cprover
unicode.cpp
Go to the documentation of this file.
1 /*******************************************************************\
2 
3 Module:
4 
5 Author: Daniel Kroening, kroening@kroening.com
6 
7 \*******************************************************************/
8 
9 #include "unicode.h"
10 
11 #include <cstring>
12 #include <locale>
13 #include <iomanip>
14 #include <sstream>
15 #include <cstdint>
16 
17 #ifdef _WIN32
18 #include <util/pragma_push.def>
19 #ifdef _MSC_VER
20 #pragma warning(disable:4668)
21  // using #if/#elif on undefined macro
22 #endif
23 #include <windows.h>
24 #include <util/pragma_pop.def>
25 #endif
26 
27 std::string narrow(const wchar_t *s)
28 {
29  #ifdef _WIN32
30 
31  int slength=static_cast<int>(wcslen(s));
32  int rlength=
33  WideCharToMultiByte(CP_UTF8, 0, s, slength, NULL, 0, NULL, NULL);
34  std::string r(rlength, 0);
35  WideCharToMultiByte(CP_UTF8, 0, s, slength, &r[0], rlength, NULL, NULL);
36  return r;
37 
38  #else
39  // dummy conversion
40  std::string r;
41  r.reserve(wcslen(s));
42  while(*s!=0)
43  {
44  r+=static_cast<char>(*s);
45  s++;
46  }
47 
48  return r;
49  #endif
50 }
51 
52 std::wstring widen(const char *s)
53 {
54  #ifdef _WIN32
55 
56  int slength=static_cast<int>(strlen(s));
57  int rlength=
58  MultiByteToWideChar(CP_UTF8, 0, s, slength, NULL, 0);
59  std::wstring r(rlength, 0);
60  MultiByteToWideChar(CP_UTF8, 0, s, slength, &r[0], rlength);
61  return r;
62 
63  #else
64  // dummy conversion
65  std::wstring r;
66  r.reserve(strlen(s));
67  while(*s!=0)
68  {
69  r+=wchar_t(*s);
70  s++;
71  }
72 
73  return r;
74  #endif
75 }
76 
77 std::string narrow(const std::wstring &s)
78 {
79  #ifdef _WIN32
80 
81  int slength=static_cast<int>(s.size());
82  int rlength=
83  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, NULL, 0, NULL, NULL);
84  std::string r(rlength, 0);
85  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, &r[0], rlength, NULL, NULL);
86  return r;
87 
88  #else
89  // dummy conversion
90  return std::string(s.begin(), s.end());
91  #endif
92 }
93 
94 std::wstring widen(const std::string &s)
95 {
96  #ifdef _WIN32
97 
98  int slength=static_cast<int>(s.size());
99  int rlength=
100  MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, NULL, 0);
101  std::wstring r(rlength, 0);
102  MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, &r[0], rlength);
103  return r;
104 
105  #else
106  // dummy conversion
107  return std::wstring(s.begin(), s.end());
108  #endif
109 }
110 
113 static void utf8_append_code(unsigned int c, std::string &result)
114 {
115  if(c<=0x7f)
116  result+=static_cast<char>(c);
117  else if(c<=0x7ff)
118  {
119  result+=static_cast<char>((c >> 6) | 0xc0);
120  result+=static_cast<char>((c &0x3f) | 0x80);
121  }
122  else if(c<=0xffff)
123  {
124  result+=static_cast<char>((c >> 12) | 0xe0);
125  result+=static_cast<char>(((c >> 6) &0x3f) | 0x80);
126  result+=static_cast<char>((c &0x3f) | 0x80);
127  }
128  else
129  {
130  result+=static_cast<char>((c >> 18) | 0xf0);
131  result+=static_cast<char>(((c >> 12) &0x3f)| 0x80);
132  result+=static_cast<char>(((c >> 6) &0x3f) | 0x80);
133  result+=static_cast<char>((c &0x3f) | 0x80);
134  }
135 }
136 
139 std::string
140 utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s)
141 {
142  std::string result;
143 
144  result.reserve(s.size()); // at least that long
145 
146  for(const auto c : s)
147  utf8_append_code(c, result);
148 
149  return result;
150 }
151 
152 std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
153 {
154  if(argv_wide==nullptr)
155  return std::vector<std::string>();
156 
157  std::vector<std::string> argv_narrow;
158  argv_narrow.reserve(argc);
159 
160  for(int i=0; i!=argc; ++i)
161  argv_narrow.push_back(narrow(argv_wide[i]));
162 
163  return argv_narrow;
164 }
165 
166 static void utf16_append_code(unsigned int code, std::wstring &result)
167 {
168  // we do not treat 0xD800 to 0xDFFF, although
169  // they are not valid unicode symbols
170 
171  if(code<0xFFFF)
172  {
173  // code is encoded as one UTF16 character
174  result += static_cast<wchar_t>(code);
175  }
176  else // code is encoded as two UTF16 characters
177  {
178  // if this is valid unicode, we have
179  // code<0x10FFFF
180  // but let's not check it programmatically
181 
182  // encode the code in UTF16
183  code=code-0x10000;
184  const uint16_t i1 = static_cast<uint16_t>(((code >> 10) & 0x3ff) | 0xD800);
185  result += static_cast<wchar_t>(i1);
186  const uint16_t i2 = static_cast<uint16_t>((code & 0x3ff) | 0xDC00);
187  result += static_cast<wchar_t>(i2);
188  }
189 }
190 
191 
196 std::wstring utf8_to_utf16_native_endian(const std::string &in)
197 {
198  std::wstring result;
199  result.reserve(in.size());
201  while(i<in.size())
202  {
203  unsigned char c=in[i++];
204  unsigned int code=0;
205  // the ifs that follow find out how many UTF8 characters (1-4) store the
206  // next unicode character. This is determined by the few most
207  // significant bits.
208  if(c<=0x7F)
209  {
210  // if it's one character, then code is exactly the value
211  code=c;
212  }
213  else if(c<=0xDF && i<in.size())
214  { // in other cases, we need to read the right number of chars and decode
215  // note: if we wanted to make sure that we capture incorrect strings,
216  // we should check that whatever follows first character starts with
217  // bits 10.
218  code = (c & 0x1Fu) << 6;
219  c=in[i++];
220  code += c & 0x3Fu;
221  }
222  else if(c<=0xEF && i+1<in.size())
223  {
224  code = (c & 0xFu) << 12;
225  c=in[i++];
226  code += (c & 0x3Fu) << 6;
227  c=in[i++];
228  code += c & 0x3Fu;
229  }
230  else if(c<=0xF7 && i+2<in.size())
231  {
232  code = (c & 0x7u) << 18;
233  c=in[i++];
234  code += (c & 0x3Fu) << 12;
235  c=in[i++];
236  code += (c & 0x3Fu) << 6;
237  c=in[i++];
238  code += c & 0x3Fu;
239  }
240  else
241  {
242  // The string is not a valid UTF8 string! Either it has some characters
243  // missing from a multi-character unicode symbol, or it has a char with
244  // too high value.
245  // For now, let's replace the character with a space
246  code=32;
247  }
248 
249  utf16_append_code(code, result);
250  }
251 
252  return result;
253 }
254 
260  const wchar_t ch,
261  std::ostringstream &result,
262  const std::locale &loc)
263 {
264  // \u unicode characters are translated very early by the Java compiler and so
265  // \u000a or \u000d would become a newline character in a char constant, which
266  // is illegal. Instead use \n or \r.
267  if(ch == '\n')
268  result << "\\n";
269  else if(ch == '\r')
270  result << "\\r";
271  // \f, \b and \t do not need to be escaped, but this will improve readability
272  // of generated tests.
273  else if(ch == '\f')
274  result << "\\f";
275  else if(ch == '\b')
276  result << "\\b";
277  else if(ch == '\t')
278  result << "\\t";
279  else if(ch <= 255 && isprint(ch, loc))
280  {
281  const auto uch = static_cast<unsigned char>(ch);
282  // ", \ and ' need to be escaped.
283  if(uch == '"' || uch == '\\' || uch == '\'')
284  result << '\\';
285  result << uch;
286  }
287  else
288  {
289  // Format ch as a hexadecimal unicode character padded to four digits with
290  // zeros.
291  result << "\\u" << std::hex << std::setw(4) << std::setfill('0')
292  << static_cast<unsigned int>(ch);
293  }
294 }
295 
298 std::string utf16_native_endian_to_java(const char16_t ch)
299 {
300  std::ostringstream result;
301  const std::locale loc;
302  utf16_native_endian_to_java(ch, result, loc);
303  return result.str();
304 }
305 
308 std::string utf16_native_endian_to_java(const std::wstring &in)
309 {
310  std::ostringstream result;
311  const std::locale loc;
312  for(const auto ch : in)
313  utf16_native_endian_to_java(ch, result, loc);
314  return result.str();
315 }
narrow
std::string narrow(const wchar_t *s)
Definition: unicode.cpp:27
widen
std::wstring widen(const char *s)
Definition: unicode.cpp:52
utf8_to_utf16_native_endian
std::wstring utf8_to_utf16_native_endian(const std::string &in)
Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
Definition: unicode.cpp:196
unicode.h
utf8_append_code
static void utf8_append_code(unsigned int c, std::string &result)
Appends a unicode character to a utf8-encoded string.
Definition: unicode.cpp:113
loc
#define loc()
Definition: ansi_c_lex.yy.cpp:4256
r
static int8_t r
Definition: irep_hash.h:59
narrow_argv
std::vector< std::string > narrow_argv(int argc, const wchar_t **argv_wide)
Definition: unicode.cpp:152
utf16_native_endian_to_java
static void utf16_native_endian_to_java(const wchar_t ch, std::ostringstream &result, const std::locale &loc)
Definition: unicode.cpp:259
size_type
unsignedbv_typet size_type()
Definition: c_types.cpp:58
utf16_append_code
static void utf16_append_code(unsigned int code, std::wstring &result)
Definition: unicode.cpp:166
utf32_native_endian_to_utf8
std::string utf32_native_endian_to_utf8(const std::basic_string< unsigned int > &s)
Definition: unicode.cpp:140