cprover
unicode.cpp
Go to the documentation of this file.
1 /*******************************************************************\
2 
3 Module:
4 
5 Author: Daniel Kroening, kroening@kroening.com
6 
7 \*******************************************************************/
8 
9 #include "unicode.h"
10 
11 #include <codecvt>
12 #include <cstdint>
13 #include <cstring>
14 #include <iomanip>
15 #include <locale>
16 #include <sstream>
17 
18 #include "invariant.h"
19 
20 #ifdef _WIN32
21 #include <util/pragma_push.def>
22 #ifdef _MSC_VER
23 #pragma warning(disable:4668)
24  // using #if/#elif on undefined macro
25 #pragma warning(disable : 5039)
26 // pointer or reference to potentially throwing function passed to extern C
27 #endif
28 #include <windows.h>
29 #include <util/pragma_pop.def>
30 #endif
31 
32 std::string narrow(const wchar_t *s)
33 {
34  #ifdef _WIN32
35 
36  int slength=static_cast<int>(wcslen(s));
37  int rlength=
38  WideCharToMultiByte(CP_UTF8, 0, s, slength, NULL, 0, NULL, NULL);
39  std::string r(rlength, 0);
40  WideCharToMultiByte(CP_UTF8, 0, s, slength, &r[0], rlength, NULL, NULL);
41  return r;
42 
43  #else
44  // dummy conversion
45  std::string r;
46  r.reserve(wcslen(s));
47  while(*s!=0)
48  {
49  r+=static_cast<char>(*s);
50  s++;
51  }
52 
53  return r;
54  #endif
55 }
56 
57 std::wstring widen(const char *s)
58 {
59  #ifdef _WIN32
60 
61  int slength=static_cast<int>(strlen(s));
62  int rlength=
63  MultiByteToWideChar(CP_UTF8, 0, s, slength, NULL, 0);
64  std::wstring r(rlength, 0);
65  MultiByteToWideChar(CP_UTF8, 0, s, slength, &r[0], rlength);
66  return r;
67 
68  #else
69  // dummy conversion
70  std::wstring r;
71  r.reserve(strlen(s));
72  while(*s!=0)
73  {
74  r+=wchar_t(*s);
75  s++;
76  }
77 
78  return r;
79  #endif
80 }
81 
82 std::string narrow(const std::wstring &s)
83 {
84  #ifdef _WIN32
85 
86  int slength=static_cast<int>(s.size());
87  int rlength=
88  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, NULL, 0, NULL, NULL);
89  std::string r(rlength, 0);
90  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, &r[0], rlength, NULL, NULL);
91  return r;
92 
93  #else
94  // dummy conversion
95  return std::string(s.begin(), s.end());
96  #endif
97 }
98 
99 std::wstring widen(const std::string &s)
100 {
101  #ifdef _WIN32
102 
103  int slength=static_cast<int>(s.size());
104  int rlength=
105  MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, NULL, 0);
106  std::wstring r(rlength, 0);
107  MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, &r[0], rlength);
108  return r;
109 
110  #else
111  // dummy conversion
112  return std::wstring(s.begin(), s.end());
113  #endif
114 }
115 
118 static void utf8_append_code(unsigned int c, std::string &result)
119 {
120  if(c<=0x7f)
121  result+=static_cast<char>(c);
122  else if(c<=0x7ff)
123  {
124  result+=static_cast<char>((c >> 6) | 0xc0);
125  result+=static_cast<char>((c &0x3f) | 0x80);
126  }
127  else if(c<=0xffff)
128  {
129  result+=static_cast<char>((c >> 12) | 0xe0);
130  result+=static_cast<char>(((c >> 6) &0x3f) | 0x80);
131  result+=static_cast<char>((c &0x3f) | 0x80);
132  }
133  else
134  {
135  result+=static_cast<char>((c >> 18) | 0xf0);
136  result+=static_cast<char>(((c >> 12) &0x3f)| 0x80);
137  result+=static_cast<char>(((c >> 6) &0x3f) | 0x80);
138  result+=static_cast<char>((c &0x3f) | 0x80);
139  }
140 }
141 
144 std::string
145 utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s)
146 {
147  std::string result;
148 
149  result.reserve(s.size()); // at least that long
150 
151  for(const auto c : s)
152  utf8_append_code(c, result);
153 
154  return result;
155 }
156 
157 std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
158 {
159  if(argv_wide==nullptr)
160  return std::vector<std::string>();
161 
162  std::vector<std::string> argv_narrow;
163  argv_narrow.reserve(argc);
164 
165  for(int i=0; i!=argc; ++i)
166  argv_narrow.push_back(narrow(argv_wide[i]));
167 
168  return argv_narrow;
169 }
170 
171 static void utf16_append_code(unsigned int code, std::wstring &result)
172 {
173  // we do not treat 0xD800 to 0xDFFF, although
174  // they are not valid unicode symbols
175 
176  if(code<0xFFFF)
177  {
178  // code is encoded as one UTF16 character
179  result += static_cast<wchar_t>(code);
180  }
181  else // code is encoded as two UTF16 characters
182  {
183  // if this is valid unicode, we have
184  // code<0x10FFFF
185  // but let's not check it programmatically
186 
187  // encode the code in UTF16
188  code=code-0x10000;
189  const uint16_t i1 = static_cast<uint16_t>(((code >> 10) & 0x3ff) | 0xD800);
190  result += static_cast<wchar_t>(i1);
191  const uint16_t i2 = static_cast<uint16_t>((code & 0x3ff) | 0xDC00);
192  result += static_cast<wchar_t>(i2);
193  }
194 }
195 
196 
201 std::wstring utf8_to_utf16_native_endian(const std::string &in)
202 {
203  std::wstring result;
204  result.reserve(in.size());
206  while(i<in.size())
207  {
208  unsigned char c=in[i++];
209  unsigned int code=0;
210  // the ifs that follow find out how many UTF8 characters (1-4) store the
211  // next unicode character. This is determined by the few most
212  // significant bits.
213  if(c<=0x7F)
214  {
215  // if it's one character, then code is exactly the value
216  code=c;
217  }
218  else if(c<=0xDF && i<in.size())
219  { // in other cases, we need to read the right number of chars and decode
220  // note: if we wanted to make sure that we capture incorrect strings,
221  // we should check that whatever follows first character starts with
222  // bits 10.
223  code = (c & 0x1Fu) << 6;
224  c=in[i++];
225  code += c & 0x3Fu;
226  }
227  else if(c<=0xEF && i+1<in.size())
228  {
229  code = (c & 0xFu) << 12;
230  c=in[i++];
231  code += (c & 0x3Fu) << 6;
232  c=in[i++];
233  code += c & 0x3Fu;
234  }
235  else if(c<=0xF7 && i+2<in.size())
236  {
237  code = (c & 0x7u) << 18;
238  c=in[i++];
239  code += (c & 0x3Fu) << 12;
240  c=in[i++];
241  code += (c & 0x3Fu) << 6;
242  c=in[i++];
243  code += c & 0x3Fu;
244  }
245  else
246  {
247  // The string is not a valid UTF8 string! Either it has some characters
248  // missing from a multi-character unicode symbol, or it has a char with
249  // too high value.
250  // For now, let's replace the character with a space
251  code=32;
252  }
253 
254  utf16_append_code(code, result);
255  }
256 
257  return result;
258 }
259 
269  const wchar_t ch,
270  std::ostringstream &result,
271  const std::locale &loc)
272 {
273  // \u unicode characters are translated very early by the Java compiler and so
274  // \u000a or \u000d would become a newline character in a char constant, which
275  // is illegal. Instead use \n or \r.
276  if(ch == '\n')
277  result << "\\n";
278  else if(ch == '\r')
279  result << "\\r";
280  // \f, \b and \t do not need to be escaped, but this will improve readability
281  // of generated tests.
282  else if(ch == '\f')
283  result << "\\f";
284  else if(ch == '\b')
285  result << "\\b";
286  else if(ch == '\t')
287  result << "\\t";
288  else if(ch <= 255 && isprint(ch, loc))
289  {
290  const auto uch = static_cast<unsigned char>(ch);
291  // ", and \ need to be escaped, but not ' for java strings
292  // e.g. "\"\\" needs escaping but "'" does not.
293  if(uch == '"' || uch == '\\')
294  result << '\\';
295  result << uch;
296  }
297  else
298  {
299  // Format ch as a hexadecimal unicode character padded to four digits with
300  // zeros.
301  result << "\\u" << std::hex << std::setw(4) << std::setfill('0')
302  << static_cast<unsigned int>(ch);
303  }
304 }
305 
313  const wchar_t ch,
314  std::ostringstream &result,
315  const std::locale &loc)
316 {
317  if(ch == (wchar_t)'\'')
318  {
319  const auto uch = static_cast<unsigned char>(ch);
320  // ' needs to be escaped for java characters, e.g. '\''
321  result << '\\' << uch;
322  }
323  else
324  {
326  }
327 }
328 
331 std::string utf16_native_endian_to_java(const char16_t ch)
332 {
333  std::ostringstream result;
334  const std::locale loc;
335  utf16_native_endian_to_java(ch, result, loc);
336  return result.str();
337 }
338 
346 std::string utf16_native_endian_to_java_string(const std::wstring &in)
347 {
348  std::ostringstream result;
349  const std::locale loc;
350  for(const auto ch : in)
352  return result.str();
353 }
354 
355 std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
356 {
357  return utf16_native_endian_to_utf8(std::u16string(1, utf16_char));
358 }
359 
360 std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str)
361 {
362 #ifdef _MSC_VER
363  // Workaround for Visual Studio bug, see
364  // https://stackoverflow.com/questions/32055357
365  std::wstring wide_string(utf16_str.begin(), utf16_str.end());
366  return std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t>{}
367  .to_bytes(wide_string);
368 #else
369  return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}
370  .to_bytes(utf16_str);
371 #endif
372 }
373 
374 char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
375 {
376  PRECONDITION(hex.length() == 4);
377  return std::strtol(hex.c_str(), nullptr, 16);
378 }
379 
380 std::string codepoint_hex_to_utf8(const std::string &hex)
381 {
383 }
codepoint_hex_to_utf8
std::string codepoint_hex_to_utf8(const std::string &hex)
Definition: unicode.cpp:380
PRECONDITION
#define PRECONDITION(CONDITION)
Definition: invariant.h:464
utf16_native_endian_to_java_string
static void utf16_native_endian_to_java_string(const wchar_t ch, std::ostringstream &result, const std::locale &loc)
Escapes non-printable characters, whitespace except for spaces, double quotes and backslashes.
Definition: unicode.cpp:268
narrow
std::string narrow(const wchar_t *s)
Definition: unicode.cpp:32
utf16_native_endian_to_utf8
std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
Definition: unicode.cpp:355
widen
std::wstring widen(const char *s)
Definition: unicode.cpp:57
utf8_to_utf16_native_endian
std::wstring utf8_to_utf16_native_endian(const std::string &in)
Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
Definition: unicode.cpp:201
codepoint_hex_to_utf16_native_endian
char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
Definition: unicode.cpp:374
invariant.h
unicode.h
utf8_append_code
static void utf8_append_code(unsigned int c, std::string &result)
Appends a unicode character to a utf8-encoded string.
Definition: unicode.cpp:118
loc
#define loc()
Definition: ansi_c_lex.yy.cpp:4684
r
static int8_t r
Definition: irep_hash.h:59
narrow_argv
std::vector< std::string > narrow_argv(int argc, const wchar_t **argv_wide)
Definition: unicode.cpp:157
utf16_native_endian_to_java
static void utf16_native_endian_to_java(const wchar_t ch, std::ostringstream &result, const std::locale &loc)
Escapes non-printable characters, whitespace except for spaces, double- and single-quotes and backsla...
Definition: unicode.cpp:312
size_type
unsignedbv_typet size_type()
Definition: c_types.cpp:58
utf16_append_code
static void utf16_append_code(unsigned int code, std::wstring &result)
Definition: unicode.cpp:171
utf32_native_endian_to_utf8
std::string utf32_native_endian_to_utf8(const std::basic_string< unsigned int > &s)
Definition: unicode.cpp:145