CBMC
unicode.cpp
Go to the documentation of this file.
1 /*******************************************************************\
2 
3 Module:
4 
5 Author: Daniel Kroening, kroening@kroening.com
6 
7 \*******************************************************************/
8 
9 #include "unicode.h"
10 
11 #include <codecvt>
12 #include <iomanip>
13 #include <locale>
14 #include <sstream>
15 
16 #include "invariant.h"
17 
18 #ifdef _WIN32
19 # include <util/pragma_push.def>
20 # ifdef _MSC_VER
21 # pragma warning(disable : 4668)
22 // using #if/#elif on undefined macro
23 # pragma warning(disable : 5039)
24 // pointer or reference to potentially throwing function passed to extern C
25 # endif
26 # include <util/pragma_pop.def>
27 # include <windows.h>
28 #endif
29 
30 static void utf8_append_code(unsigned int c, std::string &);
31 
32 std::string narrow(const wchar_t *s)
33 {
34 #ifdef _WIN32
35 
36  int slength = static_cast<int>(wcslen(s));
37  int rlength =
38  WideCharToMultiByte(CP_UTF8, 0, s, slength, NULL, 0, NULL, NULL);
39  std::string r(rlength, 0);
40  WideCharToMultiByte(CP_UTF8, 0, s, slength, &r[0], rlength, NULL, NULL);
41  return r;
42 
43 #else
44  return narrow(std::wstring(s));
45 #endif
46 }
47 
48 std::wstring widen(const char *s)
49 {
50 #ifdef _WIN32
51 
52  int slength = static_cast<int>(strlen(s));
53  int rlength = MultiByteToWideChar(CP_UTF8, 0, s, slength, NULL, 0);
54  std::wstring r(rlength, 0);
55  MultiByteToWideChar(CP_UTF8, 0, s, slength, &r[0], rlength);
56  return r;
57 
58 #else
59  return widen(std::string(s));
60 #endif
61 }
62 
63 std::string narrow(const std::wstring &s)
64 {
65 #ifdef _WIN32
66 
67  int slength = static_cast<int>(s.size());
68  int rlength =
69  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, NULL, 0, NULL, NULL);
70  std::string r(rlength, 0);
71  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, &r[0], rlength, NULL, NULL);
72  return r;
73 
74 #else
75  std::string result;
76 
77  result.reserve(s.size()); // at least that long
78 
79  for(const auto codepoint : s)
80  utf8_append_code(codepoint, result);
81 
82  return result;
83 #endif
84 }
85 
86 std::wstring widen(const std::string &s)
87 {
88 #ifdef _WIN32
89 
90  int slength = static_cast<int>(s.size());
91  int rlength = MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, NULL, 0);
92  std::wstring r(rlength, 0);
93  MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, &r[0], rlength);
94  return r;
95 
96 #else
97  auto utf32 = utf8_to_utf32(std::string(s));
98 
99  std::wstring r;
100  r.reserve(utf32.size());
101  for(auto codepoint : utf32)
102  r += codepoint;
103  return r;
104 #endif
105 }
106 
109 static void utf8_append_code(unsigned int c, std::string &result)
110 {
111  if(c <= 0x7f)
112  result += static_cast<char>(c);
113  else if(c <= 0x7ff)
114  {
115  result += static_cast<char>((c >> 6) | 0xc0);
116  result += static_cast<char>((c & 0x3f) | 0x80);
117  }
118  else if(c <= 0xffff)
119  {
120  result += static_cast<char>((c >> 12) | 0xe0);
121  result += static_cast<char>(((c >> 6) & 0x3f) | 0x80);
122  result += static_cast<char>((c & 0x3f) | 0x80);
123  }
124  else
125  {
126  result += static_cast<char>((c >> 18) | 0xf0);
127  result += static_cast<char>(((c >> 12) & 0x3f) | 0x80);
128  result += static_cast<char>(((c >> 6) & 0x3f) | 0x80);
129  result += static_cast<char>((c & 0x3f) | 0x80);
130  }
131 }
132 
135 std::string
136 utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s)
137 {
138  std::string result;
139 
140  result.reserve(s.size()); // at least that long
141 
142  for(const auto c : s)
143  utf8_append_code(c, result);
144 
145  return result;
146 }
147 
148 std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
149 {
150  if(argv_wide == nullptr)
151  return std::vector<std::string>();
152 
153  std::vector<std::string> argv_narrow;
154  argv_narrow.reserve(argc);
155 
156  for(int i = 0; i != argc; ++i)
157  argv_narrow.push_back(narrow(argv_wide[i]));
158 
159  return argv_narrow;
160 }
161 
162 static void utf16_append_code(unsigned int code, std::wstring &result)
163 {
164  // we do not treat 0xD800 to 0xDFFF, although
165  // they are not valid unicode symbols
166 
167  if(code < 0xFFFF)
168  {
169  // code is encoded as one UTF16 character
170  result += static_cast<wchar_t>(code);
171  }
172  else // code is encoded as two UTF16 characters
173  {
174  // if this is valid unicode, we have
175  // code<0x10FFFF
176  // but let's not check it programmatically
177 
178  // encode the code in UTF16
179  code = code - 0x10000;
180  const uint16_t i1 = static_cast<uint16_t>(((code >> 10) & 0x3ff) | 0xD800);
181  result += static_cast<wchar_t>(i1);
182  const uint16_t i2 = static_cast<uint16_t>((code & 0x3ff) | 0xDC00);
183  result += static_cast<wchar_t>(i2);
184  }
185 }
186 
191 std::wstring utf8_to_utf16_native_endian(const std::string &in)
192 {
193  std::wstring result;
194  result.reserve(in.size());
195 
196  for(auto codepoint : utf8_to_utf32(in))
197  utf16_append_code(codepoint, result);
198 
199  return result;
200 }
201 
205 std::u32string utf8_to_utf32(const std::string &utf8_str)
206 {
207  std::u32string result;
208  result.reserve(utf8_str.size());
210  while(i < utf8_str.size())
211  {
212  unsigned char c = utf8_str[i++];
213  char32_t code = 0;
214  // the ifs that follow find out how many UTF8 characters (1-4) store the
215  // next unicode character. This is determined by the few most
216  // significant bits.
217  if(c <= 0x7F)
218  {
219  // if it's one character, then code is exactly the value
220  code = c;
221  }
222  else if(c <= 0xDF && i < utf8_str.size())
223  { // in other cases, we need to read the right number of chars and decode
224  // note: if we wanted to make sure that we capture incorrect strings,
225  // we should check that whatever follows first character starts with
226  // bits 10.
227  code = (c & 0x1Fu) << 6;
228  c = utf8_str[i++];
229  code += c & 0x3Fu;
230  }
231  else if(c <= 0xEF && i + 1 < utf8_str.size())
232  {
233  code = (c & 0xFu) << 12;
234  c = utf8_str[i++];
235  code += (c & 0x3Fu) << 6;
236  c = utf8_str[i++];
237  code += c & 0x3Fu;
238  }
239  else if(c <= 0xF7 && i + 2 < utf8_str.size())
240  {
241  code = (c & 0x7u) << 18;
242  c = utf8_str[i++];
243  code += (c & 0x3Fu) << 12;
244  c = utf8_str[i++];
245  code += (c & 0x3Fu) << 6;
246  c = utf8_str[i++];
247  code += c & 0x3Fu;
248  }
249  else
250  {
251  // The string is not a valid UTF8 string! Either it has some characters
252  // missing from a multi-character unicode symbol, or it has a char with
253  // too high value.
254  // For now, let's replace the character with a space
255  code = 32;
256  }
257 
258  result.append(1, code);
259  }
260 
261  return result;
262 }
263 
273  const wchar_t ch,
274  std::ostringstream &result,
275  const std::locale &loc)
276 {
277  // \u unicode characters are translated very early by the Java compiler and so
278  // \u000a or \u000d would become a newline character in a char constant, which
279  // is illegal. Instead use \n or \r.
280  if(ch == '\n')
281  result << "\\n";
282  else if(ch == '\r')
283  result << "\\r";
284  // \f, \b and \t do not need to be escaped, but this will improve readability
285  // of generated tests.
286  else if(ch == '\f')
287  result << "\\f";
288  else if(ch == '\b')
289  result << "\\b";
290  else if(ch == '\t')
291  result << "\\t";
292  else if(ch <= 255 && isprint(ch, loc))
293  {
294  const auto uch = static_cast<unsigned char>(ch);
295  // ", and \ need to be escaped, but not ' for java strings
296  // e.g. "\"\\" needs escaping but "'" does not.
297  if(uch == '"' || uch == '\\')
298  result << '\\';
299  result << uch;
300  }
301  else
302  {
303  // Format ch as a hexadecimal unicode character padded to four digits with
304  // zeros.
305  result << "\\u" << std::hex << std::setw(4) << std::setfill('0')
306  << static_cast<unsigned int>(ch);
307  }
308 }
309 
317  const wchar_t ch,
318  std::ostringstream &result,
319  const std::locale &loc)
320 {
321  if(ch == (wchar_t)'\'')
322  {
323  const auto uch = static_cast<unsigned char>(ch);
324  // ' needs to be escaped for java characters, e.g. '\''
325  result << '\\' << uch;
326  }
327  else
328  {
329  utf16_native_endian_to_java_string(ch, result, loc);
330  }
331 }
332 
335 std::string utf16_native_endian_to_java(const char16_t ch)
336 {
337  std::ostringstream result;
338  const std::locale loc;
339  utf16_native_endian_to_java(ch, result, loc);
340  return result.str();
341 }
342 
350 std::string utf16_native_endian_to_java_string(const std::wstring &in)
351 {
352  std::ostringstream result;
353  const std::locale loc;
354  for(const auto ch : in)
355  utf16_native_endian_to_java_string(ch, result, loc);
356  return result.str();
357 }
358 
359 std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
360 {
361  return utf16_native_endian_to_utf8(std::u16string(1, utf16_char));
362 }
363 
364 std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str)
365 {
366 #ifdef _MSC_VER
367  // Workaround for Visual Studio bug, see
368  // https://stackoverflow.com/questions/32055357
369  std::wstring wide_string(utf16_str.begin(), utf16_str.end());
370  return std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t>{}
371  .to_bytes(wide_string);
372 #else
373  return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}
374  .to_bytes(utf16_str);
375 #endif
376 }
377 
378 char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
379 {
380  PRECONDITION(hex.length() == 4);
381  return std::strtol(hex.c_str(), nullptr, 16);
382 }
383 
384 std::string codepoint_hex_to_utf8(const std::string &hex)
385 {
387 }
invariant.h
codepoint_hex_to_utf8
std::string codepoint_hex_to_utf8(const std::string &hex)
Definition: unicode.cpp:384
utf8_to_utf32
std::u32string utf8_to_utf32(const std::string &utf8_str)
Convert UTF8-encoded string to UTF-32 with architecture-native endianness.
Definition: unicode.cpp:205
PRECONDITION
#define PRECONDITION(CONDITION)
Definition: invariant.h:463
utf16_native_endian_to_java_string
static void utf16_native_endian_to_java_string(const wchar_t ch, std::ostringstream &result, const std::locale &loc)
Escapes non-printable characters, whitespace except for spaces, double quotes and backslashes.
Definition: unicode.cpp:272
narrow
std::string narrow(const wchar_t *s)
Definition: unicode.cpp:32
utf16_native_endian_to_utf8
std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
Definition: unicode.cpp:359
widen
std::wstring widen(const char *s)
Definition: unicode.cpp:48
utf8_to_utf16_native_endian
std::wstring utf8_to_utf16_native_endian(const std::string &in)
Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
Definition: unicode.cpp:191
codepoint_hex_to_utf16_native_endian
char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
Definition: unicode.cpp:378
unicode.h
utf8_append_code
static void utf8_append_code(unsigned int c, std::string &)
Appends a unicode character to a utf8-encoded string.
Definition: unicode.cpp:109
r
static int8_t r
Definition: irep_hash.h:60
narrow_argv
std::vector< std::string > narrow_argv(int argc, const wchar_t **argv_wide)
Definition: unicode.cpp:148
utf16_native_endian_to_java
static void utf16_native_endian_to_java(const wchar_t ch, std::ostringstream &result, const std::locale &loc)
Escapes non-printable characters, whitespace except for spaces, double- and single-quotes and backsla...
Definition: unicode.cpp:316
size_type
unsignedbv_typet size_type()
Definition: c_types.cpp:68
utf16_append_code
static void utf16_append_code(unsigned int code, std::wstring &result)
Definition: unicode.cpp:162
utf32_native_endian_to_utf8
std::string utf32_native_endian_to_utf8(const std::basic_string< unsigned int > &s)
Definition: unicode.cpp:136