CBMC
convert_string_literal.cpp
Go to the documentation of this file.
1 /*******************************************************************\
2 
3 Module: C/C++ Language Conversion
4 
5 Author: Daniel Kroening, kroening@kroening.com
6 
7 \*******************************************************************/
8 
11 
12 #include "convert_string_literal.h"
13 
14 #include <util/arith_tools.h>
15 #include <util/c_types.h>
16 #include <util/unicode.h>
17 #include <util/string_constant.h>
18 
19 #include "unescape_string.h"
20 
21 std::basic_string<unsigned int> convert_one_string_literal(
22  const std::string &src)
23 {
24  assert(src.size()>=2);
25 
26  if(src[0]=='u' && src[1]=='8')
27  {
28  assert(src[src.size()-1]=='"');
29  assert(src[2]=='"');
30 
31  std::basic_string<unsigned int> value=
32  unescape_wide_string(std::string(src, 3, src.size()-4));
33 
34  // turn into utf-8
35  const std::string utf8_value = utf32_native_endian_to_utf8(value);
36 
37  // pad into wide string
38  value.resize(utf8_value.size());
39  for(std::size_t i=0; i<utf8_value.size(); i++)
40  value[i]=utf8_value[i];
41 
42  return value;
43  }
44  else if(src[0]=='L' || src[0]=='u' || src[0]=='U')
45  {
46  assert(src[src.size()-1]=='"');
47  assert(src[1]=='"');
48 
49  return unescape_wide_string(std::string(src, 2, src.size()-3));
50  }
51  else
52  {
53  assert(src[0]=='"');
54  assert(src[src.size()-1]=='"');
55 
56  std::string char_value=
57  unescape_string(std::string(src, 1, src.size()-2));
58 
59  // pad into wide string
60  std::basic_string<unsigned int> value;
61  value.resize(char_value.size());
62  for(std::size_t i=0; i<char_value.size(); i++)
63  value[i]=char_value[i];
64 
65  return value;
66  }
67 }
68 
69 exprt convert_string_literal(const std::string &src)
70 {
71  // note that 'src' could be a concatenation of string literals,
72  // e.g., something like "asd" "xyz".
73  // GCC allows "asd" L"xyz"!
74 
75  std::basic_string<unsigned int> value;
76 
77  char wide=0;
78 
79  for(std::size_t i=0; i<src.size(); i++)
80  {
81  char ch=src[i];
82 
83  // skip whitespace/newline
84  if(ch!='L' && ch!='u' && ch!='U' && ch!='"')
85  continue;
86 
87  if(ch=='L')
88  wide=ch;
89  if((ch=='u' || ch=='U') && i+1<src.size() && src[i+1]=='"')
90  wide=ch;
91 
92  // find start of sequence
93  std::size_t j=src.find('"', i);
94  CHECK_RETURN(j != std::string::npos);
95 
96  // find end of sequence, considering escaping
97  for(++j; j<src.size() && src[j]!='"'; ++j)
98  if(src[j]=='\\') // skip next character
99  ++j;
100 
101  INVARIANT(j < src.size(), "non-terminated string constant '" + src + "'");
102 
103  std::string tmp_src=std::string(src, i, j-i+1);
104  std::basic_string<unsigned int> tmp_value=
106  value.append(tmp_value);
107  i=j;
108  }
109 
110  if(wide!=0)
111  {
112  // add implicit trailing zero
113  value.push_back(0);
114 
115  // L is wchar_t, u is char16_t, U is char32_t.
116  typet subtype;
117 
118  switch(wide)
119  {
120  case 'L': subtype=wchar_t_type(); break;
121  case 'u': subtype=char16_t_type(); break;
122  case 'U': subtype=char32_t_type(); break;
123  default: assert(false);
124  }
125 
126  exprt result=exprt(ID_array);
127  result.set(ID_C_string_constant, true);
128  result.type() =
129  array_typet(subtype, from_integer(value.size(), c_index_type()));
130 
131  result.operands().resize(value.size());
132  for(std::size_t i=0; i<value.size(); i++)
133  result.operands()[i]=from_integer(value[i], subtype);
134 
135  return result;
136  }
137  else
138  {
139  std::string char_value;
140 
141  char_value.resize(value.size());
142 
143  for(std::size_t i=0; i<value.size(); i++)
144  {
145  // Loss of data here if value[i]>255.
146  // gcc issues a warning in this case.
147  char_value[i]=value[i];
148  }
149 
150  return string_constantt(char_value);
151  }
152 }
arith_tools.h
CHECK_RETURN
#define CHECK_RETURN(CONDITION)
Definition: invariant.h:495
typet
The type of an expression, extends irept.
Definition: type.h:28
char32_t_type
unsignedbv_typet char32_t_type()
Definition: c_types.cpp:185
irept::find
const irept & find(const irep_idt &name) const
Definition: irep.cpp:106
convert_string_literal.h
string_constant.h
exprt
Base class for all expressions.
Definition: expr.h:55
char16_t_type
unsignedbv_typet char16_t_type()
Definition: c_types.cpp:175
string_constantt
Definition: string_constant.h:14
exprt::type
typet & type()
Return the type of the expression.
Definition: expr.h:84
convert_one_string_literal
std::basic_string< unsigned int > convert_one_string_literal(const std::string &src)
Definition: convert_string_literal.cpp:21
irept::set
void set(const irep_idt &name, const irep_idt &value)
Definition: irep.h:420
wchar_t_type
bitvector_typet wchar_t_type()
Definition: c_types.cpp:159
unescape_wide_string
std::basic_string< unsigned int > unescape_wide_string(const std::string &src)
Definition: unescape_string.cpp:156
unescape_string.h
array_typet
Arrays with given size.
Definition: std_types.h:762
from_integer
constant_exprt from_integer(const mp_integer &int_value, const typet &type)
Definition: arith_tools.cpp:100
unicode.h
exprt::operands
operandst & operands()
Definition: expr.h:94
INVARIANT
#define INVARIANT(CONDITION, REASON)
This macro uses the wrapper function 'invariant_violated_string'.
Definition: invariant.h:423
c_index_type
bitvector_typet c_index_type()
Definition: c_types.cpp:16
unescape_string
std::string unescape_string(const std::string &src)
Definition: unescape_string.cpp:151
c_types.h
convert_string_literal
exprt convert_string_literal(const std::string &src)
Definition: convert_string_literal.cpp:69
utf32_native_endian_to_utf8
std::string utf32_native_endian_to_utf8(const std::basic_string< unsigned int > &s)
Definition: unicode.cpp:136