Boost.Nowide
utf8_codecvt.hpp
1 //
2 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
3 // Copyright (c) 2020 Alexander Grund
4 //
5 // Distributed under the Boost Software License, Version 1.0.
6 // https://www.boost.org/LICENSE_1_0.txt
7 
8 #ifndef BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
9 #define BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
10 
12 #include <boost/nowide/utf/utf.hpp>
13 #include <cassert>
14 #include <cstdint>
15 #include <locale>
16 
17 namespace boost {
18 namespace nowide {
19 
20  static_assert(sizeof(std::mbstate_t) >= 2, "mbstate_t is to small to store an UTF-16 codepoint");
21  namespace detail {
22  // Avoid including cstring for std::memcpy
23  inline void copy_uint16_t(void* dst, const void* src)
24  {
25  unsigned char* cdst = static_cast<unsigned char*>(dst);
26  const unsigned char* csrc = static_cast<const unsigned char*>(src);
27  cdst[0] = csrc[0];
28  cdst[1] = csrc[1];
29  }
30  inline std::uint16_t read_state(const std::mbstate_t& src)
31  {
32  std::uint16_t dst;
33  copy_uint16_t(&dst, &src);
34  return dst;
35  }
36  inline void write_state(std::mbstate_t& dst, const std::uint16_t src)
37  {
38  copy_uint16_t(&dst, &src);
39  }
40  } // namespace detail
41 
48  template<typename CharType, int CharSize = sizeof(CharType)>
49  class utf8_codecvt;
50 
51  BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN
53  template<typename CharType>
54  class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 2> : public std::codecvt<CharType, char, std::mbstate_t>
55  {
56  public:
57  static_assert(sizeof(CharType) >= 2, "CharType must be able to store UTF16 code point");
58 
59  utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
60  {}
61  BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END
62 
63  protected:
64  using uchar = CharType;
65 
66  std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
67  {
68  if(detail::read_state(s) != 0)
69  return std::codecvt_base::error;
70  next = from;
71  return std::codecvt_base::ok;
72  }
73  int do_encoding() const noexcept override
74  {
75  return 0;
76  }
77  int do_max_length() const noexcept override
78  {
79  return 4;
80  }
81  bool do_always_noconv() const noexcept override
82  {
83  return false;
84  }
85 
86  // LCOV_EXCL_START
87  int do_length(std::mbstate_t& std_state, const char* from, const char* from_end, size_t max) const override
88  {
89  // LCOV_EXCL_STOP
90  using utf16_traits = utf::utf_traits<uchar, 2>;
91  std::uint16_t state = detail::read_state(std_state);
92  const char* save_from = from;
93  if(state && max > 0)
94  {
95  max--;
96  state = 0;
97  }
98  while(max > 0 && from < from_end)
99  {
100  const char* prev_from = from;
101  std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
102  if(ch == utf::illegal)
103  {
105  } else if(ch == utf::incomplete)
106  {
107  from = prev_from;
108  break;
109  }
110  // If we can't write the char, we have to save the low surrogate in state
111  if(BOOST_LIKELY(static_cast<size_t>(utf16_traits::width(ch)) <= max))
112  {
113  max -= utf16_traits::width(ch);
114  } else
115  {
116  static_assert(utf16_traits::max_width == 2, "Required for below");
117  std::uint16_t tmpOut[2]{};
118  utf16_traits::encode(ch, tmpOut);
119  state = tmpOut[1];
120  break;
121  }
122  }
123  detail::write_state(std_state, state);
124  return static_cast<int>(from - save_from);
125  }
126 
127  std::codecvt_base::result do_in(std::mbstate_t& std_state, // LCOV_EXCL_LINE
128  const char* from,
129  const char* from_end,
130  const char*& from_next,
131  uchar* to,
132  uchar* to_end,
133  uchar*& to_next) const override
134  {
135  std::codecvt_base::result r = std::codecvt_base::ok;
136  using utf16_traits = utf::utf_traits<uchar, 2>;
137 
138  // mbstate_t is POD type and should be initialized to 0 (i.e. state = stateT())
139  // according to standard.
140  // We use it to store a low surrogate if it was not yet written, else state is 0
141  std::uint16_t state = detail::read_state(std_state);
142  // Write low surrogate if present
143  if(state && to < to_end)
144  {
145  *to++ = static_cast<CharType>(state);
146  state = 0;
147  }
148  while(to < to_end && from < from_end)
149  {
150  const char* from_saved = from;
151 
152  uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
153 
154  if(ch == utf::illegal)
155  {
157  } else if(ch == utf::incomplete)
158  {
159  from = from_saved;
160  r = std::codecvt_base::partial;
161  break;
162  }
163  // If the encoded char fits, write directly, else safe the low surrogate in state
164  if(BOOST_LIKELY(utf16_traits::width(ch) <= to_end - to))
165  {
166  to = utf16_traits::encode(ch, to);
167  } else
168  {
169  static_assert(utf16_traits::max_width == 2, "Required for below");
170  std::uint16_t tmpOut[2]{};
171  utf16_traits::encode(ch, tmpOut);
172  *to++ = static_cast<CharType>(tmpOut[0]);
173  state = tmpOut[1];
174  break;
175  }
176  }
177  from_next = from;
178  to_next = to;
179  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
180  r = std::codecvt_base::partial;
181  detail::write_state(std_state, state);
182  return r;
183  }
184 
185  std::codecvt_base::result do_out(std::mbstate_t& std_state,
186  const uchar* from,
187  const uchar* from_end,
188  const uchar*& from_next,
189  char* to,
190  char* to_end,
191  char*& to_next) const override
192  {
193  std::codecvt_base::result r = std::codecvt_base::ok;
194  using utf16_traits = utf::utf_traits<uchar, 2>;
195  // mbstate_t is POD type and should be initialized to 0
196  // (i.e. state = stateT()) according to standard.
197  // We use it to store the first observed surrogate pair, or 0 if there is none yet
198  std::uint16_t state = detail::read_state(std_state);
199  for(; to < to_end && from < from_end; ++from)
200  {
201  std::uint32_t ch = 0;
202  if(state != 0)
203  {
204  // We have a high surrogate, so now there should be a low surrogate
205  std::uint16_t w1 = state;
206  std::uint16_t w2 = *from;
207  if(BOOST_LIKELY(utf16_traits::is_trail(w2)))
208  {
209  ch = utf16_traits::combine_surrogate(w1, w2);
210  } else
211  {
213  }
214  } else
215  {
216  std::uint16_t w1 = *from;
217  if(BOOST_LIKELY(utf16_traits::is_single_codepoint(w1)))
218  {
219  ch = w1;
220  } else if(BOOST_LIKELY(utf16_traits::is_first_surrogate(w1)))
221  {
222  // Store into state and continue at next character
223  state = w1;
224  continue;
225  } else
226  {
227  // Neither a single codepoint nor a high surrogate so must be low surrogate.
228  // This is an error -> Replace character
230  }
231  }
232  assert(utf::is_valid_codepoint(ch)); // Any valid UTF16 sequence is a valid codepoint
233  int len = utf::utf_traits<char>::width(ch);
234  if(to_end - to < len)
235  {
236  r = std::codecvt_base::partial;
237  break;
238  }
239  to = utf::utf_traits<char>::encode(ch, to);
240  state = 0;
241  }
242  from_next = from;
243  to_next = to;
244  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
245  r = std::codecvt_base::partial;
246  detail::write_state(std_state, state);
247  return r;
248  }
249  };
250 
251  BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN
253  template<typename CharType>
254  class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 4> : public std::codecvt<CharType, char, std::mbstate_t>
255  {
256  public:
257  utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
258  {}
259  BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END
260 
261  protected:
262  using uchar = CharType;
263 
264  std::codecvt_base::result
265  do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
266  {
267  next = from;
268  return std::codecvt_base::noconv;
269  }
270  int do_encoding() const noexcept override
271  {
272  return 0;
273  }
274  int do_max_length() const noexcept override
275  {
276  return 4;
277  }
278  bool do_always_noconv() const noexcept override
279  {
280  return false;
281  }
282 
283  int do_length(std::mbstate_t& /*state*/, const char* from, const char* from_end, size_t max) const override
284  {
285  const char* start_from = from;
286 
287  while(max > 0 && from < from_end)
288  {
289  const char* save_from = from;
290  std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
291  if(ch == utf::incomplete)
292  {
293  from = save_from;
294  break;
295  } else if(ch == utf::illegal)
296  {
298  }
299  max--;
300  }
301  return static_cast<int>(from - start_from);
302  }
303 
304  std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
305  const char* from,
306  const char* from_end,
307  const char*& from_next,
308  uchar* to,
309  uchar* to_end,
310  uchar*& to_next) const override
311  {
312  std::codecvt_base::result r = std::codecvt_base::ok;
313 
314  while(to < to_end && from < from_end)
315  {
316  const char* from_saved = from;
317 
318  uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
319 
320  if(ch == utf::illegal)
321  {
323  } else if(ch == utf::incomplete)
324  {
325  r = std::codecvt_base::partial;
326  from = from_saved;
327  break;
328  }
329  *to++ = ch;
330  }
331  from_next = from;
332  to_next = to;
333  if(r == std::codecvt_base::ok && from != from_end)
334  r = std::codecvt_base::partial;
335  return r;
336  }
337 
338  std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
339  const uchar* from,
340  const uchar* from_end,
341  const uchar*& from_next,
342  char* to,
343  char* to_end,
344  char*& to_next) const override
345  {
346  std::codecvt_base::result r = std::codecvt_base::ok;
347  while(to < to_end && from < from_end)
348  {
349  std::uint32_t ch = 0;
350  ch = *from;
351  if(!utf::is_valid_codepoint(ch))
352  {
354  }
355  int len = utf::utf_traits<char>::width(ch);
356  if(to_end - to < len)
357  {
358  r = std::codecvt_base::partial;
359  break;
360  }
361  to = utf::utf_traits<char>::encode(ch, to);
362  from++;
363  }
364  from_next = from;
365  to_next = to;
366  if(r == std::codecvt_base::ok && from != from_end)
367  r = std::codecvt_base::partial;
368  return r;
369  }
370  };
371 
372 } // namespace nowide
373 } // namespace boost
374 
375 #endif
UTF Traits class - functions to convert UTF sequences to and from Unicode code points.
Definition: utf.hpp:56
static Iterator encode(code_point value, Iterator out)
Definition: utf8_codecvt.hpp:49
static const code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:32
#define BOOST_NOWIDE_REPLACEMENT_CHARACTER
Definition: replacement.hpp:15
static const code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:37
static int width(code_point value)
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:42
static code_point decode(Iterator &p, Iterator e)