generic_codecvt.hpp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. //
  2. // Copyright (c) 2015 Artyom Beilis (Tonkikh)
  3. // Copyright (c) 2021-2023 Alexander Grund
  4. //
  5. // Distributed under the Boost Software License, Version 1.0.
  6. // https://www.boost.org/LICENSE_1_0.txt
  7. #ifndef BOOST_LOCALE_GENERIC_CODECVT_HPP
  8. #define BOOST_LOCALE_GENERIC_CODECVT_HPP
  9. #include <boost/locale/utf.hpp>
  10. #include <cstdint>
  11. #include <locale>
  12. namespace boost { namespace locale {
  13. static_assert(sizeof(std::mbstate_t) >= 2, "std::mbstate_t is to small to store an UTF-16 codepoint");
  14. namespace detail {
  15. // Avoid including cstring for std::memcpy
  16. inline void copy_uint16_t(void* dst, const void* src)
  17. {
  18. unsigned char* cdst = static_cast<unsigned char*>(dst);
  19. const unsigned char* csrc = static_cast<const unsigned char*>(src);
  20. cdst[0] = csrc[0];
  21. cdst[1] = csrc[1];
  22. }
  23. inline uint16_t read_state(const std::mbstate_t& src)
  24. {
  25. uint16_t dst;
  26. copy_uint16_t(&dst, &src);
  27. return dst;
  28. }
  29. inline void write_state(std::mbstate_t& dst, const uint16_t src)
  30. {
  31. copy_uint16_t(&dst, &src);
  32. }
  33. } // namespace detail
  34. /// \brief A base class that used to define constants for generic_codecvt
  35. class generic_codecvt_base {
  36. public:
  37. /// Initial state for converting to or from Unicode code points, used by initial_state in derived classes
  38. enum initial_convertion_state {
  39. to_unicode_state, ///< The state would be used by to_unicode functions
  40. from_unicode_state ///< The state would be used by from_unicode functions
  41. };
  42. };
  43. /// \brief Generic codecvt facet for various stateless encodings to UTF-16 and UTF-32 using wchar_t, char32_t
  44. /// and char16_t
  45. ///
  46. /// Implementations should derive from this class defining itself as CodecvtImpl and provide following members
  47. ///
  48. /// - `state_type` - a type of special object that allows to store intermediate cached data, for example `iconv_t`
  49. /// descriptor
  50. /// - `state_type initial_state(generic_codecvt_base::initial_convertion_state direction) const` - member function
  51. /// that creates initial state
  52. /// - `int max_encoding_length() const` - a maximal length that one Unicode code point is represented, for UTF-8 for
  53. /// example it is 4 from ISO-8859-1 it is 1
  54. /// - `utf::code_point to_unicode(state_type& state, const char*& begin, const char* end)` - extract first code
  55. /// point from the text in range [begin,end), in case of success begin would point to the next character sequence to
  56. /// be encoded to next code point, in case of incomplete sequence - utf::incomplete shell be returned, and in case
  57. /// of invalid input sequence utf::illegal shell be returned and begin would remain unmodified
  58. /// - `utf::len_or_error from_unicode(state_type &state, utf::code_point u, char* begin, const char* end)` - convert
  59. /// a Unicode code point `u` into a character sequence at [begin,end). Return the length of the sequence in case of
  60. /// success, utf::incomplete in case of not enough room to encode the code point, or utf::illegal in case conversion
  61. /// can not be performed
  62. ///
  63. ///
  64. /// For example implementation of codecvt for latin1/ISO-8859-1 character set
  65. ///
  66. /// \code
  67. ///
  68. /// template<typename CharType>
  69. /// class latin1_codecvt: boost::locale::generic_codecvt<CharType,latin1_codecvt<CharType> >
  70. /// {
  71. /// public:
  72. ///
  73. /// /* Standard codecvt constructor */
  74. /// latin1_codecvt(size_t refs = 0): boost::locale::generic_codecvt<CharType,latin1_codecvt<CharType> >(refs)
  75. /// {
  76. /// }
  77. ///
  78. /// /* State is unused but required by generic_codecvt */
  79. /// struct state_type {};
  80. ///
  81. /// state_type initial_state(generic_codecvt_base::initial_convertion_state /*unused*/) const
  82. /// {
  83. /// return state_type();
  84. /// }
  85. ///
  86. /// int max_encoding_length() const
  87. /// {
  88. /// return 1;
  89. /// }
  90. ///
  91. /// boost::locale::utf::code_point to_unicode(state_type&, const char*& begin, const char* end) const
  92. /// {
  93. /// if(begin == end)
  94. /// return boost::locale::utf::incomplete;
  95. /// return *begin++;
  96. /// }
  97. ///
  98. /// boost::locale::utf::len_or_error from_unicode(state_type&, boost::locale::utf::code_point u,
  99. /// char* begin, const char* end) const
  100. /// {
  101. /// if(u >= 256)
  102. /// return boost::locale::utf::illegal;
  103. /// if(begin == end)
  104. /// return boost::locale::utf::incomplete;
  105. /// *begin = u;
  106. /// return 1;
  107. /// }
  108. /// };
  109. ///
  110. /// \endcode
  111. ///
  112. /// When external tools used for encoding conversion, the `state_type` is useful to save objects used for
  113. /// conversions. For example, icu::UConverter can be saved in such a state for an efficient use:
  114. ///
  115. /// \code
  116. /// template<typename CharType>
  117. /// class icu_codecvt: boost::locale::generic_codecvt<CharType,icu_codecvt<CharType>>
  118. /// {
  119. /// public:
  120. ///
  121. /// /* Standard codecvt constructor */
  122. /// icu_codecvt(std::string const &name,refs = 0):
  123. /// boost::locale::generic_codecvt<CharType,icu_codecvt<CharType>>(refs)
  124. /// { ... }
  125. ///
  126. /// using state_type = std::unique_ptr<UConverter,void (*)(UConverter*)>;
  127. ///
  128. /// state_type initial_state(generic_codecvt_base::initial_convertion_state /*unused*/) const
  129. /// {
  130. /// UErrorCode err = U_ZERO_ERROR;
  131. /// return state_type(ucnv_safeClone(converter_,0,0,&err),ucnv_close);
  132. /// }
  133. ///
  134. /// boost::locale::utf::code_point to_unicode(state_type &ptr,char const *&begin,char const *end) const
  135. /// {
  136. /// UErrorCode err = U_ZERO_ERROR;
  137. /// boost::locale::utf::code_point cp = ucnv_getNextUChar(ptr.get(),&begin,end,&err);
  138. /// ...
  139. /// }
  140. /// ...
  141. /// };
  142. /// \endcode
  143. ///
  144. template<typename CharType, typename CodecvtImpl, int CharSize = sizeof(CharType)>
  145. class generic_codecvt;
  146. /// \brief UTF-16 to/from narrow char codecvt facet to use with char16_t or wchar_t on Windows
  147. ///
  148. /// Note in order to fit the requirements of usability by std::wfstream it uses mbstate_t
  149. /// to handle intermediate states in handling of variable length UTF-16 sequences
  150. ///
  151. /// Its member functions implement standard virtual functions of basic codecvt
  152. template<typename CharType, typename CodecvtImpl>
  153. class generic_codecvt<CharType, CodecvtImpl, 2> : public std::codecvt<CharType, char, std::mbstate_t>,
  154. public generic_codecvt_base {
  155. public:
  156. typedef CharType uchar;
  157. generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
  158. const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
  159. protected:
  160. std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
  161. {
  162. if(*reinterpret_cast<char*>(&s) != 0)
  163. return std::codecvt_base::error;
  164. next = from;
  165. return std::codecvt_base::ok;
  166. }
  167. int do_encoding() const noexcept override { return 0; }
  168. int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
  169. bool do_always_noconv() const noexcept override { return false; }
  170. int do_length(std::mbstate_t& std_state, const char* from, const char* from_end, size_t max) const override
  171. {
  172. bool state = *reinterpret_cast<char*>(&std_state) != 0;
  173. const char* save_from = from;
  174. auto cvt_state = implementation().initial_state(to_unicode_state);
  175. while(max > 0 && from < from_end) {
  176. const char* prev_from = from;
  177. const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
  178. if(ch == boost::locale::utf::incomplete || ch == boost::locale::utf::illegal) {
  179. from = prev_from;
  180. break;
  181. }
  182. max--;
  183. if(ch > 0xFFFF) {
  184. if(!state)
  185. from = prev_from;
  186. state = !state;
  187. }
  188. }
  189. *reinterpret_cast<char*>(&std_state) = state;
  190. return static_cast<int>(from - save_from);
  191. }
  192. std::codecvt_base::result do_in(std::mbstate_t& std_state,
  193. const char* from,
  194. const char* from_end,
  195. const char*& from_next,
  196. uchar* to,
  197. uchar* to_end,
  198. uchar*& to_next) const override
  199. {
  200. std::codecvt_base::result r = std::codecvt_base::ok;
  201. // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
  202. // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
  203. //
  204. // if 0/false no codepoint above >0xFFFF observed, else a codepoint above 0xFFFF was observed
  205. // and first pair is written, but no input consumed
  206. bool state = *reinterpret_cast<char*>(&std_state) != 0;
  207. auto cvt_state = implementation().initial_state(to_unicode_state);
  208. while(to < to_end && from < from_end) {
  209. const char* from_saved = from;
  210. utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
  211. if(ch == boost::locale::utf::illegal) {
  212. from = from_saved;
  213. r = std::codecvt_base::error;
  214. break;
  215. }
  216. if(ch == boost::locale::utf::incomplete) {
  217. from = from_saved;
  218. r = std::codecvt_base::partial;
  219. break;
  220. }
  221. // Normal codepoints go directly to stream
  222. if(ch <= 0xFFFF)
  223. *to++ = static_cast<uchar>(ch);
  224. else {
  225. // For other codepoints we do the following
  226. //
  227. // 1. We can't consume our input as we may find ourselves
  228. // in state where all input consumed but not all output written,i.e. only
  229. // 1st pair is written
  230. // 2. We only write first pair and mark this in the state, we also revert back
  231. // the from pointer in order to make sure this codepoint would be read
  232. // once again and then we would consume our input together with writing
  233. // second surrogate pair
  234. ch -= 0x10000;
  235. std::uint16_t w1 = static_cast<std::uint16_t>(0xD800 | (ch >> 10));
  236. std::uint16_t w2 = static_cast<std::uint16_t>(0xDC00 | (ch & 0x3FF));
  237. if(!state) {
  238. from = from_saved;
  239. *to++ = w1;
  240. } else
  241. *to++ = w2;
  242. state = !state;
  243. }
  244. }
  245. from_next = from;
  246. to_next = to;
  247. if(r == std::codecvt_base::ok && (from != from_end || state))
  248. r = std::codecvt_base::partial;
  249. *reinterpret_cast<char*>(&std_state) = state;
  250. return r;
  251. }
  252. std::codecvt_base::result do_out(std::mbstate_t& std_state,
  253. const uchar* from,
  254. const uchar* from_end,
  255. const uchar*& from_next,
  256. char* to,
  257. char* to_end,
  258. char*& to_next) const override
  259. {
  260. std::codecvt_base::result r = std::codecvt_base::ok;
  261. // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
  262. // according to standard. We assume that sizeof(mbstate_t) >=2 in order
  263. // to be able to store first observed surrogate pair
  264. //
  265. // State: state!=0 - a first surrogate pair was observed (state = first pair),
  266. // we expect the second one to come and then zero the state
  267. std::uint16_t state = detail::read_state(std_state);
  268. auto cvt_state = implementation().initial_state(from_unicode_state);
  269. while(to < to_end && from < from_end) {
  270. utf::code_point ch = 0;
  271. if(state != 0) {
  272. // if the state indicates that 1st surrogate pair was written
  273. // we should make sure that the second one that comes is actually
  274. // second surrogate
  275. std::uint16_t w1 = state;
  276. std::uint16_t w2 = *from;
  277. // we don't forward from as writing may fail to incomplete or
  278. // partial conversion
  279. if(0xDC00 <= w2 && w2 <= 0xDFFF) {
  280. std::uint16_t vh = w1 - 0xD800;
  281. std::uint16_t vl = w2 - 0xDC00;
  282. ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
  283. } else {
  284. // Invalid surrogate
  285. r = std::codecvt_base::error;
  286. break;
  287. }
  288. } else {
  289. ch = *from;
  290. if(0xD800 <= ch && ch <= 0xDBFF) {
  291. // if this is a first surrogate pair we put
  292. // it into the state and consume it, note we don't
  293. // go forward as it should be illegal so we increase
  294. // the from pointer manually
  295. state = static_cast<uint16_t>(ch);
  296. from++;
  297. continue;
  298. } else if(0xDC00 <= ch && ch <= 0xDFFF) {
  299. // if we observe second surrogate pair and
  300. // first only may be expected we should break from the loop with error
  301. // as it is illegal input
  302. r = std::codecvt_base::error;
  303. break;
  304. }
  305. }
  306. if(!boost::locale::utf::is_valid_codepoint(ch)) {
  307. r = std::codecvt_base::error;
  308. break;
  309. }
  310. const utf::code_point len = implementation().from_unicode(cvt_state, ch, to, to_end);
  311. if(len == boost::locale::utf::incomplete) {
  312. r = std::codecvt_base::partial;
  313. break;
  314. } else if(len == boost::locale::utf::illegal) {
  315. r = std::codecvt_base::error;
  316. break;
  317. } else
  318. to += len;
  319. state = 0;
  320. from++;
  321. }
  322. from_next = from;
  323. to_next = to;
  324. if(r == std::codecvt_base::ok && (from != from_end || state != 0))
  325. r = std::codecvt_base::partial;
  326. detail::write_state(std_state, state);
  327. return r;
  328. }
  329. };
  330. /// \brief UTF-32 to/from narrow char codecvt facet to use with char32_t or wchar_t on POSIX platforms
  331. ///
  332. /// Its member functions implement standard virtual functions of basic codecvt.
  333. /// mbstate_t is not used for UTF-32 handling due to fixed length encoding
  334. template<typename CharType, typename CodecvtImpl>
  335. class generic_codecvt<CharType, CodecvtImpl, 4> : public std::codecvt<CharType, char, std::mbstate_t>,
  336. public generic_codecvt_base {
  337. public:
  338. typedef CharType uchar;
  339. generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
  340. const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
  341. protected:
  342. std::codecvt_base::result
  343. do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
  344. {
  345. next = from;
  346. return std::codecvt_base::ok;
  347. }
  348. int do_encoding() const noexcept override { return 0; }
  349. int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
  350. bool do_always_noconv() const noexcept override { return false; }
  351. int do_length(std::mbstate_t& /*state*/, const char* from, const char* from_end, size_t max) const override
  352. {
  353. const char* start_from = from;
  354. auto cvt_state = implementation().initial_state(to_unicode_state);
  355. while(max > 0 && from < from_end) {
  356. const char* save_from = from;
  357. const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
  358. if(ch == boost::locale::utf::incomplete || ch == boost::locale::utf::illegal) {
  359. from = save_from;
  360. break;
  361. }
  362. max--;
  363. }
  364. return static_cast<int>(from - start_from);
  365. }
  366. std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
  367. const char* from,
  368. const char* from_end,
  369. const char*& from_next,
  370. uchar* to,
  371. uchar* to_end,
  372. uchar*& to_next) const override
  373. {
  374. std::codecvt_base::result r = std::codecvt_base::ok;
  375. auto cvt_state = implementation().initial_state(to_unicode_state);
  376. while(to < to_end && from < from_end) {
  377. const char* from_saved = from;
  378. const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
  379. if(ch == boost::locale::utf::illegal) {
  380. r = std::codecvt_base::error;
  381. from = from_saved;
  382. break;
  383. }
  384. if(ch == boost::locale::utf::incomplete) {
  385. r = std::codecvt_base::partial;
  386. from = from_saved;
  387. break;
  388. }
  389. *to++ = ch;
  390. }
  391. from_next = from;
  392. to_next = to;
  393. if(r == std::codecvt_base::ok && from != from_end)
  394. r = std::codecvt_base::partial;
  395. return r;
  396. }
  397. std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
  398. const uchar* from,
  399. const uchar* from_end,
  400. const uchar*& from_next,
  401. char* to,
  402. char* to_end,
  403. char*& to_next) const override
  404. {
  405. std::codecvt_base::result r = std::codecvt_base::ok;
  406. auto cvt_state = implementation().initial_state(from_unicode_state);
  407. while(to < to_end && from < from_end) {
  408. const std::uint32_t ch = *from;
  409. if(!boost::locale::utf::is_valid_codepoint(ch)) {
  410. r = std::codecvt_base::error;
  411. break;
  412. }
  413. const utf::code_point len = implementation().from_unicode(cvt_state, ch, to, to_end);
  414. if(len == boost::locale::utf::incomplete) {
  415. r = std::codecvt_base::partial;
  416. break;
  417. } else if(len == boost::locale::utf::illegal) {
  418. r = std::codecvt_base::error;
  419. break;
  420. }
  421. to += len;
  422. from++;
  423. }
  424. from_next = from;
  425. to_next = to;
  426. if(r == std::codecvt_base::ok && from != from_end)
  427. r = std::codecvt_base::partial;
  428. return r;
  429. }
  430. };
  431. template<typename CodecvtImpl>
  432. class generic_codecvt<char, CodecvtImpl, 1> : public std::codecvt<char, char, std::mbstate_t>,
  433. public generic_codecvt_base {
  434. public:
  435. typedef char uchar;
  436. const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
  437. generic_codecvt(size_t refs = 0) : std::codecvt<char, char, std::mbstate_t>(refs) {}
  438. };
  439. }} // namespace boost::locale
  440. #endif