utf8.hpp 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. /*=============================================================================
  2. Copyright (c) 2001-2014 Joel de Guzman
  3. Copyright (c) 2023 Nikita Kniazev
  4. Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. ==============================================================================*/
  7. #if !defined(BOOST_SPIRIT_X3_UC_TYPES_NOVEMBER_23_2008_0840PM)
  8. #define BOOST_SPIRIT_X3_UC_TYPES_NOVEMBER_23_2008_0840PM
  9. #include <boost/config.hpp>
  10. #include <type_traits>
  11. #include <string>
  12. namespace boost { namespace spirit { namespace x3
  13. {
  14. typedef char32_t ucs4_char;
  15. typedef char utf8_char;
  16. typedef std::basic_string<ucs4_char> ucs4_string;
  17. typedef std::basic_string<utf8_char> utf8_string;
  18. namespace detail {
  19. inline void utf8_put_encode(utf8_string& out, ucs4_char x)
  20. {
  21. // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf D90
  22. if (BOOST_UNLIKELY(x > 0x10FFFFul || (0xD7FFul < x && x < 0xE000ul)))
  23. x = 0xFFFDul;
  24. // Table 3-6. UTF-8 Bit Distribution
  25. if (x < 0x80ul) {
  26. out.push_back(static_cast<unsigned char>(x));
  27. }
  28. else if (x < 0x800ul) {
  29. out.push_back(static_cast<unsigned char>(0xC0ul + (x >> 6)));
  30. out.push_back(static_cast<unsigned char>(0x80ul + (x & 0x3Ful)));
  31. }
  32. else if (x < 0x10000ul) {
  33. out.push_back(static_cast<unsigned char>(0xE0ul + (x >> 12)));
  34. out.push_back(static_cast<unsigned char>(0x80ul + ((x >> 6) & 0x3Ful)));
  35. out.push_back(static_cast<unsigned char>(0x80ul + (x & 0x3Ful)));
  36. }
  37. else {
  38. out.push_back(static_cast<unsigned char>(0xF0ul + (x >> 18)));
  39. out.push_back(static_cast<unsigned char>(0x80ul + ((x >> 12) & 0x3Ful)));
  40. out.push_back(static_cast<unsigned char>(0x80ul + ((x >> 6) & 0x3Ful)));
  41. out.push_back(static_cast<unsigned char>(0x80ul + (x & 0x3Ful)));
  42. }
  43. }
  44. }
  45. template <typename Char>
  46. inline utf8_string to_utf8(Char value)
  47. {
  48. utf8_string result;
  49. typedef typename std::make_unsigned<Char>::type UChar;
  50. detail::utf8_put_encode(result, static_cast<UChar>(value));
  51. return result;
  52. }
  53. template <typename Char>
  54. inline utf8_string to_utf8(Char const* str)
  55. {
  56. utf8_string result;
  57. typedef typename std::make_unsigned<Char>::type UChar;
  58. while (*str)
  59. detail::utf8_put_encode(result, static_cast<UChar>(*str++));
  60. return result;
  61. }
  62. template <typename Char, typename Traits, typename Allocator>
  63. inline utf8_string
  64. to_utf8(std::basic_string<Char, Traits, Allocator> const& str)
  65. {
  66. utf8_string result;
  67. typedef typename std::make_unsigned<Char>::type UChar;
  68. for (Char ch : str)
  69. detail::utf8_put_encode(result, static_cast<UChar>(ch));
  70. return result;
  71. }
  72. // Assume wchar_t content is UTF-16 on MSVC, or mingw/wineg++ with -fshort-wchar
  73. #if defined(_MSC_VER) || defined(__SIZEOF_WCHAR_T__) && __SIZEOF_WCHAR_T__ == 2
  74. inline utf8_string to_utf8(wchar_t value)
  75. {
  76. utf8_string result;
  77. detail::utf8_put_encode(result, static_cast<std::make_unsigned<wchar_t>::type>(value));
  78. return result;
  79. }
  80. namespace detail {
  81. inline ucs4_char decode_utf16(wchar_t const*& s)
  82. {
  83. typedef std::make_unsigned<wchar_t>::type uwchar_t;
  84. uwchar_t x(*s);
  85. if (x < 0xD800ul || x > 0xDFFFul)
  86. return x;
  87. // expected high-surrogate
  88. if (BOOST_UNLIKELY((x >> 10) != 0b110110ul))
  89. return 0xFFFDul;
  90. uwchar_t y(*++s);
  91. // expected low-surrogate
  92. if (BOOST_UNLIKELY((y >> 10) != 0b110111ul))
  93. return 0xFFFDul;
  94. return ((x & 0x3FFul) << 10) + (y & 0x3FFul) + 0x10000ul;
  95. }
  96. }
  97. inline utf8_string to_utf8(wchar_t const* str)
  98. {
  99. utf8_string result;
  100. for (ucs4_char c; (c = detail::decode_utf16(str)) != ucs4_char(); ++str)
  101. detail::utf8_put_encode(result, c);
  102. return result;
  103. }
  104. template <typename Traits, typename Allocator>
  105. inline utf8_string
  106. to_utf8(std::basic_string<wchar_t, Traits, Allocator> const& str)
  107. {
  108. return to_utf8(str.c_str());
  109. }
  110. #endif
  111. }}}
  112. #endif