utf8.hpp 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. /*=============================================================================
  2. Copyright (c) 2001-2011 Joel de Guzman
  3. Copyright (c) 2023 Nikita Kniazev
  4. Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. ==============================================================================*/
  7. #if !defined(BOOST_SPIRIT_UC_TYPES_NOVEMBER_23_2008_0840PM)
  8. #define BOOST_SPIRIT_UC_TYPES_NOVEMBER_23_2008_0840PM
  9. #if defined(_MSC_VER)
  10. #pragma once
  11. #endif
  12. #include <boost/config.hpp>
  13. #include <boost/cstdint.hpp>
  14. #include <boost/type_traits/make_unsigned.hpp>
  15. #include <string>
  16. namespace boost { namespace spirit
  17. {
  18. typedef ::boost::uint32_t ucs4_char;
  19. typedef char utf8_char;
  20. typedef std::basic_string<ucs4_char> ucs4_string;
  21. typedef std::basic_string<utf8_char> utf8_string;
  22. namespace detail {
  23. inline void utf8_put_encode(utf8_string& out, ucs4_char x)
  24. {
  25. // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf D90
  26. if (BOOST_UNLIKELY(x > 0x10FFFFul || (0xD7FFul < x && x < 0xE000ul)))
  27. x = 0xFFFDul;
  28. // Table 3-6. UTF-8 Bit Distribution
  29. if (x < 0x80ul) {
  30. out.push_back(static_cast<unsigned char>(x));
  31. }
  32. else if (x < 0x800ul) {
  33. out.push_back(static_cast<unsigned char>(0xC0ul + (x >> 6)));
  34. out.push_back(static_cast<unsigned char>(0x80ul + (x & 0x3Ful)));
  35. }
  36. else if (x < 0x10000ul) {
  37. out.push_back(static_cast<unsigned char>(0xE0ul + (x >> 12)));
  38. out.push_back(static_cast<unsigned char>(0x80ul + ((x >> 6) & 0x3Ful)));
  39. out.push_back(static_cast<unsigned char>(0x80ul + (x & 0x3Ful)));
  40. }
  41. else {
  42. out.push_back(static_cast<unsigned char>(0xF0ul + (x >> 18)));
  43. out.push_back(static_cast<unsigned char>(0x80ul + ((x >> 12) & 0x3Ful)));
  44. out.push_back(static_cast<unsigned char>(0x80ul + ((x >> 6) & 0x3Ful)));
  45. out.push_back(static_cast<unsigned char>(0x80ul + (x & 0x3Ful)));
  46. }
  47. }
  48. }
  49. template <typename Char>
  50. inline utf8_string to_utf8(Char value)
  51. {
  52. utf8_string result;
  53. typedef typename make_unsigned<Char>::type UChar;
  54. detail::utf8_put_encode(result, static_cast<UChar>(value));
  55. return result;
  56. }
  57. template <typename Char>
  58. inline utf8_string to_utf8(Char const* str)
  59. {
  60. utf8_string result;
  61. typedef typename make_unsigned<Char>::type UChar;
  62. while (*str)
  63. detail::utf8_put_encode(result, static_cast<UChar>(*str++));
  64. return result;
  65. }
  66. template <typename Char, typename Traits, typename Allocator>
  67. inline utf8_string
  68. to_utf8(std::basic_string<Char, Traits, Allocator> const& str)
  69. {
  70. utf8_string result;
  71. typedef typename make_unsigned<Char>::type UChar;
  72. for (Char const* ptr = str.data(),
  73. * end = ptr + str.size(); ptr < end; ++ptr)
  74. detail::utf8_put_encode(result, static_cast<UChar>(*ptr));
  75. return result;
  76. }
  77. // Assume wchar_t content is UTF-16 on MSVC, or mingw/wineg++ with -fshort-wchar
  78. #if defined(_MSC_VER) || defined(__SIZEOF_WCHAR_T__) && __SIZEOF_WCHAR_T__ == 2
  79. inline utf8_string to_utf8(wchar_t value)
  80. {
  81. utf8_string result;
  82. detail::utf8_put_encode(result, static_cast<make_unsigned<wchar_t>::type>(value));
  83. return result;
  84. }
  85. namespace detail {
  86. inline ucs4_char decode_utf16(wchar_t const*& s)
  87. {
  88. typedef make_unsigned<wchar_t>::type uwchar_t;
  89. uwchar_t x(*s);
  90. if (x < 0xD800ul || x > 0xDFFFul)
  91. return x;
  92. // expected high-surrogate
  93. if (BOOST_UNLIKELY((x >> 10) != 0x36ul))
  94. return 0xFFFDul;
  95. uwchar_t y(*++s);
  96. // expected low-surrogate
  97. if (BOOST_UNLIKELY((y >> 10) != 0x37ul))
  98. return 0xFFFDul;
  99. return ((x & 0x3FFul) << 10) + (y & 0x3FFul) + 0x10000ul;
  100. }
  101. }
  102. inline utf8_string to_utf8(wchar_t const* str)
  103. {
  104. utf8_string result;
  105. for (ucs4_char c; (c = detail::decode_utf16(str)) != ucs4_char(); ++str)
  106. detail::utf8_put_encode(result, c);
  107. return result;
  108. }
  109. template <typename Traits, typename Allocator>
  110. inline utf8_string
  111. to_utf8(std::basic_string<wchar_t, Traits, Allocator> const& str)
  112. {
  113. return to_utf8(str.c_str());
  114. }
  115. #endif
  116. }}
  117. #endif