util.hpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. // Copyright (c) 2022-2023 Alexander Grund
  4. //
  5. // Distributed under the Boost Software License, Version 1.0.
  6. // https://www.boost.org/LICENSE_1_0.txt
  7. #ifndef BOOST_LOCALE_UTIL_HPP
  8. #define BOOST_LOCALE_UTIL_HPP
  9. #include <boost/locale/generator.hpp>
  10. #include <boost/locale/utf.hpp>
  11. #include <boost/assert.hpp>
  12. #include <cstdint>
  13. #include <locale>
  14. #include <memory>
  15. #include <typeinfo>
  16. namespace boost { namespace locale {
  17. /// \brief This namespace provides various utility function useful for Boost.Locale's backends
  18. /// implementations
  19. namespace util {
  20. /// \brief Return default system locale name in POSIX format.
  21. ///
  22. /// This function tries to detect the locale using LC_ALL, LC_CTYPE and LANG environment
  23. /// variables in this order and if all of them are unset, on POSIX platforms it returns "C".
  24. /// On Windows additionally to the above environment variables, this function
  25. /// tries to create the locale name from ISO-639 and ISO-3166 country codes defined
  26. /// for the users default locale.
  27. /// If \a use_utf8_on_windows is true it sets the encoding to UTF-8,
  28. /// otherwise, if the system locale supports ANSI codepages it defines the ANSI encoding, e.g. windows-1252,
  29. /// otherwise (if ANSI codepage is not available) it uses UTF-8 encoding.
  30. BOOST_LOCALE_DECL
  31. std::string get_system_locale(bool use_utf8_on_windows = false);
  32. /// \brief Installs information facet to locale \a in based on locale name \a name
  33. ///
  34. /// This function installs boost::locale::info facet into the locale \a in and returns
  35. /// newly created locale.
  36. ///
  37. /// Note: all information is based only on parsing of string \a name;
  38. ///
  39. /// The name has following format: language[_COUNTRY][.encoding][\@variant]
  40. /// Where language is ISO-639 language code like "en" or "ru", COUNTRY is ISO-3166
  41. /// country identifier like "US" or "RU". the Encoding is a character set name
  42. /// like UTF-8 or ISO-8859-1. Variant is backend specific variant like \c euro or
  43. /// calendar=hebrew.
  44. ///
  45. /// If some parameters are missing they are specified as blanks, default encoding
  46. /// is assumed to be US-ASCII and missing language is assumed to be "C"
  47. BOOST_LOCALE_DECL
  48. std::locale create_info(const std::locale& in, const std::string& name);
  49. /// \brief This class represent a simple stateless converter from UCS-4 and to UCS-4 for
  50. /// each single code point
  51. ///
  52. /// This class is used for creation of std::codecvt facet for converting utf-16/utf-32 encoding
  53. /// to encoding supported by this converter
  54. ///
  55. /// Please note, this converter should be fully stateless. Fully stateless means it should
  56. /// never assume that it is called in any specific order on the text. Even if the
  57. /// encoding itself seems to be stateless like windows-1255 or shift-jis, some
  58. /// encoders (most notably iconv) can actually compose several code-point into one or
  59. /// decompose them in case composite characters are found. So be very careful when implementing
  60. /// these converters for certain character set.
  61. class BOOST_LOCALE_DECL base_converter {
  62. public:
  63. /// This value should be returned when an illegal input sequence or code-point is observed:
  64. /// For example if a UCS-32 code-point is in the range reserved for UTF-16 surrogates
  65. /// or an invalid UTF-8 sequence is found
  66. static constexpr utf::code_point illegal = utf::illegal;
  67. /// This value is returned in following cases: An incomplete input sequence was found or
  68. /// insufficient output buffer was provided so complete output could not be written.
  69. static constexpr utf::code_point incomplete = utf::incomplete;
  70. virtual ~base_converter();
  71. /// Return the maximal length that one Unicode code-point can be converted to, for example
  72. /// for UTF-8 it is 4, for Shift-JIS it is 2 and ISO-8859-1 is 1
  73. virtual int max_len() const { return 1; }
  74. /// Returns true if calling the functions from_unicode, to_unicode, and max_len is thread safe.
  75. ///
  76. /// Rule of thumb: if this class' implementation uses simple tables that are unchanged
  77. /// or is purely algorithmic like UTF-8 - so it does not share any mutable bit for
  78. /// independent to_unicode, from_unicode calls, you may set it to true, otherwise,
  79. /// for example if you use iconv_t descriptor or UConverter as conversion object return false,
  80. /// and this object will be cloned for each use.
  81. virtual bool is_thread_safe() const { return false; }
  82. /// Create a polymorphic copy of this object, usually called only if is_thread_safe() return false
  83. virtual base_converter* clone() const
  84. {
  85. BOOST_ASSERT(typeid(*this) == typeid(base_converter));
  86. return new base_converter();
  87. }
  88. /// Convert a single character starting at begin and ending at most at end to Unicode code-point.
  89. ///
  90. /// if valid input sequence found in [\a begin,\a code_point_end) such as \a begin < \a code_point_end && \a
  91. /// code_point_end <= \a end it is converted to its Unicode code point equivalent, \a begin is set to \a
  92. /// code_point_end
  93. ///
  94. /// if incomplete input sequence found in [\a begin,\a end), i.e. there my be such \a code_point_end that \a
  95. /// code_point_end > \a end and [\a begin, \a code_point_end) would be valid input sequence, then \a
  96. /// incomplete is returned begin stays unchanged, for example for UTF-8 conversion a *begin = 0xc2, \a begin
  97. /// +1 = \a end is such situation.
  98. ///
  99. /// if invalid input sequence found, i.e. there is a sequence [\a begin, \a code_point_end) such as \a
  100. /// code_point_end <= \a end that is illegal for this encoding, \a illegal is returned and begin stays
  101. /// unchanged. For example if *begin = 0xFF and begin < end for UTF-8, then \a illegal is returned.
  102. virtual utf::code_point to_unicode(const char*& begin, const char* end)
  103. {
  104. if(begin == end)
  105. return incomplete; // LCOV_EXCL_LINE
  106. unsigned char cp = *begin;
  107. if(cp <= 0x7F) {
  108. begin++;
  109. return cp;
  110. }
  111. return illegal;
  112. }
  113. /// Convert a single code-point \a u into encoding and store it in [begin,end) range.
  114. ///
  115. /// If u is invalid Unicode code-point, or it can not be mapped correctly to represented character set,
  116. /// \a illegal should be returned
  117. ///
  118. /// If u can be converted to a sequence of bytes c1, ... , cN (1<= N <= max_len() ) then
  119. ///
  120. /// -# If end - begin >= N, c1, ... cN are written starting at begin and N is returned
  121. /// -# If end - begin < N, incomplete is returned, it is unspecified what would be
  122. /// stored in bytes in range [begin,end)
  123. virtual utf::len_or_error from_unicode(utf::code_point u, char* begin, const char* end)
  124. {
  125. if(begin == end)
  126. return incomplete; // LCOV_EXCL_LINE
  127. if(u >= 0x80)
  128. return illegal;
  129. *begin = static_cast<char>(u);
  130. return 1;
  131. }
  132. };
  133. /// This function creates a \a base_converter that can be used for conversion between UTF-8 and
  134. /// Unicode code points
  135. BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_utf8_converter();
  136. BOOST_DEPRECATED("This function is deprecated, use 'create_utf8_converter()'")
  137. inline std::unique_ptr<base_converter> create_utf8_converter_unique_ptr()
  138. {
  139. return create_utf8_converter();
  140. }
  141. /// This function creates a \a base_converter that can be used for conversion between single byte
  142. /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
  143. ///
  144. /// If \a encoding is not supported, empty pointer is returned.
  145. /// So you should check whether the returned pointer is valid/non-NULL
  146. BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_simple_converter(const std::string& encoding);
  147. BOOST_DEPRECATED("This function is deprecated, use 'create_simple_converter()'")
  148. inline std::unique_ptr<base_converter> create_simple_converter_unique_ptr(const std::string& encoding)
  149. {
  150. return create_simple_converter(encoding);
  151. }
  152. /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new
  153. /// facet.
  154. ///
  155. /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter.
  156. /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or
  157. /// output.
  158. ///
  159. /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join
  160. /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware
  161. /// of wide encoding type
  162. BOOST_LOCALE_DECL
  163. std::locale create_codecvt(const std::locale& in, std::unique_ptr<base_converter> cvt, char_facet_t type);
  164. BOOST_DEPRECATED("This function is deprecated, use 'create_codecvt()'")
  165. inline std::locale create_codecvt_from_pointer(const std::locale& in, base_converter* cvt, char_facet_t type)
  166. {
  167. return create_codecvt(in, std::unique_ptr<base_converter>(cvt), type);
  168. }
  169. BOOST_DEPRECATED("This function is deprecated, use 'create_utf8_converter()'")
  170. BOOST_LOCALE_DECL base_converter* create_utf8_converter_new_ptr();
  171. BOOST_DEPRECATED("This function is deprecated, use 'create_simple_converter()'")
  172. BOOST_LOCALE_DECL base_converter* create_simple_converter_new_ptr(const std::string& encoding);
  173. /// Install utf8 codecvt to UTF-16 or UTF-32 into locale \a in and return
  174. /// new locale that is based on \a in and uses new facet.
  175. BOOST_LOCALE_DECL
  176. std::locale create_utf8_codecvt(const std::locale& in, char_facet_t type);
  177. /// This function installs codecvt that can be used for conversion between single byte
  178. /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
  179. ///
  180. /// \throws boost::locale::conv::invalid_charset_error: Character set is not supported or isn't a single
  181. /// byte character set
  182. BOOST_LOCALE_DECL
  183. std::locale create_simple_codecvt(const std::locale& in, const std::string& encoding, char_facet_t type);
  184. } // namespace util
  185. }} // namespace boost::locale
  186. #endif