query.hpp 11 KB


  1. /*=============================================================================
  2. Copyright (c) 2001-2011 Joel de Guzman
  3. Distributed under the Boost Software License, Version 1.0. (See accompanying
  4. file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. Autogenerated by MultiStageTable.py (Unicode multi-stage
  6. table builder) (c) Peter Kankowski, 2008
  7. ==============================================================================*/
  8. #if !defined(BOOST_SPIRIT_UNICODE_QUERY_FEBRUARY_2_2010)
  9. #define BOOST_SPIRIT_UNICODE_QUERY_FEBRUARY_2_2010
  10. #include <boost/cstdint.hpp>
  11. # include "category_table.hpp"
  12. # include "script_table.hpp"
  13. # include "lowercase_table.hpp"
  14. # include "uppercase_table.hpp"
  15. namespace boost { namespace spirit { namespace ucd
  16. {
  17. // This header provides Basic (Level 1) Unicode Support
  18. // See http://unicode.org/reports/tr18/ for details
  19. struct properties
  20. {
  21. // bit pattern: xxMMMCCC
  22. // MMM: major_category
  23. // CCC: category
  24. enum major_category
  25. {
  26. letter,
  27. mark,
  28. number,
  29. separator,
  30. other,
  31. punctuation,
  32. symbol
  33. };
  34. enum category
  35. {
  36. uppercase_letter = 0, // [Lu] an uppercase letter
  37. lowercase_letter, // [Ll] a lowercase letter
  38. titlecase_letter, // [Lt] a digraphic character, with first part uppercase
  39. modifier_letter, // [Lm] a modifier letter
  40. other_letter, // [Lo] other letters, including syllables and ideographs
  41. nonspacing_mark = 8, // [Mn] a nonspacing combining mark (zero advance width)
  42. enclosing_mark, // [Me] an enclosing combining mark
  43. spacing_mark, // [Mc] a spacing combining mark (positive advance width)
  44. decimal_number = 16, // [Nd] a decimal digit
  45. letter_number, // [Nl] a letterlike numeric character
  46. other_number, // [No] a numeric character of other type
  47. space_separator = 24, // [Zs] a space character (of various non-zero widths)
  48. line_separator, // [Zl] U+2028 LINE SEPARATOR only
  49. paragraph_separator, // [Zp] U+2029 PARAGRAPH SEPARATOR only
  50. control = 32, // [Cc] a C0 or C1 control code
  51. format, // [Cf] a format control character
  52. private_use, // [Co] a private-use character
  53. surrogate, // [Cs] a surrogate code point
  54. unassigned, // [Cn] a reserved unassigned code point or a noncharacter
  55. dash_punctuation = 40, // [Pd] a dash or hyphen punctuation mark
  56. open_punctuation, // [Ps] an opening punctuation mark (of a pair)
  57. close_punctuation, // [Pe] a closing punctuation mark (of a pair)
  58. connector_punctuation, // [Pc] a connecting punctuation mark, like a tie
  59. other_punctuation, // [Po] a punctuation mark of other type
  60. initial_punctuation, // [Pi] an initial quotation mark
  61. final_punctuation, // [Pf] a final quotation mark
  62. math_symbol = 48, // [Sm] a symbol of primarily mathematical use
  63. currency_symbol, // [Sc] a currency sign
  64. modifier_symbol, // [Sk] a non-letterlike modifier symbol
  65. other_symbol // [So] a symbol of other type
  66. };
  67. enum derived_properties
  68. {
  69. alphabetic = 64,
  70. uppercase = 128,
  71. lowercase = 256,
  72. white_space = 512,
  73. hex_digit = 1024,
  74. noncharacter_code_point = 2048,
  75. default_ignorable_code_point = 4096
  76. };
  77. enum script
  78. {
  79. adlam,
  80. caucasian_albanian,
  81. ahom,
  82. arabic,
  83. imperial_aramaic,
  84. armenian,
  85. avestan,
  86. balinese,
  87. bamum,
  88. bassa_vah,
  89. batak,
  90. bengali,
  91. bhaiksuki,
  92. bopomofo,
  93. brahmi,
  94. braille,
  95. buginese,
  96. buhid,
  97. chakma,
  98. canadian_aboriginal,
  99. carian,
  100. cham,
  101. cherokee,
  102. chorasmian,
  103. coptic,
  104. cypro_minoan,
  105. cypriot,
  106. cyrillic,
  107. devanagari,
  108. dives_akuru,
  109. dogra,
  110. deseret,
  111. duployan,
  112. egyptian_hieroglyphs,
  113. elbasan,
  114. elymaic,
  115. ethiopic,
  116. georgian,
  117. glagolitic,
  118. gunjala_gondi,
  119. masaram_gondi,
  120. gothic,
  121. grantha,
  122. greek,
  123. gujarati,
  124. gurmukhi,
  125. hangul,
  126. han,
  127. hanunoo,
  128. hatran,
  129. hebrew,
  130. hiragana,
  131. anatolian_hieroglyphs,
  132. pahawh_hmong,
  133. nyiakeng_puachue_hmong,
  134. katakana_or_hiragana,
  135. old_hungarian,
  136. old_italic,
  137. javanese,
  138. kayah_li,
  139. katakana,
  140. kawi,
  141. kharoshthi,
  142. khmer,
  143. khojki,
  144. khitan_small_script,
  145. kannada,
  146. kaithi,
  147. tai_tham,
  148. lao,
  149. latin,
  150. lepcha,
  151. limbu,
  152. linear_a,
  153. linear_b,
  154. lisu,
  155. lycian,
  156. lydian,
  157. mahajani,
  158. makasar,
  159. mandaic,
  160. manichaean,
  161. marchen,
  162. medefaidrin,
  163. mende_kikakui,
  164. meroitic_cursive,
  165. meroitic_hieroglyphs,
  166. malayalam,
  167. modi,
  168. mongolian,
  169. mro,
  170. meetei_mayek,
  171. multani,
  172. myanmar,
  173. nag_mundari,
  174. nandinagari,
  175. old_north_arabian,
  176. nabataean,
  177. newa,
  178. nko,
  179. nushu,
  180. ogham,
  181. ol_chiki,
  182. old_turkic,
  183. oriya,
  184. osage,
  185. osmanya,
  186. old_uyghur,
  187. palmyrene,
  188. pau_cin_hau,
  189. old_permic,
  190. phags_pa,
  191. inscriptional_pahlavi,
  192. psalter_pahlavi,
  193. phoenician,
  194. miao,
  195. inscriptional_parthian,
  196. rejang,
  197. hanifi_rohingya,
  198. runic,
  199. samaritan,
  200. old_south_arabian,
  201. saurashtra,
  202. signwriting,
  203. shavian,
  204. sharada,
  205. siddham,
  206. khudawadi,
  207. sinhala,
  208. sogdian,
  209. old_sogdian,
  210. sora_sompeng,
  211. soyombo,
  212. sundanese,
  213. syloti_nagri,
  214. syriac,
  215. tagbanwa,
  216. takri,
  217. tai_le,
  218. new_tai_lue,
  219. tamil,
  220. tangut,
  221. tai_viet,
  222. telugu,
  223. tifinagh,
  224. tagalog,
  225. thaana,
  226. thai,
  227. tibetan,
  228. tirhuta,
  229. tangsa,
  230. toto,
  231. ugaritic,
  232. vai,
  233. vithkuqi,
  234. warang_citi,
  235. wancho,
  236. old_persian,
  237. cuneiform,
  238. yezidi,
  239. yi,
  240. zanabazar_square,
  241. inherited,
  242. common,
  243. unknown
  244. };
  245. };
  246. inline properties::category get_category(::boost::uint32_t ch)
  247. {
  248. return static_cast<properties::category>(detail::category_lookup(ch) & 0x3F);
  249. }
  250. inline properties::major_category get_major_category(::boost::uint32_t ch)
  251. {
  252. return static_cast<properties::major_category>(get_category(ch) >> 3);
  253. }
  254. inline bool is_punctuation(::boost::uint32_t ch)
  255. {
  256. return get_major_category(ch) == properties::punctuation;
  257. }
  258. inline bool is_decimal_number(::boost::uint32_t ch)
  259. {
  260. return get_category(ch) == properties::decimal_number;
  261. }
  262. inline bool is_hex_digit(::boost::uint32_t ch)
  263. {
  264. return (detail::category_lookup(ch) & properties::hex_digit) != 0;
  265. }
  266. inline bool is_control(::boost::uint32_t ch)
  267. {
  268. return get_category(ch) == properties::control;
  269. }
  270. inline bool is_alphabetic(::boost::uint32_t ch)
  271. {
  272. return (detail::category_lookup(ch) & properties::alphabetic) != 0;
  273. }
  274. inline bool is_alphanumeric(::boost::uint32_t ch)
  275. {
  276. return is_decimal_number(ch) || is_alphabetic(ch);
  277. }
  278. inline bool is_uppercase(::boost::uint32_t ch)
  279. {
  280. return (detail::category_lookup(ch) & properties::uppercase) != 0;
  281. }
  282. inline bool is_lowercase(::boost::uint32_t ch)
  283. {
  284. return (detail::category_lookup(ch) & properties::lowercase) != 0;
  285. }
  286. inline bool is_white_space(::boost::uint32_t ch)
  287. {
  288. return (detail::category_lookup(ch) & properties::white_space) != 0;
  289. }
  290. inline bool is_blank(::boost::uint32_t ch)
  291. {
  292. switch (ch)
  293. {
  294. case '\n': case '\v': case '\f': case '\r':
  295. return false;
  296. default:
  297. return is_white_space(ch)
  298. && !( get_category(ch) == properties::line_separator
  299. || get_category(ch) == properties::paragraph_separator
  300. );
  301. }
  302. }
  303. inline bool is_graph(::boost::uint32_t ch)
  304. {
  305. return !( is_white_space(ch)
  306. || get_category(ch) == properties::control
  307. || get_category(ch) == properties::surrogate
  308. || get_category(ch) == properties::unassigned
  309. );
  310. }
  311. inline bool is_print(::boost::uint32_t ch)
  312. {
  313. return (is_graph(ch) || is_blank(ch)) && !is_control(ch);
  314. }
  315. inline bool is_noncharacter_code_point(::boost::uint32_t ch)
  316. {
  317. return (detail::category_lookup(ch) & properties::noncharacter_code_point) != 0;
  318. }
  319. inline bool is_default_ignorable_code_point(::boost::uint32_t ch)
  320. {
  321. return (detail::category_lookup(ch) & properties::default_ignorable_code_point) != 0;
  322. }
  323. inline properties::script get_script(::boost::uint32_t ch)
  324. {
  325. return static_cast<properties::script>(detail::script_lookup(ch));
  326. }
  327. inline ::boost::uint32_t to_lowercase(::boost::uint32_t ch)
  328. {
  329. // The table returns 0 to signal that this code maps to itself
  330. ::boost::uint32_t r = detail::lowercase_lookup(ch);
  331. return (r == 0)? ch : r;
  332. }
  333. inline ::boost::uint32_t to_uppercase(::boost::uint32_t ch)
  334. {
  335. // The table returns 0 to signal that this code maps to itself
  336. ::boost::uint32_t r = detail::uppercase_lookup(ch);
  337. return (r == 0)? ch : r;
  338. }
  339. }}}
  340. #endif