unicode_iterator.hpp 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862
  1. /*
  2. *
  3. * Copyright (c) 2004
  4. * John Maddock
  5. *
  6. * Use, modification and distribution are subject to the
  7. * Boost Software License, Version 1.0. (See accompanying file
  8. * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  9. *
  10. */
  11. /*
  12. * LOCATION: see http://www.boost.org for most recent version.
  13. * FILE unicode_iterator.hpp
  14. * VERSION see <boost/version.hpp>
  15. * DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
  16. */
  17. /****************************************************************************
  18. Contents:
  19. ~~~~~~~~~
  20. 1) Read Only, Input Adapters:
  21. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  22. template <class BaseIterator, class U8Type = std::uint8_t>
  23. class u32_to_u8_iterator;
  24. Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
  25. template <class BaseIterator, class U32Type = std::uint32_t>
  26. class u8_to_u32_iterator;
  27. Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
  28. template <class BaseIterator, class U16Type = std::uint16_t>
  29. class u32_to_u16_iterator;
  30. Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
  31. template <class BaseIterator, class U32Type = std::uint32_t>
  32. class u16_to_u32_iterator;
  33. Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
  34. 2) Single pass output iterator adapters:
  35. template <class BaseIterator>
  36. class utf8_output_iterator;
  37. Accepts UTF-32 code points and forwards them on as UTF-8 code points.
  38. template <class BaseIterator>
  39. class utf16_output_iterator;
  40. Accepts UTF-32 code points and forwards them on as UTF-16 code points.
  41. ****************************************************************************/
  42. #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
  43. #define BOOST_REGEX_UNICODE_ITERATOR_HPP
  44. #include <cstdint>
  45. #include <boost/regex/config.hpp>
  46. #include <stdexcept>
  47. #include <sstream>
  48. #include <ios>
  49. #include <limits.h> // CHAR_BIT
  50. #ifndef BOOST_REGEX_STANDALONE
  51. #include <boost/throw_exception.hpp>
  52. #endif
  53. namespace boost{
  54. namespace detail{
  55. static const std::uint16_t high_surrogate_base = 0xD7C0u;
  56. static const std::uint16_t low_surrogate_base = 0xDC00u;
  57. static const std::uint32_t ten_bit_mask = 0x3FFu;
  58. inline bool is_high_surrogate(std::uint16_t v)
  59. {
  60. return (v & 0xFFFFFC00u) == 0xd800u;
  61. }
  62. inline bool is_low_surrogate(std::uint16_t v)
  63. {
  64. return (v & 0xFFFFFC00u) == 0xdc00u;
  65. }
  66. template <class T>
  67. inline bool is_surrogate(T v)
  68. {
  69. return (v & 0xFFFFF800u) == 0xd800;
  70. }
  71. inline unsigned utf8_byte_count(std::uint8_t c)
  72. {
  73. // if the most significant bit with a zero in it is in position
  74. // 8-N then there are N bytes in this UTF-8 sequence:
  75. std::uint8_t mask = 0x80u;
  76. unsigned result = 0;
  77. while(c & mask)
  78. {
  79. ++result;
  80. mask >>= 1;
  81. }
  82. return (result == 0) ? 1 : ((result > 4) ? 4 : result);
  83. }
  84. inline unsigned utf8_trailing_byte_count(std::uint8_t c)
  85. {
  86. return utf8_byte_count(c) - 1;
  87. }
  88. #ifdef BOOST_REGEX_MSVC
  89. #pragma warning(push)
  90. #pragma warning(disable:4100)
  91. #endif
  92. #ifndef BOOST_NO_EXCEPTIONS
  93. BOOST_REGEX_NORETURN
  94. #endif
  95. inline void invalid_utf32_code_point(std::uint32_t val)
  96. {
  97. std::stringstream ss;
  98. ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
  99. std::out_of_range e(ss.str());
  100. #ifndef BOOST_REGEX_STANDALONE
  101. boost::throw_exception(e);
  102. #else
  103. throw e;
  104. #endif
  105. }
  106. #ifdef BOOST_REGEX_MSVC
  107. #pragma warning(pop)
  108. #endif
  109. } // namespace detail
  110. template <class BaseIterator, class U16Type = std::uint16_t>
  111. class u32_to_u16_iterator
  112. {
  113. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  114. static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
  115. static_assert(sizeof(U16Type)*CHAR_BIT == 16, "Incorrectly sized template argument");
  116. public:
  117. typedef std::ptrdiff_t difference_type;
  118. typedef U16Type value_type;
  119. typedef value_type const* pointer;
  120. typedef value_type const reference;
  121. typedef std::bidirectional_iterator_tag iterator_category;
  122. reference operator*()const
  123. {
  124. if(m_current == 2)
  125. extract_current();
  126. return m_values[m_current];
  127. }
  128. bool operator==(const u32_to_u16_iterator& that)const
  129. {
  130. if(m_position == that.m_position)
  131. {
  132. // Both m_currents must be equal, or both even
  133. // this is the same as saying their sum must be even:
  134. return (m_current + that.m_current) & 1u ? false : true;
  135. }
  136. return false;
  137. }
  138. bool operator!=(const u32_to_u16_iterator& that)const
  139. {
  140. return !(*this == that);
  141. }
  142. u32_to_u16_iterator& operator++()
  143. {
  144. // if we have a pending read then read now, so that we know whether
  145. // to skip a position, or move to a low-surrogate:
  146. if(m_current == 2)
  147. {
  148. // pending read:
  149. extract_current();
  150. }
  151. // move to the next surrogate position:
  152. ++m_current;
  153. // if we've reached the end skip a position:
  154. if(m_values[m_current] == 0)
  155. {
  156. m_current = 2;
  157. ++m_position;
  158. }
  159. return *this;
  160. }
  161. u32_to_u16_iterator operator++(int)
  162. {
  163. u32_to_u16_iterator r(*this);
  164. ++(*this);
  165. return r;
  166. }
  167. u32_to_u16_iterator& operator--()
  168. {
  169. if(m_current != 1)
  170. {
  171. // decrementing an iterator always leads to a valid position:
  172. --m_position;
  173. extract_current();
  174. m_current = m_values[1] ? 1 : 0;
  175. }
  176. else
  177. {
  178. m_current = 0;
  179. }
  180. return *this;
  181. }
  182. u32_to_u16_iterator operator--(int)
  183. {
  184. u32_to_u16_iterator r(*this);
  185. --(*this);
  186. return r;
  187. }
  188. BaseIterator base()const
  189. {
  190. return m_position;
  191. }
  192. // construct:
  193. u32_to_u16_iterator() : m_position(), m_current(0)
  194. {
  195. m_values[0] = 0;
  196. m_values[1] = 0;
  197. m_values[2] = 0;
  198. }
  199. u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
  200. {
  201. m_values[0] = 0;
  202. m_values[1] = 0;
  203. m_values[2] = 0;
  204. }
  205. private:
  206. void extract_current()const
  207. {
  208. // begin by checking for a code point out of range:
  209. std::uint32_t v = *m_position;
  210. if(v >= 0x10000u)
  211. {
  212. if(v > 0x10FFFFu)
  213. detail::invalid_utf32_code_point(*m_position);
  214. // split into two surrogates:
  215. m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
  216. m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
  217. m_current = 0;
  218. BOOST_REGEX_ASSERT(detail::is_high_surrogate(m_values[0]));
  219. BOOST_REGEX_ASSERT(detail::is_low_surrogate(m_values[1]));
  220. }
  221. else
  222. {
  223. // 16-bit code point:
  224. m_values[0] = static_cast<U16Type>(*m_position);
  225. m_values[1] = 0;
  226. m_current = 0;
  227. // value must not be a surrogate:
  228. if(detail::is_surrogate(m_values[0]))
  229. detail::invalid_utf32_code_point(*m_position);
  230. }
  231. }
  232. BaseIterator m_position;
  233. mutable U16Type m_values[3];
  234. mutable unsigned m_current;
  235. };
  236. template <class BaseIterator, class U32Type = std::uint32_t>
  237. class u16_to_u32_iterator
  238. {
  239. // special values for pending iterator reads:
  240. static const U32Type pending_read = 0xffffffffu;
  241. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  242. static_assert(sizeof(base_value_type)*CHAR_BIT == 16, "Incorrectly sized template argument");
  243. static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
  244. public:
  245. typedef std::ptrdiff_t difference_type;
  246. typedef U32Type value_type;
  247. typedef value_type const* pointer;
  248. typedef value_type const reference;
  249. typedef std::bidirectional_iterator_tag iterator_category;
  250. reference operator*()const
  251. {
  252. if(m_value == pending_read)
  253. extract_current();
  254. return m_value;
  255. }
  256. bool operator==(const u16_to_u32_iterator& that)const
  257. {
  258. return m_position == that.m_position;
  259. }
  260. bool operator!=(const u16_to_u32_iterator& that)const
  261. {
  262. return !(*this == that);
  263. }
  264. u16_to_u32_iterator& operator++()
  265. {
  266. // skip high surrogate first if there is one:
  267. if(detail::is_high_surrogate(*m_position)) ++m_position;
  268. ++m_position;
  269. m_value = pending_read;
  270. return *this;
  271. }
  272. u16_to_u32_iterator operator++(int)
  273. {
  274. u16_to_u32_iterator r(*this);
  275. ++(*this);
  276. return r;
  277. }
  278. u16_to_u32_iterator& operator--()
  279. {
  280. --m_position;
  281. // if we have a low surrogate then go back one more:
  282. if(detail::is_low_surrogate(*m_position))
  283. --m_position;
  284. m_value = pending_read;
  285. return *this;
  286. }
  287. u16_to_u32_iterator operator--(int)
  288. {
  289. u16_to_u32_iterator r(*this);
  290. --(*this);
  291. return r;
  292. }
  293. BaseIterator base()const
  294. {
  295. return m_position;
  296. }
  297. // construct:
  298. u16_to_u32_iterator() : m_position()
  299. {
  300. m_value = pending_read;
  301. }
  302. u16_to_u32_iterator(BaseIterator b) : m_position(b)
  303. {
  304. m_value = pending_read;
  305. }
  306. //
  307. // Range checked version:
  308. //
  309. u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
  310. {
  311. m_value = pending_read;
  312. //
  313. // The range must not start with a low surrogate, or end in a high surrogate,
  314. // otherwise we run the risk of running outside the underlying input range.
  315. // Likewise b must not be located at a low surrogate.
  316. //
  317. std::uint16_t val;
  318. if(start != end)
  319. {
  320. if((b != start) && (b != end))
  321. {
  322. val = *b;
  323. if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
  324. invalid_code_point(val);
  325. }
  326. val = *start;
  327. if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
  328. invalid_code_point(val);
  329. val = *--end;
  330. if(detail::is_high_surrogate(val))
  331. invalid_code_point(val);
  332. }
  333. }
  334. private:
  335. static void invalid_code_point(std::uint16_t val)
  336. {
  337. std::stringstream ss;
  338. ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
  339. std::out_of_range e(ss.str());
  340. #ifndef BOOST_REGEX_STANDALONE
  341. boost::throw_exception(e);
  342. #else
  343. throw e;
  344. #endif
  345. }
  346. void extract_current()const
  347. {
  348. m_value = static_cast<U32Type>(static_cast< std::uint16_t>(*m_position));
  349. // if the last value is a high surrogate then adjust m_position and m_value as needed:
  350. if(detail::is_high_surrogate(*m_position))
  351. {
  352. // precondition; next value must have be a low-surrogate:
  353. BaseIterator next(m_position);
  354. std::uint16_t t = *++next;
  355. if((t & 0xFC00u) != 0xDC00u)
  356. invalid_code_point(t);
  357. m_value = (m_value - detail::high_surrogate_base) << 10;
  358. m_value |= (static_cast<U32Type>(static_cast< std::uint16_t>(t)) & detail::ten_bit_mask);
  359. }
  360. // postcondition; result must not be a surrogate:
  361. if(detail::is_surrogate(m_value))
  362. invalid_code_point(static_cast< std::uint16_t>(m_value));
  363. }
  364. BaseIterator m_position;
  365. mutable U32Type m_value;
  366. };
  367. template <class BaseIterator, class U8Type = std::uint8_t>
  368. class u32_to_u8_iterator
  369. {
  370. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  371. static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
  372. static_assert(sizeof(U8Type)*CHAR_BIT == 8, "Incorrectly sized template argument");
  373. public:
  374. typedef std::ptrdiff_t difference_type;
  375. typedef U8Type value_type;
  376. typedef value_type const* pointer;
  377. typedef value_type const reference;
  378. typedef std::bidirectional_iterator_tag iterator_category;
  379. reference operator*()const
  380. {
  381. if(m_current == 4)
  382. extract_current();
  383. return m_values[m_current];
  384. }
  385. bool operator==(const u32_to_u8_iterator& that)const
  386. {
  387. if(m_position == that.m_position)
  388. {
  389. // either the m_current's must be equal, or one must be 0 and
  390. // the other 4: which means neither must have bits 1 or 2 set:
  391. return (m_current == that.m_current)
  392. || (((m_current | that.m_current) & 3) == 0);
  393. }
  394. return false;
  395. }
  396. bool operator!=(const u32_to_u8_iterator& that)const
  397. {
  398. return !(*this == that);
  399. }
  400. u32_to_u8_iterator& operator++()
  401. {
  402. // if we have a pending read then read now, so that we know whether
  403. // to skip a position, or move to a low-surrogate:
  404. if(m_current == 4)
  405. {
  406. // pending read:
  407. extract_current();
  408. }
  409. // move to the next surrogate position:
  410. ++m_current;
  411. // if we've reached the end skip a position:
  412. if(m_values[m_current] == 0)
  413. {
  414. m_current = 4;
  415. ++m_position;
  416. }
  417. return *this;
  418. }
  419. u32_to_u8_iterator operator++(int)
  420. {
  421. u32_to_u8_iterator r(*this);
  422. ++(*this);
  423. return r;
  424. }
  425. u32_to_u8_iterator& operator--()
  426. {
  427. if((m_current & 3) == 0)
  428. {
  429. --m_position;
  430. extract_current();
  431. m_current = 3;
  432. while(m_current && (m_values[m_current] == 0))
  433. --m_current;
  434. }
  435. else
  436. --m_current;
  437. return *this;
  438. }
  439. u32_to_u8_iterator operator--(int)
  440. {
  441. u32_to_u8_iterator r(*this);
  442. --(*this);
  443. return r;
  444. }
  445. BaseIterator base()const
  446. {
  447. return m_position;
  448. }
  449. // construct:
  450. u32_to_u8_iterator() : m_position(), m_current(0)
  451. {
  452. m_values[0] = 0;
  453. m_values[1] = 0;
  454. m_values[2] = 0;
  455. m_values[3] = 0;
  456. m_values[4] = 0;
  457. }
  458. u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
  459. {
  460. m_values[0] = 0;
  461. m_values[1] = 0;
  462. m_values[2] = 0;
  463. m_values[3] = 0;
  464. m_values[4] = 0;
  465. }
  466. private:
  467. void extract_current()const
  468. {
  469. std::uint32_t c = *m_position;
  470. if(c > 0x10FFFFu)
  471. detail::invalid_utf32_code_point(c);
  472. if(c < 0x80u)
  473. {
  474. m_values[0] = static_cast<unsigned char>(c);
  475. m_values[1] = static_cast<unsigned char>(0u);
  476. m_values[2] = static_cast<unsigned char>(0u);
  477. m_values[3] = static_cast<unsigned char>(0u);
  478. }
  479. else if(c < 0x800u)
  480. {
  481. m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
  482. m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  483. m_values[2] = static_cast<unsigned char>(0u);
  484. m_values[3] = static_cast<unsigned char>(0u);
  485. }
  486. else if(c < 0x10000u)
  487. {
  488. m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
  489. m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  490. m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  491. m_values[3] = static_cast<unsigned char>(0u);
  492. }
  493. else
  494. {
  495. m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
  496. m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
  497. m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  498. m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  499. }
  500. m_current= 0;
  501. }
  502. BaseIterator m_position;
  503. mutable U8Type m_values[5];
  504. mutable unsigned m_current;
  505. };
  506. template <class BaseIterator, class U32Type = std::uint32_t>
  507. class u8_to_u32_iterator
  508. {
  509. // special values for pending iterator reads:
  510. static const U32Type pending_read = 0xffffffffu;
  511. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  512. static_assert(sizeof(base_value_type)*CHAR_BIT == 8, "Incorrectly sized template argument");
  513. static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
  514. public:
  515. typedef std::ptrdiff_t difference_type;
  516. typedef U32Type value_type;
  517. typedef value_type const* pointer;
  518. typedef value_type const reference;
  519. typedef std::bidirectional_iterator_tag iterator_category;
  520. reference operator*()const
  521. {
  522. if(m_value == pending_read)
  523. extract_current();
  524. return m_value;
  525. }
  526. bool operator==(const u8_to_u32_iterator& that)const
  527. {
  528. return m_position == that.m_position;
  529. }
  530. bool operator!=(const u8_to_u32_iterator& that)const
  531. {
  532. return !(*this == that);
  533. }
  534. u8_to_u32_iterator& operator++()
  535. {
  536. // We must not start with a continuation character:
  537. if((static_cast<std::uint8_t>(*m_position) & 0xC0) == 0x80)
  538. invalid_sequence();
  539. // skip high surrogate first if there is one:
  540. unsigned c = detail::utf8_byte_count(*m_position);
  541. if(m_value == pending_read)
  542. {
  543. // Since we haven't read in a value, we need to validate the code points:
  544. for(unsigned i = 0; i < c; ++i)
  545. {
  546. ++m_position;
  547. // We must have a continuation byte:
  548. if((i != c - 1) && ((static_cast<std::uint8_t>(*m_position) & 0xC0) != 0x80))
  549. invalid_sequence();
  550. }
  551. }
  552. else
  553. {
  554. std::advance(m_position, c);
  555. }
  556. m_value = pending_read;
  557. return *this;
  558. }
  559. u8_to_u32_iterator operator++(int)
  560. {
  561. u8_to_u32_iterator r(*this);
  562. ++(*this);
  563. return r;
  564. }
  565. u8_to_u32_iterator& operator--()
  566. {
  567. // Keep backtracking until we don't have a trailing character:
  568. unsigned count = 0;
  569. while((*--m_position & 0xC0u) == 0x80u) ++count;
  570. // now check that the sequence was valid:
  571. if(count != detail::utf8_trailing_byte_count(*m_position))
  572. invalid_sequence();
  573. m_value = pending_read;
  574. return *this;
  575. }
  576. u8_to_u32_iterator operator--(int)
  577. {
  578. u8_to_u32_iterator r(*this);
  579. --(*this);
  580. return r;
  581. }
  582. BaseIterator base()const
  583. {
  584. return m_position;
  585. }
  586. // construct:
  587. u8_to_u32_iterator() : m_position()
  588. {
  589. m_value = pending_read;
  590. }
  591. u8_to_u32_iterator(BaseIterator b) : m_position(b)
  592. {
  593. m_value = pending_read;
  594. }
  595. //
  596. // Checked constructor:
  597. //
  598. u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
  599. {
  600. m_value = pending_read;
  601. //
  602. // We must not start with a continuation character, or end with a
  603. // truncated UTF-8 sequence otherwise we run the risk of going past
  604. // the start/end of the underlying sequence:
  605. //
  606. if(start != end)
  607. {
  608. unsigned char v = *start;
  609. if((v & 0xC0u) == 0x80u)
  610. invalid_sequence();
  611. if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
  612. invalid_sequence();
  613. BaseIterator pos = end;
  614. do
  615. {
  616. v = *--pos;
  617. }
  618. while((start != pos) && ((v & 0xC0u) == 0x80u));
  619. std::ptrdiff_t extra = detail::utf8_byte_count(v);
  620. if(std::distance(pos, end) < extra)
  621. invalid_sequence();
  622. }
  623. }
  624. private:
  625. static void invalid_sequence()
  626. {
  627. std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
  628. #ifndef BOOST_REGEX_STANDALONE
  629. boost::throw_exception(e);
  630. #else
  631. throw e;
  632. #endif
  633. }
  634. void extract_current()const
  635. {
  636. m_value = static_cast<U32Type>(static_cast< std::uint8_t>(*m_position));
  637. // we must not have a continuation character:
  638. if((m_value & 0xC0u) == 0x80u)
  639. invalid_sequence();
  640. // see how many extra bytes we have:
  641. unsigned extra = detail::utf8_trailing_byte_count(*m_position);
  642. // extract the extra bits, 6 from each extra byte:
  643. BaseIterator next(m_position);
  644. for(unsigned c = 0; c < extra; ++c)
  645. {
  646. ++next;
  647. m_value <<= 6;
  648. // We must have a continuation byte:
  649. if((static_cast<std::uint8_t>(*next) & 0xC0) != 0x80)
  650. invalid_sequence();
  651. m_value += static_cast<std::uint8_t>(*next) & 0x3Fu;
  652. }
  653. // we now need to remove a few of the leftmost bits, but how many depends
  654. // upon how many extra bytes we've extracted:
  655. static const std::uint32_t masks[4] =
  656. {
  657. 0x7Fu,
  658. 0x7FFu,
  659. 0xFFFFu,
  660. 0x1FFFFFu,
  661. };
  662. m_value &= masks[extra];
  663. // check the result is in range:
  664. if(m_value > static_cast<U32Type>(0x10FFFFu))
  665. invalid_sequence();
  666. // The result must not be a surrogate:
  667. if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
  668. invalid_sequence();
  669. // We should not have had an invalidly encoded UTF8 sequence:
  670. if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
  671. invalid_sequence();
  672. }
  673. BaseIterator m_position;
  674. mutable U32Type m_value;
  675. };
  676. template <class BaseIterator>
  677. class utf16_output_iterator
  678. {
  679. public:
  680. typedef void difference_type;
  681. typedef void value_type;
  682. typedef std::uint32_t* pointer;
  683. typedef std::uint32_t& reference;
  684. typedef std::output_iterator_tag iterator_category;
  685. utf16_output_iterator(const BaseIterator& b)
  686. : m_position(b){}
  687. utf16_output_iterator(const utf16_output_iterator& that)
  688. : m_position(that.m_position){}
  689. utf16_output_iterator& operator=(const utf16_output_iterator& that)
  690. {
  691. m_position = that.m_position;
  692. return *this;
  693. }
  694. const utf16_output_iterator& operator*()const
  695. {
  696. return *this;
  697. }
  698. void operator=(std::uint32_t val)const
  699. {
  700. push(val);
  701. }
  702. utf16_output_iterator& operator++()
  703. {
  704. return *this;
  705. }
  706. utf16_output_iterator& operator++(int)
  707. {
  708. return *this;
  709. }
  710. BaseIterator base()const
  711. {
  712. return m_position;
  713. }
  714. private:
  715. void push(std::uint32_t v)const
  716. {
  717. if(v >= 0x10000u)
  718. {
  719. // begin by checking for a code point out of range:
  720. if(v > 0x10FFFFu)
  721. detail::invalid_utf32_code_point(v);
  722. // split into two surrogates:
  723. *m_position++ = static_cast<std::uint16_t>(v >> 10) + detail::high_surrogate_base;
  724. *m_position++ = static_cast<std::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
  725. }
  726. else
  727. {
  728. // 16-bit code point:
  729. // value must not be a surrogate:
  730. if(detail::is_surrogate(v))
  731. detail::invalid_utf32_code_point(v);
  732. *m_position++ = static_cast<std::uint16_t>(v);
  733. }
  734. }
  735. mutable BaseIterator m_position;
  736. };
  737. template <class BaseIterator>
  738. class utf8_output_iterator
  739. {
  740. public:
  741. typedef void difference_type;
  742. typedef void value_type;
  743. typedef std::uint32_t* pointer;
  744. typedef std::uint32_t& reference;
  745. typedef std::output_iterator_tag iterator_category;
  746. utf8_output_iterator(const BaseIterator& b)
  747. : m_position(b){}
  748. utf8_output_iterator(const utf8_output_iterator& that)
  749. : m_position(that.m_position){}
  750. utf8_output_iterator& operator=(const utf8_output_iterator& that)
  751. {
  752. m_position = that.m_position;
  753. return *this;
  754. }
  755. const utf8_output_iterator& operator*()const
  756. {
  757. return *this;
  758. }
  759. void operator=(std::uint32_t val)const
  760. {
  761. push(val);
  762. }
  763. utf8_output_iterator& operator++()
  764. {
  765. return *this;
  766. }
  767. utf8_output_iterator& operator++(int)
  768. {
  769. return *this;
  770. }
  771. BaseIterator base()const
  772. {
  773. return m_position;
  774. }
  775. private:
  776. void push(std::uint32_t c)const
  777. {
  778. if(c > 0x10FFFFu)
  779. detail::invalid_utf32_code_point(c);
  780. if(c < 0x80u)
  781. {
  782. *m_position++ = static_cast<unsigned char>(c);
  783. }
  784. else if(c < 0x800u)
  785. {
  786. *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
  787. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  788. }
  789. else if(c < 0x10000u)
  790. {
  791. *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
  792. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  793. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  794. }
  795. else
  796. {
  797. *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
  798. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
  799. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  800. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  801. }
  802. }
  803. mutable BaseIterator m_position;
  804. };
  805. } // namespace boost
  806. #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP