string_token.hpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407
  1. // string_token.hpp
  2. // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. #ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_STRING_TOKEN_HPP
  7. #define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_STRING_TOKEN_HPP
  8. #include <algorithm>
  9. #include "size_t.hpp"
  10. #include "consts.hpp" // num_chars, num_wchar_ts
  11. #include <sstream>
  12. #include <string>
  13. #include <limits>
  14. namespace boost
  15. {
  16. namespace lexer
  17. {
  18. template<typename CharT>
  19. struct basic_string_token
  20. {
  21. typedef std::basic_string<CharT> string;
  22. bool _negated;
  23. string _charset;
  24. basic_string_token () :
  25. _negated (false)
  26. {
  27. }
  28. basic_string_token (const bool negated_, const string &charset_) :
  29. _negated (negated_),
  30. _charset (charset_)
  31. {
  32. }
  33. void remove_duplicates ()
  34. {
  35. const CharT *start_ = _charset.c_str ();
  36. const CharT *end_ = start_ + _charset.size ();
  37. // Optimisation for very large charsets:
  38. // sorting via pointers is much quicker than
  39. // via iterators...
  40. std::sort (const_cast<CharT *> (start_), const_cast<CharT *> (end_));
  41. _charset.erase (std::unique (_charset.begin (), _charset.end ()),
  42. _charset.end ());
  43. }
  44. void normalise ()
  45. {
  46. const std::size_t max_chars_ = sizeof (CharT) == 1 ?
  47. num_chars : num_wchar_ts;
  48. if (_charset.length () == max_chars_)
  49. {
  50. _negated = !_negated;
  51. _charset.clear ();
  52. }
  53. else if (_charset.length () > max_chars_ / 2)
  54. {
  55. negate ();
  56. }
  57. }
  58. void negate ()
  59. {
  60. const std::size_t max_chars_ = sizeof (CharT) == 1 ?
  61. num_chars : num_wchar_ts;
  62. CharT curr_char_ = (std::numeric_limits<CharT>::min)();
  63. string temp_;
  64. const CharT *curr_ = _charset.c_str ();
  65. const CharT *chars_end_ = curr_ + _charset.size ();
  66. _negated = !_negated;
  67. temp_.resize (max_chars_ - _charset.size ());
  68. CharT *ptr_ = const_cast<CharT *> (temp_.c_str ());
  69. std::size_t i_ = 0;
  70. while (curr_ < chars_end_)
  71. {
  72. while (*curr_ > curr_char_)
  73. {
  74. *ptr_ = curr_char_;
  75. ++ptr_;
  76. ++curr_char_;
  77. ++i_;
  78. }
  79. ++curr_char_;
  80. ++curr_;
  81. ++i_;
  82. }
  83. for (; i_ < max_chars_; ++i_)
  84. {
  85. *ptr_ = curr_char_;
  86. ++ptr_;
  87. ++curr_char_;
  88. }
  89. _charset = temp_;
  90. }
  91. bool operator < (const basic_string_token &rhs_) const
  92. {
  93. return _negated < rhs_._negated ||
  94. (_negated == rhs_._negated && _charset < rhs_._charset);
  95. }
  96. bool empty () const
  97. {
  98. return _charset.empty () && !_negated;
  99. }
  100. bool any () const
  101. {
  102. return _charset.empty () && _negated;
  103. }
  104. void clear ()
  105. {
  106. _negated = false;
  107. _charset.clear ();
  108. }
  109. void intersect (basic_string_token &rhs_, basic_string_token &overlap_)
  110. {
  111. if ((any () && rhs_.any ()) || (_negated == rhs_._negated &&
  112. !any () && !rhs_.any ()))
  113. {
  114. intersect_same_types (rhs_, overlap_);
  115. }
  116. else
  117. {
  118. intersect_diff_types (rhs_, overlap_);
  119. }
  120. }
  121. static void escape_char (const CharT ch_, string &out_)
  122. {
  123. switch (ch_)
  124. {
  125. case '\0':
  126. out_ += '\\';
  127. out_ += '0';
  128. break;
  129. case '\a':
  130. out_ += '\\';
  131. out_ += 'a';
  132. break;
  133. case '\b':
  134. out_ += '\\';
  135. out_ += 'b';
  136. break;
  137. case 27:
  138. out_ += '\\';
  139. out_ += 'x';
  140. out_ += '1';
  141. out_ += 'b';
  142. break;
  143. case '\f':
  144. out_ += '\\';
  145. out_ += 'f';
  146. break;
  147. case '\n':
  148. out_ += '\\';
  149. out_ += 'n';
  150. break;
  151. case '\r':
  152. out_ += '\\';
  153. out_ += 'r';
  154. break;
  155. case '\t':
  156. out_ += '\\';
  157. out_ += 't';
  158. break;
  159. case '\v':
  160. out_ += '\\';
  161. out_ += 'v';
  162. break;
  163. case '\\':
  164. out_ += '\\';
  165. out_ += '\\';
  166. break;
  167. case '"':
  168. out_ += '\\';
  169. out_ += '"';
  170. break;
  171. case '\'':
  172. out_ += '\\';
  173. out_ += '\'';
  174. break;
  175. default:
  176. {
  177. if (ch_ < 32 && ch_ >= 0)
  178. {
  179. std::basic_stringstream<CharT> ss_;
  180. out_ += '\\';
  181. out_ += 'x';
  182. ss_ << std::hex <<
  183. static_cast<std::size_t> (ch_);
  184. out_ += ss_.str ();
  185. }
  186. else
  187. {
  188. out_ += ch_;
  189. }
  190. break;
  191. }
  192. }
  193. }
  194. private:
  195. void intersect_same_types (basic_string_token &rhs_,
  196. basic_string_token &overlap_)
  197. {
  198. if (any ())
  199. {
  200. clear ();
  201. overlap_._negated = true;
  202. rhs_.clear ();
  203. }
  204. else
  205. {
  206. typename string::iterator iter_ = _charset.begin ();
  207. typename string::iterator end_ = _charset.end ();
  208. typename string::iterator rhs_iter_ = rhs_._charset.begin ();
  209. typename string::iterator rhs_end_ = rhs_._charset.end ();
  210. overlap_._negated = _negated;
  211. while (iter_ != end_ && rhs_iter_ != rhs_end_)
  212. {
  213. if (*iter_ < *rhs_iter_)
  214. {
  215. ++iter_;
  216. }
  217. else if (*iter_ > *rhs_iter_)
  218. {
  219. ++rhs_iter_;
  220. }
  221. else
  222. {
  223. overlap_._charset += *iter_;
  224. iter_ = _charset.erase (iter_);
  225. end_ = _charset.end ();
  226. rhs_iter_ = rhs_._charset.erase (rhs_iter_);
  227. rhs_end_ = rhs_._charset.end ();
  228. }
  229. }
  230. if (_negated)
  231. {
  232. // duplicates already merged, so safe to merge
  233. // using std lib.
  234. // src, dest
  235. merge (_charset, overlap_._charset);
  236. // duplicates already merged, so safe to merge
  237. // using std lib.
  238. // src, dest
  239. merge (rhs_._charset, overlap_._charset);
  240. _negated = false;
  241. rhs_._negated = false;
  242. std::swap (_charset, rhs_._charset);
  243. normalise ();
  244. overlap_.normalise ();
  245. rhs_.normalise ();
  246. }
  247. else if (!overlap_._charset.empty ())
  248. {
  249. normalise ();
  250. overlap_.normalise ();
  251. rhs_.normalise ();
  252. }
  253. }
  254. }
  255. void intersect_diff_types (basic_string_token &rhs_,
  256. basic_string_token &overlap_)
  257. {
  258. if (any ())
  259. {
  260. intersect_any (rhs_, overlap_);
  261. }
  262. else if (_negated)
  263. {
  264. intersect_negated (rhs_, overlap_);
  265. }
  266. else // _negated == false
  267. {
  268. intersect_charset (rhs_, overlap_);
  269. }
  270. }
  271. void intersect_any (basic_string_token &rhs_, basic_string_token &overlap_)
  272. {
  273. if (rhs_._negated)
  274. {
  275. rhs_.intersect_negated (*this, overlap_);
  276. }
  277. else // rhs._negated == false
  278. {
  279. rhs_.intersect_charset (*this, overlap_);
  280. }
  281. }
  282. void intersect_negated (basic_string_token &rhs_,
  283. basic_string_token &overlap_)
  284. {
  285. if (rhs_.any ())
  286. {
  287. overlap_._negated = true;
  288. overlap_._charset = _charset;
  289. rhs_._negated = false;
  290. rhs_._charset = _charset;
  291. clear ();
  292. }
  293. else // rhs._negated == false
  294. {
  295. rhs_.intersect_charset (*this, overlap_);
  296. }
  297. }
  298. void intersect_charset (basic_string_token &rhs_,
  299. basic_string_token &overlap_)
  300. {
  301. if (rhs_.any ())
  302. {
  303. overlap_._charset = _charset;
  304. rhs_._negated = true;
  305. rhs_._charset = _charset;
  306. clear ();
  307. }
  308. else // rhs_._negated == true
  309. {
  310. typename string::iterator iter_ = _charset.begin ();
  311. typename string::iterator end_ = _charset.end ();
  312. typename string::iterator rhs_iter_ = rhs_._charset.begin ();
  313. typename string::iterator rhs_end_ = rhs_._charset.end ();
  314. while (iter_ != end_ && rhs_iter_ != rhs_end_)
  315. {
  316. if (*iter_ < *rhs_iter_)
  317. {
  318. overlap_._charset += *iter_;
  319. rhs_iter_ = rhs_._charset.insert (rhs_iter_, *iter_);
  320. ++rhs_iter_;
  321. rhs_end_ = rhs_._charset.end ();
  322. iter_ = _charset.erase (iter_);
  323. end_ = _charset.end ();
  324. }
  325. else if (*iter_ > *rhs_iter_)
  326. {
  327. ++rhs_iter_;
  328. }
  329. else
  330. {
  331. ++iter_;
  332. ++rhs_iter_;
  333. }
  334. }
  335. if (iter_ != end_)
  336. {
  337. // nothing bigger in rhs_ than iter_,
  338. // so safe to merge using std lib.
  339. string temp_ (iter_, end_);
  340. // src, dest
  341. merge (temp_, overlap_._charset);
  342. _charset.erase (iter_, end_);
  343. }
  344. if (!overlap_._charset.empty ())
  345. {
  346. merge (overlap_._charset, rhs_._charset);
  347. // possible duplicates, so check for any and erase.
  348. rhs_._charset.erase (std::unique (rhs_._charset.begin (),
  349. rhs_._charset.end ()), rhs_._charset.end ());
  350. normalise ();
  351. overlap_.normalise ();
  352. rhs_.normalise ();
  353. }
  354. }
  355. }
  356. void merge (string &src_, string &dest_)
  357. {
  358. string tmp_ (src_.size () + dest_.size (), 0);
  359. std::merge (src_.begin (), src_.end (), dest_.begin (), dest_.end (),
  360. tmp_.begin ());
  361. dest_ = tmp_;
  362. }
  363. };
  364. }
  365. }
  366. #endif