parser.hpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. // parser.hpp
  2. // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. #ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_PARSER_HPP
  7. #define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_PARSER_HPP
  8. #include <boost/assert.hpp>
  9. #include "tree/end_node.hpp"
  10. #include "tree/iteration_node.hpp"
  11. #include "tree/leaf_node.hpp"
  12. #include "../runtime_error.hpp"
  13. #include "tree/selection_node.hpp"
  14. #include "tree/sequence_node.hpp"
  15. #include "../size_t.hpp"
  16. #include "tokeniser/re_tokeniser.hpp"
  17. #include <sstream>
  18. namespace boost
  19. {
  20. namespace lexer
  21. {
  22. namespace detail
  23. {
  24. template<typename CharT>
  25. class basic_parser
  26. {
  27. public:
  28. typedef basic_re_tokeniser<CharT> tokeniser;
  29. typedef typename tokeniser::string string;
  30. typedef std::map<string, const node *> macro_map;
  31. typedef node::node_ptr_vector node_ptr_vector;
  32. typedef typename tokeniser::num_token token;
  33. /*
  34. General principles of regex parsing:
  35. - Every regex is a sequence of sub-regexes.
  36. - Regexes consist of operands and operators
  37. - All operators decompose to sequence, selection ('|') and iteration ('*')
  38. - Regex tokens are stored on the stack.
  39. - When a complete sequence of regex tokens is on the stack it is processed.
  40. Grammar:
  41. <REGEX> -> <OREXP>
  42. <OREXP> -> <SEQUENCE> | <OREXP>'|'<SEQUENCE>
  43. <SEQUENCE> -> <SUB>
  44. <SUB> -> <EXPRESSION> | <SUB><EXPRESSION>
  45. <EXPRESSION> -> <REPEAT>
  46. <REPEAT> -> charset | macro | '('<REGEX>')' | <REPEAT><DUPLICATE>
  47. <DUPLICATE> -> '?' | '*' | '+' | '{n[,[m]]}'
  48. */
  49. static node *parse (const CharT *start_, const CharT * const end_,
  50. const std::size_t id_, const std::size_t unique_id_,
  51. const std::size_t dfa_state_, const regex_flags flags_,
  52. const std::locale &locale_, node_ptr_vector &node_ptr_vector_,
  53. const macro_map &macromap_, typename tokeniser::token_map &map_,
  54. bool &seen_BOL_assertion_, bool &seen_EOL_assertion_)
  55. {
  56. node *root_ = 0;
  57. state state_ (start_, end_, flags_, locale_);
  58. token lhs_token_;
  59. token rhs_token_;
  60. token_stack token_stack_;
  61. tree_node_stack tree_node_stack_;
  62. char action_ = 0;
  63. token_stack_.push (rhs_token_);
  64. tokeniser::next (state_, map_, rhs_token_);
  65. do
  66. {
  67. lhs_token_ = token_stack_.top ();
  68. action_ = lhs_token_.precedence (rhs_token_._type);
  69. switch (action_)
  70. {
  71. case '<':
  72. case '=':
  73. token_stack_.push (rhs_token_);
  74. tokeniser::next (state_, map_, rhs_token_);
  75. break;
  76. case '>':
  77. reduce (token_stack_, macromap_, node_ptr_vector_,
  78. tree_node_stack_);
  79. break;
  80. default:
  81. std::ostringstream ss_;
  82. ss_ << "A syntax error occurred: '" <<
  83. lhs_token_.precedence_string () <<
  84. "' against '" << rhs_token_.precedence_string () <<
  85. "' at index " << state_.index () << ".";
  86. throw runtime_error (ss_.str ().c_str ());
  87. break;
  88. }
  89. } while (!token_stack_.empty ());
  90. if (tree_node_stack_.empty ())
  91. {
  92. throw runtime_error ("Empty rules are not allowed.");
  93. }
  94. BOOST_ASSERT(tree_node_stack_.size () == 1);
  95. node *lhs_node_ = tree_node_stack_.top ();
  96. tree_node_stack_.pop ();
  97. if (id_ == 0)
  98. {
  99. // Macros have no end state...
  100. root_ = lhs_node_;
  101. }
  102. else
  103. {
  104. node_ptr_vector_->push_back (static_cast<end_node *>(0));
  105. node *rhs_node_ = new end_node (id_, unique_id_, dfa_state_);
  106. node_ptr_vector_->back () = rhs_node_;
  107. node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
  108. node_ptr_vector_->back () = new sequence_node
  109. (lhs_node_, rhs_node_);
  110. root_ = node_ptr_vector_->back ();
  111. }
  112. // Done this way as bug in VC++ 6 prevents |= operator working
  113. // properly!
  114. if (state_._seen_BOL_assertion) seen_BOL_assertion_ = true;
  115. if (state_._seen_EOL_assertion) seen_EOL_assertion_ = true;
  116. return root_;
  117. }
  118. private:
  119. typedef typename tokeniser::state state;
  120. typedef std::stack<token> token_stack;
  121. typedef node::node_stack tree_node_stack;
  122. static void reduce (token_stack &token_stack_,
  123. const macro_map &macromap_, node_ptr_vector &node_vector_ptr_,
  124. tree_node_stack &tree_node_stack_)
  125. {
  126. typename tokeniser::num_token lhs_;
  127. typename tokeniser::num_token rhs_;
  128. token_stack handle_;
  129. char action_ = 0;
  130. do
  131. {
  132. rhs_ = token_stack_.top ();
  133. token_stack_.pop ();
  134. handle_.push (rhs_);
  135. if (!token_stack_.empty ())
  136. {
  137. lhs_ = token_stack_.top ();
  138. action_ = lhs_.precedence (rhs_._type);
  139. }
  140. } while (!token_stack_.empty () && action_ == '=');
  141. BOOST_ASSERT(token_stack_.empty () || action_ == '<');
  142. switch (rhs_._type)
  143. {
  144. case token::BEGIN:
  145. // finished processing so exit
  146. break;
  147. case token::REGEX:
  148. // finished parsing, nothing to do
  149. break;
  150. case token::OREXP:
  151. orexp (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
  152. break;
  153. case token::SEQUENCE:
  154. token_stack_.push (token::OREXP);
  155. break;
  156. case token::SUB:
  157. sub (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
  158. break;
  159. case token::EXPRESSION:
  160. token_stack_.push (token::SUB);
  161. break;
  162. case token::REPEAT:
  163. repeat (handle_, token_stack_);
  164. break;
  165. case token::CHARSET:
  166. charset (handle_, token_stack_, node_vector_ptr_,
  167. tree_node_stack_);
  168. break;
  169. case token::MACRO:
  170. macro (handle_, token_stack_, macromap_, node_vector_ptr_,
  171. tree_node_stack_);
  172. break;
  173. case token::OPENPAREN:
  174. openparen (handle_, token_stack_);
  175. break;
  176. case token::OPT:
  177. case token::AOPT:
  178. optional (rhs_._type == token::OPT, node_vector_ptr_,
  179. tree_node_stack_);
  180. token_stack_.push (token::DUP);
  181. break;
  182. case token::ZEROORMORE:
  183. case token::AZEROORMORE:
  184. zero_or_more (rhs_._type == token::ZEROORMORE, node_vector_ptr_,
  185. tree_node_stack_);
  186. token_stack_.push (token::DUP);
  187. break;
  188. case token::ONEORMORE:
  189. case token::AONEORMORE:
  190. one_or_more (rhs_._type == token::ONEORMORE, node_vector_ptr_,
  191. tree_node_stack_);
  192. token_stack_.push (token::DUP);
  193. break;
  194. case token::REPEATN:
  195. case token::AREPEATN:
  196. repeatn (rhs_._type == token::REPEATN, handle_.top (),
  197. node_vector_ptr_, tree_node_stack_);
  198. token_stack_.push (token::DUP);
  199. break;
  200. default:
  201. throw runtime_error
  202. ("Internal error regex_parser::reduce");
  203. break;
  204. }
  205. }
  206. static void orexp (token_stack &handle_, token_stack &token_stack_,
  207. node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
  208. {
  209. BOOST_ASSERT(handle_.top ()._type == token::OREXP &&
  210. (handle_.size () == 1 || handle_.size () == 3));
  211. if (handle_.size () == 1)
  212. {
  213. token_stack_.push (token::REGEX);
  214. }
  215. else
  216. {
  217. handle_.pop ();
  218. BOOST_ASSERT(handle_.top ()._type == token::OR);
  219. handle_.pop ();
  220. BOOST_ASSERT(handle_.top ()._type == token::SEQUENCE);
  221. perform_or (node_ptr_vector_, tree_node_stack_);
  222. token_stack_.push (token::OREXP);
  223. }
  224. }
  225. static void sub (token_stack &handle_, token_stack &token_stack_,
  226. node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
  227. {
  228. BOOST_ASSERT(handle_.top ()._type == token::SUB &&
  229. (handle_.size () == 1 || handle_.size () == 2));
  230. if (handle_.size () == 1)
  231. {
  232. token_stack_.push (token::SEQUENCE);
  233. }
  234. else
  235. {
  236. handle_.pop ();
  237. BOOST_ASSERT(handle_.top ()._type == token::EXPRESSION);
  238. // perform join
  239. sequence (node_ptr_vector_, tree_node_stack_);
  240. token_stack_.push (token::SUB);
  241. }
  242. }
  243. static void repeat (token_stack &handle_, token_stack &token_stack_)
  244. {
  245. BOOST_ASSERT(handle_.top ()._type == token::REPEAT &&
  246. handle_.size () >= 1 && handle_.size () <= 3);
  247. if (handle_.size () == 1)
  248. {
  249. token_stack_.push (token::EXPRESSION);
  250. }
  251. else
  252. {
  253. handle_.pop ();
  254. BOOST_ASSERT(handle_.top ()._type == token::DUP);
  255. token_stack_.push (token::REPEAT);
  256. }
  257. }
  258. static void charset (token_stack &handle_, token_stack &token_stack_,
  259. node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
  260. {
  261. BOOST_ASSERT(handle_.top ()._type == token::CHARSET &&
  262. handle_.size () == 1);
  263. // store charset
  264. node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
  265. const size_t id_ = handle_.top ()._id;
  266. node_ptr_vector_->back () = new leaf_node (id_, true);
  267. tree_node_stack_.push (node_ptr_vector_->back ());
  268. token_stack_.push (token::REPEAT);
  269. }
  270. static void macro (token_stack &handle_, token_stack &token_stack_,
  271. const macro_map &macromap_, node_ptr_vector &node_ptr_vector_,
  272. tree_node_stack &tree_node_stack_)
  273. {
  274. token &top_ = handle_.top ();
  275. BOOST_ASSERT(top_._type == token::MACRO && handle_.size () == 1);
  276. typename macro_map::const_iterator iter_ =
  277. macromap_.find (top_._macro);
  278. if (iter_ == macromap_.end ())
  279. {
  280. const CharT *name_ = top_._macro;
  281. std::basic_stringstream<CharT> ss_;
  282. std::ostringstream os_;
  283. os_ << "Unknown MACRO name '";
  284. while (*name_)
  285. {
  286. os_ << ss_.narrow (*name_++, ' ');
  287. }
  288. os_ << "'.";
  289. throw runtime_error (os_.str ());
  290. }
  291. tree_node_stack_.push (iter_->second->copy (node_ptr_vector_));
  292. token_stack_.push (token::REPEAT);
  293. }
  294. static void openparen (token_stack &handle_, token_stack &token_stack_)
  295. {
  296. BOOST_ASSERT(handle_.top ()._type == token::OPENPAREN &&
  297. handle_.size () == 3);
  298. handle_.pop ();
  299. BOOST_ASSERT(handle_.top ()._type == token::REGEX);
  300. handle_.pop ();
  301. BOOST_ASSERT(handle_.top ()._type == token::CLOSEPAREN);
  302. token_stack_.push (token::REPEAT);
  303. }
  304. static void perform_or (node_ptr_vector &node_ptr_vector_,
  305. tree_node_stack &tree_node_stack_)
  306. {
  307. // perform or
  308. node *rhs_ = tree_node_stack_.top ();
  309. tree_node_stack_.pop ();
  310. node *lhs_ = tree_node_stack_.top ();
  311. node_ptr_vector_->push_back (static_cast<selection_node *>(0));
  312. node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
  313. tree_node_stack_.top () = node_ptr_vector_->back ();
  314. }
  315. static void sequence (node_ptr_vector &node_ptr_vector_,
  316. tree_node_stack &tree_node_stack_)
  317. {
  318. node *rhs_ = tree_node_stack_.top ();
  319. tree_node_stack_.pop ();
  320. node *lhs_ = tree_node_stack_.top ();
  321. node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
  322. node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
  323. tree_node_stack_.top () = node_ptr_vector_->back ();
  324. }
  325. static void optional (const bool greedy_,
  326. node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
  327. {
  328. // perform ?
  329. node *lhs_ = tree_node_stack_.top ();
  330. // You don't know if lhs_ is a leaf_node, so get firstpos.
  331. node::node_vector &firstpos_ = lhs_->firstpos ();
  332. for (node::node_vector::iterator iter_ = firstpos_.begin (),
  333. end_ = firstpos_.end (); iter_ != end_; ++iter_)
  334. {
  335. // These are leaf_nodes!
  336. (*iter_)->greedy (greedy_);
  337. }
  338. node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
  339. node *rhs_ = new leaf_node (null_token, greedy_);
  340. node_ptr_vector_->back () = rhs_;
  341. node_ptr_vector_->push_back (static_cast<selection_node *>(0));
  342. node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
  343. tree_node_stack_.top () = node_ptr_vector_->back ();
  344. }
  345. static void zero_or_more (const bool greedy_,
  346. node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
  347. {
  348. // perform *
  349. node *ptr_ = tree_node_stack_.top ();
  350. node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
  351. node_ptr_vector_->back () = new iteration_node (ptr_, greedy_);
  352. tree_node_stack_.top () = node_ptr_vector_->back ();
  353. }
  354. static void one_or_more (const bool greedy_,
  355. node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
  356. {
  357. // perform +
  358. node *lhs_ = tree_node_stack_.top ();
  359. node *copy_ = lhs_->copy (node_ptr_vector_);
  360. node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
  361. node *rhs_ = new iteration_node (copy_, greedy_);
  362. node_ptr_vector_->back () = rhs_;
  363. node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
  364. node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
  365. tree_node_stack_.top () = node_ptr_vector_->back ();
  366. }
  367. // This is one of the most mind bending routines in this code...
  368. static void repeatn (const bool greedy_, const token &token_,
  369. node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
  370. {
  371. // perform {n[,[m]]}
  372. // Semantic checks have already been performed.
  373. // {0,} = *
  374. // {0,1} = ?
  375. // {1,} = +
  376. // therefore we do not check for these cases.
  377. if (!(token_._min == 1 && !token_._comma))
  378. {
  379. const std::size_t top_ = token_._min > 0 ?
  380. token_._min : token_._max;
  381. if (token_._min == 0)
  382. {
  383. optional (greedy_, node_ptr_vector_, tree_node_stack_);
  384. }
  385. node *prev_ = tree_node_stack_.top ()->copy (node_ptr_vector_);
  386. node *curr_ = 0;
  387. for (std::size_t i_ = 2; i_ < top_; ++i_)
  388. {
  389. curr_ = prev_->copy (node_ptr_vector_);
  390. tree_node_stack_.push (static_cast<node *>(0));
  391. tree_node_stack_.top () = prev_;
  392. sequence (node_ptr_vector_, tree_node_stack_);
  393. prev_ = curr_;
  394. }
  395. if (token_._comma && token_._min > 0)
  396. {
  397. if (token_._min > 1)
  398. {
  399. curr_ = prev_->copy (node_ptr_vector_);
  400. tree_node_stack_.push (static_cast<node *>(0));
  401. tree_node_stack_.top () = prev_;
  402. sequence (node_ptr_vector_, tree_node_stack_);
  403. prev_ = curr_;
  404. }
  405. if (token_._comma && token_._max)
  406. {
  407. tree_node_stack_.push (static_cast<node *>(0));
  408. tree_node_stack_.top () = prev_;
  409. optional (greedy_, node_ptr_vector_, tree_node_stack_);
  410. prev_ = tree_node_stack_.top ();
  411. tree_node_stack_.pop ();
  412. const std::size_t count_ = token_._max - token_._min;
  413. for (std::size_t i_ = 1; i_ < count_; ++i_)
  414. {
  415. curr_ = prev_->copy (node_ptr_vector_);
  416. tree_node_stack_.push (static_cast<node *>(0));
  417. tree_node_stack_.top () = prev_;
  418. sequence (node_ptr_vector_, tree_node_stack_);
  419. prev_ = curr_;
  420. }
  421. }
  422. else
  423. {
  424. tree_node_stack_.push (static_cast<node *>(0));
  425. tree_node_stack_.top () = prev_;
  426. zero_or_more (greedy_, node_ptr_vector_, tree_node_stack_);
  427. prev_ = tree_node_stack_.top ();
  428. tree_node_stack_.pop ();
  429. }
  430. }
  431. tree_node_stack_.push (static_cast<node *>(0));
  432. tree_node_stack_.top () = prev_;
  433. sequence (node_ptr_vector_, tree_node_stack_);
  434. }
  435. }
  436. };
  437. }
  438. }
  439. }
  440. #endif