123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512 |
- // parser.hpp
- // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
- //
- // Distributed under the Boost Software License, Version 1.0. (See accompanying
- // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
- #ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_PARSER_HPP
- #define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_PARSER_HPP
- #include <boost/assert.hpp>
- #include "tree/end_node.hpp"
- #include "tree/iteration_node.hpp"
- #include "tree/leaf_node.hpp"
- #include "../runtime_error.hpp"
- #include "tree/selection_node.hpp"
- #include "tree/sequence_node.hpp"
- #include "../size_t.hpp"
- #include "tokeniser/re_tokeniser.hpp"
- #include <sstream>
- namespace boost
- {
- namespace lexer
- {
- namespace detail
- {
- template<typename CharT>
- class basic_parser
- {
- public:
- typedef basic_re_tokeniser<CharT> tokeniser;
- typedef typename tokeniser::string string;
- typedef std::map<string, const node *> macro_map;
- typedef node::node_ptr_vector node_ptr_vector;
- typedef typename tokeniser::num_token token;
- /*
- General principles of regex parsing:
- - Every regex is a sequence of sub-regexes.
- - Regexes consist of operands and operators
- - All operators decompose to sequence, selection ('|') and iteration ('*')
- - Regex tokens are stored on the stack.
- - When a complete sequence of regex tokens is on the stack it is processed.
- Grammar:
- <REGEX> -> <OREXP>
- <OREXP> -> <SEQUENCE> | <OREXP>'|'<SEQUENCE>
- <SEQUENCE> -> <SUB>
- <SUB> -> <EXPRESSION> | <SUB><EXPRESSION>
- <EXPRESSION> -> <REPEAT>
- <REPEAT> -> charset | macro | '('<REGEX>')' | <REPEAT><DUPLICATE>
- <DUPLICATE> -> '?' | '*' | '+' | '{n[,[m]]}'
- */
- static node *parse (const CharT *start_, const CharT * const end_,
- const std::size_t id_, const std::size_t unique_id_,
- const std::size_t dfa_state_, const regex_flags flags_,
- const std::locale &locale_, node_ptr_vector &node_ptr_vector_,
- const macro_map ¯omap_, typename tokeniser::token_map &map_,
- bool &seen_BOL_assertion_, bool &seen_EOL_assertion_)
- {
- node *root_ = 0;
- state state_ (start_, end_, flags_, locale_);
- token lhs_token_;
- token rhs_token_;
- token_stack token_stack_;
- tree_node_stack tree_node_stack_;
- char action_ = 0;
- token_stack_.push (rhs_token_);
- tokeniser::next (state_, map_, rhs_token_);
- do
- {
- lhs_token_ = token_stack_.top ();
- action_ = lhs_token_.precedence (rhs_token_._type);
- switch (action_)
- {
- case '<':
- case '=':
- token_stack_.push (rhs_token_);
- tokeniser::next (state_, map_, rhs_token_);
- break;
- case '>':
- reduce (token_stack_, macromap_, node_ptr_vector_,
- tree_node_stack_);
- break;
- default:
- std::ostringstream ss_;
- ss_ << "A syntax error occurred: '" <<
- lhs_token_.precedence_string () <<
- "' against '" << rhs_token_.precedence_string () <<
- "' at index " << state_.index () << ".";
- throw runtime_error (ss_.str ().c_str ());
- break;
- }
- } while (!token_stack_.empty ());
- if (tree_node_stack_.empty ())
- {
- throw runtime_error ("Empty rules are not allowed.");
- }
- BOOST_ASSERT(tree_node_stack_.size () == 1);
- node *lhs_node_ = tree_node_stack_.top ();
- tree_node_stack_.pop ();
- if (id_ == 0)
- {
- // Macros have no end state...
- root_ = lhs_node_;
- }
- else
- {
- node_ptr_vector_->push_back (static_cast<end_node *>(0));
- node *rhs_node_ = new end_node (id_, unique_id_, dfa_state_);
- node_ptr_vector_->back () = rhs_node_;
- node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
- node_ptr_vector_->back () = new sequence_node
- (lhs_node_, rhs_node_);
- root_ = node_ptr_vector_->back ();
- }
- // Done this way as bug in VC++ 6 prevents |= operator working
- // properly!
- if (state_._seen_BOL_assertion) seen_BOL_assertion_ = true;
- if (state_._seen_EOL_assertion) seen_EOL_assertion_ = true;
- return root_;
- }
- private:
- typedef typename tokeniser::state state;
- typedef std::stack<token> token_stack;
- typedef node::node_stack tree_node_stack;
- static void reduce (token_stack &token_stack_,
- const macro_map ¯omap_, node_ptr_vector &node_vector_ptr_,
- tree_node_stack &tree_node_stack_)
- {
- typename tokeniser::num_token lhs_;
- typename tokeniser::num_token rhs_;
- token_stack handle_;
- char action_ = 0;
- do
- {
- rhs_ = token_stack_.top ();
- token_stack_.pop ();
- handle_.push (rhs_);
- if (!token_stack_.empty ())
- {
- lhs_ = token_stack_.top ();
- action_ = lhs_.precedence (rhs_._type);
- }
- } while (!token_stack_.empty () && action_ == '=');
- BOOST_ASSERT(token_stack_.empty () || action_ == '<');
- switch (rhs_._type)
- {
- case token::BEGIN:
- // finished processing so exit
- break;
- case token::REGEX:
- // finished parsing, nothing to do
- break;
- case token::OREXP:
- orexp (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
- break;
- case token::SEQUENCE:
- token_stack_.push (token::OREXP);
- break;
- case token::SUB:
- sub (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
- break;
- case token::EXPRESSION:
- token_stack_.push (token::SUB);
- break;
- case token::REPEAT:
- repeat (handle_, token_stack_);
- break;
- case token::CHARSET:
- charset (handle_, token_stack_, node_vector_ptr_,
- tree_node_stack_);
- break;
- case token::MACRO:
- macro (handle_, token_stack_, macromap_, node_vector_ptr_,
- tree_node_stack_);
- break;
- case token::OPENPAREN:
- openparen (handle_, token_stack_);
- break;
- case token::OPT:
- case token::AOPT:
- optional (rhs_._type == token::OPT, node_vector_ptr_,
- tree_node_stack_);
- token_stack_.push (token::DUP);
- break;
- case token::ZEROORMORE:
- case token::AZEROORMORE:
- zero_or_more (rhs_._type == token::ZEROORMORE, node_vector_ptr_,
- tree_node_stack_);
- token_stack_.push (token::DUP);
- break;
- case token::ONEORMORE:
- case token::AONEORMORE:
- one_or_more (rhs_._type == token::ONEORMORE, node_vector_ptr_,
- tree_node_stack_);
- token_stack_.push (token::DUP);
- break;
- case token::REPEATN:
- case token::AREPEATN:
- repeatn (rhs_._type == token::REPEATN, handle_.top (),
- node_vector_ptr_, tree_node_stack_);
- token_stack_.push (token::DUP);
- break;
- default:
- throw runtime_error
- ("Internal error regex_parser::reduce");
- break;
- }
- }
- static void orexp (token_stack &handle_, token_stack &token_stack_,
- node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
- {
- BOOST_ASSERT(handle_.top ()._type == token::OREXP &&
- (handle_.size () == 1 || handle_.size () == 3));
- if (handle_.size () == 1)
- {
- token_stack_.push (token::REGEX);
- }
- else
- {
- handle_.pop ();
- BOOST_ASSERT(handle_.top ()._type == token::OR);
- handle_.pop ();
- BOOST_ASSERT(handle_.top ()._type == token::SEQUENCE);
- perform_or (node_ptr_vector_, tree_node_stack_);
- token_stack_.push (token::OREXP);
- }
- }
- static void sub (token_stack &handle_, token_stack &token_stack_,
- node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
- {
- BOOST_ASSERT(handle_.top ()._type == token::SUB &&
- (handle_.size () == 1 || handle_.size () == 2));
- if (handle_.size () == 1)
- {
- token_stack_.push (token::SEQUENCE);
- }
- else
- {
- handle_.pop ();
- BOOST_ASSERT(handle_.top ()._type == token::EXPRESSION);
- // perform join
- sequence (node_ptr_vector_, tree_node_stack_);
- token_stack_.push (token::SUB);
- }
- }
- static void repeat (token_stack &handle_, token_stack &token_stack_)
- {
- BOOST_ASSERT(handle_.top ()._type == token::REPEAT &&
- handle_.size () >= 1 && handle_.size () <= 3);
- if (handle_.size () == 1)
- {
- token_stack_.push (token::EXPRESSION);
- }
- else
- {
- handle_.pop ();
- BOOST_ASSERT(handle_.top ()._type == token::DUP);
- token_stack_.push (token::REPEAT);
- }
- }
- static void charset (token_stack &handle_, token_stack &token_stack_,
- node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
- {
- BOOST_ASSERT(handle_.top ()._type == token::CHARSET &&
- handle_.size () == 1);
- // store charset
- node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
- const size_t id_ = handle_.top ()._id;
- node_ptr_vector_->back () = new leaf_node (id_, true);
- tree_node_stack_.push (node_ptr_vector_->back ());
- token_stack_.push (token::REPEAT);
- }
- static void macro (token_stack &handle_, token_stack &token_stack_,
- const macro_map ¯omap_, node_ptr_vector &node_ptr_vector_,
- tree_node_stack &tree_node_stack_)
- {
- token &top_ = handle_.top ();
- BOOST_ASSERT(top_._type == token::MACRO && handle_.size () == 1);
- typename macro_map::const_iterator iter_ =
- macromap_.find (top_._macro);
- if (iter_ == macromap_.end ())
- {
- const CharT *name_ = top_._macro;
- std::basic_stringstream<CharT> ss_;
- std::ostringstream os_;
- os_ << "Unknown MACRO name '";
- while (*name_)
- {
- os_ << ss_.narrow (*name_++, ' ');
- }
- os_ << "'.";
- throw runtime_error (os_.str ());
- }
- tree_node_stack_.push (iter_->second->copy (node_ptr_vector_));
- token_stack_.push (token::REPEAT);
- }
- static void openparen (token_stack &handle_, token_stack &token_stack_)
- {
- BOOST_ASSERT(handle_.top ()._type == token::OPENPAREN &&
- handle_.size () == 3);
- handle_.pop ();
- BOOST_ASSERT(handle_.top ()._type == token::REGEX);
- handle_.pop ();
- BOOST_ASSERT(handle_.top ()._type == token::CLOSEPAREN);
- token_stack_.push (token::REPEAT);
- }
- static void perform_or (node_ptr_vector &node_ptr_vector_,
- tree_node_stack &tree_node_stack_)
- {
- // perform or
- node *rhs_ = tree_node_stack_.top ();
- tree_node_stack_.pop ();
- node *lhs_ = tree_node_stack_.top ();
- node_ptr_vector_->push_back (static_cast<selection_node *>(0));
- node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
- tree_node_stack_.top () = node_ptr_vector_->back ();
- }
- static void sequence (node_ptr_vector &node_ptr_vector_,
- tree_node_stack &tree_node_stack_)
- {
- node *rhs_ = tree_node_stack_.top ();
- tree_node_stack_.pop ();
- node *lhs_ = tree_node_stack_.top ();
- node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
- node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
- tree_node_stack_.top () = node_ptr_vector_->back ();
- }
- static void optional (const bool greedy_,
- node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
- {
- // perform ?
- node *lhs_ = tree_node_stack_.top ();
- // You don't know if lhs_ is a leaf_node, so get firstpos.
- node::node_vector &firstpos_ = lhs_->firstpos ();
- for (node::node_vector::iterator iter_ = firstpos_.begin (),
- end_ = firstpos_.end (); iter_ != end_; ++iter_)
- {
- // These are leaf_nodes!
- (*iter_)->greedy (greedy_);
- }
- node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
- node *rhs_ = new leaf_node (null_token, greedy_);
- node_ptr_vector_->back () = rhs_;
- node_ptr_vector_->push_back (static_cast<selection_node *>(0));
- node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
- tree_node_stack_.top () = node_ptr_vector_->back ();
- }
- static void zero_or_more (const bool greedy_,
- node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
- {
- // perform *
- node *ptr_ = tree_node_stack_.top ();
- node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
- node_ptr_vector_->back () = new iteration_node (ptr_, greedy_);
- tree_node_stack_.top () = node_ptr_vector_->back ();
- }
- static void one_or_more (const bool greedy_,
- node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
- {
- // perform +
- node *lhs_ = tree_node_stack_.top ();
- node *copy_ = lhs_->copy (node_ptr_vector_);
- node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
- node *rhs_ = new iteration_node (copy_, greedy_);
- node_ptr_vector_->back () = rhs_;
- node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
- node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
- tree_node_stack_.top () = node_ptr_vector_->back ();
- }
- // This is one of the most mind bending routines in this code...
- static void repeatn (const bool greedy_, const token &token_,
- node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
- {
- // perform {n[,[m]]}
- // Semantic checks have already been performed.
- // {0,} = *
- // {0,1} = ?
- // {1,} = +
- // therefore we do not check for these cases.
- if (!(token_._min == 1 && !token_._comma))
- {
- const std::size_t top_ = token_._min > 0 ?
- token_._min : token_._max;
- if (token_._min == 0)
- {
- optional (greedy_, node_ptr_vector_, tree_node_stack_);
- }
- node *prev_ = tree_node_stack_.top ()->copy (node_ptr_vector_);
- node *curr_ = 0;
- for (std::size_t i_ = 2; i_ < top_; ++i_)
- {
- curr_ = prev_->copy (node_ptr_vector_);
- tree_node_stack_.push (static_cast<node *>(0));
- tree_node_stack_.top () = prev_;
- sequence (node_ptr_vector_, tree_node_stack_);
- prev_ = curr_;
- }
- if (token_._comma && token_._min > 0)
- {
- if (token_._min > 1)
- {
- curr_ = prev_->copy (node_ptr_vector_);
- tree_node_stack_.push (static_cast<node *>(0));
- tree_node_stack_.top () = prev_;
- sequence (node_ptr_vector_, tree_node_stack_);
- prev_ = curr_;
- }
- if (token_._comma && token_._max)
- {
- tree_node_stack_.push (static_cast<node *>(0));
- tree_node_stack_.top () = prev_;
- optional (greedy_, node_ptr_vector_, tree_node_stack_);
- prev_ = tree_node_stack_.top ();
- tree_node_stack_.pop ();
- const std::size_t count_ = token_._max - token_._min;
- for (std::size_t i_ = 1; i_ < count_; ++i_)
- {
- curr_ = prev_->copy (node_ptr_vector_);
- tree_node_stack_.push (static_cast<node *>(0));
- tree_node_stack_.top () = prev_;
- sequence (node_ptr_vector_, tree_node_stack_);
- prev_ = curr_;
- }
- }
- else
- {
- tree_node_stack_.push (static_cast<node *>(0));
- tree_node_stack_.top () = prev_;
- zero_or_more (greedy_, node_ptr_vector_, tree_node_stack_);
- prev_ = tree_node_stack_.top ();
- tree_node_stack_.pop ();
- }
- }
- tree_node_stack_.push (static_cast<node *>(0));
- tree_node_stack_.top () = prev_;
- sequence (node_ptr_vector_, tree_node_stack_);
- }
- }
- };
- }
- }
- }
- #endif
|