lexer.hpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. // Copyright (c) 2001-2011 Hartmut Kaiser
  2. //
  3. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  4. // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. #if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_13_2007_0145PM)
  6. #define BOOST_SPIRIT_LEX_LEXER_MAR_13_2007_0145PM
  7. #if defined(_MSC_VER)
  8. #pragma once
  9. #endif
  10. #include <boost/spirit/home/support/info.hpp>
  11. #include <boost/spirit/home/qi/skip_over.hpp>
  12. #include <boost/spirit/home/qi/parser.hpp>
  13. #include <boost/spirit/home/qi/detail/assign_to.hpp>
  14. #include <boost/spirit/home/lex/reference.hpp>
  15. #include <boost/spirit/home/lex/meta_compiler.hpp>
  16. #include <boost/spirit/home/lex/lexer_type.hpp>
  17. #include <boost/spirit/home/lex/lexer/token_def.hpp>
  18. #include <boost/assert.hpp>
  19. #include <boost/noncopyable.hpp>
  20. #include <boost/fusion/include/vector.hpp>
  21. #include <boost/mpl/assert.hpp>
  22. #include <boost/proto/extends.hpp>
  23. #include <boost/proto/traits.hpp>
  24. #include <boost/range/iterator_range_core.hpp>
  25. #include <iterator> // for std::iterator_traits
  26. #include <string>
  27. namespace boost { namespace spirit { namespace lex
  28. {
  29. ///////////////////////////////////////////////////////////////////////////
  30. namespace detail
  31. {
  32. ///////////////////////////////////////////////////////////////////////
  33. #ifdef _MSC_VER
  34. # pragma warning(push)
  35. # pragma warning(disable: 4512) // assignment operator could not be generated.
  36. #endif
  37. template <typename LexerDef>
  38. struct lexer_def_
  39. : proto::extends<
  40. typename proto::terminal<
  41. lex::reference<lexer_def_<LexerDef> const>
  42. >::type
  43. , lexer_def_<LexerDef> >
  44. , qi::parser<lexer_def_<LexerDef> >
  45. , lex::lexer_type<lexer_def_<LexerDef> >
  46. {
  47. private:
  48. // avoid warnings about using 'this' in constructor
  49. lexer_def_& this_() { return *this; }
  50. typedef typename LexerDef::char_type char_type;
  51. typedef typename LexerDef::string_type string_type;
  52. typedef typename LexerDef::id_type id_type;
  53. typedef lex::reference<lexer_def_ const> reference_;
  54. typedef typename proto::terminal<reference_>::type terminal_type;
  55. typedef proto::extends<terminal_type, lexer_def_> proto_base_type;
  56. reference_ alias() const
  57. {
  58. return reference_(*this);
  59. }
  60. public:
  61. // Qi interface: metafunction calculating parser attribute type
  62. template <typename Context, typename Iterator>
  63. struct attribute
  64. {
  65. // the return value of a token set contains the matched token
  66. // id, and the corresponding pair of iterators
  67. typedef typename Iterator::base_iterator_type iterator_type;
  68. typedef
  69. fusion::vector2<id_type, iterator_range<iterator_type> >
  70. type;
  71. };
  72. // Qi interface: parse functionality
  73. template <typename Iterator, typename Context
  74. , typename Skipper, typename Attribute>
  75. bool parse(Iterator& first, Iterator const& last
  76. , Context& /*context*/, Skipper const& skipper
  77. , Attribute& attr) const
  78. {
  79. qi::skip_over(first, last, skipper); // always do a pre-skip
  80. if (first != last) {
  81. typedef typename
  82. std::iterator_traits<Iterator>::value_type
  83. token_type;
  84. token_type const& t = *first;
  85. if (token_is_valid(t) && t.state() == first.get_state()) {
  86. // any of the token definitions matched
  87. spirit::traits::assign_to(t, attr);
  88. ++first;
  89. return true;
  90. }
  91. }
  92. return false;
  93. }
  94. // Qi interface: 'what' functionality
  95. template <typename Context>
  96. info what(Context& /*context*/) const
  97. {
  98. return info("lexer");
  99. }
  100. private:
  101. // allow to use the lexer.self.add("regex1", id1)("regex2", id2);
  102. // syntax
  103. struct adder
  104. {
  105. adder(lexer_def_& def_)
  106. : def(def_) {}
  107. // Add a token definition based on a single character as given
  108. // by the first parameter, the second parameter allows to
  109. // specify the token id to use for the new token. If no token
  110. // id is given the character code is used.
  111. adder const& operator()(char_type c
  112. , id_type token_id = id_type()) const
  113. {
  114. if (id_type() == token_id)
  115. token_id = static_cast<id_type>(c);
  116. def.def.add_token (def.state.c_str(), c, token_id
  117. , def.targetstate.empty() ? 0 : def.targetstate.c_str());
  118. return *this;
  119. }
  120. // Add a token definition based on a character sequence as
  121. // given by the first parameter, the second parameter allows to
  122. // specify the token id to use for the new token. If no token
  123. // id is given this function will generate a unique id to be
  124. // used as the token's id.
  125. adder const& operator()(string_type const& s
  126. , id_type token_id = id_type()) const
  127. {
  128. if (id_type() == token_id)
  129. token_id = def.def.get_next_id();
  130. def.def.add_token (def.state.c_str(), s, token_id
  131. , def.targetstate.empty() ? 0 : def.targetstate.c_str());
  132. return *this;
  133. }
  134. template <typename Attribute>
  135. adder const& operator()(
  136. token_def<Attribute, char_type, id_type>& tokdef
  137. , id_type token_id = id_type()) const
  138. {
  139. // make sure we have a token id
  140. if (id_type() == token_id) {
  141. if (id_type() == tokdef.id()) {
  142. token_id = def.def.get_next_id();
  143. tokdef.id(token_id);
  144. }
  145. else {
  146. token_id = tokdef.id();
  147. }
  148. }
  149. else {
  150. // the following assertion makes sure that the token_def
  151. // instance has not been assigned a different id earlier
  152. BOOST_ASSERT(id_type() == tokdef.id()
  153. || token_id == tokdef.id());
  154. tokdef.id(token_id);
  155. }
  156. def.define(tokdef);
  157. return *this;
  158. }
  159. // template <typename F>
  160. // adder const& operator()(char_type c, id_type token_id, F act) const
  161. // {
  162. // if (id_type() == token_id)
  163. // token_id = def.def.get_next_id();
  164. // std::size_t unique_id =
  165. // def.def.add_token (def.state.c_str(), s, token_id);
  166. // def.def.add_action(unique_id, def.state.c_str(), act);
  167. // return *this;
  168. // }
  169. lexer_def_& def;
  170. };
  171. friend struct adder;
  172. // allow to use lexer.self.add_pattern("pattern1", "regex1")(...);
  173. // syntax
  174. struct pattern_adder
  175. {
  176. pattern_adder(lexer_def_& def_)
  177. : def(def_) {}
  178. pattern_adder const& operator()(string_type const& p
  179. , string_type const& s) const
  180. {
  181. def.def.add_pattern (def.state.c_str(), p, s);
  182. return *this;
  183. }
  184. lexer_def_& def;
  185. };
  186. friend struct pattern_adder;
  187. private:
  188. // Helper function to invoke the necessary 2 step compilation
  189. // process on token definition expressions
  190. template <typename TokenExpr>
  191. void compile2pass(TokenExpr const& expr)
  192. {
  193. expr.collect(def, state, targetstate);
  194. expr.add_actions(def);
  195. }
  196. public:
  197. ///////////////////////////////////////////////////////////////////
  198. template <typename Expr>
  199. void define(Expr const& expr)
  200. {
  201. compile2pass(compile<lex::domain>(expr));
  202. }
  203. lexer_def_(LexerDef& def_, string_type const& state_
  204. , string_type const& targetstate_ = string_type())
  205. : proto_base_type(terminal_type::make(alias()))
  206. , add(this_()), add_pattern(this_()), def(def_)
  207. , state(state_), targetstate(targetstate_)
  208. {}
  209. // allow to switch states
  210. lexer_def_ operator()(char_type const* state_) const
  211. {
  212. return lexer_def_(def, state_);
  213. }
  214. lexer_def_ operator()(char_type const* state_
  215. , char_type const* targetstate_) const
  216. {
  217. return lexer_def_(def, state_, targetstate_);
  218. }
  219. lexer_def_ operator()(string_type const& state_
  220. , string_type const& targetstate_ = string_type()) const
  221. {
  222. return lexer_def_(def, state_, targetstate_);
  223. }
  224. // allow to assign a token definition expression
  225. template <typename Expr>
  226. lexer_def_& operator= (Expr const& xpr)
  227. {
  228. // Report invalid expression error as early as possible.
  229. // If you got an error_invalid_expression error message here,
  230. // then the expression (expr) is not a valid spirit lex
  231. // expression.
  232. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  233. def.clear(state.c_str());
  234. define(xpr);
  235. return *this;
  236. }
  237. // explicitly tell the lexer that the given state will be defined
  238. // (useful in conjunction with "*")
  239. std::size_t add_state(char_type const* state_ = 0)
  240. {
  241. return def.add_state(state_ ? state_ : def.initial_state().c_str());
  242. }
  243. adder add;
  244. pattern_adder add_pattern;
  245. private:
  246. LexerDef& def;
  247. string_type state;
  248. string_type targetstate;
  249. };
  250. #ifdef _MSC_VER
  251. # pragma warning(pop)
  252. #endif
  253. #if defined(BOOST_NO_CXX11_RVALUE_REFERENCES)
  254. // allow to assign a token definition expression
  255. template <typename LexerDef, typename Expr>
  256. inline lexer_def_<LexerDef>&
  257. operator+= (lexer_def_<LexerDef>& lexdef, Expr& xpr)
  258. {
  259. // Report invalid expression error as early as possible.
  260. // If you got an error_invalid_expression error message here,
  261. // then the expression (expr) is not a valid spirit lex
  262. // expression.
  263. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  264. lexdef.define(xpr);
  265. return lexdef;
  266. }
  267. #else
  268. // allow to assign a token definition expression
  269. template <typename LexerDef, typename Expr>
  270. inline lexer_def_<LexerDef>&
  271. operator+= (lexer_def_<LexerDef>& lexdef, Expr&& xpr)
  272. {
  273. // Report invalid expression error as early as possible.
  274. // If you got an error_invalid_expression error message here,
  275. // then the expression (expr) is not a valid spirit lex
  276. // expression.
  277. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  278. lexdef.define(xpr);
  279. return lexdef;
  280. }
  281. #endif
  282. template <typename LexerDef, typename Expr>
  283. inline lexer_def_<LexerDef>&
  284. operator+= (lexer_def_<LexerDef>& lexdef, Expr const& xpr)
  285. {
  286. // Report invalid expression error as early as possible.
  287. // If you got an error_invalid_expression error message here,
  288. // then the expression (expr) is not a valid spirit lex
  289. // expression.
  290. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  291. lexdef.define(xpr);
  292. return lexdef;
  293. }
  294. }
  295. ///////////////////////////////////////////////////////////////////////////
  296. // The match_flags flags are used to influence different matching
  297. // modes of the lexer
  298. struct match_flags
  299. {
  300. enum enum_type
  301. {
  302. match_default = 0, // no flags
  303. match_not_dot_newline = 1, // the regex '.' doesn't match newlines
  304. match_icase = 2 // all matching operations are case insensitive
  305. };
  306. };
  307. ///////////////////////////////////////////////////////////////////////////
  308. // This represents a lexer object
  309. ///////////////////////////////////////////////////////////////////////////
  310. ///////////////////////////////////////////////////////////////////////////
  311. // This is the first token id automatically assigned by the library
  312. // if needed
  313. enum tokenids
  314. {
  315. min_token_id = 0x10000
  316. };
  317. template <typename Lexer>
  318. class lexer : public Lexer
  319. {
  320. private:
  321. // avoid warnings about using 'this' in constructor
  322. lexer& this_() { return *this; }
  323. std::size_t next_token_id; // has to be an integral type
  324. public:
  325. typedef Lexer lexer_type;
  326. typedef typename Lexer::id_type id_type;
  327. typedef typename Lexer::char_type char_type;
  328. typedef typename Lexer::iterator_type iterator_type;
  329. typedef lexer base_type;
  330. typedef detail::lexer_def_<lexer> lexer_def;
  331. typedef std::basic_string<char_type> string_type;
  332. // if `id_type` was specified but `first_id` is not provided
  333. // the `min_token_id` value may be out of range for `id_type`,
  334. // but it will be a problem only if unique ids feature is in use.
  335. lexer(unsigned int flags = match_flags::match_default)
  336. : lexer_type(flags)
  337. , next_token_id(min_token_id)
  338. , self(this_(), lexer_type::initial_state())
  339. {}
  340. lexer(unsigned int flags, id_type first_id)
  341. : lexer_type(flags)
  342. , next_token_id(first_id)
  343. , self(this_(), lexer_type::initial_state())
  344. {}
  345. // access iterator interface
  346. template <typename Iterator>
  347. iterator_type begin(Iterator& first, Iterator const& last
  348. , char_type const* initial_state = 0) const
  349. { return this->lexer_type::begin(first, last, initial_state); }
  350. iterator_type end() const
  351. { return this->lexer_type::end(); }
  352. std::size_t map_state(char_type const* state)
  353. { return this->lexer_type::add_state(state); }
  354. // create a unique token id
  355. id_type get_next_id() { return id_type(next_token_id++); }
  356. lexer_def self; // allow for easy token definition
  357. };
  358. }}}
  359. #endif