lexer.hpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. // Copyright (c) 2001-2011 Hartmut Kaiser
  2. //
  3. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  4. // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. #if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM)
  6. #define BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM
  7. #if defined(_MSC_VER)
  8. #pragma once
  9. #endif
  10. #include <iosfwd>
  11. #include <boost/spirit/home/support/detail/lexer/generator.hpp>
  12. #include <boost/spirit/home/support/detail/lexer/rules.hpp>
  13. #include <boost/spirit/home/support/detail/lexer/consts.hpp>
  14. #include <boost/spirit/home/support/unused.hpp>
  15. #include <boost/spirit/home/lex/lexer/lexertl/token.hpp>
  16. #include <boost/spirit/home/lex/lexer/lexertl/functor.hpp>
  17. #include <boost/spirit/home/lex/lexer/lexertl/functor_data.hpp>
  18. #include <boost/spirit/home/lex/lexer/lexertl/iterator.hpp>
  19. #if defined(BOOST_SPIRIT_LEXERTL_DEBUG)
  20. #include <boost/spirit/home/support/detail/lexer/debug.hpp>
  21. #endif
  22. #include <iterator> // for std::iterator_traits
  23. namespace boost { namespace spirit { namespace lex { namespace lexertl
  24. {
  25. ///////////////////////////////////////////////////////////////////////////
  26. namespace detail
  27. {
  28. ///////////////////////////////////////////////////////////////////////
  29. // The must_escape function checks if the given character value needs
  30. // to be preceded by a backslash character to disable its special
  31. // meaning in the context of a regular expression
  32. ///////////////////////////////////////////////////////////////////////
  33. template <typename Char>
  34. inline bool must_escape(Char c)
  35. {
  36. // FIXME: more needed?
  37. switch (c) {
  38. case '+': case '/': case '*': case '?':
  39. case '|':
  40. case '(': case ')':
  41. case '[': case ']':
  42. case '{': case '}':
  43. case '.':
  44. case '^': case '$':
  45. case '\\':
  46. case '"':
  47. return true;
  48. default:
  49. break;
  50. }
  51. return false;
  52. }
  53. ///////////////////////////////////////////////////////////////////////
  54. // The escape function returns the string representation of the given
  55. // character value, possibly escaped with a backslash character, to
  56. // allow it being safely used in a regular expression definition.
  57. ///////////////////////////////////////////////////////////////////////
  58. template <typename Char>
  59. inline std::basic_string<Char> escape(Char ch)
  60. {
  61. std::basic_string<Char> result(1, ch);
  62. if (detail::must_escape(ch))
  63. {
  64. typedef typename std::basic_string<Char>::size_type size_type;
  65. result.insert((size_type)0, 1, '\\');
  66. }
  67. return result;
  68. }
  69. ///////////////////////////////////////////////////////////////////////
  70. //
  71. ///////////////////////////////////////////////////////////////////////
  72. inline boost::lexer::regex_flags map_flags(unsigned int flags)
  73. {
  74. unsigned int retval = boost::lexer::none;
  75. if (flags & match_flags::match_not_dot_newline)
  76. retval |= boost::lexer::dot_not_newline;
  77. if (flags & match_flags::match_icase)
  78. retval |= boost::lexer::icase;
  79. return boost::lexer::regex_flags(retval);
  80. }
  81. }
  82. ///////////////////////////////////////////////////////////////////////////
  83. template <typename Lexer, typename F>
  84. bool generate_static(Lexer const&
  85. , std::basic_ostream<typename Lexer::char_type>&
  86. , typename Lexer::char_type const*, F);
  87. ///////////////////////////////////////////////////////////////////////////
  88. //
  89. // Every lexer type to be used as a lexer for Spirit has to conform to
  90. // the following public interface:
  91. //
  92. // typedefs:
  93. // iterator_type The type of the iterator exposed by this lexer.
  94. // token_type The type of the tokens returned from the exposed
  95. // iterators.
  96. //
  97. // functions:
  98. // default constructor
  99. // Since lexers are instantiated as base classes
  100. // only it might be a good idea to make this
  101. // constructor protected.
  102. // begin, end Return a pair of iterators, when dereferenced
  103. // returning the sequence of tokens recognized in
  104. // the input stream given as the parameters to the
  105. // begin() function.
  106. // add_token Should add the definition of a token to be
  107. // recognized by this lexer.
  108. // clear Should delete all current token definitions
  109. // associated with the given state of this lexer
  110. // object.
  111. //
  112. // template parameters:
  113. // Iterator The type of the iterator used to access the
  114. // underlying character stream.
  115. // Token The type of the tokens to be returned from the
  116. // exposed token iterator.
  117. // Functor The type of the InputPolicy to use to instantiate
  118. // the multi_pass iterator type to be used as the
  119. // token iterator (returned from begin()/end()).
  120. //
  121. ///////////////////////////////////////////////////////////////////////////
  122. ///////////////////////////////////////////////////////////////////////////
  123. //
  124. // The lexer class is a implementation of a Spirit.Lex lexer on
  125. // top of Ben Hanson's lexertl library as outlined above (For more
  126. // information about lexertl go here: http://www.benhanson.net/lexertl.html).
  127. //
  128. // This class is supposed to be used as the first and only template
  129. // parameter while instantiating instances of a lex::lexer class.
  130. //
  131. ///////////////////////////////////////////////////////////////////////////
  132. template <typename Token = token<>
  133. , typename Iterator = typename Token::iterator_type
  134. , typename Functor = functor<Token, lexertl::detail::data, Iterator> >
  135. class lexer
  136. {
  137. private:
  138. struct dummy { void true_() {} };
  139. typedef void (dummy::*safe_bool)();
  140. static std::size_t const all_states_id = static_cast<std::size_t>(-2);
  141. public:
  142. operator safe_bool() const
  143. { return initialized_dfa_ ? &dummy::true_ : 0; }
  144. typedef typename std::iterator_traits<Iterator>::value_type char_type;
  145. typedef std::basic_string<char_type> string_type;
  146. typedef boost::lexer::basic_rules<char_type> basic_rules_type;
  147. // Every lexer type to be used as a lexer for Spirit has to conform to
  148. // a public interface .
  149. typedef Token token_type;
  150. typedef typename Token::id_type id_type;
  151. typedef iterator<Functor> iterator_type;
  152. private:
  153. #ifdef _MSC_VER
  154. # pragma warning(push)
  155. # pragma warning(disable: 4512) // assignment operator could not be generated.
  156. #endif
  157. // this type is purely used for the iterator_type construction below
  158. struct iterator_data_type
  159. {
  160. typedef typename Functor::semantic_actions_type semantic_actions_type;
  161. iterator_data_type(
  162. boost::lexer::basic_state_machine<char_type> const& sm
  163. , boost::lexer::basic_rules<char_type> const& rules
  164. , semantic_actions_type const& actions)
  165. : state_machine_(sm), rules_(rules), actions_(actions)
  166. {}
  167. boost::lexer::basic_state_machine<char_type> const& state_machine_;
  168. boost::lexer::basic_rules<char_type> const& rules_;
  169. semantic_actions_type const& actions_;
  170. };
  171. #ifdef _MSC_VER
  172. # pragma warning(pop)
  173. #endif
  174. public:
  175. // Return the start iterator usable for iterating over the generated
  176. // tokens.
  177. iterator_type begin(Iterator& first, Iterator const& last
  178. , char_type const* initial_state = 0) const
  179. {
  180. if (!init_dfa()) // never minimize DFA for dynamic lexers
  181. return iterator_type();
  182. iterator_data_type iterator_data(state_machine_, rules_, actions_);
  183. return iterator_type(iterator_data, first, last, initial_state);
  184. }
  185. // Return the end iterator usable to stop iterating over the generated
  186. // tokens.
  187. iterator_type end() const
  188. {
  189. return iterator_type();
  190. }
  191. protected:
  192. // Lexer instances can be created by means of a derived class only.
  193. lexer(unsigned int flags)
  194. : flags_(detail::map_flags(flags))
  195. , rules_(flags_)
  196. , initialized_dfa_(false)
  197. {}
  198. public:
  199. // interface for token definition management
  200. std::size_t add_token(char_type const* state, char_type tokendef,
  201. std::size_t token_id, char_type const* targetstate)
  202. {
  203. add_state(state);
  204. initialized_dfa_ = false;
  205. if (state == all_states())
  206. return rules_.add(state, detail::escape(tokendef), token_id, rules_.dot());
  207. if (0 == targetstate)
  208. targetstate = state;
  209. else
  210. add_state(targetstate);
  211. return rules_.add(state, detail::escape(tokendef), token_id, targetstate);
  212. }
  213. std::size_t add_token(char_type const* state, string_type const& tokendef,
  214. std::size_t token_id, char_type const* targetstate)
  215. {
  216. add_state(state);
  217. initialized_dfa_ = false;
  218. if (state == all_states())
  219. return rules_.add(state, tokendef, token_id, rules_.dot());
  220. if (0 == targetstate)
  221. targetstate = state;
  222. else
  223. add_state(targetstate);
  224. return rules_.add(state, tokendef, token_id, targetstate);
  225. }
  226. // interface for pattern definition management
  227. void add_pattern (char_type const* state, string_type const& name,
  228. string_type const& patterndef)
  229. {
  230. add_state(state);
  231. rules_.add_macro(name.c_str(), patterndef);
  232. initialized_dfa_ = false;
  233. }
  234. boost::lexer::rules const& get_rules() const { return rules_; }
  235. void clear(char_type const* state)
  236. {
  237. std::size_t s = rules_.state(state);
  238. if (boost::lexer::npos != s)
  239. rules_.clear(state);
  240. initialized_dfa_ = false;
  241. }
  242. std::size_t add_state(char_type const* state)
  243. {
  244. if (state == all_states())
  245. return all_states_id;
  246. std::size_t stateid = rules_.state(state);
  247. if (boost::lexer::npos == stateid) {
  248. stateid = rules_.add_state(state);
  249. initialized_dfa_ = false;
  250. }
  251. return stateid;
  252. }
  253. string_type initial_state() const
  254. {
  255. return string_type(rules_.initial());
  256. }
  257. string_type all_states() const
  258. {
  259. return string_type(rules_.all_states());
  260. }
  261. // Register a semantic action with the given id
  262. template <typename F>
  263. void add_action(std::size_t unique_id, std::size_t state, F act)
  264. {
  265. // If you see an error here stating add_action is not a member of
  266. // fusion::unused_type then you are probably having semantic actions
  267. // attached to at least one token in the lexer definition without
  268. // using the lex::lexertl::actor_lexer<> as its base class.
  269. typedef typename Functor::wrap_action_type wrapper_type;
  270. if (state == all_states_id) {
  271. // add the action to all known states
  272. typedef typename
  273. basic_rules_type::string_size_t_map::const_iterator
  274. state_iterator;
  275. std::size_t states = rules_.statemap().size();
  276. for (state_iterator it = rules_.statemap().begin(),
  277. end = rules_.statemap().end(); it != end; ++it) {
  278. for (std::size_t j = 0; j < states; ++j)
  279. actions_.add_action(unique_id + j, it->second, wrapper_type::call(act));
  280. }
  281. }
  282. else {
  283. actions_.add_action(unique_id, state, wrapper_type::call(act));
  284. }
  285. }
  286. // template <typename F>
  287. // void add_action(std::size_t unique_id, char_type const* state, F act)
  288. // {
  289. // typedef typename Functor::wrap_action_type wrapper_type;
  290. // actions_.add_action(unique_id, add_state(state), wrapper_type::call(act));
  291. // }
  292. // We do not minimize the state machine by default anymore because
  293. // Ben said: "If you can afford to generate a lexer at runtime, there
  294. // is little point in calling minimise."
  295. // Go figure.
  296. bool init_dfa(bool minimize = false) const
  297. {
  298. if (!initialized_dfa_) {
  299. state_machine_.clear();
  300. typedef boost::lexer::basic_generator<char_type> generator;
  301. generator::build (rules_, state_machine_);
  302. if (minimize)
  303. generator::minimise (state_machine_);
  304. #if defined(BOOST_SPIRIT_LEXERTL_DEBUG)
  305. boost::lexer::debug::dump(state_machine_, std::cerr);
  306. #endif
  307. initialized_dfa_ = true;
  308. // // release memory held by rules description
  309. // basic_rules_type rules;
  310. // rules.init_state_info(rules_); // preserve states
  311. // std::swap(rules, rules_);
  312. }
  313. return true;
  314. }
  315. private:
  316. // lexertl specific data
  317. mutable boost::lexer::basic_state_machine<char_type> state_machine_;
  318. boost::lexer::regex_flags flags_;
  319. /*mutable*/ basic_rules_type rules_;
  320. typename Functor::semantic_actions_type actions_;
  321. mutable bool initialized_dfa_;
  322. // generator functions must be able to access members directly
  323. template <typename Lexer, typename F>
  324. friend bool generate_static(Lexer const&
  325. , std::basic_ostream<typename Lexer::char_type>&
  326. , typename Lexer::char_type const*, F);
  327. };
  328. ///////////////////////////////////////////////////////////////////////////
  329. //
  330. // The actor_lexer class is another implementation of a Spirit.Lex
  331. // lexer on top of Ben Hanson's lexertl library as outlined above (For
  332. // more information about lexertl go here:
  333. // http://www.benhanson.net/lexertl.html).
  334. //
  335. // The only difference to the lexer class above is that token_def
  336. // definitions may have semantic (lexer) actions attached while being
  337. // defined:
  338. //
  339. // int w;
  340. // token_def word = "[^ \t\n]+";
  341. // self = word[++ref(w)]; // see example: word_count_lexer
  342. //
  343. // This class is supposed to be used as the first and only template
  344. // parameter while instantiating instances of a lex::lexer class.
  345. //
  346. ///////////////////////////////////////////////////////////////////////////
  347. template <typename Token = token<>
  348. , typename Iterator = typename Token::iterator_type
  349. , typename Functor = functor<Token, lexertl::detail::data, Iterator, mpl::true_> >
  350. class actor_lexer : public lexer<Token, Iterator, Functor>
  351. {
  352. protected:
  353. // Lexer instances can be created by means of a derived class only.
  354. actor_lexer(unsigned int flags)
  355. : lexer<Token, Iterator, Functor>(flags) {}
  356. };
  357. }}}}
  358. #endif