cpp_re2c_lexer.hpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. /*=============================================================================
  2. Boost.Wave: A Standard compliant C++ preprocessor library
  3. Re2C based C++ lexer
  4. http://www.boost.org/
  5. Copyright (c) 2001-2012 Hartmut Kaiser. Distributed under the Boost
  6. Software License, Version 1.0. (See accompanying file
  7. LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  8. =============================================================================*/
  9. #if !defined(BOOST_CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED)
  10. #define BOOST_CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED
  11. #include <string>
  12. #include <cstdio>
  13. #include <cstdarg>
  14. #if defined(BOOST_SPIRIT_DEBUG)
  15. #include <iostream>
  16. #endif // defined(BOOST_SPIRIT_DEBUG)
  17. #include <boost/concept_check.hpp>
  18. #include <boost/assert.hpp>
  19. #include <boost/wave/wave_config.hpp>
  20. #include <boost/wave/language_support.hpp>
  21. #include <boost/wave/token_ids.hpp>
  22. #include <boost/wave/util/file_position.hpp>
  23. #include <boost/wave/cpplexer/validate_universal_char.hpp>
  24. #include <boost/wave/cpplexer/cpplexer_exceptions.hpp>
  25. #include <boost/wave/cpplexer/token_cache.hpp>
  26. #include <boost/wave/cpplexer/convert_trigraphs.hpp>
  27. #include <boost/wave/cpplexer/cpp_lex_interface.hpp>
  28. #include <boost/wave/cpplexer/re2clex/scanner.hpp>
  29. #include <boost/wave/cpplexer/re2clex/cpp_re.hpp>
  30. #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
  31. #include <boost/wave/cpplexer/detect_include_guards.hpp>
  32. #endif
  33. #include <boost/wave/cpplexer/cpp_lex_interface_generator.hpp>
  34. // this must occur after all of the includes and before any code appears
  35. #ifdef BOOST_HAS_ABI_HEADERS
  36. #include BOOST_ABI_PREFIX
  37. #endif
  38. ///////////////////////////////////////////////////////////////////////////////
  39. namespace boost {
  40. namespace wave {
  41. namespace cpplexer {
  42. namespace re2clex {
  43. ///////////////////////////////////////////////////////////////////////////////
  44. //
  45. // encapsulation of the re2c based cpp lexer
  46. //
  47. ///////////////////////////////////////////////////////////////////////////////
  48. template <typename IteratorT,
  49. typename PositionT = boost::wave::util::file_position_type,
  50. typename TokenT = lex_token<PositionT> >
  51. class lexer
  52. {
  53. public:
  54. typedef TokenT token_type;
  55. typedef typename token_type::string_type string_type;
  56. lexer(IteratorT const &first, IteratorT const &last,
  57. PositionT const &pos, boost::wave::language_support language_);
  58. ~lexer();
  59. token_type& get(token_type&);
  60. void set_position(PositionT const &pos)
  61. {
  62. // set position has to change the file name and line number only
  63. filename = pos.get_file();
  64. scanner.line = pos.get_line();
  65. // scanner.column = scanner.curr_column = pos.get_column();
  66. scanner.file_name = filename.c_str();
  67. }
  68. #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
  69. bool has_include_guards(std::string& guard_name) const
  70. {
  71. return guards.detected(guard_name);
  72. }
  73. #endif
  74. // error reporting from the re2c generated lexer
  75. static int report_error(Scanner<IteratorT> const* s, int code, char const *, ...);
  76. private:
  77. static char const *tok_names[];
  78. Scanner<IteratorT> scanner;
  79. string_type filename;
  80. string_type value;
  81. bool at_eof;
  82. boost::wave::language_support language;
  83. #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
  84. include_guards<token_type> guards;
  85. #endif
  86. #if BOOST_WAVE_SUPPORT_THREADING == 0
  87. static token_cache<string_type> const cache;
  88. #else
  89. token_cache<string_type> const cache;
  90. #endif
  91. };
  92. ///////////////////////////////////////////////////////////////////////////////
  93. // initialize cpp lexer
  94. template <typename IteratorT, typename PositionT, typename TokenT>
  95. inline
  96. lexer<IteratorT, PositionT, TokenT>::lexer(IteratorT const &first,
  97. IteratorT const &last, PositionT const &pos,
  98. boost::wave::language_support language_)
  99. : scanner(first, last),
  100. filename(pos.get_file()), at_eof(false), language(language_)
  101. #if BOOST_WAVE_SUPPORT_THREADING != 0
  102. , cache()
  103. #endif
  104. {
  105. using namespace std; // some systems have memset in std
  106. scanner.line = pos.get_line();
  107. scanner.column = scanner.curr_column = pos.get_column();
  108. scanner.error_proc = report_error;
  109. scanner.file_name = filename.c_str();
  110. #if BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
  111. scanner.enable_ms_extensions = true;
  112. #else
  113. scanner.enable_ms_extensions = false;
  114. #endif
  115. #if BOOST_WAVE_SUPPORT_VARIADICS_PLACEMARKERS != 0
  116. scanner.act_in_c99_mode = boost::wave::need_c99(language_);
  117. #endif
  118. #if BOOST_WAVE_SUPPORT_IMPORT_KEYWORD != 0
  119. scanner.enable_import_keyword = !boost::wave::need_c99(language_);
  120. #else
  121. scanner.enable_import_keyword = false;
  122. #endif
  123. scanner.detect_pp_numbers = boost::wave::need_prefer_pp_numbers(language_);
  124. scanner.single_line_only = boost::wave::need_single_line(language_);
  125. #if BOOST_WAVE_SUPPORT_CPP0X != 0
  126. scanner.act_in_cpp0x_mode = boost::wave::need_cpp0x(language_);
  127. #else
  128. scanner.act_in_cpp0x_mode = false;
  129. #endif
  130. #if BOOST_WAVE_SUPPORT_CPP2A != 0
  131. scanner.act_in_cpp2a_mode = boost::wave::need_cpp2a(language_);
  132. scanner.act_in_cpp0x_mode = boost::wave::need_cpp2a(language_)
  133. || boost::wave::need_cpp0x(language_);
  134. #else
  135. scanner.act_in_cpp2a_mode = false;
  136. #endif
  137. }
  138. template <typename IteratorT, typename PositionT, typename TokenT>
  139. inline
  140. lexer<IteratorT, PositionT, TokenT>::~lexer()
  141. {
  142. using namespace std; // some systems have free in std
  143. free(scanner.bot);
  144. }
  145. ///////////////////////////////////////////////////////////////////////////////
  146. // get the next token from the input stream
  147. template <typename IteratorT, typename PositionT, typename TokenT>
  148. inline TokenT&
  149. lexer<IteratorT, PositionT, TokenT>::get(TokenT& result)
  150. {
  151. if (at_eof)
  152. return result = token_type(); // return T_EOI
  153. std::size_t actline = scanner.line;
  154. token_id id = token_id(scan(&scanner));
  155. switch (id) {
  156. case T_IDENTIFIER:
  157. // test identifier characters for validity (throws if invalid chars found)
  158. value = string_type((char const *)scanner.tok,
  159. scanner.cur-scanner.tok);
  160. if (!boost::wave::need_no_character_validation(language))
  161. impl::validate_identifier_name(value, actline, scanner.column, filename);
  162. break;
  163. case T_STRINGLIT:
  164. case T_CHARLIT:
  165. case T_RAWSTRINGLIT:
  166. // test literal characters for validity (throws if invalid chars found)
  167. value = string_type((char const *)scanner.tok,
  168. scanner.cur-scanner.tok);
  169. if (boost::wave::need_convert_trigraphs(language))
  170. value = impl::convert_trigraphs(value);
  171. if (!boost::wave::need_no_character_validation(language))
  172. impl::validate_literal(value, actline, scanner.column, filename);
  173. break;
  174. case T_PP_HHEADER:
  175. case T_PP_QHEADER:
  176. case T_PP_INCLUDE:
  177. // convert to the corresponding ..._next token, if appropriate
  178. {
  179. value = string_type((char const *)scanner.tok,
  180. scanner.cur-scanner.tok);
  181. #if BOOST_WAVE_SUPPORT_INCLUDE_NEXT != 0
  182. // Skip '#' and whitespace and see whether we find an 'include_next' here.
  183. typename string_type::size_type start = value.find("include");
  184. if (value.compare(start, 12, "include_next", 12) == 0)
  185. id = token_id(id | AltTokenType);
  186. #endif
  187. break;
  188. }
  189. case T_LONGINTLIT: // supported in C++11, C99 and long_long mode
  190. value = string_type((char const *)scanner.tok,
  191. scanner.cur-scanner.tok);
  192. if (!boost::wave::need_long_long(language)) {
  193. // syntax error: not allowed in C++ mode
  194. BOOST_WAVE_LEXER_THROW(lexing_exception, invalid_long_long_literal,
  195. value.c_str(), actline, scanner.column, filename.c_str());
  196. }
  197. break;
  198. case T_OCTALINT:
  199. case T_DECIMALINT:
  200. case T_HEXAINT:
  201. case T_INTLIT:
  202. case T_FLOATLIT:
  203. case T_FIXEDPOINTLIT:
  204. case T_CCOMMENT:
  205. case T_CPPCOMMENT:
  206. case T_SPACE:
  207. case T_SPACE2:
  208. case T_ANY:
  209. case T_PP_NUMBER:
  210. value = string_type((char const *)scanner.tok,
  211. scanner.cur-scanner.tok);
  212. break;
  213. case T_EOF:
  214. // T_EOF is returned as a valid token, the next call will return T_EOI,
  215. // i.e. the actual end of input
  216. at_eof = true;
  217. value.clear();
  218. break;
  219. case T_OR_TRIGRAPH:
  220. case T_XOR_TRIGRAPH:
  221. case T_LEFTBRACE_TRIGRAPH:
  222. case T_RIGHTBRACE_TRIGRAPH:
  223. case T_LEFTBRACKET_TRIGRAPH:
  224. case T_RIGHTBRACKET_TRIGRAPH:
  225. case T_COMPL_TRIGRAPH:
  226. case T_POUND_TRIGRAPH:
  227. if (boost::wave::need_convert_trigraphs(language)) {
  228. value = cache.get_token_value(BASEID_FROM_TOKEN(id));
  229. }
  230. else {
  231. value = string_type((char const *)scanner.tok,
  232. scanner.cur-scanner.tok);
  233. }
  234. break;
  235. case T_ANY_TRIGRAPH:
  236. if (boost::wave::need_convert_trigraphs(language)) {
  237. value = impl::convert_trigraph(
  238. string_type((char const *)scanner.tok,
  239. scanner.cur-scanner.tok));
  240. }
  241. else {
  242. value = string_type((char const *)scanner.tok,
  243. scanner.cur-scanner.tok);
  244. }
  245. break;
  246. default:
  247. if (CATEGORY_FROM_TOKEN(id) != EXTCATEGORY_FROM_TOKEN(id) ||
  248. IS_CATEGORY(id, UnknownTokenType))
  249. {
  250. value = string_type((char const *)scanner.tok,
  251. scanner.cur-scanner.tok);
  252. }
  253. else {
  254. value = cache.get_token_value(id);
  255. }
  256. break;
  257. }
  258. // std::cerr << boost::wave::get_token_name(id) << ": " << value << std::endl;
  259. // the re2c lexer reports the new line number for newline tokens
  260. result = token_type(id, value, PositionT(filename, actline, scanner.column));
  261. #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
  262. return guards.detect_guard(result);
  263. #else
  264. return result;
  265. #endif
  266. }
  267. template <typename IteratorT, typename PositionT, typename TokenT>
  268. inline int
  269. lexer<IteratorT, PositionT, TokenT>::report_error(Scanner<IteratorT> const *s, int errcode,
  270. char const *msg, ...)
  271. {
  272. BOOST_ASSERT(0 != s);
  273. BOOST_ASSERT(0 != msg);
  274. using namespace std; // some systems have vsnprintf in namespace std
  275. constexpr std::size_t bufsize = 200; // should be large enough
  276. char buffer[bufsize];
  277. va_list params;
  278. va_start(params, msg);
  279. vsnprintf(buffer, bufsize, msg, params);
  280. va_end(params);
  281. BOOST_WAVE_LEXER_THROW_VAR(lexing_exception, errcode, buffer, s->line,
  282. s->column, s->file_name);
  283. // BOOST_UNREACHABLE_RETURN(0);
  284. return 0;
  285. }
  286. ///////////////////////////////////////////////////////////////////////////////
  287. //
  288. // lex_functor
  289. //
  290. ///////////////////////////////////////////////////////////////////////////////
  291. template <typename IteratorT,
  292. typename PositionT = boost::wave::util::file_position_type,
  293. typename TokenT = typename lexer<IteratorT, PositionT>::token_type>
  294. class lex_functor
  295. : public lex_input_interface_generator<TokenT>
  296. {
  297. public:
  298. typedef TokenT token_type;
  299. lex_functor(IteratorT const &first, IteratorT const &last,
  300. PositionT const &pos, boost::wave::language_support language)
  301. : re2c_lexer(first, last, pos, language)
  302. {}
  303. virtual ~lex_functor() {}
  304. // get the next token from the input stream
  305. token_type& get(token_type& result) BOOST_OVERRIDE { return re2c_lexer.get(result); }
  306. void set_position(PositionT const &pos) BOOST_OVERRIDE { re2c_lexer.set_position(pos); }
  307. #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
  308. bool has_include_guards(std::string& guard_name) const BOOST_OVERRIDE
  309. { return re2c_lexer.has_include_guards(guard_name); }
  310. #endif
  311. private:
  312. lexer<IteratorT, PositionT, TokenT> re2c_lexer;
  313. };
  314. #if BOOST_WAVE_SUPPORT_THREADING == 0
  315. ///////////////////////////////////////////////////////////////////////////////
  316. template <typename IteratorT, typename PositionT, typename TokenT>
  317. token_cache<typename lexer<IteratorT, PositionT, TokenT>::string_type> const
  318. lexer<IteratorT, PositionT, TokenT>::cache =
  319. token_cache<typename lexer<IteratorT, PositionT, TokenT>::string_type>();
  320. #endif
  321. } // namespace re2clex
  322. ///////////////////////////////////////////////////////////////////////////////
  323. //
  324. // The new_lexer_gen<>::new_lexer function (declared in cpp_lex_interface.hpp)
  325. // should be defined inline, if the lex_functor shouldn't be instantiated
  326. // separately from the lex_iterator.
  327. //
  328. // Separate (explicit) instantiation helps to reduce compilation time.
  329. //
  330. ///////////////////////////////////////////////////////////////////////////////
  331. #if BOOST_WAVE_SEPARATE_LEXER_INSTANTIATION != 0
  332. #define BOOST_WAVE_RE2C_NEW_LEXER_INLINE
  333. #else
  334. #define BOOST_WAVE_RE2C_NEW_LEXER_INLINE inline
  335. #endif
  336. ///////////////////////////////////////////////////////////////////////////////
  337. //
  338. // The 'new_lexer' function allows the opaque generation of a new lexer object.
  339. // It is coupled to the iterator type to allow to decouple the lexer/iterator
  340. // configurations at compile time.
  341. //
  342. // This function is declared inside the cpp_lex_token.hpp file, which is
  343. // referenced by the source file calling the lexer and the source file, which
  344. // instantiates the lex_functor. But it is defined here, so it will be
  345. // instantiated only while compiling the source file, which instantiates the
  346. // lex_functor. While the cpp_re2c_token.hpp file may be included everywhere,
  347. // this file (cpp_re2c_lexer.hpp) should be included only once. This allows
  348. // to decouple the lexer interface from the lexer implementation and reduces
  349. // compilation time.
  350. //
  351. ///////////////////////////////////////////////////////////////////////////////
  352. template <typename IteratorT, typename PositionT, typename TokenT>
  353. BOOST_WAVE_RE2C_NEW_LEXER_INLINE
  354. lex_input_interface<TokenT> *
  355. new_lexer_gen<IteratorT, PositionT, TokenT>::new_lexer(IteratorT const &first,
  356. IteratorT const &last, PositionT const &pos,
  357. boost::wave::language_support language)
  358. {
  359. using re2clex::lex_functor;
  360. return new lex_functor<IteratorT, PositionT, TokenT>(first, last, pos, language);
  361. }
  362. #undef BOOST_WAVE_RE2C_NEW_LEXER_INLINE
  363. ///////////////////////////////////////////////////////////////////////////////
  364. } // namespace cpplexer
  365. } // namespace wave
  366. } // namespace boost
  367. // the suffix header occurs after all of the code
  368. #ifdef BOOST_HAS_ABI_HEADERS
  369. #include BOOST_ABI_SUFFIX
  370. #endif
  371. #endif // !defined(BOOST_CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED)