ascii_number.hpp 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. // Copyright 2020-2023 Daniel Lemire
  2. // Copyright 2023 Matt Borland
  3. // Distributed under the Boost Software License, Version 1.0.
  4. // https://www.boost.org/LICENSE_1_0.txt
  5. //
  6. // Derivative of: https://github.com/fastfloat/fast_float
  7. #ifndef BOOST_CHARCONV_DETAIL_FASTFLOAT_ASCII_NUMBER_HPP
  8. #define BOOST_CHARCONV_DETAIL_FASTFLOAT_ASCII_NUMBER_HPP
  9. #include <boost/charconv/detail/fast_float/float_common.hpp>
  10. #include <cctype>
  11. #include <cstdint>
  12. #include <cstring>
  13. #include <iterator>
  14. namespace boost { namespace charconv { namespace detail { namespace fast_float {
  15. // Next function can be micro-optimized, but compilers are entirely
  16. // able to optimize it well.
  17. template <typename UC>
  18. BOOST_FORCEINLINE constexpr bool is_integer(UC c) noexcept {
  19. return !(c > UC('9') || c < UC('0'));
  20. }
  21. BOOST_FORCEINLINE constexpr uint64_t byteswap(uint64_t val) {
  22. return (val & 0xFF00000000000000) >> 56
  23. | (val & 0x00FF000000000000) >> 40
  24. | (val & 0x0000FF0000000000) >> 24
  25. | (val & 0x000000FF00000000) >> 8
  26. | (val & 0x00000000FF000000) << 8
  27. | (val & 0x0000000000FF0000) << 24
  28. | (val & 0x000000000000FF00) << 40
  29. | (val & 0x00000000000000FF) << 56;
  30. }
  31. BOOST_FORCEINLINE BOOST_CHARCONV_FASTFLOAT_CONSTEXPR20
  32. uint64_t read_u64(const char *chars) {
  33. if (cpp20_and_in_constexpr()) {
  34. uint64_t val = 0;
  35. for(int i = 0; i < 8; ++i) {
  36. val |= uint64_t(*chars) << (i*8);
  37. ++chars;
  38. }
  39. return val;
  40. }
  41. uint64_t val;
  42. ::memcpy(&val, chars, sizeof(uint64_t));
  43. #if BOOST_CHARCONV_FASTFLOAT_IS_BIG_ENDIAN == 1
  44. // Need to read as-if the number was in little-endian order.
  45. val = byteswap(val);
  46. #endif
  47. return val;
  48. }
  49. BOOST_FORCEINLINE BOOST_CHARCONV_FASTFLOAT_CONSTEXPR20
  50. void write_u64(uint8_t *chars, uint64_t val) {
  51. if (cpp20_and_in_constexpr()) {
  52. for(int i = 0; i < 8; ++i) {
  53. *chars = uint8_t(val);
  54. val >>= 8;
  55. ++chars;
  56. }
  57. return;
  58. }
  59. #if BOOST_CHARCONV_FASTFLOAT_IS_BIG_ENDIAN == 1
  60. // Need to read as-if the number was in little-endian order.
  61. val = byteswap(val);
  62. #endif
  63. ::memcpy(chars, &val, sizeof(uint64_t));
  64. }
  65. // credit @aqrit
  66. BOOST_FORCEINLINE BOOST_CHARCONV_FASTFLOAT_CONSTEXPR14
  67. uint32_t parse_eight_digits_unrolled(uint64_t val) {
  68. constexpr uint64_t mask = 0x000000FF000000FF;
  69. constexpr uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
  70. constexpr uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
  71. val -= 0x3030303030303030;
  72. val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
  73. val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
  74. return uint32_t(val);
  75. }
  76. BOOST_FORCEINLINE constexpr
  77. uint32_t parse_eight_digits_unrolled(const char16_t *) noexcept {
  78. return 0;
  79. }
  80. BOOST_FORCEINLINE constexpr
  81. uint32_t parse_eight_digits_unrolled(const char32_t *) noexcept {
  82. return 0;
  83. }
  84. BOOST_FORCEINLINE BOOST_CHARCONV_FASTFLOAT_CONSTEXPR20
  85. uint32_t parse_eight_digits_unrolled(const char *chars) noexcept {
  86. return parse_eight_digits_unrolled(read_u64(chars));
  87. }
  88. // credit @aqrit
  89. BOOST_FORCEINLINE constexpr bool is_made_of_eight_digits_fast(uint64_t val) noexcept {
  90. return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) & 0x8080808080808080));
  91. }
  92. BOOST_FORCEINLINE constexpr
  93. bool is_made_of_eight_digits_fast(const char16_t *) noexcept {
  94. return false;
  95. }
  96. BOOST_FORCEINLINE constexpr
  97. bool is_made_of_eight_digits_fast(const char32_t *) noexcept {
  98. return false;
  99. }
  100. BOOST_FORCEINLINE BOOST_CHARCONV_FASTFLOAT_CONSTEXPR20
  101. bool is_made_of_eight_digits_fast(const char *chars) noexcept {
  102. return is_made_of_eight_digits_fast(read_u64(chars));
  103. }
  104. template <typename UC>
  105. struct parsed_number_string_t {
  106. int64_t exponent{0};
  107. uint64_t mantissa{0};
  108. UC const * lastmatch{nullptr};
  109. bool negative{false};
  110. bool valid{false};
  111. bool too_many_digits{false};
  112. // contains the range of the significant digits
  113. span<const UC> integer{}; // non-nullable
  114. span<const UC> fraction{}; // nullable
  115. };
  116. using byte_span = span<char>;
  117. using parsed_number_string = parsed_number_string_t<char>;
  118. // Assuming that you use no more than 19 digits, this will
  119. // parse an ASCII string.
  120. template <typename UC>
  121. BOOST_FORCEINLINE BOOST_CHARCONV_FASTFLOAT_CONSTEXPR20
  122. parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, parse_options_t<UC> options) noexcept {
  123. chars_format const fmt = options.format;
  124. UC const decimal_point = options.decimal_point;
  125. parsed_number_string_t<UC> answer;
  126. answer.valid = false;
  127. answer.too_many_digits = false;
  128. answer.negative = (*p == UC('-'));
  129. #ifdef BOOST_CHARCONV_FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default
  130. if ((*p == UC('-')) || (*p == UC('+')))
  131. #else
  132. if (*p == UC('-')) // C++17 20.19.3.(7.1) explicitly forbids '+' sign here
  133. #endif
  134. {
  135. ++p;
  136. if (p == pend) {
  137. return answer;
  138. }
  139. if (!is_integer(*p) && (*p != decimal_point)) { // a sign must be followed by an integer or the dot
  140. return answer;
  141. }
  142. }
  143. UC const * const start_digits = p;
  144. uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
  145. while ((p != pend) && is_integer(*p)) {
  146. // a multiplication by 10 is cheaper than an arbitrary integer
  147. // multiplication
  148. i = 10 * i +
  149. uint64_t(*p - UC('0')); // might overflow, we will handle the overflow later
  150. ++p;
  151. }
  152. UC const * const end_of_integer_part = p;
  153. int64_t digit_count = int64_t(end_of_integer_part - start_digits);
  154. answer.integer = span<const UC>(start_digits, size_t(digit_count));
  155. int64_t exponent = 0;
  156. if ((p != pend) && (*p == decimal_point)) {
  157. ++p;
  158. UC const * before = p;
  159. // can occur at most twice without overflowing, but let it occur more, since
  160. // for integers with many digits, digit parsing is the primary bottleneck.
  161. if (std::is_same<UC,char>::value) {
  162. while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
  163. i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
  164. p += 8;
  165. }
  166. }
  167. while ((p != pend) && is_integer(*p)) {
  168. uint8_t digit = uint8_t(*p - UC('0'));
  169. ++p;
  170. i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
  171. }
  172. exponent = before - p;
  173. answer.fraction = span<const UC>(before, size_t(p - before));
  174. digit_count -= exponent;
  175. }
  176. // we must have encountered at least one integer!
  177. if (digit_count == 0) {
  178. return answer;
  179. }
  180. int64_t exp_number = 0; // explicit exponential part
  181. if ((static_cast<unsigned>(fmt) & static_cast<unsigned>(chars_format::scientific)) && (p != pend) && ((UC('e') == *p) || (UC('E') == *p))) {
  182. UC const * location_of_e = p;
  183. ++p;
  184. bool neg_exp = false;
  185. if ((p != pend) && (UC('-') == *p)) {
  186. neg_exp = true;
  187. ++p;
  188. } else if ((p != pend) && (UC('+') == *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1)
  189. ++p;
  190. }
  191. if ((p == pend) || !is_integer(*p)) {
  192. if(!(static_cast<unsigned>(fmt) & static_cast<unsigned>(chars_format::fixed))) {
  193. // We are in error.
  194. return answer;
  195. }
  196. // Otherwise, we will be ignoring the 'e'.
  197. p = location_of_e;
  198. } else {
  199. while ((p != pend) && is_integer(*p)) {
  200. uint8_t digit = uint8_t(*p - UC('0'));
  201. if (exp_number < 0x10000000) {
  202. exp_number = 10 * exp_number + digit;
  203. }
  204. ++p;
  205. }
  206. if(neg_exp) { exp_number = - exp_number; }
  207. exponent += exp_number;
  208. }
  209. } else {
  210. // If it scientific and not fixed, we have to bail out.
  211. if((static_cast<unsigned>(fmt) & static_cast<unsigned>(chars_format::scientific)) &&
  212. !(static_cast<unsigned>(fmt) & static_cast<unsigned>(chars_format::fixed)))
  213. {
  214. return answer;
  215. }
  216. }
  217. answer.lastmatch = p;
  218. answer.valid = true;
  219. // If we frequently had to deal with long strings of digits,
  220. // we could extend our code by using a 128-bit integer instead
  221. // of a 64-bit integer. However, this is uncommon.
  222. //
  223. // We can deal with up to 19 digits.
  224. if (digit_count > 19) { // this is uncommon
  225. // It is possible that the integer had an overflow.
  226. // We have to handle the case where we have 0.0000somenumber.
  227. // We need to be mindful of the case where we only have zeroes...
  228. // E.g., 0.000000000...000.
  229. UC const * start = start_digits;
  230. while ((start != pend) && (*start == UC('0') || *start == decimal_point)) {
  231. if(*start == UC('0')) { digit_count --; }
  232. start++;
  233. }
  234. if (digit_count > 19) {
  235. answer.too_many_digits = true;
  236. // Let us start again, this time, avoiding overflows.
  237. // We don't need to check if is_integer, since we use the
  238. // pre-tokenized spans from above.
  239. i = 0;
  240. p = answer.integer.ptr;
  241. UC const * int_end = p + answer.integer.len();
  242. constexpr uint64_t minimal_nineteen_digit_integer{1000000000000000000};
  243. while((i < minimal_nineteen_digit_integer) && (p != int_end)) {
  244. i = i * 10 + uint64_t(*p - UC('0'));
  245. ++p;
  246. }
  247. if (i >= minimal_nineteen_digit_integer) { // We have a big integers
  248. exponent = end_of_integer_part - p + exp_number;
  249. } else { // We have a value with a fractional component.
  250. p = answer.fraction.ptr;
  251. UC const * frac_end = p + answer.fraction.len();
  252. while((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
  253. i = i * 10 + uint64_t(*p - UC('0'));
  254. ++p;
  255. }
  256. exponent = answer.fraction.ptr - p + exp_number;
  257. }
  258. // We have now corrected both exponent and i, to a truncated value
  259. }
  260. }
  261. answer.exponent = exponent;
  262. answer.mantissa = i;
  263. return answer;
  264. }
  265. }}}} // namespace s
  266. #endif