character_set.ipp 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. //
  2. // Copyright (c) 2019-2024 Ruben Perez Hidalgo (rubenperez038 at gmail dot com)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. //
  7. #ifndef BOOST_MYSQL_IMPL_CHARACTER_SET_IPP
  8. #define BOOST_MYSQL_IMPL_CHARACTER_SET_IPP
  9. #pragma once
  10. #include <boost/mysql/character_set.hpp>
  11. #include <boost/assert.hpp>
  12. namespace boost {
  13. namespace mysql {
  14. namespace detail {
  15. inline bool in_range(unsigned char byte, unsigned char lower, unsigned char upper)
  16. {
  17. return byte >= lower && byte <= upper;
  18. }
  19. } // namespace detail
  20. } // namespace mysql
  21. } // namespace boost
  22. std::size_t boost::mysql::detail::next_char_utf8mb4(span<const unsigned char> input)
  23. {
  24. // s[0] s[1] s[2] s[3] comment
  25. // 00-7F ascii
  26. // 80-c1 invalid
  27. // c2-df 80-bf 2byte
  28. // e0 a0-bf 80-bf 3byte, case 1
  29. // e1-ec 80-bf 80-bf 3byte, case 2
  30. // ed 80-9f 80-bf 3byte, case 3 (surrogates)
  31. // ee-ef 80-bf 80-bf 3byte, case 2
  32. // f0 90-bf 80-bf 80-bf 4byte, case 1
  33. // f1-f3 80-bf 80-bf 80-bf 4byte, case 2
  34. // f4 80-8f 80-bf 80-bf 4byte, case 3
  35. BOOST_ASSERT(!input.empty());
  36. auto first_char = input.front();
  37. BOOST_ASSERT(first_char >= 0x80); // ascii range covered by call_next_char
  38. if (first_char < 0xc2)
  39. {
  40. return 0;
  41. }
  42. else if (first_char < 0xe0)
  43. {
  44. return (input.size() < 2u || !in_range(input[1], 0x80, 0xbf)) ? 0 : 2;
  45. }
  46. else if (first_char == 0xe0)
  47. {
  48. return (input.size() < 3u || !in_range(input[1], 0xa0, 0xbf) || !in_range(input[2], 0x80, 0xbf)) ? 0
  49. : 3;
  50. }
  51. else if (first_char == 0xed)
  52. {
  53. return (input.size() < 3u || !in_range(input[1], 0x80, 0x9f) || !in_range(input[2], 0x80, 0xbf)) ? 0
  54. : 3;
  55. }
  56. else if (first_char <= 0xef)
  57. {
  58. // Includes e1-ec and ee-ef
  59. return (input.size() < 3u || !in_range(input[1], 0x80, 0xbf) || !in_range(input[2], 0x80, 0xbf)) ? 0
  60. : 3;
  61. }
  62. else if (first_char == 0xf0)
  63. {
  64. return (input.size() < 4u || !in_range(input[1], 0x90, 0xbf) || !in_range(input[2], 0x80, 0xbf) ||
  65. !in_range(input[3], 0x80, 0xbf))
  66. ? 0
  67. : 4;
  68. }
  69. else if (first_char <= 0xf3)
  70. {
  71. return (input.size() < 4u || !in_range(input[1], 0x80, 0xbf) || !in_range(input[2], 0x80, 0xbf) ||
  72. !in_range(input[3], 0x80, 0xbf))
  73. ? 0
  74. : 4;
  75. }
  76. else if (first_char == 0xf4)
  77. {
  78. return (input.size() < 4u || !in_range(input[1], 0x80, 0x8f) || !in_range(input[2], 0x80, 0xbf) ||
  79. !in_range(input[3], 0x80, 0xbf))
  80. ? 0
  81. : 4;
  82. }
  83. else
  84. {
  85. return 0;
  86. }
  87. }
  88. #endif