utf8.h 45 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658
  1. // The latest version of this library is available on GitHub;
  2. // https://github.com/sheredom/utf8.h
  3. // This is free and unencumbered software released into the public domain.
  4. //
  5. // Anyone is free to copy, modify, publish, use, compile, sell, or
  6. // distribute this software, either in source code form or as a compiled
  7. // binary, for any purpose, commercial or non-commercial, and by any
  8. // means.
  9. //
  10. // In jurisdictions that recognize copyright laws, the author or authors
  11. // of this software dedicate any and all copyright interest in the
  12. // software to the public domain. We make this dedication for the benefit
  13. // of the public at large and to the detriment of our heirs and
  14. // successors. We intend this dedication to be an overt act of
  15. // relinquishment in perpetuity of all present and future rights to this
  16. // software under copyright law.
  17. //
  18. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20. // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21. // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  22. // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  23. // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  24. // OTHER DEALINGS IN THE SOFTWARE.
  25. //
  26. // For more information, please refer to <http://unlicense.org/>
  27. #ifndef SHEREDOM_UTF8_H_INCLUDED
  28. #define SHEREDOM_UTF8_H_INCLUDED
  29. #if defined(_MSC_VER)
  30. #pragma warning(push)
  31. /* disable warning: no function prototype given: converting '()' to '(void)' */
  32. #pragma warning(disable : 4255)
  33. /* disable warning: '__cplusplus' is not defined as a preprocessor macro,
  34. * replacing with '0' for '#if/#elif' */
  35. #pragma warning(disable : 4668)
  36. /* disable warning: bytes padding added after construct */
  37. #pragma warning(disable : 4820)
  38. #endif
  39. #include <stddef.h>
  40. #include <stdlib.h>
  41. #if defined(_MSC_VER)
  42. #pragma warning(pop)
  43. #endif
  44. #if defined(_MSC_VER) && (_MSC_VER < 1920)
  45. typedef __int32 utf8_int32_t;
  46. #else
  47. #include <stdint.h>
  48. typedef int32_t utf8_int32_t;
  49. #endif
  50. #if defined(__clang__)
  51. #pragma clang diagnostic push
  52. #pragma clang diagnostic ignored "-Wold-style-cast"
  53. #pragma clang diagnostic ignored "-Wcast-qual"
  54. #endif
  55. #ifdef __cplusplus
  56. extern "C" {
  57. #endif
  58. #if defined(_MSC_VER)
  59. #define utf8_nonnull
  60. #define utf8_pure
  61. #define utf8_restrict __restrict
  62. #define utf8_weak __inline
  63. #elif defined(__clang__) || defined(__GNUC__)
  64. #define utf8_nonnull __attribute__((nonnull))
  65. #define utf8_pure __attribute__((pure))
  66. #define utf8_restrict __restrict__
  67. #define utf8_weak __attribute__((weak))
  68. #else
  69. #error Non clang, non gcc, non MSVC compiler found!
  70. #endif
  71. #ifdef __cplusplus
  72. #define utf8_null NULL
  73. #else
  74. #define utf8_null 0
  75. #endif
  76. // Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
  77. // src2 respectively, case insensitive.
  78. utf8_nonnull utf8_pure utf8_weak int utf8casecmp(const void *src1,
  79. const void *src2);
  80. // Append the utf8 string src onto the utf8 string dst.
  81. utf8_nonnull utf8_weak void *utf8cat(void *utf8_restrict dst,
  82. const void *utf8_restrict src);
  83. // Find the first match of the utf8 codepoint chr in the utf8 string src.
  84. utf8_nonnull utf8_pure utf8_weak void *utf8chr(const void *src,
  85. utf8_int32_t chr);
  86. // Return less than 0, 0, greater than 0 if src1 < src2,
  87. // src1 == src2, src1 > src2 respectively.
  88. utf8_nonnull utf8_pure utf8_weak int utf8cmp(const void *src1,
  89. const void *src2);
  90. // Copy the utf8 string src onto the memory allocated in dst.
  91. utf8_nonnull utf8_weak void *utf8cpy(void *utf8_restrict dst,
  92. const void *utf8_restrict src);
  93. // Number of utf8 codepoints in the utf8 string src that consists entirely
  94. // of utf8 codepoints not from the utf8 string reject.
  95. utf8_nonnull utf8_pure utf8_weak size_t utf8cspn(const void *src,
  96. const void *reject);
  97. // Duplicate the utf8 string src by getting its size, malloc'ing a new buffer
  98. // copying over the data, and returning that. Or 0 if malloc failed.
  99. utf8_weak void *utf8dup(const void *src);
  100. // Number of utf8 codepoints in the utf8 string str,
  101. // excluding the null terminating byte.
  102. utf8_nonnull utf8_pure utf8_weak size_t utf8len(const void *str);
  103. // Similar to utf8len, except that only at most n bytes of src are looked.
  104. utf8_nonnull utf8_pure utf8_weak size_t utf8nlen(const void *str, size_t n);
  105. // Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
  106. // src2 respectively, case insensitive. Checking at most n bytes of each utf8
  107. // string.
  108. utf8_nonnull utf8_pure utf8_weak int utf8ncasecmp(const void *src1,
  109. const void *src2, size_t n);
  110. // Append the utf8 string src onto the utf8 string dst,
  111. // writing at most n+1 bytes. Can produce an invalid utf8
  112. // string if n falls partway through a utf8 codepoint.
  113. utf8_nonnull utf8_weak void *utf8ncat(void *utf8_restrict dst,
  114. const void *utf8_restrict src, size_t n);
  115. // Return less than 0, 0, greater than 0 if src1 < src2,
  116. // src1 == src2, src1 > src2 respectively. Checking at most n
  117. // bytes of each utf8 string.
  118. utf8_nonnull utf8_pure utf8_weak int utf8ncmp(const void *src1,
  119. const void *src2, size_t n);
  120. // Copy the utf8 string src onto the memory allocated in dst.
  121. // Copies at most n bytes. If n falls partway through a utf8
  122. // codepoint, or if dst doesn't have enough room for a null
  123. // terminator, the final string will be cut short to preserve
  124. // utf8 validity.
  125. utf8_nonnull utf8_weak void *utf8ncpy(void *utf8_restrict dst,
  126. const void *utf8_restrict src, size_t n);
  127. // Similar to utf8dup, except that at most n bytes of src are copied. If src is
  128. // longer than n, only n bytes are copied and a null byte is added.
  129. //
  130. // Returns a new string if successful, 0 otherwise
  131. utf8_weak void *utf8ndup(const void *src, size_t n);
  132. // Locates the first occurrence in the utf8 string str of any byte in the
  133. // utf8 string accept, or 0 if no match was found.
  134. utf8_nonnull utf8_pure utf8_weak void *utf8pbrk(const void *str,
  135. const void *accept);
  136. // Find the last match of the utf8 codepoint chr in the utf8 string src.
  137. utf8_nonnull utf8_pure utf8_weak void *utf8rchr(const void *src, int chr);
  138. // Number of bytes in the utf8 string str,
  139. // including the null terminating byte.
  140. utf8_nonnull utf8_pure utf8_weak size_t utf8size(const void *str);
  141. // Similar to utf8size, except that the null terminating byte is excluded.
  142. utf8_nonnull utf8_pure utf8_weak size_t utf8size_lazy(const void *str);
  143. // Similar to utf8size, except that only at most n bytes of src are looked and
  144. // the null terminating byte is excluded.
  145. utf8_nonnull utf8_pure utf8_weak size_t utf8nsize_lazy(const void *str, size_t n);
  146. // Number of utf8 codepoints in the utf8 string src that consists entirely
  147. // of utf8 codepoints from the utf8 string accept.
  148. utf8_nonnull utf8_pure utf8_weak size_t utf8spn(const void *src,
  149. const void *accept);
  150. // The position of the utf8 string needle in the utf8 string haystack.
  151. utf8_nonnull utf8_pure utf8_weak void *utf8str(const void *haystack,
  152. const void *needle);
  153. // The position of the utf8 string needle in the utf8 string haystack, case
  154. // insensitive.
  155. utf8_nonnull utf8_pure utf8_weak void *utf8casestr(const void *haystack,
  156. const void *needle);
  157. // Return 0 on success, or the position of the invalid
  158. // utf8 codepoint on failure.
  159. utf8_nonnull utf8_pure utf8_weak void *utf8valid(const void *str);
  160. // Similar to utf8valid, except that only at most n bytes of src are looked.
  161. utf8_nonnull utf8_pure utf8_weak void *utf8nvalid(const void *str, size_t n);
  162. // Given a null-terminated string, makes the string valid by replacing invalid
  163. // codepoints with a 1-byte replacement. Returns 0 on success.
  164. utf8_nonnull utf8_weak int utf8makevalid(void *str,
  165. const utf8_int32_t replacement);
  166. // Sets out_codepoint to the current utf8 codepoint in str, and returns the
  167. // address of the next utf8 codepoint after the current one in str.
  168. utf8_nonnull utf8_weak void *
  169. utf8codepoint(const void *utf8_restrict str,
  170. utf8_int32_t *utf8_restrict out_codepoint);
  171. // Calculates the size of the next utf8 codepoint in str.
  172. utf8_nonnull utf8_weak size_t utf8codepointcalcsize(const void *str);
  173. // Returns the size of the given codepoint in bytes.
  174. utf8_weak size_t utf8codepointsize(utf8_int32_t chr);
  175. // Write a codepoint to the given string, and return the address to the next
  176. // place after the written codepoint. Pass how many bytes left in the buffer to
  177. // n. If there is not enough space for the codepoint, this function returns
  178. // null.
  179. utf8_nonnull utf8_weak void *utf8catcodepoint(void *str, utf8_int32_t chr,
  180. size_t n);
  181. // Returns 1 if the given character is lowercase, or 0 if it is not.
  182. utf8_weak int utf8islower(utf8_int32_t chr);
  183. // Returns 1 if the given character is uppercase, or 0 if it is not.
  184. utf8_weak int utf8isupper(utf8_int32_t chr);
  185. // Transform the given string into all lowercase codepoints.
  186. utf8_nonnull utf8_weak void utf8lwr(void *utf8_restrict str);
  187. // Transform the given string into all uppercase codepoints.
  188. utf8_nonnull utf8_weak void utf8upr(void *utf8_restrict str);
  189. // Make a codepoint lower case if possible.
  190. utf8_weak utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp);
  191. // Make a codepoint upper case if possible.
  192. utf8_weak utf8_int32_t utf8uprcodepoint(utf8_int32_t cp);
  193. // Sets out_codepoint to the current utf8 codepoint in str, and returns the
  194. // address of the previous utf8 codepoint before the current one in str.
  195. utf8_nonnull utf8_weak void *
  196. utf8rcodepoint(const void *utf8_restrict str,
  197. utf8_int32_t *utf8_restrict out_codepoint);
  198. // Duplicate the utf8 string src by getting its size, calling alloc_func_ptr to
  199. // copy over data to a new buffer, and returning that. Or 0 if alloc_func_ptr
  200. // returned null.
  201. utf8_weak void *utf8dup_ex(const void *src,
  202. void *(*alloc_func_ptr)(void *, size_t),
  203. void *user_data);
  204. // Similar to utf8dup, except that at most n bytes of src are copied. If src is
  205. // longer than n, only n bytes are copied and a null byte is added.
  206. //
  207. // Returns a new string if successful, 0 otherwise.
  208. utf8_weak void *utf8ndup_ex(const void *src, size_t n,
  209. void *(*alloc_func_ptr)(void *, size_t),
  210. void *user_data);
  211. #undef utf8_weak
  212. #undef utf8_pure
  213. #undef utf8_nonnull
  214. int utf8casecmp(const void *src1, const void *src2) {
  215. utf8_int32_t src1_lwr_cp, src2_lwr_cp, src1_upr_cp, src2_upr_cp, src1_orig_cp,
  216. src2_orig_cp;
  217. for (;;) {
  218. src1 = utf8codepoint(src1, &src1_orig_cp);
  219. src2 = utf8codepoint(src2, &src2_orig_cp);
  220. // lower the srcs if required
  221. src1_lwr_cp = utf8lwrcodepoint(src1_orig_cp);
  222. src2_lwr_cp = utf8lwrcodepoint(src2_orig_cp);
  223. // lower the srcs if required
  224. src1_upr_cp = utf8uprcodepoint(src1_orig_cp);
  225. src2_upr_cp = utf8uprcodepoint(src2_orig_cp);
  226. // check if the lowered codepoints match
  227. if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
  228. return 0;
  229. } else if ((src1_lwr_cp == src2_lwr_cp) || (src1_upr_cp == src2_upr_cp)) {
  230. continue;
  231. }
  232. // if they don't match, then we return the difference between the characters
  233. return src1_lwr_cp - src2_lwr_cp;
  234. }
  235. }
  236. void *utf8cat(void *utf8_restrict dst, const void *utf8_restrict src) {
  237. char *d = (char *)dst;
  238. const char *s = (const char *)src;
  239. // find the null terminating byte in dst
  240. while ('\0' != *d) {
  241. d++;
  242. }
  243. // overwriting the null terminating byte in dst, append src byte-by-byte
  244. while ('\0' != *s) {
  245. *d++ = *s++;
  246. }
  247. // write out a new null terminating byte into dst
  248. *d = '\0';
  249. return dst;
  250. }
  251. void *utf8chr(const void *src, utf8_int32_t chr) {
  252. char c[5] = {'\0', '\0', '\0', '\0', '\0'};
  253. if (0 == chr) {
  254. // being asked to return position of null terminating byte, so
  255. // just run s to the end, and return!
  256. const char *s = (const char *)src;
  257. while ('\0' != *s) {
  258. s++;
  259. }
  260. return (void *)s;
  261. } else if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
  262. // 1-byte/7-bit ascii
  263. // (0b0xxxxxxx)
  264. c[0] = (char)chr;
  265. } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
  266. // 2-byte/11-bit utf8 code point
  267. // (0b110xxxxx 0b10xxxxxx)
  268. c[0] = 0xc0 | (char)(chr >> 6);
  269. c[1] = 0x80 | (char)(chr & 0x3f);
  270. } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
  271. // 3-byte/16-bit utf8 code point
  272. // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx)
  273. c[0] = 0xe0 | (char)(chr >> 12);
  274. c[1] = 0x80 | (char)((chr >> 6) & 0x3f);
  275. c[2] = 0x80 | (char)(chr & 0x3f);
  276. } else { // if (0 == ((int)0xffe00000 & chr)) {
  277. // 4-byte/21-bit utf8 code point
  278. // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx)
  279. c[0] = 0xf0 | (char)(chr >> 18);
  280. c[1] = 0x80 | (char)((chr >> 12) & 0x3f);
  281. c[2] = 0x80 | (char)((chr >> 6) & 0x3f);
  282. c[3] = 0x80 | (char)(chr & 0x3f);
  283. }
  284. // we've made c into a 2 utf8 codepoint string, one for the chr we are
  285. // seeking, another for the null terminating byte. Now use utf8str to
  286. // search
  287. return utf8str(src, c);
  288. }
  289. int utf8cmp(const void *src1, const void *src2) {
  290. const unsigned char *s1 = (const unsigned char *)src1;
  291. const unsigned char *s2 = (const unsigned char *)src2;
  292. while (('\0' != *s1) || ('\0' != *s2)) {
  293. if (*s1 < *s2) {
  294. return -1;
  295. } else if (*s1 > *s2) {
  296. return 1;
  297. }
  298. s1++;
  299. s2++;
  300. }
  301. // both utf8 strings matched
  302. return 0;
  303. }
  304. int utf8coll(const void *src1, const void *src2);
  305. void *utf8cpy(void *utf8_restrict dst, const void *utf8_restrict src) {
  306. char *d = (char *)dst;
  307. const char *s = (const char *)src;
  308. // overwriting anything previously in dst, write byte-by-byte
  309. // from src
  310. while ('\0' != *s) {
  311. *d++ = *s++;
  312. }
  313. // append null terminating byte
  314. *d = '\0';
  315. return dst;
  316. }
  317. size_t utf8cspn(const void *src, const void *reject) {
  318. const char *s = (const char *)src;
  319. size_t chars = 0;
  320. while ('\0' != *s) {
  321. const char *r = (const char *)reject;
  322. size_t offset = 0;
  323. while ('\0' != *r) {
  324. // checking that if *r is the start of a utf8 codepoint
  325. // (it is not 0b10xxxxxx) and we have successfully matched
  326. // a previous character (0 < offset) - we found a match
  327. if ((0x80 != (0xc0 & *r)) && (0 < offset)) {
  328. return chars;
  329. } else {
  330. if (*r == s[offset]) {
  331. // part of a utf8 codepoint matched, so move our checking
  332. // onwards to the next byte
  333. offset++;
  334. r++;
  335. } else {
  336. // r could be in the middle of an unmatching utf8 code point,
  337. // so we need to march it on to the next character beginning,
  338. do {
  339. r++;
  340. } while (0x80 == (0xc0 & *r));
  341. // reset offset too as we found a mismatch
  342. offset = 0;
  343. }
  344. }
  345. }
  346. // found a match at the end of *r, so didn't get a chance to test it
  347. if (0 < offset) {
  348. return chars;
  349. }
  350. // the current utf8 codepoint in src did not match reject, but src
  351. // could have been partway through a utf8 codepoint, so we need to
  352. // march it onto the next utf8 codepoint starting byte
  353. do {
  354. s++;
  355. } while ((0x80 == (0xc0 & *s)));
  356. chars++;
  357. }
  358. return chars;
  359. }
  360. void *utf8dup(const void *src) { return utf8dup_ex(src, utf8_null, utf8_null); }
  361. void *utf8dup_ex(const void *src, void *(*alloc_func_ptr)(void *, size_t),
  362. void *user_data) {
  363. const char *s = (const char *)src;
  364. char *n = utf8_null;
  365. // figure out how many bytes (including the terminator) we need to copy first
  366. size_t bytes = utf8size(src);
  367. if (alloc_func_ptr) {
  368. n = (char *)alloc_func_ptr(user_data, bytes);
  369. } else {
  370. n = (char *)malloc(bytes);
  371. }
  372. if (utf8_null == n) {
  373. // out of memory so we bail
  374. return utf8_null;
  375. } else {
  376. bytes = 0;
  377. // copy src byte-by-byte into our new utf8 string
  378. while ('\0' != s[bytes]) {
  379. n[bytes] = s[bytes];
  380. bytes++;
  381. }
  382. // append null terminating byte
  383. n[bytes] = '\0';
  384. return n;
  385. }
  386. }
  387. void *utf8fry(const void *str);
  388. size_t utf8len(const void *str) {
  389. return utf8nlen(str, SIZE_MAX);
  390. }
  391. size_t utf8nlen(const void *str, size_t n) {
  392. const unsigned char *s = (const unsigned char *)str;
  393. const unsigned char *t = s;
  394. size_t length = 0;
  395. while ((size_t) (s-t) < n && '\0' != *s) {
  396. if (0xf0 == (0xf8 & *s)) {
  397. // 4-byte utf8 code point (began with 0b11110xxx)
  398. s += 4;
  399. } else if (0xe0 == (0xf0 & *s)) {
  400. // 3-byte utf8 code point (began with 0b1110xxxx)
  401. s += 3;
  402. } else if (0xc0 == (0xe0 & *s)) {
  403. // 2-byte utf8 code point (began with 0b110xxxxx)
  404. s += 2;
  405. } else { // if (0x00 == (0x80 & *s)) {
  406. // 1-byte ascii (began with 0b0xxxxxxx)
  407. s += 1;
  408. }
  409. // no matter the bytes we marched s forward by, it was
  410. // only 1 utf8 codepoint
  411. length++;
  412. }
  413. if ((size_t) (s-t) > n) {
  414. length--;
  415. }
  416. return length;
  417. }
  418. int utf8ncasecmp(const void *src1, const void *src2, size_t n) {
  419. utf8_int32_t src1_lwr_cp, src2_lwr_cp, src1_upr_cp, src2_upr_cp, src1_orig_cp,
  420. src2_orig_cp;
  421. do {
  422. const unsigned char *const s1 = (const unsigned char *)src1;
  423. const unsigned char *const s2 = (const unsigned char *)src2;
  424. // first check that we have enough bytes left in n to contain an entire
  425. // codepoint
  426. if (0 == n) {
  427. return 0;
  428. }
  429. if ((1 == n) && ((0xc0 == (0xe0 & *s1)) || (0xc0 == (0xe0 & *s2)))) {
  430. const utf8_int32_t c1 = (0xe0 & *s1);
  431. const utf8_int32_t c2 = (0xe0 & *s2);
  432. if (c1 < c2) {
  433. return c1 - c2;
  434. } else {
  435. return 0;
  436. }
  437. }
  438. if ((2 >= n) && ((0xe0 == (0xf0 & *s1)) || (0xe0 == (0xf0 & *s2)))) {
  439. const utf8_int32_t c1 = (0xf0 & *s1);
  440. const utf8_int32_t c2 = (0xf0 & *s2);
  441. if (c1 < c2) {
  442. return c1 - c2;
  443. } else {
  444. return 0;
  445. }
  446. }
  447. if ((3 >= n) && ((0xf0 == (0xf8 & *s1)) || (0xf0 == (0xf8 & *s2)))) {
  448. const utf8_int32_t c1 = (0xf8 & *s1);
  449. const utf8_int32_t c2 = (0xf8 & *s2);
  450. if (c1 < c2) {
  451. return c1 - c2;
  452. } else {
  453. return 0;
  454. }
  455. }
  456. src1 = utf8codepoint(src1, &src1_orig_cp);
  457. src2 = utf8codepoint(src2, &src2_orig_cp);
  458. n -= utf8codepointsize(src1_orig_cp);
  459. src1_lwr_cp = utf8lwrcodepoint(src1_orig_cp);
  460. src2_lwr_cp = utf8lwrcodepoint(src2_orig_cp);
  461. src1_upr_cp = utf8uprcodepoint(src1_orig_cp);
  462. src2_upr_cp = utf8uprcodepoint(src2_orig_cp);
  463. // check if the lowered codepoints match
  464. if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
  465. return 0;
  466. } else if ((src1_lwr_cp == src2_lwr_cp) || (src1_upr_cp == src2_upr_cp)) {
  467. continue;
  468. }
  469. // if they don't match, then we return the difference between the characters
  470. return src1_lwr_cp - src2_lwr_cp;
  471. } while (0 < n);
  472. // both utf8 strings matched
  473. return 0;
  474. }
  475. void *utf8ncat(void *utf8_restrict dst, const void *utf8_restrict src,
  476. size_t n) {
  477. char *d = (char *)dst;
  478. const char *s = (const char *)src;
  479. // find the null terminating byte in dst
  480. while ('\0' != *d) {
  481. d++;
  482. }
  483. // overwriting the null terminating byte in dst, append src byte-by-byte
  484. // stopping if we run out of space
  485. do {
  486. *d++ = *s++;
  487. } while (('\0' != *s) && (0 != --n));
  488. // write out a new null terminating byte into dst
  489. *d = '\0';
  490. return dst;
  491. }
  492. int utf8ncmp(const void *src1, const void *src2, size_t n) {
  493. const unsigned char *s1 = (const unsigned char *)src1;
  494. const unsigned char *s2 = (const unsigned char *)src2;
  495. while ((0 != n--) && (('\0' != *s1) || ('\0' != *s2))) {
  496. if (*s1 < *s2) {
  497. return -1;
  498. } else if (*s1 > *s2) {
  499. return 1;
  500. }
  501. s1++;
  502. s2++;
  503. }
  504. // both utf8 strings matched
  505. return 0;
  506. }
  507. void *utf8ncpy(void *utf8_restrict dst, const void *utf8_restrict src,
  508. size_t n) {
  509. char *d = (char *)dst;
  510. const char *s = (const char *)src;
  511. size_t index, check_index;
  512. // overwriting anything previously in dst, write byte-by-byte
  513. // from src
  514. for (index = 0; index < n; index++) {
  515. d[index] = s[index];
  516. if ('\0' == s[index]) {
  517. break;
  518. }
  519. }
  520. for ( check_index = index - 1; check_index > 0 && 0x80 == (0xc0 & d[check_index]); check_index--) {
  521. // just moving the index
  522. }
  523. if (check_index < index && (index - check_index) < utf8codepointsize(d[check_index])) {
  524. index = check_index;
  525. }
  526. // append null terminating byte
  527. for (; index < n; index++) {
  528. d[index] = 0;
  529. }
  530. return dst;
  531. }
  532. void *utf8ndup(const void *src, size_t n) {
  533. return utf8ndup_ex(src, n, utf8_null, utf8_null);
  534. }
  535. void *utf8ndup_ex(const void *src, size_t n,
  536. void *(*alloc_func_ptr)(void *, size_t), void *user_data) {
  537. const char *s = (const char *)src;
  538. char *c = utf8_null;
  539. size_t bytes = 0;
  540. // Find the end of the string or stop when n is reached
  541. while ('\0' != s[bytes] && bytes < n) {
  542. bytes++;
  543. }
  544. // In case bytes is actually less than n, we need to set it
  545. // to be used later in the copy byte by byte.
  546. n = bytes;
  547. if (alloc_func_ptr) {
  548. c = (char *)alloc_func_ptr(user_data, bytes + 1);
  549. } else {
  550. c = (char *)malloc(bytes + 1);
  551. }
  552. if (utf8_null == c) {
  553. // out of memory so we bail
  554. return utf8_null;
  555. }
  556. bytes = 0;
  557. // copy src byte-by-byte into our new utf8 string
  558. while ('\0' != s[bytes] && bytes < n) {
  559. c[bytes] = s[bytes];
  560. bytes++;
  561. }
  562. // append null terminating byte
  563. c[bytes] = '\0';
  564. return c;
  565. }
  566. void *utf8rchr(const void *src, int chr) {
  567. const char *s = (const char *)src;
  568. const char *match = utf8_null;
  569. char c[5] = {'\0', '\0', '\0', '\0', '\0'};
  570. if (0 == chr) {
  571. // being asked to return position of null terminating byte, so
  572. // just run s to the end, and return!
  573. while ('\0' != *s) {
  574. s++;
  575. }
  576. return (void *)s;
  577. } else if (0 == ((int)0xffffff80 & chr)) {
  578. // 1-byte/7-bit ascii
  579. // (0b0xxxxxxx)
  580. c[0] = (char)chr;
  581. } else if (0 == ((int)0xfffff800 & chr)) {
  582. // 2-byte/11-bit utf8 code point
  583. // (0b110xxxxx 0b10xxxxxx)
  584. c[0] = 0xc0 | (char)(chr >> 6);
  585. c[1] = 0x80 | (char)(chr & 0x3f);
  586. } else if (0 == ((int)0xffff0000 & chr)) {
  587. // 3-byte/16-bit utf8 code point
  588. // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx)
  589. c[0] = 0xe0 | (char)(chr >> 12);
  590. c[1] = 0x80 | (char)((chr >> 6) & 0x3f);
  591. c[2] = 0x80 | (char)(chr & 0x3f);
  592. } else { // if (0 == ((int)0xffe00000 & chr)) {
  593. // 4-byte/21-bit utf8 code point
  594. // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx)
  595. c[0] = 0xf0 | (char)(chr >> 18);
  596. c[1] = 0x80 | (char)((chr >> 12) & 0x3f);
  597. c[2] = 0x80 | (char)((chr >> 6) & 0x3f);
  598. c[3] = 0x80 | (char)(chr & 0x3f);
  599. }
  600. // we've created a 2 utf8 codepoint string in c that is
  601. // the utf8 character asked for by chr, and a null
  602. // terminating byte
  603. while ('\0' != *s) {
  604. size_t offset = 0;
  605. while (s[offset] == c[offset]) {
  606. offset++;
  607. }
  608. if ('\0' == c[offset]) {
  609. // we found a matching utf8 code point
  610. match = s;
  611. s += offset;
  612. } else {
  613. s += offset;
  614. // need to march s along to next utf8 codepoint start
  615. // (the next byte that doesn't match 0b10xxxxxx)
  616. if ('\0' != *s) {
  617. do {
  618. s++;
  619. } while (0x80 == (0xc0 & *s));
  620. }
  621. }
  622. }
  623. // return the last match we found (or 0 if no match was found)
  624. return (void *)match;
  625. }
  626. void *utf8pbrk(const void *str, const void *accept) {
  627. const char *s = (const char *)str;
  628. while ('\0' != *s) {
  629. const char *a = (const char *)accept;
  630. size_t offset = 0;
  631. while ('\0' != *a) {
  632. // checking that if *a is the start of a utf8 codepoint
  633. // (it is not 0b10xxxxxx) and we have successfully matched
  634. // a previous character (0 < offset) - we found a match
  635. if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
  636. return (void *)s;
  637. } else {
  638. if (*a == s[offset]) {
  639. // part of a utf8 codepoint matched, so move our checking
  640. // onwards to the next byte
  641. offset++;
  642. a++;
  643. } else {
  644. // r could be in the middle of an unmatching utf8 code point,
  645. // so we need to march it on to the next character beginning,
  646. do {
  647. a++;
  648. } while (0x80 == (0xc0 & *a));
  649. // reset offset too as we found a mismatch
  650. offset = 0;
  651. }
  652. }
  653. }
  654. // we found a match on the last utf8 codepoint
  655. if (0 < offset) {
  656. return (void *)s;
  657. }
  658. // the current utf8 codepoint in src did not match accept, but src
  659. // could have been partway through a utf8 codepoint, so we need to
  660. // march it onto the next utf8 codepoint starting byte
  661. do {
  662. s++;
  663. } while ((0x80 == (0xc0 & *s)));
  664. }
  665. return utf8_null;
  666. }
  667. size_t utf8size(const void *str) {
  668. return utf8size_lazy(str) + 1;
  669. }
  670. size_t utf8size_lazy(const void *str) {
  671. return utf8nsize_lazy(str, SIZE_MAX);
  672. }
  673. size_t utf8nsize_lazy(const void *str, size_t n) {
  674. const char *s = (const char *)str;
  675. size_t size = 0;
  676. while (size < n && '\0' != s[size]) {
  677. size++;
  678. }
  679. return size;
  680. }
  681. size_t utf8spn(const void *src, const void *accept) {
  682. const char *s = (const char *)src;
  683. size_t chars = 0;
  684. while ('\0' != *s) {
  685. const char *a = (const char *)accept;
  686. size_t offset = 0;
  687. while ('\0' != *a) {
  688. // checking that if *r is the start of a utf8 codepoint
  689. // (it is not 0b10xxxxxx) and we have successfully matched
  690. // a previous character (0 < offset) - we found a match
  691. if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
  692. // found a match, so increment the number of utf8 codepoints
  693. // that have matched and stop checking whether any other utf8
  694. // codepoints in a match
  695. chars++;
  696. s += offset;
  697. offset = 0;
  698. break;
  699. } else {
  700. if (*a == s[offset]) {
  701. offset++;
  702. a++;
  703. } else {
  704. // a could be in the middle of an unmatching utf8 codepoint,
  705. // so we need to march it on to the next character beginning,
  706. do {
  707. a++;
  708. } while (0x80 == (0xc0 & *a));
  709. // reset offset too as we found a mismatch
  710. offset = 0;
  711. }
  712. }
  713. }
  714. // found a match at the end of *a, so didn't get a chance to test it
  715. if (0 < offset) {
  716. chars++;
  717. s += offset;
  718. continue;
  719. }
  720. // if a got to its terminating null byte, then we didn't find a match.
  721. // Return the current number of matched utf8 codepoints
  722. if ('\0' == *a) {
  723. return chars;
  724. }
  725. }
  726. return chars;
  727. }
  728. void *utf8str(const void *haystack, const void *needle) {
  729. const char *h = (const char *)haystack;
  730. utf8_int32_t throwaway_codepoint;
  731. // if needle has no utf8 codepoints before the null terminating
  732. // byte then return haystack
  733. if ('\0' == *((const char *)needle)) {
  734. return (void *)haystack;
  735. }
  736. while ('\0' != *h) {
  737. const char *maybeMatch = h;
  738. const char *n = (const char *)needle;
  739. while (*h == *n && (*h != '\0' && *n != '\0')) {
  740. n++;
  741. h++;
  742. }
  743. if ('\0' == *n) {
  744. // we found the whole utf8 string for needle in haystack at
  745. // maybeMatch, so return it
  746. return (void *)maybeMatch;
  747. } else {
  748. // h could be in the middle of an unmatching utf8 codepoint,
  749. // so we need to march it on to the next character beginning
  750. // starting from the current character
  751. h = (const char *)utf8codepoint(maybeMatch, &throwaway_codepoint);
  752. }
  753. }
  754. // no match
  755. return utf8_null;
  756. }
  757. void *utf8casestr(const void *haystack, const void *needle) {
  758. const void *h = haystack;
  759. // if needle has no utf8 codepoints before the null terminating
  760. // byte then return haystack
  761. if ('\0' == *((const char *)needle)) {
  762. return (void *)haystack;
  763. }
  764. for (;;) {
  765. const void *maybeMatch = h;
  766. const void *n = needle;
  767. utf8_int32_t h_cp, n_cp;
  768. // Get the next code point and track it
  769. const void *nextH = h = utf8codepoint(h, &h_cp);
  770. n = utf8codepoint(n, &n_cp);
  771. while ((0 != h_cp) && (0 != n_cp)) {
  772. h_cp = utf8lwrcodepoint(h_cp);
  773. n_cp = utf8lwrcodepoint(n_cp);
  774. // if we find a mismatch, bail out!
  775. if (h_cp != n_cp) {
  776. break;
  777. }
  778. h = utf8codepoint(h, &h_cp);
  779. n = utf8codepoint(n, &n_cp);
  780. }
  781. if (0 == n_cp) {
  782. // we found the whole utf8 string for needle in haystack at
  783. // maybeMatch, so return it
  784. return (void *)maybeMatch;
  785. }
  786. if (0 == h_cp) {
  787. // no match
  788. return utf8_null;
  789. }
  790. // Roll back to the next code point in the haystack to test
  791. h = nextH;
  792. }
  793. }
  794. void *utf8valid(const void *str) {
  795. return utf8nvalid(str, SIZE_MAX);
  796. }
  797. void *utf8nvalid(const void *str, size_t n) {
  798. const char *s = (const char *)str;
  799. const char *t = s;
  800. size_t consumed, remained;
  801. while ((void) (consumed = (size_t) (s-t)), consumed < n && '\0' != *s) {
  802. remained = n - consumed;
  803. if (0xf0 == (0xf8 & *s)) {
  804. // ensure that there's 4 bytes or more remained
  805. if (remained < 4) {
  806. return (void *)s;
  807. }
  808. // ensure each of the 3 following bytes in this 4-byte
  809. // utf8 codepoint began with 0b10xxxxxx
  810. if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) ||
  811. (0x80 != (0xc0 & s[3]))) {
  812. return (void *)s;
  813. }
  814. // ensure that our utf8 codepoint ended after 4 bytes
  815. if (0x80 == (0xc0 & s[4])) {
  816. return (void *)s;
  817. }
  818. // ensure that the top 5 bits of this 4-byte utf8
  819. // codepoint were not 0, as then we could have used
  820. // one of the smaller encodings
  821. if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) {
  822. return (void *)s;
  823. }
  824. // 4-byte utf8 code point (began with 0b11110xxx)
  825. s += 4;
  826. } else if (0xe0 == (0xf0 & *s)) {
  827. // ensure that there's 3 bytes or more remained
  828. if (remained < 3) {
  829. return (void *)s;
  830. }
  831. // ensure each of the 2 following bytes in this 3-byte
  832. // utf8 codepoint began with 0b10xxxxxx
  833. if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) {
  834. return (void *)s;
  835. }
  836. // ensure that our utf8 codepoint ended after 3 bytes
  837. if (0x80 == (0xc0 & s[3])) {
  838. return (void *)s;
  839. }
  840. // ensure that the top 5 bits of this 3-byte utf8
  841. // codepoint were not 0, as then we could have used
  842. // one of the smaller encodings
  843. if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) {
  844. return (void *)s;
  845. }
  846. // 3-byte utf8 code point (began with 0b1110xxxx)
  847. s += 3;
  848. } else if (0xc0 == (0xe0 & *s)) {
  849. // ensure that there's 2 bytes or more remained
  850. if (remained < 2) {
  851. return (void *)s;
  852. }
  853. // ensure the 1 following byte in this 2-byte
  854. // utf8 codepoint began with 0b10xxxxxx
  855. if (0x80 != (0xc0 & s[1])) {
  856. return (void *)s;
  857. }
  858. // ensure that our utf8 codepoint ended after 2 bytes
  859. if (0x80 == (0xc0 & s[2])) {
  860. return (void *)s;
  861. }
  862. // ensure that the top 4 bits of this 2-byte utf8
  863. // codepoint were not 0, as then we could have used
  864. // one of the smaller encodings
  865. if (0 == (0x1e & s[0])) {
  866. return (void *)s;
  867. }
  868. // 2-byte utf8 code point (began with 0b110xxxxx)
  869. s += 2;
  870. } else if (0x00 == (0x80 & *s)) {
  871. // 1-byte ascii (began with 0b0xxxxxxx)
  872. s += 1;
  873. } else {
  874. // we have an invalid 0b1xxxxxxx utf8 code point entry
  875. return (void *)s;
  876. }
  877. }
  878. return utf8_null;
  879. }
  880. int utf8makevalid(void *str, const utf8_int32_t replacement) {
  881. char *read = (char *)str;
  882. char *write = read;
  883. const char r = (char)replacement;
  884. utf8_int32_t codepoint;
  885. if (replacement > 0x7f) {
  886. return -1;
  887. }
  888. while ('\0' != *read) {
  889. if (0xf0 == (0xf8 & *read)) {
  890. // ensure each of the 3 following bytes in this 4-byte
  891. // utf8 codepoint began with 0b10xxxxxx
  892. if ((0x80 != (0xc0 & read[1])) || (0x80 != (0xc0 & read[2])) ||
  893. (0x80 != (0xc0 & read[3]))) {
  894. *write++ = r;
  895. read++;
  896. continue;
  897. }
  898. // 4-byte utf8 code point (began with 0b11110xxx)
  899. read = (char *)utf8codepoint(read, &codepoint);
  900. write = (char *)utf8catcodepoint(write, codepoint, 4);
  901. } else if (0xe0 == (0xf0 & *read)) {
  902. // ensure each of the 2 following bytes in this 3-byte
  903. // utf8 codepoint began with 0b10xxxxxx
  904. if ((0x80 != (0xc0 & read[1])) || (0x80 != (0xc0 & read[2]))) {
  905. *write++ = r;
  906. read++;
  907. continue;
  908. }
  909. // 3-byte utf8 code point (began with 0b1110xxxx)
  910. read = (char *)utf8codepoint(read, &codepoint);
  911. write = (char *)utf8catcodepoint(write, codepoint, 3);
  912. } else if (0xc0 == (0xe0 & *read)) {
  913. // ensure the 1 following byte in this 2-byte
  914. // utf8 codepoint began with 0b10xxxxxx
  915. if (0x80 != (0xc0 & read[1])) {
  916. *write++ = r;
  917. read++;
  918. continue;
  919. }
  920. // 2-byte utf8 code point (began with 0b110xxxxx)
  921. read = (char *)utf8codepoint(read, &codepoint);
  922. write = (char *)utf8catcodepoint(write, codepoint, 2);
  923. } else if (0x00 == (0x80 & *read)) {
  924. // 1-byte ascii (began with 0b0xxxxxxx)
  925. read = (char *)utf8codepoint(read, &codepoint);
  926. write = (char *)utf8catcodepoint(write, codepoint, 1);
  927. } else {
  928. // if we got here then we've got a dangling continuation (0b10xxxxxx)
  929. *write++ = r;
  930. read++;
  931. continue;
  932. }
  933. }
  934. *write = '\0';
  935. return 0;
  936. }
  937. void *utf8codepoint(const void *utf8_restrict str,
  938. utf8_int32_t *utf8_restrict out_codepoint) {
  939. const char *s = (const char *)str;
  940. if (0xf0 == (0xf8 & s[0])) {
  941. // 4 byte utf8 codepoint
  942. *out_codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) |
  943. ((0x3f & s[2]) << 6) | (0x3f & s[3]);
  944. s += 4;
  945. } else if (0xe0 == (0xf0 & s[0])) {
  946. // 3 byte utf8 codepoint
  947. *out_codepoint =
  948. ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
  949. s += 3;
  950. } else if (0xc0 == (0xe0 & s[0])) {
  951. // 2 byte utf8 codepoint
  952. *out_codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
  953. s += 2;
  954. } else {
  955. // 1 byte utf8 codepoint otherwise
  956. *out_codepoint = s[0];
  957. s += 1;
  958. }
  959. return (void *)s;
  960. }
  961. size_t utf8codepointcalcsize(const void *str) {
  962. const char *s = (const char *)str;
  963. if (0xf0 == (0xf8 & s[0])) {
  964. // 4 byte utf8 codepoint
  965. return 4;
  966. } else if (0xe0 == (0xf0 & s[0])) {
  967. // 3 byte utf8 codepoint
  968. return 3;
  969. } else if (0xc0 == (0xe0 & s[0])) {
  970. // 2 byte utf8 codepoint
  971. return 2;
  972. }
  973. // 1 byte utf8 codepoint otherwise
  974. return 1;
  975. }
  976. size_t utf8codepointsize(utf8_int32_t chr) {
  977. if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
  978. return 1;
  979. } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
  980. return 2;
  981. } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
  982. return 3;
  983. } else { // if (0 == ((int)0xffe00000 & chr)) {
  984. return 4;
  985. }
  986. }
  987. void *utf8catcodepoint(void *str, utf8_int32_t chr, size_t n) {
  988. char *s = (char *)str;
  989. if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
  990. // 1-byte/7-bit ascii
  991. // (0b0xxxxxxx)
  992. if (n < 1) {
  993. return utf8_null;
  994. }
  995. s[0] = (char)chr;
  996. s += 1;
  997. } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
  998. // 2-byte/11-bit utf8 code point
  999. // (0b110xxxxx 0b10xxxxxx)
  1000. if (n < 2) {
  1001. return utf8_null;
  1002. }
  1003. s[0] = 0xc0 | (char)((chr >> 6) & 0x1f);
  1004. s[1] = 0x80 | (char)(chr & 0x3f);
  1005. s += 2;
  1006. } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
  1007. // 3-byte/16-bit utf8 code point
  1008. // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx)
  1009. if (n < 3) {
  1010. return utf8_null;
  1011. }
  1012. s[0] = 0xe0 | (char)((chr >> 12) & 0x0f);
  1013. s[1] = 0x80 | (char)((chr >> 6) & 0x3f);
  1014. s[2] = 0x80 | (char)(chr & 0x3f);
  1015. s += 3;
  1016. } else { // if (0 == ((int)0xffe00000 & chr)) {
  1017. // 4-byte/21-bit utf8 code point
  1018. // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx)
  1019. if (n < 4) {
  1020. return utf8_null;
  1021. }
  1022. s[0] = 0xf0 | (char)((chr >> 18) & 0x07);
  1023. s[1] = 0x80 | (char)((chr >> 12) & 0x3f);
  1024. s[2] = 0x80 | (char)((chr >> 6) & 0x3f);
  1025. s[3] = 0x80 | (char)(chr & 0x3f);
  1026. s += 4;
  1027. }
  1028. return s;
  1029. }
  1030. int utf8islower(utf8_int32_t chr) { return chr != utf8uprcodepoint(chr); }
  1031. int utf8isupper(utf8_int32_t chr) { return chr != utf8lwrcodepoint(chr); }
  1032. void utf8lwr(void *utf8_restrict str) {
  1033. void *p, *pn;
  1034. utf8_int32_t cp;
  1035. p = (char *)str;
  1036. pn = utf8codepoint(p, &cp);
  1037. while (cp != 0) {
  1038. const utf8_int32_t lwr_cp = utf8lwrcodepoint(cp);
  1039. const size_t size = utf8codepointsize(lwr_cp);
  1040. if (lwr_cp != cp) {
  1041. utf8catcodepoint(p, lwr_cp, size);
  1042. }
  1043. p = pn;
  1044. pn = utf8codepoint(p, &cp);
  1045. }
  1046. }
  1047. void utf8upr(void *utf8_restrict str) {
  1048. void *p, *pn;
  1049. utf8_int32_t cp;
  1050. p = (char *)str;
  1051. pn = utf8codepoint(p, &cp);
  1052. while (cp != 0) {
  1053. const utf8_int32_t lwr_cp = utf8uprcodepoint(cp);
  1054. const size_t size = utf8codepointsize(lwr_cp);
  1055. if (lwr_cp != cp) {
  1056. utf8catcodepoint(p, lwr_cp, size);
  1057. }
  1058. p = pn;
  1059. pn = utf8codepoint(p, &cp);
  1060. }
  1061. }
  1062. utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp) {
  1063. if (((0x0041 <= cp) && (0x005a >= cp)) ||
  1064. ((0x00c0 <= cp) && (0x00d6 >= cp)) ||
  1065. ((0x00d8 <= cp) && (0x00de >= cp)) ||
  1066. ((0x0391 <= cp) && (0x03a1 >= cp)) ||
  1067. ((0x03a3 <= cp) && (0x03ab >= cp)) ||
  1068. ((0x0410 <= cp) && (0x042f >= cp))) {
  1069. cp += 32;
  1070. } else if ((0x0400 <= cp) && (0x040f >= cp)) {
  1071. cp += 80;
  1072. } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
  1073. ((0x0132 <= cp) && (0x0137 >= cp)) ||
  1074. ((0x014a <= cp) && (0x0177 >= cp)) ||
  1075. ((0x0182 <= cp) && (0x0185 >= cp)) ||
  1076. ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
  1077. ((0x01de <= cp) && (0x01ef >= cp)) ||
  1078. ((0x01f8 <= cp) && (0x021f >= cp)) ||
  1079. ((0x0222 <= cp) && (0x0233 >= cp)) ||
  1080. ((0x0246 <= cp) && (0x024f >= cp)) ||
  1081. ((0x03d8 <= cp) && (0x03ef >= cp)) ||
  1082. ((0x0460 <= cp) && (0x0481 >= cp)) ||
  1083. ((0x048a <= cp) && (0x04ff >= cp))) {
  1084. cp |= 0x1;
  1085. } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
  1086. ((0x0179 <= cp) && (0x017e >= cp)) ||
  1087. ((0x01af <= cp) && (0x01b0 >= cp)) ||
  1088. ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
  1089. ((0x01cd <= cp) && (0x01dc >= cp))) {
  1090. cp += 1;
  1091. cp &= ~0x1;
  1092. } else {
  1093. switch (cp) {
  1094. default:
  1095. break;
  1096. case 0x0178:
  1097. cp = 0x00ff;
  1098. break;
  1099. case 0x0243:
  1100. cp = 0x0180;
  1101. break;
  1102. case 0x018e:
  1103. cp = 0x01dd;
  1104. break;
  1105. case 0x023d:
  1106. cp = 0x019a;
  1107. break;
  1108. case 0x0220:
  1109. cp = 0x019e;
  1110. break;
  1111. case 0x01b7:
  1112. cp = 0x0292;
  1113. break;
  1114. case 0x01c4:
  1115. cp = 0x01c6;
  1116. break;
  1117. case 0x01c7:
  1118. cp = 0x01c9;
  1119. break;
  1120. case 0x01ca:
  1121. cp = 0x01cc;
  1122. break;
  1123. case 0x01f1:
  1124. cp = 0x01f3;
  1125. break;
  1126. case 0x01f7:
  1127. cp = 0x01bf;
  1128. break;
  1129. case 0x0187:
  1130. cp = 0x0188;
  1131. break;
  1132. case 0x018b:
  1133. cp = 0x018c;
  1134. break;
  1135. case 0x0191:
  1136. cp = 0x0192;
  1137. break;
  1138. case 0x0198:
  1139. cp = 0x0199;
  1140. break;
  1141. case 0x01a7:
  1142. cp = 0x01a8;
  1143. break;
  1144. case 0x01ac:
  1145. cp = 0x01ad;
  1146. break;
  1147. case 0x01af:
  1148. cp = 0x01b0;
  1149. break;
  1150. case 0x01b8:
  1151. cp = 0x01b9;
  1152. break;
  1153. case 0x01bc:
  1154. cp = 0x01bd;
  1155. break;
  1156. case 0x01f4:
  1157. cp = 0x01f5;
  1158. break;
  1159. case 0x023b:
  1160. cp = 0x023c;
  1161. break;
  1162. case 0x0241:
  1163. cp = 0x0242;
  1164. break;
  1165. case 0x03fd:
  1166. cp = 0x037b;
  1167. break;
  1168. case 0x03fe:
  1169. cp = 0x037c;
  1170. break;
  1171. case 0x03ff:
  1172. cp = 0x037d;
  1173. break;
  1174. case 0x037f:
  1175. cp = 0x03f3;
  1176. break;
  1177. case 0x0386:
  1178. cp = 0x03ac;
  1179. break;
  1180. case 0x0388:
  1181. cp = 0x03ad;
  1182. break;
  1183. case 0x0389:
  1184. cp = 0x03ae;
  1185. break;
  1186. case 0x038a:
  1187. cp = 0x03af;
  1188. break;
  1189. case 0x038c:
  1190. cp = 0x03cc;
  1191. break;
  1192. case 0x038e:
  1193. cp = 0x03cd;
  1194. break;
  1195. case 0x038f:
  1196. cp = 0x03ce;
  1197. break;
  1198. case 0x0370:
  1199. cp = 0x0371;
  1200. break;
  1201. case 0x0372:
  1202. cp = 0x0373;
  1203. break;
  1204. case 0x0376:
  1205. cp = 0x0377;
  1206. break;
  1207. case 0x03f4:
  1208. cp = 0x03b8;
  1209. break;
  1210. case 0x03cf:
  1211. cp = 0x03d7;
  1212. break;
  1213. case 0x03f9:
  1214. cp = 0x03f2;
  1215. break;
  1216. case 0x03f7:
  1217. cp = 0x03f8;
  1218. break;
  1219. case 0x03fa:
  1220. cp = 0x03fb;
  1221. break;
  1222. }
  1223. }
  1224. return cp;
  1225. }
  1226. utf8_int32_t utf8uprcodepoint(utf8_int32_t cp) {
  1227. if (((0x0061 <= cp) && (0x007a >= cp)) ||
  1228. ((0x00e0 <= cp) && (0x00f6 >= cp)) ||
  1229. ((0x00f8 <= cp) && (0x00fe >= cp)) ||
  1230. ((0x03b1 <= cp) && (0x03c1 >= cp)) ||
  1231. ((0x03c3 <= cp) && (0x03cb >= cp)) ||
  1232. ((0x0430 <= cp) && (0x044f >= cp))) {
  1233. cp -= 32;
  1234. } else if ((0x0450 <= cp) && (0x045f >= cp)) {
  1235. cp -= 80;
  1236. } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
  1237. ((0x0132 <= cp) && (0x0137 >= cp)) ||
  1238. ((0x014a <= cp) && (0x0177 >= cp)) ||
  1239. ((0x0182 <= cp) && (0x0185 >= cp)) ||
  1240. ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
  1241. ((0x01de <= cp) && (0x01ef >= cp)) ||
  1242. ((0x01f8 <= cp) && (0x021f >= cp)) ||
  1243. ((0x0222 <= cp) && (0x0233 >= cp)) ||
  1244. ((0x0246 <= cp) && (0x024f >= cp)) ||
  1245. ((0x03d8 <= cp) && (0x03ef >= cp)) ||
  1246. ((0x0460 <= cp) && (0x0481 >= cp)) ||
  1247. ((0x048a <= cp) && (0x04ff >= cp))) {
  1248. cp &= ~0x1;
  1249. } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
  1250. ((0x0179 <= cp) && (0x017e >= cp)) ||
  1251. ((0x01af <= cp) && (0x01b0 >= cp)) ||
  1252. ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
  1253. ((0x01cd <= cp) && (0x01dc >= cp))) {
  1254. cp -= 1;
  1255. cp |= 0x1;
  1256. } else {
  1257. switch (cp) {
  1258. default:
  1259. break;
  1260. case 0x00ff:
  1261. cp = 0x0178;
  1262. break;
  1263. case 0x0180:
  1264. cp = 0x0243;
  1265. break;
  1266. case 0x01dd:
  1267. cp = 0x018e;
  1268. break;
  1269. case 0x019a:
  1270. cp = 0x023d;
  1271. break;
  1272. case 0x019e:
  1273. cp = 0x0220;
  1274. break;
  1275. case 0x0292:
  1276. cp = 0x01b7;
  1277. break;
  1278. case 0x01c6:
  1279. cp = 0x01c4;
  1280. break;
  1281. case 0x01c9:
  1282. cp = 0x01c7;
  1283. break;
  1284. case 0x01cc:
  1285. cp = 0x01ca;
  1286. break;
  1287. case 0x01f3:
  1288. cp = 0x01f1;
  1289. break;
  1290. case 0x01bf:
  1291. cp = 0x01f7;
  1292. break;
  1293. case 0x0188:
  1294. cp = 0x0187;
  1295. break;
  1296. case 0x018c:
  1297. cp = 0x018b;
  1298. break;
  1299. case 0x0192:
  1300. cp = 0x0191;
  1301. break;
  1302. case 0x0199:
  1303. cp = 0x0198;
  1304. break;
  1305. case 0x01a8:
  1306. cp = 0x01a7;
  1307. break;
  1308. case 0x01ad:
  1309. cp = 0x01ac;
  1310. break;
  1311. case 0x01b0:
  1312. cp = 0x01af;
  1313. break;
  1314. case 0x01b9:
  1315. cp = 0x01b8;
  1316. break;
  1317. case 0x01bd:
  1318. cp = 0x01bc;
  1319. break;
  1320. case 0x01f5:
  1321. cp = 0x01f4;
  1322. break;
  1323. case 0x023c:
  1324. cp = 0x023b;
  1325. break;
  1326. case 0x0242:
  1327. cp = 0x0241;
  1328. break;
  1329. case 0x037b:
  1330. cp = 0x03fd;
  1331. break;
  1332. case 0x037c:
  1333. cp = 0x03fe;
  1334. break;
  1335. case 0x037d:
  1336. cp = 0x03ff;
  1337. break;
  1338. case 0x03f3:
  1339. cp = 0x037f;
  1340. break;
  1341. case 0x03ac:
  1342. cp = 0x0386;
  1343. break;
  1344. case 0x03ad:
  1345. cp = 0x0388;
  1346. break;
  1347. case 0x03ae:
  1348. cp = 0x0389;
  1349. break;
  1350. case 0x03af:
  1351. cp = 0x038a;
  1352. break;
  1353. case 0x03cc:
  1354. cp = 0x038c;
  1355. break;
  1356. case 0x03cd:
  1357. cp = 0x038e;
  1358. break;
  1359. case 0x03ce:
  1360. cp = 0x038f;
  1361. break;
  1362. case 0x0371:
  1363. cp = 0x0370;
  1364. break;
  1365. case 0x0373:
  1366. cp = 0x0372;
  1367. break;
  1368. case 0x0377:
  1369. cp = 0x0376;
  1370. break;
  1371. case 0x03d1:
  1372. cp = 0x0398;
  1373. break;
  1374. case 0x03d7:
  1375. cp = 0x03cf;
  1376. break;
  1377. case 0x03f2:
  1378. cp = 0x03f9;
  1379. break;
  1380. case 0x03f8:
  1381. cp = 0x03f7;
  1382. break;
  1383. case 0x03fb:
  1384. cp = 0x03fa;
  1385. break;
  1386. }
  1387. }
  1388. return cp;
  1389. }
  1390. void *utf8rcodepoint(const void *utf8_restrict str,
  1391. utf8_int32_t *utf8_restrict out_codepoint) {
  1392. const char *s = (const char *)str;
  1393. if (0xf0 == (0xf8 & s[0])) {
  1394. // 4 byte utf8 codepoint
  1395. *out_codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) |
  1396. ((0x3f & s[2]) << 6) | (0x3f & s[3]);
  1397. } else if (0xe0 == (0xf0 & s[0])) {
  1398. // 3 byte utf8 codepoint
  1399. *out_codepoint =
  1400. ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
  1401. } else if (0xc0 == (0xe0 & s[0])) {
  1402. // 2 byte utf8 codepoint
  1403. *out_codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
  1404. } else {
  1405. // 1 byte utf8 codepoint otherwise
  1406. *out_codepoint = s[0];
  1407. }
  1408. do {
  1409. s--;
  1410. } while ((0 != (0x80 & s[0])) && (0x80 == (0xc0 & s[0])));
  1411. return (void *)s;
  1412. }
  1413. #undef utf8_restrict
  1414. #undef utf8_null
  1415. #ifdef __cplusplus
  1416. } // extern "C"
  1417. #endif
  1418. #if defined(__clang__)
  1419. #pragma clang diagnostic pop
  1420. #endif
  1421. #endif // SHEREDOM_UTF8_H_INCLUDED