├── .gitignore ├── Makefile ├── README.md ├── utf8.c └── utf8.h /.gitignore: -------------------------------------------------------------------------------- 1 | /*.o 2 | /*.do 3 | /*.a 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SRCS = utf8.c 2 | 3 | OBJS = $(SRCS:%.c=%.o) 4 | DOBJS = $(SRCS:%.c=%.do) 5 | 6 | ifneq ($(MAKECMDGOALS),debug) 7 | XOBJS = $(OBJS) 8 | else 9 | XOBJS = $(DOBJS) 10 | endif 11 | 12 | FLAGS = -Wall -Wno-strict-aliasing $(CFLAGS) 13 | 14 | DEBUGFLAGS = -ggdb3 -DDEBUG 15 | SHIPFLAGS = -O3 -DNDEBUG -falign-functions -momit-leaf-frame-pointer 16 | 17 | DEBUGFLAGS += $(FLAGS) 18 | SHIPFLAGS += $(FLAGS) 19 | 20 | default: release 21 | 22 | %.o: %.c 23 | $(CC) $(SHIPFLAGS) -c $< -o $@ 24 | %.do: %.c 25 | $(CC) $(DEBUGFLAGS) -c $< -o $@ 26 | 27 | release debug: libcutef8.a 28 | 29 | libcutef8.a: $(XOBJS) 30 | rm -rf $@ 31 | ar -rcs $@ $^ 32 | 33 | clean: 34 | rm -f *.o 35 | rm -f *.do 36 | rm -f *.a 37 | rm -f *~ *# 38 | rm -f core* 39 | rm -f libcutef8.a 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Simple, fast C library for manipulating strings in the UTF-8 encoding. 2 | 3 | NOTE: I now use and recommend [utf8proc](https://github.com/JuliaLang/utf8proc) instead of this library. 4 | -------------------------------------------------------------------------------- /utf8.c: -------------------------------------------------------------------------------- 1 | /* 2 | Basic UTF-8 manipulation routines 3 | by Jeff Bezanson 4 | placed in the public domain Fall 2005 5 | 6 | This code is designed to provide the utilities you need to manipulate 7 | UTF-8 as an internal string encoding. These functions do not perform the 8 | error checking normally needed when handling UTF-8 data, so if you happen 9 | to be from the Unicode Consortium you will want to flay me alive. 10 | I do this because error checking can be performed at the boundaries (I/O), 11 | with these routines reserved for higher performance on data known to be 12 | valid. 13 | A UTF-8 validation routine is included. 14 | */ 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #ifdef WIN32 24 | #include 25 | #define snprintf _snprintf 26 | #else 27 | #ifndef __FreeBSD__ 28 | #include 29 | #endif /* __FreeBSD__ */ 30 | #endif 31 | #include 32 | 33 | #include "utf8.h" 34 | 35 | static const uint32_t offsetsFromUTF8[6] = { 36 | 0x00000000UL, 0x00003080UL, 0x000E2080UL, 37 | 0x03C82080UL, 0xFA082080UL, 0x82082080UL 38 | }; 39 | 40 | static const char trailingBytesForUTF8[256] = { 41 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 42 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 43 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 44 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 45 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 46 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 47 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 48 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 49 | }; 50 | 51 | /* returns length of next utf-8 sequence */ 52 | size_t u8_seqlen(const char *s) 53 | { 54 | return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1; 55 | } 56 | 57 | /* returns the # of bytes needed to encode a certain character 58 | 0 means the character cannot (or should not) be encoded. */ 59 | size_t u8_charlen(uint32_t ch) 60 | { 61 | if (ch < 0x80) 62 | return 1; 63 | else if (ch < 0x800) 64 | return 2; 65 | else if (ch < 0x10000) 66 | return 3; 67 | else if (ch < 0x110000) 68 | return 4; 69 | return 0; 70 | } 71 | 72 | size_t u8_codingsize(uint32_t *wcstr, size_t n) 73 | { 74 | size_t i, c=0; 75 | 76 | for(i=0; i < n; i++) 77 | c += u8_charlen(wcstr[i]); 78 | return c; 79 | } 80 | 81 | /* conversions without error checking 82 | only works for valid UTF-8, i.e. no 5- or 6-byte sequences 83 | srcsz = source size in bytes 84 | sz = dest size in # of wide characters 85 | 86 | returns # characters converted 87 | if sz == srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space. 88 | */ 89 | size_t u8_toucs(uint32_t *dest, size_t sz, const char *src, size_t srcsz) 90 | { 91 | uint32_t ch; 92 | const char *src_end = src + srcsz; 93 | size_t nb; 94 | size_t i=0; 95 | 96 | if (sz == 0 || srcsz == 0) 97 | return 0; 98 | 99 | while (i < sz) { 100 | if (!isutf(*src)) { // invalid sequence 101 | dest[i++] = 0xFFFD; 102 | src++; 103 | if (src >= src_end) break; 104 | continue; 105 | } 106 | nb = trailingBytesForUTF8[(unsigned char)*src]; 107 | if (src + nb >= src_end) 108 | break; 109 | ch = 0; 110 | switch (nb) { 111 | /* these fall through deliberately */ 112 | case 5: ch += (unsigned char)*src++; ch <<= 6; 113 | case 4: ch += (unsigned char)*src++; ch <<= 6; 114 | case 3: ch += (unsigned char)*src++; ch <<= 6; 115 | case 2: ch += (unsigned char)*src++; ch <<= 6; 116 | case 1: ch += (unsigned char)*src++; ch <<= 6; 117 | case 0: ch += (unsigned char)*src++; 118 | } 119 | ch -= offsetsFromUTF8[nb]; 120 | dest[i++] = ch; 121 | } 122 | return i; 123 | } 124 | 125 | /* srcsz = number of source characters 126 | sz = size of dest buffer in bytes 127 | 128 | returns # bytes stored in dest 129 | the destination string will never be bigger than the source string. 130 | */ 131 | size_t u8_toutf8(char *dest, size_t sz, const uint32_t *src, size_t srcsz) 132 | { 133 | uint32_t ch; 134 | size_t i = 0; 135 | char *dest0 = dest; 136 | char *dest_end = dest + sz; 137 | 138 | while (i < srcsz) { 139 | ch = src[i]; 140 | if (ch < 0x80) { 141 | if (dest >= dest_end) 142 | break; 143 | *dest++ = (char)ch; 144 | } 145 | else if (ch < 0x800) { 146 | if (dest >= dest_end-1) 147 | break; 148 | *dest++ = (ch>>6) | 0xC0; 149 | *dest++ = (ch & 0x3F) | 0x80; 150 | } 151 | else if (ch < 0x10000) { 152 | if (dest >= dest_end-2) 153 | break; 154 | *dest++ = (ch>>12) | 0xE0; 155 | *dest++ = ((ch>>6) & 0x3F) | 0x80; 156 | *dest++ = (ch & 0x3F) | 0x80; 157 | } 158 | else if (ch < 0x110000) { 159 | if (dest >= dest_end-3) 160 | break; 161 | *dest++ = (ch>>18) | 0xF0; 162 | *dest++ = ((ch>>12) & 0x3F) | 0x80; 163 | *dest++ = ((ch>>6) & 0x3F) | 0x80; 164 | *dest++ = (ch & 0x3F) | 0x80; 165 | } 166 | i++; 167 | } 168 | return (dest-dest0); 169 | } 170 | 171 | size_t u8_wc_toutf8(char *dest, uint32_t ch) 172 | { 173 | if (ch < 0x80) { 174 | dest[0] = (char)ch; 175 | return 1; 176 | } 177 | if (ch < 0x800) { 178 | dest[0] = (ch>>6) | 0xC0; 179 | dest[1] = (ch & 0x3F) | 0x80; 180 | return 2; 181 | } 182 | if (ch < 0x10000) { 183 | dest[0] = (ch>>12) | 0xE0; 184 | dest[1] = ((ch>>6) & 0x3F) | 0x80; 185 | dest[2] = (ch & 0x3F) | 0x80; 186 | return 3; 187 | } 188 | if (ch < 0x110000) { 189 | dest[0] = (ch>>18) | 0xF0; 190 | dest[1] = ((ch>>12) & 0x3F) | 0x80; 191 | dest[2] = ((ch>>6) & 0x3F) | 0x80; 192 | dest[3] = (ch & 0x3F) | 0x80; 193 | return 4; 194 | } 195 | return 0; 196 | } 197 | 198 | /* charnum => byte offset */ 199 | size_t u8_offset(const char *s, size_t charnum) 200 | { 201 | size_t i=0; 202 | 203 | while (charnum > 0) { 204 | if (s[i++] & 0x80) { 205 | (void)(isutf(s[++i]) || isutf(s[++i]) || ++i); 206 | } 207 | charnum--; 208 | } 209 | return i; 210 | } 211 | 212 | /* byte offset => charnum */ 213 | size_t u8_charnum(const char *s, size_t offset) 214 | { 215 | size_t charnum = 0, i=0; 216 | 217 | while (i < offset) { 218 | if (s[i++] & 0x80) { 219 | (void)(isutf(s[++i]) || isutf(s[++i]) || ++i); 220 | } 221 | charnum++; 222 | } 223 | return charnum; 224 | } 225 | 226 | /* number of characters in NUL-terminated string */ 227 | size_t u8_strlen(const char *s) 228 | { 229 | size_t count = 0; 230 | size_t i = 0, lasti; 231 | 232 | while (1) { 233 | lasti = i; 234 | while (s[i] > 0) 235 | i++; 236 | count += (i-lasti); 237 | if (s[i++]==0) break; 238 | (void)(isutf(s[++i]) || isutf(s[++i]) || ++i); 239 | count++; 240 | } 241 | return count; 242 | } 243 | 244 | int wcwidth(wchar_t c); 245 | 246 | size_t u8_strwidth(const char *s) 247 | { 248 | uint32_t ch; 249 | size_t nb, tot=0; 250 | int w; 251 | signed char sc; 252 | 253 | while ((sc = (signed char)*s) != 0) { 254 | if (sc >= 0) { 255 | s++; 256 | if (sc) tot++; 257 | } 258 | else { 259 | if (!isutf(sc)) { tot++; s++; continue; } 260 | nb = trailingBytesForUTF8[(unsigned char)sc]; 261 | ch = 0; 262 | switch (nb) { 263 | /* these fall through deliberately */ 264 | case 5: ch += (unsigned char)*s++; ch <<= 6; 265 | case 4: ch += (unsigned char)*s++; ch <<= 6; 266 | case 3: ch += (unsigned char)*s++; ch <<= 6; 267 | case 2: ch += (unsigned char)*s++; ch <<= 6; 268 | case 1: ch += (unsigned char)*s++; ch <<= 6; 269 | case 0: ch += (unsigned char)*s++; 270 | } 271 | ch -= offsetsFromUTF8[nb]; 272 | w = wcwidth(ch); // might return -1 273 | if (w > 0) tot += w; 274 | } 275 | } 276 | return tot; 277 | } 278 | 279 | /* reads the next utf-8 sequence out of a string, updating an index */ 280 | uint32_t u8_nextchar(const char *s, size_t *i) 281 | { 282 | uint32_t ch = 0; 283 | size_t sz = 0; 284 | 285 | do { 286 | ch <<= 6; 287 | ch += (unsigned char)s[(*i)]; 288 | sz++; 289 | } while (s[*i] && (++(*i)) && !isutf(s[*i])); 290 | ch -= offsetsFromUTF8[sz-1]; 291 | 292 | return ch; 293 | } 294 | 295 | /* next character without NUL character terminator */ 296 | uint32_t u8_nextmemchar(const char *s, size_t *i) 297 | { 298 | uint32_t ch = 0; 299 | size_t sz = 0; 300 | 301 | do { 302 | ch <<= 6; 303 | ch += (unsigned char)s[(*i)++]; 304 | sz++; 305 | } while (!isutf(s[*i])); 306 | ch -= offsetsFromUTF8[sz-1]; 307 | 308 | return ch; 309 | } 310 | 311 | void u8_inc(const char *s, size_t *i) 312 | { 313 | (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || isutf(s[++(*i)]) || ++(*i)); 314 | } 315 | 316 | void u8_dec(const char *s, size_t *i) 317 | { 318 | (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || isutf(s[--(*i)]) || --(*i)); 319 | } 320 | 321 | int octal_digit(char c) 322 | { 323 | return (c >= '0' && c <= '7'); 324 | } 325 | 326 | int hex_digit(char c) 327 | { 328 | return ((c >= '0' && c <= '9') || 329 | (c >= 'A' && c <= 'F') || 330 | (c >= 'a' && c <= 'f')); 331 | } 332 | 333 | char read_escape_control_char(char c) 334 | { 335 | if (c == 'n') 336 | return '\n'; 337 | else if (c == 't') 338 | return '\t'; 339 | else if (c == 'r') 340 | return '\r'; 341 | else if (c == 'e') 342 | return 033; // '\e' 343 | else if (c == 'b') 344 | return '\b'; 345 | else if (c == 'f') 346 | return '\f'; 347 | else if (c == 'v') 348 | return '\v'; 349 | else if (c == 'a') 350 | return '\a'; 351 | return c; 352 | } 353 | 354 | /* assumes that src points to the character after a backslash 355 | returns number of input characters processed, 0 if error */ 356 | size_t u8_read_escape_sequence(const char *str, size_t ssz, uint32_t *dest) 357 | { 358 | uint32_t ch; 359 | char digs[10]; 360 | int dno=0, ndig; 361 | size_t i=1; 362 | char c0 = str[0]; 363 | assert(ssz > 0); 364 | 365 | if (octal_digit(c0)) { 366 | i = 0; 367 | do { 368 | digs[dno++] = str[i++]; 369 | } while (i sz-c) 412 | break; 413 | memcpy(&buf[c], temp, amt); 414 | c += amt; 415 | } 416 | if (c < sz) 417 | buf[c] = '\0'; 418 | return c; 419 | } 420 | 421 | static int buf_put2c(char *buf, const char *src) 422 | { 423 | buf[0] = src[0]; 424 | buf[1] = src[1]; 425 | buf[2] = '\0'; 426 | return 2; 427 | } 428 | 429 | int u8_escape_wchar(char *buf, size_t sz, uint32_t ch) 430 | { 431 | assert(sz > 2); 432 | if (ch == L'\n') 433 | return buf_put2c(buf, "\\n"); 434 | else if (ch == L'\t') 435 | return buf_put2c(buf, "\\t"); 436 | else if (ch == L'\r') 437 | return buf_put2c(buf, "\\r"); 438 | else if (ch == 033) // L'\e' 439 | return buf_put2c(buf, "\\e"); 440 | else if (ch == L'\b') 441 | return buf_put2c(buf, "\\b"); 442 | else if (ch == L'\f') 443 | return buf_put2c(buf, "\\f"); 444 | else if (ch == L'\v') 445 | return buf_put2c(buf, "\\v"); 446 | else if (ch == L'\a') 447 | return buf_put2c(buf, "\\a"); 448 | else if (ch == L'\\') 449 | return buf_put2c(buf, "\\\\"); 450 | else if (ch < 32 || ch == 0x7f) 451 | return snprintf(buf, sz, "\\x%.2hhx", (unsigned char)ch); 452 | else if (ch > 0xFFFF) 453 | return snprintf(buf, sz, "\\U%.8x", (uint32_t)ch); 454 | else if (ch >= 0x80) 455 | return snprintf(buf, sz, "\\u%.4hx", (unsigned short)ch); 456 | 457 | buf[0] = (char)ch; 458 | buf[1] = '\0'; 459 | return 1; 460 | } 461 | 462 | size_t u8_escape(char *buf, size_t sz, const char *src, size_t *pi, size_t end, 463 | int escape_quotes, int ascii) 464 | { 465 | size_t i = *pi, i0; 466 | uint32_t ch; 467 | char *start = buf; 468 | char *blim = start + sz-11; 469 | assert(sz > 11); 470 | 471 | while (i tempi) 563 | break; 564 | } 565 | return NULL; 566 | } 567 | 568 | int u8_is_locale_utf8(const char *locale) 569 | { 570 | /* this code based on libutf8 */ 571 | const char* cp = locale; 572 | 573 | if (locale == NULL) return 0; 574 | 575 | for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) { 576 | if (*cp == '.') { 577 | const char* encoding = ++cp; 578 | for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) 579 | ; 580 | if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5)) 581 | || (cp-encoding == 4 && !strncmp(encoding, "utf8", 4))) 582 | return 1; /* it's UTF-8 */ 583 | break; 584 | } 585 | } 586 | return 0; 587 | } 588 | 589 | size_t u8_vprintf(const char *fmt, va_list ap) 590 | { 591 | int cnt, sz=0, nc, needfree=0; 592 | char *buf; 593 | uint32_t *wcs; 594 | 595 | sz = 512; 596 | buf = (char*)alloca(sz); 597 | cnt = vsnprintf(buf, sz, fmt, ap); 598 | if (cnt < 0) 599 | return 0; 600 | if (cnt >= sz) { 601 | buf = (char*)malloc(cnt + 1); 602 | needfree = 1; 603 | vsnprintf(buf, cnt+1, fmt, ap); 604 | } 605 | wcs = (uint32_t*)alloca((cnt+1) * sizeof(uint32_t)); 606 | nc = u8_toucs(wcs, (size_t)cnt+1, buf, cnt); 607 | wcs[nc] = 0; 608 | printf("%ls", (wchar_t*)wcs); 609 | if (needfree) free(buf); 610 | return nc; 611 | } 612 | 613 | size_t u8_printf(const char *fmt, ...) 614 | { 615 | size_t cnt; 616 | va_list args; 617 | 618 | va_start(args, fmt); 619 | 620 | cnt = u8_vprintf(fmt, args); 621 | 622 | va_end(args); 623 | return cnt; 624 | } 625 | 626 | /* based on the valid_utf8 routine from the PCRE library by Philip Hazel 627 | 628 | length is in bytes, since without knowing whether the string is valid 629 | it's hard to know how many characters there are! */ 630 | int u8_isvalid(const char *str, size_t length) 631 | { 632 | const unsigned char *p, *pend = (unsigned char*)str + length; 633 | unsigned char c; 634 | int ret = 1; /* ASCII */ 635 | size_t ab; 636 | 637 | for (p = (unsigned char*)str; p < pend; p++) { 638 | c = *p; 639 | if (c < 128) 640 | continue; 641 | ret = 2; /* non-ASCII UTF-8 */ 642 | if ((c & 0xc0) != 0xc0) 643 | return 0; 644 | ab = trailingBytesForUTF8[c]; 645 | if (length < ab) 646 | return 0; 647 | length -= ab; 648 | 649 | p++; 650 | /* Check top bits in the second byte */ 651 | if ((*p & 0xc0) != 0x80) 652 | return 0; 653 | 654 | /* Check for overlong sequences for each different length */ 655 | switch (ab) { 656 | /* Check for xx00 000x */ 657 | case 1: 658 | if ((c & 0x3e) == 0) return 0; 659 | continue; /* We know there aren't any more bytes to check */ 660 | 661 | /* Check for 1110 0000, xx0x xxxx */ 662 | case 2: 663 | if (c == 0xe0 && (*p & 0x20) == 0) return 0; 664 | break; 665 | 666 | /* Check for 1111 0000, xx00 xxxx */ 667 | case 3: 668 | if (c == 0xf0 && (*p & 0x30) == 0) return 0; 669 | break; 670 | 671 | /* Check for 1111 1000, xx00 0xxx */ 672 | case 4: 673 | if (c == 0xf8 && (*p & 0x38) == 0) return 0; 674 | break; 675 | 676 | /* Check for leading 0xfe or 0xff, 677 | and then for 1111 1100, xx00 00xx */ 678 | case 5: 679 | if (c == 0xfe || c == 0xff || 680 | (c == 0xfc && (*p & 0x3c) == 0)) return 0; 681 | break; 682 | } 683 | 684 | /* Check for valid bytes after the 2nd, if any; all must start 10 */ 685 | while (--ab > 0) { 686 | if ((*(++p) & 0xc0) != 0x80) return 0; 687 | } 688 | } 689 | 690 | return ret; 691 | } 692 | 693 | int u8_reverse(char *dest, char * src, size_t len) 694 | { 695 | size_t si=0, di=len; 696 | unsigned char c; 697 | 698 | dest[di] = '\0'; 699 | while (si < len) { 700 | c = (unsigned char)src[si]; 701 | if ((~c) & 0x80) { 702 | di--; 703 | dest[di] = c; 704 | si++; 705 | } 706 | else { 707 | switch (c>>4) { 708 | case 0xC: 709 | case 0xD: 710 | di -= 2; 711 | *((int16_t*)&dest[di]) = *((int16_t*)&src[si]); 712 | si += 2; 713 | break; 714 | case 0xE: 715 | di -= 3; 716 | dest[di] = src[si]; 717 | *((int16_t*)&dest[di+1]) = *((int16_t*)&src[si+1]); 718 | si += 3; 719 | break; 720 | case 0xF: 721 | di -= 4; 722 | *((int32_t*)&dest[di]) = *((int32_t*)&src[si]); 723 | si += 4; 724 | break; 725 | default: 726 | return 1; 727 | } 728 | } 729 | } 730 | return 0; 731 | } 732 | -------------------------------------------------------------------------------- /utf8.h: -------------------------------------------------------------------------------- 1 | #ifndef UTF8_H 2 | #define UTF8_H 3 | 4 | extern int locale_is_utf8; 5 | 6 | /* is c the start of a utf8 sequence? */ 7 | #define isutf(c) (((c)&0xC0)!=0x80) 8 | 9 | #define UEOF ((uint32_t)-1) 10 | 11 | /* convert UTF-8 data to wide character */ 12 | size_t u8_toucs(uint32_t *dest, size_t sz, const char *src, size_t srcsz); 13 | 14 | /* the opposite conversion */ 15 | size_t u8_toutf8(char *dest, size_t sz, const uint32_t *src, size_t srcsz); 16 | 17 | /* single character to UTF-8, returns # bytes written */ 18 | size_t u8_wc_toutf8(char *dest, uint32_t ch); 19 | 20 | /* character number to byte offset */ 21 | size_t u8_offset(const char *str, size_t charnum); 22 | 23 | /* byte offset to character number */ 24 | size_t u8_charnum(const char *s, size_t offset); 25 | 26 | /* return next character, updating an index variable */ 27 | uint32_t u8_nextchar(const char *s, size_t *i); 28 | 29 | /* next character without NUL character terminator */ 30 | uint32_t u8_nextmemchar(const char *s, size_t *i); 31 | 32 | /* move to next character */ 33 | void u8_inc(const char *s, size_t *i); 34 | 35 | /* move to previous character */ 36 | void u8_dec(const char *s, size_t *i); 37 | 38 | /* returns length of next utf-8 sequence */ 39 | size_t u8_seqlen(const char *s); 40 | 41 | /* returns the # of bytes needed to encode a certain character */ 42 | size_t u8_charlen(uint32_t ch); 43 | 44 | /* computes the # of bytes needed to encode a WC string as UTF-8 */ 45 | size_t u8_codingsize(uint32_t *wcstr, size_t n); 46 | 47 | char read_escape_control_char(char c); 48 | 49 | /* assuming src points to the character after a backslash, read an 50 | escape sequence, storing the result in dest and returning the number of 51 | input characters processed */ 52 | size_t u8_read_escape_sequence(const char *src, size_t ssz, uint32_t *dest); 53 | 54 | /* given a wide character, convert it to an ASCII escape sequence stored in 55 | buf, where buf is "sz" bytes. returns the number of characters output. 56 | sz must be at least 3. */ 57 | int u8_escape_wchar(char *buf, size_t sz, uint32_t ch); 58 | 59 | /* convert a string "src" containing escape sequences to UTF-8 */ 60 | size_t u8_unescape(char *buf, size_t sz, const char *src); 61 | 62 | /* convert UTF-8 "src" to escape sequences. 63 | 64 | sz is buf size in bytes. must be at least 12. 65 | 66 | if escape_quotes is nonzero, quote characters will be escaped. 67 | 68 | if ascii is nonzero, the output is 7-bit ASCII, no UTF-8 survives. 69 | 70 | starts at src[*pi], updates *pi to point to the first unprocessed 71 | byte of the input. 72 | 73 | end is one more than the last allowable value of *pi. 74 | 75 | returns number of bytes placed in buf, including a NUL terminator. 76 | */ 77 | size_t u8_escape(char *buf, size_t sz, const char *src, size_t *pi, size_t end, 78 | int escape_quotes, int ascii); 79 | 80 | /* utility predicates used by the above */ 81 | int octal_digit(char c); 82 | int hex_digit(char c); 83 | 84 | /* return a pointer to the first occurrence of ch in s, or NULL if not 85 | found. character index of found character returned in *charn. */ 86 | char *u8_strchr(const char *s, uint32_t ch, size_t *charn); 87 | 88 | /* same as the above, but searches a buffer of a given size instead of 89 | a NUL-terminated string. */ 90 | char *u8_memchr(const char *s, uint32_t ch, size_t sz, size_t *charn); 91 | 92 | char *u8_memrchr(const char *s, uint32_t ch, size_t sz); 93 | 94 | /* count the number of characters in a UTF-8 string */ 95 | size_t u8_strlen(const char *s); 96 | 97 | /* number of columns occupied by a string */ 98 | size_t u8_strwidth(const char *s); 99 | 100 | int u8_is_locale_utf8(const char *locale); 101 | 102 | /* printf where the format string and arguments may be in UTF-8. 103 | you can avoid this function and just use ordinary printf() if the current 104 | locale is UTF-8. */ 105 | size_t u8_vprintf(const char *fmt, va_list ap); 106 | size_t u8_printf(const char *fmt, ...); 107 | 108 | /* determine whether a sequence of bytes is valid UTF-8. length is in bytes */ 109 | int u8_isvalid(const char *str, size_t length); 110 | 111 | /* reverse a UTF-8 string. len is length in bytes. dest and src must both 112 | be allocated to at least len+1 bytes. returns 1 for error, 0 otherwise */ 113 | int u8_reverse(char *dest, char *src, size_t len); 114 | 115 | #endif 116 | --------------------------------------------------------------------------------