├── .gitignore ├── Makefile ├── README.md ├── UNLICENSE ├── test ├── benchmark.c ├── bh-utf8.h ├── tests.c └── utf8-encode.h └── utf8.h /.gitignore: -------------------------------------------------------------------------------- 1 | tests 2 | benchmark 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC = cc -std=c99 2 | CFLAGS = -Wall -Wextra -O3 -g3 -march=native 3 | 4 | all: benchmark tests 5 | 6 | benchmark: test/benchmark.c utf8.h test/utf8-encode.h test/bh-utf8.h 7 | $(CC) $(CFLAGS) $(LDFLAGS) -o $@ test/benchmark.c $(LDLIBS) 8 | 9 | tests: test/tests.c utf8.h test/utf8-encode.h 10 | $(CC) $(CFLAGS) $(LDFLAGS) -o $@ test/tests.c $(LDLIBS) 11 | 12 | bench: benchmark 13 | ./benchmark 14 | 15 | check: tests 16 | ./tests 17 | 18 | clean: 19 | rm -f benchmark tests 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Branchless UTF-8 Decoder 2 | 3 | Full article: 4 | [A Branchless UTF-8 Decoder](http://nullprogram.com/blog/2017/10/06/) 5 | 6 | ## Example usage 7 | 8 | ```c 9 | #define N (1 << 20) // 1 MiB 10 | 11 | // input buffer with 3 bytes of zero padding 12 | char buf[N+3]; 13 | char *end = buf + fread(buf, 1, N, stdin); 14 | end[0] = end[1] = end[2] = 0; 15 | 16 | // output buffer: parsed code points 17 | int len = 0; 18 | uint32_t cp[N]; 19 | 20 | int errors = 0; 21 | for (char *p = buf; p < end;) { 22 | int e; 23 | p = utf8_decode(p, cp+len++, &e); 24 | errors |= e; 25 | } 26 | if (errors) { 27 | // decode failure 28 | } 29 | ``` 30 | -------------------------------------------------------------------------------- /UNLICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /test/benchmark.c: -------------------------------------------------------------------------------- 1 | #define _POSIX_C_SOURCE 200112L 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include // alarm() 8 | 9 | #include "../utf8.h" 10 | #include "utf8-encode.h" 11 | #include "bh-utf8.h" 12 | 13 | #define SECONDS 6 14 | #define BUFLEN 8 // MB 15 | 16 | static uint32_t 17 | pcg32(uint64_t *s) 18 | { 19 | uint64_t m = 0x9b60933458e17d7d; 20 | uint64_t a = 0xd737232eeccdf7ed; 21 | *s = *s * m + a; 22 | int shift = 29 - (*s >> 61); 23 | return *s >> shift; 24 | } 25 | 26 | /* Generate a random codepoint whose UTF-8 length is uniformly selected. */ 27 | static long 28 | randchar(uint64_t *s) 29 | { 30 | uint32_t r = pcg32(s); 31 | int len = 1 + (r & 0x3); 32 | r >>= 2; 33 | switch (len) { 34 | case 1: 35 | return r % 128; 36 | case 2: 37 | return 128 + r % (2048 - 128); 38 | case 3: 39 | return 2048 + r % (65536 - 2048); 40 | case 4: 41 | return 65536 + r % (131072 - 65536); 42 | } 43 | abort(); 44 | } 45 | 46 | static volatile sig_atomic_t running; 47 | 48 | static void 49 | alarm_handler(int signum) 50 | { 51 | (void)signum; 52 | running = 0; 53 | } 54 | 55 | /* Fill buffer with random characters, with evenly-distributed encoded 56 | * lengths. 57 | */ 58 | static void * 59 | buffer_fill(void *buf, size_t z) 60 | { 61 | uint64_t s = 0; 62 | char *p = buf; 63 | char *end = p + z; 64 | while (p < end) { 65 | long c; 66 | do 67 | c = randchar(&s); 68 | while (IS_SURROGATE(c)); 69 | p = utf8_encode(p, c); 70 | } 71 | return p; 72 | } 73 | 74 | static unsigned char * 75 | utf8_simple(unsigned char *s, long *c) 76 | { 77 | unsigned char *next; 78 | if (s[0] < 0x80) { 79 | *c = s[0]; 80 | next = s + 1; 81 | } else if ((s[0] & 0xe0) == 0xc0) { 82 | *c = ((long)(s[0] & 0x1f) << 6) | 83 | ((long)(s[1] & 0x3f) << 0); 84 | if ((s[1] & 0xc0) != 0x80) 85 | *c = -1; 86 | next = s + 2; 87 | } else if ((s[0] & 0xf0) == 0xe0) { 88 | *c = ((long)(s[0] & 0x0f) << 12) | 89 | ((long)(s[1] & 0x3f) << 6) | 90 | ((long)(s[2] & 0x3f) << 0); 91 | if ((s[1] & 0xc0) != 0x80 || 92 | (s[2] & 0xc0) != 0x80) 93 | *c = -1; 94 | next = s + 3; 95 | } else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4)) { 96 | *c = ((long)(s[0] & 0x07) << 18) | 97 | ((long)(s[1] & 0x3f) << 12) | 98 | ((long)(s[2] & 0x3f) << 6) | 99 | ((long)(s[3] & 0x3f) << 0); 100 | if ((s[1] & 0xc0) != 0x80 || 101 | (s[2] & 0xc0) != 0x80 || 102 | (s[3] & 0xc0) != 0x80) 103 | *c = -1; 104 | next = s + 4; 105 | } else { 106 | *c = -1; // invalid 107 | next = s + 1; // skip this byte 108 | } 109 | if (*c >= 0xd800 && *c <= 0xdfff) 110 | *c = -1; // surrogate half 111 | return next; 112 | } 113 | 114 | int 115 | main(void) 116 | { 117 | double rate; 118 | long errors, n; 119 | size_t z = BUFLEN * 1024L * 1024; 120 | unsigned char *buffer = malloc(z); 121 | unsigned char *end = buffer_fill(buffer, z - 4); 122 | 123 | /* Benchmark the branchless decoder */ 124 | running = 1; 125 | signal(SIGALRM, alarm_handler); 126 | alarm(SECONDS); 127 | errors = n = 0; 128 | do { 129 | unsigned char *p = buffer; 130 | int e = 0; 131 | uint32_t c; 132 | long count = 0; 133 | while (p < end) { 134 | p = utf8_decode(p, &c, &e); 135 | errors += !!e; // force errors to be checked 136 | count++; 137 | } 138 | if (p == end) // reached the end successfully? 139 | n++; 140 | } while (running); 141 | rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024; 142 | printf("branchless: %f MB/s, %ld errors\n", rate, errors); 143 | 144 | /* Benchmark Bjoern Hoehrmann's decoder */ 145 | running = 1; 146 | signal(SIGALRM, alarm_handler); 147 | alarm(SECONDS); 148 | errors = n = 0; 149 | do { 150 | unsigned char *p = buffer; 151 | uint32_t c; 152 | uint32_t state = 0; 153 | long count = 0; 154 | for (; p < end; p++) { 155 | if (!bh_utf8_decode(&state, &c, *p)) 156 | count++; 157 | else if (state == UTF8_REJECT) 158 | errors++; // force errors to be checked 159 | } 160 | if (p == end) // reached the end successfully? 161 | n++; 162 | } while (running); 163 | rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024; 164 | printf("Hoehrmann: %f MB/s, %ld errors\n", rate, errors); 165 | 166 | /* Benchmark simple decoder */ 167 | running = 1; 168 | signal(SIGALRM, alarm_handler); 169 | alarm(SECONDS); 170 | errors = n = 0; 171 | do { 172 | unsigned char *p = buffer; 173 | long c; 174 | long count = 0; 175 | while (p < end) { 176 | p = utf8_simple(p, &c); 177 | count++; 178 | if (c < 0) 179 | errors++; 180 | } 181 | if (p == end) // reached the end successfully? 182 | n++; 183 | } while (running); 184 | rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024; 185 | printf("Simple: %f MB/s, %ld errors\n", rate, errors); 186 | 187 | free(buffer); 188 | } 189 | -------------------------------------------------------------------------------- /test/bh-utf8.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2008-2009 Bjoern Hoehrmann 2 | // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. 3 | 4 | #ifdef BH_ORIGINAL 5 | 6 | #define UTF8_ACCEPT 0 7 | #define UTF8_REJECT 1 8 | 9 | static const uint8_t utf8d[] = { 10 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f 11 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f 12 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f 13 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f 14 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f 15 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf 16 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df 17 | 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef 18 | 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff 19 | 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 20 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 21 | 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 22 | 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 23 | 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 24 | }; 25 | 26 | static uint32_t 27 | bh_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { 28 | uint32_t type = utf8d[byte]; 29 | 30 | *codep = (*state != UTF8_ACCEPT) ? 31 | (byte & 0x3fu) | (*codep << 6) : 32 | (0xff >> type) & (byte); 33 | 34 | *state = utf8d[256 + *state*16 + type]; 35 | return *state; 36 | } 37 | 38 | #else 39 | 40 | #define UTF8_ACCEPT 0 41 | #define UTF8_REJECT 12 42 | 43 | static const uint8_t utf8d[] = { 44 | // The first part of the table maps bytes to character classes that 45 | // to reduce the size of the transition table and create bitmasks. 46 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 47 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 48 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 49 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 50 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 51 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 52 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 53 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, 54 | 55 | // The second part is a transition table that maps a combination 56 | // of a state of the automaton and a character class to a state. 57 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, 58 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 59 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 60 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, 61 | 12,36,12,12,12,12,12,12,12,12,12,12, 62 | }; 63 | 64 | static uint32_t 65 | bh_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { 66 | uint32_t type = utf8d[byte]; 67 | 68 | *codep = (*state != UTF8_ACCEPT) ? 69 | (byte & 0x3fu) | (*codep << 6) : 70 | (0xff >> type) & (byte); 71 | 72 | *state = utf8d[256 + *state + type]; 73 | return *state; 74 | } 75 | 76 | #endif 77 | -------------------------------------------------------------------------------- /test/tests.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../utf8.h" 4 | #include "utf8-encode.h" 5 | 6 | static int count_pass; 7 | static int count_fail; 8 | 9 | #define TEST(x, s, ...) \ 10 | do { \ 11 | if (x) { \ 12 | printf("\033[32;1mPASS\033[0m " s "\n", __VA_ARGS__); \ 13 | count_pass++; \ 14 | } else { \ 15 | printf("\033[31;1mFAIL\033[0m " s "\n", __VA_ARGS__); \ 16 | count_fail++; \ 17 | } \ 18 | } while (0) 19 | 20 | int 21 | main(void) 22 | { 23 | /* Make sure it can decode every character */ 24 | { 25 | long failures = 0; 26 | for (unsigned long i = 0; i < 0x10ffff; i++) { 27 | if (!IS_SURROGATE(i)) { 28 | int e; 29 | uint32_t c; 30 | unsigned char buf[8] = {0}; 31 | unsigned char *end = utf8_encode(buf, i); 32 | unsigned char *res = utf8_decode(buf, &c, &e); 33 | failures += end != res || c != i || e; 34 | } 35 | } 36 | TEST(failures == 0, "decode all, errors: %ld", failures); 37 | } 38 | 39 | /* Reject everything outside of U+0000..U+10FFFF */ 40 | { 41 | long failures = 0; 42 | for (unsigned long i = 0x110000; i < 0x1fffff; i++) { 43 | int e; 44 | uint32_t c; 45 | unsigned char buf[8] = {0}; 46 | utf8_encode(buf, i); 47 | unsigned char *end = utf8_decode(buf, &c, &e); 48 | failures += !e; 49 | failures += end - buf != 4; 50 | } 51 | TEST(failures == 0, "out of range, errors: %ld", failures); 52 | } 53 | 54 | 55 | /* Does it reject all surrogate halves? */ 56 | { 57 | long failures = 0; 58 | for (unsigned long i = 0xd800; i <= 0xdfff; i++) { 59 | int e; 60 | uint32_t c; 61 | unsigned char buf[8] = {0}; 62 | utf8_encode(buf, i); 63 | utf8_decode(buf, &c, &e); 64 | failures += !e; 65 | } 66 | TEST(failures == 0, "surrogate halves, errors: %ld", failures); 67 | } 68 | 69 | /* How about non-canonical encodings? */ 70 | { 71 | int e; 72 | uint32_t c; 73 | unsigned char *end; 74 | 75 | unsigned char buf2[8] = {0xc0, 0xA4}; 76 | end = utf8_decode(buf2, &c, &e); 77 | TEST(e, "non-canonical len 2, 0x%02x", e); 78 | TEST(end == buf2 + 2, "non-canonical recover 2, U+%04lx", 79 | (unsigned long)c); 80 | 81 | unsigned char buf3[8] = {0xe0, 0x80, 0xA4}; 82 | end = utf8_decode(buf3, &c, &e); 83 | TEST(e, "non-canonical len 3, 0x%02x", e); 84 | TEST(end == buf3 + 3, "non-canonical recover 3, U+%04lx", 85 | (unsigned long)c); 86 | 87 | unsigned char buf4[8] = {0xf0, 0x80, 0x80, 0xA4}; 88 | end = utf8_decode(buf4, &c, &e); 89 | TEST(e, "non-canonical encoding len 4, 0x%02x", e); 90 | TEST(end == buf4 + 4, "non-canonical recover 4, U+%04lx", 91 | (unsigned long)c); 92 | } 93 | 94 | /* Let's try some bogus byte sequences */ 95 | { 96 | int len, e; 97 | uint32_t c; 98 | 99 | /* Invalid first byte */ 100 | unsigned char buf0[4] = {0xff}; 101 | len = (unsigned char *)utf8_decode(buf0, &c, &e) - buf0; 102 | TEST(e, "bogus [ff] 0x%02x U+%04lx", e, (unsigned long)c); 103 | TEST(len == 1, "bogus [ff] recovery %d", len); 104 | 105 | /* Invalid first byte */ 106 | unsigned char buf1[4] = {0x80}; 107 | len = (unsigned char *)utf8_decode(buf1, &c, &e) - buf1; 108 | TEST(e, "bogus [80] 0x%02x U+%04lx", e, (unsigned long)c); 109 | TEST(len == 1, "bogus [80] recovery %d", len); 110 | 111 | /* Looks like a two-byte sequence but second byte is wrong */ 112 | unsigned char buf2[4] = {0xc0, 0x0a}; 113 | len = (unsigned char *)utf8_decode(buf2, &c, &e) - buf2; 114 | TEST(e, "bogus [c0 0a] 0x%02x U+%04lx", e, (unsigned long)c); 115 | TEST(len == 2, "bogus [c0 0a] recovery %d", len); 116 | } 117 | 118 | printf("%d fail, %d pass\n", count_fail, count_pass); 119 | return count_fail != 0; 120 | } 121 | -------------------------------------------------------------------------------- /test/utf8-encode.h: -------------------------------------------------------------------------------- 1 | #ifndef UTF8_ENCODE 2 | #define UTF8_ENCODE 3 | 4 | #define IS_SURROGATE(c) ((c) >= 0xD800U && (c) <= 0xDFFFU) 5 | 6 | static void * 7 | utf8_encode(void *buf, long c) 8 | { 9 | unsigned char *s = buf; 10 | if (c >= (1L << 16)) { 11 | s[0] = 0xf0 | (c >> 18); 12 | s[1] = 0x80 | ((c >> 12) & 0x3f); 13 | s[2] = 0x80 | ((c >> 6) & 0x3f); 14 | s[3] = 0x80 | ((c >> 0) & 0x3f); 15 | return s + 4; 16 | } else if (c >= (1L << 11)) { 17 | s[0] = 0xe0 | (c >> 12); 18 | s[1] = 0x80 | ((c >> 6) & 0x3f); 19 | s[2] = 0x80 | ((c >> 0) & 0x3f); 20 | return s + 3; 21 | } else if (c >= (1L << 7)) { 22 | s[0] = 0xc0 | (c >> 6); 23 | s[1] = 0x80 | ((c >> 0) & 0x3f); 24 | return s + 2; 25 | } else { 26 | s[0] = c; 27 | return s + 1; 28 | } 29 | } 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /utf8.h: -------------------------------------------------------------------------------- 1 | /* Branchless UTF-8 decoder 2 | * 3 | * This is free and unencumbered software released into the public domain. 4 | */ 5 | #ifndef UTF8_H 6 | #define UTF8_H 7 | 8 | #include 9 | 10 | /* Decode the next character, C, from BUF, reporting errors in E. 11 | * 12 | * Since this is a branchless decoder, four bytes will be read from the 13 | * buffer regardless of the actual length of the next character. This 14 | * means the buffer _must_ have at least three bytes of zero padding 15 | * following the end of the data stream. 16 | * 17 | * Errors are reported in E, which will be non-zero if the parsed 18 | * character was somehow invalid: invalid byte sequence, non-canonical 19 | * encoding, or a surrogate half. 20 | * 21 | * The function returns a pointer to the next character. When an error 22 | * occurs, this pointer will be a guess that depends on the particular 23 | * error, but it will always advance at least one byte. 24 | */ 25 | static void * 26 | utf8_decode(void *buf, uint32_t *c, int *e) 27 | { 28 | static const char lengths[] = { 29 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 30 | 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0 31 | }; 32 | static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07}; 33 | static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536}; 34 | static const int shiftc[] = {0, 18, 12, 6, 0}; 35 | static const int shifte[] = {0, 6, 4, 2, 0}; 36 | 37 | unsigned char *s = buf; 38 | int len = lengths[s[0] >> 3]; 39 | 40 | /* Compute the pointer to the next character early so that the next 41 | * iteration can start working on the next character. Neither Clang 42 | * nor GCC figure out this reordering on their own. 43 | */ 44 | unsigned char *next = s + len + !len; 45 | 46 | /* Assume a four-byte character and load four bytes. Unused bits are 47 | * shifted out. 48 | */ 49 | *c = (uint32_t)(s[0] & masks[len]) << 18; 50 | *c |= (uint32_t)(s[1] & 0x3f) << 12; 51 | *c |= (uint32_t)(s[2] & 0x3f) << 6; 52 | *c |= (uint32_t)(s[3] & 0x3f) << 0; 53 | *c >>= shiftc[len]; 54 | 55 | /* Accumulate the various error conditions. */ 56 | *e = (*c < mins[len]) << 6; // non-canonical encoding 57 | *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? 58 | *e |= (*c > 0x10FFFF) << 8; // out of range? 59 | *e |= (s[1] & 0xc0) >> 2; 60 | *e |= (s[2] & 0xc0) >> 4; 61 | *e |= (s[3] ) >> 6; 62 | *e ^= 0x2a; // top two bits of each tail byte correct? 63 | *e >>= shifte[len]; 64 | 65 | return next; 66 | } 67 | 68 | #endif 69 | --------------------------------------------------------------------------------