├── .gitignore
├── Makefile
├── README.md
├── UNLICENSE
├── test
    ├── benchmark.c
    ├── bh-utf8.h
    ├── tests.c
    └── utf8-encode.h
└── utf8.h


/.gitignore:
--------------------------------------------------------------------------------
1 | tests
2 | benchmark
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CC     = cc -std=c99
 2 | CFLAGS = -Wall -Wextra -O3 -g3 -march=native
 3 | 
 4 | all: benchmark tests
 5 | 
 6 | benchmark: test/benchmark.c utf8.h test/utf8-encode.h test/bh-utf8.h
 7 | 	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ test/benchmark.c $(LDLIBS)
 8 | 
 9 | tests: test/tests.c utf8.h test/utf8-encode.h
10 | 	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ test/tests.c $(LDLIBS)
11 | 
12 | bench: benchmark
13 | 	./benchmark
14 | 
15 | check: tests
16 | 	./tests
17 | 
18 | clean:
19 | 	rm -f benchmark tests
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Branchless UTF-8 Decoder
 2 | 
 3 | Full article:
 4 | [A Branchless UTF-8 Decoder](http://nullprogram.com/blog/2017/10/06/)
 5 | 
 6 | ## Example usage
 7 | 
 8 | ```c
 9 | #define N (1 << 20)  // 1 MiB
10 | 
11 | // input buffer with 3 bytes of zero padding
12 | char buf[N+3];
13 | char *end = buf + fread(buf, 1, N, stdin);
14 | end[0] = end[1] = end[2] = 0;
15 | 
16 | // output buffer: parsed code points
17 | int len = 0;
18 | uint32_t cp[N];
19 | 
20 | int errors = 0;
21 | for (char *p = buf; p < end;) {
22 |     int e;
23 |     p = utf8_decode(p, cp+len++, &e);
24 |     errors |= e;
25 | }
26 | if (errors) {
27 |     // decode failure
28 | }
29 | ```
30 | 


--------------------------------------------------------------------------------
/UNLICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org/>
25 | 


--------------------------------------------------------------------------------
/test/benchmark.c:
--------------------------------------------------------------------------------
  1 | #define _POSIX_C_SOURCE 200112L
  2 | #include <stdio.h>
  3 | #include <stdint.h>
  4 | #include <stdlib.h>
  5 | #include <signal.h>
  6 | 
  7 | #include <unistd.h> // alarm()
  8 | 
  9 | #include "../utf8.h"
 10 | #include "utf8-encode.h"
 11 | #include "bh-utf8.h"
 12 | 
 13 | #define SECONDS 6
 14 | #define BUFLEN  8 // MB
 15 | 
 16 | static uint32_t
 17 | pcg32(uint64_t *s)
 18 | {
 19 |     uint64_t m = 0x9b60933458e17d7d;
 20 |     uint64_t a = 0xd737232eeccdf7ed;
 21 |     *s = *s * m + a;
 22 |     int shift = 29 - (*s >> 61);
 23 |     return *s >> shift;
 24 | }
 25 | 
 26 | /* Generate a random codepoint whose UTF-8 length is uniformly selected. */
 27 | static long
 28 | randchar(uint64_t *s)
 29 | {
 30 |     uint32_t r = pcg32(s);
 31 |     int len = 1 + (r & 0x3);
 32 |     r >>= 2;
 33 |     switch (len) {
 34 |         case 1:
 35 |             return r % 128;
 36 |         case 2:
 37 |             return 128 + r % (2048 - 128);
 38 |         case 3:
 39 |             return 2048 + r % (65536 - 2048);
 40 |         case 4:
 41 |             return 65536 + r % (131072 - 65536);
 42 |     }
 43 |     abort();
 44 | }
 45 | 
 46 | static volatile sig_atomic_t running;
 47 | 
 48 | static void
 49 | alarm_handler(int signum)
 50 | {
 51 |     (void)signum;
 52 |     running = 0;
 53 | }
 54 | 
 55 | /* Fill buffer with random characters, with evenly-distributed encoded
 56 |  * lengths.
 57 |  */
 58 | static void *
 59 | buffer_fill(void *buf, size_t z)
 60 | {
 61 |     uint64_t s = 0;
 62 |     char *p = buf;
 63 |     char *end = p + z;
 64 |     while (p < end) {
 65 |         long c;
 66 |         do
 67 |             c = randchar(&s);
 68 |         while (IS_SURROGATE(c));
 69 |         p = utf8_encode(p, c);
 70 |     }
 71 |     return p;
 72 | }
 73 | 
 74 | static unsigned char *
 75 | utf8_simple(unsigned char *s, long *c)
 76 | {
 77 |     unsigned char *next;
 78 |     if (s[0] < 0x80) {
 79 |         *c = s[0];
 80 |         next = s + 1;
 81 |     } else if ((s[0] & 0xe0) == 0xc0) {
 82 |         *c = ((long)(s[0] & 0x1f) <<  6) |
 83 |              ((long)(s[1] & 0x3f) <<  0);
 84 |         if ((s[1] & 0xc0) != 0x80)
 85 |             *c = -1;
 86 |         next = s + 2;
 87 |     } else if ((s[0] & 0xf0) == 0xe0) {
 88 |         *c = ((long)(s[0] & 0x0f) << 12) |
 89 |              ((long)(s[1] & 0x3f) <<  6) |
 90 |              ((long)(s[2] & 0x3f) <<  0);
 91 |         if ((s[1] & 0xc0) != 0x80 ||
 92 |             (s[2] & 0xc0) != 0x80)
 93 |             *c = -1;
 94 |         next = s + 3;
 95 |     } else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4)) {
 96 |         *c = ((long)(s[0] & 0x07) << 18) |
 97 |              ((long)(s[1] & 0x3f) << 12) |
 98 |              ((long)(s[2] & 0x3f) <<  6) |
 99 |              ((long)(s[3] & 0x3f) <<  0);
100 |         if ((s[1] & 0xc0) != 0x80 ||
101 |             (s[2] & 0xc0) != 0x80 ||
102 |             (s[3] & 0xc0) != 0x80)
103 |             *c = -1;
104 |         next = s + 4;
105 |     } else {
106 |         *c = -1; // invalid
107 |         next = s + 1; // skip this byte
108 |     }
109 |     if (*c >= 0xd800 && *c <= 0xdfff)
110 |         *c = -1; // surrogate half
111 |     return next;
112 | }
113 | 
114 | int
115 | main(void)
116 | {
117 |     double rate;
118 |     long errors, n;
119 |     size_t z = BUFLEN * 1024L * 1024;
120 |     unsigned char *buffer = malloc(z);
121 |     unsigned char *end = buffer_fill(buffer, z - 4);
122 | 
123 |     /* Benchmark the branchless decoder */
124 |     running = 1;
125 |     signal(SIGALRM, alarm_handler);
126 |     alarm(SECONDS);
127 |     errors = n = 0;
128 |     do {
129 |         unsigned char *p = buffer;
130 |         int e = 0;
131 |         uint32_t c;
132 |         long count = 0;
133 |         while (p < end) {
134 |             p = utf8_decode(p, &c, &e);
135 |             errors += !!e;  // force errors to be checked
136 |             count++;
137 |         }
138 |         if (p == end) // reached the end successfully?
139 |             n++;
140 |     } while (running);
141 |     rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
142 |     printf("branchless: %f MB/s, %ld errors\n", rate, errors);
143 | 
144 |     /* Benchmark Bjoern Hoehrmann's decoder */
145 |     running = 1;
146 |     signal(SIGALRM, alarm_handler);
147 |     alarm(SECONDS);
148 |     errors = n = 0;
149 |     do {
150 |         unsigned char *p = buffer;
151 |         uint32_t c;
152 |         uint32_t state = 0;
153 |         long count = 0;
154 |         for (; p < end; p++) {
155 |             if (!bh_utf8_decode(&state, &c, *p))
156 |                 count++;
157 |             else if (state == UTF8_REJECT)
158 |                 errors++;  // force errors to be checked
159 |         }
160 |         if (p == end) // reached the end successfully?
161 |             n++;
162 |     } while (running);
163 |     rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
164 |     printf("Hoehrmann:  %f MB/s, %ld errors\n", rate, errors);
165 | 
166 |     /* Benchmark simple decoder */
167 |     running = 1;
168 |     signal(SIGALRM, alarm_handler);
169 |     alarm(SECONDS);
170 |     errors = n = 0;
171 |     do {
172 |         unsigned char *p = buffer;
173 |         long c;
174 |         long count = 0;
175 |         while (p < end) {
176 |             p = utf8_simple(p, &c);
177 |             count++;
178 |             if (c < 0)
179 |                 errors++;
180 |         }
181 |         if (p == end) // reached the end successfully?
182 |             n++;
183 |     } while (running);
184 |     rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
185 |     printf("Simple:     %f MB/s, %ld errors\n", rate, errors);
186 | 
187 |     free(buffer);
188 | }
189 | 


--------------------------------------------------------------------------------
/test/bh-utf8.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
 2 | // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
 3 | 
 4 | #ifdef BH_ORIGINAL
 5 | 
 6 | #define UTF8_ACCEPT 0
 7 | #define UTF8_REJECT 1
 8 | 
 9 | static const uint8_t utf8d[] = {
10 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
11 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
12 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
13 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
14 |     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
15 |     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
16 |     8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
17 |     0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
18 |     0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
19 |     0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
20 |     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
21 |     1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
22 |     1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
23 |     1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
24 | };
25 | 
26 | static uint32_t
27 | bh_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
28 |     uint32_t type = utf8d[byte];
29 | 
30 |     *codep = (*state != UTF8_ACCEPT) ?
31 |         (byte & 0x3fu) | (*codep << 6) :
32 |         (0xff >> type) & (byte);
33 | 
34 |     *state = utf8d[256 + *state*16 + type];
35 |     return *state;
36 | }
37 | 
38 | #else
39 | 
40 | #define UTF8_ACCEPT 0
41 | #define UTF8_REJECT 12
42 | 
43 | static const uint8_t utf8d[] = {
44 |   // The first part of the table maps bytes to character classes that
45 |   // to reduce the size of the transition table and create bitmasks.
46 |    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47 |    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48 |    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49 |    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50 |    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
51 |    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
52 |    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
53 |   10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
54 | 
55 |   // The second part is a transition table that maps a combination
56 |   // of a state of the automaton and a character class to a state.
57 |    0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
58 |   12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
59 |   12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
60 |   12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
61 |   12,36,12,12,12,12,12,12,12,12,12,12,
62 | };
63 | 
64 | static uint32_t
65 | bh_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
66 |   uint32_t type = utf8d[byte];
67 | 
68 |   *codep = (*state != UTF8_ACCEPT) ?
69 |     (byte & 0x3fu) | (*codep << 6) :
70 |     (0xff >> type) & (byte);
71 | 
72 |   *state = utf8d[256 + *state + type];
73 |   return *state;
74 | }
75 | 
76 | #endif
77 | 


--------------------------------------------------------------------------------
/test/tests.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | 
  3 | #include "../utf8.h"
  4 | #include "utf8-encode.h"
  5 | 
  6 | static int count_pass;
  7 | static int count_fail;
  8 | 
  9 | #define TEST(x, s, ...) \
 10 |     do { \
 11 |         if (x) { \
 12 |             printf("\033[32;1mPASS\033[0m " s "\n", __VA_ARGS__); \
 13 |             count_pass++; \
 14 |         } else { \
 15 |             printf("\033[31;1mFAIL\033[0m " s "\n", __VA_ARGS__); \
 16 |             count_fail++; \
 17 |         } \
 18 |     } while (0)
 19 | 
 20 | int
 21 | main(void)
 22 | {
 23 |     /* Make sure it can decode every character */
 24 |     {
 25 |         long failures = 0;
 26 |         for (unsigned long i = 0; i < 0x10ffff; i++) {
 27 |             if (!IS_SURROGATE(i)) {
 28 |                 int e;
 29 |                 uint32_t c;
 30 |                 unsigned char buf[8] = {0};
 31 |                 unsigned char *end = utf8_encode(buf, i);
 32 |                 unsigned char *res = utf8_decode(buf, &c, &e);
 33 |                 failures += end != res || c != i || e;
 34 |             }
 35 |         }
 36 |         TEST(failures == 0, "decode all, errors: %ld", failures);
 37 |     }
 38 | 
 39 |     /* Reject everything outside of U+0000..U+10FFFF */
 40 |     {
 41 |         long failures = 0;
 42 |         for (unsigned long i = 0x110000; i < 0x1fffff; i++) {
 43 |             int e;
 44 |             uint32_t c;
 45 |             unsigned char buf[8] = {0};
 46 |             utf8_encode(buf, i);
 47 |             unsigned char *end = utf8_decode(buf, &c, &e);
 48 |             failures += !e;
 49 |             failures += end - buf != 4;
 50 |         }
 51 |         TEST(failures == 0, "out of range, errors: %ld", failures);
 52 |     }
 53 | 
 54 | 
 55 |     /* Does it reject all surrogate halves? */
 56 |     {
 57 |         long failures = 0;
 58 |         for (unsigned long i = 0xd800; i <= 0xdfff; i++) {
 59 |             int e;
 60 |             uint32_t c;
 61 |             unsigned char buf[8] = {0};
 62 |             utf8_encode(buf, i);
 63 |             utf8_decode(buf, &c, &e);
 64 |             failures += !e;
 65 |         }
 66 |         TEST(failures == 0, "surrogate halves, errors: %ld", failures);
 67 |     }
 68 | 
 69 |     /* How about non-canonical encodings? */
 70 |     {
 71 |         int e;
 72 |         uint32_t c;
 73 |         unsigned char *end;
 74 | 
 75 |         unsigned char buf2[8] = {0xc0, 0xA4};
 76 |         end = utf8_decode(buf2, &c, &e);
 77 |         TEST(e, "non-canonical len 2, 0x%02x", e);
 78 |         TEST(end == buf2 + 2, "non-canonical recover 2, U+%04lx",
 79 |              (unsigned long)c);
 80 | 
 81 |         unsigned char buf3[8] = {0xe0, 0x80, 0xA4};
 82 |         end = utf8_decode(buf3, &c, &e);
 83 |         TEST(e, "non-canonical len 3, 0x%02x", e);
 84 |         TEST(end == buf3 + 3, "non-canonical recover 3, U+%04lx",
 85 |              (unsigned long)c);
 86 | 
 87 |         unsigned char buf4[8] = {0xf0, 0x80, 0x80, 0xA4};
 88 |         end = utf8_decode(buf4, &c, &e);
 89 |         TEST(e, "non-canonical encoding len 4, 0x%02x", e);
 90 |         TEST(end == buf4 + 4, "non-canonical recover 4, U+%04lx",
 91 |              (unsigned long)c);
 92 |     }
 93 | 
 94 |     /* Let's try some bogus byte sequences */
 95 |     {
 96 |         int len, e;
 97 |         uint32_t c;
 98 | 
 99 |         /* Invalid first byte */
100 |         unsigned char buf0[4] = {0xff};
101 |         len = (unsigned char *)utf8_decode(buf0, &c, &e) - buf0;
102 |         TEST(e, "bogus [ff] 0x%02x U+%04lx", e, (unsigned long)c);
103 |         TEST(len == 1, "bogus [ff] recovery %d", len);
104 | 
105 |         /* Invalid first byte */
106 |         unsigned char buf1[4] = {0x80};
107 |         len = (unsigned char *)utf8_decode(buf1, &c, &e) - buf1;
108 |         TEST(e, "bogus [80] 0x%02x U+%04lx", e, (unsigned long)c);
109 |         TEST(len == 1, "bogus [80] recovery %d", len);
110 | 
111 |         /* Looks like a two-byte sequence but second byte is wrong */
112 |         unsigned char buf2[4] = {0xc0, 0x0a};
113 |         len = (unsigned char *)utf8_decode(buf2, &c, &e) - buf2;
114 |         TEST(e, "bogus [c0 0a] 0x%02x U+%04lx", e, (unsigned long)c);
115 |         TEST(len == 2, "bogus [c0 0a] recovery %d", len);
116 |     }
117 | 
118 |     printf("%d fail, %d pass\n", count_fail, count_pass);
119 |     return count_fail != 0;
120 | }
121 | 


--------------------------------------------------------------------------------
/test/utf8-encode.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTF8_ENCODE
 2 | #define UTF8_ENCODE
 3 | 
 4 | #define IS_SURROGATE(c) ((c) >= 0xD800U && (c) <= 0xDFFFU)
 5 | 
 6 | static void *
 7 | utf8_encode(void *buf, long c)
 8 | {
 9 |     unsigned char *s = buf;
10 |     if (c >= (1L << 16)) {
11 |         s[0] = 0xf0 |  (c >> 18);
12 |         s[1] = 0x80 | ((c >> 12) & 0x3f);
13 |         s[2] = 0x80 | ((c >>  6) & 0x3f);
14 |         s[3] = 0x80 | ((c >>  0) & 0x3f);
15 |         return s + 4;
16 |     } else if (c >= (1L << 11)) {
17 |         s[0] = 0xe0 |  (c >> 12);
18 |         s[1] = 0x80 | ((c >>  6) & 0x3f);
19 |         s[2] = 0x80 | ((c >>  0) & 0x3f);
20 |         return s + 3;
21 |     } else if (c >= (1L << 7)) {
22 |         s[0] = 0xc0 |  (c >>  6);
23 |         s[1] = 0x80 | ((c >>  0) & 0x3f);
24 |         return s + 2;
25 |     } else {
26 |         s[0] = c;
27 |         return s + 1;
28 |     }
29 | }
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/utf8.h:
--------------------------------------------------------------------------------
 1 | /* Branchless UTF-8 decoder
 2 |  *
 3 |  * This is free and unencumbered software released into the public domain.
 4 |  */
 5 | #ifndef UTF8_H
 6 | #define UTF8_H
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | /* Decode the next character, C, from BUF, reporting errors in E.
11 |  *
12 |  * Since this is a branchless decoder, four bytes will be read from the
13 |  * buffer regardless of the actual length of the next character. This
14 |  * means the buffer _must_ have at least three bytes of zero padding
15 |  * following the end of the data stream.
16 |  *
17 |  * Errors are reported in E, which will be non-zero if the parsed
18 |  * character was somehow invalid: invalid byte sequence, non-canonical
19 |  * encoding, or a surrogate half.
20 |  *
21 |  * The function returns a pointer to the next character. When an error
22 |  * occurs, this pointer will be a guess that depends on the particular
23 |  * error, but it will always advance at least one byte.
24 |  */
25 | static void *
26 | utf8_decode(void *buf, uint32_t *c, int *e)
27 | {
28 |     static const char lengths[] = {
29 |         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
30 |         0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
31 |     };
32 |     static const int masks[]  = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
33 |     static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
34 |     static const int shiftc[] = {0, 18, 12, 6, 0};
35 |     static const int shifte[] = {0, 6, 4, 2, 0};
36 | 
37 |     unsigned char *s = buf;
38 |     int len = lengths[s[0] >> 3];
39 | 
40 |     /* Compute the pointer to the next character early so that the next
41 |      * iteration can start working on the next character. Neither Clang
42 |      * nor GCC figure out this reordering on their own.
43 |      */
44 |     unsigned char *next = s + len + !len;
45 | 
46 |     /* Assume a four-byte character and load four bytes. Unused bits are
47 |      * shifted out.
48 |      */
49 |     *c  = (uint32_t)(s[0] & masks[len]) << 18;
50 |     *c |= (uint32_t)(s[1] & 0x3f) << 12;
51 |     *c |= (uint32_t)(s[2] & 0x3f) <<  6;
52 |     *c |= (uint32_t)(s[3] & 0x3f) <<  0;
53 |     *c >>= shiftc[len];
54 | 
55 |     /* Accumulate the various error conditions. */
56 |     *e  = (*c < mins[len]) << 6; // non-canonical encoding
57 |     *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
58 |     *e |= (*c > 0x10FFFF) << 8;  // out of range?
59 |     *e |= (s[1] & 0xc0) >> 2;
60 |     *e |= (s[2] & 0xc0) >> 4;
61 |     *e |= (s[3]       ) >> 6;
62 |     *e ^= 0x2a; // top two bits of each tail byte correct?
63 |     *e >>= shifte[len];
64 | 
65 |     return next;
66 | }
67 | 
68 | #endif
69 | 


--------------------------------------------------------------------------------