├── .gitignore
├── Makefile
├── README.md
├── UNLICENSE
├── test
├── benchmark.c
├── bh-utf8.h
├── tests.c
└── utf8-encode.h
└── utf8.h
/.gitignore:
--------------------------------------------------------------------------------
1 | tests
2 | benchmark
3 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | CC = cc -std=c99
2 | CFLAGS = -Wall -Wextra -O3 -g3 -march=native
3 |
4 | all: benchmark tests
5 |
6 | benchmark: test/benchmark.c utf8.h test/utf8-encode.h test/bh-utf8.h
7 | $(CC) $(CFLAGS) $(LDFLAGS) -o $@ test/benchmark.c $(LDLIBS)
8 |
9 | tests: test/tests.c utf8.h test/utf8-encode.h
10 | $(CC) $(CFLAGS) $(LDFLAGS) -o $@ test/tests.c $(LDLIBS)
11 |
12 | bench: benchmark
13 | ./benchmark
14 |
15 | check: tests
16 | ./tests
17 |
18 | clean:
19 | rm -f benchmark tests
20 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Branchless UTF-8 Decoder
2 |
3 | Full article:
4 | [A Branchless UTF-8 Decoder](http://nullprogram.com/blog/2017/10/06/)
5 |
6 | ## Example usage
7 |
8 | ```c
9 | #define N (1 << 20) // 1 MiB
10 |
11 | // input buffer with 3 bytes of zero padding
12 | char buf[N+3];
13 | char *end = buf + fread(buf, 1, N, stdin);
14 | end[0] = end[1] = end[2] = 0;
15 |
16 | // output buffer: parsed code points
17 | int len = 0;
18 | uint32_t cp[N];
19 |
20 | int errors = 0;
21 | for (char *p = buf; p < end;) {
22 | int e;
23 | p = utf8_decode(p, cp+len++, &e);
24 | errors |= e;
25 | }
26 | if (errors) {
27 | // decode failure
28 | }
29 | ```
30 |
--------------------------------------------------------------------------------
/UNLICENSE:
--------------------------------------------------------------------------------
1 | This is free and unencumbered software released into the public domain.
2 |
3 | Anyone is free to copy, modify, publish, use, compile, sell, or
4 | distribute this software, either in source code form or as a compiled
5 | binary, for any purpose, commercial or non-commercial, and by any
6 | means.
7 |
8 | In jurisdictions that recognize copyright laws, the author or authors
9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 |
24 | For more information, please refer to
25 |
--------------------------------------------------------------------------------
/test/benchmark.c:
--------------------------------------------------------------------------------
1 | #define _POSIX_C_SOURCE 200112L
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | #include // alarm()
8 |
9 | #include "../utf8.h"
10 | #include "utf8-encode.h"
11 | #include "bh-utf8.h"
12 |
13 | #define SECONDS 6
14 | #define BUFLEN 8 // MB
15 |
16 | static uint32_t
17 | pcg32(uint64_t *s)
18 | {
19 | uint64_t m = 0x9b60933458e17d7d;
20 | uint64_t a = 0xd737232eeccdf7ed;
21 | *s = *s * m + a;
22 | int shift = 29 - (*s >> 61);
23 | return *s >> shift;
24 | }
25 |
26 | /* Generate a random codepoint whose UTF-8 length is uniformly selected. */
27 | static long
28 | randchar(uint64_t *s)
29 | {
30 | uint32_t r = pcg32(s);
31 | int len = 1 + (r & 0x3);
32 | r >>= 2;
33 | switch (len) {
34 | case 1:
35 | return r % 128;
36 | case 2:
37 | return 128 + r % (2048 - 128);
38 | case 3:
39 | return 2048 + r % (65536 - 2048);
40 | case 4:
41 | return 65536 + r % (131072 - 65536);
42 | }
43 | abort();
44 | }
45 |
46 | static volatile sig_atomic_t running;
47 |
48 | static void
49 | alarm_handler(int signum)
50 | {
51 | (void)signum;
52 | running = 0;
53 | }
54 |
55 | /* Fill buffer with random characters, with evenly-distributed encoded
56 | * lengths.
57 | */
58 | static void *
59 | buffer_fill(void *buf, size_t z)
60 | {
61 | uint64_t s = 0;
62 | char *p = buf;
63 | char *end = p + z;
64 | while (p < end) {
65 | long c;
66 | do
67 | c = randchar(&s);
68 | while (IS_SURROGATE(c));
69 | p = utf8_encode(p, c);
70 | }
71 | return p;
72 | }
73 |
74 | static unsigned char *
75 | utf8_simple(unsigned char *s, long *c)
76 | {
77 | unsigned char *next;
78 | if (s[0] < 0x80) {
79 | *c = s[0];
80 | next = s + 1;
81 | } else if ((s[0] & 0xe0) == 0xc0) {
82 | *c = ((long)(s[0] & 0x1f) << 6) |
83 | ((long)(s[1] & 0x3f) << 0);
84 | if ((s[1] & 0xc0) != 0x80)
85 | *c = -1;
86 | next = s + 2;
87 | } else if ((s[0] & 0xf0) == 0xe0) {
88 | *c = ((long)(s[0] & 0x0f) << 12) |
89 | ((long)(s[1] & 0x3f) << 6) |
90 | ((long)(s[2] & 0x3f) << 0);
91 | if ((s[1] & 0xc0) != 0x80 ||
92 | (s[2] & 0xc0) != 0x80)
93 | *c = -1;
94 | next = s + 3;
95 | } else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4)) {
96 | *c = ((long)(s[0] & 0x07) << 18) |
97 | ((long)(s[1] & 0x3f) << 12) |
98 | ((long)(s[2] & 0x3f) << 6) |
99 | ((long)(s[3] & 0x3f) << 0);
100 | if ((s[1] & 0xc0) != 0x80 ||
101 | (s[2] & 0xc0) != 0x80 ||
102 | (s[3] & 0xc0) != 0x80)
103 | *c = -1;
104 | next = s + 4;
105 | } else {
106 | *c = -1; // invalid
107 | next = s + 1; // skip this byte
108 | }
109 | if (*c >= 0xd800 && *c <= 0xdfff)
110 | *c = -1; // surrogate half
111 | return next;
112 | }
113 |
114 | int
115 | main(void)
116 | {
117 | double rate;
118 | long errors, n;
119 | size_t z = BUFLEN * 1024L * 1024;
120 | unsigned char *buffer = malloc(z);
121 | unsigned char *end = buffer_fill(buffer, z - 4);
122 |
123 | /* Benchmark the branchless decoder */
124 | running = 1;
125 | signal(SIGALRM, alarm_handler);
126 | alarm(SECONDS);
127 | errors = n = 0;
128 | do {
129 | unsigned char *p = buffer;
130 | int e = 0;
131 | uint32_t c;
132 | long count = 0;
133 | while (p < end) {
134 | p = utf8_decode(p, &c, &e);
135 | errors += !!e; // force errors to be checked
136 | count++;
137 | }
138 | if (p == end) // reached the end successfully?
139 | n++;
140 | } while (running);
141 | rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
142 | printf("branchless: %f MB/s, %ld errors\n", rate, errors);
143 |
144 | /* Benchmark Bjoern Hoehrmann's decoder */
145 | running = 1;
146 | signal(SIGALRM, alarm_handler);
147 | alarm(SECONDS);
148 | errors = n = 0;
149 | do {
150 | unsigned char *p = buffer;
151 | uint32_t c;
152 | uint32_t state = 0;
153 | long count = 0;
154 | for (; p < end; p++) {
155 | if (!bh_utf8_decode(&state, &c, *p))
156 | count++;
157 | else if (state == UTF8_REJECT)
158 | errors++; // force errors to be checked
159 | }
160 | if (p == end) // reached the end successfully?
161 | n++;
162 | } while (running);
163 | rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
164 | printf("Hoehrmann: %f MB/s, %ld errors\n", rate, errors);
165 |
166 | /* Benchmark simple decoder */
167 | running = 1;
168 | signal(SIGALRM, alarm_handler);
169 | alarm(SECONDS);
170 | errors = n = 0;
171 | do {
172 | unsigned char *p = buffer;
173 | long c;
174 | long count = 0;
175 | while (p < end) {
176 | p = utf8_simple(p, &c);
177 | count++;
178 | if (c < 0)
179 | errors++;
180 | }
181 | if (p == end) // reached the end successfully?
182 | n++;
183 | } while (running);
184 | rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
185 | printf("Simple: %f MB/s, %ld errors\n", rate, errors);
186 |
187 | free(buffer);
188 | }
189 |
--------------------------------------------------------------------------------
/test/bh-utf8.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2008-2009 Bjoern Hoehrmann
2 | // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
3 |
4 | #ifdef BH_ORIGINAL
5 |
6 | #define UTF8_ACCEPT 0
7 | #define UTF8_REJECT 1
8 |
9 | static const uint8_t utf8d[] = {
10 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
11 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
12 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
13 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
14 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
15 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
16 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
17 | 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
18 | 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
19 | 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
20 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
21 | 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
22 | 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
23 | 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
24 | };
25 |
26 | static uint32_t
27 | bh_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
28 | uint32_t type = utf8d[byte];
29 |
30 | *codep = (*state != UTF8_ACCEPT) ?
31 | (byte & 0x3fu) | (*codep << 6) :
32 | (0xff >> type) & (byte);
33 |
34 | *state = utf8d[256 + *state*16 + type];
35 | return *state;
36 | }
37 |
38 | #else
39 |
40 | #define UTF8_ACCEPT 0
41 | #define UTF8_REJECT 12
42 |
43 | static const uint8_t utf8d[] = {
44 | // The first part of the table maps bytes to character classes that
45 | // to reduce the size of the transition table and create bitmasks.
46 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
51 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
52 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
53 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
54 |
55 | // The second part is a transition table that maps a combination
56 | // of a state of the automaton and a character class to a state.
57 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
58 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
59 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
60 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
61 | 12,36,12,12,12,12,12,12,12,12,12,12,
62 | };
63 |
64 | static uint32_t
65 | bh_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
66 | uint32_t type = utf8d[byte];
67 |
68 | *codep = (*state != UTF8_ACCEPT) ?
69 | (byte & 0x3fu) | (*codep << 6) :
70 | (0xff >> type) & (byte);
71 |
72 | *state = utf8d[256 + *state + type];
73 | return *state;
74 | }
75 |
76 | #endif
77 |
--------------------------------------------------------------------------------
/test/tests.c:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #include "../utf8.h"
4 | #include "utf8-encode.h"
5 |
6 | static int count_pass;
7 | static int count_fail;
8 |
9 | #define TEST(x, s, ...) \
10 | do { \
11 | if (x) { \
12 | printf("\033[32;1mPASS\033[0m " s "\n", __VA_ARGS__); \
13 | count_pass++; \
14 | } else { \
15 | printf("\033[31;1mFAIL\033[0m " s "\n", __VA_ARGS__); \
16 | count_fail++; \
17 | } \
18 | } while (0)
19 |
20 | int
21 | main(void)
22 | {
23 | /* Make sure it can decode every character */
24 | {
25 | long failures = 0;
26 | for (unsigned long i = 0; i < 0x10ffff; i++) {
27 | if (!IS_SURROGATE(i)) {
28 | int e;
29 | uint32_t c;
30 | unsigned char buf[8] = {0};
31 | unsigned char *end = utf8_encode(buf, i);
32 | unsigned char *res = utf8_decode(buf, &c, &e);
33 | failures += end != res || c != i || e;
34 | }
35 | }
36 | TEST(failures == 0, "decode all, errors: %ld", failures);
37 | }
38 |
39 | /* Reject everything outside of U+0000..U+10FFFF */
40 | {
41 | long failures = 0;
42 | for (unsigned long i = 0x110000; i < 0x1fffff; i++) {
43 | int e;
44 | uint32_t c;
45 | unsigned char buf[8] = {0};
46 | utf8_encode(buf, i);
47 | unsigned char *end = utf8_decode(buf, &c, &e);
48 | failures += !e;
49 | failures += end - buf != 4;
50 | }
51 | TEST(failures == 0, "out of range, errors: %ld", failures);
52 | }
53 |
54 |
55 | /* Does it reject all surrogate halves? */
56 | {
57 | long failures = 0;
58 | for (unsigned long i = 0xd800; i <= 0xdfff; i++) {
59 | int e;
60 | uint32_t c;
61 | unsigned char buf[8] = {0};
62 | utf8_encode(buf, i);
63 | utf8_decode(buf, &c, &e);
64 | failures += !e;
65 | }
66 | TEST(failures == 0, "surrogate halves, errors: %ld", failures);
67 | }
68 |
69 | /* How about non-canonical encodings? */
70 | {
71 | int e;
72 | uint32_t c;
73 | unsigned char *end;
74 |
75 | unsigned char buf2[8] = {0xc0, 0xA4};
76 | end = utf8_decode(buf2, &c, &e);
77 | TEST(e, "non-canonical len 2, 0x%02x", e);
78 | TEST(end == buf2 + 2, "non-canonical recover 2, U+%04lx",
79 | (unsigned long)c);
80 |
81 | unsigned char buf3[8] = {0xe0, 0x80, 0xA4};
82 | end = utf8_decode(buf3, &c, &e);
83 | TEST(e, "non-canonical len 3, 0x%02x", e);
84 | TEST(end == buf3 + 3, "non-canonical recover 3, U+%04lx",
85 | (unsigned long)c);
86 |
87 | unsigned char buf4[8] = {0xf0, 0x80, 0x80, 0xA4};
88 | end = utf8_decode(buf4, &c, &e);
89 | TEST(e, "non-canonical encoding len 4, 0x%02x", e);
90 | TEST(end == buf4 + 4, "non-canonical recover 4, U+%04lx",
91 | (unsigned long)c);
92 | }
93 |
94 | /* Let's try some bogus byte sequences */
95 | {
96 | int len, e;
97 | uint32_t c;
98 |
99 | /* Invalid first byte */
100 | unsigned char buf0[4] = {0xff};
101 | len = (unsigned char *)utf8_decode(buf0, &c, &e) - buf0;
102 | TEST(e, "bogus [ff] 0x%02x U+%04lx", e, (unsigned long)c);
103 | TEST(len == 1, "bogus [ff] recovery %d", len);
104 |
105 | /* Invalid first byte */
106 | unsigned char buf1[4] = {0x80};
107 | len = (unsigned char *)utf8_decode(buf1, &c, &e) - buf1;
108 | TEST(e, "bogus [80] 0x%02x U+%04lx", e, (unsigned long)c);
109 | TEST(len == 1, "bogus [80] recovery %d", len);
110 |
111 | /* Looks like a two-byte sequence but second byte is wrong */
112 | unsigned char buf2[4] = {0xc0, 0x0a};
113 | len = (unsigned char *)utf8_decode(buf2, &c, &e) - buf2;
114 | TEST(e, "bogus [c0 0a] 0x%02x U+%04lx", e, (unsigned long)c);
115 | TEST(len == 2, "bogus [c0 0a] recovery %d", len);
116 | }
117 |
118 | printf("%d fail, %d pass\n", count_fail, count_pass);
119 | return count_fail != 0;
120 | }
121 |
--------------------------------------------------------------------------------
/test/utf8-encode.h:
--------------------------------------------------------------------------------
1 | #ifndef UTF8_ENCODE
2 | #define UTF8_ENCODE
3 |
4 | #define IS_SURROGATE(c) ((c) >= 0xD800U && (c) <= 0xDFFFU)
5 |
6 | static void *
7 | utf8_encode(void *buf, long c)
8 | {
9 | unsigned char *s = buf;
10 | if (c >= (1L << 16)) {
11 | s[0] = 0xf0 | (c >> 18);
12 | s[1] = 0x80 | ((c >> 12) & 0x3f);
13 | s[2] = 0x80 | ((c >> 6) & 0x3f);
14 | s[3] = 0x80 | ((c >> 0) & 0x3f);
15 | return s + 4;
16 | } else if (c >= (1L << 11)) {
17 | s[0] = 0xe0 | (c >> 12);
18 | s[1] = 0x80 | ((c >> 6) & 0x3f);
19 | s[2] = 0x80 | ((c >> 0) & 0x3f);
20 | return s + 3;
21 | } else if (c >= (1L << 7)) {
22 | s[0] = 0xc0 | (c >> 6);
23 | s[1] = 0x80 | ((c >> 0) & 0x3f);
24 | return s + 2;
25 | } else {
26 | s[0] = c;
27 | return s + 1;
28 | }
29 | }
30 |
31 | #endif
32 |
--------------------------------------------------------------------------------
/utf8.h:
--------------------------------------------------------------------------------
1 | /* Branchless UTF-8 decoder
2 | *
3 | * This is free and unencumbered software released into the public domain.
4 | */
5 | #ifndef UTF8_H
6 | #define UTF8_H
7 |
8 | #include
9 |
10 | /* Decode the next character, C, from BUF, reporting errors in E.
11 | *
12 | * Since this is a branchless decoder, four bytes will be read from the
13 | * buffer regardless of the actual length of the next character. This
14 | * means the buffer _must_ have at least three bytes of zero padding
15 | * following the end of the data stream.
16 | *
17 | * Errors are reported in E, which will be non-zero if the parsed
18 | * character was somehow invalid: invalid byte sequence, non-canonical
19 | * encoding, or a surrogate half.
20 | *
21 | * The function returns a pointer to the next character. When an error
22 | * occurs, this pointer will be a guess that depends on the particular
23 | * error, but it will always advance at least one byte.
24 | */
25 | static void *
26 | utf8_decode(void *buf, uint32_t *c, int *e)
27 | {
28 | static const char lengths[] = {
29 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
30 | 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
31 | };
32 | static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
33 | static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
34 | static const int shiftc[] = {0, 18, 12, 6, 0};
35 | static const int shifte[] = {0, 6, 4, 2, 0};
36 |
37 | unsigned char *s = buf;
38 | int len = lengths[s[0] >> 3];
39 |
40 | /* Compute the pointer to the next character early so that the next
41 | * iteration can start working on the next character. Neither Clang
42 | * nor GCC figure out this reordering on their own.
43 | */
44 | unsigned char *next = s + len + !len;
45 |
46 | /* Assume a four-byte character and load four bytes. Unused bits are
47 | * shifted out.
48 | */
49 | *c = (uint32_t)(s[0] & masks[len]) << 18;
50 | *c |= (uint32_t)(s[1] & 0x3f) << 12;
51 | *c |= (uint32_t)(s[2] & 0x3f) << 6;
52 | *c |= (uint32_t)(s[3] & 0x3f) << 0;
53 | *c >>= shiftc[len];
54 |
55 | /* Accumulate the various error conditions. */
56 | *e = (*c < mins[len]) << 6; // non-canonical encoding
57 | *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
58 | *e |= (*c > 0x10FFFF) << 8; // out of range?
59 | *e |= (s[1] & 0xc0) >> 2;
60 | *e |= (s[2] & 0xc0) >> 4;
61 | *e |= (s[3] ) >> 6;
62 | *e ^= 0x2a; // top two bits of each tail byte correct?
63 | *e >>= shifte[len];
64 |
65 | return next;
66 | }
67 |
68 | #endif
69 |
--------------------------------------------------------------------------------