├── README.md
├── utf8decoder.h
├── test.c
└── utf8decoder.c


/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floodyberry/utf8dfadecoder/HEAD/README.md


--------------------------------------------------------------------------------
/utf8decoder.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTF8DECODER_H
 2 | #define UTF8DECODER_H
 3 | 
 4 | #if defined(_MSC_VER)
 5 | typedef unsigned char uint8_t;
 6 | typedef unsigned short uint16_t;
 7 | typedef unsigned int uint32_t;
 8 | #define inline __forceinline
 9 | #else
10 | #include <stdint.h>
11 | #endif
12 | 
13 | #include <stddef.h>
14 | 
15 | #define utf_replacement 0xfffd
16 | 
17 | typedef struct utf8_decode_state_t {
18 | 	uint32_t state, c;
19 | } utf8_decode_state;
20 | 
21 | void utf8_unpack_tables(void);
22 | 
23 | int utf8_is_valid(const uint8_t *m, size_t len);
24 | 
25 | /* UTF-16 */
26 | size_t utf8_to_utf16(const uint8_t *m, size_t mlen, uint16_t *out);
27 | 
28 | void utf8_to_utf16_init(utf8_decode_state *st);
29 | void utf8_to_utf16_continue(utf8_decode_state *st, const uint8_t *m, size_t mlen, size_t *read, uint16_t *out, size_t *written);
30 | void utf8_to_utf16_finish(utf8_decode_state *st, uint16_t *out, size_t *written);
31 | 
32 | /* UTF-32 */
33 | size_t utf8_to_utf32(const uint8_t *m, size_t mlen, uint32_t *out);
34 | 
35 | void utf8_to_utf32_init(utf8_decode_state *st);
36 | void utf8_to_utf32_continue(utf8_decode_state *st, const uint8_t *m, size_t mlen, size_t *read, uint32_t *out, size_t *written);
37 | void utf8_to_utf32_finish(utf8_decode_state *st, uint32_t *out, size_t *written);
38 | 
39 | /* largest size of the resulting string from fromchar -> tochar */
40 | #define max_output_utf8_to_utf16_characters(len) (size_t)(len)
41 | #define max_output_utf8_to_utf16_bytes(len) (max_output_utf8_to_utf16_characters(len) * sizeof(uint16_t))
42 | #define max_output_utf8_to_utf32_characters(len) (size_t)(len)
43 | #define max_output_utf8_to_utf32_bytes(len) (max_output_utf8_to_utf32_characters(len) * sizeof(uint32_t))
44 | 
45 | #endif /* UTF8DECODER_H */
46 | 
47 | 


--------------------------------------------------------------------------------
/test.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <malloc.h>
  3 | #include <memory.h>
  4 | #include "utf8decoder.h"
  5 | 
  6 | /* Unicode character validity tests */
  7 | #define is_in_range(c, lo, hi) ((c >= lo) && (c <= hi))
  8 | #define is_surrogate(c) is_in_range(c, 0xd800, 0xdfff)
  9 | #define is_noncharacter(c) is_in_range(c, 0xfdd0, 0xfdef)
 10 | #define is_reserved(c) ((c & 0xfffe) == 0xfffe)
 11 | #define is_outofrange(c) (c > 0x10ffff)
 12 | #define is_invalid(c) (is_reserved(c) || is_outofrange(c) || is_noncharacter(c) || is_surrogate(c))
 13 | 
 14 | /* encode a UTF-32 character to UTF-8 */
 15 | static inline size_t
 16 | encode_utf8(uint32_t c, uint8_t *out) {
 17 | 	static const uint8_t mask[] = {0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
 18 | 	size_t len;
 19 | 	uint32_t t;
 20 | 
 21 | 	if (c < 0x80) {
 22 | 		out[0] = c;
 23 | 		return 1;
 24 | 	} else {
 25 | 		t = c;
 26 | 		len = 0;
 27 | 		if (t >= 0x10000) { out[3] = (c | 0x80) & 0xbf; c >>= 6; len += 1; }
 28 | 		if (t >= 0x800)   { out[2] = (c | 0x80) & 0xbf; c >>= 6; len += 1; }
 29 | 						  { out[1] = (c | 0x80) & 0xbf; c >>= 6; }
 30 | 						  { out[0] = (c | mask[len]); }
 31 | 		return len + 2;
 32 | 	}
 33 | }
 34 | 
 35 | /* decode a UTF-16 stream to UTF-32, no error checking */
 36 | static inline size_t
 37 | decode_utf16(uint16_t *c, uint32_t *out) {
 38 | 	if (is_in_range(c[0], 0xd800, 0xdbff)) {
 39 | 		out[0] = (((uint32_t)(c[0] & 0x3ff) << 10) | (c[1] & 0x3ff)) + 0x10000;
 40 | 		return 2;
 41 | 	} else {
 42 | 		out[0] = c[0];
 43 | 		return 1;
 44 | 	}
 45 | }
 46 | 
 47 | /* test encoding and decoding of every value from 0x0 to 0x110000 (1 above maximum) */
 48 | static void
 49 | test_full_range() {
 50 | 	uint32_t i, utf32, converted[4];
 51 | 	uint16_t utf16[4];
 52 | 	uint8_t utf8[4];
 53 | 	size_t len8, len16, len32;
 54 | 
 55 | 	for (i = 0; i <= 0x110000; i++) {
 56 | 		utf32 = (is_invalid(i)) ? utf_replacement : i; /* the expected character */
 57 | 		len8 = encode_utf8(i, utf8); /* encode the (possibly invalid) character */
 58 | 
 59 | 		/* utf16 */
 60 | 		len16 = utf8_to_utf16(utf8, len8, utf16); /* decode the (possibly invalid) chracter */
 61 | 		decode_utf16(utf16, converted);
 62 | 		if (utf32 != converted[0])
 63 | 			printf("UTF32->UTF8->UTF16: Mismatch at %x, WANT: %x, GOT %x\n", i, utf32, converted[0]);
 64 | 
 65 | 		/* utf32 */
 66 | 		len32 = utf8_to_utf32(utf8, len8, converted); /* decode the (possibly invalid) chracter */
 67 | 		if (utf32 != converted[0])
 68 | 			printf("UTF32->UTF8->UTF32: Mismatch at %x, WANT: %x, GOT %x\n", i, utf32, converted[0]);
 69 | 	}
 70 | }
 71 | 
 72 | /* test encoding and decoding of every value from 0x0 to 0x110000 (1 above maximum), streamed */
 73 | static void
 74 | test_full_range_onepass() {
 75 | 	uint32_t *utf32, *utf32_wanted, *utf32_pos;
 76 | 	uint16_t *utf16, *utf16_pos;
 77 | 	uint8_t *utf8, *utf8_pos;
 78 | 	uint32_t i;
 79 | 	size_t len8, len16, len32, pos, random_len;
 80 | 	size_t utf8_left, read, written;
 81 | 	utf8_decode_state incremental_state;
 82 | 
 83 | 	/* generate utf8 buffer and correct utf32 buffer */
 84 | 	utf32_wanted = (uint32_t *)malloc(0x110001 * sizeof(uint32_t));
 85 | 	utf8 = (uint8_t *)malloc(4 * 0x110001); /* maximum spaced needed */
 86 | 
 87 | 	for (i = 0, pos = 0; i < 0x110001; i++) {
 88 | 		utf32_wanted[i] = (is_invalid(i)) ? utf_replacement : i; /* the expected character */
 89 | 		len8 = encode_utf8(i, utf8 + pos); /* encode the (possibly invalid) character */
 90 | 		pos += len8;
 91 | 	}
 92 | 
 93 | 	utf32 = (uint32_t *)malloc(max_output_utf8_to_utf32_bytes(pos)); /* maximum spaced needed */
 94 | 
 95 | 
 96 | 	/* utf16 */
 97 | 	utf16 = (uint16_t *)malloc(max_output_utf8_to_utf16_bytes(pos)); /* maximum spaced needed */
 98 | 	memset(utf16, 0, max_output_utf8_to_utf16_bytes(pos));
 99 | 
100 | 	/* convert utf8 to utf16, then utf16 to utf32 */
101 | 	len16 = utf8_to_utf16(utf8, pos, utf16);
102 | 	for (read = 0, written = 0; read < len16;) {
103 | 		read += decode_utf16(utf16 + read, utf32 + written);
104 | 		written += 1;
105 | 	}
106 | 	if (written != 0x110001)
107 | 		printf("UTF32->UTF8->UTF16: One pass conversion resulted in %x characters, wanted %x\n", (uint32_t)written, i);
108 | 	else if (memcmp(utf32_wanted, utf32, written * sizeof(uint32_t)) != 0)
109 | 		printf("UTF32->UTF8->UTF16: One pass conversion didn't match expected values\n");
110 | 
111 | 	/* incremental utf8->utf16 */
112 | 	utf8_pos = utf8;
113 | 	utf8_left = pos;
114 | 	utf16_pos = utf16;
115 | 	memset(utf16, 0, max_output_utf8_to_utf16_bytes(pos));
116 | 	utf8_to_utf16_init(&incremental_state);
117 | 	while (utf8_left) {
118 | 		random_len = (((((utf8_left * 0xcafefade) >> 7) * 0xbeeffeed) >> 24) & 31) + 1;
119 | 		utf8_to_utf16_continue(&incremental_state, utf8_pos, (random_len <= utf8_left) ? random_len : utf8_left, &read, utf16_pos, &written);
120 | 		utf8_left -= read;
121 | 		utf8_pos += read;
122 | 		utf16_pos += written;
123 | 	}
124 | 	utf8_to_utf16_finish(&incremental_state, utf16_pos, &written);
125 | 	utf16_pos += written;
126 | 	len16 = utf16_pos - utf16;
127 | 
128 | 	for (read = 0, written = 0; read < len16;) {
129 | 		read += decode_utf16(utf16 + read, utf32 + written);
130 | 		written += 1;
131 | 	}
132 | 	len32 = written;
133 | 	if (len32 != 0x110001)
134 | 		printf("UTF32->UTF8->UTF16: Incremental conversion resulted in %x characters, wanted %x\n", (uint32_t)len32, i);
135 | 	else if (memcmp(utf32_wanted, utf32, len32 * sizeof(uint32_t)) != 0)
136 | 		printf("UTF32->UTF8->UTF16: Incremental conversion didn't match expected values\n");
137 | 
138 | 	free(utf16);
139 | 
140 | 
141 | 
142 | 
143 | 	/* utf32 */
144 | 	memset(utf32, 0, max_output_utf8_to_utf32_bytes(pos));
145 | 
146 | 	/* convert utf8 to utf32 */
147 | 	len32 = utf8_to_utf32(utf8, pos, utf32);
148 | 	if (len32 != 0x110001)
149 | 		printf("UTF32->UTF8->UTF32: One pass conversion resulted in %x characters, wanted %x\n", (uint32_t)len32, i);
150 | 	else if (memcmp(utf32_wanted, utf32, len32 * sizeof(uint32_t)) != 0)
151 | 		printf("UTF32->UTF8->UTF32: One pass conversion didn't match expected values\n");
152 | 
153 | 	/* incremental utf8->utf32 */
154 | 	utf8_pos = utf8;
155 | 	utf8_left = pos;
156 | 	utf32_pos = utf32;
157 | 	memset(utf32, 0, max_output_utf8_to_utf32_bytes(pos));
158 | 	utf8_to_utf32_init(&incremental_state);
159 | 	while (utf8_left) {
160 | 		random_len = (((((utf8_left * 0xdeadbeef) >> 8) * 0xcafebabe) >> 24) & 31) + 1;
161 | 		utf8_to_utf32_continue(&incremental_state, utf8_pos, (random_len <= utf8_left) ? random_len : utf8_left, &read, utf32_pos, &written);
162 | 		utf8_left -= read;
163 | 		utf8_pos += read;
164 | 		utf32_pos += written;
165 | 	}
166 | 	utf8_to_utf32_finish(&incremental_state, utf32_pos, &written);
167 | 	utf32_pos += written;
168 | 
169 | 	len32 = utf32_pos - utf32;
170 | 	if (len32 != 0x110001)
171 | 		printf("UTF32->UTF8->UTF32: Incremental conversion resulted in %x characters, wanted %x\n", (uint32_t)len32, i);
172 | 	else if (memcmp(utf32_wanted, utf32, len32 * sizeof(uint32_t)) != 0)
173 | 		printf("UTF32->UTF8->UTF32: Incremental conversion didn't match expected values\n");
174 | 
175 | 	free(utf32_wanted);
176 | 	free(utf32);
177 | 	free(utf8);
178 | }
179 | 
180 | 
181 | /* test decoding of all overlong sequences (including 5 and 6 byte sequences) */
182 | static void
183 | test_overlong() {
184 | 	static const uint8_t masks[] = {0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
185 | 	static const uint32_t highest_overlong[] = {0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff};
186 | 
187 | 	uint32_t i, j, len, val;
188 | 	uint32_t c, converted[6];
189 | 	uint8_t utf8[6];
190 | 
191 | 	for (i = 0; i < 5; i++) {
192 | 		len = i + 2;
193 | 		for (val = 0; val < highest_overlong[i]; val++) {
194 | 			if (val == utf_replacement)
195 | 				continue;
196 | 
197 | 			c = val;
198 | 			for (j = len; j != 0; j--) {
199 | 				utf8[j - 1] = (c | 0x80) & 0xbf;
200 | 				c >>= 6;
201 | 			}
202 | 			utf8[0] = (c | masks[i]);
203 | 
204 | 			utf8_to_utf32(utf8, len, converted);
205 | 			if (converted[0] == val)
206 | 				printf("UTF8 %u bytes: Overlong encoded value %x successfully decoded!\n", len, val);
207 | 			else if (converted[0] != utf_replacement)
208 | 				printf("UTF8 %u bytes: Overlong encoded value %x decoded incorrectly!\n", len, val);
209 | 		}
210 | 	}
211 | }
212 | 
213 | /* invalid single bytes */
214 | static void
215 | test_invalid_single_bytes() {
216 | 	uint32_t i, converted;
217 | 	uint8_t utf8[1];
218 | 
219 | 	for (i = 0x80; i <= 0xff; i++) {
220 | 		utf8[0] = i;
221 | 		utf8_to_utf32(utf8, 1, &converted);
222 | 		if (converted != utf_replacement)
223 | 			printf("UTF8: Invalid byte value %x decoded improperly to %x!\n", i, converted);
224 | 	}
225 | }
226 | 
227 | 
228 | int main() {
229 | 	utf8_unpack_tables();
230 | 
231 | 	test_full_range();
232 | 	test_full_range_onepass();
233 | 	test_overlong();
234 | 	test_invalid_single_bytes();
235 | 
236 | 	return 0;
237 | }


--------------------------------------------------------------------------------
/utf8decoder.c:
--------------------------------------------------------------------------------
  1 | #include "utf8decoder.h"
  2 | 
  3 | /*
  4 | 	Starting from:
  5 | 
  6 | 	LeadingByteValue[256] = {
  7 | 		Standard UTF-8 leading byte value lookup...
  8 | 	}
  9 | 
 10 | 	with 25 character types, Type = CharacterTypes[c]:
 11 | 
 12 | 	CharacterTypes[256] = {
 13 | 		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 14 | 		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 15 | 		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 16 | 		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 17 | 		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 18 | 		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 19 | 		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 20 | 		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 21 | 		 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
 22 | 		 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4,
 23 | 		 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6,
 24 | 		 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 7, 9,10,
 25 | 		11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,
 26 | 		12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,
 27 | 		13,14,14,14,14,14,14,14,14,14,14,14,14,15,16,17,
 28 | 		18,19,19,19,20,21,21,21,22,22,22,22,23,23,24,24,
 29 | 	}
 30 | 
 31 | 	and 21 states, with the transition from each state to the next state indexed by character type:
 32 | 
 33 | 	StateTransitions[21 * 25] = {
 34 | 		0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 3,10,11,12,13,14,19,18,20, 7, 8, 9, 1,
 35 | 		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 36 | 		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 37 | 		2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 38 | 		2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 39 | 		2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 40 | 		2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 41 | 		2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 42 | 		2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 43 | 		2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 44 | 		2, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 45 | 		2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 46 | 		2, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 47 | 		2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 48 | 		2, 3, 3, 3, 3, 3, 3, 3,15, 3,16, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 49 | 		2, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 50 | 		2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 51 | 		2, 3, 3, 3, 3, 3, 3, 3, 3, 3,16, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 52 | 		2, 4,17, 4,17, 4,17, 4, 4, 4,17, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 53 | 		2, 6, 6, 4,17, 4,17, 4, 4, 4,17, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 54 | 		2, 4,17, 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 55 | 	}
 56 | 
 57 | 	where:
 58 | 
 59 | 	State 0 = Accept
 60 | 	State 1 = Reject (properly encoded UTF-8 but invalid value)
 61 | 	State 2 = Fail (unexpected byte in UTF-8 stream)
 62 | 
 63 | 	and to start:
 64 | 
 65 | 	Type = CharacterType[c]
 66 | 	Codepoint = LeadingByteValue[c]
 67 | 	State = StateTransitions[(0 * 25) + Type]
 68 | 
 69 | 	and while (State > Fail):
 70 | 
 71 | 	Type = CharacterType[c]
 72 | 	Codepoint = (Codepoint << 6) + (c & 0x3f)
 73 | 	State = StateTransitions[(State * 25) + Type]
 74 | 
 75 | 	States can be pre-multiplied by 25, but that will require 16 bits
 76 | 	versus 8
 77 | 
 78 | 	Instead of separate type & transition tables, they can be combined
 79 | 	so a single lookup is used. Store the table as 256 * 21, and have each
 80 | 	256 byte array per state be the mapping of the character type to
 81 | 	the next state.
 82 | */
 83 | 
 84 | 
 85 | /* pre-multiply the table by 256? avoids a mult by 256 in the lookup, but causes
 86 |    the table to double in size */
 87 | 
 88 | /* #define UTF8_PREMULTIPLIED_TABLE */
 89 | 
 90 | #if defined(UTF8_PREMULTIPLIED_TABLE)
 91 | #define UTF8_TABLE_TYPE uint16_t
 92 | #define UTF8_TABLE_PRECALC_MULTIPLIER 256
 93 | #define UTF8_TABLE_MULTIPLIER 1
 94 | #else
 95 | #define UTF8_TABLE_TYPE uint8_t
 96 | #define UTF8_TABLE_PRECALC_MULTIPLIER 1
 97 | #define UTF8_TABLE_MULTIPLIER 256
 98 | #endif
 99 | 
100 | static uint8_t utf8_leading_byte_value[256];
101 | static UTF8_TABLE_TYPE utf8_state_table[256 * 21];
102 | 
103 | /*
104 | 	This is the 256 * 21 table, run length compressed,
105 | 	packed in to a bitstream as a 5 bit value and 5 bit run
106 | 	length. Run lengths over 20 look up their length in
107 | 	utf8_state_table_lengths, and a value of 30 indicates
108 | 	a run length of 640
109 | 
110 | 	107 bytes of lookup tables + unpacked code vs storing
111 | 	the 5376 byte table.
112 | */
113 | 
114 | static const uint16_t utf8_state_table_packed[49] = {
115 | 	0x8780,0xc45d,0x2aa8,0xc62c,0x0b42,0xcc2e,0x0720,0x670d,
116 | 	0x9220,0x1044,0x83c2,0xfa2d,0xa2d8,0x2d87,0xd97a,0x9ba2,
117 | 	0xfa2d,0xa2d9,0x2da3,0xb17a,0x8ac3,0xb63e,0xc3e8,0x2b16,
118 | 	0xd8fa,0x0fa2,0xc2fc,0x3038,0x0e88,0xb060,0x8a00,0x720e,
119 | 	0xa210,0x0d0f,0xe883,0xc5e4,0x5e40,0xe40c,0x40c5,0x0c5e,
120 | 	0x1ba2,0x5e48,0xe40c,0x40c5,0x0c5e,0x93a2,0x8317,0x62b9,
121 | 	0x0003
122 | };
123 | 
124 | static const uint8_t utf8_state_table_lengths[9] = {
125 | 	0x1e,0x20,0x30,0x37,0x3e,0x3f,0x40,0x80,0xc0
126 | };
127 | 
128 | #define utf8_accept (0 * UTF8_TABLE_PRECALC_MULTIPLIER)
129 | #define utf8_reject (1 * UTF8_TABLE_PRECALC_MULTIPLIER)
130 | #define utf8_fail   (2 * UTF8_TABLE_PRECALC_MULTIPLIER)
131 | 
132 | /* generate the tables used by the decoder */
133 | void
134 | utf8_unpack_tables(void) {
135 | 	uint32_t i, j, c, count, val, p;
136 | 	size_t bitsleft;
137 | 
138 | 	/* generate the leading byte values */
139 | 	for (c = 0, i = 128; i != 0; i >>= 1)
140 | 		for (j = 0; j < i; j++)
141 | 			utf8_leading_byte_value[c++] = (i != 64) ? (uint8_t)j : 0;
142 | 	utf8_leading_byte_value[c] = 0;
143 | 
144 | 	/* unpack the rle'd state table */
145 | 	for (c = 0, i = 0, bitsleft = 0, p = 0; c != 256 * 21;) {
146 | 		if (bitsleft < 10) {
147 | 			p |= ((uint32_t)utf8_state_table_packed[i++] << bitsleft);
148 | 			bitsleft += 16;
149 | 		}
150 | 		bitsleft -= 10;
151 | 		val = (p & 0x1f) * UTF8_TABLE_PRECALC_MULTIPLIER; p >>= 5;
152 | 		count = (p & 0x1f); p >>= 5;
153 | 		count = (count < 21) ? count : (count < 30) ? utf8_state_table_lengths[count-21] : 640;
154 | 		while (count--)
155 | 			utf8_state_table[c++] = val;
156 | 	}
157 | }
158 | 
159 | /* does this presumed UTF-8 stream have any invalid bytes or invalid codepoints? */
160 | int
161 | utf8_is_valid(const uint8_t *m, size_t len) {
162 | 	UTF8_TABLE_TYPE state;
163 | 	size_t i;
164 | 	for (i = 0, state = 0; i < len; i++)
165 | 		state = utf8_state_table[m[i] + (state * UTF8_TABLE_MULTIPLIER)];
166 | 	return state == utf8_accept;
167 | }
168 | 
169 | /* helper to decode a single unicode character from the presumed UTF-8 byte stream, input assumed to have >= 6 bytes in it */
170 | static inline const uint8_t *
171 | utf8_decode_unsafe(const uint8_t *m, uint32_t *c) {
172 | 	UTF8_TABLE_TYPE state = utf8_state_table[*m];
173 | 	*c = utf8_leading_byte_value[*m++];
174 | 	while (state > utf8_fail) {
175 | 		state = utf8_state_table[*m + (state * UTF8_TABLE_MULTIPLIER)];
176 | 		*c = (*c << 6) | (*m++ & 0x3f);
177 | 	}
178 | 	if (state != utf8_accept) {
179 | 		*c = utf_replacement;
180 | 		m -= (state == utf8_fail);
181 | 	}
182 | 	return m;
183 | }
184 | 
185 | /* helper to decode a single unicode character from the presumed UTF-8 byte stream, also verifies input length */
186 | static inline const uint8_t *
187 | utf8_decode(const uint8_t *m, const uint8_t *end, uint32_t *c) {
188 | 	UTF8_TABLE_TYPE state = utf8_state_table[*m];
189 | 	*c = utf8_leading_byte_value[*m++];
190 | 	while ((state > utf8_fail) && (m < end)) {
191 | 		state = utf8_state_table[*m + (state * UTF8_TABLE_MULTIPLIER)];
192 | 		*c = (*c << 6) | (*m++ & 0x3f);
193 | 	}
194 | 	if (state != utf8_accept) {
195 | 		*c = utf_replacement;
196 | 		m -= (state == utf8_fail);
197 | 	}
198 | 	return m;
199 | }
200 | 
201 | /* helper to encode a UTF-32 character as UTF-16 (no error checking) */
202 | static inline uint16_t *
203 | utf32_to_utf16_unsafe(uint16_t *out, uint32_t c) {
204 | 	if (c < 0x10000) {
205 | 		out[0] = (uint16_t)c;
206 | 		return out + 1;
207 | 	} else {
208 | 		out[0] = (uint16_t)((0xd800 - (0x10000 >> 10)) + (c >> 10));
209 | 		out[1] = (uint16_t)((c & 0x3ff) | 0xdc00);
210 | 		return out + 2;
211 | 	}
212 | }
213 | 
214 | /* convert a UTF-8 stream to UTF-16 */
215 | size_t
216 | utf8_to_utf16(const uint8_t *m, size_t mlen, uint16_t *out) {
217 | 	const uint8_t *end = m + mlen, *end6 = (mlen >= 6) ? (end - 6) : m;
218 | 	uint16_t *start = out;
219 | 	uint32_t c;
220 | 
221 | 	while (m < end6) {
222 | 		while ((m < end6) && (*m < 0x80))
223 | 			*out++ = (uint16_t)*m++;
224 | 
225 | 		while ((m < end6) && (*m >= 0x80)) {
226 | 			m = utf8_decode_unsafe(m, &c);
227 | 			out = utf32_to_utf16_unsafe(out, c);
228 | 		}
229 | 	}
230 | 
231 | 	while (m < end) {
232 | 		m = utf8_decode(m, end, &c);
233 | 		out = utf32_to_utf16_unsafe(out, c);
234 | 	}
235 | 
236 | 	return out - start;
237 | }
238 | 
239 | /* incremental UTF-8 -> UTF-16 decoder */
240 | void
241 | utf8_to_utf16_init(utf8_decode_state *st) {
242 | 	st->c = 0;
243 | 	st->state = 0;
244 | }
245 | 
246 | static inline void
247 | utf8_decode_continue_utf16(const uint8_t **m, const uint8_t *end, UTF8_TABLE_TYPE *state, uint32_t *c, uint16_t **out) {
248 | 	while ((*state > utf8_fail) && (*m < end)) {
249 | 		*state = utf8_state_table[*(*m) + (*state * UTF8_TABLE_MULTIPLIER)];
250 | 		*c = (*c << 6) | (*(*m)++ & 0x3f);
251 | 	}
252 | 	if (*state <= utf8_fail) {
253 | 		if (*state != utf8_accept) {
254 | 			*c = utf_replacement;
255 | 			*m -= (*state == utf8_fail);
256 | 		}
257 | 		if (*c < 0x10000) {
258 | 			*(*out)++ = (uint16_t)*c;
259 | 		} else {
260 | 			(*out)[0] = (uint16_t)((0xd800 - (0x10000 >> 10)) + (*c >> 10));
261 | 			(*out)[1] = (uint16_t)((*c & 0x3ff) | 0xdc00);
262 | 			(*out) += 2;
263 | 		}
264 | 	}
265 | }
266 | 
267 | void
268 | utf8_to_utf16_continue(utf8_decode_state *st, const uint8_t *m, size_t mlen, size_t *read, uint16_t *out, size_t *written) {
269 | 	const uint8_t *m_start = m, *m_end = m_start + mlen;
270 | 	uint16_t *out_start = out;
271 | 	UTF8_TABLE_TYPE state = (UTF8_TABLE_TYPE)st->state;
272 | 	uint32_t c = st->c;
273 | 
274 | 	if (state)
275 | 		utf8_decode_continue_utf16(&m, m_end, &state, &c, &out);
276 | 
277 | 	while (m < m_end) {
278 | 		while ((m < m_end) && (*m < 0x80))
279 | 			*out++ = (uint16_t)*m++;
280 | 
281 | 		while ((m < m_end) && (*m >= 0x80)) {
282 | 			state = utf8_state_table[*m];
283 | 			c = utf8_leading_byte_value[*m++];
284 | 			utf8_decode_continue_utf16(&m, m_end, &state, &c, &out);
285 | 		}
286 | 	}
287 | 
288 | 	*read = m - m_start;
289 | 	*written = out - out_start;
290 | 	st->state = (state <= utf8_fail) ? 0 : state;
291 | 	st->c = c;
292 | }
293 | 
294 | void
295 | utf8_to_utf16_finish(utf8_decode_state *st, uint16_t *out, size_t *written) {
296 | 	*written = 0;
297 | 
298 | 	if (st->state != utf8_accept) {
299 | 		*written = 1;
300 | 		*out = utf_replacement;
301 | 	}
302 | }
303 | 
304 | 
305 | /* convert a UTF-8 stream to UTF-32 */
306 | size_t
307 | utf8_to_utf32(const uint8_t *m, size_t mlen, uint32_t *out) {
308 | 	const uint8_t *end = m + mlen, *end6 = (mlen >= 6) ? (end - 6) : m;
309 | 	uint32_t *start = out, c;
310 | 
311 | 	while (m < end6) {
312 | 		while ((m < end6) && (*m < 0x80))
313 | 			*out++ = (uint32_t)*m++;
314 | 
315 | 		while ((m < end6) && (*m >= 0x80)) {
316 | 			m = utf8_decode_unsafe(m, &c);
317 | 			*out++ = c;
318 | 		}
319 | 	}
320 | 
321 | 	while (m < end) {
322 | 		m = utf8_decode(m, end, &c);
323 | 		*out++ = c;
324 | 	}
325 | 
326 | 	return out - start;
327 | }
328 | 
329 | /* incremental UTF-8 -> UTF-32 decoder */
330 | void
331 | utf8_to_utf32_init(utf8_decode_state *st) {
332 | 	st->c = 0;
333 | 	st->state = 0;
334 | }
335 | 
336 | static inline void
337 | utf8_decode_continue_utf32(const uint8_t **m, const uint8_t *end, UTF8_TABLE_TYPE *state, uint32_t *c, uint32_t **out) {
338 | 	while ((*state > utf8_fail) && (*m < end)) {
339 | 		*state = utf8_state_table[*(*m) + (*state * UTF8_TABLE_MULTIPLIER)];
340 | 		*c = (*c << 6) | (*(*m)++ & 0x3f);
341 | 	}
342 | 	if (*state <= utf8_fail) {
343 | 		if (*state != utf8_accept) {
344 | 			*c = utf_replacement;
345 | 			*m -= (*state == utf8_fail);
346 | 		}
347 | 		*(*out)++ = *c;
348 | 	}
349 | }
350 | 
351 | void
352 | utf8_to_utf32_continue(utf8_decode_state *st, const uint8_t *m, size_t mlen, size_t *read, uint32_t *out, size_t *written) {
353 | 	const uint8_t *m_start = m, *m_end = m_start + mlen;
354 | 	uint32_t *out_start = out;
355 | 	UTF8_TABLE_TYPE state = (UTF8_TABLE_TYPE)st->state;
356 | 	uint32_t c = st->c;
357 | 
358 | 	if (state)
359 | 		utf8_decode_continue_utf32(&m, m_end, &state, &c, &out);
360 | 
361 | 	while (m < m_end) {
362 | 		while ((m < m_end) && (*m < 0x80))
363 | 			*out++ = (uint32_t)*m++;
364 | 
365 | 		while ((m < m_end) && (*m >= 0x80)) {
366 | 			state = utf8_state_table[*m];
367 | 			c = utf8_leading_byte_value[*m++];
368 | 			utf8_decode_continue_utf32(&m, m_end, &state, &c, &out);
369 | 		}
370 | 	}
371 | 
372 | 	*read = m - m_start;
373 | 	*written = out - out_start;
374 | 	st->state = (state <= utf8_fail) ? 0 : state;
375 | 	st->c = c;
376 | }
377 | 
378 | void
379 | utf8_to_utf32_finish(utf8_decode_state *st, uint32_t *out, size_t *written) {
380 | 	*written = 0;
381 | 
382 | 	if (st->state != utf8_accept) {
383 | 		*written = 1;
384 | 		*out = utf_replacement;
385 | 	}
386 | }
387 | 
388 | 


--------------------------------------------------------------------------------