├── README.md ├── utf8decoder.h ├── test.c └── utf8decoder.c /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floodyberry/utf8dfadecoder/HEAD/README.md -------------------------------------------------------------------------------- /utf8decoder.h: -------------------------------------------------------------------------------- 1 | #ifndef UTF8DECODER_H 2 | #define UTF8DECODER_H 3 | 4 | #if defined(_MSC_VER) 5 | typedef unsigned char uint8_t; 6 | typedef unsigned short uint16_t; 7 | typedef unsigned int uint32_t; 8 | #define inline __forceinline 9 | #else 10 | #include 11 | #endif 12 | 13 | #include 14 | 15 | #define utf_replacement 0xfffd 16 | 17 | typedef struct utf8_decode_state_t { 18 | uint32_t state, c; 19 | } utf8_decode_state; 20 | 21 | void utf8_unpack_tables(void); 22 | 23 | int utf8_is_valid(const uint8_t *m, size_t len); 24 | 25 | /* UTF-16 */ 26 | size_t utf8_to_utf16(const uint8_t *m, size_t mlen, uint16_t *out); 27 | 28 | void utf8_to_utf16_init(utf8_decode_state *st); 29 | void utf8_to_utf16_continue(utf8_decode_state *st, const uint8_t *m, size_t mlen, size_t *read, uint16_t *out, size_t *written); 30 | void utf8_to_utf16_finish(utf8_decode_state *st, uint16_t *out, size_t *written); 31 | 32 | /* UTF-32 */ 33 | size_t utf8_to_utf32(const uint8_t *m, size_t mlen, uint32_t *out); 34 | 35 | void utf8_to_utf32_init(utf8_decode_state *st); 36 | void utf8_to_utf32_continue(utf8_decode_state *st, const uint8_t *m, size_t mlen, size_t *read, uint32_t *out, size_t *written); 37 | void utf8_to_utf32_finish(utf8_decode_state *st, uint32_t *out, size_t *written); 38 | 39 | /* largest size of the resulting string from fromchar -> tochar */ 40 | #define max_output_utf8_to_utf16_characters(len) (size_t)(len) 41 | #define max_output_utf8_to_utf16_bytes(len) (max_output_utf8_to_utf16_characters(len) * sizeof(uint16_t)) 42 | #define max_output_utf8_to_utf32_characters(len) (size_t)(len) 43 | #define max_output_utf8_to_utf32_bytes(len) (max_output_utf8_to_utf32_characters(len) * sizeof(uint32_t)) 44 | 45 | #endif /* UTF8DECODER_H */ 46 | 47 | -------------------------------------------------------------------------------- /test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "utf8decoder.h" 5 | 6 | /* Unicode character validity tests */ 7 | #define is_in_range(c, lo, hi) ((c >= lo) && (c <= hi)) 8 | #define is_surrogate(c) is_in_range(c, 0xd800, 0xdfff) 9 | #define is_noncharacter(c) is_in_range(c, 0xfdd0, 0xfdef) 10 | #define is_reserved(c) ((c & 0xfffe) == 0xfffe) 11 | #define is_outofrange(c) (c > 0x10ffff) 12 | #define is_invalid(c) (is_reserved(c) || is_outofrange(c) || is_noncharacter(c) || is_surrogate(c)) 13 | 14 | /* encode a UTF-32 character to UTF-8 */ 15 | static inline size_t 16 | encode_utf8(uint32_t c, uint8_t *out) { 17 | static const uint8_t mask[] = {0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; 18 | size_t len; 19 | uint32_t t; 20 | 21 | if (c < 0x80) { 22 | out[0] = c; 23 | return 1; 24 | } else { 25 | t = c; 26 | len = 0; 27 | if (t >= 0x10000) { out[3] = (c | 0x80) & 0xbf; c >>= 6; len += 1; } 28 | if (t >= 0x800) { out[2] = (c | 0x80) & 0xbf; c >>= 6; len += 1; } 29 | { out[1] = (c | 0x80) & 0xbf; c >>= 6; } 30 | { out[0] = (c | mask[len]); } 31 | return len + 2; 32 | } 33 | } 34 | 35 | /* decode a UTF-16 stream to UTF-32, no error checking */ 36 | static inline size_t 37 | decode_utf16(uint16_t *c, uint32_t *out) { 38 | if (is_in_range(c[0], 0xd800, 0xdbff)) { 39 | out[0] = (((uint32_t)(c[0] & 0x3ff) << 10) | (c[1] & 0x3ff)) + 0x10000; 40 | return 2; 41 | } else { 42 | out[0] = c[0]; 43 | return 1; 44 | } 45 | } 46 | 47 | /* test encoding and decoding of every value from 0x0 to 0x110000 (1 above maximum) */ 48 | static void 49 | test_full_range() { 50 | uint32_t i, utf32, converted[4]; 51 | uint16_t utf16[4]; 52 | uint8_t utf8[4]; 53 | size_t len8, len16, len32; 54 | 55 | for (i = 0; i <= 0x110000; i++) { 56 | utf32 = (is_invalid(i)) ? utf_replacement : i; /* the expected character */ 57 | len8 = encode_utf8(i, utf8); /* encode the (possibly invalid) character */ 58 | 59 | /* utf16 */ 60 | len16 = utf8_to_utf16(utf8, len8, utf16); /* decode the (possibly invalid) chracter */ 61 | decode_utf16(utf16, converted); 62 | if (utf32 != converted[0]) 63 | printf("UTF32->UTF8->UTF16: Mismatch at %x, WANT: %x, GOT %x\n", i, utf32, converted[0]); 64 | 65 | /* utf32 */ 66 | len32 = utf8_to_utf32(utf8, len8, converted); /* decode the (possibly invalid) chracter */ 67 | if (utf32 != converted[0]) 68 | printf("UTF32->UTF8->UTF32: Mismatch at %x, WANT: %x, GOT %x\n", i, utf32, converted[0]); 69 | } 70 | } 71 | 72 | /* test encoding and decoding of every value from 0x0 to 0x110000 (1 above maximum), streamed */ 73 | static void 74 | test_full_range_onepass() { 75 | uint32_t *utf32, *utf32_wanted, *utf32_pos; 76 | uint16_t *utf16, *utf16_pos; 77 | uint8_t *utf8, *utf8_pos; 78 | uint32_t i; 79 | size_t len8, len16, len32, pos, random_len; 80 | size_t utf8_left, read, written; 81 | utf8_decode_state incremental_state; 82 | 83 | /* generate utf8 buffer and correct utf32 buffer */ 84 | utf32_wanted = (uint32_t *)malloc(0x110001 * sizeof(uint32_t)); 85 | utf8 = (uint8_t *)malloc(4 * 0x110001); /* maximum spaced needed */ 86 | 87 | for (i = 0, pos = 0; i < 0x110001; i++) { 88 | utf32_wanted[i] = (is_invalid(i)) ? utf_replacement : i; /* the expected character */ 89 | len8 = encode_utf8(i, utf8 + pos); /* encode the (possibly invalid) character */ 90 | pos += len8; 91 | } 92 | 93 | utf32 = (uint32_t *)malloc(max_output_utf8_to_utf32_bytes(pos)); /* maximum spaced needed */ 94 | 95 | 96 | /* utf16 */ 97 | utf16 = (uint16_t *)malloc(max_output_utf8_to_utf16_bytes(pos)); /* maximum spaced needed */ 98 | memset(utf16, 0, max_output_utf8_to_utf16_bytes(pos)); 99 | 100 | /* convert utf8 to utf16, then utf16 to utf32 */ 101 | len16 = utf8_to_utf16(utf8, pos, utf16); 102 | for (read = 0, written = 0; read < len16;) { 103 | read += decode_utf16(utf16 + read, utf32 + written); 104 | written += 1; 105 | } 106 | if (written != 0x110001) 107 | printf("UTF32->UTF8->UTF16: One pass conversion resulted in %x characters, wanted %x\n", (uint32_t)written, i); 108 | else if (memcmp(utf32_wanted, utf32, written * sizeof(uint32_t)) != 0) 109 | printf("UTF32->UTF8->UTF16: One pass conversion didn't match expected values\n"); 110 | 111 | /* incremental utf8->utf16 */ 112 | utf8_pos = utf8; 113 | utf8_left = pos; 114 | utf16_pos = utf16; 115 | memset(utf16, 0, max_output_utf8_to_utf16_bytes(pos)); 116 | utf8_to_utf16_init(&incremental_state); 117 | while (utf8_left) { 118 | random_len = (((((utf8_left * 0xcafefade) >> 7) * 0xbeeffeed) >> 24) & 31) + 1; 119 | utf8_to_utf16_continue(&incremental_state, utf8_pos, (random_len <= utf8_left) ? random_len : utf8_left, &read, utf16_pos, &written); 120 | utf8_left -= read; 121 | utf8_pos += read; 122 | utf16_pos += written; 123 | } 124 | utf8_to_utf16_finish(&incremental_state, utf16_pos, &written); 125 | utf16_pos += written; 126 | len16 = utf16_pos - utf16; 127 | 128 | for (read = 0, written = 0; read < len16;) { 129 | read += decode_utf16(utf16 + read, utf32 + written); 130 | written += 1; 131 | } 132 | len32 = written; 133 | if (len32 != 0x110001) 134 | printf("UTF32->UTF8->UTF16: Incremental conversion resulted in %x characters, wanted %x\n", (uint32_t)len32, i); 135 | else if (memcmp(utf32_wanted, utf32, len32 * sizeof(uint32_t)) != 0) 136 | printf("UTF32->UTF8->UTF16: Incremental conversion didn't match expected values\n"); 137 | 138 | free(utf16); 139 | 140 | 141 | 142 | 143 | /* utf32 */ 144 | memset(utf32, 0, max_output_utf8_to_utf32_bytes(pos)); 145 | 146 | /* convert utf8 to utf32 */ 147 | len32 = utf8_to_utf32(utf8, pos, utf32); 148 | if (len32 != 0x110001) 149 | printf("UTF32->UTF8->UTF32: One pass conversion resulted in %x characters, wanted %x\n", (uint32_t)len32, i); 150 | else if (memcmp(utf32_wanted, utf32, len32 * sizeof(uint32_t)) != 0) 151 | printf("UTF32->UTF8->UTF32: One pass conversion didn't match expected values\n"); 152 | 153 | /* incremental utf8->utf32 */ 154 | utf8_pos = utf8; 155 | utf8_left = pos; 156 | utf32_pos = utf32; 157 | memset(utf32, 0, max_output_utf8_to_utf32_bytes(pos)); 158 | utf8_to_utf32_init(&incremental_state); 159 | while (utf8_left) { 160 | random_len = (((((utf8_left * 0xdeadbeef) >> 8) * 0xcafebabe) >> 24) & 31) + 1; 161 | utf8_to_utf32_continue(&incremental_state, utf8_pos, (random_len <= utf8_left) ? random_len : utf8_left, &read, utf32_pos, &written); 162 | utf8_left -= read; 163 | utf8_pos += read; 164 | utf32_pos += written; 165 | } 166 | utf8_to_utf32_finish(&incremental_state, utf32_pos, &written); 167 | utf32_pos += written; 168 | 169 | len32 = utf32_pos - utf32; 170 | if (len32 != 0x110001) 171 | printf("UTF32->UTF8->UTF32: Incremental conversion resulted in %x characters, wanted %x\n", (uint32_t)len32, i); 172 | else if (memcmp(utf32_wanted, utf32, len32 * sizeof(uint32_t)) != 0) 173 | printf("UTF32->UTF8->UTF32: Incremental conversion didn't match expected values\n"); 174 | 175 | free(utf32_wanted); 176 | free(utf32); 177 | free(utf8); 178 | } 179 | 180 | 181 | /* test decoding of all overlong sequences (including 5 and 6 byte sequences) */ 182 | static void 183 | test_overlong() { 184 | static const uint8_t masks[] = {0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; 185 | static const uint32_t highest_overlong[] = {0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff}; 186 | 187 | uint32_t i, j, len, val; 188 | uint32_t c, converted[6]; 189 | uint8_t utf8[6]; 190 | 191 | for (i = 0; i < 5; i++) { 192 | len = i + 2; 193 | for (val = 0; val < highest_overlong[i]; val++) { 194 | if (val == utf_replacement) 195 | continue; 196 | 197 | c = val; 198 | for (j = len; j != 0; j--) { 199 | utf8[j - 1] = (c | 0x80) & 0xbf; 200 | c >>= 6; 201 | } 202 | utf8[0] = (c | masks[i]); 203 | 204 | utf8_to_utf32(utf8, len, converted); 205 | if (converted[0] == val) 206 | printf("UTF8 %u bytes: Overlong encoded value %x successfully decoded!\n", len, val); 207 | else if (converted[0] != utf_replacement) 208 | printf("UTF8 %u bytes: Overlong encoded value %x decoded incorrectly!\n", len, val); 209 | } 210 | } 211 | } 212 | 213 | /* invalid single bytes */ 214 | static void 215 | test_invalid_single_bytes() { 216 | uint32_t i, converted; 217 | uint8_t utf8[1]; 218 | 219 | for (i = 0x80; i <= 0xff; i++) { 220 | utf8[0] = i; 221 | utf8_to_utf32(utf8, 1, &converted); 222 | if (converted != utf_replacement) 223 | printf("UTF8: Invalid byte value %x decoded improperly to %x!\n", i, converted); 224 | } 225 | } 226 | 227 | 228 | int main() { 229 | utf8_unpack_tables(); 230 | 231 | test_full_range(); 232 | test_full_range_onepass(); 233 | test_overlong(); 234 | test_invalid_single_bytes(); 235 | 236 | return 0; 237 | } -------------------------------------------------------------------------------- /utf8decoder.c: -------------------------------------------------------------------------------- 1 | #include "utf8decoder.h" 2 | 3 | /* 4 | Starting from: 5 | 6 | LeadingByteValue[256] = { 7 | Standard UTF-8 leading byte value lookup... 8 | } 9 | 10 | with 25 character types, Type = CharacterTypes[c]: 11 | 12 | CharacterTypes[256] = { 13 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 22 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 23 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 24 | 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 7, 9,10, 25 | 11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12, 26 | 12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12, 27 | 13,14,14,14,14,14,14,14,14,14,14,14,14,15,16,17, 28 | 18,19,19,19,20,21,21,21,22,22,22,22,23,23,24,24, 29 | } 30 | 31 | and 21 states, with the transition from each state to the next state indexed by character type: 32 | 33 | StateTransitions[21 * 25] = { 34 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 3,10,11,12,13,14,19,18,20, 7, 8, 9, 1, 35 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 36 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 37 | 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 38 | 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 39 | 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 40 | 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 41 | 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 42 | 2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 43 | 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 44 | 2, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 45 | 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 46 | 2, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 47 | 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 48 | 2, 3, 3, 3, 3, 3, 3, 3,15, 3,16, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 49 | 2, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 50 | 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 51 | 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,16, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 52 | 2, 4,17, 4,17, 4,17, 4, 4, 4,17, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 53 | 2, 6, 6, 4,17, 4,17, 4, 4, 4,17, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 54 | 2, 4,17, 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 55 | } 56 | 57 | where: 58 | 59 | State 0 = Accept 60 | State 1 = Reject (properly encoded UTF-8 but invalid value) 61 | State 2 = Fail (unexpected byte in UTF-8 stream) 62 | 63 | and to start: 64 | 65 | Type = CharacterType[c] 66 | Codepoint = LeadingByteValue[c] 67 | State = StateTransitions[(0 * 25) + Type] 68 | 69 | and while (State > Fail): 70 | 71 | Type = CharacterType[c] 72 | Codepoint = (Codepoint << 6) + (c & 0x3f) 73 | State = StateTransitions[(State * 25) + Type] 74 | 75 | States can be pre-multiplied by 25, but that will require 16 bits 76 | versus 8 77 | 78 | Instead of separate type & transition tables, they can be combined 79 | so a single lookup is used. Store the table as 256 * 21, and have each 80 | 256 byte array per state be the mapping of the character type to 81 | the next state. 82 | */ 83 | 84 | 85 | /* pre-multiply the table by 256? avoids a mult by 256 in the lookup, but causes 86 | the table to double in size */ 87 | 88 | /* #define UTF8_PREMULTIPLIED_TABLE */ 89 | 90 | #if defined(UTF8_PREMULTIPLIED_TABLE) 91 | #define UTF8_TABLE_TYPE uint16_t 92 | #define UTF8_TABLE_PRECALC_MULTIPLIER 256 93 | #define UTF8_TABLE_MULTIPLIER 1 94 | #else 95 | #define UTF8_TABLE_TYPE uint8_t 96 | #define UTF8_TABLE_PRECALC_MULTIPLIER 1 97 | #define UTF8_TABLE_MULTIPLIER 256 98 | #endif 99 | 100 | static uint8_t utf8_leading_byte_value[256]; 101 | static UTF8_TABLE_TYPE utf8_state_table[256 * 21]; 102 | 103 | /* 104 | This is the 256 * 21 table, run length compressed, 105 | packed in to a bitstream as a 5 bit value and 5 bit run 106 | length. Run lengths over 20 look up their length in 107 | utf8_state_table_lengths, and a value of 30 indicates 108 | a run length of 640 109 | 110 | 107 bytes of lookup tables + unpacked code vs storing 111 | the 5376 byte table. 112 | */ 113 | 114 | static const uint16_t utf8_state_table_packed[49] = { 115 | 0x8780,0xc45d,0x2aa8,0xc62c,0x0b42,0xcc2e,0x0720,0x670d, 116 | 0x9220,0x1044,0x83c2,0xfa2d,0xa2d8,0x2d87,0xd97a,0x9ba2, 117 | 0xfa2d,0xa2d9,0x2da3,0xb17a,0x8ac3,0xb63e,0xc3e8,0x2b16, 118 | 0xd8fa,0x0fa2,0xc2fc,0x3038,0x0e88,0xb060,0x8a00,0x720e, 119 | 0xa210,0x0d0f,0xe883,0xc5e4,0x5e40,0xe40c,0x40c5,0x0c5e, 120 | 0x1ba2,0x5e48,0xe40c,0x40c5,0x0c5e,0x93a2,0x8317,0x62b9, 121 | 0x0003 122 | }; 123 | 124 | static const uint8_t utf8_state_table_lengths[9] = { 125 | 0x1e,0x20,0x30,0x37,0x3e,0x3f,0x40,0x80,0xc0 126 | }; 127 | 128 | #define utf8_accept (0 * UTF8_TABLE_PRECALC_MULTIPLIER) 129 | #define utf8_reject (1 * UTF8_TABLE_PRECALC_MULTIPLIER) 130 | #define utf8_fail (2 * UTF8_TABLE_PRECALC_MULTIPLIER) 131 | 132 | /* generate the tables used by the decoder */ 133 | void 134 | utf8_unpack_tables(void) { 135 | uint32_t i, j, c, count, val, p; 136 | size_t bitsleft; 137 | 138 | /* generate the leading byte values */ 139 | for (c = 0, i = 128; i != 0; i >>= 1) 140 | for (j = 0; j < i; j++) 141 | utf8_leading_byte_value[c++] = (i != 64) ? (uint8_t)j : 0; 142 | utf8_leading_byte_value[c] = 0; 143 | 144 | /* unpack the rle'd state table */ 145 | for (c = 0, i = 0, bitsleft = 0, p = 0; c != 256 * 21;) { 146 | if (bitsleft < 10) { 147 | p |= ((uint32_t)utf8_state_table_packed[i++] << bitsleft); 148 | bitsleft += 16; 149 | } 150 | bitsleft -= 10; 151 | val = (p & 0x1f) * UTF8_TABLE_PRECALC_MULTIPLIER; p >>= 5; 152 | count = (p & 0x1f); p >>= 5; 153 | count = (count < 21) ? count : (count < 30) ? utf8_state_table_lengths[count-21] : 640; 154 | while (count--) 155 | utf8_state_table[c++] = val; 156 | } 157 | } 158 | 159 | /* does this presumed UTF-8 stream have any invalid bytes or invalid codepoints? */ 160 | int 161 | utf8_is_valid(const uint8_t *m, size_t len) { 162 | UTF8_TABLE_TYPE state; 163 | size_t i; 164 | for (i = 0, state = 0; i < len; i++) 165 | state = utf8_state_table[m[i] + (state * UTF8_TABLE_MULTIPLIER)]; 166 | return state == utf8_accept; 167 | } 168 | 169 | /* helper to decode a single unicode character from the presumed UTF-8 byte stream, input assumed to have >= 6 bytes in it */ 170 | static inline const uint8_t * 171 | utf8_decode_unsafe(const uint8_t *m, uint32_t *c) { 172 | UTF8_TABLE_TYPE state = utf8_state_table[*m]; 173 | *c = utf8_leading_byte_value[*m++]; 174 | while (state > utf8_fail) { 175 | state = utf8_state_table[*m + (state * UTF8_TABLE_MULTIPLIER)]; 176 | *c = (*c << 6) | (*m++ & 0x3f); 177 | } 178 | if (state != utf8_accept) { 179 | *c = utf_replacement; 180 | m -= (state == utf8_fail); 181 | } 182 | return m; 183 | } 184 | 185 | /* helper to decode a single unicode character from the presumed UTF-8 byte stream, also verifies input length */ 186 | static inline const uint8_t * 187 | utf8_decode(const uint8_t *m, const uint8_t *end, uint32_t *c) { 188 | UTF8_TABLE_TYPE state = utf8_state_table[*m]; 189 | *c = utf8_leading_byte_value[*m++]; 190 | while ((state > utf8_fail) && (m < end)) { 191 | state = utf8_state_table[*m + (state * UTF8_TABLE_MULTIPLIER)]; 192 | *c = (*c << 6) | (*m++ & 0x3f); 193 | } 194 | if (state != utf8_accept) { 195 | *c = utf_replacement; 196 | m -= (state == utf8_fail); 197 | } 198 | return m; 199 | } 200 | 201 | /* helper to encode a UTF-32 character as UTF-16 (no error checking) */ 202 | static inline uint16_t * 203 | utf32_to_utf16_unsafe(uint16_t *out, uint32_t c) { 204 | if (c < 0x10000) { 205 | out[0] = (uint16_t)c; 206 | return out + 1; 207 | } else { 208 | out[0] = (uint16_t)((0xd800 - (0x10000 >> 10)) + (c >> 10)); 209 | out[1] = (uint16_t)((c & 0x3ff) | 0xdc00); 210 | return out + 2; 211 | } 212 | } 213 | 214 | /* convert a UTF-8 stream to UTF-16 */ 215 | size_t 216 | utf8_to_utf16(const uint8_t *m, size_t mlen, uint16_t *out) { 217 | const uint8_t *end = m + mlen, *end6 = (mlen >= 6) ? (end - 6) : m; 218 | uint16_t *start = out; 219 | uint32_t c; 220 | 221 | while (m < end6) { 222 | while ((m < end6) && (*m < 0x80)) 223 | *out++ = (uint16_t)*m++; 224 | 225 | while ((m < end6) && (*m >= 0x80)) { 226 | m = utf8_decode_unsafe(m, &c); 227 | out = utf32_to_utf16_unsafe(out, c); 228 | } 229 | } 230 | 231 | while (m < end) { 232 | m = utf8_decode(m, end, &c); 233 | out = utf32_to_utf16_unsafe(out, c); 234 | } 235 | 236 | return out - start; 237 | } 238 | 239 | /* incremental UTF-8 -> UTF-16 decoder */ 240 | void 241 | utf8_to_utf16_init(utf8_decode_state *st) { 242 | st->c = 0; 243 | st->state = 0; 244 | } 245 | 246 | static inline void 247 | utf8_decode_continue_utf16(const uint8_t **m, const uint8_t *end, UTF8_TABLE_TYPE *state, uint32_t *c, uint16_t **out) { 248 | while ((*state > utf8_fail) && (*m < end)) { 249 | *state = utf8_state_table[*(*m) + (*state * UTF8_TABLE_MULTIPLIER)]; 250 | *c = (*c << 6) | (*(*m)++ & 0x3f); 251 | } 252 | if (*state <= utf8_fail) { 253 | if (*state != utf8_accept) { 254 | *c = utf_replacement; 255 | *m -= (*state == utf8_fail); 256 | } 257 | if (*c < 0x10000) { 258 | *(*out)++ = (uint16_t)*c; 259 | } else { 260 | (*out)[0] = (uint16_t)((0xd800 - (0x10000 >> 10)) + (*c >> 10)); 261 | (*out)[1] = (uint16_t)((*c & 0x3ff) | 0xdc00); 262 | (*out) += 2; 263 | } 264 | } 265 | } 266 | 267 | void 268 | utf8_to_utf16_continue(utf8_decode_state *st, const uint8_t *m, size_t mlen, size_t *read, uint16_t *out, size_t *written) { 269 | const uint8_t *m_start = m, *m_end = m_start + mlen; 270 | uint16_t *out_start = out; 271 | UTF8_TABLE_TYPE state = (UTF8_TABLE_TYPE)st->state; 272 | uint32_t c = st->c; 273 | 274 | if (state) 275 | utf8_decode_continue_utf16(&m, m_end, &state, &c, &out); 276 | 277 | while (m < m_end) { 278 | while ((m < m_end) && (*m < 0x80)) 279 | *out++ = (uint16_t)*m++; 280 | 281 | while ((m < m_end) && (*m >= 0x80)) { 282 | state = utf8_state_table[*m]; 283 | c = utf8_leading_byte_value[*m++]; 284 | utf8_decode_continue_utf16(&m, m_end, &state, &c, &out); 285 | } 286 | } 287 | 288 | *read = m - m_start; 289 | *written = out - out_start; 290 | st->state = (state <= utf8_fail) ? 0 : state; 291 | st->c = c; 292 | } 293 | 294 | void 295 | utf8_to_utf16_finish(utf8_decode_state *st, uint16_t *out, size_t *written) { 296 | *written = 0; 297 | 298 | if (st->state != utf8_accept) { 299 | *written = 1; 300 | *out = utf_replacement; 301 | } 302 | } 303 | 304 | 305 | /* convert a UTF-8 stream to UTF-32 */ 306 | size_t 307 | utf8_to_utf32(const uint8_t *m, size_t mlen, uint32_t *out) { 308 | const uint8_t *end = m + mlen, *end6 = (mlen >= 6) ? (end - 6) : m; 309 | uint32_t *start = out, c; 310 | 311 | while (m < end6) { 312 | while ((m < end6) && (*m < 0x80)) 313 | *out++ = (uint32_t)*m++; 314 | 315 | while ((m < end6) && (*m >= 0x80)) { 316 | m = utf8_decode_unsafe(m, &c); 317 | *out++ = c; 318 | } 319 | } 320 | 321 | while (m < end) { 322 | m = utf8_decode(m, end, &c); 323 | *out++ = c; 324 | } 325 | 326 | return out - start; 327 | } 328 | 329 | /* incremental UTF-8 -> UTF-32 decoder */ 330 | void 331 | utf8_to_utf32_init(utf8_decode_state *st) { 332 | st->c = 0; 333 | st->state = 0; 334 | } 335 | 336 | static inline void 337 | utf8_decode_continue_utf32(const uint8_t **m, const uint8_t *end, UTF8_TABLE_TYPE *state, uint32_t *c, uint32_t **out) { 338 | while ((*state > utf8_fail) && (*m < end)) { 339 | *state = utf8_state_table[*(*m) + (*state * UTF8_TABLE_MULTIPLIER)]; 340 | *c = (*c << 6) | (*(*m)++ & 0x3f); 341 | } 342 | if (*state <= utf8_fail) { 343 | if (*state != utf8_accept) { 344 | *c = utf_replacement; 345 | *m -= (*state == utf8_fail); 346 | } 347 | *(*out)++ = *c; 348 | } 349 | } 350 | 351 | void 352 | utf8_to_utf32_continue(utf8_decode_state *st, const uint8_t *m, size_t mlen, size_t *read, uint32_t *out, size_t *written) { 353 | const uint8_t *m_start = m, *m_end = m_start + mlen; 354 | uint32_t *out_start = out; 355 | UTF8_TABLE_TYPE state = (UTF8_TABLE_TYPE)st->state; 356 | uint32_t c = st->c; 357 | 358 | if (state) 359 | utf8_decode_continue_utf32(&m, m_end, &state, &c, &out); 360 | 361 | while (m < m_end) { 362 | while ((m < m_end) && (*m < 0x80)) 363 | *out++ = (uint32_t)*m++; 364 | 365 | while ((m < m_end) && (*m >= 0x80)) { 366 | state = utf8_state_table[*m]; 367 | c = utf8_leading_byte_value[*m++]; 368 | utf8_decode_continue_utf32(&m, m_end, &state, &c, &out); 369 | } 370 | } 371 | 372 | *read = m - m_start; 373 | *written = out - out_start; 374 | st->state = (state <= utf8_fail) ? 0 : state; 375 | st->c = c; 376 | } 377 | 378 | void 379 | utf8_to_utf32_finish(utf8_decode_state *st, uint32_t *out, size_t *written) { 380 | *written = 0; 381 | 382 | if (st->state != utf8_accept) { 383 | *written = 1; 384 | *out = utf_replacement; 385 | } 386 | } 387 | 388 | --------------------------------------------------------------------------------