├── README.md ├── utf8_valid.h └── test.c /README.md: -------------------------------------------------------------------------------- 1 | 2 | utf8_valid.h 3 | ============ 4 | 5 | This header file provides functions to validate UTF-8 encoding form according to the specification published by Unicode and ISO/IEC 10646:2011. 6 | 7 | 8 | ```c 9 | 10 | bool utf8_valid(const char *src, size_t len); 11 | bool utf8_check(const char *src, size_t len, size_t *cursor); 12 | size_t utf8_maximal_subpart(const char *src, size_t len); 13 | 14 | ``` 15 | -------------------------------------------------------------------------------- /utf8_valid.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Christian Hansen 3 | * 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 9 | * 1. Redistributions of source code must retain the above copyright notice, this 10 | * list of conditions and the following disclaimer. 11 | * 2. Redistributions in binary form must reproduce the above copyright notice, 12 | * this list of conditions and the following disclaimer in the documentation 13 | * and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | */ 26 | #ifndef UTF8_VALID_H 27 | #define UTF8_VALID_H 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | #ifdef __cplusplus 34 | extern "C" { 35 | #endif 36 | 37 | /* 38 | * UTF-8 Encoding Form 39 | * 40 | * U+0000..U+007F 0xxxxxxx 41 | * U+0080..U+07FF 110xxxxx 10xxxxxx 42 | * U+0800..U+FFFF 1110xxxx 10xxxxxx 10xxxxxx 43 | * U+10000..U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 44 | * 45 | * 46 | * U+0000..U+007F 00..7F 47 | * N C0..C1 80..BF 1100000x 10xxxxxx 48 | * U+0080..U+07FF C2..DF 80..BF 49 | * N E0 80..9F 80..BF 11100000 100xxxxx 50 | * U+0800..U+0FFF E0 A0..BF 80..BF 51 | * U+1000..U+CFFF E1..EC 80..BF 80..BF 52 | * U+D000..U+D7FF ED 80..9F 80..BF 53 | * S ED A0..BF 80..BF 11101101 101xxxxx 54 | * U+E000..U+FFFF EE..EF 80..BF 80..BF 55 | * N F0 80..8F 80..BF 80..BF 11110000 1000xxxx 56 | * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 57 | * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 58 | * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 11110100 1000xxxx 59 | * 60 | * Legend: 61 | * N = Non-shortest form 62 | * S = Surrogates 63 | */ 64 | 65 | bool 66 | utf8_check(const char *src, size_t len, size_t *cursor) { 67 | const unsigned char *cur = (const unsigned char *)src; 68 | const unsigned char *end = cur + len; 69 | unsigned char buf[4]; 70 | uint32_t v; 71 | 72 | while (1) { 73 | const unsigned char *p; 74 | if (cur >= end - 3) { 75 | if (cur == end) 76 | break; 77 | memset(buf, 0, 4); 78 | memcpy(buf, cur, end - cur); 79 | p = (const unsigned char *)buf; 80 | } else { 81 | p = cur; 82 | } 83 | 84 | v = p[0]; 85 | /* 0xxxxxxx */ 86 | if ((v & 0x80) == 0) { 87 | cur += 1; 88 | continue; 89 | } 90 | 91 | v = (v << 8) | p[1]; 92 | /* 110xxxxx 10xxxxxx */ 93 | if ((v & 0xE0C0) == 0xC080) { 94 | /* Ensure that the top 4 bits is not zero */ 95 | v = v & 0x1E00; 96 | if (v == 0) 97 | break; 98 | cur += 2; 99 | continue; 100 | } 101 | 102 | v = (v << 8) | p[2]; 103 | /* 1110xxxx 10xxxxxx 10xxxxxx */ 104 | if ((v & 0xF0C0C0) == 0xE08080) { 105 | /* Ensure that the top 5 bits is not zero and not a surrogate */ 106 | v = v & 0x0F2000; 107 | if (v == 0 || v == 0x0D2000) 108 | break; 109 | cur += 3; 110 | continue; 111 | } 112 | 113 | v = (v << 8) | p[3]; 114 | /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 115 | if ((v & 0xF8C0C0C0) == 0xF0808080) { 116 | /* Ensure that the top 5 bits is not zero and not out of range */ 117 | v = v & 0x07300000; 118 | if (v == 0 || v > 0x04000000) 119 | break; 120 | cur += 4; 121 | continue; 122 | } 123 | 124 | break; 125 | } 126 | 127 | if (cursor) 128 | *cursor = (const char *)cur - src; 129 | 130 | return cur == end; 131 | } 132 | 133 | bool 134 | utf8_valid(const char *src, size_t len) { 135 | return utf8_check(src, len, NULL); 136 | } 137 | 138 | size_t 139 | utf8_maximal_subpart(const char *src, size_t len) { 140 | const unsigned char *cur = (const unsigned char *)src; 141 | uint32_t v; 142 | 143 | if (len < 2) 144 | return len; 145 | 146 | v = (cur[0] << 8) | cur[1]; 147 | if ((v & 0xC0C0) != 0xC080) 148 | return 1; 149 | 150 | if ((v & 0x2000) == 0) { 151 | v = v & 0x1E00; 152 | if (v == 0) 153 | return 1; 154 | return 2; 155 | } 156 | 157 | if ((v & 0x1000) == 0) { 158 | v = v & 0x0F20; 159 | if (v == 0 || v == 0x0D20) 160 | return 1; 161 | if (len < 3 || (cur[2] & 0xC0) != 0x80) 162 | return 2; 163 | return 3; 164 | } 165 | 166 | if ((v & 0x0800) == 0) { 167 | v = v & 0x0730; 168 | if (v == 0 || v > 0x0400) 169 | return 1; 170 | if (len < 3 || (cur[2] & 0xC0) != 0x80) 171 | return 2; 172 | if (len < 4 || (cur[3] & 0xC0) != 0x80) 173 | return 3; 174 | return 4; 175 | } 176 | 177 | return 1; 178 | } 179 | 180 | #ifdef __cplusplus 181 | } 182 | #endif 183 | #endif 184 | 185 | -------------------------------------------------------------------------------- /test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "utf8_valid.h" 7 | 8 | /* 9 | * UTF-8 10 | * 11 | * U+0000..U+007F 00..7F 12 | * n C0..C1 80..BF 13 | * U+0080..U+07FF C2..DF 80..BF 14 | * n E0 80..9F 80..BF 15 | * U+0800..U+D7FF E0..ED A0..9F 80..BF 16 | * U+D800..U+DFFF s ED A0..BF 80..BF 17 | * U+E000..U+FFFF EE..EF 80..BF 80..BF 18 | * n F0 80..8F 80..BF 80..BF 19 | * U+0800..U+FFFF F0 80..8F A0..BF 80..BF 20 | * U+10000..U+10FFFF F0..F4 90..8F 80..BF 80..BF 21 | * 22 | * U-110000..U-1FFFFF x F4..F7 90..BF 80..BF 80..BF 23 | * xn F8 80..87 80..BF 80..BF 80..BF 24 | * U-200000..U-3FFFFFF x F8..FB 88..BF 80..BF 80..BF 80..BF 25 | * xn FC 80..83 80..BF 80..BF 80..BF 80..BF 26 | * U-4000000..U-7FFFFFFF x FC..FD 84..BF 80..BF 80..BF 80..BF 80..BF 27 | * 28 | * Legend: 29 | * n = Non-shortest form 30 | * s = Surrogates 31 | * x = Codepoints outside Unicode codespace 32 | */ 33 | 34 | /* 35 | * Encodes the given ordinal [0, 7FFFFFFF] using the UTF-8 encoding scheme 36 | * to the given sequence length [1, 6]. This routine can be used to 37 | * produce well-formed and ill-formed UTF-8. 38 | * 39 | * To encode a Unicode scalar value to a well-formed representation: 40 | * 41 | * [U+0000, U+007F] should be encoded to a sequence length of 1 42 | * [U+0080, U+07FF] should be encoded to a sequence length of 2 43 | * [U+0800, U+D7FF] should be encoded to a sequence length of 3 44 | * [U+E000, U+FFFF] should be encoded to a sequence length of 3 45 | * [U+10000, U+10FFFF] should be encoded to a sequence length of 4 46 | * 47 | * To encode a Unicode scalar value to non-shortest form representation: 48 | * 49 | * [U+0000, U+007F] can be encoded to a sequence length of [2, 6] 50 | * [U+0080, U+07FF] can be encoded to a sequence length of [3, 6] 51 | * [U+0800, U+FFFF] can be encoded to a sequence length of [4, 6] 52 | * 53 | * To encode an ordinal outside of Unicode codespace: 54 | * 55 | * [110000, 1FFFFF] can be encoded to a sequence length of 4 56 | * [200000, 3FFFFFF] can be encoded to a sequence length of 5 57 | * [4000000, 7FFFFFFF] can be encoded to a sequence length of 6 58 | */ 59 | 60 | char * 61 | encode_ord(uint32_t ord, size_t len, char *dst) { 62 | static const uint32_t kMask[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 63 | static const uint32_t kMax[6] = { 1 << 7, 1 << 11, 1 << 16, 64 | 1 << 21, 1 << 26, 1 << 31 }; 65 | size_t i; 66 | 67 | assert(len >= 1); 68 | assert(len <= 6); 69 | assert(ord < kMax[len - 1]); 70 | 71 | for (i = len - 1; i > 0; i--) { 72 | dst[i] = (ord & 0x3F) | 0x80; 73 | ord >>= 6; 74 | } 75 | dst[0] = ord | kMask[len - 1]; 76 | return dst; 77 | } 78 | 79 | char * 80 | escape_str(const char *src, size_t len, char *dst) { 81 | static const char * const kHex = "0123456789ABCDEF"; 82 | size_t i; 83 | char *d; 84 | 85 | for (d = dst, i = 0; i < len; i++) { 86 | const unsigned char c = src[i]; 87 | if (c >= ' ' && c <= '~') { 88 | if (c == '\\' || c == '"') 89 | *d++ = '\\'; 90 | *d++ = c; 91 | } 92 | else { 93 | *d++ = '\\'; 94 | *d++ = 'x'; 95 | *d++ = kHex[c >> 4]; 96 | *d++ = kHex[c & 0x0F]; 97 | } 98 | } 99 | *d = 0; 100 | return dst; 101 | } 102 | 103 | static size_t TestCount = 0; 104 | static size_t TestFailed = 0; 105 | 106 | void 107 | test_utf8(const char *src, size_t len, size_t exp_spl, bool exp_ret, unsigned line) { 108 | char escaped[255 * 4 + 1]; 109 | size_t offset, got_spl; 110 | bool got_ret; 111 | 112 | assert(len <= 255); 113 | 114 | got_ret = utf8_check(src, len, &offset); 115 | 116 | TestCount++; 117 | 118 | if (got_ret != exp_ret) { 119 | escape_str(src, len, escaped); 120 | 121 | printf("utf8_valid(\"%s\", %d) != %s at line %u\n", 122 | escaped, (unsigned)len, exp_ret ? "true" : "false", line); 123 | 124 | TestFailed++; 125 | } 126 | 127 | src += offset; 128 | len -= offset; 129 | 130 | TestCount++; 131 | 132 | got_spl = utf8_maximal_subpart(src, len); 133 | 134 | if (got_spl != exp_spl) { 135 | escape_str(src, len, escaped); 136 | 137 | printf("utf8_maximal_subpart(\"%s\", %d) != %d (got: %d) at line %u\n", 138 | escaped, (unsigned)len, (unsigned)exp_spl, (unsigned)got_spl, line); 139 | 140 | TestFailed++; 141 | } 142 | } 143 | 144 | #define TEST_UTF8(src, len, subpart, exp) \ 145 | test_utf8(src, len, subpart, exp, __LINE__) 146 | 147 | 148 | void 149 | test_unicode_scalar_value() { 150 | uint32_t ord; 151 | char src[4]; 152 | 153 | /* Unicode scalar value [U+0000, U+007F] */ 154 | for (ord = 0x0000; ord <= 0x007F; ord++) { 155 | encode_ord(ord, 1, src); 156 | TEST_UTF8(src, 1, 0, true); 157 | } 158 | 159 | /* 160 | * Unicode scalar value [U+0080, U+07FF] 161 | * The maximal subpart is the length of the truncated sequence 162 | */ 163 | for (ord = 0x0080; ord <= 0x07FF; ord++) { 164 | encode_ord(ord, 2, src); 165 | TEST_UTF8(src, 2, 0, true); 166 | } 167 | 168 | /* 169 | * Unicode scalar value [U+0800, U+D7FF] and [U+E000, U+FFFF] 170 | * The maximal subpart is the length of the truncated sequence 171 | */ 172 | for (ord = 0x0800; ord <= 0xFFFF && (ord & 0xF800) != 0xD800; ord++) { 173 | encode_ord(ord, 3, src); 174 | 175 | TEST_UTF8(src, 3, 0, true); 176 | if ((ord % (1 << 6)) == 0) 177 | TEST_UTF8(src, 2, 2, false); 178 | } 179 | 180 | /* 181 | * Unicode scalar value [U+10000, U+10FFF] 182 | * The maximal subpart is the length of the truncated sequence 183 | */ 184 | for (ord = 0x10000; ord <= 0x10FFFF; ord++) { 185 | encode_ord(ord, 4, src); 186 | 187 | TEST_UTF8(src, 4, 0, true); 188 | if ((ord % (1 << 6)) == 0) 189 | TEST_UTF8(src, 3, 3, false); 190 | if ((ord % (1 << 12)) == 0) 191 | TEST_UTF8(src, 2, 2, false); 192 | } 193 | } 194 | 195 | void 196 | test_non_shortest_form() { 197 | uint32_t ord; 198 | char src[4]; 199 | 200 | /* 201 | * Non-shortest form 2-byte sequence [U+0000, U+007F] 202 | * The maximal subpart is 1-byte 203 | */ 204 | for (ord = 0x0000; ord <= 0x007F; ord++) { 205 | encode_ord(ord, 2, src); 206 | TEST_UTF8(src, 2, 1, false); 207 | } 208 | 209 | /* 210 | * Non-shortest form 3-byte sequence [U+0000, U+07FF] 211 | * The maximal subpart is 1-byte 212 | */ 213 | for (ord = 0x0000; ord <= 0x07FF; ord++) { 214 | encode_ord(ord, 3, src); 215 | 216 | TEST_UTF8(src, 3, 1, false); 217 | if ((ord % (1 << 6)) == 0) 218 | TEST_UTF8(src, 2, 1, false); 219 | } 220 | 221 | /* 222 | * Non-shortest form 4-byte sequence [U+0000, U+FFFF] 223 | * The maximal subpart is 1-byte 224 | */ 225 | for (ord = 0x0000; ord <= 0xFFFF; ord++) { 226 | encode_ord(ord, 4, src); 227 | 228 | TEST_UTF8(src, 4, 1, false); 229 | if ((ord % (1 << 6)) == 0) 230 | TEST_UTF8(src, 3, 1, false); 231 | if ((ord % (1 << 12)) == 0) 232 | TEST_UTF8(src, 2, 1, false); 233 | } 234 | } 235 | 236 | void 237 | test_non_unicode() { 238 | uint32_t ord; 239 | char src[4]; 240 | 241 | /* 242 | * Code point outside Unicode codespace 243 | * The maximal subpart is 1-byte 244 | */ 245 | for (ord = 0x110000; ord <= 0x1FFFFF; ord++) { 246 | encode_ord(ord, 4, src); 247 | 248 | TEST_UTF8(src, 4, 1, false); 249 | if ((ord % (1 << 6)) == 0) 250 | TEST_UTF8(src, 3, 1, false); 251 | if ((ord % (1 << 12)) == 0) 252 | TEST_UTF8(src, 2, 1, false); 253 | } 254 | } 255 | 256 | void 257 | test_surrogates() { 258 | uint32_t ord; 259 | char src[4]; 260 | 261 | /* 262 | * Surrogates [U+D800, U+DFFF] 263 | * The maximal subpart is 1-byte 264 | */ 265 | for (ord = 0xD800; ord <= 0xDFFF; ord++) { 266 | encode_ord(ord, 3, src); 267 | 268 | TEST_UTF8(src, 3, 1, false); 269 | if ((ord % (1 << 6)) == 0) 270 | TEST_UTF8(src, 2, 1, false); 271 | } 272 | } 273 | 274 | void 275 | test_continuations() { 276 | uint8_t ord; 277 | char src[4]; 278 | 279 | /* 280 | * Missplaced continuation [\x80, \xBF] 281 | * The maximal subpart is 1-byte 282 | */ 283 | for (ord = 0x80; ord <= 0xBF; ord++) { 284 | src[0] = ord; 285 | TEST_UTF8(src, 1, 1, false); 286 | } 287 | } 288 | 289 | int 290 | main(int argc, char **argv) { 291 | 292 | test_unicode_scalar_value(); 293 | test_surrogates(); 294 | test_non_shortest_form(); 295 | test_non_unicode(); 296 | test_continuations(); 297 | 298 | if (TestFailed) 299 | printf("Failed %zu tests of %zu.\n", TestFailed, TestCount); 300 | else 301 | printf("Passed %zu tests.\n", TestCount); 302 | 303 | return TestFailed ? 1 : 0; 304 | } 305 | 306 | --------------------------------------------------------------------------------