├── README.md
├── utf8_valid.h
└── test.c


/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | utf8_valid.h
 3 | ============
 4 | 
 5 | This header file provides functions to validate UTF-8 encoding form according to the specification published by Unicode and ISO/IEC 10646:2011.
 6 | 
 7 | 
 8 | ```c
 9 | 
10 | bool    utf8_valid(const char *src, size_t len);
11 | bool    utf8_check(const char *src, size_t len, size_t *cursor);
12 | size_t  utf8_maximal_subpart(const char *src, size_t len);
13 | 
14 | ```
15 | 


--------------------------------------------------------------------------------
/utf8_valid.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
  3 |  * <https://github.com/chansen/c-utf8-valid>
  4 |  * All rights reserved.
  5 |  * 
  6 |  * Redistribution and use in source and binary forms, with or without
  7 |  * modification, are permitted provided that the following conditions are met: 
  8 |  * 
  9 |  * 1. Redistributions of source code must retain the above copyright notice, this
 10 |  *    list of conditions and the following disclaimer. 
 11 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 12 |  *    this list of conditions and the following disclaimer in the documentation
 13 |  *    and/or other materials provided with the distribution. 
 14 |  * 
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 19 |  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  */
 26 | #ifndef UTF8_VALID_H
 27 | #define UTF8_VALID_H
 28 | #include <stddef.h>
 29 | #include <string.h>
 30 | #include <stdint.h>
 31 | #include <stdbool.h>
 32 | 
 33 | #ifdef __cplusplus
 34 | extern "C" {
 35 | #endif
 36 | 
 37 | /*
 38 |  *    UTF-8 Encoding Form
 39 |  *
 40 |  *    U+0000..U+007F       0xxxxxxx
 41 |  *    U+0080..U+07FF       110xxxxx 10xxxxxx
 42 |  *    U+0800..U+FFFF       1110xxxx 10xxxxxx 10xxxxxx
 43 |  *   U+10000..U+10FFFF     11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 44 |  *
 45 |  *
 46 |  *    U+0000..U+007F       00..7F
 47 |  *                      N  C0..C1  80..BF                   1100000x 10xxxxxx
 48 |  *    U+0080..U+07FF       C2..DF  80..BF
 49 |  *                      N  E0      80..9F  80..BF           11100000 100xxxxx
 50 |  *    U+0800..U+0FFF       E0      A0..BF  80..BF
 51 |  *    U+1000..U+CFFF       E1..EC  80..BF  80..BF
 52 |  *    U+D000..U+D7FF       ED      80..9F  80..BF
 53 |  *                      S  ED      A0..BF  80..BF           11101101 101xxxxx
 54 |  *    U+E000..U+FFFF       EE..EF  80..BF  80..BF
 55 |  *                      N  F0      80..8F  80..BF  80..BF   11110000 1000xxxx
 56 |  *   U+10000..U+3FFFF      F0      90..BF  80..BF  80..BF
 57 |  *   U+40000..U+FFFFF      F1..F3  80..BF  80..BF  80..BF
 58 |  *  U+100000..U+10FFFF     F4      80..8F  80..BF  80..BF   11110100 1000xxxx
 59 |  *
 60 |  *  Legend:
 61 |  *    N = Non-shortest form
 62 |  *    S = Surrogates
 63 |  */
 64 | 
 65 | bool
 66 | utf8_check(const char *src, size_t len, size_t *cursor) {
 67 |   const unsigned char *cur = (const unsigned char *)src;
 68 |   const unsigned char *end = cur + len;
 69 |   unsigned char buf[4];
 70 |   uint32_t v;
 71 | 
 72 |   while (1) {
 73 |     const unsigned char *p;
 74 |     if (cur >= end - 3) {
 75 |       if (cur == end)
 76 |         break;
 77 |       memset(buf, 0, 4);
 78 |       memcpy(buf, cur, end - cur);
 79 |       p = (const unsigned char *)buf;
 80 |     } else {
 81 |       p = cur;
 82 |     }
 83 | 
 84 |     v = p[0];
 85 |     /* 0xxxxxxx */
 86 |     if ((v & 0x80) == 0) {
 87 |       cur += 1;
 88 |       continue;
 89 |     }
 90 | 
 91 |     v = (v << 8) | p[1];
 92 |     /* 110xxxxx 10xxxxxx */
 93 |     if ((v & 0xE0C0) == 0xC080) {
 94 |       /* Ensure that the top 4 bits is not zero */
 95 |       v = v & 0x1E00;
 96 |       if (v == 0)
 97 |         break;
 98 |       cur += 2;
 99 |       continue;
100 |     }
101 | 
102 |     v = (v << 8) | p[2];
103 |     /* 1110xxxx 10xxxxxx 10xxxxxx */
104 |     if ((v & 0xF0C0C0) == 0xE08080) {
105 |       /* Ensure that the top 5 bits is not zero and not a surrogate */
106 |       v = v & 0x0F2000;
107 |       if (v == 0 || v == 0x0D2000)
108 |         break;
109 |       cur += 3;
110 |       continue;
111 |     }
112 | 
113 |     v = (v << 8) | p[3];
114 |     /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
115 |     if ((v & 0xF8C0C0C0) == 0xF0808080) {
116 |       /* Ensure that the top 5 bits is not zero and not out of range */
117 |       v = v & 0x07300000;
118 |       if (v == 0 || v > 0x04000000)
119 |         break;
120 |       cur += 4;
121 |       continue;
122 |     }
123 | 
124 |     break;
125 |   }
126 | 
127 |   if (cursor)
128 |     *cursor = (const char *)cur - src;
129 | 
130 |   return cur == end;
131 | }
132 | 
133 | bool
134 | utf8_valid(const char *src, size_t len) {
135 |   return utf8_check(src, len, NULL);
136 | }
137 | 
138 | size_t
139 | utf8_maximal_subpart(const char *src, size_t len) {
140 |   const unsigned char *cur = (const unsigned char *)src;
141 |   uint32_t v;
142 | 
143 |   if (len < 2)
144 |     return len;
145 | 
146 |   v = (cur[0] << 8) | cur[1];
147 |   if ((v & 0xC0C0) != 0xC080)
148 |     return 1;
149 | 
150 |   if ((v & 0x2000) == 0) {
151 |     v = v & 0x1E00;
152 |     if (v == 0)
153 |       return 1;
154 |     return 2;
155 |   }
156 | 
157 |   if ((v & 0x1000) == 0) {
158 |     v = v & 0x0F20;
159 |     if (v == 0 || v == 0x0D20)
160 |       return 1;
161 |     if (len < 3 || (cur[2] & 0xC0) != 0x80)
162 |       return 2;
163 |     return 3;
164 |   }
165 | 
166 |   if ((v & 0x0800) == 0) {
167 |     v = v & 0x0730;
168 |     if (v == 0 || v > 0x0400)
169 |       return 1;
170 |     if (len < 3 || (cur[2] & 0xC0) != 0x80)
171 |       return 2;
172 |     if (len < 4 || (cur[3] & 0xC0) != 0x80)
173 |       return 3;
174 |     return 4;
175 |   }
176 | 
177 |   return 1;
178 | }
179 | 
180 | #ifdef __cplusplus
181 | }
182 | #endif
183 | #endif
184 | 
185 | 


--------------------------------------------------------------------------------
/test.c:
--------------------------------------------------------------------------------
  1 | #include <stddef.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <assert.h>
  5 | #include <stdio.h>
  6 | #include "utf8_valid.h"
  7 | 
  8 | /*
  9 |  *  UTF-8
 10 |  *
 11 |  *     U+0000..U+007F         00..7F
 12 |  *                         n  C0..C1  80..BF
 13 |  *     U+0080..U+07FF         C2..DF  80..BF
 14 |  *                         n  E0      80..9F  80..BF
 15 |  *     U+0800..U+D7FF         E0..ED  A0..9F  80..BF
 16 |  *     U+D800..U+DFFF      s  ED      A0..BF  80..BF
 17 |  *     U+E000..U+FFFF         EE..EF  80..BF  80..BF
 18 |  *                         n  F0      80..8F  80..BF  80..BF
 19 |  *     U+0800..U+FFFF         F0      80..8F  A0..BF  80..BF
 20 |  *    U+10000..U+10FFFF       F0..F4  90..8F  80..BF  80..BF
 21 |  *
 22 |  *   U-110000..U-1FFFFF    x  F4..F7  90..BF  80..BF  80..BF
 23 |  *                         xn F8      80..87  80..BF  80..BF  80..BF
 24 |  *   U-200000..U-3FFFFFF   x  F8..FB  88..BF  80..BF  80..BF  80..BF
 25 |  *                         xn FC      80..83  80..BF  80..BF  80..BF  80..BF
 26 |  *  U-4000000..U-7FFFFFFF  x  FC..FD  84..BF  80..BF  80..BF  80..BF  80..BF
 27 |  * 
 28 |  *  Legend:
 29 |  *    n = Non-shortest form
 30 |  *    s = Surrogates
 31 |  *    x = Codepoints outside Unicode codespace
 32 |  */
 33 | 
 34 | /*
 35 |  *  Encodes the given ordinal [0, 7FFFFFFF] using the UTF-8 encoding scheme
 36 |  *  to the given sequence length [1, 6]. This routine can be used to
 37 |  *  produce well-formed and ill-formed UTF-8.
 38 |  *
 39 |  *  To encode a Unicode scalar value to a well-formed representation:
 40 |  *
 41 |  *   [U+0000, U+007F] should be encoded to a sequence length of 1
 42 |  *   [U+0080, U+07FF] should be encoded to a sequence length of 2
 43 |  *   [U+0800, U+D7FF] should be encoded to a sequence length of 3
 44 |  *   [U+E000, U+FFFF] should be encoded to a sequence length of 3
 45 |  *   [U+10000, U+10FFFF] should be encoded to a sequence length of 4
 46 |  *
 47 |  *  To encode a Unicode scalar value to non-shortest form representation:
 48 |  *
 49 |  *   [U+0000, U+007F] can be encoded to a sequence length of [2, 6]
 50 |  *   [U+0080, U+07FF] can be encoded to a sequence length of [3, 6]
 51 |  *   [U+0800, U+FFFF] can be encoded to a sequence length of [4, 6]
 52 |  *
 53 |  *  To encode an ordinal outside of Unicode codespace:
 54 |  *
 55 |  *   [110000, 1FFFFF] can be encoded to a sequence length of 4
 56 |  *   [200000, 3FFFFFF] can be encoded to a sequence length of 5
 57 |  *   [4000000, 7FFFFFFF] can be encoded to a sequence length of 6
 58 |  */
 59 | 
 60 | char *
 61 | encode_ord(uint32_t ord, size_t len, char *dst) {
 62 |   static const uint32_t kMask[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 63 |   static const uint32_t kMax[6]  = { 1 <<  7, 1 << 11, 1 << 16, 
 64 |                                      1 << 21, 1 << 26, 1 << 31 };
 65 |   size_t i;
 66 | 
 67 |   assert(len >= 1);
 68 |   assert(len <= 6);
 69 |   assert(ord < kMax[len - 1]);
 70 | 
 71 |   for (i = len - 1; i > 0; i--) {
 72 |     dst[i] = (ord & 0x3F) | 0x80;
 73 |     ord >>= 6;
 74 |   }
 75 |   dst[0] = ord | kMask[len - 1];
 76 |   return dst;
 77 | }
 78 | 
 79 | char *
 80 | escape_str(const char *src, size_t len, char *dst) {
 81 |   static const char * const kHex = "0123456789ABCDEF";
 82 |   size_t i;
 83 |   char *d;
 84 | 
 85 |   for (d = dst, i = 0; i < len; i++) {
 86 |     const unsigned char c = src[i];
 87 |     if (c >= ' ' && c <= '~') {
 88 |       if (c == '\\' || c == '"')
 89 |         *d++ = '\\';
 90 |       *d++ = c;
 91 |     }
 92 |     else {
 93 |       *d++ = '\\';
 94 |       *d++ = 'x';
 95 |       *d++ = kHex[c >> 4];
 96 |       *d++ = kHex[c & 0x0F];
 97 |     }
 98 |   }
 99 |   *d = 0;
100 |   return dst;
101 | }
102 | 
103 | static size_t TestCount  = 0;
104 | static size_t TestFailed = 0;
105 | 
106 | void
107 | test_utf8(const char *src, size_t len, size_t exp_spl, bool exp_ret, unsigned line) {
108 |   char escaped[255 * 4 + 1];
109 |   size_t offset, got_spl;
110 |   bool got_ret;
111 | 
112 |   assert(len <= 255);
113 | 
114 |   got_ret = utf8_check(src, len, &offset);
115 | 
116 |   TestCount++;
117 | 
118 |   if (got_ret != exp_ret) {
119 |     escape_str(src, len, escaped);
120 | 
121 |     printf("utf8_valid(\"%s\", %d) != %s at line %u\n",
122 |       escaped, (unsigned)len, exp_ret ? "true" : "false", line);
123 |     
124 |     TestFailed++;
125 |   }
126 | 
127 |   src += offset;
128 |   len -= offset;
129 | 
130 |   TestCount++;
131 | 
132 |   got_spl = utf8_maximal_subpart(src, len);
133 | 
134 |   if (got_spl != exp_spl) {
135 |     escape_str(src, len, escaped);
136 | 
137 |     printf("utf8_maximal_subpart(\"%s\", %d) != %d (got: %d) at line %u\n",
138 |       escaped, (unsigned)len, (unsigned)exp_spl, (unsigned)got_spl, line);
139 | 
140 |     TestFailed++;
141 |   }
142 | }
143 | 
144 | #define TEST_UTF8(src, len, subpart, exp) \
145 |   test_utf8(src, len, subpart, exp, __LINE__)
146 | 
147 | 
148 | void
149 | test_unicode_scalar_value() {
150 |   uint32_t ord;
151 |   char src[4];
152 | 
153 |   /* Unicode scalar value [U+0000, U+007F] */
154 |   for (ord = 0x0000; ord <= 0x007F; ord++) {
155 |     encode_ord(ord, 1, src);
156 |     TEST_UTF8(src, 1, 0, true);
157 |   }
158 | 
159 |   /*
160 |    * Unicode scalar value [U+0080, U+07FF]
161 |    * The maximal subpart is the length of the truncated sequence
162 |    */
163 |   for (ord = 0x0080; ord <= 0x07FF; ord++) {
164 |     encode_ord(ord, 2, src);
165 |     TEST_UTF8(src, 2, 0, true);
166 |   }
167 | 
168 |   /*
169 |    * Unicode scalar value [U+0800, U+D7FF] and [U+E000, U+FFFF]
170 |    * The maximal subpart is the length of the truncated sequence
171 |    */
172 |   for (ord = 0x0800; ord <= 0xFFFF && (ord & 0xF800) != 0xD800; ord++) {
173 |     encode_ord(ord, 3, src);
174 | 
175 |     TEST_UTF8(src, 3, 0, true);
176 |     if ((ord % (1 << 6)) == 0)
177 |       TEST_UTF8(src, 2, 2, false);
178 |   }
179 | 
180 |   /*
181 |    * Unicode scalar value [U+10000, U+10FFF]
182 |    * The maximal subpart is the length of the truncated sequence
183 |    */
184 |   for (ord = 0x10000; ord <= 0x10FFFF; ord++) {
185 |     encode_ord(ord, 4, src);
186 | 
187 |     TEST_UTF8(src, 4, 0, true);
188 |     if ((ord % (1 << 6)) == 0)
189 |       TEST_UTF8(src, 3, 3, false);
190 |     if ((ord % (1 << 12)) == 0)
191 |       TEST_UTF8(src, 2, 2, false);
192 |   }
193 | }
194 | 
195 | void
196 | test_non_shortest_form() {
197 |   uint32_t ord;
198 |   char src[4];
199 | 
200 |   /*
201 |    * Non-shortest form 2-byte sequence [U+0000, U+007F]
202 |    * The maximal subpart is 1-byte
203 |    */
204 |   for (ord = 0x0000; ord <= 0x007F; ord++) {
205 |     encode_ord(ord, 2, src);
206 |     TEST_UTF8(src, 2, 1, false);
207 |   }
208 | 
209 |   /*
210 |    * Non-shortest form 3-byte sequence [U+0000, U+07FF]
211 |    * The maximal subpart is 1-byte
212 |    */
213 |   for (ord = 0x0000; ord <= 0x07FF; ord++) {
214 |     encode_ord(ord, 3, src);
215 | 
216 |     TEST_UTF8(src, 3, 1, false);
217 |     if ((ord % (1 << 6)) == 0)
218 |       TEST_UTF8(src, 2, 1, false);
219 |   }
220 | 
221 |   /*
222 |    * Non-shortest form 4-byte sequence [U+0000, U+FFFF]
223 |    * The maximal subpart is 1-byte
224 |    */
225 |   for (ord = 0x0000; ord <= 0xFFFF; ord++) {
226 |     encode_ord(ord, 4, src);
227 | 
228 |     TEST_UTF8(src, 4, 1, false);
229 |     if ((ord % (1 << 6)) == 0)
230 |       TEST_UTF8(src, 3, 1, false);
231 |     if ((ord % (1 << 12)) == 0)
232 |       TEST_UTF8(src, 2, 1, false);
233 |   }
234 | }
235 | 
236 | void
237 | test_non_unicode() {
238 |   uint32_t ord;
239 |   char src[4];
240 | 
241 |   /*
242 |    * Code point outside Unicode codespace
243 |    * The maximal subpart is 1-byte
244 |    */
245 |   for (ord = 0x110000; ord <= 0x1FFFFF; ord++) {
246 |     encode_ord(ord, 4, src);
247 | 
248 |     TEST_UTF8(src, 4, 1, false);
249 |     if ((ord % (1 << 6)) == 0)
250 |       TEST_UTF8(src, 3, 1, false);
251 |     if ((ord % (1 << 12)) == 0)
252 |       TEST_UTF8(src, 2, 1, false);
253 |   }
254 | }
255 | 
256 | void
257 | test_surrogates() {
258 |   uint32_t ord;
259 |   char src[4];
260 | 
261 |   /*
262 |    * Surrogates [U+D800, U+DFFF]
263 |    * The maximal subpart is 1-byte
264 |    */
265 |   for (ord = 0xD800; ord <= 0xDFFF; ord++) {
266 |     encode_ord(ord, 3, src);
267 | 
268 |     TEST_UTF8(src, 3, 1, false);
269 |     if ((ord % (1 << 6)) == 0)
270 |       TEST_UTF8(src, 2, 1, false);
271 |   }
272 | }
273 | 
274 | void
275 | test_continuations() {
276 |   uint8_t ord;
277 |   char src[4];
278 | 
279 |   /*
280 |    * Missplaced continuation [\x80, \xBF]
281 |    * The maximal subpart is 1-byte
282 |    */
283 |   for (ord = 0x80; ord <= 0xBF; ord++) {
284 |     src[0] = ord;
285 |     TEST_UTF8(src, 1, 1, false);
286 |   }
287 | }
288 | 
289 | int
290 | main(int argc, char **argv) {
291 | 
292 |   test_unicode_scalar_value();
293 |   test_surrogates();
294 |   test_non_shortest_form();
295 |   test_non_unicode();
296 |   test_continuations();
297 | 
298 |   if (TestFailed)
299 |     printf("Failed %zu tests of %zu.\n", TestFailed, TestCount);
300 |   else
301 |     printf("Passed %zu tests.\n", TestCount);
302 |   
303 |   return TestFailed ? 1 : 0;
304 | }
305 | 
306 | 


--------------------------------------------------------------------------------