├── .gitignore
├── Makefile
├── README.md
├── utf8.c
└── utf8.h


/.gitignore:
--------------------------------------------------------------------------------
1 | /*.o
2 | /*.do
3 | /*.a
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SRCS = utf8.c 
 2 | 
 3 | OBJS = $(SRCS:%.c=%.o)
 4 | DOBJS = $(SRCS:%.c=%.do)
 5 | 
 6 | ifneq ($(MAKECMDGOALS),debug)
 7 | XOBJS = $(OBJS)
 8 | else
 9 | XOBJS = $(DOBJS)
10 | endif
11 | 
12 | FLAGS = -Wall -Wno-strict-aliasing $(CFLAGS)
13 | 
14 | DEBUGFLAGS = -ggdb3 -DDEBUG
15 | SHIPFLAGS = -O3 -DNDEBUG -falign-functions -momit-leaf-frame-pointer
16 | 
17 | DEBUGFLAGS += $(FLAGS)
18 | SHIPFLAGS += $(FLAGS)
19 | 
20 | default: release
21 | 
22 | %.o: %.c
23 | 	$(CC) $(SHIPFLAGS) -c $< -o $@
24 | %.do: %.c
25 | 	$(CC) $(DEBUGFLAGS) -c $< -o $@
26 | 
27 | release debug: libcutef8.a
28 | 
29 | libcutef8.a: $(XOBJS)
30 | 	rm -rf $@
31 | 	ar -rcs $@ $^
32 | 
33 | clean:
34 | 	rm -f *.o
35 | 	rm -f *.do
36 | 	rm -f *.a
37 | 	rm -f *~ *#
38 | 	rm -f core*
39 | 	rm -f libcutef8.a
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Simple, fast C library for manipulating strings in the UTF-8 encoding.
2 | 
3 | NOTE: I now use and recommend [utf8proc](https://github.com/JuliaLang/utf8proc) instead of this library.
4 | 


--------------------------------------------------------------------------------
/utf8.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Basic UTF-8 manipulation routines
  3 |   by Jeff Bezanson
  4 |   placed in the public domain Fall 2005
  5 | 
  6 |   This code is designed to provide the utilities you need to manipulate
  7 |   UTF-8 as an internal string encoding. These functions do not perform the
  8 |   error checking normally needed when handling UTF-8 data, so if you happen
  9 |   to be from the Unicode Consortium you will want to flay me alive.
 10 |   I do this because error checking can be performed at the boundaries (I/O),
 11 |   with these routines reserved for higher performance on data known to be
 12 |   valid.
 13 |   A UTF-8 validation routine is included.
 14 | */
 15 | #include <stdlib.h>
 16 | #include <stdio.h>
 17 | #include <string.h>
 18 | #include <stdarg.h>
 19 | #include <stdint.h>
 20 | #include <wchar.h>
 21 | #include <wctype.h>
 22 | 
 23 | #ifdef WIN32
 24 | #include <malloc.h>
 25 | #define snprintf _snprintf
 26 | #else
 27 | #ifndef __FreeBSD__
 28 | #include <alloca.h>
 29 | #endif /* __FreeBSD__ */
 30 | #endif
 31 | #include <assert.h>
 32 | 
 33 | #include "utf8.h"
 34 | 
 35 | static const uint32_t offsetsFromUTF8[6] = {
 36 |     0x00000000UL, 0x00003080UL, 0x000E2080UL,
 37 |     0x03C82080UL, 0xFA082080UL, 0x82082080UL
 38 | };
 39 | 
 40 | static const char trailingBytesForUTF8[256] = {
 41 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 42 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 43 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 44 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 45 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 46 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 47 |     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 48 |     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
 49 | };
 50 | 
 51 | /* returns length of next utf-8 sequence */
 52 | size_t u8_seqlen(const char *s)
 53 | {
 54 |     return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
 55 | }
 56 | 
 57 | /* returns the # of bytes needed to encode a certain character
 58 |    0 means the character cannot (or should not) be encoded. */
 59 | size_t u8_charlen(uint32_t ch)
 60 | {
 61 |     if (ch < 0x80)
 62 |         return 1;
 63 |     else if (ch < 0x800)
 64 |         return 2;
 65 |     else if (ch < 0x10000)
 66 |         return 3;
 67 |     else if (ch < 0x110000)
 68 |         return 4;
 69 |     return 0;
 70 | }
 71 | 
 72 | size_t u8_codingsize(uint32_t *wcstr, size_t n)
 73 | {
 74 |     size_t i, c=0;
 75 | 
 76 |     for(i=0; i < n; i++)
 77 |         c += u8_charlen(wcstr[i]);
 78 |     return c;
 79 | }
 80 | 
 81 | /* conversions without error checking
 82 |    only works for valid UTF-8, i.e. no 5- or 6-byte sequences
 83 |    srcsz = source size in bytes
 84 |    sz = dest size in # of wide characters
 85 | 
 86 |    returns # characters converted
 87 |    if sz == srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
 88 | */
 89 | size_t u8_toucs(uint32_t *dest, size_t sz, const char *src, size_t srcsz)
 90 | {
 91 |     uint32_t ch;
 92 |     const char *src_end = src + srcsz;
 93 |     size_t nb;
 94 |     size_t i=0;
 95 | 
 96 |     if (sz == 0 || srcsz == 0)
 97 |         return 0;
 98 | 
 99 |     while (i < sz) {
100 |         if (!isutf(*src)) {     // invalid sequence
101 |             dest[i++] = 0xFFFD;
102 |             src++;
103 |             if (src >= src_end) break;
104 |             continue;
105 |         }
106 |         nb = trailingBytesForUTF8[(unsigned char)*src];
107 |         if (src + nb >= src_end)
108 |             break;
109 |         ch = 0;
110 |         switch (nb) {
111 |             /* these fall through deliberately */
112 |         case 5: ch += (unsigned char)*src++; ch <<= 6;
113 |         case 4: ch += (unsigned char)*src++; ch <<= 6;
114 |         case 3: ch += (unsigned char)*src++; ch <<= 6;
115 |         case 2: ch += (unsigned char)*src++; ch <<= 6;
116 |         case 1: ch += (unsigned char)*src++; ch <<= 6;
117 |         case 0: ch += (unsigned char)*src++;
118 |         }
119 |         ch -= offsetsFromUTF8[nb];
120 |         dest[i++] = ch;
121 |     }
122 |     return i;
123 | }
124 | 
125 | /* srcsz = number of source characters
126 |    sz = size of dest buffer in bytes
127 | 
128 |    returns # bytes stored in dest
129 |    the destination string will never be bigger than the source string.
130 | */
131 | size_t u8_toutf8(char *dest, size_t sz, const uint32_t *src, size_t srcsz)
132 | {
133 |     uint32_t ch;
134 |     size_t i = 0;
135 |     char *dest0 = dest;
136 |     char *dest_end = dest + sz;
137 | 
138 |     while (i < srcsz) {
139 |         ch = src[i];
140 |         if (ch < 0x80) {
141 |             if (dest >= dest_end)
142 |                 break;
143 |             *dest++ = (char)ch;
144 |         }
145 |         else if (ch < 0x800) {
146 |             if (dest >= dest_end-1)
147 |                 break;
148 |             *dest++ = (ch>>6) | 0xC0;
149 |             *dest++ = (ch & 0x3F) | 0x80;
150 |         }
151 |         else if (ch < 0x10000) {
152 |             if (dest >= dest_end-2)
153 |                 break;
154 |             *dest++ = (ch>>12) | 0xE0;
155 |             *dest++ = ((ch>>6) & 0x3F) | 0x80;
156 |             *dest++ = (ch & 0x3F) | 0x80;
157 |         }
158 |         else if (ch < 0x110000) {
159 |             if (dest >= dest_end-3)
160 |                 break;
161 |             *dest++ = (ch>>18) | 0xF0;
162 |             *dest++ = ((ch>>12) & 0x3F) | 0x80;
163 |             *dest++ = ((ch>>6) & 0x3F) | 0x80;
164 |             *dest++ = (ch & 0x3F) | 0x80;
165 |         }
166 |         i++;
167 |     }
168 |     return (dest-dest0);
169 | }
170 | 
171 | size_t u8_wc_toutf8(char *dest, uint32_t ch)
172 | {
173 |     if (ch < 0x80) {
174 |         dest[0] = (char)ch;
175 |         return 1;
176 |     }
177 |     if (ch < 0x800) {
178 |         dest[0] = (ch>>6) | 0xC0;
179 |         dest[1] = (ch & 0x3F) | 0x80;
180 |         return 2;
181 |     }
182 |     if (ch < 0x10000) {
183 |         dest[0] = (ch>>12) | 0xE0;
184 |         dest[1] = ((ch>>6) & 0x3F) | 0x80;
185 |         dest[2] = (ch & 0x3F) | 0x80;
186 |         return 3;
187 |     }
188 |     if (ch < 0x110000) {
189 |         dest[0] = (ch>>18) | 0xF0;
190 |         dest[1] = ((ch>>12) & 0x3F) | 0x80;
191 |         dest[2] = ((ch>>6) & 0x3F) | 0x80;
192 |         dest[3] = (ch & 0x3F) | 0x80;
193 |         return 4;
194 |     }
195 |     return 0;
196 | }
197 | 
198 | /* charnum => byte offset */
199 | size_t u8_offset(const char *s, size_t charnum)
200 | {
201 |     size_t i=0;
202 | 
203 |     while (charnum > 0) {
204 |         if (s[i++] & 0x80) {
205 |             (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
206 |         }
207 |         charnum--;
208 |     }
209 |     return i;
210 | }
211 | 
212 | /* byte offset => charnum */
213 | size_t u8_charnum(const char *s, size_t offset)
214 | {
215 |     size_t charnum = 0, i=0;
216 | 
217 |     while (i < offset) {
218 |         if (s[i++] & 0x80) {
219 |             (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
220 |         }
221 |         charnum++;
222 |     }
223 |     return charnum;
224 | }
225 | 
226 | /* number of characters in NUL-terminated string */
227 | size_t u8_strlen(const char *s)
228 | {
229 |     size_t count = 0;
230 |     size_t i = 0, lasti;
231 | 
232 |     while (1) {
233 |         lasti = i;
234 |         while (s[i] > 0)
235 |             i++;
236 |         count += (i-lasti);
237 |         if (s[i++]==0) break;
238 |         (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
239 |         count++;
240 |     }
241 |     return count;
242 | }
243 | 
244 | int wcwidth(wchar_t c);
245 | 
246 | size_t u8_strwidth(const char *s)
247 | {
248 |     uint32_t ch;
249 |     size_t nb, tot=0;
250 |     int w;
251 |     signed char sc;
252 | 
253 |     while ((sc = (signed char)*s) != 0) {
254 |         if (sc >= 0) {
255 |             s++;
256 |             if (sc) tot++;
257 |         }
258 |         else {
259 |             if (!isutf(sc)) { tot++; s++; continue; }
260 |             nb = trailingBytesForUTF8[(unsigned char)sc];
261 |             ch = 0;
262 |             switch (nb) {
263 |                 /* these fall through deliberately */
264 |             case 5: ch += (unsigned char)*s++; ch <<= 6;
265 |             case 4: ch += (unsigned char)*s++; ch <<= 6;
266 |             case 3: ch += (unsigned char)*s++; ch <<= 6;
267 |             case 2: ch += (unsigned char)*s++; ch <<= 6;
268 |             case 1: ch += (unsigned char)*s++; ch <<= 6;
269 |             case 0: ch += (unsigned char)*s++;
270 |             }
271 |             ch -= offsetsFromUTF8[nb];
272 |             w = wcwidth(ch);  // might return -1
273 |             if (w > 0) tot += w;
274 |         }
275 |     }
276 |     return tot;
277 | }
278 | 
279 | /* reads the next utf-8 sequence out of a string, updating an index */
280 | uint32_t u8_nextchar(const char *s, size_t *i)
281 | {
282 |     uint32_t ch = 0;
283 |     size_t sz = 0;
284 | 
285 |     do {
286 |         ch <<= 6;
287 |         ch += (unsigned char)s[(*i)];
288 |         sz++;
289 |     } while (s[*i] && (++(*i)) && !isutf(s[*i]));
290 |     ch -= offsetsFromUTF8[sz-1];
291 | 
292 |     return ch;
293 | }
294 | 
295 | /* next character without NUL character terminator */
296 | uint32_t u8_nextmemchar(const char *s, size_t *i)
297 | {
298 |     uint32_t ch = 0;
299 |     size_t sz = 0;
300 | 
301 |     do {
302 |         ch <<= 6;
303 |         ch += (unsigned char)s[(*i)++];
304 |         sz++;
305 |     } while (!isutf(s[*i]));
306 |     ch -= offsetsFromUTF8[sz-1];
307 | 
308 |     return ch;
309 | }
310 | 
311 | void u8_inc(const char *s, size_t *i)
312 | {
313 |     (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || isutf(s[++(*i)]) || ++(*i));
314 | }
315 | 
316 | void u8_dec(const char *s, size_t *i)
317 | {
318 |     (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || isutf(s[--(*i)]) || --(*i));
319 | }
320 | 
321 | int octal_digit(char c)
322 | {
323 |     return (c >= '0' && c <= '7');
324 | }
325 | 
326 | int hex_digit(char c)
327 | {
328 |     return ((c >= '0' && c <= '9') ||
329 |             (c >= 'A' && c <= 'F') ||
330 |             (c >= 'a' && c <= 'f'));
331 | }
332 | 
333 | char read_escape_control_char(char c)
334 | {
335 |     if (c == 'n')
336 |         return '\n';
337 |     else if (c == 't')
338 |         return '\t';
339 |     else if (c == 'r')
340 |         return '\r';
341 |     else if (c == 'e')
342 |         return 033; // '\e'
343 |     else if (c == 'b')
344 |         return '\b';
345 |     else if (c == 'f')
346 |         return '\f';
347 |     else if (c == 'v')
348 |         return '\v';
349 |     else if (c == 'a')
350 |         return '\a';
351 |     return c;
352 | }
353 | 
354 | /* assumes that src points to the character after a backslash
355 |    returns number of input characters processed, 0 if error */
356 | size_t u8_read_escape_sequence(const char *str, size_t ssz, uint32_t *dest)
357 | {
358 |     uint32_t ch;
359 |     char digs[10];
360 |     int dno=0, ndig;
361 |     size_t i=1;
362 |     char c0 = str[0];
363 |     assert(ssz > 0);
364 | 
365 |     if (octal_digit(c0)) {
366 |         i = 0;
367 |         do {
368 |             digs[dno++] = str[i++];
369 |         } while (i<ssz && octal_digit(str[i]) && dno<3);
370 |         digs[dno] = '\0';
371 |         ch = strtol(digs, NULL, 8);
372 |     }
373 |     else if ((c0=='x' && (ndig=2)) ||
374 |              (c0=='u' && (ndig=4)) ||
375 |              (c0=='U' && (ndig=8))) {
376 |         while (i<ssz && hex_digit(str[i]) && dno<ndig) {
377 |             digs[dno++] = str[i++];
378 |         }
379 |         if (dno == 0) return 0;
380 |         digs[dno] = '\0';
381 |         ch = strtol(digs, NULL, 16);
382 |     }
383 |     else {
384 |         ch = (uint32_t)read_escape_control_char(c0);
385 |     }
386 |     *dest = ch;
387 | 
388 |     return i;
389 | }
390 | 
391 | /* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8
392 |    example: u8_unescape(mybuf, 256, "hello\\u220e")
393 |    note the double backslash is needed if called on a C string literal */
394 | size_t u8_unescape(char *buf, size_t sz, const char *src)
395 | {
396 |     size_t c=0, amt;
397 |     uint32_t ch = 0;
398 |     char temp[4];
399 | 
400 |     while (*src && c < sz) {
401 |         if (*src == '\\') {
402 |             src++;
403 |             amt = u8_read_escape_sequence(src, 1000, &ch);
404 |         }
405 |         else {
406 |             ch = (uint32_t)*src;
407 |             amt = 1;
408 |         }
409 |         src += amt;
410 |         amt = u8_wc_toutf8(temp, ch);
411 |         if (amt > sz-c)
412 |             break;
413 |         memcpy(&buf[c], temp, amt);
414 |         c += amt;
415 |     }
416 |     if (c < sz)
417 |         buf[c] = '\0';
418 |     return c;
419 | }
420 | 
421 | static int buf_put2c(char *buf, const char *src)
422 | {
423 |     buf[0] = src[0];
424 |     buf[1] = src[1];
425 |     buf[2] = '\0';
426 |     return 2;
427 | }
428 | 
429 | int u8_escape_wchar(char *buf, size_t sz, uint32_t ch)
430 | {
431 |     assert(sz > 2);
432 |     if (ch == L'\n')
433 |         return buf_put2c(buf, "\\n");
434 |     else if (ch == L'\t')
435 |         return buf_put2c(buf, "\\t");
436 |     else if (ch == L'\r')
437 |         return buf_put2c(buf, "\\r");
438 |     else if (ch == 033) // L'\e'
439 |         return buf_put2c(buf, "\\e");
440 |     else if (ch == L'\b')
441 |         return buf_put2c(buf, "\\b");
442 |     else if (ch == L'\f')
443 |         return buf_put2c(buf, "\\f");
444 |     else if (ch == L'\v')
445 |         return buf_put2c(buf, "\\v");
446 |     else if (ch == L'\a')
447 |         return buf_put2c(buf, "\\a");
448 |     else if (ch == L'\\')
449 |         return buf_put2c(buf, "\\\\");
450 |     else if (ch < 32 || ch == 0x7f)
451 |         return snprintf(buf, sz, "\\x%.2hhx", (unsigned char)ch);
452 |     else if (ch > 0xFFFF)
453 |         return snprintf(buf, sz, "\\U%.8x", (uint32_t)ch);
454 |     else if (ch >= 0x80)
455 |         return snprintf(buf, sz, "\\u%.4hx", (unsigned short)ch);
456 | 
457 |     buf[0] = (char)ch;
458 |     buf[1] = '\0';
459 |     return 1;
460 | }
461 | 
462 | size_t u8_escape(char *buf, size_t sz, const char *src, size_t *pi, size_t end,
463 |                  int escape_quotes, int ascii)
464 | {
465 |     size_t i = *pi, i0;
466 |     uint32_t ch;
467 |     char *start = buf;
468 |     char *blim = start + sz-11;
469 |     assert(sz > 11);
470 | 
471 |     while (i<end && buf<blim) {
472 |         // sz-11: leaves room for longest escape sequence
473 |         if (escape_quotes && src[i] == '"') {
474 |             buf += buf_put2c(buf, "\\\"");
475 |             i++;
476 |         }
477 |         else if (src[i] == '\\') {
478 |             buf += buf_put2c(buf, "\\\\");
479 |             i++;
480 |         }
481 |         else {
482 |             i0 = i;
483 |             ch = u8_nextmemchar(src, &i);
484 |             if (ascii || !iswprint((wint_t)ch)) {
485 |                 buf += u8_escape_wchar(buf, sz - (buf-start), ch);
486 |             }
487 |             else {
488 |                 i = i0;
489 |                 do {
490 |                     *buf++ = src[i++];
491 |                 } while (!isutf(src[i]));
492 |             }
493 |         }
494 |     }
495 |     *buf++ = '\0';
496 |     *pi = i;
497 |     return (buf-start);
498 | }
499 | 
500 | char *u8_strchr(const char *s, uint32_t ch, size_t *charn)
501 | {
502 |     size_t i = 0, lasti=0;
503 |     uint32_t c;
504 | 
505 |     *charn = 0;
506 |     while (s[i]) {
507 |         c = u8_nextchar(s, &i);
508 |         if (c == ch) {
509 |             /* it's const for us, but not necessarily the caller */
510 |             return (char*)&s[lasti];
511 |         }
512 |         lasti = i;
513 |         (*charn)++;
514 |     }
515 |     return NULL;
516 | }
517 | 
518 | char *u8_memchr(const char *s, uint32_t ch, size_t sz, size_t *charn)
519 | {
520 |     size_t i = 0, lasti=0;
521 |     uint32_t c;
522 |     int csz;
523 | 
524 |     *charn = 0;
525 |     while (i < sz) {
526 |         c = csz = 0;
527 |         do {
528 |             c <<= 6;
529 |             c += (unsigned char)s[i++];
530 |             csz++;
531 |         } while (i < sz && !isutf(s[i]));
532 |         c -= offsetsFromUTF8[csz-1];
533 | 
534 |         if (c == ch) {
535 |             return (char*)&s[lasti];
536 |         }
537 |         lasti = i;
538 |         (*charn)++;
539 |     }
540 |     return NULL;
541 | }
542 | 
543 | char *u8_memrchr(const char *s, uint32_t ch, size_t sz)
544 | {
545 |     size_t i = sz-1, tempi=0;
546 |     uint32_t c;
547 | 
548 |     if (sz == 0) return NULL;
549 | 
550 |     while (i && !isutf(s[i])) i--;
551 | 
552 |     while (1) {
553 |         tempi = i;
554 |         c = u8_nextmemchar(s, &tempi);
555 |         if (c == ch) {
556 |             return (char*)&s[i];
557 |         }
558 |         if (i == 0)
559 |             break;
560 |         tempi = i;
561 |         u8_dec(s, &i);
562 |         if (i > tempi)
563 |             break;
564 |     }
565 |     return NULL;
566 | }
567 | 
568 | int u8_is_locale_utf8(const char *locale)
569 | {
570 |     /* this code based on libutf8 */
571 |     const char* cp = locale;
572 | 
573 |     if (locale == NULL) return 0;
574 | 
575 |     for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) {
576 |         if (*cp == '.') {
577 |             const char* encoding = ++cp;
578 |             for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++)
579 |                 ;
580 |             if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5))
581 |                 || (cp-encoding == 4 && !strncmp(encoding, "utf8", 4)))
582 |                 return 1; /* it's UTF-8 */
583 |             break;
584 |         }
585 |     }
586 |     return 0;
587 | }
588 | 
589 | size_t u8_vprintf(const char *fmt, va_list ap)
590 | {
591 |     int cnt, sz=0, nc, needfree=0;
592 |     char *buf;
593 |     uint32_t *wcs;
594 | 
595 |     sz = 512;
596 |     buf = (char*)alloca(sz);
597 |     cnt = vsnprintf(buf, sz, fmt, ap);
598 |     if (cnt < 0)
599 |         return 0;
600 |     if (cnt >= sz) {
601 |         buf = (char*)malloc(cnt + 1);
602 |         needfree = 1;
603 |         vsnprintf(buf, cnt+1, fmt, ap);
604 |     }
605 |     wcs = (uint32_t*)alloca((cnt+1) * sizeof(uint32_t));
606 |     nc = u8_toucs(wcs, (size_t)cnt+1, buf, cnt);
607 |     wcs[nc] = 0;
608 |     printf("%ls", (wchar_t*)wcs);
609 |     if (needfree) free(buf);
610 |     return nc;
611 | }
612 | 
613 | size_t u8_printf(const char *fmt, ...)
614 | {
615 |     size_t cnt;
616 |     va_list args;
617 | 
618 |     va_start(args, fmt);
619 | 
620 |     cnt = u8_vprintf(fmt, args);
621 | 
622 |     va_end(args);
623 |     return cnt;
624 | }
625 | 
626 | /* based on the valid_utf8 routine from the PCRE library by Philip Hazel
627 | 
628 |    length is in bytes, since without knowing whether the string is valid
629 |    it's hard to know how many characters there are! */
630 | int u8_isvalid(const char *str, size_t length)
631 | {
632 |     const unsigned char *p, *pend = (unsigned char*)str + length;
633 |     unsigned char c;
634 |     int ret = 1; /* ASCII */
635 |     size_t ab;
636 | 
637 |     for (p = (unsigned char*)str; p < pend; p++) {
638 |         c = *p;
639 |         if (c < 128)
640 |             continue;
641 |         ret = 2; /* non-ASCII UTF-8 */
642 |         if ((c & 0xc0) != 0xc0)
643 |             return 0;
644 |         ab = trailingBytesForUTF8[c];
645 |         if (length < ab)
646 |             return 0;
647 |         length -= ab;
648 | 
649 |         p++;
650 |         /* Check top bits in the second byte */
651 |         if ((*p & 0xc0) != 0x80)
652 |             return 0;
653 | 
654 |         /* Check for overlong sequences for each different length */
655 |         switch (ab) {
656 |             /* Check for xx00 000x */
657 |         case 1:
658 |             if ((c & 0x3e) == 0) return 0;
659 |             continue;   /* We know there aren't any more bytes to check */
660 | 
661 |             /* Check for 1110 0000, xx0x xxxx */
662 |         case 2:
663 |             if (c == 0xe0 && (*p & 0x20) == 0) return 0;
664 |             break;
665 | 
666 |             /* Check for 1111 0000, xx00 xxxx */
667 |         case 3:
668 |             if (c == 0xf0 && (*p & 0x30) == 0) return 0;
669 |             break;
670 | 
671 |             /* Check for 1111 1000, xx00 0xxx */
672 |         case 4:
673 |             if (c == 0xf8 && (*p & 0x38) == 0) return 0;
674 |             break;
675 | 
676 |             /* Check for leading 0xfe or 0xff,
677 |                and then for 1111 1100, xx00 00xx */
678 |         case 5:
679 |             if (c == 0xfe || c == 0xff ||
680 |                 (c == 0xfc && (*p & 0x3c) == 0)) return 0;
681 |             break;
682 |         }
683 | 
684 |         /* Check for valid bytes after the 2nd, if any; all must start 10 */
685 |         while (--ab > 0) {
686 |             if ((*(++p) & 0xc0) != 0x80) return 0;
687 |         }
688 |     }
689 | 
690 |     return ret;
691 | }
692 | 
693 | int u8_reverse(char *dest, char * src, size_t len)
694 | {
695 |     size_t si=0, di=len;
696 |     unsigned char c;
697 | 
698 |     dest[di] = '\0';
699 |     while (si < len) {
700 |         c = (unsigned char)src[si];
701 |         if ((~c) & 0x80) {
702 |             di--;
703 |             dest[di] = c;
704 |             si++;
705 |         }
706 |         else {
707 |             switch (c>>4) {
708 |             case 0xC:
709 |             case 0xD:
710 |                 di -= 2;
711 |                 *((int16_t*)&dest[di]) = *((int16_t*)&src[si]);
712 |                 si += 2;
713 |                 break;
714 |             case 0xE:
715 |                 di -= 3;
716 |                 dest[di] = src[si];
717 |                 *((int16_t*)&dest[di+1]) = *((int16_t*)&src[si+1]);
718 |                 si += 3;
719 |                 break;
720 |             case 0xF:
721 |                 di -= 4;
722 |                 *((int32_t*)&dest[di]) = *((int32_t*)&src[si]);
723 |                 si += 4;
724 |                 break;
725 |             default:
726 |                 return 1;
727 |             }
728 |         }
729 |     }
730 |     return 0;
731 | }
732 | 


--------------------------------------------------------------------------------
/utf8.h:
--------------------------------------------------------------------------------
  1 | #ifndef UTF8_H
  2 | #define UTF8_H
  3 | 
  4 | extern int locale_is_utf8;
  5 | 
  6 | /* is c the start of a utf8 sequence? */
  7 | #define isutf(c) (((c)&0xC0)!=0x80)
  8 | 
  9 | #define UEOF ((uint32_t)-1)
 10 | 
 11 | /* convert UTF-8 data to wide character */
 12 | size_t u8_toucs(uint32_t *dest, size_t sz, const char *src, size_t srcsz);
 13 | 
 14 | /* the opposite conversion */
 15 | size_t u8_toutf8(char *dest, size_t sz, const uint32_t *src, size_t srcsz);
 16 | 
 17 | /* single character to UTF-8, returns # bytes written */
 18 | size_t u8_wc_toutf8(char *dest, uint32_t ch);
 19 | 
 20 | /* character number to byte offset */
 21 | size_t u8_offset(const char *str, size_t charnum);
 22 | 
 23 | /* byte offset to character number */
 24 | size_t u8_charnum(const char *s, size_t offset);
 25 | 
 26 | /* return next character, updating an index variable */
 27 | uint32_t u8_nextchar(const char *s, size_t *i);
 28 | 
 29 | /* next character without NUL character terminator */
 30 | uint32_t u8_nextmemchar(const char *s, size_t *i);
 31 | 
 32 | /* move to next character */
 33 | void u8_inc(const char *s, size_t *i);
 34 | 
 35 | /* move to previous character */
 36 | void u8_dec(const char *s, size_t *i);
 37 | 
 38 | /* returns length of next utf-8 sequence */
 39 | size_t u8_seqlen(const char *s);
 40 | 
 41 | /* returns the # of bytes needed to encode a certain character */
 42 | size_t u8_charlen(uint32_t ch);
 43 | 
 44 | /* computes the # of bytes needed to encode a WC string as UTF-8 */
 45 | size_t u8_codingsize(uint32_t *wcstr, size_t n);
 46 | 
 47 | char read_escape_control_char(char c);
 48 | 
 49 | /* assuming src points to the character after a backslash, read an
 50 |    escape sequence, storing the result in dest and returning the number of
 51 |    input characters processed */
 52 | size_t u8_read_escape_sequence(const char *src, size_t ssz, uint32_t *dest);
 53 | 
 54 | /* given a wide character, convert it to an ASCII escape sequence stored in
 55 |    buf, where buf is "sz" bytes. returns the number of characters output.
 56 |    sz must be at least 3. */
 57 | int u8_escape_wchar(char *buf, size_t sz, uint32_t ch);
 58 | 
 59 | /* convert a string "src" containing escape sequences to UTF-8 */
 60 | size_t u8_unescape(char *buf, size_t sz, const char *src);
 61 | 
 62 | /* convert UTF-8 "src" to escape sequences.
 63 | 
 64 |    sz is buf size in bytes. must be at least 12.
 65 | 
 66 |    if escape_quotes is nonzero, quote characters will be escaped.
 67 | 
 68 |    if ascii is nonzero, the output is 7-bit ASCII, no UTF-8 survives.
 69 | 
 70 |    starts at src[*pi], updates *pi to point to the first unprocessed
 71 |    byte of the input.
 72 | 
 73 |    end is one more than the last allowable value of *pi.
 74 | 
 75 |    returns number of bytes placed in buf, including a NUL terminator.
 76 | */
 77 | size_t u8_escape(char *buf, size_t sz, const char *src, size_t *pi, size_t end,
 78 |                  int escape_quotes, int ascii);
 79 | 
 80 | /* utility predicates used by the above */
 81 | int octal_digit(char c);
 82 | int hex_digit(char c);
 83 | 
 84 | /* return a pointer to the first occurrence of ch in s, or NULL if not
 85 |    found. character index of found character returned in *charn. */
 86 | char *u8_strchr(const char *s, uint32_t ch, size_t *charn);
 87 | 
 88 | /* same as the above, but searches a buffer of a given size instead of
 89 |    a NUL-terminated string. */
 90 | char *u8_memchr(const char *s, uint32_t ch, size_t sz, size_t *charn);
 91 | 
 92 | char *u8_memrchr(const char *s, uint32_t ch, size_t sz);
 93 | 
 94 | /* count the number of characters in a UTF-8 string */
 95 | size_t u8_strlen(const char *s);
 96 | 
 97 | /* number of columns occupied by a string */
 98 | size_t u8_strwidth(const char *s);
 99 | 
100 | int u8_is_locale_utf8(const char *locale);
101 | 
102 | /* printf where the format string and arguments may be in UTF-8.
103 |    you can avoid this function and just use ordinary printf() if the current
104 |    locale is UTF-8. */
105 | size_t u8_vprintf(const char *fmt, va_list ap);
106 | size_t u8_printf(const char *fmt, ...);
107 | 
108 | /* determine whether a sequence of bytes is valid UTF-8. length is in bytes */
109 | int u8_isvalid(const char *str, size_t length);
110 | 
111 | /* reverse a UTF-8 string. len is length in bytes. dest and src must both
112 |    be allocated to at least len+1 bytes. returns 1 for error, 0 otherwise */
113 | int u8_reverse(char *dest, char *src, size_t len);
114 | 
115 | #endif
116 | 


--------------------------------------------------------------------------------