├── is_utf8.h ├── Makefile ├── COPYRIGHT ├── README.md ├── test.sh ├── main.c └── is_utf8.c /is_utf8.h: -------------------------------------------------------------------------------- 1 | #ifndef _IS_UTF8_H 2 | #define _IS_UTF8_H 3 | 4 | #include 5 | 6 | size_t is_utf8(unsigned char *str, size_t len, char **message, int *faulty_bytes); 7 | 8 | #endif /* _IS_UTF8_H */ 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ## 2 | ## Makefile for is_utf8 3 | ## 4 | ## Made by julien palard 5 | ## Login 6 | ## 7 | 8 | NAME = isutf8 9 | SRC = main.c is_utf8.c 10 | OBJ = $(SRC:.c=.o) 11 | 12 | VERSION = 0 13 | MINOR = 0 14 | RELEASE = 0 15 | 16 | LIB_SRC = is_utf8.c 17 | 18 | LINKERNAME = lib$(NAME).so 19 | SONAME = $(LINKERNAME).$(VERSION) 20 | REALNAME = $(SONAME).$(MINOR).$(RELEASE) 21 | 22 | CC = gcc 23 | CFLAGS = -O3 -Wextra -Wall -ansi -Wstrict-prototypes 24 | 25 | $(NAME): IS_UTF8_LIB $(OBJ) 26 | $(CC) $(CFLAGS) -o $(NAME) $(OBJ) 27 | 28 | IS_UTF8_LIB: 29 | $(CC) --shared -fPIC $(CFLAGS) $(LIB_SRC) -o $(LINKERNAME) 30 | 31 | all: 32 | @make $(NAME) 33 | 34 | clean: 35 | rm -f $(NAME) $(LINKERNAME) $(OBJ) $(LIB_OBJ) 36 | 37 | re: clean all 38 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | is_utf8 is distributed under the following terms: 2 | 3 | Copyright (c) 2013 Palard Julien. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions 7 | are met: 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 | SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | isutf8 is a program and a c library to check if a given file (or stdin) contains only 2 | valid utf-8 sequences. 3 | 4 | # Compiling 5 | 6 | $ make 7 | 8 | # Demo 9 | 10 | $ isutf8 * -v 11 | isutf8: line 1, char 97, byte 96: Expecting bytes in the following ranges: 00..7F C2..F4. 12 | 40 00 40 00 00 00 00 00 C0 01 00 00 00 00 00 00 | @.@............. 13 | ^^ | ^ 14 | 15 | is_utf8.o: line 1, char 66, byte 65: Expecting bytes in the following ranges: 00..7F C2..F4. 16 | 00 40 00 0E 00 0B 00 48 85 F6 48 C7 02 00 00 00 | .@.....H..H..... 17 | ^^ | ^ 18 | 19 | libisutf8.so: line 1, char 153, byte 152: After a first byte of F0, expecting 2nd byte between 90 and BF. 20 | 68 12 20 00 00 00 00 00 F0 01 00 00 00 00 00 00 | h. ............. 21 | ^^^^^ | ^^ 22 | 23 | main.o: line 1, char 76, byte 75: Expecting bytes in the following ranges: 00..7F C2..F4. 24 | 56 41 55 41 54 55 53 48 83 EC 18 48 8B 5C 24 58 | VAUATUSH...H.\$X 25 | ^^ | ^ 26 | 27 | # Test a file 28 | 29 | `isutf8` returns 0 if the file is correctly encoded: 30 | 31 | $ isutf8 main.c 32 | $ echo $? 33 | 0 34 | 35 | Some files here only contain ASCII or correctly encoded UTF8: 36 | 37 | $ isutf8 README.md 38 | $ isutf8 test.sh 39 | 40 | But an ELF is clearly not UTF8, a verbose error is printed: 41 | 42 | $ isutf8 isutf8 43 | isutf8: line 1, char 97, byte 96: Expecting bytes in the following ranges: 00..7F C2..F4. 44 | 45 | `-v` adds some context: 46 | 47 | $ isutf8 -v isutf8 48 | isutf8: line 1, char 97, byte 96: Expecting bytes in the following ranges: 00..7F C2..F4. 49 | 40 00 40 00 00 00 00 00 C0 01 00 00 00 00 00 00 | @.@............. 50 | ^^ | ^ 51 | 52 | # Test stdin 53 | 54 | `isutf8` reads on stdin if no file are given, also note that `bash` 55 | helps a lot with the `$''` syntax allowing you to write and test hexadecimal: 56 | 57 | $ echo $'\xe9' | isutf8 58 | (standard input): line 1, char 0, byte 0: After a first byte between E1 and EC, expecting two following bytes. 59 | 60 | $ echo "Hellö world" | iconv -f utf8 -t latin1 | isutf8 61 | (standard input): line 1, char 4, byte 4: Expecting bytes in the following ranges: 00..7F C2..F4. 62 | 63 | # Find UTF8 or non-UTF8 files 64 | 65 | As `isutf8` can take multiple arguments it's easy classify 66 | UTF8-compatible versus non UTF8-compatible files: 67 | 68 | List non-UTF8 compatible files: 69 | 70 | $ isutf8 --list * 71 | isutf8 72 | is_utf8.o 73 | libisutf8.so 74 | main.o 75 | 76 | List UTF8-compatible files: 77 | 78 | $ isutf8 --list --invert * 79 | COPYRIGHT 80 | is_utf8.c 81 | is_utf8.h 82 | main.c 83 | Makefile 84 | README.md 85 | test.sh 86 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | utf8_test_file() 4 | { 5 | printf "." 6 | to_test="$1" 7 | should_return="$2" 8 | ./isutf8 <(printf "%s" "$to_test") 2>/dev/null >/dev/null 9 | error_number=$? 10 | if ! [ z"$error_number" == z"$should_return" ] 11 | then 12 | [ $should_return == 1 ] && 13 | printf "\nThis one should have failed:\n" || 14 | printf "\nThis one should succeed:\n" 15 | printf "%s" "$to_test" | hexdump -C 16 | printf "%s" "$to_test" | ./isutf8 <(printf "%s" "$to_test") 17 | exit 1 18 | fi 19 | } 20 | 21 | utf8_test_pipe() 22 | { 23 | printf "." 24 | to_test="$1" 25 | should_return="$2" 26 | printf "%s" "$to_test" | ./isutf8 2>/dev/null >/dev/null 27 | error_number=$? 28 | if ! [ z"$error_number" == z"$should_return" ] 29 | then 30 | [ $should_return == 1 ] && 31 | printf "\nThis one should have failed:\n" || 32 | printf "\nThis one should succeed:\n" 33 | printf "%s" "$to_test" | hexdump -C 34 | printf "%s" "$to_test" | ./isutf8 - 35 | exit 1 36 | fi 37 | } 38 | 39 | utf8_test() 40 | { 41 | utf8_test_pipe "$@" 42 | utf8_test_file "$@" 43 | } 44 | 45 | should_pass() 46 | { 47 | while [ -n "$1" ] 48 | do 49 | utf8_test "$1" 0 50 | shift 51 | done 52 | } 53 | 54 | should_fail() 55 | { 56 | while [ -n "$1" ] 57 | do 58 | utf8_test "$1" 1 59 | shift 60 | done 61 | } 62 | 63 | should_pass "Léa" "Maïté" "Pauline" "Élise" 64 | should_fail $'\xc9lise' $'Elis\xc9' 65 | 66 | # Tests from : 67 | # Table 3-7. Well-Formed UTF-8 Byte Sequences 68 | # ----------------------------------------------------------------------------- 69 | # | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte | 70 | # | U+0000..U+007F | 00..7F | | | | 71 | should_pass $'\01' $'\02' q w e r t y "~" "foo" "bar" "baz" "123" 1 2 3 "," $'\n' $'\x0E' $'\x0F' $'\x7F' 72 | should_fail $'\x80' $'\x82' $'\x83' $'\xC0' $'\xC1' 73 | should_fail $'|\x80' $'|\x82' $'|\x83' $'|\xC0' $'|\xC1' 74 | should_fail $'\x80|' $'\x82|' $'\x83|' $'\xC0|' $'\xC1|' 75 | # | U+0080..U+07FF | C2..DF | 80..BF | | | 76 | should_pass $'\xC2\x80' $'\xC2\xBF' $'\xDF\x80' $'\xDF\xBF' 77 | should_fail $'\xC2\x79' $'\xC2\xC0' $'\xC2\xC3' 78 | should_pass $'---\xC2\x80' $'---\xC2\xBF' $'---\xDF\x80' $'---\xDF\xBF' 79 | should_fail $'\xC2\x79---' $'\xC2\xC0---' $'\xC2\xC3---' 80 | # | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 81 | should_pass $'\xE0\xA0\x80' $'\xE0\xA0\xBF' $'\xE0\xBF\x80' $'\xE0\xBF\xBF' 82 | should_fail $'\xE0\x99\x80' $'\xE0\xC5\xBF' $'\xE0\xBF\x78' $'\xE0\xBF\xEE' 83 | should_pass $'~~~~\xE0\xA0\x80' $'~~~~\xE0\xA0\xBF' $'~~~~\xE0\xBF\x80' $'~~~~\xE0\xBF\xBF' 84 | should_fail $'\xE0\x99\x80~~~~' $'\xE0\xC5\xBF~~~~' $'\xE0\xBF\x78~~~~' $'\xE0\xBF\xEE~~~~' 85 | # | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | 86 | should_pass $'\xE1\x80\x80' $'\xE1\xBF\x80' $'\xE1\x80\xBF' $'\xE1\xBF\xBF' 87 | should_pass $'\xEC\x80\x80' $'\xEC\xBF\x80' $'\xEC\x80\xBF' $'\xEC\xBF\xBF' 88 | should_fail $'\xE1\x41\x80' $'\xE1\xC0\x80' $'\xE1\xC8\xBF' $'\xE1\xBF\xFE' 89 | # | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 90 | should_pass $'\xED\x80\x80' $'\xED\x80\xBF' $'\xED\x9F\x80' $'\xED\x9F\xBF' 91 | should_fail $'\xED\x80\x50' $'\xED\x80\xC1' $'\xED\xBF\x80' $'\xED\xBF\xBF' 92 | # | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | 93 | should_pass $'\xEE\x80\x80' $'\xEE\x80\xBF' $'\xEE\xBF\x80' $'\xEE\xBF\xBF' 94 | should_fail $'\xEE\x70\x80' $'\xEE\x80\x70' $'\xEE\xCF\x80' $'\xEE\xCF\xCF' 95 | # | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 96 | should_pass $'\xF0\x90\x80\x80' $'\xF0\xBF\x80\x80' $'\xF0\x90\x80\xBF' $'\xF0\xBF\xBF\xBF' 97 | should_fail $'\xF0\x70\x80\x80' $'\xF0\xCF\x70\x80' $'\xF0\xCF\x80\xCE' $'\xF0\xCF\xCF\xDF' 98 | should_fail $'\xF0\x80\x80\x80' 99 | # | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | 100 | should_pass $'\xF1\x80\x80\x80' $'\xF1\xBF\xBF\xBF' 101 | should_pass $'\xF2\x80\x80\x80' $'\xF2\xBF\xBF\xBF' 102 | should_pass $'\xF3\x80\x80\x80' $'\xF3\xBF\xBF\xBF' 103 | # | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 104 | should_pass $'\xF4\x80\x80\x80' $'\xF4\x8F\xBF\xBF' 105 | should_fail $'\xF4\x80\x80\x79' $'\xF4\xBF\xBF\xBF' 106 | # ----------------------------------------------------------------------------- 107 | 108 | should_fail $'\xf0-' $'\xf1-' $'\xf2-' $'\xf3-' $'\xE1-' $'\xEE-' $'\xED-' $'\xEF-' 109 | 110 | printf "\nAll tests are OK.\n" 111 | -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | #define _POSIX_C_SOURCE 200809L 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "is_utf8.h" 13 | 14 | #define VERSION "1.2" 15 | #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) 16 | 17 | 18 | static int show_str(const char *str, unsigned int max_length) 19 | { 20 | int printed = 0; 21 | 22 | while (max_length-- > 0) 23 | { 24 | printed += printf("%c", (*str >= ' ' && *str <= '~') ? (unsigned char)*str: '.'); 25 | str += 1; 26 | } 27 | return printed; 28 | } 29 | 30 | static int show_hex_str(const char *str, unsigned int max_length) 31 | { 32 | int printed = 0; 33 | 34 | while (max_length-- > 0) 35 | printed += printf("%.2X ", (unsigned char)*str++); 36 | return printed; 37 | } 38 | 39 | /*show_context(, 5, 279, 2, chars_before_error: 8, chars_after_error: -274) 40 | **/ 41 | static void show_context(char *str, int str_length, int err_pos_in_str, int faulty_bytes) 42 | { 43 | int chars_before_error = MIN(err_pos_in_str, 8); 44 | int chars_after_error = MIN(str_length - err_pos_in_str, 8); 45 | int printed = 0; 46 | 47 | printed = show_hex_str(str + err_pos_in_str - chars_before_error, chars_before_error + chars_after_error); /* Print up to error. */ 48 | printf("%*s | ", 3 * 16 - printed, ""); 49 | show_str(str + err_pos_in_str - chars_before_error, chars_before_error + chars_after_error); /* Print up to error. */ 50 | printf("\n"); 51 | printed = printf("%*s", (3 * chars_before_error), ""); 52 | printed += printf("%.*s", faulty_bytes * 2 + faulty_bytes - 1, "^^^^^^^^^^^^^^^^"); 53 | printf("%*s | ", 3 * 16 - printed, ""); 54 | printf("%*s", (chars_before_error), ""); 55 | printf("%.*s", faulty_bytes, "^^^^"); 56 | printf("\n\n"); 57 | } 58 | 59 | static void print_utf8_error( 60 | const char* file_path, 61 | int error_line, int error_column, int byte_no, 62 | char *str, int str_length, int err_pos_in_str, 63 | const char *message, int faulty_bytes, 64 | int quiet, int verbose, 65 | int list_only, int invert) 66 | { 67 | if (quiet) 68 | return; 69 | if (message && !invert) 70 | { 71 | if (list_only) 72 | printf("%s\n", file_path); 73 | else 74 | printf("%s: line %d, char %d, byte %d: %s\n", 75 | file_path, error_line, error_column, byte_no, 76 | message); 77 | if (verbose && !list_only) 78 | { 79 | show_context(str, str_length, err_pos_in_str, faulty_bytes); 80 | } 81 | } 82 | if (!message && invert) 83 | { 84 | printf("%s\n", file_path); 85 | } 86 | } 87 | 88 | #define handle_error(msg, target) \ 89 | do {retval = EXIT_FAILURE; perror(msg); goto target;} while (0) 90 | 91 | static int is_utf8_readline(FILE *stream, const char *file_path, 92 | int quiet, int verbose, int list_only, int invert) 93 | { 94 | char *string = NULL; 95 | size_t size = 0; 96 | ssize_t str_length; 97 | char *message = NULL; 98 | int lineno = 1; 99 | int pos = 0; 100 | int offset = 0; 101 | int faulty_bytes = 0; 102 | 103 | while ((str_length = getline(&string, &size, stream)) != -1) 104 | { 105 | pos = is_utf8((unsigned char*)string, str_length, &message, &faulty_bytes); 106 | if (message != NULL) 107 | { 108 | offset += pos; 109 | print_utf8_error(file_path, lineno, pos, offset, 110 | string, str_length, pos, message, faulty_bytes, 111 | quiet, verbose, list_only, invert); 112 | break; 113 | } 114 | offset += str_length; 115 | lineno += 1; 116 | } 117 | if (string != NULL) 118 | free(string); 119 | return message == NULL ? EXIT_SUCCESS : EXIT_FAILURE; 120 | } 121 | 122 | static void count_lines(const char *string, int length, int up_to, int *line, int *column) 123 | { 124 | int pos = 0; 125 | int line_start_at = 0; 126 | 127 | *line = 1; 128 | while (pos < length && pos < up_to) 129 | { 130 | if (string[pos] == '\n') 131 | { 132 | line_start_at = pos + 1; 133 | *line += 1; 134 | } 135 | pos += 1; 136 | } 137 | *column = 1 + up_to - line_start_at; 138 | } 139 | 140 | static int is_utf8_mmap(const char *file_path, int quiet, int verbose, 141 | int list_only, int invert) 142 | { 143 | char *addr; 144 | struct stat sb; 145 | int fd; 146 | int pos = 0; 147 | char *message; 148 | int retval = EXIT_SUCCESS; 149 | int error_column = 1; 150 | int error_line = 0; 151 | int faulty_bytes = 0; 152 | 153 | fd = open(file_path, O_RDONLY); 154 | if (fd == -1) 155 | handle_error("open", err_open); 156 | if (fstat(fd, &sb) == -1) /* To obtain file size */ 157 | handle_error("fstat", err_fstat); 158 | addr = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 159 | if (addr == MAP_FAILED) 160 | { 161 | /* Can't nmap, maybe a pipe or whatever, let's try readline. */ 162 | close(fd); 163 | return is_utf8_readline(fopen(file_path, "r"), file_path, 164 | quiet, verbose, list_only, invert); 165 | } 166 | pos = is_utf8((unsigned char*)addr, sb.st_size, &message, &faulty_bytes); 167 | if (message != NULL) 168 | count_lines(addr, sb.st_size, pos, &error_line, &error_column); 169 | print_utf8_error(file_path, error_line, error_column, pos, 170 | addr, sb.st_size, pos, message, faulty_bytes, 171 | quiet, verbose, list_only, invert); 172 | if (message != NULL) 173 | retval = EXIT_FAILURE; 174 | munmap(addr, sb.st_size); 175 | err_fstat: 176 | close(fd); 177 | err_open: 178 | return retval; 179 | } 180 | 181 | static void usage(const char *program_name) { 182 | printf("Usage: %s [OPTION]... [FILE]...\n" 183 | "Check whether input files are valid UTF-8.\n" 184 | "\n" 185 | " -h, --help display this help text and exit\n" 186 | " -q, --quiet suppress all normal output\n" 187 | " -l, --list print only names of FILEs containing invalid UTF-8\n" 188 | " -i, --invert list valid UTF-8 files instead of invalid ones\n" 189 | " -v, --verbose print detailed error (multiple lines)\n" 190 | "\n" 191 | "This is version %s.\n", 192 | program_name, VERSION); 193 | } 194 | 195 | int main(int ac, char **av) 196 | { 197 | int quiet = 0; 198 | int exit_value = EXIT_SUCCESS; 199 | int i; 200 | int list_only = 0; 201 | int invert = 0; 202 | int verbose = 0; 203 | int opt; 204 | struct option options[] = { 205 | { "help", no_argument, NULL, 'h' }, 206 | { "quiet", no_argument, &quiet, 1 }, 207 | { "list-only", no_argument, &list_only, 1 }, 208 | { "invert", no_argument, &invert, 1 }, 209 | { "verbose", no_argument, &verbose, 1 }, 210 | { 0, 0, 0, 0 } 211 | }; 212 | 213 | while ((opt = getopt_long(ac, av, "hqliv", options, NULL)) != -1) { 214 | switch (opt) { 215 | case 0: 216 | break; 217 | 218 | case 'h': 219 | usage(av[0]); 220 | return EXIT_SUCCESS; 221 | 222 | case 'q': 223 | quiet = 1; 224 | break; 225 | 226 | case 'l': 227 | list_only = 1; 228 | break; 229 | 230 | case 'i': 231 | invert = 1; 232 | break; 233 | 234 | case 'v': 235 | verbose = 1; 236 | break; 237 | 238 | case '?': 239 | usage(av[0]); 240 | return EXIT_FAILURE; 241 | 242 | default: 243 | usage(av[0]); 244 | return EXIT_FAILURE; 245 | } 246 | } 247 | if (optind == ac) 248 | { 249 | return is_utf8_readline(stdin, "(standard input)", quiet, verbose, 250 | list_only, invert); 251 | } 252 | else 253 | { 254 | for (i = optind; i < ac; ++i) 255 | { 256 | if (is_utf8_mmap(av[i], quiet, verbose, 257 | list_only, invert) == EXIT_FAILURE) 258 | exit_value = EXIT_FAILURE; 259 | } 260 | return exit_value; 261 | } 262 | } 263 | -------------------------------------------------------------------------------- /is_utf8.c: -------------------------------------------------------------------------------- 1 | #include "is_utf8.h" 2 | 3 | /* 4 | Check if the given unsigned char * is a valid utf-8 sequence. 5 | 6 | Return value : 7 | If the string is valid utf-8, 0 is returned. 8 | Else the position, starting from 1, is returned. 9 | 10 | Source: 11 | http://www.unicode.org/versions/Unicode7.0.0/UnicodeStandard-7.0.pdf 12 | page 124, 3.9 "Unicode Encoding Forms", "UTF-8" 13 | 14 | 15 | Table 3-7. Well-Formed UTF-8 Byte Sequences 16 | ----------------------------------------------------------------------------- 17 | | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte | 18 | | U+0000..U+007F | 00..7F | | | | 19 | | U+0080..U+07FF | C2..DF | 80..BF | | | 20 | | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 21 | | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | 22 | | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 23 | | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | 24 | | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 25 | | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | 26 | | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 27 | ----------------------------------------------------------------------------- 28 | 29 | Returns the first erroneous byte position, and give in 30 | `faulty_bytes` the number of actually existing bytes taking part in this error. 31 | */ 32 | size_t is_utf8(unsigned char *str, size_t len, char **message, int *faulty_bytes) 33 | { 34 | size_t i = 0; 35 | 36 | *message = NULL; 37 | *faulty_bytes = 0; 38 | while (i < len) 39 | { 40 | if (str[i] <= 0x7F) /* 00..7F */ 41 | { 42 | i += 1; 43 | } 44 | else if (str[i] >= 0xC2 && str[i] <= 0xDF) /* C2..DF 80..BF */ 45 | { 46 | if (i + 1 < len) /* Expect a 2nd byte */ 47 | { 48 | if (str[i + 1] < 0x80 || str[i + 1] > 0xBF) 49 | { 50 | *message = "After a first byte between C2 and DF, expecting a 2nd byte between 80 and BF"; 51 | *faulty_bytes = 2; 52 | return i; 53 | } 54 | } 55 | else 56 | { 57 | *message = "After a first byte between C2 and DF, expecting a 2nd byte."; 58 | *faulty_bytes = 1; 59 | return i; 60 | } 61 | i += 2; 62 | } 63 | else if (str[i] == 0xE0) /* E0 A0..BF 80..BF */ 64 | { 65 | if (i + 2 < len) /* Expect a 2nd and 3rd byte */ 66 | { 67 | if (str[i + 1] < 0xA0 || str[i + 1] > 0xBF) 68 | { 69 | *message = "After a first byte of E0, expecting a 2nd byte between A0 and BF."; 70 | *faulty_bytes = 2; 71 | return i; 72 | } 73 | if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) 74 | { 75 | *message = "After a first byte of E0, expecting a 3nd byte between 80 and BF."; 76 | *faulty_bytes = 3; 77 | return i; 78 | } 79 | } 80 | else 81 | { 82 | *message = "After a first byte of E0, expecting two following bytes."; 83 | *faulty_bytes = 1; 84 | return i; 85 | } 86 | i += 3; 87 | } 88 | else if (str[i] >= 0xE1 && str[i] <= 0xEC) /* E1..EC 80..BF 80..BF */ 89 | { 90 | if (i + 2 < len) /* Expect a 2nd and 3rd byte */ 91 | { 92 | if (str[i + 1] < 0x80 || str[i + 1] > 0xBF) 93 | { 94 | *message = "After a first byte between E1 and EC, expecting the 2nd byte between 80 and BF."; 95 | *faulty_bytes = 2; 96 | return i; 97 | } 98 | if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) 99 | { 100 | *message = "After a first byte between E1 and EC, expecting the 3rd byte between 80 and BF."; 101 | *faulty_bytes = 3; 102 | return i; 103 | } 104 | } 105 | else 106 | { 107 | *message = "After a first byte between E1 and EC, expecting two following bytes."; 108 | *faulty_bytes = 1; 109 | return i; 110 | } 111 | i += 3; 112 | } 113 | else if (str[i] == 0xED) /* ED 80..9F 80..BF */ 114 | { 115 | if (i + 2 < len) /* Expect a 2nd and 3rd byte */ 116 | { 117 | if (str[i + 1] < 0x80 || str[i + 1] > 0x9F) 118 | { 119 | *message = "After a first byte of ED, expecting 2nd byte between 80 and 9F."; 120 | *faulty_bytes = 2; 121 | return i; 122 | } 123 | if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) 124 | { 125 | *message = "After a first byte of ED, expecting 3rd byte between 80 and BF."; 126 | *faulty_bytes = 3; 127 | return i; 128 | } 129 | } 130 | else 131 | { 132 | *message = "After a first byte of ED, expecting two following bytes."; 133 | *faulty_bytes = 1; 134 | return i; 135 | } 136 | i += 3; 137 | } 138 | else if (str[i] >= 0xEE && str[i] <= 0xEF) /* EE..EF 80..BF 80..BF */ 139 | { 140 | if (i + 2 < len) /* Expect a 2nd and 3rd byte */ 141 | { 142 | if (str[i + 1] < 0x80 || str[i + 1] > 0xBF) 143 | { 144 | *message = "After a first byte between EE and EF, expecting 2nd byte between 80 and BF."; 145 | *faulty_bytes = 2; 146 | return i; 147 | } 148 | if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) 149 | { 150 | *message = "After a first byte between EE and EF, expecting 3rd byte between 80 and BF."; 151 | *faulty_bytes = 3; 152 | return i; 153 | } 154 | } 155 | else 156 | { 157 | *message = "After a first byte between EE and EF, two following bytes."; 158 | *faulty_bytes = 1; 159 | return i; 160 | } 161 | i += 3; 162 | } 163 | else if (str[i] == 0xF0) /* F0 90..BF 80..BF 80..BF */ 164 | { 165 | if (i + 3 < len) /* Expect a 2nd, 3rd 3th byte */ 166 | { 167 | if (str[i + 1] < 0x90 || str[i + 1] > 0xBF) 168 | { 169 | *message = "After a first byte of F0, expecting 2nd byte between 90 and BF."; 170 | *faulty_bytes = 2; 171 | return i; 172 | } 173 | if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) 174 | { 175 | *message = "After a first byte of F0, expecting 3rd byte between 80 and BF."; 176 | *faulty_bytes = 3; 177 | return i; 178 | } 179 | if (str[i + 3] < 0x80 || str[i + 3] > 0xBF) 180 | { 181 | *message = "After a first byte of F0, expecting 4th byte between 80 and BF."; 182 | *faulty_bytes = 4; 183 | return i; 184 | } 185 | } 186 | else 187 | { 188 | *message = "After a first byte of F0, expecting three following bytes."; 189 | *faulty_bytes = 1; 190 | return i; 191 | } 192 | i += 4; 193 | } 194 | else if (str[i] >= 0xF1 && str[i] <= 0xF3) /* F1..F3 80..BF 80..BF 80..BF */ 195 | { 196 | if (i + 3 < len) /* Expect a 2nd, 3rd 3th byte */ 197 | { 198 | if (str[i + 1] < 0x80 || str[i + 1] > 0xBF) 199 | { 200 | *message = "After a first byte of F1, F2, or F3, expecting a 2nd byte between 80 and BF."; 201 | *faulty_bytes = 2; 202 | return i; 203 | } 204 | if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) 205 | { 206 | *message = "After a first byte of F1, F2, or F3, expecting a 3rd byte between 80 and BF."; 207 | *faulty_bytes = 3; 208 | return i; 209 | } 210 | if (str[i + 3] < 0x80 || str[i + 3] > 0xBF) 211 | { 212 | *message = "After a first byte of F1, F2, or F3, expecting a 4th byte between 80 and BF."; 213 | *faulty_bytes = 4; 214 | return i; 215 | } 216 | } 217 | else 218 | { 219 | *message = "After a first byte of F1, F2, or F3, expecting three following bytes."; 220 | *faulty_bytes = 1; 221 | return i; 222 | } 223 | i += 4; 224 | } 225 | else if (str[i] == 0xF4) /* F4 80..8F 80..BF 80..BF */ 226 | { 227 | if (i + 3 < len) /* Expect a 2nd, 3rd 3th byte */ 228 | { 229 | if (str[i + 1] < 0x80 || str[i + 1] > 0x8F) 230 | { 231 | *message = "After a first byte of F4, expecting 2nd byte between 80 and 8F."; 232 | *faulty_bytes = 2; 233 | return i; 234 | } 235 | if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) 236 | { 237 | *message = "After a first byte of F4, expecting 3rd byte between 80 and BF."; 238 | *faulty_bytes = 3; 239 | return i; 240 | } 241 | if (str[i + 3] < 0x80 || str[i + 3] > 0xBF) 242 | { 243 | *message = "After a first byte of F4, expecting 4th byte between 80 and BF."; 244 | *faulty_bytes = 4; 245 | return i; 246 | } 247 | } 248 | else 249 | { 250 | *message = "After a first byte of F4, expecting three following bytes."; 251 | *faulty_bytes = 1; 252 | return i; 253 | } 254 | i += 4; 255 | } 256 | else 257 | { 258 | *message = "Expecting bytes in the following ranges: 00..7F C2..F4."; 259 | *faulty_bytes = 1; 260 | return i; 261 | } 262 | } 263 | message = NULL; 264 | return 0; 265 | } 266 | --------------------------------------------------------------------------------