├── is_utf8.h
├── Makefile
├── COPYRIGHT
├── README.md
├── test.sh
├── main.c
└── is_utf8.c


/is_utf8.h:
--------------------------------------------------------------------------------
1 | #ifndef _IS_UTF8_H
2 | #define _IS_UTF8_H
3 | 
4 | #include <stdlib.h>
5 | 
6 | size_t is_utf8(unsigned char *str, size_t len, char **message, int *faulty_bytes);
7 | 
8 | #endif /* _IS_UTF8_H */
9 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Makefile for is_utf8
 3 | ##
 4 | ## Made by julien palard
 5 | ## Login   <is_utf8@mandark.fr>
 6 | ##
 7 | 
 8 | NAME = isutf8
 9 | SRC = main.c is_utf8.c
10 | OBJ = $(SRC:.c=.o)
11 | 
12 | VERSION = 0
13 | MINOR = 0
14 | RELEASE = 0
15 | 
16 | LIB_SRC = is_utf8.c
17 | 
18 | LINKERNAME = lib$(NAME).so
19 | SONAME = $(LINKERNAME).$(VERSION)
20 | REALNAME = $(SONAME).$(MINOR).$(RELEASE)
21 | 
22 | CC = gcc
23 | CFLAGS = -O3 -Wextra -Wall -ansi -Wstrict-prototypes
24 | 
25 | $(NAME):	IS_UTF8_LIB $(OBJ)
26 | 	$(CC) $(CFLAGS) -o $(NAME) $(OBJ)
27 | 
28 | IS_UTF8_LIB:
29 | 	$(CC) --shared -fPIC $(CFLAGS) $(LIB_SRC) -o $(LINKERNAME)
30 | 
31 | all:
32 | 		@make $(NAME)
33 | 
34 | clean:
35 | 		rm -f $(NAME) $(LINKERNAME) $(OBJ) $(LIB_OBJ)
36 | 
37 | re:		clean all
38 | 


--------------------------------------------------------------------------------
/COPYRIGHT:
--------------------------------------------------------------------------------
 1 | is_utf8 is distributed under the following terms:
 2 | 
 3 | Copyright (c) 2013 Palard Julien. All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions
 7 | are met:
 8 | 1. Redistributions of source code must retain the above copyright
 9 |    notice, this list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright
11 |    notice, this list of conditions and the following disclaimer in the
12 |    documentation and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 | SUCH DAMAGE.
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | isutf8 is a program and a c library to check if a given file (or stdin) contains only
 2 | valid utf-8 sequences.
 3 | 
 4 | # Compiling
 5 | 
 6 |     $ make
 7 | 
 8 | # Demo
 9 | 
10 |     $ isutf8 * -v
11 |     isutf8: line 1, char 97, byte 96: Expecting bytes in the following ranges: 00..7F C2..F4.
12 |     40 00 40 00 00 00 00 00 C0 01 00 00 00 00 00 00  | @.@.............
13 |                             ^^                       |         ^
14 | 
15 |     is_utf8.o: line 1, char 66, byte 65: Expecting bytes in the following ranges: 00..7F C2..F4.
16 |     00 40 00 0E 00 0B 00 48 85 F6 48 C7 02 00 00 00  | .@.....H..H.....
17 |                             ^^                       |         ^
18 | 
19 |     libisutf8.so: line 1, char 153, byte 152: After a first byte of F0, expecting 2nd byte between 90 and BF.
20 |     68 12 20 00 00 00 00 00 F0 01 00 00 00 00 00 00  | h. .............
21 |                             ^^^^^                    |         ^^
22 | 
23 |     main.o: line 1, char 76, byte 75: Expecting bytes in the following ranges: 00..7F C2..F4.
24 |     56 41 55 41 54 55 53 48 83 EC 18 48 8B 5C 24 58  | VAUATUSH...H.\$X
25 |                             ^^                       |         ^
26 | 
27 | # Test a file
28 | 
29 | `isutf8` returns 0 if the file is correctly encoded:
30 | 
31 |     $ isutf8 main.c
32 |     $ echo $?
33 |     0
34 | 
35 | Some files here only contain ASCII or correctly encoded UTF8:
36 | 
37 |     $ isutf8 README.md
38 |     $ isutf8 test.sh
39 | 
40 | But an ELF is clearly not UTF8, a verbose error is printed:
41 | 
42 |     $ isutf8 isutf8
43 |     isutf8: line 1, char 97, byte 96: Expecting bytes in the following ranges: 00..7F C2..F4.
44 | 
45 | `-v` adds some context:
46 | 
47 |     $ isutf8 -v isutf8
48 |     isutf8: line 1, char 97, byte 96: Expecting bytes in the following ranges: 00..7F C2..F4.
49 |     40 00 40 00 00 00 00 00 C0 01 00 00 00 00 00 00  | @.@.............
50 |                             ^^                       |         ^
51 | 
52 | # Test stdin
53 | 
54 | `isutf8` reads on stdin if no file are given, also note that `bash`
55 | helps a lot with the `$''` syntax allowing you to write and test hexadecimal:
56 | 
57 |     $ echo $'\xe9' | isutf8
58 |     (standard input): line 1, char 0, byte 0: After a first byte between E1 and EC, expecting two following bytes.
59 | 
60 |     $ echo "Hellö world" | iconv -f utf8 -t latin1 | isutf8
61 |     (standard input): line 1, char 4, byte 4: Expecting bytes in the following ranges: 00..7F C2..F4.
62 | 
63 | # Find UTF8 or non-UTF8 files
64 | 
65 | As `isutf8` can take multiple arguments it's easy classify
66 | UTF8-compatible versus non UTF8-compatible files:
67 | 
68 | List non-UTF8 compatible files:
69 | 
70 |     $ isutf8 --list *
71 |     isutf8
72 |     is_utf8.o
73 |     libisutf8.so
74 |     main.o
75 | 
76 | List UTF8-compatible files:
77 | 
78 |     $ isutf8 --list --invert *
79 |     COPYRIGHT
80 |     is_utf8.c
81 |     is_utf8.h
82 |     main.c
83 |     Makefile
84 |     README.md
85 |     test.sh
86 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | utf8_test_file()
  4 | {
  5 |     printf "."
  6 |     to_test="$1"
  7 |     should_return="$2"
  8 |     ./isutf8 <(printf "%s" "$to_test") 2>/dev/null >/dev/null
  9 |     error_number=$?
 10 |     if ! [ z"$error_number" == z"$should_return" ]
 11 |     then
 12 |         [ $should_return == 1 ] &&
 13 |         printf "\nThis one should have failed:\n" ||
 14 |         printf "\nThis one should succeed:\n"
 15 |         printf "%s" "$to_test" | hexdump -C
 16 |         printf "%s" "$to_test" | ./isutf8 <(printf "%s" "$to_test")
 17 |         exit 1
 18 |     fi
 19 | }
 20 | 
 21 | utf8_test_pipe()
 22 | {
 23 |     printf "."
 24 |     to_test="$1"
 25 |     should_return="$2"
 26 |     printf "%s" "$to_test" | ./isutf8 2>/dev/null >/dev/null
 27 |     error_number=$?
 28 |     if ! [ z"$error_number" == z"$should_return" ]
 29 |     then
 30 |         [ $should_return == 1 ] &&
 31 |         printf "\nThis one should have failed:\n" ||
 32 |         printf "\nThis one should succeed:\n"
 33 |         printf "%s" "$to_test" | hexdump -C
 34 |         printf "%s" "$to_test" | ./isutf8 -
 35 |         exit 1
 36 |     fi
 37 | }
 38 | 
 39 | utf8_test()
 40 | {
 41 |     utf8_test_pipe "$@"
 42 |     utf8_test_file "$@"
 43 | }
 44 | 
 45 | should_pass()
 46 | {
 47 |     while [ -n "$1" ]
 48 |     do
 49 |         utf8_test "$1" 0
 50 |         shift
 51 |     done
 52 | }
 53 | 
 54 | should_fail()
 55 | {
 56 |     while [ -n "$1" ]
 57 |     do
 58 |         utf8_test "$1" 1
 59 |         shift
 60 |     done
 61 | }
 62 | 
 63 | should_pass "Léa" "Maïté" "Pauline" "Élise"
 64 | should_fail $'\xc9lise' $'Elis\xc9'
 65 | 
 66 | # Tests from :
 67 | # Table 3-7. Well-Formed UTF-8 Byte Sequences
 68 | # -----------------------------------------------------------------------------
 69 | # |  Code Points        | First Byte | Second Byte | Third Byte | Fourth Byte |
 70 | # |  U+0000..U+007F     |     00..7F |             |            |             |
 71 | should_pass $'\01' $'\02' q w e r t y "~" "foo" "bar" "baz" "123" 1 2 3 "," $'\n' $'\x0E' $'\x0F' $'\x7F'
 72 | should_fail $'\x80' $'\x82' $'\x83' $'\xC0' $'\xC1'
 73 | should_fail $'|\x80' $'|\x82' $'|\x83' $'|\xC0' $'|\xC1'
 74 | should_fail $'\x80|' $'\x82|' $'\x83|' $'\xC0|' $'\xC1|'
 75 | # |  U+0080..U+07FF     |     C2..DF |      80..BF |            |             |
 76 | should_pass $'\xC2\x80' $'\xC2\xBF' $'\xDF\x80' $'\xDF\xBF'
 77 | should_fail $'\xC2\x79' $'\xC2\xC0' $'\xC2\xC3'
 78 | should_pass $'---\xC2\x80' $'---\xC2\xBF' $'---\xDF\x80' $'---\xDF\xBF'
 79 | should_fail $'\xC2\x79---' $'\xC2\xC0---' $'\xC2\xC3---'
 80 | # |  U+0800..U+0FFF     |         E0 |      A0..BF |     80..BF |             |
 81 | should_pass $'\xE0\xA0\x80' $'\xE0\xA0\xBF' $'\xE0\xBF\x80' $'\xE0\xBF\xBF'
 82 | should_fail $'\xE0\x99\x80' $'\xE0\xC5\xBF' $'\xE0\xBF\x78' $'\xE0\xBF\xEE'
 83 | should_pass $'~~~~\xE0\xA0\x80' $'~~~~\xE0\xA0\xBF' $'~~~~\xE0\xBF\x80' $'~~~~\xE0\xBF\xBF'
 84 | should_fail $'\xE0\x99\x80~~~~' $'\xE0\xC5\xBF~~~~' $'\xE0\xBF\x78~~~~' $'\xE0\xBF\xEE~~~~'
 85 | # |  U+1000..U+CFFF     |     E1..EC |      80..BF |     80..BF |             |
 86 | should_pass $'\xE1\x80\x80' $'\xE1\xBF\x80' $'\xE1\x80\xBF' $'\xE1\xBF\xBF'
 87 | should_pass $'\xEC\x80\x80' $'\xEC\xBF\x80' $'\xEC\x80\xBF' $'\xEC\xBF\xBF'
 88 | should_fail $'\xE1\x41\x80' $'\xE1\xC0\x80' $'\xE1\xC8\xBF' $'\xE1\xBF\xFE'
 89 | # |  U+D000..U+D7FF     |         ED |      80..9F |     80..BF |             |
 90 | should_pass $'\xED\x80\x80' $'\xED\x80\xBF' $'\xED\x9F\x80' $'\xED\x9F\xBF'
 91 | should_fail $'\xED\x80\x50' $'\xED\x80\xC1' $'\xED\xBF\x80' $'\xED\xBF\xBF'
 92 | # |  U+E000..U+FFFF     |     EE..EF |      80..BF |     80..BF |             |
 93 | should_pass $'\xEE\x80\x80' $'\xEE\x80\xBF' $'\xEE\xBF\x80' $'\xEE\xBF\xBF'
 94 | should_fail $'\xEE\x70\x80' $'\xEE\x80\x70' $'\xEE\xCF\x80' $'\xEE\xCF\xCF'
 95 | # |  U+10000..U+3FFFF   |         F0 |      90..BF |     80..BF |      80..BF |
 96 | should_pass $'\xF0\x90\x80\x80' $'\xF0\xBF\x80\x80' $'\xF0\x90\x80\xBF' $'\xF0\xBF\xBF\xBF'
 97 | should_fail $'\xF0\x70\x80\x80' $'\xF0\xCF\x70\x80' $'\xF0\xCF\x80\xCE' $'\xF0\xCF\xCF\xDF'
 98 | should_fail $'\xF0\x80\x80\x80'
 99 | # |  U+40000..U+FFFFF   |     F1..F3 |      80..BF |     80..BF |      80..BF |
100 | should_pass $'\xF1\x80\x80\x80' $'\xF1\xBF\xBF\xBF'
101 | should_pass $'\xF2\x80\x80\x80' $'\xF2\xBF\xBF\xBF'
102 | should_pass $'\xF3\x80\x80\x80' $'\xF3\xBF\xBF\xBF'
103 | # |  U+100000..U+10FFFF |         F4 |      80..8F |     80..BF |      80..BF |
104 | should_pass $'\xF4\x80\x80\x80' $'\xF4\x8F\xBF\xBF'
105 | should_fail $'\xF4\x80\x80\x79' $'\xF4\xBF\xBF\xBF'
106 | # -----------------------------------------------------------------------------
107 | 
108 | should_fail $'\xf0-' $'\xf1-' $'\xf2-' $'\xf3-' $'\xE1-' $'\xEE-' $'\xED-' $'\xEF-'
109 | 
110 | printf "\nAll tests are OK.\n"
111 | 


--------------------------------------------------------------------------------
/main.c:
--------------------------------------------------------------------------------
  1 | #define _POSIX_C_SOURCE 200809L
  2 | #include <sys/types.h>
  3 | #include <sys/stat.h>
  4 | #include <fcntl.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <string.h>
  8 | #include <unistd.h>
  9 | #include <getopt.h>
 10 | #include <sys/mman.h>
 11 | 
 12 | #include "is_utf8.h"
 13 | 
 14 | #define VERSION "1.2"
 15 | #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
 16 | 
 17 | 
 18 | static int show_str(const char *str, unsigned int max_length)
 19 | {
 20 |     int printed = 0;
 21 | 
 22 |     while (max_length-- > 0)
 23 |     {
 24 |         printed += printf("%c", (*str >= ' ' && *str <= '~') ? (unsigned char)*str: '.');
 25 |         str += 1;
 26 |     }
 27 |     return printed;
 28 | }
 29 | 
 30 | static int show_hex_str(const char *str, unsigned int max_length)
 31 | {
 32 |     int printed = 0;
 33 | 
 34 |     while (max_length-- > 0)
 35 |         printed += printf("%.2X ", (unsigned char)*str++);
 36 |     return printed;
 37 | }
 38 | 
 39 | /*show_context(, 5, 279, 2, chars_before_error: 8, chars_after_error: -274)
 40 | **/
 41 | static void show_context(char *str, int str_length, int err_pos_in_str, int faulty_bytes)
 42 | {
 43 |     int chars_before_error = MIN(err_pos_in_str, 8);
 44 |     int chars_after_error = MIN(str_length - err_pos_in_str, 8);
 45 |     int printed = 0;
 46 | 
 47 |     printed = show_hex_str(str + err_pos_in_str - chars_before_error, chars_before_error + chars_after_error); /* Print up to error. */
 48 |     printf("%*s | ", 3 * 16 - printed, "");
 49 |     show_str(str + err_pos_in_str - chars_before_error, chars_before_error + chars_after_error); /* Print up to error. */
 50 |     printf("\n");
 51 |     printed = printf("%*s", (3 * chars_before_error), "");
 52 |     printed += printf("%.*s", faulty_bytes * 2 + faulty_bytes - 1, "^^^^^^^^^^^^^^^^");
 53 |     printf("%*s | ", 3 * 16 - printed, "");
 54 |     printf("%*s", (chars_before_error), "");
 55 |     printf("%.*s", faulty_bytes, "^^^^");
 56 |     printf("\n\n");
 57 | }
 58 | 
 59 | static void print_utf8_error(
 60 |     const char* file_path,
 61 |     int error_line, int error_column, int byte_no,
 62 |     char *str, int str_length, int err_pos_in_str,
 63 |     const char *message, int faulty_bytes,
 64 |     int quiet, int verbose,
 65 |     int list_only, int invert)
 66 | {
 67 |     if (quiet)
 68 |         return;
 69 |     if (message && !invert)
 70 |     {
 71 |         if (list_only)
 72 |             printf("%s\n", file_path);
 73 |         else
 74 |             printf("%s: line %d, char %d, byte %d: %s\n",
 75 |                    file_path, error_line, error_column, byte_no,
 76 |                    message);
 77 |         if (verbose && !list_only)
 78 |         {
 79 |             show_context(str, str_length, err_pos_in_str, faulty_bytes);
 80 |         }
 81 |     }
 82 |     if (!message && invert)
 83 |     {
 84 |         printf("%s\n", file_path);
 85 |     }
 86 | }
 87 | 
 88 | #define handle_error(msg, target)                                   \
 89 |     do {retval = EXIT_FAILURE; perror(msg); goto target;} while (0)
 90 | 
 91 | static int is_utf8_readline(FILE *stream, const char *file_path,
 92 |                             int quiet, int verbose, int list_only, int invert)
 93 | {
 94 |     char *string = NULL;
 95 |     size_t size = 0;
 96 |     ssize_t str_length;
 97 |     char *message = NULL;
 98 |     int lineno = 1;
 99 |     int pos = 0;
100 |     int offset = 0;
101 |     int faulty_bytes = 0;
102 | 
103 |     while ((str_length = getline(&string, &size, stream)) != -1)
104 |     {
105 |         pos = is_utf8((unsigned char*)string, str_length, &message, &faulty_bytes);
106 |         if (message != NULL)
107 |         {
108 |             offset += pos;
109 |             print_utf8_error(file_path, lineno, pos, offset,
110 |                              string, str_length, pos, message, faulty_bytes,
111 |                              quiet, verbose, list_only, invert);
112 |             break;
113 |         }
114 |         offset += str_length;
115 |         lineno += 1;
116 |     }
117 |     if (string != NULL)
118 |         free(string);
119 |     return message == NULL ? EXIT_SUCCESS : EXIT_FAILURE;
120 | }
121 | 
122 | static void count_lines(const char *string, int length, int up_to, int *line, int *column)
123 | {
124 |     int pos = 0;
125 |     int line_start_at = 0;
126 | 
127 |     *line = 1;
128 |     while (pos < length && pos < up_to)
129 |     {
130 |         if (string[pos] == '\n')
131 |         {
132 |             line_start_at = pos + 1;
133 |             *line += 1;
134 |         }
135 |         pos += 1;
136 |     }
137 |     *column = 1 + up_to - line_start_at;
138 | }
139 | 
140 | static int is_utf8_mmap(const char *file_path, int quiet, int verbose,
141 |                         int list_only, int invert)
142 | {
143 |     char *addr;
144 |     struct stat sb;
145 |     int fd;
146 |     int pos = 0;
147 |     char *message;
148 |     int retval = EXIT_SUCCESS;
149 |     int error_column = 1;
150 |     int error_line = 0;
151 |     int faulty_bytes = 0;
152 | 
153 |     fd = open(file_path, O_RDONLY);
154 |     if (fd == -1)
155 |         handle_error("open", err_open);
156 |     if (fstat(fd, &sb) == -1)           /* To obtain file size */
157 |         handle_error("fstat", err_fstat);
158 |     addr = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
159 |     if (addr == MAP_FAILED)
160 |     {
161 |         /* Can't nmap, maybe a pipe or whatever, let's try readline. */
162 |         close(fd);
163 |         return is_utf8_readline(fopen(file_path, "r"), file_path,
164 |                                 quiet, verbose, list_only, invert);
165 |     }
166 |     pos = is_utf8((unsigned char*)addr, sb.st_size, &message, &faulty_bytes);
167 |     if (message != NULL)
168 |         count_lines(addr, sb.st_size, pos, &error_line, &error_column);
169 |     print_utf8_error(file_path, error_line, error_column, pos,
170 |                      addr, sb.st_size, pos, message, faulty_bytes,
171 |                      quiet, verbose, list_only, invert);
172 |     if (message != NULL)
173 |         retval = EXIT_FAILURE;
174 |     munmap(addr, sb.st_size);
175 | err_fstat:
176 |     close(fd);
177 | err_open:
178 |     return retval;
179 | }
180 | 
181 | static void usage(const char *program_name) {
182 |     printf("Usage: %s [OPTION]... [FILE]...\n"
183 |            "Check whether input files are valid UTF-8.\n"
184 |            "\n"
185 |            "  -h, --help       display this help text and exit\n"
186 |            "  -q, --quiet      suppress all normal output\n"
187 |            "  -l, --list       print only names of FILEs containing invalid UTF-8\n"
188 |            "  -i, --invert     list valid UTF-8 files instead of invalid ones\n"
189 |            "  -v, --verbose    print detailed error (multiple lines)\n"
190 |            "\n"
191 |            "This is version %s.\n",
192 |            program_name, VERSION);
193 | }
194 | 
195 | int main(int ac, char **av)
196 | {
197 |     int quiet = 0;
198 |     int exit_value = EXIT_SUCCESS;
199 |     int i;
200 |     int list_only = 0;
201 |     int invert = 0;
202 |     int verbose = 0;
203 |     int opt;
204 |     struct option options[] = {
205 |         { "help", no_argument, NULL, 'h' },
206 |         { "quiet", no_argument, &quiet, 1 },
207 |         { "list-only", no_argument, &list_only, 1 },
208 |         { "invert", no_argument, &invert, 1 },
209 |         { "verbose", no_argument, &verbose, 1 },
210 |         { 0, 0, 0, 0 }
211 |     };
212 | 
213 |     while ((opt = getopt_long(ac, av, "hqliv", options, NULL)) != -1) {
214 |         switch (opt) {
215 |             case 0:
216 |                 break;
217 | 
218 |             case 'h':
219 |                 usage(av[0]);
220 |                 return EXIT_SUCCESS;
221 | 
222 |             case 'q':
223 |                 quiet = 1;
224 |                 break;
225 | 
226 |             case 'l':
227 |                 list_only = 1;
228 |                 break;
229 | 
230 |             case 'i':
231 |                 invert = 1;
232 |                 break;
233 | 
234 |             case 'v':
235 |                 verbose = 1;
236 |                 break;
237 | 
238 |             case '?':
239 |                 usage(av[0]);
240 |                 return EXIT_FAILURE;
241 | 
242 |             default:
243 |                 usage(av[0]);
244 |                 return EXIT_FAILURE;
245 |         }
246 |     }
247 |     if (optind == ac)
248 |     {
249 |         return is_utf8_readline(stdin, "(standard input)", quiet, verbose,
250 |                                 list_only, invert);
251 |     }
252 |     else
253 |     {
254 |         for (i = optind; i < ac; ++i)
255 |         {
256 |             if (is_utf8_mmap(av[i], quiet, verbose,
257 |                              list_only, invert) == EXIT_FAILURE)
258 |                 exit_value = EXIT_FAILURE;
259 |         }
260 |         return exit_value;
261 |     }
262 | }
263 | 


--------------------------------------------------------------------------------
/is_utf8.c:
--------------------------------------------------------------------------------
  1 | #include "is_utf8.h"
  2 | 
  3 | /*
  4 |   Check if the given unsigned char * is a valid utf-8 sequence.
  5 | 
  6 |   Return value :
  7 |   If the string is valid utf-8, 0 is returned.
  8 |   Else the position, starting from 1, is returned.
  9 | 
 10 |   Source:
 11 |    http://www.unicode.org/versions/Unicode7.0.0/UnicodeStandard-7.0.pdf
 12 |    page 124, 3.9 "Unicode Encoding Forms", "UTF-8"
 13 | 
 14 | 
 15 |   Table 3-7. Well-Formed UTF-8 Byte Sequences
 16 |   -----------------------------------------------------------------------------
 17 |   |  Code Points        | First Byte | Second Byte | Third Byte | Fourth Byte |
 18 |   |  U+0000..U+007F     |     00..7F |             |            |             |
 19 |   |  U+0080..U+07FF     |     C2..DF |      80..BF |            |             |
 20 |   |  U+0800..U+0FFF     |         E0 |      A0..BF |     80..BF |             |
 21 |   |  U+1000..U+CFFF     |     E1..EC |      80..BF |     80..BF |             |
 22 |   |  U+D000..U+D7FF     |         ED |      80..9F |     80..BF |             |
 23 |   |  U+E000..U+FFFF     |     EE..EF |      80..BF |     80..BF |             |
 24 |   |  U+10000..U+3FFFF   |         F0 |      90..BF |     80..BF |      80..BF |
 25 |   |  U+40000..U+FFFFF   |     F1..F3 |      80..BF |     80..BF |      80..BF |
 26 |   |  U+100000..U+10FFFF |         F4 |      80..8F |     80..BF |      80..BF |
 27 |   -----------------------------------------------------------------------------
 28 | 
 29 |   Returns the first erroneous byte position, and give in
 30 |   `faulty_bytes` the number of actually existing bytes taking part in this error.
 31 | */
 32 | size_t is_utf8(unsigned char *str, size_t len, char **message, int *faulty_bytes)
 33 | {
 34 |     size_t i = 0;
 35 | 
 36 |     *message = NULL;
 37 |     *faulty_bytes = 0;
 38 |     while (i < len)
 39 |     {
 40 |         if (str[i] <= 0x7F) /* 00..7F */
 41 |         {
 42 |             i += 1;
 43 |         }
 44 |         else if (str[i] >= 0xC2 && str[i] <= 0xDF) /* C2..DF 80..BF */
 45 |         {
 46 |             if (i + 1 < len) /* Expect a 2nd byte */
 47 |             {
 48 |                 if (str[i + 1] < 0x80 || str[i + 1] > 0xBF)
 49 |                 {
 50 |                     *message = "After a first byte between C2 and DF, expecting a 2nd byte between 80 and BF";
 51 |                     *faulty_bytes = 2;
 52 |                     return i;
 53 |                 }
 54 |             }
 55 |             else
 56 |             {
 57 |                 *message = "After a first byte between C2 and DF, expecting a 2nd byte.";
 58 |                 *faulty_bytes = 1;
 59 |                 return i;
 60 |             }
 61 |             i += 2;
 62 |         }
 63 |         else if (str[i] == 0xE0) /* E0 A0..BF 80..BF */
 64 |         {
 65 |             if (i + 2 < len) /* Expect a 2nd and 3rd byte */
 66 |             {
 67 |                 if (str[i + 1] < 0xA0 || str[i + 1] > 0xBF)
 68 |                 {
 69 |                     *message = "After a first byte of E0, expecting a 2nd byte between A0 and BF.";
 70 |                     *faulty_bytes = 2;
 71 |                     return i;
 72 |                 }
 73 |                 if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
 74 |                 {
 75 |                     *message = "After a first byte of E0, expecting a 3nd byte between 80 and BF.";
 76 |                     *faulty_bytes = 3;
 77 |                     return i;
 78 |                 }
 79 |             }
 80 |             else
 81 |             {
 82 |                 *message = "After a first byte of E0, expecting two following bytes.";
 83 |                 *faulty_bytes = 1;
 84 |                 return i;
 85 |             }
 86 |             i += 3;
 87 |         }
 88 |         else if (str[i] >= 0xE1 && str[i] <= 0xEC) /* E1..EC 80..BF 80..BF */
 89 |         {
 90 |             if (i + 2 < len) /* Expect a 2nd and 3rd byte */
 91 |             {
 92 |                 if (str[i + 1] < 0x80 || str[i + 1] > 0xBF)
 93 |                 {
 94 |                     *message = "After a first byte between E1 and EC, expecting the 2nd byte between 80 and BF.";
 95 |                     *faulty_bytes = 2;
 96 |                     return i;
 97 |                 }
 98 |                 if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
 99 |                 {
100 |                     *message = "After a first byte between E1 and EC, expecting the 3rd byte between 80 and BF.";
101 |                     *faulty_bytes = 3;
102 |                     return i;
103 |                 }
104 |             }
105 |             else
106 |             {
107 |                 *message = "After a first byte between E1 and EC, expecting two following bytes.";
108 |                 *faulty_bytes = 1;
109 |                 return i;
110 |             }
111 |             i += 3;
112 |         }
113 |         else if (str[i] == 0xED) /* ED 80..9F 80..BF */
114 |         {
115 |             if (i + 2 < len) /* Expect a 2nd and 3rd byte */
116 |             {
117 |                 if (str[i + 1] < 0x80 || str[i + 1] > 0x9F)
118 |                 {
119 |                     *message = "After a first byte of ED, expecting 2nd byte between 80 and 9F.";
120 |                     *faulty_bytes = 2;
121 |                     return i;
122 |                 }
123 |                 if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
124 |                 {
125 |                     *message = "After a first byte of ED, expecting 3rd byte between 80 and BF.";
126 |                     *faulty_bytes = 3;
127 |                     return i;
128 |                 }
129 |             }
130 |             else
131 |             {
132 |                 *message = "After a first byte of ED, expecting two following bytes.";
133 |                 *faulty_bytes = 1;
134 |                 return i;
135 |             }
136 |             i += 3;
137 |         }
138 |         else if (str[i] >= 0xEE && str[i] <= 0xEF) /* EE..EF 80..BF 80..BF */
139 |         {
140 |             if (i + 2 < len) /* Expect a 2nd and 3rd byte */
141 |             {
142 |                 if (str[i + 1] < 0x80 || str[i + 1] > 0xBF)
143 |                 {
144 |                     *message = "After a first byte between EE and EF, expecting 2nd byte between 80 and BF.";
145 |                     *faulty_bytes = 2;
146 |                     return i;
147 |                 }
148 |                 if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
149 |                 {
150 |                     *message = "After a first byte between EE and EF, expecting 3rd byte between 80 and BF.";
151 |                     *faulty_bytes = 3;
152 |                     return i;
153 |                 }
154 |             }
155 |             else
156 |             {
157 |                 *message = "After a first byte between EE and EF, two following bytes.";
158 |                 *faulty_bytes = 1;
159 |                 return i;
160 |             }
161 |             i += 3;
162 |         }
163 |         else if (str[i] == 0xF0) /* F0 90..BF 80..BF 80..BF */
164 |         {
165 |             if (i + 3 < len) /* Expect a 2nd, 3rd 3th byte */
166 |             {
167 |                 if (str[i + 1] < 0x90 || str[i + 1] > 0xBF)
168 |                 {
169 |                     *message = "After a first byte of F0, expecting 2nd byte between 90 and BF.";
170 |                     *faulty_bytes = 2;
171 |                     return i;
172 |                 }
173 |                 if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
174 |                 {
175 |                     *message = "After a first byte of F0, expecting 3rd byte between 80 and BF.";
176 |                     *faulty_bytes = 3;
177 |                     return i;
178 |                 }
179 |                 if (str[i + 3] < 0x80 || str[i + 3] > 0xBF)
180 |                 {
181 |                     *message = "After a first byte of F0, expecting 4th byte between 80 and BF.";
182 |                     *faulty_bytes = 4;
183 |                     return i;
184 |                 }
185 |             }
186 |             else
187 |             {
188 |                 *message = "After a first byte of F0, expecting three following bytes.";
189 |                 *faulty_bytes = 1;
190 |                 return i;
191 |             }
192 |             i += 4;
193 |         }
194 |         else if (str[i] >= 0xF1 && str[i] <= 0xF3) /* F1..F3 80..BF 80..BF 80..BF */
195 |         {
196 |             if (i + 3 < len) /* Expect a 2nd, 3rd 3th byte */
197 |             {
198 |                 if (str[i + 1] < 0x80 || str[i + 1] > 0xBF)
199 |                 {
200 |                     *message = "After a first byte of F1, F2, or F3, expecting a 2nd byte between 80 and BF.";
201 |                     *faulty_bytes = 2;
202 |                     return i;
203 |                 }
204 |                 if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
205 |                 {
206 |                     *message = "After a first byte of F1, F2, or F3, expecting a 3rd byte between 80 and BF.";
207 |                     *faulty_bytes = 3;
208 |                     return i;
209 |                 }
210 |                 if (str[i + 3] < 0x80 || str[i + 3] > 0xBF)
211 |                 {
212 |                     *message = "After a first byte of F1, F2, or F3, expecting a 4th byte between 80 and BF.";
213 |                     *faulty_bytes = 4;
214 |                     return i;
215 |                 }
216 |             }
217 |             else
218 |             {
219 |                 *message = "After a first byte of F1, F2, or F3, expecting three following bytes.";
220 |                 *faulty_bytes = 1;
221 |                 return i;
222 |             }
223 |             i += 4;
224 |         }
225 |         else if (str[i] == 0xF4) /* F4 80..8F 80..BF 80..BF */
226 |         {
227 |             if (i + 3 < len) /* Expect a 2nd, 3rd 3th byte */
228 |             {
229 |                 if (str[i + 1] < 0x80 || str[i + 1] > 0x8F)
230 |                 {
231 |                     *message = "After a first byte of F4, expecting 2nd byte between 80 and 8F.";
232 |                     *faulty_bytes = 2;
233 |                     return i;
234 |                 }
235 |                 if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
236 |                 {
237 |                     *message = "After a first byte of F4, expecting 3rd byte between 80 and BF.";
238 |                     *faulty_bytes = 3;
239 |                     return i;
240 |                 }
241 |                 if (str[i + 3] < 0x80 || str[i + 3] > 0xBF)
242 |                 {
243 |                     *message = "After a first byte of F4, expecting 4th byte between 80 and BF.";
244 |                     *faulty_bytes = 4;
245 |                     return i;
246 |                 }
247 |             }
248 |             else
249 |             {
250 |                 *message = "After a first byte of F4, expecting three following bytes.";
251 |                 *faulty_bytes = 1;
252 |                 return i;
253 |             }
254 |             i += 4;
255 |         }
256 |         else
257 |         {
258 |             *message = "Expecting bytes in the following ranges: 00..7F C2..F4.";
259 |             *faulty_bytes = 1;
260 |             return i;
261 |         }
262 |     }
263 |     message = NULL;
264 |     return 0;
265 | }
266 | 


--------------------------------------------------------------------------------