├── .gitignore
├── README.md
├── rules.py
├── test.lua
└── z_validate.c


/.gitignore:
--------------------------------------------------------------------------------
1 | /_out/
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # faster-utf8-validator
 2 | This library is a very fast UTF-8 validator using AVX2/SSE4 instructions. As
 3 | far as I am aware, it is the fastest validator in the world on the CPUs that
 4 | support these instructions (...and not AVX-512). Using AVX2, it can validate
 5 | random UTF-8 text as fast as .26 cycles/byte, and random ASCII text at .09
 6 | cycles/byte. For UTF-8, this is roughly 1.5-1.7x faster than the
 7 | [fastvalidate-utf-8](https://github.com/lemire/fastvalidate-utf-8) library.
 8 | 
 9 | This repository contains the library (one C file), a build script for the
10 | [make.py](https://github.com/zwegner/make.py) build system, and a Lua test
11 | script (which requires LuaJIT due to use of the `ffi` module).
12 | 
13 | A detailed description of the algorithm can be found in `z_validate.c`.
14 | This algorithm should map fairly nicely to AVX-512, and should in fact be a
15 | bit faster than 2x the speed of AVX2 since a few instructions can be saved.
16 | But I don't have an AVX-512 machine, so I haven't tried it yet.
17 | 
18 | Benchmark
19 | ----
20 | Here's some raw numbers, measured on my 2.4GHz Haswell laptop, using a modified
21 | version of the benchmark in the fastvalidate-utf-8 repository. There are four
22 | configurations of test input: random UTF-8 bytes or random ASCII bytes, and
23 | either 64K bytes or 16M bytes. All measurements are the best of 50 runs, with
24 | each run using a different random seed, but each validator tested with the
25 | same seeds (and thus the same inputs). All measurements are in cycles per byte.
26 | The first two rows are the fastvalidate-utf-8 AVX2 functions, and the second two
27 | rows are this library, using AVX2 and SSE4 instruction sets.
28 | 
29 | | Validator                          | 64K UTF-8 | 64K ASCII | 16M UTF-8 | 16M ASCII |
30 | | ---------------------------------- | --------- | --------- | --------- | --------- |
31 | | `validate_utf8_fast_avx`           |     0.410 |     0.410 |     0.496 |     0.429 |
32 | | `validate_utf8_fast_avx_asciipath` |     0.436 |     0.074 |     0.457 |     0.156 |
33 | | `z_validate_utf8_avx2`             |     0.264 |     0.079 |     0.290 |     0.160 |
34 | | `z_validate_utf8_sse4`             |     0.568 |     0.163 |     0.596 |     0.202 |
35 | 


--------------------------------------------------------------------------------
/rules.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def rules(ctx):
 4 |     files = ['z_validate']
 5 |     c_flags = ['-fcolor-diagnostics', '-std=gnu11', '-march=native',
 6 |             '-Wall', '-Wextra', '-Werror']
 7 |     configs = [
 8 |         ['avx2/rel', ['-DAVX2', '-O3']],
 9 |         ['avx2/deb', ['-DAVX2', '-g']],
10 |         ['sse4/rel', ['-DSSE4', '-O3']],
11 |         ['sse4/deb', ['-DSSE4', '-g']],
12 |     ]
13 | 
14 |     for [conf_path, conf_flags] in configs:
15 |         o_files = []
16 |         for f in files:
17 |             c_file = '%s.c' % f
18 |             o_file = '_out/%s/%s.o' % (conf_path, f)
19 |             d_file = '_out/%s/%s.d' % (conf_path, f)
20 |             cmd = ['cc', '-o', o_file, '-c', c_file, '-MD', *c_flags,
21 |                     *conf_flags]
22 |             ctx.add_rule(o_file, [c_file], cmd, d_file=d_file)
23 |             o_files.append(o_file)
24 | 
25 |         # Main shared library
26 |         bin_file = '_out/%s/zval.so' % conf_path
27 |         ctx.add_rule(bin_file, o_files,
28 |             ['cc', '-shared', '-o', bin_file, *c_flags, *o_files])
29 | 


--------------------------------------------------------------------------------
/test.lua:
--------------------------------------------------------------------------------
  1 | -- Load the library
  2 | local ffi = require('ffi')
  3 | local lib_avx2 = ffi.load('_out/avx2/rel/zval.so')
  4 | local lib_sse4 = ffi.load('_out/sse4/rel/zval.so')
  5 | ffi.cdef([[
  6 | bool z_validate_utf8_avx2(const char *data, size_t len);
  7 | bool z_validate_utf8_sse4(const char *data, size_t len);
  8 | ]])
  9 | 
 10 | local VALIDATORS = {
 11 |     lib_avx2.z_validate_utf8_avx2,
 12 |     lib_sse4.z_validate_utf8_sse4
 13 | }
 14 | 
 15 | -- Ranges for certain kinds of bytes
 16 | local ANY = { 0, 0xFF }
 17 | local ASCII = { 0, 0x7F }
 18 | local CONT = { 0x80, 0xBF }
 19 | 
 20 | -- Test cases. Format is { expected-result, byte-ranges... } where byte-ranges
 21 | -- are 2-element tables { lo, hi }. For each byte, all byte values between the
 22 | -- corresponding lo and hi values are tested.
 23 | local TEST_CASES = {
 24 |     -- ASCII. First byte is ' ' for keeping combinatorial explosions down
 25 |     {  true, { 0x20, 0x20 }, ASCII, ASCII, ASCII },
 26 | 
 27 |     -- 2-byte sequences
 28 |     { false, { 0xC2, 0xC2 }, },
 29 |     { false, { 0xC2, 0xC2 }, ASCII },
 30 |     {  true, { 0xC2, 0xC2 }, CONT },
 31 |     { false, { 0xC2, 0xC2 }, { 0xC0, 0xFF} },
 32 |     { false, { 0xC2, 0xC2 }, CONT, CONT },
 33 |     { false, { 0xC2, 0xC2 }, CONT, CONT, CONT },
 34 | 
 35 |     -- 3-byte sequences
 36 |     { false, { 0xE1, 0xE1 }, },
 37 |     { false, { 0xE1, 0xE1 }, CONT },
 38 |     {  true, { 0xE1, 0xE1 }, CONT, CONT },
 39 |     {  true, { 0xE1, 0xE1 }, CONT, CONT, ASCII },
 40 |     { false, { 0xE1, 0xE1 }, CONT, ASCII },
 41 |     { false, { 0xE1, 0xE1 }, CONT, CONT, CONT },
 42 | 
 43 |     -- 4-byte sequences
 44 |     { false, { 0xF1, 0xF1 }, },
 45 |     { false, { 0xF1, 0xF1 }, CONT },
 46 |     { false, { 0xF1, 0xF1 }, CONT, CONT },
 47 |     {  true, { 0xF1, 0xF1 }, CONT, CONT, CONT },
 48 |     { false, { 0xF1, 0xF1 }, CONT, CONT, ASCII },
 49 |     {  true, { 0xF1, 0xF1 }, CONT, CONT, CONT, ASCII },
 50 | 
 51 |     -- No C0/C1 bytes (overlong)
 52 |     { false, { 0xC0, 0xC1 }, ANY },
 53 |     { false, { 0xC0, 0xC1 }, ANY, ANY },
 54 |     { false, { 0xC0, 0xC1 }, ANY, ANY, ANY },
 55 | 
 56 |     -- No E0 followed by 80..9F (overlong)
 57 |     { false, { 0xE0, 0xE0 }, { 0x00, 0x9F }, CONT },
 58 |     {  true, { 0xE0, 0xE0 }, { 0xA0, 0xBF }, CONT },
 59 | 
 60 |     -- No surrogate pairs
 61 |     {  true, { 0xE1, 0xEC }, CONT, CONT },
 62 |     {  true, { 0xED, 0xED }, { 0x80, 0x9F }, CONT },
 63 |     { false, { 0xED, 0xED }, { 0xA0, 0xBF }, CONT },
 64 |     {  true, { 0xEE, 0xEF }, CONT, CONT },
 65 | 
 66 |     -- No F0 followed by 80..8F (overlong)
 67 |     { false, { 0xF0, 0xF0 }, { 0x80, 0x8F }, CONT, CONT },
 68 |     {  true, { 0xF0, 0xF0 }, { 0x90, 0xBF }, CONT, CONT },
 69 | 
 70 |     -- No code points above U+10FFFF
 71 |     {  true, { 0xF4, 0xF4 }, { 0x80, 0x8F }, CONT, CONT },
 72 |     { false, { 0xF4, 0xF4 }, { 0x90, 0xBF }, CONT, CONT },
 73 | 
 74 |     -- No bytes above F4
 75 |     { false, { 0xF5, 0xFF }, ANY },
 76 |     { false, { 0xF5, 0xFF }, ANY, ANY },
 77 |     { false, { 0xF5, 0xFF }, ANY, ANY, ANY },
 78 | }
 79 | 
 80 | -- Array string
 81 | function astr(array)
 82 |     local r = '{'
 83 |     for _, value in ipairs(array) do
 84 |         r = r .. ('%2X'):format(value) .. ','
 85 |     end
 86 |     return r .. '}'
 87 | end
 88 | 
 89 | -- A little helper function for running an input on each validator
 90 | function test_validators(str, len, buffer, expected, count, fails)
 91 |     for _, validate in ipairs(VALIDATORS) do
 92 |         local result = validate(str, len)
 93 |         if result ~= expected then
 94 |             fails = fails + 1
 95 |             print('failure:', result, expected, astr(buffer))
 96 |             assert(false)
 97 |         end
 98 |         count = count + 1
 99 |     end
100 |     return count, fails
101 | end
102 | 
103 | local count, fails = 0, 0
104 | for idx, test in ipairs(TEST_CASES) do
105 |     local expected = table.remove(test, 1)
106 |     local lo_1, hi_1 = unpack(table.remove(test, 1))
107 | 
108 |     -- Loop through various frame shifts, to make sure we catch any issues due
109 |     -- to vector alignment
110 |     for _, k in ipairs{1, 10, 28, 29, 20, 31, 32, 33} do
111 |         local buffer = {}
112 |         for j = 1, 64 do buffer[j] = 0 end
113 | 
114 |         local last_count = count
115 | 
116 |         -- Loop through first byte
117 |         for b = lo_1, hi_1 do
118 |             buffer[k] = b
119 | 
120 |             -- Find maximum range of values in remaining bytes
121 |             for offset = 0, 255 do
122 |                 local any_valid = false
123 |                 for i, range in ipairs(test) do
124 |                     i = i + k
125 |                     local lo_2, hi_2 = unpack(range)
126 |                     buffer[i] = lo_2 + offset
127 |                     if buffer[i] > hi_2 then
128 |                         buffer[i] = hi_2
129 |                     else
130 |                         any_valid = true
131 |                     end
132 |                 end
133 |                 -- Break if we've run through the range of all bytes
134 |                 if #test > 0 and not any_valid then
135 |                     break
136 |                 end
137 | 
138 |                 -- Run the validators
139 |                 local str = ffi.string(string.char(unpack(buffer)), #buffer)
140 |                 count, fails = test_validators(str, #buffer, buffer, expected,
141 |                         count, fails)
142 |             end
143 |         end
144 | 
145 |         -- Make sure we're running tests
146 |         assert(count > last_count)
147 |     end
148 | end
149 | 
150 | -- Test that we're correctly dealing with input lengths, by feeding buffers
151 | -- with invalid bytes before and after the given range
152 | local TRAILING_TESTS = {
153 |     {  true, },
154 |     {  true, 0x40 },
155 |     {  true, 0xC2, 0x80 },
156 |     {  true, 0xE0, 0xA0, 0x80 },
157 |     {  true, 0xE1, 0x80, 0x80 },
158 |     {  true, 0xED, 0x80, 0x80 },
159 |     {  true, 0xED, 0x80, 0x80 },
160 |     {  true, 0xF4, 0x8F, 0x80, 0x80 },
161 |     { false, 0xC2, },
162 |     { false, 0xE1, 0x80 },
163 |     { false, 0xF4, 0x80, 0x80 },
164 | }
165 | 
166 | for _, test in ipairs(TRAILING_TESTS) do
167 |     local expected = table.remove(test, 1)
168 |     for pre = 0, 40 do
169 |         for post = 0, 40 do
170 |             local buffer = {}
171 |             local len = pre + #test + post
172 |             -- Fill in invalid bytes everywhere
173 |             for j = 1, 128 do buffer[j] = 0xFF end
174 |             -- Fill in valid bytes in the range being tested
175 |             for j = 2, len+1 do buffer[j] = 0x20 end
176 |             -- Fill in the test sequence
177 |             for j = 1, #test do buffer[1+pre+j] = test[j] end
178 | 
179 |             local _str = ffi.string(string.char(unpack(buffer)), #buffer)
180 |             local str = ffi.cast('const char *', _str) + 1
181 |             count, fails = test_validators(str, len, buffer, expected,
182 |                     count, fails)
183 |         end
184 |     end
185 | end
186 | 
187 | print(('passed %d/%d tests'):format(count - fails, count))
188 | 


--------------------------------------------------------------------------------
/z_validate.c:
--------------------------------------------------------------------------------
  1 | // faster-utf8-validator
  2 | //
  3 | // Copyright (c) 2019 Zach Wegner
  4 | // 
  5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | // of this software and associated documentation files (the "Software"), to deal
  7 | // in the Software without restriction, including without limitation the rights
  8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | // copies of the Software, and to permit persons to whom the Software is
 10 | // furnished to do so, subject to the following conditions:
 11 | // 
 12 | // The above copyright notice and this permission notice shall be included in
 13 | // all copies or substantial portions of the Software.
 14 | // 
 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | // SOFTWARE.
 22 | 
 23 | #include <stdint.h>
 24 | #include <immintrin.h>
 25 | 
 26 | // How this validator works:
 27 | //
 28 | //   [[[ UTF-8 refresher: UTF-8 encodes text in sequences of "code points",
 29 | //   each one from 1-4 bytes. For each code point that is longer than one byte,
 30 | //   the code point begins with a unique prefix that specifies how many bytes
 31 | //   follow. All bytes in the code point after this first have a continuation
 32 | //   marker. All code points in UTF-8 will thus look like one of the following
 33 | //   binary sequences, with x meaning "don't care":
 34 | //      1 byte:  0xxxxxxx
 35 | //      2 bytes: 110xxxxx  10xxxxxx
 36 | //      3 bytes: 1110xxxx  10xxxxxx  10xxxxxx
 37 | //      4 bytes: 11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
 38 | //   ]]]
 39 | //
 40 | // This validator works in two basic steps: checking continuation bytes, and
 41 | // handling special cases. Each step works on one vector's worth of input
 42 | // bytes at a time.
 43 | //
 44 | // The continuation bytes are handled in a fairly straightforward manner in
 45 | // the scalar domain. A mask is created from the input byte vector for each
 46 | // of the highest four bits of every byte. The first mask allows us to quickly
 47 | // skip pure ASCII input vectors, which have no bits set. The first and
 48 | // (inverted) second masks together give us every continuation byte (10xxxxxx).
 49 | // The other masks are used to find prefixes of multi-byte code points (110,
 50 | // 1110, 11110). For these, we keep a "required continuation" mask, by shifting
 51 | // these masks 1, 2, and 3 bits respectively forward in the byte stream. That
 52 | // is, we take a mask of all bytes that start with 11, and shift it left one
 53 | // bit forward to get the mask of all the first continuation bytes, then do the
 54 | // same for the second and third continuation bytes. Here's an example input
 55 | // sequence along with the corresponding masks:
 56 | //
 57 | //   bytes:        61 C3 80 62 E0 A0 80 63 F0 90 80 80 00
 58 | //   code points:  61|C3 80|62|E0 A0 80|63|F0 90 80 80|00
 59 | //   # of bytes:   1 |2  - |1 |3  -  - |1 |4  -  -  - |1
 60 | //   cont. mask 1: -  -  1  -  -  1  -  -  -  1  -  -  -
 61 | //   cont. mask 2: -  -  -  -  -  -  1  -  -  -  1  -  -
 62 | //   cont. mask 3: -  -  -  -  -  -  -  -  -  -  -  1  -
 63 | //   cont. mask *: 0  0  1  0  0  1  1  0  0  1  1  1  0
 64 | //
 65 | // The final required continuation mask is then compared with the mask of
 66 | // actual continuation bytes, and must match exactly in valid UTF-8. The only
 67 | // complication in this step is that the shifted masks can cross vector
 68 | // boundaries, so we need to keep a "carry" mask of the bits that were shifted
 69 | // past the boundary in the last loop iteration.
 70 | //
 71 | // Besides the basic prefix coding of UTF-8, there are several invalid byte
 72 | // sequences that need special handling. These are due to three factors:
 73 | // code points that could be described in fewer bytes, code points that are
 74 | // part of a surrogate pair (which are only valid in UTF-16), and code points
 75 | // that are past the highest valid code point U+10FFFF.
 76 | //
 77 | // All of the invalid sequences can be detected by independently observing
 78 | // the first three nibbles of each code point. Since AVX2 can do a 4-bit/16-byte
 79 | // lookup in parallel for all 32 bytes in a vector, we can create bit masks
 80 | // for all of these error conditions, look up the bit masks for the three
 81 | // nibbles for all input bytes, and AND them together to get a final error mask,
 82 | // that must be all zero for valid UTF-8. This is somewhat complicated by
 83 | // needing to shift the error masks from the first and second nibbles forward in
 84 | // the byte stream to line up with the third nibble.
 85 | //
 86 | // We have these possible values for valid UTF-8 sequences, broken down
 87 | // by the first three nibbles:
 88 | //
 89 | //   1st   2nd   3rd   comment
 90 | //   0..7  0..F        ASCII
 91 | //   8..B  0..F        continuation bytes
 92 | //   C     2..F  8..B  C0 xx and C1 xx can be encoded in 1 byte
 93 | //   D     0..F  8..B  D0..DF are valid with a continuation byte
 94 | //   E     0     A..B  E0 8x and E0 9x can be encoded with 2 bytes
 95 | //         1..C  8..B  E1..EC are valid with continuation bytes
 96 | //         D     8..9  ED Ax and ED Bx correspond to surrogate pairs
 97 | //         E..F  8..B  EE..EF are valid with continuation bytes
 98 | //   F     0     9..B  F0 8x can be encoded with 3 bytes
 99 | //         1..3  8..B  F1..F3 are valid with continuation bytes
100 | //         4     8     F4 8F BF BF is the maximum valid code point
101 | //
102 | // That leaves us with these invalid sequences, which would otherwise fit
103 | // into UTF-8's prefix encoding. Each of these invalid sequences needs to
104 | // be detected separately, with their own bits in the error mask.
105 | //
106 | //   1st   2nd   3rd   error bit
107 | //   C     0..1  0..F  0x01
108 | //   E     0     8..9  0x02
109 | //         D     A..B  0x04
110 | //   F     0     0..8  0x08
111 | //         4     9..F  0x10
112 | //         5..F  0..F  0x20
113 | //
114 | // For every possible value of the first, second, and third nibbles, we keep
115 | // a lookup table that contains the bitwise OR of all errors that that nibble
116 | // value can cause. For example, the first nibble has zeroes in every entry
117 | // except for C, E, and F, and the third nibble lookup has the 0x21 bits in
118 | // every entry, since those errors don't depend on the third nibble. After
119 | // doing a parallel lookup of the first/second/third nibble values for all
120 | // bytes, we AND them together. Only when all three have an error bit in common
121 | // do we fail validation.
122 | 
123 | #if defined(AVX2)
124 | 
125 | // AVX2 definitions
126 | 
127 | #   define z_validate_utf8  z_validate_utf8_avx2
128 | #   define z_validate_vec   z_validate_vec_avx2
129 | 
130 | #   define V_LEN            (32)
131 | 
132 | // Vector and vector mask types. We use #defines instead of typedefs so this
133 | // header can be included multiple times with different configurations
134 | 
135 | #   define vec_t            __m256i
136 | #   define vmask_t          uint32_t
137 | #   define vmask2_t         uint64_t
138 | 
139 | #   define v_load(x)        _mm256_loadu_si256((vec_t *)(x))
140 | #   define v_set1           _mm256_set1_epi8
141 | #   define v_and            _mm256_and_si256
142 | 
143 | #   define v_test_bit(input, bit)                                           \
144 |         _mm256_movemask_epi8(_mm256_slli_epi16((input), 7 - (bit)))
145 | 
146 | // Parallel table lookup for all bytes in a vector. We need to AND with 0x0F
147 | // for the lookup, because vpshufb has the neat "feature" that negative values
148 | // in an index byte will result in a zero.
149 | 
150 | #   define v_lookup(table, index, shift)                                    \
151 |         _mm256_shuffle_epi8((table),                                        \
152 |                 v_and(_mm256_srli_epi16((index), (shift)), v_set1(0x0F)))
153 | 
154 | #   define v_testz          _mm256_testz_si256
155 | 
156 | // Simple macro to make a vector lookup table for use with vpshufb. Since
157 | // AVX2 is two 16-byte halves, we duplicate the input values.
158 | 
159 | #   define V_TABLE_16(...)    _mm256_setr_epi8(__VA_ARGS__, __VA_ARGS__)
160 | 
161 | #   define v_shift_lanes_left v_shift_lanes_left_avx2
162 | 
163 | // Move all the bytes in "input" to the left by one and fill in the first byte
164 | // with zero. Since AVX2 generally works on two separate 16-byte vectors glued
165 | // together, this needs two steps. The permute2x128 takes the middle 32 bytes
166 | // of the 64-byte concatenation v_zero:input. The align then gives the final
167 | // result in each half:
168 | //      top half: input_L:input_H --> input_L[15]:input_H[0:14]
169 | //   bottom half:  zero_H:input_L -->  zero_H[15]:input_L[0:14]
170 | static inline vec_t v_shift_lanes_left(vec_t input) {
171 |     vec_t zero = v_set1(0);
172 |     vec_t shl_16 = _mm256_permute2x128_si256(input, zero, 0x03);
173 |     return _mm256_alignr_epi8(input, shl_16, 15);
174 | }
175 | 
176 | #elif defined(SSE4)
177 | 
178 | // SSE definitions. We require at least SSE4.1 for _mm_test_all_zeros()
179 | 
180 | #   define z_validate_utf8  z_validate_utf8_sse4
181 | #   define z_validate_vec   z_validate_vec_sse4
182 | 
183 | #   define V_LEN            (16)
184 | 
185 | #   define vec_t            __m128i
186 | #   define vmask_t          uint16_t
187 | #   define vmask2_t         uint32_t
188 | 
189 | #   define v_load(x)        _mm_lddqu_si128((vec_t *)(x))
190 | #   define v_set1           _mm_set1_epi8
191 | #   define v_and            _mm_and_si128
192 | #   define v_testz          _mm_test_all_zeros
193 | 
194 | #   define v_test_bit(input, bit)                                           \
195 |         _mm_movemask_epi8(_mm_slli_epi16((input), (uint8_t)(7 - (bit))))
196 | 
197 | #   define v_lookup(table, index, shift)                                    \
198 |         _mm_shuffle_epi8((table),                                           \
199 |                 v_and(_mm_srli_epi16((index), (shift)), v_set1(0x0F)))
200 | 
201 | #   define V_TABLE_16(...)  _mm_setr_epi8(__VA_ARGS__)
202 | 
203 | #   define v_shift_lanes_left v_shift_lanes_left_sse4
204 | static inline vec_t v_shift_lanes_left(vec_t top) {
205 |     return _mm_alignr_epi8(top, v_set1(0), 15);
206 | }
207 | 
208 | #else
209 | 
210 | #   error "No valid configuration: must define one of AVX2 or SSE4
211 | 
212 | #endif
213 | 
214 | // Validate one vector's worth of input bytes
215 | inline int z_validate_vec(vec_t bytes, vec_t shifted_bytes, vmask_t *last_cont) {
216 |     // Error lookup tables for the first, second, and third nibbles
217 |     const vec_t error_1 = V_TABLE_16(
218 |         0x00, 0x00, 0x00, 0x00,
219 |         0x00, 0x00, 0x00, 0x00,
220 |         0x00, 0x00, 0x00, 0x00,
221 |         0x01, 0x00, 0x06, 0x38
222 |     );
223 |     const vec_t error_2 = V_TABLE_16(
224 |         0x0B, 0x01, 0x00, 0x00,
225 |         0x10, 0x20, 0x20, 0x20,
226 |         0x20, 0x20, 0x20, 0x20,
227 |         0x20, 0x24, 0x20, 0x20
228 |     );
229 |     const vec_t error_3 = V_TABLE_16(
230 |         0x29, 0x29, 0x29, 0x29,
231 |         0x29, 0x29, 0x29, 0x29,
232 |         0x2B, 0x33, 0x35, 0x35,
233 |         0x31, 0x31, 0x31, 0x31
234 |     );
235 | 
236 |     // Quick skip for ascii-only input. If there are no bytes with the high bit
237 |     // set, we don't need to do any more work. We return either valid or
238 |     // invalid based on whether we expected any continuation bytes here.
239 |     vmask_t high = v_test_bit(bytes, 7);
240 |     if (!high)
241 |         return *last_cont == 0;
242 | 
243 |     // Which bytes are required to be continuation bytes
244 |     vmask2_t req = *last_cont;
245 |     // A bitmask of the actual continuation bytes in the input
246 |     vmask_t cont;
247 | 
248 |     // Compute the continuation byte mask by finding bytes that start with
249 |     // 11x, 111x, and 1111. For each of these prefixes, we get a bitmask
250 |     // and shift it forward by 1, 2, or 3. This loop should be unrolled by
251 |     // the compiler, and the (n == 1) branch inside eliminated.
252 |     vmask_t set = high;
253 |     for (int n = 1; n <= 3; n++) {
254 |         set &= v_test_bit(bytes, 7 - n);
255 |         // Mark continuation bytes: those that have the high bit set but
256 |         // not the next one
257 |         if (n == 1)
258 |             cont = high ^ set;
259 | 
260 |         // We add the shifted mask here instead of ORing it, which would
261 |         // be the more natural operation, so that this line can be done
262 |         // with one lea. While adding could give a different result due
263 |         // to carries, this will only happen for invalid UTF-8 sequences,
264 |         // and in a way that won't cause it to pass validation. Reasoning:
265 |         // Any bits for required continuation bytes come after the bits
266 |         // for their leader bytes, and are all contiguous. For a carry to
267 |         // happen, two of these bit sequences would have to overlap. If
268 |         // this is the case, there is a leader byte before the second set
269 |         // of required continuation bytes (and thus before the bit that
270 |         // will be cleared by a carry). This leader byte will not be
271 |         // in the continuation mask, despite being required. QEDish.
272 |         req += (vmask2_t)set << n;
273 |     }
274 |     // Check that continuation bytes match. We must cast req from vmask2_t
275 |     // (which holds the carry mask in the upper half) to vmask_t, which
276 |     // zeroes out the upper bits
277 |     if (cont != (vmask_t)req)
278 |         return 0;
279 | 
280 |     // Look up error masks for three consecutive nibbles.
281 |     vec_t e_1 = v_lookup(error_1, shifted_bytes, 4);
282 |     vec_t e_2 = v_lookup(error_2, shifted_bytes, 0);
283 |     vec_t e_3 = v_lookup(error_3, bytes, 4);
284 | 
285 |     // Check if any bits are set in all three error masks
286 |     if (!v_testz(v_and(e_1, e_2), e_3))
287 |         return 0;
288 | 
289 |     // Save continuation bits and input bytes for the next round
290 |     *last_cont = req >> V_LEN;
291 |     return 1;
292 | }
293 | 
294 | int z_validate_utf8(const char *data, size_t len) {
295 |     vec_t bytes, shifted_bytes;
296 | 
297 |     // Keep continuation bits from the previous iteration that carry over to
298 |     // each input chunk vector
299 |     vmask_t last_cont = 0;
300 | 
301 |     size_t offset = 0;
302 |     // Deal with the input up until the last section of bytes
303 |     if (len >= V_LEN) {
304 |         // We need a vector of the input byte stream shifted forward one byte.
305 |         // Since we don't want to read the memory before the data pointer
306 |         // (which might not even be mapped), for the first chunk of input just
307 |         // use vector instructions.
308 |         shifted_bytes = v_shift_lanes_left(v_load(data));
309 | 
310 |         // Loop over input in V_LEN-byte chunks, as long as we can safely read
311 |         // that far into memory
312 |         for (; offset + V_LEN < len; offset += V_LEN) {
313 |             bytes = v_load(data + offset);
314 |             if (!z_validate_vec(bytes, shifted_bytes, &last_cont))
315 |                 return 0;
316 |             shifted_bytes = v_load(data + offset + V_LEN - 1);
317 |         }
318 |     }
319 |     // Deal with any bytes remaining. Rather than making a separate scalar path,
320 |     // just fill in a buffer, reading bytes only up to len, and load from that.
321 |     if (offset < len) {
322 |         char buffer[V_LEN + 1] = { 0 };
323 |         if (offset > 0)
324 |             buffer[0] = data[offset - 1];
325 |         for (int i = 0; i < (int)(len - offset); i++)
326 |             buffer[i + 1] = data[offset + i];
327 | 
328 |         bytes = v_load(buffer + 1);
329 |         shifted_bytes = v_load(buffer);
330 |         if (!z_validate_vec(bytes, shifted_bytes, &last_cont))
331 |             return 0;
332 |     }
333 | 
334 |     // The input is valid if we don't have any more expected continuation bytes
335 |     return last_cont == 0;
336 | }
337 | 
338 | // Undefine all macros
339 | 
340 | #undef z_validate_utf8
341 | #undef z_validate_vec
342 | #undef V_LEN
343 | #undef vec_t
344 | #undef vmask_t
345 | #undef vmask2_t
346 | #undef v_load
347 | #undef v_set1
348 | #undef v_and
349 | #undef v_test_bit
350 | #undef v_testz
351 | #undef v_lookup
352 | #undef V_TABLE_16
353 | #undef v_shift_lanes_left
354 | 


--------------------------------------------------------------------------------