├── .gitignore ├── README.md ├── rules.py ├── test.lua └── z_validate.c /.gitignore: -------------------------------------------------------------------------------- 1 | /_out/ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # faster-utf8-validator 2 | This library is a very fast UTF-8 validator using AVX2/SSE4 instructions. As 3 | far as I am aware, it is the fastest validator in the world on the CPUs that 4 | support these instructions (...and not AVX-512). Using AVX2, it can validate 5 | random UTF-8 text as fast as .26 cycles/byte, and random ASCII text at .09 6 | cycles/byte. For UTF-8, this is roughly 1.5-1.7x faster than the 7 | [fastvalidate-utf-8](https://github.com/lemire/fastvalidate-utf-8) library. 8 | 9 | This repository contains the library (one C file), a build script for the 10 | [make.py](https://github.com/zwegner/make.py) build system, and a Lua test 11 | script (which requires LuaJIT due to use of the `ffi` module). 12 | 13 | A detailed description of the algorithm can be found in `z_validate.c`. 14 | This algorithm should map fairly nicely to AVX-512, and should in fact be a 15 | bit faster than 2x the speed of AVX2 since a few instructions can be saved. 16 | But I don't have an AVX-512 machine, so I haven't tried it yet. 17 | 18 | Benchmark 19 | ---- 20 | Here's some raw numbers, measured on my 2.4GHz Haswell laptop, using a modified 21 | version of the benchmark in the fastvalidate-utf-8 repository. There are four 22 | configurations of test input: random UTF-8 bytes or random ASCII bytes, and 23 | either 64K bytes or 16M bytes. All measurements are the best of 50 runs, with 24 | each run using a different random seed, but each validator tested with the 25 | same seeds (and thus the same inputs). All measurements are in cycles per byte. 26 | The first two rows are the fastvalidate-utf-8 AVX2 functions, and the second two 27 | rows are this library, using AVX2 and SSE4 instruction sets. 28 | 29 | | Validator | 64K UTF-8 | 64K ASCII | 16M UTF-8 | 16M ASCII | 30 | | ---------------------------------- | --------- | --------- | --------- | --------- | 31 | | `validate_utf8_fast_avx` | 0.410 | 0.410 | 0.496 | 0.429 | 32 | | `validate_utf8_fast_avx_asciipath` | 0.436 | 0.074 | 0.457 | 0.156 | 33 | | `z_validate_utf8_avx2` | 0.264 | 0.079 | 0.290 | 0.160 | 34 | | `z_validate_utf8_sse4` | 0.568 | 0.163 | 0.596 | 0.202 | 35 | -------------------------------------------------------------------------------- /rules.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def rules(ctx): 4 | files = ['z_validate'] 5 | c_flags = ['-fcolor-diagnostics', '-std=gnu11', '-march=native', 6 | '-Wall', '-Wextra', '-Werror'] 7 | configs = [ 8 | ['avx2/rel', ['-DAVX2', '-O3']], 9 | ['avx2/deb', ['-DAVX2', '-g']], 10 | ['sse4/rel', ['-DSSE4', '-O3']], 11 | ['sse4/deb', ['-DSSE4', '-g']], 12 | ] 13 | 14 | for [conf_path, conf_flags] in configs: 15 | o_files = [] 16 | for f in files: 17 | c_file = '%s.c' % f 18 | o_file = '_out/%s/%s.o' % (conf_path, f) 19 | d_file = '_out/%s/%s.d' % (conf_path, f) 20 | cmd = ['cc', '-o', o_file, '-c', c_file, '-MD', *c_flags, 21 | *conf_flags] 22 | ctx.add_rule(o_file, [c_file], cmd, d_file=d_file) 23 | o_files.append(o_file) 24 | 25 | # Main shared library 26 | bin_file = '_out/%s/zval.so' % conf_path 27 | ctx.add_rule(bin_file, o_files, 28 | ['cc', '-shared', '-o', bin_file, *c_flags, *o_files]) 29 | -------------------------------------------------------------------------------- /test.lua: -------------------------------------------------------------------------------- 1 | -- Load the library 2 | local ffi = require('ffi') 3 | local lib_avx2 = ffi.load('_out/avx2/rel/zval.so') 4 | local lib_sse4 = ffi.load('_out/sse4/rel/zval.so') 5 | ffi.cdef([[ 6 | bool z_validate_utf8_avx2(const char *data, size_t len); 7 | bool z_validate_utf8_sse4(const char *data, size_t len); 8 | ]]) 9 | 10 | local VALIDATORS = { 11 | lib_avx2.z_validate_utf8_avx2, 12 | lib_sse4.z_validate_utf8_sse4 13 | } 14 | 15 | -- Ranges for certain kinds of bytes 16 | local ANY = { 0, 0xFF } 17 | local ASCII = { 0, 0x7F } 18 | local CONT = { 0x80, 0xBF } 19 | 20 | -- Test cases. Format is { expected-result, byte-ranges... } where byte-ranges 21 | -- are 2-element tables { lo, hi }. For each byte, all byte values between the 22 | -- corresponding lo and hi values are tested. 23 | local TEST_CASES = { 24 | -- ASCII. First byte is ' ' for keeping combinatorial explosions down 25 | { true, { 0x20, 0x20 }, ASCII, ASCII, ASCII }, 26 | 27 | -- 2-byte sequences 28 | { false, { 0xC2, 0xC2 }, }, 29 | { false, { 0xC2, 0xC2 }, ASCII }, 30 | { true, { 0xC2, 0xC2 }, CONT }, 31 | { false, { 0xC2, 0xC2 }, { 0xC0, 0xFF} }, 32 | { false, { 0xC2, 0xC2 }, CONT, CONT }, 33 | { false, { 0xC2, 0xC2 }, CONT, CONT, CONT }, 34 | 35 | -- 3-byte sequences 36 | { false, { 0xE1, 0xE1 }, }, 37 | { false, { 0xE1, 0xE1 }, CONT }, 38 | { true, { 0xE1, 0xE1 }, CONT, CONT }, 39 | { true, { 0xE1, 0xE1 }, CONT, CONT, ASCII }, 40 | { false, { 0xE1, 0xE1 }, CONT, ASCII }, 41 | { false, { 0xE1, 0xE1 }, CONT, CONT, CONT }, 42 | 43 | -- 4-byte sequences 44 | { false, { 0xF1, 0xF1 }, }, 45 | { false, { 0xF1, 0xF1 }, CONT }, 46 | { false, { 0xF1, 0xF1 }, CONT, CONT }, 47 | { true, { 0xF1, 0xF1 }, CONT, CONT, CONT }, 48 | { false, { 0xF1, 0xF1 }, CONT, CONT, ASCII }, 49 | { true, { 0xF1, 0xF1 }, CONT, CONT, CONT, ASCII }, 50 | 51 | -- No C0/C1 bytes (overlong) 52 | { false, { 0xC0, 0xC1 }, ANY }, 53 | { false, { 0xC0, 0xC1 }, ANY, ANY }, 54 | { false, { 0xC0, 0xC1 }, ANY, ANY, ANY }, 55 | 56 | -- No E0 followed by 80..9F (overlong) 57 | { false, { 0xE0, 0xE0 }, { 0x00, 0x9F }, CONT }, 58 | { true, { 0xE0, 0xE0 }, { 0xA0, 0xBF }, CONT }, 59 | 60 | -- No surrogate pairs 61 | { true, { 0xE1, 0xEC }, CONT, CONT }, 62 | { true, { 0xED, 0xED }, { 0x80, 0x9F }, CONT }, 63 | { false, { 0xED, 0xED }, { 0xA0, 0xBF }, CONT }, 64 | { true, { 0xEE, 0xEF }, CONT, CONT }, 65 | 66 | -- No F0 followed by 80..8F (overlong) 67 | { false, { 0xF0, 0xF0 }, { 0x80, 0x8F }, CONT, CONT }, 68 | { true, { 0xF0, 0xF0 }, { 0x90, 0xBF }, CONT, CONT }, 69 | 70 | -- No code points above U+10FFFF 71 | { true, { 0xF4, 0xF4 }, { 0x80, 0x8F }, CONT, CONT }, 72 | { false, { 0xF4, 0xF4 }, { 0x90, 0xBF }, CONT, CONT }, 73 | 74 | -- No bytes above F4 75 | { false, { 0xF5, 0xFF }, ANY }, 76 | { false, { 0xF5, 0xFF }, ANY, ANY }, 77 | { false, { 0xF5, 0xFF }, ANY, ANY, ANY }, 78 | } 79 | 80 | -- Array string 81 | function astr(array) 82 | local r = '{' 83 | for _, value in ipairs(array) do 84 | r = r .. ('%2X'):format(value) .. ',' 85 | end 86 | return r .. '}' 87 | end 88 | 89 | -- A little helper function for running an input on each validator 90 | function test_validators(str, len, buffer, expected, count, fails) 91 | for _, validate in ipairs(VALIDATORS) do 92 | local result = validate(str, len) 93 | if result ~= expected then 94 | fails = fails + 1 95 | print('failure:', result, expected, astr(buffer)) 96 | assert(false) 97 | end 98 | count = count + 1 99 | end 100 | return count, fails 101 | end 102 | 103 | local count, fails = 0, 0 104 | for idx, test in ipairs(TEST_CASES) do 105 | local expected = table.remove(test, 1) 106 | local lo_1, hi_1 = unpack(table.remove(test, 1)) 107 | 108 | -- Loop through various frame shifts, to make sure we catch any issues due 109 | -- to vector alignment 110 | for _, k in ipairs{1, 10, 28, 29, 20, 31, 32, 33} do 111 | local buffer = {} 112 | for j = 1, 64 do buffer[j] = 0 end 113 | 114 | local last_count = count 115 | 116 | -- Loop through first byte 117 | for b = lo_1, hi_1 do 118 | buffer[k] = b 119 | 120 | -- Find maximum range of values in remaining bytes 121 | for offset = 0, 255 do 122 | local any_valid = false 123 | for i, range in ipairs(test) do 124 | i = i + k 125 | local lo_2, hi_2 = unpack(range) 126 | buffer[i] = lo_2 + offset 127 | if buffer[i] > hi_2 then 128 | buffer[i] = hi_2 129 | else 130 | any_valid = true 131 | end 132 | end 133 | -- Break if we've run through the range of all bytes 134 | if #test > 0 and not any_valid then 135 | break 136 | end 137 | 138 | -- Run the validators 139 | local str = ffi.string(string.char(unpack(buffer)), #buffer) 140 | count, fails = test_validators(str, #buffer, buffer, expected, 141 | count, fails) 142 | end 143 | end 144 | 145 | -- Make sure we're running tests 146 | assert(count > last_count) 147 | end 148 | end 149 | 150 | -- Test that we're correctly dealing with input lengths, by feeding buffers 151 | -- with invalid bytes before and after the given range 152 | local TRAILING_TESTS = { 153 | { true, }, 154 | { true, 0x40 }, 155 | { true, 0xC2, 0x80 }, 156 | { true, 0xE0, 0xA0, 0x80 }, 157 | { true, 0xE1, 0x80, 0x80 }, 158 | { true, 0xED, 0x80, 0x80 }, 159 | { true, 0xED, 0x80, 0x80 }, 160 | { true, 0xF4, 0x8F, 0x80, 0x80 }, 161 | { false, 0xC2, }, 162 | { false, 0xE1, 0x80 }, 163 | { false, 0xF4, 0x80, 0x80 }, 164 | } 165 | 166 | for _, test in ipairs(TRAILING_TESTS) do 167 | local expected = table.remove(test, 1) 168 | for pre = 0, 40 do 169 | for post = 0, 40 do 170 | local buffer = {} 171 | local len = pre + #test + post 172 | -- Fill in invalid bytes everywhere 173 | for j = 1, 128 do buffer[j] = 0xFF end 174 | -- Fill in valid bytes in the range being tested 175 | for j = 2, len+1 do buffer[j] = 0x20 end 176 | -- Fill in the test sequence 177 | for j = 1, #test do buffer[1+pre+j] = test[j] end 178 | 179 | local _str = ffi.string(string.char(unpack(buffer)), #buffer) 180 | local str = ffi.cast('const char *', _str) + 1 181 | count, fails = test_validators(str, len, buffer, expected, 182 | count, fails) 183 | end 184 | end 185 | end 186 | 187 | print(('passed %d/%d tests'):format(count - fails, count)) 188 | -------------------------------------------------------------------------------- /z_validate.c: -------------------------------------------------------------------------------- 1 | // faster-utf8-validator 2 | // 3 | // Copyright (c) 2019 Zach Wegner 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in 13 | // all copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | #include 24 | #include 25 | 26 | // How this validator works: 27 | // 28 | // [[[ UTF-8 refresher: UTF-8 encodes text in sequences of "code points", 29 | // each one from 1-4 bytes. For each code point that is longer than one byte, 30 | // the code point begins with a unique prefix that specifies how many bytes 31 | // follow. All bytes in the code point after this first have a continuation 32 | // marker. All code points in UTF-8 will thus look like one of the following 33 | // binary sequences, with x meaning "don't care": 34 | // 1 byte: 0xxxxxxx 35 | // 2 bytes: 110xxxxx 10xxxxxx 36 | // 3 bytes: 1110xxxx 10xxxxxx 10xxxxxx 37 | // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 38 | // ]]] 39 | // 40 | // This validator works in two basic steps: checking continuation bytes, and 41 | // handling special cases. Each step works on one vector's worth of input 42 | // bytes at a time. 43 | // 44 | // The continuation bytes are handled in a fairly straightforward manner in 45 | // the scalar domain. A mask is created from the input byte vector for each 46 | // of the highest four bits of every byte. The first mask allows us to quickly 47 | // skip pure ASCII input vectors, which have no bits set. The first and 48 | // (inverted) second masks together give us every continuation byte (10xxxxxx). 49 | // The other masks are used to find prefixes of multi-byte code points (110, 50 | // 1110, 11110). For these, we keep a "required continuation" mask, by shifting 51 | // these masks 1, 2, and 3 bits respectively forward in the byte stream. That 52 | // is, we take a mask of all bytes that start with 11, and shift it left one 53 | // bit forward to get the mask of all the first continuation bytes, then do the 54 | // same for the second and third continuation bytes. Here's an example input 55 | // sequence along with the corresponding masks: 56 | // 57 | // bytes: 61 C3 80 62 E0 A0 80 63 F0 90 80 80 00 58 | // code points: 61|C3 80|62|E0 A0 80|63|F0 90 80 80|00 59 | // # of bytes: 1 |2 - |1 |3 - - |1 |4 - - - |1 60 | // cont. mask 1: - - 1 - - 1 - - - 1 - - - 61 | // cont. mask 2: - - - - - - 1 - - - 1 - - 62 | // cont. mask 3: - - - - - - - - - - - 1 - 63 | // cont. mask *: 0 0 1 0 0 1 1 0 0 1 1 1 0 64 | // 65 | // The final required continuation mask is then compared with the mask of 66 | // actual continuation bytes, and must match exactly in valid UTF-8. The only 67 | // complication in this step is that the shifted masks can cross vector 68 | // boundaries, so we need to keep a "carry" mask of the bits that were shifted 69 | // past the boundary in the last loop iteration. 70 | // 71 | // Besides the basic prefix coding of UTF-8, there are several invalid byte 72 | // sequences that need special handling. These are due to three factors: 73 | // code points that could be described in fewer bytes, code points that are 74 | // part of a surrogate pair (which are only valid in UTF-16), and code points 75 | // that are past the highest valid code point U+10FFFF. 76 | // 77 | // All of the invalid sequences can be detected by independently observing 78 | // the first three nibbles of each code point. Since AVX2 can do a 4-bit/16-byte 79 | // lookup in parallel for all 32 bytes in a vector, we can create bit masks 80 | // for all of these error conditions, look up the bit masks for the three 81 | // nibbles for all input bytes, and AND them together to get a final error mask, 82 | // that must be all zero for valid UTF-8. This is somewhat complicated by 83 | // needing to shift the error masks from the first and second nibbles forward in 84 | // the byte stream to line up with the third nibble. 85 | // 86 | // We have these possible values for valid UTF-8 sequences, broken down 87 | // by the first three nibbles: 88 | // 89 | // 1st 2nd 3rd comment 90 | // 0..7 0..F ASCII 91 | // 8..B 0..F continuation bytes 92 | // C 2..F 8..B C0 xx and C1 xx can be encoded in 1 byte 93 | // D 0..F 8..B D0..DF are valid with a continuation byte 94 | // E 0 A..B E0 8x and E0 9x can be encoded with 2 bytes 95 | // 1..C 8..B E1..EC are valid with continuation bytes 96 | // D 8..9 ED Ax and ED Bx correspond to surrogate pairs 97 | // E..F 8..B EE..EF are valid with continuation bytes 98 | // F 0 9..B F0 8x can be encoded with 3 bytes 99 | // 1..3 8..B F1..F3 are valid with continuation bytes 100 | // 4 8 F4 8F BF BF is the maximum valid code point 101 | // 102 | // That leaves us with these invalid sequences, which would otherwise fit 103 | // into UTF-8's prefix encoding. Each of these invalid sequences needs to 104 | // be detected separately, with their own bits in the error mask. 105 | // 106 | // 1st 2nd 3rd error bit 107 | // C 0..1 0..F 0x01 108 | // E 0 8..9 0x02 109 | // D A..B 0x04 110 | // F 0 0..8 0x08 111 | // 4 9..F 0x10 112 | // 5..F 0..F 0x20 113 | // 114 | // For every possible value of the first, second, and third nibbles, we keep 115 | // a lookup table that contains the bitwise OR of all errors that that nibble 116 | // value can cause. For example, the first nibble has zeroes in every entry 117 | // except for C, E, and F, and the third nibble lookup has the 0x21 bits in 118 | // every entry, since those errors don't depend on the third nibble. After 119 | // doing a parallel lookup of the first/second/third nibble values for all 120 | // bytes, we AND them together. Only when all three have an error bit in common 121 | // do we fail validation. 122 | 123 | #if defined(AVX2) 124 | 125 | // AVX2 definitions 126 | 127 | # define z_validate_utf8 z_validate_utf8_avx2 128 | # define z_validate_vec z_validate_vec_avx2 129 | 130 | # define V_LEN (32) 131 | 132 | // Vector and vector mask types. We use #defines instead of typedefs so this 133 | // header can be included multiple times with different configurations 134 | 135 | # define vec_t __m256i 136 | # define vmask_t uint32_t 137 | # define vmask2_t uint64_t 138 | 139 | # define v_load(x) _mm256_loadu_si256((vec_t *)(x)) 140 | # define v_set1 _mm256_set1_epi8 141 | # define v_and _mm256_and_si256 142 | 143 | # define v_test_bit(input, bit) \ 144 | _mm256_movemask_epi8(_mm256_slli_epi16((input), 7 - (bit))) 145 | 146 | // Parallel table lookup for all bytes in a vector. We need to AND with 0x0F 147 | // for the lookup, because vpshufb has the neat "feature" that negative values 148 | // in an index byte will result in a zero. 149 | 150 | # define v_lookup(table, index, shift) \ 151 | _mm256_shuffle_epi8((table), \ 152 | v_and(_mm256_srli_epi16((index), (shift)), v_set1(0x0F))) 153 | 154 | # define v_testz _mm256_testz_si256 155 | 156 | // Simple macro to make a vector lookup table for use with vpshufb. Since 157 | // AVX2 is two 16-byte halves, we duplicate the input values. 158 | 159 | # define V_TABLE_16(...) _mm256_setr_epi8(__VA_ARGS__, __VA_ARGS__) 160 | 161 | # define v_shift_lanes_left v_shift_lanes_left_avx2 162 | 163 | // Move all the bytes in "input" to the left by one and fill in the first byte 164 | // with zero. Since AVX2 generally works on two separate 16-byte vectors glued 165 | // together, this needs two steps. The permute2x128 takes the middle 32 bytes 166 | // of the 64-byte concatenation v_zero:input. The align then gives the final 167 | // result in each half: 168 | // top half: input_L:input_H --> input_L[15]:input_H[0:14] 169 | // bottom half: zero_H:input_L --> zero_H[15]:input_L[0:14] 170 | static inline vec_t v_shift_lanes_left(vec_t input) { 171 | vec_t zero = v_set1(0); 172 | vec_t shl_16 = _mm256_permute2x128_si256(input, zero, 0x03); 173 | return _mm256_alignr_epi8(input, shl_16, 15); 174 | } 175 | 176 | #elif defined(SSE4) 177 | 178 | // SSE definitions. We require at least SSE4.1 for _mm_test_all_zeros() 179 | 180 | # define z_validate_utf8 z_validate_utf8_sse4 181 | # define z_validate_vec z_validate_vec_sse4 182 | 183 | # define V_LEN (16) 184 | 185 | # define vec_t __m128i 186 | # define vmask_t uint16_t 187 | # define vmask2_t uint32_t 188 | 189 | # define v_load(x) _mm_lddqu_si128((vec_t *)(x)) 190 | # define v_set1 _mm_set1_epi8 191 | # define v_and _mm_and_si128 192 | # define v_testz _mm_test_all_zeros 193 | 194 | # define v_test_bit(input, bit) \ 195 | _mm_movemask_epi8(_mm_slli_epi16((input), (uint8_t)(7 - (bit)))) 196 | 197 | # define v_lookup(table, index, shift) \ 198 | _mm_shuffle_epi8((table), \ 199 | v_and(_mm_srli_epi16((index), (shift)), v_set1(0x0F))) 200 | 201 | # define V_TABLE_16(...) _mm_setr_epi8(__VA_ARGS__) 202 | 203 | # define v_shift_lanes_left v_shift_lanes_left_sse4 204 | static inline vec_t v_shift_lanes_left(vec_t top) { 205 | return _mm_alignr_epi8(top, v_set1(0), 15); 206 | } 207 | 208 | #else 209 | 210 | # error "No valid configuration: must define one of AVX2 or SSE4 211 | 212 | #endif 213 | 214 | // Validate one vector's worth of input bytes 215 | inline int z_validate_vec(vec_t bytes, vec_t shifted_bytes, vmask_t *last_cont) { 216 | // Error lookup tables for the first, second, and third nibbles 217 | const vec_t error_1 = V_TABLE_16( 218 | 0x00, 0x00, 0x00, 0x00, 219 | 0x00, 0x00, 0x00, 0x00, 220 | 0x00, 0x00, 0x00, 0x00, 221 | 0x01, 0x00, 0x06, 0x38 222 | ); 223 | const vec_t error_2 = V_TABLE_16( 224 | 0x0B, 0x01, 0x00, 0x00, 225 | 0x10, 0x20, 0x20, 0x20, 226 | 0x20, 0x20, 0x20, 0x20, 227 | 0x20, 0x24, 0x20, 0x20 228 | ); 229 | const vec_t error_3 = V_TABLE_16( 230 | 0x29, 0x29, 0x29, 0x29, 231 | 0x29, 0x29, 0x29, 0x29, 232 | 0x2B, 0x33, 0x35, 0x35, 233 | 0x31, 0x31, 0x31, 0x31 234 | ); 235 | 236 | // Quick skip for ascii-only input. If there are no bytes with the high bit 237 | // set, we don't need to do any more work. We return either valid or 238 | // invalid based on whether we expected any continuation bytes here. 239 | vmask_t high = v_test_bit(bytes, 7); 240 | if (!high) 241 | return *last_cont == 0; 242 | 243 | // Which bytes are required to be continuation bytes 244 | vmask2_t req = *last_cont; 245 | // A bitmask of the actual continuation bytes in the input 246 | vmask_t cont; 247 | 248 | // Compute the continuation byte mask by finding bytes that start with 249 | // 11x, 111x, and 1111. For each of these prefixes, we get a bitmask 250 | // and shift it forward by 1, 2, or 3. This loop should be unrolled by 251 | // the compiler, and the (n == 1) branch inside eliminated. 252 | vmask_t set = high; 253 | for (int n = 1; n <= 3; n++) { 254 | set &= v_test_bit(bytes, 7 - n); 255 | // Mark continuation bytes: those that have the high bit set but 256 | // not the next one 257 | if (n == 1) 258 | cont = high ^ set; 259 | 260 | // We add the shifted mask here instead of ORing it, which would 261 | // be the more natural operation, so that this line can be done 262 | // with one lea. While adding could give a different result due 263 | // to carries, this will only happen for invalid UTF-8 sequences, 264 | // and in a way that won't cause it to pass validation. Reasoning: 265 | // Any bits for required continuation bytes come after the bits 266 | // for their leader bytes, and are all contiguous. For a carry to 267 | // happen, two of these bit sequences would have to overlap. If 268 | // this is the case, there is a leader byte before the second set 269 | // of required continuation bytes (and thus before the bit that 270 | // will be cleared by a carry). This leader byte will not be 271 | // in the continuation mask, despite being required. QEDish. 272 | req += (vmask2_t)set << n; 273 | } 274 | // Check that continuation bytes match. We must cast req from vmask2_t 275 | // (which holds the carry mask in the upper half) to vmask_t, which 276 | // zeroes out the upper bits 277 | if (cont != (vmask_t)req) 278 | return 0; 279 | 280 | // Look up error masks for three consecutive nibbles. 281 | vec_t e_1 = v_lookup(error_1, shifted_bytes, 4); 282 | vec_t e_2 = v_lookup(error_2, shifted_bytes, 0); 283 | vec_t e_3 = v_lookup(error_3, bytes, 4); 284 | 285 | // Check if any bits are set in all three error masks 286 | if (!v_testz(v_and(e_1, e_2), e_3)) 287 | return 0; 288 | 289 | // Save continuation bits and input bytes for the next round 290 | *last_cont = req >> V_LEN; 291 | return 1; 292 | } 293 | 294 | int z_validate_utf8(const char *data, size_t len) { 295 | vec_t bytes, shifted_bytes; 296 | 297 | // Keep continuation bits from the previous iteration that carry over to 298 | // each input chunk vector 299 | vmask_t last_cont = 0; 300 | 301 | size_t offset = 0; 302 | // Deal with the input up until the last section of bytes 303 | if (len >= V_LEN) { 304 | // We need a vector of the input byte stream shifted forward one byte. 305 | // Since we don't want to read the memory before the data pointer 306 | // (which might not even be mapped), for the first chunk of input just 307 | // use vector instructions. 308 | shifted_bytes = v_shift_lanes_left(v_load(data)); 309 | 310 | // Loop over input in V_LEN-byte chunks, as long as we can safely read 311 | // that far into memory 312 | for (; offset + V_LEN < len; offset += V_LEN) { 313 | bytes = v_load(data + offset); 314 | if (!z_validate_vec(bytes, shifted_bytes, &last_cont)) 315 | return 0; 316 | shifted_bytes = v_load(data + offset + V_LEN - 1); 317 | } 318 | } 319 | // Deal with any bytes remaining. Rather than making a separate scalar path, 320 | // just fill in a buffer, reading bytes only up to len, and load from that. 321 | if (offset < len) { 322 | char buffer[V_LEN + 1] = { 0 }; 323 | if (offset > 0) 324 | buffer[0] = data[offset - 1]; 325 | for (int i = 0; i < (int)(len - offset); i++) 326 | buffer[i + 1] = data[offset + i]; 327 | 328 | bytes = v_load(buffer + 1); 329 | shifted_bytes = v_load(buffer); 330 | if (!z_validate_vec(bytes, shifted_bytes, &last_cont)) 331 | return 0; 332 | } 333 | 334 | // The input is valid if we don't have any more expected continuation bytes 335 | return last_cont == 0; 336 | } 337 | 338 | // Undefine all macros 339 | 340 | #undef z_validate_utf8 341 | #undef z_validate_vec 342 | #undef V_LEN 343 | #undef vec_t 344 | #undef vmask_t 345 | #undef vmask2_t 346 | #undef v_load 347 | #undef v_set1 348 | #undef v_and 349 | #undef v_test_bit 350 | #undef v_testz 351 | #undef v_lookup 352 | #undef V_TABLE_16 353 | #undef v_shift_lanes_left 354 | --------------------------------------------------------------------------------