├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── README.md
├── blake3.c
├── blake3.h
├── blake3_avx2.c
├── blake3_avx2_x86-64_unix.S
├── blake3_avx2_x86-64_windows_gnu.S
├── blake3_avx2_x86-64_windows_msvc.asm
├── blake3_avx512.c
├── blake3_avx512_x86-64_unix.S
├── blake3_avx512_x86-64_windows_gnu.S
├── blake3_avx512_x86-64_windows_msvc.asm
├── blake3_dispatch.c
├── blake3_impl.h
├── blake3_portable.c
├── blake3_sse2.c
├── blake3_sse2_x86-64_unix.S
├── blake3_sse2_x86-64_windows_gnu.S
├── blake3_sse2_x86-64_windows_msvc.asm
├── blake3_sse41.c
├── blake3_sse41_x86-64_unix.S
├── blake3_sse41_x86-64_windows_gnu.S
├── blake3_sse41_x86-64_windows_msvc.asm
├── build.sh
├── compiled
    └── .gitkeep
├── config.m4
├── config.w32
├── php_blake3.c
└── php_blake3.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | .deps
 2 | .libs/
 3 | Makefile
 4 | Makefile.fragments
 5 | Makefile.global
 6 | Makefile.objects
 7 | acinclude.m4
 8 | aclocal.m4
 9 | autom4te.cache/
10 | blake2.la
11 | blake2b-ref.lo
12 | blake2s-ref.lo
13 | build/
14 | config.guess
15 | config.h
16 | config.h.in
17 | config.log
18 | config.nice
19 | config.status
20 | config.sub
21 | configure
22 | configure.in
23 | install-sh
24 | libtool
25 | ltmain.sh
26 | missing
27 | mkinstalldirs
28 | modules/
29 | include/
30 | php_blake2.lo
31 | run-tests.php
32 | *.lo
33 | .idea/
34 | configure.ac
35 | /compiled/*.so


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: php
 2 | php:
 3 |   - '8.1'
 4 |   - '8.2'
 5 |   - '8.3'
 6 | env:
 7 |   - NO_INTERACTION=1
 8 | before_script:
 9 |   - phpize
10 |   - ./configure --enable-blake2
11 |   - make
12 | script: make test
13 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:24.04
2 | LABEL authors="cypherbits"
3 | ENV LC_ALL=C.UTF-8
4 | RUN apt-get update -y && apt-get dist-upgrade software-properties-common -y
5 | RUN add-apt-repository ppa:ondrej/php
6 | RUN apt-get update -y && apt-get install php8.3 php8.3-dev -y
7 | COPY . /making
8 | RUN cd /making && phpize && ./configure --enable-blake3 && make && make install
9 | CMD bash


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | CFLAGS = -g -O2 -mavx -mavx2 -mssse3 -mavx512f -msse4.1 -msse -mpclmul -mavx512dq -mavx512vl
 2 | 
 3 | 
 4 | Copyright (c) 2012-present strawbrary
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | PHP BLAKE3 Extension
  2 | ============================
  3 | 
  4 | BLAKE3 is an improved and faster version of BLAKE2.
  5 | 
  6 | This extension uses the official BLAKE3 C implementation, thus is single-threaded, but still faster than SHA256 or SHA512 on my benchmark on latest PHP 7.4.
  7 | 
  8 | Installation
  9 | ------------
 10 | Clone the repository and compile it:
 11 | ```sh
 12 | $ git clone https://github.com/cypherbits/php-blake3.git
 13 | $ cd php-blake3
 14 | $ phpize
 15 | $ ./configure --enable-blake3
 16 | $ make && sudo make install
 17 | ```
 18 | 
 19 | Enable the extension by adding the following line to your php.ini file:
 20 | 
 21 | ```sh
 22 | extension=blake3.so
 23 | ```
 24 | 
 25 | You may need to restart your web server to load the extension.
 26 | 
 27 | 
 28 | Usage
 29 | ----
 30 | 
 31 | **Global constants:**
 32 | 
 33 | `BLAKE3_OUT_LEN: 32`
 34 | 
 35 | **Functions:**
 36 | 
 37 | ```php
 38 | string blake3 ( string $str [, int $outputSize = 64, string $key, bool $rawOutput = false ] )
 39 | ```
 40 | 
 41 | * $str: The string to hash
 42 | * $outputSize: The length of the output hash (can be between 1 and 64)
 43 | * $key: Turns the output into a keyed hash using the specified key. It MUST be of 32 bytes long.
 44 | * $rawOutput: If set to true, then the hash is returned in raw binary format
 45 | 
 46 | * Return value: A hex string containing the BLAKE3 hash of the input string. Default output size: 32 bytes.
 47 | 
 48 | ```php
 49 | string blake3_file ( string $filename [, bool $rawOutput = false ] )
 50 | ```
 51 | 
 52 | * $filename: The filename of the file to hash
 53 | * $rawOutput: If set to true, then the hash is returned in raw binary format
 54 | * Return value: A hex string containing the BLAKE3 hash of the input file
 55 | 
 56 | Examples
 57 | --------
 58 | ```php
 59 | echo blake3('');
 60 | ```
 61 | 
 62 | af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262
 63 | 
 64 | ```php
 65 | echo blake3('Hello world', 20);
 66 | ```
 67 | 
 68 | e7e6fb7d2869d109b62cdb1227208d4016cdaa0a
 69 | 
 70 | ```php
 71 | echo blake3('Hello world', 32, 'cae8954e7b3415ea18303db548e15207');
 72 | ```
 73 | 
 74 | 75672fafd13480d2325914f0665795eceecad4e668d9ea2a87c40e71232a7d3a
 75 | 
 76 | Benchmarks
 77 | --------
 78 | ```php
 79 | <?php
 80 | 
 81 | $start = microtime(true);
 82 | 
 83 | for ($i=0; $i<1000; $i++){
 84 |     $str = md5("hello");
 85 | }
 86 | 
 87 | $end = microtime(true);
 88 | 
 89 | printf("Function call %s took %.5f seconds\n", "md5", $end - $start);
 90 | 
 91 | $start = microtime(true);
 92 | 
 93 | for ($i=0; $i<1000; $i++){
 94 |     $str = sha1("hello");
 95 | }
 96 | 
 97 | $end = microtime(true);
 98 | 
 99 | printf("Function call %s took %.5f seconds\n", "sha1", $end - $start);
100 | 
101 | $start = microtime(true);
102 | 
103 | for ($i=0; $i<1000; $i++){
104 |     $str = hash("sha256", "hello");
105 | }
106 | 
107 | $end = microtime(true);
108 | 
109 | printf("Function call %s took %.5f seconds\n", "sha256", $end - $start);
110 | 
111 | 
112 | $start = microtime(true);
113 | 
114 | for ($i=0; $i<1000; $i++){
115 |     $str = hash("sha512", "hello");
116 | }
117 | 
118 | $end = microtime(true);
119 | 
120 | printf("Function call %s took %.5f seconds\n", "sha512", $end - $start);
121 | 
122 | 
123 | $start = microtime(true);
124 | 
125 | for ($i=0; $i<1000; $i++){
126 |     $str = blake3("hello");
127 | }
128 | 
129 | $end = microtime(true);
130 | 
131 | printf("Function call %s took %.5f seconds\n", "blake3", $end - $start);
132 | ```
133 | 
134 | **Results PHP 7.4**
135 | 
136 | As fast as SHA1 but safer.
137 | 
138 | ```php
139 | Function call md5 took 0.00022 seconds
140 | Function call sha1 took 0.00030 seconds
141 | Function call sha256 took 0.00070 seconds
142 | Function call sha512 took 0.00093 seconds
143 | Function call blake3 took 0.00030 seconds
144 | ```
145 | 
146 | **Results PHP 8.0**
147 | 
148 | As fast as SHA1 but safer.
149 | 
150 | ```php
151 | Function call md5 took 0.00018 seconds
152 | Function call sha1 took 0.00028 seconds
153 | Function call sha256 took 0.00059 seconds
154 | Function call sha512 took 0.00076 seconds
155 | Function call blake3 took 0.00028 seconds
156 | ```
157 | 
158 | **Results PHP 8.0 for a random 4MB file (loop of 100 iterations)**
159 | 
160 | On file hashing Blake3 wins even over SHA1!
161 | 
162 | ```php
163 | Function call sha512 took 1.69619 seconds
164 | Function call sha256 took 2.51510 seconds
165 | Function call blake3 took 0.60553 seconds
166 | Function call sha1 took 1.03434 seconds
167 | ```
168 | 
169 | More Info
170 | ---------
171 | https://github.com/BLAKE3-team/BLAKE3
172 | 
173 | Donate
174 | ------
175 | https://ko-fi.com/cypherbits
176 | 
177 | Monero address:
178 | `4BCveGZaPM7FejGkhFyHgtjVXZw52RrYxKs7znZdmnWLfB3xDKAW6SkYZPpNhqBvJA8crE8Tug8y7hx8U9KAmq83PwLtVLe`


--------------------------------------------------------------------------------
/blake3.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <stdbool.h>
  3 | #include <string.h>
  4 | 
  5 | #include "blake3.h"
  6 | #include "blake3_impl.h"
  7 | 
  8 | 
  9 | /* inlen, at least, should be uint64_t. Others can be size_t. */
 10 | int blake3( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
 11 | {
 12 |     // Initialize the hasher.
 13 |     blake3_hasher hasher;
 14 | 
 15 |     /* Verify parameters */
 16 |     if ( NULL == in && inlen > 0 ) return -1;
 17 | 
 18 |     if ( NULL == out ) return -1;
 19 | 
 20 |     if( NULL == key && keylen > 0 ) return -1;
 21 | 
 22 |     if( keylen == BLAKE3_KEY_LEN )
 23 |     {
 24 |         blake3_hasher_init_keyed(&hasher, key);
 25 |     }
 26 |     else
 27 |     {
 28 |         blake3_hasher_init(&hasher);
 29 |     }
 30 | 
 31 |     blake3_hasher_update(&hasher, in, inlen);
 32 | 
 33 |     blake3_hasher_finalize(&hasher, out, outlen);
 34 | 
 35 |     return 0;
 36 | }
 37 | 
 38 | INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
 39 |                              uint8_t flags) {
 40 |   memcpy(self->cv, key, BLAKE3_KEY_LEN);
 41 |   self->chunk_counter = 0;
 42 |   memset(self->buf, 0, BLAKE3_BLOCK_LEN);
 43 |   self->buf_len = 0;
 44 |   self->blocks_compressed = 0;
 45 |   self->flags = flags;
 46 | }
 47 | 
 48 | INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8],
 49 |                               uint64_t chunk_counter) {
 50 |   memcpy(self->cv, key, BLAKE3_KEY_LEN);
 51 |   self->chunk_counter = chunk_counter;
 52 |   self->blocks_compressed = 0;
 53 |   memset(self->buf, 0, BLAKE3_BLOCK_LEN);
 54 |   self->buf_len = 0;
 55 | }
 56 | 
 57 | INLINE size_t chunk_state_len(const blake3_chunk_state *self) {
 58 |   return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) +
 59 |          ((size_t)self->buf_len);
 60 | }
 61 | 
 62 | INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self,
 63 |                                    const uint8_t *input, size_t input_len) {
 64 |   size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len);
 65 |   if (take > input_len) {
 66 |     take = input_len;
 67 |   }
 68 |   uint8_t *dest = self->buf + ((size_t)self->buf_len);
 69 |   memcpy(dest, input, take);
 70 |   self->buf_len += (uint8_t)take;
 71 |   return take;
 72 | }
 73 | 
 74 | INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) {
 75 |   if (self->blocks_compressed == 0) {
 76 |     return CHUNK_START;
 77 |   } else {
 78 |     return 0;
 79 |   }
 80 | }
 81 | 
 82 | typedef struct {
 83 |   uint32_t input_cv[8];
 84 |   uint64_t counter;
 85 |   uint8_t block[BLAKE3_BLOCK_LEN];
 86 |   uint8_t block_len;
 87 |   uint8_t flags;
 88 | } output_t;
 89 | 
 90 | INLINE output_t make_output(const uint32_t input_cv[8],
 91 |                             const uint8_t block[BLAKE3_BLOCK_LEN],
 92 |                             uint8_t block_len, uint64_t counter,
 93 |                             uint8_t flags) {
 94 |   output_t ret;
 95 |   memcpy(ret.input_cv, input_cv, 32);
 96 |   memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
 97 |   ret.block_len = block_len;
 98 |   ret.counter = counter;
 99 |   ret.flags = flags;
100 |   return ret;
101 | }
102 | 
103 | // Chaining values within a given chunk (specifically the compress_in_place
104 | // interface) are represented as words. This avoids unnecessary bytes<->words
105 | // conversion overhead in the portable implementation. However, the hash_many
106 | // interface handles both user input and parent node blocks, so it accepts
107 | // bytes. For that reason, chaining values in the CV stack are represented as
108 | // bytes.
109 | INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
110 |   uint32_t cv_words[8];
111 |   memcpy(cv_words, self->input_cv, 32);
112 |   blake3_compress_in_place(cv_words, self->block, self->block_len,
113 |                            self->counter, self->flags);
114 |   store_cv_words(cv, cv_words);
115 | }
116 | 
117 | INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
118 |                               size_t out_len) {
119 |   uint64_t output_block_counter = seek / 64;
120 |   size_t offset_within_block = seek % 64;
121 |   uint8_t wide_buf[64];
122 |   while (out_len > 0) {
123 |     blake3_compress_xof(self->input_cv, self->block, self->block_len,
124 |                         output_block_counter, self->flags | ROOT, wide_buf);
125 |     size_t available_bytes = 64 - offset_within_block;
126 |     size_t memcpy_len;
127 |     if (out_len > available_bytes) {
128 |       memcpy_len = available_bytes;
129 |     } else {
130 |       memcpy_len = out_len;
131 |     }
132 |     memcpy(out, wide_buf + offset_within_block, memcpy_len);
133 |     out += memcpy_len;
134 |     out_len -= memcpy_len;
135 |     output_block_counter += 1;
136 |     offset_within_block = 0;
137 |   }
138 | }
139 | 
140 | INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input,
141 |                                size_t input_len) {
142 |   if (self->buf_len > 0) {
143 |     size_t take = chunk_state_fill_buf(self, input, input_len);
144 |     input += take;
145 |     input_len -= take;
146 |     if (input_len > 0) {
147 |       blake3_compress_in_place(
148 |           self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter,
149 |           self->flags | chunk_state_maybe_start_flag(self));
150 |       self->blocks_compressed += 1;
151 |       self->buf_len = 0;
152 |       memset(self->buf, 0, BLAKE3_BLOCK_LEN);
153 |     }
154 |   }
155 | 
156 |   while (input_len > BLAKE3_BLOCK_LEN) {
157 |     blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN,
158 |                              self->chunk_counter,
159 |                              self->flags | chunk_state_maybe_start_flag(self));
160 |     self->blocks_compressed += 1;
161 |     input += BLAKE3_BLOCK_LEN;
162 |     input_len -= BLAKE3_BLOCK_LEN;
163 |   }
164 | 
165 |   size_t take = chunk_state_fill_buf(self, input, input_len);
166 |   input += take;
167 |   input_len -= take;
168 | }
169 | 
170 | INLINE output_t chunk_state_output(const blake3_chunk_state *self) {
171 |   uint8_t block_flags =
172 |       self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
173 |   return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter,
174 |                      block_flags);
175 | }
176 | 
177 | INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
178 |                               const uint32_t key[8], uint8_t flags) {
179 |   return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
180 | }
181 | 
182 | // Given some input larger than one chunk, return the number of bytes that
183 | // should go in the left subtree. This is the largest power-of-2 number of
184 | // chunks that leaves at least 1 byte for the right subtree.
185 | INLINE size_t left_len(size_t content_len) {
186 |   // Subtract 1 to reserve at least one byte for the right side. content_len
187 |   // should always be greater than BLAKE3_CHUNK_LEN.
188 |   size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
189 |   return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
190 | }
191 | 
192 | // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
193 | // on a single thread. Write out the chunk chaining values and return the
194 | // number of chunks hashed. These chunks are never the root and never empty;
195 | // those cases use a different codepath.
196 | INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len,
197 |                                        const uint32_t key[8],
198 |                                        uint64_t chunk_counter, uint8_t flags,
199 |                                        uint8_t *out) {
200 | #if defined(BLAKE3_TESTING)
201 |   assert(0 < input_len);
202 |   assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN);
203 | #endif
204 | 
205 |   const uint8_t *chunks_array[MAX_SIMD_DEGREE];
206 |   size_t input_position = 0;
207 |   size_t chunks_array_len = 0;
208 |   while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
209 |     chunks_array[chunks_array_len] = &input[input_position];
210 |     input_position += BLAKE3_CHUNK_LEN;
211 |     chunks_array_len += 1;
212 |   }
213 | 
214 |   blake3_hash_many(chunks_array, chunks_array_len,
215 |                    BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter,
216 |                    true, flags, CHUNK_START, CHUNK_END, out);
217 | 
218 |   // Hash the remaining partial chunk, if there is one. Note that the empty
219 |   // chunk (meaning the empty message) is a different codepath.
220 |   if (input_len > input_position) {
221 |     uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
222 |     blake3_chunk_state chunk_state;
223 |     chunk_state_init(&chunk_state, key, flags);
224 |     chunk_state.chunk_counter = counter;
225 |     chunk_state_update(&chunk_state, &input[input_position],
226 |                        input_len - input_position);
227 |     output_t output = chunk_state_output(&chunk_state);
228 |     output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
229 |     return chunks_array_len + 1;
230 |   } else {
231 |     return chunks_array_len;
232 |   }
233 | }
234 | 
235 | // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
236 | // on a single thread. Write out the parent chaining values and return the
237 | // number of parents hashed. (If there's an odd input chaining value left over,
238 | // return it as an additional output.) These parents are never the root and
239 | // never empty; those cases use a different codepath.
240 | INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
241 |                                         size_t num_chaining_values,
242 |                                         const uint32_t key[8], uint8_t flags,
243 |                                         uint8_t *out) {
244 | #if defined(BLAKE3_TESTING)
245 |   assert(2 <= num_chaining_values);
246 |   assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2);
247 | #endif
248 | 
249 |   const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
250 |   size_t parents_array_len = 0;
251 |   while (num_chaining_values - (2 * parents_array_len) >= 2) {
252 |     parents_array[parents_array_len] =
253 |         &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
254 |     parents_array_len += 1;
255 |   }
256 | 
257 |   blake3_hash_many(parents_array, parents_array_len, 1, key,
258 |                    0, // Parents always use counter 0.
259 |                    false, flags | PARENT,
260 |                    0, // Parents have no start flags.
261 |                    0, // Parents have no end flags.
262 |                    out);
263 | 
264 |   // If there's an odd child left over, it becomes an output.
265 |   if (num_chaining_values > 2 * parents_array_len) {
266 |     memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
267 |            &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
268 |            BLAKE3_OUT_LEN);
269 |     return parents_array_len + 1;
270 |   } else {
271 |     return parents_array_len;
272 |   }
273 | }
274 | 
275 | // The wide helper function returns (writes out) an array of chaining values
276 | // and returns the length of that array. The number of chaining values returned
277 | // is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
278 | // if the input is shorter than that many chunks. The reason for maintaining a
279 | // wide array of chaining values going back up the tree, is to allow the
280 | // implementation to hash as many parents in parallel as possible.
281 | //
282 | // As a special case when the SIMD degree is 1, this function will still return
283 | // at least 2 outputs. This guarantees that this function doesn't perform the
284 | // root compression. (If it did, it would use the wrong flags, and also we
285 | // wouldn't be able to implement exendable ouput.) Note that this function is
286 | // not used when the whole input is only 1 chunk long; that's a different
287 | // codepath.
288 | //
289 | // Why not just have the caller split the input on the first update(), instead
290 | // of implementing this special rule? Because we don't want to limit SIMD or
291 | // multi-threading parallelism for that update().
292 | static size_t blake3_compress_subtree_wide(const uint8_t *input,
293 |                                            size_t input_len,
294 |                                            const uint32_t key[8],
295 |                                            uint64_t chunk_counter,
296 |                                            uint8_t flags, uint8_t *out) {
297 |   // Note that the single chunk case does *not* bump the SIMD degree up to 2
298 |   // when it is 1. If this implementation adds multi-threading in the future,
299 |   // this gives us the option of multi-threading even the 2-chunk case, which
300 |   // can help performance on smaller platforms.
301 |   if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) {
302 |     return compress_chunks_parallel(input, input_len, key, chunk_counter, flags,
303 |                                     out);
304 |   }
305 | 
306 |   // With more than simd_degree chunks, we need to recurse. Start by dividing
307 |   // the input into left and right subtrees. (Note that this is only optimal
308 |   // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
309 |   // of 3 or something, we'll need a more complicated strategy.)
310 |   size_t left_input_len = left_len(input_len);
311 |   size_t right_input_len = input_len - left_input_len;
312 |   const uint8_t *right_input = &input[left_input_len];
313 |   uint64_t right_chunk_counter =
314 |       chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);
315 | 
316 |   // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
317 |   // account for the special case of returning 2 outputs when the SIMD degree
318 |   // is 1.
319 |   uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
320 |   size_t degree = blake3_simd_degree();
321 |   if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
322 |     // The special case: We always use a degree of at least two, to make
323 |     // sure there are two outputs. Except, as noted above, at the chunk
324 |     // level, where we allow degree=1. (Note that the 1-chunk-input case is
325 |     // a different codepath.)
326 |     degree = 2;
327 |   }
328 |   uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
329 | 
330 |   // Recurse! If this implementation adds multi-threading support in the
331 |   // future, this is where it will go.
332 |   size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
333 |                                                chunk_counter, flags, cv_array);
334 |   size_t right_n = blake3_compress_subtree_wide(
335 |       right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
336 | 
337 |   // The special case again. If simd_degree=1, then we'll have left_n=1 and
338 |   // right_n=1. Rather than compressing them into a single output, return
339 |   // them directly, to make sure we always have at least two outputs.
340 |   if (left_n == 1) {
341 |     memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
342 |     return 2;
343 |   }
344 | 
345 |   // Otherwise, do one layer of parent node compression.
346 |   size_t num_chaining_values = left_n + right_n;
347 |   return compress_parents_parallel(cv_array, num_chaining_values, key, flags,
348 |                                    out);
349 | }
350 | 
351 | // Hash a subtree with compress_subtree_wide(), and then condense the resulting
352 | // list of chaining values down to a single parent node. Don't compress that
353 | // last parent node, however. Instead, return its message bytes (the
354 | // concatenated chaining values of its children). This is necessary when the
355 | // first call to update() supplies a complete subtree, because the topmost
356 | // parent node of that subtree could end up being the root. It's also necessary
357 | // for extended output in the general case.
358 | //
359 | // As with compress_subtree_wide(), this function is not used on inputs of 1
360 | // chunk or less. That's a different codepath.
361 | INLINE void compress_subtree_to_parent_node(
362 |     const uint8_t *input, size_t input_len, const uint32_t key[8],
363 |     uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) {
364 | #if defined(BLAKE3_TESTING)
365 |   assert(input_len > BLAKE3_CHUNK_LEN);
366 | #endif
367 | 
368 |   uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
369 |   size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
370 |                                                 chunk_counter, flags, cv_array);
371 | 
372 |   // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
373 |   // compress_subtree_wide() returns more than 2 chaining values. Condense
374 |   // them into 2 by forming parent nodes repeatedly.
375 |   uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
376 |   while (num_cvs > 2) {
377 |     num_cvs =
378 |         compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
379 |     memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
380 |   }
381 |   memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
382 | }
383 | 
384 | INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8],
385 |                              uint8_t flags) {
386 |   memcpy(self->key, key, BLAKE3_KEY_LEN);
387 |   chunk_state_init(&self->chunk, key, flags);
388 |   self->cv_stack_len = 0;
389 | }
390 | 
391 | void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); }
392 | 
393 | void blake3_hasher_init_keyed(blake3_hasher *self,
394 |                               const uint8_t key[BLAKE3_KEY_LEN]) {
395 |   uint32_t key_words[8];
396 |   load_key_words(key, key_words);
397 |   hasher_init_base(self, key_words, KEYED_HASH);
398 | }
399 | 
400 | void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
401 |                                        size_t context_len) {
402 |   blake3_hasher context_hasher;
403 |   hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
404 |   blake3_hasher_update(&context_hasher, context, context_len);
405 |   uint8_t context_key[BLAKE3_KEY_LEN];
406 |   blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
407 |   uint32_t context_key_words[8];
408 |   load_key_words(context_key, context_key_words);
409 |   hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
410 | }
411 | 
412 | void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
413 |   blake3_hasher_init_derive_key_raw(self, context, strlen(context));
414 | }
415 | 
416 | // As described in hasher_push_cv() below, we do "lazy merging", delaying
417 | // merges until right before the next CV is about to be added. This is
418 | // different from the reference implementation. Another difference is that we
419 | // aren't always merging 1 chunk at a time. Instead, each CV might represent
420 | // any power-of-two number of chunks, as long as the smaller-above-larger stack
421 | // order is maintained. Instead of the "count the trailing 0-bits" algorithm
422 | // described in the spec, we use a "count the total number of 1-bits" variant
423 | // that doesn't require us to retain the subtree size of the CV on top of the
424 | // stack. The principle is the same: each CV that should remain in the stack is
425 | // represented by a 1-bit in the total number of chunks (or bytes) so far.
426 | INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
427 |   size_t post_merge_stack_len = (size_t)popcnt(total_len);
428 |   while (self->cv_stack_len > post_merge_stack_len) {
429 |     uint8_t *parent_node =
430 |         &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN];
431 |     output_t output = parent_output(parent_node, self->key, self->chunk.flags);
432 |     output_chaining_value(&output, parent_node);
433 |     self->cv_stack_len -= 1;
434 |   }
435 | }
436 | 
437 | // In reference_impl.rs, we merge the new CV with existing CVs from the stack
438 | // before pushing it. We can do that because we know more input is coming, so
439 | // we know none of the merges are root.
440 | //
441 | // This setting is different. We want to feed as much input as possible to
442 | // compress_subtree_wide(), without setting aside anything for the chunk_state.
443 | // If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
444 | // as a single subtree, if at all possible.
445 | //
446 | // This leads to two problems:
447 | // 1) This 64 KiB input might be the only call that ever gets made to update.
448 | //    In this case, the root node of the 64 KiB subtree would be the root node
449 | //    of the whole tree, and it would need to be ROOT finalized. We can't
450 | //    compress it until we know.
451 | // 2) This 64 KiB input might complete a larger tree, whose root node is
452 | //    similarly going to be the the root of the whole tree. For example, maybe
453 | //    we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
454 | //    node at the root of the 256 KiB subtree until we know how to finalize it.
455 | //
456 | // The second problem is solved with "lazy merging". That is, when we're about
457 | // to add a CV to the stack, we don't merge it with anything first, as the
458 | // reference impl does. Instead we do merges using the *previous* CV that was
459 | // added, which is sitting on top of the stack, and we put the new CV
460 | // (unmerged) on top of the stack afterwards. This guarantees that we never
461 | // merge the root node until finalize().
462 | //
463 | // Solving the first problem requires an additional tool,
464 | // compress_subtree_to_parent_node(). That function always returns the top
465 | // *two* chaining values of the subtree it's compressing. We then do lazy
466 | // merging with each of them separately, so that the second CV will always
467 | // remain unmerged. (That also helps us support extendable output when we're
468 | // hashing an input all-at-once.)
469 | INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
470 |                            uint64_t chunk_counter) {
471 |   hasher_merge_cv_stack(self, chunk_counter);
472 |   memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv,
473 |          BLAKE3_OUT_LEN);
474 |   self->cv_stack_len += 1;
475 | }
476 | 
477 | void blake3_hasher_update(blake3_hasher *self, const void *input,
478 |                           size_t input_len) {
479 |   // Explicitly checking for zero avoids causing UB by passing a null pointer
480 |   // to memcpy. This comes up in practice with things like:
481 |   //   std::vector<uint8_t> v;
482 |   //   blake3_hasher_update(&hasher, v.data(), v.size());
483 |   if (input_len == 0) {
484 |     return;
485 |   }
486 | 
487 |   const uint8_t *input_bytes = (const uint8_t *)input;
488 | 
489 |   // If we have some partial chunk bytes in the internal chunk_state, we need
490 |   // to finish that chunk first.
491 |   if (chunk_state_len(&self->chunk) > 0) {
492 |     size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
493 |     if (take > input_len) {
494 |       take = input_len;
495 |     }
496 |     chunk_state_update(&self->chunk, input_bytes, take);
497 |     input_bytes += take;
498 |     input_len -= take;
499 |     // If we've filled the current chunk and there's more coming, finalize this
500 |     // chunk and proceed. In this case we know it's not the root.
501 |     if (input_len > 0) {
502 |       output_t output = chunk_state_output(&self->chunk);
503 |       uint8_t chunk_cv[32];
504 |       output_chaining_value(&output, chunk_cv);
505 |       hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter);
506 |       chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
507 |     } else {
508 |       return;
509 |     }
510 |   }
511 | 
512 |   // Now the chunk_state is clear, and we have more input. If there's more than
513 |   // a single chunk (so, definitely not the root chunk), hash the largest whole
514 |   // subtree we can, with the full benefits of SIMD (and maybe in the future,
515 |   // multi-threading) parallelism. Two restrictions:
516 |   // - The subtree has to be a power-of-2 number of chunks. Only subtrees along
517 |   //   the right edge can be incomplete, and we don't know where the right edge
518 |   //   is going to be until we get to finalize().
519 |   // - The subtree must evenly divide the total number of chunks up until this
520 |   //   point (if total is not 0). If the current incomplete subtree is only
521 |   //   waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
522 |   //   to complete the current subtree first.
523 |   // Because we might need to break up the input to form powers of 2, or to
524 |   // evenly divide what we already have, this part runs in a loop.
525 |   while (input_len > BLAKE3_CHUNK_LEN) {
526 |     size_t subtree_len = round_down_to_power_of_2(input_len);
527 |     uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
528 |     // Shrink the subtree_len until it evenly divides the count so far. We know
529 |     // that subtree_len itself is a power of 2, so we can use a bitmasking
530 |     // trick instead of an actual remainder operation. (Note that if the caller
531 |     // consistently passes power-of-2 inputs of the same size, as is hopefully
532 |     // typical, this loop condition will always fail, and subtree_len will
533 |     // always be the full length of the input.)
534 |     //
535 |     // An aside: We don't have to shrink subtree_len quite this much. For
536 |     // example, if count_so_far is 1, we could pass 2 chunks to
537 |     // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
538 |     // get the right answer in the end, and we might get to use 2-way SIMD
539 |     // parallelism. The problem with this optimization, is that it gets us
540 |     // stuck always hashing 2 chunks. The total number of chunks will remain
541 |     // odd, and we'll never graduate to higher degrees of parallelism. See
542 |     // https://github.com/BLAKE3-team/BLAKE3/issues/69.
543 |     while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
544 |       subtree_len /= 2;
545 |     }
546 |     // The shrunken subtree_len might now be 1 chunk long. If so, hash that one
547 |     // chunk by itself. Otherwise, compress the subtree into a pair of CVs.
548 |     uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
549 |     if (subtree_len <= BLAKE3_CHUNK_LEN) {
550 |       blake3_chunk_state chunk_state;
551 |       chunk_state_init(&chunk_state, self->key, self->chunk.flags);
552 |       chunk_state.chunk_counter = self->chunk.chunk_counter;
553 |       chunk_state_update(&chunk_state, input_bytes, subtree_len);
554 |       output_t output = chunk_state_output(&chunk_state);
555 |       uint8_t cv[BLAKE3_OUT_LEN];
556 |       output_chaining_value(&output, cv);
557 |       hasher_push_cv(self, cv, chunk_state.chunk_counter);
558 |     } else {
559 |       // This is the high-performance happy path, though getting here depends
560 |       // on the caller giving us a long enough input.
561 |       uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
562 |       compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
563 |                                       self->chunk.chunk_counter,
564 |                                       self->chunk.flags, cv_pair);
565 |       hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
566 |       hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
567 |                      self->chunk.chunk_counter + (subtree_chunks / 2));
568 |     }
569 |     self->chunk.chunk_counter += subtree_chunks;
570 |     input_bytes += subtree_len;
571 |     input_len -= subtree_len;
572 |   }
573 | 
574 |   // If there's any remaining input less than a full chunk, add it to the chunk
575 |   // state. In that case, also do a final merge loop to make sure the subtree
576 |   // stack doesn't contain any unmerged pairs. The remaining input means we
577 |   // know these merges are non-root. This merge loop isn't strictly necessary
578 |   // here, because hasher_push_chunk_cv already does its own merge loop, but it
579 |   // simplifies blake3_hasher_finalize below.
580 |   if (input_len > 0) {
581 |     chunk_state_update(&self->chunk, input_bytes, input_len);
582 |     hasher_merge_cv_stack(self, self->chunk.chunk_counter);
583 |   }
584 | }
585 | 
586 | void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
587 |                             size_t out_len) {
588 |   blake3_hasher_finalize_seek(self, 0, out, out_len);
589 | }
590 | 
591 | void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
592 |                                  uint8_t *out, size_t out_len) {
593 |   // Explicitly checking for zero avoids causing UB by passing a null pointer
594 |   // to memcpy. This comes up in practice with things like:
595 |   //   std::vector<uint8_t> v;
596 |   //   blake3_hasher_finalize(&hasher, v.data(), v.size());
597 |   if (out_len == 0) {
598 |     return;
599 |   }
600 | 
601 |   // If the subtree stack is empty, then the current chunk is the root.
602 |   if (self->cv_stack_len == 0) {
603 |     output_t output = chunk_state_output(&self->chunk);
604 |     output_root_bytes(&output, seek, out, out_len);
605 |     return;
606 |   }
607 |   // If there are any bytes in the chunk state, finalize that chunk and do a
608 |   // roll-up merge between that chunk hash and every subtree in the stack. In
609 |   // this case, the extra merge loop at the end of blake3_hasher_update
610 |   // guarantees that none of the subtrees in the stack need to be merged with
611 |   // each other first. Otherwise, if there are no bytes in the chunk state,
612 |   // then the top of the stack is a chunk hash, and we start the merge from
613 |   // that.
614 |   output_t output;
615 |   size_t cvs_remaining;
616 |   if (chunk_state_len(&self->chunk) > 0) {
617 |     cvs_remaining = self->cv_stack_len;
618 |     output = chunk_state_output(&self->chunk);
619 |   } else {
620 |     // There are always at least 2 CVs in the stack in this case.
621 |     cvs_remaining = self->cv_stack_len - 2;
622 |     output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key,
623 |                            self->chunk.flags);
624 |   }
625 |   while (cvs_remaining > 0) {
626 |     cvs_remaining -= 1;
627 |     uint8_t parent_block[BLAKE3_BLOCK_LEN];
628 |     memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32);
629 |     output_chaining_value(&output, &parent_block[32]);
630 |     output = parent_output(parent_block, self->key, self->chunk.flags);
631 |   }
632 |   output_root_bytes(&output, seek, out, out_len);
633 | }
634 | 


--------------------------------------------------------------------------------
/blake3.h:
--------------------------------------------------------------------------------
 1 | #ifndef BLAKE3_H
 2 | #define BLAKE3_H
 3 | 
 4 | #include <stddef.h>
 5 | #include <stdint.h>
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | #define BLAKE3_KEY_LEN 32
12 | #define BLAKE3_OUT_LEN 32
13 | #define BLAKE3_BLOCK_LEN 64
14 | #define BLAKE3_CHUNK_LEN 1024
15 | #define BLAKE3_MAX_DEPTH 54
16 | #define BLAKE3_MAX_SIMD_DEGREE 16
17 | 
18 | // This struct is a private implementation detail. It has to be here because
19 | // it's part of blake3_hasher below.
20 | typedef struct {
21 |   uint32_t cv[8];
22 |   uint64_t chunk_counter;
23 |   uint8_t buf[BLAKE3_BLOCK_LEN];
24 |   uint8_t buf_len;
25 |   uint8_t blocks_compressed;
26 |   uint8_t flags;
27 | } blake3_chunk_state;
28 | 
29 | typedef struct {
30 |   uint32_t key[8];
31 |   blake3_chunk_state chunk;
32 |   uint8_t cv_stack_len;
33 |   // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
34 |   // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
35 |   // requires a 4th entry, rather than merging everything down to 1, because we
36 |   // don't know whether more input is coming. This is different from how the
37 |   // reference implementation does things.
38 |   uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
39 | } blake3_hasher;
40 | 
41 | void blake3_hasher_init(blake3_hasher *self);
42 | void blake3_hasher_init_keyed(blake3_hasher *self,
43 |                               const uint8_t key[BLAKE3_KEY_LEN]);
44 | void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
45 | void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, 
46 |                                        size_t context_len);
47 | void blake3_hasher_update(blake3_hasher *self, const void *input,
48 |                           size_t input_len);
49 | void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
50 |                             size_t out_len);
51 | void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
52 |                                  uint8_t *out, size_t out_len);
53 | 
54 | 
55 | /* This is simply an alias for blake2b */
56 | int blake3( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
57 | 
58 | #ifdef __cplusplus
59 | }
60 | #endif
61 | 
62 | #endif /* BLAKE3_H */
63 | 


--------------------------------------------------------------------------------
/blake3_avx2.c:
--------------------------------------------------------------------------------
  1 | #include "blake3_impl.h"
  2 | 
  3 | #include <immintrin.h>
  4 | 
  5 | #define DEGREE 8
  6 | 
  7 | INLINE __m256i loadu(const uint8_t src[32]) {
  8 |   return _mm256_loadu_si256((const __m256i *)src);
  9 | }
 10 | 
 11 | INLINE void storeu(__m256i src, uint8_t dest[16]) {
 12 |   _mm256_storeu_si256((__m256i *)dest, src);
 13 | }
 14 | 
 15 | INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
 16 | 
 17 | // Note that clang-format doesn't like the name "xor" for some reason.
 18 | INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
 19 | 
 20 | INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
 21 | 
 22 | INLINE __m256i rot16(__m256i x) {
 23 |   return _mm256_shuffle_epi8(
 24 |       x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
 25 |                          13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
 26 | }
 27 | 
 28 | INLINE __m256i rot12(__m256i x) {
 29 |   return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
 30 | }
 31 | 
 32 | INLINE __m256i rot8(__m256i x) {
 33 |   return _mm256_shuffle_epi8(
 34 |       x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1,
 35 |                          12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
 36 | }
 37 | 
 38 | INLINE __m256i rot7(__m256i x) {
 39 |   return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
 40 | }
 41 | 
 42 | INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) {
 43 |   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
 44 |   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
 45 |   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
 46 |   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
 47 |   v[0] = addv(v[0], v[4]);
 48 |   v[1] = addv(v[1], v[5]);
 49 |   v[2] = addv(v[2], v[6]);
 50 |   v[3] = addv(v[3], v[7]);
 51 |   v[12] = xorv(v[12], v[0]);
 52 |   v[13] = xorv(v[13], v[1]);
 53 |   v[14] = xorv(v[14], v[2]);
 54 |   v[15] = xorv(v[15], v[3]);
 55 |   v[12] = rot16(v[12]);
 56 |   v[13] = rot16(v[13]);
 57 |   v[14] = rot16(v[14]);
 58 |   v[15] = rot16(v[15]);
 59 |   v[8] = addv(v[8], v[12]);
 60 |   v[9] = addv(v[9], v[13]);
 61 |   v[10] = addv(v[10], v[14]);
 62 |   v[11] = addv(v[11], v[15]);
 63 |   v[4] = xorv(v[4], v[8]);
 64 |   v[5] = xorv(v[5], v[9]);
 65 |   v[6] = xorv(v[6], v[10]);
 66 |   v[7] = xorv(v[7], v[11]);
 67 |   v[4] = rot12(v[4]);
 68 |   v[5] = rot12(v[5]);
 69 |   v[6] = rot12(v[6]);
 70 |   v[7] = rot12(v[7]);
 71 |   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
 72 |   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
 73 |   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
 74 |   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
 75 |   v[0] = addv(v[0], v[4]);
 76 |   v[1] = addv(v[1], v[5]);
 77 |   v[2] = addv(v[2], v[6]);
 78 |   v[3] = addv(v[3], v[7]);
 79 |   v[12] = xorv(v[12], v[0]);
 80 |   v[13] = xorv(v[13], v[1]);
 81 |   v[14] = xorv(v[14], v[2]);
 82 |   v[15] = xorv(v[15], v[3]);
 83 |   v[12] = rot8(v[12]);
 84 |   v[13] = rot8(v[13]);
 85 |   v[14] = rot8(v[14]);
 86 |   v[15] = rot8(v[15]);
 87 |   v[8] = addv(v[8], v[12]);
 88 |   v[9] = addv(v[9], v[13]);
 89 |   v[10] = addv(v[10], v[14]);
 90 |   v[11] = addv(v[11], v[15]);
 91 |   v[4] = xorv(v[4], v[8]);
 92 |   v[5] = xorv(v[5], v[9]);
 93 |   v[6] = xorv(v[6], v[10]);
 94 |   v[7] = xorv(v[7], v[11]);
 95 |   v[4] = rot7(v[4]);
 96 |   v[5] = rot7(v[5]);
 97 |   v[6] = rot7(v[6]);
 98 |   v[7] = rot7(v[7]);
 99 | 
100 |   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
101 |   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
102 |   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
103 |   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
104 |   v[0] = addv(v[0], v[5]);
105 |   v[1] = addv(v[1], v[6]);
106 |   v[2] = addv(v[2], v[7]);
107 |   v[3] = addv(v[3], v[4]);
108 |   v[15] = xorv(v[15], v[0]);
109 |   v[12] = xorv(v[12], v[1]);
110 |   v[13] = xorv(v[13], v[2]);
111 |   v[14] = xorv(v[14], v[3]);
112 |   v[15] = rot16(v[15]);
113 |   v[12] = rot16(v[12]);
114 |   v[13] = rot16(v[13]);
115 |   v[14] = rot16(v[14]);
116 |   v[10] = addv(v[10], v[15]);
117 |   v[11] = addv(v[11], v[12]);
118 |   v[8] = addv(v[8], v[13]);
119 |   v[9] = addv(v[9], v[14]);
120 |   v[5] = xorv(v[5], v[10]);
121 |   v[6] = xorv(v[6], v[11]);
122 |   v[7] = xorv(v[7], v[8]);
123 |   v[4] = xorv(v[4], v[9]);
124 |   v[5] = rot12(v[5]);
125 |   v[6] = rot12(v[6]);
126 |   v[7] = rot12(v[7]);
127 |   v[4] = rot12(v[4]);
128 |   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
129 |   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
130 |   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
131 |   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
132 |   v[0] = addv(v[0], v[5]);
133 |   v[1] = addv(v[1], v[6]);
134 |   v[2] = addv(v[2], v[7]);
135 |   v[3] = addv(v[3], v[4]);
136 |   v[15] = xorv(v[15], v[0]);
137 |   v[12] = xorv(v[12], v[1]);
138 |   v[13] = xorv(v[13], v[2]);
139 |   v[14] = xorv(v[14], v[3]);
140 |   v[15] = rot8(v[15]);
141 |   v[12] = rot8(v[12]);
142 |   v[13] = rot8(v[13]);
143 |   v[14] = rot8(v[14]);
144 |   v[10] = addv(v[10], v[15]);
145 |   v[11] = addv(v[11], v[12]);
146 |   v[8] = addv(v[8], v[13]);
147 |   v[9] = addv(v[9], v[14]);
148 |   v[5] = xorv(v[5], v[10]);
149 |   v[6] = xorv(v[6], v[11]);
150 |   v[7] = xorv(v[7], v[8]);
151 |   v[4] = xorv(v[4], v[9]);
152 |   v[5] = rot7(v[5]);
153 |   v[6] = rot7(v[6]);
154 |   v[7] = rot7(v[7]);
155 |   v[4] = rot7(v[4]);
156 | }
157 | 
158 | INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
159 |   // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
160 |   // is 22/33/66/77.
161 |   __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
162 |   __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
163 |   __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
164 |   __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
165 |   __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
166 |   __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
167 |   __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
168 |   __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
169 | 
170 |   // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
171 |   // 11/33.
172 |   __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
173 |   __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
174 |   __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
175 |   __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
176 |   __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
177 |   __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
178 |   __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
179 |   __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
180 | 
181 |   // Interleave 128-bit lanes.
182 |   vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
183 |   vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
184 |   vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
185 |   vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
186 |   vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
187 |   vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
188 |   vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
189 |   vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
190 | }
191 | 
192 | INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
193 |                                size_t block_offset, __m256i out[16]) {
194 |   out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
195 |   out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
196 |   out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
197 |   out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
198 |   out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
199 |   out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
200 |   out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
201 |   out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
202 |   out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
203 |   out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
204 |   out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
205 |   out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
206 |   out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
207 |   out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
208 |   out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
209 |   out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
210 |   for (size_t i = 0; i < 8; ++i) {
211 |     _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
212 |   }
213 |   transpose_vecs(&out[0]);
214 |   transpose_vecs(&out[8]);
215 | }
216 | 
217 | INLINE void load_counters(uint64_t counter, bool increment_counter,
218 |                           __m256i *out_lo, __m256i *out_hi) {
219 |   const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
220 |   const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
221 |   const __m256i add1 = _mm256_and_si256(mask, add0);
222 |   __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1);
223 |   __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), 
224 |                                      _mm256_xor_si256(   l, _mm256_set1_epi32(0x80000000)));
225 |   __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry);
226 |   *out_lo = l;
227 |   *out_hi = h;
228 | }
229 | 
230 | void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
231 |                        const uint32_t key[8], uint64_t counter,
232 |                        bool increment_counter, uint8_t flags,
233 |                        uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
234 |   __m256i h_vecs[8] = {
235 |       set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
236 |       set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
237 |   };
238 |   __m256i counter_low_vec, counter_high_vec;
239 |   load_counters(counter, increment_counter, &counter_low_vec,
240 |                 &counter_high_vec);
241 |   uint8_t block_flags = flags | flags_start;
242 | 
243 |   for (size_t block = 0; block < blocks; block++) {
244 |     if (block + 1 == blocks) {
245 |       block_flags |= flags_end;
246 |     }
247 |     __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN);
248 |     __m256i block_flags_vec = set1(block_flags);
249 |     __m256i msg_vecs[16];
250 |     transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
251 | 
252 |     __m256i v[16] = {
253 |         h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
254 |         h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
255 |         set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
256 |         counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
257 |     };
258 |     round_fn(v, msg_vecs, 0);
259 |     round_fn(v, msg_vecs, 1);
260 |     round_fn(v, msg_vecs, 2);
261 |     round_fn(v, msg_vecs, 3);
262 |     round_fn(v, msg_vecs, 4);
263 |     round_fn(v, msg_vecs, 5);
264 |     round_fn(v, msg_vecs, 6);
265 |     h_vecs[0] = xorv(v[0], v[8]);
266 |     h_vecs[1] = xorv(v[1], v[9]);
267 |     h_vecs[2] = xorv(v[2], v[10]);
268 |     h_vecs[3] = xorv(v[3], v[11]);
269 |     h_vecs[4] = xorv(v[4], v[12]);
270 |     h_vecs[5] = xorv(v[5], v[13]);
271 |     h_vecs[6] = xorv(v[6], v[14]);
272 |     h_vecs[7] = xorv(v[7], v[15]);
273 | 
274 |     block_flags = flags;
275 |   }
276 | 
277 |   transpose_vecs(h_vecs);
278 |   storeu(h_vecs[0], &out[0 * sizeof(__m256i)]);
279 |   storeu(h_vecs[1], &out[1 * sizeof(__m256i)]);
280 |   storeu(h_vecs[2], &out[2 * sizeof(__m256i)]);
281 |   storeu(h_vecs[3], &out[3 * sizeof(__m256i)]);
282 |   storeu(h_vecs[4], &out[4 * sizeof(__m256i)]);
283 |   storeu(h_vecs[5], &out[5 * sizeof(__m256i)]);
284 |   storeu(h_vecs[6], &out[6 * sizeof(__m256i)]);
285 |   storeu(h_vecs[7], &out[7 * sizeof(__m256i)]);
286 | }
287 | 
288 | #if !defined(BLAKE3_NO_SSE41)
289 | void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
290 |                             size_t blocks, const uint32_t key[8],
291 |                             uint64_t counter, bool increment_counter,
292 |                             uint8_t flags, uint8_t flags_start,
293 |                             uint8_t flags_end, uint8_t *out);
294 | #else
295 | void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
296 |                                size_t blocks, const uint32_t key[8],
297 |                                uint64_t counter, bool increment_counter,
298 |                                uint8_t flags, uint8_t flags_start,
299 |                                uint8_t flags_end, uint8_t *out);
300 | #endif
301 | 
302 | void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
303 |                            size_t blocks, const uint32_t key[8],
304 |                            uint64_t counter, bool increment_counter,
305 |                            uint8_t flags, uint8_t flags_start,
306 |                            uint8_t flags_end, uint8_t *out) {
307 |   while (num_inputs >= DEGREE) {
308 |     blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags,
309 |                       flags_start, flags_end, out);
310 |     if (increment_counter) {
311 |       counter += DEGREE;
312 |     }
313 |     inputs += DEGREE;
314 |     num_inputs -= DEGREE;
315 |     out = &out[DEGREE * BLAKE3_OUT_LEN];
316 |   }
317 | #if !defined(BLAKE3_NO_SSE41)
318 |   blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
319 |                          increment_counter, flags, flags_start, flags_end, out);
320 | #else
321 |   blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
322 |                             increment_counter, flags, flags_start, flags_end,
323 |                             out);
324 | #endif
325 | }
326 | 


--------------------------------------------------------------------------------
/blake3_avx512.c:
--------------------------------------------------------------------------------
   1 | #include "blake3_impl.h"
   2 | 
   3 | #include <immintrin.h>
   4 | 
   5 | #define _mm_shuffle_ps2(a, b, c)                                               \
   6 |   (_mm_castps_si128(                                                           \
   7 |       _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
   8 | 
   9 | INLINE __m128i loadu_128(const uint8_t src[16]) {
  10 |   return _mm_loadu_si128((const __m128i *)src);
  11 | }
  12 | 
  13 | INLINE __m256i loadu_256(const uint8_t src[32]) {
  14 |   return _mm256_loadu_si256((const __m256i *)src);
  15 | }
  16 | 
  17 | INLINE __m512i loadu_512(const uint8_t src[64]) {
  18 |   return _mm512_loadu_si512((const __m512i *)src);
  19 | }
  20 | 
  21 | INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
  22 |   _mm_storeu_si128((__m128i *)dest, src);
  23 | }
  24 | 
  25 | INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
  26 |   _mm256_storeu_si256((__m256i *)dest, src);
  27 | }
  28 | 
  29 | INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
  30 | 
  31 | INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
  32 | 
  33 | INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); }
  34 | 
  35 | INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
  36 | 
  37 | INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
  38 | 
  39 | INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); }
  40 | 
  41 | INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
  42 | 
  43 | INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
  44 | 
  45 | INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); }
  46 | 
  47 | INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
  48 |   return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
  49 | }
  50 | 
  51 | INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); }
  52 | 
  53 | INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); }
  54 | 
  55 | INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); }
  56 | 
  57 | INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); }
  58 | 
  59 | INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); }
  60 | 
  61 | INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); }
  62 | 
  63 | INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); }
  64 | 
  65 | INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); }
  66 | 
  67 | INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); }
  68 | 
  69 | INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); }
  70 | 
  71 | INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); }
  72 | 
  73 | INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); }
  74 | 
  75 | /*
  76 |  * ----------------------------------------------------------------------------
  77 |  * compress_avx512
  78 |  * ----------------------------------------------------------------------------
  79 |  */
  80 | 
  81 | INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
  82 |                __m128i m) {
  83 |   *row0 = add_128(add_128(*row0, m), *row1);
  84 |   *row3 = xor_128(*row3, *row0);
  85 |   *row3 = rot16_128(*row3);
  86 |   *row2 = add_128(*row2, *row3);
  87 |   *row1 = xor_128(*row1, *row2);
  88 |   *row1 = rot12_128(*row1);
  89 | }
  90 | 
  91 | INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
  92 |                __m128i m) {
  93 |   *row0 = add_128(add_128(*row0, m), *row1);
  94 |   *row3 = xor_128(*row3, *row0);
  95 |   *row3 = rot8_128(*row3);
  96 |   *row2 = add_128(*row2, *row3);
  97 |   *row1 = xor_128(*row1, *row2);
  98 |   *row1 = rot7_128(*row1);
  99 | }
 100 | 
 101 | // Note the optimization here of leaving row1 as the unrotated row, rather than
 102 | // row0. All the message loads below are adjusted to compensate for this. See
 103 | // discussion at https://github.com/sneves/blake2-avx2/pull/4
 104 | INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
 105 |   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
 106 |   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
 107 |   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
 108 | }
 109 | 
 110 | INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
 111 |   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
 112 |   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
 113 |   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
 114 | }
 115 | 
 116 | INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
 117 |                          const uint8_t block[BLAKE3_BLOCK_LEN],
 118 |                          uint8_t block_len, uint64_t counter, uint8_t flags) {
 119 |   rows[0] = loadu_128((uint8_t *)&cv[0]);
 120 |   rows[1] = loadu_128((uint8_t *)&cv[4]);
 121 |   rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
 122 |   rows[3] = set4(counter_low(counter), counter_high(counter),
 123 |                  (uint32_t)block_len, (uint32_t)flags);
 124 | 
 125 |   __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
 126 |   __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
 127 |   __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]);
 128 |   __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]);
 129 | 
 130 |   __m128i t0, t1, t2, t3, tt;
 131 | 
 132 |   // Round 1. The first round permutes the message words from the original
 133 |   // input order, into the groups that get mixed in parallel.
 134 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
 135 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 136 |   t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
 137 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 138 |   diagonalize(&rows[0], &rows[2], &rows[3]);
 139 |   t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
 140 |   t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
 141 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 142 |   t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
 143 |   t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
 144 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 145 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
 146 |   m0 = t0;
 147 |   m1 = t1;
 148 |   m2 = t2;
 149 |   m3 = t3;
 150 | 
 151 |   // Round 2. This round and all following rounds apply a fixed permutation
 152 |   // to the message words from the round before.
 153 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
 154 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 155 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 156 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
 157 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 158 |   t1 = _mm_blend_epi16(tt, t1, 0xCC);
 159 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 160 |   diagonalize(&rows[0], &rows[2], &rows[3]);
 161 |   t2 = _mm_unpacklo_epi64(m3, m1);
 162 |   tt = _mm_blend_epi16(t2, m2, 0xC0);
 163 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 164 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 165 |   t3 = _mm_unpackhi_epi32(m1, m3);
 166 |   tt = _mm_unpacklo_epi32(m2, t3);
 167 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 168 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 169 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
 170 |   m0 = t0;
 171 |   m1 = t1;
 172 |   m2 = t2;
 173 |   m3 = t3;
 174 | 
 175 |   // Round 3
 176 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
 177 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 178 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 179 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
 180 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 181 |   t1 = _mm_blend_epi16(tt, t1, 0xCC);
 182 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 183 |   diagonalize(&rows[0], &rows[2], &rows[3]);
 184 |   t2 = _mm_unpacklo_epi64(m3, m1);
 185 |   tt = _mm_blend_epi16(t2, m2, 0xC0);
 186 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 187 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 188 |   t3 = _mm_unpackhi_epi32(m1, m3);
 189 |   tt = _mm_unpacklo_epi32(m2, t3);
 190 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 191 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 192 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
 193 |   m0 = t0;
 194 |   m1 = t1;
 195 |   m2 = t2;
 196 |   m3 = t3;
 197 | 
 198 |   // Round 4
 199 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
 200 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 201 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 202 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
 203 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 204 |   t1 = _mm_blend_epi16(tt, t1, 0xCC);
 205 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 206 |   diagonalize(&rows[0], &rows[2], &rows[3]);
 207 |   t2 = _mm_unpacklo_epi64(m3, m1);
 208 |   tt = _mm_blend_epi16(t2, m2, 0xC0);
 209 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 210 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 211 |   t3 = _mm_unpackhi_epi32(m1, m3);
 212 |   tt = _mm_unpacklo_epi32(m2, t3);
 213 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 214 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 215 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
 216 |   m0 = t0;
 217 |   m1 = t1;
 218 |   m2 = t2;
 219 |   m3 = t3;
 220 | 
 221 |   // Round 5
 222 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
 223 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 224 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 225 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
 226 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 227 |   t1 = _mm_blend_epi16(tt, t1, 0xCC);
 228 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 229 |   diagonalize(&rows[0], &rows[2], &rows[3]);
 230 |   t2 = _mm_unpacklo_epi64(m3, m1);
 231 |   tt = _mm_blend_epi16(t2, m2, 0xC0);
 232 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 233 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 234 |   t3 = _mm_unpackhi_epi32(m1, m3);
 235 |   tt = _mm_unpacklo_epi32(m2, t3);
 236 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 237 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 238 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
 239 |   m0 = t0;
 240 |   m1 = t1;
 241 |   m2 = t2;
 242 |   m3 = t3;
 243 | 
 244 |   // Round 6
 245 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
 246 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 247 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 248 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
 249 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 250 |   t1 = _mm_blend_epi16(tt, t1, 0xCC);
 251 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 252 |   diagonalize(&rows[0], &rows[2], &rows[3]);
 253 |   t2 = _mm_unpacklo_epi64(m3, m1);
 254 |   tt = _mm_blend_epi16(t2, m2, 0xC0);
 255 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 256 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 257 |   t3 = _mm_unpackhi_epi32(m1, m3);
 258 |   tt = _mm_unpacklo_epi32(m2, t3);
 259 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 260 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 261 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
 262 |   m0 = t0;
 263 |   m1 = t1;
 264 |   m2 = t2;
 265 |   m3 = t3;
 266 | 
 267 |   // Round 7
 268 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
 269 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 270 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 271 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
 272 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 273 |   t1 = _mm_blend_epi16(tt, t1, 0xCC);
 274 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 275 |   diagonalize(&rows[0], &rows[2], &rows[3]);
 276 |   t2 = _mm_unpacklo_epi64(m3, m1);
 277 |   tt = _mm_blend_epi16(t2, m2, 0xC0);
 278 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 279 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 280 |   t3 = _mm_unpackhi_epi32(m1, m3);
 281 |   tt = _mm_unpacklo_epi32(m2, t3);
 282 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 283 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 284 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
 285 | }
 286 | 
 287 | void blake3_compress_xof_avx512(const uint32_t cv[8],
 288 |                                 const uint8_t block[BLAKE3_BLOCK_LEN],
 289 |                                 uint8_t block_len, uint64_t counter,
 290 |                                 uint8_t flags, uint8_t out[64]) {
 291 |   __m128i rows[4];
 292 |   compress_pre(rows, cv, block, block_len, counter, flags);
 293 |   storeu_128(xor_128(rows[0], rows[2]), &out[0]);
 294 |   storeu_128(xor_128(rows[1], rows[3]), &out[16]);
 295 |   storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
 296 |   storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]);
 297 | }
 298 | 
 299 | void blake3_compress_in_place_avx512(uint32_t cv[8],
 300 |                                      const uint8_t block[BLAKE3_BLOCK_LEN],
 301 |                                      uint8_t block_len, uint64_t counter,
 302 |                                      uint8_t flags) {
 303 |   __m128i rows[4];
 304 |   compress_pre(rows, cv, block, block_len, counter, flags);
 305 |   storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
 306 |   storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
 307 | }
 308 | 
 309 | /*
 310 |  * ----------------------------------------------------------------------------
 311 |  * hash4_avx512
 312 |  * ----------------------------------------------------------------------------
 313 |  */
 314 | 
 315 | INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
 316 |   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
 317 |   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
 318 |   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
 319 |   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
 320 |   v[0] = add_128(v[0], v[4]);
 321 |   v[1] = add_128(v[1], v[5]);
 322 |   v[2] = add_128(v[2], v[6]);
 323 |   v[3] = add_128(v[3], v[7]);
 324 |   v[12] = xor_128(v[12], v[0]);
 325 |   v[13] = xor_128(v[13], v[1]);
 326 |   v[14] = xor_128(v[14], v[2]);
 327 |   v[15] = xor_128(v[15], v[3]);
 328 |   v[12] = rot16_128(v[12]);
 329 |   v[13] = rot16_128(v[13]);
 330 |   v[14] = rot16_128(v[14]);
 331 |   v[15] = rot16_128(v[15]);
 332 |   v[8] = add_128(v[8], v[12]);
 333 |   v[9] = add_128(v[9], v[13]);
 334 |   v[10] = add_128(v[10], v[14]);
 335 |   v[11] = add_128(v[11], v[15]);
 336 |   v[4] = xor_128(v[4], v[8]);
 337 |   v[5] = xor_128(v[5], v[9]);
 338 |   v[6] = xor_128(v[6], v[10]);
 339 |   v[7] = xor_128(v[7], v[11]);
 340 |   v[4] = rot12_128(v[4]);
 341 |   v[5] = rot12_128(v[5]);
 342 |   v[6] = rot12_128(v[6]);
 343 |   v[7] = rot12_128(v[7]);
 344 |   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
 345 |   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
 346 |   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
 347 |   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
 348 |   v[0] = add_128(v[0], v[4]);
 349 |   v[1] = add_128(v[1], v[5]);
 350 |   v[2] = add_128(v[2], v[6]);
 351 |   v[3] = add_128(v[3], v[7]);
 352 |   v[12] = xor_128(v[12], v[0]);
 353 |   v[13] = xor_128(v[13], v[1]);
 354 |   v[14] = xor_128(v[14], v[2]);
 355 |   v[15] = xor_128(v[15], v[3]);
 356 |   v[12] = rot8_128(v[12]);
 357 |   v[13] = rot8_128(v[13]);
 358 |   v[14] = rot8_128(v[14]);
 359 |   v[15] = rot8_128(v[15]);
 360 |   v[8] = add_128(v[8], v[12]);
 361 |   v[9] = add_128(v[9], v[13]);
 362 |   v[10] = add_128(v[10], v[14]);
 363 |   v[11] = add_128(v[11], v[15]);
 364 |   v[4] = xor_128(v[4], v[8]);
 365 |   v[5] = xor_128(v[5], v[9]);
 366 |   v[6] = xor_128(v[6], v[10]);
 367 |   v[7] = xor_128(v[7], v[11]);
 368 |   v[4] = rot7_128(v[4]);
 369 |   v[5] = rot7_128(v[5]);
 370 |   v[6] = rot7_128(v[6]);
 371 |   v[7] = rot7_128(v[7]);
 372 | 
 373 |   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
 374 |   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
 375 |   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
 376 |   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
 377 |   v[0] = add_128(v[0], v[5]);
 378 |   v[1] = add_128(v[1], v[6]);
 379 |   v[2] = add_128(v[2], v[7]);
 380 |   v[3] = add_128(v[3], v[4]);
 381 |   v[15] = xor_128(v[15], v[0]);
 382 |   v[12] = xor_128(v[12], v[1]);
 383 |   v[13] = xor_128(v[13], v[2]);
 384 |   v[14] = xor_128(v[14], v[3]);
 385 |   v[15] = rot16_128(v[15]);
 386 |   v[12] = rot16_128(v[12]);
 387 |   v[13] = rot16_128(v[13]);
 388 |   v[14] = rot16_128(v[14]);
 389 |   v[10] = add_128(v[10], v[15]);
 390 |   v[11] = add_128(v[11], v[12]);
 391 |   v[8] = add_128(v[8], v[13]);
 392 |   v[9] = add_128(v[9], v[14]);
 393 |   v[5] = xor_128(v[5], v[10]);
 394 |   v[6] = xor_128(v[6], v[11]);
 395 |   v[7] = xor_128(v[7], v[8]);
 396 |   v[4] = xor_128(v[4], v[9]);
 397 |   v[5] = rot12_128(v[5]);
 398 |   v[6] = rot12_128(v[6]);
 399 |   v[7] = rot12_128(v[7]);
 400 |   v[4] = rot12_128(v[4]);
 401 |   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
 402 |   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
 403 |   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
 404 |   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
 405 |   v[0] = add_128(v[0], v[5]);
 406 |   v[1] = add_128(v[1], v[6]);
 407 |   v[2] = add_128(v[2], v[7]);
 408 |   v[3] = add_128(v[3], v[4]);
 409 |   v[15] = xor_128(v[15], v[0]);
 410 |   v[12] = xor_128(v[12], v[1]);
 411 |   v[13] = xor_128(v[13], v[2]);
 412 |   v[14] = xor_128(v[14], v[3]);
 413 |   v[15] = rot8_128(v[15]);
 414 |   v[12] = rot8_128(v[12]);
 415 |   v[13] = rot8_128(v[13]);
 416 |   v[14] = rot8_128(v[14]);
 417 |   v[10] = add_128(v[10], v[15]);
 418 |   v[11] = add_128(v[11], v[12]);
 419 |   v[8] = add_128(v[8], v[13]);
 420 |   v[9] = add_128(v[9], v[14]);
 421 |   v[5] = xor_128(v[5], v[10]);
 422 |   v[6] = xor_128(v[6], v[11]);
 423 |   v[7] = xor_128(v[7], v[8]);
 424 |   v[4] = xor_128(v[4], v[9]);
 425 |   v[5] = rot7_128(v[5]);
 426 |   v[6] = rot7_128(v[6]);
 427 |   v[7] = rot7_128(v[7]);
 428 |   v[4] = rot7_128(v[4]);
 429 | }
 430 | 
 431 | INLINE void transpose_vecs_128(__m128i vecs[4]) {
 432 |   // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
 433 |   // 22/33. Note that this doesn't split the vector into two lanes, as the
 434 |   // AVX2 counterparts do.
 435 |   __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
 436 |   __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
 437 |   __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
 438 |   __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
 439 | 
 440 |   // Interleave 64-bit lanes.
 441 |   __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
 442 |   __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
 443 |   __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
 444 |   __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
 445 | 
 446 |   vecs[0] = abcd_0;
 447 |   vecs[1] = abcd_1;
 448 |   vecs[2] = abcd_2;
 449 |   vecs[3] = abcd_3;
 450 | }
 451 | 
 452 | INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
 453 |                                 size_t block_offset, __m128i out[16]) {
 454 |   out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
 455 |   out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
 456 |   out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
 457 |   out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
 458 |   out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
 459 |   out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
 460 |   out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
 461 |   out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
 462 |   out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
 463 |   out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
 464 |   out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
 465 |   out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
 466 |   out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
 467 |   out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
 468 |   out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
 469 |   out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
 470 |   for (size_t i = 0; i < 4; ++i) {
 471 |     _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
 472 |   }
 473 |   transpose_vecs_128(&out[0]);
 474 |   transpose_vecs_128(&out[4]);
 475 |   transpose_vecs_128(&out[8]);
 476 |   transpose_vecs_128(&out[12]);
 477 | }
 478 | 
 479 | INLINE void load_counters4(uint64_t counter, bool increment_counter,
 480 |                            __m128i *out_lo, __m128i *out_hi) {
 481 |   uint64_t mask = (increment_counter ? ~0 : 0);
 482 |   __m256i mask_vec = _mm256_set1_epi64x(mask);
 483 |   __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
 484 |   deltas = _mm256_and_si256(mask_vec, deltas);
 485 |   __m256i counters =
 486 |       _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
 487 |   *out_lo = _mm256_cvtepi64_epi32(counters);
 488 |   *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
 489 | }
 490 | 
 491 | void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
 492 |                          const uint32_t key[8], uint64_t counter,
 493 |                          bool increment_counter, uint8_t flags,
 494 |                          uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
 495 |   __m128i h_vecs[8] = {
 496 |       set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
 497 |       set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
 498 |   };
 499 |   __m128i counter_low_vec, counter_high_vec;
 500 |   load_counters4(counter, increment_counter, &counter_low_vec,
 501 |                  &counter_high_vec);
 502 |   uint8_t block_flags = flags | flags_start;
 503 | 
 504 |   for (size_t block = 0; block < blocks; block++) {
 505 |     if (block + 1 == blocks) {
 506 |       block_flags |= flags_end;
 507 |     }
 508 |     __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
 509 |     __m128i block_flags_vec = set1_128(block_flags);
 510 |     __m128i msg_vecs[16];
 511 |     transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
 512 | 
 513 |     __m128i v[16] = {
 514 |         h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
 515 |         h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
 516 |         set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
 517 |         counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
 518 |     };
 519 |     round_fn4(v, msg_vecs, 0);
 520 |     round_fn4(v, msg_vecs, 1);
 521 |     round_fn4(v, msg_vecs, 2);
 522 |     round_fn4(v, msg_vecs, 3);
 523 |     round_fn4(v, msg_vecs, 4);
 524 |     round_fn4(v, msg_vecs, 5);
 525 |     round_fn4(v, msg_vecs, 6);
 526 |     h_vecs[0] = xor_128(v[0], v[8]);
 527 |     h_vecs[1] = xor_128(v[1], v[9]);
 528 |     h_vecs[2] = xor_128(v[2], v[10]);
 529 |     h_vecs[3] = xor_128(v[3], v[11]);
 530 |     h_vecs[4] = xor_128(v[4], v[12]);
 531 |     h_vecs[5] = xor_128(v[5], v[13]);
 532 |     h_vecs[6] = xor_128(v[6], v[14]);
 533 |     h_vecs[7] = xor_128(v[7], v[15]);
 534 | 
 535 |     block_flags = flags;
 536 |   }
 537 | 
 538 |   transpose_vecs_128(&h_vecs[0]);
 539 |   transpose_vecs_128(&h_vecs[4]);
 540 |   // The first four vecs now contain the first half of each output, and the
 541 |   // second four vecs contain the second half of each output.
 542 |   storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]);
 543 |   storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]);
 544 |   storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]);
 545 |   storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]);
 546 |   storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]);
 547 |   storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]);
 548 |   storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]);
 549 |   storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
 550 | }
 551 | 
 552 | /*
 553 |  * ----------------------------------------------------------------------------
 554 |  * hash8_avx512
 555 |  * ----------------------------------------------------------------------------
 556 |  */
 557 | 
 558 | INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) {
 559 |   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
 560 |   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
 561 |   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
 562 |   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
 563 |   v[0] = add_256(v[0], v[4]);
 564 |   v[1] = add_256(v[1], v[5]);
 565 |   v[2] = add_256(v[2], v[6]);
 566 |   v[3] = add_256(v[3], v[7]);
 567 |   v[12] = xor_256(v[12], v[0]);
 568 |   v[13] = xor_256(v[13], v[1]);
 569 |   v[14] = xor_256(v[14], v[2]);
 570 |   v[15] = xor_256(v[15], v[3]);
 571 |   v[12] = rot16_256(v[12]);
 572 |   v[13] = rot16_256(v[13]);
 573 |   v[14] = rot16_256(v[14]);
 574 |   v[15] = rot16_256(v[15]);
 575 |   v[8] = add_256(v[8], v[12]);
 576 |   v[9] = add_256(v[9], v[13]);
 577 |   v[10] = add_256(v[10], v[14]);
 578 |   v[11] = add_256(v[11], v[15]);
 579 |   v[4] = xor_256(v[4], v[8]);
 580 |   v[5] = xor_256(v[5], v[9]);
 581 |   v[6] = xor_256(v[6], v[10]);
 582 |   v[7] = xor_256(v[7], v[11]);
 583 |   v[4] = rot12_256(v[4]);
 584 |   v[5] = rot12_256(v[5]);
 585 |   v[6] = rot12_256(v[6]);
 586 |   v[7] = rot12_256(v[7]);
 587 |   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
 588 |   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
 589 |   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
 590 |   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
 591 |   v[0] = add_256(v[0], v[4]);
 592 |   v[1] = add_256(v[1], v[5]);
 593 |   v[2] = add_256(v[2], v[6]);
 594 |   v[3] = add_256(v[3], v[7]);
 595 |   v[12] = xor_256(v[12], v[0]);
 596 |   v[13] = xor_256(v[13], v[1]);
 597 |   v[14] = xor_256(v[14], v[2]);
 598 |   v[15] = xor_256(v[15], v[3]);
 599 |   v[12] = rot8_256(v[12]);
 600 |   v[13] = rot8_256(v[13]);
 601 |   v[14] = rot8_256(v[14]);
 602 |   v[15] = rot8_256(v[15]);
 603 |   v[8] = add_256(v[8], v[12]);
 604 |   v[9] = add_256(v[9], v[13]);
 605 |   v[10] = add_256(v[10], v[14]);
 606 |   v[11] = add_256(v[11], v[15]);
 607 |   v[4] = xor_256(v[4], v[8]);
 608 |   v[5] = xor_256(v[5], v[9]);
 609 |   v[6] = xor_256(v[6], v[10]);
 610 |   v[7] = xor_256(v[7], v[11]);
 611 |   v[4] = rot7_256(v[4]);
 612 |   v[5] = rot7_256(v[5]);
 613 |   v[6] = rot7_256(v[6]);
 614 |   v[7] = rot7_256(v[7]);
 615 | 
 616 |   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
 617 |   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
 618 |   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
 619 |   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
 620 |   v[0] = add_256(v[0], v[5]);
 621 |   v[1] = add_256(v[1], v[6]);
 622 |   v[2] = add_256(v[2], v[7]);
 623 |   v[3] = add_256(v[3], v[4]);
 624 |   v[15] = xor_256(v[15], v[0]);
 625 |   v[12] = xor_256(v[12], v[1]);
 626 |   v[13] = xor_256(v[13], v[2]);
 627 |   v[14] = xor_256(v[14], v[3]);
 628 |   v[15] = rot16_256(v[15]);
 629 |   v[12] = rot16_256(v[12]);
 630 |   v[13] = rot16_256(v[13]);
 631 |   v[14] = rot16_256(v[14]);
 632 |   v[10] = add_256(v[10], v[15]);
 633 |   v[11] = add_256(v[11], v[12]);
 634 |   v[8] = add_256(v[8], v[13]);
 635 |   v[9] = add_256(v[9], v[14]);
 636 |   v[5] = xor_256(v[5], v[10]);
 637 |   v[6] = xor_256(v[6], v[11]);
 638 |   v[7] = xor_256(v[7], v[8]);
 639 |   v[4] = xor_256(v[4], v[9]);
 640 |   v[5] = rot12_256(v[5]);
 641 |   v[6] = rot12_256(v[6]);
 642 |   v[7] = rot12_256(v[7]);
 643 |   v[4] = rot12_256(v[4]);
 644 |   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
 645 |   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
 646 |   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
 647 |   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
 648 |   v[0] = add_256(v[0], v[5]);
 649 |   v[1] = add_256(v[1], v[6]);
 650 |   v[2] = add_256(v[2], v[7]);
 651 |   v[3] = add_256(v[3], v[4]);
 652 |   v[15] = xor_256(v[15], v[0]);
 653 |   v[12] = xor_256(v[12], v[1]);
 654 |   v[13] = xor_256(v[13], v[2]);
 655 |   v[14] = xor_256(v[14], v[3]);
 656 |   v[15] = rot8_256(v[15]);
 657 |   v[12] = rot8_256(v[12]);
 658 |   v[13] = rot8_256(v[13]);
 659 |   v[14] = rot8_256(v[14]);
 660 |   v[10] = add_256(v[10], v[15]);
 661 |   v[11] = add_256(v[11], v[12]);
 662 |   v[8] = add_256(v[8], v[13]);
 663 |   v[9] = add_256(v[9], v[14]);
 664 |   v[5] = xor_256(v[5], v[10]);
 665 |   v[6] = xor_256(v[6], v[11]);
 666 |   v[7] = xor_256(v[7], v[8]);
 667 |   v[4] = xor_256(v[4], v[9]);
 668 |   v[5] = rot7_256(v[5]);
 669 |   v[6] = rot7_256(v[6]);
 670 |   v[7] = rot7_256(v[7]);
 671 |   v[4] = rot7_256(v[4]);
 672 | }
 673 | 
 674 | INLINE void transpose_vecs_256(__m256i vecs[8]) {
 675 |   // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
 676 |   // is 22/33/66/77.
 677 |   __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
 678 |   __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
 679 |   __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
 680 |   __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
 681 |   __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
 682 |   __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
 683 |   __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
 684 |   __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
 685 | 
 686 |   // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
 687 |   // 11/33.
 688 |   __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
 689 |   __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
 690 |   __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
 691 |   __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
 692 |   __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
 693 |   __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
 694 |   __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
 695 |   __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
 696 | 
 697 |   // Interleave 128-bit lanes.
 698 |   vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
 699 |   vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
 700 |   vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
 701 |   vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
 702 |   vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
 703 |   vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
 704 |   vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
 705 |   vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
 706 | }
 707 | 
 708 | INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
 709 |                                 size_t block_offset, __m256i out[16]) {
 710 |   out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
 711 |   out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
 712 |   out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
 713 |   out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
 714 |   out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
 715 |   out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
 716 |   out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
 717 |   out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
 718 |   out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
 719 |   out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
 720 |   out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
 721 |   out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
 722 |   out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
 723 |   out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
 724 |   out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
 725 |   out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
 726 |   for (size_t i = 0; i < 8; ++i) {
 727 |     _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
 728 |   }
 729 |   transpose_vecs_256(&out[0]);
 730 |   transpose_vecs_256(&out[8]);
 731 | }
 732 | 
 733 | INLINE void load_counters8(uint64_t counter, bool increment_counter,
 734 |                            __m256i *out_lo, __m256i *out_hi) {
 735 |   uint64_t mask = (increment_counter ? ~0 : 0);
 736 |   __m512i mask_vec = _mm512_set1_epi64(mask);
 737 |   __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
 738 |   deltas = _mm512_and_si512(mask_vec, deltas);
 739 |   __m512i counters =
 740 |       _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
 741 |   *out_lo = _mm512_cvtepi64_epi32(counters);
 742 |   *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
 743 | }
 744 | 
 745 | void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
 746 |                          const uint32_t key[8], uint64_t counter,
 747 |                          bool increment_counter, uint8_t flags,
 748 |                          uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
 749 |   __m256i h_vecs[8] = {
 750 |       set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
 751 |       set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
 752 |   };
 753 |   __m256i counter_low_vec, counter_high_vec;
 754 |   load_counters8(counter, increment_counter, &counter_low_vec,
 755 |                  &counter_high_vec);
 756 |   uint8_t block_flags = flags | flags_start;
 757 | 
 758 |   for (size_t block = 0; block < blocks; block++) {
 759 |     if (block + 1 == blocks) {
 760 |       block_flags |= flags_end;
 761 |     }
 762 |     __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN);
 763 |     __m256i block_flags_vec = set1_256(block_flags);
 764 |     __m256i msg_vecs[16];
 765 |     transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
 766 | 
 767 |     __m256i v[16] = {
 768 |         h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
 769 |         h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
 770 |         set1_256(IV[0]), set1_256(IV[1]),  set1_256(IV[2]), set1_256(IV[3]),
 771 |         counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
 772 |     };
 773 |     round_fn8(v, msg_vecs, 0);
 774 |     round_fn8(v, msg_vecs, 1);
 775 |     round_fn8(v, msg_vecs, 2);
 776 |     round_fn8(v, msg_vecs, 3);
 777 |     round_fn8(v, msg_vecs, 4);
 778 |     round_fn8(v, msg_vecs, 5);
 779 |     round_fn8(v, msg_vecs, 6);
 780 |     h_vecs[0] = xor_256(v[0], v[8]);
 781 |     h_vecs[1] = xor_256(v[1], v[9]);
 782 |     h_vecs[2] = xor_256(v[2], v[10]);
 783 |     h_vecs[3] = xor_256(v[3], v[11]);
 784 |     h_vecs[4] = xor_256(v[4], v[12]);
 785 |     h_vecs[5] = xor_256(v[5], v[13]);
 786 |     h_vecs[6] = xor_256(v[6], v[14]);
 787 |     h_vecs[7] = xor_256(v[7], v[15]);
 788 | 
 789 |     block_flags = flags;
 790 |   }
 791 | 
 792 |   transpose_vecs_256(h_vecs);
 793 |   storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]);
 794 |   storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]);
 795 |   storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]);
 796 |   storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]);
 797 |   storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]);
 798 |   storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]);
 799 |   storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]);
 800 |   storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
 801 | }
 802 | 
 803 | /*
 804 |  * ----------------------------------------------------------------------------
 805 |  * hash16_avx512
 806 |  * ----------------------------------------------------------------------------
 807 |  */
 808 | 
 809 | INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) {
 810 |   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
 811 |   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
 812 |   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
 813 |   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
 814 |   v[0] = add_512(v[0], v[4]);
 815 |   v[1] = add_512(v[1], v[5]);
 816 |   v[2] = add_512(v[2], v[6]);
 817 |   v[3] = add_512(v[3], v[7]);
 818 |   v[12] = xor_512(v[12], v[0]);
 819 |   v[13] = xor_512(v[13], v[1]);
 820 |   v[14] = xor_512(v[14], v[2]);
 821 |   v[15] = xor_512(v[15], v[3]);
 822 |   v[12] = rot16_512(v[12]);
 823 |   v[13] = rot16_512(v[13]);
 824 |   v[14] = rot16_512(v[14]);
 825 |   v[15] = rot16_512(v[15]);
 826 |   v[8] = add_512(v[8], v[12]);
 827 |   v[9] = add_512(v[9], v[13]);
 828 |   v[10] = add_512(v[10], v[14]);
 829 |   v[11] = add_512(v[11], v[15]);
 830 |   v[4] = xor_512(v[4], v[8]);
 831 |   v[5] = xor_512(v[5], v[9]);
 832 |   v[6] = xor_512(v[6], v[10]);
 833 |   v[7] = xor_512(v[7], v[11]);
 834 |   v[4] = rot12_512(v[4]);
 835 |   v[5] = rot12_512(v[5]);
 836 |   v[6] = rot12_512(v[6]);
 837 |   v[7] = rot12_512(v[7]);
 838 |   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
 839 |   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
 840 |   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
 841 |   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
 842 |   v[0] = add_512(v[0], v[4]);
 843 |   v[1] = add_512(v[1], v[5]);
 844 |   v[2] = add_512(v[2], v[6]);
 845 |   v[3] = add_512(v[3], v[7]);
 846 |   v[12] = xor_512(v[12], v[0]);
 847 |   v[13] = xor_512(v[13], v[1]);
 848 |   v[14] = xor_512(v[14], v[2]);
 849 |   v[15] = xor_512(v[15], v[3]);
 850 |   v[12] = rot8_512(v[12]);
 851 |   v[13] = rot8_512(v[13]);
 852 |   v[14] = rot8_512(v[14]);
 853 |   v[15] = rot8_512(v[15]);
 854 |   v[8] = add_512(v[8], v[12]);
 855 |   v[9] = add_512(v[9], v[13]);
 856 |   v[10] = add_512(v[10], v[14]);
 857 |   v[11] = add_512(v[11], v[15]);
 858 |   v[4] = xor_512(v[4], v[8]);
 859 |   v[5] = xor_512(v[5], v[9]);
 860 |   v[6] = xor_512(v[6], v[10]);
 861 |   v[7] = xor_512(v[7], v[11]);
 862 |   v[4] = rot7_512(v[4]);
 863 |   v[5] = rot7_512(v[5]);
 864 |   v[6] = rot7_512(v[6]);
 865 |   v[7] = rot7_512(v[7]);
 866 | 
 867 |   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
 868 |   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
 869 |   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
 870 |   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
 871 |   v[0] = add_512(v[0], v[5]);
 872 |   v[1] = add_512(v[1], v[6]);
 873 |   v[2] = add_512(v[2], v[7]);
 874 |   v[3] = add_512(v[3], v[4]);
 875 |   v[15] = xor_512(v[15], v[0]);
 876 |   v[12] = xor_512(v[12], v[1]);
 877 |   v[13] = xor_512(v[13], v[2]);
 878 |   v[14] = xor_512(v[14], v[3]);
 879 |   v[15] = rot16_512(v[15]);
 880 |   v[12] = rot16_512(v[12]);
 881 |   v[13] = rot16_512(v[13]);
 882 |   v[14] = rot16_512(v[14]);
 883 |   v[10] = add_512(v[10], v[15]);
 884 |   v[11] = add_512(v[11], v[12]);
 885 |   v[8] = add_512(v[8], v[13]);
 886 |   v[9] = add_512(v[9], v[14]);
 887 |   v[5] = xor_512(v[5], v[10]);
 888 |   v[6] = xor_512(v[6], v[11]);
 889 |   v[7] = xor_512(v[7], v[8]);
 890 |   v[4] = xor_512(v[4], v[9]);
 891 |   v[5] = rot12_512(v[5]);
 892 |   v[6] = rot12_512(v[6]);
 893 |   v[7] = rot12_512(v[7]);
 894 |   v[4] = rot12_512(v[4]);
 895 |   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
 896 |   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
 897 |   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
 898 |   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
 899 |   v[0] = add_512(v[0], v[5]);
 900 |   v[1] = add_512(v[1], v[6]);
 901 |   v[2] = add_512(v[2], v[7]);
 902 |   v[3] = add_512(v[3], v[4]);
 903 |   v[15] = xor_512(v[15], v[0]);
 904 |   v[12] = xor_512(v[12], v[1]);
 905 |   v[13] = xor_512(v[13], v[2]);
 906 |   v[14] = xor_512(v[14], v[3]);
 907 |   v[15] = rot8_512(v[15]);
 908 |   v[12] = rot8_512(v[12]);
 909 |   v[13] = rot8_512(v[13]);
 910 |   v[14] = rot8_512(v[14]);
 911 |   v[10] = add_512(v[10], v[15]);
 912 |   v[11] = add_512(v[11], v[12]);
 913 |   v[8] = add_512(v[8], v[13]);
 914 |   v[9] = add_512(v[9], v[14]);
 915 |   v[5] = xor_512(v[5], v[10]);
 916 |   v[6] = xor_512(v[6], v[11]);
 917 |   v[7] = xor_512(v[7], v[8]);
 918 |   v[4] = xor_512(v[4], v[9]);
 919 |   v[5] = rot7_512(v[5]);
 920 |   v[6] = rot7_512(v[6]);
 921 |   v[7] = rot7_512(v[7]);
 922 |   v[4] = rot7_512(v[4]);
 923 | }
 924 | 
 925 | // 0b10001000, or lanes a0/a2/b0/b2 in little-endian order
 926 | #define LO_IMM8 0x88
 927 | 
 928 | INLINE __m512i unpack_lo_128(__m512i a, __m512i b) {
 929 |   return _mm512_shuffle_i32x4(a, b, LO_IMM8);
 930 | }
 931 | 
 932 | // 0b11011101, or lanes a1/a3/b1/b3 in little-endian order
 933 | #define HI_IMM8 0xdd
 934 | 
 935 | INLINE __m512i unpack_hi_128(__m512i a, __m512i b) {
 936 |   return _mm512_shuffle_i32x4(a, b, HI_IMM8);
 937 | }
 938 | 
 939 | INLINE void transpose_vecs_512(__m512i vecs[16]) {
 940 |   // Interleave 32-bit lanes. The _0 unpack is lanes
 941 |   // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes
 942 |   // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15.
 943 |   __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
 944 |   __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
 945 |   __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
 946 |   __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
 947 |   __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
 948 |   __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
 949 |   __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
 950 |   __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
 951 |   __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
 952 |   __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
 953 |   __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
 954 |   __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
 955 |   __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
 956 |   __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
 957 |   __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
 958 |   __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
 959 | 
 960 |   // Interleave 64-bit lates. The _0 unpack is lanes
 961 |   // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
 962 |   // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
 963 |   // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
 964 |   // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15.
 965 |   __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
 966 |   __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
 967 |   __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
 968 |   __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
 969 |   __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
 970 |   __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
 971 |   __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
 972 |   __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
 973 |   __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
 974 |   __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
 975 |   __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
 976 |   __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
 977 |   __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
 978 |   __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
 979 |   __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
 980 |   __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
 981 | 
 982 |   // Interleave 128-bit lanes. The _0 unpack is
 983 |   // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is
 984 |   // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on.
 985 |   __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0);
 986 |   __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1);
 987 |   __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2);
 988 |   __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3);
 989 |   __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0);
 990 |   __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1);
 991 |   __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2);
 992 |   __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3);
 993 |   __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0);
 994 |   __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1);
 995 |   __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2);
 996 |   __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3);
 997 |   __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0);
 998 |   __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1);
 999 |   __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2);
1000 |   __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3);
1001 | 
1002 |   // Interleave 128-bit lanes again for the final outputs.
1003 |   vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0);
1004 |   vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1);
1005 |   vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2);
1006 |   vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3);
1007 |   vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4);
1008 |   vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5);
1009 |   vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6);
1010 |   vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7);
1011 |   vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0);
1012 |   vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1);
1013 |   vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2);
1014 |   vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3);
1015 |   vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4);
1016 |   vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5);
1017 |   vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6);
1018 |   vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7);
1019 | }
1020 | 
1021 | INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
1022 |                                  size_t block_offset, __m512i out[16]) {
1023 |   out[0] = loadu_512(&inputs[0][block_offset]);
1024 |   out[1] = loadu_512(&inputs[1][block_offset]);
1025 |   out[2] = loadu_512(&inputs[2][block_offset]);
1026 |   out[3] = loadu_512(&inputs[3][block_offset]);
1027 |   out[4] = loadu_512(&inputs[4][block_offset]);
1028 |   out[5] = loadu_512(&inputs[5][block_offset]);
1029 |   out[6] = loadu_512(&inputs[6][block_offset]);
1030 |   out[7] = loadu_512(&inputs[7][block_offset]);
1031 |   out[8] = loadu_512(&inputs[8][block_offset]);
1032 |   out[9] = loadu_512(&inputs[9][block_offset]);
1033 |   out[10] = loadu_512(&inputs[10][block_offset]);
1034 |   out[11] = loadu_512(&inputs[11][block_offset]);
1035 |   out[12] = loadu_512(&inputs[12][block_offset]);
1036 |   out[13] = loadu_512(&inputs[13][block_offset]);
1037 |   out[14] = loadu_512(&inputs[14][block_offset]);
1038 |   out[15] = loadu_512(&inputs[15][block_offset]);
1039 |   for (size_t i = 0; i < 16; ++i) {
1040 |     _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
1041 |   }
1042 |   transpose_vecs_512(out);
1043 | }
1044 | 
1045 | INLINE void load_counters16(uint64_t counter, bool increment_counter,
1046 |                             __m512i *out_lo, __m512i *out_hi) {
1047 |   const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
1048 |   const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1049 |   const __m512i add1 = _mm512_and_si512(mask, add0);
1050 |   __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1);
1051 |   __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT);
1052 |   __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1));
1053 |   *out_lo = l;
1054 |   *out_hi = h;
1055 | }
1056 | 
1057 | void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
1058 |                           const uint32_t key[8], uint64_t counter,
1059 |                           bool increment_counter, uint8_t flags,
1060 |                           uint8_t flags_start, uint8_t flags_end,
1061 |                           uint8_t *out) {
1062 |   __m512i h_vecs[8] = {
1063 |       set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
1064 |       set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
1065 |   };
1066 |   __m512i counter_low_vec, counter_high_vec;
1067 |   load_counters16(counter, increment_counter, &counter_low_vec,
1068 |                   &counter_high_vec);
1069 |   uint8_t block_flags = flags | flags_start;
1070 | 
1071 |   for (size_t block = 0; block < blocks; block++) {
1072 |     if (block + 1 == blocks) {
1073 |       block_flags |= flags_end;
1074 |     }
1075 |     __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN);
1076 |     __m512i block_flags_vec = set1_512(block_flags);
1077 |     __m512i msg_vecs[16];
1078 |     transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
1079 | 
1080 |     __m512i v[16] = {
1081 |         h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
1082 |         h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
1083 |         set1_512(IV[0]), set1_512(IV[1]),  set1_512(IV[2]), set1_512(IV[3]),
1084 |         counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
1085 |     };
1086 |     round_fn16(v, msg_vecs, 0);
1087 |     round_fn16(v, msg_vecs, 1);
1088 |     round_fn16(v, msg_vecs, 2);
1089 |     round_fn16(v, msg_vecs, 3);
1090 |     round_fn16(v, msg_vecs, 4);
1091 |     round_fn16(v, msg_vecs, 5);
1092 |     round_fn16(v, msg_vecs, 6);
1093 |     h_vecs[0] = xor_512(v[0], v[8]);
1094 |     h_vecs[1] = xor_512(v[1], v[9]);
1095 |     h_vecs[2] = xor_512(v[2], v[10]);
1096 |     h_vecs[3] = xor_512(v[3], v[11]);
1097 |     h_vecs[4] = xor_512(v[4], v[12]);
1098 |     h_vecs[5] = xor_512(v[5], v[13]);
1099 |     h_vecs[6] = xor_512(v[6], v[14]);
1100 |     h_vecs[7] = xor_512(v[7], v[15]);
1101 | 
1102 |     block_flags = flags;
1103 |   }
1104 | 
1105 |   // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8
1106 |   // state vectors. Pad the matrix with zeros. After transposition, store the
1107 |   // lower half of each vector.
1108 |   __m512i padded[16] = {
1109 |       h_vecs[0],   h_vecs[1],   h_vecs[2],   h_vecs[3],
1110 |       h_vecs[4],   h_vecs[5],   h_vecs[6],   h_vecs[7],
1111 |       set1_512(0), set1_512(0), set1_512(0), set1_512(0),
1112 |       set1_512(0), set1_512(0), set1_512(0), set1_512(0),
1113 |   };
1114 |   transpose_vecs_512(padded);
1115 |   _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
1116 |   _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
1117 |   _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
1118 |   _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
1119 |   _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
1120 |   _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
1121 |   _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
1122 |   _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
1123 |   _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
1124 |   _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
1125 |   _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
1126 |   _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
1127 |   _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
1128 |   _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
1129 |   _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
1130 |   _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
1131 | }
1132 | 
1133 | /*
1134 |  * ----------------------------------------------------------------------------
1135 |  * hash_many_avx512
1136 |  * ----------------------------------------------------------------------------
1137 |  */
1138 | 
1139 | INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
1140 |                             const uint32_t key[8], uint64_t counter,
1141 |                             uint8_t flags, uint8_t flags_start,
1142 |                             uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
1143 |   uint32_t cv[8];
1144 |   memcpy(cv, key, BLAKE3_KEY_LEN);
1145 |   uint8_t block_flags = flags | flags_start;
1146 |   while (blocks > 0) {
1147 |     if (blocks == 1) {
1148 |       block_flags |= flags_end;
1149 |     }
1150 |     blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter,
1151 |                                     block_flags);
1152 |     input = &input[BLAKE3_BLOCK_LEN];
1153 |     blocks -= 1;
1154 |     block_flags = flags;
1155 |   }
1156 |   memcpy(out, cv, BLAKE3_OUT_LEN);
1157 | }
1158 | 
1159 | void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
1160 |                              size_t blocks, const uint32_t key[8],
1161 |                              uint64_t counter, bool increment_counter,
1162 |                              uint8_t flags, uint8_t flags_start,
1163 |                              uint8_t flags_end, uint8_t *out) {
1164 |   while (num_inputs >= 16) {
1165 |     blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags,
1166 |                          flags_start, flags_end, out);
1167 |     if (increment_counter) {
1168 |       counter += 16;
1169 |     }
1170 |     inputs += 16;
1171 |     num_inputs -= 16;
1172 |     out = &out[16 * BLAKE3_OUT_LEN];
1173 |   }
1174 |   while (num_inputs >= 8) {
1175 |     blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags,
1176 |                         flags_start, flags_end, out);
1177 |     if (increment_counter) {
1178 |       counter += 8;
1179 |     }
1180 |     inputs += 8;
1181 |     num_inputs -= 8;
1182 |     out = &out[8 * BLAKE3_OUT_LEN];
1183 |   }
1184 |   while (num_inputs >= 4) {
1185 |     blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags,
1186 |                         flags_start, flags_end, out);
1187 |     if (increment_counter) {
1188 |       counter += 4;
1189 |     }
1190 |     inputs += 4;
1191 |     num_inputs -= 4;
1192 |     out = &out[4 * BLAKE3_OUT_LEN];
1193 |   }
1194 |   while (num_inputs > 0) {
1195 |     hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start,
1196 |                     flags_end, out);
1197 |     if (increment_counter) {
1198 |       counter += 1;
1199 |     }
1200 |     inputs += 1;
1201 |     num_inputs -= 1;
1202 |     out = &out[BLAKE3_OUT_LEN];
1203 |   }
1204 | }
1205 | 


--------------------------------------------------------------------------------
/blake3_dispatch.c:
--------------------------------------------------------------------------------
  1 | #include <stdbool.h>
  2 | #include <stddef.h>
  3 | #include <stdint.h>
  4 | 
  5 | #include "blake3_impl.h"
  6 | 
  7 | #if defined(IS_X86)
  8 | #if defined(_MSC_VER)
  9 | #include <intrin.h>
 10 | #elif defined(__GNUC__)
 11 | #include <immintrin.h>
 12 | #else
 13 | #error "Unimplemented!"
 14 | #endif
 15 | #endif
 16 | 
 17 | #define MAYBE_UNUSED(x) (void)((x))
 18 | 
 19 | #if defined(IS_X86)
 20 | static uint64_t xgetbv() {
 21 | #if defined(_MSC_VER)
 22 |   return _xgetbv(0);
 23 | #else
 24 |   uint32_t eax = 0, edx = 0;
 25 |   __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
 26 |   return ((uint64_t)edx << 32) | eax;
 27 | #endif
 28 | }
 29 | 
 30 | static void cpuid(uint32_t out[4], uint32_t id) {
 31 | #if defined(_MSC_VER)
 32 |   __cpuid((int *)out, id);
 33 | #elif defined(__i386__) || defined(_M_IX86)
 34 |   __asm__ __volatile__("movl %%ebx, %1\n"
 35 |                        "cpuid\n"
 36 |                        "xchgl %1, %%ebx\n"
 37 |                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
 38 |                        : "a"(id));
 39 | #else
 40 |   __asm__ __volatile__("cpuid\n"
 41 |                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
 42 |                        : "a"(id));
 43 | #endif
 44 | }
 45 | 
 46 | static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
 47 | #if defined(_MSC_VER)
 48 |   __cpuidex((int *)out, id, sid);
 49 | #elif defined(__i386__) || defined(_M_IX86)
 50 |   __asm__ __volatile__("movl %%ebx, %1\n"
 51 |                        "cpuid\n"
 52 |                        "xchgl %1, %%ebx\n"
 53 |                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
 54 |                        : "a"(id), "c"(sid));
 55 | #else
 56 |   __asm__ __volatile__("cpuid\n"
 57 |                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
 58 |                        : "a"(id), "c"(sid));
 59 | #endif
 60 | }
 61 | 
 62 | #endif
 63 | 
 64 | enum cpu_feature {
 65 |   SSE2 = 1 << 0,
 66 |   SSSE3 = 1 << 1,
 67 |   SSE41 = 1 << 2,
 68 |   AVX = 1 << 3,
 69 |   AVX2 = 1 << 4,
 70 |   AVX512F = 1 << 5,
 71 |   AVX512VL = 1 << 6,
 72 |   /* ... */
 73 |   UNDEFINED = 1 << 30
 74 | };
 75 | 
 76 | #if !defined(BLAKE3_TESTING)
 77 | static /* Allow the variable to be controlled manually for testing */
 78 | #endif
 79 |     enum cpu_feature g_cpu_features = UNDEFINED;
 80 | 
 81 | #if !defined(BLAKE3_TESTING)
 82 | static
 83 | #endif
 84 |     enum cpu_feature
 85 |     get_cpu_features() {
 86 | 
 87 |   if (g_cpu_features != UNDEFINED) {
 88 |     return g_cpu_features;
 89 |   } else {
 90 | #if defined(IS_X86)
 91 |     uint32_t regs[4] = {0};
 92 |     uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
 93 |     (void)edx;
 94 |     enum cpu_feature features = 0;
 95 |     cpuid(regs, 0);
 96 |     const int max_id = *eax;
 97 |     cpuid(regs, 1);
 98 | #if defined(__amd64__) || defined(_M_X64)
 99 |     features |= SSE2;
100 | #else
101 |     if (*edx & (1UL << 26))
102 |       features |= SSE2;
103 | #endif
104 |     if (*ecx & (1UL << 0))
105 |       features |= SSSE3;
106 |     if (*ecx & (1UL << 19))
107 |       features |= SSE41;
108 | 
109 |     if (*ecx & (1UL << 27)) { // OSXSAVE
110 |       const uint64_t mask = xgetbv();
111 |       if ((mask & 6) == 6) { // SSE and AVX states
112 |         if (*ecx & (1UL << 28))
113 |           features |= AVX;
114 |         if (max_id >= 7) {
115 |           cpuidex(regs, 7, 0);
116 |           if (*ebx & (1UL << 5))
117 |             features |= AVX2;
118 |           if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
119 |             if (*ebx & (1UL << 31))
120 |               features |= AVX512VL;
121 |             if (*ebx & (1UL << 16))
122 |               features |= AVX512F;
123 |           }
124 |         }
125 |       }
126 |     }
127 |     g_cpu_features = features;
128 |     return features;
129 | #else
130 |     /* How to detect NEON? */
131 |     return 0;
132 | #endif
133 |   }
134 | }
135 | 
136 | void blake3_compress_in_place(uint32_t cv[8],
137 |                               const uint8_t block[BLAKE3_BLOCK_LEN],
138 |                               uint8_t block_len, uint64_t counter,
139 |                               uint8_t flags) {
140 | #if defined(IS_X86)
141 |   const enum cpu_feature features = get_cpu_features();
142 |   MAYBE_UNUSED(features);
143 | #if !defined(BLAKE3_NO_AVX512)
144 |   if (features & AVX512VL) {
145 |     blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
146 |     return;
147 |   }
148 | #endif
149 | #if !defined(BLAKE3_NO_SSE41)
150 |   if (features & SSE41) {
151 |     blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
152 |     return;
153 |   }
154 | #endif
155 | #if !defined(BLAKE3_NO_SSE2)
156 |   if (features & SSE2) {
157 |     blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
158 |     return;
159 |   }
160 | #endif
161 | #endif
162 |   blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
163 | }
164 | 
165 | void blake3_compress_xof(const uint32_t cv[8],
166 |                          const uint8_t block[BLAKE3_BLOCK_LEN],
167 |                          uint8_t block_len, uint64_t counter, uint8_t flags,
168 |                          uint8_t out[64]) {
169 | #if defined(IS_X86)
170 |   const enum cpu_feature features = get_cpu_features();
171 |   MAYBE_UNUSED(features);
172 | #if !defined(BLAKE3_NO_AVX512)
173 |   if (features & AVX512VL) {
174 |     blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
175 |     return;
176 |   }
177 | #endif
178 | #if !defined(BLAKE3_NO_SSE41)
179 |   if (features & SSE41) {
180 |     blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
181 |     return;
182 |   }
183 | #endif
184 | #if !defined(BLAKE3_NO_SSE2)
185 |   if (features & SSE2) {
186 |     blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
187 |     return;
188 |   }
189 | #endif
190 | #endif
191 |   blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
192 | }
193 | 
194 | void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
195 |                       size_t blocks, const uint32_t key[8], uint64_t counter,
196 |                       bool increment_counter, uint8_t flags,
197 |                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
198 | #if defined(IS_X86)
199 |   const enum cpu_feature features = get_cpu_features();
200 |   MAYBE_UNUSED(features);
201 | #if !defined(BLAKE3_NO_AVX512)
202 |   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
203 |     blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
204 |                             increment_counter, flags, flags_start, flags_end,
205 |                             out);
206 |     return;
207 |   }
208 | #endif
209 | #if !defined(BLAKE3_NO_AVX2)
210 |   if (features & AVX2) {
211 |     blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
212 |                           increment_counter, flags, flags_start, flags_end,
213 |                           out);
214 |     return;
215 |   }
216 | #endif
217 | #if !defined(BLAKE3_NO_SSE41)
218 |   if (features & SSE41) {
219 |     blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
220 |                            increment_counter, flags, flags_start, flags_end,
221 |                            out);
222 |     return;
223 |   }
224 | #endif
225 | #if !defined(BLAKE3_NO_SSE2)
226 |   if (features & SSE2) {
227 |     blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
228 |                           increment_counter, flags, flags_start, flags_end,
229 |                           out);
230 |     return;
231 |   }
232 | #endif
233 | #endif
234 | 
235 | #if defined(BLAKE3_USE_NEON)
236 |   blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
237 |                         increment_counter, flags, flags_start, flags_end, out);
238 |   return;
239 | #endif
240 | 
241 |   blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
242 |                             increment_counter, flags, flags_start, flags_end,
243 |                             out);
244 | }
245 | 
246 | // The dynamically detected SIMD degree of the current platform.
247 | size_t blake3_simd_degree(void) {
248 | #if defined(IS_X86)
249 |   const enum cpu_feature features = get_cpu_features();
250 |   MAYBE_UNUSED(features);
251 | #if !defined(BLAKE3_NO_AVX512)
252 |   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
253 |     return 16;
254 |   }
255 | #endif
256 | #if !defined(BLAKE3_NO_AVX2)
257 |   if (features & AVX2) {
258 |     return 8;
259 |   }
260 | #endif
261 | #if !defined(BLAKE3_NO_SSE41)
262 |   if (features & SSE41) {
263 |     return 4;
264 |   }
265 | #endif
266 | #if !defined(BLAKE3_NO_SSE2)
267 |   if (features & SSE2) {
268 |     return 4;
269 |   }
270 | #endif
271 | #endif
272 | #if defined(BLAKE3_USE_NEON)
273 |   return 4;
274 | #endif
275 |   return 1;
276 | }
277 | 


--------------------------------------------------------------------------------
/blake3_impl.h:
--------------------------------------------------------------------------------
  1 | #ifndef BLAKE3_IMPL_H
  2 | #define BLAKE3_IMPL_H
  3 | 
  4 | #include <assert.h>
  5 | #include <stdbool.h>
  6 | #include <stddef.h>
  7 | #include <stdint.h>
  8 | #include <string.h>
  9 | 
 10 | #include "blake3.h"
 11 | 
 12 | // internal flags
 13 | enum blake3_flags {
 14 |   CHUNK_START         = 1 << 0,
 15 |   CHUNK_END           = 1 << 1,
 16 |   PARENT              = 1 << 2,
 17 |   ROOT                = 1 << 3,
 18 |   KEYED_HASH          = 1 << 4,
 19 |   DERIVE_KEY_CONTEXT  = 1 << 5,
 20 |   DERIVE_KEY_MATERIAL = 1 << 6,
 21 | };
 22 | 
 23 | // This C implementation tries to support recent versions of GCC, Clang, and
 24 | // MSVC.
 25 | #if defined(_MSC_VER)
 26 | #define INLINE static __forceinline
 27 | #else
 28 | #define INLINE static inline __attribute__((always_inline))
 29 | #endif
 30 | 
 31 | #if defined(__x86_64__) || defined(_M_X64) 
 32 | #define IS_X86
 33 | #define IS_X86_64
 34 | #endif
 35 | 
 36 | #if defined(__i386__) || defined(_M_IX86)
 37 | #define IS_X86
 38 | #define IS_X86_32
 39 | #endif
 40 | 
 41 | #if defined(IS_X86)
 42 | #if defined(_MSC_VER)
 43 | #include <intrin.h>
 44 | #endif
 45 | #include <immintrin.h>
 46 | #endif
 47 | 
 48 | #if defined(IS_X86)
 49 | #define MAX_SIMD_DEGREE 16
 50 | #elif defined(BLAKE3_USE_NEON)
 51 | #define MAX_SIMD_DEGREE 4
 52 | #else
 53 | #define MAX_SIMD_DEGREE 1
 54 | #endif
 55 | 
 56 | // There are some places where we want a static size that's equal to the
 57 | // MAX_SIMD_DEGREE, but also at least 2.
 58 | #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
 59 | 
 60 | static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
 61 |                                0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
 62 |                                0x1F83D9ABUL, 0x5BE0CD19UL};
 63 | 
 64 | static const uint8_t MSG_SCHEDULE[7][16] = {
 65 |     {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
 66 |     {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
 67 |     {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
 68 |     {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
 69 |     {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
 70 |     {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
 71 |     {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
 72 | };
 73 | 
 74 | /* Find index of the highest set bit */
 75 | /* x is assumed to be nonzero.       */
 76 | static unsigned int highest_one(uint64_t x) {
 77 | #if defined(__GNUC__) || defined(__clang__)
 78 |   return 63 ^ __builtin_clzll(x);
 79 | #elif defined(_MSC_VER) && defined(IS_X86_64)
 80 |   unsigned long index;
 81 |   _BitScanReverse64(&index, x);
 82 |   return index;
 83 | #elif defined(_MSC_VER) && defined(IS_X86_32)
 84 |   if(x >> 32) {
 85 |     unsigned long index;
 86 |     _BitScanReverse(&index, x >> 32);
 87 |     return 32 + index;
 88 |   } else {
 89 |     unsigned long index;
 90 |     _BitScanReverse(&index, x);
 91 |     return index;
 92 |   }
 93 | #else
 94 |   unsigned int c = 0;
 95 |   if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
 96 |   if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
 97 |   if(x & 0x000000000000ff00ULL) { x >>=  8; c +=  8; }
 98 |   if(x & 0x00000000000000f0ULL) { x >>=  4; c +=  4; }
 99 |   if(x & 0x000000000000000cULL) { x >>=  2; c +=  2; }
100 |   if(x & 0x0000000000000002ULL) {           c +=  1; }
101 |   return c;
102 | #endif
103 | }
104 | 
105 | // Count the number of 1 bits.
106 | INLINE unsigned int popcnt(uint64_t x) {
107 | #if defined(__GNUC__) || defined(__clang__)
108 |   return __builtin_popcountll(x);
109 | #else
110 |   unsigned int count = 0;
111 |   while (x != 0) {
112 |     count += 1;
113 |     x &= x - 1;
114 |   }
115 |   return count;
116 | #endif
117 | }
118 | 
119 | // Largest power of two less than or equal to x. As a special case, returns 1
120 | // when x is 0. 
121 | INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
122 |   return 1ULL << highest_one(x | 1);
123 | }
124 | 
125 | INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
126 | 
127 | INLINE uint32_t counter_high(uint64_t counter) {
128 |   return (uint32_t)(counter >> 32);
129 | }
130 | 
131 | INLINE uint32_t load32(const void *src) {
132 |   const uint8_t *p = (const uint8_t *)src;
133 |   return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
134 |          ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
135 | }
136 | 
137 | INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
138 |                            uint32_t key_words[8]) {
139 |   key_words[0] = load32(&key[0 * 4]);
140 |   key_words[1] = load32(&key[1 * 4]);
141 |   key_words[2] = load32(&key[2 * 4]);
142 |   key_words[3] = load32(&key[3 * 4]);
143 |   key_words[4] = load32(&key[4 * 4]);
144 |   key_words[5] = load32(&key[5 * 4]);
145 |   key_words[6] = load32(&key[6 * 4]);
146 |   key_words[7] = load32(&key[7 * 4]);
147 | }
148 | 
149 | INLINE void store32(void *dst, uint32_t w) {
150 |   uint8_t *p = (uint8_t *)dst;
151 |   p[0] = (uint8_t)(w >> 0);
152 |   p[1] = (uint8_t)(w >> 8);
153 |   p[2] = (uint8_t)(w >> 16);
154 |   p[3] = (uint8_t)(w >> 24);
155 | }
156 | 
157 | INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
158 |   store32(&bytes_out[0 * 4], cv_words[0]);
159 |   store32(&bytes_out[1 * 4], cv_words[1]);
160 |   store32(&bytes_out[2 * 4], cv_words[2]);
161 |   store32(&bytes_out[3 * 4], cv_words[3]);
162 |   store32(&bytes_out[4 * 4], cv_words[4]);
163 |   store32(&bytes_out[5 * 4], cv_words[5]);
164 |   store32(&bytes_out[6 * 4], cv_words[6]);
165 |   store32(&bytes_out[7 * 4], cv_words[7]);
166 | }
167 | 
168 | void blake3_compress_in_place(uint32_t cv[8],
169 |                               const uint8_t block[BLAKE3_BLOCK_LEN],
170 |                               uint8_t block_len, uint64_t counter,
171 |                               uint8_t flags);
172 | 
173 | void blake3_compress_xof(const uint32_t cv[8],
174 |                          const uint8_t block[BLAKE3_BLOCK_LEN],
175 |                          uint8_t block_len, uint64_t counter, uint8_t flags,
176 |                          uint8_t out[64]);
177 | 
178 | void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
179 |                       size_t blocks, const uint32_t key[8], uint64_t counter,
180 |                       bool increment_counter, uint8_t flags,
181 |                       uint8_t flags_start, uint8_t flags_end, uint8_t *out);
182 | 
183 | size_t blake3_simd_degree(void);
184 | 
185 | 
186 | // Declarations for implementation-specific functions.
187 | void blake3_compress_in_place_portable(uint32_t cv[8],
188 |                                        const uint8_t block[BLAKE3_BLOCK_LEN],
189 |                                        uint8_t block_len, uint64_t counter,
190 |                                        uint8_t flags);
191 | 
192 | void blake3_compress_xof_portable(const uint32_t cv[8],
193 |                                   const uint8_t block[BLAKE3_BLOCK_LEN],
194 |                                   uint8_t block_len, uint64_t counter,
195 |                                   uint8_t flags, uint8_t out[64]);
196 | 
197 | void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
198 |                                size_t blocks, const uint32_t key[8],
199 |                                uint64_t counter, bool increment_counter,
200 |                                uint8_t flags, uint8_t flags_start,
201 |                                uint8_t flags_end, uint8_t *out);
202 | 
203 | #if defined(IS_X86)
204 | #if !defined(BLAKE3_NO_SSE2)
205 | void blake3_compress_in_place_sse2(uint32_t cv[8],
206 |                                    const uint8_t block[BLAKE3_BLOCK_LEN],
207 |                                    uint8_t block_len, uint64_t counter,
208 |                                    uint8_t flags);
209 | void blake3_compress_xof_sse2(const uint32_t cv[8],
210 |                               const uint8_t block[BLAKE3_BLOCK_LEN],
211 |                               uint8_t block_len, uint64_t counter,
212 |                               uint8_t flags, uint8_t out[64]);
213 | void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
214 |                            size_t blocks, const uint32_t key[8],
215 |                            uint64_t counter, bool increment_counter,
216 |                            uint8_t flags, uint8_t flags_start,
217 |                            uint8_t flags_end, uint8_t *out);
218 | #endif
219 | #if !defined(BLAKE3_NO_SSE41)
220 | void blake3_compress_in_place_sse41(uint32_t cv[8],
221 |                                     const uint8_t block[BLAKE3_BLOCK_LEN],
222 |                                     uint8_t block_len, uint64_t counter,
223 |                                     uint8_t flags);
224 | void blake3_compress_xof_sse41(const uint32_t cv[8],
225 |                                const uint8_t block[BLAKE3_BLOCK_LEN],
226 |                                uint8_t block_len, uint64_t counter,
227 |                                uint8_t flags, uint8_t out[64]);
228 | void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
229 |                             size_t blocks, const uint32_t key[8],
230 |                             uint64_t counter, bool increment_counter,
231 |                             uint8_t flags, uint8_t flags_start,
232 |                             uint8_t flags_end, uint8_t *out);
233 | #endif
234 | #if !defined(BLAKE3_NO_AVX2)
235 | void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
236 |                            size_t blocks, const uint32_t key[8],
237 |                            uint64_t counter, bool increment_counter,
238 |                            uint8_t flags, uint8_t flags_start,
239 |                            uint8_t flags_end, uint8_t *out);
240 | #endif
241 | #if !defined(BLAKE3_NO_AVX512)
242 | void blake3_compress_in_place_avx512(uint32_t cv[8],
243 |                                      const uint8_t block[BLAKE3_BLOCK_LEN],
244 |                                      uint8_t block_len, uint64_t counter,
245 |                                      uint8_t flags);
246 | 
247 | void blake3_compress_xof_avx512(const uint32_t cv[8],
248 |                                 const uint8_t block[BLAKE3_BLOCK_LEN],
249 |                                 uint8_t block_len, uint64_t counter,
250 |                                 uint8_t flags, uint8_t out[64]);
251 | 
252 | void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
253 |                              size_t blocks, const uint32_t key[8],
254 |                              uint64_t counter, bool increment_counter,
255 |                              uint8_t flags, uint8_t flags_start,
256 |                              uint8_t flags_end, uint8_t *out);
257 | #endif
258 | #endif
259 | 
260 | #if defined(BLAKE3_USE_NEON)
261 | void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
262 |                            size_t blocks, const uint32_t key[8],
263 |                            uint64_t counter, bool increment_counter,
264 |                            uint8_t flags, uint8_t flags_start,
265 |                            uint8_t flags_end, uint8_t *out);
266 | #endif
267 | 
268 | 
269 | #endif /* BLAKE3_IMPL_H */
270 | 


--------------------------------------------------------------------------------
/blake3_portable.c:
--------------------------------------------------------------------------------
  1 | #include "blake3_impl.h"
  2 | #include <string.h>
  3 | 
  4 | INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
  5 |   return (w >> c) | (w << (32 - c));
  6 | }
  7 | 
  8 | INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
  9 |               uint32_t x, uint32_t y) {
 10 |   state[a] = state[a] + state[b] + x;
 11 |   state[d] = rotr32(state[d] ^ state[a], 16);
 12 |   state[c] = state[c] + state[d];
 13 |   state[b] = rotr32(state[b] ^ state[c], 12);
 14 |   state[a] = state[a] + state[b] + y;
 15 |   state[d] = rotr32(state[d] ^ state[a], 8);
 16 |   state[c] = state[c] + state[d];
 17 |   state[b] = rotr32(state[b] ^ state[c], 7);
 18 | }
 19 | 
 20 | INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
 21 |   // Select the message schedule based on the round.
 22 |   const uint8_t *schedule = MSG_SCHEDULE[round];
 23 | 
 24 |   // Mix the columns.
 25 |   g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
 26 |   g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
 27 |   g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
 28 |   g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
 29 | 
 30 |   // Mix the rows.
 31 |   g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
 32 |   g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
 33 |   g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
 34 |   g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
 35 | }
 36 | 
 37 | INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
 38 |                          const uint8_t block[BLAKE3_BLOCK_LEN],
 39 |                          uint8_t block_len, uint64_t counter, uint8_t flags) {
 40 |   uint32_t block_words[16];
 41 |   block_words[0] = load32(block + 4 * 0);
 42 |   block_words[1] = load32(block + 4 * 1);
 43 |   block_words[2] = load32(block + 4 * 2);
 44 |   block_words[3] = load32(block + 4 * 3);
 45 |   block_words[4] = load32(block + 4 * 4);
 46 |   block_words[5] = load32(block + 4 * 5);
 47 |   block_words[6] = load32(block + 4 * 6);
 48 |   block_words[7] = load32(block + 4 * 7);
 49 |   block_words[8] = load32(block + 4 * 8);
 50 |   block_words[9] = load32(block + 4 * 9);
 51 |   block_words[10] = load32(block + 4 * 10);
 52 |   block_words[11] = load32(block + 4 * 11);
 53 |   block_words[12] = load32(block + 4 * 12);
 54 |   block_words[13] = load32(block + 4 * 13);
 55 |   block_words[14] = load32(block + 4 * 14);
 56 |   block_words[15] = load32(block + 4 * 15);
 57 | 
 58 |   state[0] = cv[0];
 59 |   state[1] = cv[1];
 60 |   state[2] = cv[2];
 61 |   state[3] = cv[3];
 62 |   state[4] = cv[4];
 63 |   state[5] = cv[5];
 64 |   state[6] = cv[6];
 65 |   state[7] = cv[7];
 66 |   state[8] = IV[0];
 67 |   state[9] = IV[1];
 68 |   state[10] = IV[2];
 69 |   state[11] = IV[3];
 70 |   state[12] = counter_low(counter);
 71 |   state[13] = counter_high(counter);
 72 |   state[14] = (uint32_t)block_len;
 73 |   state[15] = (uint32_t)flags;
 74 | 
 75 |   round_fn(state, &block_words[0], 0);
 76 |   round_fn(state, &block_words[0], 1);
 77 |   round_fn(state, &block_words[0], 2);
 78 |   round_fn(state, &block_words[0], 3);
 79 |   round_fn(state, &block_words[0], 4);
 80 |   round_fn(state, &block_words[0], 5);
 81 |   round_fn(state, &block_words[0], 6);
 82 | }
 83 | 
 84 | void blake3_compress_in_place_portable(uint32_t cv[8],
 85 |                                        const uint8_t block[BLAKE3_BLOCK_LEN],
 86 |                                        uint8_t block_len, uint64_t counter,
 87 |                                        uint8_t flags) {
 88 |   uint32_t state[16];
 89 |   compress_pre(state, cv, block, block_len, counter, flags);
 90 |   cv[0] = state[0] ^ state[8];
 91 |   cv[1] = state[1] ^ state[9];
 92 |   cv[2] = state[2] ^ state[10];
 93 |   cv[3] = state[3] ^ state[11];
 94 |   cv[4] = state[4] ^ state[12];
 95 |   cv[5] = state[5] ^ state[13];
 96 |   cv[6] = state[6] ^ state[14];
 97 |   cv[7] = state[7] ^ state[15];
 98 | }
 99 | 
100 | void blake3_compress_xof_portable(const uint32_t cv[8],
101 |                                   const uint8_t block[BLAKE3_BLOCK_LEN],
102 |                                   uint8_t block_len, uint64_t counter,
103 |                                   uint8_t flags, uint8_t out[64]) {
104 |   uint32_t state[16];
105 |   compress_pre(state, cv, block, block_len, counter, flags);
106 | 
107 |   store32(&out[0 * 4], state[0] ^ state[8]);
108 |   store32(&out[1 * 4], state[1] ^ state[9]);
109 |   store32(&out[2 * 4], state[2] ^ state[10]);
110 |   store32(&out[3 * 4], state[3] ^ state[11]);
111 |   store32(&out[4 * 4], state[4] ^ state[12]);
112 |   store32(&out[5 * 4], state[5] ^ state[13]);
113 |   store32(&out[6 * 4], state[6] ^ state[14]);
114 |   store32(&out[7 * 4], state[7] ^ state[15]);
115 |   store32(&out[8 * 4], state[8] ^ cv[0]);
116 |   store32(&out[9 * 4], state[9] ^ cv[1]);
117 |   store32(&out[10 * 4], state[10] ^ cv[2]);
118 |   store32(&out[11 * 4], state[11] ^ cv[3]);
119 |   store32(&out[12 * 4], state[12] ^ cv[4]);
120 |   store32(&out[13 * 4], state[13] ^ cv[5]);
121 |   store32(&out[14 * 4], state[14] ^ cv[6]);
122 |   store32(&out[15 * 4], state[15] ^ cv[7]);
123 | }
124 | 
125 | INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
126 |                               const uint32_t key[8], uint64_t counter,
127 |                               uint8_t flags, uint8_t flags_start,
128 |                               uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
129 |   uint32_t cv[8];
130 |   memcpy(cv, key, BLAKE3_KEY_LEN);
131 |   uint8_t block_flags = flags | flags_start;
132 |   while (blocks > 0) {
133 |     if (blocks == 1) {
134 |       block_flags |= flags_end;
135 |     }
136 |     blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
137 |                                       block_flags);
138 |     input = &input[BLAKE3_BLOCK_LEN];
139 |     blocks -= 1;
140 |     block_flags = flags;
141 |   }
142 |   store_cv_words(out, cv);
143 | }
144 | 
145 | void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
146 |                                size_t blocks, const uint32_t key[8],
147 |                                uint64_t counter, bool increment_counter,
148 |                                uint8_t flags, uint8_t flags_start,
149 |                                uint8_t flags_end, uint8_t *out) {
150 |   while (num_inputs > 0) {
151 |     hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
152 |                       flags_end, out);
153 |     if (increment_counter) {
154 |       counter += 1;
155 |     }
156 |     inputs += 1;
157 |     num_inputs -= 1;
158 |     out = &out[BLAKE3_OUT_LEN];
159 |   }
160 | }
161 | 


--------------------------------------------------------------------------------
/blake3_sse2.c:
--------------------------------------------------------------------------------
  1 | #include "blake3_impl.h"
  2 | 
  3 | #include <immintrin.h>
  4 | 
  5 | #define DEGREE 4
  6 | 
  7 | #define _mm_shuffle_ps2(a, b, c)                                               \
  8 |   (_mm_castps_si128(                                                           \
  9 |       _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
 10 | 
 11 | INLINE __m128i loadu(const uint8_t src[16]) {
 12 |   return _mm_loadu_si128((const __m128i *)src);
 13 | }
 14 | 
 15 | INLINE void storeu(__m128i src, uint8_t dest[16]) {
 16 |   _mm_storeu_si128((__m128i *)dest, src);
 17 | }
 18 | 
 19 | INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
 20 | 
 21 | // Note that clang-format doesn't like the name "xor" for some reason.
 22 | INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
 23 | 
 24 | INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
 25 | 
 26 | INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
 27 |   return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
 28 | }
 29 | 
 30 | INLINE __m128i rot16(__m128i x) {
 31 |   return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1);
 32 | }
 33 | 
 34 | INLINE __m128i rot12(__m128i x) {
 35 |   return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
 36 | }
 37 | 
 38 | INLINE __m128i rot8(__m128i x) {
 39 |   return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8));
 40 | }
 41 | 
 42 | INLINE __m128i rot7(__m128i x) {
 43 |   return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
 44 | }
 45 | 
 46 | INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
 47 |                __m128i m) {
 48 |   *row0 = addv(addv(*row0, m), *row1);
 49 |   *row3 = xorv(*row3, *row0);
 50 |   *row3 = rot16(*row3);
 51 |   *row2 = addv(*row2, *row3);
 52 |   *row1 = xorv(*row1, *row2);
 53 |   *row1 = rot12(*row1);
 54 | }
 55 | 
 56 | INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
 57 |                __m128i m) {
 58 |   *row0 = addv(addv(*row0, m), *row1);
 59 |   *row3 = xorv(*row3, *row0);
 60 |   *row3 = rot8(*row3);
 61 |   *row2 = addv(*row2, *row3);
 62 |   *row1 = xorv(*row1, *row2);
 63 |   *row1 = rot7(*row1);
 64 | }
 65 | 
 66 | // Note the optimization here of leaving row1 as the unrotated row, rather than
 67 | // row0. All the message loads below are adjusted to compensate for this. See
 68 | // discussion at https://github.com/sneves/blake2-avx2/pull/4
 69 | INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
 70 |   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
 71 |   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
 72 |   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
 73 | }
 74 | 
 75 | INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
 76 |   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
 77 |   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
 78 |   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
 79 | }
 80 | 
 81 | INLINE __m128i blend_epi16(__m128i a, __m128i b, const int imm8) {
 82 |   const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
 83 |   __m128i mask = _mm_set1_epi16(imm8);
 84 |   mask = _mm_and_si128(mask, bits);
 85 |   mask = _mm_cmpeq_epi16(mask, bits);
 86 |   return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
 87 | }
 88 | 
 89 | INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
 90 |                          const uint8_t block[BLAKE3_BLOCK_LEN],
 91 |                          uint8_t block_len, uint64_t counter, uint8_t flags) {
 92 |   rows[0] = loadu((uint8_t *)&cv[0]);
 93 |   rows[1] = loadu((uint8_t *)&cv[4]);
 94 |   rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
 95 |   rows[3] = set4(counter_low(counter), counter_high(counter),
 96 |                  (uint32_t)block_len, (uint32_t)flags);
 97 | 
 98 |   __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
 99 |   __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
100 |   __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
101 |   __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
102 | 
103 |   __m128i t0, t1, t2, t3, tt;
104 | 
105 |   // Round 1. The first round permutes the message words from the original
106 |   // input order, into the groups that get mixed in parallel.
107 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
108 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
109 |   t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
110 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
111 |   diagonalize(&rows[0], &rows[2], &rows[3]);
112 |   t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
113 |   t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
114 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
115 |   t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
116 |   t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
117 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
118 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
119 |   m0 = t0;
120 |   m1 = t1;
121 |   m2 = t2;
122 |   m3 = t3;
123 | 
124 |   // Round 2. This round and all following rounds apply a fixed permutation
125 |   // to the message words from the round before.
126 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
127 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
128 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
129 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
130 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
131 |   t1 = blend_epi16(tt, t1, 0xCC);
132 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
133 |   diagonalize(&rows[0], &rows[2], &rows[3]);
134 |   t2 = _mm_unpacklo_epi64(m3, m1);
135 |   tt = blend_epi16(t2, m2, 0xC0);
136 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
137 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
138 |   t3 = _mm_unpackhi_epi32(m1, m3);
139 |   tt = _mm_unpacklo_epi32(m2, t3);
140 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
141 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
142 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
143 |   m0 = t0;
144 |   m1 = t1;
145 |   m2 = t2;
146 |   m3 = t3;
147 | 
148 |   // Round 3
149 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
150 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
151 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
152 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
153 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
154 |   t1 = blend_epi16(tt, t1, 0xCC);
155 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
156 |   diagonalize(&rows[0], &rows[2], &rows[3]);
157 |   t2 = _mm_unpacklo_epi64(m3, m1);
158 |   tt = blend_epi16(t2, m2, 0xC0);
159 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
160 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
161 |   t3 = _mm_unpackhi_epi32(m1, m3);
162 |   tt = _mm_unpacklo_epi32(m2, t3);
163 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
164 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
165 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
166 |   m0 = t0;
167 |   m1 = t1;
168 |   m2 = t2;
169 |   m3 = t3;
170 | 
171 |   // Round 4
172 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
173 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
174 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
175 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
176 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
177 |   t1 = blend_epi16(tt, t1, 0xCC);
178 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
179 |   diagonalize(&rows[0], &rows[2], &rows[3]);
180 |   t2 = _mm_unpacklo_epi64(m3, m1);
181 |   tt = blend_epi16(t2, m2, 0xC0);
182 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
183 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
184 |   t3 = _mm_unpackhi_epi32(m1, m3);
185 |   tt = _mm_unpacklo_epi32(m2, t3);
186 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
187 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
188 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
189 |   m0 = t0;
190 |   m1 = t1;
191 |   m2 = t2;
192 |   m3 = t3;
193 | 
194 |   // Round 5
195 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
196 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
197 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
198 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
199 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
200 |   t1 = blend_epi16(tt, t1, 0xCC);
201 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
202 |   diagonalize(&rows[0], &rows[2], &rows[3]);
203 |   t2 = _mm_unpacklo_epi64(m3, m1);
204 |   tt = blend_epi16(t2, m2, 0xC0);
205 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
206 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
207 |   t3 = _mm_unpackhi_epi32(m1, m3);
208 |   tt = _mm_unpacklo_epi32(m2, t3);
209 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
210 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
211 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
212 |   m0 = t0;
213 |   m1 = t1;
214 |   m2 = t2;
215 |   m3 = t3;
216 | 
217 |   // Round 6
218 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
219 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
220 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
221 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
222 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
223 |   t1 = blend_epi16(tt, t1, 0xCC);
224 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
225 |   diagonalize(&rows[0], &rows[2], &rows[3]);
226 |   t2 = _mm_unpacklo_epi64(m3, m1);
227 |   tt = blend_epi16(t2, m2, 0xC0);
228 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
229 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
230 |   t3 = _mm_unpackhi_epi32(m1, m3);
231 |   tt = _mm_unpacklo_epi32(m2, t3);
232 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
233 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
234 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
235 |   m0 = t0;
236 |   m1 = t1;
237 |   m2 = t2;
238 |   m3 = t3;
239 | 
240 |   // Round 7
241 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
242 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
243 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
244 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
245 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
246 |   t1 = blend_epi16(tt, t1, 0xCC);
247 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
248 |   diagonalize(&rows[0], &rows[2], &rows[3]);
249 |   t2 = _mm_unpacklo_epi64(m3, m1);
250 |   tt = blend_epi16(t2, m2, 0xC0);
251 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
252 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
253 |   t3 = _mm_unpackhi_epi32(m1, m3);
254 |   tt = _mm_unpacklo_epi32(m2, t3);
255 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
256 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
257 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
258 | }
259 | 
260 | void blake3_compress_in_place_sse2(uint32_t cv[8],
261 |                                    const uint8_t block[BLAKE3_BLOCK_LEN],
262 |                                    uint8_t block_len, uint64_t counter,
263 |                                    uint8_t flags) {
264 |   __m128i rows[4];
265 |   compress_pre(rows, cv, block, block_len, counter, flags);
266 |   storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
267 |   storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
268 | }
269 | 
270 | void blake3_compress_xof_sse2(const uint32_t cv[8],
271 |                               const uint8_t block[BLAKE3_BLOCK_LEN],
272 |                               uint8_t block_len, uint64_t counter,
273 |                               uint8_t flags, uint8_t out[64]) {
274 |   __m128i rows[4];
275 |   compress_pre(rows, cv, block, block_len, counter, flags);
276 |   storeu(xorv(rows[0], rows[2]), &out[0]);
277 |   storeu(xorv(rows[1], rows[3]), &out[16]);
278 |   storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
279 |   storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
280 | }
281 | 
282 | INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
283 |   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
284 |   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
285 |   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
286 |   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
287 |   v[0] = addv(v[0], v[4]);
288 |   v[1] = addv(v[1], v[5]);
289 |   v[2] = addv(v[2], v[6]);
290 |   v[3] = addv(v[3], v[7]);
291 |   v[12] = xorv(v[12], v[0]);
292 |   v[13] = xorv(v[13], v[1]);
293 |   v[14] = xorv(v[14], v[2]);
294 |   v[15] = xorv(v[15], v[3]);
295 |   v[12] = rot16(v[12]);
296 |   v[13] = rot16(v[13]);
297 |   v[14] = rot16(v[14]);
298 |   v[15] = rot16(v[15]);
299 |   v[8] = addv(v[8], v[12]);
300 |   v[9] = addv(v[9], v[13]);
301 |   v[10] = addv(v[10], v[14]);
302 |   v[11] = addv(v[11], v[15]);
303 |   v[4] = xorv(v[4], v[8]);
304 |   v[5] = xorv(v[5], v[9]);
305 |   v[6] = xorv(v[6], v[10]);
306 |   v[7] = xorv(v[7], v[11]);
307 |   v[4] = rot12(v[4]);
308 |   v[5] = rot12(v[5]);
309 |   v[6] = rot12(v[6]);
310 |   v[7] = rot12(v[7]);
311 |   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
312 |   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
313 |   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
314 |   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
315 |   v[0] = addv(v[0], v[4]);
316 |   v[1] = addv(v[1], v[5]);
317 |   v[2] = addv(v[2], v[6]);
318 |   v[3] = addv(v[3], v[7]);
319 |   v[12] = xorv(v[12], v[0]);
320 |   v[13] = xorv(v[13], v[1]);
321 |   v[14] = xorv(v[14], v[2]);
322 |   v[15] = xorv(v[15], v[3]);
323 |   v[12] = rot8(v[12]);
324 |   v[13] = rot8(v[13]);
325 |   v[14] = rot8(v[14]);
326 |   v[15] = rot8(v[15]);
327 |   v[8] = addv(v[8], v[12]);
328 |   v[9] = addv(v[9], v[13]);
329 |   v[10] = addv(v[10], v[14]);
330 |   v[11] = addv(v[11], v[15]);
331 |   v[4] = xorv(v[4], v[8]);
332 |   v[5] = xorv(v[5], v[9]);
333 |   v[6] = xorv(v[6], v[10]);
334 |   v[7] = xorv(v[7], v[11]);
335 |   v[4] = rot7(v[4]);
336 |   v[5] = rot7(v[5]);
337 |   v[6] = rot7(v[6]);
338 |   v[7] = rot7(v[7]);
339 | 
340 |   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
341 |   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
342 |   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
343 |   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
344 |   v[0] = addv(v[0], v[5]);
345 |   v[1] = addv(v[1], v[6]);
346 |   v[2] = addv(v[2], v[7]);
347 |   v[3] = addv(v[3], v[4]);
348 |   v[15] = xorv(v[15], v[0]);
349 |   v[12] = xorv(v[12], v[1]);
350 |   v[13] = xorv(v[13], v[2]);
351 |   v[14] = xorv(v[14], v[3]);
352 |   v[15] = rot16(v[15]);
353 |   v[12] = rot16(v[12]);
354 |   v[13] = rot16(v[13]);
355 |   v[14] = rot16(v[14]);
356 |   v[10] = addv(v[10], v[15]);
357 |   v[11] = addv(v[11], v[12]);
358 |   v[8] = addv(v[8], v[13]);
359 |   v[9] = addv(v[9], v[14]);
360 |   v[5] = xorv(v[5], v[10]);
361 |   v[6] = xorv(v[6], v[11]);
362 |   v[7] = xorv(v[7], v[8]);
363 |   v[4] = xorv(v[4], v[9]);
364 |   v[5] = rot12(v[5]);
365 |   v[6] = rot12(v[6]);
366 |   v[7] = rot12(v[7]);
367 |   v[4] = rot12(v[4]);
368 |   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
369 |   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
370 |   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
371 |   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
372 |   v[0] = addv(v[0], v[5]);
373 |   v[1] = addv(v[1], v[6]);
374 |   v[2] = addv(v[2], v[7]);
375 |   v[3] = addv(v[3], v[4]);
376 |   v[15] = xorv(v[15], v[0]);
377 |   v[12] = xorv(v[12], v[1]);
378 |   v[13] = xorv(v[13], v[2]);
379 |   v[14] = xorv(v[14], v[3]);
380 |   v[15] = rot8(v[15]);
381 |   v[12] = rot8(v[12]);
382 |   v[13] = rot8(v[13]);
383 |   v[14] = rot8(v[14]);
384 |   v[10] = addv(v[10], v[15]);
385 |   v[11] = addv(v[11], v[12]);
386 |   v[8] = addv(v[8], v[13]);
387 |   v[9] = addv(v[9], v[14]);
388 |   v[5] = xorv(v[5], v[10]);
389 |   v[6] = xorv(v[6], v[11]);
390 |   v[7] = xorv(v[7], v[8]);
391 |   v[4] = xorv(v[4], v[9]);
392 |   v[5] = rot7(v[5]);
393 |   v[6] = rot7(v[6]);
394 |   v[7] = rot7(v[7]);
395 |   v[4] = rot7(v[4]);
396 | }
397 | 
398 | INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
399 |   // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
400 |   // 22/33. Note that this doesn't split the vector into two lanes, as the
401 |   // AVX2 counterparts do.
402 |   __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
403 |   __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
404 |   __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
405 |   __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
406 | 
407 |   // Interleave 64-bit lanes.
408 |   __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
409 |   __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
410 |   __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
411 |   __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
412 | 
413 |   vecs[0] = abcd_0;
414 |   vecs[1] = abcd_1;
415 |   vecs[2] = abcd_2;
416 |   vecs[3] = abcd_3;
417 | }
418 | 
419 | INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
420 |                                size_t block_offset, __m128i out[16]) {
421 |   out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
422 |   out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
423 |   out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
424 |   out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
425 |   out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
426 |   out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
427 |   out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
428 |   out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
429 |   out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
430 |   out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
431 |   out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
432 |   out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
433 |   out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
434 |   out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
435 |   out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
436 |   out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
437 |   for (size_t i = 0; i < 4; ++i) {
438 |     _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
439 |   }
440 |   transpose_vecs(&out[0]);
441 |   transpose_vecs(&out[4]);
442 |   transpose_vecs(&out[8]);
443 |   transpose_vecs(&out[12]);
444 | }
445 | 
446 | INLINE void load_counters(uint64_t counter, bool increment_counter,
447 |                           __m128i *out_lo, __m128i *out_hi) {
448 |   const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
449 |   const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
450 |   const __m128i add1 = _mm_and_si128(mask, add0);
451 |   __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
452 |   __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), 
453 |                                   _mm_xor_si128(   l, _mm_set1_epi32(0x80000000)));
454 |   __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
455 |   *out_lo = l;
456 |   *out_hi = h;
457 | }
458 | 
459 | void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
460 |                        const uint32_t key[8], uint64_t counter,
461 |                        bool increment_counter, uint8_t flags,
462 |                        uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
463 |   __m128i h_vecs[8] = {
464 |       set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
465 |       set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
466 |   };
467 |   __m128i counter_low_vec, counter_high_vec;
468 |   load_counters(counter, increment_counter, &counter_low_vec,
469 |                 &counter_high_vec);
470 |   uint8_t block_flags = flags | flags_start;
471 | 
472 |   for (size_t block = 0; block < blocks; block++) {
473 |     if (block + 1 == blocks) {
474 |       block_flags |= flags_end;
475 |     }
476 |     __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
477 |     __m128i block_flags_vec = set1(block_flags);
478 |     __m128i msg_vecs[16];
479 |     transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
480 | 
481 |     __m128i v[16] = {
482 |         h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
483 |         h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
484 |         set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
485 |         counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
486 |     };
487 |     round_fn(v, msg_vecs, 0);
488 |     round_fn(v, msg_vecs, 1);
489 |     round_fn(v, msg_vecs, 2);
490 |     round_fn(v, msg_vecs, 3);
491 |     round_fn(v, msg_vecs, 4);
492 |     round_fn(v, msg_vecs, 5);
493 |     round_fn(v, msg_vecs, 6);
494 |     h_vecs[0] = xorv(v[0], v[8]);
495 |     h_vecs[1] = xorv(v[1], v[9]);
496 |     h_vecs[2] = xorv(v[2], v[10]);
497 |     h_vecs[3] = xorv(v[3], v[11]);
498 |     h_vecs[4] = xorv(v[4], v[12]);
499 |     h_vecs[5] = xorv(v[5], v[13]);
500 |     h_vecs[6] = xorv(v[6], v[14]);
501 |     h_vecs[7] = xorv(v[7], v[15]);
502 | 
503 |     block_flags = flags;
504 |   }
505 | 
506 |   transpose_vecs(&h_vecs[0]);
507 |   transpose_vecs(&h_vecs[4]);
508 |   // The first four vecs now contain the first half of each output, and the
509 |   // second four vecs contain the second half of each output.
510 |   storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
511 |   storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
512 |   storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
513 |   storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
514 |   storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
515 |   storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
516 |   storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
517 |   storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
518 | }
519 | 
520 | INLINE void hash_one_sse2(const uint8_t *input, size_t blocks,
521 |                           const uint32_t key[8], uint64_t counter,
522 |                           uint8_t flags, uint8_t flags_start,
523 |                           uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
524 |   uint32_t cv[8];
525 |   memcpy(cv, key, BLAKE3_KEY_LEN);
526 |   uint8_t block_flags = flags | flags_start;
527 |   while (blocks > 0) {
528 |     if (blocks == 1) {
529 |       block_flags |= flags_end;
530 |     }
531 |     blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter,
532 |                                   block_flags);
533 |     input = &input[BLAKE3_BLOCK_LEN];
534 |     blocks -= 1;
535 |     block_flags = flags;
536 |   }
537 |   memcpy(out, cv, BLAKE3_OUT_LEN);
538 | }
539 | 
540 | void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
541 |                            size_t blocks, const uint32_t key[8],
542 |                            uint64_t counter, bool increment_counter,
543 |                            uint8_t flags, uint8_t flags_start,
544 |                            uint8_t flags_end, uint8_t *out) {
545 |   while (num_inputs >= DEGREE) {
546 |     blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags,
547 |                       flags_start, flags_end, out);
548 |     if (increment_counter) {
549 |       counter += DEGREE;
550 |     }
551 |     inputs += DEGREE;
552 |     num_inputs -= DEGREE;
553 |     out = &out[DEGREE * BLAKE3_OUT_LEN];
554 |   }
555 |   while (num_inputs > 0) {
556 |     hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start,
557 |                   flags_end, out);
558 |     if (increment_counter) {
559 |       counter += 1;
560 |     }
561 |     inputs += 1;
562 |     num_inputs -= 1;
563 |     out = &out[BLAKE3_OUT_LEN];
564 |   }
565 | }
566 | 


--------------------------------------------------------------------------------
/blake3_sse41.c:
--------------------------------------------------------------------------------
  1 | #include "blake3_impl.h"
  2 | 
  3 | #include <immintrin.h>
  4 | 
  5 | #define DEGREE 4
  6 | 
  7 | #define _mm_shuffle_ps2(a, b, c)                                               \
  8 |   (_mm_castps_si128(                                                           \
  9 |       _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
 10 | 
 11 | INLINE __m128i loadu(const uint8_t src[16]) {
 12 |   return _mm_loadu_si128((const __m128i *)src);
 13 | }
 14 | 
 15 | INLINE void storeu(__m128i src, uint8_t dest[16]) {
 16 |   _mm_storeu_si128((__m128i *)dest, src);
 17 | }
 18 | 
 19 | INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
 20 | 
 21 | // Note that clang-format doesn't like the name "xor" for some reason.
 22 | INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
 23 | 
 24 | INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
 25 | 
 26 | INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
 27 |   return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
 28 | }
 29 | 
 30 | INLINE __m128i rot16(__m128i x) {
 31 |   return _mm_shuffle_epi8(
 32 |       x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
 33 | }
 34 | 
 35 | INLINE __m128i rot12(__m128i x) {
 36 |   return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
 37 | }
 38 | 
 39 | INLINE __m128i rot8(__m128i x) {
 40 |   return _mm_shuffle_epi8(
 41 |       x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
 42 | }
 43 | 
 44 | INLINE __m128i rot7(__m128i x) {
 45 |   return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
 46 | }
 47 | 
 48 | INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
 49 |                __m128i m) {
 50 |   *row0 = addv(addv(*row0, m), *row1);
 51 |   *row3 = xorv(*row3, *row0);
 52 |   *row3 = rot16(*row3);
 53 |   *row2 = addv(*row2, *row3);
 54 |   *row1 = xorv(*row1, *row2);
 55 |   *row1 = rot12(*row1);
 56 | }
 57 | 
 58 | INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
 59 |                __m128i m) {
 60 |   *row0 = addv(addv(*row0, m), *row1);
 61 |   *row3 = xorv(*row3, *row0);
 62 |   *row3 = rot8(*row3);
 63 |   *row2 = addv(*row2, *row3);
 64 |   *row1 = xorv(*row1, *row2);
 65 |   *row1 = rot7(*row1);
 66 | }
 67 | 
 68 | // Note the optimization here of leaving row1 as the unrotated row, rather than
 69 | // row0. All the message loads below are adjusted to compensate for this. See
 70 | // discussion at https://github.com/sneves/blake2-avx2/pull/4
 71 | INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
 72 |   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
 73 |   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
 74 |   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
 75 | }
 76 | 
 77 | INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
 78 |   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
 79 |   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
 80 |   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
 81 | }
 82 | 
 83 | INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
 84 |                          const uint8_t block[BLAKE3_BLOCK_LEN],
 85 |                          uint8_t block_len, uint64_t counter, uint8_t flags) {
 86 |   rows[0] = loadu((uint8_t *)&cv[0]);
 87 |   rows[1] = loadu((uint8_t *)&cv[4]);
 88 |   rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
 89 |   rows[3] = set4(counter_low(counter), counter_high(counter),
 90 |                  (uint32_t)block_len, (uint32_t)flags);
 91 | 
 92 |   __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
 93 |   __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
 94 |   __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
 95 |   __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
 96 | 
 97 |   __m128i t0, t1, t2, t3, tt;
 98 | 
 99 |   // Round 1. The first round permutes the message words from the original
100 |   // input order, into the groups that get mixed in parallel.
101 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
102 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
103 |   t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
104 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
105 |   diagonalize(&rows[0], &rows[2], &rows[3]);
106 |   t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
107 |   t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
108 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
109 |   t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
110 |   t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
111 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
112 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
113 |   m0 = t0;
114 |   m1 = t1;
115 |   m2 = t2;
116 |   m3 = t3;
117 | 
118 |   // Round 2. This round and all following rounds apply a fixed permutation
119 |   // to the message words from the round before.
120 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
121 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
122 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
123 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
124 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
125 |   t1 = _mm_blend_epi16(tt, t1, 0xCC);
126 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
127 |   diagonalize(&rows[0], &rows[2], &rows[3]);
128 |   t2 = _mm_unpacklo_epi64(m3, m1);
129 |   tt = _mm_blend_epi16(t2, m2, 0xC0);
130 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
131 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
132 |   t3 = _mm_unpackhi_epi32(m1, m3);
133 |   tt = _mm_unpacklo_epi32(m2, t3);
134 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
135 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
136 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
137 |   m0 = t0;
138 |   m1 = t1;
139 |   m2 = t2;
140 |   m3 = t3;
141 | 
142 |   // Round 3
143 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
144 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
145 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
146 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
147 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
148 |   t1 = _mm_blend_epi16(tt, t1, 0xCC);
149 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
150 |   diagonalize(&rows[0], &rows[2], &rows[3]);
151 |   t2 = _mm_unpacklo_epi64(m3, m1);
152 |   tt = _mm_blend_epi16(t2, m2, 0xC0);
153 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
154 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
155 |   t3 = _mm_unpackhi_epi32(m1, m3);
156 |   tt = _mm_unpacklo_epi32(m2, t3);
157 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
158 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
159 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
160 |   m0 = t0;
161 |   m1 = t1;
162 |   m2 = t2;
163 |   m3 = t3;
164 | 
165 |   // Round 4
166 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
167 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
168 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
169 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
170 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
171 |   t1 = _mm_blend_epi16(tt, t1, 0xCC);
172 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
173 |   diagonalize(&rows[0], &rows[2], &rows[3]);
174 |   t2 = _mm_unpacklo_epi64(m3, m1);
175 |   tt = _mm_blend_epi16(t2, m2, 0xC0);
176 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
177 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
178 |   t3 = _mm_unpackhi_epi32(m1, m3);
179 |   tt = _mm_unpacklo_epi32(m2, t3);
180 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
181 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
182 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
183 |   m0 = t0;
184 |   m1 = t1;
185 |   m2 = t2;
186 |   m3 = t3;
187 | 
188 |   // Round 5
189 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
190 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
191 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
192 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
193 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
194 |   t1 = _mm_blend_epi16(tt, t1, 0xCC);
195 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
196 |   diagonalize(&rows[0], &rows[2], &rows[3]);
197 |   t2 = _mm_unpacklo_epi64(m3, m1);
198 |   tt = _mm_blend_epi16(t2, m2, 0xC0);
199 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
200 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
201 |   t3 = _mm_unpackhi_epi32(m1, m3);
202 |   tt = _mm_unpacklo_epi32(m2, t3);
203 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
204 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
205 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
206 |   m0 = t0;
207 |   m1 = t1;
208 |   m2 = t2;
209 |   m3 = t3;
210 | 
211 |   // Round 6
212 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
213 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
214 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
215 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
216 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
217 |   t1 = _mm_blend_epi16(tt, t1, 0xCC);
218 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
219 |   diagonalize(&rows[0], &rows[2], &rows[3]);
220 |   t2 = _mm_unpacklo_epi64(m3, m1);
221 |   tt = _mm_blend_epi16(t2, m2, 0xC0);
222 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
223 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
224 |   t3 = _mm_unpackhi_epi32(m1, m3);
225 |   tt = _mm_unpacklo_epi32(m2, t3);
226 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
227 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
228 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
229 |   m0 = t0;
230 |   m1 = t1;
231 |   m2 = t2;
232 |   m3 = t3;
233 | 
234 |   // Round 7
235 |   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
236 |   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
237 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
238 |   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
239 |   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
240 |   t1 = _mm_blend_epi16(tt, t1, 0xCC);
241 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
242 |   diagonalize(&rows[0], &rows[2], &rows[3]);
243 |   t2 = _mm_unpacklo_epi64(m3, m1);
244 |   tt = _mm_blend_epi16(t2, m2, 0xC0);
245 |   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
246 |   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
247 |   t3 = _mm_unpackhi_epi32(m1, m3);
248 |   tt = _mm_unpacklo_epi32(m2, t3);
249 |   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
250 |   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
251 |   undiagonalize(&rows[0], &rows[2], &rows[3]);
252 | }
253 | 
254 | void blake3_compress_in_place_sse41(uint32_t cv[8],
255 |                                     const uint8_t block[BLAKE3_BLOCK_LEN],
256 |                                     uint8_t block_len, uint64_t counter,
257 |                                     uint8_t flags) {
258 |   __m128i rows[4];
259 |   compress_pre(rows, cv, block, block_len, counter, flags);
260 |   storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
261 |   storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
262 | }
263 | 
264 | void blake3_compress_xof_sse41(const uint32_t cv[8],
265 |                                const uint8_t block[BLAKE3_BLOCK_LEN],
266 |                                uint8_t block_len, uint64_t counter,
267 |                                uint8_t flags, uint8_t out[64]) {
268 |   __m128i rows[4];
269 |   compress_pre(rows, cv, block, block_len, counter, flags);
270 |   storeu(xorv(rows[0], rows[2]), &out[0]);
271 |   storeu(xorv(rows[1], rows[3]), &out[16]);
272 |   storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
273 |   storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
274 | }
275 | 
276 | INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
277 |   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
278 |   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
279 |   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
280 |   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
281 |   v[0] = addv(v[0], v[4]);
282 |   v[1] = addv(v[1], v[5]);
283 |   v[2] = addv(v[2], v[6]);
284 |   v[3] = addv(v[3], v[7]);
285 |   v[12] = xorv(v[12], v[0]);
286 |   v[13] = xorv(v[13], v[1]);
287 |   v[14] = xorv(v[14], v[2]);
288 |   v[15] = xorv(v[15], v[3]);
289 |   v[12] = rot16(v[12]);
290 |   v[13] = rot16(v[13]);
291 |   v[14] = rot16(v[14]);
292 |   v[15] = rot16(v[15]);
293 |   v[8] = addv(v[8], v[12]);
294 |   v[9] = addv(v[9], v[13]);
295 |   v[10] = addv(v[10], v[14]);
296 |   v[11] = addv(v[11], v[15]);
297 |   v[4] = xorv(v[4], v[8]);
298 |   v[5] = xorv(v[5], v[9]);
299 |   v[6] = xorv(v[6], v[10]);
300 |   v[7] = xorv(v[7], v[11]);
301 |   v[4] = rot12(v[4]);
302 |   v[5] = rot12(v[5]);
303 |   v[6] = rot12(v[6]);
304 |   v[7] = rot12(v[7]);
305 |   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
306 |   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
307 |   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
308 |   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
309 |   v[0] = addv(v[0], v[4]);
310 |   v[1] = addv(v[1], v[5]);
311 |   v[2] = addv(v[2], v[6]);
312 |   v[3] = addv(v[3], v[7]);
313 |   v[12] = xorv(v[12], v[0]);
314 |   v[13] = xorv(v[13], v[1]);
315 |   v[14] = xorv(v[14], v[2]);
316 |   v[15] = xorv(v[15], v[3]);
317 |   v[12] = rot8(v[12]);
318 |   v[13] = rot8(v[13]);
319 |   v[14] = rot8(v[14]);
320 |   v[15] = rot8(v[15]);
321 |   v[8] = addv(v[8], v[12]);
322 |   v[9] = addv(v[9], v[13]);
323 |   v[10] = addv(v[10], v[14]);
324 |   v[11] = addv(v[11], v[15]);
325 |   v[4] = xorv(v[4], v[8]);
326 |   v[5] = xorv(v[5], v[9]);
327 |   v[6] = xorv(v[6], v[10]);
328 |   v[7] = xorv(v[7], v[11]);
329 |   v[4] = rot7(v[4]);
330 |   v[5] = rot7(v[5]);
331 |   v[6] = rot7(v[6]);
332 |   v[7] = rot7(v[7]);
333 | 
334 |   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
335 |   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
336 |   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
337 |   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
338 |   v[0] = addv(v[0], v[5]);
339 |   v[1] = addv(v[1], v[6]);
340 |   v[2] = addv(v[2], v[7]);
341 |   v[3] = addv(v[3], v[4]);
342 |   v[15] = xorv(v[15], v[0]);
343 |   v[12] = xorv(v[12], v[1]);
344 |   v[13] = xorv(v[13], v[2]);
345 |   v[14] = xorv(v[14], v[3]);
346 |   v[15] = rot16(v[15]);
347 |   v[12] = rot16(v[12]);
348 |   v[13] = rot16(v[13]);
349 |   v[14] = rot16(v[14]);
350 |   v[10] = addv(v[10], v[15]);
351 |   v[11] = addv(v[11], v[12]);
352 |   v[8] = addv(v[8], v[13]);
353 |   v[9] = addv(v[9], v[14]);
354 |   v[5] = xorv(v[5], v[10]);
355 |   v[6] = xorv(v[6], v[11]);
356 |   v[7] = xorv(v[7], v[8]);
357 |   v[4] = xorv(v[4], v[9]);
358 |   v[5] = rot12(v[5]);
359 |   v[6] = rot12(v[6]);
360 |   v[7] = rot12(v[7]);
361 |   v[4] = rot12(v[4]);
362 |   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
363 |   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
364 |   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
365 |   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
366 |   v[0] = addv(v[0], v[5]);
367 |   v[1] = addv(v[1], v[6]);
368 |   v[2] = addv(v[2], v[7]);
369 |   v[3] = addv(v[3], v[4]);
370 |   v[15] = xorv(v[15], v[0]);
371 |   v[12] = xorv(v[12], v[1]);
372 |   v[13] = xorv(v[13], v[2]);
373 |   v[14] = xorv(v[14], v[3]);
374 |   v[15] = rot8(v[15]);
375 |   v[12] = rot8(v[12]);
376 |   v[13] = rot8(v[13]);
377 |   v[14] = rot8(v[14]);
378 |   v[10] = addv(v[10], v[15]);
379 |   v[11] = addv(v[11], v[12]);
380 |   v[8] = addv(v[8], v[13]);
381 |   v[9] = addv(v[9], v[14]);
382 |   v[5] = xorv(v[5], v[10]);
383 |   v[6] = xorv(v[6], v[11]);
384 |   v[7] = xorv(v[7], v[8]);
385 |   v[4] = xorv(v[4], v[9]);
386 |   v[5] = rot7(v[5]);
387 |   v[6] = rot7(v[6]);
388 |   v[7] = rot7(v[7]);
389 |   v[4] = rot7(v[4]);
390 | }
391 | 
392 | INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
393 |   // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
394 |   // 22/33. Note that this doesn't split the vector into two lanes, as the
395 |   // AVX2 counterparts do.
396 |   __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
397 |   __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
398 |   __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
399 |   __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
400 | 
401 |   // Interleave 64-bit lanes.
402 |   __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
403 |   __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
404 |   __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
405 |   __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
406 | 
407 |   vecs[0] = abcd_0;
408 |   vecs[1] = abcd_1;
409 |   vecs[2] = abcd_2;
410 |   vecs[3] = abcd_3;
411 | }
412 | 
413 | INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
414 |                                size_t block_offset, __m128i out[16]) {
415 |   out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
416 |   out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
417 |   out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
418 |   out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
419 |   out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
420 |   out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
421 |   out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
422 |   out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
423 |   out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
424 |   out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
425 |   out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
426 |   out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
427 |   out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
428 |   out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
429 |   out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
430 |   out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
431 |   for (size_t i = 0; i < 4; ++i) {
432 |     _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
433 |   }
434 |   transpose_vecs(&out[0]);
435 |   transpose_vecs(&out[4]);
436 |   transpose_vecs(&out[8]);
437 |   transpose_vecs(&out[12]);
438 | }
439 | 
440 | INLINE void load_counters(uint64_t counter, bool increment_counter,
441 |                           __m128i *out_lo, __m128i *out_hi) {
442 |   const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
443 |   const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
444 |   const __m128i add1 = _mm_and_si128(mask, add0);
445 |   __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
446 |   __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), 
447 |                                   _mm_xor_si128(   l, _mm_set1_epi32(0x80000000)));
448 |   __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
449 |   *out_lo = l;
450 |   *out_hi = h;
451 | }
452 | 
453 | void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
454 |                         const uint32_t key[8], uint64_t counter,
455 |                         bool increment_counter, uint8_t flags,
456 |                         uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
457 |   __m128i h_vecs[8] = {
458 |       set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
459 |       set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
460 |   };
461 |   __m128i counter_low_vec, counter_high_vec;
462 |   load_counters(counter, increment_counter, &counter_low_vec,
463 |                 &counter_high_vec);
464 |   uint8_t block_flags = flags | flags_start;
465 | 
466 |   for (size_t block = 0; block < blocks; block++) {
467 |     if (block + 1 == blocks) {
468 |       block_flags |= flags_end;
469 |     }
470 |     __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
471 |     __m128i block_flags_vec = set1(block_flags);
472 |     __m128i msg_vecs[16];
473 |     transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
474 | 
475 |     __m128i v[16] = {
476 |         h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
477 |         h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
478 |         set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
479 |         counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
480 |     };
481 |     round_fn(v, msg_vecs, 0);
482 |     round_fn(v, msg_vecs, 1);
483 |     round_fn(v, msg_vecs, 2);
484 |     round_fn(v, msg_vecs, 3);
485 |     round_fn(v, msg_vecs, 4);
486 |     round_fn(v, msg_vecs, 5);
487 |     round_fn(v, msg_vecs, 6);
488 |     h_vecs[0] = xorv(v[0], v[8]);
489 |     h_vecs[1] = xorv(v[1], v[9]);
490 |     h_vecs[2] = xorv(v[2], v[10]);
491 |     h_vecs[3] = xorv(v[3], v[11]);
492 |     h_vecs[4] = xorv(v[4], v[12]);
493 |     h_vecs[5] = xorv(v[5], v[13]);
494 |     h_vecs[6] = xorv(v[6], v[14]);
495 |     h_vecs[7] = xorv(v[7], v[15]);
496 | 
497 |     block_flags = flags;
498 |   }
499 | 
500 |   transpose_vecs(&h_vecs[0]);
501 |   transpose_vecs(&h_vecs[4]);
502 |   // The first four vecs now contain the first half of each output, and the
503 |   // second four vecs contain the second half of each output.
504 |   storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
505 |   storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
506 |   storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
507 |   storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
508 |   storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
509 |   storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
510 |   storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
511 |   storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
512 | }
513 | 
514 | INLINE void hash_one_sse41(const uint8_t *input, size_t blocks,
515 |                            const uint32_t key[8], uint64_t counter,
516 |                            uint8_t flags, uint8_t flags_start,
517 |                            uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
518 |   uint32_t cv[8];
519 |   memcpy(cv, key, BLAKE3_KEY_LEN);
520 |   uint8_t block_flags = flags | flags_start;
521 |   while (blocks > 0) {
522 |     if (blocks == 1) {
523 |       block_flags |= flags_end;
524 |     }
525 |     blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter,
526 |                                    block_flags);
527 |     input = &input[BLAKE3_BLOCK_LEN];
528 |     blocks -= 1;
529 |     block_flags = flags;
530 |   }
531 |   memcpy(out, cv, BLAKE3_OUT_LEN);
532 | }
533 | 
534 | void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
535 |                             size_t blocks, const uint32_t key[8],
536 |                             uint64_t counter, bool increment_counter,
537 |                             uint8_t flags, uint8_t flags_start,
538 |                             uint8_t flags_end, uint8_t *out) {
539 |   while (num_inputs >= DEGREE) {
540 |     blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags,
541 |                        flags_start, flags_end, out);
542 |     if (increment_counter) {
543 |       counter += DEGREE;
544 |     }
545 |     inputs += DEGREE;
546 |     num_inputs -= DEGREE;
547 |     out = &out[DEGREE * BLAKE3_OUT_LEN];
548 |   }
549 |   while (num_inputs > 0) {
550 |     hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start,
551 |                    flags_end, out);
552 |     if (increment_counter) {
553 |       counter += 1;
554 |     }
555 |     inputs += 1;
556 |     num_inputs -= 1;
557 |     out = &out[BLAKE3_OUT_LEN];
558 |   }
559 | }
560 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | docker build --tag "blake3build:latest" .
2 | docker create --name blake3build_container blake3build
3 | #/making/modules/blake3.so
4 | docker cp blake3build_container:/making/modules/blake3.so ./compiled/blake3.so
5 | docker rm blake3build_container


--------------------------------------------------------------------------------
/compiled/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cypherbits/php-blake3/c269f1a25436fec48c04e8b771a8bbbfd5c198ac/compiled/.gitkeep


--------------------------------------------------------------------------------
/config.m4:
--------------------------------------------------------------------------------
1 | PHP_ARG_ENABLE(blake3,
2 | [Whether to enable BLAKE3 support],
3 | [--enable-blake3           Enable BLAKE3 Extension])
4 | 
5 | if test "$PHP_BLAKE3" != "no"; then
6 |     PHP_NEW_EXTENSION(blake3, php_blake3.c blake3.c blake3_dispatch.c blake3_portable.c blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S blake3_avx512_x86-64_unix.S, $ext_shared)
7 | fi


--------------------------------------------------------------------------------
/config.w32:
--------------------------------------------------------------------------------
1 | ARG_ENABLE('blake3', 'blake3 support', 'no');
2 | 
3 | if (PHP_BLAKE3 != 'no') {
4 | 	AC_DEFINE('HAVE_BLAKE3', 1, 'blake3 support enabled');
5 | 
6 | 	EXTENSION('blake3', 'php_blake3.c', true, '/DZEND_ENABLE_STATIC_TSRMLS_CACHE=1');
7 | 	ADD_SOURCES(configure_module_dirname, 'blake3b-ref.c blake3s-ref.c', 'blake3');
8 | }
9 | 


--------------------------------------------------------------------------------
/php_blake3.c:
--------------------------------------------------------------------------------
  1 | #ifdef HAVE_CONFIG_H
  2 | #include "config.h"
  3 | #endif
  4 | 
  5 | #include "php.h"
  6 | #include "ext/standard/info.h"
  7 | #include "ext/hash/php_hash.h"
  8 | #include "blake3.h"
  9 | #include "php_blake3.h"
 10 | 
 11 | #define PHP_BLAKE3_NAME "BLAKE3"
 12 | #define PHP_BLAKE3_VERSION "0.1.0"
 13 | 
 14 | ZEND_BEGIN_ARG_INFO_EX(arginfo_void, 0, 0, 0)
 15 | ZEND_END_ARG_INFO()
 16 | 
 17 | ZEND_BEGIN_ARG_INFO_EX(arginfo_blake3, 0, 0, 1)
 18 | 	ZEND_ARG_INFO(0, str)
 19 | 	ZEND_ARG_INFO(0, outputSize)
 20 | 	ZEND_ARG_INFO(0, key)
 21 |     ZEND_ARG_INFO(0, rawOutput)
 22 | ZEND_END_ARG_INFO()
 23 | 
 24 | ZEND_BEGIN_ARG_INFO_EX(arginfo_blake3_file, 0, 0, 1)
 25 | 	ZEND_ARG_INFO(0, filename)
 26 |     ZEND_ARG_INFO(0, rawOutput)
 27 | ZEND_END_ARG_INFO()
 28 | 
 29 | zend_function_entry blake3_functions[] = {
 30 |     PHP_FE(blake3, arginfo_blake3)
 31 |     PHP_FE(blake3_file, arginfo_blake3_file)
 32 |     {NULL, NULL, NULL}
 33 | };
 34 | 
 35 | PHP_MINIT_FUNCTION(blake3){
 36 | REGISTER_LONG_CONSTANT("BLAKE3_OUT_LEN",
 37 |                            BLAKE3_OUT_LEN, CONST_CS | CONST_PERSISTENT);
 38 | }
 39 | 
 40 | zend_module_entry blake3_module_entry = {
 41 | #if ZEND_MODULE_API_NO >= 20010901
 42 |     STANDARD_MODULE_HEADER,
 43 | #endif
 44 |     PHP_BLAKE3_NAME,
 45 |     blake3_functions,
 46 |     PHP_MINIT(blake3),
 47 |     NULL,
 48 |     NULL,
 49 |     NULL,
 50 |     NULL,
 51 | #if ZEND_MODULE_API_NO >= 20010901
 52 |     PHP_BLAKE3_VERSION,
 53 | #endif
 54 |     STANDARD_MODULE_PROPERTIES
 55 | };
 56 | 
 57 | #ifdef COMPILE_DL_BLAKE3
 58 | ZEND_GET_MODULE(blake3)
 59 | #endif
 60 | 
 61 | PHP_FUNCTION(blake3)
 62 | {
 63 | #if ZEND_MODULE_API_NO >= 20151012
 64 |     zend_long hashByteLength = BLAKE3_OUT_LEN;
 65 |     size_t dataByteLength;
 66 |     size_t keyLength = 0;
 67 | #else
 68 |     long hashByteLength = BLAKE3_OUT_LEN;
 69 |     int dataByteLength;
 70 |     int keyLength = 0;
 71 | #endif
 72 |     unsigned char *data;
 73 |     unsigned char *key;
 74 |     zend_bool rawOutput = 0;
 75 | 
 76 |     if (zend_parse_parameters(ZEND_NUM_ARGS(), "s|lsb", &data, &dataByteLength, &hashByteLength, &key, &keyLength, &rawOutput) == FAILURE) {
 77 |         return;
 78 |     }
 79 | 
 80 |     zend_bool hasError = 0;
 81 | 
 82 |     if (hashByteLength < 1) {
 83 |         hasError = 1;
 84 |         zend_error(E_ERROR, "BLAKE3 output length cannot be zero");
 85 |     }
 86 | 
 87 |     if (keyLength > 0 && keyLength != BLAKE3_KEY_LEN) {
 88 |         hasError = 1;
 89 |         zend_error(E_ERROR, "BLAKE3 key length MUST be 32 bytes");
 90 |     }
 91 | 
 92 |     if (hasError) {
 93 |         RETURN_FALSE;
 94 |     }
 95 | 
 96 |     char* hashOutput = (unsigned char*) emalloc(hashByteLength);
 97 | 
 98 |     int result = blake3(hashOutput, hashByteLength, data, dataByteLength, key, keyLength);
 99 | 
100 |     if (result != 0) {
101 |         zend_error(E_ERROR, "Error generating BLAKE3 hash");
102 |         efree(hashOutput);
103 |         RETURN_FALSE;
104 |     }
105 | 
106 |     if (rawOutput) {
107 | #if ZEND_MODULE_API_NO >= 20151012
108 |          RETVAL_STRINGL(hashOutput, hashByteLength);
109 | #else
110 |          RETVAL_STRINGL(hashOutput, hashByteLength, 1);
111 | #endif
112 |     } else {
113 |         char* hex = (char*) emalloc(hashByteLength * 2 + 1);
114 |         php_hash_bin2hex(hex, (unsigned char *) hashOutput, hashByteLength);
115 |         hex[hashByteLength * 2] = '\0';
116 | 
117 | #if ZEND_MODULE_API_NO >= 20151012
118 |         RETVAL_STRING(hex);
119 | #else
120 |         RETVAL_STRING(hex,1);
121 | #endif
122 | 
123 |         efree(hex);
124 |     }
125 | 
126 |     efree(hashOutput);
127 | }
128 | 
129 | PHP_FUNCTION(blake3_file)
130 | {
131 | #if ZEND_MODULE_API_NO >= 20151012
132 |     zend_long hashByteLength = BLAKE3_OUT_LEN;
133 |     size_t dataByteLength;
134 | #else
135 |     long hashByteLength = BLAKE3_OUT_LEN;
136 |     int dataByteLength;
137 | #endif
138 | 
139 |     char          *data;
140 |     int           rawOutput = 0;
141 | 
142 |     php_stream    *stream;
143 |     int           n;
144 |     unsigned char buf[1024];
145 | 
146 |     blake3_hasher hasher;
147 | 
148 |     if (zend_parse_parameters(ZEND_NUM_ARGS(), "p|b", &data, &dataByteLength, &rawOutput) == FAILURE) {
149 |         return;
150 |     }
151 | 
152 |     stream = php_stream_open_wrapper(data, "rb", REPORT_ERRORS, NULL);
153 |     if (!stream) {
154 |         RETURN_FALSE;
155 |     }
156 | 
157 |     char* hashOutput = (char*) emalloc(hashByteLength);
158 | 
159 |     blake3_hasher_init(&hasher);
160 | 
161 |     while ((n = php_stream_read(stream, buf, sizeof(buf))) > 0) {
162 |         blake3_hasher_update(&hasher, (const uint8_t *)buf, n);
163 |     }
164 | 
165 |     blake3_hasher_finalize(&hasher, hashOutput, hashByteLength);
166 | 
167 |     php_stream_close(stream);
168 | 
169 |     if (n<0) {
170 |         efree(hashOutput);
171 |         RETURN_FALSE;
172 |     }
173 | 
174 |     if (rawOutput) {
175 | #if ZEND_MODULE_API_NO >= 20151012
176 |         RETVAL_STRINGL(hashOutput, hashByteLength);
177 | #else
178 |         RETVAL_STRINGL(hashOutput, hashByteLength, 1);
179 | #endif
180 |     } else {
181 |         char* hex = (char*) emalloc(hashByteLength * 2 + 1);
182 |         php_hash_bin2hex(hex, (unsigned char *) hashOutput, hashByteLength);
183 |         hex[hashByteLength * 2] = '\0';
184 | #if ZEND_MODULE_API_NO >= 20151012
185 |         RETVAL_STRING(hex);
186 | #else
187 |         RETVAL_STRING(hex,1);
188 | #endif
189 |         efree(hex);
190 |     }
191 | 
192 |     efree(hashOutput);
193 | }
194 | 


--------------------------------------------------------------------------------
/php_blake3.h:
--------------------------------------------------------------------------------
 1 | #ifndef PHP_BLAKE3_H
 2 | #define PHP_BLAKE3_H
 3 | 
 4 | PHP_FUNCTION(blake3);
 5 | PHP_FUNCTION(blake3_file);
 6 | 
 7 | extern zend_module_entry blake3_module_entry;
 8 | #define phpext_blake3_ptr &blake3_module_entry
 9 | 
10 | #endif
11 | 


--------------------------------------------------------------------------------