├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── blake3.c ├── blake3.h ├── blake3_avx2.c ├── blake3_avx2_x86-64_unix.S ├── blake3_avx2_x86-64_windows_gnu.S ├── blake3_avx2_x86-64_windows_msvc.asm ├── blake3_avx512.c ├── blake3_avx512_x86-64_unix.S ├── blake3_avx512_x86-64_windows_gnu.S ├── blake3_avx512_x86-64_windows_msvc.asm ├── blake3_dispatch.c ├── blake3_impl.h ├── blake3_portable.c ├── blake3_sse2.c ├── blake3_sse2_x86-64_unix.S ├── blake3_sse2_x86-64_windows_gnu.S ├── blake3_sse2_x86-64_windows_msvc.asm ├── blake3_sse41.c ├── blake3_sse41_x86-64_unix.S ├── blake3_sse41_x86-64_windows_gnu.S ├── blake3_sse41_x86-64_windows_msvc.asm ├── build.sh ├── compiled └── .gitkeep ├── config.m4 ├── config.w32 ├── php_blake3.c └── php_blake3.h /.gitignore: -------------------------------------------------------------------------------- 1 | .deps 2 | .libs/ 3 | Makefile 4 | Makefile.fragments 5 | Makefile.global 6 | Makefile.objects 7 | acinclude.m4 8 | aclocal.m4 9 | autom4te.cache/ 10 | blake2.la 11 | blake2b-ref.lo 12 | blake2s-ref.lo 13 | build/ 14 | config.guess 15 | config.h 16 | config.h.in 17 | config.log 18 | config.nice 19 | config.status 20 | config.sub 21 | configure 22 | configure.in 23 | install-sh 24 | libtool 25 | ltmain.sh 26 | missing 27 | mkinstalldirs 28 | modules/ 29 | include/ 30 | php_blake2.lo 31 | run-tests.php 32 | *.lo 33 | .idea/ 34 | configure.ac 35 | /compiled/*.so -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | php: 3 | - '8.1' 4 | - '8.2' 5 | - '8.3' 6 | env: 7 | - NO_INTERACTION=1 8 | before_script: 9 | - phpize 10 | - ./configure --enable-blake2 11 | - make 12 | script: make test 13 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | LABEL authors="cypherbits" 3 | ENV LC_ALL=C.UTF-8 4 | RUN apt-get update -y && apt-get dist-upgrade software-properties-common -y 5 | RUN add-apt-repository ppa:ondrej/php 6 | RUN apt-get update -y && apt-get install php8.3 php8.3-dev -y 7 | COPY . /making 8 | RUN cd /making && phpize && ./configure --enable-blake3 && make && make install 9 | CMD bash -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | CFLAGS = -g -O2 -mavx -mavx2 -mssse3 -mavx512f -msse4.1 -msse -mpclmul -mavx512dq -mavx512vl 2 | 3 | 4 | Copyright (c) 2012-present strawbrary 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PHP BLAKE3 Extension 2 | ============================ 3 | 4 | BLAKE3 is an improved and faster version of BLAKE2. 5 | 6 | This extension uses the official BLAKE3 C implementation, thus is single-threaded, but still faster than SHA256 or SHA512 on my benchmark on latest PHP 7.4. 7 | 8 | Installation 9 | ------------ 10 | Clone the repository and compile it: 11 | ```sh 12 | $ git clone https://github.com/cypherbits/php-blake3.git 13 | $ cd php-blake3 14 | $ phpize 15 | $ ./configure --enable-blake3 16 | $ make && sudo make install 17 | ``` 18 | 19 | Enable the extension by adding the following line to your php.ini file: 20 | 21 | ```sh 22 | extension=blake3.so 23 | ``` 24 | 25 | You may need to restart your web server to load the extension. 26 | 27 | 28 | Usage 29 | ---- 30 | 31 | **Global constants:** 32 | 33 | `BLAKE3_OUT_LEN: 32` 34 | 35 | **Functions:** 36 | 37 | ```php 38 | string blake3 ( string $str [, int $outputSize = 64, string $key, bool $rawOutput = false ] ) 39 | ``` 40 | 41 | * $str: The string to hash 42 | * $outputSize: The length of the output hash (can be between 1 and 64) 43 | * $key: Turns the output into a keyed hash using the specified key. It MUST be of 32 bytes long. 44 | * $rawOutput: If set to true, then the hash is returned in raw binary format 45 | 46 | * Return value: A hex string containing the BLAKE3 hash of the input string. Default output size: 32 bytes. 47 | 48 | ```php 49 | string blake3_file ( string $filename [, bool $rawOutput = false ] ) 50 | ``` 51 | 52 | * $filename: The filename of the file to hash 53 | * $rawOutput: If set to true, then the hash is returned in raw binary format 54 | * Return value: A hex string containing the BLAKE3 hash of the input file 55 | 56 | Examples 57 | -------- 58 | ```php 59 | echo blake3(''); 60 | ``` 61 | 62 | af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 63 | 64 | ```php 65 | echo blake3('Hello world', 20); 66 | ``` 67 | 68 | e7e6fb7d2869d109b62cdb1227208d4016cdaa0a 69 | 70 | ```php 71 | echo blake3('Hello world', 32, 'cae8954e7b3415ea18303db548e15207'); 72 | ``` 73 | 74 | 75672fafd13480d2325914f0665795eceecad4e668d9ea2a87c40e71232a7d3a 75 | 76 | Benchmarks 77 | -------- 78 | ```php 79 | 2 | #include 3 | #include 4 | 5 | #include "blake3.h" 6 | #include "blake3_impl.h" 7 | 8 | 9 | /* inlen, at least, should be uint64_t. Others can be size_t. */ 10 | int blake3( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) 11 | { 12 | // Initialize the hasher. 13 | blake3_hasher hasher; 14 | 15 | /* Verify parameters */ 16 | if ( NULL == in && inlen > 0 ) return -1; 17 | 18 | if ( NULL == out ) return -1; 19 | 20 | if( NULL == key && keylen > 0 ) return -1; 21 | 22 | if( keylen == BLAKE3_KEY_LEN ) 23 | { 24 | blake3_hasher_init_keyed(&hasher, key); 25 | } 26 | else 27 | { 28 | blake3_hasher_init(&hasher); 29 | } 30 | 31 | blake3_hasher_update(&hasher, in, inlen); 32 | 33 | blake3_hasher_finalize(&hasher, out, outlen); 34 | 35 | return 0; 36 | } 37 | 38 | INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], 39 | uint8_t flags) { 40 | memcpy(self->cv, key, BLAKE3_KEY_LEN); 41 | self->chunk_counter = 0; 42 | memset(self->buf, 0, BLAKE3_BLOCK_LEN); 43 | self->buf_len = 0; 44 | self->blocks_compressed = 0; 45 | self->flags = flags; 46 | } 47 | 48 | INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], 49 | uint64_t chunk_counter) { 50 | memcpy(self->cv, key, BLAKE3_KEY_LEN); 51 | self->chunk_counter = chunk_counter; 52 | self->blocks_compressed = 0; 53 | memset(self->buf, 0, BLAKE3_BLOCK_LEN); 54 | self->buf_len = 0; 55 | } 56 | 57 | INLINE size_t chunk_state_len(const blake3_chunk_state *self) { 58 | return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + 59 | ((size_t)self->buf_len); 60 | } 61 | 62 | INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, 63 | const uint8_t *input, size_t input_len) { 64 | size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); 65 | if (take > input_len) { 66 | take = input_len; 67 | } 68 | uint8_t *dest = self->buf + ((size_t)self->buf_len); 69 | memcpy(dest, input, take); 70 | self->buf_len += (uint8_t)take; 71 | return take; 72 | } 73 | 74 | INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { 75 | if (self->blocks_compressed == 0) { 76 | return CHUNK_START; 77 | } else { 78 | return 0; 79 | } 80 | } 81 | 82 | typedef struct { 83 | uint32_t input_cv[8]; 84 | uint64_t counter; 85 | uint8_t block[BLAKE3_BLOCK_LEN]; 86 | uint8_t block_len; 87 | uint8_t flags; 88 | } output_t; 89 | 90 | INLINE output_t make_output(const uint32_t input_cv[8], 91 | const uint8_t block[BLAKE3_BLOCK_LEN], 92 | uint8_t block_len, uint64_t counter, 93 | uint8_t flags) { 94 | output_t ret; 95 | memcpy(ret.input_cv, input_cv, 32); 96 | memcpy(ret.block, block, BLAKE3_BLOCK_LEN); 97 | ret.block_len = block_len; 98 | ret.counter = counter; 99 | ret.flags = flags; 100 | return ret; 101 | } 102 | 103 | // Chaining values within a given chunk (specifically the compress_in_place 104 | // interface) are represented as words. This avoids unnecessary bytes<->words 105 | // conversion overhead in the portable implementation. However, the hash_many 106 | // interface handles both user input and parent node blocks, so it accepts 107 | // bytes. For that reason, chaining values in the CV stack are represented as 108 | // bytes. 109 | INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { 110 | uint32_t cv_words[8]; 111 | memcpy(cv_words, self->input_cv, 32); 112 | blake3_compress_in_place(cv_words, self->block, self->block_len, 113 | self->counter, self->flags); 114 | store_cv_words(cv, cv_words); 115 | } 116 | 117 | INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, 118 | size_t out_len) { 119 | uint64_t output_block_counter = seek / 64; 120 | size_t offset_within_block = seek % 64; 121 | uint8_t wide_buf[64]; 122 | while (out_len > 0) { 123 | blake3_compress_xof(self->input_cv, self->block, self->block_len, 124 | output_block_counter, self->flags | ROOT, wide_buf); 125 | size_t available_bytes = 64 - offset_within_block; 126 | size_t memcpy_len; 127 | if (out_len > available_bytes) { 128 | memcpy_len = available_bytes; 129 | } else { 130 | memcpy_len = out_len; 131 | } 132 | memcpy(out, wide_buf + offset_within_block, memcpy_len); 133 | out += memcpy_len; 134 | out_len -= memcpy_len; 135 | output_block_counter += 1; 136 | offset_within_block = 0; 137 | } 138 | } 139 | 140 | INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, 141 | size_t input_len) { 142 | if (self->buf_len > 0) { 143 | size_t take = chunk_state_fill_buf(self, input, input_len); 144 | input += take; 145 | input_len -= take; 146 | if (input_len > 0) { 147 | blake3_compress_in_place( 148 | self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, 149 | self->flags | chunk_state_maybe_start_flag(self)); 150 | self->blocks_compressed += 1; 151 | self->buf_len = 0; 152 | memset(self->buf, 0, BLAKE3_BLOCK_LEN); 153 | } 154 | } 155 | 156 | while (input_len > BLAKE3_BLOCK_LEN) { 157 | blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, 158 | self->chunk_counter, 159 | self->flags | chunk_state_maybe_start_flag(self)); 160 | self->blocks_compressed += 1; 161 | input += BLAKE3_BLOCK_LEN; 162 | input_len -= BLAKE3_BLOCK_LEN; 163 | } 164 | 165 | size_t take = chunk_state_fill_buf(self, input, input_len); 166 | input += take; 167 | input_len -= take; 168 | } 169 | 170 | INLINE output_t chunk_state_output(const blake3_chunk_state *self) { 171 | uint8_t block_flags = 172 | self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; 173 | return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, 174 | block_flags); 175 | } 176 | 177 | INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], 178 | const uint32_t key[8], uint8_t flags) { 179 | return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); 180 | } 181 | 182 | // Given some input larger than one chunk, return the number of bytes that 183 | // should go in the left subtree. This is the largest power-of-2 number of 184 | // chunks that leaves at least 1 byte for the right subtree. 185 | INLINE size_t left_len(size_t content_len) { 186 | // Subtract 1 to reserve at least one byte for the right side. content_len 187 | // should always be greater than BLAKE3_CHUNK_LEN. 188 | size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; 189 | return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; 190 | } 191 | 192 | // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time 193 | // on a single thread. Write out the chunk chaining values and return the 194 | // number of chunks hashed. These chunks are never the root and never empty; 195 | // those cases use a different codepath. 196 | INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, 197 | const uint32_t key[8], 198 | uint64_t chunk_counter, uint8_t flags, 199 | uint8_t *out) { 200 | #if defined(BLAKE3_TESTING) 201 | assert(0 < input_len); 202 | assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); 203 | #endif 204 | 205 | const uint8_t *chunks_array[MAX_SIMD_DEGREE]; 206 | size_t input_position = 0; 207 | size_t chunks_array_len = 0; 208 | while (input_len - input_position >= BLAKE3_CHUNK_LEN) { 209 | chunks_array[chunks_array_len] = &input[input_position]; 210 | input_position += BLAKE3_CHUNK_LEN; 211 | chunks_array_len += 1; 212 | } 213 | 214 | blake3_hash_many(chunks_array, chunks_array_len, 215 | BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, 216 | true, flags, CHUNK_START, CHUNK_END, out); 217 | 218 | // Hash the remaining partial chunk, if there is one. Note that the empty 219 | // chunk (meaning the empty message) is a different codepath. 220 | if (input_len > input_position) { 221 | uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; 222 | blake3_chunk_state chunk_state; 223 | chunk_state_init(&chunk_state, key, flags); 224 | chunk_state.chunk_counter = counter; 225 | chunk_state_update(&chunk_state, &input[input_position], 226 | input_len - input_position); 227 | output_t output = chunk_state_output(&chunk_state); 228 | output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); 229 | return chunks_array_len + 1; 230 | } else { 231 | return chunks_array_len; 232 | } 233 | } 234 | 235 | // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time 236 | // on a single thread. Write out the parent chaining values and return the 237 | // number of parents hashed. (If there's an odd input chaining value left over, 238 | // return it as an additional output.) These parents are never the root and 239 | // never empty; those cases use a different codepath. 240 | INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, 241 | size_t num_chaining_values, 242 | const uint32_t key[8], uint8_t flags, 243 | uint8_t *out) { 244 | #if defined(BLAKE3_TESTING) 245 | assert(2 <= num_chaining_values); 246 | assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); 247 | #endif 248 | 249 | const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; 250 | size_t parents_array_len = 0; 251 | while (num_chaining_values - (2 * parents_array_len) >= 2) { 252 | parents_array[parents_array_len] = 253 | &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; 254 | parents_array_len += 1; 255 | } 256 | 257 | blake3_hash_many(parents_array, parents_array_len, 1, key, 258 | 0, // Parents always use counter 0. 259 | false, flags | PARENT, 260 | 0, // Parents have no start flags. 261 | 0, // Parents have no end flags. 262 | out); 263 | 264 | // If there's an odd child left over, it becomes an output. 265 | if (num_chaining_values > 2 * parents_array_len) { 266 | memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], 267 | &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], 268 | BLAKE3_OUT_LEN); 269 | return parents_array_len + 1; 270 | } else { 271 | return parents_array_len; 272 | } 273 | } 274 | 275 | // The wide helper function returns (writes out) an array of chaining values 276 | // and returns the length of that array. The number of chaining values returned 277 | // is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, 278 | // if the input is shorter than that many chunks. The reason for maintaining a 279 | // wide array of chaining values going back up the tree, is to allow the 280 | // implementation to hash as many parents in parallel as possible. 281 | // 282 | // As a special case when the SIMD degree is 1, this function will still return 283 | // at least 2 outputs. This guarantees that this function doesn't perform the 284 | // root compression. (If it did, it would use the wrong flags, and also we 285 | // wouldn't be able to implement exendable ouput.) Note that this function is 286 | // not used when the whole input is only 1 chunk long; that's a different 287 | // codepath. 288 | // 289 | // Why not just have the caller split the input on the first update(), instead 290 | // of implementing this special rule? Because we don't want to limit SIMD or 291 | // multi-threading parallelism for that update(). 292 | static size_t blake3_compress_subtree_wide(const uint8_t *input, 293 | size_t input_len, 294 | const uint32_t key[8], 295 | uint64_t chunk_counter, 296 | uint8_t flags, uint8_t *out) { 297 | // Note that the single chunk case does *not* bump the SIMD degree up to 2 298 | // when it is 1. If this implementation adds multi-threading in the future, 299 | // this gives us the option of multi-threading even the 2-chunk case, which 300 | // can help performance on smaller platforms. 301 | if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { 302 | return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, 303 | out); 304 | } 305 | 306 | // With more than simd_degree chunks, we need to recurse. Start by dividing 307 | // the input into left and right subtrees. (Note that this is only optimal 308 | // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree 309 | // of 3 or something, we'll need a more complicated strategy.) 310 | size_t left_input_len = left_len(input_len); 311 | size_t right_input_len = input_len - left_input_len; 312 | const uint8_t *right_input = &input[left_input_len]; 313 | uint64_t right_chunk_counter = 314 | chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); 315 | 316 | // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to 317 | // account for the special case of returning 2 outputs when the SIMD degree 318 | // is 1. 319 | uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; 320 | size_t degree = blake3_simd_degree(); 321 | if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { 322 | // The special case: We always use a degree of at least two, to make 323 | // sure there are two outputs. Except, as noted above, at the chunk 324 | // level, where we allow degree=1. (Note that the 1-chunk-input case is 325 | // a different codepath.) 326 | degree = 2; 327 | } 328 | uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; 329 | 330 | // Recurse! If this implementation adds multi-threading support in the 331 | // future, this is where it will go. 332 | size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, 333 | chunk_counter, flags, cv_array); 334 | size_t right_n = blake3_compress_subtree_wide( 335 | right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); 336 | 337 | // The special case again. If simd_degree=1, then we'll have left_n=1 and 338 | // right_n=1. Rather than compressing them into a single output, return 339 | // them directly, to make sure we always have at least two outputs. 340 | if (left_n == 1) { 341 | memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); 342 | return 2; 343 | } 344 | 345 | // Otherwise, do one layer of parent node compression. 346 | size_t num_chaining_values = left_n + right_n; 347 | return compress_parents_parallel(cv_array, num_chaining_values, key, flags, 348 | out); 349 | } 350 | 351 | // Hash a subtree with compress_subtree_wide(), and then condense the resulting 352 | // list of chaining values down to a single parent node. Don't compress that 353 | // last parent node, however. Instead, return its message bytes (the 354 | // concatenated chaining values of its children). This is necessary when the 355 | // first call to update() supplies a complete subtree, because the topmost 356 | // parent node of that subtree could end up being the root. It's also necessary 357 | // for extended output in the general case. 358 | // 359 | // As with compress_subtree_wide(), this function is not used on inputs of 1 360 | // chunk or less. That's a different codepath. 361 | INLINE void compress_subtree_to_parent_node( 362 | const uint8_t *input, size_t input_len, const uint32_t key[8], 363 | uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { 364 | #if defined(BLAKE3_TESTING) 365 | assert(input_len > BLAKE3_CHUNK_LEN); 366 | #endif 367 | 368 | uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; 369 | size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, 370 | chunk_counter, flags, cv_array); 371 | 372 | // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, 373 | // compress_subtree_wide() returns more than 2 chaining values. Condense 374 | // them into 2 by forming parent nodes repeatedly. 375 | uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; 376 | while (num_cvs > 2) { 377 | num_cvs = 378 | compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); 379 | memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); 380 | } 381 | memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); 382 | } 383 | 384 | INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], 385 | uint8_t flags) { 386 | memcpy(self->key, key, BLAKE3_KEY_LEN); 387 | chunk_state_init(&self->chunk, key, flags); 388 | self->cv_stack_len = 0; 389 | } 390 | 391 | void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } 392 | 393 | void blake3_hasher_init_keyed(blake3_hasher *self, 394 | const uint8_t key[BLAKE3_KEY_LEN]) { 395 | uint32_t key_words[8]; 396 | load_key_words(key, key_words); 397 | hasher_init_base(self, key_words, KEYED_HASH); 398 | } 399 | 400 | void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, 401 | size_t context_len) { 402 | blake3_hasher context_hasher; 403 | hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); 404 | blake3_hasher_update(&context_hasher, context, context_len); 405 | uint8_t context_key[BLAKE3_KEY_LEN]; 406 | blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); 407 | uint32_t context_key_words[8]; 408 | load_key_words(context_key, context_key_words); 409 | hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); 410 | } 411 | 412 | void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { 413 | blake3_hasher_init_derive_key_raw(self, context, strlen(context)); 414 | } 415 | 416 | // As described in hasher_push_cv() below, we do "lazy merging", delaying 417 | // merges until right before the next CV is about to be added. This is 418 | // different from the reference implementation. Another difference is that we 419 | // aren't always merging 1 chunk at a time. Instead, each CV might represent 420 | // any power-of-two number of chunks, as long as the smaller-above-larger stack 421 | // order is maintained. Instead of the "count the trailing 0-bits" algorithm 422 | // described in the spec, we use a "count the total number of 1-bits" variant 423 | // that doesn't require us to retain the subtree size of the CV on top of the 424 | // stack. The principle is the same: each CV that should remain in the stack is 425 | // represented by a 1-bit in the total number of chunks (or bytes) so far. 426 | INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { 427 | size_t post_merge_stack_len = (size_t)popcnt(total_len); 428 | while (self->cv_stack_len > post_merge_stack_len) { 429 | uint8_t *parent_node = 430 | &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; 431 | output_t output = parent_output(parent_node, self->key, self->chunk.flags); 432 | output_chaining_value(&output, parent_node); 433 | self->cv_stack_len -= 1; 434 | } 435 | } 436 | 437 | // In reference_impl.rs, we merge the new CV with existing CVs from the stack 438 | // before pushing it. We can do that because we know more input is coming, so 439 | // we know none of the merges are root. 440 | // 441 | // This setting is different. We want to feed as much input as possible to 442 | // compress_subtree_wide(), without setting aside anything for the chunk_state. 443 | // If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once 444 | // as a single subtree, if at all possible. 445 | // 446 | // This leads to two problems: 447 | // 1) This 64 KiB input might be the only call that ever gets made to update. 448 | // In this case, the root node of the 64 KiB subtree would be the root node 449 | // of the whole tree, and it would need to be ROOT finalized. We can't 450 | // compress it until we know. 451 | // 2) This 64 KiB input might complete a larger tree, whose root node is 452 | // similarly going to be the the root of the whole tree. For example, maybe 453 | // we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the 454 | // node at the root of the 256 KiB subtree until we know how to finalize it. 455 | // 456 | // The second problem is solved with "lazy merging". That is, when we're about 457 | // to add a CV to the stack, we don't merge it with anything first, as the 458 | // reference impl does. Instead we do merges using the *previous* CV that was 459 | // added, which is sitting on top of the stack, and we put the new CV 460 | // (unmerged) on top of the stack afterwards. This guarantees that we never 461 | // merge the root node until finalize(). 462 | // 463 | // Solving the first problem requires an additional tool, 464 | // compress_subtree_to_parent_node(). That function always returns the top 465 | // *two* chaining values of the subtree it's compressing. We then do lazy 466 | // merging with each of them separately, so that the second CV will always 467 | // remain unmerged. (That also helps us support extendable output when we're 468 | // hashing an input all-at-once.) 469 | INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], 470 | uint64_t chunk_counter) { 471 | hasher_merge_cv_stack(self, chunk_counter); 472 | memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, 473 | BLAKE3_OUT_LEN); 474 | self->cv_stack_len += 1; 475 | } 476 | 477 | void blake3_hasher_update(blake3_hasher *self, const void *input, 478 | size_t input_len) { 479 | // Explicitly checking for zero avoids causing UB by passing a null pointer 480 | // to memcpy. This comes up in practice with things like: 481 | // std::vector v; 482 | // blake3_hasher_update(&hasher, v.data(), v.size()); 483 | if (input_len == 0) { 484 | return; 485 | } 486 | 487 | const uint8_t *input_bytes = (const uint8_t *)input; 488 | 489 | // If we have some partial chunk bytes in the internal chunk_state, we need 490 | // to finish that chunk first. 491 | if (chunk_state_len(&self->chunk) > 0) { 492 | size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); 493 | if (take > input_len) { 494 | take = input_len; 495 | } 496 | chunk_state_update(&self->chunk, input_bytes, take); 497 | input_bytes += take; 498 | input_len -= take; 499 | // If we've filled the current chunk and there's more coming, finalize this 500 | // chunk and proceed. In this case we know it's not the root. 501 | if (input_len > 0) { 502 | output_t output = chunk_state_output(&self->chunk); 503 | uint8_t chunk_cv[32]; 504 | output_chaining_value(&output, chunk_cv); 505 | hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); 506 | chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); 507 | } else { 508 | return; 509 | } 510 | } 511 | 512 | // Now the chunk_state is clear, and we have more input. If there's more than 513 | // a single chunk (so, definitely not the root chunk), hash the largest whole 514 | // subtree we can, with the full benefits of SIMD (and maybe in the future, 515 | // multi-threading) parallelism. Two restrictions: 516 | // - The subtree has to be a power-of-2 number of chunks. Only subtrees along 517 | // the right edge can be incomplete, and we don't know where the right edge 518 | // is going to be until we get to finalize(). 519 | // - The subtree must evenly divide the total number of chunks up until this 520 | // point (if total is not 0). If the current incomplete subtree is only 521 | // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have 522 | // to complete the current subtree first. 523 | // Because we might need to break up the input to form powers of 2, or to 524 | // evenly divide what we already have, this part runs in a loop. 525 | while (input_len > BLAKE3_CHUNK_LEN) { 526 | size_t subtree_len = round_down_to_power_of_2(input_len); 527 | uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; 528 | // Shrink the subtree_len until it evenly divides the count so far. We know 529 | // that subtree_len itself is a power of 2, so we can use a bitmasking 530 | // trick instead of an actual remainder operation. (Note that if the caller 531 | // consistently passes power-of-2 inputs of the same size, as is hopefully 532 | // typical, this loop condition will always fail, and subtree_len will 533 | // always be the full length of the input.) 534 | // 535 | // An aside: We don't have to shrink subtree_len quite this much. For 536 | // example, if count_so_far is 1, we could pass 2 chunks to 537 | // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still 538 | // get the right answer in the end, and we might get to use 2-way SIMD 539 | // parallelism. The problem with this optimization, is that it gets us 540 | // stuck always hashing 2 chunks. The total number of chunks will remain 541 | // odd, and we'll never graduate to higher degrees of parallelism. See 542 | // https://github.com/BLAKE3-team/BLAKE3/issues/69. 543 | while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { 544 | subtree_len /= 2; 545 | } 546 | // The shrunken subtree_len might now be 1 chunk long. If so, hash that one 547 | // chunk by itself. Otherwise, compress the subtree into a pair of CVs. 548 | uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; 549 | if (subtree_len <= BLAKE3_CHUNK_LEN) { 550 | blake3_chunk_state chunk_state; 551 | chunk_state_init(&chunk_state, self->key, self->chunk.flags); 552 | chunk_state.chunk_counter = self->chunk.chunk_counter; 553 | chunk_state_update(&chunk_state, input_bytes, subtree_len); 554 | output_t output = chunk_state_output(&chunk_state); 555 | uint8_t cv[BLAKE3_OUT_LEN]; 556 | output_chaining_value(&output, cv); 557 | hasher_push_cv(self, cv, chunk_state.chunk_counter); 558 | } else { 559 | // This is the high-performance happy path, though getting here depends 560 | // on the caller giving us a long enough input. 561 | uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; 562 | compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, 563 | self->chunk.chunk_counter, 564 | self->chunk.flags, cv_pair); 565 | hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); 566 | hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], 567 | self->chunk.chunk_counter + (subtree_chunks / 2)); 568 | } 569 | self->chunk.chunk_counter += subtree_chunks; 570 | input_bytes += subtree_len; 571 | input_len -= subtree_len; 572 | } 573 | 574 | // If there's any remaining input less than a full chunk, add it to the chunk 575 | // state. In that case, also do a final merge loop to make sure the subtree 576 | // stack doesn't contain any unmerged pairs. The remaining input means we 577 | // know these merges are non-root. This merge loop isn't strictly necessary 578 | // here, because hasher_push_chunk_cv already does its own merge loop, but it 579 | // simplifies blake3_hasher_finalize below. 580 | if (input_len > 0) { 581 | chunk_state_update(&self->chunk, input_bytes, input_len); 582 | hasher_merge_cv_stack(self, self->chunk.chunk_counter); 583 | } 584 | } 585 | 586 | void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, 587 | size_t out_len) { 588 | blake3_hasher_finalize_seek(self, 0, out, out_len); 589 | } 590 | 591 | void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, 592 | uint8_t *out, size_t out_len) { 593 | // Explicitly checking for zero avoids causing UB by passing a null pointer 594 | // to memcpy. This comes up in practice with things like: 595 | // std::vector v; 596 | // blake3_hasher_finalize(&hasher, v.data(), v.size()); 597 | if (out_len == 0) { 598 | return; 599 | } 600 | 601 | // If the subtree stack is empty, then the current chunk is the root. 602 | if (self->cv_stack_len == 0) { 603 | output_t output = chunk_state_output(&self->chunk); 604 | output_root_bytes(&output, seek, out, out_len); 605 | return; 606 | } 607 | // If there are any bytes in the chunk state, finalize that chunk and do a 608 | // roll-up merge between that chunk hash and every subtree in the stack. In 609 | // this case, the extra merge loop at the end of blake3_hasher_update 610 | // guarantees that none of the subtrees in the stack need to be merged with 611 | // each other first. Otherwise, if there are no bytes in the chunk state, 612 | // then the top of the stack is a chunk hash, and we start the merge from 613 | // that. 614 | output_t output; 615 | size_t cvs_remaining; 616 | if (chunk_state_len(&self->chunk) > 0) { 617 | cvs_remaining = self->cv_stack_len; 618 | output = chunk_state_output(&self->chunk); 619 | } else { 620 | // There are always at least 2 CVs in the stack in this case. 621 | cvs_remaining = self->cv_stack_len - 2; 622 | output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, 623 | self->chunk.flags); 624 | } 625 | while (cvs_remaining > 0) { 626 | cvs_remaining -= 1; 627 | uint8_t parent_block[BLAKE3_BLOCK_LEN]; 628 | memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); 629 | output_chaining_value(&output, &parent_block[32]); 630 | output = parent_output(parent_block, self->key, self->chunk.flags); 631 | } 632 | output_root_bytes(&output, seek, out, out_len); 633 | } 634 | -------------------------------------------------------------------------------- /blake3.h: -------------------------------------------------------------------------------- 1 | #ifndef BLAKE3_H 2 | #define BLAKE3_H 3 | 4 | #include 5 | #include 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | #define BLAKE3_KEY_LEN 32 12 | #define BLAKE3_OUT_LEN 32 13 | #define BLAKE3_BLOCK_LEN 64 14 | #define BLAKE3_CHUNK_LEN 1024 15 | #define BLAKE3_MAX_DEPTH 54 16 | #define BLAKE3_MAX_SIMD_DEGREE 16 17 | 18 | // This struct is a private implementation detail. It has to be here because 19 | // it's part of blake3_hasher below. 20 | typedef struct { 21 | uint32_t cv[8]; 22 | uint64_t chunk_counter; 23 | uint8_t buf[BLAKE3_BLOCK_LEN]; 24 | uint8_t buf_len; 25 | uint8_t blocks_compressed; 26 | uint8_t flags; 27 | } blake3_chunk_state; 28 | 29 | typedef struct { 30 | uint32_t key[8]; 31 | blake3_chunk_state chunk; 32 | uint8_t cv_stack_len; 33 | // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, 34 | // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk 35 | // requires a 4th entry, rather than merging everything down to 1, because we 36 | // don't know whether more input is coming. This is different from how the 37 | // reference implementation does things. 38 | uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; 39 | } blake3_hasher; 40 | 41 | void blake3_hasher_init(blake3_hasher *self); 42 | void blake3_hasher_init_keyed(blake3_hasher *self, 43 | const uint8_t key[BLAKE3_KEY_LEN]); 44 | void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); 45 | void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, 46 | size_t context_len); 47 | void blake3_hasher_update(blake3_hasher *self, const void *input, 48 | size_t input_len); 49 | void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, 50 | size_t out_len); 51 | void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, 52 | uint8_t *out, size_t out_len); 53 | 54 | 55 | /* This is simply an alias for blake2b */ 56 | int blake3( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ); 57 | 58 | #ifdef __cplusplus 59 | } 60 | #endif 61 | 62 | #endif /* BLAKE3_H */ 63 | -------------------------------------------------------------------------------- /blake3_avx2.c: -------------------------------------------------------------------------------- 1 | #include "blake3_impl.h" 2 | 3 | #include 4 | 5 | #define DEGREE 8 6 | 7 | INLINE __m256i loadu(const uint8_t src[32]) { 8 | return _mm256_loadu_si256((const __m256i *)src); 9 | } 10 | 11 | INLINE void storeu(__m256i src, uint8_t dest[16]) { 12 | _mm256_storeu_si256((__m256i *)dest, src); 13 | } 14 | 15 | INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } 16 | 17 | // Note that clang-format doesn't like the name "xor" for some reason. 18 | INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } 19 | 20 | INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } 21 | 22 | INLINE __m256i rot16(__m256i x) { 23 | return _mm256_shuffle_epi8( 24 | x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 25 | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); 26 | } 27 | 28 | INLINE __m256i rot12(__m256i x) { 29 | return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); 30 | } 31 | 32 | INLINE __m256i rot8(__m256i x) { 33 | return _mm256_shuffle_epi8( 34 | x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, 35 | 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); 36 | } 37 | 38 | INLINE __m256i rot7(__m256i x) { 39 | return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); 40 | } 41 | 42 | INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { 43 | v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); 44 | v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); 45 | v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); 46 | v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); 47 | v[0] = addv(v[0], v[4]); 48 | v[1] = addv(v[1], v[5]); 49 | v[2] = addv(v[2], v[6]); 50 | v[3] = addv(v[3], v[7]); 51 | v[12] = xorv(v[12], v[0]); 52 | v[13] = xorv(v[13], v[1]); 53 | v[14] = xorv(v[14], v[2]); 54 | v[15] = xorv(v[15], v[3]); 55 | v[12] = rot16(v[12]); 56 | v[13] = rot16(v[13]); 57 | v[14] = rot16(v[14]); 58 | v[15] = rot16(v[15]); 59 | v[8] = addv(v[8], v[12]); 60 | v[9] = addv(v[9], v[13]); 61 | v[10] = addv(v[10], v[14]); 62 | v[11] = addv(v[11], v[15]); 63 | v[4] = xorv(v[4], v[8]); 64 | v[5] = xorv(v[5], v[9]); 65 | v[6] = xorv(v[6], v[10]); 66 | v[7] = xorv(v[7], v[11]); 67 | v[4] = rot12(v[4]); 68 | v[5] = rot12(v[5]); 69 | v[6] = rot12(v[6]); 70 | v[7] = rot12(v[7]); 71 | v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); 72 | v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); 73 | v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); 74 | v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); 75 | v[0] = addv(v[0], v[4]); 76 | v[1] = addv(v[1], v[5]); 77 | v[2] = addv(v[2], v[6]); 78 | v[3] = addv(v[3], v[7]); 79 | v[12] = xorv(v[12], v[0]); 80 | v[13] = xorv(v[13], v[1]); 81 | v[14] = xorv(v[14], v[2]); 82 | v[15] = xorv(v[15], v[3]); 83 | v[12] = rot8(v[12]); 84 | v[13] = rot8(v[13]); 85 | v[14] = rot8(v[14]); 86 | v[15] = rot8(v[15]); 87 | v[8] = addv(v[8], v[12]); 88 | v[9] = addv(v[9], v[13]); 89 | v[10] = addv(v[10], v[14]); 90 | v[11] = addv(v[11], v[15]); 91 | v[4] = xorv(v[4], v[8]); 92 | v[5] = xorv(v[5], v[9]); 93 | v[6] = xorv(v[6], v[10]); 94 | v[7] = xorv(v[7], v[11]); 95 | v[4] = rot7(v[4]); 96 | v[5] = rot7(v[5]); 97 | v[6] = rot7(v[6]); 98 | v[7] = rot7(v[7]); 99 | 100 | v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); 101 | v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); 102 | v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); 103 | v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); 104 | v[0] = addv(v[0], v[5]); 105 | v[1] = addv(v[1], v[6]); 106 | v[2] = addv(v[2], v[7]); 107 | v[3] = addv(v[3], v[4]); 108 | v[15] = xorv(v[15], v[0]); 109 | v[12] = xorv(v[12], v[1]); 110 | v[13] = xorv(v[13], v[2]); 111 | v[14] = xorv(v[14], v[3]); 112 | v[15] = rot16(v[15]); 113 | v[12] = rot16(v[12]); 114 | v[13] = rot16(v[13]); 115 | v[14] = rot16(v[14]); 116 | v[10] = addv(v[10], v[15]); 117 | v[11] = addv(v[11], v[12]); 118 | v[8] = addv(v[8], v[13]); 119 | v[9] = addv(v[9], v[14]); 120 | v[5] = xorv(v[5], v[10]); 121 | v[6] = xorv(v[6], v[11]); 122 | v[7] = xorv(v[7], v[8]); 123 | v[4] = xorv(v[4], v[9]); 124 | v[5] = rot12(v[5]); 125 | v[6] = rot12(v[6]); 126 | v[7] = rot12(v[7]); 127 | v[4] = rot12(v[4]); 128 | v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); 129 | v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); 130 | v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); 131 | v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); 132 | v[0] = addv(v[0], v[5]); 133 | v[1] = addv(v[1], v[6]); 134 | v[2] = addv(v[2], v[7]); 135 | v[3] = addv(v[3], v[4]); 136 | v[15] = xorv(v[15], v[0]); 137 | v[12] = xorv(v[12], v[1]); 138 | v[13] = xorv(v[13], v[2]); 139 | v[14] = xorv(v[14], v[3]); 140 | v[15] = rot8(v[15]); 141 | v[12] = rot8(v[12]); 142 | v[13] = rot8(v[13]); 143 | v[14] = rot8(v[14]); 144 | v[10] = addv(v[10], v[15]); 145 | v[11] = addv(v[11], v[12]); 146 | v[8] = addv(v[8], v[13]); 147 | v[9] = addv(v[9], v[14]); 148 | v[5] = xorv(v[5], v[10]); 149 | v[6] = xorv(v[6], v[11]); 150 | v[7] = xorv(v[7], v[8]); 151 | v[4] = xorv(v[4], v[9]); 152 | v[5] = rot7(v[5]); 153 | v[6] = rot7(v[6]); 154 | v[7] = rot7(v[7]); 155 | v[4] = rot7(v[4]); 156 | } 157 | 158 | INLINE void transpose_vecs(__m256i vecs[DEGREE]) { 159 | // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high 160 | // is 22/33/66/77. 161 | __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); 162 | __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); 163 | __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); 164 | __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); 165 | __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); 166 | __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); 167 | __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); 168 | __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); 169 | 170 | // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 171 | // 11/33. 172 | __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); 173 | __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); 174 | __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); 175 | __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); 176 | __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); 177 | __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); 178 | __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); 179 | __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); 180 | 181 | // Interleave 128-bit lanes. 182 | vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); 183 | vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); 184 | vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); 185 | vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); 186 | vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); 187 | vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); 188 | vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); 189 | vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); 190 | } 191 | 192 | INLINE void transpose_msg_vecs(const uint8_t *const *inputs, 193 | size_t block_offset, __m256i out[16]) { 194 | out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]); 195 | out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]); 196 | out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]); 197 | out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]); 198 | out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]); 199 | out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]); 200 | out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]); 201 | out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]); 202 | out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]); 203 | out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]); 204 | out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]); 205 | out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]); 206 | out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]); 207 | out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]); 208 | out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]); 209 | out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]); 210 | for (size_t i = 0; i < 8; ++i) { 211 | _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); 212 | } 213 | transpose_vecs(&out[0]); 214 | transpose_vecs(&out[8]); 215 | } 216 | 217 | INLINE void load_counters(uint64_t counter, bool increment_counter, 218 | __m256i *out_lo, __m256i *out_hi) { 219 | const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter); 220 | const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); 221 | const __m256i add1 = _mm256_and_si256(mask, add0); 222 | __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1); 223 | __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), 224 | _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); 225 | __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry); 226 | *out_lo = l; 227 | *out_hi = h; 228 | } 229 | 230 | void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, 231 | const uint32_t key[8], uint64_t counter, 232 | bool increment_counter, uint8_t flags, 233 | uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 234 | __m256i h_vecs[8] = { 235 | set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), 236 | set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), 237 | }; 238 | __m256i counter_low_vec, counter_high_vec; 239 | load_counters(counter, increment_counter, &counter_low_vec, 240 | &counter_high_vec); 241 | uint8_t block_flags = flags | flags_start; 242 | 243 | for (size_t block = 0; block < blocks; block++) { 244 | if (block + 1 == blocks) { 245 | block_flags |= flags_end; 246 | } 247 | __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN); 248 | __m256i block_flags_vec = set1(block_flags); 249 | __m256i msg_vecs[16]; 250 | transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); 251 | 252 | __m256i v[16] = { 253 | h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], 254 | h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], 255 | set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), 256 | counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, 257 | }; 258 | round_fn(v, msg_vecs, 0); 259 | round_fn(v, msg_vecs, 1); 260 | round_fn(v, msg_vecs, 2); 261 | round_fn(v, msg_vecs, 3); 262 | round_fn(v, msg_vecs, 4); 263 | round_fn(v, msg_vecs, 5); 264 | round_fn(v, msg_vecs, 6); 265 | h_vecs[0] = xorv(v[0], v[8]); 266 | h_vecs[1] = xorv(v[1], v[9]); 267 | h_vecs[2] = xorv(v[2], v[10]); 268 | h_vecs[3] = xorv(v[3], v[11]); 269 | h_vecs[4] = xorv(v[4], v[12]); 270 | h_vecs[5] = xorv(v[5], v[13]); 271 | h_vecs[6] = xorv(v[6], v[14]); 272 | h_vecs[7] = xorv(v[7], v[15]); 273 | 274 | block_flags = flags; 275 | } 276 | 277 | transpose_vecs(h_vecs); 278 | storeu(h_vecs[0], &out[0 * sizeof(__m256i)]); 279 | storeu(h_vecs[1], &out[1 * sizeof(__m256i)]); 280 | storeu(h_vecs[2], &out[2 * sizeof(__m256i)]); 281 | storeu(h_vecs[3], &out[3 * sizeof(__m256i)]); 282 | storeu(h_vecs[4], &out[4 * sizeof(__m256i)]); 283 | storeu(h_vecs[5], &out[5 * sizeof(__m256i)]); 284 | storeu(h_vecs[6], &out[6 * sizeof(__m256i)]); 285 | storeu(h_vecs[7], &out[7 * sizeof(__m256i)]); 286 | } 287 | 288 | #if !defined(BLAKE3_NO_SSE41) 289 | void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, 290 | size_t blocks, const uint32_t key[8], 291 | uint64_t counter, bool increment_counter, 292 | uint8_t flags, uint8_t flags_start, 293 | uint8_t flags_end, uint8_t *out); 294 | #else 295 | void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, 296 | size_t blocks, const uint32_t key[8], 297 | uint64_t counter, bool increment_counter, 298 | uint8_t flags, uint8_t flags_start, 299 | uint8_t flags_end, uint8_t *out); 300 | #endif 301 | 302 | void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, 303 | size_t blocks, const uint32_t key[8], 304 | uint64_t counter, bool increment_counter, 305 | uint8_t flags, uint8_t flags_start, 306 | uint8_t flags_end, uint8_t *out) { 307 | while (num_inputs >= DEGREE) { 308 | blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, 309 | flags_start, flags_end, out); 310 | if (increment_counter) { 311 | counter += DEGREE; 312 | } 313 | inputs += DEGREE; 314 | num_inputs -= DEGREE; 315 | out = &out[DEGREE * BLAKE3_OUT_LEN]; 316 | } 317 | #if !defined(BLAKE3_NO_SSE41) 318 | blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, 319 | increment_counter, flags, flags_start, flags_end, out); 320 | #else 321 | blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, 322 | increment_counter, flags, flags_start, flags_end, 323 | out); 324 | #endif 325 | } 326 | -------------------------------------------------------------------------------- /blake3_avx512.c: -------------------------------------------------------------------------------- 1 | #include "blake3_impl.h" 2 | 3 | #include 4 | 5 | #define _mm_shuffle_ps2(a, b, c) \ 6 | (_mm_castps_si128( \ 7 | _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) 8 | 9 | INLINE __m128i loadu_128(const uint8_t src[16]) { 10 | return _mm_loadu_si128((const __m128i *)src); 11 | } 12 | 13 | INLINE __m256i loadu_256(const uint8_t src[32]) { 14 | return _mm256_loadu_si256((const __m256i *)src); 15 | } 16 | 17 | INLINE __m512i loadu_512(const uint8_t src[64]) { 18 | return _mm512_loadu_si512((const __m512i *)src); 19 | } 20 | 21 | INLINE void storeu_128(__m128i src, uint8_t dest[16]) { 22 | _mm_storeu_si128((__m128i *)dest, src); 23 | } 24 | 25 | INLINE void storeu_256(__m256i src, uint8_t dest[16]) { 26 | _mm256_storeu_si256((__m256i *)dest, src); 27 | } 28 | 29 | INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } 30 | 31 | INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } 32 | 33 | INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); } 34 | 35 | INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } 36 | 37 | INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } 38 | 39 | INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); } 40 | 41 | INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); } 42 | 43 | INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } 44 | 45 | INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); } 46 | 47 | INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { 48 | return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); 49 | } 50 | 51 | INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); } 52 | 53 | INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); } 54 | 55 | INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); } 56 | 57 | INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); } 58 | 59 | INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); } 60 | 61 | INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); } 62 | 63 | INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); } 64 | 65 | INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); } 66 | 67 | INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); } 68 | 69 | INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); } 70 | 71 | INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); } 72 | 73 | INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); } 74 | 75 | /* 76 | * ---------------------------------------------------------------------------- 77 | * compress_avx512 78 | * ---------------------------------------------------------------------------- 79 | */ 80 | 81 | INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, 82 | __m128i m) { 83 | *row0 = add_128(add_128(*row0, m), *row1); 84 | *row3 = xor_128(*row3, *row0); 85 | *row3 = rot16_128(*row3); 86 | *row2 = add_128(*row2, *row3); 87 | *row1 = xor_128(*row1, *row2); 88 | *row1 = rot12_128(*row1); 89 | } 90 | 91 | INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, 92 | __m128i m) { 93 | *row0 = add_128(add_128(*row0, m), *row1); 94 | *row3 = xor_128(*row3, *row0); 95 | *row3 = rot8_128(*row3); 96 | *row2 = add_128(*row2, *row3); 97 | *row1 = xor_128(*row1, *row2); 98 | *row1 = rot7_128(*row1); 99 | } 100 | 101 | // Note the optimization here of leaving row1 as the unrotated row, rather than 102 | // row0. All the message loads below are adjusted to compensate for this. See 103 | // discussion at https://github.com/sneves/blake2-avx2/pull/4 104 | INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { 105 | *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); 106 | *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); 107 | *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); 108 | } 109 | 110 | INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { 111 | *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); 112 | *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); 113 | *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); 114 | } 115 | 116 | INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], 117 | const uint8_t block[BLAKE3_BLOCK_LEN], 118 | uint8_t block_len, uint64_t counter, uint8_t flags) { 119 | rows[0] = loadu_128((uint8_t *)&cv[0]); 120 | rows[1] = loadu_128((uint8_t *)&cv[4]); 121 | rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); 122 | rows[3] = set4(counter_low(counter), counter_high(counter), 123 | (uint32_t)block_len, (uint32_t)flags); 124 | 125 | __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]); 126 | __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]); 127 | __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]); 128 | __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); 129 | 130 | __m128i t0, t1, t2, t3, tt; 131 | 132 | // Round 1. The first round permutes the message words from the original 133 | // input order, into the groups that get mixed in parallel. 134 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 135 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 136 | t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 137 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 138 | diagonalize(&rows[0], &rows[2], &rows[3]); 139 | t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 140 | t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 141 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 142 | t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 143 | t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 144 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 145 | undiagonalize(&rows[0], &rows[2], &rows[3]); 146 | m0 = t0; 147 | m1 = t1; 148 | m2 = t2; 149 | m3 = t3; 150 | 151 | // Round 2. This round and all following rounds apply a fixed permutation 152 | // to the message words from the round before. 153 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 154 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 155 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 156 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 157 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 158 | t1 = _mm_blend_epi16(tt, t1, 0xCC); 159 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 160 | diagonalize(&rows[0], &rows[2], &rows[3]); 161 | t2 = _mm_unpacklo_epi64(m3, m1); 162 | tt = _mm_blend_epi16(t2, m2, 0xC0); 163 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 164 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 165 | t3 = _mm_unpackhi_epi32(m1, m3); 166 | tt = _mm_unpacklo_epi32(m2, t3); 167 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 168 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 169 | undiagonalize(&rows[0], &rows[2], &rows[3]); 170 | m0 = t0; 171 | m1 = t1; 172 | m2 = t2; 173 | m3 = t3; 174 | 175 | // Round 3 176 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 177 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 178 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 179 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 180 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 181 | t1 = _mm_blend_epi16(tt, t1, 0xCC); 182 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 183 | diagonalize(&rows[0], &rows[2], &rows[3]); 184 | t2 = _mm_unpacklo_epi64(m3, m1); 185 | tt = _mm_blend_epi16(t2, m2, 0xC0); 186 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 187 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 188 | t3 = _mm_unpackhi_epi32(m1, m3); 189 | tt = _mm_unpacklo_epi32(m2, t3); 190 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 191 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 192 | undiagonalize(&rows[0], &rows[2], &rows[3]); 193 | m0 = t0; 194 | m1 = t1; 195 | m2 = t2; 196 | m3 = t3; 197 | 198 | // Round 4 199 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 200 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 201 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 202 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 203 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 204 | t1 = _mm_blend_epi16(tt, t1, 0xCC); 205 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 206 | diagonalize(&rows[0], &rows[2], &rows[3]); 207 | t2 = _mm_unpacklo_epi64(m3, m1); 208 | tt = _mm_blend_epi16(t2, m2, 0xC0); 209 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 210 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 211 | t3 = _mm_unpackhi_epi32(m1, m3); 212 | tt = _mm_unpacklo_epi32(m2, t3); 213 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 214 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 215 | undiagonalize(&rows[0], &rows[2], &rows[3]); 216 | m0 = t0; 217 | m1 = t1; 218 | m2 = t2; 219 | m3 = t3; 220 | 221 | // Round 5 222 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 223 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 224 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 225 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 226 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 227 | t1 = _mm_blend_epi16(tt, t1, 0xCC); 228 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 229 | diagonalize(&rows[0], &rows[2], &rows[3]); 230 | t2 = _mm_unpacklo_epi64(m3, m1); 231 | tt = _mm_blend_epi16(t2, m2, 0xC0); 232 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 233 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 234 | t3 = _mm_unpackhi_epi32(m1, m3); 235 | tt = _mm_unpacklo_epi32(m2, t3); 236 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 237 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 238 | undiagonalize(&rows[0], &rows[2], &rows[3]); 239 | m0 = t0; 240 | m1 = t1; 241 | m2 = t2; 242 | m3 = t3; 243 | 244 | // Round 6 245 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 246 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 247 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 248 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 249 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 250 | t1 = _mm_blend_epi16(tt, t1, 0xCC); 251 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 252 | diagonalize(&rows[0], &rows[2], &rows[3]); 253 | t2 = _mm_unpacklo_epi64(m3, m1); 254 | tt = _mm_blend_epi16(t2, m2, 0xC0); 255 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 256 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 257 | t3 = _mm_unpackhi_epi32(m1, m3); 258 | tt = _mm_unpacklo_epi32(m2, t3); 259 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 260 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 261 | undiagonalize(&rows[0], &rows[2], &rows[3]); 262 | m0 = t0; 263 | m1 = t1; 264 | m2 = t2; 265 | m3 = t3; 266 | 267 | // Round 7 268 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 269 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 270 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 271 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 272 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 273 | t1 = _mm_blend_epi16(tt, t1, 0xCC); 274 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 275 | diagonalize(&rows[0], &rows[2], &rows[3]); 276 | t2 = _mm_unpacklo_epi64(m3, m1); 277 | tt = _mm_blend_epi16(t2, m2, 0xC0); 278 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 279 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 280 | t3 = _mm_unpackhi_epi32(m1, m3); 281 | tt = _mm_unpacklo_epi32(m2, t3); 282 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 283 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 284 | undiagonalize(&rows[0], &rows[2], &rows[3]); 285 | } 286 | 287 | void blake3_compress_xof_avx512(const uint32_t cv[8], 288 | const uint8_t block[BLAKE3_BLOCK_LEN], 289 | uint8_t block_len, uint64_t counter, 290 | uint8_t flags, uint8_t out[64]) { 291 | __m128i rows[4]; 292 | compress_pre(rows, cv, block, block_len, counter, flags); 293 | storeu_128(xor_128(rows[0], rows[2]), &out[0]); 294 | storeu_128(xor_128(rows[1], rows[3]), &out[16]); 295 | storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]); 296 | storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); 297 | } 298 | 299 | void blake3_compress_in_place_avx512(uint32_t cv[8], 300 | const uint8_t block[BLAKE3_BLOCK_LEN], 301 | uint8_t block_len, uint64_t counter, 302 | uint8_t flags) { 303 | __m128i rows[4]; 304 | compress_pre(rows, cv, block, block_len, counter, flags); 305 | storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); 306 | storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); 307 | } 308 | 309 | /* 310 | * ---------------------------------------------------------------------------- 311 | * hash4_avx512 312 | * ---------------------------------------------------------------------------- 313 | */ 314 | 315 | INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) { 316 | v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); 317 | v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); 318 | v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); 319 | v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); 320 | v[0] = add_128(v[0], v[4]); 321 | v[1] = add_128(v[1], v[5]); 322 | v[2] = add_128(v[2], v[6]); 323 | v[3] = add_128(v[3], v[7]); 324 | v[12] = xor_128(v[12], v[0]); 325 | v[13] = xor_128(v[13], v[1]); 326 | v[14] = xor_128(v[14], v[2]); 327 | v[15] = xor_128(v[15], v[3]); 328 | v[12] = rot16_128(v[12]); 329 | v[13] = rot16_128(v[13]); 330 | v[14] = rot16_128(v[14]); 331 | v[15] = rot16_128(v[15]); 332 | v[8] = add_128(v[8], v[12]); 333 | v[9] = add_128(v[9], v[13]); 334 | v[10] = add_128(v[10], v[14]); 335 | v[11] = add_128(v[11], v[15]); 336 | v[4] = xor_128(v[4], v[8]); 337 | v[5] = xor_128(v[5], v[9]); 338 | v[6] = xor_128(v[6], v[10]); 339 | v[7] = xor_128(v[7], v[11]); 340 | v[4] = rot12_128(v[4]); 341 | v[5] = rot12_128(v[5]); 342 | v[6] = rot12_128(v[6]); 343 | v[7] = rot12_128(v[7]); 344 | v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); 345 | v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); 346 | v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); 347 | v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); 348 | v[0] = add_128(v[0], v[4]); 349 | v[1] = add_128(v[1], v[5]); 350 | v[2] = add_128(v[2], v[6]); 351 | v[3] = add_128(v[3], v[7]); 352 | v[12] = xor_128(v[12], v[0]); 353 | v[13] = xor_128(v[13], v[1]); 354 | v[14] = xor_128(v[14], v[2]); 355 | v[15] = xor_128(v[15], v[3]); 356 | v[12] = rot8_128(v[12]); 357 | v[13] = rot8_128(v[13]); 358 | v[14] = rot8_128(v[14]); 359 | v[15] = rot8_128(v[15]); 360 | v[8] = add_128(v[8], v[12]); 361 | v[9] = add_128(v[9], v[13]); 362 | v[10] = add_128(v[10], v[14]); 363 | v[11] = add_128(v[11], v[15]); 364 | v[4] = xor_128(v[4], v[8]); 365 | v[5] = xor_128(v[5], v[9]); 366 | v[6] = xor_128(v[6], v[10]); 367 | v[7] = xor_128(v[7], v[11]); 368 | v[4] = rot7_128(v[4]); 369 | v[5] = rot7_128(v[5]); 370 | v[6] = rot7_128(v[6]); 371 | v[7] = rot7_128(v[7]); 372 | 373 | v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); 374 | v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); 375 | v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); 376 | v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); 377 | v[0] = add_128(v[0], v[5]); 378 | v[1] = add_128(v[1], v[6]); 379 | v[2] = add_128(v[2], v[7]); 380 | v[3] = add_128(v[3], v[4]); 381 | v[15] = xor_128(v[15], v[0]); 382 | v[12] = xor_128(v[12], v[1]); 383 | v[13] = xor_128(v[13], v[2]); 384 | v[14] = xor_128(v[14], v[3]); 385 | v[15] = rot16_128(v[15]); 386 | v[12] = rot16_128(v[12]); 387 | v[13] = rot16_128(v[13]); 388 | v[14] = rot16_128(v[14]); 389 | v[10] = add_128(v[10], v[15]); 390 | v[11] = add_128(v[11], v[12]); 391 | v[8] = add_128(v[8], v[13]); 392 | v[9] = add_128(v[9], v[14]); 393 | v[5] = xor_128(v[5], v[10]); 394 | v[6] = xor_128(v[6], v[11]); 395 | v[7] = xor_128(v[7], v[8]); 396 | v[4] = xor_128(v[4], v[9]); 397 | v[5] = rot12_128(v[5]); 398 | v[6] = rot12_128(v[6]); 399 | v[7] = rot12_128(v[7]); 400 | v[4] = rot12_128(v[4]); 401 | v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); 402 | v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); 403 | v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); 404 | v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); 405 | v[0] = add_128(v[0], v[5]); 406 | v[1] = add_128(v[1], v[6]); 407 | v[2] = add_128(v[2], v[7]); 408 | v[3] = add_128(v[3], v[4]); 409 | v[15] = xor_128(v[15], v[0]); 410 | v[12] = xor_128(v[12], v[1]); 411 | v[13] = xor_128(v[13], v[2]); 412 | v[14] = xor_128(v[14], v[3]); 413 | v[15] = rot8_128(v[15]); 414 | v[12] = rot8_128(v[12]); 415 | v[13] = rot8_128(v[13]); 416 | v[14] = rot8_128(v[14]); 417 | v[10] = add_128(v[10], v[15]); 418 | v[11] = add_128(v[11], v[12]); 419 | v[8] = add_128(v[8], v[13]); 420 | v[9] = add_128(v[9], v[14]); 421 | v[5] = xor_128(v[5], v[10]); 422 | v[6] = xor_128(v[6], v[11]); 423 | v[7] = xor_128(v[7], v[8]); 424 | v[4] = xor_128(v[4], v[9]); 425 | v[5] = rot7_128(v[5]); 426 | v[6] = rot7_128(v[6]); 427 | v[7] = rot7_128(v[7]); 428 | v[4] = rot7_128(v[4]); 429 | } 430 | 431 | INLINE void transpose_vecs_128(__m128i vecs[4]) { 432 | // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is 433 | // 22/33. Note that this doesn't split the vector into two lanes, as the 434 | // AVX2 counterparts do. 435 | __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); 436 | __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); 437 | __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); 438 | __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); 439 | 440 | // Interleave 64-bit lanes. 441 | __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); 442 | __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); 443 | __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); 444 | __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); 445 | 446 | vecs[0] = abcd_0; 447 | vecs[1] = abcd_1; 448 | vecs[2] = abcd_2; 449 | vecs[3] = abcd_3; 450 | } 451 | 452 | INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, 453 | size_t block_offset, __m128i out[16]) { 454 | out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]); 455 | out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]); 456 | out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]); 457 | out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]); 458 | out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]); 459 | out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]); 460 | out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]); 461 | out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]); 462 | out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]); 463 | out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]); 464 | out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]); 465 | out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]); 466 | out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]); 467 | out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]); 468 | out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); 469 | out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); 470 | for (size_t i = 0; i < 4; ++i) { 471 | _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); 472 | } 473 | transpose_vecs_128(&out[0]); 474 | transpose_vecs_128(&out[4]); 475 | transpose_vecs_128(&out[8]); 476 | transpose_vecs_128(&out[12]); 477 | } 478 | 479 | INLINE void load_counters4(uint64_t counter, bool increment_counter, 480 | __m128i *out_lo, __m128i *out_hi) { 481 | uint64_t mask = (increment_counter ? ~0 : 0); 482 | __m256i mask_vec = _mm256_set1_epi64x(mask); 483 | __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3); 484 | deltas = _mm256_and_si256(mask_vec, deltas); 485 | __m256i counters = 486 | _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas); 487 | *out_lo = _mm256_cvtepi64_epi32(counters); 488 | *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32)); 489 | } 490 | 491 | void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, 492 | const uint32_t key[8], uint64_t counter, 493 | bool increment_counter, uint8_t flags, 494 | uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 495 | __m128i h_vecs[8] = { 496 | set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), 497 | set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), 498 | }; 499 | __m128i counter_low_vec, counter_high_vec; 500 | load_counters4(counter, increment_counter, &counter_low_vec, 501 | &counter_high_vec); 502 | uint8_t block_flags = flags | flags_start; 503 | 504 | for (size_t block = 0; block < blocks; block++) { 505 | if (block + 1 == blocks) { 506 | block_flags |= flags_end; 507 | } 508 | __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN); 509 | __m128i block_flags_vec = set1_128(block_flags); 510 | __m128i msg_vecs[16]; 511 | transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); 512 | 513 | __m128i v[16] = { 514 | h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], 515 | h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], 516 | set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), 517 | counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, 518 | }; 519 | round_fn4(v, msg_vecs, 0); 520 | round_fn4(v, msg_vecs, 1); 521 | round_fn4(v, msg_vecs, 2); 522 | round_fn4(v, msg_vecs, 3); 523 | round_fn4(v, msg_vecs, 4); 524 | round_fn4(v, msg_vecs, 5); 525 | round_fn4(v, msg_vecs, 6); 526 | h_vecs[0] = xor_128(v[0], v[8]); 527 | h_vecs[1] = xor_128(v[1], v[9]); 528 | h_vecs[2] = xor_128(v[2], v[10]); 529 | h_vecs[3] = xor_128(v[3], v[11]); 530 | h_vecs[4] = xor_128(v[4], v[12]); 531 | h_vecs[5] = xor_128(v[5], v[13]); 532 | h_vecs[6] = xor_128(v[6], v[14]); 533 | h_vecs[7] = xor_128(v[7], v[15]); 534 | 535 | block_flags = flags; 536 | } 537 | 538 | transpose_vecs_128(&h_vecs[0]); 539 | transpose_vecs_128(&h_vecs[4]); 540 | // The first four vecs now contain the first half of each output, and the 541 | // second four vecs contain the second half of each output. 542 | storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]); 543 | storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]); 544 | storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]); 545 | storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]); 546 | storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]); 547 | storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]); 548 | storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]); 549 | storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]); 550 | } 551 | 552 | /* 553 | * ---------------------------------------------------------------------------- 554 | * hash8_avx512 555 | * ---------------------------------------------------------------------------- 556 | */ 557 | 558 | INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) { 559 | v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); 560 | v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); 561 | v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); 562 | v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); 563 | v[0] = add_256(v[0], v[4]); 564 | v[1] = add_256(v[1], v[5]); 565 | v[2] = add_256(v[2], v[6]); 566 | v[3] = add_256(v[3], v[7]); 567 | v[12] = xor_256(v[12], v[0]); 568 | v[13] = xor_256(v[13], v[1]); 569 | v[14] = xor_256(v[14], v[2]); 570 | v[15] = xor_256(v[15], v[3]); 571 | v[12] = rot16_256(v[12]); 572 | v[13] = rot16_256(v[13]); 573 | v[14] = rot16_256(v[14]); 574 | v[15] = rot16_256(v[15]); 575 | v[8] = add_256(v[8], v[12]); 576 | v[9] = add_256(v[9], v[13]); 577 | v[10] = add_256(v[10], v[14]); 578 | v[11] = add_256(v[11], v[15]); 579 | v[4] = xor_256(v[4], v[8]); 580 | v[5] = xor_256(v[5], v[9]); 581 | v[6] = xor_256(v[6], v[10]); 582 | v[7] = xor_256(v[7], v[11]); 583 | v[4] = rot12_256(v[4]); 584 | v[5] = rot12_256(v[5]); 585 | v[6] = rot12_256(v[6]); 586 | v[7] = rot12_256(v[7]); 587 | v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); 588 | v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); 589 | v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); 590 | v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); 591 | v[0] = add_256(v[0], v[4]); 592 | v[1] = add_256(v[1], v[5]); 593 | v[2] = add_256(v[2], v[6]); 594 | v[3] = add_256(v[3], v[7]); 595 | v[12] = xor_256(v[12], v[0]); 596 | v[13] = xor_256(v[13], v[1]); 597 | v[14] = xor_256(v[14], v[2]); 598 | v[15] = xor_256(v[15], v[3]); 599 | v[12] = rot8_256(v[12]); 600 | v[13] = rot8_256(v[13]); 601 | v[14] = rot8_256(v[14]); 602 | v[15] = rot8_256(v[15]); 603 | v[8] = add_256(v[8], v[12]); 604 | v[9] = add_256(v[9], v[13]); 605 | v[10] = add_256(v[10], v[14]); 606 | v[11] = add_256(v[11], v[15]); 607 | v[4] = xor_256(v[4], v[8]); 608 | v[5] = xor_256(v[5], v[9]); 609 | v[6] = xor_256(v[6], v[10]); 610 | v[7] = xor_256(v[7], v[11]); 611 | v[4] = rot7_256(v[4]); 612 | v[5] = rot7_256(v[5]); 613 | v[6] = rot7_256(v[6]); 614 | v[7] = rot7_256(v[7]); 615 | 616 | v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); 617 | v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); 618 | v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); 619 | v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); 620 | v[0] = add_256(v[0], v[5]); 621 | v[1] = add_256(v[1], v[6]); 622 | v[2] = add_256(v[2], v[7]); 623 | v[3] = add_256(v[3], v[4]); 624 | v[15] = xor_256(v[15], v[0]); 625 | v[12] = xor_256(v[12], v[1]); 626 | v[13] = xor_256(v[13], v[2]); 627 | v[14] = xor_256(v[14], v[3]); 628 | v[15] = rot16_256(v[15]); 629 | v[12] = rot16_256(v[12]); 630 | v[13] = rot16_256(v[13]); 631 | v[14] = rot16_256(v[14]); 632 | v[10] = add_256(v[10], v[15]); 633 | v[11] = add_256(v[11], v[12]); 634 | v[8] = add_256(v[8], v[13]); 635 | v[9] = add_256(v[9], v[14]); 636 | v[5] = xor_256(v[5], v[10]); 637 | v[6] = xor_256(v[6], v[11]); 638 | v[7] = xor_256(v[7], v[8]); 639 | v[4] = xor_256(v[4], v[9]); 640 | v[5] = rot12_256(v[5]); 641 | v[6] = rot12_256(v[6]); 642 | v[7] = rot12_256(v[7]); 643 | v[4] = rot12_256(v[4]); 644 | v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); 645 | v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); 646 | v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); 647 | v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); 648 | v[0] = add_256(v[0], v[5]); 649 | v[1] = add_256(v[1], v[6]); 650 | v[2] = add_256(v[2], v[7]); 651 | v[3] = add_256(v[3], v[4]); 652 | v[15] = xor_256(v[15], v[0]); 653 | v[12] = xor_256(v[12], v[1]); 654 | v[13] = xor_256(v[13], v[2]); 655 | v[14] = xor_256(v[14], v[3]); 656 | v[15] = rot8_256(v[15]); 657 | v[12] = rot8_256(v[12]); 658 | v[13] = rot8_256(v[13]); 659 | v[14] = rot8_256(v[14]); 660 | v[10] = add_256(v[10], v[15]); 661 | v[11] = add_256(v[11], v[12]); 662 | v[8] = add_256(v[8], v[13]); 663 | v[9] = add_256(v[9], v[14]); 664 | v[5] = xor_256(v[5], v[10]); 665 | v[6] = xor_256(v[6], v[11]); 666 | v[7] = xor_256(v[7], v[8]); 667 | v[4] = xor_256(v[4], v[9]); 668 | v[5] = rot7_256(v[5]); 669 | v[6] = rot7_256(v[6]); 670 | v[7] = rot7_256(v[7]); 671 | v[4] = rot7_256(v[4]); 672 | } 673 | 674 | INLINE void transpose_vecs_256(__m256i vecs[8]) { 675 | // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high 676 | // is 22/33/66/77. 677 | __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); 678 | __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); 679 | __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); 680 | __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); 681 | __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); 682 | __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); 683 | __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); 684 | __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); 685 | 686 | // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 687 | // 11/33. 688 | __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); 689 | __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); 690 | __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); 691 | __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); 692 | __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); 693 | __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); 694 | __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); 695 | __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); 696 | 697 | // Interleave 128-bit lanes. 698 | vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); 699 | vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); 700 | vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); 701 | vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); 702 | vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); 703 | vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); 704 | vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); 705 | vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); 706 | } 707 | 708 | INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, 709 | size_t block_offset, __m256i out[16]) { 710 | out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]); 711 | out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]); 712 | out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]); 713 | out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]); 714 | out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]); 715 | out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]); 716 | out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]); 717 | out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]); 718 | out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]); 719 | out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]); 720 | out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]); 721 | out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]); 722 | out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]); 723 | out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]); 724 | out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]); 725 | out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]); 726 | for (size_t i = 0; i < 8; ++i) { 727 | _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); 728 | } 729 | transpose_vecs_256(&out[0]); 730 | transpose_vecs_256(&out[8]); 731 | } 732 | 733 | INLINE void load_counters8(uint64_t counter, bool increment_counter, 734 | __m256i *out_lo, __m256i *out_hi) { 735 | uint64_t mask = (increment_counter ? ~0 : 0); 736 | __m512i mask_vec = _mm512_set1_epi64(mask); 737 | __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); 738 | deltas = _mm512_and_si512(mask_vec, deltas); 739 | __m512i counters = 740 | _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas); 741 | *out_lo = _mm512_cvtepi64_epi32(counters); 742 | *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32)); 743 | } 744 | 745 | void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, 746 | const uint32_t key[8], uint64_t counter, 747 | bool increment_counter, uint8_t flags, 748 | uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 749 | __m256i h_vecs[8] = { 750 | set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), 751 | set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), 752 | }; 753 | __m256i counter_low_vec, counter_high_vec; 754 | load_counters8(counter, increment_counter, &counter_low_vec, 755 | &counter_high_vec); 756 | uint8_t block_flags = flags | flags_start; 757 | 758 | for (size_t block = 0; block < blocks; block++) { 759 | if (block + 1 == blocks) { 760 | block_flags |= flags_end; 761 | } 762 | __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN); 763 | __m256i block_flags_vec = set1_256(block_flags); 764 | __m256i msg_vecs[16]; 765 | transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); 766 | 767 | __m256i v[16] = { 768 | h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], 769 | h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], 770 | set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), 771 | counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, 772 | }; 773 | round_fn8(v, msg_vecs, 0); 774 | round_fn8(v, msg_vecs, 1); 775 | round_fn8(v, msg_vecs, 2); 776 | round_fn8(v, msg_vecs, 3); 777 | round_fn8(v, msg_vecs, 4); 778 | round_fn8(v, msg_vecs, 5); 779 | round_fn8(v, msg_vecs, 6); 780 | h_vecs[0] = xor_256(v[0], v[8]); 781 | h_vecs[1] = xor_256(v[1], v[9]); 782 | h_vecs[2] = xor_256(v[2], v[10]); 783 | h_vecs[3] = xor_256(v[3], v[11]); 784 | h_vecs[4] = xor_256(v[4], v[12]); 785 | h_vecs[5] = xor_256(v[5], v[13]); 786 | h_vecs[6] = xor_256(v[6], v[14]); 787 | h_vecs[7] = xor_256(v[7], v[15]); 788 | 789 | block_flags = flags; 790 | } 791 | 792 | transpose_vecs_256(h_vecs); 793 | storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]); 794 | storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]); 795 | storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]); 796 | storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]); 797 | storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]); 798 | storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]); 799 | storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]); 800 | storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]); 801 | } 802 | 803 | /* 804 | * ---------------------------------------------------------------------------- 805 | * hash16_avx512 806 | * ---------------------------------------------------------------------------- 807 | */ 808 | 809 | INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) { 810 | v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); 811 | v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); 812 | v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); 813 | v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); 814 | v[0] = add_512(v[0], v[4]); 815 | v[1] = add_512(v[1], v[5]); 816 | v[2] = add_512(v[2], v[6]); 817 | v[3] = add_512(v[3], v[7]); 818 | v[12] = xor_512(v[12], v[0]); 819 | v[13] = xor_512(v[13], v[1]); 820 | v[14] = xor_512(v[14], v[2]); 821 | v[15] = xor_512(v[15], v[3]); 822 | v[12] = rot16_512(v[12]); 823 | v[13] = rot16_512(v[13]); 824 | v[14] = rot16_512(v[14]); 825 | v[15] = rot16_512(v[15]); 826 | v[8] = add_512(v[8], v[12]); 827 | v[9] = add_512(v[9], v[13]); 828 | v[10] = add_512(v[10], v[14]); 829 | v[11] = add_512(v[11], v[15]); 830 | v[4] = xor_512(v[4], v[8]); 831 | v[5] = xor_512(v[5], v[9]); 832 | v[6] = xor_512(v[6], v[10]); 833 | v[7] = xor_512(v[7], v[11]); 834 | v[4] = rot12_512(v[4]); 835 | v[5] = rot12_512(v[5]); 836 | v[6] = rot12_512(v[6]); 837 | v[7] = rot12_512(v[7]); 838 | v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); 839 | v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); 840 | v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); 841 | v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); 842 | v[0] = add_512(v[0], v[4]); 843 | v[1] = add_512(v[1], v[5]); 844 | v[2] = add_512(v[2], v[6]); 845 | v[3] = add_512(v[3], v[7]); 846 | v[12] = xor_512(v[12], v[0]); 847 | v[13] = xor_512(v[13], v[1]); 848 | v[14] = xor_512(v[14], v[2]); 849 | v[15] = xor_512(v[15], v[3]); 850 | v[12] = rot8_512(v[12]); 851 | v[13] = rot8_512(v[13]); 852 | v[14] = rot8_512(v[14]); 853 | v[15] = rot8_512(v[15]); 854 | v[8] = add_512(v[8], v[12]); 855 | v[9] = add_512(v[9], v[13]); 856 | v[10] = add_512(v[10], v[14]); 857 | v[11] = add_512(v[11], v[15]); 858 | v[4] = xor_512(v[4], v[8]); 859 | v[5] = xor_512(v[5], v[9]); 860 | v[6] = xor_512(v[6], v[10]); 861 | v[7] = xor_512(v[7], v[11]); 862 | v[4] = rot7_512(v[4]); 863 | v[5] = rot7_512(v[5]); 864 | v[6] = rot7_512(v[6]); 865 | v[7] = rot7_512(v[7]); 866 | 867 | v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); 868 | v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); 869 | v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); 870 | v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); 871 | v[0] = add_512(v[0], v[5]); 872 | v[1] = add_512(v[1], v[6]); 873 | v[2] = add_512(v[2], v[7]); 874 | v[3] = add_512(v[3], v[4]); 875 | v[15] = xor_512(v[15], v[0]); 876 | v[12] = xor_512(v[12], v[1]); 877 | v[13] = xor_512(v[13], v[2]); 878 | v[14] = xor_512(v[14], v[3]); 879 | v[15] = rot16_512(v[15]); 880 | v[12] = rot16_512(v[12]); 881 | v[13] = rot16_512(v[13]); 882 | v[14] = rot16_512(v[14]); 883 | v[10] = add_512(v[10], v[15]); 884 | v[11] = add_512(v[11], v[12]); 885 | v[8] = add_512(v[8], v[13]); 886 | v[9] = add_512(v[9], v[14]); 887 | v[5] = xor_512(v[5], v[10]); 888 | v[6] = xor_512(v[6], v[11]); 889 | v[7] = xor_512(v[7], v[8]); 890 | v[4] = xor_512(v[4], v[9]); 891 | v[5] = rot12_512(v[5]); 892 | v[6] = rot12_512(v[6]); 893 | v[7] = rot12_512(v[7]); 894 | v[4] = rot12_512(v[4]); 895 | v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); 896 | v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); 897 | v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); 898 | v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); 899 | v[0] = add_512(v[0], v[5]); 900 | v[1] = add_512(v[1], v[6]); 901 | v[2] = add_512(v[2], v[7]); 902 | v[3] = add_512(v[3], v[4]); 903 | v[15] = xor_512(v[15], v[0]); 904 | v[12] = xor_512(v[12], v[1]); 905 | v[13] = xor_512(v[13], v[2]); 906 | v[14] = xor_512(v[14], v[3]); 907 | v[15] = rot8_512(v[15]); 908 | v[12] = rot8_512(v[12]); 909 | v[13] = rot8_512(v[13]); 910 | v[14] = rot8_512(v[14]); 911 | v[10] = add_512(v[10], v[15]); 912 | v[11] = add_512(v[11], v[12]); 913 | v[8] = add_512(v[8], v[13]); 914 | v[9] = add_512(v[9], v[14]); 915 | v[5] = xor_512(v[5], v[10]); 916 | v[6] = xor_512(v[6], v[11]); 917 | v[7] = xor_512(v[7], v[8]); 918 | v[4] = xor_512(v[4], v[9]); 919 | v[5] = rot7_512(v[5]); 920 | v[6] = rot7_512(v[6]); 921 | v[7] = rot7_512(v[7]); 922 | v[4] = rot7_512(v[4]); 923 | } 924 | 925 | // 0b10001000, or lanes a0/a2/b0/b2 in little-endian order 926 | #define LO_IMM8 0x88 927 | 928 | INLINE __m512i unpack_lo_128(__m512i a, __m512i b) { 929 | return _mm512_shuffle_i32x4(a, b, LO_IMM8); 930 | } 931 | 932 | // 0b11011101, or lanes a1/a3/b1/b3 in little-endian order 933 | #define HI_IMM8 0xdd 934 | 935 | INLINE __m512i unpack_hi_128(__m512i a, __m512i b) { 936 | return _mm512_shuffle_i32x4(a, b, HI_IMM8); 937 | } 938 | 939 | INLINE void transpose_vecs_512(__m512i vecs[16]) { 940 | // Interleave 32-bit lanes. The _0 unpack is lanes 941 | // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes 942 | // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15. 943 | __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]); 944 | __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]); 945 | __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]); 946 | __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]); 947 | __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]); 948 | __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]); 949 | __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]); 950 | __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]); 951 | __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]); 952 | __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]); 953 | __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]); 954 | __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]); 955 | __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]); 956 | __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]); 957 | __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]); 958 | __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]); 959 | 960 | // Interleave 64-bit lates. The _0 unpack is lanes 961 | // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes 962 | // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes 963 | // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes 964 | // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15. 965 | __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0); 966 | __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0); 967 | __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2); 968 | __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2); 969 | __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0); 970 | __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0); 971 | __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2); 972 | __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2); 973 | __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0); 974 | __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0); 975 | __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2); 976 | __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2); 977 | __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0); 978 | __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0); 979 | __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2); 980 | __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2); 981 | 982 | // Interleave 128-bit lanes. The _0 unpack is 983 | // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is 984 | // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on. 985 | __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0); 986 | __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1); 987 | __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2); 988 | __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3); 989 | __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0); 990 | __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1); 991 | __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2); 992 | __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3); 993 | __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0); 994 | __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1); 995 | __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2); 996 | __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3); 997 | __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0); 998 | __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1); 999 | __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2); 1000 | __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3); 1001 | 1002 | // Interleave 128-bit lanes again for the final outputs. 1003 | vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0); 1004 | vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1); 1005 | vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2); 1006 | vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3); 1007 | vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4); 1008 | vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5); 1009 | vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6); 1010 | vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7); 1011 | vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0); 1012 | vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1); 1013 | vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2); 1014 | vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3); 1015 | vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4); 1016 | vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5); 1017 | vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6); 1018 | vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7); 1019 | } 1020 | 1021 | INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, 1022 | size_t block_offset, __m512i out[16]) { 1023 | out[0] = loadu_512(&inputs[0][block_offset]); 1024 | out[1] = loadu_512(&inputs[1][block_offset]); 1025 | out[2] = loadu_512(&inputs[2][block_offset]); 1026 | out[3] = loadu_512(&inputs[3][block_offset]); 1027 | out[4] = loadu_512(&inputs[4][block_offset]); 1028 | out[5] = loadu_512(&inputs[5][block_offset]); 1029 | out[6] = loadu_512(&inputs[6][block_offset]); 1030 | out[7] = loadu_512(&inputs[7][block_offset]); 1031 | out[8] = loadu_512(&inputs[8][block_offset]); 1032 | out[9] = loadu_512(&inputs[9][block_offset]); 1033 | out[10] = loadu_512(&inputs[10][block_offset]); 1034 | out[11] = loadu_512(&inputs[11][block_offset]); 1035 | out[12] = loadu_512(&inputs[12][block_offset]); 1036 | out[13] = loadu_512(&inputs[13][block_offset]); 1037 | out[14] = loadu_512(&inputs[14][block_offset]); 1038 | out[15] = loadu_512(&inputs[15][block_offset]); 1039 | for (size_t i = 0; i < 16; ++i) { 1040 | _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); 1041 | } 1042 | transpose_vecs_512(out); 1043 | } 1044 | 1045 | INLINE void load_counters16(uint64_t counter, bool increment_counter, 1046 | __m512i *out_lo, __m512i *out_hi) { 1047 | const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter); 1048 | const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 1049 | const __m512i add1 = _mm512_and_si512(mask, add0); 1050 | __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1); 1051 | __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT); 1052 | __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1)); 1053 | *out_lo = l; 1054 | *out_hi = h; 1055 | } 1056 | 1057 | void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, 1058 | const uint32_t key[8], uint64_t counter, 1059 | bool increment_counter, uint8_t flags, 1060 | uint8_t flags_start, uint8_t flags_end, 1061 | uint8_t *out) { 1062 | __m512i h_vecs[8] = { 1063 | set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), 1064 | set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), 1065 | }; 1066 | __m512i counter_low_vec, counter_high_vec; 1067 | load_counters16(counter, increment_counter, &counter_low_vec, 1068 | &counter_high_vec); 1069 | uint8_t block_flags = flags | flags_start; 1070 | 1071 | for (size_t block = 0; block < blocks; block++) { 1072 | if (block + 1 == blocks) { 1073 | block_flags |= flags_end; 1074 | } 1075 | __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN); 1076 | __m512i block_flags_vec = set1_512(block_flags); 1077 | __m512i msg_vecs[16]; 1078 | transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); 1079 | 1080 | __m512i v[16] = { 1081 | h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], 1082 | h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], 1083 | set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), 1084 | counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, 1085 | }; 1086 | round_fn16(v, msg_vecs, 0); 1087 | round_fn16(v, msg_vecs, 1); 1088 | round_fn16(v, msg_vecs, 2); 1089 | round_fn16(v, msg_vecs, 3); 1090 | round_fn16(v, msg_vecs, 4); 1091 | round_fn16(v, msg_vecs, 5); 1092 | round_fn16(v, msg_vecs, 6); 1093 | h_vecs[0] = xor_512(v[0], v[8]); 1094 | h_vecs[1] = xor_512(v[1], v[9]); 1095 | h_vecs[2] = xor_512(v[2], v[10]); 1096 | h_vecs[3] = xor_512(v[3], v[11]); 1097 | h_vecs[4] = xor_512(v[4], v[12]); 1098 | h_vecs[5] = xor_512(v[5], v[13]); 1099 | h_vecs[6] = xor_512(v[6], v[14]); 1100 | h_vecs[7] = xor_512(v[7], v[15]); 1101 | 1102 | block_flags = flags; 1103 | } 1104 | 1105 | // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8 1106 | // state vectors. Pad the matrix with zeros. After transposition, store the 1107 | // lower half of each vector. 1108 | __m512i padded[16] = { 1109 | h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], 1110 | h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], 1111 | set1_512(0), set1_512(0), set1_512(0), set1_512(0), 1112 | set1_512(0), set1_512(0), set1_512(0), set1_512(0), 1113 | }; 1114 | transpose_vecs_512(padded); 1115 | _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0])); 1116 | _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1])); 1117 | _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2])); 1118 | _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3])); 1119 | _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4])); 1120 | _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5])); 1121 | _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6])); 1122 | _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7])); 1123 | _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8])); 1124 | _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9])); 1125 | _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10])); 1126 | _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11])); 1127 | _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12])); 1128 | _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13])); 1129 | _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14])); 1130 | _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15])); 1131 | } 1132 | 1133 | /* 1134 | * ---------------------------------------------------------------------------- 1135 | * hash_many_avx512 1136 | * ---------------------------------------------------------------------------- 1137 | */ 1138 | 1139 | INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, 1140 | const uint32_t key[8], uint64_t counter, 1141 | uint8_t flags, uint8_t flags_start, 1142 | uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { 1143 | uint32_t cv[8]; 1144 | memcpy(cv, key, BLAKE3_KEY_LEN); 1145 | uint8_t block_flags = flags | flags_start; 1146 | while (blocks > 0) { 1147 | if (blocks == 1) { 1148 | block_flags |= flags_end; 1149 | } 1150 | blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, 1151 | block_flags); 1152 | input = &input[BLAKE3_BLOCK_LEN]; 1153 | blocks -= 1; 1154 | block_flags = flags; 1155 | } 1156 | memcpy(out, cv, BLAKE3_OUT_LEN); 1157 | } 1158 | 1159 | void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, 1160 | size_t blocks, const uint32_t key[8], 1161 | uint64_t counter, bool increment_counter, 1162 | uint8_t flags, uint8_t flags_start, 1163 | uint8_t flags_end, uint8_t *out) { 1164 | while (num_inputs >= 16) { 1165 | blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, 1166 | flags_start, flags_end, out); 1167 | if (increment_counter) { 1168 | counter += 16; 1169 | } 1170 | inputs += 16; 1171 | num_inputs -= 16; 1172 | out = &out[16 * BLAKE3_OUT_LEN]; 1173 | } 1174 | while (num_inputs >= 8) { 1175 | blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, 1176 | flags_start, flags_end, out); 1177 | if (increment_counter) { 1178 | counter += 8; 1179 | } 1180 | inputs += 8; 1181 | num_inputs -= 8; 1182 | out = &out[8 * BLAKE3_OUT_LEN]; 1183 | } 1184 | while (num_inputs >= 4) { 1185 | blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, 1186 | flags_start, flags_end, out); 1187 | if (increment_counter) { 1188 | counter += 4; 1189 | } 1190 | inputs += 4; 1191 | num_inputs -= 4; 1192 | out = &out[4 * BLAKE3_OUT_LEN]; 1193 | } 1194 | while (num_inputs > 0) { 1195 | hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, 1196 | flags_end, out); 1197 | if (increment_counter) { 1198 | counter += 1; 1199 | } 1200 | inputs += 1; 1201 | num_inputs -= 1; 1202 | out = &out[BLAKE3_OUT_LEN]; 1203 | } 1204 | } 1205 | -------------------------------------------------------------------------------- /blake3_dispatch.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "blake3_impl.h" 6 | 7 | #if defined(IS_X86) 8 | #if defined(_MSC_VER) 9 | #include 10 | #elif defined(__GNUC__) 11 | #include 12 | #else 13 | #error "Unimplemented!" 14 | #endif 15 | #endif 16 | 17 | #define MAYBE_UNUSED(x) (void)((x)) 18 | 19 | #if defined(IS_X86) 20 | static uint64_t xgetbv() { 21 | #if defined(_MSC_VER) 22 | return _xgetbv(0); 23 | #else 24 | uint32_t eax = 0, edx = 0; 25 | __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); 26 | return ((uint64_t)edx << 32) | eax; 27 | #endif 28 | } 29 | 30 | static void cpuid(uint32_t out[4], uint32_t id) { 31 | #if defined(_MSC_VER) 32 | __cpuid((int *)out, id); 33 | #elif defined(__i386__) || defined(_M_IX86) 34 | __asm__ __volatile__("movl %%ebx, %1\n" 35 | "cpuid\n" 36 | "xchgl %1, %%ebx\n" 37 | : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) 38 | : "a"(id)); 39 | #else 40 | __asm__ __volatile__("cpuid\n" 41 | : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) 42 | : "a"(id)); 43 | #endif 44 | } 45 | 46 | static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { 47 | #if defined(_MSC_VER) 48 | __cpuidex((int *)out, id, sid); 49 | #elif defined(__i386__) || defined(_M_IX86) 50 | __asm__ __volatile__("movl %%ebx, %1\n" 51 | "cpuid\n" 52 | "xchgl %1, %%ebx\n" 53 | : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) 54 | : "a"(id), "c"(sid)); 55 | #else 56 | __asm__ __volatile__("cpuid\n" 57 | : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) 58 | : "a"(id), "c"(sid)); 59 | #endif 60 | } 61 | 62 | #endif 63 | 64 | enum cpu_feature { 65 | SSE2 = 1 << 0, 66 | SSSE3 = 1 << 1, 67 | SSE41 = 1 << 2, 68 | AVX = 1 << 3, 69 | AVX2 = 1 << 4, 70 | AVX512F = 1 << 5, 71 | AVX512VL = 1 << 6, 72 | /* ... */ 73 | UNDEFINED = 1 << 30 74 | }; 75 | 76 | #if !defined(BLAKE3_TESTING) 77 | static /* Allow the variable to be controlled manually for testing */ 78 | #endif 79 | enum cpu_feature g_cpu_features = UNDEFINED; 80 | 81 | #if !defined(BLAKE3_TESTING) 82 | static 83 | #endif 84 | enum cpu_feature 85 | get_cpu_features() { 86 | 87 | if (g_cpu_features != UNDEFINED) { 88 | return g_cpu_features; 89 | } else { 90 | #if defined(IS_X86) 91 | uint32_t regs[4] = {0}; 92 | uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; 93 | (void)edx; 94 | enum cpu_feature features = 0; 95 | cpuid(regs, 0); 96 | const int max_id = *eax; 97 | cpuid(regs, 1); 98 | #if defined(__amd64__) || defined(_M_X64) 99 | features |= SSE2; 100 | #else 101 | if (*edx & (1UL << 26)) 102 | features |= SSE2; 103 | #endif 104 | if (*ecx & (1UL << 0)) 105 | features |= SSSE3; 106 | if (*ecx & (1UL << 19)) 107 | features |= SSE41; 108 | 109 | if (*ecx & (1UL << 27)) { // OSXSAVE 110 | const uint64_t mask = xgetbv(); 111 | if ((mask & 6) == 6) { // SSE and AVX states 112 | if (*ecx & (1UL << 28)) 113 | features |= AVX; 114 | if (max_id >= 7) { 115 | cpuidex(regs, 7, 0); 116 | if (*ebx & (1UL << 5)) 117 | features |= AVX2; 118 | if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm 119 | if (*ebx & (1UL << 31)) 120 | features |= AVX512VL; 121 | if (*ebx & (1UL << 16)) 122 | features |= AVX512F; 123 | } 124 | } 125 | } 126 | } 127 | g_cpu_features = features; 128 | return features; 129 | #else 130 | /* How to detect NEON? */ 131 | return 0; 132 | #endif 133 | } 134 | } 135 | 136 | void blake3_compress_in_place(uint32_t cv[8], 137 | const uint8_t block[BLAKE3_BLOCK_LEN], 138 | uint8_t block_len, uint64_t counter, 139 | uint8_t flags) { 140 | #if defined(IS_X86) 141 | const enum cpu_feature features = get_cpu_features(); 142 | MAYBE_UNUSED(features); 143 | #if !defined(BLAKE3_NO_AVX512) 144 | if (features & AVX512VL) { 145 | blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); 146 | return; 147 | } 148 | #endif 149 | #if !defined(BLAKE3_NO_SSE41) 150 | if (features & SSE41) { 151 | blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); 152 | return; 153 | } 154 | #endif 155 | #if !defined(BLAKE3_NO_SSE2) 156 | if (features & SSE2) { 157 | blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); 158 | return; 159 | } 160 | #endif 161 | #endif 162 | blake3_compress_in_place_portable(cv, block, block_len, counter, flags); 163 | } 164 | 165 | void blake3_compress_xof(const uint32_t cv[8], 166 | const uint8_t block[BLAKE3_BLOCK_LEN], 167 | uint8_t block_len, uint64_t counter, uint8_t flags, 168 | uint8_t out[64]) { 169 | #if defined(IS_X86) 170 | const enum cpu_feature features = get_cpu_features(); 171 | MAYBE_UNUSED(features); 172 | #if !defined(BLAKE3_NO_AVX512) 173 | if (features & AVX512VL) { 174 | blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); 175 | return; 176 | } 177 | #endif 178 | #if !defined(BLAKE3_NO_SSE41) 179 | if (features & SSE41) { 180 | blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); 181 | return; 182 | } 183 | #endif 184 | #if !defined(BLAKE3_NO_SSE2) 185 | if (features & SSE2) { 186 | blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); 187 | return; 188 | } 189 | #endif 190 | #endif 191 | blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); 192 | } 193 | 194 | void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, 195 | size_t blocks, const uint32_t key[8], uint64_t counter, 196 | bool increment_counter, uint8_t flags, 197 | uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 198 | #if defined(IS_X86) 199 | const enum cpu_feature features = get_cpu_features(); 200 | MAYBE_UNUSED(features); 201 | #if !defined(BLAKE3_NO_AVX512) 202 | if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { 203 | blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, 204 | increment_counter, flags, flags_start, flags_end, 205 | out); 206 | return; 207 | } 208 | #endif 209 | #if !defined(BLAKE3_NO_AVX2) 210 | if (features & AVX2) { 211 | blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, 212 | increment_counter, flags, flags_start, flags_end, 213 | out); 214 | return; 215 | } 216 | #endif 217 | #if !defined(BLAKE3_NO_SSE41) 218 | if (features & SSE41) { 219 | blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, 220 | increment_counter, flags, flags_start, flags_end, 221 | out); 222 | return; 223 | } 224 | #endif 225 | #if !defined(BLAKE3_NO_SSE2) 226 | if (features & SSE2) { 227 | blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, 228 | increment_counter, flags, flags_start, flags_end, 229 | out); 230 | return; 231 | } 232 | #endif 233 | #endif 234 | 235 | #if defined(BLAKE3_USE_NEON) 236 | blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, 237 | increment_counter, flags, flags_start, flags_end, out); 238 | return; 239 | #endif 240 | 241 | blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, 242 | increment_counter, flags, flags_start, flags_end, 243 | out); 244 | } 245 | 246 | // The dynamically detected SIMD degree of the current platform. 247 | size_t blake3_simd_degree(void) { 248 | #if defined(IS_X86) 249 | const enum cpu_feature features = get_cpu_features(); 250 | MAYBE_UNUSED(features); 251 | #if !defined(BLAKE3_NO_AVX512) 252 | if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { 253 | return 16; 254 | } 255 | #endif 256 | #if !defined(BLAKE3_NO_AVX2) 257 | if (features & AVX2) { 258 | return 8; 259 | } 260 | #endif 261 | #if !defined(BLAKE3_NO_SSE41) 262 | if (features & SSE41) { 263 | return 4; 264 | } 265 | #endif 266 | #if !defined(BLAKE3_NO_SSE2) 267 | if (features & SSE2) { 268 | return 4; 269 | } 270 | #endif 271 | #endif 272 | #if defined(BLAKE3_USE_NEON) 273 | return 4; 274 | #endif 275 | return 1; 276 | } 277 | -------------------------------------------------------------------------------- /blake3_impl.h: -------------------------------------------------------------------------------- 1 | #ifndef BLAKE3_IMPL_H 2 | #define BLAKE3_IMPL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "blake3.h" 11 | 12 | // internal flags 13 | enum blake3_flags { 14 | CHUNK_START = 1 << 0, 15 | CHUNK_END = 1 << 1, 16 | PARENT = 1 << 2, 17 | ROOT = 1 << 3, 18 | KEYED_HASH = 1 << 4, 19 | DERIVE_KEY_CONTEXT = 1 << 5, 20 | DERIVE_KEY_MATERIAL = 1 << 6, 21 | }; 22 | 23 | // This C implementation tries to support recent versions of GCC, Clang, and 24 | // MSVC. 25 | #if defined(_MSC_VER) 26 | #define INLINE static __forceinline 27 | #else 28 | #define INLINE static inline __attribute__((always_inline)) 29 | #endif 30 | 31 | #if defined(__x86_64__) || defined(_M_X64) 32 | #define IS_X86 33 | #define IS_X86_64 34 | #endif 35 | 36 | #if defined(__i386__) || defined(_M_IX86) 37 | #define IS_X86 38 | #define IS_X86_32 39 | #endif 40 | 41 | #if defined(IS_X86) 42 | #if defined(_MSC_VER) 43 | #include 44 | #endif 45 | #include 46 | #endif 47 | 48 | #if defined(IS_X86) 49 | #define MAX_SIMD_DEGREE 16 50 | #elif defined(BLAKE3_USE_NEON) 51 | #define MAX_SIMD_DEGREE 4 52 | #else 53 | #define MAX_SIMD_DEGREE 1 54 | #endif 55 | 56 | // There are some places where we want a static size that's equal to the 57 | // MAX_SIMD_DEGREE, but also at least 2. 58 | #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) 59 | 60 | static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 61 | 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, 62 | 0x1F83D9ABUL, 0x5BE0CD19UL}; 63 | 64 | static const uint8_t MSG_SCHEDULE[7][16] = { 65 | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 66 | {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, 67 | {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, 68 | {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, 69 | {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, 70 | {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, 71 | {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, 72 | }; 73 | 74 | /* Find index of the highest set bit */ 75 | /* x is assumed to be nonzero. */ 76 | static unsigned int highest_one(uint64_t x) { 77 | #if defined(__GNUC__) || defined(__clang__) 78 | return 63 ^ __builtin_clzll(x); 79 | #elif defined(_MSC_VER) && defined(IS_X86_64) 80 | unsigned long index; 81 | _BitScanReverse64(&index, x); 82 | return index; 83 | #elif defined(_MSC_VER) && defined(IS_X86_32) 84 | if(x >> 32) { 85 | unsigned long index; 86 | _BitScanReverse(&index, x >> 32); 87 | return 32 + index; 88 | } else { 89 | unsigned long index; 90 | _BitScanReverse(&index, x); 91 | return index; 92 | } 93 | #else 94 | unsigned int c = 0; 95 | if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } 96 | if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } 97 | if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } 98 | if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } 99 | if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } 100 | if(x & 0x0000000000000002ULL) { c += 1; } 101 | return c; 102 | #endif 103 | } 104 | 105 | // Count the number of 1 bits. 106 | INLINE unsigned int popcnt(uint64_t x) { 107 | #if defined(__GNUC__) || defined(__clang__) 108 | return __builtin_popcountll(x); 109 | #else 110 | unsigned int count = 0; 111 | while (x != 0) { 112 | count += 1; 113 | x &= x - 1; 114 | } 115 | return count; 116 | #endif 117 | } 118 | 119 | // Largest power of two less than or equal to x. As a special case, returns 1 120 | // when x is 0. 121 | INLINE uint64_t round_down_to_power_of_2(uint64_t x) { 122 | return 1ULL << highest_one(x | 1); 123 | } 124 | 125 | INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } 126 | 127 | INLINE uint32_t counter_high(uint64_t counter) { 128 | return (uint32_t)(counter >> 32); 129 | } 130 | 131 | INLINE uint32_t load32(const void *src) { 132 | const uint8_t *p = (const uint8_t *)src; 133 | return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | 134 | ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); 135 | } 136 | 137 | INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], 138 | uint32_t key_words[8]) { 139 | key_words[0] = load32(&key[0 * 4]); 140 | key_words[1] = load32(&key[1 * 4]); 141 | key_words[2] = load32(&key[2 * 4]); 142 | key_words[3] = load32(&key[3 * 4]); 143 | key_words[4] = load32(&key[4 * 4]); 144 | key_words[5] = load32(&key[5 * 4]); 145 | key_words[6] = load32(&key[6 * 4]); 146 | key_words[7] = load32(&key[7 * 4]); 147 | } 148 | 149 | INLINE void store32(void *dst, uint32_t w) { 150 | uint8_t *p = (uint8_t *)dst; 151 | p[0] = (uint8_t)(w >> 0); 152 | p[1] = (uint8_t)(w >> 8); 153 | p[2] = (uint8_t)(w >> 16); 154 | p[3] = (uint8_t)(w >> 24); 155 | } 156 | 157 | INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { 158 | store32(&bytes_out[0 * 4], cv_words[0]); 159 | store32(&bytes_out[1 * 4], cv_words[1]); 160 | store32(&bytes_out[2 * 4], cv_words[2]); 161 | store32(&bytes_out[3 * 4], cv_words[3]); 162 | store32(&bytes_out[4 * 4], cv_words[4]); 163 | store32(&bytes_out[5 * 4], cv_words[5]); 164 | store32(&bytes_out[6 * 4], cv_words[6]); 165 | store32(&bytes_out[7 * 4], cv_words[7]); 166 | } 167 | 168 | void blake3_compress_in_place(uint32_t cv[8], 169 | const uint8_t block[BLAKE3_BLOCK_LEN], 170 | uint8_t block_len, uint64_t counter, 171 | uint8_t flags); 172 | 173 | void blake3_compress_xof(const uint32_t cv[8], 174 | const uint8_t block[BLAKE3_BLOCK_LEN], 175 | uint8_t block_len, uint64_t counter, uint8_t flags, 176 | uint8_t out[64]); 177 | 178 | void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, 179 | size_t blocks, const uint32_t key[8], uint64_t counter, 180 | bool increment_counter, uint8_t flags, 181 | uint8_t flags_start, uint8_t flags_end, uint8_t *out); 182 | 183 | size_t blake3_simd_degree(void); 184 | 185 | 186 | // Declarations for implementation-specific functions. 187 | void blake3_compress_in_place_portable(uint32_t cv[8], 188 | const uint8_t block[BLAKE3_BLOCK_LEN], 189 | uint8_t block_len, uint64_t counter, 190 | uint8_t flags); 191 | 192 | void blake3_compress_xof_portable(const uint32_t cv[8], 193 | const uint8_t block[BLAKE3_BLOCK_LEN], 194 | uint8_t block_len, uint64_t counter, 195 | uint8_t flags, uint8_t out[64]); 196 | 197 | void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, 198 | size_t blocks, const uint32_t key[8], 199 | uint64_t counter, bool increment_counter, 200 | uint8_t flags, uint8_t flags_start, 201 | uint8_t flags_end, uint8_t *out); 202 | 203 | #if defined(IS_X86) 204 | #if !defined(BLAKE3_NO_SSE2) 205 | void blake3_compress_in_place_sse2(uint32_t cv[8], 206 | const uint8_t block[BLAKE3_BLOCK_LEN], 207 | uint8_t block_len, uint64_t counter, 208 | uint8_t flags); 209 | void blake3_compress_xof_sse2(const uint32_t cv[8], 210 | const uint8_t block[BLAKE3_BLOCK_LEN], 211 | uint8_t block_len, uint64_t counter, 212 | uint8_t flags, uint8_t out[64]); 213 | void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, 214 | size_t blocks, const uint32_t key[8], 215 | uint64_t counter, bool increment_counter, 216 | uint8_t flags, uint8_t flags_start, 217 | uint8_t flags_end, uint8_t *out); 218 | #endif 219 | #if !defined(BLAKE3_NO_SSE41) 220 | void blake3_compress_in_place_sse41(uint32_t cv[8], 221 | const uint8_t block[BLAKE3_BLOCK_LEN], 222 | uint8_t block_len, uint64_t counter, 223 | uint8_t flags); 224 | void blake3_compress_xof_sse41(const uint32_t cv[8], 225 | const uint8_t block[BLAKE3_BLOCK_LEN], 226 | uint8_t block_len, uint64_t counter, 227 | uint8_t flags, uint8_t out[64]); 228 | void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, 229 | size_t blocks, const uint32_t key[8], 230 | uint64_t counter, bool increment_counter, 231 | uint8_t flags, uint8_t flags_start, 232 | uint8_t flags_end, uint8_t *out); 233 | #endif 234 | #if !defined(BLAKE3_NO_AVX2) 235 | void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, 236 | size_t blocks, const uint32_t key[8], 237 | uint64_t counter, bool increment_counter, 238 | uint8_t flags, uint8_t flags_start, 239 | uint8_t flags_end, uint8_t *out); 240 | #endif 241 | #if !defined(BLAKE3_NO_AVX512) 242 | void blake3_compress_in_place_avx512(uint32_t cv[8], 243 | const uint8_t block[BLAKE3_BLOCK_LEN], 244 | uint8_t block_len, uint64_t counter, 245 | uint8_t flags); 246 | 247 | void blake3_compress_xof_avx512(const uint32_t cv[8], 248 | const uint8_t block[BLAKE3_BLOCK_LEN], 249 | uint8_t block_len, uint64_t counter, 250 | uint8_t flags, uint8_t out[64]); 251 | 252 | void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, 253 | size_t blocks, const uint32_t key[8], 254 | uint64_t counter, bool increment_counter, 255 | uint8_t flags, uint8_t flags_start, 256 | uint8_t flags_end, uint8_t *out); 257 | #endif 258 | #endif 259 | 260 | #if defined(BLAKE3_USE_NEON) 261 | void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, 262 | size_t blocks, const uint32_t key[8], 263 | uint64_t counter, bool increment_counter, 264 | uint8_t flags, uint8_t flags_start, 265 | uint8_t flags_end, uint8_t *out); 266 | #endif 267 | 268 | 269 | #endif /* BLAKE3_IMPL_H */ 270 | -------------------------------------------------------------------------------- /blake3_portable.c: -------------------------------------------------------------------------------- 1 | #include "blake3_impl.h" 2 | #include 3 | 4 | INLINE uint32_t rotr32(uint32_t w, uint32_t c) { 5 | return (w >> c) | (w << (32 - c)); 6 | } 7 | 8 | INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, 9 | uint32_t x, uint32_t y) { 10 | state[a] = state[a] + state[b] + x; 11 | state[d] = rotr32(state[d] ^ state[a], 16); 12 | state[c] = state[c] + state[d]; 13 | state[b] = rotr32(state[b] ^ state[c], 12); 14 | state[a] = state[a] + state[b] + y; 15 | state[d] = rotr32(state[d] ^ state[a], 8); 16 | state[c] = state[c] + state[d]; 17 | state[b] = rotr32(state[b] ^ state[c], 7); 18 | } 19 | 20 | INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { 21 | // Select the message schedule based on the round. 22 | const uint8_t *schedule = MSG_SCHEDULE[round]; 23 | 24 | // Mix the columns. 25 | g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); 26 | g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); 27 | g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); 28 | g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); 29 | 30 | // Mix the rows. 31 | g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); 32 | g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); 33 | g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); 34 | g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); 35 | } 36 | 37 | INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], 38 | const uint8_t block[BLAKE3_BLOCK_LEN], 39 | uint8_t block_len, uint64_t counter, uint8_t flags) { 40 | uint32_t block_words[16]; 41 | block_words[0] = load32(block + 4 * 0); 42 | block_words[1] = load32(block + 4 * 1); 43 | block_words[2] = load32(block + 4 * 2); 44 | block_words[3] = load32(block + 4 * 3); 45 | block_words[4] = load32(block + 4 * 4); 46 | block_words[5] = load32(block + 4 * 5); 47 | block_words[6] = load32(block + 4 * 6); 48 | block_words[7] = load32(block + 4 * 7); 49 | block_words[8] = load32(block + 4 * 8); 50 | block_words[9] = load32(block + 4 * 9); 51 | block_words[10] = load32(block + 4 * 10); 52 | block_words[11] = load32(block + 4 * 11); 53 | block_words[12] = load32(block + 4 * 12); 54 | block_words[13] = load32(block + 4 * 13); 55 | block_words[14] = load32(block + 4 * 14); 56 | block_words[15] = load32(block + 4 * 15); 57 | 58 | state[0] = cv[0]; 59 | state[1] = cv[1]; 60 | state[2] = cv[2]; 61 | state[3] = cv[3]; 62 | state[4] = cv[4]; 63 | state[5] = cv[5]; 64 | state[6] = cv[6]; 65 | state[7] = cv[7]; 66 | state[8] = IV[0]; 67 | state[9] = IV[1]; 68 | state[10] = IV[2]; 69 | state[11] = IV[3]; 70 | state[12] = counter_low(counter); 71 | state[13] = counter_high(counter); 72 | state[14] = (uint32_t)block_len; 73 | state[15] = (uint32_t)flags; 74 | 75 | round_fn(state, &block_words[0], 0); 76 | round_fn(state, &block_words[0], 1); 77 | round_fn(state, &block_words[0], 2); 78 | round_fn(state, &block_words[0], 3); 79 | round_fn(state, &block_words[0], 4); 80 | round_fn(state, &block_words[0], 5); 81 | round_fn(state, &block_words[0], 6); 82 | } 83 | 84 | void blake3_compress_in_place_portable(uint32_t cv[8], 85 | const uint8_t block[BLAKE3_BLOCK_LEN], 86 | uint8_t block_len, uint64_t counter, 87 | uint8_t flags) { 88 | uint32_t state[16]; 89 | compress_pre(state, cv, block, block_len, counter, flags); 90 | cv[0] = state[0] ^ state[8]; 91 | cv[1] = state[1] ^ state[9]; 92 | cv[2] = state[2] ^ state[10]; 93 | cv[3] = state[3] ^ state[11]; 94 | cv[4] = state[4] ^ state[12]; 95 | cv[5] = state[5] ^ state[13]; 96 | cv[6] = state[6] ^ state[14]; 97 | cv[7] = state[7] ^ state[15]; 98 | } 99 | 100 | void blake3_compress_xof_portable(const uint32_t cv[8], 101 | const uint8_t block[BLAKE3_BLOCK_LEN], 102 | uint8_t block_len, uint64_t counter, 103 | uint8_t flags, uint8_t out[64]) { 104 | uint32_t state[16]; 105 | compress_pre(state, cv, block, block_len, counter, flags); 106 | 107 | store32(&out[0 * 4], state[0] ^ state[8]); 108 | store32(&out[1 * 4], state[1] ^ state[9]); 109 | store32(&out[2 * 4], state[2] ^ state[10]); 110 | store32(&out[3 * 4], state[3] ^ state[11]); 111 | store32(&out[4 * 4], state[4] ^ state[12]); 112 | store32(&out[5 * 4], state[5] ^ state[13]); 113 | store32(&out[6 * 4], state[6] ^ state[14]); 114 | store32(&out[7 * 4], state[7] ^ state[15]); 115 | store32(&out[8 * 4], state[8] ^ cv[0]); 116 | store32(&out[9 * 4], state[9] ^ cv[1]); 117 | store32(&out[10 * 4], state[10] ^ cv[2]); 118 | store32(&out[11 * 4], state[11] ^ cv[3]); 119 | store32(&out[12 * 4], state[12] ^ cv[4]); 120 | store32(&out[13 * 4], state[13] ^ cv[5]); 121 | store32(&out[14 * 4], state[14] ^ cv[6]); 122 | store32(&out[15 * 4], state[15] ^ cv[7]); 123 | } 124 | 125 | INLINE void hash_one_portable(const uint8_t *input, size_t blocks, 126 | const uint32_t key[8], uint64_t counter, 127 | uint8_t flags, uint8_t flags_start, 128 | uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { 129 | uint32_t cv[8]; 130 | memcpy(cv, key, BLAKE3_KEY_LEN); 131 | uint8_t block_flags = flags | flags_start; 132 | while (blocks > 0) { 133 | if (blocks == 1) { 134 | block_flags |= flags_end; 135 | } 136 | blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, 137 | block_flags); 138 | input = &input[BLAKE3_BLOCK_LEN]; 139 | blocks -= 1; 140 | block_flags = flags; 141 | } 142 | store_cv_words(out, cv); 143 | } 144 | 145 | void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, 146 | size_t blocks, const uint32_t key[8], 147 | uint64_t counter, bool increment_counter, 148 | uint8_t flags, uint8_t flags_start, 149 | uint8_t flags_end, uint8_t *out) { 150 | while (num_inputs > 0) { 151 | hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, 152 | flags_end, out); 153 | if (increment_counter) { 154 | counter += 1; 155 | } 156 | inputs += 1; 157 | num_inputs -= 1; 158 | out = &out[BLAKE3_OUT_LEN]; 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /blake3_sse2.c: -------------------------------------------------------------------------------- 1 | #include "blake3_impl.h" 2 | 3 | #include 4 | 5 | #define DEGREE 4 6 | 7 | #define _mm_shuffle_ps2(a, b, c) \ 8 | (_mm_castps_si128( \ 9 | _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) 10 | 11 | INLINE __m128i loadu(const uint8_t src[16]) { 12 | return _mm_loadu_si128((const __m128i *)src); 13 | } 14 | 15 | INLINE void storeu(__m128i src, uint8_t dest[16]) { 16 | _mm_storeu_si128((__m128i *)dest, src); 17 | } 18 | 19 | INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } 20 | 21 | // Note that clang-format doesn't like the name "xor" for some reason. 22 | INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } 23 | 24 | INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } 25 | 26 | INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { 27 | return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); 28 | } 29 | 30 | INLINE __m128i rot16(__m128i x) { 31 | return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1); 32 | } 33 | 34 | INLINE __m128i rot12(__m128i x) { 35 | return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); 36 | } 37 | 38 | INLINE __m128i rot8(__m128i x) { 39 | return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8)); 40 | } 41 | 42 | INLINE __m128i rot7(__m128i x) { 43 | return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); 44 | } 45 | 46 | INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, 47 | __m128i m) { 48 | *row0 = addv(addv(*row0, m), *row1); 49 | *row3 = xorv(*row3, *row0); 50 | *row3 = rot16(*row3); 51 | *row2 = addv(*row2, *row3); 52 | *row1 = xorv(*row1, *row2); 53 | *row1 = rot12(*row1); 54 | } 55 | 56 | INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, 57 | __m128i m) { 58 | *row0 = addv(addv(*row0, m), *row1); 59 | *row3 = xorv(*row3, *row0); 60 | *row3 = rot8(*row3); 61 | *row2 = addv(*row2, *row3); 62 | *row1 = xorv(*row1, *row2); 63 | *row1 = rot7(*row1); 64 | } 65 | 66 | // Note the optimization here of leaving row1 as the unrotated row, rather than 67 | // row0. All the message loads below are adjusted to compensate for this. See 68 | // discussion at https://github.com/sneves/blake2-avx2/pull/4 69 | INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { 70 | *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); 71 | *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); 72 | *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); 73 | } 74 | 75 | INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { 76 | *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); 77 | *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); 78 | *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); 79 | } 80 | 81 | INLINE __m128i blend_epi16(__m128i a, __m128i b, const int imm8) { 82 | const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); 83 | __m128i mask = _mm_set1_epi16(imm8); 84 | mask = _mm_and_si128(mask, bits); 85 | mask = _mm_cmpeq_epi16(mask, bits); 86 | return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)); 87 | } 88 | 89 | INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], 90 | const uint8_t block[BLAKE3_BLOCK_LEN], 91 | uint8_t block_len, uint64_t counter, uint8_t flags) { 92 | rows[0] = loadu((uint8_t *)&cv[0]); 93 | rows[1] = loadu((uint8_t *)&cv[4]); 94 | rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); 95 | rows[3] = set4(counter_low(counter), counter_high(counter), 96 | (uint32_t)block_len, (uint32_t)flags); 97 | 98 | __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); 99 | __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); 100 | __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); 101 | __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); 102 | 103 | __m128i t0, t1, t2, t3, tt; 104 | 105 | // Round 1. The first round permutes the message words from the original 106 | // input order, into the groups that get mixed in parallel. 107 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 108 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 109 | t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 110 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 111 | diagonalize(&rows[0], &rows[2], &rows[3]); 112 | t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 113 | t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 114 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 115 | t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 116 | t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 117 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 118 | undiagonalize(&rows[0], &rows[2], &rows[3]); 119 | m0 = t0; 120 | m1 = t1; 121 | m2 = t2; 122 | m3 = t3; 123 | 124 | // Round 2. This round and all following rounds apply a fixed permutation 125 | // to the message words from the round before. 126 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 127 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 128 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 129 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 130 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 131 | t1 = blend_epi16(tt, t1, 0xCC); 132 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 133 | diagonalize(&rows[0], &rows[2], &rows[3]); 134 | t2 = _mm_unpacklo_epi64(m3, m1); 135 | tt = blend_epi16(t2, m2, 0xC0); 136 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 137 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 138 | t3 = _mm_unpackhi_epi32(m1, m3); 139 | tt = _mm_unpacklo_epi32(m2, t3); 140 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 141 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 142 | undiagonalize(&rows[0], &rows[2], &rows[3]); 143 | m0 = t0; 144 | m1 = t1; 145 | m2 = t2; 146 | m3 = t3; 147 | 148 | // Round 3 149 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 150 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 151 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 152 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 153 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 154 | t1 = blend_epi16(tt, t1, 0xCC); 155 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 156 | diagonalize(&rows[0], &rows[2], &rows[3]); 157 | t2 = _mm_unpacklo_epi64(m3, m1); 158 | tt = blend_epi16(t2, m2, 0xC0); 159 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 160 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 161 | t3 = _mm_unpackhi_epi32(m1, m3); 162 | tt = _mm_unpacklo_epi32(m2, t3); 163 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 164 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 165 | undiagonalize(&rows[0], &rows[2], &rows[3]); 166 | m0 = t0; 167 | m1 = t1; 168 | m2 = t2; 169 | m3 = t3; 170 | 171 | // Round 4 172 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 173 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 174 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 175 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 176 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 177 | t1 = blend_epi16(tt, t1, 0xCC); 178 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 179 | diagonalize(&rows[0], &rows[2], &rows[3]); 180 | t2 = _mm_unpacklo_epi64(m3, m1); 181 | tt = blend_epi16(t2, m2, 0xC0); 182 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 183 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 184 | t3 = _mm_unpackhi_epi32(m1, m3); 185 | tt = _mm_unpacklo_epi32(m2, t3); 186 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 187 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 188 | undiagonalize(&rows[0], &rows[2], &rows[3]); 189 | m0 = t0; 190 | m1 = t1; 191 | m2 = t2; 192 | m3 = t3; 193 | 194 | // Round 5 195 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 196 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 197 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 198 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 199 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 200 | t1 = blend_epi16(tt, t1, 0xCC); 201 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 202 | diagonalize(&rows[0], &rows[2], &rows[3]); 203 | t2 = _mm_unpacklo_epi64(m3, m1); 204 | tt = blend_epi16(t2, m2, 0xC0); 205 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 206 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 207 | t3 = _mm_unpackhi_epi32(m1, m3); 208 | tt = _mm_unpacklo_epi32(m2, t3); 209 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 210 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 211 | undiagonalize(&rows[0], &rows[2], &rows[3]); 212 | m0 = t0; 213 | m1 = t1; 214 | m2 = t2; 215 | m3 = t3; 216 | 217 | // Round 6 218 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 219 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 220 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 221 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 222 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 223 | t1 = blend_epi16(tt, t1, 0xCC); 224 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 225 | diagonalize(&rows[0], &rows[2], &rows[3]); 226 | t2 = _mm_unpacklo_epi64(m3, m1); 227 | tt = blend_epi16(t2, m2, 0xC0); 228 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 229 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 230 | t3 = _mm_unpackhi_epi32(m1, m3); 231 | tt = _mm_unpacklo_epi32(m2, t3); 232 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 233 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 234 | undiagonalize(&rows[0], &rows[2], &rows[3]); 235 | m0 = t0; 236 | m1 = t1; 237 | m2 = t2; 238 | m3 = t3; 239 | 240 | // Round 7 241 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 242 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 243 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 244 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 245 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 246 | t1 = blend_epi16(tt, t1, 0xCC); 247 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 248 | diagonalize(&rows[0], &rows[2], &rows[3]); 249 | t2 = _mm_unpacklo_epi64(m3, m1); 250 | tt = blend_epi16(t2, m2, 0xC0); 251 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 252 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 253 | t3 = _mm_unpackhi_epi32(m1, m3); 254 | tt = _mm_unpacklo_epi32(m2, t3); 255 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 256 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 257 | undiagonalize(&rows[0], &rows[2], &rows[3]); 258 | } 259 | 260 | void blake3_compress_in_place_sse2(uint32_t cv[8], 261 | const uint8_t block[BLAKE3_BLOCK_LEN], 262 | uint8_t block_len, uint64_t counter, 263 | uint8_t flags) { 264 | __m128i rows[4]; 265 | compress_pre(rows, cv, block, block_len, counter, flags); 266 | storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); 267 | storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); 268 | } 269 | 270 | void blake3_compress_xof_sse2(const uint32_t cv[8], 271 | const uint8_t block[BLAKE3_BLOCK_LEN], 272 | uint8_t block_len, uint64_t counter, 273 | uint8_t flags, uint8_t out[64]) { 274 | __m128i rows[4]; 275 | compress_pre(rows, cv, block, block_len, counter, flags); 276 | storeu(xorv(rows[0], rows[2]), &out[0]); 277 | storeu(xorv(rows[1], rows[3]), &out[16]); 278 | storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); 279 | storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); 280 | } 281 | 282 | INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { 283 | v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); 284 | v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); 285 | v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); 286 | v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); 287 | v[0] = addv(v[0], v[4]); 288 | v[1] = addv(v[1], v[5]); 289 | v[2] = addv(v[2], v[6]); 290 | v[3] = addv(v[3], v[7]); 291 | v[12] = xorv(v[12], v[0]); 292 | v[13] = xorv(v[13], v[1]); 293 | v[14] = xorv(v[14], v[2]); 294 | v[15] = xorv(v[15], v[3]); 295 | v[12] = rot16(v[12]); 296 | v[13] = rot16(v[13]); 297 | v[14] = rot16(v[14]); 298 | v[15] = rot16(v[15]); 299 | v[8] = addv(v[8], v[12]); 300 | v[9] = addv(v[9], v[13]); 301 | v[10] = addv(v[10], v[14]); 302 | v[11] = addv(v[11], v[15]); 303 | v[4] = xorv(v[4], v[8]); 304 | v[5] = xorv(v[5], v[9]); 305 | v[6] = xorv(v[6], v[10]); 306 | v[7] = xorv(v[7], v[11]); 307 | v[4] = rot12(v[4]); 308 | v[5] = rot12(v[5]); 309 | v[6] = rot12(v[6]); 310 | v[7] = rot12(v[7]); 311 | v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); 312 | v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); 313 | v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); 314 | v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); 315 | v[0] = addv(v[0], v[4]); 316 | v[1] = addv(v[1], v[5]); 317 | v[2] = addv(v[2], v[6]); 318 | v[3] = addv(v[3], v[7]); 319 | v[12] = xorv(v[12], v[0]); 320 | v[13] = xorv(v[13], v[1]); 321 | v[14] = xorv(v[14], v[2]); 322 | v[15] = xorv(v[15], v[3]); 323 | v[12] = rot8(v[12]); 324 | v[13] = rot8(v[13]); 325 | v[14] = rot8(v[14]); 326 | v[15] = rot8(v[15]); 327 | v[8] = addv(v[8], v[12]); 328 | v[9] = addv(v[9], v[13]); 329 | v[10] = addv(v[10], v[14]); 330 | v[11] = addv(v[11], v[15]); 331 | v[4] = xorv(v[4], v[8]); 332 | v[5] = xorv(v[5], v[9]); 333 | v[6] = xorv(v[6], v[10]); 334 | v[7] = xorv(v[7], v[11]); 335 | v[4] = rot7(v[4]); 336 | v[5] = rot7(v[5]); 337 | v[6] = rot7(v[6]); 338 | v[7] = rot7(v[7]); 339 | 340 | v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); 341 | v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); 342 | v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); 343 | v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); 344 | v[0] = addv(v[0], v[5]); 345 | v[1] = addv(v[1], v[6]); 346 | v[2] = addv(v[2], v[7]); 347 | v[3] = addv(v[3], v[4]); 348 | v[15] = xorv(v[15], v[0]); 349 | v[12] = xorv(v[12], v[1]); 350 | v[13] = xorv(v[13], v[2]); 351 | v[14] = xorv(v[14], v[3]); 352 | v[15] = rot16(v[15]); 353 | v[12] = rot16(v[12]); 354 | v[13] = rot16(v[13]); 355 | v[14] = rot16(v[14]); 356 | v[10] = addv(v[10], v[15]); 357 | v[11] = addv(v[11], v[12]); 358 | v[8] = addv(v[8], v[13]); 359 | v[9] = addv(v[9], v[14]); 360 | v[5] = xorv(v[5], v[10]); 361 | v[6] = xorv(v[6], v[11]); 362 | v[7] = xorv(v[7], v[8]); 363 | v[4] = xorv(v[4], v[9]); 364 | v[5] = rot12(v[5]); 365 | v[6] = rot12(v[6]); 366 | v[7] = rot12(v[7]); 367 | v[4] = rot12(v[4]); 368 | v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); 369 | v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); 370 | v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); 371 | v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); 372 | v[0] = addv(v[0], v[5]); 373 | v[1] = addv(v[1], v[6]); 374 | v[2] = addv(v[2], v[7]); 375 | v[3] = addv(v[3], v[4]); 376 | v[15] = xorv(v[15], v[0]); 377 | v[12] = xorv(v[12], v[1]); 378 | v[13] = xorv(v[13], v[2]); 379 | v[14] = xorv(v[14], v[3]); 380 | v[15] = rot8(v[15]); 381 | v[12] = rot8(v[12]); 382 | v[13] = rot8(v[13]); 383 | v[14] = rot8(v[14]); 384 | v[10] = addv(v[10], v[15]); 385 | v[11] = addv(v[11], v[12]); 386 | v[8] = addv(v[8], v[13]); 387 | v[9] = addv(v[9], v[14]); 388 | v[5] = xorv(v[5], v[10]); 389 | v[6] = xorv(v[6], v[11]); 390 | v[7] = xorv(v[7], v[8]); 391 | v[4] = xorv(v[4], v[9]); 392 | v[5] = rot7(v[5]); 393 | v[6] = rot7(v[6]); 394 | v[7] = rot7(v[7]); 395 | v[4] = rot7(v[4]); 396 | } 397 | 398 | INLINE void transpose_vecs(__m128i vecs[DEGREE]) { 399 | // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is 400 | // 22/33. Note that this doesn't split the vector into two lanes, as the 401 | // AVX2 counterparts do. 402 | __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); 403 | __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); 404 | __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); 405 | __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); 406 | 407 | // Interleave 64-bit lanes. 408 | __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); 409 | __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); 410 | __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); 411 | __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); 412 | 413 | vecs[0] = abcd_0; 414 | vecs[1] = abcd_1; 415 | vecs[2] = abcd_2; 416 | vecs[3] = abcd_3; 417 | } 418 | 419 | INLINE void transpose_msg_vecs(const uint8_t *const *inputs, 420 | size_t block_offset, __m128i out[16]) { 421 | out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); 422 | out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); 423 | out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); 424 | out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); 425 | out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); 426 | out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); 427 | out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); 428 | out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); 429 | out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); 430 | out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); 431 | out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); 432 | out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); 433 | out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); 434 | out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); 435 | out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); 436 | out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); 437 | for (size_t i = 0; i < 4; ++i) { 438 | _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); 439 | } 440 | transpose_vecs(&out[0]); 441 | transpose_vecs(&out[4]); 442 | transpose_vecs(&out[8]); 443 | transpose_vecs(&out[12]); 444 | } 445 | 446 | INLINE void load_counters(uint64_t counter, bool increment_counter, 447 | __m128i *out_lo, __m128i *out_hi) { 448 | const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); 449 | const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); 450 | const __m128i add1 = _mm_and_si128(mask, add0); 451 | __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1); 452 | __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), 453 | _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); 454 | __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry); 455 | *out_lo = l; 456 | *out_hi = h; 457 | } 458 | 459 | void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, 460 | const uint32_t key[8], uint64_t counter, 461 | bool increment_counter, uint8_t flags, 462 | uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 463 | __m128i h_vecs[8] = { 464 | set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), 465 | set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), 466 | }; 467 | __m128i counter_low_vec, counter_high_vec; 468 | load_counters(counter, increment_counter, &counter_low_vec, 469 | &counter_high_vec); 470 | uint8_t block_flags = flags | flags_start; 471 | 472 | for (size_t block = 0; block < blocks; block++) { 473 | if (block + 1 == blocks) { 474 | block_flags |= flags_end; 475 | } 476 | __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); 477 | __m128i block_flags_vec = set1(block_flags); 478 | __m128i msg_vecs[16]; 479 | transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); 480 | 481 | __m128i v[16] = { 482 | h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], 483 | h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], 484 | set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), 485 | counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, 486 | }; 487 | round_fn(v, msg_vecs, 0); 488 | round_fn(v, msg_vecs, 1); 489 | round_fn(v, msg_vecs, 2); 490 | round_fn(v, msg_vecs, 3); 491 | round_fn(v, msg_vecs, 4); 492 | round_fn(v, msg_vecs, 5); 493 | round_fn(v, msg_vecs, 6); 494 | h_vecs[0] = xorv(v[0], v[8]); 495 | h_vecs[1] = xorv(v[1], v[9]); 496 | h_vecs[2] = xorv(v[2], v[10]); 497 | h_vecs[3] = xorv(v[3], v[11]); 498 | h_vecs[4] = xorv(v[4], v[12]); 499 | h_vecs[5] = xorv(v[5], v[13]); 500 | h_vecs[6] = xorv(v[6], v[14]); 501 | h_vecs[7] = xorv(v[7], v[15]); 502 | 503 | block_flags = flags; 504 | } 505 | 506 | transpose_vecs(&h_vecs[0]); 507 | transpose_vecs(&h_vecs[4]); 508 | // The first four vecs now contain the first half of each output, and the 509 | // second four vecs contain the second half of each output. 510 | storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); 511 | storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); 512 | storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); 513 | storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); 514 | storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); 515 | storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); 516 | storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); 517 | storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); 518 | } 519 | 520 | INLINE void hash_one_sse2(const uint8_t *input, size_t blocks, 521 | const uint32_t key[8], uint64_t counter, 522 | uint8_t flags, uint8_t flags_start, 523 | uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { 524 | uint32_t cv[8]; 525 | memcpy(cv, key, BLAKE3_KEY_LEN); 526 | uint8_t block_flags = flags | flags_start; 527 | while (blocks > 0) { 528 | if (blocks == 1) { 529 | block_flags |= flags_end; 530 | } 531 | blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter, 532 | block_flags); 533 | input = &input[BLAKE3_BLOCK_LEN]; 534 | blocks -= 1; 535 | block_flags = flags; 536 | } 537 | memcpy(out, cv, BLAKE3_OUT_LEN); 538 | } 539 | 540 | void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, 541 | size_t blocks, const uint32_t key[8], 542 | uint64_t counter, bool increment_counter, 543 | uint8_t flags, uint8_t flags_start, 544 | uint8_t flags_end, uint8_t *out) { 545 | while (num_inputs >= DEGREE) { 546 | blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags, 547 | flags_start, flags_end, out); 548 | if (increment_counter) { 549 | counter += DEGREE; 550 | } 551 | inputs += DEGREE; 552 | num_inputs -= DEGREE; 553 | out = &out[DEGREE * BLAKE3_OUT_LEN]; 554 | } 555 | while (num_inputs > 0) { 556 | hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start, 557 | flags_end, out); 558 | if (increment_counter) { 559 | counter += 1; 560 | } 561 | inputs += 1; 562 | num_inputs -= 1; 563 | out = &out[BLAKE3_OUT_LEN]; 564 | } 565 | } 566 | -------------------------------------------------------------------------------- /blake3_sse41.c: -------------------------------------------------------------------------------- 1 | #include "blake3_impl.h" 2 | 3 | #include 4 | 5 | #define DEGREE 4 6 | 7 | #define _mm_shuffle_ps2(a, b, c) \ 8 | (_mm_castps_si128( \ 9 | _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) 10 | 11 | INLINE __m128i loadu(const uint8_t src[16]) { 12 | return _mm_loadu_si128((const __m128i *)src); 13 | } 14 | 15 | INLINE void storeu(__m128i src, uint8_t dest[16]) { 16 | _mm_storeu_si128((__m128i *)dest, src); 17 | } 18 | 19 | INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } 20 | 21 | // Note that clang-format doesn't like the name "xor" for some reason. 22 | INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } 23 | 24 | INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } 25 | 26 | INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { 27 | return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); 28 | } 29 | 30 | INLINE __m128i rot16(__m128i x) { 31 | return _mm_shuffle_epi8( 32 | x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); 33 | } 34 | 35 | INLINE __m128i rot12(__m128i x) { 36 | return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); 37 | } 38 | 39 | INLINE __m128i rot8(__m128i x) { 40 | return _mm_shuffle_epi8( 41 | x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); 42 | } 43 | 44 | INLINE __m128i rot7(__m128i x) { 45 | return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); 46 | } 47 | 48 | INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, 49 | __m128i m) { 50 | *row0 = addv(addv(*row0, m), *row1); 51 | *row3 = xorv(*row3, *row0); 52 | *row3 = rot16(*row3); 53 | *row2 = addv(*row2, *row3); 54 | *row1 = xorv(*row1, *row2); 55 | *row1 = rot12(*row1); 56 | } 57 | 58 | INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, 59 | __m128i m) { 60 | *row0 = addv(addv(*row0, m), *row1); 61 | *row3 = xorv(*row3, *row0); 62 | *row3 = rot8(*row3); 63 | *row2 = addv(*row2, *row3); 64 | *row1 = xorv(*row1, *row2); 65 | *row1 = rot7(*row1); 66 | } 67 | 68 | // Note the optimization here of leaving row1 as the unrotated row, rather than 69 | // row0. All the message loads below are adjusted to compensate for this. See 70 | // discussion at https://github.com/sneves/blake2-avx2/pull/4 71 | INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { 72 | *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); 73 | *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); 74 | *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); 75 | } 76 | 77 | INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { 78 | *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); 79 | *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); 80 | *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); 81 | } 82 | 83 | INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], 84 | const uint8_t block[BLAKE3_BLOCK_LEN], 85 | uint8_t block_len, uint64_t counter, uint8_t flags) { 86 | rows[0] = loadu((uint8_t *)&cv[0]); 87 | rows[1] = loadu((uint8_t *)&cv[4]); 88 | rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); 89 | rows[3] = set4(counter_low(counter), counter_high(counter), 90 | (uint32_t)block_len, (uint32_t)flags); 91 | 92 | __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); 93 | __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); 94 | __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); 95 | __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); 96 | 97 | __m128i t0, t1, t2, t3, tt; 98 | 99 | // Round 1. The first round permutes the message words from the original 100 | // input order, into the groups that get mixed in parallel. 101 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 102 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 103 | t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 104 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 105 | diagonalize(&rows[0], &rows[2], &rows[3]); 106 | t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 107 | t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 108 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 109 | t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 110 | t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 111 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 112 | undiagonalize(&rows[0], &rows[2], &rows[3]); 113 | m0 = t0; 114 | m1 = t1; 115 | m2 = t2; 116 | m3 = t3; 117 | 118 | // Round 2. This round and all following rounds apply a fixed permutation 119 | // to the message words from the round before. 120 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 121 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 122 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 123 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 124 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 125 | t1 = _mm_blend_epi16(tt, t1, 0xCC); 126 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 127 | diagonalize(&rows[0], &rows[2], &rows[3]); 128 | t2 = _mm_unpacklo_epi64(m3, m1); 129 | tt = _mm_blend_epi16(t2, m2, 0xC0); 130 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 131 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 132 | t3 = _mm_unpackhi_epi32(m1, m3); 133 | tt = _mm_unpacklo_epi32(m2, t3); 134 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 135 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 136 | undiagonalize(&rows[0], &rows[2], &rows[3]); 137 | m0 = t0; 138 | m1 = t1; 139 | m2 = t2; 140 | m3 = t3; 141 | 142 | // Round 3 143 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 144 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 145 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 146 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 147 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 148 | t1 = _mm_blend_epi16(tt, t1, 0xCC); 149 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 150 | diagonalize(&rows[0], &rows[2], &rows[3]); 151 | t2 = _mm_unpacklo_epi64(m3, m1); 152 | tt = _mm_blend_epi16(t2, m2, 0xC0); 153 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 154 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 155 | t3 = _mm_unpackhi_epi32(m1, m3); 156 | tt = _mm_unpacklo_epi32(m2, t3); 157 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 158 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 159 | undiagonalize(&rows[0], &rows[2], &rows[3]); 160 | m0 = t0; 161 | m1 = t1; 162 | m2 = t2; 163 | m3 = t3; 164 | 165 | // Round 4 166 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 167 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 168 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 169 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 170 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 171 | t1 = _mm_blend_epi16(tt, t1, 0xCC); 172 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 173 | diagonalize(&rows[0], &rows[2], &rows[3]); 174 | t2 = _mm_unpacklo_epi64(m3, m1); 175 | tt = _mm_blend_epi16(t2, m2, 0xC0); 176 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 177 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 178 | t3 = _mm_unpackhi_epi32(m1, m3); 179 | tt = _mm_unpacklo_epi32(m2, t3); 180 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 181 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 182 | undiagonalize(&rows[0], &rows[2], &rows[3]); 183 | m0 = t0; 184 | m1 = t1; 185 | m2 = t2; 186 | m3 = t3; 187 | 188 | // Round 5 189 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 190 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 191 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 192 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 193 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 194 | t1 = _mm_blend_epi16(tt, t1, 0xCC); 195 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 196 | diagonalize(&rows[0], &rows[2], &rows[3]); 197 | t2 = _mm_unpacklo_epi64(m3, m1); 198 | tt = _mm_blend_epi16(t2, m2, 0xC0); 199 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 200 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 201 | t3 = _mm_unpackhi_epi32(m1, m3); 202 | tt = _mm_unpacklo_epi32(m2, t3); 203 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 204 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 205 | undiagonalize(&rows[0], &rows[2], &rows[3]); 206 | m0 = t0; 207 | m1 = t1; 208 | m2 = t2; 209 | m3 = t3; 210 | 211 | // Round 6 212 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 213 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 214 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 215 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 216 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 217 | t1 = _mm_blend_epi16(tt, t1, 0xCC); 218 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 219 | diagonalize(&rows[0], &rows[2], &rows[3]); 220 | t2 = _mm_unpacklo_epi64(m3, m1); 221 | tt = _mm_blend_epi16(t2, m2, 0xC0); 222 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 223 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 224 | t3 = _mm_unpackhi_epi32(m1, m3); 225 | tt = _mm_unpacklo_epi32(m2, t3); 226 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 227 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 228 | undiagonalize(&rows[0], &rows[2], &rows[3]); 229 | m0 = t0; 230 | m1 = t1; 231 | m2 = t2; 232 | m3 = t3; 233 | 234 | // Round 7 235 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); 236 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); 237 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); 238 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); 239 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); 240 | t1 = _mm_blend_epi16(tt, t1, 0xCC); 241 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); 242 | diagonalize(&rows[0], &rows[2], &rows[3]); 243 | t2 = _mm_unpacklo_epi64(m3, m1); 244 | tt = _mm_blend_epi16(t2, m2, 0xC0); 245 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); 246 | g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); 247 | t3 = _mm_unpackhi_epi32(m1, m3); 248 | tt = _mm_unpacklo_epi32(m2, t3); 249 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); 250 | g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); 251 | undiagonalize(&rows[0], &rows[2], &rows[3]); 252 | } 253 | 254 | void blake3_compress_in_place_sse41(uint32_t cv[8], 255 | const uint8_t block[BLAKE3_BLOCK_LEN], 256 | uint8_t block_len, uint64_t counter, 257 | uint8_t flags) { 258 | __m128i rows[4]; 259 | compress_pre(rows, cv, block, block_len, counter, flags); 260 | storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); 261 | storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); 262 | } 263 | 264 | void blake3_compress_xof_sse41(const uint32_t cv[8], 265 | const uint8_t block[BLAKE3_BLOCK_LEN], 266 | uint8_t block_len, uint64_t counter, 267 | uint8_t flags, uint8_t out[64]) { 268 | __m128i rows[4]; 269 | compress_pre(rows, cv, block, block_len, counter, flags); 270 | storeu(xorv(rows[0], rows[2]), &out[0]); 271 | storeu(xorv(rows[1], rows[3]), &out[16]); 272 | storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); 273 | storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); 274 | } 275 | 276 | INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { 277 | v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); 278 | v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); 279 | v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); 280 | v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); 281 | v[0] = addv(v[0], v[4]); 282 | v[1] = addv(v[1], v[5]); 283 | v[2] = addv(v[2], v[6]); 284 | v[3] = addv(v[3], v[7]); 285 | v[12] = xorv(v[12], v[0]); 286 | v[13] = xorv(v[13], v[1]); 287 | v[14] = xorv(v[14], v[2]); 288 | v[15] = xorv(v[15], v[3]); 289 | v[12] = rot16(v[12]); 290 | v[13] = rot16(v[13]); 291 | v[14] = rot16(v[14]); 292 | v[15] = rot16(v[15]); 293 | v[8] = addv(v[8], v[12]); 294 | v[9] = addv(v[9], v[13]); 295 | v[10] = addv(v[10], v[14]); 296 | v[11] = addv(v[11], v[15]); 297 | v[4] = xorv(v[4], v[8]); 298 | v[5] = xorv(v[5], v[9]); 299 | v[6] = xorv(v[6], v[10]); 300 | v[7] = xorv(v[7], v[11]); 301 | v[4] = rot12(v[4]); 302 | v[5] = rot12(v[5]); 303 | v[6] = rot12(v[6]); 304 | v[7] = rot12(v[7]); 305 | v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); 306 | v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); 307 | v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); 308 | v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); 309 | v[0] = addv(v[0], v[4]); 310 | v[1] = addv(v[1], v[5]); 311 | v[2] = addv(v[2], v[6]); 312 | v[3] = addv(v[3], v[7]); 313 | v[12] = xorv(v[12], v[0]); 314 | v[13] = xorv(v[13], v[1]); 315 | v[14] = xorv(v[14], v[2]); 316 | v[15] = xorv(v[15], v[3]); 317 | v[12] = rot8(v[12]); 318 | v[13] = rot8(v[13]); 319 | v[14] = rot8(v[14]); 320 | v[15] = rot8(v[15]); 321 | v[8] = addv(v[8], v[12]); 322 | v[9] = addv(v[9], v[13]); 323 | v[10] = addv(v[10], v[14]); 324 | v[11] = addv(v[11], v[15]); 325 | v[4] = xorv(v[4], v[8]); 326 | v[5] = xorv(v[5], v[9]); 327 | v[6] = xorv(v[6], v[10]); 328 | v[7] = xorv(v[7], v[11]); 329 | v[4] = rot7(v[4]); 330 | v[5] = rot7(v[5]); 331 | v[6] = rot7(v[6]); 332 | v[7] = rot7(v[7]); 333 | 334 | v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); 335 | v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); 336 | v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); 337 | v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); 338 | v[0] = addv(v[0], v[5]); 339 | v[1] = addv(v[1], v[6]); 340 | v[2] = addv(v[2], v[7]); 341 | v[3] = addv(v[3], v[4]); 342 | v[15] = xorv(v[15], v[0]); 343 | v[12] = xorv(v[12], v[1]); 344 | v[13] = xorv(v[13], v[2]); 345 | v[14] = xorv(v[14], v[3]); 346 | v[15] = rot16(v[15]); 347 | v[12] = rot16(v[12]); 348 | v[13] = rot16(v[13]); 349 | v[14] = rot16(v[14]); 350 | v[10] = addv(v[10], v[15]); 351 | v[11] = addv(v[11], v[12]); 352 | v[8] = addv(v[8], v[13]); 353 | v[9] = addv(v[9], v[14]); 354 | v[5] = xorv(v[5], v[10]); 355 | v[6] = xorv(v[6], v[11]); 356 | v[7] = xorv(v[7], v[8]); 357 | v[4] = xorv(v[4], v[9]); 358 | v[5] = rot12(v[5]); 359 | v[6] = rot12(v[6]); 360 | v[7] = rot12(v[7]); 361 | v[4] = rot12(v[4]); 362 | v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); 363 | v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); 364 | v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); 365 | v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); 366 | v[0] = addv(v[0], v[5]); 367 | v[1] = addv(v[1], v[6]); 368 | v[2] = addv(v[2], v[7]); 369 | v[3] = addv(v[3], v[4]); 370 | v[15] = xorv(v[15], v[0]); 371 | v[12] = xorv(v[12], v[1]); 372 | v[13] = xorv(v[13], v[2]); 373 | v[14] = xorv(v[14], v[3]); 374 | v[15] = rot8(v[15]); 375 | v[12] = rot8(v[12]); 376 | v[13] = rot8(v[13]); 377 | v[14] = rot8(v[14]); 378 | v[10] = addv(v[10], v[15]); 379 | v[11] = addv(v[11], v[12]); 380 | v[8] = addv(v[8], v[13]); 381 | v[9] = addv(v[9], v[14]); 382 | v[5] = xorv(v[5], v[10]); 383 | v[6] = xorv(v[6], v[11]); 384 | v[7] = xorv(v[7], v[8]); 385 | v[4] = xorv(v[4], v[9]); 386 | v[5] = rot7(v[5]); 387 | v[6] = rot7(v[6]); 388 | v[7] = rot7(v[7]); 389 | v[4] = rot7(v[4]); 390 | } 391 | 392 | INLINE void transpose_vecs(__m128i vecs[DEGREE]) { 393 | // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is 394 | // 22/33. Note that this doesn't split the vector into two lanes, as the 395 | // AVX2 counterparts do. 396 | __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); 397 | __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); 398 | __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); 399 | __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); 400 | 401 | // Interleave 64-bit lanes. 402 | __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); 403 | __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); 404 | __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); 405 | __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); 406 | 407 | vecs[0] = abcd_0; 408 | vecs[1] = abcd_1; 409 | vecs[2] = abcd_2; 410 | vecs[3] = abcd_3; 411 | } 412 | 413 | INLINE void transpose_msg_vecs(const uint8_t *const *inputs, 414 | size_t block_offset, __m128i out[16]) { 415 | out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); 416 | out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); 417 | out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); 418 | out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); 419 | out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); 420 | out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); 421 | out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); 422 | out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); 423 | out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); 424 | out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); 425 | out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); 426 | out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); 427 | out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); 428 | out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); 429 | out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); 430 | out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); 431 | for (size_t i = 0; i < 4; ++i) { 432 | _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); 433 | } 434 | transpose_vecs(&out[0]); 435 | transpose_vecs(&out[4]); 436 | transpose_vecs(&out[8]); 437 | transpose_vecs(&out[12]); 438 | } 439 | 440 | INLINE void load_counters(uint64_t counter, bool increment_counter, 441 | __m128i *out_lo, __m128i *out_hi) { 442 | const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); 443 | const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); 444 | const __m128i add1 = _mm_and_si128(mask, add0); 445 | __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1); 446 | __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), 447 | _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); 448 | __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry); 449 | *out_lo = l; 450 | *out_hi = h; 451 | } 452 | 453 | void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, 454 | const uint32_t key[8], uint64_t counter, 455 | bool increment_counter, uint8_t flags, 456 | uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 457 | __m128i h_vecs[8] = { 458 | set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), 459 | set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), 460 | }; 461 | __m128i counter_low_vec, counter_high_vec; 462 | load_counters(counter, increment_counter, &counter_low_vec, 463 | &counter_high_vec); 464 | uint8_t block_flags = flags | flags_start; 465 | 466 | for (size_t block = 0; block < blocks; block++) { 467 | if (block + 1 == blocks) { 468 | block_flags |= flags_end; 469 | } 470 | __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); 471 | __m128i block_flags_vec = set1(block_flags); 472 | __m128i msg_vecs[16]; 473 | transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); 474 | 475 | __m128i v[16] = { 476 | h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], 477 | h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], 478 | set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), 479 | counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, 480 | }; 481 | round_fn(v, msg_vecs, 0); 482 | round_fn(v, msg_vecs, 1); 483 | round_fn(v, msg_vecs, 2); 484 | round_fn(v, msg_vecs, 3); 485 | round_fn(v, msg_vecs, 4); 486 | round_fn(v, msg_vecs, 5); 487 | round_fn(v, msg_vecs, 6); 488 | h_vecs[0] = xorv(v[0], v[8]); 489 | h_vecs[1] = xorv(v[1], v[9]); 490 | h_vecs[2] = xorv(v[2], v[10]); 491 | h_vecs[3] = xorv(v[3], v[11]); 492 | h_vecs[4] = xorv(v[4], v[12]); 493 | h_vecs[5] = xorv(v[5], v[13]); 494 | h_vecs[6] = xorv(v[6], v[14]); 495 | h_vecs[7] = xorv(v[7], v[15]); 496 | 497 | block_flags = flags; 498 | } 499 | 500 | transpose_vecs(&h_vecs[0]); 501 | transpose_vecs(&h_vecs[4]); 502 | // The first four vecs now contain the first half of each output, and the 503 | // second four vecs contain the second half of each output. 504 | storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); 505 | storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); 506 | storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); 507 | storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); 508 | storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); 509 | storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); 510 | storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); 511 | storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); 512 | } 513 | 514 | INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, 515 | const uint32_t key[8], uint64_t counter, 516 | uint8_t flags, uint8_t flags_start, 517 | uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { 518 | uint32_t cv[8]; 519 | memcpy(cv, key, BLAKE3_KEY_LEN); 520 | uint8_t block_flags = flags | flags_start; 521 | while (blocks > 0) { 522 | if (blocks == 1) { 523 | block_flags |= flags_end; 524 | } 525 | blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, 526 | block_flags); 527 | input = &input[BLAKE3_BLOCK_LEN]; 528 | blocks -= 1; 529 | block_flags = flags; 530 | } 531 | memcpy(out, cv, BLAKE3_OUT_LEN); 532 | } 533 | 534 | void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, 535 | size_t blocks, const uint32_t key[8], 536 | uint64_t counter, bool increment_counter, 537 | uint8_t flags, uint8_t flags_start, 538 | uint8_t flags_end, uint8_t *out) { 539 | while (num_inputs >= DEGREE) { 540 | blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, 541 | flags_start, flags_end, out); 542 | if (increment_counter) { 543 | counter += DEGREE; 544 | } 545 | inputs += DEGREE; 546 | num_inputs -= DEGREE; 547 | out = &out[DEGREE * BLAKE3_OUT_LEN]; 548 | } 549 | while (num_inputs > 0) { 550 | hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, 551 | flags_end, out); 552 | if (increment_counter) { 553 | counter += 1; 554 | } 555 | inputs += 1; 556 | num_inputs -= 1; 557 | out = &out[BLAKE3_OUT_LEN]; 558 | } 559 | } 560 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | docker build --tag "blake3build:latest" . 2 | docker create --name blake3build_container blake3build 3 | #/making/modules/blake3.so 4 | docker cp blake3build_container:/making/modules/blake3.so ./compiled/blake3.so 5 | docker rm blake3build_container -------------------------------------------------------------------------------- /compiled/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cypherbits/php-blake3/c269f1a25436fec48c04e8b771a8bbbfd5c198ac/compiled/.gitkeep -------------------------------------------------------------------------------- /config.m4: -------------------------------------------------------------------------------- 1 | PHP_ARG_ENABLE(blake3, 2 | [Whether to enable BLAKE3 support], 3 | [--enable-blake3 Enable BLAKE3 Extension]) 4 | 5 | if test "$PHP_BLAKE3" != "no"; then 6 | PHP_NEW_EXTENSION(blake3, php_blake3.c blake3.c blake3_dispatch.c blake3_portable.c blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S blake3_avx512_x86-64_unix.S, $ext_shared) 7 | fi -------------------------------------------------------------------------------- /config.w32: -------------------------------------------------------------------------------- 1 | ARG_ENABLE('blake3', 'blake3 support', 'no'); 2 | 3 | if (PHP_BLAKE3 != 'no') { 4 | AC_DEFINE('HAVE_BLAKE3', 1, 'blake3 support enabled'); 5 | 6 | EXTENSION('blake3', 'php_blake3.c', true, '/DZEND_ENABLE_STATIC_TSRMLS_CACHE=1'); 7 | ADD_SOURCES(configure_module_dirname, 'blake3b-ref.c blake3s-ref.c', 'blake3'); 8 | } 9 | -------------------------------------------------------------------------------- /php_blake3.c: -------------------------------------------------------------------------------- 1 | #ifdef HAVE_CONFIG_H 2 | #include "config.h" 3 | #endif 4 | 5 | #include "php.h" 6 | #include "ext/standard/info.h" 7 | #include "ext/hash/php_hash.h" 8 | #include "blake3.h" 9 | #include "php_blake3.h" 10 | 11 | #define PHP_BLAKE3_NAME "BLAKE3" 12 | #define PHP_BLAKE3_VERSION "0.1.0" 13 | 14 | ZEND_BEGIN_ARG_INFO_EX(arginfo_void, 0, 0, 0) 15 | ZEND_END_ARG_INFO() 16 | 17 | ZEND_BEGIN_ARG_INFO_EX(arginfo_blake3, 0, 0, 1) 18 | ZEND_ARG_INFO(0, str) 19 | ZEND_ARG_INFO(0, outputSize) 20 | ZEND_ARG_INFO(0, key) 21 | ZEND_ARG_INFO(0, rawOutput) 22 | ZEND_END_ARG_INFO() 23 | 24 | ZEND_BEGIN_ARG_INFO_EX(arginfo_blake3_file, 0, 0, 1) 25 | ZEND_ARG_INFO(0, filename) 26 | ZEND_ARG_INFO(0, rawOutput) 27 | ZEND_END_ARG_INFO() 28 | 29 | zend_function_entry blake3_functions[] = { 30 | PHP_FE(blake3, arginfo_blake3) 31 | PHP_FE(blake3_file, arginfo_blake3_file) 32 | {NULL, NULL, NULL} 33 | }; 34 | 35 | PHP_MINIT_FUNCTION(blake3){ 36 | REGISTER_LONG_CONSTANT("BLAKE3_OUT_LEN", 37 | BLAKE3_OUT_LEN, CONST_CS | CONST_PERSISTENT); 38 | } 39 | 40 | zend_module_entry blake3_module_entry = { 41 | #if ZEND_MODULE_API_NO >= 20010901 42 | STANDARD_MODULE_HEADER, 43 | #endif 44 | PHP_BLAKE3_NAME, 45 | blake3_functions, 46 | PHP_MINIT(blake3), 47 | NULL, 48 | NULL, 49 | NULL, 50 | NULL, 51 | #if ZEND_MODULE_API_NO >= 20010901 52 | PHP_BLAKE3_VERSION, 53 | #endif 54 | STANDARD_MODULE_PROPERTIES 55 | }; 56 | 57 | #ifdef COMPILE_DL_BLAKE3 58 | ZEND_GET_MODULE(blake3) 59 | #endif 60 | 61 | PHP_FUNCTION(blake3) 62 | { 63 | #if ZEND_MODULE_API_NO >= 20151012 64 | zend_long hashByteLength = BLAKE3_OUT_LEN; 65 | size_t dataByteLength; 66 | size_t keyLength = 0; 67 | #else 68 | long hashByteLength = BLAKE3_OUT_LEN; 69 | int dataByteLength; 70 | int keyLength = 0; 71 | #endif 72 | unsigned char *data; 73 | unsigned char *key; 74 | zend_bool rawOutput = 0; 75 | 76 | if (zend_parse_parameters(ZEND_NUM_ARGS(), "s|lsb", &data, &dataByteLength, &hashByteLength, &key, &keyLength, &rawOutput) == FAILURE) { 77 | return; 78 | } 79 | 80 | zend_bool hasError = 0; 81 | 82 | if (hashByteLength < 1) { 83 | hasError = 1; 84 | zend_error(E_ERROR, "BLAKE3 output length cannot be zero"); 85 | } 86 | 87 | if (keyLength > 0 && keyLength != BLAKE3_KEY_LEN) { 88 | hasError = 1; 89 | zend_error(E_ERROR, "BLAKE3 key length MUST be 32 bytes"); 90 | } 91 | 92 | if (hasError) { 93 | RETURN_FALSE; 94 | } 95 | 96 | char* hashOutput = (unsigned char*) emalloc(hashByteLength); 97 | 98 | int result = blake3(hashOutput, hashByteLength, data, dataByteLength, key, keyLength); 99 | 100 | if (result != 0) { 101 | zend_error(E_ERROR, "Error generating BLAKE3 hash"); 102 | efree(hashOutput); 103 | RETURN_FALSE; 104 | } 105 | 106 | if (rawOutput) { 107 | #if ZEND_MODULE_API_NO >= 20151012 108 | RETVAL_STRINGL(hashOutput, hashByteLength); 109 | #else 110 | RETVAL_STRINGL(hashOutput, hashByteLength, 1); 111 | #endif 112 | } else { 113 | char* hex = (char*) emalloc(hashByteLength * 2 + 1); 114 | php_hash_bin2hex(hex, (unsigned char *) hashOutput, hashByteLength); 115 | hex[hashByteLength * 2] = '\0'; 116 | 117 | #if ZEND_MODULE_API_NO >= 20151012 118 | RETVAL_STRING(hex); 119 | #else 120 | RETVAL_STRING(hex,1); 121 | #endif 122 | 123 | efree(hex); 124 | } 125 | 126 | efree(hashOutput); 127 | } 128 | 129 | PHP_FUNCTION(blake3_file) 130 | { 131 | #if ZEND_MODULE_API_NO >= 20151012 132 | zend_long hashByteLength = BLAKE3_OUT_LEN; 133 | size_t dataByteLength; 134 | #else 135 | long hashByteLength = BLAKE3_OUT_LEN; 136 | int dataByteLength; 137 | #endif 138 | 139 | char *data; 140 | int rawOutput = 0; 141 | 142 | php_stream *stream; 143 | int n; 144 | unsigned char buf[1024]; 145 | 146 | blake3_hasher hasher; 147 | 148 | if (zend_parse_parameters(ZEND_NUM_ARGS(), "p|b", &data, &dataByteLength, &rawOutput) == FAILURE) { 149 | return; 150 | } 151 | 152 | stream = php_stream_open_wrapper(data, "rb", REPORT_ERRORS, NULL); 153 | if (!stream) { 154 | RETURN_FALSE; 155 | } 156 | 157 | char* hashOutput = (char*) emalloc(hashByteLength); 158 | 159 | blake3_hasher_init(&hasher); 160 | 161 | while ((n = php_stream_read(stream, buf, sizeof(buf))) > 0) { 162 | blake3_hasher_update(&hasher, (const uint8_t *)buf, n); 163 | } 164 | 165 | blake3_hasher_finalize(&hasher, hashOutput, hashByteLength); 166 | 167 | php_stream_close(stream); 168 | 169 | if (n<0) { 170 | efree(hashOutput); 171 | RETURN_FALSE; 172 | } 173 | 174 | if (rawOutput) { 175 | #if ZEND_MODULE_API_NO >= 20151012 176 | RETVAL_STRINGL(hashOutput, hashByteLength); 177 | #else 178 | RETVAL_STRINGL(hashOutput, hashByteLength, 1); 179 | #endif 180 | } else { 181 | char* hex = (char*) emalloc(hashByteLength * 2 + 1); 182 | php_hash_bin2hex(hex, (unsigned char *) hashOutput, hashByteLength); 183 | hex[hashByteLength * 2] = '\0'; 184 | #if ZEND_MODULE_API_NO >= 20151012 185 | RETVAL_STRING(hex); 186 | #else 187 | RETVAL_STRING(hex,1); 188 | #endif 189 | efree(hex); 190 | } 191 | 192 | efree(hashOutput); 193 | } 194 | -------------------------------------------------------------------------------- /php_blake3.h: -------------------------------------------------------------------------------- 1 | #ifndef PHP_BLAKE3_H 2 | #define PHP_BLAKE3_H 3 | 4 | PHP_FUNCTION(blake3); 5 | PHP_FUNCTION(blake3_file); 6 | 7 | extern zend_module_entry blake3_module_entry; 8 | #define phpext_blake3_ptr &blake3_module_entry 9 | 10 | #endif 11 | --------------------------------------------------------------------------------