├── .gitignore ├── src ├── .gitignore ├── FindHashtree.cmake ├── hashtree.h ├── lib.rs ├── hashtree.c ├── Makefile ├── sha256_armv8_crypto.S ├── bench.c ├── sha256_generic.c ├── sha256_armv8_neon_x1.S ├── sha256_avx_x4.S ├── sha256_sse_x1.S ├── sha256_avx_x1.S └── sha256_armv8_neon_x4.S ├── hashtree_amd64.syso ├── hashtree_linux_arm64.syso ├── hashtree_darwin_arm64.syso ├── hashtree_windows_amd64.syso ├── go.mod ├── .clang-format ├── go.sum ├── bindings_arm64.go ├── Dockerfile ├── bindings_amd64.go ├── wrapper_arm64.s ├── Makefile ├── tests └── test_hashtree_abi.nim ├── Cargo.toml ├── hashtree_abi.nimble ├── wrapper_linux_amd64.s ├── wrapper_windows_amd64.s ├── LICENSE ├── examples └── basic_usage.rs ├── hashtree_abi.nim ├── bindings.go ├── .github └── workflows │ └── test.yml ├── sha256_1_generic.go ├── README.md └── bindings_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | Cargo.lock 2 | target 3 | build 4 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | hashtree.pc 3 | libhashtree.a 4 | libhashtree.lib 5 | -------------------------------------------------------------------------------- /hashtree_amd64.syso: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OffchainLabs/hashtree/HEAD/hashtree_amd64.syso -------------------------------------------------------------------------------- /hashtree_linux_arm64.syso: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OffchainLabs/hashtree/HEAD/hashtree_linux_arm64.syso -------------------------------------------------------------------------------- /hashtree_darwin_arm64.syso: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OffchainLabs/hashtree/HEAD/hashtree_darwin_arm64.syso -------------------------------------------------------------------------------- /hashtree_windows_amd64.syso: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OffchainLabs/hashtree/HEAD/hashtree_windows_amd64.syso -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/OffchainLabs/hashtree 2 | 3 | go 1.22.2 4 | 5 | require github.com/klauspost/cpuid/v2 v2.0.9 6 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | Language: Cpp 2 | BasedOnStyle: Google 3 | ColumnLimit: 120 4 | ConstructorInitializerIndentWidth: 4 5 | ContinuationIndentWidth: 4 6 | IndentWidth: 4 7 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4= 2 | github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= 3 | -------------------------------------------------------------------------------- /bindings_arm64.go: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | package hashtree 5 | 6 | import ( 7 | "github.com/klauspost/cpuid/v2" 8 | ) 9 | 10 | var hasShani = cpuid.CPU.Supports(cpuid.SHA2) 11 | var supportedCPU = true 12 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rust:latest 2 | 3 | ARG TARGET 4 | 5 | RUN rustup target add $TARGET 6 | 7 | COPY . /usr/src/hashtree 8 | WORKDIR /usr/src/hashtree 9 | 10 | RUN cargo build --release --target $TARGET 11 | 12 | RUN cargo test --target $TARGET -------------------------------------------------------------------------------- /bindings_amd64.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | package hashtree 5 | 6 | import ( 7 | "github.com/klauspost/cpuid/v2" 8 | ) 9 | 10 | var hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512VL) 11 | var hasAVX2 = cpuid.CPU.Supports(cpuid.AVX2, cpuid.BMI2) 12 | var hasShani = cpuid.CPU.Supports(cpuid.SHA, cpuid.AVX) 13 | var supportedCPU = hasAVX2 || hasShani || hasAVX512 14 | -------------------------------------------------------------------------------- /wrapper_arm64.s: -------------------------------------------------------------------------------- 1 | // +build arm64 2 | 3 | TEXT ·HashtreeHash(SB), 0, $2048-24 4 | MOVD output+0(FP), R0 5 | MOVD input+8(FP), R1 6 | MOVD count+16(FP), R2 7 | 8 | 9 | #define check_shani R19 10 | MOVBU ·hasShani(SB), check_shani 11 | CBNZ check_shani, shani 12 | 13 | BL hashtree_sha256_neon_x4(SB) 14 | RET 15 | 16 | shani: 17 | BL hashtree_sha256_sha_x1(SB) 18 | RET 19 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | libhashtree: 2 | $(MAKE) -C src 3 | 4 | clean: 5 | $(MAKE) -C src clean 6 | cargo clean 7 | 8 | test: 9 | $(MAKE) -C src test 10 | 11 | bench: 12 | $(MAKE) -C src bench 13 | 14 | .PHONY: rust_bindings rust_tests go_bindings 15 | rust_bindings: 16 | cd rust_bindings && cargo build --release 17 | rust_tests: 18 | cd rust_bindings && cargo test 19 | go_bindings: 20 | $(MAKE) -C src go_bindings 21 | all: 22 | $(MAKE) -C src all 23 | 24 | install: 25 | $(MAKE) -C src install 26 | 27 | uninstall: 28 | $(MAKE) -C src uninstall 29 | -------------------------------------------------------------------------------- /tests/test_hashtree_abi.nim: -------------------------------------------------------------------------------- 1 | # hashtree nim bindings 2 | # Copyright (c) 2024 Status Research & Development GmbH 3 | # Licensed and distributed under 4 | # * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT). 5 | # This file may not be copied, modified, or distributed except according to those terms. 6 | 7 | import ../hashtree_abi 8 | 9 | func test() {.raises: [].} = 10 | var data: array[64, byte] 11 | hashtree_hash(addr data[0], addr data[0], 1) 12 | 13 | doAssert data[0] == 245 14 | when isMainModule: 15 | test() 16 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hashtree-rs" 3 | version = "0.2.0" 4 | authors = ["Potuz "] 5 | edition = "2021" 6 | description = "Rust bindings for the hashtree library" 7 | documentation = "https://github.com/OffchainLabs/hashtree" 8 | repository = "https://github.com/OffchainLabs/hashtree" 9 | homepage = "https://github.com/OffchainLabs/hashtree" 10 | license = "MIT" 11 | build = "build.rs" 12 | keywords = ["hash", "crypto", "sha256", "merkle"] 13 | include = ["build.rs", "Cargo.toml", "src/lib.rs", "src/*.c", "src/*.h", "src/*.S", "Makefile", "src/Makefile"] 14 | 15 | [lib] 16 | crate-type = ["cdylib", "rlib"] 17 | 18 | [dependencies] 19 | libc = "0.2" 20 | 21 | [build-dependencies] 22 | cc = "1.0" 23 | -------------------------------------------------------------------------------- /hashtree_abi.nimble: -------------------------------------------------------------------------------- 1 | # hashtree nim bindings 2 | # Copyright (c) 2024 Status Research & Development GmbH 3 | # Licensed and distributed under 4 | # * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT). 5 | # This file may not be copied, modified, or distributed except according to those terms. 6 | 7 | mode = ScriptMode.Verbose 8 | 9 | packageName = "hashtree_abi" 10 | version = "0.2.0" 11 | author = "Prysmatic labs, Status Research and Development GmbH" 12 | description = "Low-level ABI package for hashtree, a SHA256 implementation specialized for merkle trees and 64-byte chunks" 13 | license = "MIT" 14 | installDirs = @["src"] 15 | installFiles = @["hashtree_abi.nim"] 16 | 17 | requires "nim >= 1.6.0" 18 | -------------------------------------------------------------------------------- /wrapper_linux_amd64.s: -------------------------------------------------------------------------------- 1 | // +build linux,amd64 2 | 3 | TEXT ·HashtreeHash(SB), 0, $2048-24 4 | MOVQ output+0(FP), DI 5 | MOVQ input+8(FP), SI 6 | MOVQ count+16(FP), DX 7 | 8 | MOVQ SP, BX 9 | ADDQ $2048, SP 10 | ANDQ $~31, SP 11 | 12 | CMPB ·hasShani(SB), $1 13 | JE shani 14 | CMPB ·hasAVX512(SB), $1 15 | JE avx512 16 | CMPB ·hasAVX2(SB), $1 17 | JE avx2 18 | CALL hashtree_sha256_avx_x1(SB) 19 | JMP epilog 20 | 21 | shani: 22 | CALL hashtree_sha256_shani_x2(SB) 23 | JMP epilog 24 | 25 | avx512: 26 | CALL hashtree_sha256_avx512_x16(SB) 27 | JMP epilog 28 | 29 | avx2: 30 | CALL hashtree_sha256_avx2_x8(SB) 31 | 32 | epilog: 33 | MOVQ BX, SP 34 | RET 35 | -------------------------------------------------------------------------------- /wrapper_windows_amd64.s: -------------------------------------------------------------------------------- 1 | // +build windows,amd64 2 | 3 | TEXT ·HashtreeHash(SB), 0, $1024-24 4 | MOVQ output+0(FP), CX 5 | MOVQ input+8(FP), DX 6 | MOVQ R8, R12 // R12 is saved on windows 7 | MOVQ count+16(FP), R8 8 | 9 | MOVQ SP, BX 10 | ADDQ $2048, SP 11 | ANDQ $~31, SP 12 | 13 | CMPB ·hasShani(SB), $1 14 | JE shani 15 | CMPB ·hasAVX512(SB), $1 16 | JE avx512 17 | CMPB ·hasAVX2(SB), $1 18 | JE avx2 19 | CALL hashtree_sha256_avx_x1(SB) 20 | JMP epilog 21 | 22 | shani: 23 | CALL hashtree_sha256_shani_x2(SB) 24 | JMP epilog 25 | 26 | avx512: 27 | CALL hashtree_sha256_avx512_x16(SB) 28 | JMP epilog 29 | 30 | avx2: 31 | CALL hashtree_sha256_avx2_x8(SB) 32 | 33 | epilog: 34 | MOVQ BX, SP 35 | MOVQ R12, R8 36 | RET 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Prysmatic Labs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/FindHashtree.cmake: -------------------------------------------------------------------------------- 1 | #[=======================================================================[.rst: 2 | FindHashtree 3 | ----------- 4 | 5 | Find the Hashtree library 6 | 7 | IMPORTED targets 8 | ^^^^^^^^^^^^^^^^ 9 | 10 | This module defines the following :prop_tgt:`IMPORTED` target: 11 | 12 | ``Hashtree::Hashtree`` 13 | 14 | Result variables 15 | ^^^^^^^^^^^^^^^^ 16 | 17 | This module will set the following variables if found: 18 | 19 | ``Hashtree_INCLUDE_DIRS`` 20 | where to find hashtree.h 21 | ``Hashtree_LIBRARIES`` 22 | the libraries to link against to use Hashtree. 23 | ``Hashtree_FOUND`` 24 | TRUE if found 25 | 26 | #]=======================================================================] 27 | 28 | # Look for the necessary header 29 | find_path(Hashtree_INCLUDE_DIR NAMES hashtree.h) 30 | mark_as_advanced(Hashtree_INCLUDE_DIR) 31 | 32 | # Look for the necessary library 33 | find_library(Hashtree_LIBRARY NAMES hashtree) 34 | mark_as_advanced(Hashtree_LIBRARY) 35 | 36 | include(FindPackageHandleStandardArgs) 37 | find_package_handle_standard_args(Hashtree 38 | REQUIRED_VARS Hashtree_INCLUDE_DIR Hashtree_LIBRARY 39 | ) 40 | 41 | # Create the imported target 42 | if(Hashtree_FOUND) 43 | set(Hashtree_INCLUDE_DIRS ${Hashtree_INCLUDE_DIR}) 44 | set(Hashtree_LIBRARIES ${Hashtree_LIBRARY}) 45 | if(NOT TARGET Hashtree::Hashtree) 46 | add_library(Hashtree::Hashtree UNKNOWN IMPORTED) 47 | set_target_properties(Hashtree::Hashtree PROPERTIES 48 | IMPORTED_LOCATION "${Hashtree_LIBRARY}" 49 | INTERFACE_INCLUDE_DIRECTORIES "${Hashtree_INCLUDE_DIR}") 50 | endif() 51 | endif() 52 | -------------------------------------------------------------------------------- /examples/basic_usage.rs: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021-2024 Prysmatic Labs 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | 25 | 26 | extern crate hashtree_rs; 27 | 28 | fn main() { 29 | println!("Initializing hashtree..."); 30 | hashtree_rs::init(); 31 | println!("Hashtree initialized."); 32 | 33 | let chunks: [u8; 64] = [0xAB; 64]; 34 | let mut out = [0u8; 32]; 35 | 36 | hashtree_rs::hash(&mut out, &chunks, 1); 37 | 38 | let hex_string: String = out.iter().map(|byte| format!("{:02x}", byte)).collect(); 39 | 40 | println!("Computed hash: 0x{}", hex_string); 41 | } 42 | -------------------------------------------------------------------------------- /hashtree_abi.nim: -------------------------------------------------------------------------------- 1 | # hashtree nim bindings 2 | # Copyright (c) 2024 Status Research & Development GmbH 3 | # Licensed and distributed under 4 | # * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT). 5 | # This file may not be copied, modified, or distributed except according to those terms. 6 | 7 | {.pragma: hashtreedecl, importc, cdecl, gcsafe, raises: [].} 8 | 9 | import std/[os, strutils] 10 | 11 | const srcDir = currentSourcePath.parentDir.replace('\\', '/') & "/src/" 12 | 13 | {.compile: srcDir & "hashtree.c".} 14 | {.compile: srcDir & "sha256_generic.c".} 15 | 16 | # The assember files use gnu/binutils-specific macros and lack mac support in 17 | # general 18 | when 19 | ((defined(linux) or defined(windows)) and defined(gcc)) or 20 | (defined(linux) and defined(clang)) or 21 | (defined(macosx) and defined(clang) and defined(arm64)): 22 | 23 | const cflags = 24 | when defined(clang) and (defined(linux) or defined(macosx)): 25 | # The integrated `clang` assembler uses a different macro syntax but on 26 | # linux and macos we can convince it to use the system assembler which _tends_ to be 27 | # the binutils variant 28 | "-fno-integrated-as" 29 | else: 30 | "" 31 | 32 | when defined(arm64): 33 | {.compile(srcDir & "sha256_armv8_crypto.S", cflags).} 34 | {.compile(srcDir & "sha256_armv8_neon_x1.S", cflags).} 35 | {.compile(srcDir & "sha256_armv8_neon_x4.S", cflags).} 36 | 37 | elif defined(amd64): 38 | {.compile(srcDir & "sha256_avx_x1.S", cflags).} 39 | {.compile(srcDir & "sha256_avx_x4.S", cflags).} 40 | {.compile(srcDir & "sha256_avx_x8.S", cflags).} 41 | {.compile(srcDir & "sha256_avx_x16.S", cflags).} 42 | {.compile(srcDir & "sha256_shani.S", cflags).} 43 | {.compile(srcDir & "sha256_sse_x1.S", cflags).} 44 | 45 | type HashFcn* = proc(output: pointer, input: pointer, count: uint64) {. 46 | cdecl, noSideEffect, gcsafe, raises: [].} 47 | 48 | proc hashtree_init*(override: HashFcn) {.hashtreedecl.} 49 | func hashtree_hash*(output: pointer, input: pointer, count: uint64) {. 50 | hashtreedecl.} 51 | -------------------------------------------------------------------------------- /bindings.go: -------------------------------------------------------------------------------- 1 | package hashtree 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "unsafe" 7 | ) 8 | 9 | var ( 10 | // ErrOddChunks is returned when the number of chunks is odd. 11 | ErrOddChunks = errors.New("odd number of chunks") 12 | // ErrNotEnoughDigests is returned when the number of digests is not enough. 13 | ErrNotEnoughDigests = errors.New("not enough digest length") 14 | // ErrChunksNotMultipleOf64 is returned when the chunks are not multiple of 64 bytes. 15 | ErrChunksNotMultipleOf64 = errors.New("chunks not multiple of 64 bytes") 16 | // ErrDigestsNotMultipleOf32 is returned when the digests are not multiple of 32 bytes. 17 | ErrDigestsNotMultipleOf32 = errors.New("digests not multiple of 32 bytes") 18 | ) 19 | 20 | //go:noescape 21 | func HashtreeHash(output *byte, input *byte, count uint64) 22 | 23 | // Hash hashes the chunks two at the time and outputs the digests on the first 24 | // argument. It does check for lengths on the inputs. 25 | func Hash(digests [][32]byte, chunks [][32]byte) error { 26 | if len(chunks) == 0 { 27 | return nil 28 | } 29 | 30 | if len(chunks)%2 == 1 { 31 | return ErrOddChunks 32 | } 33 | if len(digests) < len(chunks)/2 { 34 | return fmt.Errorf("%w: need at least %v, got %v", ErrNotEnoughDigests, len(chunks)/2, len(digests)) 35 | } 36 | if supportedCPU { 37 | HashtreeHash(&digests[0][0], &chunks[0][0], uint64(len(chunks)/2)) 38 | } else { 39 | sha256_1_generic(digests, chunks) 40 | } 41 | return nil 42 | } 43 | 44 | // HashByteSlice is the same as hash but it takes byte slices instead of slices of arrays. 45 | func HashByteSlice(digests []byte, chunks []byte) error { 46 | if len(chunks) == 0 { 47 | return nil 48 | } 49 | 50 | if len(chunks)%64 != 0 { 51 | return ErrChunksNotMultipleOf64 52 | } 53 | 54 | if len(digests)%32 != 0 { 55 | return ErrDigestsNotMultipleOf32 56 | } 57 | 58 | if len(digests) < len(chunks)/2 { 59 | return fmt.Errorf("%w: need at least %v, got %v", ErrNotEnoughDigests, len(chunks)/2, len(digests)) 60 | } 61 | // We use an unsafe pointer to cast []byte to [][32]byte. The length and 62 | // capacity of the slice need to be divided accordingly by 32. 63 | sizeChunks := (len(chunks) >> 5) 64 | chunkedChunks := unsafe.Slice((*[32]byte)(unsafe.Pointer(&chunks[0])), sizeChunks) 65 | 66 | sizeDigests := (len(digests) >> 5) 67 | chunkedDigest := unsafe.Slice((*[32]byte)(unsafe.Pointer(&digests[0])), sizeDigests) 68 | if supportedCPU { 69 | Hash(chunkedDigest, chunkedChunks) 70 | } else { 71 | sha256_1_generic(chunkedDigest, chunkedChunks) 72 | } 73 | return nil 74 | } 75 | -------------------------------------------------------------------------------- /src/hashtree.h: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021-2024 Prysmatic Labs 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | #ifndef HASHTREE_H 25 | #define HASHTREE_H 26 | 27 | #include 28 | 29 | #ifdef __cplusplus 30 | extern "C" { 31 | #endif 32 | 33 | typedef void (*hashtree_hash_fcn)(unsigned char*, const unsigned char*, uint64_t); 34 | 35 | /** Initialize the library to use the given hash tree function or perform 36 | * auto-detection based on the CPU if `NULL` is given. 37 | * 38 | * Calling this function is optional - if it is not called, auto-detection happens 39 | * automatically on the first hash computation. 40 | */ 41 | void hashtree_init(hashtree_hash_fcn override); 42 | 43 | /* Undefined behavior if called without appropriate hardware support */ 44 | void hashtree_hash(unsigned char* output, const unsigned char* input, uint64_t count); 45 | void hashtree_sha256_generic(unsigned char* output, const unsigned char* input, uint64_t count); 46 | 47 | #ifdef __aarch64__ 48 | void hashtree_sha256_neon_x1(unsigned char* output, const unsigned char* input, uint64_t count); 49 | void hashtree_sha256_neon_x4(unsigned char* output, const unsigned char* input, uint64_t count); 50 | void hashtree_sha256_sha_x1(unsigned char* output, const unsigned char* input, uint64_t count); 51 | #endif 52 | 53 | #ifdef __x86_64__ 54 | void hashtree_sha256_sse_x1(unsigned char* output, const unsigned char* input, uint64_t count); 55 | void hashtree_sha256_avx_x1(unsigned char* output, const unsigned char* input, uint64_t count); 56 | void hashtree_sha256_avx_x4(unsigned char* output, const unsigned char* input, uint64_t count); 57 | void hashtree_sha256_avx2_x8(unsigned char* output, const unsigned char* input, uint64_t count); 58 | void hashtree_sha256_avx512_x16(unsigned char* output, const unsigned char* input, uint64_t count); 59 | void hashtree_sha256_shani_x2(unsigned char* output, const unsigned char* input, uint64_t count); 60 | #endif 61 | #ifdef __cplusplus 62 | } 63 | #endif 64 | #endif 65 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Hasthree rust bindings 2 | //! 3 | //! hashtree is a SHA256 library highly optimized for Merkle tree computations. It is based on 4 | //! Intel's implementation (intel-ipsec-mb) with a few modifications like hardcoding the scheduled 5 | //! words. This library exposes a single function that takes an input slice of bytes to be 6 | //! considered as chunks of 64 bytes each, and another slice where the digests of each chunk will 7 | //! be written consecutively 8 | //! 9 | 10 | /* 11 | MIT License 12 | 13 | Copyright (c) 2021-2024 Prysmatic Labs 14 | 15 | Permission is hereby granted, free of charge, to any person obtaining a copy 16 | of this software and associated documentation files (the "Software"), to deal 17 | in the Software without restriction, including without limitation the rights 18 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 19 | copies of the Software, and to permit persons to whom the Software is 20 | furnished to do so, subject to the following conditions: 21 | 22 | The above copyright notice and this permission notice shall be included in all 23 | copies or substantial portions of the Software. 24 | 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 28 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 | SOFTWARE. 32 | */ 33 | 34 | extern crate libc; 35 | use libc::c_int; 36 | use std::ptr; 37 | 38 | type HashFunction = extern "C" fn(*mut u8, *const u8, u64); 39 | 40 | extern "C" { 41 | fn hashtree_init(override_: *const HashFunction) -> c_int; 42 | fn hashtree_hash(output: *mut u8, input: *const u8, count: u64); 43 | } 44 | 45 | /// init is used to initialize the hashtree library. It automatically chooses the best 46 | /// implementation. 47 | pub fn init() -> i32 { 48 | unsafe { hashtree_init(ptr::null()) } 49 | } 50 | 51 | /// hash takes a mutable slice where the digests will be stored (overwritten), a slice with the 52 | /// chunks to merkleize and the number of chunks to merkleize 53 | pub fn hash(out: &mut [u8], chunks: &[u8], count: usize) { 54 | unsafe { hashtree_hash(out.as_mut_ptr(), chunks.as_ptr(), count as u64) } 55 | } 56 | 57 | #[cfg(test)] 58 | mod tests { 59 | use super::*; 60 | 61 | #[test] 62 | fn test_init() { 63 | init(); // test passes if this doesn't panic 64 | } 65 | 66 | #[test] 67 | fn test_hash() { 68 | let chunks: [u8; 64] = [0xAB; 64]; 69 | let mut out = [0u8; 32]; 70 | 71 | hash(&mut out, &chunks, 1); 72 | 73 | let expected_hash: [u8; 32] = [ 74 | 0xec, 0x65, 0xc8, 0x79, 0x8e, 0xcf, 0x95, 0x90, 0x24, 0x13, 0xc4, 0x0f, 0x7b, 0x9e, 75 | 0x6d, 0x4b, 0x00, 0x68, 0x88, 0x5f, 0x5f, 0x32, 0x4a, 0xba, 0x1f, 0x9b, 0xa1, 0xc8, 76 | 0xe1, 0x4a, 0xea, 0x61, 77 | ]; 78 | 79 | assert_eq!( 80 | out, expected_hash, 81 | "The generated hash did not match the expected hash." 82 | ); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/hashtree.c: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021-2024 Prysmatic Labs 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | 25 | #include "hashtree.h" 26 | 27 | #include 28 | #ifdef __x86_64__ 29 | #include 30 | #endif 31 | #ifdef __aarch64__ 32 | #ifndef __APPLE__ 33 | #include 34 | #include 35 | #endif 36 | #endif 37 | 38 | static void init_and_hash(unsigned char *output, const unsigned char *input, uint64_t count); 39 | 40 | static hashtree_hash_fcn hash_ptr = init_and_hash; 41 | 42 | static hashtree_hash_fcn hashtree_detect() { 43 | #ifdef __x86_64__ 44 | uint32_t a = 0, b = 0, c = 0, d = 0; 45 | __get_cpuid_count(7, 0, &a, &b, &c, &d); 46 | 47 | if (b & bit_SHA) { 48 | /* Although AVX512 may be faster for full 16-block hashes, SHANI 49 | outperforms it significantly on smaller lists - thus, avoid pathological 50 | behavior. */ 51 | return &hashtree_sha256_shani_x2; 52 | } 53 | if ((b & bit_AVX512F) && (b & bit_AVX512VL)) { 54 | return &hashtree_sha256_avx512_x16; 55 | } 56 | if (b & bit_AVX2) { 57 | return &hashtree_sha256_avx2_x8; 58 | } 59 | __get_cpuid_count(1, 0, &a, &b, &c, &d); 60 | if (c & bit_AVX) { 61 | return &hashtree_sha256_avx_x4; 62 | } 63 | if (c & bit_AVX) { 64 | return &hashtree_sha256_sse_x1; 65 | } 66 | #endif 67 | #ifdef __aarch64__ 68 | #ifdef __APPLE__ 69 | return &hashtree_sha256_sha_x1; 70 | #else 71 | long hwcaps = getauxval(AT_HWCAP); 72 | if (hwcaps & HWCAP_SHA2) { 73 | return &hashtree_sha256_sha_x1; 74 | } 75 | 76 | if (hwcaps & HWCAP_ASIMD) { 77 | return &hashtree_sha256_neon_x4; 78 | } 79 | #endif 80 | #endif 81 | return &hashtree_sha256_generic; 82 | } 83 | 84 | void hashtree_init(hashtree_hash_fcn override) { 85 | if (override) { 86 | hash_ptr = override; 87 | } else { 88 | hash_ptr = hashtree_detect(); 89 | } 90 | } 91 | 92 | void hashtree_hash(unsigned char *output, const unsigned char *input, uint64_t count) { 93 | (*hash_ptr)(output, input, count); 94 | } 95 | 96 | static void init_and_hash(unsigned char *output, const unsigned char *input, uint64_t count) { 97 | hash_ptr = hashtree_detect(); 98 | assert(hash_ptr); 99 | 100 | hashtree_hash(output, input, count); 101 | } 102 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | on: [pull_request, workflow_dispatch] 3 | 4 | jobs: 5 | build: 6 | name: Library - ${{ matrix.settings.name }} 7 | runs-on: ${{ matrix.settings.runner }} 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | settings: 12 | - name: MacOS arm64 13 | target: aarch64-apple-darwin 14 | runner: macos-latest 15 | - name: Linux x86 16 | target: x86_64-unknown-linux-gnu 17 | runner: ubuntu-latest 18 | - name: Linux arm64 19 | target: aarch64-unknown-linux-gnu 20 | runner: ubuntu-latest 21 | - name: Linux x86 using clang 22 | target: x86_64-unknown-linux-gnu 23 | runner: ubuntu-latest 24 | cc: clang 25 | - name: Linux arm64 using clang 26 | target: aarch64-unknown-linux-gnu 27 | runner: ubuntu-latest 28 | cc: clang 29 | - name: Windows x86 30 | target: x86_64-pc-windows-msvc 31 | runner: windows-latest 32 | - name: Windows arm64 33 | target: aarch64-pc-windows-msvc 34 | runner: windows-latest 35 | steps: 36 | - name: Checkout 37 | uses: actions/checkout@v4 38 | - name: Cross-Compile Build 39 | if: ${{ matrix.settings.cc != '' }} 40 | run: CC=${{ matrix.settings.cc }} make all 41 | - name: Build 42 | if: ${{ !matrix.settings.cc }} 43 | run: make all 44 | - name: Run tests 45 | run: ./build/test 46 | 47 | go-bindings: 48 | name: Go Bindings - ${{ matrix.settings.name }} 49 | runs-on: ${{ matrix.settings.runner }} 50 | strategy: 51 | fail-fast: false 52 | matrix: 53 | go-version: [ '1.21', '1.22.x' ] 54 | settings: 55 | - name: MacOS arm64 56 | target: aarch64-apple-darwin 57 | runner: macos-latest 58 | - name: Linux x86 59 | target: x86_64-unknown-linux-gnu 60 | runner: ubuntu-latest 61 | - name: Linux arm64 62 | target: aarch64-unknown-linux-gnu 63 | runner: ubuntu-latest 64 | - name: Linux x86 using clang 65 | target: x86_64-unknown-linux-gnu 66 | runner: ubuntu-latest 67 | cc: clang 68 | - name: Linux arm64 using clang 69 | target: aarch64-unknown-linux-gnu 70 | runner: ubuntu-latest 71 | cc: clang 72 | - name: Windows x86 73 | target: x86_64-pc-windows-msvc 74 | runner: windows-latest 75 | - name: Windows arm64 76 | target: aarch64-pc-windows-msvc 77 | runner: windows-latest 78 | steps: 79 | - name: Checkout 80 | uses: actions/checkout@v4 81 | - name: Setup Go ${{ matrix.go-version }} 82 | uses: actions/setup-go@v5 83 | with: 84 | go-version: ${{ matrix.go-version }} 85 | - name: Cross-Compile Build 86 | if: ${{ matrix.settings.cc != '' }} 87 | run: CC=${{ matrix.settings.cc }} make go_bindings 88 | - name: Build 89 | if: ${{ !matrix.settings.cc }} 90 | run: make go_bindings 91 | - name: Run tests 92 | run: go test . 93 | - name: Run benchmarks 94 | run: go test -bench=. 95 | 96 | rust-bindings: 97 | name: Rust Bindings - ${{ matrix.settings.name }} 98 | runs-on: ${{ matrix.settings.runner }} 99 | strategy: 100 | fail-fast: false 101 | matrix: 102 | settings: 103 | - name: MacOS arm64 104 | target: aarch64-apple-darwin 105 | runner: macos-latest 106 | - name: Linux x86 107 | target: x86_64-unknown-linux-gnu 108 | runner: ubuntu-latest 109 | - name: Linux arm64 110 | target: aarch64-unknown-linux-gnu 111 | runner: ubuntu-latest 112 | use-docker: true 113 | - name: Windows x86 114 | target: x86_64-pc-windows-msvc 115 | runner: windows-latest 116 | # - name: Windows arm64 117 | # target: aarch64-pc-windows-msvc 118 | # runner: windows-latest 119 | # use-docker: true 120 | steps: 121 | - name: checkout 122 | uses: actions/checkout@v3 123 | - name: Set up QEMU 124 | if: ${{ matrix.settings.use-docker && matrix.settings.runner == 'ubuntu-latest' }} 125 | uses: docker/setup-qemu-action@v3 126 | with: 127 | platforms: arm64 128 | - name: Build in docker 129 | if: ${{ matrix.settings.use-docker }} 130 | run: | 131 | docker buildx build \ 132 | --build-arg TARGET=${{ matrix.settings.target }} \ 133 | --platform linux/arm64 \ 134 | -t ${{ matrix.settings.target }} \ 135 | . 136 | - name: install rustup 137 | if: ${{ !matrix.settings.use-docker }} 138 | run: | 139 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rustup-init.sh 140 | sh rustup-init.sh -y --default-toolchain none 141 | rustup target add ${{ matrix.settings.target }} 142 | - name: Build and Test 143 | if: ${{ !matrix.settings.use-docker }} 144 | run: | 145 | cargo build --release --target ${{ matrix.settings.target }} 146 | cargo test --target ${{ matrix.settings.target }} 147 | 148 | nim: 149 | name: nim 150 | runs-on: ubuntu-latest 151 | steps: 152 | - uses: jiro4989/setup-nim-action@v1 153 | - uses: actions/checkout@v3 154 | - name: Run Nim test 155 | run: nim c -r tests/test_hashtree_abi.nim 156 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 Prysmatic Labs 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | ############################################################################### 24 | # Constants and Configuration Variables 25 | ############################################################################### 26 | VERSION := 0.2.0 27 | OUT_DIR ?= $(CURDIR)/../build 28 | BASE_DIR := $(CURDIR)/../ 29 | OBJ_DIR := $(OUT_DIR)/obj 30 | LIB_DIR := $(OUT_DIR)/lib 31 | 32 | # Ensure these directories exist 33 | $(shell mkdir -p $(OBJ_DIR)) 34 | $(shell mkdir -p $(LIB_DIR)) 35 | 36 | ASFLAGS += -g -fpic 37 | CFLAGS += -g -Wall -Werror -O3 38 | CLANG_ASFLAGS = -fno-integrated-as 39 | LDFLAGS += -L . 40 | testlibs = -lhashtree 41 | benchlibs = -lhashtree -lm 42 | 43 | ############################################################################### 44 | # Platform Configuration 45 | ############################################################################### 46 | # Platform detection. 47 | ifndef OS 48 | OS := $(shell uname -s) 49 | endif 50 | 51 | ifeq ($(OS),Windows_NT) 52 | PLATFORM = Windows 53 | else 54 | ifeq ($(OS),Darwin) 55 | PLATFORM = Darwin 56 | else 57 | PLATFORM = Linux 58 | endif 59 | endif 60 | 61 | # ARM architecture detection 62 | ifdef CC 63 | ARM = $(shell $(CC) -dM -E - < /dev/null | grep "aarch" | awk '{ print $$3 }') 64 | ifneq ($(findstring mingw, $(CC)),) 65 | ifneq ($(ARM),1) 66 | PLATFORM = Windows 67 | endif 68 | endif 69 | else 70 | ARCH = $(shell uname -m) 71 | ARM = $(shell echo $(ARCH) | grep -E '^(arm|aarch64)' >/dev/null && echo 1 || echo 0) 72 | endif 73 | 74 | # Cross-platform compiler selection 75 | # check for default, skip setting if user passed in specific cross-compilation lib 76 | ifeq ($(CC), cc) 77 | ifeq ($(PLATFORM),Darwin) 78 | CC = clang 79 | else 80 | CC = gcc 81 | endif 82 | endif 83 | 84 | ifeq ($(CC),clang) 85 | ifneq ($(ARM),1) 86 | ASFLAGS += $(CLANG_ASFLAGS) 87 | endif 88 | endif 89 | 90 | ifeq ($(HAVE_OPENSSL),1) 91 | CFLAGS += -DHAVE_OPENSSL 92 | benchlibs += -lcrypto 93 | testlibs += -lcrypto 94 | endif 95 | 96 | ifeq ($(PLATFORM),Windows) 97 | libname = $(LIB_DIR)/libhashtree.lib 98 | else 99 | libname = $(LIB_DIR)/libhashtree.a 100 | endif 101 | 102 | ifeq ($(ARM), 1) 103 | OBJ_LIST = $(OBJ_DIR)/sha256_armv8_neon_x4.o\ 104 | $(OBJ_DIR)/sha256_armv8_neon_x1.o\ 105 | $(OBJ_DIR)/sha256_armv8_crypto.o\ 106 | $(OBJ_DIR)/sha256_generic.o\ 107 | $(OBJ_DIR)/hashtree.o 108 | else 109 | OBJ_LIST = $(OBJ_DIR)/sha256_shani.o\ 110 | $(OBJ_DIR)/sha256_avx_x16.o\ 111 | $(OBJ_DIR)/sha256_avx_x8.o\ 112 | $(OBJ_DIR)/sha256_avx_x4.o\ 113 | $(OBJ_DIR)/sha256_avx_x1.o\ 114 | $(OBJ_DIR)/sha256_sse_x1.o\ 115 | $(OBJ_DIR)/sha256_generic.o\ 116 | $(OBJ_DIR)/hashtree.o 117 | endif 118 | 119 | ############################################################################### 120 | # Commands 121 | ############################################################################### 122 | 123 | .PHONY : clean .FORCE 124 | .FORCE: 125 | 126 | $(OBJ_DIR)/%.o: %.S 127 | $(CC) $(ASFLAGS) -c $< -o $@ 128 | 129 | $(OBJ_DIR)/%.o: %.c 130 | $(CC) $(CFLAGS) -c $< -o $@ 131 | 132 | $(libname): $(OBJ_LIST) 133 | $(AR) rcs $@ $(OBJ_LIST) 134 | 135 | ifeq ($(PLATFORM),Windows) 136 | all: $(libname) test 137 | else 138 | all: $(libname) test bench 139 | endif 140 | 141 | go_bindings: $(libname) 142 | cp $(libname) $(BASE_DIR)/hashtree.syso 143 | go build $(BASE_DIR) 144 | 145 | test: hashtree.h acutest.h test.c $(libname) 146 | $(CC) $(CFLAGS) $(LDFLAGS) -L$(LIB_DIR) -o $(OUT_DIR)/test test.c $(testlibs) 147 | 148 | bench: hashtree.h ubench.h bench.c $(libname) 149 | $(CC) $(CFLAGS) $(LDFLAGS) -L$(LIB_DIR) -o $(OUT_DIR)/bench bench.c $(benchlibs) 150 | 151 | clean: 152 | -rm -f $(OBJ_LIST) $(LIB_DIR)/libhashtree.a $(LIB_DIR)/libhashtree.lib $(OUT_DIR)/test $(OUT_DIR)/test.exe $(OUT_DIR)/bench hashtree.pc $(BASE_DIR)/hashtree.syso 153 | 154 | ifeq ($(PREFIX),) 155 | PREFIX := /usr 156 | endif 157 | 158 | hashtree.pc: .FORCE 159 | @echo 'prefix='$(PREFIX) > hashtree.pc 160 | @echo 'exec_prefix=$${prefix}' >> hashtree.pc 161 | @echo 'libdir=$${prefix}/lib' >> hashtree.pc 162 | @echo 'includedir=$${prefix}/include' >> hashtree.pc 163 | @echo '' >> hashtree.pc 164 | @echo 'Name: hashtree' >> hashtree.pc 165 | @echo 'Description: Fast hashing of Merkle trees' >> hashtree.pc 166 | @echo 'Version: '$(VERSION) >> hashtree.pc 167 | @echo 'URL: https://github.com/OffchainLabs/hashtree' >> hashtree.pc 168 | @echo 'LIBS: -L$${libdir} -lhashtree' >> hashtree.pc 169 | @echo 'Cflags: -I$${includedir}'>> hashtree.pc 170 | 171 | ifneq ($(PLATFORM),Windows) 172 | install: $(libname) hashtree.pc 173 | install -d $(DESTDIR)$(PREFIX)/lib 174 | install -m 644 $(libname) $(DESTDIR)$(PREFIX)/lib/ 175 | install -d $(DESTDIR)$(PREFIX)/include 176 | install -m 644 hashtree.h $(DESTDIR)$(PREFIX)/include/ 177 | install -d $(DESTDIR)$(PREFIX)/lib/pkgconfig 178 | install -m 644 hashtree.pc $(DESTDIR)$(PREFIX)/lib/pkgconfig/hashtree.pc 179 | 180 | uninstall: $(libname) 181 | rm $(DESTDIR)$(PREFIX)/lib/libhashtree.a 182 | rm $(DESTDIR)$(PREFIX)/include/hashtree.h 183 | endif 184 | 185 | -------------------------------------------------------------------------------- /sha256_1_generic.go: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | # Copyright (c) 2021-2022 Prysmatic Labs 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | package hashtree 25 | 26 | import ( 27 | "encoding/binary" 28 | "math/bits" 29 | ) 30 | 31 | const ( 32 | init0 = uint32(0x6A09E667) 33 | init1 = uint32(0xBB67AE85) 34 | init2 = uint32(0x3C6EF372) 35 | init3 = uint32(0xA54FF53A) 36 | init4 = uint32(0x510E527F) 37 | init5 = uint32(0x9B05688C) 38 | init6 = uint32(0x1F83D9AB) 39 | init7 = uint32(0x5BE0CD19) 40 | ) 41 | 42 | var _P = []uint32{ 43 | 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 44 | 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 45 | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 46 | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374, 47 | 0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254, 48 | 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa, 49 | 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7, 50 | 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0, 51 | 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd, 52 | 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16, 53 | 0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537, 54 | 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37, 55 | 0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7, 56 | 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890, 57 | 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c, 58 | 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76, 59 | } 60 | 61 | var _K = []uint32{ 62 | 0x428a2f98, 63 | 0x71374491, 64 | 0xb5c0fbcf, 65 | 0xe9b5dba5, 66 | 0x3956c25b, 67 | 0x59f111f1, 68 | 0x923f82a4, 69 | 0xab1c5ed5, 70 | 0xd807aa98, 71 | 0x12835b01, 72 | 0x243185be, 73 | 0x550c7dc3, 74 | 0x72be5d74, 75 | 0x80deb1fe, 76 | 0x9bdc06a7, 77 | 0xc19bf174, 78 | 0xe49b69c1, 79 | 0xefbe4786, 80 | 0x0fc19dc6, 81 | 0x240ca1cc, 82 | 0x2de92c6f, 83 | 0x4a7484aa, 84 | 0x5cb0a9dc, 85 | 0x76f988da, 86 | 0x983e5152, 87 | 0xa831c66d, 88 | 0xb00327c8, 89 | 0xbf597fc7, 90 | 0xc6e00bf3, 91 | 0xd5a79147, 92 | 0x06ca6351, 93 | 0x14292967, 94 | 0x27b70a85, 95 | 0x2e1b2138, 96 | 0x4d2c6dfc, 97 | 0x53380d13, 98 | 0x650a7354, 99 | 0x766a0abb, 100 | 0x81c2c92e, 101 | 0x92722c85, 102 | 0xa2bfe8a1, 103 | 0xa81a664b, 104 | 0xc24b8b70, 105 | 0xc76c51a3, 106 | 0xd192e819, 107 | 0xd6990624, 108 | 0xf40e3585, 109 | 0x106aa070, 110 | 0x19a4c116, 111 | 0x1e376c08, 112 | 0x2748774c, 113 | 0x34b0bcb5, 114 | 0x391c0cb3, 115 | 0x4ed8aa4a, 116 | 0x5b9cca4f, 117 | 0x682e6ff3, 118 | 0x748f82ee, 119 | 0x78a5636f, 120 | 0x84c87814, 121 | 0x8cc70208, 122 | 0x90befffa, 123 | 0xa4506ceb, 124 | 0xbef9a3f7, 125 | 0xc67178f2, 126 | } 127 | 128 | func sha256_1_generic(digests [][32]byte, p [][32]byte) { 129 | var w [16]uint32 130 | for k := 0; k < len(p)/2; k++ { 131 | // First 16 rounds 132 | a, b, c, d, e, f, g, h := init0, init1, init2, init3, init4, init5, init6, init7 133 | for i := 0; i < 8; i++ { 134 | j := i * 4 135 | w[i] = uint32(p[2*k][j])<<24 | uint32(p[2*k][j+1])<<16 | uint32(p[2*k][j+2])<<8 | uint32(p[2*k][j+3]) 136 | t1 := h + ((bits.RotateLeft32(e, -6)) ^ (bits.RotateLeft32(e, -11)) ^ (bits.RotateLeft32(e, -25))) + ((e & f) ^ (^e & g)) + _K[i] + w[i] 137 | 138 | t2 := ((bits.RotateLeft32(a, -2)) ^ (bits.RotateLeft32(a, -13)) ^ (bits.RotateLeft32(a, -22))) + ((a & b) ^ (a & c) ^ (b & c)) 139 | 140 | h = g 141 | g = f 142 | f = e 143 | e = d + t1 144 | d = c 145 | c = b 146 | b = a 147 | a = t1 + t2 148 | } 149 | for i := 8; i < 16; i++ { 150 | j := (i - 8) * 4 151 | w[i] = uint32(p[2*k+1][j])<<24 | uint32(p[2*k+1][j+1])<<16 | uint32(p[2*k+1][j+2])<<8 | uint32(p[2*k+1][j+3]) 152 | t1 := h + ((bits.RotateLeft32(e, -6)) ^ (bits.RotateLeft32(e, -11)) ^ (bits.RotateLeft32(e, -25))) + ((e & f) ^ (^e & g)) + _K[i] + w[i] 153 | 154 | t2 := ((bits.RotateLeft32(a, -2)) ^ (bits.RotateLeft32(a, -13)) ^ (bits.RotateLeft32(a, -22))) + ((a & b) ^ (a & c) ^ (b & c)) 155 | 156 | h = g 157 | g = f 158 | f = e 159 | e = d + t1 160 | d = c 161 | c = b 162 | b = a 163 | a = t1 + t2 164 | } 165 | // Last 48 rounds 166 | for i := 16; i < 64; i++ { 167 | v1 := w[(i-2)%16] 168 | t1 := (bits.RotateLeft32(v1, -17)) ^ (bits.RotateLeft32(v1, -19)) ^ (v1 >> 10) 169 | v2 := w[(i-15)%16] 170 | t2 := (bits.RotateLeft32(v2, -7)) ^ (bits.RotateLeft32(v2, -18)) ^ (v2 >> 3) 171 | w[i%16] += t1 + w[(i-7)%16] + t2 172 | 173 | t1 = h + ((bits.RotateLeft32(e, -6)) ^ (bits.RotateLeft32(e, -11)) ^ (bits.RotateLeft32(e, -25))) + ((e & f) ^ (^e & g)) + _K[i] + w[i%16] 174 | t2 = ((bits.RotateLeft32(a, -2)) ^ (bits.RotateLeft32(a, -13)) ^ (bits.RotateLeft32(a, -22))) + ((a & b) ^ (a & c) ^ (b & c)) 175 | h = g 176 | g = f 177 | f = e 178 | e = d + t1 179 | d = c 180 | c = b 181 | b = a 182 | a = t1 + t2 183 | } 184 | // Add original digest 185 | a += init0 186 | b += init1 187 | c += init2 188 | d += init3 189 | e += init4 190 | f += init5 191 | g += init6 192 | h += init7 193 | 194 | h0, h1, h2, h3, h4, h5, h6, h7 := a, b, c, d, e, f, g, h 195 | // Rounds with padding 196 | for i := 0; i < 64; i++ { 197 | t1 := h + ((bits.RotateLeft32(e, -6)) ^ (bits.RotateLeft32(e, -11)) ^ (bits.RotateLeft32(e, -25))) + ((e & f) ^ (^e & g)) + _P[i] 198 | 199 | t2 := ((bits.RotateLeft32(a, -2)) ^ (bits.RotateLeft32(a, -13)) ^ (bits.RotateLeft32(a, -22))) + ((a & b) ^ (a & c) ^ (b & c)) 200 | 201 | h = g 202 | g = f 203 | f = e 204 | e = d + t1 205 | d = c 206 | c = b 207 | b = a 208 | a = t1 + t2 209 | } 210 | 211 | h0 += a 212 | h1 += b 213 | h2 += c 214 | h3 += d 215 | h4 += e 216 | h5 += f 217 | h6 += g 218 | h7 += h 219 | 220 | var dig [32]byte 221 | binary.BigEndian.PutUint32(dig[0:4], h0) 222 | binary.BigEndian.PutUint32(dig[4:8], h1) 223 | binary.BigEndian.PutUint32(dig[8:12], h2) 224 | binary.BigEndian.PutUint32(dig[12:16], h3) 225 | binary.BigEndian.PutUint32(dig[16:20], h4) 226 | binary.BigEndian.PutUint32(dig[20:24], h5) 227 | binary.BigEndian.PutUint32(dig[24:28], h6) 228 | binary.BigEndian.PutUint32(dig[28:32], h7) 229 | (digests)[k] = dig 230 | } 231 | } 232 | -------------------------------------------------------------------------------- /src/sha256_armv8_crypto.S: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021-2024 Prysmatic Labs 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | 25 | 26 | ######################################################################################################### 27 | # 28 | # void sha256_armv8_crypto(unsigned char *output, unsigned char *input, size_t count) 29 | # 30 | # armv8-a implementation with crypto extensions 31 | # as in the Apple Silicon M1 32 | # 33 | # There are no bound checks, caller is responsible to check that memory up to output + 32*count 34 | # is writable. 35 | # 36 | # Used registers: x0, x1, x2, x3, x4, x5 37 | # 38 | # SIMD registers: all vector registers except v12-v15 39 | # 40 | ######################################################################################################## 41 | 42 | #ifdef __aarch64__ 43 | .text 44 | .arch armv8-a+sha2 45 | .altmacro 46 | 47 | output .req x0 48 | input .req x1 49 | count .req x2 50 | last .req x2 51 | 52 | digest .req x3 53 | k256 .req x4 54 | padding .req x5 55 | 56 | 57 | .macro hashupdate WORD 58 | sha256h q2, q3, \WORD 59 | sha256h2 q3, q8, \WORD 60 | mov v8.16b, v2.16b 61 | .endm 62 | 63 | .macro schedule A, B, C, D, E, WORD 64 | add \WORD, \B, \A 65 | sha256su0 \B, \C 66 | sha256su1 \E, \C, \D 67 | hashupdate \WORD 68 | .endm 69 | 70 | #ifdef __APPLE__ 71 | .global _hashtree_sha256_sha_x1 72 | #else 73 | .global hashtree_sha256_sha_x1 74 | #endif 75 | #ifndef __APPLE__ 76 | .type hashtree_sha256_sha_x1,%function 77 | #endif 78 | .align 5 79 | #ifdef __APPLE__ 80 | _hashtree_sha256_sha_x1: 81 | #else 82 | hashtree_sha256_sha_x1: 83 | #endif 84 | // Set up stack, need to save the clobbered registers d8-d11 85 | sub sp, sp, #32 86 | stp d8, d9, [sp] 87 | 88 | #ifdef __APPLE__ 89 | adrp digest, .LDIGEST@PAGE 90 | add digest, digest, #:lo12:.LDIGEST@PAGEOFF 91 | adrp k256, .LK256@PAGE 92 | add k256, k256, #:lo12:.LK256@PAGEOFF 93 | #else 94 | adrp digest, .LDIGEST 95 | add digest, digest, #:lo12:.LDIGEST 96 | adrp k256, .LK256 97 | add k256, k256, #:lo12:.LK256 98 | #endif 99 | stp d10, d11, [sp, #16] 100 | #ifdef __APPLE__ 101 | adrp padding, .LPADDING@PAGE 102 | add padding, padding, #:lo12:.LPADDING@PAGEOFF 103 | #else 104 | adrp padding, .LPADDING 105 | add padding, padding, #:lo12:.LPADDING 106 | #endif 107 | add last, output, count, lsl #5 108 | 109 | ld1 {v0.4s, v1.4s}, [digest] 110 | 111 | .Lshani_loop: 112 | cmp last, output 113 | beq .Lshani_finish 114 | 115 | // Load all K constants 116 | ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [k256], #64 117 | ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [k256], #64 118 | ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [k256], #64 119 | ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [k256] 120 | sub k256, k256, #192 121 | 122 | // Load one block 123 | ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [input], #64 124 | mov v2.16b, v0.16b 125 | mov v3.16b, v1.16b 126 | mov v8.16b, v2.16b 127 | 128 | // Reverse endinanness 129 | rev32 v4.16b, v4.16b 130 | rev32 v5.16b, v5.16b 131 | rev32 v6.16b, v6.16b 132 | rev32 v7.16b, v7.16b 133 | 134 | add v9.4s, v4.4s, v16.4s 135 | sha256su0 v4.4s, v5.4s 136 | hashupdate v9.4s 137 | 138 | schedule v17.4s, v5.4s, v6.4s, v7.4s, v4.4s, v9.4s 139 | schedule v18.4s, v6.4s, v7.4s, v4.4s, v5.4s, v9.4s 140 | schedule v19.4s, v7.4s, v4.4s, v5.4s, v6.4s, v9.4s 141 | schedule v20.4s, v4.4s, v5.4s, v6.4s, v7.4s, v9.4s 142 | schedule v21.4s, v5.4s, v6.4s, v7.4s, v4.4s, v9.4s 143 | schedule v22.4s, v6.4s, v7.4s, v4.4s, v5.4s, v9.4s 144 | schedule v23.4s, v7.4s, v4.4s, v5.4s, v6.4s, v9.4s 145 | schedule v24.4s, v4.4s, v5.4s, v6.4s, v7.4s, v9.4s 146 | schedule v25.4s, v5.4s, v6.4s, v7.4s, v4.4s, v9.4s 147 | schedule v26.4s, v6.4s, v7.4s, v4.4s, v5.4s, v9.4s 148 | schedule v27.4s, v7.4s, v4.4s, v5.4s, v6.4s, v9.4s 149 | 150 | add v9.4s, v4.4s, v28.4s 151 | hashupdate v9.4s 152 | sha256su1 v7.4s, v5.4s, v6.4s 153 | add v9.4s, v5.4s, v29.4s 154 | hashupdate v9.4s 155 | add v9.4s, v6.4s, v30.4s 156 | hashupdate v9.4s 157 | add v9.4s, v7.4s, v31.4s 158 | hashupdate v9.4s 159 | 160 | // Add initial digest and back it up 161 | add v2.4s, v0.4s, v2.4s 162 | add v3.4s, v1.4s, v3.4s 163 | mov v10.16b, v2.16b 164 | mov v11.16b, v3.16b 165 | 166 | // Rounds with padding 167 | 168 | // Load prescheduled constants 169 | ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [padding], #64 170 | ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [padding], #64 171 | mov v8.16b, v2.16b 172 | ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [padding], #64 173 | ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [padding] 174 | sub padding, padding, #192 175 | 176 | hashupdate v16.4s 177 | hashupdate v17.4s 178 | hashupdate v18.4s 179 | hashupdate v19.4s 180 | hashupdate v20.4s 181 | hashupdate v21.4s 182 | hashupdate v22.4s 183 | hashupdate v23.4s 184 | hashupdate v24.4s 185 | hashupdate v25.4s 186 | hashupdate v26.4s 187 | hashupdate v27.4s 188 | hashupdate v28.4s 189 | hashupdate v29.4s 190 | hashupdate v30.4s 191 | hashupdate v31.4s 192 | 193 | // Add backed up digest 194 | add v2.4s, v10.4s, v2.4s 195 | add v3.4s, v11.4s, v3.4s 196 | 197 | rev32 v2.16b, v2.16b 198 | rev32 v3.16b, v3.16b 199 | st1 {v2.4s, v3.4s}, [output], #32 200 | 201 | b .Lshani_loop 202 | 203 | .Lshani_finish: 204 | ldp d8,d9, [sp], #16 205 | ldp d10, d11, [sp], #16 206 | ret 207 | 208 | .section .rodata, "a" 209 | .align 4 210 | .LDIGEST: 211 | .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,\ 212 | 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 213 | .LK256: 214 | .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,\ 215 | 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,\ 216 | 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,\ 217 | 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,\ 218 | 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,\ 219 | 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,\ 220 | 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,\ 221 | 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,\ 222 | 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,\ 223 | 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,\ 224 | 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,\ 225 | 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,\ 226 | 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,\ 227 | 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,\ 228 | 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,\ 229 | 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 230 | 231 | .LPADDING: 232 | .word 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,\ 233 | 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,\ 234 | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,\ 235 | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,\ 236 | 0x649b69c1, 0xf0fe4786, 0xfe1edc6, 0x240cf254,\ 237 | 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,\ 238 | 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,\ 239 | 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,\ 240 | 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,\ 241 | 0xa35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,\ 242 | 0x7f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,\ 243 | 0x17406110, 0xd8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,\ 244 | 0x83613bda, 0xdb48a363, 0xb02e931, 0x6fd15ca7,\ 245 | 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,\ 246 | 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,\ 247 | 0xd2c741c6, 0x7237ea3, 0xa4954b68, 0x4c191d76 248 | #endif 249 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hashtree 2 | 3 | Hashtree is a SHA256 library highly optimized for Merkle tree computation. It is 4 | based on [Intel's implementation](https://github.com/intel/intel-ipsec-mb) with 5 | a few modifications like hardcoding the scheduled words of the padding block. 6 | 7 | The library exposes a single header file with the low level sha functions. They 8 | all have the following signature: 9 | ```c 10 | void sha256(unsigned char *output, unsigned char *input, uint64_t count) 11 | ``` 12 | where `input` is the buffer containing `count` chunks of 64 bytes to be hashed, 13 | and `output` is a pre-allocated buffer that will hold the `count` 32-byte digests. 14 | These are low level functions that do not perform any error check: the caller is 15 | responsible for checking that `input` is at least `64*count` bytes long and that 16 | `output` is writable and at least `32*count` bytes long. The caller is 17 | responsible for memory allocation and de-allocation of these buffers. 18 | 19 | ## Dependencies 20 | There are no dependencies besides the standard `C` header `stdint.h`. Benchmarks 21 | have a dependency on `libm`. Tests and benchmarks on x86-64 an extra 22 | dependency on `cpuid.h` is needed. An optional dependency on openssl allows to 23 | test and benchmark against openssl. The only build-time dependency is a GCC and 24 | GNU assembler compatible compiler like `gcc` and `gas`. On Mac OS X with newer Apple Silicon processors the library can be built with the default clang compiler. 25 | 26 | ## Compilation 27 | - Start by cloning the repository 28 | ```shell 29 | $ git clone https://github.com/potuz/hashtree.git 30 | $ cd hashtree 31 | ``` 32 | - To build the library 33 | ```shell 34 | $ make 35 | ``` 36 | this produces a statically linked library `libhashtree.a` in the `src` 37 | directory. 38 | - To build tests or benchmarks or all respectively: 39 | ```shell 40 | $ make test 41 | $ make bench 42 | $ make all 43 | ``` 44 | - To test or benchmark against OPENSSL: 45 | ```shell 46 | $ make clean 47 | $ HAVE_OPENSSL=1 make all 48 | ``` 49 | - To cross compile for ARMv8 50 | ```shell 51 | $ CC=aarch64-linux-gnu-gcc make 52 | ``` 53 | - To cross compile for Windows (benchmarks will not work) 54 | ```shell 55 | $ make clean 56 | $ CC=x86_64-w64-mingw32-gcc make test 57 | ``` 58 | ## Running Tests and Benchmarks 59 | ```shell 60 | $ make test 61 | $ ./src/test 62 | Test hash_sse_1... [ OK ] 63 | Test hash_sse_1_multiple_blocks... [ OK ] 64 | Test hash_avx_1... [ OK ] 65 | Test hash_avx_1_multiple_blocks... [ OK ] 66 | Test hash_avx_4... [ OK ] 67 | Test hash_avx_4_6blocks... [ OK ] 68 | Test hash_avx_8... [ OK ] 69 | Test hash_avx_8_13blocks... [ OK ] 70 | Test hash_shani... [ CPU does not support SHA-ni ] 71 | Test hash_shani_13blocks... [ CPU does not support SHA-ni ] 72 | Test hash_avx_16... [ OK ] 73 | Test hash_avx_16_30blocks... [ OK ] 74 | ``` 75 | This is running in a CPU that does not support SHA extensions, that is why two tests fail. Your system may output a different combination. 76 | 77 | To run benchmarks: 78 | ```shell 79 | $ HAVE_OPENSSL=1 make all 80 | $ ./src/bench 81 | [==========] Running 9 benchmarks. 82 | [ RUN ] sse.sse_x1_one_at_time 83 | [ OK ] sse.sse_x1_one_at_time (mean 32.300ms, confidence interval +- 1.725613%) 84 | [ RUN ] sse.sse_x1 85 | [ OK ] sse.sse_x1 (mean 31.837ms, confidence interval +- 0.327542%) 86 | [ RUN ] avx.avx_x1_one_at_time 87 | [ OK ] avx.avx_x1_one_at_time (mean 32.299ms, confidence interval +- 0.464452%) 88 | [ RUN ] avx.avx_x1 89 | [ OK ] avx.avx_x1 (mean 31.855ms, confidence interval +- 0.150833%) 90 | [ RUN ] avx.avx_x4 91 | [ OK ] avx.avx_x4 (mean 12.368ms, confidence interval +- 1.758262%) 92 | [ RUN ] avx.avx_x8 93 | [ OK ] avx.avx_x8 (mean 6.519ms, confidence interval +- 2.142718%) 94 | [ RUN ] shani.shani 95 | [ OK ] shani.shani (mean -20.-559us, confidence interval +- -89188758235664.375000%) 96 | [ RUN ] shani.shani_one_at_time 97 | [ OK ] shani.shani_one_at_time (mean -3281090326183.-673us, confidence interval +- -9846.737643%) 98 | [ RUN ] openssl.openssl_one_at_time 99 | [ OK ] openssl.openssl_one_at_time (mean 30.519ms, confidence interval +- 0.330545%) 100 | [==========] 9 benchmarks ran. 101 | [ PASSED ] 9 benchmarks. 102 | ``` 103 | The results for the SHA-ni benchmarks are spurious since the CPU does not 104 | support them. Here we see that the benchmark against openssl native 105 | implementation for this CPU runs at the same speed as the single buffer SSE and 106 | AVX implementation, while the AVX2 implementation 8 blocks at a time runs 5 107 | times faster. 108 | 109 | A benchmark on a cascade-lake supporting AVX-512: 110 | ```shell 111 | ./src/bench 112 | [==========] Running 9 benchmarks. 113 | [ RUN ] sse.sse_x1_one_at_time 114 | [ OK ] sse.sse_x1_one_at_time (mean 29.182ms, confidence interval +- 0.149473%) 115 | [ RUN ] sse.sse_x1 116 | [ OK ] sse.sse_x1 (mean 28.833ms, confidence interval +- 0.074605%) 117 | [ RUN ] avx.avx_x1_one_at_time 118 | [ OK ] avx.avx_x1_one_at_time (mean 29.205ms, confidence interval +- 0.138581%) 119 | [ RUN ] avx.avx_x1 120 | [ OK ] avx.avx_x1 (mean 28.871ms, confidence interval +- 0.200034%) 121 | [ RUN ] avx.avx_x4 122 | [ OK ] avx.avx_x4 (mean 11.078ms, confidence interval +- 0.140484%) 123 | [ RUN ] avx.avx_x8 124 | [ OK ] avx.avx_x8 (mean 5.650ms, confidence interval +- 0.118668%) 125 | [ RUN ] avx.avx_x16 126 | [ OK ] avx.avx_x16 (mean 2.413ms, confidence interval +- 0.223049%) 127 | [ RUN ] shani.shani 128 | [ OK ] shani.shani (mean -4.-941us, confidence interval +- -1102817647134393.500000%) 129 | [ RUN ] shani.shani_one_at_time 130 | [ OK ] shani.shani_one_at_time (mean 0.-140us, confidence interval +- -70078699044457400.000000%) 131 | [==========] 9 benchmarks ran. 132 | [ PASSED ] 9 benchmarks. 133 | ``` 134 | We see that AVX-512 (x16) runs 12 times faster than a single block 135 | implementation. This is slightly better than a native SHA extension CPU were 136 | gains were about x10. 137 | 138 | A similar benchmark on a Raspberry Pi 4 model B: 139 | ```shell 140 | $ ./src/bench 141 | [==========] Running 3 benchmarks. 142 | [ RUN ] armv8.neon_x1_one_at_time 143 | [ OK ] armv8.neon_x1_one_at_time (mean 79.853ms, confidence interval +- 0.157599%) 144 | [ RUN ] armv8.neon_x1 145 | [ OK ] armv8.neon_x1 (mean 79.035ms, confidence interval +- 0.070254%) 146 | [ RUN ] armv8.neon_x4 147 | [ OK ] armv8.neon_x4 (mean 58.356ms, confidence interval +- 0.076089%) 148 | [==========] 3 benchmarks ran. 149 | [ PASSED ] 3 benchmarks. 150 | ``` 151 | We see that a ASIMD version 4 blocks at a time, while not that much of an 152 | improvement as in the x86-64, is still 27% faster. 153 | 154 | ## Using the library 155 | The library exposes several architecture dependent SHA implementations. It is the caller responsibility to choose the right one. This can be done at runtime once at application launch. For x86_64 systems for example one can use cpuid.h, see [here](https://github.com/potuz/mammon/blob/main/ssz/hasher.cpp#L43) for an example on how to choose an implementation. 156 | 157 | Most vectorized implementations exploit the fact that independent branches in the Merkle tree can be hashed in "parallel" within one CPU, to take advantage of this, 158 | Merkleization algorithms that loop over consecutive tree layers hashing two blocks at a time need to be updated to pass the entire layer, or all consecutive blocks. A naive example on how to accomplish this can be found in [this document](https://hackmd.io/80mJ75A5QeeRcrNmqcuU-g?view) 159 | 160 | Some examples benchmarks running the algorithms in this library vs prysm's current implementation are 161 | ``` 162 | goos: linux 163 | goarch: amd64 164 | cpu: AMD Ryzen 5 3600 6-Core Processor 165 | BenchmarkHashBalanceShani-12 160 7629704 ns/op 166 | BenchmarkHashBalanceShaniPrysm-12 15 74012328 ns/op 167 | PASS 168 | 169 | goos: linux 170 | goarch: amd64 171 | cpu: Intel(R) Core(TM) i5-3570 CPU @ 3.40GHz 172 | BenchmarkHashBalanceAVX-4 68 26677965 ns/op 173 | BenchmarkHashBalancePrysm-4 7 165434686 ns/op 174 | PASS 175 | 176 | goos: linux 177 | goarch: amd64 178 | cpu: Intel(R) Core(TM) i5-7200U CPU @ 2.50GHz 179 | BenchmarkHashBalanceAVX2-4 121 9711482 ns/op 180 | BenchmarkHashBalancePrysm-4 10 103716714 ns/op 181 | PASS 182 | ``` 183 | 184 | ## Nim bindings 185 | 186 | The library offers low-level bindings for Nim that can be installed using: 187 | 188 | ```sh 189 | nimble install https://github.com/OffchainLabs/hashtree/ 190 | ``` 191 | 192 | or used in a package with: 193 | 194 | ```nim 195 | requires "https://github.com/OffchainLabs/hashtree/" 196 | ``` 197 | 198 | ## Rust bindings 199 | 200 | At the top directory you can run 201 | 202 | ``` 203 | $ make rust_bindings 204 | ``` 205 | 206 | To run tests: 207 | 208 | ``` 209 | $ make rust_tests 210 | ``` 211 | 212 | See the `examples` directory for examples on how to use the library 213 | -------------------------------------------------------------------------------- /src/bench.c: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021 Prysmatic Labs 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | #ifdef __x86_64__ 25 | #include 26 | #endif 27 | #include 28 | 29 | #include "hashtree.h" 30 | #include "ubench.h" 31 | #ifdef HAVE_OPENSSL 32 | #include 33 | #endif 34 | 35 | #define buffer_size 4800000 36 | 37 | #ifdef __aarch64__ 38 | UBENCH_EX(armv8, neon_x1_one_at_time) { 39 | int *buffer = (int *)malloc(buffer_size); 40 | unsigned char digest[32]; 41 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 42 | buffer[i] = rand(); 43 | } 44 | UBENCH_DO_BENCHMARK() { 45 | for (int i = 0; i < buffer_size; i += 64) { 46 | hashtree_sha256_neon_x1(digest, (unsigned char *)(buffer + i / sizeof(int)), 1); 47 | } 48 | } 49 | free(buffer); 50 | } 51 | 52 | UBENCH_EX(armv8, neon_x1) { 53 | int *buffer = (int *)malloc(buffer_size); 54 | unsigned char *digest = (unsigned char *)malloc(buffer_size / 2); 55 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 56 | buffer[i] = rand(); 57 | } 58 | UBENCH_DO_BENCHMARK() { hashtree_sha256_neon_x1(digest, (unsigned char *)buffer, buffer_size / 64); } 59 | free(buffer); 60 | free(digest); 61 | } 62 | 63 | UBENCH_EX(armv8, neon_x4) { 64 | int *buffer = (int *)malloc(buffer_size); 65 | unsigned char *digest = (unsigned char *)malloc(buffer_size / 2); 66 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 67 | buffer[i] = rand(); 68 | } 69 | UBENCH_DO_BENCHMARK() { hashtree_sha256_neon_x4(digest, (unsigned char *)buffer, buffer_size / 64); } 70 | free(buffer); 71 | free(digest); 72 | } 73 | 74 | UBENCH_EX(armv8, crypto) { 75 | int *buffer = (int *)malloc(buffer_size); 76 | unsigned char *digest = (unsigned char *)malloc(buffer_size / 2); 77 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 78 | buffer[i] = rand(); 79 | } 80 | UBENCH_DO_BENCHMARK() { hashtree_sha256_sha_x1(digest, (unsigned char *)buffer, buffer_size / 64); } 81 | free(buffer); 82 | free(digest); 83 | } 84 | 85 | #endif 86 | 87 | UBENCH_EX(generic, generic) { 88 | int *buffer = (int *)malloc(buffer_size); 89 | unsigned char digest[32]; 90 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 91 | buffer[i] = rand(); 92 | } 93 | UBENCH_DO_BENCHMARK() { 94 | for (int i = 0; i < buffer_size; i += 64) { 95 | hashtree_sha256_generic(digest, (unsigned char *)(buffer + i / sizeof(int)), 1); 96 | } 97 | } 98 | free(buffer); 99 | } 100 | 101 | #ifdef __x86_64__ 102 | UBENCH_EX(sse, sse_x1_one_at_time) { 103 | int *buffer = (int *)malloc(buffer_size); 104 | unsigned char digest[32]; 105 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 106 | buffer[i] = rand(); 107 | } 108 | UBENCH_DO_BENCHMARK() { 109 | for (int i = 0; i < buffer_size; i += 64) { 110 | hashtree_sha256_sse_x1(digest, (unsigned char *)(buffer + i / sizeof(int)), 1); 111 | } 112 | } 113 | free(buffer); 114 | } 115 | 116 | UBENCH_EX(sse, sse_x1) { 117 | int *buffer = (int *)malloc(buffer_size); 118 | unsigned char *digest = (unsigned char *)malloc(buffer_size / 2); 119 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 120 | buffer[i] = rand(); 121 | } 122 | UBENCH_DO_BENCHMARK() { hashtree_sha256_sse_x1(digest, (unsigned char *)buffer, buffer_size / 64); } 123 | free(buffer); 124 | free(digest); 125 | } 126 | 127 | UBENCH_EX(avx, avx_x1_one_at_time) { 128 | uint32_t a = 0, b = 0, c = 0, d = 0; 129 | __get_cpuid_count(1, 0, &a, &b, &c, &d); 130 | if (!(c & bit_AVX)) { 131 | return; 132 | } 133 | int *buffer = (int *)malloc(buffer_size); 134 | unsigned char digest[32]; 135 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 136 | buffer[i] = rand(); 137 | } 138 | UBENCH_DO_BENCHMARK() { 139 | for (int i = 0; i < buffer_size; i += 64) { 140 | hashtree_sha256_avx_x1(digest, (unsigned char *)(buffer + i / sizeof(int)), 1); 141 | } 142 | } 143 | free(buffer); 144 | } 145 | 146 | UBENCH_EX(avx, avx_x1) { 147 | uint32_t a = 0, b = 0, c = 0, d = 0; 148 | __get_cpuid_count(1, 0, &a, &b, &c, &d); 149 | if (!(c & bit_AVX)) { 150 | return; 151 | } 152 | 153 | int *buffer = (int *)malloc(buffer_size); 154 | unsigned char *digest = (unsigned char *)malloc(buffer_size / 2); 155 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 156 | buffer[i] = rand(); 157 | } 158 | UBENCH_DO_BENCHMARK() { hashtree_sha256_avx_x1(digest, (unsigned char *)buffer, buffer_size / 64); } 159 | free(buffer); 160 | free(digest); 161 | } 162 | 163 | UBENCH_EX(avx, avx_x4) { 164 | uint32_t a = 0, b = 0, c = 0, d = 0; 165 | __get_cpuid_count(1, 0, &a, &b, &c, &d); 166 | if (!(c & bit_AVX)) { 167 | return; 168 | } 169 | 170 | int *buffer = (int *)malloc(buffer_size); 171 | unsigned char *digest = (unsigned char *)malloc(buffer_size / 2); 172 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 173 | buffer[i] = rand(); 174 | } 175 | UBENCH_DO_BENCHMARK() { hashtree_sha256_avx_x4(digest, (unsigned char *)buffer, buffer_size / 64); } 176 | free(buffer); 177 | free(digest); 178 | } 179 | 180 | UBENCH_EX(avx, avx_x8) { 181 | uint32_t a = 0, b = 0, c = 0, d = 0; 182 | __get_cpuid_count(7, 0, &a, &b, &c, &d); 183 | if (!(b & bit_AVX2)) { 184 | return; 185 | } 186 | 187 | int *buffer = (int *)malloc(buffer_size); 188 | unsigned char *digest = (unsigned char *)malloc(buffer_size / 2); 189 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 190 | buffer[i] = rand(); 191 | } 192 | UBENCH_DO_BENCHMARK() { hashtree_sha256_avx2_x8(digest, (unsigned char *)buffer, buffer_size / 64); } 193 | free(buffer); 194 | free(digest); 195 | } 196 | 197 | UBENCH_EX(avx, avx_x16) { 198 | uint32_t a = 0, b = 0, c = 0, d = 0; 199 | __get_cpuid_count(7, 0, &a, &b, &c, &d); 200 | if (!(b & bit_AVX512F) || !(b & bit_AVX512VL)) { 201 | return; 202 | } 203 | 204 | int *buffer = (int *)malloc(buffer_size); 205 | unsigned char *digest = (unsigned char *)malloc(buffer_size / 2); 206 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 207 | buffer[i] = rand(); 208 | } 209 | UBENCH_DO_BENCHMARK() { hashtree_sha256_avx512_x16(digest, (unsigned char *)buffer, buffer_size / 64); } 210 | free(buffer); 211 | free(digest); 212 | } 213 | 214 | UBENCH_EX(shani, shani) { 215 | uint32_t a = 0, b = 0, c = 0, d = 0; 216 | __get_cpuid_count(7, 0, &a, &b, &c, &d); 217 | if (!(b & bit_SHA)) { 218 | return; 219 | } 220 | 221 | int *buffer = (int *)malloc(buffer_size); 222 | unsigned char *digest = (unsigned char *)malloc(buffer_size / 2); 223 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 224 | buffer[i] = rand(); 225 | } 226 | UBENCH_DO_BENCHMARK() { hashtree_sha256_shani_x2(digest, (unsigned char *)buffer, buffer_size / 64); } 227 | free(buffer); 228 | free(digest); 229 | } 230 | 231 | UBENCH_EX(shani, shani_one_at_time) { 232 | uint32_t a = 0, b = 0, c = 0, d = 0; 233 | __get_cpuid_count(7, 0, &a, &b, &c, &d); 234 | if (!(b & bit_SHA)) { 235 | return; 236 | } 237 | int *buffer = (int *)malloc(buffer_size); 238 | unsigned char digest[32]; 239 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 240 | buffer[i] = rand(); 241 | } 242 | UBENCH_DO_BENCHMARK() { 243 | for (int i = 0; i < buffer_size; i += 64) { 244 | hashtree_sha256_shani_x2(digest, (unsigned char *)(buffer + i / sizeof(int)), 1); 245 | } 246 | } 247 | free(buffer); 248 | } 249 | #endif 250 | #ifdef HAVE_OPENSSL 251 | UBENCH_EX(openssl, openssl_one_at_time) { 252 | int *buffer = (int *)malloc(buffer_size); 253 | unsigned char digest[32]; 254 | for (int i = 0; i < buffer_size / sizeof(int); i++) { 255 | buffer[i] = rand(); 256 | } 257 | UBENCH_DO_BENCHMARK() { 258 | for (int i = 0; i < buffer_size; i += 64) { 259 | SHA256((unsigned char *)(buffer + i / sizeof(int)), 64, digest); 260 | } 261 | } 262 | free(buffer); 263 | } 264 | #endif 265 | 266 | UBENCH_MAIN() 267 | -------------------------------------------------------------------------------- /src/sha256_generic.c: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021-2025 Offchain Labs 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | 24 | This code is based on Intel's implementation found in 25 | https://github.com/intel/intel-ipsec-mb 26 | Such software is licensed under the BSD 3-Clause License and is 27 | Copyright (c) 2012-2023, Intel Corporation 28 | */ 29 | 30 | #include 31 | 32 | static const uint32_t init[] = { 33 | 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, 34 | }; 35 | 36 | static const uint32_t K[] = { 37 | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 38 | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 39 | 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 40 | 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 41 | 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 42 | 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 43 | 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 44 | 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 45 | }; 46 | 47 | static const uint32_t P[] = { 48 | 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 49 | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374, 50 | 0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254, 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa, 51 | 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7, 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0, 52 | 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd, 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16, 53 | 0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537, 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37, 54 | 0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7, 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890, 55 | 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c, 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76, 56 | }; 57 | 58 | static inline uint32_t rotr(uint32_t x, int r) { return (x >> r) | (x << (32 - r)); } 59 | 60 | static inline uint32_t be32(const unsigned char* b) { 61 | return ((uint32_t)b[0] << 24) | ((uint32_t)b[1] << 16) | ((uint32_t)b[2] << 8) | b[3]; 62 | } 63 | 64 | void hashtree_sha256_generic(unsigned char* output, const unsigned char* input, uint64_t count) { 65 | uint32_t w[16]; 66 | for (int k = 0; k < count; k++) { 67 | // First 16 roudnds 68 | uint32_t a = init[0]; 69 | uint32_t b = init[1]; 70 | uint32_t c = init[2]; 71 | uint32_t d = init[3]; 72 | uint32_t e = init[4]; 73 | uint32_t f = init[5]; 74 | uint32_t g = init[6]; 75 | uint32_t h = init[7]; 76 | for (int i = 0; i < 16; i++) { 77 | w[i] = be32(&input[k * 64 + (i << 2)]); 78 | uint32_t t1 = h + (rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25)) + ((e & f) ^ (~e & g)) + K[i] + w[i]; 79 | uint32_t t2 = (rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22)) + ((a & b) ^ (a & c) ^ (b & c)); 80 | h = g; 81 | g = f; 82 | f = e; 83 | e = d + t1; 84 | d = c; 85 | c = b; 86 | b = a; 87 | a = t1 + t2; 88 | } 89 | 90 | // Last 48 rounds with loop unrolling (4 rounds at a time) 91 | for (int i = 16; i < 64; i += 4) { 92 | // Round i 93 | uint32_t v1 = w[(i - 2) & 0xF]; 94 | uint32_t t1 = rotr(v1, 17) ^ rotr(v1, 19) ^ (v1 >> 10); 95 | uint32_t v2 = w[(i - 15) & 0xF]; 96 | uint32_t t2 = rotr(v2, 7) ^ rotr(v2, 18) ^ (v2 >> 3); 97 | w[i & 0xF] += t1 + w[(i - 7) & 0xF] + t2; 98 | 99 | t1 = h + (rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25)) + ((e & f) ^ (~e & g)) + K[i] + w[i & 0xF]; 100 | t2 = (rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22)) + ((a & b) ^ (a & c) ^ (b & c)); 101 | h = g; g = f; f = e; e = d + t1; d = c; c = b; b = a; a = t1 + t2; 102 | 103 | // Round i+1 104 | v1 = w[(i + 1 - 2) & 0xF]; 105 | t1 = rotr(v1, 17) ^ rotr(v1, 19) ^ (v1 >> 10); 106 | v2 = w[(i + 1 - 15) & 0xF]; 107 | t2 = rotr(v2, 7) ^ rotr(v2, 18) ^ (v2 >> 3); 108 | w[(i + 1) & 0xF] += t1 + w[(i + 1 - 7) & 0xF] + t2; 109 | 110 | t1 = h + (rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25)) + ((e & f) ^ (~e & g)) + K[i + 1] + w[(i + 1) & 0xF]; 111 | t2 = (rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22)) + ((a & b) ^ (a & c) ^ (b & c)); 112 | h = g; g = f; f = e; e = d + t1; d = c; c = b; b = a; a = t1 + t2; 113 | 114 | // Round i+2 115 | v1 = w[(i + 2 - 2) & 0xF]; 116 | t1 = rotr(v1, 17) ^ rotr(v1, 19) ^ (v1 >> 10); 117 | v2 = w[(i + 2 - 15) & 0xF]; 118 | t2 = rotr(v2, 7) ^ rotr(v2, 18) ^ (v2 >> 3); 119 | w[(i + 2) & 0xF] += t1 + w[(i + 2 - 7) & 0xF] + t2; 120 | 121 | t1 = h + (rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25)) + ((e & f) ^ (~e & g)) + K[i + 2] + w[(i + 2) & 0xF]; 122 | t2 = (rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22)) + ((a & b) ^ (a & c) ^ (b & c)); 123 | h = g; g = f; f = e; e = d + t1; d = c; c = b; b = a; a = t1 + t2; 124 | 125 | // Round i+3 126 | v1 = w[(i + 3 - 2) & 0xF]; 127 | t1 = rotr(v1, 17) ^ rotr(v1, 19) ^ (v1 >> 10); 128 | v2 = w[(i + 3 - 15) & 0xF]; 129 | t2 = rotr(v2, 7) ^ rotr(v2, 18) ^ (v2 >> 3); 130 | w[(i + 3) & 0xF] += t1 + w[(i + 3 - 7) & 0xF] + t2; 131 | 132 | t1 = h + (rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25)) + ((e & f) ^ (~e & g)) + K[i + 3] + w[(i + 3) & 0xF]; 133 | t2 = (rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22)) + ((a & b) ^ (a & c) ^ (b & c)); 134 | h = g; g = f; f = e; e = d + t1; d = c; c = b; b = a; a = t1 + t2; 135 | } 136 | // Add original digest 137 | a += init[0]; 138 | b += init[1]; 139 | c += init[2]; 140 | d += init[3]; 141 | e += init[4]; 142 | f += init[5]; 143 | g += init[6]; 144 | h += init[7]; 145 | 146 | // Rounds with padding 147 | uint32_t h0 = a; 148 | uint32_t h1 = b; 149 | uint32_t h2 = c; 150 | uint32_t h3 = d; 151 | uint32_t h4 = e; 152 | uint32_t h5 = f; 153 | uint32_t h6 = g; 154 | uint32_t h7 = h; 155 | // Padding rounds with loop unrolling (4 rounds at a time) 156 | for (int i = 0; i < 64; i += 4) { 157 | // Round i 158 | uint32_t t1 = h + (rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25)) + ((e & f) ^ (~e & g)) + P[i]; 159 | uint32_t t2 = (rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22)) + ((a & b) ^ (a & c) ^ (b & c)); 160 | h = g; g = f; f = e; e = d + t1; d = c; c = b; b = a; a = t1 + t2; 161 | 162 | // Round i+1 163 | t1 = h + (rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25)) + ((e & f) ^ (~e & g)) + P[i + 1]; 164 | t2 = (rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22)) + ((a & b) ^ (a & c) ^ (b & c)); 165 | h = g; g = f; f = e; e = d + t1; d = c; c = b; b = a; a = t1 + t2; 166 | 167 | // Round i+2 168 | t1 = h + (rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25)) + ((e & f) ^ (~e & g)) + P[i + 2]; 169 | t2 = (rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22)) + ((a & b) ^ (a & c) ^ (b & c)); 170 | h = g; g = f; f = e; e = d + t1; d = c; c = b; b = a; a = t1 + t2; 171 | 172 | // Round i+3 173 | t1 = h + (rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25)) + ((e & f) ^ (~e & g)) + P[i + 3]; 174 | t2 = (rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22)) + ((a & b) ^ (a & c) ^ (b & c)); 175 | h = g; g = f; f = e; e = d + t1; d = c; c = b; b = a; a = t1 + t2; 176 | } 177 | 178 | h0 += a; 179 | h1 += b; 180 | h2 += c; 181 | h3 += d; 182 | h4 += e; 183 | h5 += f; 184 | h6 += g; 185 | h7 += h; 186 | unsigned char* out = &output[k * 32]; 187 | out[0] = h0 >> 24; out[1] = h0 >> 16; out[2] = h0 >> 8; out[3] = h0; 188 | out[4] = h1 >> 24; out[5] = h1 >> 16; out[6] = h1 >> 8; out[7] = h1; 189 | out[8] = h2 >> 24; out[9] = h2 >> 16; out[10] = h2 >> 8; out[11] = h2; 190 | out[12] = h3 >> 24; out[13] = h3 >> 16; out[14] = h3 >> 8; out[15] = h3; 191 | out[16] = h4 >> 24; out[17] = h4 >> 16; out[18] = h4 >> 8; out[19] = h4; 192 | out[20] = h5 >> 24; out[21] = h5 >> 16; out[22] = h5 >> 8; out[23] = h5; 193 | out[24] = h6 >> 24; out[25] = h6 >> 16; out[26] = h6 >> 8; out[27] = h6; 194 | out[28] = h7 >> 24; out[29] = h7 >> 16; out[30] = h7 >> 8; out[31] = h7; 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /src/sha256_armv8_neon_x1.S: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021-2024 Prysmatic Labs 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | 25 | #ifdef __aarch64__ 26 | .text 27 | .arch armv8-a 28 | .altmacro 29 | 30 | output .req x0 31 | input .req x1 32 | count .req x2 33 | last .req x2 34 | 35 | digest .req x19 36 | k256 .req x20 37 | padding .req x21 38 | 39 | VR0 .req v0 40 | VR1 .req v1 41 | VR2 .req v2 42 | VR3 .req v3 43 | QR0 .req q0 44 | 45 | VTMP0 .req v4 46 | VTMP1 .req v5 47 | VTMP2 .req v6 48 | VTMP3 .req v7 49 | VTMP4 .req v17 50 | VTMP5 .req v18 51 | VTMP6 .req v19 52 | KV0 .req v20 53 | KV1 .req v21 54 | KV2 .req v22 55 | KV3 .req v23 56 | KQ0 .req q20 57 | KQ1 .req q21 58 | KQ2 .req q22 59 | KQ3 .req q23 60 | 61 | VZ .req v16 62 | 63 | A_ .req w3 64 | B_ .req w4 65 | C_ .req w5 66 | D_ .req w6 67 | E_ .req w7 68 | F_ .req w9 69 | G_ .req w10 70 | H_ .req w11 71 | 72 | AX_ .req x3 73 | BX_ .req x4 74 | CX_ .req x5 75 | DX_ .req x6 76 | EX_ .req x7 77 | FX_ .req x9 78 | GX_ .req x10 79 | HX_ .req x11 80 | 81 | 82 | T1 .req w12 83 | T2 .req w13 84 | T3 .req w14 85 | T4 .req w15 86 | T5 .req w22 87 | 88 | ################################################################################################### 89 | # 90 | # the functions roundx_sched for x=1..4 schedule 4 words and perform 4 rounds at the time, interleaving 91 | # the ASIMD instructions with the ALU ones for better use of the RISC pipeline 92 | # 93 | ################################################################################################## 94 | .macro round1_sched A, B, C, D, E, F, G, H, VV0, VV1, VV2, VV3 95 | ext VTMP0.16b, \VV2\().16b, \VV3\().16b, #4 96 | ror T1, \E, #6 97 | ldr T3, [sp] 98 | ror T2, \A, #2 99 | ror T4, \A, #13 100 | 101 | ext VTMP1.16b, \VV0\().16b, \VV1\().16b, #4 // (W1..W4) 102 | eor T2, T2, T4 103 | add \H, \H, T3 104 | ror T3, \E, #11 105 | 106 | add VTMP0.4s, VTMP0.4s, \VV0\().4s // (W0 + W9..W3 + W12) 107 | eor T1, T1, T3 108 | ror T3, \E, #25 109 | ror T4, \A, #22 110 | 111 | ushr VTMP2.4s, VTMP1.4s, #7 112 | eor T1, T1, T3 // Sigma_1 113 | eor T2, T2, T4 // Sigma_0 114 | eor T3, \F, \G 115 | 116 | shl VTMP3.4s, VTMP1.4s, #(32-7) 117 | eor T4, \A, \C 118 | and T3, T3, \E 119 | and T4, T4, \B 120 | eor T3, T3, \G // CH 121 | 122 | ushr VTMP4.4s, VTMP1.4s, #18 123 | add T1, T1, T3 124 | and T3, \A, \C 125 | add \H, \H, T1 126 | 127 | orr VTMP3.16b, VTMP3.16b, VTMP2.16b // Ror^7(W1..W4) 128 | eor T4, T4, T3 // MAJ 129 | add \D, \D, \H 130 | add T2, T2, T4 131 | 132 | ushr VTMP2.4s, VTMP1.4s, #3 133 | add \H, \H, T2 134 | .endm 135 | 136 | .macro round2_sched A, B, C, D, E, F, G, H, VV3 137 | ldr T3, [sp, #4] 138 | ror T1, \E, #6 139 | shl VTMP1.4s, VTMP1.4s, #(32-18) 140 | ror T2, \A, #2 141 | ror T4, \A, #13 142 | add \H, \H, T3 143 | eor VTMP3.16b, VTMP3.16b, VTMP2.16b 144 | ror T3, \E, #11 145 | eor T2, T2, T4 146 | eor T1, T1, T3 147 | eor VTMP1.16b, VTMP4.16b, VTMP1.16b // Ror^18(W1..W4) 148 | ror T3, \E, #25 149 | ror T4, \A, #22 150 | eor T1, T1, T3 // Sigma_1 151 | zip2 VTMP5.4s, \VV3\().4s, \VV3\().4s // (W12, W12, W13, W13) 152 | eor T2, T2, T4 // Sigma_0 153 | eor T3, \F, \G 154 | eor T4, \A, \C 155 | eor VTMP1.16b, VTMP3.16b, VTMP1.16b // sigma_0(W1..W4) 156 | and T3, T3, \E 157 | and T4, T4, \B 158 | eor T3, T3, \G // CH 159 | ushr VTMP6.4s, VTMP5.4s, #10 160 | add T1, T1, T3 161 | and T3, \A, \C 162 | add \H, \H, T1 163 | ushr VTMP3.2d, VTMP5.2d, #19 // Ror^19(W12, x, W13, x) 164 | eor T4, T4, T3 // MAJ 165 | add \D, \D, \H 166 | add T2, T2, T4 167 | ushr VTMP2.2d, VTMP5.2d, #17 // Ror^17(W12, x, W13, x) 168 | add \H, \H, T2 169 | .endm 170 | 171 | .macro round3_sched A, B, C, D, E, F, G, H 172 | ldr T3, [sp, #8] 173 | ror T1, \E, #6 174 | eor VTMP3.16b, VTMP3.16b, VTMP6.16b 175 | ror T2, \A, #2 176 | ror T4, \A, #13 177 | add \H, \H, T3 178 | add VTMP0.4s, VTMP0.4s, VTMP1.4s // W0 + W9 + sigma_0 179 | ror T3, \E, #11 180 | eor T2, T2, T4 181 | eor T1, T1, T3 182 | eor VTMP1.16b, VTMP3.16b, VTMP2.16b // sigma_1(W12, x, W13, x) 183 | ror T3, \E, #25 184 | ror T4, \A, #22 185 | eor T1, T1, T3 // Sigma_1 186 | xtn VTMP1.2s, VTMP1.2d // sigma_1(W12, W13, 0, 0) 187 | eor T2, T2, T4 // Sigma_0 188 | eor T3, \F, \G 189 | eor T4, \A, \C 190 | add VTMP0.4s, VTMP0.4s, VTMP1.4s // (W16, W17,..) 191 | and T3, T3, \E 192 | and T4, T4, \B 193 | eor T3, T3, \G // CH 194 | zip1 VTMP2.4s, VTMP0.4s, VTMP0.4s // (W16, W16, W17, W17) 195 | add T1, T1, T3 196 | and T3, \A, \C 197 | add \H, \H, T1 198 | eor T4, T4, T3 // MAJ 199 | add \D, \D, \H 200 | add T2, T2, T4 201 | ushr VTMP1.4s, VTMP2.4s, #10 202 | add \H, \H, T2 203 | .endm 204 | 205 | .macro round4_sched A, B, C, D, E, F, G, H, VV0 206 | ldr T3, [sp, #12] 207 | ror T1, \E, #6 208 | ror T2, \A, #2 209 | ushr VTMP3.2d, VTMP2.2d, #19 // Ror^19(W16, x, W17, x) 210 | ror T4, \A, #13 211 | add \H, \H, T3 212 | ror T3, \E, #11 213 | eor T2, T2, T4 214 | ushr VTMP2.2d, VTMP2.2d, #17 // ROR^17(W16, x, W17, x) 215 | eor T1, T1, T3 216 | ror T3, \E, #25 217 | ror T4, \A, #22 218 | eor T1, T1, T3 // Sigma_1 219 | eor VTMP1.16b, VTMP1.16b, VTMP3.16b 220 | eor T2, T2, T4 // Sigma_0 221 | eor T3, \F, \G 222 | eor T4, \A, \C 223 | eor VTMP1.16b, VTMP1.16b, VTMP2.16b // sigma_1(W16, x, W17, x) 224 | and T3, T3, \E 225 | and T4, T4, \B 226 | eor T3, T3, \G // CH 227 | uzp1 VTMP1.4s, VZ.4s, VTMP1.4s 228 | add T1, T1, T3 229 | and T3, \A, \C 230 | add \H, \H, T1 231 | eor T4, T4, T3 // MAJ 232 | add \D, \D, \H 233 | add T2, T2, T4 234 | add \VV0\().4s, VTMP1.4s, VTMP0.4s 235 | add \H, \H, T2 236 | .endm 237 | 238 | 239 | .macro four_rounds_sched A, B, C, D, E, F, G, H, VV0, VV1, VV2, VV3 240 | round1_sched \A, \B, \C, \D, \E, \F, \G, \H, \VV0, \VV1, \VV2, \VV3 241 | round2_sched \H, \A, \B, \C, \D, \E, \F, \G, \VV3 242 | round3_sched \G, \H, \A, \B, \C, \D, \E, \F 243 | round4_sched \F, \G, \H, \A, \B, \C, \D, \E, \VV0 244 | .endm 245 | 246 | ################################################################################### 247 | # one_round performs a one round transition of the working variables A..H 248 | # it reads pre-scheduled words from ptr + offset. 249 | ################################################################################## 250 | .macro one_round A, B, C, D, E, F, G, H, ptr, offset 251 | ldr T3, [\ptr, \offset] 252 | ror T1, \E, #6 253 | ror T2, \A, #2 254 | ror T4, \A, #13 255 | add \H, \H, T3 256 | ror T3, \E, #11 257 | eor T2, T2, T4 258 | eor T1, T1, T3 259 | ror T3, \E, #25 260 | ror T4, \A, #22 261 | eor T1, T1, T3 // Sigma_1 262 | eor T2, T2, T4 // Sigma_0 263 | eor T3, \F, \G 264 | eor T4, \A, \C 265 | and T3, T3, \E 266 | and T4, T4, \B 267 | eor T3, T3, \G // CH 268 | add T1, T1, T3 269 | and T3, \A, \C 270 | add \H, \H, T1 271 | eor T4, T4, T3 // MAJ 272 | add \D, \D, \H 273 | add T2, T2, T4 274 | add \H, \H, T2 275 | .endm 276 | 277 | ############################################################################## 278 | # 279 | # four_rounds performs 4 transitions of the working variables A..H. 280 | # it reads pre-scheduled words from ptr+offset 281 | # 282 | ############################################################################# 283 | .macro four_rounds A, B, C, D, E, F, G, H, ptr, offset 284 | one_round \A, \B, \C, \D, \E, \F, \G, \H, \ptr, \offset 285 | one_round \H, \A, \B, \C, \D, \E, \F, \G, \ptr, \offset + 4 286 | one_round \G, \H, \A, \B, \C, \D, \E, \F, \ptr, \offset + 8 287 | one_round \F, \G, \H, \A, \B, \C, \D, \E, \ptr, \offset + 12 288 | .endm 289 | 290 | ######################################################################################################## 291 | # 292 | # void sha256_armv8_neon_x1( unsigned char *output, unsigned char *input, size_t count) 293 | # 294 | # armv8-a implementation with Neon but no crypto extensions 295 | # as in the Cortex A-72 of a Raspberry-Pi 4.b 296 | # 297 | # It reads one block at a time, and schedules 4 words at the same time using ASIMD instructions 298 | # There are no bound checks, caller is responsible to check that memory up to output + 32*count 299 | # is writable. 300 | # 301 | ######################################################################################################## 302 | 303 | #ifdef __APPLE__ 304 | .global _hashtree_sha256_neon_x1 305 | #else 306 | .global hashtree_sha256_neon_x1 307 | .type hashtree_sha256_neon_x1,%function 308 | #endif 309 | .align 4 310 | #ifdef __APPLE__ 311 | _hashtree_sha256_neon_x1: 312 | #else 313 | hashtree_sha256_neon_x1: 314 | #endif 315 | sub sp, sp, #64 316 | stp digest,k256, [sp, #48] 317 | 318 | movi VZ.4s, #0 319 | stp padding, x22, [sp, #32] 320 | #ifdef __APPLE__ 321 | adrp digest, .LDIGEST@PAGE 322 | add digest, digest, .LDIGEST@PAGEOFF 323 | adrp padding, .LPADDING@PAGE 324 | add padding, padding, .LPADDING@PAGEOFF 325 | #else 326 | adrp digest, .LDIGEST 327 | add digest, digest, #:lo12:.LDIGEST 328 | adrp padding, .LPADDING 329 | add padding, padding, #:lo12:.LPADDING 330 | #endif 331 | add last, output, count, lsl #5 332 | 333 | .Lhash_1_block_loop: 334 | # load one block 335 | cmp output, last 336 | beq .Larmv8_neon_x1_finish 337 | 338 | ld1 {VR0.4s, VR1.4s, VR2.4s, VR3.4s}, [input], #64 339 | #ifdef __APPLE__ 340 | adrp k256, .LK256@PAGE 341 | add k256, k256, #:lo12:.LK256@PAGEOFF 342 | #else 343 | adrp k256, .LK256 344 | add k256, k256, #:lo12:.LK256 345 | #endif 346 | # change endianness 347 | rev32 VR0.16b, VR0.16b 348 | rev32 VR1.16b, VR1.16b 349 | rev32 VR2.16b, VR2.16b 350 | rev32 VR3.16b, VR3.16b 351 | 352 | # load initial digest 353 | ldp A_, B_, [digest] 354 | ldp C_, D_, [digest, #8] 355 | ldp E_, F_, [digest, #16] 356 | ldp G_, H_, [digest, #24] 357 | 358 | .rept 3 359 | ld1 {KV0.4s, KV1.4s, KV2.4s, KV3.4s}, [k256], #64 360 | add KV0.4s, KV0.4s, VR0.4s 361 | str KQ0, [sp] 362 | four_rounds_sched A_, B_, C_, D_, E_, F_, G_, H_, VR0, VR1, VR2, VR3 363 | add KV1.4s, KV1.4s, VR1.4s 364 | str KQ1, [sp] 365 | four_rounds_sched E_, F_, G_, H_, A_, B_, C_, D_, VR1, VR2, VR3, VR0 366 | add KV2.4s, KV2.4s, VR2.4s 367 | str KQ2, [sp] 368 | four_rounds_sched A_, B_, C_, D_, E_, F_, G_, H_, VR2, VR3, VR0, VR1 369 | add KV3.4s, KV3.4s, VR3.4s 370 | str KQ3, [sp] 371 | four_rounds_sched E_, F_, G_, H_, A_, B_, C_, D_, VR3, VR0, VR1, VR2 372 | .endr 373 | .Lremaining_rounds: 374 | ld1 {KV0.4s, KV1.4s, KV2.4s, KV3.4s}, [k256], #64 375 | add KV0.4s, KV0.4s, VR0.4s 376 | str KQ0, [sp] 377 | four_rounds A_, B_, C_, D_, E_, F_, G_, H_, sp, #0 378 | add KV1.4s, KV1.4s, VR1.4s 379 | str KQ1, [sp] 380 | four_rounds E_, F_, G_, H_, A_, B_, C_, D_, sp, #0 381 | add KV2.4s, KV2.4s, VR2.4s 382 | str KQ2, [sp] 383 | four_rounds A_, B_, C_, D_, E_, F_, G_, H_, sp, #0 384 | add KV3.4s, KV3.4s, VR3.4s 385 | str KQ3, [sp] 386 | four_rounds E_, F_, G_, H_, A_, B_, C_, D_, sp, #0 387 | 388 | .Lrounds_with_padding: 389 | ldp T1, T2, [digest] 390 | ldp T3, T4, [digest, #8] 391 | add A_, A_, T1 392 | add B_, B_, T2 393 | add C_, C_, T3 394 | add D_, D_, T4 395 | ldp T1, T2, [digest, #16] 396 | stp A_, B_, [sp] 397 | stp C_, D_, [sp, #8] 398 | ldp T3, T4, [digest, #24] 399 | add E_, E_, T1 400 | add F_, F_, T2 401 | add G_, G_, T3 402 | stp E_, F_, [sp, #16] 403 | add H_, H_, T4 404 | stp G_, H_, [sp, #24] 405 | 406 | .irp i,0,1,2,3,4,5,6,7 407 | four_rounds A_, B_, C_, D_, E_, F_, G_, H_, padding, \i * 32 408 | four_rounds E_, F_, G_, H_, A_, B_, C_, D_, padding, \i * 32 + 16 409 | .endr 410 | 411 | ldp T1, T2, [sp] 412 | ldp T3, T4, [sp, #8] 413 | add A_, A_, T1 414 | add B_, B_, T2 415 | rev32 AX_, AX_ 416 | rev32 BX_, BX_ 417 | add C_, C_, T3 418 | add D_, D_, T4 419 | stp A_, B_, [output], #8 420 | ldp T1, T2, [sp, #16] 421 | rev32 CX_, CX_ 422 | rev32 DX_, DX_ 423 | stp C_, D_, [output], #8 424 | ldp T3, T4, [sp, #24] 425 | add E_, E_, T1 426 | add F_, F_, T2 427 | rev32 EX_, EX_ 428 | rev32 FX_, FX_ 429 | add G_, G_, T3 430 | add H_, H_, T4 431 | rev32 GX_, GX_ 432 | rev32 HX_, HX_ 433 | stp E_, F_, [output], #8 434 | stp G_, H_, [output], #8 435 | 436 | b .Lhash_1_block_loop 437 | 438 | .Larmv8_neon_x1_finish: 439 | ldp digest,k256, [sp, #48] 440 | ldp padding, x22, [sp, #32] 441 | add sp, sp, #64 442 | ret 443 | 444 | .section .rodata, "a" 445 | .align 4 446 | .LDIGEST: 447 | .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,\ 448 | 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 449 | .LK256: 450 | .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,\ 451 | 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,\ 452 | 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,\ 453 | 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,\ 454 | 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,\ 455 | 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,\ 456 | 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,\ 457 | 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,\ 458 | 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,\ 459 | 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,\ 460 | 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,\ 461 | 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,\ 462 | 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,\ 463 | 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,\ 464 | 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,\ 465 | 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 466 | 467 | .LPADDING: 468 | .word 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,\ 469 | 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,\ 470 | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,\ 471 | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,\ 472 | 0x649b69c1, 0xf0fe4786, 0xfe1edc6, 0x240cf254,\ 473 | 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,\ 474 | 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,\ 475 | 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,\ 476 | 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,\ 477 | 0xa35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,\ 478 | 0x7f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,\ 479 | 0x17406110, 0xd8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,\ 480 | 0x83613bda, 0xdb48a363, 0xb02e931, 0x6fd15ca7,\ 481 | 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,\ 482 | 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,\ 483 | 0xd2c741c6, 0x7237ea3, 0xa4954b68, 0x4c191d76 484 | #endif 485 | -------------------------------------------------------------------------------- /src/sha256_avx_x4.S: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021-2024 Prysmatic Labs 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | 24 | This code is based on Intel's implementation found in 25 | https://github.com/intel/intel-ipsec-mb 26 | Such software is licensed under the BSD 3-Clause License and is 27 | Copyright (c) 2012-2023, Intel Corporation 28 | */ 29 | 30 | #ifdef __x86_64__ 31 | .intel_syntax noprefix 32 | .section .rodata 33 | .align 64 34 | 35 | .LK256_4: 36 | .quad 0x428a2f98428a2f98, 0x428a2f98428a2f98 37 | .quad 0x7137449171374491, 0x7137449171374491 38 | .quad 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf 39 | .quad 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 40 | .quad 0x3956c25b3956c25b, 0x3956c25b3956c25b 41 | .quad 0x59f111f159f111f1, 0x59f111f159f111f1 42 | .quad 0x923f82a4923f82a4, 0x923f82a4923f82a4 43 | .quad 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 44 | .quad 0xd807aa98d807aa98, 0xd807aa98d807aa98 45 | .quad 0x12835b0112835b01, 0x12835b0112835b01 46 | .quad 0x243185be243185be, 0x243185be243185be 47 | .quad 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 48 | .quad 0x72be5d7472be5d74, 0x72be5d7472be5d74 49 | .quad 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe 50 | .quad 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 51 | .quad 0xc19bf174c19bf174, 0xc19bf174c19bf174 52 | .quad 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 53 | .quad 0xefbe4786efbe4786, 0xefbe4786efbe4786 54 | .quad 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 55 | .quad 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc 56 | .quad 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f 57 | .quad 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa 58 | .quad 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc 59 | .quad 0x76f988da76f988da, 0x76f988da76f988da 60 | .quad 0x983e5152983e5152, 0x983e5152983e5152 61 | .quad 0xa831c66da831c66d, 0xa831c66da831c66d 62 | .quad 0xb00327c8b00327c8, 0xb00327c8b00327c8 63 | .quad 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 64 | .quad 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 65 | .quad 0xd5a79147d5a79147, 0xd5a79147d5a79147 66 | .quad 0x06ca635106ca6351, 0x06ca635106ca6351 67 | .quad 0x1429296714292967, 0x1429296714292967 68 | .quad 0x27b70a8527b70a85, 0x27b70a8527b70a85 69 | .quad 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 70 | .quad 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc 71 | .quad 0x53380d1353380d13, 0x53380d1353380d13 72 | .quad 0x650a7354650a7354, 0x650a7354650a7354 73 | .quad 0x766a0abb766a0abb, 0x766a0abb766a0abb 74 | .quad 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e 75 | .quad 0x92722c8592722c85, 0x92722c8592722c85 76 | .quad 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 77 | .quad 0xa81a664ba81a664b, 0xa81a664ba81a664b 78 | .quad 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 79 | .quad 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 80 | .quad 0xd192e819d192e819, 0xd192e819d192e819 81 | .quad 0xd6990624d6990624, 0xd6990624d6990624 82 | .quad 0xf40e3585f40e3585, 0xf40e3585f40e3585 83 | .quad 0x106aa070106aa070, 0x106aa070106aa070 84 | .quad 0x19a4c11619a4c116, 0x19a4c11619a4c116 85 | .quad 0x1e376c081e376c08, 0x1e376c081e376c08 86 | .quad 0x2748774c2748774c, 0x2748774c2748774c 87 | .quad 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 88 | .quad 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 89 | .quad 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a 90 | .quad 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f 91 | .quad 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 92 | .quad 0x748f82ee748f82ee, 0x748f82ee748f82ee 93 | .quad 0x78a5636f78a5636f, 0x78a5636f78a5636f 94 | .quad 0x84c8781484c87814, 0x84c8781484c87814 95 | .quad 0x8cc702088cc70208, 0x8cc702088cc70208 96 | .quad 0x90befffa90befffa, 0x90befffa90befffa 97 | .quad 0xa4506ceba4506ceb, 0xa4506ceba4506ceb 98 | .quad 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 99 | .quad 0xc67178f2c67178f2, 0xc67178f2c67178f2 100 | 101 | .LPADDING_4: 102 | .quad 0xc28a2f98c28a2f98, 0xc28a2f98c28a2f98 103 | .quad 0x7137449171374491, 0x7137449171374491 104 | .quad 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf 105 | .quad 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 106 | .quad 0x3956c25b3956c25b, 0x3956c25b3956c25b 107 | .quad 0x59f111f159f111f1, 0x59f111f159f111f1 108 | .quad 0x923f82a4923f82a4, 0x923f82a4923f82a4 109 | .quad 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 110 | .quad 0xd807aa98d807aa98, 0xd807aa98d807aa98 111 | .quad 0x12835b0112835b01, 0x12835b0112835b01 112 | .quad 0x243185be243185be, 0x243185be243185be 113 | .quad 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 114 | .quad 0x72be5d7472be5d74, 0x72be5d7472be5d74 115 | .quad 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe 116 | .quad 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 117 | .quad 0xc19bf374c19bf374, 0xc19bf374c19bf374 118 | .quad 0x649b69c1649b69c1, 0x649b69c1649b69c1 119 | .quad 0xf0fe4786f0fe4786, 0xf0fe4786f0fe4786 120 | .quad 0x0fe1edc60fe1edc6, 0x0fe1edc60fe1edc6 121 | .quad 0x240cf254240cf254, 0x240cf254240cf254 122 | .quad 0x4fe9346f4fe9346f, 0x4fe9346f4fe9346f 123 | .quad 0x6cc984be6cc984be, 0x6cc984be6cc984be 124 | .quad 0x61b9411e61b9411e, 0x61b9411e61b9411e 125 | .quad 0x16f988fa16f988fa, 0x16f988fa16f988fa 126 | .quad 0xf2c65152f2c65152, 0xf2c65152f2c65152 127 | .quad 0xa88e5a6da88e5a6d, 0xa88e5a6da88e5a6d 128 | .quad 0xb019fc65b019fc65, 0xb019fc65b019fc65 129 | .quad 0xb9d99ec7b9d99ec7, 0xb9d99ec7b9d99ec7 130 | .quad 0x9a1231c39a1231c3, 0x9a1231c39a1231c3 131 | .quad 0xe70eeaa0e70eeaa0, 0xe70eeaa0e70eeaa0 132 | .quad 0xfdb1232bfdb1232b, 0xfdb1232bfdb1232b 133 | .quad 0xc7353eb0c7353eb0, 0xc7353eb0c7353eb0 134 | .quad 0x3069bad53069bad5, 0x3069bad53069bad5 135 | .quad 0xcb976d5fcb976d5f, 0xcb976d5fcb976d5f 136 | .quad 0x5a0f118f5a0f118f, 0x5a0f118f5a0f118f 137 | .quad 0xdc1eeefddc1eeefd, 0xdc1eeefddc1eeefd 138 | .quad 0x0a35b6890a35b689, 0x0a35b6890a35b689 139 | .quad 0xde0b7a04de0b7a04, 0xde0b7a04de0b7a04 140 | .quad 0x58f4ca9d58f4ca9d, 0x58f4ca9d58f4ca9d 141 | .quad 0xe15d5b16e15d5b16, 0xe15d5b16e15d5b16 142 | .quad 0x007f3e86007f3e86, 0x007f3e86007f3e86 143 | .quad 0x3708898037088980, 0x3708898037088980 144 | .quad 0xa507ea32a507ea32, 0xa507ea32a507ea32 145 | .quad 0x6fab95376fab9537, 0x6fab95376fab9537 146 | .quad 0x1740611017406110, 0x1740611017406110 147 | .quad 0x0d8cd6f10d8cd6f1, 0x0d8cd6f10d8cd6f1 148 | .quad 0xcdaa3b6dcdaa3b6d, 0xcdaa3b6dcdaa3b6d 149 | .quad 0xc0bbbe37c0bbbe37, 0xc0bbbe37c0bbbe37 150 | .quad 0x83613bda83613bda, 0x83613bda83613bda 151 | .quad 0xdb48a363db48a363, 0xdb48a363db48a363 152 | .quad 0x0b02e9310b02e931, 0x0b02e9310b02e931 153 | .quad 0x6fd15ca76fd15ca7, 0x6fd15ca76fd15ca7 154 | .quad 0x521afaca521afaca, 0x521afaca521afaca 155 | .quad 0x3133843131338431, 0x3133843131338431 156 | .quad 0x6ed41a956ed41a95, 0x6ed41a956ed41a95 157 | .quad 0x6d4378906d437890, 0x6d4378906d437890 158 | .quad 0xc39c91f2c39c91f2, 0xc39c91f2c39c91f2 159 | .quad 0x9eccabbd9eccabbd, 0x9eccabbd9eccabbd 160 | .quad 0xb5c9a0e6b5c9a0e6, 0xb5c9a0e6b5c9a0e6 161 | .quad 0x532fb63c532fb63c, 0x532fb63c532fb63c 162 | .quad 0xd2c741c6d2c741c6, 0xd2c741c6d2c741c6 163 | .quad 0x07237ea307237ea3, 0x07237ea307237ea3 164 | .quad 0xa4954b68a4954b68, 0xa4954b68a4954b68 165 | .quad 0x4c191d764c191d76, 0x4c191d764c191d76 166 | 167 | .LDIGEST_4: 168 | .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 169 | .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 170 | .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 171 | .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a 172 | .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f 173 | .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c 174 | .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab 175 | .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 176 | 177 | .LPSHUFFLE_BYTE_FLIP_MASK: 178 | .quad 0x0405060700010203, 0x0c0d0e0f08090a0b 179 | 180 | #ifdef __WIN64__ 181 | .equiv OUTPUT_PTR, rcx // 1st arg 182 | .equiv DATA_PTR, rdx // 2nd arg 183 | .equiv NUM_BLKS, r8 // 3rd arg 184 | .equiv TBL, rax 185 | #else 186 | .equiv OUTPUT_PTR, rdi // 1st arg 187 | .equiv DATA_PTR, rsi // 2nd arg 188 | .equiv NUM_BLKS, rdx // 3rd arg 189 | .equiv TBL, rcx 190 | #endif 191 | 192 | .equiv ROUND, r9 193 | 194 | .equ a, xmm0 195 | .equ b, xmm1 196 | .equ c, xmm2 197 | .equ d, xmm3 198 | .equ e, xmm4 199 | .equ f, xmm5 200 | .equ g, xmm6 201 | .equ h, xmm7 202 | 203 | .equiv a0, xmm8 204 | .equiv a1, xmm9 205 | .equiv a2, xmm10 206 | 207 | .equiv TT0, xmm14 208 | .equiv TT1, xmm13 209 | .equiv TT2, xmm12 210 | .equiv TT3, xmm11 211 | .equiv TT4, xmm10 212 | .equiv TT5, xmm9 213 | 214 | .equiv T1, xmm14 215 | .equiv TMP, xmm15 216 | 217 | #define SZ4 16 218 | #define SHA256_DIGEST_WORD_SIZE 4 219 | #define NUM_SHA256_DIGEST_WORDS 8 220 | #define ROUNDS 1024 221 | 222 | // stack usage 223 | #define _DATA 0 224 | #define _DIGEST 256 225 | #ifdef __WIN64__ 226 | #define _XMM_SAVE 384 227 | #define sha256_avx_4_stack_size 568 228 | #else 229 | #define sha256_avx_4_stack_size 408 230 | #endif 231 | 232 | #define VMOVPS vmovups 233 | 234 | .macro TRANSPOSE r0, r1, r2, r3, t0, t1 235 | vshufps \t0, \r0, \r1, 0x44 // t0 = {b1 b0 a1 a0} 236 | vshufps \r0, \r0, \r1, 0xEE // r0 = {b3 b2 a3 a2} 237 | 238 | vshufps \t1, \r2, \r3, 0x44 // t1 = {d1 d0 c1 c0} 239 | vshufps \r2, \r2, \r3, 0xEE // r2 = {d3 d2 c3 c2} 240 | 241 | vshufps \r1, \t0, \t1, 0xDD // r1 = {d1 c1 b1 a1} 242 | 243 | vshufps \r3, \r0, \r2, 0xDD // r3 = {d3 c3 b3 a3} 244 | 245 | vshufps \r0, \r0, \r2, 0x88 // r0 = {d2 c2 b2 a2} 246 | vshufps \t0, \t0, \t1, 0x88 // t0 = {d0 c0 b0 a0} 247 | .endm 248 | 249 | .macro ROTATE_ARGS 250 | .equ TMP_, h 251 | .equ h, g 252 | .equ g, f 253 | .equ f, e 254 | .equ e, d 255 | .equ d, c 256 | .equ c, b 257 | .equ b, a 258 | .equ a, TMP_ 259 | .endm 260 | 261 | 262 | .macro PRORD3 reg, imm, tmp 263 | vpslld \tmp, \reg, (32-(\imm)) 264 | vpsrld \reg, \reg, \imm 265 | vpor \reg, \reg, \tmp 266 | .endm 267 | 268 | .macro PRORD_nd4 reg, imm, tmp, src 269 | vpslld \tmp, \src, (32-(\imm)) 270 | vpsrld \reg, \src, \imm 271 | vpor \reg, \reg, \tmp 272 | .endm 273 | 274 | .macro PRORD src, imm 275 | PRORD3 \src, \imm, TMP 276 | .endm 277 | 278 | .macro PRORD_nd dst, src, amt 279 | PRORD_nd4 \dst, \amt, TMP, \src 280 | .endm 281 | 282 | .macro ROUND_00_15 T1, index 283 | PRORD_nd a0, e, (11-6) // sig1: a0 = (e >> 5) 284 | 285 | vpxor a2, f, g // ch: a2 = f^g 286 | vpand a2, a2, e // ch: a2 = (f^g)&e 287 | vpxor a2, a2, g // a2 = ch 288 | 289 | PRORD_nd a1, e, 25 // sig1: a1 = (e >> 25) 290 | .if .Lpadding - 1 291 | vmovdqa [SZ4*(\index&0xf) + rsp + _DATA], \T1 292 | vpaddd \T1, \T1, [TBL + ROUND] // T1 = W + K 293 | .else 294 | vmovdqa \T1, [TBL + ROUND] 295 | .endif 296 | vpxor a0, a0, e // sig1: a0 = e ^ (e >> 5) 297 | PRORD a0, 6 // sig1: a0 = (e >> 6) ^ (e >> 11) 298 | vpaddd h, h, a2 // h = h + ch 299 | PRORD_nd a2, a, (13-2) // sig0: a2 = (a >> 11) 300 | vpaddd h, h, \T1 // h = h + ch + W + K 301 | vpxor a0, a0, a1 // a0 = sigma1 302 | PRORD_nd a1, a, 22 // sig0: a1 = (a >> 22) 303 | vpxor \T1, a, c // maj: T1 = a^c 304 | add ROUND, SZ4 // ROUND++ 305 | vpand \T1, \T1, b // maj: T1 = (a^c)&b 306 | vpaddd h, h, a0 307 | 308 | vpaddd d, d, h 309 | 310 | vpxor a2, a2, a // sig0: a2 = a ^ (a >> 11) 311 | PRORD a2, 2 // sig0: a2 = (a >> 2) ^ (a >> 13) 312 | vpxor a2, a2, a1 // a2 = sig0 313 | vpand a1, a, c // maj: a1 = a&c 314 | vpor a1, a1, \T1 // a1 = maj 315 | vpaddd h, h, a1 // h = h + ch + W + K + maj 316 | vpaddd h, h, a2 // h = h + ch + W + K + maj + sigma0 317 | 318 | ROTATE_ARGS 319 | .endm 320 | 321 | 322 | //; arguments passed implicitly in preprocessor symbols i, a...h 323 | .macro ROUND_16_XX T1, index 324 | vmovdqa \T1, [SZ4*((\index-15)&0xf) + rsp + _DATA] 325 | vmovdqa a1, [SZ4*((\index-2)&0xf) + rsp + _DATA] 326 | vmovdqa a0, \T1 327 | PRORD \T1, 18-7 328 | vmovdqa a2, a1 329 | PRORD a1, 19-17 330 | vpxor \T1, \T1, a0 331 | PRORD \T1, 7 332 | vpxor a1, a1, a2 333 | PRORD a1, 17 334 | vpsrld a0, a0, 3 335 | vpxor \T1, \T1, a0 336 | vpsrld a2, a2, 10 337 | vpxor a1, a1, a2 338 | vpaddd \T1, \T1, [SZ4*((\index-16)&0xf) + rsp + _DATA] 339 | vpaddd a1, a1, [SZ4*((\index-7)&0xf) + rsp + _DATA] 340 | vpaddd \T1, \T1, a1 341 | 342 | ROUND_00_15 \T1, \index 343 | .endm 344 | 345 | .text 346 | .global hashtree_sha256_avx_x4 347 | #ifndef __WIN64__ 348 | .type hashtree_sha256_avx_x4,%function 349 | #endif 350 | .align 16 351 | hashtree_sha256_avx_x4: 352 | endbr64 353 | cmp NUM_BLKS, 0 354 | jne .Lstart_routine 355 | ret 356 | .Lstart_routine: 357 | sub rsp, sha256_avx_4_stack_size 358 | #ifdef __WIN64__ 359 | vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 360 | vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 361 | vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 362 | vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 363 | vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 364 | vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 365 | vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 366 | vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 367 | vmovdqa [rsp + _XMM_SAVE + 8*16],xmm14 368 | vmovdqa [rsp + _XMM_SAVE + 9*16],xmm15 369 | #endif 370 | 371 | .Lsha256_4_avx_loop: 372 | .set .Lpadding, 0 373 | cmp NUM_BLKS, 4 374 | jl .Lsha256_4_avx_epilog 375 | 376 | xor ROUND, ROUND 377 | 378 | // Load the pre-transposed incoming digest. 379 | lea TBL, [rip + .LDIGEST_4] 380 | vmovdqa a,[TBL + 0*SZ4] 381 | vmovdqa b,[TBL + 1*SZ4] 382 | vmovdqa c,[TBL + 2*SZ4] 383 | vmovdqa d,[TBL + 3*SZ4] 384 | vmovdqa e,[TBL + 4*SZ4] 385 | vmovdqa f,[TBL + 5*SZ4] 386 | vmovdqa g,[TBL + 6*SZ4] 387 | vmovdqa h,[TBL + 7*SZ4] 388 | 389 | lea TBL, [rip + .LK256_4] 390 | 391 | .set .Li, 0 392 | .rept 4 393 | vmovdqa TMP, [rip + .LPSHUFFLE_BYTE_FLIP_MASK] 394 | VMOVPS TT2,[DATA_PTR + 0*64 + .Li*16] 395 | VMOVPS TT1,[DATA_PTR + 1*64 + .Li*16] 396 | VMOVPS TT4,[DATA_PTR + 2*64 + .Li*16] 397 | VMOVPS TT3,[DATA_PTR + 3*64 + .Li*16] 398 | TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 399 | vpshufb TT0, TT0, TMP 400 | vpshufb TT1, TT1, TMP 401 | vpshufb TT2, TT2, TMP 402 | vpshufb TT3, TT3, TMP 403 | ROUND_00_15 TT0, 4*.Li 404 | ROUND_00_15 TT1, 4*.Li + 1 405 | ROUND_00_15 TT2, 4*.Li + 2 406 | ROUND_00_15 TT3, 4*.Li + 3 407 | .set .Li, .Li+1 408 | .endr 409 | .set .Li, 4*.Li 410 | jmp .Lrounds_16_xx 411 | .align 16 412 | .Lrounds_16_xx: 413 | .rept 16 414 | ROUND_16_XX T1, .Li 415 | .set .Li, .Li+1 416 | .endr 417 | 418 | cmp ROUND,ROUNDS 419 | jb .Lrounds_16_xx 420 | 421 | // add old digest 422 | lea TBL, [rip + .LDIGEST_4] 423 | vpaddd a, a, [TBL + 0*SZ4] 424 | vpaddd b, b, [TBL + 1*SZ4] 425 | vpaddd c, c, [TBL + 2*SZ4] 426 | vpaddd d, d, [TBL + 3*SZ4] 427 | vpaddd e, e, [TBL + 4*SZ4] 428 | vpaddd f, f, [TBL + 5*SZ4] 429 | vpaddd g, g, [TBL + 6*SZ4] 430 | vpaddd h, h, [TBL + 7*SZ4] 431 | 432 | // rounds with padding 433 | 434 | // save old digest 435 | 436 | vmovdqa [rsp + _DIGEST + 0*SZ4], a 437 | vmovdqa [rsp + _DIGEST + 1*SZ4], b 438 | vmovdqa [rsp + _DIGEST + 2*SZ4], c 439 | vmovdqa [rsp + _DIGEST + 3*SZ4], d 440 | vmovdqa [rsp + _DIGEST + 4*SZ4], e 441 | vmovdqa [rsp + _DIGEST + 5*SZ4], f 442 | vmovdqa [rsp + _DIGEST + 6*SZ4], g 443 | vmovdqa [rsp + _DIGEST + 7*SZ4], h 444 | 445 | lea TBL, [rip + .LPADDING_4] 446 | xor ROUND,ROUND 447 | jmp .Lrounds_padding 448 | 449 | .align 16 450 | .Lrounds_padding: 451 | .set .Lpadding, 1 452 | .rept 64 453 | ROUND_00_15 T1, 0 454 | .endr 455 | 456 | // add old digest 457 | vpaddd a, a, [rsp + _DIGEST + 0*SZ4] 458 | vpaddd b, b, [rsp + _DIGEST + 1*SZ4] 459 | vpaddd c, c, [rsp + _DIGEST + 2*SZ4] 460 | vpaddd d, d, [rsp + _DIGEST + 3*SZ4] 461 | vpaddd e, e, [rsp + _DIGEST + 4*SZ4] 462 | vpaddd f, f, [rsp + _DIGEST + 5*SZ4] 463 | vpaddd g, g, [rsp + _DIGEST + 6*SZ4] 464 | vpaddd h, h, [rsp + _DIGEST + 7*SZ4] 465 | 466 | // transpose the digest and convert to little endian to get the registers correctly 467 | 468 | TRANSPOSE a, b, c, d, TT0, TT1 469 | TRANSPOSE e, f, g, h, TT2, TT1 470 | 471 | vmovdqa TMP, [rip + .LPSHUFFLE_BYTE_FLIP_MASK] 472 | vpshufb TT0, TT0, TMP 473 | vpshufb TT2, TT2, TMP 474 | vpshufb b, b, TMP 475 | vpshufb f, f, TMP 476 | vpshufb a, a, TMP 477 | vpshufb e, e, TMP 478 | vpshufb d, d, TMP 479 | vpshufb h, h, TMP 480 | 481 | 482 | // write to output 483 | 484 | vmovdqu [OUTPUT_PTR + 0*SZ4],TT0 485 | vmovdqu [OUTPUT_PTR + 1*SZ4],TT2 486 | vmovdqu [OUTPUT_PTR + 2*SZ4],b 487 | vmovdqu [OUTPUT_PTR + 3*SZ4],f 488 | vmovdqu [OUTPUT_PTR + 4*SZ4],a 489 | vmovdqu [OUTPUT_PTR + 5*SZ4],e 490 | vmovdqu [OUTPUT_PTR + 6*SZ4],d 491 | vmovdqu [OUTPUT_PTR + 7*SZ4],h 492 | 493 | // update pointers and loop 494 | 495 | add DATA_PTR, 64*4 496 | add OUTPUT_PTR, 32*4 497 | sub NUM_BLKS, 4 498 | jmp .Lsha256_4_avx_loop 499 | 500 | .Lsha256_4_avx_epilog: 501 | #ifdef __WIN64__ 502 | vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] 503 | vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] 504 | vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] 505 | vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] 506 | vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] 507 | vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] 508 | vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] 509 | vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] 510 | vmovdqa xmm14,[rsp + _XMM_SAVE + 8*16] 511 | vmovdqa xmm15,[rsp + _XMM_SAVE + 9*16] 512 | #endif 513 | 514 | add rsp, sha256_avx_4_stack_size 515 | jmp hashtree_sha256_avx_x1 516 | #ifdef __linux__ 517 | .size hashtree_sha256_avx_x4,.-hashtree_sha256_avx_x4 518 | .section .note.GNU-stack,"",@progbits 519 | #endif 520 | #endif 521 | -------------------------------------------------------------------------------- /src/sha256_sse_x1.S: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021-2024 Prysmatic Labs 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | 24 | This code is based on Intel's implementation found in 25 | https://github.com/intel/intel-ipsec-mb 26 | Copied parts are 27 | Copyright (c) 2012-2021, Intel Corporation 28 | */ 29 | 30 | #ifdef __x86_64__ 31 | .intel_syntax noprefix 32 | 33 | .section .rodata 34 | .align 64 35 | .LK256: 36 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 37 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 38 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 39 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 40 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 41 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 42 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 43 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 44 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 45 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 46 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 47 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 48 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 49 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 50 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 51 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 52 | 53 | .LDIGEST: 54 | .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a 55 | .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 56 | 57 | .LPADDING: 58 | .long 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 59 | .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 60 | .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 61 | .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374 62 | .long 0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254 63 | .long 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa 64 | .long 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7 65 | .long 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0 66 | .long 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd 67 | .long 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16 68 | .long 0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537 69 | .long 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37 70 | .long 0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7 71 | .long 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890 72 | .long 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c 73 | .long 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76 74 | 75 | 76 | .LPSHUFFLE_BYTE_FLIP_MASK: //.longq 0x0c0d0e0f08090a0b0405060700010203 77 | .quad 0x0405060700010203, 0x0c0d0e0f08090a0b 78 | 79 | // shuffle xBxA -> 00BA 80 | .L_SHUF_00BA: //d.quad 0xFFFFFFFFFFFFFFFF0b0a090803020100 81 | .quad 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF 82 | 83 | // shuffle xDxC -> DC00 84 | .L_SHUF_DC00: //d.quad 0x0b0a090803020100FFFFFFFFFFFFFFFF 85 | .quad 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 86 | 87 | 88 | #define MOVDQ movdqu 89 | 90 | .macro COPY_XMM_AND_BSWAP t1, t2, t3 91 | MOVDQ \t1, \t2 92 | pshufb \t1, \t3 93 | .endm 94 | 95 | 96 | .equ X0, xmm4 97 | .equ X1, xmm5 98 | .equ X2, xmm6 99 | .equ X3, xmm7 100 | 101 | .equiv XTMP0, xmm0 102 | .equiv XTMP1, xmm1 103 | .equiv XTMP2, xmm2 104 | .equiv XTMP3, xmm3 105 | .equiv XTMP4, xmm8 106 | .equiv XFER , xmm9 107 | 108 | .equiv SHUF_00BA, xmm10 109 | .equiv SHUF_DC00, xmm11 110 | .equiv BYTE_FLIP_MASK, xmm12 111 | 112 | #ifdef __WIN64__ 113 | .equiv OUTPUT_PTR, rcx 114 | .equiv DATA_PTR, rdx 115 | .equiv count, r8 116 | .equ c, edi 117 | .equ d, esi 118 | #else 119 | .equiv OUTPUT_PTR, rdi 120 | .equiv DATA_PTR, rsi 121 | .equiv count, rdx 122 | .equ c, ecx 123 | .equ d, r8d 124 | #endif 125 | .equiv TBL, rbp 126 | .equ a, eax 127 | .equ b, ebx 128 | .equ f, r9d 129 | .equ g, r10d 130 | .equ h, r11d 131 | .equ e, r12d 132 | 133 | .equiv y0, r13d 134 | .equiv y1, r14d 135 | .equiv y2, r15d 136 | 137 | # stack usage 138 | #ifdef __WIN64__ 139 | #define _XMM_SAVE 64 140 | #define STACK_SIZE 192 141 | #else 142 | #define STACK_SIZE 88 143 | #endif 144 | #define _DIGEST 32 145 | .macro ROTATE_ARGS 146 | .equ TMP_, h 147 | .equ h, g 148 | .equ g, f 149 | .equ f, e 150 | .equ e, d 151 | .equ d, c 152 | .equ c, b 153 | .equ b, a 154 | .equ a, TMP_ 155 | .endm 156 | 157 | .macro rotate_Xs 158 | .equ X_, X0 159 | .equ X0, X1 160 | .equ X1, X2 161 | .equ X2, X3 162 | .equ X3, X_ 163 | .endm 164 | 165 | .macro FOUR_ROUNDS_AND_SCHED 166 | # compute s0 four at a time and s1 two at a time 167 | # compute W[-16] + W[-7] 4 at a time 168 | movdqa XTMP0, X3 169 | mov y0, e // y0 = e 170 | ror y0, (25-11) // y0 = e >> (25-11) 171 | mov y1, a // y1 = a 172 | palignr XTMP0, X2, 4 // XTMP0 = W[-7] 173 | ror y1, (22-13) // y1 = a >> (22-13) 174 | xor y0, e // y0 = e ^ (e >> (25-11)) 175 | mov y2, f // y2 = f 176 | ror y0, (11-6) // y0 = (e >> (11-6)) ^ (e >> (25-6)) 177 | movdqa XTMP1, X1 178 | xor y1, a // y1 = a ^ (a >> (22-13) 179 | xor y2, g // y2 = f^g 180 | paddd XTMP0, X0 // XTMP0 = W[-7] + W[-16] 181 | xor y0, e // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 182 | and y2, e // y2 = (f^g)&e 183 | ror y1, (13-2) // y1 = (a >> (13-2)) ^ (a >> (22-2)) 184 | # compute s0 185 | palignr XTMP1, X0, 4 // XTMP1 = W[-15] 186 | xor y1, a // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 187 | ror y0, 6 // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 188 | xor y2, g // y2 = CH = ((f^g)&e)^g 189 | movdqa XTMP2, XTMP1 // XTMP2 = W[-15] 190 | ror y1, 2 // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 191 | add y2, y0 // y2 = S1 + CH 192 | add y2, [rsp + 0*4] // y2 = k + w + S1 + CH 193 | movdqa XTMP3, XTMP1 // XTMP3 = W[-15] 194 | mov y0, a // y0 = a 195 | add h, y2 // h = h + S1 + CH + k + w 196 | mov y2, a // y2 = a 197 | pslld XTMP1, (32-7) 198 | or y0, c // y0 = a|c 199 | add d, h // d = d + h + S1 + CH + k + w 200 | and y2, c // y2 = a&c 201 | psrld XTMP2, 7 202 | and y0, b // y0 = (a|c)&b 203 | add h, y1 // h = h + S1 + CH + k + w + S0 204 | por XTMP1, XTMP2 // XTMP1 = W[-15] ror 7 205 | or y0, y2 // y0 = MAJ = (a|c)&b)|(a&c) 206 | add h, y0 // h = h + S1 + CH + k + w + S0 + MAJ 207 | 208 | ROTATE_ARGS 209 | movdqa XTMP2, XTMP3 // XTMP2 = W[-15] 210 | mov y0, e // y0 = e 211 | mov y1, a // y1 = a 212 | movdqa XTMP4, XTMP3 // XTMP4 = W[-15] 213 | ror y0, (25-11) // y0 = e >> (25-11) 214 | xor y0, e // y0 = e ^ (e >> (25-11)) 215 | mov y2, f // y2 = f 216 | ror y1, (22-13) // y1 = a >> (22-13) 217 | pslld XTMP3, (32-18) 218 | xor y1, a // y1 = a ^ (a >> (22-13) 219 | ror y0, (11-6) // y0 = (e >> (11-6)) ^ (e >> (25-6)) 220 | xor y2, g // y2 = f^g 221 | psrld XTMP2, 18 222 | ror y1, (13-2) // y1 = (a >> (13-2)) ^ (a >> (22-2)) 223 | xor y0, e // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 224 | and y2, e // y2 = (f^g)&e 225 | ror y0, 6 // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 226 | pxor XTMP1, XTMP3 227 | xor y1, a // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 228 | xor y2, g // y2 = CH = ((f^g)&e)^g 229 | psrld XTMP4, 3 // XTMP4 = W[-15] >> 3 230 | add y2, y0 // y2 = S1 + CH 231 | add y2, [rsp + 1*4] // y2 = k + w + S1 + CH 232 | ror y1, 2 // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 233 | pxor XTMP1, XTMP2 // XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 234 | mov y0, a // y0 = a 235 | add h, y2 // h = h + S1 + CH + k + w 236 | mov y2, a // y2 = a 237 | pxor XTMP1, XTMP4 // XTMP1 = s0 238 | or y0, c // y0 = a|c 239 | add d, h // d = d + h + S1 + CH + k + w 240 | and y2, c // y2 = a&c 241 | # compute low s1 242 | pshufd XTMP2, X3, 0b11111010 // XTMP2 = W[-2] {BBAA} 243 | and y0, b // y0 = (a|c)&b 244 | add h, y1 // h = h + S1 + CH + k + w + S0 245 | paddd XTMP0, XTMP1 // XTMP0 = W[-16] + W[-7] + s0 246 | or y0, y2 // y0 = MAJ = (a|c)&b)|(a&c) 247 | add h, y0 // h = h + S1 + CH + k + w + S0 + MAJ 248 | 249 | ROTATE_ARGS 250 | movdqa XTMP3, XTMP2 // XTMP3 = W[-2] {BBAA} 251 | mov y0, e // y0 = e 252 | mov y1, a // y1 = a 253 | ror y0, (25-11) // y0 = e >> (25-11) 254 | movdqa XTMP4, XTMP2 // XTMP4 = W[-2] {BBAA} 255 | xor y0, e // y0 = e ^ (e >> (25-11)) 256 | ror y1, (22-13) // y1 = a >> (22-13) 257 | mov y2, f // y2 = f 258 | xor y1, a // y1 = a ^ (a >> (22-13) 259 | ror y0, (11-6) // y0 = (e >> (11-6)) ^ (e >> (25-6)) 260 | psrlq XTMP2, 17 // XTMP2 = W[-2] ror 17 {xBxA} 261 | xor y2, g // y2 = f^g 262 | psrlq XTMP3, 19 // XTMP3 = W[-2] ror 19 {xBxA} 263 | xor y0, e // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 264 | and y2, e // y2 = (f^g)&e 265 | psrld XTMP4, 10 // XTMP4 = W[-2] >> 10 {BBAA} 266 | ror y1, (13-2) // y1 = (a >> (13-2)) ^ (a >> (22-2)) 267 | xor y1, a // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 268 | xor y2, g // y2 = CH = ((f^g)&e)^g 269 | ror y0, 6 // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 270 | pxor XTMP2, XTMP3 271 | add y2, y0 // y2 = S1 + CH 272 | ror y1, 2 // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 273 | add y2, [rsp + 2*4] // y2 = k + w + S1 + CH 274 | pxor XTMP4, XTMP2 // XTMP4 = s1 {xBxA} 275 | mov y0, a // y0 = a 276 | add h, y2 // h = h + S1 + CH + k + w 277 | mov y2, a // y2 = a 278 | pshufb XTMP4, SHUF_00BA // XTMP4 = s1 {00BA} 279 | or y0, c // y0 = a|c 280 | add d, h // d = d + h + S1 + CH + k + w 281 | and y2, c // y2 = a&c 282 | paddd XTMP0, XTMP4 // XTMP0 = {..., ..., W[1], W[0]} 283 | and y0, b // y0 = (a|c)&b 284 | add h, y1 // h = h + S1 + CH + k + w + S0 285 | # compute high s1 286 | pshufd XTMP2, XTMP0, 0b01010000 // XTMP2 = W[-2] {DDCC} 287 | or y0, y2 // y0 = MAJ = (a|c)&b)|(a&c) 288 | add h, y0 // h = h + S1 + CH + k + w + S0 + MAJ 289 | 290 | ROTATE_ARGS 291 | movdqa XTMP3, XTMP2 // XTMP3 = W[-2] {DDCC} 292 | mov y0, e // y0 = e 293 | ror y0, (25-11) // y0 = e >> (25-11) 294 | mov y1, a // y1 = a 295 | movdqa X0, XTMP2 // X0 = W[-2] {DDCC} 296 | ror y1, (22-13) // y1 = a >> (22-13) 297 | xor y0, e // y0 = e ^ (e >> (25-11)) 298 | mov y2, f // y2 = f 299 | ror y0, (11-6) // y0 = (e >> (11-6)) ^ (e >> (25-6)) 300 | psrlq XTMP2, 17 // XTMP2 = W[-2] ror 17 {xDxC} 301 | xor y1, a // y1 = a ^ (a >> (22-13) 302 | xor y2, g // y2 = f^g 303 | psrlq XTMP3, 19 // XTMP3 = W[-2] ror 19 {xDxC} 304 | xor y0, e // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 305 | and y2, e // y2 = (f^g)&e 306 | ror y1, (13-2) // y1 = (a >> (13-2)) ^ (a >> (22-2)) 307 | psrld X0, 10 // X0 = W[-2] >> 10 {DDCC} 308 | xor y1, a // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 309 | ror y0, 6 // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 310 | xor y2, g // y2 = CH = ((f^g)&e)^g 311 | pxor XTMP2, XTMP3 312 | ror y1, 2 // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 313 | add y2, y0 // y2 = S1 + CH 314 | add y2, [rsp + 3*4] // y2 = k + w + S1 + CH 315 | pxor X0, XTMP2 // X0 = s1 {xDxC} 316 | mov y0, a // y0 = a 317 | add h, y2 // h = h + S1 + CH + k + w 318 | mov y2, a // y2 = a 319 | pshufb X0, SHUF_DC00 // X0 = s1 {DC00} 320 | or y0, c // y0 = a|c 321 | add d, h // d = d + h + S1 + CH + k + w 322 | and y2, c // y2 = a&c 323 | paddd X0, XTMP0 // X0 = {W[3], W[2], W[1], W[0]} 324 | and y0, b // y0 = (a|c)&b 325 | add h, y1 // h = h + S1 + CH + k + w + S0 326 | or y0, y2 // y0 = MAJ = (a|c)&b)|(a&c) 327 | add h, y0 // h = h + S1 + CH + k + w + S0 + MAJ 328 | ROTATE_ARGS 329 | rotate_Xs 330 | .endm 331 | 332 | .macro DO_ROUND base, offset 333 | mov y0, e // y0 = e 334 | ror y0, (25-11) // y0 = e >> (25-11) 335 | mov y1, a // y1 = a 336 | xor y0, e // y0 = e ^ (e >> (25-11)) 337 | ror y1, (22-13) // y1 = a >> (22-13) 338 | mov y2, f // y2 = f 339 | xor y1, a // y1 = a ^ (a >> (22-13) 340 | ror y0, (11-6) // y0 = (e >> (11-6)) ^ (e >> (25-6)) 341 | xor y2, g // y2 = f^g 342 | xor y0, e // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 343 | ror y1, (13-2) // y1 = (a >> (13-2)) ^ (a >> (22-2)) 344 | and y2, e // y2 = (f^g)&e 345 | xor y1, a // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 346 | ror y0, 6 // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 347 | xor y2, g // y2 = CH = ((f^g)&e)^g 348 | add y2, y0 // y2 = S1 + CH 349 | ror y1, 2 // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 350 | add y2, [\base + \offset] // y2 = k + w + S1 + CH 351 | mov y0, a // y0 = a 352 | add h, y2 // h = h + S1 + CH + k + w 353 | mov y2, a // y2 = a 354 | or y0, c // y0 = a|c 355 | add d, h // d = d + h + S1 + CH + k + w 356 | and y2, c // y2 = a&c 357 | and y0, b // y0 = (a|c)&b 358 | add h, y1 // h = h + S1 + CH + k + w + S0 359 | or y0, y2 // y0 = MAJ = (a|c)&b)|(a&c) 360 | add h, y0 // h = h + S1 + CH + k + w + S0 + MAJ 361 | ROTATE_ARGS 362 | .endm 363 | 364 | .text 365 | .global hashtree_sha256_sse_x1 366 | #ifndef __WIN64__ 367 | .type hashtree_sha256_sse_x1,%function 368 | #endif 369 | .align 32 370 | hashtree_sha256_sse_x1: 371 | push rbx 372 | #ifdef __WIN64__ 373 | push r8 374 | push rsi 375 | push rdi 376 | #endif 377 | push rbp 378 | push r12 379 | push r13 380 | push r14 381 | push r15 382 | 383 | sub rsp,STACK_SIZE 384 | #ifdef __WIN64__ 385 | movdqa [rsp + _XMM_SAVE + 0*16],xmm6 386 | movdqa [rsp + _XMM_SAVE + 1*16],xmm7 387 | movdqa [rsp + _XMM_SAVE + 2*16],xmm8 388 | movdqa [rsp + _XMM_SAVE + 3*16],xmm9 389 | movdqa [rsp + _XMM_SAVE + 4*16],xmm10 390 | movdqa [rsp + _XMM_SAVE + 5*16],xmm11 391 | movdqa [rsp + _XMM_SAVE + 6*16],xmm12 392 | movdqa [rsp + _XMM_SAVE + 7*16],xmm13 393 | #endif 394 | movdqa BYTE_FLIP_MASK, [rip + .LPSHUFFLE_BYTE_FLIP_MASK] 395 | movdqa SHUF_00BA, [rip + .L_SHUF_00BA] 396 | movdqa SHUF_DC00, [rip + .L_SHUF_DC00] 397 | 398 | shl count, 5 399 | add count, OUTPUT_PTR 400 | 401 | .Lsha256_sse_1_block_loop: 402 | cmp OUTPUT_PTR, count 403 | je .Lsha256_1_sse_epilog 404 | 405 | 406 | # load initial digest 407 | lea TBL,[rip + .LDIGEST] 408 | mov a, [4*0 + TBL] 409 | mov b, [4*1 + TBL] 410 | mov c, [4*2 + TBL] 411 | mov d, [4*3 + TBL] 412 | mov e, [4*4 + TBL] 413 | mov f, [4*5 + TBL] 414 | mov g, [4*6 + TBL] 415 | mov h, [4*7 + TBL] 416 | 417 | lea TBL,[rip + .LK256] 418 | 419 | # byte swap first 16 dwords 420 | COPY_XMM_AND_BSWAP X0, [DATA_PTR + 0*16], BYTE_FLIP_MASK 421 | COPY_XMM_AND_BSWAP X1, [DATA_PTR + 1*16], BYTE_FLIP_MASK 422 | COPY_XMM_AND_BSWAP X2, [DATA_PTR + 2*16], BYTE_FLIP_MASK 423 | COPY_XMM_AND_BSWAP X3, [DATA_PTR + 3*16], BYTE_FLIP_MASK 424 | 425 | # schedule 48 input dwords, by doing 3 rounds of 16 each 426 | 427 | .rept 3 428 | .align 16 429 | movdqa XFER, [TBL + 0*16] 430 | paddd XFER, X0 431 | movdqa [rsp], XFER 432 | FOUR_ROUNDS_AND_SCHED 433 | 434 | movdqa XFER, [TBL + 1*16] 435 | paddd XFER, X0 436 | movdqa [rsp], XFER 437 | FOUR_ROUNDS_AND_SCHED 438 | 439 | movdqa XFER, [TBL + 2*16] 440 | paddd XFER, X0 441 | movdqa [rsp], XFER 442 | FOUR_ROUNDS_AND_SCHED 443 | 444 | movdqa XFER, [TBL + 3*16] 445 | paddd XFER, X0 446 | movdqa [rsp], XFER 447 | add TBL, 4*16 448 | FOUR_ROUNDS_AND_SCHED 449 | .endr 450 | 451 | .rept 2 452 | paddd X0, [TBL + 0*16] 453 | movdqa [rsp], X0 454 | DO_ROUND rsp, 0 455 | DO_ROUND rsp, 4 456 | DO_ROUND rsp, 8 457 | DO_ROUND rsp, 12 458 | paddd X1, [TBL + 1*16] 459 | movdqa [rsp], X1 460 | add TBL, 2*16 461 | DO_ROUND rsp, 0 462 | DO_ROUND rsp, 4 463 | DO_ROUND rsp, 8 464 | DO_ROUND rsp, 12 465 | 466 | movdqa X0, X2 467 | movdqa X1, X3 468 | .endr 469 | 470 | lea TBL,[rip + .LDIGEST] 471 | add a, [TBL + 0*4] 472 | add b, [TBL + 1*4] 473 | add c, [TBL + 2*4] 474 | add d, [TBL + 3*4] 475 | add e, [TBL + 4*4] 476 | add f, [TBL + 5*4] 477 | add g, [TBL + 6*4] 478 | add h, [TBL + 7*4] 479 | 480 | // rounds with padding 481 | 482 | // save old digest 483 | mov [rsp + _DIGEST + 0*4], a 484 | mov [rsp + _DIGEST + 1*4], b 485 | mov [rsp + _DIGEST + 2*4], c 486 | mov [rsp + _DIGEST + 3*4], d 487 | mov [rsp + _DIGEST + 4*4], e 488 | mov [rsp + _DIGEST + 5*4], f 489 | mov [rsp + _DIGEST + 6*4], g 490 | mov [rsp + _DIGEST + 7*4], h 491 | 492 | lea TBL,[rip + .LPADDING] 493 | 494 | .set .Li, 0 495 | .rept 64 496 | DO_ROUND TBL, .Li 497 | .set .Li, .Li+4 498 | .endr 499 | 500 | // add the previous digest 501 | add a, [rsp + _DIGEST + 0*4] 502 | add b, [rsp + _DIGEST + 1*4] 503 | add c, [rsp + _DIGEST + 2*4] 504 | add d, [rsp + _DIGEST + 3*4] 505 | add e, [rsp + _DIGEST + 4*4] 506 | add f, [rsp + _DIGEST + 5*4] 507 | add g, [rsp + _DIGEST + 6*4] 508 | add h, [rsp + _DIGEST + 7*4] 509 | 510 | // shuffle the bytes to little endian 511 | bswap a 512 | bswap b 513 | bswap c 514 | bswap d 515 | bswap e 516 | bswap f 517 | bswap g 518 | bswap h 519 | 520 | // write resulting hash 521 | mov [OUTPUT_PTR + 0*4], a 522 | mov [OUTPUT_PTR + 1*4], b 523 | mov [OUTPUT_PTR + 2*4], c 524 | mov [OUTPUT_PTR + 3*4], d 525 | mov [OUTPUT_PTR + 4*4], e 526 | mov [OUTPUT_PTR + 5*4], f 527 | mov [OUTPUT_PTR + 6*4], g 528 | mov [OUTPUT_PTR + 7*4], h 529 | 530 | add OUTPUT_PTR, 32 531 | add DATA_PTR, 64 532 | jmp .Lsha256_sse_1_block_loop 533 | 534 | .Lsha256_1_sse_epilog: 535 | 536 | #ifdef __WIN64__ 537 | vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] 538 | vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] 539 | vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] 540 | vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] 541 | vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] 542 | vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] 543 | vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] 544 | vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] 545 | #endif 546 | 547 | add rsp, STACK_SIZE 548 | 549 | pop r15 550 | pop r14 551 | pop r13 552 | pop r12 553 | pop rbp 554 | #ifdef __WIN64__ 555 | pop rdi 556 | pop rsi 557 | pop r8 558 | #endif 559 | pop rbx 560 | 561 | ret 562 | #ifdef __linux__ 563 | .size hashtree_sha256_sse_x1,.-hashtree_sha256_sse_x1 564 | .section .note.GNU-stack,"",@progbits 565 | #endif 566 | #endif 567 | -------------------------------------------------------------------------------- /src/sha256_avx_x1.S: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021-2024 Prysmatic Labs 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | 24 | This code is based on Intel's implementation found in 25 | https://github.com/intel/intel-ipsec-mb 26 | Such software is licensed under the BSD 3-Clause License and is 27 | Copyright (c) 2012-2023, Intel Corporation 28 | */ 29 | 30 | #ifdef __x86_64__ 31 | .intel_syntax noprefix 32 | .section .rodata 33 | .align 16 34 | .LK256: 35 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 36 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 37 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 38 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 39 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 40 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 41 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 42 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 43 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 44 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 45 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 46 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 47 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 48 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 49 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 50 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 51 | 52 | .LDIGEST: 53 | .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a 54 | .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 55 | 56 | .LPADDING: 57 | .long 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 58 | .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 59 | .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 60 | .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374 61 | .long 0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254 62 | .long 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa 63 | .long 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7 64 | .long 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0 65 | .long 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd 66 | .long 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16 67 | .long 0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537 68 | .long 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37 69 | .long 0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7 70 | .long 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890 71 | .long 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c 72 | .long 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76 73 | 74 | 75 | .LPSHUFFLE_BYTE_FLIP_MASK: //.longq 0x0c0d0e0f08090a0b0405060700010203 76 | .quad 0x0405060700010203, 0x0c0d0e0f08090a0b 77 | 78 | // shuffle xBxA -> 00BA 79 | .L_SHUF_00BA: //d.quad 0xFFFFFFFFFFFFFFFF0b0a090803020100 80 | .quad 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF 81 | 82 | // shuffle xDxC -> DC00 83 | .L_SHUF_DC00: //d.quad 0x0b0a090803020100FFFFFFFFFFFFFFFF 84 | .quad 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 85 | 86 | .text 87 | # define VMOVDQ vmovdqu 88 | 89 | .macro MY_ROR src, shf 90 | shld \src, \src, (32-\shf) 91 | .endm 92 | 93 | // COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask 94 | // Load xmm with mem and byte swap each dword 95 | .macro COPY_XMM_AND_BSWAP dst, src, msk 96 | VMOVDQ \dst, \src 97 | vpshufb \dst, \dst, \msk 98 | .endm 99 | 100 | .equ X0, xmm4 101 | .equ X1, xmm5 102 | .equ X2, xmm6 103 | .equ X3, xmm7 104 | 105 | .equiv XTMP0, xmm0 106 | .equiv XTMP1, xmm1 107 | .equiv XTMP2, xmm2 108 | .equiv XTMP3, xmm3 109 | .equiv XTMP4, xmm8 110 | .equiv XFER, xmm9 111 | .equiv XTMP5, xmm11 112 | 113 | .equiv SHUF_00BA, xmm10 // shuffle xBxA -> 00BA 114 | .equiv SHUF_DC00, xmm12 // shuffle xDxC -> DC00 115 | .equiv BYTE_FLIP_MASK, xmm13 116 | 117 | #ifdef __WIN64__ 118 | .equ OUTPUT_PTR, rcx // 1st arg 119 | .equ DATA_PTR, rdx // 2nd arg 120 | .equ d_, ebp 121 | .equiv count, r8 // 3rd arg 122 | 123 | .equ TBL, rsi 124 | .equ c_, edi 125 | #define _XMM_SAVE 64 126 | #define stack_size 192 127 | #else 128 | .equ OUTPUT_PTR, rdi // 1st arg 129 | .equ DATA_PTR, rsi // 2nd arg 130 | .equ c_, ebp 131 | .equiv count, rdx // 3rd arg 132 | 133 | .equ TBL, rcx 134 | .equ d_, r8d 135 | #define stack_size 88 136 | #endif 137 | 138 | #define _DIGEST 32 139 | 140 | .equ a_, eax 141 | .equ b_, ebx 142 | .equ e_, r9d 143 | .equ f_, r10d 144 | .equ g_, r11d 145 | .equ h_, r12d 146 | 147 | .equiv y0, r13d 148 | .equiv y1, r14d 149 | .equiv y2, r15d 150 | 151 | 152 | .macro rotate_Xs 153 | .equ X_, X0 154 | .equ X0, X1 155 | .equ X1, X2 156 | .equ X2, X3 157 | .equ X3, X_ 158 | .endm 159 | 160 | .macro ROTATE_ARGS 161 | .equ TMP_, h_ 162 | .equ h_, g_ 163 | .equ g_, f_ 164 | .equ f_, e_ 165 | .equ e_, d_ 166 | .equ d_, c_ 167 | .equ c_, b_ 168 | .equ b_, a_ 169 | .equ a_, TMP_ 170 | .endm 171 | 172 | .macro FOUR_ROUNDS_AND_SCHED 173 | //// compute s0 four at a time and s1 two at a time 174 | //// compute W[-16] + W[-7] 4 at a time 175 | //vmovdqa XTMP0, X3 176 | mov y0, e_ // y0 = e 177 | MY_ROR y0, (25-11) // y0 = e >> (25-11) 178 | mov y1, a_ // y1 = a 179 | vpalignr XTMP0, X3, X2, 4 // XTMP0 = W[-7] 180 | MY_ROR y1, (22-13) // y1 = a >> (22-13) 181 | xor y0, e_ // y0 = e ^ (e >> (25-11)) 182 | mov y2, f_ // y2 = f 183 | MY_ROR y0, (11-6) // y0 = (e >> (11-6)) ^ (e >> (25-6)) 184 | //vmovdqa XTMP1, X1 185 | xor y1, a_ // y1 = a ^ (a >> (22-13) 186 | xor y2, g_ // y2 = f^g 187 | vpaddd XTMP0, XTMP0, X0 // XTMP0 = W[-7] + W[-16] 188 | xor y0, e_ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 189 | and y2, e_ // y2 = (f^g)&e 190 | MY_ROR y1, (13-2) // y1 = (a >> (13-2)) ^ (a >> (22-2)) 191 | //// compute s0 192 | vpalignr XTMP1, X1, X0, 4 // XTMP1 = W[-15] 193 | xor y1, a_ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 194 | MY_ROR y0, 6 // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 195 | xor y2, g_ // y2 = CH = ((f^g)&e)^g 196 | 197 | MY_ROR y1, 2 // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 198 | add y2, y0 // y2 = S1 + CH 199 | add y2, [rsp + 0*4] // y2 = k + w + S1 + CH 200 | 201 | mov y0, a_ // y0 = a 202 | add h_, y2 // h = h + S1 + CH + k + w 203 | mov y2, a_ // y2 = a 204 | 205 | vpsrld XTMP2, XTMP1, 7 206 | 207 | or y0, c_ // y0 = a|c 208 | add d_, h_ // d = d + h + S1 + CH + k + w 209 | and y2, c_ // y2 = a&c 210 | 211 | vpslld XTMP3, XTMP1, (32-7) 212 | 213 | and y0, b_ // y0 = (a|c)&b 214 | add h_, y1 // h = h + S1 + CH + k + w + S0 215 | 216 | vpor XTMP3, XTMP3, XTMP2 // XTMP1 = W[-15] MY_ROR 7 217 | 218 | or y0, y2 // y0 = MAJ = (a|c)&b)|(a&c) 219 | add h_, y0 // h = h + S1 + CH + k + w + S0 + MAJ 220 | 221 | ROTATE_ARGS 222 | 223 | mov y0, e_ // y0 = e 224 | mov y1, a_ // y1 = a 225 | 226 | 227 | MY_ROR y0, (25-11) // y0 = e >> (25-11) 228 | xor y0, e_ // y0 = e ^ (e >> (25-11)) 229 | mov y2, f_ // y2 = f 230 | MY_ROR y1, (22-13) // y1 = a >> (22-13) 231 | 232 | vpsrld XTMP2, XTMP1,18 233 | 234 | xor y1, a_ // y1 = a ^ (a >> (22-13) 235 | MY_ROR y0, (11-6) // y0 = (e >> (11-6)) ^ (e >> (25-6)) 236 | xor y2, g_ // y2 = f^g 237 | 238 | vpsrld XTMP4, XTMP1, 3 // XTMP4 = W[-15] >> 3 239 | 240 | MY_ROR y1, (13-2) // y1 = (a >> (13-2)) ^ (a >> (22-2)) 241 | xor y0, e_ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 242 | and y2, e_ // y2 = (f^g)&e 243 | MY_ROR y0, 6 // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 244 | 245 | vpslld XTMP1, XTMP1, (32-18) 246 | 247 | xor y1, a_ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 248 | xor y2, g_ // y2 = CH = ((f^g)&e)^g 249 | 250 | vpxor XTMP3, XTMP3, XTMP1 251 | 252 | add y2, y0 // y2 = S1 + CH 253 | add y2, [rsp + 1*4] // y2 = k + w + S1 + CH 254 | MY_ROR y1, 2 // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 255 | 256 | vpxor XTMP3, XTMP3, XTMP2 // XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 257 | 258 | mov y0, a_ // y0 = a 259 | add h_, y2 // h = h + S1 + CH + k + w 260 | mov y2, a_ // y2 = a 261 | 262 | vpxor XTMP1, XTMP3, XTMP4 // XTMP1 = s0 263 | 264 | or y0, c_ // y0 = a|c 265 | add d_, h_ // d = d + h + S1 + CH + k + w 266 | and y2, c_ // y2 = a&c 267 | //// compute low s1 268 | vpshufd XTMP2, X3, 0b11111010 // XTMP2 = W[-2] {BBAA} 269 | and y0, b_ // y0 = (a|c)&b 270 | add h_, y1 // h = h + S1 + CH + k + w + S0 271 | vpaddd XTMP0, XTMP0, XTMP1 // XTMP0 = W[-16] + W[-7] + s0 272 | or y0, y2 // y0 = MAJ = (a|c)&b)|(a&c) 273 | add h_, y0 // h = h + S1 + CH + k + w + S0 + MAJ 274 | 275 | ROTATE_ARGS 276 | //vmovdqa XTMP3, XTMP2 // XTMP3 = W[-2] {BBAA} 277 | 278 | mov y0, e_ // y0 = e 279 | mov y1, a_ // y1 = a 280 | MY_ROR y0, (25-11) // y0 = e >> (25-11) 281 | 282 | //vmovdqa XTMP4, XTMP2 // XTMP4 = W[-2] {BBAA} 283 | 284 | xor y0, e_ // y0 = e ^ (e >> (25-11)) 285 | MY_ROR y1, (22-13) // y1 = a >> (22-13) 286 | mov y2, f_ // y2 = f 287 | xor y1, a_ // y1 = a ^ (a >> (22-13) 288 | MY_ROR y0, (11-6) // y0 = (e >> (11-6)) ^ (e >> (25-6)) 289 | 290 | vpsrld XTMP4, XTMP2, 10 // XTMP4 = W[-2] >> 10 {BBAA} 291 | 292 | xor y2, g_ // y2 = f^g 293 | 294 | vpsrlq XTMP3, XTMP2, 19 // XTMP3 = W[-2] MY_ROR 19 {xBxA} 295 | 296 | xor y0, e_ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 297 | and y2, e_ // y2 = (f^g)&e 298 | 299 | vpsrlq XTMP2, XTMP2, 17 // XTMP2 = W[-2] MY_ROR 17 {xBxA} 300 | 301 | MY_ROR y1, (13-2) // y1 = (a >> (13-2)) ^ (a >> (22-2)) 302 | xor y1, a_ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 303 | xor y2, g_ // y2 = CH = ((f^g)&e)^g 304 | MY_ROR y0, 6 // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 305 | vpxor XTMP2, XTMP2, XTMP3 306 | add y2, y0 // y2 = S1 + CH 307 | MY_ROR y1, 2 // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 308 | add y2, [rsp + 2*4] // y2 = k + w + S1 + CH 309 | vpxor XTMP4, XTMP4, XTMP2 // XTMP4 = s1 {xBxA} 310 | mov y0, a_ // y0 = a 311 | add h_, y2 // h = h + S1 + CH + k + w 312 | mov y2, a_ // y2 = a 313 | vpshufb XTMP4, XTMP4, SHUF_00BA // XTMP4 = s1 {00BA} 314 | or y0, c_ // y0 = a|c 315 | add d_, h_ // d = d + h + S1 + CH + k + w 316 | and y2, c_ // y2 = a&c 317 | vpaddd XTMP0, XTMP0, XTMP4 // XTMP0 = {..., ..., W[1], W[0]} 318 | and y0, b_ // y0 = (a|c)&b 319 | add h_, y1 // h = h + S1 + CH + k + w + S0 320 | //// compute high s1 321 | vpshufd XTMP2, XTMP0, 0b01010000 // XTMP2 = W[-2] {DDCC} 322 | or y0, y2 // y0 = MAJ = (a|c)&b)|(a&c) 323 | add h_, y0 // h = h + S1 + CH + k + w + S0 + MAJ 324 | 325 | ROTATE_ARGS 326 | //vmovdqa XTMP3, XTMP2 // XTMP3 = W[-2] {DDCC} 327 | mov y0, e_ // y0 = e 328 | MY_ROR y0, (25-11) // y0 = e >> (25-11) 329 | mov y1, a_ // y1 = a 330 | //vmovdqa XTMP5, XTMP2 // XTMP5 = W[-2] {DDCC} 331 | MY_ROR y1, (22-13) // y1 = a >> (22-13) 332 | xor y0, e_ // y0 = e ^ (e >> (25-11)) 333 | mov y2, f_ // y2 = f 334 | MY_ROR y0, (11-6) // y0 = (e >> (11-6)) ^ (e >> (25-6)) 335 | 336 | vpsrld XTMP5, XTMP2, 10 // XTMP5 = W[-2] >> 10 {DDCC} 337 | 338 | xor y1, a_ // y1 = a ^ (a >> (22-13) 339 | xor y2, g_ // y2 = f^g 340 | 341 | vpsrlq XTMP3, XTMP2, 19 // XTMP3 = W[-2] MY_ROR 19 {xDxC} 342 | 343 | xor y0, e_ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 344 | and y2, e_ // y2 = (f^g)&e 345 | MY_ROR y1, (13-2) // y1 = (a >> (13-2)) ^ (a >> (22-2)) 346 | 347 | vpsrlq XTMP2, XTMP2, 17 // XTMP2 = W[-2] MY_ROR 17 {xDxC} 348 | 349 | xor y1, a_ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 350 | MY_ROR y0, 6 // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 351 | xor y2, g_ // y2 = CH = ((f^g)&e)^g 352 | 353 | vpxor XTMP2, XTMP2, XTMP3 354 | 355 | MY_ROR y1, 2 // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 356 | add y2, y0 // y2 = S1 + CH 357 | add y2, [rsp + 3*4] // y2 = k + w + S1 + CH 358 | vpxor XTMP5, XTMP5, XTMP2 // XTMP5 = s1 {xDxC} 359 | mov y0, a_ // y0 = a 360 | add h_, y2 // h = h + S1 + CH + k + w 361 | mov y2, a_ // y2 = a 362 | vpshufb XTMP5, XTMP5, SHUF_DC00 // XTMP5 = s1 {DC00} 363 | or y0, c_ // y0 = a|c 364 | add d_, h_ // d = d + h + S1 + CH + k + w 365 | and y2, c_ // y2 = a&c 366 | vpaddd X0, XTMP5, XTMP0 // X0 = {W[3], W[2], W[1], W[0]} 367 | and y0, b_ // y0 = (a|c)&b 368 | add h_, y1 // h = h + S1 + CH + k + w + S0 369 | or y0, y2 // y0 = MAJ = (a|c)&b)|(a&c) 370 | add h_, y0 // h = h + S1 + CH + k + w + S0 + MAJ 371 | ROTATE_ARGS 372 | rotate_Xs 373 | .endm 374 | 375 | .macro DO_ROUND base offset 376 | mov y0, e_ // y0 = e 377 | MY_ROR y0, (25-11) // y0 = e >> (25-11) 378 | mov y1, a_ // y1 = a 379 | xor y0, e_ // y0 = e ^ (e >> (25-11)) 380 | MY_ROR y1, (22-13) // y1 = a >> (22-13) 381 | mov y2, f_ // y2 = f 382 | xor y1, a_ // y1 = a ^ (a >> (22-13) 383 | MY_ROR y0, (11-6) // y0 = (e >> (11-6)) ^ (e >> (25-6)) 384 | xor y2, g_ // y2 = f^g 385 | xor y0, e_ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 386 | MY_ROR y1, (13-2) // y1 = (a >> (13-2)) ^ (a >> (22-2)) 387 | and y2, e_ // y2 = (f^g)&e 388 | xor y1, a_ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 389 | MY_ROR y0, 6 // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 390 | xor y2, g_ // y2 = CH = ((f^g)&e)^g 391 | add y2, y0 // y2 = S1 + CH 392 | MY_ROR y1, 2 // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 393 | add y2, [\base + \offset] // y2 = k + w + S1 + CH 394 | mov y0, a_ // y0 = a 395 | add h_, y2 // h = h + S1 + CH + k + w 396 | mov y2, a_ // y2 = a 397 | or y0, c_ // y0 = a|c 398 | add d_, h_ // d = d + h + S1 + CH + k + w 399 | and y2, c_ // y2 = a&c 400 | and y0, b_ // y0 = (a|c)&b 401 | add h_, y1 // h = h + S1 + CH + k + w + S0 402 | or y0, y2 // y0 = MAJ = (a|c)&b)|(a&c) 403 | add h_, y0 // h = h + S1 + CH + k + w + S0 + MAJ 404 | ROTATE_ARGS 405 | .endm 406 | 407 | .global hashtree_sha256_avx_x1 408 | #ifndef __WIN64__ 409 | .type hashtree_sha256_avx_x1,%function 410 | #endif 411 | .align 32 412 | hashtree_sha256_avx_x1: 413 | endbr64 414 | push rbx 415 | #ifdef __WIN64__ 416 | push r8 417 | push rsi 418 | push rdi 419 | #endif 420 | push rbp 421 | push r12 422 | push r13 423 | push r14 424 | push r15 425 | 426 | sub rsp, stack_size 427 | #ifdef __WIN64__ 428 | vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 429 | vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 430 | vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 431 | vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 432 | vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 433 | vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 434 | vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 435 | vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 436 | #endif 437 | vmovdqa BYTE_FLIP_MASK, [rip + .LPSHUFFLE_BYTE_FLIP_MASK] 438 | vmovdqa SHUF_00BA, [rip + .L_SHUF_00BA] 439 | vmovdqa SHUF_DC00, [rip + .L_SHUF_DC00] 440 | 441 | shl count, 5 442 | add count, OUTPUT_PTR 443 | 444 | .Lsha256_avx_1_block_loop: 445 | cmp OUTPUT_PTR, count 446 | je .Lsha256_1_avx_epilog 447 | 448 | //; load initial digest 449 | lea TBL,[rip + .LDIGEST] 450 | mov a_, [TBL + 0*4] 451 | mov b_, [TBL + 1*4] 452 | mov c_, [TBL + 2*4] 453 | mov d_, [TBL + 3*4] 454 | mov e_, [TBL + 4*4] 455 | mov f_, [TBL + 5*4] 456 | mov g_, [TBL + 6*4] 457 | mov h_, [TBL + 7*4] 458 | 459 | lea TBL,[rip + .LK256] 460 | 461 | //; byte swap first 16 dwords 462 | COPY_XMM_AND_BSWAP X0, [DATA_PTR + 0*16], BYTE_FLIP_MASK 463 | COPY_XMM_AND_BSWAP X1, [DATA_PTR + 1*16], BYTE_FLIP_MASK 464 | COPY_XMM_AND_BSWAP X2, [DATA_PTR + 2*16], BYTE_FLIP_MASK 465 | COPY_XMM_AND_BSWAP X3, [DATA_PTR + 3*16], BYTE_FLIP_MASK 466 | 467 | //; schedule 48 input dwords, by doing 3 rounds of 16 each 468 | .rept 3 469 | .align 32 470 | vpaddd XFER, X0, [TBL + 0*16] 471 | vmovdqa [rsp], XFER 472 | FOUR_ROUNDS_AND_SCHED 473 | 474 | vpaddd XFER, X0, [TBL + 1*16] 475 | vmovdqa [rsp], XFER 476 | FOUR_ROUNDS_AND_SCHED 477 | 478 | vpaddd XFER, X0, [TBL + 2*16] 479 | vmovdqa [rsp], XFER 480 | FOUR_ROUNDS_AND_SCHED 481 | 482 | vpaddd XFER, X0, [TBL + 3*16] 483 | vmovdqa [rsp], XFER 484 | add TBL, 4*16 485 | FOUR_ROUNDS_AND_SCHED 486 | .endr 487 | 488 | .rept 2 489 | vpaddd XFER, X0, [TBL + 0*16] 490 | vmovdqa [rsp], XFER 491 | DO_ROUND rsp, 0 492 | DO_ROUND rsp, 4 493 | DO_ROUND rsp, 8 494 | DO_ROUND rsp, 12 495 | 496 | vpaddd XFER, X1, [TBL + 1*16] 497 | vmovdqa [rsp], XFER 498 | add TBL, 2*16 499 | DO_ROUND rsp, 0 500 | DO_ROUND rsp, 4 501 | DO_ROUND rsp, 8 502 | DO_ROUND rsp, 12 503 | 504 | vmovdqa X0, X2 505 | vmovdqa X1, X3 506 | .endr 507 | 508 | // add old digest 509 | 510 | lea TBL,[rip + .LDIGEST] 511 | add a_, [TBL + 0*4] 512 | add b_, [TBL + 1*4] 513 | add c_, [TBL + 2*4] 514 | add d_, [TBL + 3*4] 515 | add e_, [TBL + 4*4] 516 | add f_, [TBL + 5*4] 517 | add g_, [TBL + 6*4] 518 | add h_, [TBL + 7*4] 519 | 520 | 521 | // rounds with padding 522 | 523 | // save old digest 524 | // 525 | mov [rsp + _DIGEST + 0*4], a_ 526 | mov [rsp + _DIGEST + 1*4], b_ 527 | mov [rsp + _DIGEST + 2*4], c_ 528 | mov [rsp + _DIGEST + 3*4], d_ 529 | mov [rsp + _DIGEST + 4*4], e_ 530 | mov [rsp + _DIGEST + 5*4], f_ 531 | mov [rsp + _DIGEST + 6*4], g_ 532 | mov [rsp + _DIGEST + 7*4], h_ 533 | 534 | lea TBL,[rip + .LPADDING] 535 | 536 | .set .Li, 0 537 | .rept 64 538 | DO_ROUND TBL, .Li 539 | .set .Li, .Li+4 540 | .endr 541 | 542 | //; add the previous digest 543 | add a_, [rsp + _DIGEST + 0*4] 544 | add b_, [rsp + _DIGEST + 1*4] 545 | add c_, [rsp + _DIGEST + 2*4] 546 | add d_, [rsp + _DIGEST + 3*4] 547 | add e_, [rsp + _DIGEST + 4*4] 548 | add f_, [rsp + _DIGEST + 5*4] 549 | add g_, [rsp + _DIGEST + 6*4] 550 | add h_, [rsp + _DIGEST + 7*4] 551 | 552 | //; shuffle the bytes to little endian 553 | bswap a_ 554 | bswap b_ 555 | bswap c_ 556 | bswap d_ 557 | bswap e_ 558 | bswap f_ 559 | bswap g_ 560 | bswap h_ 561 | 562 | //; write resulting hash 563 | mov [OUTPUT_PTR + 0*4], a_ 564 | mov [OUTPUT_PTR + 1*4], b_ 565 | mov [OUTPUT_PTR + 2*4], c_ 566 | mov [OUTPUT_PTR + 3*4], d_ 567 | mov [OUTPUT_PTR + 4*4], e_ 568 | mov [OUTPUT_PTR + 5*4], f_ 569 | mov [OUTPUT_PTR + 6*4], g_ 570 | mov [OUTPUT_PTR + 7*4], h_ 571 | 572 | add OUTPUT_PTR, 32 573 | add DATA_PTR, 64 574 | jmp .Lsha256_avx_1_block_loop 575 | 576 | .Lsha256_1_avx_epilog: 577 | 578 | #ifdef __WIN64__ 579 | vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] 580 | vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] 581 | vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] 582 | vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] 583 | vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] 584 | vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] 585 | vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] 586 | vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] 587 | #endif 588 | 589 | add rsp, stack_size 590 | 591 | pop r15 592 | pop r14 593 | pop r13 594 | pop r12 595 | pop rbp 596 | #ifdef __WIN64__ 597 | pop rdi 598 | pop rsi 599 | pop r8 600 | #endif 601 | pop rbx 602 | 603 | ret 604 | #ifdef __linux__ 605 | .size hashtree_sha256_avx_x1,.-hashtree_sha256_avx_x1 606 | .section .note.GNU-stack,"",@progbits 607 | #endif 608 | #endif 609 | -------------------------------------------------------------------------------- /bindings_test.go: -------------------------------------------------------------------------------- 1 | package hashtree 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | var _test_32_block = [][32]byte{ 9 | {0x7a, 0xee, 0xd5, 0xc9, 0x66, 0x17, 0x59, 0x7f, 0x89, 0xd6, 0xd9, 0xe8, 0xa8, 0xa7, 0x01, 0x47, 0x60, 0xc6, 0x88, 0xfd, 0x2a, 0x7a, 0xf6, 0x1d, 0x10, 0x20, 0x62, 0x7e, 0x7c, 0xd0, 0x1a, 0x0b}, 10 | {0xd4, 0x1f, 0xa7, 0x89, 0x8c, 0xf9, 0x05, 0xfc, 0x1e, 0xb0, 0x04, 0xd7, 0xaa, 0x56, 0x35, 0xec, 0x36, 0xf5, 0x0d, 0x41, 0x75, 0x64, 0x34, 0x71, 0xf0, 0x3b, 0x5b, 0xb2, 0xcc, 0xfa, 0x8c, 0xca}, 11 | {0xf8, 0xd9, 0x9e, 0xa7, 0x9c, 0xa1, 0xe0, 0x3a, 0x19, 0x4f, 0xd3, 0x2d, 0xbd, 0x40, 0x3a, 0xa3, 0x28, 0xe8, 0xa4, 0x27, 0x58, 0x44, 0x12, 0xf7, 0x69, 0x01, 0x66, 0xfa, 0xf1, 0x97, 0x30, 0xfe}, 12 | {0x99, 0x7c, 0x24, 0x0e, 0xed, 0x31, 0x0a, 0xda, 0x12, 0x16, 0x0e, 0x06, 0x44, 0xb8, 0x3f, 0xa2, 0x40, 0x52, 0xbc, 0x2d, 0xaf, 0x97, 0x00, 0x01, 0x5d, 0xbb, 0x0d, 0x06, 0x66, 0xb1, 0x59, 0xf2}, 13 | {0x99, 0x43, 0x52, 0x77, 0x28, 0x39, 0x6b, 0xeb, 0x03, 0x51, 0xc4, 0x5f, 0x7d, 0xd3, 0xe1, 0x41, 0x17, 0x66, 0x7b, 0x0e, 0xc9, 0x51, 0x01, 0xa7, 0x39, 0xf3, 0xc8, 0x63, 0x95, 0xa5, 0x92, 0x6b}, 14 | {0xce, 0x6e, 0xab, 0xd2, 0xe8, 0xad, 0x90, 0xad, 0xbe, 0xe5, 0x94, 0x96, 0xa9, 0x98, 0xe7, 0x83, 0x07, 0xa4, 0x0f, 0x8e, 0xe5, 0xb3, 0x5a, 0x05, 0xcd, 0xfd, 0xae, 0x9c, 0x07, 0xad, 0x26, 0xaa}, 15 | {0xf5, 0xee, 0x66, 0x87, 0x00, 0xed, 0xeb, 0x8b, 0xc2, 0x7d, 0x97, 0x52, 0x2d, 0xfc, 0x0a, 0x2a, 0x32, 0x0e, 0x92, 0xd2, 0x91, 0xd1, 0x69, 0x29, 0x9d, 0xb1, 0x3a, 0x65, 0x9f, 0x8e, 0x7e, 0x2a}, 16 | {0x88, 0x4a, 0xc8, 0x81, 0xdb, 0xa6, 0x79, 0x36, 0x54, 0xe9, 0x15, 0x5c, 0xff, 0x06, 0x35, 0x8b, 0x6e, 0x0d, 0xaa, 0x3e, 0x7a, 0x82, 0x7c, 0x4a, 0xfe, 0x8a, 0x91, 0xb4, 0x34, 0xed, 0xe3, 0x17}, 17 | {0xe7, 0x92, 0xa4, 0x91, 0xdc, 0x1d, 0x83, 0xc8, 0x72, 0x5a, 0xd1, 0x27, 0x17, 0x78, 0x2b, 0xc7, 0x67, 0xe9, 0x56, 0xf2, 0xb4, 0x37, 0x51, 0xa1, 0x6b, 0x23, 0x8c, 0xc9, 0x03, 0x3d, 0x90, 0x1e}, 18 | {0xc4, 0x1f, 0xcc, 0x5e, 0xcb, 0x5e, 0x7d, 0x02, 0x12, 0x3f, 0x15, 0x9f, 0x35, 0xf4, 0x49, 0x55, 0xba, 0xc6, 0x47, 0xd2, 0x85, 0x85, 0x61, 0x69, 0xa5, 0x60, 0x7a, 0x32, 0x7f, 0x8e, 0x09, 0x5f}, 19 | {0x60, 0xb6, 0xab, 0xb5, 0x6b, 0x4d, 0xce, 0x6f, 0x1d, 0x77, 0x2e, 0x9b, 0x0d, 0x60, 0x76, 0xe3, 0xcb, 0x79, 0xbc, 0x40, 0x2d, 0x16, 0xf6, 0xa3, 0x06, 0x12, 0x36, 0x71, 0xda, 0xfd, 0x28, 0x89}, 20 | {0x67, 0xdd, 0x7f, 0x26, 0x6d, 0x2e, 0xf3, 0xef, 0x13, 0xb6, 0x09, 0x73, 0x82, 0xbc, 0x73, 0x25, 0x83, 0xc0, 0x34, 0x90, 0xe8, 0xad, 0xf0, 0x17, 0x8d, 0xed, 0xad, 0x29, 0xf7, 0x78, 0x9c, 0x28}, 21 | {0x00, 0xb0, 0xd5, 0xd0, 0x8e, 0x9b, 0xe5, 0xf0, 0x46, 0x8e, 0x60, 0x25, 0x95, 0xe5, 0x3a, 0x46, 0xb1, 0x07, 0x74, 0x97, 0xed, 0x0a, 0x2f, 0x9a, 0x3f, 0xf3, 0x94, 0x2f, 0xb3, 0x12, 0xa1, 0x91}, 22 | {0x8d, 0x36, 0x16, 0xc6, 0x00, 0x88, 0xd6, 0x69, 0xb4, 0x5a, 0x71, 0x18, 0x41, 0xe5, 0x4d, 0xb2, 0xd9, 0x00, 0x7a, 0x17, 0x63, 0x6a, 0x9b, 0x2e, 0x22, 0x12, 0x5b, 0xa3, 0x74, 0x7c, 0x95, 0xc9}, 23 | {0x4e, 0xfc, 0x5c, 0x18, 0xd1, 0x8a, 0x5b, 0x57, 0x7c, 0x86, 0x3e, 0xe2, 0x75, 0x91, 0xf2, 0xb3, 0x5f, 0xd0, 0x92, 0xbc, 0x77, 0xbe, 0x1b, 0xef, 0x1a, 0x7c, 0xe2, 0xd8, 0x8d, 0x7b, 0xef, 0xf7}, 24 | {0xb7, 0x80, 0xc2, 0x31, 0xe6, 0x75, 0x0c, 0xad, 0x0f, 0xe8, 0xed, 0x59, 0x34, 0xdb, 0xfb, 0x41, 0xd4, 0x38, 0x73, 0x7a, 0x47, 0x01, 0xb8, 0xea, 0xea, 0x2e, 0x01, 0x8e, 0x4f, 0x09, 0x64, 0x82}, 25 | {0x99, 0x43, 0x52, 0x77, 0x28, 0x39, 0x6b, 0xeb, 0x03, 0x51, 0xc4, 0x5f, 0x7d, 0xd3, 0xe1, 0x41, 0x17, 0x66, 0x7b, 0x0e, 0xc9, 0x51, 0x01, 0xa7, 0x39, 0xf3, 0xc8, 0x63, 0x95, 0xa5, 0x92, 0x6b}, 26 | {0xce, 0x6e, 0xab, 0xd2, 0xe8, 0xad, 0x90, 0xad, 0xbe, 0xe5, 0x94, 0x96, 0xa9, 0x98, 0xe7, 0x83, 0x07, 0xa4, 0x0f, 0x8e, 0xe5, 0xb3, 0x5a, 0x05, 0xcd, 0xfd, 0xae, 0x9c, 0x07, 0xad, 0x26, 0xaa}, 27 | {0xf5, 0xee, 0x66, 0x87, 0x00, 0xed, 0xeb, 0x8b, 0xc2, 0x7d, 0x97, 0x52, 0x2d, 0xfc, 0x0a, 0x2a, 0x32, 0x0e, 0x92, 0xd2, 0x91, 0xd1, 0x69, 0x29, 0x9d, 0xb1, 0x3a, 0x65, 0x9f, 0x8e, 0x7e, 0x2a}, 28 | {0x88, 0x4a, 0xc8, 0x81, 0xdb, 0xa6, 0x79, 0x36, 0x54, 0xe9, 0x15, 0x5c, 0xff, 0x06, 0x35, 0x8b, 0x6e, 0x0d, 0xaa, 0x3e, 0x7a, 0x82, 0x7c, 0x4a, 0xfe, 0x8a, 0x91, 0xb4, 0x34, 0xed, 0xe3, 0x17}, 29 | {0xe7, 0x92, 0xa4, 0x91, 0xdc, 0x1d, 0x83, 0xc8, 0x72, 0x5a, 0xd1, 0x27, 0x17, 0x78, 0x2b, 0xc7, 0x67, 0xe9, 0x56, 0xf2, 0xb4, 0x37, 0x51, 0xa1, 0x6b, 0x23, 0x8c, 0xc9, 0x03, 0x3d, 0x90, 0x1e}, 30 | {0xc4, 0x1f, 0xcc, 0x5e, 0xcb, 0x5e, 0x7d, 0x02, 0x12, 0x3f, 0x15, 0x9f, 0x35, 0xf4, 0x49, 0x55, 0xba, 0xc6, 0x47, 0xd2, 0x85, 0x85, 0x61, 0x69, 0xa5, 0x60, 0x7a, 0x32, 0x7f, 0x8e, 0x09, 0x5f}, 31 | {0x60, 0xb6, 0xab, 0xb5, 0x6b, 0x4d, 0xce, 0x6f, 0x1d, 0x77, 0x2e, 0x9b, 0x0d, 0x60, 0x76, 0xe3, 0xcb, 0x79, 0xbc, 0x40, 0x2d, 0x16, 0xf6, 0xa3, 0x06, 0x12, 0x36, 0x71, 0xda, 0xfd, 0x28, 0x89}, 32 | {0x67, 0xdd, 0x7f, 0x26, 0x6d, 0x2e, 0xf3, 0xef, 0x13, 0xb6, 0x09, 0x73, 0x82, 0xbc, 0x73, 0x25, 0x83, 0xc0, 0x34, 0x90, 0xe8, 0xad, 0xf0, 0x17, 0x8d, 0xed, 0xad, 0x29, 0xf7, 0x78, 0x9c, 0x28}, 33 | {0x00, 0xb0, 0xd5, 0xd0, 0x8e, 0x9b, 0xe5, 0xf0, 0x46, 0x8e, 0x60, 0x25, 0x95, 0xe5, 0x3a, 0x46, 0xb1, 0x07, 0x74, 0x97, 0xed, 0x0a, 0x2f, 0x9a, 0x3f, 0xf3, 0x94, 0x2f, 0xb3, 0x12, 0xa1, 0x91}, 34 | {0x8d, 0x36, 0x16, 0xc6, 0x00, 0x88, 0xd6, 0x69, 0xb4, 0x5a, 0x71, 0x18, 0x41, 0xe5, 0x4d, 0xb2, 0xd9, 0x00, 0x7a, 0x17, 0x63, 0x6a, 0x9b, 0x2e, 0x22, 0x12, 0x5b, 0xa3, 0x74, 0x7c, 0x95, 0xc9}, 35 | {0x4e, 0xfc, 0x5c, 0x18, 0xd1, 0x8a, 0x5b, 0x57, 0x7c, 0x86, 0x3e, 0xe2, 0x75, 0x91, 0xf2, 0xb3, 0x5f, 0xd0, 0x92, 0xbc, 0x77, 0xbe, 0x1b, 0xef, 0x1a, 0x7c, 0xe2, 0xd8, 0x8d, 0x7b, 0xef, 0xf7}, 36 | {0xcd, 0x78, 0x15, 0x64, 0x2c, 0x78, 0x57, 0x74, 0x2b, 0xb7, 0xdb, 0x74, 0xe2, 0xab, 0x82, 0xbb, 0x61, 0x32, 0x3e, 0xe4, 0xb1, 0x00, 0xde, 0xb2, 0x35, 0x1e, 0x3e, 0x1c, 0x91, 0x9d, 0x87, 0xde}, 37 | {0x17, 0xcc, 0x52, 0x5c, 0x60, 0x9e, 0xd8, 0xd4, 0xf4, 0x56, 0x28, 0x16, 0xde, 0xde, 0x73, 0xfe, 0xd9, 0x92, 0xb7, 0x99, 0x15, 0x24, 0x1b, 0x40, 0xb0, 0xda, 0x9a, 0xf8, 0x24, 0x38, 0x13, 0xbd}, 38 | {0xd0, 0x45, 0x9b, 0xe3, 0x9a, 0xae, 0x78, 0x41, 0xcd, 0x12, 0x9a, 0x6b, 0x91, 0x58, 0x29, 0x75, 0xae, 0x21, 0xd3, 0xf2, 0x5e, 0x98, 0xab, 0x09, 0xb0, 0xaa, 0x62, 0x96, 0x35, 0x64, 0x18, 0x48}, 39 | {0xd2, 0x5b, 0x10, 0xf1, 0x35, 0xaa, 0x04, 0x49, 0x4e, 0x51, 0x30, 0x0d, 0xb6, 0xbf, 0xa0, 0x9b, 0xa0, 0xf5, 0x66, 0x5f, 0x28, 0xc7, 0x8d, 0xa8, 0x3e, 0x0f, 0xe4, 0xa7, 0xc9, 0xd4, 0x0f, 0x7d}, 40 | {0xb7, 0x80, 0xc2, 0x31, 0xe6, 0x75, 0x0c, 0xad, 0x0f, 0xe8, 0xed, 0x59, 0x34, 0xdb, 0xfb, 0x41, 0xd4, 0x38, 0x73, 0x7a, 0x47, 0x01, 0xb8, 0xea, 0xea, 0x2e, 0x01, 0x8e, 0x4f, 0x09, 0x64, 0x82}, 41 | {0xe4, 0x8b, 0x12, 0xd3, 0xd0, 0x78, 0xb5, 0x5f, 0x3e, 0x9d, 0x94, 0x7f, 0x93, 0x84, 0x77, 0x77, 0xdb, 0x78, 0x41, 0xe8, 0x91, 0xfb, 0x6d, 0x0d, 0xef, 0x00, 0x30, 0x8e, 0x0a, 0xe4, 0x7b, 0xec}, 42 | {0xe7, 0xb2, 0x76, 0xe7, 0x6c, 0xba, 0x8f, 0x8c, 0x0b, 0xf2, 0xa3, 0xad, 0xc2, 0x2d, 0x92, 0xb4, 0xd5, 0xf2, 0x83, 0x42, 0x65, 0x02, 0xd6, 0x67, 0x9a, 0x78, 0x6a, 0xc1, 0xca, 0x91, 0x87, 0x7c}, 43 | {0x16, 0x99, 0x13, 0xf8, 0xa9, 0x20, 0x62, 0x2e, 0xc1, 0x84, 0xc0, 0x25, 0xdc, 0x35, 0x1f, 0xe6, 0x32, 0x49, 0x37, 0x79, 0x78, 0xfb, 0xf5, 0xf7, 0x34, 0xf4, 0xa5, 0x49, 0x9f, 0xc8, 0xfa, 0x8e}, 44 | {0x28, 0x9b, 0x27, 0xae, 0x21, 0x12, 0x14, 0x57, 0x56, 0xf6, 0x9d, 0x7f, 0x0d, 0x28, 0x03, 0xbd, 0x05, 0xd0, 0x11, 0x9e, 0xf1, 0x98, 0x8e, 0x1c, 0xbe, 0xc1, 0x83, 0xdb, 0x1a, 0x65, 0x08, 0x0d}, 45 | {0xef, 0x42, 0x3a, 0x0b, 0x2f, 0xea, 0xdf, 0xfe, 0xeb, 0xd9, 0x72, 0x9a, 0xcf, 0x5a, 0xac, 0x19, 0x09, 0x75, 0x25, 0x64, 0x61, 0x19, 0xf5, 0xcd, 0xdb, 0x9d, 0xcf, 0x4a, 0xa9, 0xf5, 0x48, 0x2c}, 46 | {0x47, 0x69, 0xaa, 0x80, 0x3f, 0xd3, 0x02, 0x67, 0xe9, 0x8b, 0x82, 0xa8, 0x02, 0xe8, 0xcf, 0x60, 0x66, 0xaa, 0xcf, 0x05, 0x0a, 0x85, 0xeb, 0x3d, 0x87, 0x21, 0xcc, 0xe2, 0xdd, 0x6c, 0x42, 0x54}, 47 | {0xd8, 0xb4, 0x39, 0x4f, 0x78, 0xce, 0xd8, 0xad, 0x57, 0xbe, 0xda, 0x18, 0x8f, 0x4a, 0x9b, 0x41, 0xfe, 0x58, 0x9d, 0xa1, 0xd4, 0x71, 0x6e, 0x2f, 0x04, 0xaf, 0x37, 0xa0, 0x29, 0x60, 0x6f, 0x9d}, 48 | {0x84, 0x4a, 0x39, 0x0a, 0x5e, 0x24, 0x81, 0x2e, 0x63, 0xc9, 0xb6, 0xde, 0xc3, 0xf1, 0x82, 0x7b, 0x82, 0x14, 0x07, 0xde, 0x46, 0x03, 0x25, 0x27, 0x4d, 0x09, 0x6b, 0x7e, 0xb9, 0x82, 0x98, 0x41}, 49 | {0x68, 0xf8, 0x98, 0x04, 0xb2, 0x61, 0x78, 0xbf, 0x8a, 0x69, 0x4d, 0xc7, 0x83, 0x4a, 0xe7, 0x77, 0xf7, 0x4b, 0x00, 0x28, 0x34, 0xe6, 0x36, 0xca, 0xa2, 0x58, 0x37, 0x61, 0x60, 0x95, 0x0d, 0xa6}, 50 | {0x20, 0x00, 0x7e, 0x29, 0xa8, 0x6e, 0xca, 0xb8, 0x1b, 0xbc, 0x94, 0x29, 0x2b, 0x18, 0xaa, 0x56, 0x0f, 0x4c, 0x38, 0x1a, 0x7a, 0x16, 0xe8, 0xbb, 0x51, 0xb7, 0xb3, 0xe3, 0x22, 0x8e, 0x9c, 0x05}, 51 | {0xa8, 0x0f, 0x08, 0x4d, 0xf1, 0xd1, 0xd8, 0x2c, 0xac, 0xe8, 0x73, 0x43, 0xcc, 0x73, 0x6b, 0x03, 0x40, 0x21, 0x85, 0x9b, 0x9d, 0x63, 0xa8, 0x44, 0x6a, 0x6c, 0x23, 0xe3, 0x4e, 0x76, 0xb1, 0x51}, 52 | {0x90, 0x61, 0x31, 0xfe, 0xf7, 0x4a, 0x8f, 0x06, 0x9e, 0x75, 0x6a, 0x5a, 0x66, 0xdd, 0xa2, 0xe4, 0x9b, 0x8f, 0x98, 0xbb, 0x18, 0x9a, 0x96, 0x84, 0xfa, 0xe4, 0x3c, 0xd2, 0x2c, 0x96, 0x61, 0xd8}, 53 | {0x96, 0xb4, 0x84, 0xa8, 0x8b, 0x6f, 0xeb, 0xc5, 0x3e, 0xa3, 0x48, 0xd5, 0x00, 0x95, 0x47, 0xda, 0xc1, 0x2d, 0x95, 0x68, 0x49, 0x29, 0x15, 0xb9, 0x36, 0x59, 0x4c, 0x0b, 0x77, 0xdc, 0x01, 0x06}, 54 | {0x58, 0x37, 0xa7, 0x03, 0x40, 0x70, 0x91, 0xee, 0x29, 0x75, 0x10, 0xd4, 0xec, 0x01, 0x87, 0x5f, 0x2e, 0xb5, 0x56, 0xc6, 0x2d, 0xe9, 0x2b, 0xb4, 0xab, 0x95, 0x82, 0x1f, 0x11, 0xf2, 0xb8, 0xc9}, 55 | {0x81, 0xbf, 0xb0, 0x58, 0xcc, 0xdd, 0x0e, 0xf1, 0x9c, 0x17, 0x6b, 0xa0, 0xe6, 0x42, 0x8c, 0x1a, 0x3c, 0x9c, 0x20, 0x18, 0x0b, 0x52, 0x66, 0x5a, 0xc1, 0xe5, 0xc5, 0x66, 0x35, 0xe5, 0x26, 0x4f}, 56 | {0xca, 0x73, 0xe0, 0x95, 0x2c, 0xc7, 0xa9, 0x22, 0x58, 0x68, 0x49, 0xb3, 0x68, 0xdc, 0x34, 0xe1, 0x3b, 0x17, 0x67, 0xaa, 0x82, 0xa1, 0xb6, 0xbd, 0x69, 0x9b, 0xf6, 0x00, 0x71, 0x51, 0x08, 0xca}, 57 | {0xce, 0x06, 0x68, 0x95, 0x13, 0x37, 0x8b, 0x32, 0xc9, 0x62, 0x38, 0xc9, 0x78, 0x90, 0x89, 0x0e, 0x3a, 0x5d, 0x85, 0x50, 0x1c, 0x4c, 0xd6, 0x80, 0xcc, 0x5f, 0x63, 0xf0, 0xc9, 0xfe, 0x7a, 0xb5}, 58 | {0x79, 0x78, 0x8d, 0x38, 0x13, 0xdf, 0xb7, 0x37, 0x18, 0x78, 0xbd, 0x2f, 0x3e, 0xc7, 0x2c, 0x46, 0xd2, 0x74, 0x01, 0xe9, 0xa1, 0x3f, 0xfe, 0x46, 0x11, 0xb0, 0x85, 0x2f, 0x6d, 0x4b, 0x4b, 0x8e}, 59 | {0x11, 0xce, 0x55, 0xe4, 0xba, 0xf7, 0x11, 0xcd, 0xe8, 0xa8, 0x04, 0x33, 0xbd, 0x19, 0xe8, 0xbe, 0xa1, 0x00, 0xd3, 0x28, 0xca, 0x78, 0x56, 0x6d, 0xde, 0xe5, 0x71, 0x13, 0xc2, 0xbd, 0xd8, 0xc2}, 60 | {0x04, 0x64, 0xdb, 0xdb, 0x8b, 0x4f, 0x73, 0x0e, 0x0a, 0x9e, 0xfe, 0xd0, 0x5d, 0x92, 0x3e, 0xf8, 0xf4, 0x8b, 0xef, 0xb6, 0x6f, 0x42, 0xc9, 0xea, 0x73, 0xfb, 0xb6, 0x8e, 0x37, 0x74, 0xae, 0x39}, 61 | {0x91, 0x1e, 0x40, 0x74, 0x23, 0xa7, 0xa8, 0x00, 0xfc, 0xa1, 0x16, 0xed, 0xcf, 0xff, 0xce, 0xea, 0x3f, 0x31, 0x54, 0xad, 0x19, 0x98, 0xcb, 0x5d, 0xfd, 0x82, 0xe2, 0x48, 0xbf, 0xc3, 0x74, 0x71}, 62 | {0x5f, 0x45, 0x5f, 0xba, 0x82, 0x5d, 0xc4, 0x20, 0x12, 0x67, 0x65, 0x0d, 0x8b, 0x14, 0x45, 0x20, 0xd3, 0xbc, 0xb4, 0x23, 0x26, 0x98, 0xfc, 0x05, 0x8f, 0xa5, 0x99, 0xe2, 0x78, 0x74, 0x72, 0x71}, 63 | {0xda, 0xa5, 0x2a, 0xc1, 0x13, 0xa4, 0x3b, 0xeb, 0x41, 0x51, 0x1b, 0x96, 0xa3, 0xa0, 0x5b, 0xd8, 0xed, 0x5e, 0x69, 0x67, 0xfb, 0xc5, 0x27, 0x66, 0x56, 0x8a, 0xb2, 0x1e, 0x93, 0xbf, 0xb0, 0x36}, 64 | {0x54, 0xb8, 0x17, 0xb6, 0xd2, 0x26, 0x22, 0x93, 0xdc, 0xb5, 0xd5, 0x32, 0x1b, 0x76, 0x3c, 0xfa, 0x24, 0x04, 0xcb, 0xa0, 0x1b, 0xcb, 0xa3, 0x12, 0x20, 0x60, 0x3b, 0x59, 0xe5, 0xdf, 0xf7, 0xbf}, 65 | {0x41, 0x42, 0x6c, 0xbf, 0xfa, 0x23, 0xcc, 0xee, 0x3e, 0xf6, 0xf3, 0xbf, 0xa1, 0x39, 0x9b, 0x6e, 0x7f, 0xfb, 0x2c, 0x7f, 0x4e, 0xf5, 0x35, 0x78, 0xb5, 0x5e, 0x77, 0x02, 0x40, 0x2a, 0xbc, 0x77}, 66 | {0x9b, 0xc5, 0x2f, 0xb6, 0xa1, 0x3d, 0x5a, 0xc0, 0x9a, 0x23, 0xce, 0xbf, 0x9b, 0x94, 0xad, 0xd4, 0xe4, 0x6f, 0x0f, 0x0a, 0x64, 0x55, 0x22, 0x26, 0xbc, 0x8b, 0xba, 0xdf, 0xb9, 0x04, 0x3a, 0x5b}, 67 | {0x7b, 0x66, 0x20, 0xcf, 0x63, 0xeb, 0x29, 0xb9, 0x11, 0xc5, 0x5e, 0x18, 0x98, 0x15, 0x2f, 0x69, 0x60, 0xa7, 0xf1, 0x0c, 0xc1, 0x6b, 0x6f, 0xba, 0xd3, 0x2c, 0x83, 0x7d, 0x9d, 0x8e, 0x2b, 0x74}, 68 | {0x7b, 0x9b, 0xcd, 0x1a, 0xe3, 0xfd, 0xd9, 0xd4, 0x74, 0x2e, 0x0d, 0xbc, 0xe1, 0x3c, 0x54, 0x2c, 0xc1, 0x81, 0xb5, 0x0b, 0xa0, 0xf9, 0xd5, 0xe1, 0xca, 0x18, 0x00, 0xf9, 0xb5, 0x84, 0x85, 0xca}, 69 | {0xe7, 0xc9, 0xe2, 0xc8, 0x33, 0x41, 0x31, 0x15, 0xb3, 0x84, 0x3f, 0x79, 0x18, 0xe9, 0x98, 0x5a, 0x51, 0x60, 0xf0, 0x5a, 0x5b, 0xf8, 0x7f, 0x5f, 0xdd, 0x70, 0x27, 0xe3, 0x8f, 0xe3, 0x39, 0xf4}, 70 | {0x36, 0x0d, 0x5b, 0xa8, 0x0e, 0x59, 0xe2, 0x82, 0xa2, 0x39, 0xdf, 0x28, 0x34, 0x4d, 0x4f, 0x74, 0xee, 0xd8, 0x6b, 0xa0, 0xd8, 0x9d, 0xe7, 0x88, 0x05, 0x4e, 0xba, 0x6b, 0x50, 0x03, 0x89, 0xa2}, 71 | {0x89, 0xd6, 0x81, 0x5f, 0x68, 0x39, 0x36, 0x6c, 0x25, 0xad, 0xb6, 0x43, 0xff, 0x6b, 0x5e, 0x19, 0x63, 0xd3, 0xff, 0xd0, 0xce, 0x1a, 0xa7, 0x8c, 0x7f, 0xeb, 0x5a, 0x6e, 0x99, 0xf1, 0xb4, 0xdb}, 72 | {0x1f, 0x36, 0x6f, 0x27, 0xc8, 0x2f, 0x23, 0x81, 0xfc, 0x02, 0x80, 0x4f, 0x8b, 0x8d, 0xa8, 0x2f, 0x3d, 0x35, 0x91, 0xe3, 0x60, 0x90, 0x7c, 0x57, 0x03, 0xc3, 0xa9, 0xed, 0xb1, 0x72, 0x3e, 0x3e}, 73 | } 74 | 75 | var _test_32_digests = [][32]byte{ 76 | {0x22, 0xd8, 0x35, 0x89, 0xe6, 0x42, 0xe1, 0xb1, 0x40, 0xed, 0x1b, 0x48, 0x48, 0x5b, 0x44, 0xc7, 0x07, 0x9d, 0xf3, 0xb2, 0x04, 0xbe, 0x48, 0x69, 0x42, 0x1d, 0x45, 0x49, 0xf3, 0x9e, 0x2c, 0xc7}, 77 | {0xac, 0xfe, 0x28, 0x1d, 0x11, 0x77, 0x7c, 0x1e, 0x22, 0xe0, 0xb7, 0x16, 0x0f, 0x01, 0x66, 0x92, 0xa7, 0xb3, 0xb5, 0x69, 0xed, 0x12, 0x8d, 0x93, 0xcf, 0xce, 0x27, 0x49, 0xfd, 0x1c, 0x85, 0x01}, 78 | {0xbc, 0xb2, 0xa2, 0x0b, 0x95, 0x58, 0x91, 0x64, 0x1f, 0x3a, 0x5d, 0x80, 0xaa, 0x11, 0x49, 0xa5, 0x1b, 0xac, 0xb7, 0x1e, 0x06, 0x62, 0x45, 0x34, 0xa5, 0x66, 0xd1, 0xc7, 0x5a, 0xa9, 0x68, 0xc9}, 79 | {0x4d, 0xe2, 0xaa, 0x4b, 0xc4, 0x6c, 0x1c, 0x3d, 0x42, 0x65, 0x34, 0x8a, 0x2c, 0x7a, 0x64, 0xa8, 0xd9, 0x8a, 0x82, 0xe4, 0x8b, 0x9c, 0xc9, 0x3c, 0x3c, 0xcd, 0x34, 0x4d, 0x71, 0x76, 0xda, 0x69}, 80 | {0x1e, 0x00, 0xd3, 0xc6, 0x59, 0x37, 0x27, 0x6a, 0x6a, 0xae, 0xa7, 0xd8, 0x37, 0x51, 0xac, 0x74, 0x2d, 0xe0, 0xb6, 0x7e, 0xc5, 0xa8, 0xa7, 0x56, 0x5b, 0x0f, 0x10, 0xba, 0x8a, 0x40, 0xe2, 0x1c}, 81 | {0x30, 0x96, 0xdb, 0x9d, 0xcf, 0xa9, 0x5c, 0xf4, 0xa4, 0xc4, 0xc9, 0xd5, 0xa0, 0x1e, 0xd4, 0x30, 0xe5, 0xe8, 0xad, 0x9d, 0xaa, 0x8e, 0x79, 0x1c, 0x5d, 0x6c, 0xac, 0x1a, 0xb3, 0x65, 0xb5, 0x14}, 82 | {0x7a, 0xee, 0xd5, 0xc9, 0x66, 0x17, 0x59, 0x7f, 0x89, 0xd6, 0xd9, 0xe8, 0xa8, 0xa7, 0x01, 0x47, 0x60, 0xc6, 0x88, 0xfd, 0x2a, 0x7a, 0xf6, 0x1d, 0x10, 0x20, 0x62, 0x7e, 0x7c, 0xd0, 0x1a, 0x0b}, 83 | {0xce, 0x0c, 0x94, 0xa7, 0x41, 0x25, 0xa5, 0xe3, 0x96, 0x77, 0xd6, 0xbd, 0x91, 0xca, 0xe6, 0x06, 0xf3, 0x90, 0xe0, 0x37, 0xcc, 0xc1, 0x2c, 0x7d, 0x97, 0x97, 0xf3, 0x56, 0xf0, 0xbd, 0x66, 0x43}, 84 | {0xbc, 0xb2, 0xa2, 0x0b, 0x95, 0x58, 0x91, 0x64, 0x1f, 0x3a, 0x5d, 0x80, 0xaa, 0x11, 0x49, 0xa5, 0x1b, 0xac, 0xb7, 0x1e, 0x06, 0x62, 0x45, 0x34, 0xa5, 0x66, 0xd1, 0xc7, 0x5a, 0xa9, 0x68, 0xc9}, 85 | {0x4d, 0xe2, 0xaa, 0x4b, 0xc4, 0x6c, 0x1c, 0x3d, 0x42, 0x65, 0x34, 0x8a, 0x2c, 0x7a, 0x64, 0xa8, 0xd9, 0x8a, 0x82, 0xe4, 0x8b, 0x9c, 0xc9, 0x3c, 0x3c, 0xcd, 0x34, 0x4d, 0x71, 0x76, 0xda, 0x69}, 86 | {0x1e, 0x00, 0xd3, 0xc6, 0x59, 0x37, 0x27, 0x6a, 0x6a, 0xae, 0xa7, 0xd8, 0x37, 0x51, 0xac, 0x74, 0x2d, 0xe0, 0xb6, 0x7e, 0xc5, 0xa8, 0xa7, 0x56, 0x5b, 0x0f, 0x10, 0xba, 0x8a, 0x40, 0xe2, 0x1c}, 87 | {0x30, 0x96, 0xdb, 0x9d, 0xcf, 0xa9, 0x5c, 0xf4, 0xa4, 0xc4, 0xc9, 0xd5, 0xa0, 0x1e, 0xd4, 0x30, 0xe5, 0xe8, 0xad, 0x9d, 0xaa, 0x8e, 0x79, 0x1c, 0x5d, 0x6c, 0xac, 0x1a, 0xb3, 0x65, 0xb5, 0x14}, 88 | {0x7a, 0xee, 0xd5, 0xc9, 0x66, 0x17, 0x59, 0x7f, 0x89, 0xd6, 0xd9, 0xe8, 0xa8, 0xa7, 0x01, 0x47, 0x60, 0xc6, 0x88, 0xfd, 0x2a, 0x7a, 0xf6, 0x1d, 0x10, 0x20, 0x62, 0x7e, 0x7c, 0xd0, 0x1a, 0x0b}, 89 | {0xd4, 0x1f, 0xa7, 0x89, 0x8c, 0xf9, 0x05, 0xfc, 0x1e, 0xb0, 0x04, 0xd7, 0xaa, 0x56, 0x35, 0xec, 0x36, 0xf5, 0x0d, 0x41, 0x75, 0x64, 0x34, 0x71, 0xf0, 0x3b, 0x5b, 0xb2, 0xcc, 0xfa, 0x8c, 0xca}, 90 | {0xf8, 0xd9, 0x9e, 0xa7, 0x9c, 0xa1, 0xe0, 0x3a, 0x19, 0x4f, 0xd3, 0x2d, 0xbd, 0x40, 0x3a, 0xa3, 0x28, 0xe8, 0xa4, 0x27, 0x58, 0x44, 0x12, 0xf7, 0x69, 0x01, 0x66, 0xfa, 0xf1, 0x97, 0x30, 0xfe}, 91 | {0x99, 0x7c, 0x24, 0x0e, 0xed, 0x31, 0x0a, 0xda, 0x12, 0x16, 0x0e, 0x06, 0x44, 0xb8, 0x3f, 0xa2, 0x40, 0x52, 0xbc, 0x2d, 0xaf, 0x97, 0x00, 0x01, 0x5d, 0xbb, 0x0d, 0x06, 0x66, 0xb1, 0x59, 0xf2}, 92 | {0x99, 0x43, 0x52, 0x77, 0x28, 0x39, 0x6b, 0xeb, 0x03, 0x51, 0xc4, 0x5f, 0x7d, 0xd3, 0xe1, 0x41, 0x17, 0x66, 0x7b, 0x0e, 0xc9, 0x51, 0x01, 0xa7, 0x39, 0xf3, 0xc8, 0x63, 0x95, 0xa5, 0x92, 0x6b}, 93 | {0xce, 0x6e, 0xab, 0xd2, 0xe8, 0xad, 0x90, 0xad, 0xbe, 0xe5, 0x94, 0x96, 0xa9, 0x98, 0xe7, 0x83, 0x07, 0xa4, 0x0f, 0x8e, 0xe5, 0xb3, 0x5a, 0x05, 0xcd, 0xfd, 0xae, 0x9c, 0x07, 0xad, 0x26, 0xaa}, 94 | {0xf5, 0xee, 0x66, 0x87, 0x00, 0xed, 0xeb, 0x8b, 0xc2, 0x7d, 0x97, 0x52, 0x2d, 0xfc, 0x0a, 0x2a, 0x32, 0x0e, 0x92, 0xd2, 0x91, 0xd1, 0x69, 0x29, 0x9d, 0xb1, 0x3a, 0x65, 0x9f, 0x8e, 0x7e, 0x2a}, 95 | {0x88, 0x4a, 0xc8, 0x81, 0xdb, 0xa6, 0x79, 0x36, 0x54, 0xe9, 0x15, 0x5c, 0xff, 0x06, 0x35, 0x8b, 0x6e, 0x0d, 0xaa, 0x3e, 0x7a, 0x82, 0x7c, 0x4a, 0xfe, 0x8a, 0x91, 0xb4, 0x34, 0xed, 0xe3, 0x17}, 96 | {0xe7, 0x92, 0xa4, 0x91, 0xdc, 0x1d, 0x83, 0xc8, 0x72, 0x5a, 0xd1, 0x27, 0x17, 0x78, 0x2b, 0xc7, 0x67, 0xe9, 0x56, 0xf2, 0xb4, 0x37, 0x51, 0xa1, 0x6b, 0x23, 0x8c, 0xc9, 0x03, 0x3d, 0x90, 0x1e}, 97 | {0xc4, 0x1f, 0xcc, 0x5e, 0xcb, 0x5e, 0x7d, 0x02, 0x12, 0x3f, 0x15, 0x9f, 0x35, 0xf4, 0x49, 0x55, 0xba, 0xc6, 0x47, 0xd2, 0x85, 0x85, 0x61, 0x69, 0xa5, 0x60, 0x7a, 0x32, 0x7f, 0x8e, 0x09, 0x5f}, 98 | {0x60, 0xb6, 0xab, 0xb5, 0x6b, 0x4d, 0xce, 0x6f, 0x1d, 0x77, 0x2e, 0x9b, 0x0d, 0x60, 0x76, 0xe3, 0xcb, 0x79, 0xbc, 0x40, 0x2d, 0x16, 0xf6, 0xa3, 0x06, 0x12, 0x36, 0x71, 0xda, 0xfd, 0x28, 0x89}, 99 | {0x67, 0xdd, 0x7f, 0x26, 0x6d, 0x2e, 0xf3, 0xef, 0x13, 0xb6, 0x09, 0x73, 0x82, 0xbc, 0x73, 0x25, 0x83, 0xc0, 0x34, 0x90, 0xe8, 0xad, 0xf0, 0x17, 0x8d, 0xed, 0xad, 0x29, 0xf7, 0x78, 0x9c, 0x28}, 100 | {0x00, 0xb0, 0xd5, 0xd0, 0x8e, 0x9b, 0xe5, 0xf0, 0x46, 0x8e, 0x60, 0x25, 0x95, 0xe5, 0x3a, 0x46, 0xb1, 0x07, 0x74, 0x97, 0xed, 0x0a, 0x2f, 0x9a, 0x3f, 0xf3, 0x94, 0x2f, 0xb3, 0x12, 0xa1, 0x91}, 101 | {0x8d, 0x36, 0x16, 0xc6, 0x00, 0x88, 0xd6, 0x69, 0xb4, 0x5a, 0x71, 0x18, 0x41, 0xe5, 0x4d, 0xb2, 0xd9, 0x00, 0x7a, 0x17, 0x63, 0x6a, 0x9b, 0x2e, 0x22, 0x12, 0x5b, 0xa3, 0x74, 0x7c, 0x95, 0xc9}, 102 | {0x4e, 0xfc, 0x5c, 0x18, 0xd1, 0x8a, 0x5b, 0x57, 0x7c, 0x86, 0x3e, 0xe2, 0x75, 0x91, 0xf2, 0xb3, 0x5f, 0xd0, 0x92, 0xbc, 0x77, 0xbe, 0x1b, 0xef, 0x1a, 0x7c, 0xe2, 0xd8, 0x8d, 0x7b, 0xef, 0xf7}, 103 | {0xcd, 0x78, 0x15, 0x64, 0x2c, 0x78, 0x57, 0x74, 0x2b, 0xb7, 0xdb, 0x74, 0xe2, 0xab, 0x82, 0xbb, 0x61, 0x32, 0x3e, 0xe4, 0xb1, 0x00, 0xde, 0xb2, 0x35, 0x1e, 0x3e, 0x1c, 0x91, 0x9d, 0x87, 0xde}, 104 | {0x17, 0xcc, 0x52, 0x5c, 0x60, 0x9e, 0xd8, 0xd4, 0xf4, 0x56, 0x28, 0x16, 0xde, 0xde, 0x73, 0xfe, 0xd9, 0x92, 0xb7, 0x99, 0x15, 0x24, 0x1b, 0x40, 0xb0, 0xda, 0x9a, 0xf8, 0x24, 0x38, 0x13, 0xbd}, 105 | {0xd0, 0x45, 0x9b, 0xe3, 0x9a, 0xae, 0x78, 0x41, 0xcd, 0x12, 0x9a, 0x6b, 0x91, 0x58, 0x29, 0x75, 0xae, 0x21, 0xd3, 0xf2, 0x5e, 0x98, 0xab, 0x09, 0xb0, 0xaa, 0x62, 0x96, 0x35, 0x64, 0x18, 0x48}, 106 | {0xd2, 0x5b, 0x10, 0xf1, 0x35, 0xaa, 0x04, 0x49, 0x4e, 0x51, 0x30, 0x0d, 0xb6, 0xbf, 0xa0, 0x9b, 0xa0, 0xf5, 0x66, 0x5f, 0x28, 0xc7, 0x8d, 0xa8, 0x3e, 0x0f, 0xe4, 0xa7, 0xc9, 0xd4, 0x0f, 0x7d}, 107 | {0xb7, 0x80, 0xc2, 0x31, 0xe6, 0x75, 0x0c, 0xad, 0x0f, 0xe8, 0xed, 0x59, 0x34, 0xdb, 0xfb, 0x41, 0xd4, 0x38, 0x73, 0x7a, 0x47, 0x01, 0xb8, 0xea, 0xea, 0x2e, 0x01, 0x8e, 0x4f, 0x09, 0x64, 0x82}, 108 | } 109 | 110 | func TestHash(t *testing.T) { 111 | tests := []struct { 112 | name string 113 | count uint32 114 | }{ 115 | { 116 | name: "hash 1 block", 117 | count: 1, 118 | }, 119 | { 120 | name: "hash 4 blocks", 121 | count: 4, 122 | }, 123 | { 124 | name: "hash 8 blocks", 125 | count: 8, 126 | }, 127 | { 128 | name: "hash 16 blocks", 129 | count: 16, 130 | }, 131 | { 132 | name: "hash 18 blocks", 133 | count: 18, 134 | }, 135 | { 136 | name: "hash 24 blocks", 137 | count: 24, 138 | }, 139 | { 140 | name: "hash 32 blocks", 141 | count: 32, 142 | }, 143 | { 144 | name: "hash 31 blocks", 145 | count: 31, 146 | }, 147 | } 148 | for _, tt := range tests { 149 | t.Run(tt.name, func(t *testing.T) { 150 | digests := make([][32]byte, tt.count) 151 | err := Hash(digests, _test_32_block[:2*tt.count]) 152 | if err != nil { 153 | t.Log(err) 154 | t.Fail() 155 | } 156 | if !reflect.DeepEqual(digests, _test_32_digests[:tt.count]) { 157 | t.Logf("Digests are different\n Expected: %x\n Produced: %x\n", 158 | _test_32_digests[:tt.count], digests) 159 | t.Fail() 160 | } 161 | digests2 := make([][32]byte, tt.count) 162 | sha256_1_generic(digests2, _test_32_block[:2*tt.count]) 163 | if err != nil { 164 | t.Log(err) 165 | t.Fail() 166 | } 167 | if !reflect.DeepEqual(digests2, _test_32_digests[:tt.count]) { 168 | t.Logf("Digests are different\n Expected: %x\n Produced: %x\n", 169 | _test_32_digests[:tt.count], digests) 170 | t.Fail() 171 | } 172 | }) 173 | } 174 | } 175 | 176 | func TestHashByteSlice(t *testing.T) { 177 | tests := []struct { 178 | name string 179 | count uint32 180 | }{ 181 | { 182 | name: "hash 1 block", 183 | count: 1, 184 | }, 185 | { 186 | name: "hash 4 blocks", 187 | count: 4, 188 | }, 189 | { 190 | name: "hash 8 blocks", 191 | count: 8, 192 | }, 193 | { 194 | name: "hash 16 blocks", 195 | count: 16, 196 | }, 197 | { 198 | name: "hash 18 blocks", 199 | count: 18, 200 | }, 201 | { 202 | name: "hash 24 blocks", 203 | count: 24, 204 | }, 205 | { 206 | name: "hash 32 blocks", 207 | count: 32, 208 | }, 209 | { 210 | name: "hash 31 blocks", 211 | count: 31, 212 | }, 213 | } 214 | for _, tt := range tests { 215 | t.Run(tt.name, func(t *testing.T) { 216 | digests := make([]byte, 32*tt.count) 217 | chunks := make([]byte, 64*tt.count) 218 | for i := 0; i < int(2*tt.count); i += 2 { 219 | if n := copy(chunks[32*i:32*i+32], _test_32_block[i][:]); n != 32 { 220 | t.Logf("copied wrong number of bytes") 221 | t.Fail() 222 | } 223 | if n := copy(chunks[32*i+32:32*i+64], _test_32_block[i+1][:]); n != 32 { 224 | t.Logf("copied wrong number of bytes") 225 | t.Fail() 226 | } 227 | } 228 | 229 | err := HashByteSlice(digests, chunks) 230 | if err != nil { 231 | t.Log(err) 232 | t.Fail() 233 | } 234 | for i := 0; i < int(tt.count); i++ { 235 | if !reflect.DeepEqual(digests[32*i:32*i+32], _test_32_digests[i][:]) { 236 | t.Logf("Digests are different\n Expected: %x\n Produced: %x\n", 237 | _test_32_digests[i][:], digests[32*i:32*i+32]) 238 | t.Fail() 239 | } 240 | } 241 | }) 242 | } 243 | } 244 | 245 | func BenchmarkHash_1(b *testing.B) { 246 | chunks := make([][32]byte, 2) 247 | digests := make([][32]byte, 1) 248 | b.ResetTimer() 249 | for i := 0; i < b.N; i++ { 250 | Hash(digests, chunks) 251 | } 252 | } 253 | 254 | func BenchmarkHash_4(b *testing.B) { 255 | chunks := make([][32]byte, 8) 256 | digests := make([][32]byte, 4) 257 | b.ResetTimer() 258 | for i := 0; i < b.N; i++ { 259 | Hash(digests, chunks) 260 | } 261 | } 262 | 263 | func BenchmarkHash_8(b *testing.B) { 264 | chunks := make([][32]byte, 16) 265 | digests := make([][32]byte, 8) 266 | b.ResetTimer() 267 | for i := 0; i < b.N; i++ { 268 | Hash(digests, chunks) 269 | } 270 | } 271 | 272 | func BenchmarkHash_16(b *testing.B) { 273 | chunks := make([][32]byte, 32) 274 | digests := make([][32]byte, 16) 275 | b.ResetTimer() 276 | for i := 0; i < b.N; i++ { 277 | Hash(digests, chunks) 278 | } 279 | } 280 | 281 | func BenchmarkHashList(b *testing.B) { 282 | balances := make([][32]byte, 400000) 283 | for i := 0; i < len(balances); i++ { 284 | balances[i] = [32]byte{'A'} 285 | } 286 | digests := make([][32]byte, 200000) 287 | b.ResetTimer() 288 | for i := 0; i < b.N; i++ { 289 | Hash(digests, balances) 290 | } 291 | } 292 | -------------------------------------------------------------------------------- /src/sha256_armv8_neon_x4.S: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021-2024 Prysmatic Labs 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | 25 | 26 | ######################################################################################################### 27 | # 28 | # void sha256_armv8_neon_x1( unsigned char *output, unsigned char *input, size_t count) 29 | # 30 | # armv8-a implementation with Neon but no crypto extensions 31 | # as in the Cortex A-72 of a Raspberry-Pi 4.b 32 | # 33 | # It reads four blocks at a time, and schedules 4 words at the same time using ASIMD instructions 34 | # There are no bound checks, caller is responsible to check that memory up to output + 32*count 35 | # is writable. 36 | # 37 | ######################################################################################################## 38 | 39 | #ifdef __aarch64__ 40 | .text 41 | .arch armv8-a 42 | 43 | output .req x0 44 | input .req x1 45 | count .req x2 46 | last .req x2 47 | 48 | digest .req x3 49 | k256 .req x4 50 | padding .req x5 51 | digest2 .req x6 52 | post64 .req x7 53 | postminus176 .req x9 54 | post32 .req x10 55 | postminus80 .req x11 56 | 57 | A_ .req v0 58 | B_ .req v1 59 | C_ .req v2 60 | D_ .req v3 61 | E_ .req v4 62 | F_ .req v5 63 | G_ .req v6 64 | H_ .req v7 65 | 66 | M1 .req v16 67 | M2 .req v17 68 | M3 .req v18 69 | M4 .req v19 70 | MQ1 .req q16 71 | MQ2 .req q17 72 | MQ3 .req q18 73 | MQ4 .req q19 74 | 75 | VR1 .req v24 76 | VR2 .req v25 77 | VR3 .req v26 78 | VR4 .req v27 79 | QR2 .req q25 80 | QR4 .req q27 81 | T1 .req v28 82 | T2 .req v29 83 | T3 .req v30 84 | T4 .req v31 85 | T5 .req v20 86 | T6 .req v21 87 | T7 .req v22 88 | T8 .req v23 89 | TQ4 .req q31 90 | TQ5 .req q20 91 | TQ6 .req q21 92 | TQ7 .req q22 93 | 94 | 95 | ############################################################################ 96 | # round computes one round, 4 lanes at a time. Constants are read from k256, 97 | # one message word is consumed from MW and saved to sp transposed this macro 98 | # adds 16 to sp. offset has to be set to 0 on the first round 99 | ############################################################################ 100 | .macro round A, B, C, D, E, F, G, H, MV, MQ 101 | ushr T1.4s, \E\().4s, #6 102 | shl T2.4s, \E\().4s, #(32-6) 103 | ushr VR2.4s, \E\().4s, #11 104 | shl VR1.4s, \E\().4s, #(32-11) 105 | and T3.16b, \E\().16b, \F\().16b 106 | bic T4.16b, \G\().16b, \E\().16b 107 | orr T1.16b, T1.16b, T2.16b // ROTR^6(E) 108 | ushr T2.4s, \E\().4s, #25 109 | ldr QR4, [k256, #.Loffset] 110 | shl VR3.4s, \E\().4s, #(32-25) 111 | orr VR1.16b, VR2.16b, VR1.16b // ROTR^11(E) 112 | eor T3.16b, T3.16b, T4.16b // CH(E,F,G) 113 | orr T2.16b, T2.16b, VR3.16b // ROTR^25(E) 114 | eor VR3.16b, \A\().16b, \C\().16b 115 | eor T1.16b, T1.16b, VR1.16b 116 | add T4.4s, \MV\().4s, VR4.4s // W + K 117 | add \H\().4s, \H\().4s, T3.4s 118 | ushr T3.4s, \A\().4s, #2 119 | and VR3.16b, VR3.16b, \B\().16b 120 | shl VR4.4s, \A\().4s, #(32-2) 121 | eor T1.16b, T1.16b, T2.16b // Sigma1 122 | ushr T2.4s, \A\().4s, #13 123 | shl VR1.4s, \A\().4s, #(32-13) 124 | add \H\().4s, \H\().4s, T4.4s 125 | orr T3.16b, T3.16b, VR4.16b // ROTR^2(A) 126 | and VR4.16b, \A\().16b, \C\().16b 127 | ushr T4.4s, \A\().4s, #22 128 | shl VR2.4s, \A\().4s, #(32 - 22) 129 | orr T2.16b, T2.16b, VR1.16b // ROTR^13(A) 130 | add \H\().4s, \H\().4s, T1.4s 131 | eor VR3.16b, VR3.16b, VR4.16b // MAJ(A,B,C) 132 | orr T4.16b, T4.16b, VR2.16b // ROTR^22(A) 133 | eor T2.16b, T2.16b, T3.16b 134 | add \D\().4s, \D\().4s, \H\().4s 135 | add \H\().4s, \H\().4s, VR3.4s 136 | eor T2.16b, T2.16b, T4.16b // Sigma0 137 | str \MQ, [sp, #.Loffset] 138 | add \H\().4s, \H\().4s, T2.4s 139 | .set .Loffset, .Loffset + 16 140 | .endm 141 | .macro four_rounds A, B, C, D, E, F, G, H, MV1, MV2, MV3, MV4, MQ1, MQ2, MQ3, MQ4 142 | round \A, \B, \C, \D, \E, \F, \G, \H, \MV1, \MQ1 143 | round \H, \A, \B, \C, \D, \E, \F, \G, \MV2, \MQ2 144 | round \G, \H, \A, \B, \C, \D, \E, \F, \MV3, \MQ3 145 | round \F, \G, \H, \A, \B, \C, \D, \E, \MV4, \MQ4 146 | .endm 147 | 148 | ################################################################################## 149 | # round_and_sched performs one round and schedules one word, 4 lanes at a time 150 | # reads previous scheduled words from sp, constants from k256 151 | # 152 | # 153 | ################################################################################## 154 | .macro round_and_sched A, B, C, D, E, F, G, H 155 | ldp TQ6, TQ5, [sp, #(.Loffset-256)] // W16, W15 156 | ushr T1.4s, \E\().4s, #6 157 | shl T2.4s, \E\().4s, #(32-6) 158 | ushr VR2.4s, \E\().4s, #11 159 | shl VR1.4s, \E\().4s, #(32-11) 160 | and T3.16b, \E\().16b, \F\().16b 161 | bic T4.16b, \G\().16b, \E\().16b 162 | ushr M1.4s, T5.4s, #7 163 | ldr TQ7, [sp, #(.Loffset - 32)] // W2 164 | shl M2.4s, T5.4s, #(32-7) 165 | orr T1.16b, T1.16b, T2.16b // ROTR^6(E) 166 | ushr T2.4s, \E\().4s, #25 167 | shl VR3.4s, \E\().4s, #(32-25) 168 | orr VR1.16b, VR2.16b, VR1.16b // ROTR^11(E) 169 | eor T3.16b, T3.16b, T4.16b // CH(E,F,G) 170 | ldr QR4, [k256, #.Loffset] 171 | 172 | orr M1.16b, M1.16b, M2.16b // ROTR7(W15) 173 | ushr M3.4s, T7.4s, #17 174 | shl M4.4s, T7.4s, #(32-17) 175 | ushr M2.4s, T5.4s, #18 176 | shl T8.4s, T5.4s, #(32-18) 177 | 178 | orr T2.16b, T2.16b, VR3.16b // ROTR^25(E) 179 | eor VR3.16b, \A\().16b, \C\().16b 180 | orr M3.16b, M3.16b, M4.16b // ROTR^17(W2) 181 | ldr TQ4, [sp, #(.Loffset - 112)] // W7 182 | ushr M4.4s, T7.4s, #19 183 | shl VR2.4s, T7.4s, #(32-19) 184 | orr M2.16b, M2.16b, T8.16b // ROTR^18(W15) 185 | ushr T8.4s, T5.4s, #3 186 | orr M4.16b, M4.16b, VR2.16b // ROTR^19(W2) 187 | 188 | eor T1.16b, T1.16b, VR1.16b 189 | eor M1.16b, M1.16b, M2.16b 190 | ushr M2.4s, T7.4s, #10 191 | eor M3.16b, M3.16b, M4.16b 192 | add \H\().4s, \H\().4s, T3.4s 193 | eor M1.16b, M1.16b, T8.16b // sigma0 194 | add T6.4s, T6.4s, T4.4s // W7 + W16 195 | eor M3.16b, M3.16b, M2.16b // sigma1 196 | 197 | 198 | ushr T3.4s, \A\().4s, #2 199 | and VR3.16b, VR3.16b, \B\().16b 200 | add M1.4s, M1.4s, T6.4s 201 | shl T6.4s, \A\().4s, #(32-2) 202 | eor T1.16b, T1.16b, T2.16b // Sigma1 203 | ushr T2.4s, \A\().4s, #13 204 | add M1.4s, M1.4s, M3.4s // W0 205 | add \H\().4s, \H\().4s, T1.4s 206 | shl VR1.4s, \A\().4s, #(32-13) 207 | orr T3.16b, T3.16b, T6.16b // ROTR^2(A) 208 | add T5.4s, M1.4s, VR4.4s // W + K 209 | str MQ1, [sp, #.Loffset] 210 | and VR4.16b, \A\().16b, \C\().16b 211 | ushr T4.4s, \A\().4s, #22 212 | shl VR2.4s, \A\().4s, #(32 - 22) 213 | add \H\().4s, \H\().4s, T5.4s 214 | orr T2.16b, T2.16b, VR1.16b // ROTR^13(A) 215 | eor VR3.16b, VR3.16b, VR4.16b // MAJ(A,B,C) 216 | orr T4.16b, T4.16b, VR2.16b // ROTR^22(A) 217 | eor T2.16b, T2.16b, T3.16b 218 | add \D\().4s, \D\().4s, \H\().4s 219 | add \H\().4s, \H\().4s, VR3.4s 220 | eor T2.16b, T2.16b, T4.16b // Sigma0 221 | add \H\().4s, \H\().4s, T2.4s 222 | .set .Loffset, .Loffset + 16 223 | .endm 224 | .macro four_rounds_and_sched A, B, C, D, E, F, G, H 225 | round_and_sched \A, \B, \C, \D, \E, \F, \G, \H 226 | round_and_sched \H, \A, \B, \C, \D, \E, \F, \G 227 | round_and_sched \G, \H, \A, \B, \C, \D, \E, \F 228 | round_and_sched \F, \G, \H, \A, \B, \C, \D, \E 229 | .endm 230 | 231 | ########################################################################## 232 | # performs one round reading the precomputed words from padding 233 | ########################################################################## 234 | .macro round_padding A, B, C, D, E, F, G, H 235 | ushr T1.4s, \E\().4s, #6 236 | shl T2.4s, \E\().4s, #(32-6) 237 | ushr VR2.4s, \E\().4s, #11 238 | shl VR1.4s, \E\().4s, #(32-11) 239 | and T3.16b, \E\().16b, \F\().16b 240 | bic T4.16b, \G\().16b, \E\().16b 241 | orr T1.16b, T1.16b, T2.16b // ROTR^6(E) 242 | ushr T2.4s, \E\().4s, #25 243 | shl VR3.4s, \E\().4s, #(32-25) 244 | orr VR1.16b, VR2.16b, VR1.16b // ROTR^11(E) 245 | eor T3.16b, T3.16b, T4.16b // CH(E,F,G) 246 | orr T2.16b, T2.16b, VR3.16b // ROTR^25(E) 247 | eor VR3.16b, \A\().16b, \C\().16b 248 | eor T1.16b, T1.16b, VR1.16b 249 | add \H\().4s, \H\().4s, T3.4s 250 | ushr T3.4s, \A\().4s, #2 251 | ldr QR2, [padding, #.Loffset] 252 | and VR3.16b, VR3.16b, \B\().16b 253 | shl VR4.4s, \A\().4s, #(32-2) 254 | eor T1.16b, T1.16b, T2.16b // Sigma1 255 | ushr T2.4s, \A\().4s, #13 256 | shl VR1.4s, \A\().4s, #(32-13) 257 | add \H\().4s, \H\().4s, VR2.4s 258 | orr T3.16b, T3.16b, VR4.16b // ROTR^2(A) 259 | and VR4.16b, \A\().16b, \C\().16b 260 | ushr T4.4s, \A\().4s, #22 261 | shl VR2.4s, \A\().4s, #(32 - 22) 262 | orr T2.16b, T2.16b, VR1.16b // ROTR^13(A) 263 | add \H\().4s, \H\().4s, T1.4s 264 | eor VR3.16b, VR3.16b, VR4.16b // MAJ(A,B,C) 265 | orr T4.16b, T4.16b, VR2.16b // ROTR^22(A) 266 | eor T2.16b, T2.16b, T3.16b 267 | add \D\().4s, \D\().4s, \H\().4s 268 | add \H\().4s, \H\().4s, VR3.4s 269 | eor T2.16b, T2.16b, T4.16b // Sigma0 270 | add \H\().4s, \H\().4s, T2.4s 271 | .set .Loffset, .Loffset + 16 272 | .endm 273 | .macro four_rounds_padding A, B, C, D, E, F, G, H 274 | round_padding \A, \B, \C, \D, \E, \F, \G, \H 275 | round_padding \H, \A, \B, \C, \D, \E, \F, \G 276 | round_padding \G, \H, \A, \B, \C, \D, \E, \F 277 | round_padding \F, \G, \H, \A, \B, \C, \D, \E 278 | .endm 279 | 280 | #ifdef __APPLE__ 281 | .global _hashtree_sha256_neon_x4 282 | #else 283 | .global hashtree_sha256_neon_x4 284 | #endif 285 | #ifdef __APPLE__ 286 | //.type hashtree_sha256_neon_x4,%function 287 | #else 288 | .type hashtree_sha256_neon_x4,%function 289 | #endif 290 | 291 | .align 5 292 | #ifdef __APPLE__ 293 | _hashtree_sha256_neon_x4: 294 | #else 295 | hashtree_sha256_neon_x4: 296 | #endif 297 | sub sp, sp, #1024 298 | 299 | #ifdef __APPLE__ 300 | adrp k256,.LK256x4@GOTPAGE 301 | ldr k256, [k256, .LK256x4@GOTPAGEOFF] 302 | adrp padding, .LPADDINGx4@GOTPAGE 303 | ldr padding, [padding, .LPADDINGx4@GOTPAGEOFF] 304 | adrp digest, .LDIGESTx4L@GOTPAGE 305 | ldr digest, [digest, .LDIGESTx4L@GOTPAGEOFF] 306 | adrp digest2, .LDIGESTx4H@GOTPAGE 307 | ldr digest2, [digest2, .LDIGESTx4H@GOTPAGEOFF] 308 | #else 309 | adrp k256,.LK256x4 310 | add k256, k256, #:lo12:.LK256x4 311 | adrp padding, .LPADDINGx4 312 | add padding, padding, #:lo12:.LPADDINGx4 313 | adrp digest, .LDIGESTx4L 314 | add digest, digest, #:lo12:.LDIGESTx4L 315 | adrp digest2, .LDIGESTx4H 316 | add digest2, digest2, #:lo12:.LDIGESTx4H 317 | #endif 318 | mov post64, #64 319 | mov post32, #32 320 | mov postminus80, #-80 321 | mov postminus176, #-176 322 | .Larmv8_neon_x4_loop: 323 | cmp count, 4 324 | b.lo .Lsha256_armv8_x4_epilog 325 | ld1 {A_.4s, B_.4s, C_.4s, D_.4s}, [digest] 326 | ld1 {E_.4s, F_.4s, G_.4s, H_.4s}, [digest2] // stall 8 cycles 327 | 328 | .set .Loffset, 0 329 | .rept 2 330 | ld4 {M1.s, M2.s, M3.s, M4.s}[0], [input], post64 331 | ld4 {M1.s, M2.s, M3.s, M4.s}[1], [input], post64 332 | ld4 {M1.s, M2.s, M3.s, M4.s}[2], [input], post64 333 | ld4 {M1.s, M2.s, M3.s, M4.s}[3], [input], postminus176 334 | 335 | rev32 M1.16b, M1.16b 336 | rev32 M2.16b, M2.16b 337 | rev32 M3.16b, M3.16b 338 | rev32 M4.16b, M4.16b 339 | 340 | four_rounds A_, B_, C_, D_, E_, F_, G_, H_, M1, M2, M3, M4, MQ1, MQ2, MQ3, MQ4 341 | 342 | ld4 {M1.s, M2.s, M3.s, M4.s}[0], [input], post64 343 | ld4 {M1.s, M2.s, M3.s, M4.s}[1], [input], post64 344 | ld4 {M1.s, M2.s, M3.s, M4.s}[2], [input], post64 345 | ld4 {M1.s, M2.s, M3.s, M4.s}[3], [input], postminus176 346 | 347 | rev32 M1.16b, M1.16b 348 | rev32 M2.16b, M2.16b 349 | rev32 M3.16b, M3.16b 350 | rev32 M4.16b, M4.16b 351 | 352 | four_rounds E_, F_, G_, H_, A_, B_, C_, D_, M1, M2, M3, M4, MQ1, MQ2, MQ3, MQ4 353 | .endr 354 | .rept 6 355 | four_rounds_and_sched A_, B_, C_, D_, E_, F_, G_, H_ 356 | four_rounds_and_sched E_, F_, G_, H_, A_, B_, C_, D_ 357 | .endr 358 | 359 | # add previous digest 360 | ld1 {M1.4s, M2.4s, M3.4s, M4.4s}, [digest] 361 | ld1 {T5.4s, T6.4s, T7.4s, T8.4s}, [digest2] // stall 8 cycles 362 | add A_.4s, A_.4s, M1.4s 363 | add B_.4s, B_.4s, M2.4s 364 | add C_.4s, C_.4s, M3.4s 365 | add D_.4s, D_.4s, M4.4s 366 | add E_.4s, E_.4s, T5.4s 367 | add F_.4s, F_.4s, T6.4s 368 | add G_.4s, G_.4s, T7.4s 369 | add H_.4s, H_.4s, T8.4s 370 | 371 | 372 | # save state 373 | mov M1.16b, A_.16b 374 | mov M2.16b, B_.16b 375 | mov M3.16b, C_.16b 376 | mov M4.16b, D_.16b 377 | mov T5.16b, E_.16b 378 | mov T6.16b, F_.16b 379 | mov T7.16b, G_.16b 380 | mov T8.16b, H_.16b 381 | 382 | # rounds with padding 383 | .set .Loffset, 0 384 | .rept 8 385 | four_rounds_padding A_, B_, C_, D_, E_, F_, G_, H_ 386 | four_rounds_padding E_, F_, G_, H_, A_, B_, C_, D_ 387 | .endr 388 | 389 | #add previous digest 390 | add A_.4s, A_.4s, M1.4s 391 | add B_.4s, B_.4s, M2.4s 392 | add C_.4s, C_.4s, M3.4s 393 | add D_.4s, D_.4s, M4.4s 394 | add E_.4s, E_.4s, T5.4s 395 | add F_.4s, F_.4s, T6.4s 396 | add G_.4s, G_.4s, T7.4s 397 | add H_.4s, H_.4s, T8.4s 398 | #change endianness transpose and store 399 | rev32 A_.16b, A_.16b 400 | rev32 B_.16b, B_.16b 401 | rev32 C_.16b, C_.16b 402 | rev32 D_.16b, D_.16b 403 | rev32 E_.16b, E_.16b 404 | rev32 F_.16b, F_.16b 405 | rev32 G_.16b, G_.16b 406 | rev32 H_.16b, H_.16b 407 | 408 | st4 {A_.s, B_.s, C_.s, D_.s}[0], [output], post32 409 | st4 {A_.s, B_.s, C_.s, D_.s}[1], [output], post32 410 | st4 {A_.s, B_.s, C_.s, D_.s}[2], [output], post32 411 | st4 {A_.s, B_.s, C_.s, D_.s}[3], [output], postminus80 412 | st4 {E_.s, F_.s, G_.s, H_.s}[0], [output], post32 413 | st4 {E_.s, F_.s, G_.s, H_.s}[1], [output], post32 414 | st4 {E_.s, F_.s, G_.s, H_.s}[2], [output], post32 415 | st4 {E_.s, F_.s, G_.s, H_.s}[3], [output], #16 416 | add input, input, #192 417 | sub count, count, #4 418 | b .Larmv8_neon_x4_loop 419 | .Lsha256_armv8_x4_epilog: 420 | add sp, sp, #1024 421 | #ifdef __APPLE__ 422 | b _hashtree_sha256_neon_x1 423 | #else 424 | b hashtree_sha256_neon_x1 425 | #endif 426 | .section .rodata,"a" 427 | .align 4 428 | .LDIGESTx4L: 429 | .word 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667,\ 430 | 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85,\ 431 | 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372,\ 432 | 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a 433 | .LDIGESTx4H: 434 | .word 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f,\ 435 | 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c,\ 436 | 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab,\ 437 | 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 438 | .LK256x4: 439 | .word 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98,\ 440 | 0x71374491, 0x71374491, 0x71374491, 0x71374491,\ 441 | 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf,\ 442 | 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5,\ 443 | 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b,\ 444 | 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1,\ 445 | 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4,\ 446 | 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5,\ 447 | 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98,\ 448 | 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01,\ 449 | 0x243185be, 0x243185be, 0x243185be, 0x243185be,\ 450 | 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3,\ 451 | 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74,\ 452 | 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe,\ 453 | 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7,\ 454 | 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174,\ 455 | 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1,\ 456 | 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786,\ 457 | 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6,\ 458 | 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc,\ 459 | 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f,\ 460 | 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa,\ 461 | 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc,\ 462 | 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da,\ 463 | 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152,\ 464 | 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d,\ 465 | 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8,\ 466 | 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7,\ 467 | 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3,\ 468 | 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147,\ 469 | 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351,\ 470 | 0x14292967, 0x14292967, 0x14292967, 0x14292967,\ 471 | 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85,\ 472 | 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138,\ 473 | 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc,\ 474 | 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13,\ 475 | 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354,\ 476 | 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb,\ 477 | 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e,\ 478 | 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85,\ 479 | 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1,\ 480 | 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b,\ 481 | 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70,\ 482 | 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3,\ 483 | 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819,\ 484 | 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624,\ 485 | 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585,\ 486 | 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070,\ 487 | 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116,\ 488 | 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08,\ 489 | 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c,\ 490 | 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5,\ 491 | 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3,\ 492 | 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a,\ 493 | 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f,\ 494 | 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3,\ 495 | 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee,\ 496 | 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f,\ 497 | 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814,\ 498 | 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208,\ 499 | 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa,\ 500 | 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb,\ 501 | 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7,\ 502 | 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 503 | 504 | .LPADDINGx4: 505 | .word 0xc28a2f98, 0xc28a2f98, 0xc28a2f98, 0xc28a2f98,\ 506 | 0x71374491, 0x71374491, 0x71374491, 0x71374491,\ 507 | 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf,\ 508 | 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5,\ 509 | 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b,\ 510 | 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1,\ 511 | 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4,\ 512 | 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5,\ 513 | 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98,\ 514 | 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01,\ 515 | 0x243185be, 0x243185be, 0x243185be, 0x243185be,\ 516 | 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3,\ 517 | 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74,\ 518 | 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe,\ 519 | 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7,\ 520 | 0xc19bf374, 0xc19bf374, 0xc19bf374, 0xc19bf374,\ 521 | 0x649b69c1, 0x649b69c1, 0x649b69c1, 0x649b69c1,\ 522 | 0xf0fe4786, 0xf0fe4786, 0xf0fe4786, 0xf0fe4786,\ 523 | 0x0fe1edc6, 0x0fe1edc6, 0x0fe1edc6, 0x0fe1edc6,\ 524 | 0x240cf254, 0x240cf254, 0x240cf254, 0x240cf254,\ 525 | 0x4fe9346f, 0x4fe9346f, 0x4fe9346f, 0x4fe9346f,\ 526 | 0x6cc984be, 0x6cc984be, 0x6cc984be, 0x6cc984be,\ 527 | 0x61b9411e, 0x61b9411e, 0x61b9411e, 0x61b9411e,\ 528 | 0x16f988fa, 0x16f988fa, 0x16f988fa, 0x16f988fa,\ 529 | 0xf2c65152, 0xf2c65152, 0xf2c65152, 0xf2c65152,\ 530 | 0xa88e5a6d, 0xa88e5a6d, 0xa88e5a6d, 0xa88e5a6d,\ 531 | 0xb019fc65, 0xb019fc65, 0xb019fc65, 0xb019fc65,\ 532 | 0xb9d99ec7, 0xb9d99ec7, 0xb9d99ec7, 0xb9d99ec7,\ 533 | 0x9a1231c3, 0x9a1231c3, 0x9a1231c3, 0x9a1231c3,\ 534 | 0xe70eeaa0, 0xe70eeaa0, 0xe70eeaa0, 0xe70eeaa0,\ 535 | 0xfdb1232b, 0xfdb1232b, 0xfdb1232b, 0xfdb1232b,\ 536 | 0xc7353eb0, 0xc7353eb0, 0xc7353eb0, 0xc7353eb0,\ 537 | 0x3069bad5, 0x3069bad5, 0x3069bad5, 0x3069bad5,\ 538 | 0xcb976d5f, 0xcb976d5f, 0xcb976d5f, 0xcb976d5f,\ 539 | 0x5a0f118f, 0x5a0f118f, 0x5a0f118f, 0x5a0f118f,\ 540 | 0xdc1eeefd, 0xdc1eeefd, 0xdc1eeefd, 0xdc1eeefd,\ 541 | 0x0a35b689, 0x0a35b689, 0x0a35b689, 0x0a35b689,\ 542 | 0xde0b7a04, 0xde0b7a04, 0xde0b7a04, 0xde0b7a04,\ 543 | 0x58f4ca9d, 0x58f4ca9d, 0x58f4ca9d, 0x58f4ca9d,\ 544 | 0xe15d5b16, 0xe15d5b16, 0xe15d5b16, 0xe15d5b16,\ 545 | 0x007f3e86, 0x007f3e86, 0x007f3e86, 0x007f3e86,\ 546 | 0x37088980, 0x37088980, 0x37088980, 0x37088980,\ 547 | 0xa507ea32, 0xa507ea32, 0xa507ea32, 0xa507ea32,\ 548 | 0x6fab9537, 0x6fab9537, 0x6fab9537, 0x6fab9537,\ 549 | 0x17406110, 0x17406110, 0x17406110, 0x17406110,\ 550 | 0x0d8cd6f1, 0x0d8cd6f1, 0x0d8cd6f1, 0x0d8cd6f1,\ 551 | 0xcdaa3b6d, 0xcdaa3b6d, 0xcdaa3b6d, 0xcdaa3b6d,\ 552 | 0xc0bbbe37, 0xc0bbbe37, 0xc0bbbe37, 0xc0bbbe37,\ 553 | 0x83613bda, 0x83613bda, 0x83613bda, 0x83613bda,\ 554 | 0xdb48a363, 0xdb48a363, 0xdb48a363, 0xdb48a363,\ 555 | 0x0b02e931, 0x0b02e931, 0x0b02e931, 0x0b02e931,\ 556 | 0x6fd15ca7, 0x6fd15ca7, 0x6fd15ca7, 0x6fd15ca7,\ 557 | 0x521afaca, 0x521afaca, 0x521afaca, 0x521afaca,\ 558 | 0x31338431, 0x31338431, 0x31338431, 0x31338431,\ 559 | 0x6ed41a95, 0x6ed41a95, 0x6ed41a95, 0x6ed41a95,\ 560 | 0x6d437890, 0x6d437890, 0x6d437890, 0x6d437890,\ 561 | 0xc39c91f2, 0xc39c91f2, 0xc39c91f2, 0xc39c91f2,\ 562 | 0x9eccabbd, 0x9eccabbd, 0x9eccabbd, 0x9eccabbd,\ 563 | 0xb5c9a0e6, 0xb5c9a0e6, 0xb5c9a0e6, 0xb5c9a0e6,\ 564 | 0x532fb63c, 0x532fb63c, 0x532fb63c, 0x532fb63c,\ 565 | 0xd2c741c6, 0xd2c741c6, 0xd2c741c6, 0xd2c741c6,\ 566 | 0x07237ea3, 0x07237ea3, 0x07237ea3, 0x07237ea3,\ 567 | 0xa4954b68, 0xa4954b68, 0xa4954b68, 0xa4954b68,\ 568 | 0x4c191d76, 0x4c191d76, 0x4c191d76, 0x4c191d76 569 | 570 | #endif 571 | --------------------------------------------------------------------------------