├── C ├── .gitignore ├── README.md ├── ChaCha20-SIMD │ ├── chacha20_dispatch.c │ ├── LICENSE │ ├── chacha20.h │ ├── cpu_features.h │ ├── cpu_features.c │ ├── chacha20.c │ ├── chacha20_sse2.c │ └── chacha20_avx2.c ├── CMakeLists.txt ├── xelis_hash_v2.c └── xelis_hash_v3.c ├── .gitignore ├── .cargo └── config.toml ├── .gitmodules ├── go ├── aes │ ├── aes_generic.go │ ├── aes_amd64.s │ ├── aes_amd64.go │ ├── aes_test.go │ └── aes.go ├── go.mod ├── hash │ └── hash.go ├── xelis_hash.go ├── go.sum ├── v1 │ ├── v1_test.go │ ├── v1.go │ └── keccak.go ├── v2 │ ├── v2_test.go │ └── v2.go └── v3 │ ├── v3_test.go │ └── v3.go ├── benches ├── v2.rs ├── v1.rs └── v3.rs ├── src ├── lib.rs ├── scratchpad.rs ├── tracker.rs ├── v1.rs ├── v3.rs └── v2.rs ├── LICENSE ├── Cargo.toml └── README.md /C/.gitignore: -------------------------------------------------------------------------------- 1 | /build -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | *.png 3 | .vscode/ -------------------------------------------------------------------------------- /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | rustflags = ["-C", "target-cpu=native"] -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "C/BLAKE3"] 2 | path = C/BLAKE3 3 | url = https://github.com/BLAKE3-team/BLAKE3.git 4 | -------------------------------------------------------------------------------- /C/README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | git clone --recursive https://github.com/xelis-project/xelis-hash.git 3 | cd xelis-hash/C 4 | mkdir build && cd build 5 | cmake .. 6 | make 7 | ``` -------------------------------------------------------------------------------- /go/aes/aes_generic.go: -------------------------------------------------------------------------------- 1 | //go:build !amd64 || purego 2 | 3 | package aes 4 | 5 | // CipherRound performs a single AES round using software implementation 6 | func CipherRound(block *[16]byte, key *[16]byte) { 7 | CipherRoundGeneric(block, key) 8 | } 9 | -------------------------------------------------------------------------------- /go/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/xelis-project/xelis-hash/go 2 | 3 | go 1.24.0 4 | 5 | toolchain go1.24.11 6 | 7 | require ( 8 | github.com/chocolatkey/chacha8 v0.0.0-20200308092524-06a0ce7f6716 9 | golang.org/x/sys v0.38.0 10 | lukechampine.com/blake3 v1.2.1 11 | lukechampine.com/uint128 v1.3.0 12 | ) 13 | 14 | require ( 15 | github.com/klauspost/cpuid/v2 v2.0.9 // indirect 16 | ) 17 | -------------------------------------------------------------------------------- /go/aes/aes_amd64.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | // func aesCipherRoundAsm(block *[16]byte, key *[16]byte) 4 | TEXT ·aesCipherRoundAsm(SB), NOSPLIT, $0-16 5 | MOVQ block+0(FP), AX 6 | MOVQ key+8(FP), BX 7 | 8 | // Load block into XMM0 9 | MOVOU (AX), X0 10 | 11 | // Load key into XMM1 12 | MOVOU (BX), X1 13 | 14 | // Perform AES round: AESENC = SubBytes + ShiftRows + MixColumns + AddRoundKey 15 | AESENC X1, X0 16 | 17 | // Store result back to block 18 | MOVOU X0, (AX) 19 | RET 20 | -------------------------------------------------------------------------------- /go/aes/aes_amd64.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 && !purego 2 | 3 | package aes 4 | 5 | import ( 6 | "golang.org/x/sys/cpu" 7 | ) 8 | 9 | // hasAESNI indicates whether AES-NI instructions are available 10 | var hasAESNI = cpu.X86.HasAES 11 | 12 | // CipherRound performs a single AES round using AES-NI instructions if available 13 | func CipherRound(block *[16]byte, key *[16]byte) { 14 | if hasAESNI { 15 | aesCipherRoundAsm(block, key) 16 | } else { 17 | CipherRoundGeneric(block, key) 18 | } 19 | } 20 | 21 | // aesCipherRoundAsm is implemented in assembly using AES-NI 22 | func aesCipherRoundAsm(block *[16]byte, key *[16]byte) 23 | -------------------------------------------------------------------------------- /benches/v2.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, Criterion}; 2 | use xelis_hash::v2::{xelis_hash, ScratchPad}; 3 | 4 | const FIXED_INPUT: &[u8] = b"Hello World from xelis hash v2!"; 5 | 6 | fn bench_zero_input(c: &mut Criterion) { 7 | let mut scratch_pad = ScratchPad::default(); 8 | let input = [0u8; 112]; 9 | c.bench_function("v2::zero_input", |b| b.iter(|| xelis_hash(&input, &mut scratch_pad))); 10 | } 11 | 12 | fn bench_fixed_input(c: &mut Criterion) { 13 | let mut scratch_pad = ScratchPad::default(); 14 | c.bench_function("v2::fixed_input", |b| b.iter(|| xelis_hash(FIXED_INPUT, &mut scratch_pad))); 15 | } 16 | 17 | criterion_group!(benches, bench_zero_input, bench_fixed_input); 18 | criterion_main!(benches); -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error as ThisError; 2 | 3 | #[cfg(feature = "v1")] 4 | pub mod v1; 5 | #[cfg(feature = "v2")] 6 | pub mod v2; 7 | #[cfg(feature = "v3")] 8 | pub mod v3; 9 | 10 | pub mod scratchpad; 11 | 12 | #[cfg(feature = "tracker")] 13 | pub mod tracker; 14 | 15 | // Number of bytes in a hash 16 | const HASH_SIZE: usize = 32; 17 | 18 | // Hash type alias 19 | pub type Hash = [u8; HASH_SIZE]; 20 | 21 | // Error that can occur while hashing 22 | #[derive(Debug, ThisError)] 23 | #[error("Error while hashing")] 24 | pub enum Error { 25 | #[error("Error while hashing")] 26 | Error, 27 | #[error("Error while casting: {0}")] 28 | CastError(bytemuck::PodCastError), 29 | #[error("Error on format")] 30 | FormatError, 31 | } 32 | 33 | -------------------------------------------------------------------------------- /C/ChaCha20-SIMD/chacha20_dispatch.c: -------------------------------------------------------------------------------- 1 | #include "cpu_features.h" 2 | #include "chacha20.h" 3 | 4 | static bool initialized = false; 5 | static cpu_features_t f; 6 | 7 | void chacha_init() 8 | { 9 | get_cpu_features(&f); 10 | 11 | initialized = true; 12 | } 13 | 14 | void chacha_encrypt(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds) 15 | { 16 | 17 | if (!initialized) 18 | { 19 | chacha_init(); 20 | } 21 | 22 | if (f.HW_AVX2) 23 | { 24 | chacha_encrypt_avx2(key, nonce, in, out, bytes, rounds); 25 | return; 26 | } 27 | 28 | if (f.HW_SSE2) 29 | { 30 | chacha_encrypt_sse2(key, nonce, in, out, bytes, rounds); 31 | return; 32 | } 33 | 34 | chacha_encrypt_portable(key, nonce, in, out, bytes, rounds); 35 | } -------------------------------------------------------------------------------- /benches/v1.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, Criterion}; 2 | use xelis_hash::v1::{xelis_hash, ScratchPad}; 3 | 4 | const FIXED_INPUT: &[u8] = b"Hello World from xelis hash v1!"; 5 | 6 | fn bench_zero_input(c: &mut Criterion) { 7 | let mut scratch_pad = ScratchPad::default(); 8 | let input = [0u8; 200]; 9 | c.bench_function("v1::zero_input", |b| b.iter(|| xelis_hash(&mut input.clone(), &mut scratch_pad))); 10 | } 11 | 12 | fn bench_fixed_input(c: &mut Criterion) { 13 | let mut scratch_pad = ScratchPad::default(); 14 | let mut input = [0u8; 200]; 15 | input[0..FIXED_INPUT.len()].copy_from_slice(FIXED_INPUT); 16 | 17 | c.bench_function("v1::fixed_input", |b| b.iter(|| xelis_hash(&mut input.clone(), &mut scratch_pad))); 18 | } 19 | 20 | criterion_group!(benches, bench_zero_input, bench_fixed_input); 21 | criterion_main!(benches); -------------------------------------------------------------------------------- /go/hash/hash.go: -------------------------------------------------------------------------------- 1 | package hash 2 | 3 | import ( 4 | "encoding/hex" 5 | "errors" 6 | ) 7 | 8 | const HashSize = 32 9 | 10 | var ErrInvalidHashLength = errors.New("invalid hash length") 11 | 12 | type Hash [HashSize]byte 13 | 14 | func Zero() Hash { 15 | return Hash{} 16 | } 17 | 18 | func NewHash(input [HashSize]byte) Hash { 19 | return Hash(input) 20 | } 21 | 22 | func (h *Hash) Bytes() []byte { 23 | return h[:] 24 | } 25 | 26 | func (h *Hash) String() string { 27 | return hex.EncodeToString(h[:]) 28 | } 29 | 30 | func FromBytes(data []byte) (Hash, error) { 31 | var h Hash 32 | if len(data) != HashSize { 33 | return h, ErrInvalidHashLength 34 | } 35 | copy(h[:], data) 36 | return h, nil 37 | } 38 | 39 | func FromString(s string) (Hash, error) { 40 | var h Hash 41 | data, err := hex.DecodeString(s) 42 | if err != nil { 43 | return h, err 44 | } 45 | return FromBytes(data) 46 | } 47 | -------------------------------------------------------------------------------- /go/xelis_hash.go: -------------------------------------------------------------------------------- 1 | package xelis_hash 2 | 3 | import ( 4 | "errors" 5 | 6 | hash "github.com/xelis-project/xelis-hash/go/hash" 7 | v1 "github.com/xelis-project/xelis-hash/go/v1" 8 | v2 "github.com/xelis-project/xelis-hash/go/v2" 9 | v3 "github.com/xelis-project/xelis-hash/go/v3" 10 | ) 11 | 12 | func HashV1(input []byte) (hash.Hash, error) { 13 | var padded [v1.BytesArrayInput]byte 14 | if len(input) <= v1.BytesArrayInput { 15 | copy(padded[:], input) 16 | } else { 17 | return hash.Zero(), errors.New("input too long for v1 hash (max 120 bytes)") 18 | } 19 | 20 | scratchPad := v1.NewScratchPad() 21 | return v1.XelisHash(&padded, scratchPad) 22 | } 23 | 24 | func HashV2(input []byte) (hash.Hash, error) { 25 | scratchPad := v2.NewScratchPad() 26 | return v2.XelisHash(input, scratchPad) 27 | } 28 | 29 | func HashV3(input []byte) (hash.Hash, error) { 30 | scratchPad := v3.NewScratchPad() 31 | return v3.XelisHash(input, scratchPad) 32 | } 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 XELIS 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /C/ChaCha20-SIMD/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Yury Myakotin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "xelis-hash" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | # Common dependencies 10 | thiserror = "1.0.58" 11 | aes = { version = "0.8.3", features = ["hazmat"] } 12 | bytemuck = { version = "1.15.0", features = ["derive"] } 13 | 14 | # v1 features 15 | tiny-keccak = { version = "2.0", features = ["k12"], optional = true } 16 | 17 | # v2 features 18 | blake3 = { version = "1.5.1", optional = true } 19 | chacha20 = { version = "0.9.1", optional = true } 20 | plotters = { version = "0.3.7", optional = true } 21 | anyhow = { version = "1", optional = true } 22 | 23 | [dev-dependencies] 24 | rand = "0.8.5" 25 | criterion = "0.5.1" 26 | 27 | [features] 28 | default = ["v1", "v2", "v3"] 29 | v1 = ["dep:tiny-keccak"] 30 | v2 = ["dep:blake3", "dep:chacha20"] 31 | v3 = ["v2"] 32 | # Only available in v2/v3 33 | tracker = ["dep:plotters", "dep:anyhow"] 34 | 35 | [[bench]] 36 | name = "v1" 37 | harness = false 38 | required-features = ["v1"] 39 | 40 | [[bench]] 41 | name = "v2" 42 | harness = false 43 | required-features = ["v2"] 44 | 45 | [[bench]] 46 | name = "v3" 47 | harness = false 48 | required-features = ["v3"] 49 | -------------------------------------------------------------------------------- /C/ChaCha20-SIMD/chacha20.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef CHACHA20_H 3 | #define CHACHA20_H 4 | 5 | #include 6 | #include 7 | 8 | #define ChaCha20StateSizeBytes 48; 9 | #define ChaCha20KeySizeByte 32 10 | #define ChaCha20NonceSizeByte 12 11 | #define ChaCha20CounterSizeByte 4 12 | 13 | #ifdef __cplusplus 14 | extern "C" 15 | { 16 | #endif 17 | 18 | void ChaCha20SetKey(uint8_t *state, const uint8_t *Key); 19 | void ChaCha20SetNonce(uint8_t *state, const uint8_t *Nonce); 20 | // void ChaCha20SetCtr(uint8_t *state, const uint8_t *Ctr); 21 | // void ChaCha20EncryptBytes(uint8_t *state, uint8_t *In, uint8_t *Out, size_t Size, uint32_t rounds); // if In=nullptr - just fill Out 22 | void ChaCha20IncrementNonce(uint8_t *state); 23 | void ChaCha20AddCounter(uint8_t *ChaCha, const uint32_t value_to_add); 24 | 25 | void chacha_encrypt_portable(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds); 26 | void chacha_encrypt_sse2(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds); 27 | void chacha_encrypt_avx2(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds); 28 | void chacha_encrypt(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds); 29 | 30 | #ifdef __cplusplus 31 | } 32 | #endif 33 | 34 | #endif // CHACHA20_H -------------------------------------------------------------------------------- /src/scratchpad.rs: -------------------------------------------------------------------------------- 1 | use crate::Error; 2 | 3 | 4 | // Scratchpad used to store intermediate values 5 | // It has a fixed size of `MEMORY_SIZE` u64s 6 | // It can be easily reused for multiple hashing operations safely 7 | #[derive(Debug, Clone)] 8 | pub struct ScratchPad(Box<[u64; M]>); 9 | 10 | impl ScratchPad { 11 | // Retrieve the scratchpad size 12 | #[inline(always)] 13 | pub fn len(&self) -> usize { 14 | self.0.len() 15 | } 16 | 17 | // Get the inner scratch pad as a mutable u64 slice 18 | #[inline(always)] 19 | pub fn as_mut_slice(&mut self) -> &mut [u64; M] { 20 | &mut self.0 21 | } 22 | 23 | // Retrieve the scratch pad as a mutable bytes slice 24 | #[inline(always)] 25 | pub fn as_mut_bytes(&mut self) -> Result<&mut [u8; M_BYTES], Error> { 26 | bytemuck::try_cast_slice_mut(self.as_mut_slice()) 27 | .map_err(|e| Error::CastError(e))? 28 | .try_into() 29 | .map_err(|_| Error::FormatError) 30 | } 31 | } 32 | 33 | impl Default for ScratchPad { 34 | fn default() -> Self { 35 | Self( 36 | vec![0; M] 37 | .into_boxed_slice() 38 | .try_into() 39 | .expect("Failed generating scratchpad") 40 | ) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /go/aes/aes_test.go: -------------------------------------------------------------------------------- 1 | package aes 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestAESCipherRound(t *testing.T) { 8 | // Test vector for a single AES round 9 | block := [16]byte{0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff} 10 | key := [16]byte{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f} 11 | 12 | // Test with hardware intrinsics (if available) 13 | blockHW := block 14 | CipherRound(&blockHW, &key) 15 | 16 | // Test with generic implementation 17 | blockSW := block 18 | CipherRoundGeneric(&blockSW, &key) 19 | 20 | // Both should produce the same result 21 | if blockHW != blockSW { 22 | t.Errorf("Hardware and software implementations produce different results\nHW: %x\nSW: %x", blockHW, blockSW) 23 | } 24 | } 25 | 26 | func BenchmarkAESCipherRound(b *testing.B) { 27 | block := [16]byte{0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff} 28 | key := [16]byte{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f} 29 | 30 | b.Run("Hardware", func(b *testing.B) { 31 | for i := 0; i < b.N; i++ { 32 | CipherRound(&block, &key) 33 | } 34 | }) 35 | 36 | b.Run("Software", func(b *testing.B) { 37 | for i := 0; i < b.N; i++ { 38 | CipherRoundGeneric(&block, &key) 39 | } 40 | }) 41 | } 42 | -------------------------------------------------------------------------------- /C/ChaCha20-SIMD/cpu_features.h: -------------------------------------------------------------------------------- 1 | #ifndef CPU_FEATURES_H 2 | #define CPU_FEATURES_H 3 | 4 | #include 5 | 6 | typedef struct cpu_features_t 7 | { 8 | // Misc. 9 | bool HW_MMX; 10 | bool HW_x64; 11 | bool HW_ABM; // Advanced Bit Manipulation 12 | bool HW_RDRAND; 13 | bool HW_BMI1; 14 | bool HW_BMI2; 15 | bool HW_ADX; 16 | bool HW_PREFETCHWT1; 17 | 18 | // SIMD: 128-bit 19 | bool HW_SSE; 20 | bool HW_SSE2; 21 | bool HW_SSE3; 22 | bool HW_SSSE3; 23 | bool HW_SSE41; 24 | bool HW_SSE42; 25 | bool HW_SSE4a; 26 | bool HW_AES; 27 | bool HW_SHA; 28 | 29 | // SIMD: 256-bit 30 | bool HW_AVX; 31 | bool HW_XOP; 32 | bool HW_FMA3; 33 | bool HW_FMA4; 34 | bool HW_AVX2; 35 | 36 | // SIMD: 512-bit 37 | bool HW_AVX512F; // AVX512 Foundation 38 | bool HW_AVX512CD; // AVX512 Conflict Detection 39 | bool HW_AVX512PF; // AVX512 Prefetch 40 | bool HW_AVX512ER; // AVX512 Exponential + Reciprocal 41 | bool HW_AVX512VL; // AVX512 Vector Length Extensions 42 | bool HW_AVX512BW; // AVX512 Byte + Word 43 | bool HW_AVX512DQ; // AVX512 Doubleword + Quadword 44 | bool HW_AVX512IFMA; // AVX512 Integer 52-bit Fused Multiply-Add 45 | bool HW_AVX512VBMI; // AVX512 Vector Byte Manipulation Instructions 46 | } cpu_features_t; 47 | 48 | void get_cpu_features(cpu_features_t *f); 49 | 50 | #endif // CPU_FEATURES_H -------------------------------------------------------------------------------- /C/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | project(xelishash_v3 C) 4 | 5 | set(CMAKE_C_STANDARD 11) 6 | set(CMAKE_C_EXTENSIONS ON) 7 | 8 | if(NOT CMAKE_BUILD_TYPE) 9 | set(CMAKE_BUILD_TYPE Release) 10 | endif() 11 | 12 | add_subdirectory(BLAKE3/c) 13 | 14 | set(SOURCES 15 | ChaCha20-SIMD/chacha20.c 16 | ChaCha20-SIMD/chacha20_sse2.c 17 | ChaCha20-SIMD/chacha20_avx2.c 18 | ChaCha20-SIMD/chacha20_dispatch.c 19 | ChaCha20-SIMD/cpu_features.c 20 | xelis_hash_v3.c 21 | ) 22 | 23 | # Per-file ISA flags 24 | if(MSVC) 25 | set_source_files_properties(ChaCha20-SIMD/chacha20_avx2.c PROPERTIES COMPILE_FLAGS "/arch:AVX2") 26 | else() 27 | set_source_files_properties(ChaCha20-SIMD/chacha20_sse2.c PROPERTIES COMPILE_FLAGS "-msse2") 28 | set_source_files_properties(ChaCha20-SIMD/chacha20_avx2.c PROPERTIES COMPILE_FLAGS "-mavx2") 29 | endif() 30 | 31 | add_executable(${PROJECT_NAME} ${SOURCES}) 32 | 33 | # Target flags 34 | if(CMAKE_C_COMPILER_ID MATCHES "GNU|Clang") 35 | target_compile_options(${PROJECT_NAME} PRIVATE -O3) 36 | if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i[3-6]86") 37 | target_compile_options(${PROJECT_NAME} PRIVATE -maes -mpclmul) 38 | endif() 39 | elseif(MSVC) 40 | target_compile_options(${PROJECT_NAME} PRIVATE /O2) 41 | endif() 42 | 43 | # Link 44 | if(UNIX AND NOT APPLE) 45 | target_link_libraries(${PROJECT_NAME} PRIVATE blake3 pthread m) 46 | else() 47 | target_link_libraries(${PROJECT_NAME} PRIVATE blake3) 48 | endif() -------------------------------------------------------------------------------- /benches/v3.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, Criterion, BatchSize}; 2 | use rand::{Rng, SeedableRng, rngs::StdRng}; 3 | use xelis_hash::v3::*; 4 | 5 | const FIXED_INPUT: &[u8] = b"Hello World from xelis hash v3!"; 6 | 7 | fn bench_zero_input(c: &mut Criterion) { 8 | let mut scratch_pad = ScratchPad::default(); 9 | let input = [0u8; 112]; 10 | c.bench_function("v3::zero_input", |b| b.iter(|| xelis_hash(&input, &mut scratch_pad))); 11 | } 12 | 13 | fn bench_fixed_input(c: &mut Criterion) { 14 | let mut scratch_pad = ScratchPad::default(); 15 | c.bench_function("v3::fixed_input", |b| b.iter(|| xelis_hash(FIXED_INPUT, &mut scratch_pad))); 16 | } 17 | 18 | fn bench_pick_half(c: &mut Criterion) { 19 | let mut rng = StdRng::seed_from_u64(0xDEADBEEFCAFEBABE); 20 | 21 | c.bench_function("v3::pick_half", |b| { 22 | b.iter_batched( 23 | || rng.gen::(), 24 | |seed| pick_half(seed), 25 | BatchSize::SmallInput 26 | ) 27 | }); 28 | } 29 | 30 | fn bench_map_index(c: &mut Criterion) { 31 | let mut rng = StdRng::seed_from_u64(0xDEADBEEFCAFEBABE); 32 | 33 | c.bench_function("v3::map_index", |b| { 34 | b.iter_batched( 35 | || rng.gen::(), 36 | |seed| map_index(seed), 37 | BatchSize::SmallInput 38 | ) 39 | }); 40 | } 41 | 42 | criterion_group!(benches, bench_zero_input, bench_fixed_input, bench_pick_half, bench_map_index); 43 | criterion_main!(benches); -------------------------------------------------------------------------------- /go/go.sum: -------------------------------------------------------------------------------- 1 | github.com/chocolatkey/chacha8 v0.0.0-20200308092524-06a0ce7f6716 h1:NSjnwJb5rlX8weAJPotMIFtWSFt4Tjtkjt7nTBil1dA= 2 | github.com/chocolatkey/chacha8 v0.0.0-20200308092524-06a0ce7f6716/go.mod h1:NvCEVATmyDtfApL4hee9mqF2c7+AFTpltRm62q68ppU= 3 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 4 | github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4= 5 | github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= 6 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 7 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 8 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= 9 | golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34= 10 | golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc= 11 | golang.org/x/sys v0.0.0-20190902133755-9109b7679e13/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 12 | golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= 13 | golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= 14 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 15 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 16 | lukechampine.com/blake3 v1.2.1 h1:YuqqRuaqsGV71BV/nm9xlI0MKUv4QC54jQnBChWbGnI= 17 | lukechampine.com/blake3 v1.2.1/go.mod h1:0OFRp7fBtAylGVCO40o87sbupkyIGgbpv1+M1k1LM6k= 18 | lukechampine.com/uint128 v1.3.0 h1:cDdUVfRwDUDovz610ABgFD17nXD4/uDgVHl2sC3+sbo= 19 | lukechampine.com/uint128 v1.3.0/go.mod h1:c4eWIwlEGaxC/+H1VguhU4PHXNWDCDMUlWdIWl2j1gk= 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # XELIS Hash 2 | 3 | XELIS Hash is expected to run on CPU and GPUs with a controlled gap. 4 | It is relying on two famous algorithms: ChaCha8 and Blake3. 5 | 6 | ## V3 7 | 8 | Same base as the V2 but with a scratchpad of 544 KB. 9 | 10 | Stage 1 and (Final) stage 4 are the same as V2. 11 | 12 | Stage 3 has been modified to increase memory accesses while having a random memory access pattern. 13 | 14 | ## V2 15 | 16 | New version use a scratchpad of ~440 KB which can be reused at each hash. 17 | 18 | Stage 1 will randomize the scratchpad based on the input used as a key for the ChaCha8 stream cipher. 19 | The key is a Blake3 hash of (previous hash + input chunk). 20 | 21 | First nonce is based on the first 12 bytes of the input's blake3 hash result. 22 | The input is splitted into several 32 bytes chunks padded with zeroes if size is smaller. 23 | It cannot be parallelized due to the nonce based on the previous iteration. 24 | 25 | Stage 2 has been removed because the whole work is now done in stage 3. 26 | 27 | Stage 3 is expected to do a lot of random access in memory while being forced to stay sequential. 28 | There is 4 reads and 2 writes per iteration, making it memory bound. 29 | A branching part is included in the inner loop to be power-hungry and reduce efficiency of FPGA and GPUs. 30 | 31 | (Final) stage 4 is using Blake3 algorithm to hash the whole scratchpad to give a final good-quality hash. 32 | It is also used to prevent skipping a part of the scratchpad, to force it to be fully computed. 33 | 34 | Blake3 and ChaCha8 are used as they are really fast and can be highly parallelized, one thread can have high hashrate to reduce verification time. 35 | 36 | Expected time per hash is around 1.20-1.50ms. 37 | 38 | ## Features 39 | 40 | - `v1`: deprecated algorithm 41 | - `v2`: new algorithm with 440 KB scratchpad 42 | - `v3`: new algorithm version based on the v2 with bigger scratchpad and others changes 43 | - `tracker`: track branches selection, memory accesses and generate charts for it. -------------------------------------------------------------------------------- /go/v1/v1_test.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | import ( 4 | "slices" 5 | "testing" 6 | ) 7 | 8 | func TestZeroInput(t *testing.T) { 9 | var input [BytesArrayInput]byte 10 | scratchPad := NewScratchPad() 11 | 12 | hash, err := XelisHash(&input, scratchPad) 13 | if err != nil { 14 | t.Fatalf("Hash failed: %v", err) 15 | } 16 | 17 | expected := []byte{ 18 | 0x0e, 0xbb, 0xbd, 0x8a, 0x31, 0xed, 0xad, 0xfe, 0x09, 0x8f, 0x2d, 0x77, 0x0d, 0x84, 19 | 0xb7, 0x19, 0x58, 0x86, 0x75, 0xab, 0x88, 0xa0, 0xa1, 0x70, 0x67, 0xd0, 0x0a, 0x8f, 20 | 0x36, 0x18, 0x22, 0x65} 21 | 22 | if !slices.Equal(expected, hash[:]) { 23 | t.Errorf("Hash mismatch:\nGot: %x\nExpected: %x", hash, expected) 24 | } 25 | t.Logf("Hash: %x", hash) 26 | } 27 | 28 | func TestXelisInput(t *testing.T) { 29 | var input [BytesArrayInput]byte 30 | custom := []byte("xelis-hashing-algorithm") 31 | copy(input[:], custom) 32 | 33 | scratchPad := NewScratchPad() 34 | hash, err := XelisHash(&input, scratchPad) 35 | if err != nil { 36 | t.Fatalf("Hash failed: %v", err) 37 | } 38 | 39 | expected := []byte{ 40 | 106, 106, 173, 8, 207, 59, 118, 108, 176, 196, 9, 124, 250, 195, 3, 41 | 61, 30, 146, 238, 182, 88, 83, 115, 81, 139, 56, 3, 28, 176, 86, 68, 21, 42 | } 43 | 44 | if !slices.Equal(expected, hash[:]) { 45 | t.Errorf("Hash mismatch:\nGot: %x\nExpected: %x", hash, expected) 46 | } 47 | 48 | t.Logf("Hash: %x", hash) 49 | } 50 | 51 | func TestScratchPadReuse(t *testing.T) { 52 | var input [BytesArrayInput]byte 53 | scratchPad := NewScratchPad() 54 | 55 | hash1, err := XelisHash(&input, scratchPad) 56 | if err != nil { 57 | t.Fatalf("First hash failed: %v", err) 58 | } 59 | 60 | hash2, err := XelisHash(&input, scratchPad) 61 | if err != nil { 62 | t.Fatalf("Second hash failed: %v", err) 63 | } 64 | 65 | if hash1 != hash2 { 66 | t.Errorf("Hash mismatch:\nGot: %x\nExpected: %x", hash1, hash2) 67 | } 68 | } 69 | 70 | func BenchmarkXelisHashV1(b *testing.B) { 71 | var input [BytesArrayInput]byte 72 | copy(input[:], []byte("benchmark data")) 73 | scratchPad := NewScratchPad() 74 | 75 | b.ResetTimer() 76 | for i := 0; i < b.N; i++ { 77 | _, _ = XelisHash(&input, scratchPad) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /go/v2/v2_test.go: -------------------------------------------------------------------------------- 1 | package v2 2 | 3 | import ( 4 | "slices" 5 | "testing" 6 | ) 7 | 8 | func TestZeroHash(t *testing.T) { 9 | input := make([]byte, 112) 10 | scratchPad := NewScratchPad() 11 | 12 | hash, err := XelisHash(input, scratchPad) 13 | if err != nil { 14 | t.Fatalf("Hash failed: %v", err) 15 | } 16 | 17 | expected := [32]byte{ 18 | 126, 219, 112, 240, 116, 133, 115, 144, 39, 40, 164, 19 | 105, 30, 158, 45, 126, 64, 67, 238, 52, 200, 35, 20 | 161, 19, 144, 211, 214, 225, 95, 190, 146, 27, 21 | } 22 | 23 | if !slices.Equal(expected[:], hash[:]) { 24 | t.Errorf("Hash mismatch:\nGot: %x\nExpected: %x", hash, expected) 25 | } 26 | } 27 | 28 | func TestReusedScratchpad(t *testing.T) { 29 | input := make([]byte, 112) 30 | for i := range input { 31 | input[i] = byte(i % 256) 32 | } 33 | 34 | scratchPad := NewScratchPad() 35 | 36 | hash1, err := XelisHash(input, scratchPad) 37 | if err != nil { 38 | t.Fatalf("First hash failed: %v", err) 39 | } 40 | 41 | hash2, err := XelisHash(input, scratchPad) 42 | if err != nil { 43 | t.Fatalf("Second hash failed: %v", err) 44 | } 45 | 46 | for i := range hash1 { 47 | if hash1[i] != hash2[i] { 48 | t.Errorf("Hash mismatch when reusing scratchpad") 49 | break 50 | } 51 | } 52 | } 53 | 54 | func TestVerifyOutput(t *testing.T) { 55 | input := []byte{ 56 | 172, 236, 108, 212, 181, 31, 109, 45, 44, 242, 54, 225, 143, 133, 57 | 89, 44, 179, 108, 39, 191, 32, 116, 229, 33, 63, 130, 33, 120, 185, 89, 58 | 146, 141, 10, 79, 183, 107, 238, 122, 92, 222, 25, 134, 90, 107, 116, 59 | 110, 236, 53, 255, 5, 214, 126, 24, 216, 97, 199, 148, 239, 253, 102, 60 | 199, 184, 232, 253, 158, 145, 86, 187, 112, 81, 78, 70, 80, 110, 33, 61 | 37, 159, 233, 198, 1, 178, 108, 210, 100, 109, 155, 106, 124, 124, 83, 62 | 89, 50, 197, 115, 231, 32, 74, 2, 92, 47, 25, 220, 135, 249, 122, 63 | 172, 220, 137, 143, 234, 68, 188, 64 | } 65 | 66 | scratchPad := NewScratchPad() 67 | hash, err := XelisHash(input, scratchPad) 68 | if err != nil { 69 | t.Fatalf("Hash failed: %v", err) 70 | } 71 | 72 | expected := []byte{ 73 | 199, 114, 154, 28, 4, 164, 196, 178, 117, 17, 148, 74 | 203, 125, 228, 51, 145, 162, 222, 106, 202, 205, 75 | 55, 244, 178, 94, 29, 248, 242, 98, 221, 158, 179, 76 | } 77 | 78 | if !slices.Equal(expected, hash[:]) { 79 | t.Errorf("Hash mismatch:\nGot: %x\nExpected: %x", hash, expected) 80 | } 81 | } 82 | 83 | func BenchmarkXelisHashV2(b *testing.B) { 84 | input := make([]byte, 112) 85 | scratchPad := NewScratchPad() 86 | 87 | b.ResetTimer() 88 | for i := 0; i < b.N; i++ { 89 | _, _ = XelisHash(input, scratchPad) 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /C/ChaCha20-SIMD/cpu_features.c: -------------------------------------------------------------------------------- 1 | // CPU features set 2 | #include "cpu_features.h" 3 | 4 | static void cpuid(int cpuinfo[4], int info_type) 5 | { 6 | __asm__ __volatile__( 7 | "cpuid" : "=a"(cpuinfo[0]), 8 | "=b"(cpuinfo[1]), 9 | "=c"(cpuinfo[2]), 10 | "=d"(cpuinfo[3]) : "a"(info_type), "c"(0)); 11 | } 12 | 13 | void get_cpu_features(cpu_features_t *f) 14 | { 15 | int info[4]; 16 | cpuid(info, 0); 17 | int nIds = info[0]; 18 | 19 | cpuid(info, 0x80000000); 20 | unsigned nExIds = info[0]; 21 | 22 | // Detect Features 23 | if (nIds >= 0x00000001) 24 | { 25 | cpuid(info, 0x00000001); 26 | f->HW_MMX = (info[3] & ((int)1 << 23)) != 0; 27 | f->HW_SSE = (info[3] & ((int)1 << 25)) != 0; 28 | f->HW_SSE2 = (info[3] & ((int)1 << 26)) != 0; 29 | f->HW_SSE3 = (info[2] & ((int)1 << 0)) != 0; 30 | 31 | f->HW_SSSE3 = (info[2] & ((int)1 << 9)) != 0; 32 | f->HW_SSE41 = (info[2] & ((int)1 << 19)) != 0; 33 | f->HW_SSE42 = (info[2] & ((int)1 << 20)) != 0; 34 | f->HW_AES = (info[2] & ((int)1 << 25)) != 0; 35 | 36 | f->HW_AVX = (info[2] & ((int)1 << 28)) != 0; 37 | f->HW_FMA3 = (info[2] & ((int)1 << 12)) != 0; 38 | 39 | f->HW_RDRAND = (info[2] & ((int)1 << 30)) != 0; 40 | } 41 | if (nIds >= 0x00000007) 42 | { 43 | cpuid(info, 0x00000007); 44 | f->HW_AVX2 = (info[1] & ((int)1 << 5)) != 0; 45 | 46 | f->HW_BMI1 = (info[1] & ((int)1 << 3)) != 0; 47 | f->HW_BMI2 = (info[1] & ((int)1 << 8)) != 0; 48 | f->HW_ADX = (info[1] & ((int)1 << 19)) != 0; 49 | f->HW_SHA = (info[1] & ((int)1 << 29)) != 0; 50 | f->HW_PREFETCHWT1 = (info[2] & ((int)1 << 0)) != 0; 51 | 52 | f->HW_AVX512F = (info[1] & ((int)1 << 16)) != 0; 53 | f->HW_AVX512CD = (info[1] & ((int)1 << 28)) != 0; 54 | f->HW_AVX512PF = (info[1] & ((int)1 << 26)) != 0; 55 | f->HW_AVX512ER = (info[1] & ((int)1 << 27)) != 0; 56 | f->HW_AVX512VL = (info[1] & ((int)1 << 31)) != 0; 57 | f->HW_AVX512BW = (info[1] & ((int)1 << 30)) != 0; 58 | f->HW_AVX512DQ = (info[1] & ((int)1 << 17)) != 0; 59 | f->HW_AVX512IFMA = (info[1] & ((int)1 << 21)) != 0; 60 | f->HW_AVX512VBMI = (info[2] & ((int)1 << 1)) != 0; 61 | } 62 | if (nExIds >= 0x80000001) 63 | { 64 | cpuid(info, 0x80000001); 65 | f->HW_x64 = (info[3] & ((int)1 << 29)) != 0; 66 | f->HW_ABM = (info[2] & ((int)1 << 5)) != 0; 67 | f->HW_SSE4a = (info[2] & ((int)1 << 6)) != 0; 68 | f->HW_FMA4 = (info[2] & ((int)1 << 16)) != 0; 69 | f->HW_XOP = (info[2] & ((int)1 << 11)) != 0; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /go/aes/aes.go: -------------------------------------------------------------------------------- 1 | package aes 2 | 3 | // AES S-box 4 | var sbox = [256]byte{ 5 | 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 6 | 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 7 | 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 8 | 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 9 | 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 10 | 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 11 | 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 12 | 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 13 | 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 14 | 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 15 | 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 16 | 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 17 | 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 18 | 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 19 | 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 20 | 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16, 21 | } 22 | 23 | // Galois Field multiplication by 2 24 | func gmul2(b byte) byte { 25 | if b&0x80 != 0 { 26 | return (b << 1) ^ 0x1b 27 | } 28 | return b << 1 29 | } 30 | 31 | // Galois Field multiplication by 3 32 | func gmul3(b byte) byte { 33 | return gmul2(b) ^ b 34 | } 35 | 36 | // subBytes performs the SubBytes step 37 | func subBytes(state *[16]byte) { 38 | for i := 0; i < 16; i++ { 39 | state[i] = sbox[state[i]] 40 | } 41 | } 42 | 43 | // shiftRows performs the ShiftRows step 44 | func shiftRows(state *[16]byte) { 45 | // Row 0: no shift 46 | // Row 1: shift left by 1 47 | state[1], state[5], state[9], state[13] = state[5], state[9], state[13], state[1] 48 | // Row 2: shift left by 2 49 | state[2], state[6], state[10], state[14] = state[10], state[14], state[2], state[6] 50 | // Row 3: shift left by 3 51 | state[3], state[7], state[11], state[15] = state[15], state[3], state[7], state[11] 52 | } 53 | 54 | // mixColumns performs the MixColumns step 55 | func mixColumns(state *[16]byte) { 56 | for i := 0; i < 4; i++ { 57 | col := i * 4 58 | s0 := state[col] 59 | s1 := state[col+1] 60 | s2 := state[col+2] 61 | s3 := state[col+3] 62 | 63 | state[col] = gmul2(s0) ^ gmul3(s1) ^ s2 ^ s3 64 | state[col+1] = s0 ^ gmul2(s1) ^ gmul3(s2) ^ s3 65 | state[col+2] = s0 ^ s1 ^ gmul2(s2) ^ gmul3(s3) 66 | state[col+3] = gmul3(s0) ^ s1 ^ s2 ^ gmul2(s3) 67 | } 68 | } 69 | 70 | // addRoundKey XORs the state with the round key 71 | func addRoundKey(state *[16]byte, key *[16]byte) { 72 | for i := 0; i < 16; i++ { 73 | state[i] ^= key[i] 74 | } 75 | } 76 | 77 | // CipherRoundGeneric performs a single AES round (SubBytes, ShiftRows, MixColumns, AddRoundKey) 78 | // This matches aes::hazmat::cipher_round in Rust 79 | func CipherRoundGeneric(block *[16]byte, key *[16]byte) { 80 | subBytes(block) 81 | shiftRows(block) 82 | mixColumns(block) 83 | addRoundKey(block, key) 84 | } 85 | -------------------------------------------------------------------------------- /go/v3/v3_test.go: -------------------------------------------------------------------------------- 1 | package v3 2 | 3 | import ( 4 | "slices" 5 | "testing" 6 | ) 7 | 8 | func TestZeroHash(t *testing.T) { 9 | input := make([]byte, 112) 10 | scratchPad := NewScratchPad() 11 | 12 | hash, err := XelisHash(input, scratchPad) 13 | if err != nil { 14 | t.Fatalf("Hash failed: %v", err) 15 | } 16 | 17 | expected := [32]byte{ 18 | 105, 172, 103, 40, 94, 253, 92, 162, 19 | 42, 252, 5, 196, 236, 238, 91, 218, 20 | 22, 157, 228, 233, 239, 8, 250, 57, 21 | 212, 166, 121, 132, 148, 205, 103, 163, 22 | } 23 | 24 | if !slices.Equal(expected[:], hash[:]) { 25 | t.Errorf("Hash mismatch:\nGot: %x\nExpected: %x", hash, expected) 26 | } 27 | } 28 | 29 | func TestReusedScratchpad(t *testing.T) { 30 | input := make([]byte, 112) 31 | for i := range input { 32 | input[i] = byte(i % 256) 33 | } 34 | 35 | scratchPad := NewScratchPad() 36 | 37 | hash1, err := XelisHash(input, scratchPad) 38 | if err != nil { 39 | t.Fatalf("First hash failed: %v", err) 40 | } 41 | 42 | hash2, err := XelisHash(input, scratchPad) 43 | if err != nil { 44 | t.Fatalf("Second hash failed: %v", err) 45 | } 46 | 47 | for i := range hash1 { 48 | if hash1[i] != hash2[i] { 49 | t.Errorf("Hash mismatch when reusing scratchpad") 50 | break 51 | } 52 | } 53 | } 54 | 55 | func TestVerifyOutput(t *testing.T) { 56 | input := []byte{ 57 | 172, 236, 108, 212, 181, 31, 109, 45, 44, 242, 54, 225, 143, 133, 58 | 89, 44, 179, 108, 39, 191, 32, 116, 229, 33, 63, 130, 33, 120, 185, 89, 59 | 146, 141, 10, 79, 183, 107, 238, 122, 92, 222, 25, 134, 90, 107, 116, 60 | 110, 236, 53, 255, 5, 214, 126, 24, 216, 97, 199, 148, 239, 253, 102, 61 | 199, 184, 232, 253, 158, 145, 86, 187, 112, 81, 78, 70, 80, 110, 33, 62 | 37, 159, 233, 198, 1, 178, 108, 210, 100, 109, 155, 106, 124, 124, 83, 63 | 89, 50, 197, 115, 231, 32, 74, 2, 92, 47, 25, 220, 135, 249, 122, 64 | 172, 220, 137, 143, 234, 68, 188, 65 | } 66 | 67 | scratchPad := NewScratchPad() 68 | hash, err := XelisHash(input, scratchPad) 69 | if err != nil { 70 | t.Fatalf("Hash failed: %v", err) 71 | } 72 | 73 | expected := []byte{ 74 | 242, 8, 176, 222, 203, 27, 104, 75 | 187, 22, 40, 68, 73, 79, 79, 65, 76 | 83, 138, 101, 10, 116, 194, 41, 153, 77 | 21, 92, 163, 12, 206, 231, 156, 70, 83, 78 | } 79 | 80 | if !slices.Equal(expected, hash[:]) { 81 | t.Errorf("Hash mismatch:\nGot: %x\nExpected: %x", hash, expected) 82 | } 83 | } 84 | 85 | func TestMapIndex(t *testing.T) { 86 | // Test that mapIndex always returns valid indices 87 | for i := 0; i < 10000; i++ { 88 | idx := mapIndex(uint64(i)) 89 | if idx < 0 || idx >= BufferSize { 90 | t.Errorf("Invalid index %d from mapIndex(%d)", idx, i) 91 | } 92 | } 93 | 94 | // Edge cases 95 | if mapIndex(0) < 0 || mapIndex(0) >= BufferSize { 96 | t.Error("mapIndex(0) out of bounds") 97 | } 98 | if mapIndex(^uint64(0)) < 0 || mapIndex(^uint64(0)) >= BufferSize { 99 | t.Error("mapIndex(MAX) out of bounds") 100 | } 101 | } 102 | 103 | func TestPickHalf(t *testing.T) { 104 | // Test that pickHalf produces roughly 50/50 distribution 105 | ones := 0 106 | zeros := 0 107 | iterations := 100000 108 | 109 | for i := 0; i < iterations; i++ { 110 | if pickHalf(uint64(i)) { 111 | ones++ 112 | } else { 113 | zeros++ 114 | } 115 | } 116 | 117 | ratio := float64(ones) / float64(ones+zeros) 118 | t.Logf("pickHalf ratio: %f (ones: %d, zeros: %d)", ratio, ones, zeros) 119 | 120 | // Allow 5% deviation from 0.5 121 | if ratio < 0.45 || ratio > 0.55 { 122 | t.Errorf("pickHalf distribution is skewed: %f", ratio) 123 | } 124 | } 125 | 126 | func BenchmarkXelisHashV3(b *testing.B) { 127 | input := make([]byte, 112) 128 | scratchPad := NewScratchPad() 129 | 130 | b.ResetTimer() 131 | for i := 0; i < b.N; i++ { 132 | _, _ = XelisHash(input, scratchPad) 133 | } 134 | } 135 | 136 | func BenchmarkMapIndex(b *testing.B) { 137 | for i := 0; i < b.N; i++ { 138 | _ = mapIndex(uint64(i)) 139 | } 140 | } 141 | 142 | func BenchmarkPickHalf(b *testing.B) { 143 | for i := 0; i < b.N; i++ { 144 | _ = pickHalf(uint64(i)) 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /C/ChaCha20-SIMD/chacha20.c: -------------------------------------------------------------------------------- 1 | #include "chacha20.h" 2 | #include 3 | 4 | static const int32_t KeyDataSize = 48; 5 | static const int32_t rounds = 20; 6 | 7 | static const uint32_t ConstState[4] = {1634760805, 857760878, 2036477234, 1797285236}; //"expand 32-byte k";; 8 | 9 | void ChaCha20SetKey(uint8_t *state, const uint8_t *Key) 10 | { 11 | memcpy(state, Key, 32); 12 | } 13 | 14 | void ChaCha20SetNonce(uint8_t *state, const uint8_t *Nonce) 15 | { 16 | memcpy(state + 36, Nonce, 12); 17 | } 18 | 19 | void ChaCha20SetCtr(uint8_t *state, const uint8_t *Ctr) 20 | { 21 | memcpy(state + 32, Ctr, 4); 22 | } 23 | 24 | void ChaCha20IncrementNonce(uint8_t *state) 25 | { 26 | uint32_t *State32bits = (uint32_t *)state; 27 | State32bits[8] = 0; // reset counter 28 | ++State32bits[9]; 29 | if (State32bits[9] == 0) 30 | { 31 | ++State32bits[10]; 32 | if (State32bits[10] == 0) 33 | ++State32bits[11]; 34 | } 35 | } 36 | 37 | void ChaCha20AddCounter(uint8_t *ChaCha, const uint32_t value_to_add) 38 | { 39 | uint32_t *State32bits = (uint32_t *)ChaCha; 40 | State32bits[8] += value_to_add; 41 | } 42 | void ChaCha20EncryptBytes(uint8_t *state, uint8_t *In, uint8_t *Out, size_t Size, uint32_t rounds) 43 | { 44 | 45 | // portable chacha, no simd 46 | uint8_t *CurrentIn = In; 47 | uint8_t *CurrentOut = Out; 48 | uint64_t RemainingBytes = Size; 49 | uint32_t *state_dwords = (uint32_t *)state; 50 | uint32_t b[16]; 51 | while (1) 52 | { 53 | b[0] = ConstState[0]; 54 | b[1] = ConstState[1]; 55 | b[2] = ConstState[2]; 56 | b[3] = ConstState[3]; 57 | memcpy(((uint8_t *)b) + 16, state, 48); 58 | 59 | for (int i = rounds; i > 0; i -= 2) 60 | { 61 | b[0] = b[0] + b[4]; 62 | b[12] = (b[12] ^ b[0]) << 16 | (b[12] ^ b[0]) >> 16; 63 | b[8] = b[8] + b[12]; 64 | b[4] = (b[4] ^ b[8]) << 12 | (b[4] ^ b[8]) >> 20; 65 | b[0] = b[0] + b[4]; 66 | b[12] = (b[12] ^ b[0]) << 8 | (b[12] ^ b[0]) >> 24; 67 | b[8] = b[8] + b[12]; 68 | b[4] = (b[4] ^ b[8]) << 7 | (b[4] ^ b[8]) >> 25; 69 | b[1] = b[1] + b[5]; 70 | b[13] = (b[13] ^ b[1]) << 16 | (b[13] ^ b[1]) >> 16; 71 | b[9] = b[9] + b[13]; 72 | b[5] = (b[5] ^ b[9]) << 12 | (b[5] ^ b[9]) >> 20; 73 | b[1] = b[1] + b[5]; 74 | b[13] = (b[13] ^ b[1]) << 8 | (b[13] ^ b[1]) >> 24; 75 | b[9] = b[9] + b[13]; 76 | b[5] = (b[5] ^ b[9]) << 7 | (b[5] ^ b[9]) >> 25; 77 | b[2] = b[2] + b[6]; 78 | b[14] = (b[14] ^ b[2]) << 16 | (b[14] ^ b[2]) >> 16; 79 | b[10] = b[10] + b[14]; 80 | b[6] = (b[6] ^ b[10]) << 12 | (b[6] ^ b[10]) >> 20; 81 | b[2] = b[2] + b[6]; 82 | b[14] = (b[14] ^ b[2]) << 8 | (b[14] ^ b[2]) >> 24; 83 | b[10] = b[10] + b[14]; 84 | b[6] = (b[6] ^ b[10]) << 7 | (b[6] ^ b[10]) >> 25; 85 | b[3] = b[3] + b[7]; 86 | b[15] = (b[15] ^ b[3]) << 16 | (b[15] ^ b[3]) >> 16; 87 | b[11] = b[11] + b[15]; 88 | b[7] = (b[7] ^ b[11]) << 12 | (b[7] ^ b[11]) >> 20; 89 | b[3] = b[3] + b[7]; 90 | b[15] = (b[15] ^ b[3]) << 8 | (b[15] ^ b[3]) >> 24; 91 | b[11] = b[11] + b[15]; 92 | b[7] = (b[7] ^ b[11]) << 7 | (b[7] ^ b[11]) >> 25; 93 | b[0] = b[0] + b[5]; 94 | b[15] = (b[15] ^ b[0]) << 16 | (b[15] ^ b[0]) >> 16; 95 | b[10] = b[10] + b[15]; 96 | b[5] = (b[5] ^ b[10]) << 12 | (b[5] ^ b[10]) >> 20; 97 | b[0] = b[0] + b[5]; 98 | b[15] = (b[15] ^ b[0]) << 8 | (b[15] ^ b[0]) >> 24; 99 | b[10] = b[10] + b[15]; 100 | b[5] = (b[5] ^ b[10]) << 7 | (b[5] ^ b[10]) >> 25; 101 | b[1] = b[1] + b[6]; 102 | b[12] = (b[12] ^ b[1]) << 16 | (b[12] ^ b[1]) >> 16; 103 | b[11] = b[11] + b[12]; 104 | b[6] = (b[6] ^ b[11]) << 12 | (b[6] ^ b[11]) >> 20; 105 | b[1] = b[1] + b[6]; 106 | b[12] = (b[12] ^ b[1]) << 8 | (b[12] ^ b[1]) >> 24; 107 | b[11] = b[11] + b[12]; 108 | b[6] = (b[6] ^ b[11]) << 7 | (b[6] ^ b[11]) >> 25; 109 | b[2] = b[2] + b[7]; 110 | b[13] = (b[13] ^ b[2]) << 16 | (b[13] ^ b[2]) >> 16; 111 | b[8] = b[8] + b[13]; 112 | b[7] = (b[7] ^ b[8]) << 12 | (b[7] ^ b[8]) >> 20; 113 | b[2] = b[2] + b[7]; 114 | b[13] = (b[13] ^ b[2]) << 8 | (b[13] ^ b[2]) >> 24; 115 | b[8] = b[8] + b[13]; 116 | b[7] = (b[7] ^ b[8]) << 7 | (b[7] ^ b[8]) >> 25; 117 | b[3] = b[3] + b[4]; 118 | b[14] = (b[14] ^ b[3]) << 16 | (b[14] ^ b[3]) >> 16; 119 | b[9] = b[9] + b[14]; 120 | b[4] = (b[4] ^ b[9]) << 12 | (b[4] ^ b[9]) >> 20; 121 | b[3] = b[3] + b[4]; 122 | b[14] = (b[14] ^ b[3]) << 8 | (b[14] ^ b[3]) >> 24; 123 | b[9] = b[9] + b[14]; 124 | b[4] = (b[4] ^ b[9]) << 7 | (b[4] ^ b[9]) >> 25; 125 | } 126 | 127 | for (uint32_t i = 0; i < 4; ++i) 128 | { 129 | b[i] += ConstState[i]; 130 | } 131 | for (uint32_t i = 0; i < 12; ++i) 132 | { 133 | b[i + 4] += state_dwords[i]; 134 | } 135 | 136 | ++state_dwords[8]; // counter 137 | 138 | if (RemainingBytes >= 64) 139 | { 140 | if (In) 141 | { 142 | uint32_t *In32bits = (uint32_t *)CurrentIn; 143 | uint32_t *Out32bits = (uint32_t *)CurrentOut; 144 | for (uint32_t i = 0; i < 16; i++) 145 | { 146 | Out32bits[i] = In32bits[i] ^ b[i]; 147 | } 148 | } 149 | else 150 | memcpy(CurrentOut, b, 64); 151 | 152 | if (In) 153 | CurrentIn += 64; 154 | CurrentOut += 64; 155 | RemainingBytes -= 64; 156 | if (RemainingBytes == 0) 157 | return; 158 | continue; 159 | } 160 | else 161 | { 162 | if (In) 163 | { 164 | for (int32_t i = 0; i < RemainingBytes; i++) 165 | CurrentOut[i] = CurrentIn[i] ^ ((uint8_t *)b)[i]; 166 | } 167 | else 168 | memcpy(CurrentOut, b, RemainingBytes); 169 | return; 170 | } 171 | } 172 | } 173 | 174 | void chacha_encrypt_portable(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds) 175 | { 176 | uint8_t state[48] = {0}; 177 | ChaCha20SetKey(state, key); 178 | ChaCha20SetNonce(state, nonce); 179 | ChaCha20EncryptBytes(state, in, out, bytes, rounds); 180 | } -------------------------------------------------------------------------------- /go/v1/v1.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | import ( 4 | "encoding/binary" 5 | "math/bits" 6 | "unsafe" 7 | 8 | "github.com/xelis-project/xelis-hash/go/aes" 9 | "github.com/xelis-project/xelis-hash/go/hash" 10 | ) 11 | 12 | const ( 13 | MemorySize = 32768 14 | ScratchpadIters = 5000 15 | Iters = 1 16 | BufferSize = 42 17 | SlotLength = 256 18 | KeccakWords = 25 19 | BytesArrayInput = KeccakWords * 8 20 | Stage1Max = MemorySize / KeccakWords 21 | ) 22 | 23 | type ScratchPad [MemorySize]uint64 24 | 25 | func Stage1(input *[KeccakWords]uint64, scratchPad *[MemorySize]uint64, aRange, bRange [2]int) { 26 | for i := aRange[0]; i <= aRange[1]; i++ { 27 | KeccakF1600(input) 28 | 29 | var randInt uint64 = 0 30 | for j := bRange[0]; j <= bRange[1]; j++ { 31 | pairIdx := (j + 1) % KeccakWords 32 | pairIdx2 := (j + 2) % KeccakWords 33 | 34 | targetIdx := i*KeccakWords + j 35 | a := input[j] ^ randInt 36 | 37 | // Branching 38 | left := input[pairIdx] 39 | right := input[pairIdx2] 40 | xor := left ^ right 41 | var v uint64 42 | switch xor & 0x3 { 43 | case 0: 44 | v = left & right 45 | case 1: 46 | v = ^(left & right) 47 | case 2: 48 | v = ^xor 49 | case 3: 50 | v = xor 51 | } 52 | 53 | b := a ^ v 54 | randInt = b 55 | scratchPad[targetIdx] = b 56 | } 57 | } 58 | } 59 | 60 | func XelisHash(input *[BytesArrayInput]byte, scratchPad *ScratchPad) (hash.Hash, error) { 61 | // Convert input bytes to u64 array 62 | var intInput [KeccakWords]uint64 63 | for i := 0; i < KeccakWords; i++ { 64 | intInput[i] = binary.LittleEndian.Uint64(input[i*8:]) 65 | } 66 | 67 | // Stage 1 68 | Stage1(&intInput, (*[MemorySize]uint64)(scratchPad), [2]int{0, Stage1Max - 1}, [2]int{0, KeccakWords - 1}) 69 | Stage1(&intInput, (*[MemorySize]uint64)(scratchPad), [2]int{Stage1Max, Stage1Max}, [2]int{0, 17}) 70 | 71 | // Stage 2 72 | var slots [SlotLength]uint32 73 | // Convert scratchpad to u32 using unsafe pointer (no copy) 74 | smallPad := (*[MemorySize * 2]uint32)(unsafe.Pointer(&scratchPad[0])) 75 | 76 | copy(slots[:], smallPad[len(smallPad)-SlotLength:]) 77 | 78 | var indices [SlotLength]uint16 79 | for iter := 0; iter < Iters; iter++ { 80 | for j := 0; j < len(smallPad)/SlotLength; j++ { 81 | // Initialize indices and precompute the total sum 82 | var totalSum uint32 = 0 83 | for k := 0; k < SlotLength; k++ { 84 | indices[k] = uint16(k) 85 | if slots[k]>>31 == 0 { 86 | totalSum += smallPad[j*SlotLength+k] 87 | } else { 88 | totalSum -= smallPad[j*SlotLength+k] 89 | } 90 | } 91 | 92 | for slotIdx := SlotLength - 1; slotIdx >= 0; slotIdx-- { 93 | indexInIndices := int(smallPad[j*SlotLength+slotIdx] % uint32(slotIdx+1)) 94 | index := int(indices[indexInIndices]) 95 | indices[indexInIndices] = indices[slotIdx] 96 | 97 | localSum := totalSum 98 | s1 := int32(slots[index] >> 31) 99 | padValue := smallPad[j*SlotLength+index] 100 | if s1 == 0 { 101 | localSum -= padValue 102 | } else { 103 | localSum += padValue 104 | } 105 | 106 | // Apply the sum to the slot 107 | slots[index] += localSum 108 | 109 | // Update the total sum 110 | s2 := int32(slots[index] >> 31) 111 | totalSum -= 2 * smallPad[j*SlotLength+index] * uint32(-s1+s2) 112 | } 113 | } 114 | } 115 | 116 | copy(smallPad[MemorySize*2-SlotLength:], slots[:]) 117 | // No need to convert back - smallPad points directly to scratchPad memory 118 | 119 | // Stage 3 120 | var key [16]byte // zero key 121 | var block [16]byte 122 | 123 | addrA := (scratchPad[MemorySize-1] >> 15) & 0x7FFF 124 | addrB := scratchPad[MemorySize-1] & 0x7FFF 125 | 126 | var memBufferA [BufferSize]uint64 127 | var memBufferB [BufferSize]uint64 128 | 129 | for i := uint64(0); i < BufferSize; i++ { 130 | memBufferA[i] = scratchPad[(addrA+i)%MemorySize] 131 | memBufferB[i] = scratchPad[(addrB+i)%MemorySize] 132 | } 133 | 134 | var finalResult hash.Hash 135 | 136 | for i := 0; i < ScratchpadIters; i++ { 137 | memA := memBufferA[i%BufferSize] 138 | memB := memBufferB[i%BufferSize] 139 | 140 | binary.LittleEndian.PutUint64(block[0:8], memB) 141 | binary.LittleEndian.PutUint64(block[8:16], memA) 142 | 143 | // Use single AES round instead of full encryption 144 | aes.CipherRound(&block, &key) 145 | 146 | hash1 := binary.LittleEndian.Uint64(block[0:8]) 147 | hash2 := memA ^ memB 148 | 149 | result := ^(hash1 ^ hash2) 150 | 151 | for j := 0; j < hash.HashSize; j++ { 152 | a := memBufferA[(j+i)%BufferSize] 153 | b := memBufferB[(j+i)%BufferSize] 154 | 155 | switch (result >> (j * 2)) & 0xf { 156 | case 0: 157 | result = bits.RotateLeft64(result, j) ^ b 158 | case 1: 159 | result = ^(bits.RotateLeft64(result, j) ^ a) 160 | case 2: 161 | result = ^(result ^ a) 162 | case 3: 163 | result ^= b 164 | case 4: 165 | result ^= (a + b) 166 | case 5: 167 | result ^= (a - b) 168 | case 6: 169 | result ^= (b - a) 170 | case 7: 171 | result ^= (a * b) 172 | case 8: 173 | result ^= (a & b) 174 | case 9: 175 | result ^= (a | b) 176 | case 10: 177 | result ^= (a ^ b) 178 | case 11: 179 | result ^= (a - result) 180 | case 12: 181 | result ^= (b - result) 182 | case 13: 183 | result ^= (a + result) 184 | case 14: 185 | result ^= (result - a) 186 | case 15: 187 | result ^= (result - b) 188 | } 189 | } 190 | 191 | addrB = result & 0x7FFF 192 | memBufferA[i%BufferSize] = result 193 | memBufferB[i%BufferSize] = scratchPad[addrB] 194 | 195 | addrA = (result >> 15) & 0x7FFF 196 | scratchPad[addrA] = result 197 | 198 | index := ScratchpadIters - i - 1 199 | if index < 4 { 200 | var resultBytes [8]byte 201 | binary.BigEndian.PutUint64(resultBytes[:], result) 202 | copy(finalResult[index*8:(ScratchpadIters-i)*8], resultBytes[:]) 203 | } 204 | } 205 | 206 | return finalResult, nil 207 | } 208 | 209 | // NewScratchPad creates a new zeroed scratchpad 210 | func NewScratchPad() *ScratchPad { 211 | return &ScratchPad{} 212 | } 213 | -------------------------------------------------------------------------------- /go/v2/v2.go: -------------------------------------------------------------------------------- 1 | package v2 2 | 3 | import ( 4 | "encoding/binary" 5 | "unsafe" 6 | 7 | "github.com/chocolatkey/chacha8" 8 | "github.com/xelis-project/xelis-hash/go/aes" 9 | "github.com/xelis-project/xelis-hash/go/hash" 10 | "lukechampine.com/blake3" 11 | "lukechampine.com/uint128" 12 | ) 13 | 14 | const ( 15 | MemorySize = 429 * 128 16 | ScratchpadIters = 3 17 | BufferSize = MemorySize / 2 18 | ChunkSize = 32 19 | NonceSize = 12 20 | MemorySizeBytes = MemorySize * 8 21 | ) 22 | 23 | var Key = [16]byte{'x', 'e', 'l', 'i', 's', 'h', 'a', 's', 'h', '-', 'p', 'o', 'w', '-', 'v', '2'} 24 | 25 | type ScratchPad [MemorySize]uint64 26 | 27 | // Stage1 generates the scratchpad using ChaCha8 28 | func Stage1(input []byte, scratchPad *ScratchPad) error { 29 | // Convert scratchpad to bytes 30 | scratchPadBytes := (*[MemorySizeBytes]byte)(unsafe.Pointer(scratchPad))[:] 31 | 32 | // Reset scratchpad 33 | for i := range scratchPadBytes { 34 | scratchPadBytes[i] = 0 35 | } 36 | 37 | outputOffset := 0 38 | nonce := make([]byte, NonceSize) 39 | 40 | // Generate nonce from input 41 | inputHash := blake3.Sum256(input) 42 | copy(nonce, inputHash[:NonceSize]) 43 | 44 | numChunks := (len(input) + ChunkSize - 1) / ChunkSize 45 | 46 | for chunkIndex := 0; chunkIndex*ChunkSize < len(input); chunkIndex++ { 47 | start := chunkIndex * ChunkSize 48 | end := start + ChunkSize 49 | if end > len(input) { 50 | end = len(input) 51 | } 52 | chunk := input[start:end] 53 | 54 | // Concatenate input hash with chunk 55 | tmp := make([]byte, hash.HashSize*2) 56 | copy(tmp[0:hash.HashSize], inputHash[:]) 57 | copy(tmp[hash.HashSize:hash.HashSize+len(chunk)], chunk) 58 | 59 | // Hash it 60 | inputHash = blake3.Sum256(tmp) 61 | 62 | cipher, err := chacha8.New(inputHash[:], nonce) 63 | if err != nil { 64 | return err 65 | } 66 | 67 | // Calculate output size for this iteration 68 | remainingOutputSize := MemorySizeBytes - outputOffset 69 | chunksLeft := numChunks - chunkIndex 70 | chunkOutputSize := remainingOutputSize / chunksLeft 71 | currentOutputSize := remainingOutputSize 72 | if currentOutputSize > chunkOutputSize { 73 | currentOutputSize = chunkOutputSize 74 | } 75 | 76 | // Apply keystream 77 | offset := chunkIndex * currentOutputSize 78 | part := scratchPadBytes[offset : offset+currentOutputSize] 79 | cipher.XORKeyStream(part, part) 80 | 81 | outputOffset += currentOutputSize 82 | 83 | // Update nonce 84 | nonceStart := currentOutputSize - NonceSize 85 | if nonceStart < 0 { 86 | nonceStart = 0 87 | } 88 | copy(nonce, part[nonceStart:]) 89 | } 90 | 91 | return nil 92 | } 93 | 94 | func isqrt(n uint64) uint64 { 95 | if n < 2 { 96 | return n 97 | } 98 | 99 | x := n 100 | y := (x + 1) >> 1 101 | 102 | for y < x { 103 | x = y 104 | y = (x + n/x) >> 1 105 | } 106 | 107 | return x 108 | } 109 | 110 | // Stage3 performs random memory accesses and branching 111 | func Stage3(scratchPad *ScratchPad) error { 112 | var block [16]byte 113 | 114 | // Split scratchpad into two buffers 115 | memBufferA := scratchPad[:BufferSize] 116 | memBufferB := scratchPad[BufferSize:] 117 | 118 | addrA := memBufferB[BufferSize-1] 119 | addrB := memBufferA[BufferSize-1] >> 32 120 | 121 | r := 0 122 | 123 | for i := 0; i < ScratchpadIters; i++ { 124 | indexA := int(addrA % uint64(BufferSize)) 125 | indexB := int(addrB % uint64(BufferSize)) 126 | 127 | memA := memBufferA[indexA] 128 | memB := memBufferB[indexB] 129 | 130 | binary.LittleEndian.PutUint64(block[0:8], memB) 131 | binary.LittleEndian.PutUint64(block[8:16], memA) 132 | 133 | aes.CipherRound(&block, &Key) 134 | 135 | hash1 := binary.LittleEndian.Uint64(block[0:8]) 136 | hash2 := memA ^ memB 137 | result := ^(hash1 ^ hash2) 138 | 139 | for j := 0; j < BufferSize; j++ { 140 | indexA := int(result % uint64(BufferSize)) 141 | indexB := int((^bits_RotateRight64(result, uint(r))) % uint64(BufferSize)) 142 | 143 | a := memBufferA[indexA] 144 | b := memBufferB[indexB] 145 | 146 | var c uint64 147 | if r < BufferSize { 148 | c = memBufferA[r] 149 | } else { 150 | c = memBufferB[r-BufferSize] 151 | } 152 | 153 | if r < MemorySize-1 { 154 | r++ 155 | } else { 156 | r = 0 157 | } 158 | 159 | branchIdx := uint8((bits_RotateLeft64(result, uint(c)) & 0xf)) 160 | 161 | var v uint64 162 | switch branchIdx { 163 | case 0: 164 | v = result ^ (bits_RotateLeft64(c, uint(i*j)) ^ b) 165 | case 1: 166 | v = result ^ (bits_RotateRight64(c, uint(i*j)) ^ a) 167 | case 2: 168 | v = result ^ (a ^ b ^ c) 169 | case 3: 170 | v = result ^ ((a + b) * c) 171 | case 4: 172 | v = result ^ ((b - c) * a) 173 | case 5: 174 | v = result ^ (c - a + b) 175 | case 6: 176 | v = result ^ (a - b + c) 177 | case 7: 178 | v = result ^ (b*c + a) 179 | case 8: 180 | v = result ^ (c*a + b) 181 | case 9: 182 | v = result ^ (a * b * c) 183 | case 10: 184 | // combine_u64(a, b) % (c | 1) 185 | // Rust: combine_u64(high, low) where a is high, b is low 186 | t1 := uint128.New(b, a) // New(lo, hi) 187 | t2 := uint128.From64(c | 1) 188 | v = result ^ t1.Mod(t2).Lo 189 | case 11: 190 | // combine_u64(b, c) % combine_u64(result.rotate_left(r), a | 2) 191 | // Rust: combine_u64(high, low) 192 | t1 := uint128.New(c, b) // New(lo, hi) where b is high, c is low 193 | t2 := uint128.New(a|2, bits_RotateLeft64(result, uint(r))) // New(lo, hi) 194 | v = result ^ t1.Mod(t2).Lo 195 | case 12: 196 | // combine_u64(c, a) / (b | 4) 197 | // Rust: combine_u64(high, low) where c is high, a is low 198 | t1 := uint128.New(a, c) // New(lo, hi) 199 | t2 := uint128.From64(b | 4) 200 | v = result ^ t1.Div(t2).Lo 201 | case 13: 202 | // combine_u64(result.rotate_left(r), b) where first arg is high 203 | // combine_u64(a, c|8) where a is high 204 | t1 := uint128.New(b, bits_RotateLeft64(result, uint(r))) // New(lo, hi) 205 | t2 := uint128.New(c|8, a) // New(lo, hi) 206 | if t1.Cmp(t2) > 0 { 207 | v = result ^ t1.Div(t2).Lo 208 | } else { 209 | v = result ^ (a ^ b) 210 | } 211 | case 14: 212 | // (combine_u64(b, a) * c) >> 64 213 | // Rust wrapping_mul on u128 then >> 64 gets high 64 bits 214 | t1 := uint128.New(a, b) // New(lo, hi) where b is high, a is low 215 | prod := t1.MulWrap64(c) // Wrapping mul 216 | v = result ^ prod.Hi 217 | case 15: 218 | // (combine_u64(a, c) * combine_u64(result.rotate_right(r), b)) >> 64 219 | // Rust wrapping_mul on u128 then >> 64 gets high 64 bits 220 | rr := bits_RotateRight64(result, uint(r)) 221 | t1 := uint128.New(c, a) // New(lo, hi) where a is high, c is low 222 | t2 := uint128.New(b, rr) // New(lo, hi) where rr is high, b is low 223 | prod := t1.MulWrap(t2) // Wrapping mul 224 | v = result ^ prod.Hi 225 | } 226 | 227 | result = bits_RotateLeft64(v, 1) 228 | 229 | t := memBufferA[BufferSize-j-1] ^ result 230 | memBufferA[BufferSize-j-1] = t 231 | memBufferB[j] ^= bits_RotateRight64(t, uint(result)) 232 | } 233 | 234 | addrA = result 235 | addrB = isqrt(result) 236 | } 237 | 238 | return nil 239 | } 240 | 241 | // Stage4 hashes the entire scratchpad with Blake3 242 | func Stage4(scratchPad *ScratchPad) hash.Hash { 243 | scratchPadBytes := (*[MemorySizeBytes]byte)(unsafe.Pointer(scratchPad))[:] 244 | return blake3.Sum256(scratchPadBytes) 245 | } 246 | 247 | func XelisHash(input []byte, scratchPad *ScratchPad) (hash.Hash, error) { 248 | err := Stage1(input, scratchPad) 249 | if err != nil { 250 | return hash.Zero(), err 251 | } 252 | 253 | err = Stage3(scratchPad) 254 | if err != nil { 255 | return hash.Zero(), err 256 | } 257 | 258 | return Stage4(scratchPad), nil 259 | } 260 | 261 | func NewScratchPad() *ScratchPad { 262 | return &ScratchPad{} 263 | } 264 | 265 | // Helper functions for bit rotation 266 | func bits_RotateLeft64(x uint64, k uint) uint64 { 267 | const n = 64 268 | s := k & (n - 1) 269 | return x<>(n-s) 270 | } 271 | 272 | func bits_RotateRight64(x uint64, k uint) uint64 { 273 | const n = 64 274 | s := k & (n - 1) 275 | return x>>s | x<<(n-s) 276 | } 277 | -------------------------------------------------------------------------------- /go/v3/v3.go: -------------------------------------------------------------------------------- 1 | package v3 2 | 3 | import ( 4 | "encoding/binary" 5 | "math" 6 | "unsafe" 7 | 8 | "github.com/chocolatkey/chacha8" 9 | "github.com/xelis-project/xelis-hash/go/aes" 10 | "github.com/xelis-project/xelis-hash/go/hash" 11 | "lukechampine.com/blake3" 12 | "lukechampine.com/uint128" 13 | ) 14 | 15 | const ( 16 | MemorySize = 531 * 128 17 | ScratchpadIters = 2 18 | BufferSize = MemorySize / 2 19 | MemorySizeBytes = MemorySize * 8 20 | ) 21 | 22 | var Key = [16]byte{'x', 'e', 'l', 'i', 's', 'h', 'a', 's', 'h', '-', 'p', 'o', 'w', '-', 'v', '3'} 23 | 24 | type ScratchPad [MemorySize]uint64 25 | 26 | func murmurhash3(seed uint64) uint64 { 27 | seed ^= seed >> 55 28 | seed *= 0xff51afd7ed558ccd 29 | seed ^= seed >> 32 30 | seed *= 0xc4ceb9fe1a85ec53 31 | seed ^= seed >> 15 32 | return seed 33 | } 34 | 35 | func mapIndex(x uint64) int { 36 | x ^= x >> 33 37 | x *= 0xff51afd7ed558ccd 38 | // multiply-high reduction: get high 64 bits of x * BufferSize 39 | // (x * BufferSize) >> 64 40 | t1 := uint128.From64(x) 41 | t2 := uint128.From64(uint64(BufferSize)) 42 | prod := t1.MulWrap(t2) 43 | return int(prod.Hi) 44 | } 45 | 46 | func pickHalf(seed uint64) bool { 47 | return (murmurhash3(seed) & (1 << 58)) != 0 48 | } 49 | 50 | func isqrt(n uint64) uint64 { 51 | if n < 2 { 52 | return n 53 | } 54 | 55 | // Compute floating-point square root as an approximation 56 | approx := uint64(math.Sqrt(float64(n))) 57 | 58 | // Verify and adjust if necessary 59 | if approx*approx > n { 60 | return approx - 1 61 | } else if (approx+1)*(approx+1) <= n { 62 | return approx + 1 63 | } 64 | return approx 65 | } 66 | 67 | func modularPower(base, exp, mod uint64) uint64 { 68 | result := uint64(1) 69 | base %= mod 70 | 71 | for exp > 0 { 72 | if exp&1 == 1 { 73 | result = mulmod(result, base, mod) 74 | } 75 | base = mulmod(base, base, mod) 76 | exp /= 2 77 | } 78 | 79 | return result 80 | } 81 | 82 | // mulmod computes (a * b) % m avoiding overflow 83 | func mulmod(a, b, m uint64) uint64 { 84 | t1 := uint128.From64(a) 85 | t2 := uint128.From64(b) 86 | prod := t1.MulWrap(t2) 87 | mod := uint128.From64(m) 88 | return prod.Mod(mod).Lo 89 | } 90 | 91 | // Stage3 performs the complex memory operations with branching 92 | func Stage3(scratchPad *ScratchPad) error { 93 | key := Key 94 | var block [16]byte 95 | 96 | // Split scratchpad 97 | memBufferA := scratchPad[:BufferSize] 98 | memBufferB := scratchPad[BufferSize:] 99 | 100 | addrA := memBufferB[BufferSize-1] 101 | addrB := memBufferA[BufferSize-1] >> 32 102 | 103 | r := 0 104 | 105 | for i := 0; i < ScratchpadIters; i++ { 106 | indexA := mapIndex(addrA) 107 | memA := memBufferA[indexA] 108 | 109 | indexB := mapIndex(memA ^ addrB) 110 | memB := memBufferB[indexB] 111 | 112 | binary.LittleEndian.PutUint64(block[0:8], memB) 113 | binary.LittleEndian.PutUint64(block[8:16], memA) 114 | 115 | aes.CipherRound(&block, &key) 116 | 117 | hash1 := binary.LittleEndian.Uint64(block[0:8]) 118 | hash2 := binary.LittleEndian.Uint64(block[8:16]) 119 | 120 | result := ^(hash1 ^ hash2) 121 | 122 | for j := 0; j < BufferSize; j++ { 123 | indexA := mapIndex(result) 124 | a := memBufferA[indexA] 125 | 126 | indexB := mapIndex(a ^ (^bits_RotateRight64(result, uint(r)))) 127 | b := memBufferB[indexB] 128 | 129 | var c uint64 130 | if r < BufferSize { 131 | c = memBufferA[r] 132 | } else { 133 | c = memBufferB[r-BufferSize] 134 | } 135 | 136 | if r < MemorySize-1 { 137 | r++ 138 | } else { 139 | r = 0 140 | } 141 | 142 | branchIdx := uint8(bits_RotateLeft64(result, uint(c)) & 0xf) 143 | 144 | var v uint64 145 | switch branchIdx { 146 | case 0: 147 | // combine_u64((a + i), isqrt(b + j)) % (murmurhash3(c ^ result ^ i ^ j) | 1) 148 | t1 := uint128.New(isqrt(b+uint64(j)), a+uint64(i)) // New(lo, hi) 149 | denom := uint128.From64(murmurhash3(c^result^uint64(i)^uint64(j)) | 1) 150 | v = t1.Mod(denom).Lo 151 | case 1: 152 | // ROTL((c + i) % isqrt(b | 2), i + j) * isqrt(a + j) 153 | sqrt := isqrt(b | 2) 154 | if sqrt == 0 { 155 | sqrt = 1 156 | } 157 | t1 := (c + uint64(i)) % sqrt 158 | t2 := bits_RotateLeft64(t1, uint(i+j)) 159 | t3 := isqrt(a + uint64(j)) 160 | v = t2 * t3 161 | case 2: 162 | // (isqrt(a + i) * isqrt(c + j)) ^ (b + i + j) 163 | t1 := isqrt(a + uint64(i)) 164 | t2 := isqrt(c + uint64(j)) 165 | t3 := t1 * t2 166 | v = t3 ^ (b + uint64(i) + uint64(j)) 167 | case 3: 168 | v = (a + b) * c 169 | case 4: 170 | v = (b - c) * a 171 | case 5: 172 | v = c - a + b 173 | case 6: 174 | v = a - b + c 175 | case 7: 176 | v = b*c + a 177 | case 8: 178 | v = c*a + b 179 | case 9: 180 | v = a * b * c 181 | case 10: 182 | t1 := uint128.New(b, a) // New(lo, hi) 183 | t2 := uint128.From64(c | 1) 184 | v = t1.Mod(t2).Lo 185 | case 11: 186 | t1 := uint128.New(c, b) // New(lo, hi) 187 | t2 := uint128.New(a|2, bits_RotateLeft64(result, uint(r))) // New(lo, hi) 188 | if t2.Cmp(t1) > 0 { 189 | v = c 190 | } else { 191 | v = t1.Mod(t2).Lo 192 | } 193 | case 12: 194 | t1 := uint128.New(a, c) // New(lo, hi) 195 | t2 := uint128.From64(b | 4) 196 | v = t1.Div(t2).Lo 197 | case 13: 198 | t1 := uint128.New(b, bits_RotateLeft64(result, uint(r))) // New(lo, hi) 199 | t2 := uint128.New(c|8, a) // New(lo, hi) 200 | if t1.Cmp(t2) > 0 { 201 | v = t1.Div(t2).Lo 202 | } else { 203 | v = a ^ b 204 | } 205 | case 14: 206 | // (combine_u64(b, a) * c) >> 64 207 | t1 := uint128.New(a, b) // New(lo, hi) 208 | prod := t1.MulWrap64(c) 209 | v = prod.Hi 210 | case 15: 211 | // (combine_u64(a, c) * combine_u64(result.rotate_right(r), b)) >> 64 212 | rr := bits_RotateRight64(result, uint(r)) 213 | t1 := uint128.New(c, a) // New(lo, hi) 214 | t2 := uint128.New(b, rr) // New(lo, hi) 215 | prod := t1.MulWrap(t2) 216 | v = prod.Hi 217 | } 218 | 219 | seed := v ^ result 220 | result = bits_RotateLeft64(seed, uint(r)) 221 | 222 | useBufferB := pickHalf(v) 223 | indexT := mapIndex(seed) 224 | var t uint64 225 | if useBufferB { 226 | t = memBufferB[indexT] ^ result 227 | } else { 228 | t = memBufferA[indexT] ^ result 229 | } 230 | 231 | indexA2 := mapIndex(t ^ result ^ 0x9e3779b97f4a7c15) 232 | indexB2 := mapIndex(uint64(indexA2) ^ ^result ^ 0xd2b74407b1ce6e93) 233 | 234 | oldA := memBufferA[indexA2] 235 | memBufferA[indexA2] = t 236 | memBufferB[indexB2] ^= oldA ^ bits_RotateRight64(t, uint(i+j)) 237 | } 238 | 239 | addrA = modularPower(addrA, addrB, result) 240 | addrB = isqrt(result) * uint64(r+1) * isqrt(addrA) 241 | } 242 | 243 | return nil 244 | } 245 | 246 | // Stage1 generates the scratchpad using ChaCha8 (same as v2 but with v3's memory size) 247 | func Stage1(input []byte, scratchPad *ScratchPad) error { 248 | // Convert scratchpad to bytes 249 | scratchPadBytes := (*[MemorySizeBytes]byte)(unsafe.Pointer(scratchPad))[:] 250 | 251 | // Reset scratchpad 252 | for i := range scratchPadBytes { 253 | scratchPadBytes[i] = 0 254 | } 255 | 256 | const ChunkSize = 32 257 | const NonceSize = 12 258 | 259 | outputOffset := 0 260 | nonce := make([]byte, NonceSize) 261 | 262 | // Generate nonce from input 263 | inputHash := blake3.Sum256(input) 264 | copy(nonce, inputHash[:NonceSize]) 265 | 266 | numChunks := (len(input) + ChunkSize - 1) / ChunkSize 267 | 268 | for chunkIndex := 0; chunkIndex*ChunkSize < len(input); chunkIndex++ { 269 | start := chunkIndex * ChunkSize 270 | end := start + ChunkSize 271 | if end > len(input) { 272 | end = len(input) 273 | } 274 | chunk := input[start:end] 275 | 276 | // Concatenate input hash with chunk 277 | tmp := make([]byte, hash.HashSize*2) 278 | copy(tmp[0:hash.HashSize], inputHash[:]) 279 | copy(tmp[hash.HashSize:hash.HashSize+len(chunk)], chunk) 280 | 281 | // Hash it 282 | inputHash = blake3.Sum256(tmp) 283 | 284 | cipher, err := chacha8.New(inputHash[:], nonce) 285 | if err != nil { 286 | return err 287 | } 288 | 289 | // Calculate output size for this iteration 290 | remainingOutputSize := MemorySizeBytes - outputOffset 291 | chunksLeft := numChunks - chunkIndex 292 | chunkOutputSize := remainingOutputSize / chunksLeft 293 | currentOutputSize := remainingOutputSize 294 | if currentOutputSize > chunkOutputSize { 295 | currentOutputSize = chunkOutputSize 296 | } 297 | 298 | // Apply keystream 299 | offset := chunkIndex * currentOutputSize 300 | part := scratchPadBytes[offset : offset+currentOutputSize] 301 | cipher.XORKeyStream(part, part) 302 | 303 | outputOffset += currentOutputSize 304 | 305 | // Update nonce 306 | nonceStart := currentOutputSize - NonceSize 307 | if nonceStart < 0 { 308 | nonceStart = 0 309 | } 310 | copy(nonce, part[nonceStart:]) 311 | } 312 | 313 | return nil 314 | } 315 | 316 | func XelisHash(input []byte, scratchPad *ScratchPad) (hash.Hash, error) { 317 | // Use v3's Stage1 with correct memory size 318 | err := Stage1(input, scratchPad) 319 | if err != nil { 320 | return hash.Zero(), err 321 | } 322 | 323 | // V3's custom Stage3 324 | err = Stage3(scratchPad) 325 | if err != nil { 326 | return hash.Zero(), err 327 | } 328 | 329 | // Stage4: hash the whole scratchpad 330 | scratchPadBytes := (*[MemorySizeBytes]byte)(unsafe.Pointer(scratchPad))[:] 331 | return blake3.Sum256(scratchPadBytes), nil 332 | } 333 | 334 | func NewScratchPad() *ScratchPad { 335 | return &ScratchPad{} 336 | } 337 | 338 | // Helper functions 339 | func bits_RotateLeft64(x uint64, k uint) uint64 { 340 | const n = 64 341 | s := k & (n - 1) 342 | return x<>(n-s) 343 | } 344 | 345 | func bits_RotateRight64(x uint64, k uint) uint64 { 346 | const n = 64 347 | s := k & (n - 1) 348 | return x>>s | x<<(n-s) 349 | } 350 | -------------------------------------------------------------------------------- /src/tracker.rs: -------------------------------------------------------------------------------- 1 | use plotters::{ 2 | chart::ChartBuilder, 3 | prelude::*, 4 | style::{ 5 | text_anchor::{HPos, Pos, VPos}, 6 | Color, 7 | IntoFont, 8 | RGBColor, 9 | TextStyle, 10 | WHITE 11 | } 12 | }; 13 | 14 | #[derive(Debug, Clone, Copy)] 15 | pub enum MemOp { 16 | Read, 17 | Write, 18 | } 19 | 20 | #[derive(Debug, Clone, Copy, Default)] 21 | pub struct MemTracker { 22 | pub read: u64, 23 | pub write: u64, 24 | } 25 | 26 | // Track the operations used in each iteration 27 | // This is used to verify that we have a good distribution 28 | // in branches and memory operations 29 | #[derive(Debug)] 30 | pub struct OpsTracker { 31 | // branches id used at each iteration 32 | branches: [usize; 16], 33 | // memory operations used at each iteration 34 | // first Vec represents the scratchpad with each index 35 | // inner Vec represents the memory operations used at each index 36 | mem_ops: Vec, 37 | } 38 | 39 | impl OpsTracker { 40 | pub fn new(scratchpad: usize) -> Self { 41 | Self { 42 | branches: [0; 16], 43 | mem_ops: vec![Default::default(); scratchpad], 44 | } 45 | } 46 | 47 | pub fn add_branch(&mut self, branch: u8) { 48 | self.branches[branch as usize] += 1; 49 | } 50 | 51 | pub fn add_mem_op(&mut self, index: usize, mem_op: MemOp) { 52 | let tracker = &mut self.mem_ops[index]; 53 | match mem_op { 54 | MemOp::Read => tracker.read += 1, 55 | MemOp::Write => tracker.write += 1, 56 | } 57 | } 58 | 59 | pub fn get_branches(&self) -> &[usize; 16] { 60 | &self.branches 61 | } 62 | 63 | pub fn get_mem_ops(&self) -> &Vec { 64 | &self.mem_ops 65 | } 66 | 67 | /// Generate a percentage-based heatmap of branch usage 68 | pub fn generate_branch_distribution(&self, output_path: &str) -> Result<(), anyhow::Error> { 69 | let total: usize = self.branches.iter().sum(); 70 | let total = total.max(1); 71 | 72 | let percentages: Vec = self.branches 73 | .iter() 74 | .map(|&b| (b as f64 / total as f64) * 100.0) 75 | .collect(); 76 | 77 | // Choose a reasonable y max (at least a little above the tallest bar) 78 | let max_val = percentages 79 | .iter() 80 | .cloned() 81 | .fold(0.0_f64, f64::max) 82 | .max(10.0); 83 | 84 | // Create drawing area 85 | let root = BitMapBackend::new(output_path, (1000, 600)).into_drawing_area(); 86 | root.fill(&WHITE)?; 87 | 88 | // Use f64 for x-range so we can put label at i + 0.5 89 | let mut chart = ChartBuilder::on(&root) 90 | .caption("Branch Usage Distribution (%)", ("sans-serif", 30)) 91 | .margin(20) 92 | .x_label_area_size(40) 93 | .y_label_area_size(60) 94 | .build_cartesian_2d(0f64..16f64, 0f64..(max_val * 1.12))?; // leave headroom for labels 95 | 96 | chart 97 | .configure_mesh() 98 | .x_labels(16) 99 | .x_label_formatter(&|x| format!("{}", *x as usize)) 100 | .x_desc("Branch ID") 101 | .y_desc("Usage (%)") 102 | .axis_desc_style(("sans-serif", 20)) 103 | .draw()?; 104 | 105 | // Bar color 106 | let bar_style = RGBColor(30, 120, 200).filled(); 107 | 108 | // Draw bars using f64 coordinates 109 | for (i, &pct) in percentages.iter().enumerate() { 110 | let x0 = i as f64; 111 | let x1 = x0 + 0.9; // slightly narrower than 1.0 for spacing 112 | chart.draw_series(std::iter::once(Rectangle::new( 113 | [(x0, 0.0), (x1, pct)], 114 | bar_style, 115 | )))?; 116 | } 117 | 118 | // Prepare a TextStyle and position it anchored to center above the bar 119 | let label_style = TextStyle::from(("sans-serif", 14).into_font()) 120 | .pos(Pos::new(HPos::Center, VPos::Bottom)); 121 | 122 | // Draw labels 123 | for (i, &pct) in percentages.iter().enumerate() { 124 | let x_center = i as f64 + 0.45; // center given x1 = x0 + 0.9 125 | let y = pct + (max_val * 0.02); // small offset above the bar 126 | chart.draw_series(std::iter::once(Text::new( 127 | format!("{:.1}%", pct), 128 | (x_center, y), 129 | label_style.clone(), 130 | )))?; 131 | } 132 | 133 | root.present()?; 134 | Ok(()) 135 | } 136 | 137 | pub fn generate_memory_usage_graph( 138 | &self, 139 | output_path: &str, 140 | ma_window: usize, 141 | ) -> Result<(), anyhow::Error> { 142 | use plotters::prelude::*; 143 | 144 | let scratchpad_size = self.mem_ops.len(); 145 | let mut read_counts = vec![0usize; scratchpad_size]; 146 | let mut write_counts = vec![0usize; scratchpad_size]; 147 | 148 | for (i, ops) in self.mem_ops.iter().enumerate() { 149 | read_counts[i] = ops.read as usize; 150 | write_counts[i] = ops.write as usize; 151 | } 152 | 153 | // ---- zero-phase moving average (filtfilt for a boxcar) ---- 154 | 155 | #[inline] 156 | fn ma_forward_usize(data: &[usize], w: usize) -> Vec { 157 | let w = w.max(1); 158 | let mut out = vec![0.0; data.len()]; 159 | let mut sum: u64 = 0; 160 | for i in 0..data.len() { 161 | sum += data[i] as u64; 162 | if i >= w { sum -= data[i - w] as u64; } 163 | let denom = (i + 1).min(w) as f64; 164 | out[i] = sum as f64 / denom; 165 | } 166 | out 167 | } 168 | 169 | #[inline] 170 | fn ma_forward_f64(data: &[f64], w: usize) -> Vec { 171 | let w = w.max(1); 172 | let mut out = vec![0.0; data.len()]; 173 | let mut sum: f64 = 0.0; 174 | for i in 0..data.len() { 175 | sum += data[i]; 176 | if i >= w { sum -= data[i - w]; } 177 | let denom = (i + 1).min(w) as f64; 178 | out[i] = sum / denom; 179 | } 180 | out 181 | } 182 | 183 | #[inline] 184 | fn filtfilt_ma_usize(data: &[usize], w: usize) -> Vec { 185 | let fwd = ma_forward_usize(data, w); 186 | let mut rev = fwd.clone(); 187 | rev.reverse(); 188 | let rev2 = ma_forward_f64(&rev, w); 189 | let mut out = rev2; 190 | out.reverse(); 191 | out 192 | } 193 | 194 | let read_ma = filtfilt_ma_usize(&read_counts, ma_window); 195 | let write_ma = filtfilt_ma_usize(&write_counts, ma_window); 196 | 197 | // Y-axis 198 | let counts_max = read_counts.iter().zip(write_counts.iter()) 199 | .map(|(&r, &w)| r.max(w)) 200 | .max() 201 | .unwrap_or(1) as f64; 202 | let ma_max = read_ma.iter().cloned().fold(0.0, f64::max) 203 | .max(write_ma.iter().cloned().fold(0.0, f64::max)); 204 | let y_max = counts_max.max(ma_max) * 1.15; 205 | 206 | // ---- plot ---- 207 | let root = BitMapBackend::new(output_path, (1920, 1080)).into_drawing_area(); 208 | root.fill(&WHITE)?; 209 | 210 | let mut chart = ChartBuilder::on(&root) 211 | .caption( 212 | format!("Memory Accesses per Index (Read/Write + filtfilt MA({}))", ma_window.max(1)), 213 | ("sans-serif", 28), 214 | ) 215 | .margin(20) 216 | .x_label_area_size(40) 217 | .y_label_area_size(60) 218 | .build_cartesian_2d(0f64..scratchpad_size as f64, 0f64..y_max)?; 219 | 220 | chart 221 | .configure_mesh() 222 | .x_labels(20) 223 | .x_label_formatter(&|x| format!("{}", *x as usize)) 224 | .x_desc("Memory Index") 225 | .y_desc("Access Count") 226 | .axis_desc_style(("sans-serif", 18)) 227 | .draw()?; 228 | 229 | let read_fill = RGBColor(30, 144, 255).filled(); 230 | let write_fill = RGBColor(220, 50, 47).filled(); 231 | let read_line = RGBColor(30, 144, 255); 232 | let write_line = RGBColor(220, 50, 47); 233 | 234 | let avg_read_line = RGBColor(100, 180, 255); 235 | let avg_write_line = RGBColor(255, 100, 100); 236 | 237 | let bar_width = 1.0; 238 | 239 | for i in 0..scratchpad_size { 240 | let x0 = i as f64 - bar_width / 2.0; 241 | let x1 = i as f64 + bar_width / 2.0; 242 | let r = read_counts[i] as f64; 243 | let w = write_counts[i] as f64; 244 | 245 | if r > w { 246 | chart.draw_series(std::iter::once(Rectangle::new([(x0, 0.0), (x1, r)], read_fill.clone())))?; 247 | if w > 0.0 { 248 | chart.draw_series(std::iter::once(Rectangle::new([(x0, 0.0), (x1, w)], write_fill.clone())))?; 249 | } 250 | } else { 251 | chart.draw_series(std::iter::once(Rectangle::new([(x0, 0.0), (x1, w)], write_fill.clone())))?; 252 | if r > 0.0 { 253 | chart.draw_series(std::iter::once(Rectangle::new([(x0, 0.0), (x1, r)], read_fill.clone())))?; 254 | } 255 | } 256 | } 257 | 258 | // Zero-phase MA overlays 259 | chart.draw_series(LineSeries::new( 260 | (0..scratchpad_size).map(|i| (i as f64, read_ma[i])), 261 | ShapeStyle::from(&avg_read_line).stroke_width(3), 262 | ))?.label(format!("Read MA_filtfilt({})", ma_window.max(1))); 263 | 264 | chart.draw_series(LineSeries::new( 265 | (0..scratchpad_size).map(|i| (i as f64, write_ma[i])), 266 | ShapeStyle::from(&avg_write_line).stroke_width(3), 267 | ))?.label(format!("Write MA_filtfilt({})", ma_window.max(1))); 268 | 269 | // Legend 270 | chart 271 | .draw_series(std::iter::once(Rectangle::new([(0.0, 0.0), (0.0, 0.0)], read_fill.clone())))? 272 | .label("Read") 273 | .legend(move |(x, y)| Rectangle::new([(x, y - 5), (x + 10, y + 5)], read_fill.clone())); 274 | chart 275 | .draw_series(std::iter::once(Rectangle::new([(0.0, 0.0), (0.0, 0.0)], write_fill.clone())))? 276 | .label("Write") 277 | .legend(move |(x, y)| Rectangle::new([(x, y - 5), (x + 10, y + 5)], write_fill.clone())); 278 | chart 279 | .draw_series(std::iter::once(PathElement::new( 280 | vec![(0.0, 0.0), (0.0, 0.0)], 281 | ShapeStyle::from(&read_line).stroke_width(3), 282 | )))? 283 | .label(format!("Read MA_filtfilt({})", ma_window.max(1))) 284 | .legend(move |(x, y)| PathElement::new( 285 | vec![(x, y), (x + 14, y)], 286 | ShapeStyle::from(&read_line).stroke_width(3), 287 | )); 288 | chart 289 | .draw_series(std::iter::once(PathElement::new( 290 | vec![(0.0, 0.0), (0.0, 0.0)], 291 | ShapeStyle::from(&write_line).stroke_width(3), 292 | )))? 293 | .label(format!("Write MA_filtfilt({})", ma_window.max(1))) 294 | .legend(move |(x, y)| PathElement::new( 295 | vec![(x, y), (x + 14, y)], 296 | ShapeStyle::from(&write_line).stroke_width(3), 297 | )); 298 | 299 | chart 300 | .configure_series_labels() 301 | .position(SeriesLabelPosition::UpperRight) 302 | .border_style(&BLACK) 303 | .background_style(WHITE.mix(0.8)) 304 | .draw()?; 305 | 306 | root.present()?; 307 | Ok(()) 308 | } 309 | 310 | } -------------------------------------------------------------------------------- /src/v1.rs: -------------------------------------------------------------------------------- 1 | use aes::cipher::generic_array::GenericArray; 2 | use tiny_keccak::keccakp; 3 | 4 | use crate::{Hash, HASH_SIZE, Error, scratchpad::ScratchPad as ScratchPadInternal}; 5 | 6 | // These are tweakable parameters 7 | pub const MEMORY_SIZE: usize = 32768; 8 | pub const SCRATCHPAD_ITERS: usize = 5000; 9 | pub const ITERS: usize = 1; 10 | pub const BUFFER_SIZE: usize = 42; 11 | pub const SLOT_LENGTH: usize = 256; 12 | 13 | // Untweakable parameters 14 | pub const KECCAK_WORDS: usize = 25; 15 | pub const BYTES_ARRAY_INPUT: usize = KECCAK_WORDS * 8; 16 | pub const STAGE_1_MAX: usize = MEMORY_SIZE / KECCAK_WORDS; 17 | 18 | // Scratchpad used to store intermediate values 19 | // It has a fixed size of `MEMORY_SIZE` u64s 20 | // It can be easily reused for multiple hashing operations safely 21 | pub type ScratchPad = ScratchPadInternal; 22 | 23 | // Align the input to 8 bytes 24 | const ALIGNMENT: usize = 8; 25 | 26 | #[derive(Debug, bytemuck::Pod, bytemuck::Zeroable, Copy, Clone)] 27 | #[repr(C, align(8))] 28 | pub struct Bytes8Alignment([u8; ALIGNMENT]); 29 | 30 | // This is a workaround to force the correct alignment on Windows and MacOS 31 | // We need an input of `BYTES_ARRAY_INPUT` bytes, but we need to ensure that it's aligned to 8 bytes 32 | // to be able to cast it to a `[u64; KECCAK_WORDS]` later on. 33 | #[derive(Debug, Clone)] 34 | pub struct AlignedInput { 35 | data: Vec, 36 | } 37 | 38 | impl Default for AlignedInput { 39 | fn default() -> Self { 40 | let mut n = BYTES_ARRAY_INPUT / ALIGNMENT; 41 | if BYTES_ARRAY_INPUT % ALIGNMENT != 0 { 42 | n += 1; 43 | } 44 | 45 | Self { 46 | data: vec![Bytes8Alignment([0; ALIGNMENT]); n] 47 | } 48 | } 49 | } 50 | 51 | impl AlignedInput { 52 | // The number of elements in the input 53 | pub fn len(&self) -> usize { 54 | self.data.len() 55 | } 56 | 57 | // The size of the input in bytes 58 | pub fn size(&self) -> usize { 59 | self.data.len() * ALIGNMENT 60 | } 61 | 62 | // Get a mutable pointer to the input 63 | pub fn as_mut_ptr(&mut self) -> *mut u8 { 64 | self.data.as_mut_ptr() as *mut u8 65 | } 66 | 67 | // Retrieve the input as a mutable slice 68 | pub fn as_mut_slice(&mut self) -> Result<&mut [u8; BYTES_ARRAY_INPUT], Error> { 69 | bytemuck::cast_slice_mut(&mut self.data).try_into().map_err(|_| Error::FormatError) 70 | } 71 | 72 | // Retrieve the input as a slice 73 | pub fn as_slice(&self) -> Result<&[u8; BYTES_ARRAY_INPUT], Error> { 74 | bytemuck::cast_slice(&self.data).try_into().map_err(|_| Error::FormatError) 75 | } 76 | } 77 | 78 | #[inline(always)] 79 | fn stage_1(input: &mut [u64; KECCAK_WORDS], scratch_pad: &mut [u64; MEMORY_SIZE], a: (usize, usize), b: (usize, usize)) { 80 | for i in a.0..=a.1 { 81 | keccakp(input); 82 | 83 | let mut rand_int: u64 = 0; 84 | for j in b.0..=b.1 { 85 | let pair_idx = (j + 1) % KECCAK_WORDS; 86 | let pair_idx2 = (j + 2) % KECCAK_WORDS; 87 | 88 | let target_idx = i * KECCAK_WORDS + j; 89 | let a = input[j] ^ rand_int; 90 | // Branching 91 | let left = input[pair_idx]; 92 | let right = input[pair_idx2]; 93 | let xor = left ^ right; 94 | let v = match xor & 0x3 { 95 | 0 => left & right, 96 | 1 => !(left & right), 97 | 2 => !xor, 98 | 3 => xor, 99 | _ => unreachable!(), 100 | }; 101 | let b = a ^ v; 102 | rand_int = b; 103 | scratch_pad[target_idx] = b; 104 | } 105 | } 106 | } 107 | 108 | // This function is used to hash the input using the generated scratch pad 109 | // NOTE: The scratchpad is completely overwritten in stage 1 and can be reused without any issues 110 | pub fn xelis_hash(input: &mut [u8; BYTES_ARRAY_INPUT], scratch_pad: &mut ScratchPad) -> Result { 111 | let int_input: &mut [u64; KECCAK_WORDS] = bytemuck::try_from_bytes_mut(input) 112 | .map_err(|e| Error::CastError(e))?; 113 | 114 | // stage 1 115 | let scratch_pad = scratch_pad.as_mut_slice(); 116 | stage_1(int_input, scratch_pad, (0, STAGE_1_MAX - 1), (0, KECCAK_WORDS - 1)); 117 | stage_1(int_input, scratch_pad, (STAGE_1_MAX, STAGE_1_MAX), (0, 17)); 118 | 119 | // stage 2 120 | let mut slots: [u32; SLOT_LENGTH] = [0; SLOT_LENGTH]; 121 | // this is equal to MEMORY_SIZE, just in u32 format 122 | let small_pad: &mut [u32; MEMORY_SIZE * 2] = bytemuck::try_cast_slice_mut(scratch_pad) 123 | .map_err(|e| Error::CastError(e))? 124 | .try_into() 125 | .map_err(|_| Error::FormatError)?; 126 | 127 | slots.copy_from_slice(&small_pad[small_pad.len() - SLOT_LENGTH..]); 128 | 129 | let mut indices: [u16; SLOT_LENGTH] = [0; SLOT_LENGTH]; 130 | for _ in 0..ITERS { 131 | for j in 0..small_pad.len() / SLOT_LENGTH { 132 | // Initialize indices and precompute the total sum of small pad 133 | let mut total_sum: u32 = 0; 134 | for k in 0..SLOT_LENGTH { 135 | indices[k] = k as u16; 136 | if slots[k] >> 31 == 0 { 137 | total_sum = total_sum.wrapping_add(small_pad[j * SLOT_LENGTH + k]); 138 | } else { 139 | total_sum = total_sum.wrapping_sub(small_pad[j * SLOT_LENGTH + k]); 140 | } 141 | } 142 | 143 | for slot_idx in (0..SLOT_LENGTH).rev() { 144 | let index_in_indices = (small_pad[j * SLOT_LENGTH + slot_idx] % (slot_idx as u32 + 1)) as usize; 145 | let index = indices[index_in_indices] as usize; 146 | indices[index_in_indices] = indices[slot_idx]; 147 | 148 | let mut local_sum = total_sum; 149 | let s1 = (slots[index] >> 31) as i32; 150 | let pad_value = small_pad[j * SLOT_LENGTH + index]; 151 | if s1 == 0 { 152 | local_sum = local_sum.wrapping_sub(pad_value); 153 | } else { 154 | local_sum = local_sum.wrapping_add(pad_value); 155 | } 156 | 157 | // Apply the sum to the slot 158 | slots[index] = slots[index].wrapping_add(local_sum); 159 | 160 | // Update the total sum 161 | let s2 = (slots[index] >> 31) as i32; 162 | total_sum = total_sum.wrapping_sub(2u32.wrapping_mul(small_pad[(j * SLOT_LENGTH).wrapping_add(index)].wrapping_mul((-s1).wrapping_add(s2) as u32))); 163 | } 164 | } 165 | } 166 | 167 | small_pad[(MEMORY_SIZE * 2) - SLOT_LENGTH..].copy_from_slice(&slots); 168 | 169 | // stage 3 170 | let key = GenericArray::from([0u8; 16]); 171 | let mut block = GenericArray::from([0u8; 16]); 172 | 173 | let mut addr_a = (scratch_pad[MEMORY_SIZE - 1] >> 15) & 0x7FFF; 174 | let mut addr_b = scratch_pad[MEMORY_SIZE - 1] & 0x7FFF; 175 | 176 | let mut mem_buffer_a: [u64; BUFFER_SIZE] = [0; BUFFER_SIZE]; 177 | let mut mem_buffer_b: [u64; BUFFER_SIZE] = [0; BUFFER_SIZE]; 178 | 179 | for i in 0..BUFFER_SIZE as u64 { 180 | mem_buffer_a[i as usize] = scratch_pad[((addr_a + i) % MEMORY_SIZE as u64) as usize]; 181 | mem_buffer_b[i as usize] = scratch_pad[((addr_b + i) % MEMORY_SIZE as u64) as usize]; 182 | } 183 | 184 | let mut final_result = [0; HASH_SIZE]; 185 | 186 | for i in 0..SCRATCHPAD_ITERS { 187 | let mem_a = mem_buffer_a[i % BUFFER_SIZE]; 188 | let mem_b = mem_buffer_b[i % BUFFER_SIZE]; 189 | 190 | block[..8].copy_from_slice(&mem_b.to_le_bytes()); 191 | block[8..].copy_from_slice(&mem_a.to_le_bytes()); 192 | 193 | aes::hazmat::cipher_round(&mut block, &key); 194 | 195 | let hash1 = u64::from_le_bytes(block[0..8].try_into().map_err(|_| Error::FormatError)?); 196 | let hash2 = mem_a ^ mem_b; 197 | 198 | let mut result = !(hash1 ^ hash2); 199 | 200 | for j in 0..HASH_SIZE { 201 | let a = mem_buffer_a[(j + i) % BUFFER_SIZE]; 202 | let b = mem_buffer_b[(j + i) % BUFFER_SIZE]; 203 | 204 | // more branching 205 | let v = match (result >> (j * 2)) & 0xf { 206 | 0 => result.rotate_left(j as u32) ^ b, 207 | 1 => !(result.rotate_left(j as u32) ^ a), 208 | 2 => !(result ^ a), 209 | 3 => result ^ b, 210 | 4 => result ^ (a.wrapping_add(b)), 211 | 5 => result ^ (a.wrapping_sub(b)), 212 | 6 => result ^ (b.wrapping_sub(a)), 213 | 7 => result ^ (a.wrapping_mul(b)), 214 | 8 => result ^ (a & b), 215 | 9 => result ^ (a | b), 216 | 10 => result ^ (a ^ b), 217 | 11 => result ^ (a.wrapping_sub(result)), 218 | 12 => result ^ (b.wrapping_sub(result)), 219 | 13 => result ^ (a.wrapping_add(result)), 220 | 14 => result ^ (result.wrapping_sub(a)), 221 | 15 => result ^ (result.wrapping_sub(b)), 222 | _ => unreachable!(), 223 | }; 224 | 225 | result = v; 226 | } 227 | 228 | addr_b = result & 0x7FFF; 229 | mem_buffer_a[i % BUFFER_SIZE] = result; 230 | mem_buffer_b[i % BUFFER_SIZE] = scratch_pad[addr_b as usize]; 231 | 232 | addr_a = (result >> 15) & 0x7FFF; 233 | scratch_pad[addr_a as usize] = result; 234 | 235 | let index = SCRATCHPAD_ITERS - i - 1; 236 | if index < 4 { 237 | final_result[index * 8..(SCRATCHPAD_ITERS - i) * 8].copy_from_slice(&result.to_be_bytes()); 238 | } 239 | } 240 | 241 | Ok(final_result) 242 | } 243 | 244 | #[cfg(test)] 245 | mod tests { 246 | use super::*; 247 | 248 | fn test_input(input: &mut [u8; BYTES_ARRAY_INPUT], expected_hash: Hash) { 249 | let mut scratch_pad = ScratchPad::default(); 250 | let hash = xelis_hash(input, &mut scratch_pad).unwrap(); 251 | assert_eq!(hash, expected_hash); 252 | } 253 | 254 | #[test] 255 | fn test_zero_input() { 256 | let mut input = [0u8; 200]; 257 | let expected_hash = [ 258 | 0x0e, 0xbb, 0xbd, 0x8a, 0x31, 0xed, 0xad, 0xfe, 0x09, 0x8f, 0x2d, 0x77, 0x0d, 0x84, 259 | 0xb7, 0x19, 0x58, 0x86, 0x75, 0xab, 0x88, 0xa0, 0xa1, 0x70, 0x67, 0xd0, 0x0a, 0x8f, 260 | 0x36, 0x18, 0x22, 0x65, 261 | ]; 262 | 263 | test_input(&mut input, expected_hash); 264 | } 265 | 266 | #[test] 267 | fn test_xelis_input() { 268 | let mut input = [0u8; BYTES_ARRAY_INPUT]; 269 | 270 | let custom = b"xelis-hashing-algorithm"; 271 | input[0..custom.len()].copy_from_slice(custom); 272 | 273 | let expected_hash = [ 274 | 106, 106, 173, 8, 207, 59, 118, 108, 176, 196, 9, 124, 250, 195, 3, 275 | 61, 30, 146, 238, 182, 88, 83, 115, 81, 139, 56, 3, 28, 176, 86, 68, 21 276 | ]; 277 | test_input(&mut input, expected_hash); 278 | } 279 | 280 | #[test] 281 | fn test_scratch_pad() { 282 | let mut scratch_pad = ScratchPad::default(); 283 | let mut input = AlignedInput::default(); 284 | 285 | let hash = xelis_hash(input.as_mut_slice().unwrap(), &mut scratch_pad).unwrap(); 286 | let expected_hash = [ 287 | 0x0e, 0xbb, 0xbd, 0x8a, 0x31, 0xed, 0xad, 0xfe, 0x09, 0x8f, 0x2d, 0x77, 0x0d, 0x84, 288 | 0xb7, 0x19, 0x58, 0x86, 0x75, 0xab, 0x88, 0xa0, 0xa1, 0x70, 0x67, 0xd0, 0x0a, 0x8f, 289 | 0x36, 0x18, 0x22, 0x65, 290 | ]; 291 | assert_eq!(hash, expected_hash); 292 | } 293 | 294 | #[test] 295 | fn test_bytes_alignment() { 296 | let alignment = std::mem::align_of::(); 297 | assert_eq!(alignment, 8); 298 | } 299 | } -------------------------------------------------------------------------------- /go/v1/keccak.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | import "math/bits" 4 | 5 | const RC_LEN = 12 6 | 7 | // rc stores the round constants for use in the ι step. 8 | var rc = [RC_LEN]uint64{ 9 | 0x000000008000808b, 10 | 0x800000000000008b, 11 | 0x8000000000008089, 12 | 0x8000000000008003, 13 | 0x8000000000008002, 14 | 0x8000000000000080, 15 | 0x000000000000800a, 16 | 0x800000008000000a, 17 | 0x8000000080008081, 18 | 0x8000000000008080, 19 | 0x0000000080000001, 20 | 0x8000000080008008, 21 | } 22 | 23 | // keccakF1600 applies the Keccak permutation to a 1600b-wide 24 | // state represented as a slice of 25 uint64s. 25 | // This is copied directly from golang.org/x/crypto/sha3/keccakf.go 26 | func KeccakF1600(a *[25]uint64) { 27 | // Implementation translated from Keccak-inplace.c 28 | // in the keccak reference code. 29 | var t, bc0, bc1, bc2, bc3, bc4, d0, d1, d2, d3, d4 uint64 30 | 31 | for i := 0; i < RC_LEN; i += 4 { 32 | // Combines the 5 steps in each round into 2 steps. 33 | // Unrolls 4 rounds per loop and spreads some steps across rounds. 34 | 35 | // Round 1 36 | bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20] 37 | bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21] 38 | bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22] 39 | bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23] 40 | bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24] 41 | d0 = bc4 ^ (bc1<<1 | bc1>>63) 42 | d1 = bc0 ^ (bc2<<1 | bc2>>63) 43 | d2 = bc1 ^ (bc3<<1 | bc3>>63) 44 | d3 = bc2 ^ (bc4<<1 | bc4>>63) 45 | d4 = bc3 ^ (bc0<<1 | bc0>>63) 46 | 47 | bc0 = a[0] ^ d0 48 | t = a[6] ^ d1 49 | bc1 = bits.RotateLeft64(t, 44) 50 | t = a[12] ^ d2 51 | bc2 = bits.RotateLeft64(t, 43) 52 | t = a[18] ^ d3 53 | bc3 = bits.RotateLeft64(t, 21) 54 | t = a[24] ^ d4 55 | bc4 = bits.RotateLeft64(t, 14) 56 | a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i] 57 | a[6] = bc1 ^ (bc3 &^ bc2) 58 | a[12] = bc2 ^ (bc4 &^ bc3) 59 | a[18] = bc3 ^ (bc0 &^ bc4) 60 | a[24] = bc4 ^ (bc1 &^ bc0) 61 | 62 | t = a[10] ^ d0 63 | bc2 = bits.RotateLeft64(t, 3) 64 | t = a[16] ^ d1 65 | bc3 = bits.RotateLeft64(t, 45) 66 | t = a[22] ^ d2 67 | bc4 = bits.RotateLeft64(t, 61) 68 | t = a[3] ^ d3 69 | bc0 = bits.RotateLeft64(t, 28) 70 | t = a[9] ^ d4 71 | bc1 = bits.RotateLeft64(t, 20) 72 | a[10] = bc0 ^ (bc2 &^ bc1) 73 | a[16] = bc1 ^ (bc3 &^ bc2) 74 | a[22] = bc2 ^ (bc4 &^ bc3) 75 | a[3] = bc3 ^ (bc0 &^ bc4) 76 | a[9] = bc4 ^ (bc1 &^ bc0) 77 | 78 | t = a[20] ^ d0 79 | bc4 = bits.RotateLeft64(t, 18) 80 | t = a[1] ^ d1 81 | bc0 = bits.RotateLeft64(t, 1) 82 | t = a[7] ^ d2 83 | bc1 = bits.RotateLeft64(t, 6) 84 | t = a[13] ^ d3 85 | bc2 = bits.RotateLeft64(t, 25) 86 | t = a[19] ^ d4 87 | bc3 = bits.RotateLeft64(t, 8) 88 | a[20] = bc0 ^ (bc2 &^ bc1) 89 | a[1] = bc1 ^ (bc3 &^ bc2) 90 | a[7] = bc2 ^ (bc4 &^ bc3) 91 | a[13] = bc3 ^ (bc0 &^ bc4) 92 | a[19] = bc4 ^ (bc1 &^ bc0) 93 | 94 | t = a[5] ^ d0 95 | bc1 = bits.RotateLeft64(t, 36) 96 | t = a[11] ^ d1 97 | bc2 = bits.RotateLeft64(t, 10) 98 | t = a[17] ^ d2 99 | bc3 = bits.RotateLeft64(t, 15) 100 | t = a[23] ^ d3 101 | bc4 = bits.RotateLeft64(t, 56) 102 | t = a[4] ^ d4 103 | bc0 = bits.RotateLeft64(t, 27) 104 | a[5] = bc0 ^ (bc2 &^ bc1) 105 | a[11] = bc1 ^ (bc3 &^ bc2) 106 | a[17] = bc2 ^ (bc4 &^ bc3) 107 | a[23] = bc3 ^ (bc0 &^ bc4) 108 | a[4] = bc4 ^ (bc1 &^ bc0) 109 | 110 | t = a[15] ^ d0 111 | bc3 = bits.RotateLeft64(t, 41) 112 | t = a[21] ^ d1 113 | bc4 = bits.RotateLeft64(t, 2) 114 | t = a[2] ^ d2 115 | bc0 = bits.RotateLeft64(t, 62) 116 | t = a[8] ^ d3 117 | bc1 = bits.RotateLeft64(t, 55) 118 | t = a[14] ^ d4 119 | bc2 = bits.RotateLeft64(t, 39) 120 | a[15] = bc0 ^ (bc2 &^ bc1) 121 | a[21] = bc1 ^ (bc3 &^ bc2) 122 | a[2] = bc2 ^ (bc4 &^ bc3) 123 | a[8] = bc3 ^ (bc0 &^ bc4) 124 | a[14] = bc4 ^ (bc1 &^ bc0) 125 | 126 | // Round 2 127 | bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20] 128 | bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21] 129 | bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22] 130 | bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23] 131 | bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24] 132 | d0 = bc4 ^ (bc1<<1 | bc1>>63) 133 | d1 = bc0 ^ (bc2<<1 | bc2>>63) 134 | d2 = bc1 ^ (bc3<<1 | bc3>>63) 135 | d3 = bc2 ^ (bc4<<1 | bc4>>63) 136 | d4 = bc3 ^ (bc0<<1 | bc0>>63) 137 | 138 | bc0 = a[0] ^ d0 139 | t = a[16] ^ d1 140 | bc1 = bits.RotateLeft64(t, 44) 141 | t = a[7] ^ d2 142 | bc2 = bits.RotateLeft64(t, 43) 143 | t = a[23] ^ d3 144 | bc3 = bits.RotateLeft64(t, 21) 145 | t = a[14] ^ d4 146 | bc4 = bits.RotateLeft64(t, 14) 147 | a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+1] 148 | a[16] = bc1 ^ (bc3 &^ bc2) 149 | a[7] = bc2 ^ (bc4 &^ bc3) 150 | a[23] = bc3 ^ (bc0 &^ bc4) 151 | a[14] = bc4 ^ (bc1 &^ bc0) 152 | 153 | t = a[20] ^ d0 154 | bc2 = bits.RotateLeft64(t, 3) 155 | t = a[11] ^ d1 156 | bc3 = bits.RotateLeft64(t, 45) 157 | t = a[2] ^ d2 158 | bc4 = bits.RotateLeft64(t, 61) 159 | t = a[18] ^ d3 160 | bc0 = bits.RotateLeft64(t, 28) 161 | t = a[9] ^ d4 162 | bc1 = bits.RotateLeft64(t, 20) 163 | a[20] = bc0 ^ (bc2 &^ bc1) 164 | a[11] = bc1 ^ (bc3 &^ bc2) 165 | a[2] = bc2 ^ (bc4 &^ bc3) 166 | a[18] = bc3 ^ (bc0 &^ bc4) 167 | a[9] = bc4 ^ (bc1 &^ bc0) 168 | 169 | t = a[15] ^ d0 170 | bc4 = bits.RotateLeft64(t, 18) 171 | t = a[6] ^ d1 172 | bc0 = bits.RotateLeft64(t, 1) 173 | t = a[22] ^ d2 174 | bc1 = bits.RotateLeft64(t, 6) 175 | t = a[13] ^ d3 176 | bc2 = bits.RotateLeft64(t, 25) 177 | t = a[4] ^ d4 178 | bc3 = bits.RotateLeft64(t, 8) 179 | a[15] = bc0 ^ (bc2 &^ bc1) 180 | a[6] = bc1 ^ (bc3 &^ bc2) 181 | a[22] = bc2 ^ (bc4 &^ bc3) 182 | a[13] = bc3 ^ (bc0 &^ bc4) 183 | a[4] = bc4 ^ (bc1 &^ bc0) 184 | 185 | t = a[10] ^ d0 186 | bc1 = bits.RotateLeft64(t, 36) 187 | t = a[1] ^ d1 188 | bc2 = bits.RotateLeft64(t, 10) 189 | t = a[17] ^ d2 190 | bc3 = bits.RotateLeft64(t, 15) 191 | t = a[8] ^ d3 192 | bc4 = bits.RotateLeft64(t, 56) 193 | t = a[24] ^ d4 194 | bc0 = bits.RotateLeft64(t, 27) 195 | a[10] = bc0 ^ (bc2 &^ bc1) 196 | a[1] = bc1 ^ (bc3 &^ bc2) 197 | a[17] = bc2 ^ (bc4 &^ bc3) 198 | a[8] = bc3 ^ (bc0 &^ bc4) 199 | a[24] = bc4 ^ (bc1 &^ bc0) 200 | 201 | t = a[5] ^ d0 202 | bc3 = bits.RotateLeft64(t, 41) 203 | t = a[21] ^ d1 204 | bc4 = bits.RotateLeft64(t, 2) 205 | t = a[12] ^ d2 206 | bc0 = bits.RotateLeft64(t, 62) 207 | t = a[3] ^ d3 208 | bc1 = bits.RotateLeft64(t, 55) 209 | t = a[19] ^ d4 210 | bc2 = bits.RotateLeft64(t, 39) 211 | a[5] = bc0 ^ (bc2 &^ bc1) 212 | a[21] = bc1 ^ (bc3 &^ bc2) 213 | a[12] = bc2 ^ (bc4 &^ bc3) 214 | a[3] = bc3 ^ (bc0 &^ bc4) 215 | a[19] = bc4 ^ (bc1 &^ bc0) 216 | 217 | // Round 3 218 | bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20] 219 | bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21] 220 | bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22] 221 | bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23] 222 | bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24] 223 | d0 = bc4 ^ (bc1<<1 | bc1>>63) 224 | d1 = bc0 ^ (bc2<<1 | bc2>>63) 225 | d2 = bc1 ^ (bc3<<1 | bc3>>63) 226 | d3 = bc2 ^ (bc4<<1 | bc4>>63) 227 | d4 = bc3 ^ (bc0<<1 | bc0>>63) 228 | 229 | bc0 = a[0] ^ d0 230 | t = a[11] ^ d1 231 | bc1 = bits.RotateLeft64(t, 44) 232 | t = a[22] ^ d2 233 | bc2 = bits.RotateLeft64(t, 43) 234 | t = a[8] ^ d3 235 | bc3 = bits.RotateLeft64(t, 21) 236 | t = a[19] ^ d4 237 | bc4 = bits.RotateLeft64(t, 14) 238 | a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+2] 239 | a[11] = bc1 ^ (bc3 &^ bc2) 240 | a[22] = bc2 ^ (bc4 &^ bc3) 241 | a[8] = bc3 ^ (bc0 &^ bc4) 242 | a[19] = bc4 ^ (bc1 &^ bc0) 243 | 244 | t = a[15] ^ d0 245 | bc2 = bits.RotateLeft64(t, 3) 246 | t = a[1] ^ d1 247 | bc3 = bits.RotateLeft64(t, 45) 248 | t = a[12] ^ d2 249 | bc4 = bits.RotateLeft64(t, 61) 250 | t = a[23] ^ d3 251 | bc0 = bits.RotateLeft64(t, 28) 252 | t = a[9] ^ d4 253 | bc1 = bits.RotateLeft64(t, 20) 254 | a[15] = bc0 ^ (bc2 &^ bc1) 255 | a[1] = bc1 ^ (bc3 &^ bc2) 256 | a[12] = bc2 ^ (bc4 &^ bc3) 257 | a[23] = bc3 ^ (bc0 &^ bc4) 258 | a[9] = bc4 ^ (bc1 &^ bc0) 259 | 260 | t = a[5] ^ d0 261 | bc4 = bits.RotateLeft64(t, 18) 262 | t = a[16] ^ d1 263 | bc0 = bits.RotateLeft64(t, 1) 264 | t = a[2] ^ d2 265 | bc1 = bits.RotateLeft64(t, 6) 266 | t = a[13] ^ d3 267 | bc2 = bits.RotateLeft64(t, 25) 268 | t = a[24] ^ d4 269 | bc3 = bits.RotateLeft64(t, 8) 270 | a[5] = bc0 ^ (bc2 &^ bc1) 271 | a[16] = bc1 ^ (bc3 &^ bc2) 272 | a[2] = bc2 ^ (bc4 &^ bc3) 273 | a[13] = bc3 ^ (bc0 &^ bc4) 274 | a[24] = bc4 ^ (bc1 &^ bc0) 275 | 276 | t = a[20] ^ d0 277 | bc1 = bits.RotateLeft64(t, 36) 278 | t = a[6] ^ d1 279 | bc2 = bits.RotateLeft64(t, 10) 280 | t = a[17] ^ d2 281 | bc3 = bits.RotateLeft64(t, 15) 282 | t = a[3] ^ d3 283 | bc4 = bits.RotateLeft64(t, 56) 284 | t = a[14] ^ d4 285 | bc0 = bits.RotateLeft64(t, 27) 286 | a[20] = bc0 ^ (bc2 &^ bc1) 287 | a[6] = bc1 ^ (bc3 &^ bc2) 288 | a[17] = bc2 ^ (bc4 &^ bc3) 289 | a[3] = bc3 ^ (bc0 &^ bc4) 290 | a[14] = bc4 ^ (bc1 &^ bc0) 291 | 292 | t = a[10] ^ d0 293 | bc3 = bits.RotateLeft64(t, 41) 294 | t = a[21] ^ d1 295 | bc4 = bits.RotateLeft64(t, 2) 296 | t = a[7] ^ d2 297 | bc0 = bits.RotateLeft64(t, 62) 298 | t = a[18] ^ d3 299 | bc1 = bits.RotateLeft64(t, 55) 300 | t = a[4] ^ d4 301 | bc2 = bits.RotateLeft64(t, 39) 302 | a[10] = bc0 ^ (bc2 &^ bc1) 303 | a[21] = bc1 ^ (bc3 &^ bc2) 304 | a[7] = bc2 ^ (bc4 &^ bc3) 305 | a[18] = bc3 ^ (bc0 &^ bc4) 306 | a[4] = bc4 ^ (bc1 &^ bc0) 307 | 308 | // Round 4 309 | bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20] 310 | bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21] 311 | bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22] 312 | bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23] 313 | bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24] 314 | d0 = bc4 ^ (bc1<<1 | bc1>>63) 315 | d1 = bc0 ^ (bc2<<1 | bc2>>63) 316 | d2 = bc1 ^ (bc3<<1 | bc3>>63) 317 | d3 = bc2 ^ (bc4<<1 | bc4>>63) 318 | d4 = bc3 ^ (bc0<<1 | bc0>>63) 319 | 320 | bc0 = a[0] ^ d0 321 | t = a[1] ^ d1 322 | bc1 = bits.RotateLeft64(t, 44) 323 | t = a[2] ^ d2 324 | bc2 = bits.RotateLeft64(t, 43) 325 | t = a[3] ^ d3 326 | bc3 = bits.RotateLeft64(t, 21) 327 | t = a[4] ^ d4 328 | bc4 = bits.RotateLeft64(t, 14) 329 | a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+3] 330 | a[1] = bc1 ^ (bc3 &^ bc2) 331 | a[2] = bc2 ^ (bc4 &^ bc3) 332 | a[3] = bc3 ^ (bc0 &^ bc4) 333 | a[4] = bc4 ^ (bc1 &^ bc0) 334 | 335 | t = a[5] ^ d0 336 | bc2 = bits.RotateLeft64(t, 3) 337 | t = a[6] ^ d1 338 | bc3 = bits.RotateLeft64(t, 45) 339 | t = a[7] ^ d2 340 | bc4 = bits.RotateLeft64(t, 61) 341 | t = a[8] ^ d3 342 | bc0 = bits.RotateLeft64(t, 28) 343 | t = a[9] ^ d4 344 | bc1 = bits.RotateLeft64(t, 20) 345 | a[5] = bc0 ^ (bc2 &^ bc1) 346 | a[6] = bc1 ^ (bc3 &^ bc2) 347 | a[7] = bc2 ^ (bc4 &^ bc3) 348 | a[8] = bc3 ^ (bc0 &^ bc4) 349 | a[9] = bc4 ^ (bc1 &^ bc0) 350 | 351 | t = a[10] ^ d0 352 | bc4 = bits.RotateLeft64(t, 18) 353 | t = a[11] ^ d1 354 | bc0 = bits.RotateLeft64(t, 1) 355 | t = a[12] ^ d2 356 | bc1 = bits.RotateLeft64(t, 6) 357 | t = a[13] ^ d3 358 | bc2 = bits.RotateLeft64(t, 25) 359 | t = a[14] ^ d4 360 | bc3 = bits.RotateLeft64(t, 8) 361 | a[10] = bc0 ^ (bc2 &^ bc1) 362 | a[11] = bc1 ^ (bc3 &^ bc2) 363 | a[12] = bc2 ^ (bc4 &^ bc3) 364 | a[13] = bc3 ^ (bc0 &^ bc4) 365 | a[14] = bc4 ^ (bc1 &^ bc0) 366 | 367 | t = a[15] ^ d0 368 | bc1 = bits.RotateLeft64(t, 36) 369 | t = a[16] ^ d1 370 | bc2 = bits.RotateLeft64(t, 10) 371 | t = a[17] ^ d2 372 | bc3 = bits.RotateLeft64(t, 15) 373 | t = a[18] ^ d3 374 | bc4 = bits.RotateLeft64(t, 56) 375 | t = a[19] ^ d4 376 | bc0 = bits.RotateLeft64(t, 27) 377 | a[15] = bc0 ^ (bc2 &^ bc1) 378 | a[16] = bc1 ^ (bc3 &^ bc2) 379 | a[17] = bc2 ^ (bc4 &^ bc3) 380 | a[18] = bc3 ^ (bc0 &^ bc4) 381 | a[19] = bc4 ^ (bc1 &^ bc0) 382 | 383 | t = a[20] ^ d0 384 | bc3 = bits.RotateLeft64(t, 41) 385 | t = a[21] ^ d1 386 | bc4 = bits.RotateLeft64(t, 2) 387 | t = a[22] ^ d2 388 | bc0 = bits.RotateLeft64(t, 62) 389 | t = a[23] ^ d3 390 | bc1 = bits.RotateLeft64(t, 55) 391 | t = a[24] ^ d4 392 | bc2 = bits.RotateLeft64(t, 39) 393 | a[20] = bc0 ^ (bc2 &^ bc1) 394 | a[21] = bc1 ^ (bc3 &^ bc2) 395 | a[22] = bc2 ^ (bc4 &^ bc3) 396 | a[23] = bc3 ^ (bc0 &^ bc4) 397 | a[24] = bc4 ^ (bc1 &^ bc0) 398 | } 399 | } 400 | -------------------------------------------------------------------------------- /C/xelis_hash_v2.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "BLAKE3/c/blake3.h" 12 | #include "ChaCha20-SIMD/chacha20.h" 13 | 14 | #define INPUT_LEN (112) 15 | #define MEMSIZE (429 * 128) 16 | #define ITERS (3) 17 | #define HASH_SIZE (32) 18 | #define CHUNK_SIZE (32) 19 | #define NONCE_SIZE (12) 20 | #define OUTPUT_SIZE (MEMSIZE * 8) 21 | #define CHUNKS (4) 22 | #define INPUT_LEN (112) 23 | 24 | static inline void blake3(const uint8_t *input, int len, uint8_t *output) 25 | { 26 | blake3_hasher hasher; 27 | blake3_hasher_init(&hasher); 28 | blake3_hasher_update(&hasher, input, len); 29 | blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); 30 | } 31 | 32 | 33 | void stage1(const uint8_t *input, size_t input_len, uint8_t scratch_pad[OUTPUT_SIZE]) 34 | { 35 | uint8_t key[CHUNK_SIZE * CHUNKS] = {0}; 36 | uint8_t input_hash[HASH_SIZE]; 37 | uint8_t buffer[CHUNK_SIZE * 2]; 38 | memcpy(key, input, INPUT_LEN); 39 | blake3(input, INPUT_LEN, buffer); 40 | 41 | uint8_t *t = scratch_pad; 42 | 43 | memcpy(buffer + CHUNK_SIZE, key + 0 * CHUNK_SIZE, CHUNK_SIZE); 44 | blake3(buffer, CHUNK_SIZE * 2, input_hash); 45 | chacha_encrypt(input_hash, buffer, NULL, t, OUTPUT_SIZE / CHUNKS, 8); 46 | 47 | t += OUTPUT_SIZE / CHUNKS; 48 | memcpy(buffer, input_hash, CHUNK_SIZE); 49 | memcpy(buffer + CHUNK_SIZE, key + 1 * CHUNK_SIZE, CHUNK_SIZE); 50 | blake3(buffer, CHUNK_SIZE * 2, input_hash); 51 | chacha_encrypt(input_hash, t - NONCE_SIZE, NULL, t, OUTPUT_SIZE / CHUNKS, 8); 52 | 53 | t += OUTPUT_SIZE / CHUNKS; 54 | memcpy(buffer, input_hash, CHUNK_SIZE); 55 | memcpy(buffer + CHUNK_SIZE, key + 2 * CHUNK_SIZE, CHUNK_SIZE); 56 | blake3(buffer, CHUNK_SIZE * 2, input_hash); 57 | chacha_encrypt(input_hash, t - NONCE_SIZE, NULL, t, OUTPUT_SIZE / CHUNKS, 8); 58 | 59 | t += OUTPUT_SIZE / CHUNKS; 60 | memcpy(buffer, input_hash, CHUNK_SIZE); 61 | memcpy(buffer + CHUNK_SIZE, key + 3 * CHUNK_SIZE, CHUNK_SIZE); 62 | blake3(buffer, CHUNK_SIZE * 2, input_hash); 63 | chacha_encrypt(input_hash, t - NONCE_SIZE, NULL, t, OUTPUT_SIZE / CHUNKS, 8); 64 | } 65 | 66 | #define KEY "xelishash-pow-v2" 67 | #define BUFSIZE (MEMSIZE / 2) 68 | 69 | // https://danlark.org/2020/06/14/128-bit-division 70 | static inline uint64_t Divide128Div64To64(uint64_t high, uint64_t low, uint64_t divisor, uint64_t *remainder) 71 | { 72 | uint64_t result; 73 | __asm__("divq %[v]" 74 | : "=a"(result), "=d"(*remainder) // Output parametrs, =a for rax, =d for rdx, [v] is an 75 | // alias for divisor, input paramters "a" and "d" for low and high. 76 | : [v] "r"(divisor), "a"(low), "d"(high)); 77 | return result; 78 | } 79 | 80 | static inline uint64_t udiv(uint64_t high, uint64_t low, uint64_t divisor) 81 | { 82 | uint64_t remainder; 83 | 84 | if (high < divisor) 85 | { 86 | return Divide128Div64To64(high, low, divisor, &remainder); 87 | } 88 | else 89 | { 90 | uint64_t qhi = Divide128Div64To64(0, high, divisor, &high); 91 | return Divide128Div64To64(high, low, divisor, &remainder); 92 | } 93 | } 94 | 95 | static inline uint64_t ROTR(uint64_t x, uint32_t r) 96 | { 97 | asm("rorq %%cl, %0" : "+r"(x) : "c"(r)); 98 | return x; 99 | } 100 | 101 | static inline uint64_t ROTL(uint64_t x, uint32_t r) 102 | { 103 | asm("rolq %%cl, %0" : "+r"(x) : "c"(r)); 104 | return x; 105 | } 106 | 107 | static inline __uint128_t combine_uint64(uint64_t high, uint64_t low) 108 | { 109 | return ((__uint128_t)high << 64) | low; 110 | } 111 | 112 | /* 113 | uint64_t isqrt(uint64_t n) { 114 | if (n < 2) 115 | return n; 116 | 117 | uint64_t x = n; 118 | uint64_t y = (x + 1) >> 1; 119 | 120 | while (y < x) { 121 | x = y; 122 | y = (x + n / x) >> 1; 123 | } 124 | 125 | return x; 126 | } 127 | */ 128 | 129 | uint64_t isqrt(uint64_t n) 130 | { 131 | if (n < 2) 132 | return n; 133 | 134 | uint64_t x = n; 135 | uint64_t result = 0; 136 | uint64_t bit = (uint64_t)1 << 62; // The second-to-top bit is set 137 | 138 | // "bit" starts at the highest power of four <= the argument. 139 | while (bit > x) 140 | bit >>= 2; 141 | 142 | while (bit != 0) 143 | { 144 | if (x >= result + bit) 145 | { 146 | x -= result + bit; 147 | result = (result >> 1) + bit; 148 | } 149 | else 150 | { 151 | result >>= 1; 152 | } 153 | bit >>= 2; 154 | } 155 | 156 | return result; 157 | } 158 | 159 | void static inline uint64_to_le_bytes(uint64_t value, uint8_t *bytes) 160 | { 161 | for (int i = 0; i < 8; i++) 162 | { 163 | bytes[i] = value & 0xFF; 164 | value >>= 8; 165 | } 166 | } 167 | 168 | uint64_t static inline le_bytes_to_uint64(const uint8_t *bytes) 169 | { 170 | uint64_t value = 0; 171 | for (int i = 7; i >= 0; i--) 172 | value = (value << 8) | bytes[i]; 173 | return value; 174 | } 175 | 176 | void static inline aes_single_round(uint8_t *block, const uint8_t *key) 177 | { 178 | __m128i block_vec = _mm_loadu_si128((const __m128i *)block); 179 | __m128i key_vec = _mm_loadu_si128((const __m128i *)key); 180 | 181 | // Perform single AES encryption round 182 | block_vec = _mm_aesenc_si128(block_vec, key_vec); 183 | 184 | _mm_storeu_si128((__m128i *)block, block_vec); 185 | } 186 | 187 | void stage3(uint64_t *scratch) 188 | { 189 | uint64_t *mem_buffer_a = scratch; 190 | uint64_t *mem_buffer_b = &scratch[BUFSIZE]; 191 | 192 | uint64_t addr_a = mem_buffer_b[BUFSIZE - 1]; 193 | uint64_t addr_b = mem_buffer_a[BUFSIZE - 1] >> 32; 194 | uint32_t r = 0; 195 | 196 | for (uint32_t i = 0; i < ITERS; i++) 197 | { 198 | uint64_t mem_a = mem_buffer_a[addr_a % BUFSIZE]; 199 | uint64_t mem_b = mem_buffer_b[addr_b % BUFSIZE]; 200 | 201 | uint8_t block[16]; 202 | uint64_to_le_bytes(mem_b, block); 203 | uint64_to_le_bytes(mem_a, block + 8); 204 | aes_single_round(block, KEY); 205 | 206 | uint64_t hash1 = le_bytes_to_uint64(block); 207 | uint64_t hash2 = mem_a ^ mem_b; 208 | uint64_t result = ~(hash1 ^ hash2); 209 | 210 | for (uint32_t j = 0; j < BUFSIZE; j++) 211 | { 212 | uint64_t a = mem_buffer_a[result % BUFSIZE]; 213 | uint64_t b = mem_buffer_b[~ROTR(result, r) % BUFSIZE]; 214 | uint64_t c = (r < BUFSIZE) ? mem_buffer_a[r] : mem_buffer_b[r - BUFSIZE]; 215 | r = (r < MEMSIZE - 1) ? r + 1 : 0; 216 | 217 | uint64_t v; 218 | __uint128_t t1, t2; 219 | switch (ROTL(result, (uint32_t)c) & 0xf) 220 | { 221 | case 0: 222 | v = ROTL(c, i * j) ^ b; 223 | break; 224 | case 1: 225 | v = ROTR(c, i * j) ^ a; 226 | break; 227 | case 2: 228 | v = a ^ b ^ c; 229 | break; 230 | case 3: 231 | v = ((a + b) * c); 232 | break; 233 | case 4: 234 | v = ((b - c) * a); 235 | break; 236 | case 5: 237 | v = (c - a + b); 238 | break; 239 | case 6: 240 | v = (a - b + c); 241 | break; 242 | case 7: 243 | v = (b * c + a); 244 | break; 245 | case 8: 246 | v = (c * a + b); 247 | break; 248 | case 9: 249 | v = (a * b * c); 250 | break; 251 | case 10: 252 | { 253 | t1 = combine_uint64(a, b); 254 | uint64_t t2 = c | 1; 255 | v = t1 % t2; 256 | } 257 | break; 258 | case 11: 259 | { 260 | t1 = combine_uint64(b, c); 261 | t2 = combine_uint64(ROTL(result, r), a | 2); 262 | v = (t2 > t1) ? c : t1 % t2; 263 | } 264 | break; 265 | case 12: 266 | v = udiv(c, a, b | 4); 267 | break; 268 | case 13: 269 | { 270 | t1 = combine_uint64(ROTL(result, r), b); 271 | t2 = combine_uint64(a, c | 8); 272 | v = (t1 > t2) ? t1 / t2 : a ^ b; 273 | } 274 | break; 275 | case 14: 276 | { 277 | t1 = combine_uint64(b, a); 278 | uint64_t t2 = c; 279 | v = (t1 * t2) >> 64; 280 | } 281 | break; 282 | case 15: 283 | { 284 | t1 = combine_uint64(a, c); 285 | t2 = combine_uint64(ROTR(result, r), b); 286 | v = (t1 * t2) >> 64; 287 | } 288 | break; 289 | } 290 | result = ROTL(result ^ v, 1); 291 | 292 | uint64_t t = mem_buffer_a[BUFSIZE - j - 1] ^ result; 293 | mem_buffer_a[BUFSIZE - j - 1] = t; 294 | mem_buffer_b[j] ^= ROTR(t, result); 295 | } 296 | addr_a = result; 297 | addr_b = isqrt(result); 298 | } 299 | } 300 | 301 | int xelis_hash_v2_init() 302 | { 303 | // return sodium_init(); 304 | } 305 | 306 | void xelis_hash_v2(uint8_t in[INPUT_LEN], uint8_t hash[HASH_SIZE], uint64_t scratch[MEMSIZE]) 307 | { 308 | uint8_t *scratch_uint8 = (uint8_t *)scratch; 309 | 310 | stage1(in, INPUT_LEN, scratch_uint8); 311 | stage3(scratch); 312 | blake3(scratch_uint8, OUTPUT_SIZE, hash); 313 | } 314 | 315 | double display_time(const char *stage, struct timespec start, struct timespec end, int iterations) 316 | { 317 | uint64_t total_time = (end.tv_sec - start.tv_sec) * 1000000000ULL + (end.tv_nsec - start.tv_nsec); 318 | double time_per = (double)total_time / iterations; 319 | printf("%s: %.3f ms\n", stage, time_per / 1000000.0); 320 | return time_per; 321 | } 322 | 323 | void timing_test(int N) 324 | { 325 | uint8_t hash[HASH_SIZE]; 326 | struct timespec start, end; 327 | double time_per, time_sum = 0; 328 | 329 | uint8_t *input = (uint8_t *)calloc(INPUT_LEN, sizeof(uint8_t)); 330 | uint64_t *scratch = (uint64_t *)calloc(MEMSIZE, sizeof(uint64_t)); 331 | uint8_t *scratch_uint8 = (uint8_t *)scratch; 332 | 333 | xelis_hash_v2_init(); 334 | 335 | printf("Timing:\n"); 336 | clock_gettime(CLOCK_MONOTONIC, &start); 337 | for (int i = 0; i < N; i++) 338 | stage1(input, INPUT_LEN, scratch_uint8); 339 | clock_gettime(CLOCK_MONOTONIC, &end); 340 | time_sum += display_time("stage1", start, end, N); 341 | 342 | clock_gettime(CLOCK_MONOTONIC, &start); 343 | for (int i = 0; i < N; i++) 344 | stage3(scratch); 345 | clock_gettime(CLOCK_MONOTONIC, &end); 346 | time_sum += display_time("stage3", start, end, N); 347 | 348 | clock_gettime(CLOCK_MONOTONIC, &start); 349 | for (int i = 0; i < N; i++) 350 | blake3(scratch_uint8, OUTPUT_SIZE, hash); 351 | clock_gettime(CLOCK_MONOTONIC, &end); 352 | time_sum += display_time("stage4", start, end, N); 353 | 354 | printf("Total: %.3f ms (%d avg)\n", time_sum / 1000000.0, N); 355 | 356 | // verify output 357 | uint8_t gold[HASH_SIZE] = { 358 | 126, 219, 112, 240, 116, 133, 115, 359 | 144, 39, 40, 164, 105, 30, 158, 45, 360 | 126, 64, 67, 238, 52, 200, 35, 161, 19, 361 | 144, 211, 214, 225, 95, 190, 146, 27}; 362 | 363 | xelis_hash_v2(input, hash, scratch); 364 | if (memcmp(gold, hash, HASH_SIZE)) 365 | printf("Failed!\n"); 366 | else 367 | printf("Passed!\n"); 368 | 369 | free(input); 370 | free(scratch); 371 | } 372 | 373 | typedef struct 374 | { 375 | int thread_id; 376 | int iterations; 377 | uint8_t *input; 378 | uint64_t *scratch; 379 | uint8_t *hash; 380 | } thread_data_t; 381 | 382 | void set_thread_affinity(int thread_id) 383 | { 384 | cpu_set_t cpuset; 385 | CPU_ZERO(&cpuset); 386 | CPU_SET(thread_id % sysconf(_SC_NPROCESSORS_ONLN), &cpuset); 387 | int rc = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); 388 | if (rc != 0) 389 | { 390 | fprintf(stderr, "Error: Unable to set CPU affinity for thread %d\n", thread_id); 391 | } 392 | } 393 | 394 | void *hash_thread(void *arg) 395 | { 396 | thread_data_t *data = (thread_data_t *)arg; 397 | // set_thread_affinity(data->thread_id); 398 | 399 | for (int i = 0; i < data->iterations; ++i) 400 | xelis_hash_v2(data->input, data->hash, data->scratch); 401 | 402 | pthread_exit(NULL); 403 | } 404 | 405 | void hash_test(int t, int i) 406 | { 407 | pthread_t *threads; 408 | thread_data_t *thread_data; 409 | 410 | xelis_hash_v2_init(); 411 | 412 | printf("\n%-10s %-15s %-10s\n", "Threads", "Hashes", "Hash/s"); 413 | for (int tc = 1; tc <= t; ++tc) 414 | { 415 | threads = (pthread_t *)malloc(tc * sizeof(pthread_t)); 416 | thread_data = (thread_data_t *)malloc(tc * sizeof(thread_data_t)); 417 | struct timespec start, end; 418 | 419 | clock_gettime(CLOCK_REALTIME, &start); 420 | for (int j = 0; j < tc; ++j) 421 | { 422 | thread_data[j].thread_id = j; 423 | thread_data[j].iterations = i; 424 | thread_data[j].input = (uint8_t *)calloc(INPUT_LEN, sizeof(uint8_t)); 425 | thread_data[j].scratch = (uint64_t *)calloc(MEMSIZE, sizeof(uint64_t)); 426 | thread_data[j].hash = (uint8_t *)calloc(HASH_SIZE, sizeof(uint8_t)); 427 | pthread_create(&threads[j], NULL, hash_thread, (void *)&thread_data[j]); 428 | } 429 | 430 | for (int j = 0; j < tc; ++j) 431 | { 432 | pthread_join(threads[j], NULL); 433 | free(thread_data[j].input); 434 | free(thread_data[j].scratch); 435 | free(thread_data[j].hash); 436 | } 437 | 438 | clock_gettime(CLOCK_REALTIME, &end); 439 | 440 | double time_taken = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9; 441 | double hashes_per_second = (double)(tc * i) / time_taken; 442 | printf("%-10d %-15d %-10.2f\n", tc, i * tc, hashes_per_second); 443 | 444 | free(threads); 445 | free(thread_data); 446 | } 447 | } 448 | 449 | void print_usage(const char *prog_name) 450 | { 451 | printf("Usage: %s [-n iterations] [-t threads]\n", prog_name); 452 | printf(" -n iterations Number of iterations for tests\n"); 453 | printf(" -t threads Number of threads to test\n"); 454 | printf(" -h Show this help message\n"); 455 | } 456 | 457 | int main(int argc, char *argv[]) 458 | { 459 | int N = 1000, T = 8; 460 | int opt; 461 | 462 | while ((opt = getopt(argc, argv, "n:t:h")) != -1) 463 | { 464 | switch (opt) 465 | { 466 | case 'n': 467 | N = atoi(optarg); 468 | break; 469 | case 't': 470 | T = atoi(optarg); 471 | break; 472 | case 'h': 473 | print_usage(argv[0]); 474 | return 0; 475 | default: 476 | print_usage(argv[0]); 477 | return 1; 478 | } 479 | } 480 | 481 | timing_test(N); 482 | hash_test(T, N); 483 | } 484 | -------------------------------------------------------------------------------- /C/xelis_hash_v3.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "BLAKE3/c/blake3.h" 12 | #include "ChaCha20-SIMD/chacha20.h" 13 | #include 14 | 15 | #define INPUT_LEN (112) 16 | #define MEMSIZE (531 * 128) 17 | #define ITERS (2) 18 | #define HASH_SIZE (32) 19 | 20 | static inline void blake3(const uint8_t *input, int len, uint8_t *output) { 21 | blake3_hasher hasher; 22 | blake3_hasher_init(&hasher); 23 | blake3_hasher_update(&hasher, input, len); 24 | blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); 25 | } 26 | 27 | #define CHUNK_SIZE (32) 28 | #define NONCE_SIZE (12) 29 | #define OUTPUT_SIZE (MEMSIZE * 8) 30 | #define CHUNKS (4) 31 | #define INPUT_LEN (112) 32 | 33 | void stage1(const uint8_t *input, size_t input_len, uint8_t scratch_pad[OUTPUT_SIZE]) { 34 | uint8_t key[CHUNK_SIZE * CHUNKS] = {0}; 35 | uint8_t input_hash[HASH_SIZE]; 36 | uint8_t buffer[CHUNK_SIZE * 2]; 37 | memcpy(key, input, INPUT_LEN); 38 | blake3(input, INPUT_LEN, buffer); 39 | 40 | uint8_t *t = scratch_pad; 41 | 42 | memcpy(buffer + CHUNK_SIZE, key + 0 * CHUNK_SIZE, CHUNK_SIZE); 43 | blake3(buffer, CHUNK_SIZE * 2, input_hash); 44 | chacha_encrypt(input_hash, buffer, NULL, t, OUTPUT_SIZE / CHUNKS, 8); 45 | 46 | t += OUTPUT_SIZE / CHUNKS; 47 | memcpy(buffer, input_hash, CHUNK_SIZE); 48 | memcpy(buffer + CHUNK_SIZE, key + 1 * CHUNK_SIZE, CHUNK_SIZE); 49 | blake3(buffer, CHUNK_SIZE * 2, input_hash); 50 | chacha_encrypt(input_hash, t - NONCE_SIZE, NULL, t, OUTPUT_SIZE / CHUNKS, 8); 51 | 52 | t += OUTPUT_SIZE / CHUNKS; 53 | memcpy(buffer, input_hash, CHUNK_SIZE); 54 | memcpy(buffer + CHUNK_SIZE, key + 2 * CHUNK_SIZE, CHUNK_SIZE); 55 | blake3(buffer, CHUNK_SIZE * 2, input_hash); 56 | chacha_encrypt(input_hash, t - NONCE_SIZE, NULL, t, OUTPUT_SIZE / CHUNKS, 8); 57 | 58 | t += OUTPUT_SIZE / CHUNKS; 59 | memcpy(buffer, input_hash, CHUNK_SIZE); 60 | memcpy(buffer + CHUNK_SIZE, key + 3 * CHUNK_SIZE, CHUNK_SIZE); 61 | blake3(buffer, CHUNK_SIZE * 2, input_hash); 62 | chacha_encrypt(input_hash, t - NONCE_SIZE, NULL, t, OUTPUT_SIZE / CHUNKS, 8); 63 | } 64 | 65 | #define KEY "xelishash-pow-v3" 66 | #define BUFSIZE (MEMSIZE / 2) 67 | 68 | // https://danlark.org/2020/06/14/128-bit-division 69 | static inline uint64_t Divide128Div64To64(uint64_t high, uint64_t low, uint64_t divisor, uint64_t *remainder) { 70 | uint64_t result; 71 | __asm__("divq %[v]" 72 | : "=a"(result), "=d"(*remainder) // Output parametrs, =a for rax, =d for rdx, [v] is an 73 | // alias for divisor, input paramters "a" and "d" for low and high. 74 | : [v] "r"(divisor), "a"(low), "d"(high)); 75 | return result; 76 | } 77 | 78 | static inline uint64_t udiv(uint64_t high, uint64_t low, uint64_t divisor) { 79 | uint64_t remainder; 80 | 81 | if (high < divisor) { 82 | return Divide128Div64To64(high, low, divisor, &remainder); 83 | } 84 | else { 85 | (void)Divide128Div64To64(0, high, divisor, &high); 86 | return Divide128Div64To64(high, low, divisor, &remainder); 87 | } 88 | } 89 | 90 | static inline uint64_t ROTR(uint64_t x, uint32_t r) { 91 | asm("rorq %%cl, %0" : "+r"(x) : "c"(r)); 92 | return x; 93 | } 94 | 95 | static inline uint64_t ROTL(uint64_t x, uint32_t r) { 96 | asm("rolq %%cl, %0" : "+r"(x) : "c"(r)); 97 | return x; 98 | } 99 | 100 | static inline __uint128_t combine_uint64(uint64_t high, uint64_t low) { 101 | return ((__uint128_t)high << 64) | low; 102 | } 103 | 104 | static inline uint64_t murmurhash3(uint64_t seed) { 105 | seed ^= seed >> 55; 106 | seed *= 0xff51afd7ed558ccdULL; 107 | seed ^= seed >> 32; 108 | seed *= 0xc4ceb9fe1a85ec53ULL; 109 | seed ^= seed >> 15; 110 | return seed; 111 | } 112 | 113 | static inline uint64_t map_index(uint64_t x) { 114 | x ^= x >> 33; 115 | x *= 0xff51afd7ed558ccdULL; 116 | return (uint64_t)(((__uint128_t)x * BUFSIZE) >> 64); 117 | } 118 | 119 | static inline int pick_half(uint64_t seed) { 120 | return (murmurhash3(seed) & (1ULL << 58)) != 0; 121 | } 122 | 123 | uint64_t isqrt(uint64_t n) { 124 | if (n < 2) 125 | return n; 126 | 127 | // Compute the floating-point square root 128 | uint64_t approx = (uint64_t)sqrt((double)n); 129 | 130 | // Verify and adjust if necessary 131 | if (approx * approx > n) { 132 | return approx - 1; 133 | } else if ((approx + 1) * (approx + 1) <= n) { 134 | return approx + 1; 135 | } else { 136 | return approx; 137 | } 138 | } 139 | 140 | uint64_t modular_power(uint64_t base, uint64_t exp, uint64_t mod) { 141 | uint64_t result = 1; 142 | base %= mod; // Ensure base is within the range of mod 143 | 144 | while (exp > 0) { 145 | // If exp is odd, multiply base with result 146 | if (exp & 1) { 147 | result = (uint64_t)(((__uint128_t)result * base) % mod); 148 | } 149 | 150 | // Square the base and reduce by mod 151 | base = (uint64_t)(((__uint128_t)base * base) % mod); 152 | exp /= 2; // Halve the exponent 153 | } 154 | 155 | return result; 156 | } 157 | 158 | void static inline uint64_to_le_bytes(uint64_t value, uint8_t *bytes) { 159 | for (int i = 0; i < 8; i++) { 160 | bytes[i] = value & 0xFF; 161 | value >>= 8; 162 | } 163 | } 164 | 165 | uint64_t static inline le_bytes_to_uint64(const uint8_t *bytes) { 166 | uint64_t value = 0; 167 | for (int i = 7; i >= 0; i--) 168 | value = (value << 8) | bytes[i]; 169 | return value; 170 | } 171 | 172 | void static inline aes_single_round(uint8_t *block, const uint8_t *key) { 173 | __m128i block_vec = _mm_loadu_si128((const __m128i *)block); 174 | __m128i key_vec = _mm_loadu_si128((const __m128i *)key); 175 | 176 | // Perform single AES encryption round 177 | block_vec = _mm_aesenc_si128(block_vec, key_vec); 178 | 179 | _mm_storeu_si128((__m128i *)block, block_vec); 180 | } 181 | 182 | void stage3(uint64_t *scratch) { 183 | uint64_t *mem_buffer_a = scratch; 184 | uint64_t *mem_buffer_b = &scratch[BUFSIZE]; 185 | 186 | uint64_t addr_a = mem_buffer_b[BUFSIZE - 1]; 187 | uint64_t addr_b = mem_buffer_a[BUFSIZE - 1] >> 32; 188 | uint32_t r = 0; 189 | 190 | for (uint32_t i = 0; i < ITERS; i++) { 191 | uint64_t mem_a = mem_buffer_a[map_index(addr_a)]; 192 | uint64_t mem_b = mem_buffer_b[map_index(mem_a ^ addr_b)]; 193 | 194 | uint8_t block[16]; 195 | uint64_to_le_bytes(mem_b, block); 196 | uint64_to_le_bytes(mem_a, block + 8); 197 | aes_single_round(block, KEY); 198 | 199 | uint64_t hash1 = le_bytes_to_uint64(block); 200 | uint64_t hash2 = le_bytes_to_uint64(block + 8); 201 | uint64_t result = ~(hash1 ^ hash2); 202 | 203 | for (uint32_t j = 0; j < BUFSIZE; j++) { 204 | uint64_t a = mem_buffer_a[map_index(result)]; 205 | uint64_t b = mem_buffer_b[map_index(a ^ ~ROTR(result, r))]; 206 | uint64_t c = (r < BUFSIZE) ? mem_buffer_a[r] : mem_buffer_b[r - BUFSIZE]; 207 | r = (r < MEMSIZE - 1) ? r + 1 : 0; 208 | 209 | uint64_t v; 210 | __uint128_t t1, t2; 211 | switch (ROTL(result, (uint32_t)c) & 0xf) { 212 | case 0: 213 | t1 = combine_uint64(a + i, isqrt(b + j)); 214 | uint64_t denom = murmurhash3(c ^ result ^ i ^ j) | 1; 215 | v = (uint64_t)(t1 % denom); 216 | break; 217 | case 1: 218 | v = ROTL((c + i) % isqrt(b | 2), i + j) * isqrt(a + j); 219 | break; 220 | case 2: 221 | v = (isqrt(a + i) * isqrt(c + j)) ^ (b + i + j); 222 | break; 223 | case 3: 224 | v = ((a + b) * c); 225 | break; 226 | case 4: 227 | v = ((b - c) * a); 228 | break; 229 | case 5: 230 | v = (c - a + b); 231 | break; 232 | case 6: 233 | v = (a - b + c); 234 | break; 235 | case 7: 236 | v = (b * c + a); 237 | break; 238 | case 8: 239 | v = (c * a + b); 240 | break; 241 | case 9: 242 | v = (a * b * c); 243 | break; 244 | case 10: 245 | t1 = combine_uint64(a, b); 246 | v = t1 % (c | 1); 247 | break; 248 | case 11: 249 | t1 = combine_uint64(b, c); 250 | t2 = combine_uint64(ROTL(result, r), a | 2); 251 | v = (t2 > t1) ? c : t1 % t2; 252 | break; 253 | case 12: 254 | v = udiv(c, a, b | 4); 255 | break; 256 | case 13: 257 | t1 = combine_uint64(ROTL(result, r), b); 258 | t2 = combine_uint64(a, c | 8); 259 | v = (t1 > t2) ? t1 / t2 : a ^ b; 260 | break; 261 | case 14: 262 | t1 = combine_uint64(b, a); 263 | v = (t1 * c) >> 64; 264 | break; 265 | case 15: 266 | t1 = combine_uint64(a, c); 267 | t2 = combine_uint64(ROTR(result, r), b); 268 | v = (t1 * t2) >> 64; 269 | break; 270 | } 271 | uint64_t idx_seed = v ^ result; 272 | result = ROTL(idx_seed, r); 273 | 274 | uint64_t use_buffer_b = pick_half(v); 275 | uint64_t idx_t = map_index(idx_seed); 276 | uint64_t t = (use_buffer_b ? mem_buffer_b[idx_t] : mem_buffer_a[idx_t]) ^ result; 277 | 278 | uint64_t idx_a = map_index(t ^ result ^ 0x9e3779b97f4a7c15); 279 | uint64_t idx_b = map_index(idx_a ^ ~result ^ 0xd2b74407b1ce6e93); 280 | 281 | uint64_t mem_a = mem_buffer_a[idx_a]; 282 | mem_buffer_a[idx_a] = t; 283 | mem_buffer_b[idx_b] ^= mem_a ^ ROTR(t, i + j); 284 | } 285 | 286 | addr_a = modular_power(addr_a, addr_b, result); 287 | addr_b = isqrt(result) * (r + 1) * isqrt(addr_a); 288 | } 289 | } 290 | 291 | int xelis_hash_v3_init() { 292 | // return sodium_init(); 293 | } 294 | 295 | void xelis_hash_v3(uint8_t in[INPUT_LEN], uint8_t hash[HASH_SIZE], uint64_t scratch[MEMSIZE]) { 296 | uint8_t *scratch_uint8 = (uint8_t *)scratch; 297 | 298 | stage1(in, INPUT_LEN, scratch_uint8); 299 | stage3(scratch); 300 | blake3(scratch_uint8, OUTPUT_SIZE, hash); 301 | } 302 | 303 | double display_time(const char *stage, struct timespec start, struct timespec end, int iterations) { 304 | uint64_t total_time = (end.tv_sec - start.tv_sec) * 1000000000ULL + (end.tv_nsec - start.tv_nsec); 305 | double time_per = (double)total_time / iterations; 306 | printf("%s: %.3f ms\n", stage, time_per / 1000000.0); 307 | return time_per; 308 | } 309 | 310 | void timing_test(int N) { 311 | uint8_t hash[HASH_SIZE]; 312 | struct timespec start, end; 313 | double time_per, time_sum = 0; 314 | 315 | uint8_t *input = (uint8_t *)calloc(INPUT_LEN, sizeof(uint8_t)); 316 | uint64_t *scratch = (uint64_t *)calloc(MEMSIZE, sizeof(uint64_t)); 317 | uint8_t *scratch_uint8 = (uint8_t *)scratch; 318 | 319 | xelis_hash_v3_init(); 320 | 321 | printf("Timing:\n"); 322 | clock_gettime(CLOCK_MONOTONIC, &start); 323 | for (int i = 0; i < N; i++) 324 | stage1(input, INPUT_LEN, scratch_uint8); 325 | clock_gettime(CLOCK_MONOTONIC, &end); 326 | time_sum += display_time("stage1", start, end, N); 327 | 328 | clock_gettime(CLOCK_MONOTONIC, &start); 329 | for (int i = 0; i < N; i++) 330 | stage3(scratch); 331 | clock_gettime(CLOCK_MONOTONIC, &end); 332 | time_sum += display_time("stage3", start, end, N); 333 | 334 | clock_gettime(CLOCK_MONOTONIC, &start); 335 | for (int i = 0; i < N; i++) 336 | blake3(scratch_uint8, OUTPUT_SIZE, hash); 337 | clock_gettime(CLOCK_MONOTONIC, &end); 338 | time_sum += display_time("stage4", start, end, N); 339 | 340 | printf("Total: %.3f ms (%d avg)\n", time_sum / 1000000.0, N); 341 | 342 | // verify output 343 | uint8_t gold[HASH_SIZE] = { 344 | 105, 172, 103, 40, 94, 253, 92, 162, 345 | 42, 252, 5, 196, 236, 238, 91, 218, 346 | 22, 157, 228, 233, 239, 8, 250, 57, 347 | 212, 166, 121, 132, 148, 205, 103, 163 348 | }; 349 | 350 | xelis_hash_v3(input, hash, scratch); 351 | if (memcmp(gold, hash, HASH_SIZE)) { 352 | printf("Failed!\n"); 353 | printf("Expected: "); 354 | for (int i = 0; i < HASH_SIZE; i++) { 355 | printf("%u", gold[i]); 356 | if (i != HASH_SIZE - 1) { 357 | printf(", "); 358 | } 359 | } 360 | printf("\nGot: "); 361 | for (int i = 0; i < HASH_SIZE; i++) { 362 | printf("%u", hash[i]); 363 | if (i != HASH_SIZE - 1) { 364 | printf(", "); 365 | } 366 | } 367 | printf("\n"); 368 | } 369 | else { 370 | printf("Passed!\n"); 371 | } 372 | 373 | free(input); 374 | free(scratch); 375 | } 376 | 377 | typedef struct { 378 | int thread_id; 379 | int iterations; 380 | uint8_t *input; 381 | uint64_t *scratch; 382 | uint8_t *hash; 383 | } thread_data_t; 384 | 385 | void set_thread_affinity(int thread_id) { 386 | cpu_set_t cpuset; 387 | CPU_ZERO(&cpuset); 388 | CPU_SET(thread_id % sysconf(_SC_NPROCESSORS_ONLN), &cpuset); 389 | int rc = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); 390 | if (rc != 0) { 391 | fprintf(stderr, "Error: Unable to set CPU affinity for thread %d\n", thread_id); 392 | } 393 | } 394 | 395 | void *hash_thread(void *arg) { 396 | thread_data_t *data = (thread_data_t *)arg; 397 | // set_thread_affinity(data->thread_id); 398 | 399 | for (int i = 0; i < data->iterations; ++i) 400 | xelis_hash_v3(data->input, data->hash, data->scratch); 401 | 402 | pthread_exit(NULL); 403 | } 404 | 405 | void hash_test(int t, int i) { 406 | pthread_t *threads; 407 | thread_data_t *thread_data; 408 | 409 | xelis_hash_v3_init(); 410 | 411 | printf("\n%-10s %-15s %-10s\n", "Threads", "Hashes", "Hash/s"); 412 | for (int tc = 1; tc <= t; ++tc) { 413 | threads = (pthread_t *)malloc(tc * sizeof(pthread_t)); 414 | thread_data = (thread_data_t *)malloc(tc * sizeof(thread_data_t)); 415 | struct timespec start, end; 416 | 417 | clock_gettime(CLOCK_REALTIME, &start); 418 | for (int j = 0; j < tc; ++j) { 419 | thread_data[j].thread_id = j; 420 | thread_data[j].iterations = i; 421 | thread_data[j].input = (uint8_t *)calloc(INPUT_LEN, sizeof(uint8_t)); 422 | thread_data[j].scratch = (uint64_t *)calloc(MEMSIZE, sizeof(uint64_t)); 423 | thread_data[j].hash = (uint8_t *)calloc(HASH_SIZE, sizeof(uint8_t)); 424 | pthread_create(&threads[j], NULL, hash_thread, (void *)&thread_data[j]); 425 | } 426 | 427 | for (int j = 0; j < tc; ++j) { 428 | pthread_join(threads[j], NULL); 429 | free(thread_data[j].input); 430 | free(thread_data[j].scratch); 431 | free(thread_data[j].hash); 432 | } 433 | 434 | clock_gettime(CLOCK_REALTIME, &end); 435 | 436 | double time_taken = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9; 437 | double hashes_per_second = (double)(tc * i) / time_taken; 438 | printf("%-10d %-15d %-10.2f\n", tc, i * tc, hashes_per_second); 439 | 440 | free(threads); 441 | free(thread_data); 442 | } 443 | } 444 | 445 | void print_usage(const char *prog_name) { 446 | printf("Usage: %s [-n iterations] [-t threads]\n", prog_name); 447 | printf(" -n iterations Number of iterations for tests\n"); 448 | printf(" -t threads Number of threads to test\n"); 449 | printf(" -h Show this help message\n"); 450 | } 451 | 452 | int main(int argc, char *argv[]) { 453 | int N = 1000, T = 8; 454 | int opt; 455 | 456 | while ((opt = getopt(argc, argv, "n:t:h")) != -1) { 457 | switch (opt) { 458 | case 'n': 459 | N = atoi(optarg); 460 | break; 461 | case 't': 462 | T = atoi(optarg); 463 | break; 464 | case 'h': 465 | print_usage(argv[0]); 466 | return 0; 467 | default: 468 | print_usage(argv[0]); 469 | return 1; 470 | } 471 | } 472 | 473 | timing_test(N); 474 | if (T) 475 | hash_test(T, N); 476 | } -------------------------------------------------------------------------------- /src/v3.rs: -------------------------------------------------------------------------------- 1 | use aes::cipher::generic_array::GenericArray; 2 | use crate::{v2, Error, Hash, scratchpad::ScratchPad as ScratchPadInternal}; 3 | 4 | #[cfg(feature = "tracker")] 5 | use crate::tracker::*; 6 | 7 | // These are tweakable parameters 8 | // Memory size is the size of the scratch pad in u64s 9 | // In bytes, this is equal to ~ 544 kB 10 | const MEMORY_SIZE: usize = 531 * 128; 11 | const MEMORY_SIZE_BYTES: usize = MEMORY_SIZE * 8; 12 | const SCRATCHPAD_ITERS: usize = 2; 13 | const BUFFER_SIZE: usize = MEMORY_SIZE / 2; 14 | 15 | // Stage 3 AES key 16 | const KEY: [u8; 16] = *b"xelishash-pow-v3"; 17 | 18 | pub type ScratchPad = ScratchPadInternal; 19 | 20 | #[inline] 21 | const fn murmurhash3(mut seed: u64) -> u64 { 22 | /* MurmurHash3 finalizer. 23 | * Avalanches the input seed to produce a uniformly distributed output. 24 | */ 25 | seed ^= seed >> 55; 26 | seed = seed.wrapping_mul(0xff51afd7ed558ccd); 27 | seed ^= seed >> 32; 28 | seed = seed.wrapping_mul(0xc4ceb9fe1a85ec53); 29 | seed ^= seed >> 15; 30 | 31 | seed 32 | } 33 | 34 | #[inline(always)] 35 | pub fn map_index(mut x: u64) -> usize { 36 | /* MurmurHash3-like finalizer + multiply-high reduction. 37 | * The finalizer avalanches the input seed; the mulhi step maps 38 | * uniformly into [0, BUFSIZE) with minimal modulo bias. 39 | */ 40 | x ^= x >> 33; 41 | x = x.wrapping_mul(0xff51afd7ed558ccd); 42 | 43 | ((x as u128) * (BUFFER_SIZE as u128) >> 64) as usize 44 | } 45 | 46 | #[inline(always)] 47 | pub fn pick_half(seed: u64) -> bool { 48 | // // Murmur3 finalizer to get a uniform selector bit 49 | (murmurhash3(seed) & (1u64 << 58)) != 0 50 | } 51 | 52 | #[inline(always)] 53 | pub fn isqrt(n: u64) -> u64 { 54 | if n < 2 { 55 | return n; 56 | } 57 | 58 | // Compute floating-point square root as an approximation 59 | let approx = (n as f64).sqrt() as u64; 60 | 61 | // Verify and adjust if necessary 62 | if approx * approx > n { 63 | approx - 1 64 | } else if (approx + 1) * (approx + 1) <= n { 65 | approx + 1 66 | } else { 67 | approx 68 | } 69 | } 70 | 71 | const fn modular_power(mut base: u64, mut exp: u64, mod_: u64) -> u64 { 72 | let mut result: u64 = 1; 73 | // Ensure base is within the range of mod 74 | base %= mod_; 75 | 76 | while exp > 0 { 77 | // If exp is odd, multiply base with result 78 | if exp & 1 == 1 { 79 | result = ((result as u128 * base as u128) % mod_ as u128) as u64; 80 | } 81 | 82 | // Square the base and reduce by mod 83 | base = ((base as u128 * base as u128) % mod_ as u128) as u64; 84 | exp /= 2; 85 | } 86 | 87 | result 88 | } 89 | 90 | pub(crate) fn stage_3(scratch_pad: &mut [u64; MEMORY_SIZE], #[cfg(feature = "tracker")] tracker: &mut OpsTracker) -> Result<(), Error> { 91 | let key = GenericArray::from(KEY); 92 | let mut block = GenericArray::from([0u8; 16]); 93 | 94 | // Create two new slices for each half 95 | let (mem_buffer_a, mem_buffer_b) = scratch_pad.as_mut_slice().split_at_mut(BUFFER_SIZE); 96 | 97 | let mut addr_a = mem_buffer_b[BUFFER_SIZE-1]; 98 | let mut addr_b = mem_buffer_a[BUFFER_SIZE-1] >> 32; 99 | 100 | #[cfg(feature = "tracker")] 101 | { 102 | tracker.add_mem_op(BUFFER_SIZE-1, MemOp::Read); 103 | tracker.add_mem_op(MEMORY_SIZE-1, MemOp::Read); 104 | } 105 | 106 | let mut r: usize = 0; 107 | 108 | for i in 0..SCRATCHPAD_ITERS { 109 | let index_a = map_index(addr_a); 110 | let mem_a = mem_buffer_a[index_a]; 111 | 112 | let index_b = map_index(mem_a ^ addr_b); 113 | let mem_b = mem_buffer_b[index_b]; 114 | 115 | #[cfg(feature = "tracker")] 116 | { 117 | tracker.add_mem_op(index_a, MemOp::Read); 118 | tracker.add_mem_op(BUFFER_SIZE + index_b, MemOp::Read); 119 | } 120 | 121 | block[..8].copy_from_slice(&mem_b.to_le_bytes()); 122 | block[8..].copy_from_slice(&mem_a.to_le_bytes()); 123 | 124 | aes::hazmat::cipher_round(&mut block, &key); 125 | 126 | let hash1 = u64::from_le_bytes(block[..8] 127 | .try_into() 128 | .map_err(|_| Error::FormatError)?); 129 | 130 | let hash2 = u64::from_le_bytes(block[8..] 131 | .try_into() 132 | .map_err(|_| Error::FormatError)?); 133 | 134 | let mut result = !(hash1 ^ hash2); 135 | 136 | for j in 0..BUFFER_SIZE { 137 | let index_a = map_index(result); 138 | let a = mem_buffer_a[index_a]; 139 | 140 | let index_b = map_index(a ^ !result.rotate_right(r as u32)); 141 | let b = mem_buffer_b[index_b]; 142 | 143 | #[cfg(feature = "tracker")] 144 | { 145 | tracker.add_mem_op(index_a, MemOp::Read); 146 | tracker.add_mem_op(BUFFER_SIZE + index_b, MemOp::Read); 147 | 148 | // This is the same index in scratchpad 149 | tracker.add_mem_op(r, MemOp::Read); 150 | } 151 | 152 | let c = if r < BUFFER_SIZE { 153 | mem_buffer_a[r] 154 | } else { 155 | mem_buffer_b[r-BUFFER_SIZE] 156 | }; 157 | r = if r < MEMORY_SIZE - 1 { 158 | r + 1 159 | } else { 160 | 0 161 | }; 162 | 163 | let branch_idx = (result.rotate_left(c as u32) & 0xf) as u8; 164 | #[cfg(feature = "tracker")] 165 | { 166 | tracker.add_branch(branch_idx); 167 | } 168 | 169 | let v = match branch_idx { 170 | // combine_u64((a + i), isqrt(b + j)) % (murmurhash3(c ^ result ^ i ^ j) | 1) 171 | 0 => { 172 | let t1 = v2::combine_u64( 173 | a.wrapping_add(i as u64), 174 | isqrt(b.wrapping_add(j as u64)), 175 | ); 176 | let denom = murmurhash3(c ^ result ^ i as u64 ^ j as u64) | 1; 177 | (t1 % (denom as u128)) as u64 178 | } 179 | // ROTL((c + i) % isqrt(b | 2), i + j) * isqrt(a + j) 180 | 1 => { 181 | let t1 = c.wrapping_add(i as u64).wrapping_rem(isqrt(b | 2)); 182 | let t2 = t1.rotate_left((i.wrapping_add(j)) as u32); 183 | let t3 = isqrt(a.wrapping_add(j as u64)); 184 | t2.wrapping_mul(t3) 185 | } 186 | // (isqrt(a + i) * isqrt(c + j)) ^ (b + i + j) 187 | 2 => { 188 | let t1 = isqrt(a.wrapping_add(i as u64)); 189 | let t2 = isqrt(c.wrapping_add(j as u64)); 190 | let t3 = t1.wrapping_mul(t2); 191 | t3 ^ b.wrapping_add(i as u64).wrapping_add(j as u64) 192 | } 193 | // (a + b) * c 194 | 3 => a.wrapping_add(b).wrapping_mul(c), 195 | // (b - c) * a 196 | 4 => b.wrapping_sub(c).wrapping_mul(a), 197 | // c - a + b 198 | 5 => c.wrapping_sub(a).wrapping_add(b), 199 | // a - b + c 200 | 6 => a.wrapping_sub(b).wrapping_add(c), 201 | // b * c + a 202 | 7 => b.wrapping_mul(c).wrapping_add(a), 203 | // c * a + b 204 | 8 => c.wrapping_mul(a).wrapping_add(b), 205 | // a * b * c 206 | 9 => a.wrapping_mul(b).wrapping_mul(c), 207 | 10 => { 208 | let t1 = v2::combine_u64(a, b); 209 | let t2 = (c | 1) as u128; 210 | t1.wrapping_rem(t2) as u64 211 | }, 212 | 11 => { 213 | let t1 = v2::combine_u64(b, c); 214 | let t2 = v2::combine_u64(result.rotate_left(r as u32), a | 2); 215 | if t2 > t1 { c } else { t1.wrapping_rem(t2) as u64 } 216 | }, 217 | 12 => { 218 | let t1 = v2::combine_u64(c, a); 219 | let t2 = (b | 4) as u128; 220 | t1.wrapping_div(t2) as u64 221 | }, 222 | 13 => { 223 | let t1 = v2::combine_u64(result.rotate_left(r as u32), b); 224 | let t2 = v2::combine_u64(a, c | 8); 225 | if t1 > t2 {t1.wrapping_div(t2) as u64} else {a^b} 226 | }, 227 | 14 => { 228 | let t1 = v2::combine_u64(b, a); 229 | let t2 = c as u128; 230 | (t1.wrapping_mul(t2) >> 64) as u64 231 | }, 232 | 15 => { 233 | let t1 = v2::combine_u64(a, c); 234 | let t2 = v2::combine_u64(result.rotate_right(r as u32), b); 235 | (t1.wrapping_mul(t2) >> 64) as u64 236 | }, 237 | _ => unreachable!(), 238 | }; 239 | 240 | let seed = v ^ result; 241 | result = seed.rotate_left(r as u32); 242 | 243 | let use_buffer_b = pick_half(v); 244 | let index_t = map_index(seed); 245 | let t = if use_buffer_b { mem_buffer_b[index_t] } else { mem_buffer_a[index_t] } ^ result; 246 | 247 | let index_a = map_index(t ^ result ^ 0x9e3779b97f4a7c15); 248 | let index_b = map_index(index_a as u64 ^ !result ^ 0xd2b74407b1ce6e93); 249 | 250 | let a = std::mem::replace(&mut mem_buffer_a[index_a], t); 251 | mem_buffer_b[index_b] ^= a ^ t.rotate_right(i.wrapping_add(j) as u32); 252 | 253 | #[cfg(feature = "tracker")] 254 | { 255 | if use_buffer_b { 256 | tracker.add_mem_op(BUFFER_SIZE + index_t, MemOp::Read); 257 | } else { 258 | tracker.add_mem_op(index_t, MemOp::Read); 259 | } 260 | 261 | // mem_buffer_a[index_a] and mem_buffer_b[index_b] are written 262 | tracker.add_mem_op(index_a, MemOp::Read); 263 | tracker.add_mem_op(index_a, MemOp::Write); 264 | 265 | tracker.add_mem_op(BUFFER_SIZE + index_b, MemOp::Read); 266 | tracker.add_mem_op(BUFFER_SIZE + index_b, MemOp::Write); 267 | } 268 | } 269 | 270 | addr_a = modular_power(addr_a, addr_b, result); 271 | addr_b = isqrt(result).wrapping_mul((r as u64).wrapping_add(1)).wrapping_mul(isqrt(addr_a)); 272 | } 273 | 274 | Ok(()) 275 | } 276 | 277 | pub fn xelis_hash(input: &[u8], scratch_pad: &mut ScratchPad, #[cfg(feature = "tracker")] distribution: &mut OpsTracker) -> Result { 278 | v2::stage_1::(input, scratch_pad)?; 279 | 280 | let scratch_pad = scratch_pad.as_mut_slice(); 281 | 282 | // stage 3 is customized compared to v2 283 | stage_3(scratch_pad, #[cfg(feature = "tracker")] distribution)?; 284 | 285 | // final stage 4 286 | v2::stage_4(scratch_pad) 287 | } 288 | 289 | 290 | #[cfg(test)] 291 | mod tests { 292 | use rand::{RngCore, rngs::OsRng}; 293 | use super::*; 294 | 295 | #[test] 296 | fn test_reused_scratchpad() { 297 | let mut scratch_pad = ScratchPad::default(); 298 | let mut input = [0u8; 112]; 299 | OsRng.fill_bytes(&mut input); 300 | 301 | // Do a first hash 302 | let expected_hash = xelis_hash(&input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap(); 303 | 304 | // Do a second hash with dirty scratch pad but same input 305 | let hash = xelis_hash(&input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap(); 306 | assert_eq!(hash, expected_hash); 307 | } 308 | 309 | #[test] 310 | fn test_zero_hash() { 311 | let mut scratch_pad = ScratchPad::default(); 312 | let mut input = [0u8; 112]; 313 | 314 | let hash = xelis_hash(&mut input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap(); 315 | let expected_hash = [ 316 | 105, 172, 103, 40, 94, 253, 92, 162, 317 | 42, 252, 5, 196, 236, 238, 91, 218, 318 | 22, 157, 228, 233, 239, 8, 250, 57, 319 | 212, 166, 121, 132, 148, 205, 103, 163 320 | ]; 321 | 322 | assert_eq!(hash, expected_hash); 323 | } 324 | 325 | #[test] 326 | fn test_verify_output() { 327 | let input = [ 328 | 172, 236, 108, 212, 181, 31, 109, 45, 44, 242, 54, 225, 143, 133, 329 | 89, 44, 179, 108, 39, 191, 32, 116, 229, 33, 63, 130, 33, 120, 185, 89, 330 | 146, 141, 10, 79, 183, 107, 238, 122, 92, 222, 25, 134, 90, 107, 116, 331 | 110, 236, 53, 255, 5, 214, 126, 24, 216, 97, 199, 148, 239, 253, 102, 332 | 199, 184, 232, 253, 158, 145, 86, 187, 112, 81, 78, 70, 80, 110, 33, 333 | 37, 159, 233, 198, 1, 178, 108, 210, 100, 109, 155, 106, 124, 124, 83, 334 | 89, 50, 197, 115, 231, 32, 74, 2, 92, 47, 25, 220, 135, 249, 122, 335 | 172, 220, 137, 143, 234, 68, 188 336 | ]; 337 | 338 | let mut scratch_pad = ScratchPad::default(); 339 | let hash = xelis_hash(&input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap(); 340 | 341 | let expected_hash = [ 342 | 242, 8, 176, 222, 203, 27, 104, 343 | 187, 22, 40, 68, 73, 79, 79, 65, 344 | 83, 138, 101, 10, 116, 194, 41, 153, 345 | 21, 92, 163, 12, 206, 231, 156, 70, 83 346 | ]; 347 | 348 | assert_eq!(hash, expected_hash); 349 | } 350 | 351 | #[test] 352 | #[cfg(feature = "tracker")] 353 | fn test_distribution() { 354 | const ITERATIONS: usize = 50_000; 355 | 356 | let mut scratch_pad = ScratchPad::default(); 357 | let mut input = [0u8; 112]; 358 | let mut distribution = OpsTracker::new(MEMORY_SIZE); 359 | for _ in 0..ITERATIONS { 360 | OsRng.fill_bytes(&mut input); 361 | let _ = xelis_hash(&input, &mut scratch_pad, &mut distribution).unwrap(); 362 | } 363 | 364 | distribution.generate_branch_distribution("branch_v3.png").unwrap(); 365 | distribution.generate_memory_usage_graph("memory_v3.png", 1000).unwrap(); 366 | } 367 | 368 | #[test] 369 | fn test_pick_half() { 370 | let mut ones = 0; 371 | let mut zeros = 0; 372 | 373 | for _ in 0..1_000_000 { 374 | let i = OsRng.next_u64(); 375 | if pick_half(i) { 376 | ones += 1; 377 | } else { 378 | zeros += 1; 379 | } 380 | } 381 | 382 | let ratio = ones as f64 / (ones + zeros) as f64; 383 | assert!((ratio - 0.5).abs() < 0.01, "pick_half is not balanced: ratio={}", ratio); 384 | } 385 | 386 | #[test] 387 | fn test_map_index() { 388 | for _ in 0..10_000_000 { 389 | let i = OsRng.next_u64(); 390 | let index = map_index(i); 391 | 392 | assert!(index < BUFFER_SIZE); 393 | } 394 | 395 | assert!(map_index(0) == 0); 396 | assert!(map_index(u64::MAX) < BUFFER_SIZE); 397 | } 398 | } -------------------------------------------------------------------------------- /src/v2.rs: -------------------------------------------------------------------------------- 1 | use aes::cipher::generic_array::GenericArray; 2 | use blake3::hash as blake3_hash; 3 | use chacha20::{ 4 | cipher::{KeyIvInit, StreamCipher}, 5 | ChaCha8, 6 | }; 7 | 8 | use crate::{ 9 | scratchpad::ScratchPad as ScratchPadInternal, 10 | Error, 11 | Hash, 12 | HASH_SIZE 13 | }; 14 | 15 | #[cfg(feature = "tracker")] 16 | use crate::tracker::{OpsTracker, MemOp}; 17 | 18 | // These are tweakable parameters 19 | // Memory size is the size of the scratch pad in u64s 20 | // In bytes, this is equal to ~ 440 kB 21 | const MEMORY_SIZE: usize = 429 * 128; 22 | 23 | // Scratchpad iterations in stage 3 24 | const SCRATCHPAD_ITERS: usize = 3; 25 | // Buffer size for stage 3 (inner loop iterations) 26 | const BUFFER_SIZE: usize = MEMORY_SIZE / 2; 27 | 28 | // Stage 1 config 29 | const CHUNK_SIZE: usize = 32; 30 | const NONCE_SIZE: usize = 12; 31 | const MEMORY_SIZE_BYTES: usize = MEMORY_SIZE * 8; 32 | 33 | // Stage 3 AES key 34 | const KEY: [u8; 16] = *b"xelishash-pow-v2"; 35 | 36 | pub type ScratchPad = ScratchPadInternal; 37 | 38 | // Combine two u64 into a u128 39 | #[inline(always)] 40 | pub(crate) fn combine_u64(high: u64, low: u64) -> u128 { 41 | (high as u128) << 64 | low as u128 42 | } 43 | 44 | // Stage 1 of the hashing algorithm 45 | // This stage is responsible for generating the scratch pad 46 | // The scratch pad is generated using ChaCha8 with a custom nonce 47 | // that is updated after each iteration 48 | pub(crate) fn stage_1(input: &[u8], scratch_pad: &mut ScratchPadInternal) -> Result<(), Error> { 49 | let bytes = scratch_pad.as_mut_bytes::()?; 50 | 51 | // Reset the scratchpad to 0 52 | // This is done to ensure that the scratchpad is clean 53 | // and prevent us to do multiple heap allocations in below loop 54 | bytes.fill(0); 55 | 56 | let mut output_offset = 0; 57 | let mut nonce = [0u8; NONCE_SIZE]; 58 | 59 | // Generate the nonce from the input 60 | let mut input_hash: Hash = blake3_hash(input).into(); 61 | nonce.copy_from_slice(&input_hash[..NONCE_SIZE]); 62 | 63 | let num_chunks = (input.len() + CHUNK_SIZE - 1) / CHUNK_SIZE; 64 | 65 | for (chunk_index, chunk) in input.chunks(CHUNK_SIZE).enumerate() { 66 | // Concatenate the input hash with the chunk 67 | let mut tmp = [0u8; HASH_SIZE * 2]; 68 | tmp[0..HASH_SIZE].copy_from_slice(&input_hash); 69 | tmp[HASH_SIZE..HASH_SIZE + chunk.len()].copy_from_slice(chunk); 70 | 71 | // Hash it to not trust the input 72 | input_hash = blake3_hash(&tmp).into(); 73 | 74 | let mut cipher = ChaCha8::new(&input_hash.into(), &nonce.into()); 75 | 76 | // Calculate the remaining size and how much to generate this iteration 77 | let remaining_output_size = OUTPUT_SIZE - output_offset; 78 | // Remaining chunks 79 | let chunks_left = num_chunks - chunk_index; 80 | let chunk_output_size = remaining_output_size / chunks_left; 81 | let current_output_size = remaining_output_size.min(chunk_output_size); 82 | 83 | // Apply the keystream to the output 84 | let offset = chunk_index * current_output_size; 85 | let part = &mut bytes[offset..offset+current_output_size]; 86 | cipher.apply_keystream(part); 87 | 88 | output_offset += current_output_size; 89 | 90 | // Update the nonce with the last NONCE_SIZE bytes of temp_output 91 | let nonce_start = current_output_size.saturating_sub(NONCE_SIZE); 92 | 93 | // Copy the new nonce 94 | nonce.copy_from_slice(&part[nonce_start..]); 95 | } 96 | 97 | Ok(()) 98 | } 99 | 100 | // Stage 3 of the hashing algorithm 101 | // This stage is responsible for hashing the scratch pad 102 | // Its goal is to have lot of random memory accesses 103 | // and some branching to make it hard to optimize on GPUs 104 | // it shouldn't be possible to parallelize this stage 105 | pub(crate) fn stage_3(scratch_pad: &mut [u64; MEMORY_SIZE], #[cfg(feature = "tracker")] tracker: &mut OpsTracker) -> Result<(), Error> { 106 | let key = GenericArray::from(KEY); 107 | let mut block = GenericArray::from([0u8; 16]); 108 | let buffer_size = BUFFER_SIZE as u64; 109 | 110 | // Create two new slices for each half 111 | let (mem_buffer_a, mem_buffer_b) = scratch_pad.as_mut_slice().split_at_mut(BUFFER_SIZE); 112 | 113 | let mut addr_a = mem_buffer_b[BUFFER_SIZE-1]; 114 | let mut addr_b = mem_buffer_a[BUFFER_SIZE-1] >> 32; 115 | 116 | #[cfg(feature = "tracker")] 117 | { 118 | tracker.add_mem_op(BUFFER_SIZE-1, MemOp::Read); 119 | tracker.add_mem_op(MEMORY_SIZE-1, MemOp::Read); 120 | } 121 | 122 | let mut r: usize = 0; 123 | 124 | for i in 0..SCRATCHPAD_ITERS { 125 | let index_a = (addr_a % buffer_size) as usize; 126 | let index_b = (addr_b % buffer_size) as usize; 127 | 128 | let mem_a = mem_buffer_a[index_a]; 129 | let mem_b = mem_buffer_b[index_b]; 130 | 131 | #[cfg(feature = "tracker")] 132 | { 133 | tracker.add_mem_op(index_a, MemOp::Read); 134 | tracker.add_mem_op(BUFFER_SIZE + index_b, MemOp::Read); 135 | } 136 | 137 | block[..8].copy_from_slice(&mem_b.to_le_bytes()); 138 | block[8..].copy_from_slice(&mem_a.to_le_bytes()); 139 | 140 | aes::hazmat::cipher_round(&mut block, &key); 141 | 142 | let hash1 = u64::from_le_bytes(block[0..8] 143 | .try_into() 144 | .map_err(|_| Error::FormatError)?); 145 | 146 | let hash2 = mem_a ^ mem_b; 147 | let mut result = !(hash1 ^ hash2); 148 | 149 | for j in 0..BUFFER_SIZE { 150 | let index_a = (result % buffer_size) as usize; 151 | let index_b = (!result.rotate_right(r as u32) % buffer_size) as usize; 152 | 153 | #[cfg(feature = "tracker")] 154 | { 155 | tracker.add_mem_op(index_a, MemOp::Read); 156 | tracker.add_mem_op(BUFFER_SIZE + index_b, MemOp::Read); 157 | } 158 | 159 | let a = mem_buffer_a[index_a]; 160 | let b = mem_buffer_b[index_b]; 161 | 162 | #[cfg(feature = "tracker")] 163 | { 164 | // This is the same index in scratchpad 165 | tracker.add_mem_op(r, MemOp::Read); 166 | } 167 | 168 | let c = if r < BUFFER_SIZE {mem_buffer_a[r]} else {mem_buffer_b[r-BUFFER_SIZE]}; 169 | r = if r < MEMORY_SIZE-1 {r+1} else {0}; 170 | 171 | let branch_idx = (result.rotate_left(c as u32) & 0xf) as u8; 172 | #[cfg(feature = "tracker")] 173 | { 174 | tracker.add_branch(branch_idx); 175 | } 176 | 177 | let v = result ^ match branch_idx { 178 | 0 => c.rotate_left(i.wrapping_mul(j) as u32) ^ b, 179 | 1 => c.rotate_right(i.wrapping_mul(j) as u32) ^ a, 180 | 2 => a ^ b ^ c, 181 | 3 => a.wrapping_add(b).wrapping_mul(c), 182 | 4 => b.wrapping_sub(c).wrapping_mul(a), 183 | 5 => c.wrapping_sub(a).wrapping_add(b), 184 | 6 => a.wrapping_sub(b).wrapping_add(c), 185 | 7 => b.wrapping_mul(c).wrapping_add(a), 186 | 8 => c.wrapping_mul(a).wrapping_add(b), 187 | 9 => a.wrapping_mul(b).wrapping_mul(c), 188 | 10 => { 189 | let t1 = combine_u64(a, b); 190 | let t2 = (c | 1) as u128; 191 | t1.wrapping_rem(t2) as u64 192 | }, 193 | 11 => { 194 | let t1 = combine_u64(b, c); 195 | let t2 = combine_u64(result.rotate_left(r as u32), a | 2); 196 | t1.wrapping_rem(t2) as u64 197 | }, 198 | 12 => { 199 | let t1 = combine_u64(c, a); 200 | let t2 = (b | 4) as u128; 201 | t1.wrapping_div(t2) as u64 202 | }, 203 | 13 => { 204 | let t1 = combine_u64(result.rotate_left(r as u32), b); 205 | let t2 = combine_u64(a, c | 8); 206 | if t1 > t2 {t1.wrapping_div(t2) as u64} else {a^b} 207 | }, 208 | 14 => { 209 | let t1 = combine_u64(b, a); 210 | let t2 = c as u128; 211 | (t1.wrapping_mul(t2) >> 64) as u64 212 | }, 213 | 15 => { 214 | let t1 = combine_u64(a, c); 215 | let t2 = combine_u64(result.rotate_right(r as u32), b); 216 | (t1.wrapping_mul(t2) >> 64) as u64 217 | }, 218 | _ => unreachable!(), 219 | }; 220 | 221 | result = v.rotate_left(1); 222 | 223 | #[cfg(feature = "tracker")] 224 | { 225 | tracker.add_mem_op(BUFFER_SIZE-j-1, MemOp::Write); 226 | tracker.add_mem_op(BUFFER_SIZE+j, MemOp::Write); 227 | } 228 | 229 | let t = mem_buffer_a[BUFFER_SIZE-j-1] ^ result; 230 | mem_buffer_a[BUFFER_SIZE-j-1] = t; 231 | mem_buffer_b[j] ^= t.rotate_right(result as u32); 232 | } 233 | addr_a = result; 234 | addr_b = isqrt(result); 235 | } 236 | 237 | Ok(()) 238 | } 239 | 240 | // Stage 4 hash the whole scratchpad using Blake3 to prevent any shortcut in 241 | // the scratchpad computation 242 | #[inline] 243 | pub(crate) fn stage_4(scratch_pad: &[u64]) -> Result { 244 | let bytes: &[u8] = bytemuck::try_cast_slice(scratch_pad) 245 | .map_err(Error::CastError)?; 246 | 247 | Ok(blake3_hash(bytes).into()) 248 | } 249 | 250 | fn isqrt(n: u64) -> u64 { 251 | if n < 2 { 252 | return n; 253 | } 254 | 255 | let mut x = n; 256 | let mut y = (x.wrapping_add(1)) >> 1; 257 | 258 | while y < x { 259 | x = y; 260 | y = (x.wrapping_add(n.wrapping_div(x))) >> 1; 261 | } 262 | 263 | x 264 | } 265 | 266 | // This function is used to hash the input using the generated scratch pad 267 | // NOTE: The scratchpad is completely overwritten in stage 1 and can be reused without any issues 268 | pub fn xelis_hash(input: &[u8], scratch_pad: &mut ScratchPad, #[cfg(feature = "tracker")] distribution: &mut OpsTracker) -> Result { 269 | stage_1::(input, scratch_pad)?; 270 | 271 | let scratch_pad = scratch_pad.as_mut_slice(); 272 | 273 | // stage 2 got removed as it got completely optimized on GPUs 274 | 275 | // stage 3 276 | stage_3(scratch_pad, #[cfg(feature = "tracker")] distribution)?; 277 | 278 | // final stage 4 279 | stage_4(scratch_pad) 280 | } 281 | 282 | #[cfg(test)] 283 | mod tests { 284 | use rand::{rngs::OsRng, RngCore}; 285 | use std::time::Instant; 286 | use super::*; 287 | 288 | const ITERATIONS: usize = 1000; 289 | 290 | #[test] 291 | fn test_reused_scratchpad() { 292 | let mut scratch_pad = ScratchPad::default(); 293 | let mut input = [0u8; 112]; 294 | OsRng.fill_bytes(&mut input); 295 | 296 | // Do a first hash 297 | let expected_hash = xelis_hash(&input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap(); 298 | 299 | // Do a second hash with dirty scratch pad but same input 300 | let hash = xelis_hash(&input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap(); 301 | assert_eq!(hash, expected_hash); 302 | } 303 | 304 | #[test] 305 | fn test_zero_hash() { 306 | let mut scratch_pad = ScratchPad::default(); 307 | let mut input = [0u8; 112]; 308 | 309 | let hash = xelis_hash(&mut input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap(); 310 | let expected_hash = [ 311 | 126, 219, 112, 240, 116, 133, 115, 144, 39, 40, 164, 312 | 105, 30, 158, 45, 126, 64, 67, 238, 52, 200, 35, 313 | 161, 19, 144, 211, 214, 225, 95, 190, 146, 27 314 | ]; 315 | 316 | assert_eq!(hash, expected_hash); 317 | } 318 | 319 | #[test] 320 | fn test_xelis_stages() { 321 | let mut input = [0u8; 112]; 322 | OsRng.fill_bytes(&mut input); 323 | 324 | let mut scratch_pad = ScratchPad::default(); 325 | let instant = Instant::now(); 326 | for i in 0..ITERATIONS { 327 | input[0] = i as u8; 328 | std::hint::black_box(stage_1::(&mut input, &mut scratch_pad).unwrap()); 329 | } 330 | println!("Stage 1 took: {} microseconds", instant.elapsed().as_micros() / ITERATIONS as u128); 331 | 332 | let instant = Instant::now(); 333 | for _ in 0..ITERATIONS { 334 | std::hint::black_box(stage_3(scratch_pad.as_mut_slice(), #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap()); 335 | } 336 | println!("Stage 3 took: {} microseconds", instant.elapsed().as_micros() / ITERATIONS as u128); 337 | 338 | let instant = Instant::now(); 339 | for _ in 0..ITERATIONS { 340 | std::hint::black_box(blake3_hash(scratch_pad.as_mut_bytes::().unwrap())); 341 | } 342 | println!("Stage 4 took: {} microseconds", instant.elapsed().as_micros() / ITERATIONS as u128); 343 | } 344 | 345 | #[test] 346 | fn test_verify_output() { 347 | let input = [ 348 | 172, 236, 108, 212, 181, 31, 109, 45, 44, 242, 54, 225, 143, 133, 349 | 89, 44, 179, 108, 39, 191, 32, 116, 229, 33, 63, 130, 33, 120, 185, 89, 350 | 146, 141, 10, 79, 183, 107, 238, 122, 92, 222, 25, 134, 90, 107, 116, 351 | 110, 236, 53, 255, 5, 214, 126, 24, 216, 97, 199, 148, 239, 253, 102, 352 | 199, 184, 232, 253, 158, 145, 86, 187, 112, 81, 78, 70, 80, 110, 33, 353 | 37, 159, 233, 198, 1, 178, 108, 210, 100, 109, 155, 106, 124, 124, 83, 354 | 89, 50, 197, 115, 231, 32, 74, 2, 92, 47, 25, 220, 135, 249, 122, 355 | 172, 220, 137, 143, 234, 68, 188 356 | ]; 357 | 358 | let mut scratch_pad = ScratchPad::default(); 359 | let hash = xelis_hash(&input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap(); 360 | 361 | let expected_hash = [ 362 | 199, 114, 154, 28, 4, 164, 196, 178, 117, 17, 148, 363 | 203, 125, 228, 51, 145, 162, 222, 106, 202, 205, 364 | 55, 244, 178, 94, 29, 248, 242, 98, 221, 158, 179 365 | ]; 366 | 367 | assert_eq!(hash, expected_hash); 368 | } 369 | 370 | #[test] 371 | #[cfg(feature = "tracker")] 372 | fn test_distribution() { 373 | let mut scratch_pad = ScratchPad::default(); 374 | let mut input = [0u8; 112]; 375 | let mut distribution = OpsTracker::new(MEMORY_SIZE); 376 | for _ in 0..ITERATIONS { 377 | OsRng.fill_bytes(&mut input); 378 | let _ = xelis_hash(&input, &mut scratch_pad, &mut distribution).unwrap(); 379 | } 380 | 381 | distribution.generate_branch_distribution("branch_v2.png").unwrap(); 382 | distribution.generate_memory_usage_graph("memory_v2.png", 100).unwrap(); 383 | } 384 | } -------------------------------------------------------------------------------- /C/ChaCha20-SIMD/chacha20_sse2.c: -------------------------------------------------------------------------------- 1 | #include "chacha20.h" 2 | #include 3 | #include 4 | 5 | static inline void PartialXor(const __m128i val, uint8_t *Src, uint8_t *Dest, uint64_t Size) 6 | { 7 | _Alignas(16) uint8_t BuffForPartialOp[16]; 8 | memcpy(BuffForPartialOp, Src, Size); 9 | _mm_storeu_si128((__m128i *)(BuffForPartialOp), _mm_xor_si128(val, _mm_loadu_si128((const __m128i *)BuffForPartialOp))); 10 | memcpy(Dest, BuffForPartialOp, Size); 11 | } 12 | static inline void PartialStore(const __m128i val, uint8_t *Dest, uint64_t Size) 13 | { 14 | _Alignas(16) uint8_t BuffForPartialOp[16]; 15 | _mm_storeu_si128((__m128i *)(BuffForPartialOp), val); 16 | memcpy(Dest, BuffForPartialOp, Size); 17 | } 18 | 19 | static inline __m128i RotateLeft7(const __m128i val) 20 | { 21 | return _mm_or_si128(_mm_slli_epi32(val, 7), _mm_srli_epi32(val, 32 - 7)); 22 | } 23 | 24 | static inline __m128i RotateLeft8(const __m128i val) 25 | { 26 | return _mm_or_si128(_mm_slli_epi32(val, 8), _mm_srli_epi32(val, 32 - 8)); 27 | } 28 | 29 | static inline __m128i RotateLeft12(const __m128i val) 30 | { 31 | return _mm_or_si128(_mm_slli_epi32(val, 12), _mm_srli_epi32(val, 32 - 12)); 32 | } 33 | 34 | static inline __m128i RotateLeft16(const __m128i val) 35 | { 36 | return _mm_or_si128(_mm_slli_epi32(val, 16), _mm_srli_epi32(val, 32 - 16)); 37 | } 38 | 39 | static void ChaCha20EncryptBytes(uint8_t *state, uint8_t *In, uint8_t *Out, size_t Size, uint32_t rounds) 40 | { 41 | 42 | uint8_t *CurrentIn = In; 43 | uint8_t *CurrentOut = Out; 44 | 45 | uint64_t FullBlocksCount = Size / 256; 46 | uint64_t RemainingBytes = Size % 256; 47 | 48 | const __m128i state0 = _mm_set_epi32(1797285236, 2036477234, 857760878, 1634760805); //"expand 32-byte k" 49 | const __m128i state1 = _mm_loadu_si128((const __m128i *)(state)); 50 | const __m128i state2 = _mm_loadu_si128((const __m128i *)((state) + 16)); 51 | 52 | for (int64_t n = 0; n < FullBlocksCount; n++) 53 | { 54 | 55 | const __m128i state3 = _mm_loadu_si128((const __m128i *)((state) + 32)); 56 | 57 | __m128i r0_0 = state0; 58 | __m128i r0_1 = state1; 59 | __m128i r0_2 = state2; 60 | __m128i r0_3 = state3; 61 | 62 | __m128i r1_0 = state0; 63 | __m128i r1_1 = state1; 64 | __m128i r1_2 = state2; 65 | __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1)); 66 | 67 | __m128i r2_0 = state0; 68 | __m128i r2_1 = state1; 69 | __m128i r2_2 = state2; 70 | __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2)); 71 | 72 | __m128i r3_0 = state0; 73 | __m128i r3_1 = state1; 74 | __m128i r3_2 = state2; 75 | __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3)); 76 | 77 | for (int i = rounds; i > 0; i -= 2) 78 | { 79 | r0_0 = _mm_add_epi32(r0_0, r0_1); 80 | r1_0 = _mm_add_epi32(r1_0, r1_1); 81 | r2_0 = _mm_add_epi32(r2_0, r2_1); 82 | r3_0 = _mm_add_epi32(r3_0, r3_1); 83 | 84 | r0_3 = _mm_xor_si128(r0_3, r0_0); 85 | r1_3 = _mm_xor_si128(r1_3, r1_0); 86 | r2_3 = _mm_xor_si128(r2_3, r2_0); 87 | r3_3 = _mm_xor_si128(r3_3, r3_0); 88 | 89 | r0_3 = RotateLeft16(r0_3); 90 | r1_3 = RotateLeft16(r1_3); 91 | r2_3 = RotateLeft16(r2_3); 92 | r3_3 = RotateLeft16(r3_3); 93 | 94 | r0_2 = _mm_add_epi32(r0_2, r0_3); 95 | r1_2 = _mm_add_epi32(r1_2, r1_3); 96 | r2_2 = _mm_add_epi32(r2_2, r2_3); 97 | r3_2 = _mm_add_epi32(r3_2, r3_3); 98 | 99 | r0_1 = _mm_xor_si128(r0_1, r0_2); 100 | r1_1 = _mm_xor_si128(r1_1, r1_2); 101 | r2_1 = _mm_xor_si128(r2_1, r2_2); 102 | r3_1 = _mm_xor_si128(r3_1, r3_2); 103 | 104 | r0_1 = RotateLeft12(r0_1); 105 | r1_1 = RotateLeft12(r1_1); 106 | r2_1 = RotateLeft12(r2_1); 107 | r3_1 = RotateLeft12(r3_1); 108 | 109 | r0_0 = _mm_add_epi32(r0_0, r0_1); 110 | r1_0 = _mm_add_epi32(r1_0, r1_1); 111 | r2_0 = _mm_add_epi32(r2_0, r2_1); 112 | r3_0 = _mm_add_epi32(r3_0, r3_1); 113 | 114 | r0_3 = _mm_xor_si128(r0_3, r0_0); 115 | r1_3 = _mm_xor_si128(r1_3, r1_0); 116 | r2_3 = _mm_xor_si128(r2_3, r2_0); 117 | r3_3 = _mm_xor_si128(r3_3, r3_0); 118 | 119 | r0_3 = RotateLeft8(r0_3); 120 | r1_3 = RotateLeft8(r1_3); 121 | r2_3 = RotateLeft8(r2_3); 122 | r3_3 = RotateLeft8(r3_3); 123 | 124 | r0_2 = _mm_add_epi32(r0_2, r0_3); 125 | r1_2 = _mm_add_epi32(r1_2, r1_3); 126 | r2_2 = _mm_add_epi32(r2_2, r2_3); 127 | r3_2 = _mm_add_epi32(r3_2, r3_3); 128 | 129 | r0_1 = _mm_xor_si128(r0_1, r0_2); 130 | r1_1 = _mm_xor_si128(r1_1, r1_2); 131 | r2_1 = _mm_xor_si128(r2_1, r2_2); 132 | r3_1 = _mm_xor_si128(r3_1, r3_2); 133 | 134 | r0_1 = RotateLeft7(r0_1); 135 | r1_1 = RotateLeft7(r1_1); 136 | r2_1 = RotateLeft7(r2_1); 137 | r3_1 = RotateLeft7(r3_1); 138 | 139 | r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1)); 140 | r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); 141 | r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3)); 142 | 143 | r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1)); 144 | r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); 145 | r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3)); 146 | 147 | r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1)); 148 | r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); 149 | r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3)); 150 | 151 | r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1)); 152 | r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); 153 | r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3)); 154 | 155 | r0_0 = _mm_add_epi32(r0_0, r0_1); 156 | r1_0 = _mm_add_epi32(r1_0, r1_1); 157 | r2_0 = _mm_add_epi32(r2_0, r2_1); 158 | r3_0 = _mm_add_epi32(r3_0, r3_1); 159 | 160 | r0_3 = _mm_xor_si128(r0_3, r0_0); 161 | r1_3 = _mm_xor_si128(r1_3, r1_0); 162 | r2_3 = _mm_xor_si128(r2_3, r2_0); 163 | r3_3 = _mm_xor_si128(r3_3, r3_0); 164 | 165 | r0_3 = RotateLeft16(r0_3); 166 | r1_3 = RotateLeft16(r1_3); 167 | r2_3 = RotateLeft16(r2_3); 168 | r3_3 = RotateLeft16(r3_3); 169 | 170 | r0_2 = _mm_add_epi32(r0_2, r0_3); 171 | r1_2 = _mm_add_epi32(r1_2, r1_3); 172 | r2_2 = _mm_add_epi32(r2_2, r2_3); 173 | r3_2 = _mm_add_epi32(r3_2, r3_3); 174 | 175 | r0_1 = _mm_xor_si128(r0_1, r0_2); 176 | r1_1 = _mm_xor_si128(r1_1, r1_2); 177 | r2_1 = _mm_xor_si128(r2_1, r2_2); 178 | r3_1 = _mm_xor_si128(r3_1, r3_2); 179 | 180 | r0_1 = RotateLeft12(r0_1); 181 | r1_1 = RotateLeft12(r1_1); 182 | r2_1 = RotateLeft12(r2_1); 183 | r3_1 = RotateLeft12(r3_1); 184 | 185 | r0_0 = _mm_add_epi32(r0_0, r0_1); 186 | r1_0 = _mm_add_epi32(r1_0, r1_1); 187 | r2_0 = _mm_add_epi32(r2_0, r2_1); 188 | r3_0 = _mm_add_epi32(r3_0, r3_1); 189 | 190 | r0_3 = _mm_xor_si128(r0_3, r0_0); 191 | r1_3 = _mm_xor_si128(r1_3, r1_0); 192 | r2_3 = _mm_xor_si128(r2_3, r2_0); 193 | r3_3 = _mm_xor_si128(r3_3, r3_0); 194 | 195 | r0_3 = RotateLeft8(r0_3); 196 | r1_3 = RotateLeft8(r1_3); 197 | r2_3 = RotateLeft8(r2_3); 198 | r3_3 = RotateLeft8(r3_3); 199 | 200 | r0_2 = _mm_add_epi32(r0_2, r0_3); 201 | r1_2 = _mm_add_epi32(r1_2, r1_3); 202 | r2_2 = _mm_add_epi32(r2_2, r2_3); 203 | r3_2 = _mm_add_epi32(r3_2, r3_3); 204 | 205 | r0_1 = _mm_xor_si128(r0_1, r0_2); 206 | r1_1 = _mm_xor_si128(r1_1, r1_2); 207 | r2_1 = _mm_xor_si128(r2_1, r2_2); 208 | r3_1 = _mm_xor_si128(r3_1, r3_2); 209 | 210 | r0_1 = RotateLeft7(r0_1); 211 | r1_1 = RotateLeft7(r1_1); 212 | r2_1 = RotateLeft7(r2_1); 213 | r3_1 = RotateLeft7(r3_1); 214 | 215 | r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3)); 216 | r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); 217 | r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1)); 218 | 219 | r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3)); 220 | r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); 221 | r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1)); 222 | 223 | r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3)); 224 | r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); 225 | r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1)); 226 | 227 | r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3)); 228 | r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); 229 | r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1)); 230 | } 231 | 232 | r0_0 = _mm_add_epi32(r0_0, state0); 233 | r0_1 = _mm_add_epi32(r0_1, state1); 234 | r0_2 = _mm_add_epi32(r0_2, state2); 235 | r0_3 = _mm_add_epi32(r0_3, state3); 236 | 237 | r1_0 = _mm_add_epi32(r1_0, state0); 238 | r1_1 = _mm_add_epi32(r1_1, state1); 239 | r1_2 = _mm_add_epi32(r1_2, state2); 240 | r1_3 = _mm_add_epi32(r1_3, state3); 241 | r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1)); 242 | 243 | r2_0 = _mm_add_epi32(r2_0, state0); 244 | r2_1 = _mm_add_epi32(r2_1, state1); 245 | r2_2 = _mm_add_epi32(r2_2, state2); 246 | r2_3 = _mm_add_epi32(r2_3, state3); 247 | r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2)); 248 | 249 | r3_0 = _mm_add_epi32(r3_0, state0); 250 | r3_1 = _mm_add_epi32(r3_1, state1); 251 | r3_2 = _mm_add_epi32(r3_2, state2); 252 | r3_3 = _mm_add_epi32(r3_3, state3); 253 | r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3)); 254 | 255 | if (In) 256 | { 257 | _mm_storeu_si128((__m128i *)(CurrentOut + 0 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 0 * 16)), r0_0)); 258 | _mm_storeu_si128((__m128i *)(CurrentOut + 1 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 1 * 16)), r0_1)); 259 | _mm_storeu_si128((__m128i *)(CurrentOut + 2 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 2 * 16)), r0_2)); 260 | _mm_storeu_si128((__m128i *)(CurrentOut + 3 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 3 * 16)), r0_3)); 261 | 262 | _mm_storeu_si128((__m128i *)(CurrentOut + 4 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 4 * 16)), r1_0)); 263 | _mm_storeu_si128((__m128i *)(CurrentOut + 5 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 5 * 16)), r1_1)); 264 | _mm_storeu_si128((__m128i *)(CurrentOut + 6 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 6 * 16)), r1_2)); 265 | _mm_storeu_si128((__m128i *)(CurrentOut + 7 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 7 * 16)), r1_3)); 266 | 267 | _mm_storeu_si128((__m128i *)(CurrentOut + 8 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 8 * 16)), r2_0)); 268 | _mm_storeu_si128((__m128i *)(CurrentOut + 9 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 9 * 16)), r2_1)); 269 | _mm_storeu_si128((__m128i *)(CurrentOut + 10 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 10 * 16)), r2_2)); 270 | _mm_storeu_si128((__m128i *)(CurrentOut + 11 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 11 * 16)), r2_3)); 271 | 272 | _mm_storeu_si128((__m128i *)(CurrentOut + 12 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 12 * 16)), r3_0)); 273 | _mm_storeu_si128((__m128i *)(CurrentOut + 13 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 13 * 16)), r3_1)); 274 | _mm_storeu_si128((__m128i *)(CurrentOut + 14 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 14 * 16)), r3_2)); 275 | _mm_storeu_si128((__m128i *)(CurrentOut + 15 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 15 * 16)), r3_3)); 276 | CurrentIn += 256; 277 | } 278 | else 279 | { 280 | _mm_storeu_si128((__m128i *)(CurrentOut + 0 * 16), r0_0); 281 | _mm_storeu_si128((__m128i *)(CurrentOut + 1 * 16), r0_1); 282 | _mm_storeu_si128((__m128i *)(CurrentOut + 2 * 16), r0_2); 283 | _mm_storeu_si128((__m128i *)(CurrentOut + 3 * 16), r0_3); 284 | 285 | _mm_storeu_si128((__m128i *)(CurrentOut + 4 * 16), r1_0); 286 | _mm_storeu_si128((__m128i *)(CurrentOut + 5 * 16), r1_1); 287 | _mm_storeu_si128((__m128i *)(CurrentOut + 6 * 16), r1_2); 288 | _mm_storeu_si128((__m128i *)(CurrentOut + 7 * 16), r1_3); 289 | 290 | _mm_storeu_si128((__m128i *)(CurrentOut + 8 * 16), r2_0); 291 | _mm_storeu_si128((__m128i *)(CurrentOut + 9 * 16), r2_1); 292 | _mm_storeu_si128((__m128i *)(CurrentOut + 10 * 16), r2_2); 293 | _mm_storeu_si128((__m128i *)(CurrentOut + 11 * 16), r2_3); 294 | 295 | _mm_storeu_si128((__m128i *)(CurrentOut + 12 * 16), r3_0); 296 | _mm_storeu_si128((__m128i *)(CurrentOut + 13 * 16), r3_1); 297 | _mm_storeu_si128((__m128i *)(CurrentOut + 14 * 16), r3_2); 298 | _mm_storeu_si128((__m128i *)(CurrentOut + 15 * 16), r3_3); 299 | } 300 | 301 | CurrentOut += 256; 302 | 303 | ChaCha20AddCounter(state, 4); 304 | } 305 | 306 | if (RemainingBytes == 0) 307 | return; 308 | 309 | while (1) 310 | { 311 | const __m128i state3 = _mm_loadu_si128((const __m128i *)((state) + 32)); 312 | 313 | __m128i r0_0 = state0; 314 | __m128i r0_1 = state1; 315 | __m128i r0_2 = state2; 316 | __m128i r0_3 = state3; 317 | 318 | for (int i = rounds; i > 0; i -= 2) 319 | { 320 | r0_0 = _mm_add_epi32(r0_0, r0_1); 321 | 322 | r0_3 = _mm_xor_si128(r0_3, r0_0); 323 | 324 | r0_3 = RotateLeft16(r0_3); 325 | 326 | r0_2 = _mm_add_epi32(r0_2, r0_3); 327 | 328 | r0_1 = _mm_xor_si128(r0_1, r0_2); 329 | 330 | r0_1 = RotateLeft12(r0_1); 331 | 332 | r0_0 = _mm_add_epi32(r0_0, r0_1); 333 | 334 | r0_3 = _mm_xor_si128(r0_3, r0_0); 335 | 336 | r0_3 = RotateLeft8(r0_3); 337 | 338 | r0_2 = _mm_add_epi32(r0_2, r0_3); 339 | 340 | r0_1 = _mm_xor_si128(r0_1, r0_2); 341 | 342 | r0_1 = RotateLeft7(r0_1); 343 | 344 | r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1)); 345 | r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); 346 | r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3)); 347 | 348 | r0_0 = _mm_add_epi32(r0_0, r0_1); 349 | 350 | r0_3 = _mm_xor_si128(r0_3, r0_0); 351 | 352 | r0_3 = RotateLeft16(r0_3); 353 | 354 | r0_2 = _mm_add_epi32(r0_2, r0_3); 355 | 356 | r0_1 = _mm_xor_si128(r0_1, r0_2); 357 | 358 | r0_1 = RotateLeft12(r0_1); 359 | 360 | r0_0 = _mm_add_epi32(r0_0, r0_1); 361 | 362 | r0_3 = _mm_xor_si128(r0_3, r0_0); 363 | 364 | r0_3 = RotateLeft8(r0_3); 365 | 366 | r0_2 = _mm_add_epi32(r0_2, r0_3); 367 | 368 | r0_1 = _mm_xor_si128(r0_1, r0_2); 369 | 370 | r0_1 = RotateLeft7(r0_1); 371 | 372 | r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3)); 373 | r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); 374 | r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1)); 375 | } 376 | 377 | r0_0 = _mm_add_epi32(r0_0, state0); 378 | r0_1 = _mm_add_epi32(r0_1, state1); 379 | r0_2 = _mm_add_epi32(r0_2, state2); 380 | r0_3 = _mm_add_epi32(r0_3, state3); 381 | 382 | if (RemainingBytes >= 64) 383 | { 384 | 385 | if (In) 386 | { 387 | _mm_storeu_si128((__m128i *)(CurrentOut + 0 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 0 * 16)), r0_0)); 388 | _mm_storeu_si128((__m128i *)(CurrentOut + 1 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 1 * 16)), r0_1)); 389 | _mm_storeu_si128((__m128i *)(CurrentOut + 2 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 2 * 16)), r0_2)); 390 | _mm_storeu_si128((__m128i *)(CurrentOut + 3 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 3 * 16)), r0_3)); 391 | CurrentIn += 64; 392 | } 393 | else 394 | { 395 | _mm_storeu_si128((__m128i *)(CurrentOut + 0 * 16), r0_0); 396 | _mm_storeu_si128((__m128i *)(CurrentOut + 1 * 16), r0_1); 397 | _mm_storeu_si128((__m128i *)(CurrentOut + 2 * 16), r0_2); 398 | _mm_storeu_si128((__m128i *)(CurrentOut + 3 * 16), r0_3); 399 | } 400 | CurrentOut += 64; 401 | ChaCha20AddCounter(state, 1); 402 | RemainingBytes -= 64; 403 | if (RemainingBytes == 0) 404 | return; 405 | continue; 406 | } 407 | else 408 | { 409 | _Alignas(16) uint8_t TmpBuf[64]; 410 | if (In) 411 | { 412 | memcpy(TmpBuf, CurrentIn, RemainingBytes); 413 | _mm_storeu_si128((__m128i *)(TmpBuf + 0 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(TmpBuf + 0 * 16)), r0_0)); 414 | _mm_storeu_si128((__m128i *)(TmpBuf + 1 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(TmpBuf + 1 * 16)), r0_1)); 415 | _mm_storeu_si128((__m128i *)(TmpBuf + 2 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(TmpBuf + 2 * 16)), r0_2)); 416 | _mm_storeu_si128((__m128i *)(TmpBuf + 3 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(TmpBuf + 3 * 16)), r0_3)); 417 | } 418 | else 419 | { 420 | _mm_storeu_si128((__m128i *)(TmpBuf + 0 * 16), r0_0); 421 | _mm_storeu_si128((__m128i *)(TmpBuf + 1 * 16), r0_1); 422 | _mm_storeu_si128((__m128i *)(TmpBuf + 2 * 16), r0_2); 423 | _mm_storeu_si128((__m128i *)(TmpBuf + 3 * 16), r0_3); 424 | } 425 | memcpy(CurrentOut, TmpBuf, RemainingBytes); 426 | ChaCha20AddCounter(state, 1); 427 | return; 428 | } 429 | } 430 | } 431 | 432 | void chacha_encrypt_sse2(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds) 433 | { 434 | uint8_t state[48] = {0}; 435 | ChaCha20SetKey(state, key); 436 | ChaCha20SetNonce(state, nonce); 437 | ChaCha20EncryptBytes(state, in, out, bytes, rounds); 438 | } -------------------------------------------------------------------------------- /C/ChaCha20-SIMD/chacha20_avx2.c: -------------------------------------------------------------------------------- 1 | #include "chacha20.h" 2 | #include 3 | #include 4 | 5 | static inline void PartialXor(const __m256i val, const uint8_t *Src, uint8_t *Dest, uint64_t Size) 6 | { 7 | _Alignas(32) uint8_t BuffForPartialOp[32]; 8 | memcpy(BuffForPartialOp, Src, Size); 9 | _mm256_storeu_si256((__m256i *)(BuffForPartialOp), _mm256_xor_si256(val, _mm256_loadu_si256((const __m256i *)BuffForPartialOp))); 10 | memcpy(Dest, BuffForPartialOp, Size); 11 | } 12 | static inline void PartialStore(const __m256i val, uint8_t *Dest, uint64_t Size) 13 | { 14 | _Alignas(32) uint8_t BuffForPartialOp[32]; 15 | _mm256_storeu_si256((__m256i *)(BuffForPartialOp), val); 16 | memcpy(Dest, BuffForPartialOp, Size); 17 | } 18 | 19 | static __m256i RotateLeft7(const __m256i val) 20 | { 21 | return _mm256_or_si256(_mm256_slli_epi32(val, 7), 22 | _mm256_srli_epi32(val, 32 - 7)); 23 | } 24 | 25 | static __m256i RotateLeft12(const __m256i val) 26 | { 27 | return _mm256_or_si256(_mm256_slli_epi32(val, 12), 28 | _mm256_srli_epi32(val, 32 - 12)); 29 | } 30 | 31 | static __m256i RotateLeft8(const __m256i val) 32 | { 33 | const __m256i mask = 34 | _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 14, 35 | 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); 36 | return _mm256_shuffle_epi8(val, mask); 37 | } 38 | 39 | static __m256i RotateLeft16(const __m256i val) 40 | { 41 | const __m256i mask = 42 | _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13, 43 | 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); 44 | return _mm256_shuffle_epi8(val, mask); 45 | } 46 | 47 | static void ChaCha20EncryptBytes(uint8_t *state, uint8_t *In, uint8_t *Out, size_t Size, uint32_t rounds) 48 | { 49 | 50 | uint8_t *CurrentIn = In; 51 | uint8_t *CurrentOut = Out; 52 | 53 | uint64_t FullBlocksCount = Size / 512; 54 | uint64_t RemainingBytes = Size % 512; 55 | 56 | const __m256i state0 = _mm256_broadcastsi128_si256(_mm_set_epi32(1797285236, 2036477234, 857760878, 1634760805)); //"expand 32-byte k" 57 | const __m256i state1 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i *)(state))); 58 | const __m256i state2 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i *)(state + 16))); 59 | 60 | __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 4); 61 | const __m256i CTR1 = _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 5); 62 | const __m256i CTR2 = _mm256_set_epi32(0, 0, 0, 2, 0, 0, 0, 6); 63 | const __m256i CTR3 = _mm256_set_epi32(0, 0, 0, 3, 0, 0, 0, 7); 64 | 65 | for (int64_t n = 0; n < FullBlocksCount; n++) 66 | { 67 | 68 | const __m256i state3 = _mm256_broadcastsi128_si256( 69 | _mm_load_si128((const __m128i *)(state + 32))); 70 | 71 | __m256i X0_0 = state0; 72 | __m256i X0_1 = state1; 73 | __m256i X0_2 = state2; 74 | __m256i X0_3 = _mm256_add_epi32(state3, CTR0); 75 | 76 | __m256i X1_0 = state0; 77 | __m256i X1_1 = state1; 78 | __m256i X1_2 = state2; 79 | __m256i X1_3 = _mm256_add_epi32(state3, CTR1); 80 | 81 | __m256i X2_0 = state0; 82 | __m256i X2_1 = state1; 83 | __m256i X2_2 = state2; 84 | __m256i X2_3 = _mm256_add_epi32(state3, CTR2); 85 | 86 | __m256i X3_0 = state0; 87 | __m256i X3_1 = state1; 88 | __m256i X3_2 = state2; 89 | __m256i X3_3 = _mm256_add_epi32(state3, CTR3); 90 | 91 | for (int i = rounds; i > 0; i -= 2) 92 | { 93 | X0_0 = _mm256_add_epi32(X0_0, X0_1); 94 | X1_0 = _mm256_add_epi32(X1_0, X1_1); 95 | X2_0 = _mm256_add_epi32(X2_0, X2_1); 96 | X3_0 = _mm256_add_epi32(X3_0, X3_1); 97 | 98 | X0_3 = _mm256_xor_si256(X0_3, X0_0); 99 | X1_3 = _mm256_xor_si256(X1_3, X1_0); 100 | X2_3 = _mm256_xor_si256(X2_3, X2_0); 101 | X3_3 = _mm256_xor_si256(X3_3, X3_0); 102 | 103 | X0_3 = RotateLeft16(X0_3); 104 | X1_3 = RotateLeft16(X1_3); 105 | X2_3 = RotateLeft16(X2_3); 106 | X3_3 = RotateLeft16(X3_3); 107 | 108 | X0_2 = _mm256_add_epi32(X0_2, X0_3); 109 | X1_2 = _mm256_add_epi32(X1_2, X1_3); 110 | X2_2 = _mm256_add_epi32(X2_2, X2_3); 111 | X3_2 = _mm256_add_epi32(X3_2, X3_3); 112 | 113 | X0_1 = _mm256_xor_si256(X0_1, X0_2); 114 | X1_1 = _mm256_xor_si256(X1_1, X1_2); 115 | X2_1 = _mm256_xor_si256(X2_1, X2_2); 116 | X3_1 = _mm256_xor_si256(X3_1, X3_2); 117 | 118 | X0_1 = RotateLeft12(X0_1); 119 | X1_1 = RotateLeft12(X1_1); 120 | X2_1 = RotateLeft12(X2_1); 121 | X3_1 = RotateLeft12(X3_1); 122 | 123 | X0_0 = _mm256_add_epi32(X0_0, X0_1); 124 | X1_0 = _mm256_add_epi32(X1_0, X1_1); 125 | X2_0 = _mm256_add_epi32(X2_0, X2_1); 126 | X3_0 = _mm256_add_epi32(X3_0, X3_1); 127 | 128 | X0_3 = _mm256_xor_si256(X0_3, X0_0); 129 | X1_3 = _mm256_xor_si256(X1_3, X1_0); 130 | X2_3 = _mm256_xor_si256(X2_3, X2_0); 131 | X3_3 = _mm256_xor_si256(X3_3, X3_0); 132 | 133 | X0_3 = RotateLeft8(X0_3); 134 | X1_3 = RotateLeft8(X1_3); 135 | X2_3 = RotateLeft8(X2_3); 136 | X3_3 = RotateLeft8(X3_3); 137 | 138 | X0_2 = _mm256_add_epi32(X0_2, X0_3); 139 | X1_2 = _mm256_add_epi32(X1_2, X1_3); 140 | X2_2 = _mm256_add_epi32(X2_2, X2_3); 141 | X3_2 = _mm256_add_epi32(X3_2, X3_3); 142 | 143 | X0_1 = _mm256_xor_si256(X0_1, X0_2); 144 | X1_1 = _mm256_xor_si256(X1_1, X1_2); 145 | X2_1 = _mm256_xor_si256(X2_1, X2_2); 146 | X3_1 = _mm256_xor_si256(X3_1, X3_2); 147 | 148 | X0_1 = RotateLeft7(X0_1); 149 | X1_1 = RotateLeft7(X1_1); 150 | X2_1 = RotateLeft7(X2_1); 151 | X3_1 = RotateLeft7(X3_1); 152 | 153 | X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1)); 154 | X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2)); 155 | X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3)); 156 | 157 | X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1)); 158 | X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2)); 159 | X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3)); 160 | 161 | X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1)); 162 | X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2)); 163 | X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3)); 164 | 165 | X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1)); 166 | X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2)); 167 | X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3)); 168 | 169 | X0_0 = _mm256_add_epi32(X0_0, X0_1); 170 | X1_0 = _mm256_add_epi32(X1_0, X1_1); 171 | X2_0 = _mm256_add_epi32(X2_0, X2_1); 172 | X3_0 = _mm256_add_epi32(X3_0, X3_1); 173 | 174 | X0_3 = _mm256_xor_si256(X0_3, X0_0); 175 | X1_3 = _mm256_xor_si256(X1_3, X1_0); 176 | X2_3 = _mm256_xor_si256(X2_3, X2_0); 177 | X3_3 = _mm256_xor_si256(X3_3, X3_0); 178 | 179 | X0_3 = RotateLeft16(X0_3); 180 | X1_3 = RotateLeft16(X1_3); 181 | X2_3 = RotateLeft16(X2_3); 182 | X3_3 = RotateLeft16(X3_3); 183 | 184 | X0_2 = _mm256_add_epi32(X0_2, X0_3); 185 | X1_2 = _mm256_add_epi32(X1_2, X1_3); 186 | X2_2 = _mm256_add_epi32(X2_2, X2_3); 187 | X3_2 = _mm256_add_epi32(X3_2, X3_3); 188 | 189 | X0_1 = _mm256_xor_si256(X0_1, X0_2); 190 | X1_1 = _mm256_xor_si256(X1_1, X1_2); 191 | X2_1 = _mm256_xor_si256(X2_1, X2_2); 192 | X3_1 = _mm256_xor_si256(X3_1, X3_2); 193 | 194 | X0_1 = RotateLeft12(X0_1); 195 | X1_1 = RotateLeft12(X1_1); 196 | X2_1 = RotateLeft12(X2_1); 197 | X3_1 = RotateLeft12(X3_1); 198 | 199 | X0_0 = _mm256_add_epi32(X0_0, X0_1); 200 | X1_0 = _mm256_add_epi32(X1_0, X1_1); 201 | X2_0 = _mm256_add_epi32(X2_0, X2_1); 202 | X3_0 = _mm256_add_epi32(X3_0, X3_1); 203 | 204 | X0_3 = _mm256_xor_si256(X0_3, X0_0); 205 | X1_3 = _mm256_xor_si256(X1_3, X1_0); 206 | X2_3 = _mm256_xor_si256(X2_3, X2_0); 207 | X3_3 = _mm256_xor_si256(X3_3, X3_0); 208 | 209 | X0_3 = RotateLeft8(X0_3); 210 | X1_3 = RotateLeft8(X1_3); 211 | X2_3 = RotateLeft8(X2_3); 212 | X3_3 = RotateLeft8(X3_3); 213 | 214 | X0_2 = _mm256_add_epi32(X0_2, X0_3); 215 | X1_2 = _mm256_add_epi32(X1_2, X1_3); 216 | X2_2 = _mm256_add_epi32(X2_2, X2_3); 217 | X3_2 = _mm256_add_epi32(X3_2, X3_3); 218 | 219 | X0_1 = _mm256_xor_si256(X0_1, X0_2); 220 | X1_1 = _mm256_xor_si256(X1_1, X1_2); 221 | X2_1 = _mm256_xor_si256(X2_1, X2_2); 222 | X3_1 = _mm256_xor_si256(X3_1, X3_2); 223 | 224 | X0_1 = RotateLeft7(X0_1); 225 | X1_1 = RotateLeft7(X1_1); 226 | X2_1 = RotateLeft7(X2_1); 227 | X3_1 = RotateLeft7(X3_1); 228 | 229 | X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3)); 230 | X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2)); 231 | X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1)); 232 | 233 | X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3)); 234 | X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2)); 235 | X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1)); 236 | 237 | X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3)); 238 | X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2)); 239 | X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1)); 240 | 241 | X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3)); 242 | X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2)); 243 | X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1)); 244 | } 245 | 246 | X0_0 = _mm256_add_epi32(X0_0, state0); 247 | X0_1 = _mm256_add_epi32(X0_1, state1); 248 | X0_2 = _mm256_add_epi32(X0_2, state2); 249 | X0_3 = _mm256_add_epi32(X0_3, state3); 250 | X0_3 = _mm256_add_epi32(X0_3, CTR0); 251 | 252 | X1_0 = _mm256_add_epi32(X1_0, state0); 253 | X1_1 = _mm256_add_epi32(X1_1, state1); 254 | X1_2 = _mm256_add_epi32(X1_2, state2); 255 | X1_3 = _mm256_add_epi32(X1_3, state3); 256 | X1_3 = _mm256_add_epi32(X1_3, CTR1); 257 | 258 | X2_0 = _mm256_add_epi32(X2_0, state0); 259 | X2_1 = _mm256_add_epi32(X2_1, state1); 260 | X2_2 = _mm256_add_epi32(X2_2, state2); 261 | X2_3 = _mm256_add_epi32(X2_3, state3); 262 | X2_3 = _mm256_add_epi32(X2_3, CTR2); 263 | 264 | X3_0 = _mm256_add_epi32(X3_0, state0); 265 | X3_1 = _mm256_add_epi32(X3_1, state1); 266 | X3_2 = _mm256_add_epi32(X3_2, state2); 267 | X3_3 = _mm256_add_epi32(X3_3, state3); 268 | X3_3 = _mm256_add_epi32(X3_3, CTR3); 269 | 270 | // 271 | 272 | if (In) 273 | { 274 | _mm256_storeu_si256((__m256i *)(CurrentOut + 0 * 32), 275 | _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)), 276 | _mm256_loadu_si256((__m256i *)(CurrentIn + 0 * 32)))); 277 | _mm256_storeu_si256((__m256i *)(CurrentOut + 1 * 32), 278 | _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)), 279 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 1 * 32)))); 280 | _mm256_storeu_si256((__m256i *)(CurrentOut + 2 * 32), 281 | _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)), 282 | _mm256_loadu_si256(((const __m256i *)(CurrentIn + 2 * 32))))); 283 | _mm256_storeu_si256((__m256i *)(CurrentOut + 3 * 32), 284 | _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)), 285 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 3 * 32)))); 286 | 287 | _mm256_storeu_si256((__m256i *)(CurrentOut + 4 * 32), 288 | _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)), 289 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 4 * 32)))); 290 | _mm256_storeu_si256((__m256i *)(CurrentOut + 5 * 32), 291 | _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)), 292 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 5 * 32)))); 293 | _mm256_storeu_si256((__m256i *)(CurrentOut + 6 * 32), 294 | _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)), 295 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 6 * 32)))); 296 | _mm256_storeu_si256((__m256i *)(CurrentOut + 7 * 32), 297 | _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)), 298 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 7 * 32)))); 299 | 300 | _mm256_storeu_si256( 301 | (__m256i *)(CurrentOut + 8 * 32), 302 | _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)), 303 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 8 * 32)))); 304 | _mm256_storeu_si256((__m256i *)(CurrentOut + 9 * 32), 305 | _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)), 306 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 9 * 32)))); 307 | _mm256_storeu_si256((__m256i *)(CurrentOut + 10 * 32), 308 | _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)), 309 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 10 * 32)))); 310 | _mm256_storeu_si256((__m256i *)(CurrentOut + 11 * 32), 311 | _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)), 312 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 11 * 32)))); 313 | 314 | _mm256_storeu_si256((__m256i *)(CurrentOut + 12 * 32), 315 | _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)), 316 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 12 * 32)))); 317 | _mm256_storeu_si256((__m256i *)(CurrentOut + 13 * 32), 318 | _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)), 319 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 13 * 32)))); 320 | _mm256_storeu_si256((__m256i *)(CurrentOut + 14 * 32), 321 | _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)), 322 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 14 * 32)))); 323 | _mm256_storeu_si256((__m256i *)(CurrentOut + 15 * 32), 324 | _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)), 325 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 15 * 32)))); 326 | } 327 | else 328 | { 329 | _mm256_storeu_si256((__m256i *)(CurrentOut + 0 * 32), 330 | _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4))); 331 | _mm256_storeu_si256((__m256i *)(CurrentOut + 1 * 32), 332 | _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4))); 333 | _mm256_storeu_si256((__m256i *)(CurrentOut + 2 * 32), 334 | _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4))); 335 | _mm256_storeu_si256((__m256i *)(CurrentOut + 3 * 32), 336 | _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4))); 337 | 338 | _mm256_storeu_si256((__m256i *)(CurrentOut + 4 * 32), 339 | _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4))); 340 | _mm256_storeu_si256((__m256i *)(CurrentOut + 5 * 32), 341 | _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4))); 342 | _mm256_storeu_si256((__m256i *)(CurrentOut + 6 * 32), 343 | _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4))); 344 | _mm256_storeu_si256((__m256i *)(CurrentOut + 7 * 32), 345 | _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4))); 346 | 347 | _mm256_storeu_si256((__m256i *)(CurrentOut + 8 * 32), 348 | _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4))); 349 | _mm256_storeu_si256((__m256i *)(CurrentOut + 9 * 32), 350 | _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4))); 351 | _mm256_storeu_si256((__m256i *)(CurrentOut + 10 * 32), 352 | _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4))); 353 | _mm256_storeu_si256((__m256i *)(CurrentOut + 11 * 32), 354 | _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4))); 355 | 356 | _mm256_storeu_si256((__m256i *)(CurrentOut + 12 * 32), 357 | _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4))); 358 | _mm256_storeu_si256((__m256i *)(CurrentOut + 13 * 32), 359 | _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4))); 360 | _mm256_storeu_si256((__m256i *)(CurrentOut + 14 * 32), 361 | _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4))); 362 | _mm256_storeu_si256((__m256i *)(CurrentOut + 15 * 32), 363 | _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4))); 364 | } 365 | 366 | ChaCha20AddCounter(state, 8); 367 | if (CurrentIn) 368 | CurrentIn += 512; 369 | CurrentOut += 512; 370 | } 371 | 372 | if (RemainingBytes == 0) 373 | return; 374 | 375 | CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1); 376 | 377 | while (1) 378 | { 379 | 380 | const __m256i state3 = _mm256_broadcastsi128_si256( 381 | _mm_load_si128((const __m128i *)(state + 32))); 382 | 383 | __m256i X0_0 = state0; 384 | __m256i X0_1 = state1; 385 | __m256i X0_2 = state2; 386 | __m256i X0_3 = _mm256_add_epi32(state3, CTR0); 387 | 388 | for (int i = rounds; i > 0; i -= 2) 389 | { 390 | X0_0 = _mm256_add_epi32(X0_0, X0_1); 391 | 392 | X0_3 = _mm256_xor_si256(X0_3, X0_0); 393 | 394 | X0_3 = RotateLeft16(X0_3); 395 | 396 | X0_2 = _mm256_add_epi32(X0_2, X0_3); 397 | 398 | X0_1 = _mm256_xor_si256(X0_1, X0_2); 399 | 400 | X0_1 = RotateLeft12(X0_1); 401 | 402 | X0_0 = _mm256_add_epi32(X0_0, X0_1); 403 | 404 | X0_3 = _mm256_xor_si256(X0_3, X0_0); 405 | 406 | X0_3 = RotateLeft8(X0_3); 407 | 408 | X0_2 = _mm256_add_epi32(X0_2, X0_3); 409 | 410 | X0_1 = _mm256_xor_si256(X0_1, X0_2); 411 | 412 | X0_1 = RotateLeft7(X0_1); 413 | 414 | X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1)); 415 | X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2)); 416 | X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3)); 417 | 418 | X0_0 = _mm256_add_epi32(X0_0, X0_1); 419 | 420 | X0_3 = _mm256_xor_si256(X0_3, X0_0); 421 | 422 | X0_3 = RotateLeft16(X0_3); 423 | 424 | X0_2 = _mm256_add_epi32(X0_2, X0_3); 425 | 426 | X0_1 = _mm256_xor_si256(X0_1, X0_2); 427 | 428 | X0_1 = RotateLeft12(X0_1); 429 | 430 | X0_0 = _mm256_add_epi32(X0_0, X0_1); 431 | 432 | X0_3 = _mm256_xor_si256(X0_3, X0_0); 433 | 434 | X0_3 = RotateLeft8(X0_3); 435 | 436 | X0_2 = _mm256_add_epi32(X0_2, X0_3); 437 | 438 | X0_1 = _mm256_xor_si256(X0_1, X0_2); 439 | 440 | X0_1 = RotateLeft7(X0_1); 441 | 442 | X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3)); 443 | X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2)); 444 | X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1)); 445 | } 446 | 447 | X0_0 = _mm256_add_epi32(X0_0, state0); 448 | X0_1 = _mm256_add_epi32(X0_1, state1); 449 | X0_2 = _mm256_add_epi32(X0_2, state2); 450 | X0_3 = _mm256_add_epi32(X0_3, state3); 451 | X0_3 = _mm256_add_epi32(X0_3, CTR0); 452 | 453 | // todo 454 | 455 | if (RemainingBytes >= 128) 456 | { 457 | if (In) 458 | { 459 | _mm256_storeu_si256((__m256i *)(CurrentOut + 0 * 32), 460 | _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)), 461 | _mm256_loadu_si256((__m256i *)(CurrentIn + 0 * 32)))); 462 | _mm256_storeu_si256((__m256i *)(CurrentOut + 1 * 32), 463 | _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)), 464 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 1 * 32)))); 465 | _mm256_storeu_si256((__m256i *)(CurrentOut + 2 * 32), 466 | _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)), 467 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 2 * 32)))); 468 | _mm256_storeu_si256((__m256i *)(CurrentOut + 3 * 32), 469 | _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)), 470 | _mm256_loadu_si256((const __m256i *)(CurrentIn + 3 * 32)))); 471 | } 472 | else 473 | { 474 | _mm256_storeu_si256((__m256i *)(CurrentOut + 0 * 32), 475 | _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4))); 476 | _mm256_storeu_si256((__m256i *)(CurrentOut + 1 * 32), 477 | _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4))); 478 | _mm256_storeu_si256((__m256i *)(CurrentOut + 2 * 32), 479 | _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4))); 480 | _mm256_storeu_si256((__m256i *)(CurrentOut + 3 * 32), 481 | _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4))); 482 | } 483 | ChaCha20AddCounter(state, 2); 484 | RemainingBytes -= 128; 485 | if (RemainingBytes == 0) 486 | return; 487 | if (CurrentIn) 488 | CurrentIn += 128; 489 | CurrentOut += 128; 490 | continue; 491 | } 492 | else // last, partial block 493 | { 494 | __m256i tmp; 495 | if (In) // encrypt 496 | { 497 | tmp = _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)); 498 | if (RemainingBytes < 32) 499 | { 500 | PartialXor(tmp, CurrentIn, CurrentOut, RemainingBytes); 501 | ChaCha20AddCounter(state, 1); 502 | return; 503 | } 504 | _mm256_storeu_si256((__m256i *)(CurrentOut), _mm256_xor_si256(tmp, _mm256_loadu_si256((const __m256i *)(CurrentIn)))); 505 | RemainingBytes -= 32; 506 | if (RemainingBytes == 0) 507 | { 508 | ChaCha20AddCounter(state, 1); 509 | return; 510 | } 511 | 512 | CurrentIn += 32; 513 | CurrentOut += 32; 514 | 515 | tmp = _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)); 516 | if (RemainingBytes < 32) 517 | { 518 | PartialXor(tmp, CurrentIn, CurrentOut, RemainingBytes); 519 | ChaCha20AddCounter(state, 1); 520 | return; 521 | } 522 | _mm256_storeu_si256((__m256i *)(CurrentOut), _mm256_xor_si256(tmp, _mm256_loadu_si256((const __m256i *)(CurrentIn)))); 523 | RemainingBytes -= 32; 524 | if (RemainingBytes == 0) 525 | { 526 | ChaCha20AddCounter(state, 1); 527 | return; 528 | } 529 | CurrentIn += 32; 530 | CurrentOut += 32; 531 | 532 | tmp = _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)); 533 | if (RemainingBytes < 32) 534 | { 535 | PartialXor(tmp, CurrentIn, CurrentOut, RemainingBytes); 536 | ChaCha20AddCounter(state, 2); 537 | return; 538 | } 539 | _mm256_storeu_si256((__m256i *)(CurrentOut), _mm256_xor_si256(tmp, _mm256_loadu_si256((const __m256i *)(CurrentIn)))); 540 | RemainingBytes -= 32; 541 | if (RemainingBytes == 0) 542 | { 543 | ChaCha20AddCounter(state, 2); 544 | return; 545 | } 546 | CurrentIn += 32; 547 | CurrentOut += 32; 548 | 549 | tmp = _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)); 550 | PartialXor(tmp, CurrentIn, CurrentOut, RemainingBytes); 551 | ChaCha20AddCounter(state, 2); 552 | return; 553 | } 554 | else 555 | { 556 | 557 | tmp = _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)); 558 | if (RemainingBytes < 32) 559 | { 560 | PartialStore(tmp, CurrentOut, RemainingBytes); 561 | ChaCha20AddCounter(state, 1); 562 | return; 563 | } 564 | _mm256_storeu_si256((__m256i *)(CurrentOut), tmp); 565 | RemainingBytes -= 32; 566 | if (RemainingBytes == 0) 567 | { 568 | ChaCha20AddCounter(state, 1); 569 | return; 570 | } 571 | CurrentOut += 32; 572 | 573 | tmp = _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)); 574 | 575 | if (RemainingBytes < 32) 576 | { 577 | PartialStore(tmp, CurrentOut, RemainingBytes); 578 | ChaCha20AddCounter(state, 1); 579 | return; 580 | } 581 | _mm256_storeu_si256((__m256i *)(CurrentOut), tmp); 582 | RemainingBytes -= 32; 583 | if (RemainingBytes == 0) 584 | { 585 | ChaCha20AddCounter(state, 1); 586 | return; 587 | } 588 | CurrentOut += 32; 589 | 590 | tmp = _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)); 591 | if (RemainingBytes < 32) 592 | { 593 | PartialStore(tmp, CurrentOut, RemainingBytes); 594 | ChaCha20AddCounter(state, 2); 595 | return; 596 | } 597 | _mm256_storeu_si256((__m256i *)(CurrentOut), tmp); 598 | RemainingBytes -= 32; 599 | if (RemainingBytes == 0) 600 | { 601 | ChaCha20AddCounter(state, 2); 602 | return; 603 | } 604 | CurrentOut += 32; 605 | 606 | tmp = _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)); 607 | PartialStore(tmp, CurrentOut, RemainingBytes); 608 | ChaCha20AddCounter(state, 2); 609 | return; 610 | } 611 | } 612 | } 613 | } 614 | 615 | void chacha_encrypt_avx2(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds) 616 | { 617 | uint8_t state[48] = {0}; 618 | ChaCha20SetKey(state, key); 619 | ChaCha20SetNonce(state, nonce); 620 | ChaCha20EncryptBytes(state, in, out, bytes, rounds); 621 | } 622 | --------------------------------------------------------------------------------