├── C
    ├── .gitignore
    ├── README.md
    ├── ChaCha20-SIMD
    │   ├── chacha20_dispatch.c
    │   ├── LICENSE
    │   ├── chacha20.h
    │   ├── cpu_features.h
    │   ├── cpu_features.c
    │   ├── chacha20.c
    │   ├── chacha20_sse2.c
    │   └── chacha20_avx2.c
    ├── CMakeLists.txt
    ├── xelis_hash_v2.c
    └── xelis_hash_v3.c
├── .gitignore
├── .cargo
    └── config.toml
├── .gitmodules
├── go
    ├── aes
    │   ├── aes_generic.go
    │   ├── aes_amd64.s
    │   ├── aes_amd64.go
    │   ├── aes_test.go
    │   └── aes.go
    ├── go.mod
    ├── hash
    │   └── hash.go
    ├── xelis_hash.go
    ├── go.sum
    ├── v1
    │   ├── v1_test.go
    │   ├── v1.go
    │   └── keccak.go
    ├── v2
    │   ├── v2_test.go
    │   └── v2.go
    └── v3
    │   ├── v3_test.go
    │   └── v3.go
├── benches
    ├── v2.rs
    ├── v1.rs
    └── v3.rs
├── src
    ├── lib.rs
    ├── scratchpad.rs
    ├── tracker.rs
    ├── v1.rs
    ├── v3.rs
    └── v2.rs
├── LICENSE
├── Cargo.toml
└── README.md


/C/.gitignore:
--------------------------------------------------------------------------------
1 | /build


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | *.png
3 | .vscode/


--------------------------------------------------------------------------------
/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [build]
2 | rustflags = ["-C", "target-cpu=native"]


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "C/BLAKE3"]
2 | 	path = C/BLAKE3
3 | 	url = https://github.com/BLAKE3-team/BLAKE3.git
4 | 


--------------------------------------------------------------------------------
/C/README.md:
--------------------------------------------------------------------------------
1 | ```
2 | git clone --recursive https://github.com/xelis-project/xelis-hash.git
3 | cd xelis-hash/C
4 | mkdir build && cd build
5 | cmake .. 
6 | make
7 | ```


--------------------------------------------------------------------------------
/go/aes/aes_generic.go:
--------------------------------------------------------------------------------
1 | //go:build !amd64 || purego
2 | 
3 | package aes
4 | 
5 | // CipherRound performs a single AES round using software implementation
6 | func CipherRound(block *[16]byte, key *[16]byte) {
7 | 	CipherRoundGeneric(block, key)
8 | }
9 | 


--------------------------------------------------------------------------------
/go/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/xelis-project/xelis-hash/go
 2 | 
 3 | go 1.24.0
 4 | 
 5 | toolchain go1.24.11
 6 | 
 7 | require (
 8 | 	github.com/chocolatkey/chacha8 v0.0.0-20200308092524-06a0ce7f6716
 9 | 	golang.org/x/sys v0.38.0
10 | 	lukechampine.com/blake3 v1.2.1
11 | 	lukechampine.com/uint128 v1.3.0
12 | )
13 | 
14 | require (
15 | 	github.com/klauspost/cpuid/v2 v2.0.9 // indirect
16 | )
17 | 


--------------------------------------------------------------------------------
/go/aes/aes_amd64.s:
--------------------------------------------------------------------------------
 1 | #include "textflag.h"
 2 | 
 3 | // func aesCipherRoundAsm(block *[16]byte, key *[16]byte)
 4 | TEXT ·aesCipherRoundAsm(SB), NOSPLIT, $0-16
 5 | 	MOVQ block+0(FP), AX
 6 | 	MOVQ key+8(FP), BX
 7 | 	
 8 | 	// Load block into XMM0
 9 | 	MOVOU (AX), X0
10 | 	
11 | 	// Load key into XMM1
12 | 	MOVOU (BX), X1
13 | 	
14 | 	// Perform AES round: AESENC = SubBytes + ShiftRows + MixColumns + AddRoundKey
15 | 	AESENC X1, X0
16 | 	
17 | 	// Store result back to block
18 | 	MOVOU X0, (AX)
19 | 	RET
20 | 


--------------------------------------------------------------------------------
/go/aes/aes_amd64.go:
--------------------------------------------------------------------------------
 1 | //go:build amd64 && !purego
 2 | 
 3 | package aes
 4 | 
 5 | import (
 6 | 	"golang.org/x/sys/cpu"
 7 | )
 8 | 
 9 | // hasAESNI indicates whether AES-NI instructions are available
10 | var hasAESNI = cpu.X86.HasAES
11 | 
12 | // CipherRound performs a single AES round using AES-NI instructions if available
13 | func CipherRound(block *[16]byte, key *[16]byte) {
14 | 	if hasAESNI {
15 | 		aesCipherRoundAsm(block, key)
16 | 	} else {
17 | 		CipherRoundGeneric(block, key)
18 | 	}
19 | }
20 | 
21 | // aesCipherRoundAsm is implemented in assembly using AES-NI
22 | func aesCipherRoundAsm(block *[16]byte, key *[16]byte)
23 | 


--------------------------------------------------------------------------------
/benches/v2.rs:
--------------------------------------------------------------------------------
 1 | use criterion::{criterion_group, criterion_main, Criterion};
 2 | use xelis_hash::v2::{xelis_hash, ScratchPad};
 3 | 
 4 | const FIXED_INPUT: &[u8] = b"Hello World from xelis hash v2!";
 5 | 
 6 | fn bench_zero_input(c: &mut Criterion) {
 7 |     let mut scratch_pad = ScratchPad::default();
 8 |     let input = [0u8; 112];
 9 |     c.bench_function("v2::zero_input", |b| b.iter(|| xelis_hash(&input, &mut scratch_pad)));
10 | }
11 | 
12 | fn bench_fixed_input(c: &mut Criterion) {
13 |     let mut scratch_pad = ScratchPad::default();
14 |     c.bench_function("v2::fixed_input", |b| b.iter(|| xelis_hash(FIXED_INPUT, &mut scratch_pad)));
15 | }
16 | 
17 | criterion_group!(benches, bench_zero_input, bench_fixed_input);
18 | criterion_main!(benches);


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use thiserror::Error as ThisError;
 2 | 
 3 | #[cfg(feature = "v1")]
 4 | pub mod v1;
 5 | #[cfg(feature = "v2")]
 6 | pub mod v2;
 7 | #[cfg(feature = "v3")]
 8 | pub mod v3;
 9 | 
10 | pub mod scratchpad;
11 | 
12 | #[cfg(feature = "tracker")]
13 | pub mod tracker;
14 | 
15 | // Number of bytes in a hash
16 | const HASH_SIZE: usize = 32;
17 | 
18 | // Hash type alias
19 | pub type Hash = [u8; HASH_SIZE];
20 | 
21 | // Error that can occur while hashing
22 | #[derive(Debug, ThisError)]
23 | #[error("Error while hashing")]
24 | pub enum Error {
25 |     #[error("Error while hashing")]
26 |     Error,
27 |     #[error("Error while casting: {0}")]
28 |     CastError(bytemuck::PodCastError),
29 |     #[error("Error on format")]
30 |     FormatError,
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/C/ChaCha20-SIMD/chacha20_dispatch.c:
--------------------------------------------------------------------------------
 1 | #include "cpu_features.h"
 2 | #include "chacha20.h"
 3 | 
 4 | static bool initialized = false;
 5 | static cpu_features_t f;
 6 | 
 7 | void chacha_init()
 8 | {
 9 |     get_cpu_features(&f);
10 | 
11 |     initialized = true;
12 | }
13 | 
14 | void chacha_encrypt(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds)
15 | {
16 | 
17 |     if (!initialized)
18 |     {
19 |         chacha_init();
20 |     }
21 | 
22 |     if (f.HW_AVX2)
23 |     {
24 |         chacha_encrypt_avx2(key, nonce, in, out, bytes, rounds);
25 |         return;
26 |     }
27 | 
28 |     if (f.HW_SSE2)
29 |     {
30 |         chacha_encrypt_sse2(key, nonce, in, out, bytes, rounds);
31 |         return;
32 |     }
33 | 
34 |     chacha_encrypt_portable(key, nonce, in, out, bytes, rounds);
35 | }


--------------------------------------------------------------------------------
/benches/v1.rs:
--------------------------------------------------------------------------------
 1 | use criterion::{criterion_group, criterion_main, Criterion};
 2 | use xelis_hash::v1::{xelis_hash, ScratchPad};
 3 | 
 4 | const FIXED_INPUT: &[u8] = b"Hello World from xelis hash v1!";
 5 | 
 6 | fn bench_zero_input(c: &mut Criterion) {
 7 |     let mut scratch_pad = ScratchPad::default();
 8 |     let input = [0u8; 200];
 9 |     c.bench_function("v1::zero_input", |b| b.iter(|| xelis_hash(&mut input.clone(), &mut scratch_pad)));
10 | }
11 | 
12 | fn bench_fixed_input(c: &mut Criterion) {
13 |     let mut scratch_pad = ScratchPad::default();
14 |     let mut input = [0u8; 200];
15 |     input[0..FIXED_INPUT.len()].copy_from_slice(FIXED_INPUT);
16 | 
17 |     c.bench_function("v1::fixed_input", |b| b.iter(|| xelis_hash(&mut input.clone(), &mut scratch_pad)));
18 | }
19 | 
20 | criterion_group!(benches, bench_zero_input, bench_fixed_input);
21 | criterion_main!(benches);


--------------------------------------------------------------------------------
/go/hash/hash.go:
--------------------------------------------------------------------------------
 1 | package hash
 2 | 
 3 | import (
 4 | 	"encoding/hex"
 5 | 	"errors"
 6 | )
 7 | 
 8 | const HashSize = 32
 9 | 
10 | var ErrInvalidHashLength = errors.New("invalid hash length")
11 | 
12 | type Hash [HashSize]byte
13 | 
14 | func Zero() Hash {
15 | 	return Hash{}
16 | }
17 | 
18 | func NewHash(input [HashSize]byte) Hash {
19 | 	return Hash(input)
20 | }
21 | 
22 | func (h *Hash) Bytes() []byte {
23 | 	return h[:]
24 | }
25 | 
26 | func (h *Hash) String() string {
27 | 	return hex.EncodeToString(h[:])
28 | }
29 | 
30 | func FromBytes(data []byte) (Hash, error) {
31 | 	var h Hash
32 | 	if len(data) != HashSize {
33 | 		return h, ErrInvalidHashLength
34 | 	}
35 | 	copy(h[:], data)
36 | 	return h, nil
37 | }
38 | 
39 | func FromString(s string) (Hash, error) {
40 | 	var h Hash
41 | 	data, err := hex.DecodeString(s)
42 | 	if err != nil {
43 | 		return h, err
44 | 	}
45 | 	return FromBytes(data)
46 | }
47 | 


--------------------------------------------------------------------------------
/go/xelis_hash.go:
--------------------------------------------------------------------------------
 1 | package xelis_hash
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 
 6 | 	hash "github.com/xelis-project/xelis-hash/go/hash"
 7 | 	v1 "github.com/xelis-project/xelis-hash/go/v1"
 8 | 	v2 "github.com/xelis-project/xelis-hash/go/v2"
 9 | 	v3 "github.com/xelis-project/xelis-hash/go/v3"
10 | )
11 | 
12 | func HashV1(input []byte) (hash.Hash, error) {
13 | 	var padded [v1.BytesArrayInput]byte
14 | 	if len(input) <= v1.BytesArrayInput {
15 | 		copy(padded[:], input)
16 | 	} else {
17 | 		return hash.Zero(), errors.New("input too long for v1 hash (max 120 bytes)")
18 | 	}
19 | 
20 | 	scratchPad := v1.NewScratchPad()
21 | 	return v1.XelisHash(&padded, scratchPad)
22 | }
23 | 
24 | func HashV2(input []byte) (hash.Hash, error) {
25 | 	scratchPad := v2.NewScratchPad()
26 | 	return v2.XelisHash(input, scratchPad)
27 | }
28 | 
29 | func HashV3(input []byte) (hash.Hash, error) {
30 | 	scratchPad := v3.NewScratchPad()
31 | 	return v3.XelisHash(input, scratchPad)
32 | }
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 XELIS
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/C/ChaCha20-SIMD/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Yury Myakotin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "xelis-hash"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | # Common dependencies
10 | thiserror = "1.0.58"
11 | aes = { version = "0.8.3", features = ["hazmat"] }
12 | bytemuck = { version = "1.15.0", features = ["derive"] }
13 | 
14 | # v1 features
15 | tiny-keccak = { version = "2.0", features = ["k12"], optional = true }
16 | 
17 | # v2 features
18 | blake3 = { version = "1.5.1", optional = true }
19 | chacha20 = { version = "0.9.1", optional = true }
20 | plotters = { version = "0.3.7", optional = true }
21 | anyhow = { version = "1", optional = true }
22 | 
23 | [dev-dependencies]
24 | rand = "0.8.5"
25 | criterion = "0.5.1"
26 | 
27 | [features]
28 | default = ["v1", "v2", "v3"]
29 | v1 = ["dep:tiny-keccak"]
30 | v2 = ["dep:blake3", "dep:chacha20"]
31 | v3 = ["v2"]
32 | # Only available in v2/v3
33 | tracker = ["dep:plotters", "dep:anyhow"]
34 | 
35 | [[bench]]
36 | name = "v1"
37 | harness = false
38 | required-features = ["v1"]
39 | 
40 | [[bench]]
41 | name = "v2"
42 | harness = false
43 | required-features = ["v2"]
44 | 
45 | [[bench]]
46 | name = "v3"
47 | harness = false
48 | required-features = ["v3"]
49 | 


--------------------------------------------------------------------------------
/C/ChaCha20-SIMD/chacha20.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef CHACHA20_H
 3 | #define CHACHA20_H
 4 | 
 5 | #include <stdint.h>
 6 | #include <stddef.h>
 7 | 
 8 | #define ChaCha20StateSizeBytes 48;
 9 | #define ChaCha20KeySizeByte 32
10 | #define ChaCha20NonceSizeByte 12
11 | #define ChaCha20CounterSizeByte 4
12 | 
13 | #ifdef __cplusplus
14 | extern "C"
15 | {
16 | #endif
17 | 
18 | 	void ChaCha20SetKey(uint8_t *state, const uint8_t *Key);
19 | 	void ChaCha20SetNonce(uint8_t *state, const uint8_t *Nonce);
20 | 	// void ChaCha20SetCtr(uint8_t *state, const uint8_t *Ctr);
21 | 	// void ChaCha20EncryptBytes(uint8_t *state, uint8_t *In, uint8_t *Out, size_t Size, uint32_t rounds); // if In=nullptr - just fill Out
22 | 	void ChaCha20IncrementNonce(uint8_t *state);
23 | 	void ChaCha20AddCounter(uint8_t *ChaCha, const uint32_t value_to_add);
24 | 
25 | 	void chacha_encrypt_portable(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds);
26 | 	void chacha_encrypt_sse2(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds);
27 | 	void chacha_encrypt_avx2(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds);
28 | 	void chacha_encrypt(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds);
29 | 
30 | #ifdef __cplusplus
31 | }
32 | #endif
33 | 
34 | #endif // CHACHA20_H


--------------------------------------------------------------------------------
/src/scratchpad.rs:
--------------------------------------------------------------------------------
 1 | use crate::Error;
 2 | 
 3 | 
 4 | // Scratchpad used to store intermediate values
 5 | // It has a fixed size of `MEMORY_SIZE` u64s
 6 | // It can be easily reused for multiple hashing operations safely
 7 | #[derive(Debug, Clone)]
 8 | pub struct ScratchPad<const M: usize>(Box<[u64; M]>);
 9 | 
10 | impl<const M: usize> ScratchPad<M> {
11 |     // Retrieve the scratchpad size
12 |     #[inline(always)]
13 |     pub fn len(&self) -> usize {
14 |         self.0.len()
15 |     }
16 | 
17 |     // Get the inner scratch pad as a mutable u64 slice
18 |     #[inline(always)]
19 |     pub fn as_mut_slice(&mut self) -> &mut [u64; M] {
20 |         &mut self.0
21 |     }
22 | 
23 |     // Retrieve the scratch pad as a mutable bytes slice
24 |     #[inline(always)]
25 |     pub fn as_mut_bytes<const M_BYTES: usize>(&mut self) -> Result<&mut [u8; M_BYTES], Error> {
26 |         bytemuck::try_cast_slice_mut(self.as_mut_slice())
27 |             .map_err(|e| Error::CastError(e))?
28 |             .try_into()
29 |             .map_err(|_| Error::FormatError)
30 |     }
31 | }
32 | 
33 | impl<const M: usize> Default for ScratchPad<M> {
34 |     fn default() -> Self {
35 |         Self(
36 |             vec![0; M]
37 |                 .into_boxed_slice()
38 |                 .try_into()
39 |                 .expect("Failed generating scratchpad")
40 |         )
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/go/aes/aes_test.go:
--------------------------------------------------------------------------------
 1 | package aes
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestAESCipherRound(t *testing.T) {
 8 | 	// Test vector for a single AES round
 9 | 	block := [16]byte{0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}
10 | 	key := [16]byte{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}
11 | 
12 | 	// Test with hardware intrinsics (if available)
13 | 	blockHW := block
14 | 	CipherRound(&blockHW, &key)
15 | 
16 | 	// Test with generic implementation
17 | 	blockSW := block
18 | 	CipherRoundGeneric(&blockSW, &key)
19 | 
20 | 	// Both should produce the same result
21 | 	if blockHW != blockSW {
22 | 		t.Errorf("Hardware and software implementations produce different results\nHW: %x\nSW: %x", blockHW, blockSW)
23 | 	}
24 | }
25 | 
26 | func BenchmarkAESCipherRound(b *testing.B) {
27 | 	block := [16]byte{0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}
28 | 	key := [16]byte{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}
29 | 
30 | 	b.Run("Hardware", func(b *testing.B) {
31 | 		for i := 0; i < b.N; i++ {
32 | 			CipherRound(&block, &key)
33 | 		}
34 | 	})
35 | 
36 | 	b.Run("Software", func(b *testing.B) {
37 | 		for i := 0; i < b.N; i++ {
38 | 			CipherRoundGeneric(&block, &key)
39 | 		}
40 | 	})
41 | }
42 | 


--------------------------------------------------------------------------------
/C/ChaCha20-SIMD/cpu_features.h:
--------------------------------------------------------------------------------
 1 | #ifndef CPU_FEATURES_H
 2 | #define CPU_FEATURES_H
 3 | 
 4 | #include <stdbool.h>
 5 | 
 6 | typedef struct cpu_features_t
 7 | {
 8 |     //  Misc.
 9 |     bool HW_MMX;
10 |     bool HW_x64;
11 |     bool HW_ABM; // Advanced Bit Manipulation
12 |     bool HW_RDRAND;
13 |     bool HW_BMI1;
14 |     bool HW_BMI2;
15 |     bool HW_ADX;
16 |     bool HW_PREFETCHWT1;
17 | 
18 |     //  SIMD: 128-bit
19 |     bool HW_SSE;
20 |     bool HW_SSE2;
21 |     bool HW_SSE3;
22 |     bool HW_SSSE3;
23 |     bool HW_SSE41;
24 |     bool HW_SSE42;
25 |     bool HW_SSE4a;
26 |     bool HW_AES;
27 |     bool HW_SHA;
28 | 
29 |     //  SIMD: 256-bit
30 |     bool HW_AVX;
31 |     bool HW_XOP;
32 |     bool HW_FMA3;
33 |     bool HW_FMA4;
34 |     bool HW_AVX2;
35 | 
36 |     //  SIMD: 512-bit
37 |     bool HW_AVX512F;    //  AVX512 Foundation
38 |     bool HW_AVX512CD;   //  AVX512 Conflict Detection
39 |     bool HW_AVX512PF;   //  AVX512 Prefetch
40 |     bool HW_AVX512ER;   //  AVX512 Exponential + Reciprocal
41 |     bool HW_AVX512VL;   //  AVX512 Vector Length Extensions
42 |     bool HW_AVX512BW;   //  AVX512 Byte + Word
43 |     bool HW_AVX512DQ;   //  AVX512 Doubleword + Quadword
44 |     bool HW_AVX512IFMA; //  AVX512 Integer 52-bit Fused Multiply-Add
45 |     bool HW_AVX512VBMI; //  AVX512 Vector Byte Manipulation Instructions
46 | } cpu_features_t;
47 | 
48 | void get_cpu_features(cpu_features_t *f);
49 | 
50 | #endif // CPU_FEATURES_H


--------------------------------------------------------------------------------
/C/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | project(xelishash_v3 C)
 4 | 
 5 | set(CMAKE_C_STANDARD 11)
 6 | set(CMAKE_C_EXTENSIONS ON)
 7 | 
 8 | if(NOT CMAKE_BUILD_TYPE)
 9 |   set(CMAKE_BUILD_TYPE Release)
10 | endif()
11 | 
12 | add_subdirectory(BLAKE3/c)
13 | 
14 | set(SOURCES
15 |   ChaCha20-SIMD/chacha20.c
16 |   ChaCha20-SIMD/chacha20_sse2.c
17 |   ChaCha20-SIMD/chacha20_avx2.c
18 |   ChaCha20-SIMD/chacha20_dispatch.c
19 |   ChaCha20-SIMD/cpu_features.c
20 |   xelis_hash_v3.c
21 | )
22 | 
23 | # Per-file ISA flags
24 | if(MSVC)
25 |   set_source_files_properties(ChaCha20-SIMD/chacha20_avx2.c PROPERTIES COMPILE_FLAGS "/arch:AVX2")
26 | else()
27 |   set_source_files_properties(ChaCha20-SIMD/chacha20_sse2.c PROPERTIES COMPILE_FLAGS "-msse2")
28 |   set_source_files_properties(ChaCha20-SIMD/chacha20_avx2.c PROPERTIES COMPILE_FLAGS "-mavx2")
29 | endif()
30 | 
31 | add_executable(${PROJECT_NAME} ${SOURCES})
32 | 
33 | # Target flags
34 | if(CMAKE_C_COMPILER_ID MATCHES "GNU|Clang")
35 |   target_compile_options(${PROJECT_NAME} PRIVATE -O3)
36 |   if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i[3-6]86")
37 |     target_compile_options(${PROJECT_NAME} PRIVATE -maes -mpclmul)
38 |   endif()
39 | elseif(MSVC)
40 |   target_compile_options(${PROJECT_NAME} PRIVATE /O2)
41 | endif()
42 | 
43 | # Link
44 | if(UNIX AND NOT APPLE)
45 |   target_link_libraries(${PROJECT_NAME} PRIVATE blake3 pthread m)
46 | else()
47 |   target_link_libraries(${PROJECT_NAME} PRIVATE blake3)
48 | endif()


--------------------------------------------------------------------------------
/benches/v3.rs:
--------------------------------------------------------------------------------
 1 | use criterion::{criterion_group, criterion_main, Criterion, BatchSize};
 2 | use rand::{Rng, SeedableRng, rngs::StdRng};
 3 | use xelis_hash::v3::*;
 4 | 
 5 | const FIXED_INPUT: &[u8] = b"Hello World from xelis hash v3!";
 6 | 
 7 | fn bench_zero_input(c: &mut Criterion) {
 8 |     let mut scratch_pad = ScratchPad::default();
 9 |     let input = [0u8; 112];
10 |     c.bench_function("v3::zero_input", |b| b.iter(|| xelis_hash(&input, &mut scratch_pad)));
11 | }
12 | 
13 | fn bench_fixed_input(c: &mut Criterion) {
14 |     let mut scratch_pad = ScratchPad::default();
15 |     c.bench_function("v3::fixed_input", |b| b.iter(|| xelis_hash(FIXED_INPUT, &mut scratch_pad)));
16 | }
17 | 
18 | fn bench_pick_half(c: &mut Criterion) {
19 |     let mut rng = StdRng::seed_from_u64(0xDEADBEEFCAFEBABE);
20 | 
21 |     c.bench_function("v3::pick_half", |b| {
22 |         b.iter_batched(
23 |             || rng.gen::<u64>(),
24 |             |seed| pick_half(seed),
25 |             BatchSize::SmallInput
26 |         )
27 |     });
28 | }
29 | 
30 | fn bench_map_index(c: &mut Criterion) {
31 |     let mut rng = StdRng::seed_from_u64(0xDEADBEEFCAFEBABE);
32 | 
33 |     c.bench_function("v3::map_index", |b| {
34 |         b.iter_batched(
35 |             || rng.gen::<u64>(),
36 |             |seed| map_index(seed),
37 |             BatchSize::SmallInput
38 |         )
39 |     });
40 | }
41 | 
42 | criterion_group!(benches, bench_zero_input, bench_fixed_input, bench_pick_half, bench_map_index);
43 | criterion_main!(benches);


--------------------------------------------------------------------------------
/go/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/chocolatkey/chacha8 v0.0.0-20200308092524-06a0ce7f6716 h1:NSjnwJb5rlX8weAJPotMIFtWSFt4Tjtkjt7nTBil1dA=
 2 | github.com/chocolatkey/chacha8 v0.0.0-20200308092524-06a0ce7f6716/go.mod h1:NvCEVATmyDtfApL4hee9mqF2c7+AFTpltRm62q68ppU=
 3 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 4 | github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4=
 5 | github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
 6 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 7 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 8 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
 9 | golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
10 | golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
11 | golang.org/x/sys v0.0.0-20190902133755-9109b7679e13/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
12 | golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
13 | golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
14 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
15 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
16 | lukechampine.com/blake3 v1.2.1 h1:YuqqRuaqsGV71BV/nm9xlI0MKUv4QC54jQnBChWbGnI=
17 | lukechampine.com/blake3 v1.2.1/go.mod h1:0OFRp7fBtAylGVCO40o87sbupkyIGgbpv1+M1k1LM6k=
18 | lukechampine.com/uint128 v1.3.0 h1:cDdUVfRwDUDovz610ABgFD17nXD4/uDgVHl2sC3+sbo=
19 | lukechampine.com/uint128 v1.3.0/go.mod h1:c4eWIwlEGaxC/+H1VguhU4PHXNWDCDMUlWdIWl2j1gk=
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # XELIS Hash
 2 | 
 3 | XELIS Hash is expected to run on CPU and GPUs with a controlled gap.
 4 | It is relying on two famous algorithms: ChaCha8 and Blake3.
 5 | 
 6 | ## V3
 7 | 
 8 | Same base as the V2 but with a scratchpad of 544 KB.
 9 | 
10 | Stage 1 and (Final) stage 4 are the same as V2.
11 | 
12 | Stage 3 has been modified to increase memory accesses while having a random memory access pattern.
13 | 
14 | ## V2
15 | 
16 | New version use a scratchpad of ~440 KB which can be reused at each hash.
17 | 
18 | Stage 1 will randomize the scratchpad based on the input used as a key for the ChaCha8 stream cipher.
19 | The key is a Blake3 hash of (previous hash + input chunk).
20 |  
21 | First nonce is based on the first 12 bytes of the input's blake3 hash result.
22 | The input is splitted into several 32 bytes chunks padded with zeroes if size is smaller.
23 | It cannot be parallelized due to the nonce based on the previous iteration.
24 | 
25 | Stage 2 has been removed because the whole work is now done in stage 3.
26 | 
27 | Stage 3 is expected to do a lot of random access in memory while being forced to stay sequential.
28 | There is 4 reads and 2 writes per iteration, making it memory bound.
29 | A branching part is included in the inner loop to be power-hungry and reduce efficiency of FPGA and GPUs.
30 | 
31 | (Final) stage 4 is using Blake3 algorithm to hash the whole scratchpad to give a final good-quality hash.
32 | It is also used to prevent skipping a part of the scratchpad, to force it to be fully computed.
33 | 
34 | Blake3 and ChaCha8 are used as they are really fast and can be highly parallelized, one thread can have high hashrate to reduce verification time.
35 | 
36 | Expected time per hash is around 1.20-1.50ms.
37 | 
38 | ## Features
39 | 
40 | - `v1`: deprecated algorithm
41 | - `v2`: new algorithm with 440 KB scratchpad
42 | - `v3`: new algorithm version based on the v2 with bigger scratchpad and others changes
43 | - `tracker`: track branches selection, memory accesses and generate charts for it.


--------------------------------------------------------------------------------
/go/v1/v1_test.go:
--------------------------------------------------------------------------------
 1 | package v1
 2 | 
 3 | import (
 4 | 	"slices"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestZeroInput(t *testing.T) {
 9 | 	var input [BytesArrayInput]byte
10 | 	scratchPad := NewScratchPad()
11 | 
12 | 	hash, err := XelisHash(&input, scratchPad)
13 | 	if err != nil {
14 | 		t.Fatalf("Hash failed: %v", err)
15 | 	}
16 | 
17 | 	expected := []byte{
18 | 		0x0e, 0xbb, 0xbd, 0x8a, 0x31, 0xed, 0xad, 0xfe, 0x09, 0x8f, 0x2d, 0x77, 0x0d, 0x84,
19 | 		0xb7, 0x19, 0x58, 0x86, 0x75, 0xab, 0x88, 0xa0, 0xa1, 0x70, 0x67, 0xd0, 0x0a, 0x8f,
20 | 		0x36, 0x18, 0x22, 0x65}
21 | 
22 | 	if !slices.Equal(expected, hash[:]) {
23 | 		t.Errorf("Hash mismatch:\nGot:      %x\nExpected: %x", hash, expected)
24 | 	}
25 | 	t.Logf("Hash: %x", hash)
26 | }
27 | 
28 | func TestXelisInput(t *testing.T) {
29 | 	var input [BytesArrayInput]byte
30 | 	custom := []byte("xelis-hashing-algorithm")
31 | 	copy(input[:], custom)
32 | 
33 | 	scratchPad := NewScratchPad()
34 | 	hash, err := XelisHash(&input, scratchPad)
35 | 	if err != nil {
36 | 		t.Fatalf("Hash failed: %v", err)
37 | 	}
38 | 
39 | 	expected := []byte{
40 | 		106, 106, 173, 8, 207, 59, 118, 108, 176, 196, 9, 124, 250, 195, 3,
41 | 		61, 30, 146, 238, 182, 88, 83, 115, 81, 139, 56, 3, 28, 176, 86, 68, 21,
42 | 	}
43 | 
44 | 	if !slices.Equal(expected, hash[:]) {
45 | 		t.Errorf("Hash mismatch:\nGot:      %x\nExpected: %x", hash, expected)
46 | 	}
47 | 
48 | 	t.Logf("Hash: %x", hash)
49 | }
50 | 
51 | func TestScratchPadReuse(t *testing.T) {
52 | 	var input [BytesArrayInput]byte
53 | 	scratchPad := NewScratchPad()
54 | 
55 | 	hash1, err := XelisHash(&input, scratchPad)
56 | 	if err != nil {
57 | 		t.Fatalf("First hash failed: %v", err)
58 | 	}
59 | 
60 | 	hash2, err := XelisHash(&input, scratchPad)
61 | 	if err != nil {
62 | 		t.Fatalf("Second hash failed: %v", err)
63 | 	}
64 | 
65 | 	if hash1 != hash2 {
66 | 		t.Errorf("Hash mismatch:\nGot:      %x\nExpected: %x", hash1, hash2)
67 | 	}
68 | }
69 | 
70 | func BenchmarkXelisHashV1(b *testing.B) {
71 | 	var input [BytesArrayInput]byte
72 | 	copy(input[:], []byte("benchmark data"))
73 | 	scratchPad := NewScratchPad()
74 | 
75 | 	b.ResetTimer()
76 | 	for i := 0; i < b.N; i++ {
77 | 		_, _ = XelisHash(&input, scratchPad)
78 | 	}
79 | }
80 | 


--------------------------------------------------------------------------------
/go/v2/v2_test.go:
--------------------------------------------------------------------------------
 1 | package v2
 2 | 
 3 | import (
 4 | 	"slices"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestZeroHash(t *testing.T) {
 9 | 	input := make([]byte, 112)
10 | 	scratchPad := NewScratchPad()
11 | 
12 | 	hash, err := XelisHash(input, scratchPad)
13 | 	if err != nil {
14 | 		t.Fatalf("Hash failed: %v", err)
15 | 	}
16 | 
17 | 	expected := [32]byte{
18 | 		126, 219, 112, 240, 116, 133, 115, 144, 39, 40, 164,
19 | 		105, 30, 158, 45, 126, 64, 67, 238, 52, 200, 35,
20 | 		161, 19, 144, 211, 214, 225, 95, 190, 146, 27,
21 | 	}
22 | 
23 | 	if !slices.Equal(expected[:], hash[:]) {
24 | 		t.Errorf("Hash mismatch:\nGot:      %x\nExpected: %x", hash, expected)
25 | 	}
26 | }
27 | 
28 | func TestReusedScratchpad(t *testing.T) {
29 | 	input := make([]byte, 112)
30 | 	for i := range input {
31 | 		input[i] = byte(i % 256)
32 | 	}
33 | 
34 | 	scratchPad := NewScratchPad()
35 | 
36 | 	hash1, err := XelisHash(input, scratchPad)
37 | 	if err != nil {
38 | 		t.Fatalf("First hash failed: %v", err)
39 | 	}
40 | 
41 | 	hash2, err := XelisHash(input, scratchPad)
42 | 	if err != nil {
43 | 		t.Fatalf("Second hash failed: %v", err)
44 | 	}
45 | 
46 | 	for i := range hash1 {
47 | 		if hash1[i] != hash2[i] {
48 | 			t.Errorf("Hash mismatch when reusing scratchpad")
49 | 			break
50 | 		}
51 | 	}
52 | }
53 | 
54 | func TestVerifyOutput(t *testing.T) {
55 | 	input := []byte{
56 | 		172, 236, 108, 212, 181, 31, 109, 45, 44, 242, 54, 225, 143, 133,
57 | 		89, 44, 179, 108, 39, 191, 32, 116, 229, 33, 63, 130, 33, 120, 185, 89,
58 | 		146, 141, 10, 79, 183, 107, 238, 122, 92, 222, 25, 134, 90, 107, 116,
59 | 		110, 236, 53, 255, 5, 214, 126, 24, 216, 97, 199, 148, 239, 253, 102,
60 | 		199, 184, 232, 253, 158, 145, 86, 187, 112, 81, 78, 70, 80, 110, 33,
61 | 		37, 159, 233, 198, 1, 178, 108, 210, 100, 109, 155, 106, 124, 124, 83,
62 | 		89, 50, 197, 115, 231, 32, 74, 2, 92, 47, 25, 220, 135, 249, 122,
63 | 		172, 220, 137, 143, 234, 68, 188,
64 | 	}
65 | 
66 | 	scratchPad := NewScratchPad()
67 | 	hash, err := XelisHash(input, scratchPad)
68 | 	if err != nil {
69 | 		t.Fatalf("Hash failed: %v", err)
70 | 	}
71 | 
72 | 	expected := []byte{
73 | 		199, 114, 154, 28, 4, 164, 196, 178, 117, 17, 148,
74 | 		203, 125, 228, 51, 145, 162, 222, 106, 202, 205,
75 | 		55, 244, 178, 94, 29, 248, 242, 98, 221, 158, 179,
76 | 	}
77 | 
78 | 	if !slices.Equal(expected, hash[:]) {
79 | 		t.Errorf("Hash mismatch:\nGot:      %x\nExpected: %x", hash, expected)
80 | 	}
81 | }
82 | 
83 | func BenchmarkXelisHashV2(b *testing.B) {
84 | 	input := make([]byte, 112)
85 | 	scratchPad := NewScratchPad()
86 | 
87 | 	b.ResetTimer()
88 | 	for i := 0; i < b.N; i++ {
89 | 		_, _ = XelisHash(input, scratchPad)
90 | 	}
91 | }
92 | 


--------------------------------------------------------------------------------
/C/ChaCha20-SIMD/cpu_features.c:
--------------------------------------------------------------------------------
 1 | // CPU features set
 2 | #include "cpu_features.h"
 3 | 
 4 | static void cpuid(int cpuinfo[4], int info_type)
 5 | {
 6 |     __asm__ __volatile__(
 7 |         "cpuid" : "=a"(cpuinfo[0]),
 8 |                   "=b"(cpuinfo[1]),
 9 |                   "=c"(cpuinfo[2]),
10 |                   "=d"(cpuinfo[3]) : "a"(info_type), "c"(0));
11 | }
12 | 
13 | void get_cpu_features(cpu_features_t *f)
14 | {
15 |     int info[4];
16 |     cpuid(info, 0);
17 |     int nIds = info[0];
18 | 
19 |     cpuid(info, 0x80000000);
20 |     unsigned nExIds = info[0];
21 | 
22 |     //  Detect Features
23 |     if (nIds >= 0x00000001)
24 |     {
25 |         cpuid(info, 0x00000001);
26 |         f->HW_MMX = (info[3] & ((int)1 << 23)) != 0;
27 |         f->HW_SSE = (info[3] & ((int)1 << 25)) != 0;
28 |         f->HW_SSE2 = (info[3] & ((int)1 << 26)) != 0;
29 |         f->HW_SSE3 = (info[2] & ((int)1 << 0)) != 0;
30 | 
31 |         f->HW_SSSE3 = (info[2] & ((int)1 << 9)) != 0;
32 |         f->HW_SSE41 = (info[2] & ((int)1 << 19)) != 0;
33 |         f->HW_SSE42 = (info[2] & ((int)1 << 20)) != 0;
34 |         f->HW_AES = (info[2] & ((int)1 << 25)) != 0;
35 | 
36 |         f->HW_AVX = (info[2] & ((int)1 << 28)) != 0;
37 |         f->HW_FMA3 = (info[2] & ((int)1 << 12)) != 0;
38 | 
39 |         f->HW_RDRAND = (info[2] & ((int)1 << 30)) != 0;
40 |     }
41 |     if (nIds >= 0x00000007)
42 |     {
43 |         cpuid(info, 0x00000007);
44 |         f->HW_AVX2 = (info[1] & ((int)1 << 5)) != 0;
45 | 
46 |         f->HW_BMI1 = (info[1] & ((int)1 << 3)) != 0;
47 |         f->HW_BMI2 = (info[1] & ((int)1 << 8)) != 0;
48 |         f->HW_ADX = (info[1] & ((int)1 << 19)) != 0;
49 |         f->HW_SHA = (info[1] & ((int)1 << 29)) != 0;
50 |         f->HW_PREFETCHWT1 = (info[2] & ((int)1 << 0)) != 0;
51 | 
52 |         f->HW_AVX512F = (info[1] & ((int)1 << 16)) != 0;
53 |         f->HW_AVX512CD = (info[1] & ((int)1 << 28)) != 0;
54 |         f->HW_AVX512PF = (info[1] & ((int)1 << 26)) != 0;
55 |         f->HW_AVX512ER = (info[1] & ((int)1 << 27)) != 0;
56 |         f->HW_AVX512VL = (info[1] & ((int)1 << 31)) != 0;
57 |         f->HW_AVX512BW = (info[1] & ((int)1 << 30)) != 0;
58 |         f->HW_AVX512DQ = (info[1] & ((int)1 << 17)) != 0;
59 |         f->HW_AVX512IFMA = (info[1] & ((int)1 << 21)) != 0;
60 |         f->HW_AVX512VBMI = (info[2] & ((int)1 << 1)) != 0;
61 |     }
62 |     if (nExIds >= 0x80000001)
63 |     {
64 |         cpuid(info, 0x80000001);
65 |         f->HW_x64 = (info[3] & ((int)1 << 29)) != 0;
66 |         f->HW_ABM = (info[2] & ((int)1 << 5)) != 0;
67 |         f->HW_SSE4a = (info[2] & ((int)1 << 6)) != 0;
68 |         f->HW_FMA4 = (info[2] & ((int)1 << 16)) != 0;
69 |         f->HW_XOP = (info[2] & ((int)1 << 11)) != 0;
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/go/aes/aes.go:
--------------------------------------------------------------------------------
 1 | package aes
 2 | 
 3 | // AES S-box
 4 | var sbox = [256]byte{
 5 | 	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
 6 | 	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
 7 | 	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
 8 | 	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
 9 | 	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
10 | 	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
11 | 	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
12 | 	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
13 | 	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
14 | 	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
15 | 	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
16 | 	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
17 | 	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
18 | 	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
19 | 	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
20 | 	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
21 | }
22 | 
23 | // Galois Field multiplication by 2
24 | func gmul2(b byte) byte {
25 | 	if b&0x80 != 0 {
26 | 		return (b << 1) ^ 0x1b
27 | 	}
28 | 	return b << 1
29 | }
30 | 
31 | // Galois Field multiplication by 3
32 | func gmul3(b byte) byte {
33 | 	return gmul2(b) ^ b
34 | }
35 | 
36 | // subBytes performs the SubBytes step
37 | func subBytes(state *[16]byte) {
38 | 	for i := 0; i < 16; i++ {
39 | 		state[i] = sbox[state[i]]
40 | 	}
41 | }
42 | 
43 | // shiftRows performs the ShiftRows step
44 | func shiftRows(state *[16]byte) {
45 | 	// Row 0: no shift
46 | 	// Row 1: shift left by 1
47 | 	state[1], state[5], state[9], state[13] = state[5], state[9], state[13], state[1]
48 | 	// Row 2: shift left by 2
49 | 	state[2], state[6], state[10], state[14] = state[10], state[14], state[2], state[6]
50 | 	// Row 3: shift left by 3
51 | 	state[3], state[7], state[11], state[15] = state[15], state[3], state[7], state[11]
52 | }
53 | 
54 | // mixColumns performs the MixColumns step
55 | func mixColumns(state *[16]byte) {
56 | 	for i := 0; i < 4; i++ {
57 | 		col := i * 4
58 | 		s0 := state[col]
59 | 		s1 := state[col+1]
60 | 		s2 := state[col+2]
61 | 		s3 := state[col+3]
62 | 
63 | 		state[col] = gmul2(s0) ^ gmul3(s1) ^ s2 ^ s3
64 | 		state[col+1] = s0 ^ gmul2(s1) ^ gmul3(s2) ^ s3
65 | 		state[col+2] = s0 ^ s1 ^ gmul2(s2) ^ gmul3(s3)
66 | 		state[col+3] = gmul3(s0) ^ s1 ^ s2 ^ gmul2(s3)
67 | 	}
68 | }
69 | 
70 | // addRoundKey XORs the state with the round key
71 | func addRoundKey(state *[16]byte, key *[16]byte) {
72 | 	for i := 0; i < 16; i++ {
73 | 		state[i] ^= key[i]
74 | 	}
75 | }
76 | 
77 | // CipherRoundGeneric performs a single AES round (SubBytes, ShiftRows, MixColumns, AddRoundKey)
78 | // This matches aes::hazmat::cipher_round in Rust
79 | func CipherRoundGeneric(block *[16]byte, key *[16]byte) {
80 | 	subBytes(block)
81 | 	shiftRows(block)
82 | 	mixColumns(block)
83 | 	addRoundKey(block, key)
84 | }
85 | 


--------------------------------------------------------------------------------
/go/v3/v3_test.go:
--------------------------------------------------------------------------------
  1 | package v3
  2 | 
  3 | import (
  4 | 	"slices"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestZeroHash(t *testing.T) {
  9 | 	input := make([]byte, 112)
 10 | 	scratchPad := NewScratchPad()
 11 | 
 12 | 	hash, err := XelisHash(input, scratchPad)
 13 | 	if err != nil {
 14 | 		t.Fatalf("Hash failed: %v", err)
 15 | 	}
 16 | 
 17 | 	expected := [32]byte{
 18 | 		105, 172, 103, 40, 94, 253, 92, 162,
 19 | 		42, 252, 5, 196, 236, 238, 91, 218,
 20 | 		22, 157, 228, 233, 239, 8, 250, 57,
 21 | 		212, 166, 121, 132, 148, 205, 103, 163,
 22 | 	}
 23 | 
 24 | 	if !slices.Equal(expected[:], hash[:]) {
 25 | 		t.Errorf("Hash mismatch:\nGot:      %x\nExpected: %x", hash, expected)
 26 | 	}
 27 | }
 28 | 
 29 | func TestReusedScratchpad(t *testing.T) {
 30 | 	input := make([]byte, 112)
 31 | 	for i := range input {
 32 | 		input[i] = byte(i % 256)
 33 | 	}
 34 | 
 35 | 	scratchPad := NewScratchPad()
 36 | 
 37 | 	hash1, err := XelisHash(input, scratchPad)
 38 | 	if err != nil {
 39 | 		t.Fatalf("First hash failed: %v", err)
 40 | 	}
 41 | 
 42 | 	hash2, err := XelisHash(input, scratchPad)
 43 | 	if err != nil {
 44 | 		t.Fatalf("Second hash failed: %v", err)
 45 | 	}
 46 | 
 47 | 	for i := range hash1 {
 48 | 		if hash1[i] != hash2[i] {
 49 | 			t.Errorf("Hash mismatch when reusing scratchpad")
 50 | 			break
 51 | 		}
 52 | 	}
 53 | }
 54 | 
 55 | func TestVerifyOutput(t *testing.T) {
 56 | 	input := []byte{
 57 | 		172, 236, 108, 212, 181, 31, 109, 45, 44, 242, 54, 225, 143, 133,
 58 | 		89, 44, 179, 108, 39, 191, 32, 116, 229, 33, 63, 130, 33, 120, 185, 89,
 59 | 		146, 141, 10, 79, 183, 107, 238, 122, 92, 222, 25, 134, 90, 107, 116,
 60 | 		110, 236, 53, 255, 5, 214, 126, 24, 216, 97, 199, 148, 239, 253, 102,
 61 | 		199, 184, 232, 253, 158, 145, 86, 187, 112, 81, 78, 70, 80, 110, 33,
 62 | 		37, 159, 233, 198, 1, 178, 108, 210, 100, 109, 155, 106, 124, 124, 83,
 63 | 		89, 50, 197, 115, 231, 32, 74, 2, 92, 47, 25, 220, 135, 249, 122,
 64 | 		172, 220, 137, 143, 234, 68, 188,
 65 | 	}
 66 | 
 67 | 	scratchPad := NewScratchPad()
 68 | 	hash, err := XelisHash(input, scratchPad)
 69 | 	if err != nil {
 70 | 		t.Fatalf("Hash failed: %v", err)
 71 | 	}
 72 | 
 73 | 	expected := []byte{
 74 | 		242, 8, 176, 222, 203, 27, 104,
 75 | 		187, 22, 40, 68, 73, 79, 79, 65,
 76 | 		83, 138, 101, 10, 116, 194, 41, 153,
 77 | 		21, 92, 163, 12, 206, 231, 156, 70, 83,
 78 | 	}
 79 | 
 80 | 	if !slices.Equal(expected, hash[:]) {
 81 | 		t.Errorf("Hash mismatch:\nGot:      %x\nExpected: %x", hash, expected)
 82 | 	}
 83 | }
 84 | 
 85 | func TestMapIndex(t *testing.T) {
 86 | 	// Test that mapIndex always returns valid indices
 87 | 	for i := 0; i < 10000; i++ {
 88 | 		idx := mapIndex(uint64(i))
 89 | 		if idx < 0 || idx >= BufferSize {
 90 | 			t.Errorf("Invalid index %d from mapIndex(%d)", idx, i)
 91 | 		}
 92 | 	}
 93 | 
 94 | 	// Edge cases
 95 | 	if mapIndex(0) < 0 || mapIndex(0) >= BufferSize {
 96 | 		t.Error("mapIndex(0) out of bounds")
 97 | 	}
 98 | 	if mapIndex(^uint64(0)) < 0 || mapIndex(^uint64(0)) >= BufferSize {
 99 | 		t.Error("mapIndex(MAX) out of bounds")
100 | 	}
101 | }
102 | 
103 | func TestPickHalf(t *testing.T) {
104 | 	// Test that pickHalf produces roughly 50/50 distribution
105 | 	ones := 0
106 | 	zeros := 0
107 | 	iterations := 100000
108 | 
109 | 	for i := 0; i < iterations; i++ {
110 | 		if pickHalf(uint64(i)) {
111 | 			ones++
112 | 		} else {
113 | 			zeros++
114 | 		}
115 | 	}
116 | 
117 | 	ratio := float64(ones) / float64(ones+zeros)
118 | 	t.Logf("pickHalf ratio: %f (ones: %d, zeros: %d)", ratio, ones, zeros)
119 | 
120 | 	// Allow 5% deviation from 0.5
121 | 	if ratio < 0.45 || ratio > 0.55 {
122 | 		t.Errorf("pickHalf distribution is skewed: %f", ratio)
123 | 	}
124 | }
125 | 
126 | func BenchmarkXelisHashV3(b *testing.B) {
127 | 	input := make([]byte, 112)
128 | 	scratchPad := NewScratchPad()
129 | 
130 | 	b.ResetTimer()
131 | 	for i := 0; i < b.N; i++ {
132 | 		_, _ = XelisHash(input, scratchPad)
133 | 	}
134 | }
135 | 
136 | func BenchmarkMapIndex(b *testing.B) {
137 | 	for i := 0; i < b.N; i++ {
138 | 		_ = mapIndex(uint64(i))
139 | 	}
140 | }
141 | 
142 | func BenchmarkPickHalf(b *testing.B) {
143 | 	for i := 0; i < b.N; i++ {
144 | 		_ = pickHalf(uint64(i))
145 | 	}
146 | }
147 | 


--------------------------------------------------------------------------------
/C/ChaCha20-SIMD/chacha20.c:
--------------------------------------------------------------------------------
  1 | #include "chacha20.h"
  2 | #include <memory.h>
  3 | 
  4 | static const int32_t KeyDataSize = 48;
  5 | static const int32_t rounds = 20;
  6 | 
  7 | static const uint32_t ConstState[4] = {1634760805, 857760878, 2036477234, 1797285236}; //"expand 32-byte k";;
  8 | 
  9 | void ChaCha20SetKey(uint8_t *state, const uint8_t *Key)
 10 | {
 11 | 	memcpy(state, Key, 32);
 12 | }
 13 | 
 14 | void ChaCha20SetNonce(uint8_t *state, const uint8_t *Nonce)
 15 | {
 16 | 	memcpy(state + 36, Nonce, 12);
 17 | }
 18 | 
 19 | void ChaCha20SetCtr(uint8_t *state, const uint8_t *Ctr)
 20 | {
 21 | 	memcpy(state + 32, Ctr, 4);
 22 | }
 23 | 
 24 | void ChaCha20IncrementNonce(uint8_t *state)
 25 | {
 26 | 	uint32_t *State32bits = (uint32_t *)state;
 27 | 	State32bits[8] = 0; // reset counter
 28 | 	++State32bits[9];
 29 | 	if (State32bits[9] == 0)
 30 | 	{
 31 | 		++State32bits[10];
 32 | 		if (State32bits[10] == 0)
 33 | 			++State32bits[11];
 34 | 	}
 35 | }
 36 | 
 37 | void ChaCha20AddCounter(uint8_t *ChaCha, const uint32_t value_to_add)
 38 | {
 39 | 	uint32_t *State32bits = (uint32_t *)ChaCha;
 40 | 	State32bits[8] += value_to_add;
 41 | }
 42 | void ChaCha20EncryptBytes(uint8_t *state, uint8_t *In, uint8_t *Out, size_t Size, uint32_t rounds)
 43 | {
 44 | 
 45 | 	// portable chacha, no simd
 46 | 	uint8_t *CurrentIn = In;
 47 | 	uint8_t *CurrentOut = Out;
 48 | 	uint64_t RemainingBytes = Size;
 49 | 	uint32_t *state_dwords = (uint32_t *)state;
 50 | 	uint32_t b[16];
 51 | 	while (1)
 52 | 	{
 53 | 		b[0] = ConstState[0];
 54 | 		b[1] = ConstState[1];
 55 | 		b[2] = ConstState[2];
 56 | 		b[3] = ConstState[3];
 57 | 		memcpy(((uint8_t *)b) + 16, state, 48);
 58 | 
 59 | 		for (int i = rounds; i > 0; i -= 2)
 60 | 		{
 61 | 			b[0] = b[0] + b[4];
 62 | 			b[12] = (b[12] ^ b[0]) << 16 | (b[12] ^ b[0]) >> 16;
 63 | 			b[8] = b[8] + b[12];
 64 | 			b[4] = (b[4] ^ b[8]) << 12 | (b[4] ^ b[8]) >> 20;
 65 | 			b[0] = b[0] + b[4];
 66 | 			b[12] = (b[12] ^ b[0]) << 8 | (b[12] ^ b[0]) >> 24;
 67 | 			b[8] = b[8] + b[12];
 68 | 			b[4] = (b[4] ^ b[8]) << 7 | (b[4] ^ b[8]) >> 25;
 69 | 			b[1] = b[1] + b[5];
 70 | 			b[13] = (b[13] ^ b[1]) << 16 | (b[13] ^ b[1]) >> 16;
 71 | 			b[9] = b[9] + b[13];
 72 | 			b[5] = (b[5] ^ b[9]) << 12 | (b[5] ^ b[9]) >> 20;
 73 | 			b[1] = b[1] + b[5];
 74 | 			b[13] = (b[13] ^ b[1]) << 8 | (b[13] ^ b[1]) >> 24;
 75 | 			b[9] = b[9] + b[13];
 76 | 			b[5] = (b[5] ^ b[9]) << 7 | (b[5] ^ b[9]) >> 25;
 77 | 			b[2] = b[2] + b[6];
 78 | 			b[14] = (b[14] ^ b[2]) << 16 | (b[14] ^ b[2]) >> 16;
 79 | 			b[10] = b[10] + b[14];
 80 | 			b[6] = (b[6] ^ b[10]) << 12 | (b[6] ^ b[10]) >> 20;
 81 | 			b[2] = b[2] + b[6];
 82 | 			b[14] = (b[14] ^ b[2]) << 8 | (b[14] ^ b[2]) >> 24;
 83 | 			b[10] = b[10] + b[14];
 84 | 			b[6] = (b[6] ^ b[10]) << 7 | (b[6] ^ b[10]) >> 25;
 85 | 			b[3] = b[3] + b[7];
 86 | 			b[15] = (b[15] ^ b[3]) << 16 | (b[15] ^ b[3]) >> 16;
 87 | 			b[11] = b[11] + b[15];
 88 | 			b[7] = (b[7] ^ b[11]) << 12 | (b[7] ^ b[11]) >> 20;
 89 | 			b[3] = b[3] + b[7];
 90 | 			b[15] = (b[15] ^ b[3]) << 8 | (b[15] ^ b[3]) >> 24;
 91 | 			b[11] = b[11] + b[15];
 92 | 			b[7] = (b[7] ^ b[11]) << 7 | (b[7] ^ b[11]) >> 25;
 93 | 			b[0] = b[0] + b[5];
 94 | 			b[15] = (b[15] ^ b[0]) << 16 | (b[15] ^ b[0]) >> 16;
 95 | 			b[10] = b[10] + b[15];
 96 | 			b[5] = (b[5] ^ b[10]) << 12 | (b[5] ^ b[10]) >> 20;
 97 | 			b[0] = b[0] + b[5];
 98 | 			b[15] = (b[15] ^ b[0]) << 8 | (b[15] ^ b[0]) >> 24;
 99 | 			b[10] = b[10] + b[15];
100 | 			b[5] = (b[5] ^ b[10]) << 7 | (b[5] ^ b[10]) >> 25;
101 | 			b[1] = b[1] + b[6];
102 | 			b[12] = (b[12] ^ b[1]) << 16 | (b[12] ^ b[1]) >> 16;
103 | 			b[11] = b[11] + b[12];
104 | 			b[6] = (b[6] ^ b[11]) << 12 | (b[6] ^ b[11]) >> 20;
105 | 			b[1] = b[1] + b[6];
106 | 			b[12] = (b[12] ^ b[1]) << 8 | (b[12] ^ b[1]) >> 24;
107 | 			b[11] = b[11] + b[12];
108 | 			b[6] = (b[6] ^ b[11]) << 7 | (b[6] ^ b[11]) >> 25;
109 | 			b[2] = b[2] + b[7];
110 | 			b[13] = (b[13] ^ b[2]) << 16 | (b[13] ^ b[2]) >> 16;
111 | 			b[8] = b[8] + b[13];
112 | 			b[7] = (b[7] ^ b[8]) << 12 | (b[7] ^ b[8]) >> 20;
113 | 			b[2] = b[2] + b[7];
114 | 			b[13] = (b[13] ^ b[2]) << 8 | (b[13] ^ b[2]) >> 24;
115 | 			b[8] = b[8] + b[13];
116 | 			b[7] = (b[7] ^ b[8]) << 7 | (b[7] ^ b[8]) >> 25;
117 | 			b[3] = b[3] + b[4];
118 | 			b[14] = (b[14] ^ b[3]) << 16 | (b[14] ^ b[3]) >> 16;
119 | 			b[9] = b[9] + b[14];
120 | 			b[4] = (b[4] ^ b[9]) << 12 | (b[4] ^ b[9]) >> 20;
121 | 			b[3] = b[3] + b[4];
122 | 			b[14] = (b[14] ^ b[3]) << 8 | (b[14] ^ b[3]) >> 24;
123 | 			b[9] = b[9] + b[14];
124 | 			b[4] = (b[4] ^ b[9]) << 7 | (b[4] ^ b[9]) >> 25;
125 | 		}
126 | 
127 | 		for (uint32_t i = 0; i < 4; ++i)
128 | 		{
129 | 			b[i] += ConstState[i];
130 | 		}
131 | 		for (uint32_t i = 0; i < 12; ++i)
132 | 		{
133 | 			b[i + 4] += state_dwords[i];
134 | 		}
135 | 
136 | 		++state_dwords[8]; // counter
137 | 
138 | 		if (RemainingBytes >= 64)
139 | 		{
140 | 			if (In)
141 | 			{
142 | 				uint32_t *In32bits = (uint32_t *)CurrentIn;
143 | 				uint32_t *Out32bits = (uint32_t *)CurrentOut;
144 | 				for (uint32_t i = 0; i < 16; i++)
145 | 				{
146 | 					Out32bits[i] = In32bits[i] ^ b[i];
147 | 				}
148 | 			}
149 | 			else
150 | 				memcpy(CurrentOut, b, 64);
151 | 
152 | 			if (In)
153 | 				CurrentIn += 64;
154 | 			CurrentOut += 64;
155 | 			RemainingBytes -= 64;
156 | 			if (RemainingBytes == 0)
157 | 				return;
158 | 			continue;
159 | 		}
160 | 		else
161 | 		{
162 | 			if (In)
163 | 			{
164 | 				for (int32_t i = 0; i < RemainingBytes; i++)
165 | 					CurrentOut[i] = CurrentIn[i] ^ ((uint8_t *)b)[i];
166 | 			}
167 | 			else
168 | 				memcpy(CurrentOut, b, RemainingBytes);
169 | 			return;
170 | 		}
171 | 	}
172 | }
173 | 
174 | void chacha_encrypt_portable(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds)
175 | {
176 | 	uint8_t state[48] = {0};
177 | 	ChaCha20SetKey(state, key);
178 | 	ChaCha20SetNonce(state, nonce);
179 | 	ChaCha20EncryptBytes(state, in, out, bytes, rounds);
180 | }


--------------------------------------------------------------------------------
/go/v1/v1.go:
--------------------------------------------------------------------------------
  1 | package v1
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"math/bits"
  6 | 	"unsafe"
  7 | 
  8 | 	"github.com/xelis-project/xelis-hash/go/aes"
  9 | 	"github.com/xelis-project/xelis-hash/go/hash"
 10 | )
 11 | 
 12 | const (
 13 | 	MemorySize      = 32768
 14 | 	ScratchpadIters = 5000
 15 | 	Iters           = 1
 16 | 	BufferSize      = 42
 17 | 	SlotLength      = 256
 18 | 	KeccakWords     = 25
 19 | 	BytesArrayInput = KeccakWords * 8
 20 | 	Stage1Max       = MemorySize / KeccakWords
 21 | )
 22 | 
 23 | type ScratchPad [MemorySize]uint64
 24 | 
 25 | func Stage1(input *[KeccakWords]uint64, scratchPad *[MemorySize]uint64, aRange, bRange [2]int) {
 26 | 	for i := aRange[0]; i <= aRange[1]; i++ {
 27 | 		KeccakF1600(input)
 28 | 
 29 | 		var randInt uint64 = 0
 30 | 		for j := bRange[0]; j <= bRange[1]; j++ {
 31 | 			pairIdx := (j + 1) % KeccakWords
 32 | 			pairIdx2 := (j + 2) % KeccakWords
 33 | 
 34 | 			targetIdx := i*KeccakWords + j
 35 | 			a := input[j] ^ randInt
 36 | 
 37 | 			// Branching
 38 | 			left := input[pairIdx]
 39 | 			right := input[pairIdx2]
 40 | 			xor := left ^ right
 41 | 			var v uint64
 42 | 			switch xor & 0x3 {
 43 | 			case 0:
 44 | 				v = left & right
 45 | 			case 1:
 46 | 				v = ^(left & right)
 47 | 			case 2:
 48 | 				v = ^xor
 49 | 			case 3:
 50 | 				v = xor
 51 | 			}
 52 | 
 53 | 			b := a ^ v
 54 | 			randInt = b
 55 | 			scratchPad[targetIdx] = b
 56 | 		}
 57 | 	}
 58 | }
 59 | 
 60 | func XelisHash(input *[BytesArrayInput]byte, scratchPad *ScratchPad) (hash.Hash, error) {
 61 | 	// Convert input bytes to u64 array
 62 | 	var intInput [KeccakWords]uint64
 63 | 	for i := 0; i < KeccakWords; i++ {
 64 | 		intInput[i] = binary.LittleEndian.Uint64(input[i*8:])
 65 | 	}
 66 | 
 67 | 	// Stage 1
 68 | 	Stage1(&intInput, (*[MemorySize]uint64)(scratchPad), [2]int{0, Stage1Max - 1}, [2]int{0, KeccakWords - 1})
 69 | 	Stage1(&intInput, (*[MemorySize]uint64)(scratchPad), [2]int{Stage1Max, Stage1Max}, [2]int{0, 17})
 70 | 
 71 | 	// Stage 2
 72 | 	var slots [SlotLength]uint32
 73 | 	// Convert scratchpad to u32 using unsafe pointer (no copy)
 74 | 	smallPad := (*[MemorySize * 2]uint32)(unsafe.Pointer(&scratchPad[0]))
 75 | 
 76 | 	copy(slots[:], smallPad[len(smallPad)-SlotLength:])
 77 | 
 78 | 	var indices [SlotLength]uint16
 79 | 	for iter := 0; iter < Iters; iter++ {
 80 | 		for j := 0; j < len(smallPad)/SlotLength; j++ {
 81 | 			// Initialize indices and precompute the total sum
 82 | 			var totalSum uint32 = 0
 83 | 			for k := 0; k < SlotLength; k++ {
 84 | 				indices[k] = uint16(k)
 85 | 				if slots[k]>>31 == 0 {
 86 | 					totalSum += smallPad[j*SlotLength+k]
 87 | 				} else {
 88 | 					totalSum -= smallPad[j*SlotLength+k]
 89 | 				}
 90 | 			}
 91 | 
 92 | 			for slotIdx := SlotLength - 1; slotIdx >= 0; slotIdx-- {
 93 | 				indexInIndices := int(smallPad[j*SlotLength+slotIdx] % uint32(slotIdx+1))
 94 | 				index := int(indices[indexInIndices])
 95 | 				indices[indexInIndices] = indices[slotIdx]
 96 | 
 97 | 				localSum := totalSum
 98 | 				s1 := int32(slots[index] >> 31)
 99 | 				padValue := smallPad[j*SlotLength+index]
100 | 				if s1 == 0 {
101 | 					localSum -= padValue
102 | 				} else {
103 | 					localSum += padValue
104 | 				}
105 | 
106 | 				// Apply the sum to the slot
107 | 				slots[index] += localSum
108 | 
109 | 				// Update the total sum
110 | 				s2 := int32(slots[index] >> 31)
111 | 				totalSum -= 2 * smallPad[j*SlotLength+index] * uint32(-s1+s2)
112 | 			}
113 | 		}
114 | 	}
115 | 
116 | 	copy(smallPad[MemorySize*2-SlotLength:], slots[:])
117 | 	// No need to convert back - smallPad points directly to scratchPad memory
118 | 
119 | 	// Stage 3
120 | 	var key [16]byte // zero key
121 | 	var block [16]byte
122 | 
123 | 	addrA := (scratchPad[MemorySize-1] >> 15) & 0x7FFF
124 | 	addrB := scratchPad[MemorySize-1] & 0x7FFF
125 | 
126 | 	var memBufferA [BufferSize]uint64
127 | 	var memBufferB [BufferSize]uint64
128 | 
129 | 	for i := uint64(0); i < BufferSize; i++ {
130 | 		memBufferA[i] = scratchPad[(addrA+i)%MemorySize]
131 | 		memBufferB[i] = scratchPad[(addrB+i)%MemorySize]
132 | 	}
133 | 
134 | 	var finalResult hash.Hash
135 | 
136 | 	for i := 0; i < ScratchpadIters; i++ {
137 | 		memA := memBufferA[i%BufferSize]
138 | 		memB := memBufferB[i%BufferSize]
139 | 
140 | 		binary.LittleEndian.PutUint64(block[0:8], memB)
141 | 		binary.LittleEndian.PutUint64(block[8:16], memA)
142 | 
143 | 		// Use single AES round instead of full encryption
144 | 		aes.CipherRound(&block, &key)
145 | 
146 | 		hash1 := binary.LittleEndian.Uint64(block[0:8])
147 | 		hash2 := memA ^ memB
148 | 
149 | 		result := ^(hash1 ^ hash2)
150 | 
151 | 		for j := 0; j < hash.HashSize; j++ {
152 | 			a := memBufferA[(j+i)%BufferSize]
153 | 			b := memBufferB[(j+i)%BufferSize]
154 | 
155 | 			switch (result >> (j * 2)) & 0xf {
156 | 			case 0:
157 | 				result = bits.RotateLeft64(result, j) ^ b
158 | 			case 1:
159 | 				result = ^(bits.RotateLeft64(result, j) ^ a)
160 | 			case 2:
161 | 				result = ^(result ^ a)
162 | 			case 3:
163 | 				result ^= b
164 | 			case 4:
165 | 				result ^= (a + b)
166 | 			case 5:
167 | 				result ^= (a - b)
168 | 			case 6:
169 | 				result ^= (b - a)
170 | 			case 7:
171 | 				result ^= (a * b)
172 | 			case 8:
173 | 				result ^= (a & b)
174 | 			case 9:
175 | 				result ^= (a | b)
176 | 			case 10:
177 | 				result ^= (a ^ b)
178 | 			case 11:
179 | 				result ^= (a - result)
180 | 			case 12:
181 | 				result ^= (b - result)
182 | 			case 13:
183 | 				result ^= (a + result)
184 | 			case 14:
185 | 				result ^= (result - a)
186 | 			case 15:
187 | 				result ^= (result - b)
188 | 			}
189 | 		}
190 | 
191 | 		addrB = result & 0x7FFF
192 | 		memBufferA[i%BufferSize] = result
193 | 		memBufferB[i%BufferSize] = scratchPad[addrB]
194 | 
195 | 		addrA = (result >> 15) & 0x7FFF
196 | 		scratchPad[addrA] = result
197 | 
198 | 		index := ScratchpadIters - i - 1
199 | 		if index < 4 {
200 | 			var resultBytes [8]byte
201 | 			binary.BigEndian.PutUint64(resultBytes[:], result)
202 | 			copy(finalResult[index*8:(ScratchpadIters-i)*8], resultBytes[:])
203 | 		}
204 | 	}
205 | 
206 | 	return finalResult, nil
207 | }
208 | 
209 | // NewScratchPad creates a new zeroed scratchpad
210 | func NewScratchPad() *ScratchPad {
211 | 	return &ScratchPad{}
212 | }
213 | 


--------------------------------------------------------------------------------
/go/v2/v2.go:
--------------------------------------------------------------------------------
  1 | package v2
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"unsafe"
  6 | 
  7 | 	"github.com/chocolatkey/chacha8"
  8 | 	"github.com/xelis-project/xelis-hash/go/aes"
  9 | 	"github.com/xelis-project/xelis-hash/go/hash"
 10 | 	"lukechampine.com/blake3"
 11 | 	"lukechampine.com/uint128"
 12 | )
 13 | 
 14 | const (
 15 | 	MemorySize      = 429 * 128
 16 | 	ScratchpadIters = 3
 17 | 	BufferSize      = MemorySize / 2
 18 | 	ChunkSize       = 32
 19 | 	NonceSize       = 12
 20 | 	MemorySizeBytes = MemorySize * 8
 21 | )
 22 | 
 23 | var Key = [16]byte{'x', 'e', 'l', 'i', 's', 'h', 'a', 's', 'h', '-', 'p', 'o', 'w', '-', 'v', '2'}
 24 | 
 25 | type ScratchPad [MemorySize]uint64
 26 | 
 27 | // Stage1 generates the scratchpad using ChaCha8
 28 | func Stage1(input []byte, scratchPad *ScratchPad) error {
 29 | 	// Convert scratchpad to bytes
 30 | 	scratchPadBytes := (*[MemorySizeBytes]byte)(unsafe.Pointer(scratchPad))[:]
 31 | 
 32 | 	// Reset scratchpad
 33 | 	for i := range scratchPadBytes {
 34 | 		scratchPadBytes[i] = 0
 35 | 	}
 36 | 
 37 | 	outputOffset := 0
 38 | 	nonce := make([]byte, NonceSize)
 39 | 
 40 | 	// Generate nonce from input
 41 | 	inputHash := blake3.Sum256(input)
 42 | 	copy(nonce, inputHash[:NonceSize])
 43 | 
 44 | 	numChunks := (len(input) + ChunkSize - 1) / ChunkSize
 45 | 
 46 | 	for chunkIndex := 0; chunkIndex*ChunkSize < len(input); chunkIndex++ {
 47 | 		start := chunkIndex * ChunkSize
 48 | 		end := start + ChunkSize
 49 | 		if end > len(input) {
 50 | 			end = len(input)
 51 | 		}
 52 | 		chunk := input[start:end]
 53 | 
 54 | 		// Concatenate input hash with chunk
 55 | 		tmp := make([]byte, hash.HashSize*2)
 56 | 		copy(tmp[0:hash.HashSize], inputHash[:])
 57 | 		copy(tmp[hash.HashSize:hash.HashSize+len(chunk)], chunk)
 58 | 
 59 | 		// Hash it
 60 | 		inputHash = blake3.Sum256(tmp)
 61 | 
 62 | 		cipher, err := chacha8.New(inputHash[:], nonce)
 63 | 		if err != nil {
 64 | 			return err
 65 | 		}
 66 | 
 67 | 		// Calculate output size for this iteration
 68 | 		remainingOutputSize := MemorySizeBytes - outputOffset
 69 | 		chunksLeft := numChunks - chunkIndex
 70 | 		chunkOutputSize := remainingOutputSize / chunksLeft
 71 | 		currentOutputSize := remainingOutputSize
 72 | 		if currentOutputSize > chunkOutputSize {
 73 | 			currentOutputSize = chunkOutputSize
 74 | 		}
 75 | 
 76 | 		// Apply keystream
 77 | 		offset := chunkIndex * currentOutputSize
 78 | 		part := scratchPadBytes[offset : offset+currentOutputSize]
 79 | 		cipher.XORKeyStream(part, part)
 80 | 
 81 | 		outputOffset += currentOutputSize
 82 | 
 83 | 		// Update nonce
 84 | 		nonceStart := currentOutputSize - NonceSize
 85 | 		if nonceStart < 0 {
 86 | 			nonceStart = 0
 87 | 		}
 88 | 		copy(nonce, part[nonceStart:])
 89 | 	}
 90 | 
 91 | 	return nil
 92 | }
 93 | 
 94 | func isqrt(n uint64) uint64 {
 95 | 	if n < 2 {
 96 | 		return n
 97 | 	}
 98 | 
 99 | 	x := n
100 | 	y := (x + 1) >> 1
101 | 
102 | 	for y < x {
103 | 		x = y
104 | 		y = (x + n/x) >> 1
105 | 	}
106 | 
107 | 	return x
108 | }
109 | 
110 | // Stage3 performs random memory accesses and branching
111 | func Stage3(scratchPad *ScratchPad) error {
112 | 	var block [16]byte
113 | 
114 | 	// Split scratchpad into two buffers
115 | 	memBufferA := scratchPad[:BufferSize]
116 | 	memBufferB := scratchPad[BufferSize:]
117 | 
118 | 	addrA := memBufferB[BufferSize-1]
119 | 	addrB := memBufferA[BufferSize-1] >> 32
120 | 
121 | 	r := 0
122 | 
123 | 	for i := 0; i < ScratchpadIters; i++ {
124 | 		indexA := int(addrA % uint64(BufferSize))
125 | 		indexB := int(addrB % uint64(BufferSize))
126 | 
127 | 		memA := memBufferA[indexA]
128 | 		memB := memBufferB[indexB]
129 | 
130 | 		binary.LittleEndian.PutUint64(block[0:8], memB)
131 | 		binary.LittleEndian.PutUint64(block[8:16], memA)
132 | 
133 | 		aes.CipherRound(&block, &Key)
134 | 
135 | 		hash1 := binary.LittleEndian.Uint64(block[0:8])
136 | 		hash2 := memA ^ memB
137 | 		result := ^(hash1 ^ hash2)
138 | 
139 | 		for j := 0; j < BufferSize; j++ {
140 | 			indexA := int(result % uint64(BufferSize))
141 | 			indexB := int((^bits_RotateRight64(result, uint(r))) % uint64(BufferSize))
142 | 
143 | 			a := memBufferA[indexA]
144 | 			b := memBufferB[indexB]
145 | 
146 | 			var c uint64
147 | 			if r < BufferSize {
148 | 				c = memBufferA[r]
149 | 			} else {
150 | 				c = memBufferB[r-BufferSize]
151 | 			}
152 | 
153 | 			if r < MemorySize-1 {
154 | 				r++
155 | 			} else {
156 | 				r = 0
157 | 			}
158 | 
159 | 			branchIdx := uint8((bits_RotateLeft64(result, uint(c)) & 0xf))
160 | 
161 | 			var v uint64
162 | 			switch branchIdx {
163 | 			case 0:
164 | 				v = result ^ (bits_RotateLeft64(c, uint(i*j)) ^ b)
165 | 			case 1:
166 | 				v = result ^ (bits_RotateRight64(c, uint(i*j)) ^ a)
167 | 			case 2:
168 | 				v = result ^ (a ^ b ^ c)
169 | 			case 3:
170 | 				v = result ^ ((a + b) * c)
171 | 			case 4:
172 | 				v = result ^ ((b - c) * a)
173 | 			case 5:
174 | 				v = result ^ (c - a + b)
175 | 			case 6:
176 | 				v = result ^ (a - b + c)
177 | 			case 7:
178 | 				v = result ^ (b*c + a)
179 | 			case 8:
180 | 				v = result ^ (c*a + b)
181 | 			case 9:
182 | 				v = result ^ (a * b * c)
183 | 			case 10:
184 | 				// combine_u64(a, b) % (c | 1)
185 | 				// Rust: combine_u64(high, low) where a is high, b is low
186 | 				t1 := uint128.New(b, a) // New(lo, hi)
187 | 				t2 := uint128.From64(c | 1)
188 | 				v = result ^ t1.Mod(t2).Lo
189 | 			case 11:
190 | 				// combine_u64(b, c) % combine_u64(result.rotate_left(r), a | 2)
191 | 				// Rust: combine_u64(high, low)
192 | 				t1 := uint128.New(c, b)                                    // New(lo, hi) where b is high, c is low
193 | 				t2 := uint128.New(a|2, bits_RotateLeft64(result, uint(r))) // New(lo, hi)
194 | 				v = result ^ t1.Mod(t2).Lo
195 | 			case 12:
196 | 				// combine_u64(c, a) / (b | 4)
197 | 				// Rust: combine_u64(high, low) where c is high, a is low
198 | 				t1 := uint128.New(a, c) // New(lo, hi)
199 | 				t2 := uint128.From64(b | 4)
200 | 				v = result ^ t1.Div(t2).Lo
201 | 			case 13:
202 | 				// combine_u64(result.rotate_left(r), b) where first arg is high
203 | 				// combine_u64(a, c|8) where a is high
204 | 				t1 := uint128.New(b, bits_RotateLeft64(result, uint(r))) // New(lo, hi)
205 | 				t2 := uint128.New(c|8, a)                                // New(lo, hi)
206 | 				if t1.Cmp(t2) > 0 {
207 | 					v = result ^ t1.Div(t2).Lo
208 | 				} else {
209 | 					v = result ^ (a ^ b)
210 | 				}
211 | 			case 14:
212 | 				// (combine_u64(b, a) * c) >> 64
213 | 				// Rust wrapping_mul on u128 then >> 64 gets high 64 bits
214 | 				t1 := uint128.New(a, b) // New(lo, hi) where b is high, a is low
215 | 				prod := t1.MulWrap64(c) // Wrapping mul
216 | 				v = result ^ prod.Hi
217 | 			case 15:
218 | 				// (combine_u64(a, c) * combine_u64(result.rotate_right(r), b)) >> 64
219 | 				// Rust wrapping_mul on u128 then >> 64 gets high 64 bits
220 | 				rr := bits_RotateRight64(result, uint(r))
221 | 				t1 := uint128.New(c, a)  // New(lo, hi) where a is high, c is low
222 | 				t2 := uint128.New(b, rr) // New(lo, hi) where rr is high, b is low
223 | 				prod := t1.MulWrap(t2)   // Wrapping mul
224 | 				v = result ^ prod.Hi
225 | 			}
226 | 
227 | 			result = bits_RotateLeft64(v, 1)
228 | 
229 | 			t := memBufferA[BufferSize-j-1] ^ result
230 | 			memBufferA[BufferSize-j-1] = t
231 | 			memBufferB[j] ^= bits_RotateRight64(t, uint(result))
232 | 		}
233 | 
234 | 		addrA = result
235 | 		addrB = isqrt(result)
236 | 	}
237 | 
238 | 	return nil
239 | }
240 | 
241 | // Stage4 hashes the entire scratchpad with Blake3
242 | func Stage4(scratchPad *ScratchPad) hash.Hash {
243 | 	scratchPadBytes := (*[MemorySizeBytes]byte)(unsafe.Pointer(scratchPad))[:]
244 | 	return blake3.Sum256(scratchPadBytes)
245 | }
246 | 
247 | func XelisHash(input []byte, scratchPad *ScratchPad) (hash.Hash, error) {
248 | 	err := Stage1(input, scratchPad)
249 | 	if err != nil {
250 | 		return hash.Zero(), err
251 | 	}
252 | 
253 | 	err = Stage3(scratchPad)
254 | 	if err != nil {
255 | 		return hash.Zero(), err
256 | 	}
257 | 
258 | 	return Stage4(scratchPad), nil
259 | }
260 | 
261 | func NewScratchPad() *ScratchPad {
262 | 	return &ScratchPad{}
263 | }
264 | 
265 | // Helper functions for bit rotation
266 | func bits_RotateLeft64(x uint64, k uint) uint64 {
267 | 	const n = 64
268 | 	s := k & (n - 1)
269 | 	return x<<s | x>>(n-s)
270 | }
271 | 
272 | func bits_RotateRight64(x uint64, k uint) uint64 {
273 | 	const n = 64
274 | 	s := k & (n - 1)
275 | 	return x>>s | x<<(n-s)
276 | }
277 | 


--------------------------------------------------------------------------------
/go/v3/v3.go:
--------------------------------------------------------------------------------
  1 | package v3
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"math"
  6 | 	"unsafe"
  7 | 
  8 | 	"github.com/chocolatkey/chacha8"
  9 | 	"github.com/xelis-project/xelis-hash/go/aes"
 10 | 	"github.com/xelis-project/xelis-hash/go/hash"
 11 | 	"lukechampine.com/blake3"
 12 | 	"lukechampine.com/uint128"
 13 | )
 14 | 
 15 | const (
 16 | 	MemorySize      = 531 * 128
 17 | 	ScratchpadIters = 2
 18 | 	BufferSize      = MemorySize / 2
 19 | 	MemorySizeBytes = MemorySize * 8
 20 | )
 21 | 
 22 | var Key = [16]byte{'x', 'e', 'l', 'i', 's', 'h', 'a', 's', 'h', '-', 'p', 'o', 'w', '-', 'v', '3'}
 23 | 
 24 | type ScratchPad [MemorySize]uint64
 25 | 
 26 | func murmurhash3(seed uint64) uint64 {
 27 | 	seed ^= seed >> 55
 28 | 	seed *= 0xff51afd7ed558ccd
 29 | 	seed ^= seed >> 32
 30 | 	seed *= 0xc4ceb9fe1a85ec53
 31 | 	seed ^= seed >> 15
 32 | 	return seed
 33 | }
 34 | 
 35 | func mapIndex(x uint64) int {
 36 | 	x ^= x >> 33
 37 | 	x *= 0xff51afd7ed558ccd
 38 | 	// multiply-high reduction: get high 64 bits of x * BufferSize
 39 | 	// (x * BufferSize) >> 64
 40 | 	t1 := uint128.From64(x)
 41 | 	t2 := uint128.From64(uint64(BufferSize))
 42 | 	prod := t1.MulWrap(t2)
 43 | 	return int(prod.Hi)
 44 | }
 45 | 
 46 | func pickHalf(seed uint64) bool {
 47 | 	return (murmurhash3(seed) & (1 << 58)) != 0
 48 | }
 49 | 
 50 | func isqrt(n uint64) uint64 {
 51 | 	if n < 2 {
 52 | 		return n
 53 | 	}
 54 | 
 55 | 	// Compute floating-point square root as an approximation
 56 | 	approx := uint64(math.Sqrt(float64(n)))
 57 | 
 58 | 	// Verify and adjust if necessary
 59 | 	if approx*approx > n {
 60 | 		return approx - 1
 61 | 	} else if (approx+1)*(approx+1) <= n {
 62 | 		return approx + 1
 63 | 	}
 64 | 	return approx
 65 | }
 66 | 
 67 | func modularPower(base, exp, mod uint64) uint64 {
 68 | 	result := uint64(1)
 69 | 	base %= mod
 70 | 
 71 | 	for exp > 0 {
 72 | 		if exp&1 == 1 {
 73 | 			result = mulmod(result, base, mod)
 74 | 		}
 75 | 		base = mulmod(base, base, mod)
 76 | 		exp /= 2
 77 | 	}
 78 | 
 79 | 	return result
 80 | }
 81 | 
 82 | // mulmod computes (a * b) % m avoiding overflow
 83 | func mulmod(a, b, m uint64) uint64 {
 84 | 	t1 := uint128.From64(a)
 85 | 	t2 := uint128.From64(b)
 86 | 	prod := t1.MulWrap(t2)
 87 | 	mod := uint128.From64(m)
 88 | 	return prod.Mod(mod).Lo
 89 | }
 90 | 
 91 | // Stage3 performs the complex memory operations with branching
 92 | func Stage3(scratchPad *ScratchPad) error {
 93 | 	key := Key
 94 | 	var block [16]byte
 95 | 
 96 | 	// Split scratchpad
 97 | 	memBufferA := scratchPad[:BufferSize]
 98 | 	memBufferB := scratchPad[BufferSize:]
 99 | 
100 | 	addrA := memBufferB[BufferSize-1]
101 | 	addrB := memBufferA[BufferSize-1] >> 32
102 | 
103 | 	r := 0
104 | 
105 | 	for i := 0; i < ScratchpadIters; i++ {
106 | 		indexA := mapIndex(addrA)
107 | 		memA := memBufferA[indexA]
108 | 
109 | 		indexB := mapIndex(memA ^ addrB)
110 | 		memB := memBufferB[indexB]
111 | 
112 | 		binary.LittleEndian.PutUint64(block[0:8], memB)
113 | 		binary.LittleEndian.PutUint64(block[8:16], memA)
114 | 
115 | 		aes.CipherRound(&block, &key)
116 | 
117 | 		hash1 := binary.LittleEndian.Uint64(block[0:8])
118 | 		hash2 := binary.LittleEndian.Uint64(block[8:16])
119 | 
120 | 		result := ^(hash1 ^ hash2)
121 | 
122 | 		for j := 0; j < BufferSize; j++ {
123 | 			indexA := mapIndex(result)
124 | 			a := memBufferA[indexA]
125 | 
126 | 			indexB := mapIndex(a ^ (^bits_RotateRight64(result, uint(r))))
127 | 			b := memBufferB[indexB]
128 | 
129 | 			var c uint64
130 | 			if r < BufferSize {
131 | 				c = memBufferA[r]
132 | 			} else {
133 | 				c = memBufferB[r-BufferSize]
134 | 			}
135 | 
136 | 			if r < MemorySize-1 {
137 | 				r++
138 | 			} else {
139 | 				r = 0
140 | 			}
141 | 
142 | 			branchIdx := uint8(bits_RotateLeft64(result, uint(c)) & 0xf)
143 | 
144 | 			var v uint64
145 | 			switch branchIdx {
146 | 			case 0:
147 | 				// combine_u64((a + i), isqrt(b + j)) % (murmurhash3(c ^ result ^ i ^ j) | 1)
148 | 				t1 := uint128.New(isqrt(b+uint64(j)), a+uint64(i)) // New(lo, hi)
149 | 				denom := uint128.From64(murmurhash3(c^result^uint64(i)^uint64(j)) | 1)
150 | 				v = t1.Mod(denom).Lo
151 | 			case 1:
152 | 				// ROTL((c + i) % isqrt(b | 2), i + j) * isqrt(a + j)
153 | 				sqrt := isqrt(b | 2)
154 | 				if sqrt == 0 {
155 | 					sqrt = 1
156 | 				}
157 | 				t1 := (c + uint64(i)) % sqrt
158 | 				t2 := bits_RotateLeft64(t1, uint(i+j))
159 | 				t3 := isqrt(a + uint64(j))
160 | 				v = t2 * t3
161 | 			case 2:
162 | 				// (isqrt(a + i) * isqrt(c + j)) ^ (b + i + j)
163 | 				t1 := isqrt(a + uint64(i))
164 | 				t2 := isqrt(c + uint64(j))
165 | 				t3 := t1 * t2
166 | 				v = t3 ^ (b + uint64(i) + uint64(j))
167 | 			case 3:
168 | 				v = (a + b) * c
169 | 			case 4:
170 | 				v = (b - c) * a
171 | 			case 5:
172 | 				v = c - a + b
173 | 			case 6:
174 | 				v = a - b + c
175 | 			case 7:
176 | 				v = b*c + a
177 | 			case 8:
178 | 				v = c*a + b
179 | 			case 9:
180 | 				v = a * b * c
181 | 			case 10:
182 | 				t1 := uint128.New(b, a) // New(lo, hi)
183 | 				t2 := uint128.From64(c | 1)
184 | 				v = t1.Mod(t2).Lo
185 | 			case 11:
186 | 				t1 := uint128.New(c, b)                                    // New(lo, hi)
187 | 				t2 := uint128.New(a|2, bits_RotateLeft64(result, uint(r))) // New(lo, hi)
188 | 				if t2.Cmp(t1) > 0 {
189 | 					v = c
190 | 				} else {
191 | 					v = t1.Mod(t2).Lo
192 | 				}
193 | 			case 12:
194 | 				t1 := uint128.New(a, c) // New(lo, hi)
195 | 				t2 := uint128.From64(b | 4)
196 | 				v = t1.Div(t2).Lo
197 | 			case 13:
198 | 				t1 := uint128.New(b, bits_RotateLeft64(result, uint(r))) // New(lo, hi)
199 | 				t2 := uint128.New(c|8, a)                                // New(lo, hi)
200 | 				if t1.Cmp(t2) > 0 {
201 | 					v = t1.Div(t2).Lo
202 | 				} else {
203 | 					v = a ^ b
204 | 				}
205 | 			case 14:
206 | 				// (combine_u64(b, a) * c) >> 64
207 | 				t1 := uint128.New(a, b) // New(lo, hi)
208 | 				prod := t1.MulWrap64(c)
209 | 				v = prod.Hi
210 | 			case 15:
211 | 				// (combine_u64(a, c) * combine_u64(result.rotate_right(r), b)) >> 64
212 | 				rr := bits_RotateRight64(result, uint(r))
213 | 				t1 := uint128.New(c, a)  // New(lo, hi)
214 | 				t2 := uint128.New(b, rr) // New(lo, hi)
215 | 				prod := t1.MulWrap(t2)
216 | 				v = prod.Hi
217 | 			}
218 | 
219 | 			seed := v ^ result
220 | 			result = bits_RotateLeft64(seed, uint(r))
221 | 
222 | 			useBufferB := pickHalf(v)
223 | 			indexT := mapIndex(seed)
224 | 			var t uint64
225 | 			if useBufferB {
226 | 				t = memBufferB[indexT] ^ result
227 | 			} else {
228 | 				t = memBufferA[indexT] ^ result
229 | 			}
230 | 
231 | 			indexA2 := mapIndex(t ^ result ^ 0x9e3779b97f4a7c15)
232 | 			indexB2 := mapIndex(uint64(indexA2) ^ ^result ^ 0xd2b74407b1ce6e93)
233 | 
234 | 			oldA := memBufferA[indexA2]
235 | 			memBufferA[indexA2] = t
236 | 			memBufferB[indexB2] ^= oldA ^ bits_RotateRight64(t, uint(i+j))
237 | 		}
238 | 
239 | 		addrA = modularPower(addrA, addrB, result)
240 | 		addrB = isqrt(result) * uint64(r+1) * isqrt(addrA)
241 | 	}
242 | 
243 | 	return nil
244 | }
245 | 
246 | // Stage1 generates the scratchpad using ChaCha8 (same as v2 but with v3's memory size)
247 | func Stage1(input []byte, scratchPad *ScratchPad) error {
248 | 	// Convert scratchpad to bytes
249 | 	scratchPadBytes := (*[MemorySizeBytes]byte)(unsafe.Pointer(scratchPad))[:]
250 | 
251 | 	// Reset scratchpad
252 | 	for i := range scratchPadBytes {
253 | 		scratchPadBytes[i] = 0
254 | 	}
255 | 
256 | 	const ChunkSize = 32
257 | 	const NonceSize = 12
258 | 
259 | 	outputOffset := 0
260 | 	nonce := make([]byte, NonceSize)
261 | 
262 | 	// Generate nonce from input
263 | 	inputHash := blake3.Sum256(input)
264 | 	copy(nonce, inputHash[:NonceSize])
265 | 
266 | 	numChunks := (len(input) + ChunkSize - 1) / ChunkSize
267 | 
268 | 	for chunkIndex := 0; chunkIndex*ChunkSize < len(input); chunkIndex++ {
269 | 		start := chunkIndex * ChunkSize
270 | 		end := start + ChunkSize
271 | 		if end > len(input) {
272 | 			end = len(input)
273 | 		}
274 | 		chunk := input[start:end]
275 | 
276 | 		// Concatenate input hash with chunk
277 | 		tmp := make([]byte, hash.HashSize*2)
278 | 		copy(tmp[0:hash.HashSize], inputHash[:])
279 | 		copy(tmp[hash.HashSize:hash.HashSize+len(chunk)], chunk)
280 | 
281 | 		// Hash it
282 | 		inputHash = blake3.Sum256(tmp)
283 | 
284 | 		cipher, err := chacha8.New(inputHash[:], nonce)
285 | 		if err != nil {
286 | 			return err
287 | 		}
288 | 
289 | 		// Calculate output size for this iteration
290 | 		remainingOutputSize := MemorySizeBytes - outputOffset
291 | 		chunksLeft := numChunks - chunkIndex
292 | 		chunkOutputSize := remainingOutputSize / chunksLeft
293 | 		currentOutputSize := remainingOutputSize
294 | 		if currentOutputSize > chunkOutputSize {
295 | 			currentOutputSize = chunkOutputSize
296 | 		}
297 | 
298 | 		// Apply keystream
299 | 		offset := chunkIndex * currentOutputSize
300 | 		part := scratchPadBytes[offset : offset+currentOutputSize]
301 | 		cipher.XORKeyStream(part, part)
302 | 
303 | 		outputOffset += currentOutputSize
304 | 
305 | 		// Update nonce
306 | 		nonceStart := currentOutputSize - NonceSize
307 | 		if nonceStart < 0 {
308 | 			nonceStart = 0
309 | 		}
310 | 		copy(nonce, part[nonceStart:])
311 | 	}
312 | 
313 | 	return nil
314 | }
315 | 
316 | func XelisHash(input []byte, scratchPad *ScratchPad) (hash.Hash, error) {
317 | 	// Use v3's Stage1 with correct memory size
318 | 	err := Stage1(input, scratchPad)
319 | 	if err != nil {
320 | 		return hash.Zero(), err
321 | 	}
322 | 
323 | 	// V3's custom Stage3
324 | 	err = Stage3(scratchPad)
325 | 	if err != nil {
326 | 		return hash.Zero(), err
327 | 	}
328 | 
329 | 	// Stage4: hash the whole scratchpad
330 | 	scratchPadBytes := (*[MemorySizeBytes]byte)(unsafe.Pointer(scratchPad))[:]
331 | 	return blake3.Sum256(scratchPadBytes), nil
332 | }
333 | 
334 | func NewScratchPad() *ScratchPad {
335 | 	return &ScratchPad{}
336 | }
337 | 
338 | // Helper functions
339 | func bits_RotateLeft64(x uint64, k uint) uint64 {
340 | 	const n = 64
341 | 	s := k & (n - 1)
342 | 	return x<<s | x>>(n-s)
343 | }
344 | 
345 | func bits_RotateRight64(x uint64, k uint) uint64 {
346 | 	const n = 64
347 | 	s := k & (n - 1)
348 | 	return x>>s | x<<(n-s)
349 | }
350 | 


--------------------------------------------------------------------------------
/src/tracker.rs:
--------------------------------------------------------------------------------
  1 | use plotters::{
  2 |     chart::ChartBuilder,
  3 |     prelude::*,
  4 |     style::{
  5 |         text_anchor::{HPos, Pos, VPos},
  6 |         Color,
  7 |         IntoFont,
  8 |         RGBColor,
  9 |         TextStyle,
 10 |         WHITE
 11 |     }
 12 | };
 13 | 
 14 | #[derive(Debug, Clone, Copy)]
 15 | pub enum MemOp {
 16 |     Read,
 17 |     Write,
 18 | }
 19 | 
 20 | #[derive(Debug, Clone, Copy, Default)]
 21 | pub struct MemTracker {
 22 |     pub read: u64,
 23 |     pub write: u64,
 24 | }
 25 | 
 26 | // Track the operations used in each iteration
 27 | // This is used to verify that we have a good distribution
 28 | // in branches and memory operations
 29 | #[derive(Debug)]
 30 | pub struct OpsTracker {
 31 |     // branches id used at each iteration
 32 |     branches: [usize; 16],
 33 |     // memory operations used at each iteration
 34 |     // first Vec represents the scratchpad with each index
 35 |     // inner Vec represents the memory operations used at each index
 36 |     mem_ops: Vec<MemTracker>,
 37 | }
 38 | 
 39 | impl OpsTracker {
 40 |     pub fn new(scratchpad: usize) -> Self {
 41 |         Self {
 42 |             branches: [0; 16],
 43 |             mem_ops: vec![Default::default(); scratchpad],
 44 |         }
 45 |     }
 46 | 
 47 |     pub fn add_branch(&mut self, branch: u8) {
 48 |         self.branches[branch as usize] += 1;
 49 |     }
 50 | 
 51 |     pub fn add_mem_op(&mut self, index: usize, mem_op: MemOp) {
 52 |         let tracker = &mut self.mem_ops[index];
 53 |         match mem_op {
 54 |             MemOp::Read => tracker.read += 1,
 55 |             MemOp::Write => tracker.write += 1,
 56 |         }
 57 |     }
 58 | 
 59 |     pub fn get_branches(&self) -> &[usize; 16] {
 60 |         &self.branches
 61 |     }
 62 | 
 63 |     pub fn get_mem_ops(&self) -> &Vec<MemTracker> {
 64 |         &self.mem_ops
 65 |     }
 66 | 
 67 |     /// Generate a percentage-based heatmap of branch usage
 68 |     pub fn generate_branch_distribution(&self, output_path: &str) -> Result<(), anyhow::Error> {
 69 |         let total: usize = self.branches.iter().sum();
 70 |         let total = total.max(1);
 71 | 
 72 |         let percentages: Vec<f64> = self.branches
 73 |             .iter()
 74 |             .map(|&b| (b as f64 / total as f64) * 100.0)
 75 |             .collect();
 76 | 
 77 |         // Choose a reasonable y max (at least a little above the tallest bar)
 78 |         let max_val = percentages
 79 |             .iter()
 80 |             .cloned()
 81 |             .fold(0.0_f64, f64::max)
 82 |             .max(10.0);
 83 | 
 84 |         // Create drawing area
 85 |         let root = BitMapBackend::new(output_path, (1000, 600)).into_drawing_area();
 86 |         root.fill(&WHITE)?;
 87 | 
 88 |         // Use f64 for x-range so we can put label at i + 0.5
 89 |         let mut chart = ChartBuilder::on(&root)
 90 |             .caption("Branch Usage Distribution (%)", ("sans-serif", 30))
 91 |             .margin(20)
 92 |             .x_label_area_size(40)
 93 |             .y_label_area_size(60)
 94 |             .build_cartesian_2d(0f64..16f64, 0f64..(max_val * 1.12))?; // leave headroom for labels
 95 | 
 96 |         chart
 97 |             .configure_mesh()
 98 |             .x_labels(16)
 99 |             .x_label_formatter(&|x| format!("{}", *x as usize))
100 |             .x_desc("Branch ID")
101 |             .y_desc("Usage (%)")
102 |             .axis_desc_style(("sans-serif", 20))
103 |             .draw()?;
104 | 
105 |         // Bar color
106 |         let bar_style = RGBColor(30, 120, 200).filled();
107 | 
108 |         // Draw bars using f64 coordinates
109 |         for (i, &pct) in percentages.iter().enumerate() {
110 |             let x0 = i as f64;
111 |             let x1 = x0 + 0.9; // slightly narrower than 1.0 for spacing
112 |             chart.draw_series(std::iter::once(Rectangle::new(
113 |                 [(x0, 0.0), (x1, pct)],
114 |                 bar_style,
115 |             )))?;
116 |         }
117 | 
118 |         // Prepare a TextStyle and position it anchored to center above the bar
119 |         let label_style = TextStyle::from(("sans-serif", 14).into_font())
120 |             .pos(Pos::new(HPos::Center, VPos::Bottom));
121 | 
122 |         // Draw labels
123 |         for (i, &pct) in percentages.iter().enumerate() {
124 |             let x_center = i as f64 + 0.45; // center given x1 = x0 + 0.9
125 |             let y = pct + (max_val * 0.02); // small offset above the bar
126 |             chart.draw_series(std::iter::once(Text::new(
127 |                 format!("{:.1}%", pct),
128 |                 (x_center, y),
129 |                 label_style.clone(),
130 |             )))?;
131 |         }
132 | 
133 |         root.present()?;
134 |         Ok(())
135 |     }
136 | 
137 | pub fn generate_memory_usage_graph(
138 |     &self,
139 |     output_path: &str,
140 |     ma_window: usize,
141 | ) -> Result<(), anyhow::Error> {
142 |     use plotters::prelude::*;
143 | 
144 |     let scratchpad_size = self.mem_ops.len();
145 |     let mut read_counts = vec![0usize; scratchpad_size];
146 |     let mut write_counts = vec![0usize; scratchpad_size];
147 | 
148 |     for (i, ops) in self.mem_ops.iter().enumerate() {
149 |         read_counts[i] = ops.read as usize;
150 |         write_counts[i] = ops.write as usize;
151 |     }
152 | 
153 |     // ---- zero-phase moving average (filtfilt for a boxcar) ----
154 | 
155 |     #[inline]
156 |     fn ma_forward_usize(data: &[usize], w: usize) -> Vec<f64> {
157 |         let w = w.max(1);
158 |         let mut out = vec![0.0; data.len()];
159 |         let mut sum: u64 = 0;
160 |         for i in 0..data.len() {
161 |             sum += data[i] as u64;
162 |             if i >= w { sum -= data[i - w] as u64; }
163 |             let denom = (i + 1).min(w) as f64;
164 |             out[i] = sum as f64 / denom;
165 |         }
166 |         out
167 |     }
168 | 
169 |     #[inline]
170 |     fn ma_forward_f64(data: &[f64], w: usize) -> Vec<f64> {
171 |         let w = w.max(1);
172 |         let mut out = vec![0.0; data.len()];
173 |         let mut sum: f64 = 0.0;
174 |         for i in 0..data.len() {
175 |             sum += data[i];
176 |             if i >= w { sum -= data[i - w]; }
177 |             let denom = (i + 1).min(w) as f64;
178 |             out[i] = sum / denom;
179 |         }
180 |         out
181 |     }
182 | 
183 |     #[inline]
184 |     fn filtfilt_ma_usize(data: &[usize], w: usize) -> Vec<f64> {
185 |         let fwd = ma_forward_usize(data, w);
186 |         let mut rev = fwd.clone();
187 |         rev.reverse();
188 |         let rev2 = ma_forward_f64(&rev, w);
189 |         let mut out = rev2;
190 |         out.reverse();
191 |         out
192 |     }
193 | 
194 |     let read_ma = filtfilt_ma_usize(&read_counts, ma_window);
195 |     let write_ma = filtfilt_ma_usize(&write_counts, ma_window);
196 | 
197 |     // Y-axis
198 |     let counts_max = read_counts.iter().zip(write_counts.iter())
199 |         .map(|(&r, &w)| r.max(w))
200 |         .max()
201 |         .unwrap_or(1) as f64;
202 |     let ma_max = read_ma.iter().cloned().fold(0.0, f64::max)
203 |         .max(write_ma.iter().cloned().fold(0.0, f64::max));
204 |     let y_max = counts_max.max(ma_max) * 1.15;
205 | 
206 |     // ---- plot ----
207 |     let root = BitMapBackend::new(output_path, (1920, 1080)).into_drawing_area();
208 |     root.fill(&WHITE)?;
209 | 
210 |     let mut chart = ChartBuilder::on(&root)
211 |         .caption(
212 |             format!("Memory Accesses per Index (Read/Write + filtfilt MA({}))", ma_window.max(1)),
213 |             ("sans-serif", 28),
214 |         )
215 |         .margin(20)
216 |         .x_label_area_size(40)
217 |         .y_label_area_size(60)
218 |         .build_cartesian_2d(0f64..scratchpad_size as f64, 0f64..y_max)?;
219 | 
220 |     chart
221 |         .configure_mesh()
222 |         .x_labels(20)
223 |         .x_label_formatter(&|x| format!("{}", *x as usize))
224 |         .x_desc("Memory Index")
225 |         .y_desc("Access Count")
226 |         .axis_desc_style(("sans-serif", 18))
227 |         .draw()?;
228 | 
229 |     let read_fill = RGBColor(30, 144, 255).filled();
230 |     let write_fill = RGBColor(220, 50, 47).filled();
231 |     let read_line = RGBColor(30, 144, 255);
232 |     let write_line = RGBColor(220, 50, 47);
233 | 
234 |     let avg_read_line = RGBColor(100, 180, 255);
235 |     let avg_write_line = RGBColor(255, 100, 100);
236 | 
237 |     let bar_width = 1.0;
238 | 
239 |     for i in 0..scratchpad_size {
240 |         let x0 = i as f64 - bar_width / 2.0;
241 |         let x1 = i as f64 + bar_width / 2.0;
242 |         let r = read_counts[i] as f64;
243 |         let w = write_counts[i] as f64;
244 | 
245 |         if r > w {
246 |             chart.draw_series(std::iter::once(Rectangle::new([(x0, 0.0), (x1, r)], read_fill.clone())))?;
247 |             if w > 0.0 {
248 |                 chart.draw_series(std::iter::once(Rectangle::new([(x0, 0.0), (x1, w)], write_fill.clone())))?;
249 |             }
250 |         } else {
251 |             chart.draw_series(std::iter::once(Rectangle::new([(x0, 0.0), (x1, w)], write_fill.clone())))?;
252 |             if r > 0.0 {
253 |                 chart.draw_series(std::iter::once(Rectangle::new([(x0, 0.0), (x1, r)], read_fill.clone())))?;
254 |             }
255 |         }
256 |     }
257 | 
258 |     // Zero-phase MA overlays
259 |     chart.draw_series(LineSeries::new(
260 |         (0..scratchpad_size).map(|i| (i as f64, read_ma[i])),
261 |         ShapeStyle::from(&avg_read_line).stroke_width(3),
262 |     ))?.label(format!("Read MA_filtfilt({})", ma_window.max(1)));
263 | 
264 |     chart.draw_series(LineSeries::new(
265 |         (0..scratchpad_size).map(|i| (i as f64, write_ma[i])),
266 |         ShapeStyle::from(&avg_write_line).stroke_width(3),
267 |     ))?.label(format!("Write MA_filtfilt({})", ma_window.max(1)));
268 | 
269 |     // Legend
270 |     chart
271 |         .draw_series(std::iter::once(Rectangle::new([(0.0, 0.0), (0.0, 0.0)], read_fill.clone())))?
272 |         .label("Read")
273 |         .legend(move |(x, y)| Rectangle::new([(x, y - 5), (x + 10, y + 5)], read_fill.clone()));
274 |     chart
275 |         .draw_series(std::iter::once(Rectangle::new([(0.0, 0.0), (0.0, 0.0)], write_fill.clone())))?
276 |         .label("Write")
277 |         .legend(move |(x, y)| Rectangle::new([(x, y - 5), (x + 10, y + 5)], write_fill.clone()));
278 |     chart
279 |         .draw_series(std::iter::once(PathElement::new(
280 |             vec![(0.0, 0.0), (0.0, 0.0)],
281 |             ShapeStyle::from(&read_line).stroke_width(3),
282 |         )))?
283 |         .label(format!("Read MA_filtfilt({})", ma_window.max(1)))
284 |         .legend(move |(x, y)| PathElement::new(
285 |             vec![(x, y), (x + 14, y)],
286 |             ShapeStyle::from(&read_line).stroke_width(3),
287 |         ));
288 |     chart
289 |         .draw_series(std::iter::once(PathElement::new(
290 |             vec![(0.0, 0.0), (0.0, 0.0)],
291 |             ShapeStyle::from(&write_line).stroke_width(3),
292 |         )))?
293 |         .label(format!("Write MA_filtfilt({})", ma_window.max(1)))
294 |         .legend(move |(x, y)| PathElement::new(
295 |             vec![(x, y), (x + 14, y)],
296 |             ShapeStyle::from(&write_line).stroke_width(3),
297 |         ));
298 | 
299 |     chart
300 |         .configure_series_labels()
301 |         .position(SeriesLabelPosition::UpperRight)
302 |         .border_style(&BLACK)
303 |         .background_style(WHITE.mix(0.8))
304 |         .draw()?;
305 | 
306 |     root.present()?;
307 |     Ok(())
308 | }
309 | 
310 | }


--------------------------------------------------------------------------------
/src/v1.rs:
--------------------------------------------------------------------------------
  1 | use aes::cipher::generic_array::GenericArray;
  2 | use tiny_keccak::keccakp;
  3 | 
  4 | use crate::{Hash, HASH_SIZE, Error, scratchpad::ScratchPad as ScratchPadInternal};
  5 | 
  6 | // These are tweakable parameters
  7 | pub const MEMORY_SIZE: usize = 32768;
  8 | pub const SCRATCHPAD_ITERS: usize = 5000;
  9 | pub const ITERS: usize = 1;
 10 | pub const BUFFER_SIZE: usize = 42;
 11 | pub const SLOT_LENGTH: usize = 256;
 12 | 
 13 | // Untweakable parameters
 14 | pub const KECCAK_WORDS: usize = 25;
 15 | pub const BYTES_ARRAY_INPUT: usize = KECCAK_WORDS * 8;
 16 | pub const STAGE_1_MAX: usize = MEMORY_SIZE / KECCAK_WORDS;
 17 | 
 18 | // Scratchpad used to store intermediate values
 19 | // It has a fixed size of `MEMORY_SIZE` u64s
 20 | // It can be easily reused for multiple hashing operations safely
 21 | pub type ScratchPad = ScratchPadInternal<MEMORY_SIZE>;
 22 | 
 23 | // Align the input to 8 bytes
 24 | const ALIGNMENT: usize = 8;
 25 | 
 26 | #[derive(Debug, bytemuck::Pod, bytemuck::Zeroable, Copy, Clone)]
 27 | #[repr(C, align(8))]
 28 | pub struct Bytes8Alignment([u8; ALIGNMENT]);
 29 | 
 30 | // This is a workaround to force the correct alignment on Windows and MacOS
 31 | // We need an input of `BYTES_ARRAY_INPUT` bytes, but we need to ensure that it's aligned to 8 bytes
 32 | // to be able to cast it to a `[u64; KECCAK_WORDS]` later on.
 33 | #[derive(Debug, Clone)]
 34 | pub struct AlignedInput {
 35 |     data: Vec<Bytes8Alignment>,
 36 | }
 37 | 
 38 | impl Default for AlignedInput {
 39 |     fn default() -> Self {
 40 |         let mut n = BYTES_ARRAY_INPUT / ALIGNMENT;
 41 |         if BYTES_ARRAY_INPUT % ALIGNMENT != 0 {
 42 |             n += 1;
 43 |         }
 44 |     
 45 |         Self {
 46 |             data: vec![Bytes8Alignment([0; ALIGNMENT]); n]
 47 |         }
 48 |     }
 49 | } 
 50 | 
 51 | impl AlignedInput {
 52 |     // The number of elements in the input
 53 |     pub fn len(&self) -> usize {
 54 |         self.data.len()
 55 |     }
 56 | 
 57 |     // The size of the input in bytes
 58 |     pub fn size(&self) -> usize {
 59 |         self.data.len() * ALIGNMENT
 60 |     }
 61 | 
 62 |     // Get a mutable pointer to the input
 63 |     pub fn as_mut_ptr(&mut self) -> *mut u8 {
 64 |         self.data.as_mut_ptr() as *mut u8
 65 |     }
 66 | 
 67 |     // Retrieve the input as a mutable slice
 68 |     pub fn as_mut_slice(&mut self) -> Result<&mut [u8; BYTES_ARRAY_INPUT], Error> {
 69 |         bytemuck::cast_slice_mut(&mut self.data).try_into().map_err(|_| Error::FormatError)
 70 |     }
 71 | 
 72 |     // Retrieve the input as a slice
 73 |     pub fn as_slice(&self) -> Result<&[u8; BYTES_ARRAY_INPUT], Error> {
 74 |         bytemuck::cast_slice(&self.data).try_into().map_err(|_| Error::FormatError)
 75 |     }
 76 | }
 77 | 
 78 | #[inline(always)]
 79 | fn stage_1(input: &mut [u64; KECCAK_WORDS], scratch_pad: &mut [u64; MEMORY_SIZE], a: (usize, usize), b: (usize, usize)) {
 80 |     for i in a.0..=a.1 {
 81 |         keccakp(input);
 82 | 
 83 |         let mut rand_int: u64 = 0;
 84 |         for j in b.0..=b.1 {
 85 |             let pair_idx = (j + 1) % KECCAK_WORDS;
 86 |             let pair_idx2 = (j + 2) % KECCAK_WORDS;
 87 | 
 88 |             let target_idx = i * KECCAK_WORDS + j;
 89 |             let a = input[j] ^ rand_int;
 90 |             // Branching
 91 |             let left = input[pair_idx];
 92 |             let right = input[pair_idx2];
 93 |             let xor = left ^ right;
 94 |             let v = match xor & 0x3 {
 95 |                 0 => left & right,
 96 |                 1 => !(left & right),
 97 |                 2 => !xor,
 98 |                 3 => xor,
 99 |                 _ => unreachable!(),
100 |             };
101 |             let b = a ^ v;
102 |             rand_int = b;
103 |             scratch_pad[target_idx] = b;
104 |         }
105 |     }
106 | }
107 | 
108 | // This function is used to hash the input using the generated scratch pad
109 | // NOTE: The scratchpad is completely overwritten in stage 1  and can be reused without any issues
110 | pub fn xelis_hash(input: &mut [u8; BYTES_ARRAY_INPUT], scratch_pad: &mut ScratchPad) -> Result<Hash, Error> {
111 |     let int_input: &mut [u64; KECCAK_WORDS] = bytemuck::try_from_bytes_mut(input)
112 |         .map_err(|e| Error::CastError(e))?;
113 | 
114 |     // stage 1
115 |     let scratch_pad = scratch_pad.as_mut_slice();
116 |     stage_1(int_input, scratch_pad, (0, STAGE_1_MAX - 1), (0, KECCAK_WORDS - 1));
117 |     stage_1(int_input, scratch_pad, (STAGE_1_MAX, STAGE_1_MAX), (0, 17));
118 | 
119 |     // stage 2
120 |     let mut slots: [u32; SLOT_LENGTH] = [0; SLOT_LENGTH];
121 |     // this is equal to MEMORY_SIZE, just in u32 format
122 |     let small_pad: &mut [u32; MEMORY_SIZE * 2] = bytemuck::try_cast_slice_mut(scratch_pad)
123 |         .map_err(|e| Error::CastError(e))?
124 |         .try_into()
125 |         .map_err(|_| Error::FormatError)?;
126 | 
127 |     slots.copy_from_slice(&small_pad[small_pad.len() - SLOT_LENGTH..]);
128 | 
129 |     let mut indices: [u16; SLOT_LENGTH] = [0; SLOT_LENGTH];
130 |     for _ in 0..ITERS {
131 |         for j in 0..small_pad.len() / SLOT_LENGTH {
132 |             // Initialize indices and precompute the total sum of small pad
133 |             let mut total_sum: u32 = 0;
134 |             for k in 0..SLOT_LENGTH {
135 |                 indices[k] = k as u16;
136 |                 if slots[k] >> 31 == 0 {
137 |                     total_sum = total_sum.wrapping_add(small_pad[j * SLOT_LENGTH + k]);
138 |                 } else {
139 |                     total_sum = total_sum.wrapping_sub(small_pad[j * SLOT_LENGTH + k]);
140 |                 }
141 |             }
142 | 
143 |             for slot_idx in (0..SLOT_LENGTH).rev() {
144 |                 let index_in_indices = (small_pad[j * SLOT_LENGTH + slot_idx] % (slot_idx as u32 + 1)) as usize;
145 |                 let index = indices[index_in_indices] as usize;
146 |                 indices[index_in_indices] = indices[slot_idx];
147 | 
148 |                 let mut local_sum = total_sum;
149 |                 let s1 = (slots[index] >> 31) as i32;
150 |                 let pad_value = small_pad[j * SLOT_LENGTH + index];
151 |                 if s1 == 0 {
152 |                     local_sum = local_sum.wrapping_sub(pad_value);
153 |                 } else {
154 |                     local_sum = local_sum.wrapping_add(pad_value);
155 |                 }
156 | 
157 |                 // Apply the sum to the slot
158 |                 slots[index] = slots[index].wrapping_add(local_sum);
159 | 
160 |                 // Update the total sum
161 |                 let s2 = (slots[index] >> 31) as i32;
162 |                 total_sum = total_sum.wrapping_sub(2u32.wrapping_mul(small_pad[(j * SLOT_LENGTH).wrapping_add(index)].wrapping_mul((-s1).wrapping_add(s2) as u32)));
163 |             }
164 |         }
165 |     }
166 | 
167 |     small_pad[(MEMORY_SIZE * 2) - SLOT_LENGTH..].copy_from_slice(&slots);
168 | 
169 |     // stage 3
170 |     let key = GenericArray::from([0u8; 16]);
171 |     let mut block = GenericArray::from([0u8; 16]);
172 | 
173 |     let mut addr_a = (scratch_pad[MEMORY_SIZE - 1] >> 15) & 0x7FFF;
174 |     let mut addr_b = scratch_pad[MEMORY_SIZE - 1] & 0x7FFF;
175 | 
176 |     let mut mem_buffer_a: [u64; BUFFER_SIZE] = [0; BUFFER_SIZE];
177 |     let mut mem_buffer_b: [u64; BUFFER_SIZE] = [0; BUFFER_SIZE];
178 | 
179 |     for i in 0..BUFFER_SIZE as u64 {
180 |         mem_buffer_a[i as usize] = scratch_pad[((addr_a + i) % MEMORY_SIZE as u64) as usize];
181 |         mem_buffer_b[i as usize] = scratch_pad[((addr_b + i) % MEMORY_SIZE as u64) as usize];
182 |     }
183 | 
184 |     let mut final_result = [0; HASH_SIZE];
185 | 
186 |     for i in 0..SCRATCHPAD_ITERS {
187 |         let mem_a = mem_buffer_a[i % BUFFER_SIZE];
188 |         let mem_b = mem_buffer_b[i % BUFFER_SIZE];
189 | 
190 |         block[..8].copy_from_slice(&mem_b.to_le_bytes());
191 |         block[8..].copy_from_slice(&mem_a.to_le_bytes());
192 | 
193 |         aes::hazmat::cipher_round(&mut block, &key);
194 | 
195 |         let hash1 = u64::from_le_bytes(block[0..8].try_into().map_err(|_| Error::FormatError)?);
196 |         let hash2 = mem_a ^ mem_b;
197 | 
198 |         let mut result = !(hash1 ^ hash2);
199 | 
200 |         for j in 0..HASH_SIZE {
201 |             let a = mem_buffer_a[(j + i) % BUFFER_SIZE];
202 |             let b = mem_buffer_b[(j + i) % BUFFER_SIZE];
203 | 
204 |             // more branching
205 |             let v = match (result >> (j * 2)) & 0xf {
206 |                 0 => result.rotate_left(j as u32) ^ b,
207 |                 1 => !(result.rotate_left(j as u32) ^ a),
208 |                 2 => !(result ^ a),
209 |                 3 => result ^ b,
210 |                 4 => result ^ (a.wrapping_add(b)),
211 |                 5 => result ^ (a.wrapping_sub(b)),
212 |                 6 => result ^ (b.wrapping_sub(a)),
213 |                 7 => result ^ (a.wrapping_mul(b)),
214 |                 8 => result ^ (a & b),
215 |                 9 => result ^ (a | b),
216 |                 10 => result ^ (a ^ b),
217 |                 11 => result ^ (a.wrapping_sub(result)),
218 |                 12 => result ^ (b.wrapping_sub(result)),
219 |                 13 => result ^ (a.wrapping_add(result)),
220 |                 14 => result ^ (result.wrapping_sub(a)),
221 |                 15 => result ^ (result.wrapping_sub(b)),
222 |                 _ => unreachable!(),
223 |             };
224 | 
225 |             result = v;
226 |         }
227 | 
228 |         addr_b = result & 0x7FFF;
229 |         mem_buffer_a[i % BUFFER_SIZE] = result;
230 |         mem_buffer_b[i % BUFFER_SIZE] = scratch_pad[addr_b as usize];
231 | 
232 |         addr_a = (result >> 15) & 0x7FFF;
233 |         scratch_pad[addr_a as usize] = result;
234 | 
235 |         let index = SCRATCHPAD_ITERS - i - 1;
236 |         if index < 4 {
237 |             final_result[index * 8..(SCRATCHPAD_ITERS - i) * 8].copy_from_slice(&result.to_be_bytes());
238 |         }
239 |     }
240 | 
241 |     Ok(final_result)
242 | }
243 | 
244 | #[cfg(test)]
245 | mod tests {
246 |     use super::*;
247 | 
248 |     fn test_input(input: &mut [u8; BYTES_ARRAY_INPUT], expected_hash: Hash) {
249 |         let mut scratch_pad = ScratchPad::default();
250 |         let hash = xelis_hash(input, &mut scratch_pad).unwrap();
251 |         assert_eq!(hash, expected_hash);
252 |     }
253 | 
254 |     #[test]
255 |     fn test_zero_input() {
256 |         let mut input = [0u8; 200];
257 |         let expected_hash = [
258 |             0x0e, 0xbb, 0xbd, 0x8a, 0x31, 0xed, 0xad, 0xfe, 0x09, 0x8f, 0x2d, 0x77, 0x0d, 0x84,
259 |             0xb7, 0x19, 0x58, 0x86, 0x75, 0xab, 0x88, 0xa0, 0xa1, 0x70, 0x67, 0xd0, 0x0a, 0x8f,
260 |             0x36, 0x18, 0x22, 0x65,
261 |         ];
262 | 
263 |         test_input(&mut input, expected_hash);
264 |     }
265 | 
266 |     #[test]
267 |     fn test_xelis_input() {
268 |         let mut input = [0u8; BYTES_ARRAY_INPUT];
269 | 
270 |         let custom = b"xelis-hashing-algorithm";
271 |         input[0..custom.len()].copy_from_slice(custom);
272 | 
273 |         let expected_hash = [
274 |             106, 106, 173, 8, 207, 59, 118, 108, 176, 196, 9, 124, 250, 195, 3,
275 |             61, 30, 146, 238, 182, 88, 83, 115, 81, 139, 56, 3, 28, 176, 86, 68, 21
276 |         ];
277 |         test_input(&mut input, expected_hash);
278 |     }
279 | 
280 |     #[test]
281 |     fn test_scratch_pad() {
282 |         let mut scratch_pad = ScratchPad::default();
283 |         let mut input = AlignedInput::default();
284 | 
285 |         let hash = xelis_hash(input.as_mut_slice().unwrap(), &mut scratch_pad).unwrap();
286 |         let expected_hash = [
287 |             0x0e, 0xbb, 0xbd, 0x8a, 0x31, 0xed, 0xad, 0xfe, 0x09, 0x8f, 0x2d, 0x77, 0x0d, 0x84,
288 |             0xb7, 0x19, 0x58, 0x86, 0x75, 0xab, 0x88, 0xa0, 0xa1, 0x70, 0x67, 0xd0, 0x0a, 0x8f,
289 |             0x36, 0x18, 0x22, 0x65,
290 |         ];
291 |         assert_eq!(hash, expected_hash);
292 |     }
293 | 
294 |     #[test]
295 |     fn test_bytes_alignment() {
296 |         let alignment = std::mem::align_of::<Bytes8Alignment>();
297 |         assert_eq!(alignment, 8);
298 |     }
299 | }


--------------------------------------------------------------------------------
/go/v1/keccak.go:
--------------------------------------------------------------------------------
  1 | package v1
  2 | 
  3 | import "math/bits"
  4 | 
  5 | const RC_LEN = 12
  6 | 
  7 | // rc stores the round constants for use in the ι step.
  8 | var rc = [RC_LEN]uint64{
  9 | 	0x000000008000808b,
 10 | 	0x800000000000008b,
 11 | 	0x8000000000008089,
 12 | 	0x8000000000008003,
 13 | 	0x8000000000008002,
 14 | 	0x8000000000000080,
 15 | 	0x000000000000800a,
 16 | 	0x800000008000000a,
 17 | 	0x8000000080008081,
 18 | 	0x8000000000008080,
 19 | 	0x0000000080000001,
 20 | 	0x8000000080008008,
 21 | }
 22 | 
 23 | // keccakF1600 applies the Keccak permutation to a 1600b-wide
 24 | // state represented as a slice of 25 uint64s.
 25 | // This is copied directly from golang.org/x/crypto/sha3/keccakf.go
 26 | func KeccakF1600(a *[25]uint64) {
 27 | 	// Implementation translated from Keccak-inplace.c
 28 | 	// in the keccak reference code.
 29 | 	var t, bc0, bc1, bc2, bc3, bc4, d0, d1, d2, d3, d4 uint64
 30 | 
 31 | 	for i := 0; i < RC_LEN; i += 4 {
 32 | 		// Combines the 5 steps in each round into 2 steps.
 33 | 		// Unrolls 4 rounds per loop and spreads some steps across rounds.
 34 | 
 35 | 		// Round 1
 36 | 		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
 37 | 		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
 38 | 		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
 39 | 		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
 40 | 		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
 41 | 		d0 = bc4 ^ (bc1<<1 | bc1>>63)
 42 | 		d1 = bc0 ^ (bc2<<1 | bc2>>63)
 43 | 		d2 = bc1 ^ (bc3<<1 | bc3>>63)
 44 | 		d3 = bc2 ^ (bc4<<1 | bc4>>63)
 45 | 		d4 = bc3 ^ (bc0<<1 | bc0>>63)
 46 | 
 47 | 		bc0 = a[0] ^ d0
 48 | 		t = a[6] ^ d1
 49 | 		bc1 = bits.RotateLeft64(t, 44)
 50 | 		t = a[12] ^ d2
 51 | 		bc2 = bits.RotateLeft64(t, 43)
 52 | 		t = a[18] ^ d3
 53 | 		bc3 = bits.RotateLeft64(t, 21)
 54 | 		t = a[24] ^ d4
 55 | 		bc4 = bits.RotateLeft64(t, 14)
 56 | 		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i]
 57 | 		a[6] = bc1 ^ (bc3 &^ bc2)
 58 | 		a[12] = bc2 ^ (bc4 &^ bc3)
 59 | 		a[18] = bc3 ^ (bc0 &^ bc4)
 60 | 		a[24] = bc4 ^ (bc1 &^ bc0)
 61 | 
 62 | 		t = a[10] ^ d0
 63 | 		bc2 = bits.RotateLeft64(t, 3)
 64 | 		t = a[16] ^ d1
 65 | 		bc3 = bits.RotateLeft64(t, 45)
 66 | 		t = a[22] ^ d2
 67 | 		bc4 = bits.RotateLeft64(t, 61)
 68 | 		t = a[3] ^ d3
 69 | 		bc0 = bits.RotateLeft64(t, 28)
 70 | 		t = a[9] ^ d4
 71 | 		bc1 = bits.RotateLeft64(t, 20)
 72 | 		a[10] = bc0 ^ (bc2 &^ bc1)
 73 | 		a[16] = bc1 ^ (bc3 &^ bc2)
 74 | 		a[22] = bc2 ^ (bc4 &^ bc3)
 75 | 		a[3] = bc3 ^ (bc0 &^ bc4)
 76 | 		a[9] = bc4 ^ (bc1 &^ bc0)
 77 | 
 78 | 		t = a[20] ^ d0
 79 | 		bc4 = bits.RotateLeft64(t, 18)
 80 | 		t = a[1] ^ d1
 81 | 		bc0 = bits.RotateLeft64(t, 1)
 82 | 		t = a[7] ^ d2
 83 | 		bc1 = bits.RotateLeft64(t, 6)
 84 | 		t = a[13] ^ d3
 85 | 		bc2 = bits.RotateLeft64(t, 25)
 86 | 		t = a[19] ^ d4
 87 | 		bc3 = bits.RotateLeft64(t, 8)
 88 | 		a[20] = bc0 ^ (bc2 &^ bc1)
 89 | 		a[1] = bc1 ^ (bc3 &^ bc2)
 90 | 		a[7] = bc2 ^ (bc4 &^ bc3)
 91 | 		a[13] = bc3 ^ (bc0 &^ bc4)
 92 | 		a[19] = bc4 ^ (bc1 &^ bc0)
 93 | 
 94 | 		t = a[5] ^ d0
 95 | 		bc1 = bits.RotateLeft64(t, 36)
 96 | 		t = a[11] ^ d1
 97 | 		bc2 = bits.RotateLeft64(t, 10)
 98 | 		t = a[17] ^ d2
 99 | 		bc3 = bits.RotateLeft64(t, 15)
100 | 		t = a[23] ^ d3
101 | 		bc4 = bits.RotateLeft64(t, 56)
102 | 		t = a[4] ^ d4
103 | 		bc0 = bits.RotateLeft64(t, 27)
104 | 		a[5] = bc0 ^ (bc2 &^ bc1)
105 | 		a[11] = bc1 ^ (bc3 &^ bc2)
106 | 		a[17] = bc2 ^ (bc4 &^ bc3)
107 | 		a[23] = bc3 ^ (bc0 &^ bc4)
108 | 		a[4] = bc4 ^ (bc1 &^ bc0)
109 | 
110 | 		t = a[15] ^ d0
111 | 		bc3 = bits.RotateLeft64(t, 41)
112 | 		t = a[21] ^ d1
113 | 		bc4 = bits.RotateLeft64(t, 2)
114 | 		t = a[2] ^ d2
115 | 		bc0 = bits.RotateLeft64(t, 62)
116 | 		t = a[8] ^ d3
117 | 		bc1 = bits.RotateLeft64(t, 55)
118 | 		t = a[14] ^ d4
119 | 		bc2 = bits.RotateLeft64(t, 39)
120 | 		a[15] = bc0 ^ (bc2 &^ bc1)
121 | 		a[21] = bc1 ^ (bc3 &^ bc2)
122 | 		a[2] = bc2 ^ (bc4 &^ bc3)
123 | 		a[8] = bc3 ^ (bc0 &^ bc4)
124 | 		a[14] = bc4 ^ (bc1 &^ bc0)
125 | 
126 | 		// Round 2
127 | 		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
128 | 		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
129 | 		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
130 | 		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
131 | 		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
132 | 		d0 = bc4 ^ (bc1<<1 | bc1>>63)
133 | 		d1 = bc0 ^ (bc2<<1 | bc2>>63)
134 | 		d2 = bc1 ^ (bc3<<1 | bc3>>63)
135 | 		d3 = bc2 ^ (bc4<<1 | bc4>>63)
136 | 		d4 = bc3 ^ (bc0<<1 | bc0>>63)
137 | 
138 | 		bc0 = a[0] ^ d0
139 | 		t = a[16] ^ d1
140 | 		bc1 = bits.RotateLeft64(t, 44)
141 | 		t = a[7] ^ d2
142 | 		bc2 = bits.RotateLeft64(t, 43)
143 | 		t = a[23] ^ d3
144 | 		bc3 = bits.RotateLeft64(t, 21)
145 | 		t = a[14] ^ d4
146 | 		bc4 = bits.RotateLeft64(t, 14)
147 | 		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+1]
148 | 		a[16] = bc1 ^ (bc3 &^ bc2)
149 | 		a[7] = bc2 ^ (bc4 &^ bc3)
150 | 		a[23] = bc3 ^ (bc0 &^ bc4)
151 | 		a[14] = bc4 ^ (bc1 &^ bc0)
152 | 
153 | 		t = a[20] ^ d0
154 | 		bc2 = bits.RotateLeft64(t, 3)
155 | 		t = a[11] ^ d1
156 | 		bc3 = bits.RotateLeft64(t, 45)
157 | 		t = a[2] ^ d2
158 | 		bc4 = bits.RotateLeft64(t, 61)
159 | 		t = a[18] ^ d3
160 | 		bc0 = bits.RotateLeft64(t, 28)
161 | 		t = a[9] ^ d4
162 | 		bc1 = bits.RotateLeft64(t, 20)
163 | 		a[20] = bc0 ^ (bc2 &^ bc1)
164 | 		a[11] = bc1 ^ (bc3 &^ bc2)
165 | 		a[2] = bc2 ^ (bc4 &^ bc3)
166 | 		a[18] = bc3 ^ (bc0 &^ bc4)
167 | 		a[9] = bc4 ^ (bc1 &^ bc0)
168 | 
169 | 		t = a[15] ^ d0
170 | 		bc4 = bits.RotateLeft64(t, 18)
171 | 		t = a[6] ^ d1
172 | 		bc0 = bits.RotateLeft64(t, 1)
173 | 		t = a[22] ^ d2
174 | 		bc1 = bits.RotateLeft64(t, 6)
175 | 		t = a[13] ^ d3
176 | 		bc2 = bits.RotateLeft64(t, 25)
177 | 		t = a[4] ^ d4
178 | 		bc3 = bits.RotateLeft64(t, 8)
179 | 		a[15] = bc0 ^ (bc2 &^ bc1)
180 | 		a[6] = bc1 ^ (bc3 &^ bc2)
181 | 		a[22] = bc2 ^ (bc4 &^ bc3)
182 | 		a[13] = bc3 ^ (bc0 &^ bc4)
183 | 		a[4] = bc4 ^ (bc1 &^ bc0)
184 | 
185 | 		t = a[10] ^ d0
186 | 		bc1 = bits.RotateLeft64(t, 36)
187 | 		t = a[1] ^ d1
188 | 		bc2 = bits.RotateLeft64(t, 10)
189 | 		t = a[17] ^ d2
190 | 		bc3 = bits.RotateLeft64(t, 15)
191 | 		t = a[8] ^ d3
192 | 		bc4 = bits.RotateLeft64(t, 56)
193 | 		t = a[24] ^ d4
194 | 		bc0 = bits.RotateLeft64(t, 27)
195 | 		a[10] = bc0 ^ (bc2 &^ bc1)
196 | 		a[1] = bc1 ^ (bc3 &^ bc2)
197 | 		a[17] = bc2 ^ (bc4 &^ bc3)
198 | 		a[8] = bc3 ^ (bc0 &^ bc4)
199 | 		a[24] = bc4 ^ (bc1 &^ bc0)
200 | 
201 | 		t = a[5] ^ d0
202 | 		bc3 = bits.RotateLeft64(t, 41)
203 | 		t = a[21] ^ d1
204 | 		bc4 = bits.RotateLeft64(t, 2)
205 | 		t = a[12] ^ d2
206 | 		bc0 = bits.RotateLeft64(t, 62)
207 | 		t = a[3] ^ d3
208 | 		bc1 = bits.RotateLeft64(t, 55)
209 | 		t = a[19] ^ d4
210 | 		bc2 = bits.RotateLeft64(t, 39)
211 | 		a[5] = bc0 ^ (bc2 &^ bc1)
212 | 		a[21] = bc1 ^ (bc3 &^ bc2)
213 | 		a[12] = bc2 ^ (bc4 &^ bc3)
214 | 		a[3] = bc3 ^ (bc0 &^ bc4)
215 | 		a[19] = bc4 ^ (bc1 &^ bc0)
216 | 
217 | 		// Round 3
218 | 		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
219 | 		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
220 | 		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
221 | 		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
222 | 		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
223 | 		d0 = bc4 ^ (bc1<<1 | bc1>>63)
224 | 		d1 = bc0 ^ (bc2<<1 | bc2>>63)
225 | 		d2 = bc1 ^ (bc3<<1 | bc3>>63)
226 | 		d3 = bc2 ^ (bc4<<1 | bc4>>63)
227 | 		d4 = bc3 ^ (bc0<<1 | bc0>>63)
228 | 
229 | 		bc0 = a[0] ^ d0
230 | 		t = a[11] ^ d1
231 | 		bc1 = bits.RotateLeft64(t, 44)
232 | 		t = a[22] ^ d2
233 | 		bc2 = bits.RotateLeft64(t, 43)
234 | 		t = a[8] ^ d3
235 | 		bc3 = bits.RotateLeft64(t, 21)
236 | 		t = a[19] ^ d4
237 | 		bc4 = bits.RotateLeft64(t, 14)
238 | 		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+2]
239 | 		a[11] = bc1 ^ (bc3 &^ bc2)
240 | 		a[22] = bc2 ^ (bc4 &^ bc3)
241 | 		a[8] = bc3 ^ (bc0 &^ bc4)
242 | 		a[19] = bc4 ^ (bc1 &^ bc0)
243 | 
244 | 		t = a[15] ^ d0
245 | 		bc2 = bits.RotateLeft64(t, 3)
246 | 		t = a[1] ^ d1
247 | 		bc3 = bits.RotateLeft64(t, 45)
248 | 		t = a[12] ^ d2
249 | 		bc4 = bits.RotateLeft64(t, 61)
250 | 		t = a[23] ^ d3
251 | 		bc0 = bits.RotateLeft64(t, 28)
252 | 		t = a[9] ^ d4
253 | 		bc1 = bits.RotateLeft64(t, 20)
254 | 		a[15] = bc0 ^ (bc2 &^ bc1)
255 | 		a[1] = bc1 ^ (bc3 &^ bc2)
256 | 		a[12] = bc2 ^ (bc4 &^ bc3)
257 | 		a[23] = bc3 ^ (bc0 &^ bc4)
258 | 		a[9] = bc4 ^ (bc1 &^ bc0)
259 | 
260 | 		t = a[5] ^ d0
261 | 		bc4 = bits.RotateLeft64(t, 18)
262 | 		t = a[16] ^ d1
263 | 		bc0 = bits.RotateLeft64(t, 1)
264 | 		t = a[2] ^ d2
265 | 		bc1 = bits.RotateLeft64(t, 6)
266 | 		t = a[13] ^ d3
267 | 		bc2 = bits.RotateLeft64(t, 25)
268 | 		t = a[24] ^ d4
269 | 		bc3 = bits.RotateLeft64(t, 8)
270 | 		a[5] = bc0 ^ (bc2 &^ bc1)
271 | 		a[16] = bc1 ^ (bc3 &^ bc2)
272 | 		a[2] = bc2 ^ (bc4 &^ bc3)
273 | 		a[13] = bc3 ^ (bc0 &^ bc4)
274 | 		a[24] = bc4 ^ (bc1 &^ bc0)
275 | 
276 | 		t = a[20] ^ d0
277 | 		bc1 = bits.RotateLeft64(t, 36)
278 | 		t = a[6] ^ d1
279 | 		bc2 = bits.RotateLeft64(t, 10)
280 | 		t = a[17] ^ d2
281 | 		bc3 = bits.RotateLeft64(t, 15)
282 | 		t = a[3] ^ d3
283 | 		bc4 = bits.RotateLeft64(t, 56)
284 | 		t = a[14] ^ d4
285 | 		bc0 = bits.RotateLeft64(t, 27)
286 | 		a[20] = bc0 ^ (bc2 &^ bc1)
287 | 		a[6] = bc1 ^ (bc3 &^ bc2)
288 | 		a[17] = bc2 ^ (bc4 &^ bc3)
289 | 		a[3] = bc3 ^ (bc0 &^ bc4)
290 | 		a[14] = bc4 ^ (bc1 &^ bc0)
291 | 
292 | 		t = a[10] ^ d0
293 | 		bc3 = bits.RotateLeft64(t, 41)
294 | 		t = a[21] ^ d1
295 | 		bc4 = bits.RotateLeft64(t, 2)
296 | 		t = a[7] ^ d2
297 | 		bc0 = bits.RotateLeft64(t, 62)
298 | 		t = a[18] ^ d3
299 | 		bc1 = bits.RotateLeft64(t, 55)
300 | 		t = a[4] ^ d4
301 | 		bc2 = bits.RotateLeft64(t, 39)
302 | 		a[10] = bc0 ^ (bc2 &^ bc1)
303 | 		a[21] = bc1 ^ (bc3 &^ bc2)
304 | 		a[7] = bc2 ^ (bc4 &^ bc3)
305 | 		a[18] = bc3 ^ (bc0 &^ bc4)
306 | 		a[4] = bc4 ^ (bc1 &^ bc0)
307 | 
308 | 		// Round 4
309 | 		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
310 | 		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
311 | 		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
312 | 		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
313 | 		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
314 | 		d0 = bc4 ^ (bc1<<1 | bc1>>63)
315 | 		d1 = bc0 ^ (bc2<<1 | bc2>>63)
316 | 		d2 = bc1 ^ (bc3<<1 | bc3>>63)
317 | 		d3 = bc2 ^ (bc4<<1 | bc4>>63)
318 | 		d4 = bc3 ^ (bc0<<1 | bc0>>63)
319 | 
320 | 		bc0 = a[0] ^ d0
321 | 		t = a[1] ^ d1
322 | 		bc1 = bits.RotateLeft64(t, 44)
323 | 		t = a[2] ^ d2
324 | 		bc2 = bits.RotateLeft64(t, 43)
325 | 		t = a[3] ^ d3
326 | 		bc3 = bits.RotateLeft64(t, 21)
327 | 		t = a[4] ^ d4
328 | 		bc4 = bits.RotateLeft64(t, 14)
329 | 		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+3]
330 | 		a[1] = bc1 ^ (bc3 &^ bc2)
331 | 		a[2] = bc2 ^ (bc4 &^ bc3)
332 | 		a[3] = bc3 ^ (bc0 &^ bc4)
333 | 		a[4] = bc4 ^ (bc1 &^ bc0)
334 | 
335 | 		t = a[5] ^ d0
336 | 		bc2 = bits.RotateLeft64(t, 3)
337 | 		t = a[6] ^ d1
338 | 		bc3 = bits.RotateLeft64(t, 45)
339 | 		t = a[7] ^ d2
340 | 		bc4 = bits.RotateLeft64(t, 61)
341 | 		t = a[8] ^ d3
342 | 		bc0 = bits.RotateLeft64(t, 28)
343 | 		t = a[9] ^ d4
344 | 		bc1 = bits.RotateLeft64(t, 20)
345 | 		a[5] = bc0 ^ (bc2 &^ bc1)
346 | 		a[6] = bc1 ^ (bc3 &^ bc2)
347 | 		a[7] = bc2 ^ (bc4 &^ bc3)
348 | 		a[8] = bc3 ^ (bc0 &^ bc4)
349 | 		a[9] = bc4 ^ (bc1 &^ bc0)
350 | 
351 | 		t = a[10] ^ d0
352 | 		bc4 = bits.RotateLeft64(t, 18)
353 | 		t = a[11] ^ d1
354 | 		bc0 = bits.RotateLeft64(t, 1)
355 | 		t = a[12] ^ d2
356 | 		bc1 = bits.RotateLeft64(t, 6)
357 | 		t = a[13] ^ d3
358 | 		bc2 = bits.RotateLeft64(t, 25)
359 | 		t = a[14] ^ d4
360 | 		bc3 = bits.RotateLeft64(t, 8)
361 | 		a[10] = bc0 ^ (bc2 &^ bc1)
362 | 		a[11] = bc1 ^ (bc3 &^ bc2)
363 | 		a[12] = bc2 ^ (bc4 &^ bc3)
364 | 		a[13] = bc3 ^ (bc0 &^ bc4)
365 | 		a[14] = bc4 ^ (bc1 &^ bc0)
366 | 
367 | 		t = a[15] ^ d0
368 | 		bc1 = bits.RotateLeft64(t, 36)
369 | 		t = a[16] ^ d1
370 | 		bc2 = bits.RotateLeft64(t, 10)
371 | 		t = a[17] ^ d2
372 | 		bc3 = bits.RotateLeft64(t, 15)
373 | 		t = a[18] ^ d3
374 | 		bc4 = bits.RotateLeft64(t, 56)
375 | 		t = a[19] ^ d4
376 | 		bc0 = bits.RotateLeft64(t, 27)
377 | 		a[15] = bc0 ^ (bc2 &^ bc1)
378 | 		a[16] = bc1 ^ (bc3 &^ bc2)
379 | 		a[17] = bc2 ^ (bc4 &^ bc3)
380 | 		a[18] = bc3 ^ (bc0 &^ bc4)
381 | 		a[19] = bc4 ^ (bc1 &^ bc0)
382 | 
383 | 		t = a[20] ^ d0
384 | 		bc3 = bits.RotateLeft64(t, 41)
385 | 		t = a[21] ^ d1
386 | 		bc4 = bits.RotateLeft64(t, 2)
387 | 		t = a[22] ^ d2
388 | 		bc0 = bits.RotateLeft64(t, 62)
389 | 		t = a[23] ^ d3
390 | 		bc1 = bits.RotateLeft64(t, 55)
391 | 		t = a[24] ^ d4
392 | 		bc2 = bits.RotateLeft64(t, 39)
393 | 		a[20] = bc0 ^ (bc2 &^ bc1)
394 | 		a[21] = bc1 ^ (bc3 &^ bc2)
395 | 		a[22] = bc2 ^ (bc4 &^ bc3)
396 | 		a[23] = bc3 ^ (bc0 &^ bc4)
397 | 		a[24] = bc4 ^ (bc1 &^ bc0)
398 | 	}
399 | }
400 | 


--------------------------------------------------------------------------------
/C/xelis_hash_v2.c:
--------------------------------------------------------------------------------
  1 | #define _GNU_SOURCE
  2 | #include <stdio.h>
  3 | #include <malloc.h>
  4 | #include <stdint.h>
  5 | #include <string.h>
  6 | #include <time.h>
  7 | #include <pthread.h>
  8 | #include <sched.h>
  9 | #include <unistd.h>
 10 | #include <wmmintrin.h>
 11 | #include "BLAKE3/c/blake3.h"
 12 | #include "ChaCha20-SIMD/chacha20.h"
 13 | 
 14 | #define INPUT_LEN (112)
 15 | #define MEMSIZE (429 * 128)
 16 | #define ITERS (3)
 17 | #define HASH_SIZE (32)
 18 | #define CHUNK_SIZE (32)
 19 | #define NONCE_SIZE (12)
 20 | #define OUTPUT_SIZE (MEMSIZE * 8)
 21 | #define CHUNKS (4)
 22 | #define INPUT_LEN (112)
 23 | 
 24 | static inline void blake3(const uint8_t *input, int len, uint8_t *output)
 25 | {
 26 | 	blake3_hasher hasher;
 27 | 	blake3_hasher_init(&hasher);
 28 | 	blake3_hasher_update(&hasher, input, len);
 29 | 	blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
 30 | }
 31 | 
 32 | 
 33 | void stage1(const uint8_t *input, size_t input_len, uint8_t scratch_pad[OUTPUT_SIZE])
 34 | {
 35 | 	uint8_t key[CHUNK_SIZE * CHUNKS] = {0};
 36 | 	uint8_t input_hash[HASH_SIZE];
 37 | 	uint8_t buffer[CHUNK_SIZE * 2];
 38 | 	memcpy(key, input, INPUT_LEN);
 39 | 	blake3(input, INPUT_LEN, buffer);
 40 | 
 41 | 	uint8_t *t = scratch_pad;
 42 | 
 43 | 	memcpy(buffer + CHUNK_SIZE, key + 0 * CHUNK_SIZE, CHUNK_SIZE);
 44 | 	blake3(buffer, CHUNK_SIZE * 2, input_hash);
 45 | 	chacha_encrypt(input_hash, buffer, NULL, t, OUTPUT_SIZE / CHUNKS, 8);
 46 | 
 47 | 	t += OUTPUT_SIZE / CHUNKS;
 48 | 	memcpy(buffer, input_hash, CHUNK_SIZE);
 49 | 	memcpy(buffer + CHUNK_SIZE, key + 1 * CHUNK_SIZE, CHUNK_SIZE);
 50 | 	blake3(buffer, CHUNK_SIZE * 2, input_hash);
 51 | 	chacha_encrypt(input_hash, t - NONCE_SIZE, NULL, t, OUTPUT_SIZE / CHUNKS, 8);
 52 | 
 53 | 	t += OUTPUT_SIZE / CHUNKS;
 54 | 	memcpy(buffer, input_hash, CHUNK_SIZE);
 55 | 	memcpy(buffer + CHUNK_SIZE, key + 2 * CHUNK_SIZE, CHUNK_SIZE);
 56 | 	blake3(buffer, CHUNK_SIZE * 2, input_hash);
 57 | 	chacha_encrypt(input_hash, t - NONCE_SIZE, NULL, t, OUTPUT_SIZE / CHUNKS, 8);
 58 | 
 59 | 	t += OUTPUT_SIZE / CHUNKS;
 60 | 	memcpy(buffer, input_hash, CHUNK_SIZE);
 61 | 	memcpy(buffer + CHUNK_SIZE, key + 3 * CHUNK_SIZE, CHUNK_SIZE);
 62 | 	blake3(buffer, CHUNK_SIZE * 2, input_hash);
 63 | 	chacha_encrypt(input_hash, t - NONCE_SIZE, NULL, t, OUTPUT_SIZE / CHUNKS, 8);
 64 | }
 65 | 
 66 | #define KEY "xelishash-pow-v2"
 67 | #define BUFSIZE (MEMSIZE / 2)
 68 | 
 69 | // https://danlark.org/2020/06/14/128-bit-division
 70 | static inline uint64_t Divide128Div64To64(uint64_t high, uint64_t low, uint64_t divisor, uint64_t *remainder)
 71 | {
 72 | 	uint64_t result;
 73 | 	__asm__("divq %[v]"
 74 | 			: "=a"(result), "=d"(*remainder) // Output parametrs, =a for rax, =d for rdx, [v] is an
 75 | 			// alias for divisor, input paramters "a" and "d" for low and high.
 76 | 			: [v] "r"(divisor), "a"(low), "d"(high));
 77 | 	return result;
 78 | }
 79 | 
 80 | static inline uint64_t udiv(uint64_t high, uint64_t low, uint64_t divisor)
 81 | {
 82 | 	uint64_t remainder;
 83 | 
 84 | 	if (high < divisor)
 85 | 	{
 86 | 		return Divide128Div64To64(high, low, divisor, &remainder);
 87 | 	}
 88 | 	else
 89 | 	{
 90 | 		uint64_t qhi = Divide128Div64To64(0, high, divisor, &high);
 91 | 		return Divide128Div64To64(high, low, divisor, &remainder);
 92 | 	}
 93 | }
 94 | 
 95 | static inline uint64_t ROTR(uint64_t x, uint32_t r)
 96 | {
 97 | 	asm("rorq %%cl, %0" : "+r"(x) : "c"(r));
 98 | 	return x;
 99 | }
100 | 
101 | static inline uint64_t ROTL(uint64_t x, uint32_t r)
102 | {
103 | 	asm("rolq %%cl, %0" : "+r"(x) : "c"(r));
104 | 	return x;
105 | }
106 | 
107 | static inline __uint128_t combine_uint64(uint64_t high, uint64_t low)
108 | {
109 | 	return ((__uint128_t)high << 64) | low;
110 | }
111 | 
112 | /*
113 | uint64_t isqrt(uint64_t n) {
114 | 	if (n < 2)
115 | 		return n;
116 | 
117 | 	uint64_t x = n;
118 | 	uint64_t y = (x + 1) >> 1;
119 | 
120 | 	while (y < x) {
121 | 		x = y;
122 | 		y = (x + n / x) >> 1;
123 | 	}
124 | 
125 | 	return x;
126 | }
127 | */
128 | 
129 | uint64_t isqrt(uint64_t n)
130 | {
131 | 	if (n < 2)
132 | 		return n;
133 | 
134 | 	uint64_t x = n;
135 | 	uint64_t result = 0;
136 | 	uint64_t bit = (uint64_t)1 << 62; // The second-to-top bit is set
137 | 
138 | 	// "bit" starts at the highest power of four <= the argument.
139 | 	while (bit > x)
140 | 		bit >>= 2;
141 | 
142 | 	while (bit != 0)
143 | 	{
144 | 		if (x >= result + bit)
145 | 		{
146 | 			x -= result + bit;
147 | 			result = (result >> 1) + bit;
148 | 		}
149 | 		else
150 | 		{
151 | 			result >>= 1;
152 | 		}
153 | 		bit >>= 2;
154 | 	}
155 | 
156 | 	return result;
157 | }
158 | 
159 | void static inline uint64_to_le_bytes(uint64_t value, uint8_t *bytes)
160 | {
161 | 	for (int i = 0; i < 8; i++)
162 | 	{
163 | 		bytes[i] = value & 0xFF;
164 | 		value >>= 8;
165 | 	}
166 | }
167 | 
168 | uint64_t static inline le_bytes_to_uint64(const uint8_t *bytes)
169 | {
170 | 	uint64_t value = 0;
171 | 	for (int i = 7; i >= 0; i--)
172 | 		value = (value << 8) | bytes[i];
173 | 	return value;
174 | }
175 | 
176 | void static inline aes_single_round(uint8_t *block, const uint8_t *key)
177 | {
178 | 	__m128i block_vec = _mm_loadu_si128((const __m128i *)block);
179 | 	__m128i key_vec = _mm_loadu_si128((const __m128i *)key);
180 | 
181 | 	// Perform single AES encryption round
182 | 	block_vec = _mm_aesenc_si128(block_vec, key_vec);
183 | 
184 | 	_mm_storeu_si128((__m128i *)block, block_vec);
185 | }
186 | 
187 | void stage3(uint64_t *scratch)
188 | {
189 | 	uint64_t *mem_buffer_a = scratch;
190 | 	uint64_t *mem_buffer_b = &scratch[BUFSIZE];
191 | 
192 | 	uint64_t addr_a = mem_buffer_b[BUFSIZE - 1];
193 | 	uint64_t addr_b = mem_buffer_a[BUFSIZE - 1] >> 32;
194 | 	uint32_t r = 0;
195 | 
196 | 	for (uint32_t i = 0; i < ITERS; i++)
197 | 	{
198 | 		uint64_t mem_a = mem_buffer_a[addr_a % BUFSIZE];
199 | 		uint64_t mem_b = mem_buffer_b[addr_b % BUFSIZE];
200 | 
201 | 		uint8_t block[16];
202 | 		uint64_to_le_bytes(mem_b, block);
203 | 		uint64_to_le_bytes(mem_a, block + 8);
204 | 		aes_single_round(block, KEY);
205 | 
206 | 		uint64_t hash1 = le_bytes_to_uint64(block);
207 | 		uint64_t hash2 = mem_a ^ mem_b;
208 | 		uint64_t result = ~(hash1 ^ hash2);
209 | 
210 | 		for (uint32_t j = 0; j < BUFSIZE; j++)
211 | 		{
212 | 			uint64_t a = mem_buffer_a[result % BUFSIZE];
213 | 			uint64_t b = mem_buffer_b[~ROTR(result, r) % BUFSIZE];
214 | 			uint64_t c = (r < BUFSIZE) ? mem_buffer_a[r] : mem_buffer_b[r - BUFSIZE];
215 | 			r = (r < MEMSIZE - 1) ? r + 1 : 0;
216 | 
217 | 			uint64_t v;
218 | 			__uint128_t t1, t2;
219 | 			switch (ROTL(result, (uint32_t)c) & 0xf)
220 | 			{
221 | 			case 0:
222 | 				v = ROTL(c, i * j) ^ b;
223 | 				break;
224 | 			case 1:
225 | 				v = ROTR(c, i * j) ^ a;
226 | 				break;
227 | 			case 2:
228 | 				v = a ^ b ^ c;
229 | 				break;
230 | 			case 3:
231 | 				v = ((a + b) * c);
232 | 				break;
233 | 			case 4:
234 | 				v = ((b - c) * a);
235 | 				break;
236 | 			case 5:
237 | 				v = (c - a + b);
238 | 				break;
239 | 			case 6:
240 | 				v = (a - b + c);
241 | 				break;
242 | 			case 7:
243 | 				v = (b * c + a);
244 | 				break;
245 | 			case 8:
246 | 				v = (c * a + b);
247 | 				break;
248 | 			case 9:
249 | 				v = (a * b * c);
250 | 				break;
251 | 			case 10:
252 | 			{
253 | 				t1 = combine_uint64(a, b);
254 | 				uint64_t t2 = c | 1;
255 | 				v = t1 % t2;
256 | 			}
257 | 			break;
258 | 			case 11:
259 | 			{
260 | 				t1 = combine_uint64(b, c);
261 | 				t2 = combine_uint64(ROTL(result, r), a | 2);
262 | 				v = (t2 > t1) ? c : t1 % t2;
263 | 			}
264 | 			break;
265 | 			case 12:
266 | 				v = udiv(c, a, b | 4);
267 | 				break;
268 | 			case 13:
269 | 			{
270 | 				t1 = combine_uint64(ROTL(result, r), b);
271 | 				t2 = combine_uint64(a, c | 8);
272 | 				v = (t1 > t2) ? t1 / t2 : a ^ b;
273 | 			}
274 | 			break;
275 | 			case 14:
276 | 			{
277 | 				t1 = combine_uint64(b, a);
278 | 				uint64_t t2 = c;
279 | 				v = (t1 * t2) >> 64;
280 | 			}
281 | 			break;
282 | 			case 15:
283 | 			{
284 | 				t1 = combine_uint64(a, c);
285 | 				t2 = combine_uint64(ROTR(result, r), b);
286 | 				v = (t1 * t2) >> 64;
287 | 			}
288 | 			break;
289 | 			}
290 | 			result = ROTL(result ^ v, 1);
291 | 
292 | 			uint64_t t = mem_buffer_a[BUFSIZE - j - 1] ^ result;
293 | 			mem_buffer_a[BUFSIZE - j - 1] = t;
294 | 			mem_buffer_b[j] ^= ROTR(t, result);
295 | 		}
296 | 		addr_a = result;
297 | 		addr_b = isqrt(result);
298 | 	}
299 | }
300 | 
301 | int xelis_hash_v2_init()
302 | {
303 | 	// return sodium_init();
304 | }
305 | 
306 | void xelis_hash_v2(uint8_t in[INPUT_LEN], uint8_t hash[HASH_SIZE], uint64_t scratch[MEMSIZE])
307 | {
308 | 	uint8_t *scratch_uint8 = (uint8_t *)scratch;
309 | 
310 | 	stage1(in, INPUT_LEN, scratch_uint8);
311 | 	stage3(scratch);
312 | 	blake3(scratch_uint8, OUTPUT_SIZE, hash);
313 | }
314 | 
315 | double display_time(const char *stage, struct timespec start, struct timespec end, int iterations)
316 | {
317 | 	uint64_t total_time = (end.tv_sec - start.tv_sec) * 1000000000ULL + (end.tv_nsec - start.tv_nsec);
318 | 	double time_per = (double)total_time / iterations;
319 | 	printf("%s: %.3f ms\n", stage, time_per / 1000000.0);
320 | 	return time_per;
321 | }
322 | 
323 | void timing_test(int N)
324 | {
325 | 	uint8_t hash[HASH_SIZE];
326 | 	struct timespec start, end;
327 | 	double time_per, time_sum = 0;
328 | 
329 | 	uint8_t *input = (uint8_t *)calloc(INPUT_LEN, sizeof(uint8_t));
330 | 	uint64_t *scratch = (uint64_t *)calloc(MEMSIZE, sizeof(uint64_t));
331 | 	uint8_t *scratch_uint8 = (uint8_t *)scratch;
332 | 
333 | 	xelis_hash_v2_init();
334 | 
335 | 	printf("Timing:\n");
336 | 	clock_gettime(CLOCK_MONOTONIC, &start);
337 | 	for (int i = 0; i < N; i++)
338 | 		stage1(input, INPUT_LEN, scratch_uint8);
339 | 	clock_gettime(CLOCK_MONOTONIC, &end);
340 | 	time_sum += display_time("stage1", start, end, N);
341 | 
342 | 	clock_gettime(CLOCK_MONOTONIC, &start);
343 | 	for (int i = 0; i < N; i++)
344 | 		stage3(scratch);
345 | 	clock_gettime(CLOCK_MONOTONIC, &end);
346 | 	time_sum += display_time("stage3", start, end, N);
347 | 
348 | 	clock_gettime(CLOCK_MONOTONIC, &start);
349 | 	for (int i = 0; i < N; i++)
350 | 		blake3(scratch_uint8, OUTPUT_SIZE, hash);
351 | 	clock_gettime(CLOCK_MONOTONIC, &end);
352 | 	time_sum += display_time("stage4", start, end, N);
353 | 
354 | 	printf("Total:  %.3f ms (%d avg)\n", time_sum / 1000000.0, N);
355 | 
356 | 	// verify output
357 | 	uint8_t gold[HASH_SIZE] = {
358 | 		126, 219, 112, 240, 116, 133, 115,
359 | 		144, 39, 40, 164, 105, 30, 158, 45,
360 | 		126, 64, 67, 238, 52, 200, 35, 161, 19,
361 | 		144, 211, 214, 225, 95, 190, 146, 27};
362 | 
363 | 	xelis_hash_v2(input, hash, scratch);
364 | 	if (memcmp(gold, hash, HASH_SIZE))
365 | 		printf("Failed!\n");
366 | 	else
367 | 		printf("Passed!\n");
368 | 
369 | 	free(input);
370 | 	free(scratch);
371 | }
372 | 
373 | typedef struct
374 | {
375 | 	int thread_id;
376 | 	int iterations;
377 | 	uint8_t *input;
378 | 	uint64_t *scratch;
379 | 	uint8_t *hash;
380 | } thread_data_t;
381 | 
382 | void set_thread_affinity(int thread_id)
383 | {
384 | 	cpu_set_t cpuset;
385 | 	CPU_ZERO(&cpuset);
386 | 	CPU_SET(thread_id % sysconf(_SC_NPROCESSORS_ONLN), &cpuset);
387 | 	int rc = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
388 | 	if (rc != 0)
389 | 	{
390 | 		fprintf(stderr, "Error: Unable to set CPU affinity for thread %d\n", thread_id);
391 | 	}
392 | }
393 | 
394 | void *hash_thread(void *arg)
395 | {
396 | 	thread_data_t *data = (thread_data_t *)arg;
397 | 	//	set_thread_affinity(data->thread_id);
398 | 
399 | 	for (int i = 0; i < data->iterations; ++i)
400 | 		xelis_hash_v2(data->input, data->hash, data->scratch);
401 | 
402 | 	pthread_exit(NULL);
403 | }
404 | 
405 | void hash_test(int t, int i)
406 | {
407 | 	pthread_t *threads;
408 | 	thread_data_t *thread_data;
409 | 
410 | 	xelis_hash_v2_init();
411 | 
412 | 	printf("\n%-10s %-15s %-10s\n", "Threads", "Hashes", "Hash/s");
413 | 	for (int tc = 1; tc <= t; ++tc)
414 | 	{
415 | 		threads = (pthread_t *)malloc(tc * sizeof(pthread_t));
416 | 		thread_data = (thread_data_t *)malloc(tc * sizeof(thread_data_t));
417 | 		struct timespec start, end;
418 | 
419 | 		clock_gettime(CLOCK_REALTIME, &start);
420 | 		for (int j = 0; j < tc; ++j)
421 | 		{
422 | 			thread_data[j].thread_id = j;
423 | 			thread_data[j].iterations = i;
424 | 			thread_data[j].input = (uint8_t *)calloc(INPUT_LEN, sizeof(uint8_t));
425 | 			thread_data[j].scratch = (uint64_t *)calloc(MEMSIZE, sizeof(uint64_t));
426 | 			thread_data[j].hash = (uint8_t *)calloc(HASH_SIZE, sizeof(uint8_t));
427 | 			pthread_create(&threads[j], NULL, hash_thread, (void *)&thread_data[j]);
428 | 		}
429 | 
430 | 		for (int j = 0; j < tc; ++j)
431 | 		{
432 | 			pthread_join(threads[j], NULL);
433 | 			free(thread_data[j].input);
434 | 			free(thread_data[j].scratch);
435 | 			free(thread_data[j].hash);
436 | 		}
437 | 
438 | 		clock_gettime(CLOCK_REALTIME, &end);
439 | 
440 | 		double time_taken = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
441 | 		double hashes_per_second = (double)(tc * i) / time_taken;
442 | 		printf("%-10d %-15d %-10.2f\n", tc, i * tc, hashes_per_second);
443 | 
444 | 		free(threads);
445 | 		free(thread_data);
446 | 	}
447 | }
448 | 
449 | void print_usage(const char *prog_name)
450 | {
451 | 	printf("Usage: %s [-n iterations] [-t threads]\n", prog_name);
452 | 	printf("  -n iterations    Number of iterations for tests\n");
453 | 	printf("  -t threads       Number of threads to test\n");
454 | 	printf("  -h               Show this help message\n");
455 | }
456 | 
457 | int main(int argc, char *argv[])
458 | {
459 | 	int N = 1000, T = 8;
460 | 	int opt;
461 | 
462 | 	while ((opt = getopt(argc, argv, "n:t:h")) != -1)
463 | 	{
464 | 		switch (opt)
465 | 		{
466 | 		case 'n':
467 | 			N = atoi(optarg);
468 | 			break;
469 | 		case 't':
470 | 			T = atoi(optarg);
471 | 			break;
472 | 		case 'h':
473 | 			print_usage(argv[0]);
474 | 			return 0;
475 | 		default:
476 | 			print_usage(argv[0]);
477 | 			return 1;
478 | 		}
479 | 	}
480 | 
481 | 	timing_test(N);
482 | 	hash_test(T, N);
483 | }
484 | 


--------------------------------------------------------------------------------
/C/xelis_hash_v3.c:
--------------------------------------------------------------------------------
  1 | #define _GNU_SOURCE
  2 | #include <stdio.h>
  3 | #include <malloc.h>
  4 | #include <stdint.h>
  5 | #include <string.h>
  6 | #include <time.h>
  7 | #include <pthread.h>
  8 | #include <sched.h>
  9 | #include <unistd.h>
 10 | #include <wmmintrin.h>
 11 | #include "BLAKE3/c/blake3.h"
 12 | #include "ChaCha20-SIMD/chacha20.h"
 13 | #include <math.h>
 14 | 
 15 | #define INPUT_LEN (112)
 16 | #define MEMSIZE (531 * 128)
 17 | #define ITERS (2)
 18 | #define HASH_SIZE (32)
 19 | 
 20 | static inline void blake3(const uint8_t *input, int len, uint8_t *output) {
 21 | 	blake3_hasher hasher;
 22 | 	blake3_hasher_init(&hasher);
 23 | 	blake3_hasher_update(&hasher, input, len);
 24 | 	blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
 25 | }
 26 | 
 27 | #define CHUNK_SIZE (32)
 28 | #define NONCE_SIZE (12)
 29 | #define OUTPUT_SIZE (MEMSIZE * 8)
 30 | #define CHUNKS (4)
 31 | #define INPUT_LEN (112)
 32 | 
 33 | void stage1(const uint8_t *input, size_t input_len, uint8_t scratch_pad[OUTPUT_SIZE]) {
 34 | 	uint8_t key[CHUNK_SIZE * CHUNKS] = {0};
 35 | 	uint8_t input_hash[HASH_SIZE];
 36 | 	uint8_t buffer[CHUNK_SIZE * 2];
 37 | 	memcpy(key, input, INPUT_LEN);
 38 | 	blake3(input, INPUT_LEN, buffer);
 39 | 
 40 | 	uint8_t *t = scratch_pad;
 41 | 
 42 | 	memcpy(buffer + CHUNK_SIZE, key + 0 * CHUNK_SIZE, CHUNK_SIZE);
 43 | 	blake3(buffer, CHUNK_SIZE * 2, input_hash);
 44 | 	chacha_encrypt(input_hash, buffer, NULL, t, OUTPUT_SIZE / CHUNKS, 8);
 45 | 
 46 | 	t += OUTPUT_SIZE / CHUNKS;
 47 | 	memcpy(buffer, input_hash, CHUNK_SIZE);
 48 | 	memcpy(buffer + CHUNK_SIZE, key + 1 * CHUNK_SIZE, CHUNK_SIZE);
 49 | 	blake3(buffer, CHUNK_SIZE * 2, input_hash);
 50 | 	chacha_encrypt(input_hash, t - NONCE_SIZE, NULL, t, OUTPUT_SIZE / CHUNKS, 8);
 51 | 
 52 | 	t += OUTPUT_SIZE / CHUNKS;
 53 | 	memcpy(buffer, input_hash, CHUNK_SIZE);
 54 | 	memcpy(buffer + CHUNK_SIZE, key + 2 * CHUNK_SIZE, CHUNK_SIZE);
 55 | 	blake3(buffer, CHUNK_SIZE * 2, input_hash);
 56 | 	chacha_encrypt(input_hash, t - NONCE_SIZE, NULL, t, OUTPUT_SIZE / CHUNKS, 8);
 57 | 
 58 | 	t += OUTPUT_SIZE / CHUNKS;
 59 | 	memcpy(buffer, input_hash, CHUNK_SIZE);
 60 | 	memcpy(buffer + CHUNK_SIZE, key + 3 * CHUNK_SIZE, CHUNK_SIZE);
 61 | 	blake3(buffer, CHUNK_SIZE * 2, input_hash);
 62 | 	chacha_encrypt(input_hash, t - NONCE_SIZE, NULL, t, OUTPUT_SIZE / CHUNKS, 8);
 63 | }
 64 | 
 65 | #define KEY "xelishash-pow-v3"
 66 | #define BUFSIZE (MEMSIZE / 2)
 67 | 
 68 | // https://danlark.org/2020/06/14/128-bit-division
 69 | static inline uint64_t Divide128Div64To64(uint64_t high, uint64_t low, uint64_t divisor, uint64_t *remainder) {
 70 | 	uint64_t result;
 71 | 	__asm__("divq %[v]"
 72 | 			: "=a"(result), "=d"(*remainder) // Output parametrs, =a for rax, =d for rdx, [v] is an
 73 | 			// alias for divisor, input paramters "a" and "d" for low and high.
 74 | 			: [v] "r"(divisor), "a"(low), "d"(high));
 75 | 	return result;
 76 | }
 77 | 
 78 | static inline uint64_t udiv(uint64_t high, uint64_t low, uint64_t divisor) {
 79 | 	uint64_t remainder;
 80 | 
 81 | 	if (high < divisor) {
 82 | 		return Divide128Div64To64(high, low, divisor, &remainder);
 83 | 	}
 84 | 	else {
 85 | 		(void)Divide128Div64To64(0, high, divisor, &high);
 86 | 		return Divide128Div64To64(high, low, divisor, &remainder);
 87 | 	}
 88 | }
 89 | 
 90 | static inline uint64_t ROTR(uint64_t x, uint32_t r) {
 91 | 	asm("rorq %%cl, %0" : "+r"(x) : "c"(r));
 92 | 	return x;
 93 | }
 94 | 
 95 | static inline uint64_t ROTL(uint64_t x, uint32_t r) {
 96 | 	asm("rolq %%cl, %0" : "+r"(x) : "c"(r));
 97 | 	return x;
 98 | }
 99 | 
100 | static inline __uint128_t combine_uint64(uint64_t high, uint64_t low) {
101 | 	return ((__uint128_t)high << 64) | low;
102 | }
103 | 
104 | static inline uint64_t murmurhash3(uint64_t seed) {
105 |     seed ^= seed >> 55;
106 |     seed *= 0xff51afd7ed558ccdULL;
107 |     seed ^= seed >> 32;
108 |     seed *= 0xc4ceb9fe1a85ec53ULL;
109 |     seed ^= seed >> 15;
110 |     return seed;
111 | }
112 | 
113 | static inline uint64_t map_index(uint64_t x) {
114 |     x ^= x >> 33;
115 |     x *= 0xff51afd7ed558ccdULL;
116 |     return (uint64_t)(((__uint128_t)x * BUFSIZE) >> 64);
117 | }
118 | 
119 | static inline int pick_half(uint64_t seed) {
120 |     return (murmurhash3(seed) & (1ULL << 58)) != 0;
121 | }
122 | 
123 | uint64_t isqrt(uint64_t n) {
124 |     if (n < 2)
125 |         return n;
126 | 
127 |     // Compute the floating-point square root
128 |     uint64_t approx = (uint64_t)sqrt((double)n);
129 | 
130 |     // Verify and adjust if necessary
131 |     if (approx * approx > n) {
132 |         return approx - 1;
133 |     } else if ((approx + 1) * (approx + 1) <= n) {
134 |         return approx + 1;
135 |     } else {
136 |         return approx;
137 |     }
138 | }
139 | 
140 | uint64_t modular_power(uint64_t base, uint64_t exp, uint64_t mod) {
141 |     uint64_t result = 1;
142 |     base %= mod;  // Ensure base is within the range of mod
143 |     
144 |     while (exp > 0) {
145 |         // If exp is odd, multiply base with result
146 |         if (exp & 1) {
147 |             result = (uint64_t)(((__uint128_t)result * base) % mod);
148 |         }
149 | 	        
150 |         // Square the base and reduce by mod
151 |         base = (uint64_t)(((__uint128_t)base * base) % mod);
152 |         exp /= 2;  // Halve the exponent
153 |     }
154 |     
155 |     return result;
156 | }
157 | 
158 | void static inline uint64_to_le_bytes(uint64_t value, uint8_t *bytes) {
159 | 	for (int i = 0; i < 8; i++) {
160 | 		bytes[i] = value & 0xFF;
161 | 		value >>= 8;
162 | 	}
163 | }
164 | 
165 | uint64_t static inline le_bytes_to_uint64(const uint8_t *bytes) {
166 | 	uint64_t value = 0;
167 | 	for (int i = 7; i >= 0; i--)
168 | 		value = (value << 8) | bytes[i];
169 | 	return value;
170 | }
171 | 
172 | void static inline aes_single_round(uint8_t *block, const uint8_t *key) {
173 | 	__m128i block_vec = _mm_loadu_si128((const __m128i *)block);
174 | 	__m128i key_vec = _mm_loadu_si128((const __m128i *)key);
175 | 
176 | 	// Perform single AES encryption round
177 | 	block_vec = _mm_aesenc_si128(block_vec, key_vec);
178 | 
179 | 	_mm_storeu_si128((__m128i *)block, block_vec);
180 | }
181 | 
182 | void stage3(uint64_t *scratch) {
183 | 	uint64_t *mem_buffer_a = scratch;
184 | 	uint64_t *mem_buffer_b = &scratch[BUFSIZE];
185 | 
186 | 	uint64_t addr_a = mem_buffer_b[BUFSIZE - 1];
187 | 	uint64_t addr_b = mem_buffer_a[BUFSIZE - 1] >> 32;
188 | 	uint32_t r = 0;
189 | 
190 | 	for (uint32_t i = 0; i < ITERS; i++) {
191 | 		uint64_t mem_a = mem_buffer_a[map_index(addr_a)];
192 | 		uint64_t mem_b = mem_buffer_b[map_index(mem_a ^ addr_b)];
193 | 
194 | 		uint8_t block[16];
195 | 		uint64_to_le_bytes(mem_b, block);
196 | 		uint64_to_le_bytes(mem_a, block + 8);
197 | 		aes_single_round(block, KEY);
198 | 
199 | 		uint64_t hash1 = le_bytes_to_uint64(block);
200 | 		uint64_t hash2 = le_bytes_to_uint64(block + 8);
201 | 		uint64_t result = ~(hash1 ^ hash2);
202 | 
203 | 		for (uint32_t j = 0; j < BUFSIZE; j++) {
204 | 			uint64_t a = mem_buffer_a[map_index(result)];
205 | 			uint64_t b = mem_buffer_b[map_index(a ^ ~ROTR(result, r))];
206 | 			uint64_t c = (r < BUFSIZE) ? mem_buffer_a[r] : mem_buffer_b[r - BUFSIZE];
207 | 			r = (r < MEMSIZE - 1) ? r + 1 : 0;
208 | 
209 | 			uint64_t v;
210 | 			__uint128_t t1, t2;
211 | 			switch (ROTL(result, (uint32_t)c) & 0xf) {
212 | 			case 0:
213 | 				t1 = combine_uint64(a + i, isqrt(b + j));
214 | 				uint64_t denom = murmurhash3(c ^ result ^ i ^ j) | 1;
215 | 				v = (uint64_t)(t1 % denom);
216 | 				break;
217 | 			case 1:
218 | 				v = ROTL((c + i) % isqrt(b | 2), i + j) * isqrt(a + j);
219 | 				break;
220 | 			case 2:
221 | 				v = (isqrt(a + i) * isqrt(c + j)) ^ (b + i + j);
222 | 				break;
223 | 			case 3:
224 | 				v = ((a + b) * c);
225 | 				break;
226 | 			case 4:
227 | 				v = ((b - c) * a);
228 | 				break;
229 | 			case 5:
230 | 				v = (c - a + b);
231 | 				break;
232 | 			case 6:
233 | 				v = (a - b + c);
234 | 				break;
235 | 			case 7:
236 | 				v = (b * c + a);
237 | 				break;
238 | 			case 8:
239 | 				v = (c * a + b);
240 | 				break;
241 | 			case 9:
242 | 				v = (a * b * c);
243 | 				break;
244 | 			case 10:
245 | 				t1 = combine_uint64(a, b);
246 | 				v = t1 % (c | 1);
247 | 				break;
248 | 			case 11:
249 | 				t1 = combine_uint64(b, c);
250 | 				t2 = combine_uint64(ROTL(result, r), a | 2);
251 | 				v = (t2 > t1) ? c : t1 % t2;
252 | 				break;
253 | 			case 12:
254 | 				v = udiv(c, a, b | 4);
255 | 				break;
256 | 			case 13:
257 | 				t1 = combine_uint64(ROTL(result, r), b);
258 | 				t2 = combine_uint64(a, c | 8);
259 | 				v = (t1 > t2) ? t1 / t2 : a ^ b;
260 | 				break;
261 | 			case 14:
262 | 				t1 = combine_uint64(b, a);
263 | 				v = (t1 * c) >> 64;
264 | 				break;
265 | 			case 15:
266 | 				t1 = combine_uint64(a, c);
267 | 				t2 = combine_uint64(ROTR(result, r), b);
268 | 				v = (t1 * t2) >> 64;
269 | 				break;
270 | 			}
271 | 			uint64_t idx_seed = v ^ result;
272 | 			result = ROTL(idx_seed, r);
273 | 
274 | 			uint64_t use_buffer_b = pick_half(v);
275 | 			uint64_t idx_t = map_index(idx_seed);
276 | 			uint64_t t = (use_buffer_b ? mem_buffer_b[idx_t] : mem_buffer_a[idx_t]) ^ result;
277 | 
278 | 			uint64_t idx_a = map_index(t ^ result ^ 0x9e3779b97f4a7c15);
279 | 			uint64_t idx_b = map_index(idx_a ^ ~result ^ 0xd2b74407b1ce6e93);
280 | 
281 | 			uint64_t mem_a = mem_buffer_a[idx_a];
282 | 			mem_buffer_a[idx_a] = t;
283 | 			mem_buffer_b[idx_b] ^= mem_a ^ ROTR(t, i + j);
284 | 		}
285 | 
286 | 		addr_a = modular_power(addr_a, addr_b, result);
287 | 		addr_b = isqrt(result) * (r + 1) * isqrt(addr_a);
288 | 	}
289 | }
290 | 
291 | int xelis_hash_v3_init() {
292 | 	// return sodium_init();
293 | }
294 | 
295 | void xelis_hash_v3(uint8_t in[INPUT_LEN], uint8_t hash[HASH_SIZE], uint64_t scratch[MEMSIZE]) {
296 | 	uint8_t *scratch_uint8 = (uint8_t *)scratch;
297 | 
298 | 	stage1(in, INPUT_LEN, scratch_uint8);
299 | 	stage3(scratch);
300 | 	blake3(scratch_uint8, OUTPUT_SIZE, hash);
301 | }
302 | 
303 | double display_time(const char *stage, struct timespec start, struct timespec end, int iterations) {
304 | 	uint64_t total_time = (end.tv_sec - start.tv_sec) * 1000000000ULL + (end.tv_nsec - start.tv_nsec);
305 | 	double time_per = (double)total_time / iterations;
306 | 	printf("%s: %.3f ms\n", stage, time_per / 1000000.0);
307 | 	return time_per;
308 | }
309 | 
310 | void timing_test(int N) {
311 | 	uint8_t hash[HASH_SIZE];
312 | 	struct timespec start, end;
313 | 	double time_per, time_sum = 0;
314 | 
315 | 	uint8_t *input = (uint8_t *)calloc(INPUT_LEN, sizeof(uint8_t));
316 | 	uint64_t *scratch = (uint64_t *)calloc(MEMSIZE, sizeof(uint64_t));
317 | 	uint8_t *scratch_uint8 = (uint8_t *)scratch;
318 | 
319 | 	xelis_hash_v3_init();
320 | 
321 | 	printf("Timing:\n");
322 | 	clock_gettime(CLOCK_MONOTONIC, &start);
323 | 	for (int i = 0; i < N; i++)
324 | 		stage1(input, INPUT_LEN, scratch_uint8);
325 | 	clock_gettime(CLOCK_MONOTONIC, &end);
326 | 	time_sum += display_time("stage1", start, end, N);
327 | 
328 | 	clock_gettime(CLOCK_MONOTONIC, &start);
329 | 	for (int i = 0; i < N; i++)
330 | 		stage3(scratch);
331 | 	clock_gettime(CLOCK_MONOTONIC, &end);
332 | 	time_sum += display_time("stage3", start, end, N);
333 | 
334 | 	clock_gettime(CLOCK_MONOTONIC, &start);
335 | 	for (int i = 0; i < N; i++)
336 | 		blake3(scratch_uint8, OUTPUT_SIZE, hash);
337 | 	clock_gettime(CLOCK_MONOTONIC, &end);
338 | 	time_sum += display_time("stage4", start, end, N);
339 | 
340 | 	printf("Total:  %.3f ms (%d avg)\n", time_sum / 1000000.0, N);
341 | 
342 | 	// verify output
343 | 	uint8_t gold[HASH_SIZE] = {
344 | 		105, 172, 103, 40, 94, 253, 92, 162,
345 | 		42, 252, 5, 196, 236, 238, 91, 218,
346 | 		22, 157, 228, 233, 239, 8, 250, 57,
347 | 		212, 166, 121, 132, 148, 205, 103, 163
348 | 	};
349 | 
350 | 	xelis_hash_v3(input, hash, scratch);
351 | 	if (memcmp(gold, hash, HASH_SIZE)) {
352 | 		printf("Failed!\n");
353 | 		printf("Expected: ");
354 | 		for (int i = 0; i < HASH_SIZE; i++) {
355 | 			printf("%u", gold[i]);
356 | 			if (i != HASH_SIZE - 1) {
357 | 				printf(", ");
358 | 			}
359 | 		}
360 | 		printf("\nGot:      ");
361 | 		for (int i = 0; i < HASH_SIZE; i++) {
362 | 			printf("%u", hash[i]);
363 | 			if (i != HASH_SIZE - 1) {
364 | 				printf(", ");
365 | 			}
366 | 		}
367 | 		printf("\n");
368 | 	}
369 | 	else {
370 | 		printf("Passed!\n");
371 | 	}
372 | 
373 | 	free(input);
374 | 	free(scratch);
375 | }
376 | 
377 | typedef struct {
378 | 	int thread_id;
379 | 	int iterations;
380 | 	uint8_t *input;
381 | 	uint64_t *scratch;
382 | 	uint8_t *hash;
383 | } thread_data_t;
384 | 
385 | void set_thread_affinity(int thread_id) {
386 | 	cpu_set_t cpuset;
387 | 	CPU_ZERO(&cpuset);
388 | 	CPU_SET(thread_id % sysconf(_SC_NPROCESSORS_ONLN), &cpuset);
389 | 	int rc = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
390 | 	if (rc != 0) {
391 | 		fprintf(stderr, "Error: Unable to set CPU affinity for thread %d\n", thread_id);
392 | 	}
393 | }
394 | 
395 | void *hash_thread(void *arg) {
396 | 	thread_data_t *data = (thread_data_t *)arg;
397 | 	//	set_thread_affinity(data->thread_id);
398 | 
399 | 	for (int i = 0; i < data->iterations; ++i)
400 | 		xelis_hash_v3(data->input, data->hash, data->scratch);
401 | 
402 | 	pthread_exit(NULL);
403 | }
404 | 
405 | void hash_test(int t, int i) {
406 | 	pthread_t *threads;
407 | 	thread_data_t *thread_data;
408 | 
409 | 	xelis_hash_v3_init();
410 | 
411 | 	printf("\n%-10s %-15s %-10s\n", "Threads", "Hashes", "Hash/s");
412 | 	for (int tc = 1; tc <= t; ++tc) {
413 | 		threads = (pthread_t *)malloc(tc * sizeof(pthread_t));
414 | 		thread_data = (thread_data_t *)malloc(tc * sizeof(thread_data_t));
415 | 		struct timespec start, end;
416 | 
417 | 		clock_gettime(CLOCK_REALTIME, &start);
418 | 		for (int j = 0; j < tc; ++j) {
419 | 			thread_data[j].thread_id = j;
420 | 			thread_data[j].iterations = i;
421 | 			thread_data[j].input = (uint8_t *)calloc(INPUT_LEN, sizeof(uint8_t));
422 | 			thread_data[j].scratch = (uint64_t *)calloc(MEMSIZE, sizeof(uint64_t));
423 | 			thread_data[j].hash = (uint8_t *)calloc(HASH_SIZE, sizeof(uint8_t));
424 | 			pthread_create(&threads[j], NULL, hash_thread, (void *)&thread_data[j]);
425 | 		}
426 | 
427 | 		for (int j = 0; j < tc; ++j) {
428 | 			pthread_join(threads[j], NULL);
429 | 			free(thread_data[j].input);
430 | 			free(thread_data[j].scratch);
431 | 			free(thread_data[j].hash);
432 | 		}
433 | 
434 | 		clock_gettime(CLOCK_REALTIME, &end);
435 | 
436 | 		double time_taken = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
437 | 		double hashes_per_second = (double)(tc * i) / time_taken;
438 | 		printf("%-10d %-15d %-10.2f\n", tc, i * tc, hashes_per_second);
439 | 
440 | 		free(threads);
441 | 		free(thread_data);
442 | 	}
443 | }
444 | 
445 | void print_usage(const char *prog_name) {
446 | 	printf("Usage: %s [-n iterations] [-t threads]\n", prog_name);
447 | 	printf("  -n iterations    Number of iterations for tests\n");
448 | 	printf("  -t threads       Number of threads to test\n");
449 | 	printf("  -h               Show this help message\n");
450 | }
451 | 
452 | int main(int argc, char *argv[]) {
453 | 	int N = 1000, T = 8;
454 | 	int opt;
455 | 
456 | 	while ((opt = getopt(argc, argv, "n:t:h")) != -1) {
457 | 		switch (opt) {
458 | 		case 'n':
459 | 			N = atoi(optarg);
460 | 			break;
461 | 		case 't':
462 | 			T = atoi(optarg);
463 | 			break;
464 | 		case 'h':
465 | 			print_usage(argv[0]);
466 | 			return 0;
467 | 		default:
468 | 			print_usage(argv[0]);
469 | 			return 1;
470 | 		}
471 | 	}
472 | 
473 | 	timing_test(N);
474 | 	if (T)
475 | 		hash_test(T, N);
476 | }


--------------------------------------------------------------------------------
/src/v3.rs:
--------------------------------------------------------------------------------
  1 | use aes::cipher::generic_array::GenericArray;
  2 | use crate::{v2, Error, Hash, scratchpad::ScratchPad as ScratchPadInternal};
  3 | 
  4 | #[cfg(feature = "tracker")]
  5 | use crate::tracker::*;
  6 | 
  7 | // These are tweakable parameters
  8 | // Memory size is the size of the scratch pad in u64s
  9 | // In bytes, this is equal to ~ 544 kB
 10 | const MEMORY_SIZE: usize = 531 * 128;
 11 | const MEMORY_SIZE_BYTES: usize = MEMORY_SIZE * 8;
 12 | const SCRATCHPAD_ITERS: usize = 2;
 13 | const BUFFER_SIZE: usize = MEMORY_SIZE / 2;
 14 | 
 15 | // Stage 3 AES key
 16 | const KEY: [u8; 16] = *b"xelishash-pow-v3";
 17 | 
 18 | pub type ScratchPad = ScratchPadInternal<MEMORY_SIZE>;
 19 | 
 20 | #[inline]
 21 | const fn murmurhash3(mut seed: u64) -> u64 {
 22 |     /* MurmurHash3 finalizer.
 23 |     * Avalanches the input seed to produce a uniformly distributed output.
 24 |     */
 25 | 	seed ^= seed >> 55;
 26 | 	seed = seed.wrapping_mul(0xff51afd7ed558ccd);
 27 | 	seed ^= seed >> 32;
 28 | 	seed = seed.wrapping_mul(0xc4ceb9fe1a85ec53);
 29 | 	seed ^= seed >> 15;
 30 | 
 31 |     seed
 32 | }
 33 | 
 34 | #[inline(always)]
 35 | pub fn map_index(mut x: u64) -> usize {
 36 | 	/* MurmurHash3-like finalizer + multiply-high reduction.
 37 | 	* The finalizer avalanches the input seed; the mulhi step maps
 38 | 	* uniformly into [0, BUFSIZE) with minimal modulo bias.
 39 |     */
 40 |     x ^= x >> 33;
 41 |     x = x.wrapping_mul(0xff51afd7ed558ccd);
 42 | 
 43 |     ((x as u128) * (BUFFER_SIZE as u128) >> 64) as usize
 44 | }
 45 | 
 46 | #[inline(always)]
 47 | pub fn pick_half(seed: u64) -> bool {
 48 |     // // Murmur3 finalizer to get a uniform selector bit
 49 |     (murmurhash3(seed) & (1u64 << 58)) != 0
 50 | }
 51 | 
 52 | #[inline(always)]
 53 | pub fn isqrt(n: u64) -> u64 {
 54 |     if n < 2 {
 55 |         return n;
 56 |     }
 57 | 
 58 |     // Compute floating-point square root as an approximation
 59 |     let approx = (n as f64).sqrt() as u64;
 60 | 
 61 |     // Verify and adjust if necessary
 62 |     if approx * approx > n {
 63 |         approx - 1
 64 |     } else if (approx + 1) * (approx + 1) <= n {
 65 |         approx + 1
 66 |     } else {
 67 |         approx
 68 |     }
 69 | }
 70 | 
 71 | const fn modular_power(mut base: u64, mut exp: u64, mod_: u64) -> u64 {
 72 |     let mut result: u64 = 1;
 73 |     // Ensure base is within the range of mod
 74 |     base %= mod_;
 75 | 
 76 |     while exp > 0 {
 77 |         // If exp is odd, multiply base with result
 78 |         if exp & 1 == 1 {
 79 |             result = ((result as u128 * base as u128) % mod_ as u128) as u64;
 80 |         }
 81 | 
 82 |         // Square the base and reduce by mod
 83 |         base = ((base as u128 * base as u128) % mod_ as u128) as u64;
 84 |         exp /= 2;
 85 |     }
 86 | 
 87 |     result
 88 | }
 89 | 
 90 | pub(crate) fn stage_3(scratch_pad: &mut [u64; MEMORY_SIZE], #[cfg(feature = "tracker")] tracker: &mut OpsTracker) -> Result<(), Error> {
 91 |     let key = GenericArray::from(KEY);
 92 |     let mut block = GenericArray::from([0u8; 16]);
 93 | 
 94 |     // Create two new slices for each half
 95 |     let (mem_buffer_a, mem_buffer_b) = scratch_pad.as_mut_slice().split_at_mut(BUFFER_SIZE);
 96 | 
 97 |     let mut addr_a = mem_buffer_b[BUFFER_SIZE-1];
 98 |     let mut addr_b = mem_buffer_a[BUFFER_SIZE-1] >> 32;
 99 | 
100 |     #[cfg(feature = "tracker")]
101 |     {
102 |         tracker.add_mem_op(BUFFER_SIZE-1, MemOp::Read);
103 |         tracker.add_mem_op(MEMORY_SIZE-1, MemOp::Read);
104 |     }
105 | 
106 |     let mut r: usize = 0;
107 | 
108 |     for i in 0..SCRATCHPAD_ITERS {
109 |         let index_a = map_index(addr_a);
110 |         let mem_a = mem_buffer_a[index_a];
111 | 
112 |         let index_b = map_index(mem_a ^ addr_b);
113 |         let mem_b = mem_buffer_b[index_b];
114 | 
115 |         #[cfg(feature = "tracker")]
116 |         {
117 |             tracker.add_mem_op(index_a, MemOp::Read);
118 |             tracker.add_mem_op(BUFFER_SIZE + index_b, MemOp::Read);
119 |         }
120 | 
121 |         block[..8].copy_from_slice(&mem_b.to_le_bytes());
122 |         block[8..].copy_from_slice(&mem_a.to_le_bytes());
123 | 
124 |         aes::hazmat::cipher_round(&mut block, &key);
125 | 
126 |         let hash1 = u64::from_le_bytes(block[..8]
127 |             .try_into()
128 |             .map_err(|_| Error::FormatError)?);
129 | 
130 |         let hash2 = u64::from_le_bytes(block[8..]
131 |             .try_into()
132 |             .map_err(|_| Error::FormatError)?);
133 | 
134 |         let mut result = !(hash1 ^ hash2);
135 | 
136 |         for j in 0..BUFFER_SIZE {
137 |             let index_a = map_index(result);
138 |             let a = mem_buffer_a[index_a];      
139 | 
140 |             let index_b = map_index(a ^ !result.rotate_right(r as u32));
141 |             let b = mem_buffer_b[index_b];
142 | 
143 |             #[cfg(feature = "tracker")]
144 |             {
145 |                 tracker.add_mem_op(index_a, MemOp::Read);
146 |                 tracker.add_mem_op(BUFFER_SIZE + index_b, MemOp::Read);
147 | 
148 |                 // This is the same index in scratchpad
149 |                 tracker.add_mem_op(r, MemOp::Read);
150 |             }
151 | 
152 |             let c = if r < BUFFER_SIZE {
153 |                 mem_buffer_a[r]
154 |             } else {
155 |                 mem_buffer_b[r-BUFFER_SIZE]
156 |             };
157 |             r = if r < MEMORY_SIZE - 1 {
158 |                 r + 1
159 |             } else {
160 |                 0
161 |             };
162 | 
163 |             let branch_idx = (result.rotate_left(c as u32) & 0xf) as u8;
164 |             #[cfg(feature = "tracker")]
165 |             {
166 |                 tracker.add_branch(branch_idx);
167 |             }
168 | 
169 |             let v = match branch_idx {
170 |                 // combine_u64((a + i), isqrt(b + j)) % (murmurhash3(c ^ result ^ i ^ j) | 1)
171 |                 0 => {
172 |                     let t1 = v2::combine_u64(
173 |                         a.wrapping_add(i as u64),
174 |                         isqrt(b.wrapping_add(j as u64)),
175 |                     );
176 |                     let denom = murmurhash3(c ^ result ^ i as u64 ^ j as u64) | 1;
177 |                     (t1 % (denom as u128)) as u64
178 |                 }
179 |                 // ROTL((c + i) % isqrt(b | 2), i + j) * isqrt(a + j)
180 |                 1 => {
181 |                     let t1 = c.wrapping_add(i as u64).wrapping_rem(isqrt(b | 2));
182 |                     let t2 = t1.rotate_left((i.wrapping_add(j)) as u32);
183 |                     let t3 = isqrt(a.wrapping_add(j as u64));
184 |                     t2.wrapping_mul(t3)
185 |                 }
186 |                 // (isqrt(a + i) * isqrt(c + j)) ^ (b + i + j)
187 |                 2 => {
188 |                     let t1 = isqrt(a.wrapping_add(i as u64));
189 |                     let t2 = isqrt(c.wrapping_add(j as u64));
190 |                     let t3 = t1.wrapping_mul(t2);
191 |                     t3 ^ b.wrapping_add(i as u64).wrapping_add(j as u64)
192 |                 }
193 |                 // (a + b) * c
194 |                 3 => a.wrapping_add(b).wrapping_mul(c),
195 |                 // (b - c) * a
196 |                 4 => b.wrapping_sub(c).wrapping_mul(a),
197 |                 // c - a + b
198 |                 5 => c.wrapping_sub(a).wrapping_add(b),
199 |                 // a - b + c
200 |                 6 => a.wrapping_sub(b).wrapping_add(c),
201 |                 // b * c + a
202 |                 7 => b.wrapping_mul(c).wrapping_add(a),
203 |                 // c * a + b
204 |                 8 => c.wrapping_mul(a).wrapping_add(b),
205 |                 // a * b * c
206 |                 9 => a.wrapping_mul(b).wrapping_mul(c),
207 |                 10 => {
208 |                     let t1 = v2::combine_u64(a, b);
209 |                     let t2 = (c | 1) as u128;
210 |                     t1.wrapping_rem(t2) as u64
211 |                 },
212 |                 11 => {
213 |                     let t1 = v2::combine_u64(b, c);
214 |                     let t2 = v2::combine_u64(result.rotate_left(r as u32), a | 2);
215 |                     if t2 > t1 { c } else { t1.wrapping_rem(t2) as u64 }
216 |                 },
217 |                 12 => {
218 |                     let t1 = v2::combine_u64(c, a);
219 |                     let t2 = (b | 4) as u128;
220 |                     t1.wrapping_div(t2) as u64
221 |                 },
222 |                 13 => {
223 |                     let t1 = v2::combine_u64(result.rotate_left(r as u32), b);
224 |                     let t2 = v2::combine_u64(a, c | 8);
225 |                     if t1 > t2 {t1.wrapping_div(t2) as u64} else {a^b}
226 |                 },
227 |                 14 => {
228 |                     let t1 = v2::combine_u64(b, a);
229 |                     let t2 = c as u128;
230 |                     (t1.wrapping_mul(t2) >> 64) as u64
231 |                 },
232 |                 15 => {
233 |                     let t1 = v2::combine_u64(a, c);
234 |                     let t2 = v2::combine_u64(result.rotate_right(r as u32), b);
235 |                     (t1.wrapping_mul(t2) >> 64) as u64
236 |                 },
237 |                 _ => unreachable!(),
238 |             };
239 | 
240 |             let seed = v ^ result;
241 |             result = seed.rotate_left(r as u32);
242 | 
243 |             let use_buffer_b = pick_half(v);
244 |             let index_t = map_index(seed);
245 |             let t = if use_buffer_b { mem_buffer_b[index_t] } else { mem_buffer_a[index_t] } ^ result;
246 | 
247 |             let index_a = map_index(t ^ result ^ 0x9e3779b97f4a7c15);
248 |             let index_b = map_index(index_a as u64 ^ !result ^ 0xd2b74407b1ce6e93);
249 | 
250 |             let a = std::mem::replace(&mut mem_buffer_a[index_a], t);
251 |             mem_buffer_b[index_b] ^= a ^ t.rotate_right(i.wrapping_add(j) as u32);
252 | 
253 |             #[cfg(feature = "tracker")]
254 |             {
255 |                 if use_buffer_b {
256 |                     tracker.add_mem_op(BUFFER_SIZE + index_t, MemOp::Read);
257 |                 } else {
258 |                     tracker.add_mem_op(index_t, MemOp::Read);
259 |                 }
260 | 
261 |                 // mem_buffer_a[index_a] and mem_buffer_b[index_b] are written
262 |                 tracker.add_mem_op(index_a, MemOp::Read);
263 |                 tracker.add_mem_op(index_a, MemOp::Write);
264 | 
265 |                 tracker.add_mem_op(BUFFER_SIZE + index_b, MemOp::Read);
266 |                 tracker.add_mem_op(BUFFER_SIZE + index_b, MemOp::Write);
267 |             }
268 |         }
269 | 
270 |         addr_a = modular_power(addr_a, addr_b, result);
271 |         addr_b = isqrt(result).wrapping_mul((r as u64).wrapping_add(1)).wrapping_mul(isqrt(addr_a));
272 |     }
273 | 
274 |     Ok(())
275 | }
276 | 
277 | pub fn xelis_hash(input: &[u8], scratch_pad: &mut ScratchPad, #[cfg(feature = "tracker")] distribution: &mut OpsTracker) -> Result<Hash, Error> {
278 |     v2::stage_1::<MEMORY_SIZE, MEMORY_SIZE_BYTES>(input, scratch_pad)?;
279 | 
280 |     let scratch_pad = scratch_pad.as_mut_slice();
281 | 
282 |     // stage 3 is customized compared to v2
283 |     stage_3(scratch_pad, #[cfg(feature = "tracker")] distribution)?;
284 | 
285 |     // final stage 4
286 |     v2::stage_4(scratch_pad)
287 | }
288 | 
289 | 
290 | #[cfg(test)]
291 | mod tests {
292 |     use rand::{RngCore, rngs::OsRng};
293 |     use super::*;
294 | 
295 |     #[test]
296 |     fn test_reused_scratchpad() {
297 |         let mut scratch_pad = ScratchPad::default();
298 |         let mut input = [0u8; 112];
299 |         OsRng.fill_bytes(&mut input);
300 | 
301 |         // Do a first hash
302 |         let expected_hash = xelis_hash(&input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap();
303 | 
304 |         // Do a second hash with dirty scratch pad but same input
305 |         let hash = xelis_hash(&input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap();
306 |         assert_eq!(hash, expected_hash);
307 |     }
308 | 
309 |     #[test]
310 |     fn test_zero_hash() {
311 |         let mut scratch_pad = ScratchPad::default();
312 |         let mut input = [0u8; 112];
313 | 
314 |         let hash = xelis_hash(&mut input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap();
315 |         let expected_hash = [
316 |             105, 172, 103, 40, 94, 253, 92, 162,
317 |             42, 252, 5, 196, 236, 238, 91, 218,
318 |             22, 157, 228, 233, 239, 8, 250, 57,
319 |             212, 166, 121, 132, 148, 205, 103, 163
320 |         ];
321 | 
322 |         assert_eq!(hash, expected_hash);
323 |     }
324 |  
325 |     #[test]
326 |     fn test_verify_output() {
327 |         let input = [
328 |             172, 236, 108, 212, 181, 31, 109, 45, 44, 242, 54, 225, 143, 133,
329 |             89, 44, 179, 108, 39, 191, 32, 116, 229, 33, 63, 130, 33, 120, 185, 89,
330 |             146, 141, 10, 79, 183, 107, 238, 122, 92, 222, 25, 134, 90, 107, 116,
331 |             110, 236, 53, 255, 5, 214, 126, 24, 216, 97, 199, 148, 239, 253, 102,
332 |             199, 184, 232, 253, 158, 145, 86, 187, 112, 81, 78, 70, 80, 110, 33,
333 |             37, 159, 233, 198, 1, 178, 108, 210, 100, 109, 155, 106, 124, 124, 83,
334 |             89, 50, 197, 115, 231, 32, 74, 2, 92, 47, 25, 220, 135, 249, 122,
335 |             172, 220, 137, 143, 234, 68, 188
336 |         ];
337 | 
338 |         let mut scratch_pad = ScratchPad::default();
339 |         let hash = xelis_hash(&input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap();
340 | 
341 |         let expected_hash = [
342 |             242, 8, 176, 222, 203, 27, 104,
343 |             187, 22, 40, 68, 73, 79, 79, 65,
344 |             83, 138, 101, 10, 116, 194, 41, 153,
345 |             21, 92, 163, 12, 206, 231, 156, 70, 83
346 |         ];
347 | 
348 |         assert_eq!(hash, expected_hash);
349 |     }
350 | 
351 |     #[test]
352 |     #[cfg(feature = "tracker")]
353 |     fn test_distribution() {
354 |         const ITERATIONS: usize = 50_000;
355 | 
356 |         let mut scratch_pad = ScratchPad::default();
357 |         let mut input = [0u8; 112];
358 |         let mut distribution = OpsTracker::new(MEMORY_SIZE);
359 |         for _ in 0..ITERATIONS {
360 |             OsRng.fill_bytes(&mut input);
361 |             let _ = xelis_hash(&input, &mut scratch_pad, &mut distribution).unwrap();
362 |         }
363 | 
364 |         distribution.generate_branch_distribution("branch_v3.png").unwrap();
365 |         distribution.generate_memory_usage_graph("memory_v3.png", 1000).unwrap();
366 |     }
367 | 
368 |     #[test]
369 |     fn test_pick_half() {
370 |         let mut ones = 0;
371 |         let mut zeros = 0;
372 | 
373 |         for _ in 0..1_000_000 {
374 |             let i = OsRng.next_u64();
375 |             if pick_half(i) {
376 |                 ones += 1;
377 |             } else {
378 |                 zeros += 1;
379 |             }
380 |         }
381 | 
382 |         let ratio = ones as f64 / (ones + zeros) as f64;
383 |         assert!((ratio - 0.5).abs() < 0.01, "pick_half is not balanced: ratio={}", ratio);
384 |     }
385 | 
386 |     #[test]
387 |     fn test_map_index() {
388 |         for _ in 0..10_000_000 {
389 |             let i = OsRng.next_u64();
390 |             let index = map_index(i);
391 | 
392 |             assert!(index < BUFFER_SIZE);
393 |         }
394 | 
395 |         assert!(map_index(0) == 0);
396 |         assert!(map_index(u64::MAX) < BUFFER_SIZE);
397 |     }
398 | }


--------------------------------------------------------------------------------
/src/v2.rs:
--------------------------------------------------------------------------------
  1 | use aes::cipher::generic_array::GenericArray;
  2 | use blake3::hash as blake3_hash;
  3 | use chacha20::{
  4 |     cipher::{KeyIvInit, StreamCipher},
  5 |     ChaCha8,
  6 | };
  7 | 
  8 | use crate::{
  9 |     scratchpad::ScratchPad as ScratchPadInternal,
 10 |     Error,
 11 |     Hash,
 12 |     HASH_SIZE
 13 | };
 14 | 
 15 | #[cfg(feature = "tracker")]
 16 | use crate::tracker::{OpsTracker, MemOp};
 17 | 
 18 | // These are tweakable parameters
 19 | // Memory size is the size of the scratch pad in u64s
 20 | // In bytes, this is equal to ~ 440 kB
 21 | const MEMORY_SIZE: usize = 429 * 128;
 22 | 
 23 | // Scratchpad iterations in stage 3
 24 | const SCRATCHPAD_ITERS: usize = 3;
 25 | // Buffer size for stage 3 (inner loop iterations)
 26 | const BUFFER_SIZE: usize = MEMORY_SIZE / 2;
 27 | 
 28 | // Stage 1 config
 29 | const CHUNK_SIZE: usize = 32;
 30 | const NONCE_SIZE: usize = 12;
 31 | const MEMORY_SIZE_BYTES: usize = MEMORY_SIZE * 8;
 32 | 
 33 | // Stage 3 AES key
 34 | const KEY: [u8; 16] = *b"xelishash-pow-v2";
 35 | 
 36 | pub type ScratchPad = ScratchPadInternal<MEMORY_SIZE>;
 37 | 
 38 | // Combine two u64 into a u128
 39 | #[inline(always)]
 40 | pub(crate) fn combine_u64(high: u64, low: u64) -> u128 {
 41 |     (high as u128) << 64 | low as u128
 42 | }
 43 | 
 44 | // Stage 1 of the hashing algorithm
 45 | // This stage is responsible for generating the scratch pad
 46 | // The scratch pad is generated using ChaCha8 with a custom nonce
 47 | // that is updated after each iteration
 48 | pub(crate) fn stage_1<const M: usize, const OUTPUT_SIZE: usize>(input: &[u8], scratch_pad: &mut ScratchPadInternal<M>) -> Result<(), Error> {
 49 |     let bytes = scratch_pad.as_mut_bytes::<OUTPUT_SIZE>()?;
 50 | 
 51 |     // Reset the scratchpad to 0
 52 |     // This is done to ensure that the scratchpad is clean
 53 |     // and prevent us to do multiple heap allocations in below loop
 54 |     bytes.fill(0);
 55 | 
 56 |     let mut output_offset = 0;
 57 |     let mut nonce = [0u8; NONCE_SIZE];
 58 | 
 59 |     // Generate the nonce from the input
 60 |     let mut input_hash: Hash = blake3_hash(input).into();
 61 |     nonce.copy_from_slice(&input_hash[..NONCE_SIZE]);
 62 | 
 63 |     let num_chunks = (input.len() + CHUNK_SIZE - 1) / CHUNK_SIZE;
 64 | 
 65 |     for (chunk_index, chunk) in input.chunks(CHUNK_SIZE).enumerate() {
 66 |         // Concatenate the input hash with the chunk
 67 |         let mut tmp = [0u8; HASH_SIZE * 2];
 68 |         tmp[0..HASH_SIZE].copy_from_slice(&input_hash);
 69 |         tmp[HASH_SIZE..HASH_SIZE + chunk.len()].copy_from_slice(chunk);
 70 | 
 71 |         // Hash it to not trust the input
 72 |         input_hash = blake3_hash(&tmp).into();
 73 | 
 74 |         let mut cipher = ChaCha8::new(&input_hash.into(), &nonce.into());
 75 | 
 76 |         // Calculate the remaining size and how much to generate this iteration
 77 |         let remaining_output_size = OUTPUT_SIZE - output_offset;
 78 |         // Remaining chunks
 79 |         let chunks_left = num_chunks - chunk_index;
 80 |         let chunk_output_size = remaining_output_size / chunks_left;
 81 |         let current_output_size = remaining_output_size.min(chunk_output_size);
 82 | 
 83 |         // Apply the keystream to the output
 84 |         let offset = chunk_index * current_output_size;
 85 |         let part = &mut bytes[offset..offset+current_output_size];
 86 |         cipher.apply_keystream(part);
 87 | 
 88 |         output_offset += current_output_size;
 89 | 
 90 |         // Update the nonce with the last NONCE_SIZE bytes of temp_output
 91 |         let nonce_start = current_output_size.saturating_sub(NONCE_SIZE);
 92 | 
 93 |         // Copy the new nonce
 94 |         nonce.copy_from_slice(&part[nonce_start..]);
 95 |     }
 96 | 
 97 |     Ok(())
 98 | }
 99 | 
100 | // Stage 3 of the hashing algorithm
101 | // This stage is responsible for hashing the scratch pad
102 | // Its goal is to have lot of random memory accesses
103 | // and some branching to make it hard to optimize on GPUs
104 | // it shouldn't be possible to parallelize this stage
105 | pub(crate) fn stage_3(scratch_pad: &mut [u64; MEMORY_SIZE], #[cfg(feature = "tracker")] tracker: &mut OpsTracker) -> Result<(), Error> {
106 |     let key = GenericArray::from(KEY);
107 |     let mut block = GenericArray::from([0u8; 16]);
108 |     let buffer_size = BUFFER_SIZE as u64;
109 | 
110 |     // Create two new slices for each half
111 |     let (mem_buffer_a, mem_buffer_b) = scratch_pad.as_mut_slice().split_at_mut(BUFFER_SIZE);
112 | 
113 |     let mut addr_a = mem_buffer_b[BUFFER_SIZE-1];
114 |     let mut addr_b = mem_buffer_a[BUFFER_SIZE-1] >> 32;
115 | 
116 |     #[cfg(feature = "tracker")]
117 |     {
118 |         tracker.add_mem_op(BUFFER_SIZE-1, MemOp::Read);
119 |         tracker.add_mem_op(MEMORY_SIZE-1, MemOp::Read);
120 |     }
121 | 
122 |     let mut r: usize = 0;
123 | 
124 |     for i in 0..SCRATCHPAD_ITERS {
125 |         let index_a = (addr_a % buffer_size) as usize;
126 |         let index_b = (addr_b % buffer_size) as usize;
127 | 
128 |         let mem_a = mem_buffer_a[index_a];
129 |         let mem_b = mem_buffer_b[index_b];
130 | 
131 |         #[cfg(feature = "tracker")]
132 |         {
133 |             tracker.add_mem_op(index_a, MemOp::Read);
134 |             tracker.add_mem_op(BUFFER_SIZE + index_b, MemOp::Read);
135 |         }
136 | 
137 |         block[..8].copy_from_slice(&mem_b.to_le_bytes());
138 |         block[8..].copy_from_slice(&mem_a.to_le_bytes());
139 | 
140 |         aes::hazmat::cipher_round(&mut block, &key);
141 | 
142 |         let hash1 = u64::from_le_bytes(block[0..8]
143 |             .try_into()
144 |             .map_err(|_| Error::FormatError)?);
145 | 
146 |         let hash2 = mem_a ^ mem_b;
147 |         let mut result = !(hash1 ^ hash2);
148 | 
149 |         for j in 0..BUFFER_SIZE {
150 |             let index_a = (result % buffer_size) as usize;
151 |             let index_b = (!result.rotate_right(r as u32) % buffer_size) as usize;
152 | 
153 |             #[cfg(feature = "tracker")]
154 |             {
155 |                 tracker.add_mem_op(index_a, MemOp::Read);
156 |                 tracker.add_mem_op(BUFFER_SIZE + index_b, MemOp::Read);
157 |             }
158 | 
159 |             let a = mem_buffer_a[index_a];
160 |             let b = mem_buffer_b[index_b];
161 | 
162 |             #[cfg(feature = "tracker")]
163 |             {
164 |                 // This is the same index in scratchpad
165 |                 tracker.add_mem_op(r, MemOp::Read);
166 |             }
167 | 
168 |             let c = if r < BUFFER_SIZE {mem_buffer_a[r]} else {mem_buffer_b[r-BUFFER_SIZE]};
169 |             r = if r < MEMORY_SIZE-1 {r+1} else {0};
170 | 
171 |             let branch_idx = (result.rotate_left(c as u32) & 0xf) as u8;
172 |             #[cfg(feature = "tracker")]
173 |             {
174 |                 tracker.add_branch(branch_idx);
175 |             }
176 | 
177 |             let v = result ^ match branch_idx {
178 |                 0 => c.rotate_left(i.wrapping_mul(j) as u32) ^ b,
179 |                 1 => c.rotate_right(i.wrapping_mul(j) as u32) ^ a,
180 |                 2 => a ^ b ^ c,
181 |                 3 => a.wrapping_add(b).wrapping_mul(c),
182 |                 4 => b.wrapping_sub(c).wrapping_mul(a),
183 |                 5 => c.wrapping_sub(a).wrapping_add(b),
184 |                 6 => a.wrapping_sub(b).wrapping_add(c),
185 |                 7 => b.wrapping_mul(c).wrapping_add(a),
186 |                 8 => c.wrapping_mul(a).wrapping_add(b),
187 |                 9 => a.wrapping_mul(b).wrapping_mul(c),
188 |                 10 => {
189 |                     let t1 = combine_u64(a, b);
190 |                     let t2 = (c | 1) as u128;
191 |                     t1.wrapping_rem(t2) as u64
192 |                 },
193 |                 11 => {
194 |                     let t1 = combine_u64(b, c);
195 |                     let t2 = combine_u64(result.rotate_left(r as u32), a | 2);
196 |                     t1.wrapping_rem(t2) as u64
197 |                 },
198 |                 12 => {
199 |                     let t1 = combine_u64(c, a);
200 |                     let t2 = (b | 4) as u128;
201 |                     t1.wrapping_div(t2) as u64
202 |                 },
203 |                 13 => {
204 |                     let t1 = combine_u64(result.rotate_left(r as u32), b);
205 |                     let t2 = combine_u64(a, c | 8);
206 |                     if t1 > t2 {t1.wrapping_div(t2) as u64} else {a^b}
207 |                 },
208 |                 14 => {
209 |                     let t1 = combine_u64(b, a);
210 |                     let t2 = c as u128;
211 |                     (t1.wrapping_mul(t2) >> 64) as u64
212 |                 },
213 |                 15 => {
214 |                     let t1 = combine_u64(a, c);
215 |                     let t2 = combine_u64(result.rotate_right(r as u32), b);
216 |                     (t1.wrapping_mul(t2) >> 64) as u64
217 |                 },
218 |                 _ => unreachable!(),
219 |             };
220 | 
221 |             result = v.rotate_left(1);
222 | 
223 |             #[cfg(feature = "tracker")]
224 |             {
225 |                 tracker.add_mem_op(BUFFER_SIZE-j-1, MemOp::Write);
226 |                 tracker.add_mem_op(BUFFER_SIZE+j, MemOp::Write);
227 |             }
228 | 
229 |             let t = mem_buffer_a[BUFFER_SIZE-j-1] ^ result;
230 |             mem_buffer_a[BUFFER_SIZE-j-1] = t;
231 |             mem_buffer_b[j] ^= t.rotate_right(result as u32);
232 |         }
233 |         addr_a = result;
234 |         addr_b = isqrt(result);
235 |     }
236 | 
237 |     Ok(())
238 | }
239 | 
240 | // Stage 4 hash the whole scratchpad using Blake3 to prevent any shortcut in
241 | // the scratchpad computation
242 | #[inline]
243 | pub(crate) fn stage_4(scratch_pad: &[u64]) -> Result<Hash, Error> {
244 |     let bytes: &[u8] = bytemuck::try_cast_slice(scratch_pad)
245 |         .map_err(Error::CastError)?;
246 | 
247 |     Ok(blake3_hash(bytes).into())
248 | }
249 | 
250 | fn isqrt(n: u64) -> u64 {
251 |     if n < 2 {
252 |         return n;
253 |     }
254 | 
255 |     let mut x = n;
256 |     let mut y = (x.wrapping_add(1)) >> 1;
257 | 
258 |     while y < x {
259 |         x = y;
260 |         y = (x.wrapping_add(n.wrapping_div(x))) >> 1;
261 |     }
262 | 
263 |     x
264 | }
265 | 
266 | // This function is used to hash the input using the generated scratch pad
267 | // NOTE: The scratchpad is completely overwritten in stage 1  and can be reused without any issues
268 | pub fn xelis_hash(input: &[u8], scratch_pad: &mut ScratchPad, #[cfg(feature = "tracker")] distribution: &mut OpsTracker) -> Result<Hash, Error> {
269 |     stage_1::<MEMORY_SIZE, MEMORY_SIZE_BYTES>(input, scratch_pad)?;
270 | 
271 |     let scratch_pad = scratch_pad.as_mut_slice();
272 | 
273 |     // stage 2 got removed as it got completely optimized on GPUs
274 | 
275 |     // stage 3
276 |     stage_3(scratch_pad, #[cfg(feature = "tracker")] distribution)?;
277 | 
278 |     // final stage 4
279 |     stage_4(scratch_pad)
280 | }
281 | 
282 | #[cfg(test)]
283 | mod tests {
284 |     use rand::{rngs::OsRng, RngCore};
285 |     use std::time::Instant;
286 |     use super::*;
287 | 
288 |     const ITERATIONS: usize = 1000;
289 | 
290 |     #[test]
291 |     fn test_reused_scratchpad() {
292 |         let mut scratch_pad = ScratchPad::default();
293 |         let mut input = [0u8; 112];
294 |         OsRng.fill_bytes(&mut input);
295 | 
296 |         // Do a first hash
297 |         let expected_hash = xelis_hash(&input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap();
298 | 
299 |         // Do a second hash with dirty scratch pad but same input
300 |         let hash = xelis_hash(&input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap();
301 |         assert_eq!(hash, expected_hash);
302 |     }
303 | 
304 |     #[test]
305 |     fn test_zero_hash() {
306 |         let mut scratch_pad = ScratchPad::default();
307 |         let mut input = [0u8; 112];
308 | 
309 |         let hash = xelis_hash(&mut input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap();
310 |         let expected_hash = [
311 |             126, 219, 112, 240, 116, 133, 115, 144, 39, 40, 164,
312 |             105, 30, 158, 45, 126, 64, 67, 238, 52, 200, 35,
313 |             161, 19, 144, 211, 214, 225, 95, 190, 146, 27
314 |         ];
315 | 
316 |         assert_eq!(hash, expected_hash);
317 |     }
318 | 
319 |     #[test]
320 |     fn test_xelis_stages() {
321 |         let mut input = [0u8; 112];
322 |         OsRng.fill_bytes(&mut input);
323 | 
324 |         let mut scratch_pad = ScratchPad::default();
325 |         let instant = Instant::now();
326 |         for i in 0..ITERATIONS {
327 |             input[0] = i as u8;
328 |             std::hint::black_box(stage_1::<MEMORY_SIZE, MEMORY_SIZE_BYTES>(&mut input, &mut scratch_pad).unwrap());
329 |         }
330 |         println!("Stage 1 took: {} microseconds", instant.elapsed().as_micros() / ITERATIONS as u128);
331 | 
332 |         let instant = Instant::now();
333 |         for _ in 0..ITERATIONS {
334 |             std::hint::black_box(stage_3(scratch_pad.as_mut_slice(), #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap());
335 |         }
336 |         println!("Stage 3 took: {} microseconds", instant.elapsed().as_micros() / ITERATIONS as u128);
337 | 
338 |         let instant = Instant::now();
339 |         for _ in 0..ITERATIONS {
340 |             std::hint::black_box(blake3_hash(scratch_pad.as_mut_bytes::<MEMORY_SIZE_BYTES>().unwrap()));
341 |         }
342 |         println!("Stage 4 took: {} microseconds", instant.elapsed().as_micros() / ITERATIONS as u128);
343 |     }
344 | 
345 |     #[test]
346 |     fn test_verify_output() {
347 |         let input = [
348 |             172, 236, 108, 212, 181, 31, 109, 45, 44, 242, 54, 225, 143, 133,
349 |             89, 44, 179, 108, 39, 191, 32, 116, 229, 33, 63, 130, 33, 120, 185, 89,
350 |             146, 141, 10, 79, 183, 107, 238, 122, 92, 222, 25, 134, 90, 107, 116,
351 |             110, 236, 53, 255, 5, 214, 126, 24, 216, 97, 199, 148, 239, 253, 102,
352 |             199, 184, 232, 253, 158, 145, 86, 187, 112, 81, 78, 70, 80, 110, 33,
353 |             37, 159, 233, 198, 1, 178, 108, 210, 100, 109, 155, 106, 124, 124, 83,
354 |             89, 50, 197, 115, 231, 32, 74, 2, 92, 47, 25, 220, 135, 249, 122,
355 |             172, 220, 137, 143, 234, 68, 188
356 |         ];
357 | 
358 |         let mut scratch_pad = ScratchPad::default();
359 |         let hash = xelis_hash(&input, &mut scratch_pad, #[cfg(feature = "tracker")] &mut OpsTracker::new(MEMORY_SIZE)).unwrap();
360 | 
361 |         let expected_hash = [
362 |             199, 114, 154, 28, 4, 164, 196, 178, 117, 17, 148,
363 |             203, 125, 228, 51, 145, 162, 222, 106, 202, 205,
364 |             55, 244, 178, 94, 29, 248, 242, 98, 221, 158, 179
365 |         ];
366 | 
367 |         assert_eq!(hash, expected_hash);
368 |     }
369 | 
370 |     #[test]
371 |     #[cfg(feature = "tracker")]
372 |     fn test_distribution() {
373 |         let mut scratch_pad = ScratchPad::default();
374 |         let mut input = [0u8; 112];
375 |         let mut distribution = OpsTracker::new(MEMORY_SIZE);
376 |         for _ in 0..ITERATIONS {
377 |             OsRng.fill_bytes(&mut input);
378 |             let _ = xelis_hash(&input, &mut scratch_pad, &mut distribution).unwrap();
379 |         }
380 | 
381 |         distribution.generate_branch_distribution("branch_v2.png").unwrap();
382 |         distribution.generate_memory_usage_graph("memory_v2.png", 100).unwrap();
383 |     }
384 | }


--------------------------------------------------------------------------------
/C/ChaCha20-SIMD/chacha20_sse2.c:
--------------------------------------------------------------------------------
  1 | #include "chacha20.h"
  2 | #include <immintrin.h>
  3 | #include <memory.h>
  4 | 
  5 | static inline void PartialXor(const __m128i val, uint8_t *Src, uint8_t *Dest, uint64_t Size)
  6 | {
  7 | 	_Alignas(16) uint8_t BuffForPartialOp[16];
  8 | 	memcpy(BuffForPartialOp, Src, Size);
  9 | 	_mm_storeu_si128((__m128i *)(BuffForPartialOp), _mm_xor_si128(val, _mm_loadu_si128((const __m128i *)BuffForPartialOp)));
 10 | 	memcpy(Dest, BuffForPartialOp, Size);
 11 | }
 12 | static inline void PartialStore(const __m128i val, uint8_t *Dest, uint64_t Size)
 13 | {
 14 | 	_Alignas(16) uint8_t BuffForPartialOp[16];
 15 | 	_mm_storeu_si128((__m128i *)(BuffForPartialOp), val);
 16 | 	memcpy(Dest, BuffForPartialOp, Size);
 17 | }
 18 | 
 19 | static inline __m128i RotateLeft7(const __m128i val)
 20 | {
 21 | 	return _mm_or_si128(_mm_slli_epi32(val, 7), _mm_srli_epi32(val, 32 - 7));
 22 | }
 23 | 
 24 | static inline __m128i RotateLeft8(const __m128i val)
 25 | {
 26 | 	return _mm_or_si128(_mm_slli_epi32(val, 8), _mm_srli_epi32(val, 32 - 8));
 27 | }
 28 | 
 29 | static inline __m128i RotateLeft12(const __m128i val)
 30 | {
 31 | 	return _mm_or_si128(_mm_slli_epi32(val, 12), _mm_srli_epi32(val, 32 - 12));
 32 | }
 33 | 
 34 | static inline __m128i RotateLeft16(const __m128i val)
 35 | {
 36 | 	return _mm_or_si128(_mm_slli_epi32(val, 16), _mm_srli_epi32(val, 32 - 16));
 37 | }
 38 | 
 39 | static void ChaCha20EncryptBytes(uint8_t *state, uint8_t *In, uint8_t *Out, size_t Size, uint32_t rounds)
 40 | {
 41 | 
 42 | 	uint8_t *CurrentIn = In;
 43 | 	uint8_t *CurrentOut = Out;
 44 | 
 45 | 	uint64_t FullBlocksCount = Size / 256;
 46 | 	uint64_t RemainingBytes = Size % 256;
 47 | 
 48 | 	const __m128i state0 = _mm_set_epi32(1797285236, 2036477234, 857760878, 1634760805); //"expand 32-byte k"
 49 | 	const __m128i state1 = _mm_loadu_si128((const __m128i *)(state));
 50 | 	const __m128i state2 = _mm_loadu_si128((const __m128i *)((state) + 16));
 51 | 
 52 | 	for (int64_t n = 0; n < FullBlocksCount; n++)
 53 | 	{
 54 | 
 55 | 		const __m128i state3 = _mm_loadu_si128((const __m128i *)((state) + 32));
 56 | 
 57 | 		__m128i r0_0 = state0;
 58 | 		__m128i r0_1 = state1;
 59 | 		__m128i r0_2 = state2;
 60 | 		__m128i r0_3 = state3;
 61 | 
 62 | 		__m128i r1_0 = state0;
 63 | 		__m128i r1_1 = state1;
 64 | 		__m128i r1_2 = state2;
 65 | 		__m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1));
 66 | 
 67 | 		__m128i r2_0 = state0;
 68 | 		__m128i r2_1 = state1;
 69 | 		__m128i r2_2 = state2;
 70 | 		__m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2));
 71 | 
 72 | 		__m128i r3_0 = state0;
 73 | 		__m128i r3_1 = state1;
 74 | 		__m128i r3_2 = state2;
 75 | 		__m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3));
 76 | 
 77 | 		for (int i = rounds; i > 0; i -= 2)
 78 | 		{
 79 | 			r0_0 = _mm_add_epi32(r0_0, r0_1);
 80 | 			r1_0 = _mm_add_epi32(r1_0, r1_1);
 81 | 			r2_0 = _mm_add_epi32(r2_0, r2_1);
 82 | 			r3_0 = _mm_add_epi32(r3_0, r3_1);
 83 | 
 84 | 			r0_3 = _mm_xor_si128(r0_3, r0_0);
 85 | 			r1_3 = _mm_xor_si128(r1_3, r1_0);
 86 | 			r2_3 = _mm_xor_si128(r2_3, r2_0);
 87 | 			r3_3 = _mm_xor_si128(r3_3, r3_0);
 88 | 
 89 | 			r0_3 = RotateLeft16(r0_3);
 90 | 			r1_3 = RotateLeft16(r1_3);
 91 | 			r2_3 = RotateLeft16(r2_3);
 92 | 			r3_3 = RotateLeft16(r3_3);
 93 | 
 94 | 			r0_2 = _mm_add_epi32(r0_2, r0_3);
 95 | 			r1_2 = _mm_add_epi32(r1_2, r1_3);
 96 | 			r2_2 = _mm_add_epi32(r2_2, r2_3);
 97 | 			r3_2 = _mm_add_epi32(r3_2, r3_3);
 98 | 
 99 | 			r0_1 = _mm_xor_si128(r0_1, r0_2);
100 | 			r1_1 = _mm_xor_si128(r1_1, r1_2);
101 | 			r2_1 = _mm_xor_si128(r2_1, r2_2);
102 | 			r3_1 = _mm_xor_si128(r3_1, r3_2);
103 | 
104 | 			r0_1 = RotateLeft12(r0_1);
105 | 			r1_1 = RotateLeft12(r1_1);
106 | 			r2_1 = RotateLeft12(r2_1);
107 | 			r3_1 = RotateLeft12(r3_1);
108 | 
109 | 			r0_0 = _mm_add_epi32(r0_0, r0_1);
110 | 			r1_0 = _mm_add_epi32(r1_0, r1_1);
111 | 			r2_0 = _mm_add_epi32(r2_0, r2_1);
112 | 			r3_0 = _mm_add_epi32(r3_0, r3_1);
113 | 
114 | 			r0_3 = _mm_xor_si128(r0_3, r0_0);
115 | 			r1_3 = _mm_xor_si128(r1_3, r1_0);
116 | 			r2_3 = _mm_xor_si128(r2_3, r2_0);
117 | 			r3_3 = _mm_xor_si128(r3_3, r3_0);
118 | 
119 | 			r0_3 = RotateLeft8(r0_3);
120 | 			r1_3 = RotateLeft8(r1_3);
121 | 			r2_3 = RotateLeft8(r2_3);
122 | 			r3_3 = RotateLeft8(r3_3);
123 | 
124 | 			r0_2 = _mm_add_epi32(r0_2, r0_3);
125 | 			r1_2 = _mm_add_epi32(r1_2, r1_3);
126 | 			r2_2 = _mm_add_epi32(r2_2, r2_3);
127 | 			r3_2 = _mm_add_epi32(r3_2, r3_3);
128 | 
129 | 			r0_1 = _mm_xor_si128(r0_1, r0_2);
130 | 			r1_1 = _mm_xor_si128(r1_1, r1_2);
131 | 			r2_1 = _mm_xor_si128(r2_1, r2_2);
132 | 			r3_1 = _mm_xor_si128(r3_1, r3_2);
133 | 
134 | 			r0_1 = RotateLeft7(r0_1);
135 | 			r1_1 = RotateLeft7(r1_1);
136 | 			r2_1 = RotateLeft7(r2_1);
137 | 			r3_1 = RotateLeft7(r3_1);
138 | 
139 | 			r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1));
140 | 			r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
141 | 			r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3));
142 | 
143 | 			r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1));
144 | 			r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
145 | 			r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3));
146 | 
147 | 			r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1));
148 | 			r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
149 | 			r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3));
150 | 
151 | 			r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1));
152 | 			r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
153 | 			r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3));
154 | 
155 | 			r0_0 = _mm_add_epi32(r0_0, r0_1);
156 | 			r1_0 = _mm_add_epi32(r1_0, r1_1);
157 | 			r2_0 = _mm_add_epi32(r2_0, r2_1);
158 | 			r3_0 = _mm_add_epi32(r3_0, r3_1);
159 | 
160 | 			r0_3 = _mm_xor_si128(r0_3, r0_0);
161 | 			r1_3 = _mm_xor_si128(r1_3, r1_0);
162 | 			r2_3 = _mm_xor_si128(r2_3, r2_0);
163 | 			r3_3 = _mm_xor_si128(r3_3, r3_0);
164 | 
165 | 			r0_3 = RotateLeft16(r0_3);
166 | 			r1_3 = RotateLeft16(r1_3);
167 | 			r2_3 = RotateLeft16(r2_3);
168 | 			r3_3 = RotateLeft16(r3_3);
169 | 
170 | 			r0_2 = _mm_add_epi32(r0_2, r0_3);
171 | 			r1_2 = _mm_add_epi32(r1_2, r1_3);
172 | 			r2_2 = _mm_add_epi32(r2_2, r2_3);
173 | 			r3_2 = _mm_add_epi32(r3_2, r3_3);
174 | 
175 | 			r0_1 = _mm_xor_si128(r0_1, r0_2);
176 | 			r1_1 = _mm_xor_si128(r1_1, r1_2);
177 | 			r2_1 = _mm_xor_si128(r2_1, r2_2);
178 | 			r3_1 = _mm_xor_si128(r3_1, r3_2);
179 | 
180 | 			r0_1 = RotateLeft12(r0_1);
181 | 			r1_1 = RotateLeft12(r1_1);
182 | 			r2_1 = RotateLeft12(r2_1);
183 | 			r3_1 = RotateLeft12(r3_1);
184 | 
185 | 			r0_0 = _mm_add_epi32(r0_0, r0_1);
186 | 			r1_0 = _mm_add_epi32(r1_0, r1_1);
187 | 			r2_0 = _mm_add_epi32(r2_0, r2_1);
188 | 			r3_0 = _mm_add_epi32(r3_0, r3_1);
189 | 
190 | 			r0_3 = _mm_xor_si128(r0_3, r0_0);
191 | 			r1_3 = _mm_xor_si128(r1_3, r1_0);
192 | 			r2_3 = _mm_xor_si128(r2_3, r2_0);
193 | 			r3_3 = _mm_xor_si128(r3_3, r3_0);
194 | 
195 | 			r0_3 = RotateLeft8(r0_3);
196 | 			r1_3 = RotateLeft8(r1_3);
197 | 			r2_3 = RotateLeft8(r2_3);
198 | 			r3_3 = RotateLeft8(r3_3);
199 | 
200 | 			r0_2 = _mm_add_epi32(r0_2, r0_3);
201 | 			r1_2 = _mm_add_epi32(r1_2, r1_3);
202 | 			r2_2 = _mm_add_epi32(r2_2, r2_3);
203 | 			r3_2 = _mm_add_epi32(r3_2, r3_3);
204 | 
205 | 			r0_1 = _mm_xor_si128(r0_1, r0_2);
206 | 			r1_1 = _mm_xor_si128(r1_1, r1_2);
207 | 			r2_1 = _mm_xor_si128(r2_1, r2_2);
208 | 			r3_1 = _mm_xor_si128(r3_1, r3_2);
209 | 
210 | 			r0_1 = RotateLeft7(r0_1);
211 | 			r1_1 = RotateLeft7(r1_1);
212 | 			r2_1 = RotateLeft7(r2_1);
213 | 			r3_1 = RotateLeft7(r3_1);
214 | 
215 | 			r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3));
216 | 			r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
217 | 			r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1));
218 | 
219 | 			r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3));
220 | 			r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
221 | 			r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1));
222 | 
223 | 			r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3));
224 | 			r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
225 | 			r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1));
226 | 
227 | 			r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3));
228 | 			r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
229 | 			r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1));
230 | 		}
231 | 
232 | 		r0_0 = _mm_add_epi32(r0_0, state0);
233 | 		r0_1 = _mm_add_epi32(r0_1, state1);
234 | 		r0_2 = _mm_add_epi32(r0_2, state2);
235 | 		r0_3 = _mm_add_epi32(r0_3, state3);
236 | 
237 | 		r1_0 = _mm_add_epi32(r1_0, state0);
238 | 		r1_1 = _mm_add_epi32(r1_1, state1);
239 | 		r1_2 = _mm_add_epi32(r1_2, state2);
240 | 		r1_3 = _mm_add_epi32(r1_3, state3);
241 | 		r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1));
242 | 
243 | 		r2_0 = _mm_add_epi32(r2_0, state0);
244 | 		r2_1 = _mm_add_epi32(r2_1, state1);
245 | 		r2_2 = _mm_add_epi32(r2_2, state2);
246 | 		r2_3 = _mm_add_epi32(r2_3, state3);
247 | 		r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2));
248 | 
249 | 		r3_0 = _mm_add_epi32(r3_0, state0);
250 | 		r3_1 = _mm_add_epi32(r3_1, state1);
251 | 		r3_2 = _mm_add_epi32(r3_2, state2);
252 | 		r3_3 = _mm_add_epi32(r3_3, state3);
253 | 		r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3));
254 | 
255 | 		if (In)
256 | 		{
257 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 0 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 0 * 16)), r0_0));
258 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 1 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 1 * 16)), r0_1));
259 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 2 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 2 * 16)), r0_2));
260 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 3 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 3 * 16)), r0_3));
261 | 
262 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 4 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 4 * 16)), r1_0));
263 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 5 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 5 * 16)), r1_1));
264 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 6 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 6 * 16)), r1_2));
265 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 7 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 7 * 16)), r1_3));
266 | 
267 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 8 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 8 * 16)), r2_0));
268 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 9 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 9 * 16)), r2_1));
269 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 10 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 10 * 16)), r2_2));
270 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 11 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 11 * 16)), r2_3));
271 | 
272 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 12 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 12 * 16)), r3_0));
273 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 13 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 13 * 16)), r3_1));
274 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 14 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 14 * 16)), r3_2));
275 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 15 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 15 * 16)), r3_3));
276 | 			CurrentIn += 256;
277 | 		}
278 | 		else
279 | 		{
280 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 0 * 16), r0_0);
281 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 1 * 16), r0_1);
282 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 2 * 16), r0_2);
283 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 3 * 16), r0_3);
284 | 
285 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 4 * 16), r1_0);
286 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 5 * 16), r1_1);
287 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 6 * 16), r1_2);
288 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 7 * 16), r1_3);
289 | 
290 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 8 * 16), r2_0);
291 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 9 * 16), r2_1);
292 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 10 * 16), r2_2);
293 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 11 * 16), r2_3);
294 | 
295 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 12 * 16), r3_0);
296 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 13 * 16), r3_1);
297 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 14 * 16), r3_2);
298 | 			_mm_storeu_si128((__m128i *)(CurrentOut + 15 * 16), r3_3);
299 | 		}
300 | 
301 | 		CurrentOut += 256;
302 | 
303 | 		ChaCha20AddCounter(state, 4);
304 | 	}
305 | 
306 | 	if (RemainingBytes == 0)
307 | 		return;
308 | 
309 | 	while (1)
310 | 	{
311 | 		const __m128i state3 = _mm_loadu_si128((const __m128i *)((state) + 32));
312 | 
313 | 		__m128i r0_0 = state0;
314 | 		__m128i r0_1 = state1;
315 | 		__m128i r0_2 = state2;
316 | 		__m128i r0_3 = state3;
317 | 
318 | 		for (int i = rounds; i > 0; i -= 2)
319 | 		{
320 | 			r0_0 = _mm_add_epi32(r0_0, r0_1);
321 | 
322 | 			r0_3 = _mm_xor_si128(r0_3, r0_0);
323 | 
324 | 			r0_3 = RotateLeft16(r0_3);
325 | 
326 | 			r0_2 = _mm_add_epi32(r0_2, r0_3);
327 | 
328 | 			r0_1 = _mm_xor_si128(r0_1, r0_2);
329 | 
330 | 			r0_1 = RotateLeft12(r0_1);
331 | 
332 | 			r0_0 = _mm_add_epi32(r0_0, r0_1);
333 | 
334 | 			r0_3 = _mm_xor_si128(r0_3, r0_0);
335 | 
336 | 			r0_3 = RotateLeft8(r0_3);
337 | 
338 | 			r0_2 = _mm_add_epi32(r0_2, r0_3);
339 | 
340 | 			r0_1 = _mm_xor_si128(r0_1, r0_2);
341 | 
342 | 			r0_1 = RotateLeft7(r0_1);
343 | 
344 | 			r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1));
345 | 			r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
346 | 			r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3));
347 | 
348 | 			r0_0 = _mm_add_epi32(r0_0, r0_1);
349 | 
350 | 			r0_3 = _mm_xor_si128(r0_3, r0_0);
351 | 
352 | 			r0_3 = RotateLeft16(r0_3);
353 | 
354 | 			r0_2 = _mm_add_epi32(r0_2, r0_3);
355 | 
356 | 			r0_1 = _mm_xor_si128(r0_1, r0_2);
357 | 
358 | 			r0_1 = RotateLeft12(r0_1);
359 | 
360 | 			r0_0 = _mm_add_epi32(r0_0, r0_1);
361 | 
362 | 			r0_3 = _mm_xor_si128(r0_3, r0_0);
363 | 
364 | 			r0_3 = RotateLeft8(r0_3);
365 | 
366 | 			r0_2 = _mm_add_epi32(r0_2, r0_3);
367 | 
368 | 			r0_1 = _mm_xor_si128(r0_1, r0_2);
369 | 
370 | 			r0_1 = RotateLeft7(r0_1);
371 | 
372 | 			r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3));
373 | 			r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
374 | 			r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1));
375 | 		}
376 | 
377 | 		r0_0 = _mm_add_epi32(r0_0, state0);
378 | 		r0_1 = _mm_add_epi32(r0_1, state1);
379 | 		r0_2 = _mm_add_epi32(r0_2, state2);
380 | 		r0_3 = _mm_add_epi32(r0_3, state3);
381 | 
382 | 		if (RemainingBytes >= 64)
383 | 		{
384 | 
385 | 			if (In)
386 | 			{
387 | 				_mm_storeu_si128((__m128i *)(CurrentOut + 0 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 0 * 16)), r0_0));
388 | 				_mm_storeu_si128((__m128i *)(CurrentOut + 1 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 1 * 16)), r0_1));
389 | 				_mm_storeu_si128((__m128i *)(CurrentOut + 2 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 2 * 16)), r0_2));
390 | 				_mm_storeu_si128((__m128i *)(CurrentOut + 3 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(CurrentIn + 3 * 16)), r0_3));
391 | 				CurrentIn += 64;
392 | 			}
393 | 			else
394 | 			{
395 | 				_mm_storeu_si128((__m128i *)(CurrentOut + 0 * 16), r0_0);
396 | 				_mm_storeu_si128((__m128i *)(CurrentOut + 1 * 16), r0_1);
397 | 				_mm_storeu_si128((__m128i *)(CurrentOut + 2 * 16), r0_2);
398 | 				_mm_storeu_si128((__m128i *)(CurrentOut + 3 * 16), r0_3);
399 | 			}
400 | 			CurrentOut += 64;
401 | 			ChaCha20AddCounter(state, 1);
402 | 			RemainingBytes -= 64;
403 | 			if (RemainingBytes == 0)
404 | 				return;
405 | 			continue;
406 | 		}
407 | 		else
408 | 		{
409 | 			_Alignas(16) uint8_t TmpBuf[64];
410 | 			if (In)
411 | 			{
412 | 				memcpy(TmpBuf, CurrentIn, RemainingBytes);
413 | 				_mm_storeu_si128((__m128i *)(TmpBuf + 0 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(TmpBuf + 0 * 16)), r0_0));
414 | 				_mm_storeu_si128((__m128i *)(TmpBuf + 1 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(TmpBuf + 1 * 16)), r0_1));
415 | 				_mm_storeu_si128((__m128i *)(TmpBuf + 2 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(TmpBuf + 2 * 16)), r0_2));
416 | 				_mm_storeu_si128((__m128i *)(TmpBuf + 3 * 16), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(TmpBuf + 3 * 16)), r0_3));
417 | 			}
418 | 			else
419 | 			{
420 | 				_mm_storeu_si128((__m128i *)(TmpBuf + 0 * 16), r0_0);
421 | 				_mm_storeu_si128((__m128i *)(TmpBuf + 1 * 16), r0_1);
422 | 				_mm_storeu_si128((__m128i *)(TmpBuf + 2 * 16), r0_2);
423 | 				_mm_storeu_si128((__m128i *)(TmpBuf + 3 * 16), r0_3);
424 | 			}
425 | 			memcpy(CurrentOut, TmpBuf, RemainingBytes);
426 | 			ChaCha20AddCounter(state, 1);
427 | 			return;
428 | 		}
429 | 	}
430 | }
431 | 
432 | void chacha_encrypt_sse2(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds)
433 | {
434 | 	uint8_t state[48] = {0};
435 | 	ChaCha20SetKey(state, key);
436 | 	ChaCha20SetNonce(state, nonce);
437 | 	ChaCha20EncryptBytes(state, in, out, bytes, rounds);
438 | }


--------------------------------------------------------------------------------
/C/ChaCha20-SIMD/chacha20_avx2.c:
--------------------------------------------------------------------------------
  1 | #include "chacha20.h"
  2 | #include <immintrin.h>
  3 | #include <memory.h>
  4 | 
  5 | static inline void PartialXor(const __m256i val, const uint8_t *Src, uint8_t *Dest, uint64_t Size)
  6 | {
  7 | 	_Alignas(32) uint8_t BuffForPartialOp[32];
  8 | 	memcpy(BuffForPartialOp, Src, Size);
  9 | 	_mm256_storeu_si256((__m256i *)(BuffForPartialOp), _mm256_xor_si256(val, _mm256_loadu_si256((const __m256i *)BuffForPartialOp)));
 10 | 	memcpy(Dest, BuffForPartialOp, Size);
 11 | }
 12 | static inline void PartialStore(const __m256i val, uint8_t *Dest, uint64_t Size)
 13 | {
 14 | 	_Alignas(32) uint8_t BuffForPartialOp[32];
 15 | 	_mm256_storeu_si256((__m256i *)(BuffForPartialOp), val);
 16 | 	memcpy(Dest, BuffForPartialOp, Size);
 17 | }
 18 | 
 19 | static __m256i RotateLeft7(const __m256i val)
 20 | {
 21 | 	return _mm256_or_si256(_mm256_slli_epi32(val, 7),
 22 | 						   _mm256_srli_epi32(val, 32 - 7));
 23 | }
 24 | 
 25 | static __m256i RotateLeft12(const __m256i val)
 26 | {
 27 | 	return _mm256_or_si256(_mm256_slli_epi32(val, 12),
 28 | 						   _mm256_srli_epi32(val, 32 - 12));
 29 | }
 30 | 
 31 | static __m256i RotateLeft8(const __m256i val)
 32 | {
 33 | 	const __m256i mask =
 34 | 		_mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 14,
 35 | 						13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
 36 | 	return _mm256_shuffle_epi8(val, mask);
 37 | }
 38 | 
 39 | static __m256i RotateLeft16(const __m256i val)
 40 | {
 41 | 	const __m256i mask =
 42 | 		_mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13,
 43 | 						12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
 44 | 	return _mm256_shuffle_epi8(val, mask);
 45 | }
 46 | 
 47 | static void ChaCha20EncryptBytes(uint8_t *state, uint8_t *In, uint8_t *Out, size_t Size, uint32_t rounds)
 48 | {
 49 | 
 50 | 	uint8_t *CurrentIn = In;
 51 | 	uint8_t *CurrentOut = Out;
 52 | 
 53 | 	uint64_t FullBlocksCount = Size / 512;
 54 | 	uint64_t RemainingBytes = Size % 512;
 55 | 
 56 | 	const __m256i state0 = _mm256_broadcastsi128_si256(_mm_set_epi32(1797285236, 2036477234, 857760878, 1634760805)); //"expand 32-byte k"
 57 | 	const __m256i state1 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i *)(state)));
 58 | 	const __m256i state2 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i *)(state + 16)));
 59 | 
 60 | 	__m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 4);
 61 | 	const __m256i CTR1 = _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 5);
 62 | 	const __m256i CTR2 = _mm256_set_epi32(0, 0, 0, 2, 0, 0, 0, 6);
 63 | 	const __m256i CTR3 = _mm256_set_epi32(0, 0, 0, 3, 0, 0, 0, 7);
 64 | 
 65 | 	for (int64_t n = 0; n < FullBlocksCount; n++)
 66 | 	{
 67 | 
 68 | 		const __m256i state3 = _mm256_broadcastsi128_si256(
 69 | 			_mm_load_si128((const __m128i *)(state + 32)));
 70 | 
 71 | 		__m256i X0_0 = state0;
 72 | 		__m256i X0_1 = state1;
 73 | 		__m256i X0_2 = state2;
 74 | 		__m256i X0_3 = _mm256_add_epi32(state3, CTR0);
 75 | 
 76 | 		__m256i X1_0 = state0;
 77 | 		__m256i X1_1 = state1;
 78 | 		__m256i X1_2 = state2;
 79 | 		__m256i X1_3 = _mm256_add_epi32(state3, CTR1);
 80 | 
 81 | 		__m256i X2_0 = state0;
 82 | 		__m256i X2_1 = state1;
 83 | 		__m256i X2_2 = state2;
 84 | 		__m256i X2_3 = _mm256_add_epi32(state3, CTR2);
 85 | 
 86 | 		__m256i X3_0 = state0;
 87 | 		__m256i X3_1 = state1;
 88 | 		__m256i X3_2 = state2;
 89 | 		__m256i X3_3 = _mm256_add_epi32(state3, CTR3);
 90 | 
 91 | 		for (int i = rounds; i > 0; i -= 2)
 92 | 		{
 93 | 			X0_0 = _mm256_add_epi32(X0_0, X0_1);
 94 | 			X1_0 = _mm256_add_epi32(X1_0, X1_1);
 95 | 			X2_0 = _mm256_add_epi32(X2_0, X2_1);
 96 | 			X3_0 = _mm256_add_epi32(X3_0, X3_1);
 97 | 
 98 | 			X0_3 = _mm256_xor_si256(X0_3, X0_0);
 99 | 			X1_3 = _mm256_xor_si256(X1_3, X1_0);
100 | 			X2_3 = _mm256_xor_si256(X2_3, X2_0);
101 | 			X3_3 = _mm256_xor_si256(X3_3, X3_0);
102 | 
103 | 			X0_3 = RotateLeft16(X0_3);
104 | 			X1_3 = RotateLeft16(X1_3);
105 | 			X2_3 = RotateLeft16(X2_3);
106 | 			X3_3 = RotateLeft16(X3_3);
107 | 
108 | 			X0_2 = _mm256_add_epi32(X0_2, X0_3);
109 | 			X1_2 = _mm256_add_epi32(X1_2, X1_3);
110 | 			X2_2 = _mm256_add_epi32(X2_2, X2_3);
111 | 			X3_2 = _mm256_add_epi32(X3_2, X3_3);
112 | 
113 | 			X0_1 = _mm256_xor_si256(X0_1, X0_2);
114 | 			X1_1 = _mm256_xor_si256(X1_1, X1_2);
115 | 			X2_1 = _mm256_xor_si256(X2_1, X2_2);
116 | 			X3_1 = _mm256_xor_si256(X3_1, X3_2);
117 | 
118 | 			X0_1 = RotateLeft12(X0_1);
119 | 			X1_1 = RotateLeft12(X1_1);
120 | 			X2_1 = RotateLeft12(X2_1);
121 | 			X3_1 = RotateLeft12(X3_1);
122 | 
123 | 			X0_0 = _mm256_add_epi32(X0_0, X0_1);
124 | 			X1_0 = _mm256_add_epi32(X1_0, X1_1);
125 | 			X2_0 = _mm256_add_epi32(X2_0, X2_1);
126 | 			X3_0 = _mm256_add_epi32(X3_0, X3_1);
127 | 
128 | 			X0_3 = _mm256_xor_si256(X0_3, X0_0);
129 | 			X1_3 = _mm256_xor_si256(X1_3, X1_0);
130 | 			X2_3 = _mm256_xor_si256(X2_3, X2_0);
131 | 			X3_3 = _mm256_xor_si256(X3_3, X3_0);
132 | 
133 | 			X0_3 = RotateLeft8(X0_3);
134 | 			X1_3 = RotateLeft8(X1_3);
135 | 			X2_3 = RotateLeft8(X2_3);
136 | 			X3_3 = RotateLeft8(X3_3);
137 | 
138 | 			X0_2 = _mm256_add_epi32(X0_2, X0_3);
139 | 			X1_2 = _mm256_add_epi32(X1_2, X1_3);
140 | 			X2_2 = _mm256_add_epi32(X2_2, X2_3);
141 | 			X3_2 = _mm256_add_epi32(X3_2, X3_3);
142 | 
143 | 			X0_1 = _mm256_xor_si256(X0_1, X0_2);
144 | 			X1_1 = _mm256_xor_si256(X1_1, X1_2);
145 | 			X2_1 = _mm256_xor_si256(X2_1, X2_2);
146 | 			X3_1 = _mm256_xor_si256(X3_1, X3_2);
147 | 
148 | 			X0_1 = RotateLeft7(X0_1);
149 | 			X1_1 = RotateLeft7(X1_1);
150 | 			X2_1 = RotateLeft7(X2_1);
151 | 			X3_1 = RotateLeft7(X3_1);
152 | 
153 | 			X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));
154 | 			X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
155 | 			X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));
156 | 
157 | 			X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));
158 | 			X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
159 | 			X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));
160 | 
161 | 			X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));
162 | 			X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
163 | 			X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));
164 | 
165 | 			X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));
166 | 			X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
167 | 			X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));
168 | 
169 | 			X0_0 = _mm256_add_epi32(X0_0, X0_1);
170 | 			X1_0 = _mm256_add_epi32(X1_0, X1_1);
171 | 			X2_0 = _mm256_add_epi32(X2_0, X2_1);
172 | 			X3_0 = _mm256_add_epi32(X3_0, X3_1);
173 | 
174 | 			X0_3 = _mm256_xor_si256(X0_3, X0_0);
175 | 			X1_3 = _mm256_xor_si256(X1_3, X1_0);
176 | 			X2_3 = _mm256_xor_si256(X2_3, X2_0);
177 | 			X3_3 = _mm256_xor_si256(X3_3, X3_0);
178 | 
179 | 			X0_3 = RotateLeft16(X0_3);
180 | 			X1_3 = RotateLeft16(X1_3);
181 | 			X2_3 = RotateLeft16(X2_3);
182 | 			X3_3 = RotateLeft16(X3_3);
183 | 
184 | 			X0_2 = _mm256_add_epi32(X0_2, X0_3);
185 | 			X1_2 = _mm256_add_epi32(X1_2, X1_3);
186 | 			X2_2 = _mm256_add_epi32(X2_2, X2_3);
187 | 			X3_2 = _mm256_add_epi32(X3_2, X3_3);
188 | 
189 | 			X0_1 = _mm256_xor_si256(X0_1, X0_2);
190 | 			X1_1 = _mm256_xor_si256(X1_1, X1_2);
191 | 			X2_1 = _mm256_xor_si256(X2_1, X2_2);
192 | 			X3_1 = _mm256_xor_si256(X3_1, X3_2);
193 | 
194 | 			X0_1 = RotateLeft12(X0_1);
195 | 			X1_1 = RotateLeft12(X1_1);
196 | 			X2_1 = RotateLeft12(X2_1);
197 | 			X3_1 = RotateLeft12(X3_1);
198 | 
199 | 			X0_0 = _mm256_add_epi32(X0_0, X0_1);
200 | 			X1_0 = _mm256_add_epi32(X1_0, X1_1);
201 | 			X2_0 = _mm256_add_epi32(X2_0, X2_1);
202 | 			X3_0 = _mm256_add_epi32(X3_0, X3_1);
203 | 
204 | 			X0_3 = _mm256_xor_si256(X0_3, X0_0);
205 | 			X1_3 = _mm256_xor_si256(X1_3, X1_0);
206 | 			X2_3 = _mm256_xor_si256(X2_3, X2_0);
207 | 			X3_3 = _mm256_xor_si256(X3_3, X3_0);
208 | 
209 | 			X0_3 = RotateLeft8(X0_3);
210 | 			X1_3 = RotateLeft8(X1_3);
211 | 			X2_3 = RotateLeft8(X2_3);
212 | 			X3_3 = RotateLeft8(X3_3);
213 | 
214 | 			X0_2 = _mm256_add_epi32(X0_2, X0_3);
215 | 			X1_2 = _mm256_add_epi32(X1_2, X1_3);
216 | 			X2_2 = _mm256_add_epi32(X2_2, X2_3);
217 | 			X3_2 = _mm256_add_epi32(X3_2, X3_3);
218 | 
219 | 			X0_1 = _mm256_xor_si256(X0_1, X0_2);
220 | 			X1_1 = _mm256_xor_si256(X1_1, X1_2);
221 | 			X2_1 = _mm256_xor_si256(X2_1, X2_2);
222 | 			X3_1 = _mm256_xor_si256(X3_1, X3_2);
223 | 
224 | 			X0_1 = RotateLeft7(X0_1);
225 | 			X1_1 = RotateLeft7(X1_1);
226 | 			X2_1 = RotateLeft7(X2_1);
227 | 			X3_1 = RotateLeft7(X3_1);
228 | 
229 | 			X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));
230 | 			X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
231 | 			X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));
232 | 
233 | 			X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));
234 | 			X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
235 | 			X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));
236 | 
237 | 			X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));
238 | 			X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
239 | 			X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));
240 | 
241 | 			X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));
242 | 			X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
243 | 			X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));
244 | 		}
245 | 
246 | 		X0_0 = _mm256_add_epi32(X0_0, state0);
247 | 		X0_1 = _mm256_add_epi32(X0_1, state1);
248 | 		X0_2 = _mm256_add_epi32(X0_2, state2);
249 | 		X0_3 = _mm256_add_epi32(X0_3, state3);
250 | 		X0_3 = _mm256_add_epi32(X0_3, CTR0);
251 | 
252 | 		X1_0 = _mm256_add_epi32(X1_0, state0);
253 | 		X1_1 = _mm256_add_epi32(X1_1, state1);
254 | 		X1_2 = _mm256_add_epi32(X1_2, state2);
255 | 		X1_3 = _mm256_add_epi32(X1_3, state3);
256 | 		X1_3 = _mm256_add_epi32(X1_3, CTR1);
257 | 
258 | 		X2_0 = _mm256_add_epi32(X2_0, state0);
259 | 		X2_1 = _mm256_add_epi32(X2_1, state1);
260 | 		X2_2 = _mm256_add_epi32(X2_2, state2);
261 | 		X2_3 = _mm256_add_epi32(X2_3, state3);
262 | 		X2_3 = _mm256_add_epi32(X2_3, CTR2);
263 | 
264 | 		X3_0 = _mm256_add_epi32(X3_0, state0);
265 | 		X3_1 = _mm256_add_epi32(X3_1, state1);
266 | 		X3_2 = _mm256_add_epi32(X3_2, state2);
267 | 		X3_3 = _mm256_add_epi32(X3_3, state3);
268 | 		X3_3 = _mm256_add_epi32(X3_3, CTR3);
269 | 
270 | 		//
271 | 
272 | 		if (In)
273 | 		{
274 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 0 * 32),
275 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)),
276 | 												 _mm256_loadu_si256((__m256i *)(CurrentIn + 0 * 32))));
277 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 1 * 32),
278 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)),
279 | 												 _mm256_loadu_si256((const __m256i *)(CurrentIn + 1 * 32))));
280 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 2 * 32),
281 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)),
282 | 												 _mm256_loadu_si256(((const __m256i *)(CurrentIn + 2 * 32)))));
283 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 3 * 32),
284 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)),
285 | 												 _mm256_loadu_si256((const __m256i *)(CurrentIn + 3 * 32))));
286 | 
287 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 4 * 32),
288 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)),
289 | 												 _mm256_loadu_si256((const __m256i *)(CurrentIn + 4 * 32))));
290 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 5 * 32),
291 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)),
292 | 												 _mm256_loadu_si256((const __m256i *)(CurrentIn + 5 * 32))));
293 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 6 * 32),
294 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)),
295 | 												 _mm256_loadu_si256((const __m256i *)(CurrentIn + 6 * 32))));
296 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 7 * 32),
297 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)),
298 | 												 _mm256_loadu_si256((const __m256i *)(CurrentIn + 7 * 32))));
299 | 
300 | 			_mm256_storeu_si256(
301 | 				(__m256i *)(CurrentOut + 8 * 32),
302 | 				_mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)),
303 | 								 _mm256_loadu_si256((const __m256i *)(CurrentIn + 8 * 32))));
304 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 9 * 32),
305 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)),
306 | 												 _mm256_loadu_si256((const __m256i *)(CurrentIn + 9 * 32))));
307 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 10 * 32),
308 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)),
309 | 												 _mm256_loadu_si256((const __m256i *)(CurrentIn + 10 * 32))));
310 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 11 * 32),
311 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)),
312 | 												 _mm256_loadu_si256((const __m256i *)(CurrentIn + 11 * 32))));
313 | 
314 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 12 * 32),
315 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)),
316 | 												 _mm256_loadu_si256((const __m256i *)(CurrentIn + 12 * 32))));
317 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 13 * 32),
318 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)),
319 | 												 _mm256_loadu_si256((const __m256i *)(CurrentIn + 13 * 32))));
320 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 14 * 32),
321 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)),
322 | 												 _mm256_loadu_si256((const __m256i *)(CurrentIn + 14 * 32))));
323 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 15 * 32),
324 | 								_mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)),
325 | 												 _mm256_loadu_si256((const __m256i *)(CurrentIn + 15 * 32))));
326 | 		}
327 | 		else
328 | 		{
329 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 0 * 32),
330 | 								_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));
331 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 1 * 32),
332 | 								_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));
333 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 2 * 32),
334 | 								_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));
335 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 3 * 32),
336 | 								_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));
337 | 
338 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 4 * 32),
339 | 								_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));
340 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 5 * 32),
341 | 								_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));
342 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 6 * 32),
343 | 								_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));
344 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 7 * 32),
345 | 								_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));
346 | 
347 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 8 * 32),
348 | 								_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));
349 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 9 * 32),
350 | 								_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));
351 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 10 * 32),
352 | 								_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));
353 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 11 * 32),
354 | 								_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));
355 | 
356 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 12 * 32),
357 | 								_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));
358 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 13 * 32),
359 | 								_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));
360 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 14 * 32),
361 | 								_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));
362 | 			_mm256_storeu_si256((__m256i *)(CurrentOut + 15 * 32),
363 | 								_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));
364 | 		}
365 | 
366 | 		ChaCha20AddCounter(state, 8);
367 | 		if (CurrentIn)
368 | 			CurrentIn += 512;
369 | 		CurrentOut += 512;
370 | 	}
371 | 
372 | 	if (RemainingBytes == 0)
373 | 		return;
374 | 
375 | 	CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1);
376 | 
377 | 	while (1)
378 | 	{
379 | 
380 | 		const __m256i state3 = _mm256_broadcastsi128_si256(
381 | 			_mm_load_si128((const __m128i *)(state + 32)));
382 | 
383 | 		__m256i X0_0 = state0;
384 | 		__m256i X0_1 = state1;
385 | 		__m256i X0_2 = state2;
386 | 		__m256i X0_3 = _mm256_add_epi32(state3, CTR0);
387 | 
388 | 		for (int i = rounds; i > 0; i -= 2)
389 | 		{
390 | 			X0_0 = _mm256_add_epi32(X0_0, X0_1);
391 | 
392 | 			X0_3 = _mm256_xor_si256(X0_3, X0_0);
393 | 
394 | 			X0_3 = RotateLeft16(X0_3);
395 | 
396 | 			X0_2 = _mm256_add_epi32(X0_2, X0_3);
397 | 
398 | 			X0_1 = _mm256_xor_si256(X0_1, X0_2);
399 | 
400 | 			X0_1 = RotateLeft12(X0_1);
401 | 
402 | 			X0_0 = _mm256_add_epi32(X0_0, X0_1);
403 | 
404 | 			X0_3 = _mm256_xor_si256(X0_3, X0_0);
405 | 
406 | 			X0_3 = RotateLeft8(X0_3);
407 | 
408 | 			X0_2 = _mm256_add_epi32(X0_2, X0_3);
409 | 
410 | 			X0_1 = _mm256_xor_si256(X0_1, X0_2);
411 | 
412 | 			X0_1 = RotateLeft7(X0_1);
413 | 
414 | 			X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));
415 | 			X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
416 | 			X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));
417 | 
418 | 			X0_0 = _mm256_add_epi32(X0_0, X0_1);
419 | 
420 | 			X0_3 = _mm256_xor_si256(X0_3, X0_0);
421 | 
422 | 			X0_3 = RotateLeft16(X0_3);
423 | 
424 | 			X0_2 = _mm256_add_epi32(X0_2, X0_3);
425 | 
426 | 			X0_1 = _mm256_xor_si256(X0_1, X0_2);
427 | 
428 | 			X0_1 = RotateLeft12(X0_1);
429 | 
430 | 			X0_0 = _mm256_add_epi32(X0_0, X0_1);
431 | 
432 | 			X0_3 = _mm256_xor_si256(X0_3, X0_0);
433 | 
434 | 			X0_3 = RotateLeft8(X0_3);
435 | 
436 | 			X0_2 = _mm256_add_epi32(X0_2, X0_3);
437 | 
438 | 			X0_1 = _mm256_xor_si256(X0_1, X0_2);
439 | 
440 | 			X0_1 = RotateLeft7(X0_1);
441 | 
442 | 			X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));
443 | 			X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
444 | 			X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));
445 | 		}
446 | 
447 | 		X0_0 = _mm256_add_epi32(X0_0, state0);
448 | 		X0_1 = _mm256_add_epi32(X0_1, state1);
449 | 		X0_2 = _mm256_add_epi32(X0_2, state2);
450 | 		X0_3 = _mm256_add_epi32(X0_3, state3);
451 | 		X0_3 = _mm256_add_epi32(X0_3, CTR0);
452 | 
453 | 		// todo
454 | 
455 | 		if (RemainingBytes >= 128)
456 | 		{
457 | 			if (In)
458 | 			{
459 | 				_mm256_storeu_si256((__m256i *)(CurrentOut + 0 * 32),
460 | 									_mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)),
461 | 													 _mm256_loadu_si256((__m256i *)(CurrentIn + 0 * 32))));
462 | 				_mm256_storeu_si256((__m256i *)(CurrentOut + 1 * 32),
463 | 									_mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)),
464 | 													 _mm256_loadu_si256((const __m256i *)(CurrentIn + 1 * 32))));
465 | 				_mm256_storeu_si256((__m256i *)(CurrentOut + 2 * 32),
466 | 									_mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)),
467 | 													 _mm256_loadu_si256((const __m256i *)(CurrentIn + 2 * 32))));
468 | 				_mm256_storeu_si256((__m256i *)(CurrentOut + 3 * 32),
469 | 									_mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)),
470 | 													 _mm256_loadu_si256((const __m256i *)(CurrentIn + 3 * 32))));
471 | 			}
472 | 			else
473 | 			{
474 | 				_mm256_storeu_si256((__m256i *)(CurrentOut + 0 * 32),
475 | 									_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));
476 | 				_mm256_storeu_si256((__m256i *)(CurrentOut + 1 * 32),
477 | 									_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));
478 | 				_mm256_storeu_si256((__m256i *)(CurrentOut + 2 * 32),
479 | 									_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));
480 | 				_mm256_storeu_si256((__m256i *)(CurrentOut + 3 * 32),
481 | 									_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));
482 | 			}
483 | 			ChaCha20AddCounter(state, 2);
484 | 			RemainingBytes -= 128;
485 | 			if (RemainingBytes == 0)
486 | 				return;
487 | 			if (CurrentIn)
488 | 				CurrentIn += 128;
489 | 			CurrentOut += 128;
490 | 			continue;
491 | 		}
492 | 		else // last, partial block
493 | 		{
494 | 			__m256i tmp;
495 | 			if (In) // encrypt
496 | 			{
497 | 				tmp = _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4));
498 | 				if (RemainingBytes < 32)
499 | 				{
500 | 					PartialXor(tmp, CurrentIn, CurrentOut, RemainingBytes);
501 | 					ChaCha20AddCounter(state, 1);
502 | 					return;
503 | 				}
504 | 				_mm256_storeu_si256((__m256i *)(CurrentOut), _mm256_xor_si256(tmp, _mm256_loadu_si256((const __m256i *)(CurrentIn))));
505 | 				RemainingBytes -= 32;
506 | 				if (RemainingBytes == 0)
507 | 				{
508 | 					ChaCha20AddCounter(state, 1);
509 | 					return;
510 | 				}
511 | 
512 | 				CurrentIn += 32;
513 | 				CurrentOut += 32;
514 | 
515 | 				tmp = _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4));
516 | 				if (RemainingBytes < 32)
517 | 				{
518 | 					PartialXor(tmp, CurrentIn, CurrentOut, RemainingBytes);
519 | 					ChaCha20AddCounter(state, 1);
520 | 					return;
521 | 				}
522 | 				_mm256_storeu_si256((__m256i *)(CurrentOut), _mm256_xor_si256(tmp, _mm256_loadu_si256((const __m256i *)(CurrentIn))));
523 | 				RemainingBytes -= 32;
524 | 				if (RemainingBytes == 0)
525 | 				{
526 | 					ChaCha20AddCounter(state, 1);
527 | 					return;
528 | 				}
529 | 				CurrentIn += 32;
530 | 				CurrentOut += 32;
531 | 
532 | 				tmp = _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4));
533 | 				if (RemainingBytes < 32)
534 | 				{
535 | 					PartialXor(tmp, CurrentIn, CurrentOut, RemainingBytes);
536 | 					ChaCha20AddCounter(state, 2);
537 | 					return;
538 | 				}
539 | 				_mm256_storeu_si256((__m256i *)(CurrentOut), _mm256_xor_si256(tmp, _mm256_loadu_si256((const __m256i *)(CurrentIn))));
540 | 				RemainingBytes -= 32;
541 | 				if (RemainingBytes == 0)
542 | 				{
543 | 					ChaCha20AddCounter(state, 2);
544 | 					return;
545 | 				}
546 | 				CurrentIn += 32;
547 | 				CurrentOut += 32;
548 | 
549 | 				tmp = _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4));
550 | 				PartialXor(tmp, CurrentIn, CurrentOut, RemainingBytes);
551 | 				ChaCha20AddCounter(state, 2);
552 | 				return;
553 | 			}
554 | 			else
555 | 			{
556 | 
557 | 				tmp = _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4));
558 | 				if (RemainingBytes < 32)
559 | 				{
560 | 					PartialStore(tmp, CurrentOut, RemainingBytes);
561 | 					ChaCha20AddCounter(state, 1);
562 | 					return;
563 | 				}
564 | 				_mm256_storeu_si256((__m256i *)(CurrentOut), tmp);
565 | 				RemainingBytes -= 32;
566 | 				if (RemainingBytes == 0)
567 | 				{
568 | 					ChaCha20AddCounter(state, 1);
569 | 					return;
570 | 				}
571 | 				CurrentOut += 32;
572 | 
573 | 				tmp = _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4));
574 | 
575 | 				if (RemainingBytes < 32)
576 | 				{
577 | 					PartialStore(tmp, CurrentOut, RemainingBytes);
578 | 					ChaCha20AddCounter(state, 1);
579 | 					return;
580 | 				}
581 | 				_mm256_storeu_si256((__m256i *)(CurrentOut), tmp);
582 | 				RemainingBytes -= 32;
583 | 				if (RemainingBytes == 0)
584 | 				{
585 | 					ChaCha20AddCounter(state, 1);
586 | 					return;
587 | 				}
588 | 				CurrentOut += 32;
589 | 
590 | 				tmp = _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4));
591 | 				if (RemainingBytes < 32)
592 | 				{
593 | 					PartialStore(tmp, CurrentOut, RemainingBytes);
594 | 					ChaCha20AddCounter(state, 2);
595 | 					return;
596 | 				}
597 | 				_mm256_storeu_si256((__m256i *)(CurrentOut), tmp);
598 | 				RemainingBytes -= 32;
599 | 				if (RemainingBytes == 0)
600 | 				{
601 | 					ChaCha20AddCounter(state, 2);
602 | 					return;
603 | 				}
604 | 				CurrentOut += 32;
605 | 
606 | 				tmp = _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4));
607 | 				PartialStore(tmp, CurrentOut, RemainingBytes);
608 | 				ChaCha20AddCounter(state, 2);
609 | 				return;
610 | 			}
611 | 		}
612 | 	}
613 | }
614 | 
615 | void chacha_encrypt_avx2(uint8_t *key, uint8_t *nonce, uint8_t *in, uint8_t *out, size_t bytes, uint32_t rounds)
616 | {
617 | 	uint8_t state[48] = {0};
618 | 	ChaCha20SetKey(state, key);
619 | 	ChaCha20SetNonce(state, nonce);
620 | 	ChaCha20EncryptBytes(state, in, out, bytes, rounds);
621 | }
622 | 


--------------------------------------------------------------------------------