├── .gitignore ├── Makefile ├── .gitmodules ├── util └── KeccakSum │ ├── base64.h │ └── base64.c ├── .github └── workflows │ └── CI.yml ├── tests ├── testPerformance.h ├── testKangarooTwelve.h ├── timing.c ├── main.c ├── testPerformance.c ├── timing.h └── testKangarooTwelve.c ├── lib ├── Plain64 │ ├── KeccakP-1600-plain64.c │ └── KeccakP-1600-SnP.h ├── KT-threadpool.c ├── align.h ├── Inplace32BI │ └── KeccakP-1600-SnP.h ├── KangarooTwelve-threading.h ├── ARMv8Asha3 │ ├── KeccakP-1600-SnP.h │ ├── KeccakP-1600-opt64.c │ └── KeccakP-1600-runtimeDispatch.c ├── KT-threadpool-sequential.c ├── Optimized64 │ ├── KeccakP-1600-SnP.h │ ├── KeccakP-1600-AVX512-plainC.c │ ├── KeccakP-1600-runtimeDispatch.c │ └── KeccakP-1600-timesN-SSSE3.c ├── KT-threadpool.h ├── brg_endian.h ├── KT-threadpool-pthread.c ├── KangarooTwelve.h └── KangarooTwelve-threading.c ├── Python ├── Utils.py ├── KangarooTwelve.py ├── TurboSHAKE256Tests.py ├── TurboSHAKE128Tests.py ├── KangarooTwelveTests.py └── TurboSHAKE.py ├── README.markdown ├── Makefile.build └── .travis.yml /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | var/ 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | XKCBpath = support/XKCBuild 2 | include $(XKCBpath)/src/Main.makefile 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "support/XKCBuild"] 2 | path = support/XKCBuild 3 | url = https://github.com/XKCP/XKCBuild.git 4 | -------------------------------------------------------------------------------- /util/KeccakSum/base64.h: -------------------------------------------------------------------------------- 1 | /* 2 | Implementation taken from: 3 | https://en.wikibooks.org/wiki/Algorithm_Implementation/Miscellaneous/Base64 4 | (2015-12-16) 5 | 6 | Available under the Creative Commons Attribution-ShareAlike License: 7 | https://creativecommons.org/licenses/by-sa/3.0/ 8 | */ 9 | 10 | #include 11 | 12 | int base64encode(const void* data_buf, size_t dataLength, char* result, size_t resultSize); 13 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | 9 | jobs: 10 | build: 11 | strategy: 12 | matrix: 13 | target: ["generic32", "generic64", "generic64noAsm", "plain64"] 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v3 17 | with: 18 | submodules: recursive 19 | - name: Install xsltproc 20 | run: sudo apt-get install xsltproc 21 | - name: Build 22 | run: make ${{ matrix.target }}/KTtests ${{ matrix.target }}/libKT.a ${{ matrix.target }}/libKT.so 23 | - name: Test 24 | run: bin/${{ matrix.target }}/KTtests -K12 25 | -------------------------------------------------------------------------------- /tests/testPerformance.h: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. 6 | 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | */ 16 | 17 | #ifndef _testPerformance_h_ 18 | #define _testPerformance_h_ 19 | 20 | void testPerformance(void); 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /tests/testKangarooTwelve.h: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. 6 | 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | */ 16 | 17 | #ifndef _TestKangarooTwelve_h_ 18 | #define _TestKangarooTwelve_h_ 19 | 20 | void testKangarooTwelve(void); 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /lib/Plain64/KeccakP-1600-plain64.c: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. 6 | 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | 16 | --- 17 | 18 | Please refer to the XKCP for more details. 19 | */ 20 | 21 | const char * KeccakP1600_GetImplementation() 22 | { 23 | return "generic 64-bit implementation"; 24 | } 25 | -------------------------------------------------------------------------------- /lib/KT-threadpool.c: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | Thread pool abstraction layer - common functions. 6 | 7 | To the extent possible under law, the implementer has waived all copyright 8 | and related or neighboring rights to the source code in this file. 9 | http://creativecommons.org/publicdomain/zero/1.0/ 10 | */ 11 | 12 | #include "KT-threadpool.h" 13 | 14 | /* Detect pthread availability */ 15 | #if defined(_POSIX_THREADS) || defined(__unix__) || defined(__unix) || \ 16 | (defined(__APPLE__) && defined(__MACH__)) || defined(__linux__) || \ 17 | defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) 18 | #define HAVE_PTHREADS 1 19 | #endif 20 | 21 | const KT_ThreadPool_API* KT_ThreadPool_GetDefault(void) 22 | { 23 | #ifdef HAVE_PTHREADS 24 | return &KT_ThreadPool_Pthread; 25 | #else 26 | return &KT_ThreadPool_Sequential; 27 | #endif 28 | } 29 | -------------------------------------------------------------------------------- /lib/align.h: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. 6 | 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | */ 16 | 17 | #ifndef _align_h_ 18 | #define _align_h_ 19 | 20 | #ifdef ALIGN 21 | #undef ALIGN 22 | #endif 23 | 24 | #if defined(__GNUC__) 25 | #define ALIGN(x) __attribute__ ((aligned(x))) 26 | #elif defined(_MSC_VER) 27 | #define ALIGN(x) __declspec(align(x)) 28 | #elif defined(__ARMCC_VERSION) 29 | #define ALIGN(x) __align(x) 30 | #else 31 | #define ALIGN(x) 32 | #endif 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /lib/Inplace32BI/KeccakP-1600-SnP.h: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. 6 | 7 | Implementation by Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | 16 | --- 17 | 18 | Please refer to the XKCP for more details. 19 | */ 20 | 21 | #ifndef _KeccakP_1600_SnP_h_ 22 | #define _KeccakP_1600_SnP_h_ 23 | 24 | #define KeccakP1600_stateSizeInBytes 200 25 | #define KeccakP1600_stateAlignment 8 26 | #define KeccakP1600_disableParallelism 27 | 28 | const char * KeccakP1600_GetImplementation(); 29 | void KeccakP1600_Initialize(void *state); 30 | void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); 31 | void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); 32 | void KeccakP1600_Permute_12rounds(void *state); 33 | void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /Python/Utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Implementation by Gilles Van Assche and Benoit Viguier, hereby denoted as "the implementers". 3 | # 4 | # For more information, feedback or questions, please refer to our website: 5 | # https://keccak.team/ 6 | # 7 | # To the extent possible under law, the implementers has waived all copyright 8 | # and related or neighboring rights to the source code in this file. 9 | # http://creativecommons.org/publicdomain/zero/1.0/ 10 | 11 | from __future__ import print_function 12 | 13 | def hexString(s): 14 | r = '' 15 | for i in range(len(s)): 16 | if r != '': r = r + ' ' 17 | r = r + "{0:02X}".format(s[i]) 18 | return r 19 | 20 | def hexStringSpecial(s): 21 | if len(s) == 0: 22 | return "`00`^0" 23 | else: 24 | return "`"+hexString(s)+"`" 25 | 26 | def numberStringSpecial(base, exponent): 27 | if exponent == 0: 28 | return "1" 29 | elif exponent == 1: 30 | return "{0:d}".format(base) 31 | else: 32 | return "{0:d}**{1:d}".format(base, exponent) 33 | 34 | def outputHex(s): 35 | for i in range(len(s)): 36 | print("{0:02X}".format(s[i]), end=' ') 37 | if i % 16 == 15: 38 | print() 39 | print() 40 | print() 41 | 42 | def printTestVectorOutput(s): 43 | print(' `', end='') 44 | for i in range(len(s)): 45 | print("{0:02X}".format(s[i]), end=('`' if i == len(s) - 1 else ' ')) 46 | if i % 16 == 15: 47 | print() 48 | print(' ', end='') 49 | print() 50 | -------------------------------------------------------------------------------- /tests/timing.c: -------------------------------------------------------------------------------- 1 | /* 2 | The eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 6 | 7 | For more information, feedback or questions, please refer to the Keccak Team website: 8 | https://keccak.team/ 9 | 10 | To the extent possible under law, the implementer has waived all copyright 11 | and related or neighboring rights to the source code in this file. 12 | http://creativecommons.org/publicdomain/zero/1.0/ 13 | */ 14 | 15 | #include "timing.h" 16 | 17 | const char * getTimerUnit() 18 | { 19 | #if defined(__aarch64__) 20 | return "ns"; 21 | #else 22 | return "cycles"; 23 | #endif 24 | } 25 | 26 | double timerCorrectionFactor = 1.0; 27 | 28 | static double getTimerCorrectionFactor() 29 | { 30 | #if defined(__aarch64__) 31 | int64_t virtual_timer_freq; 32 | asm volatile("mrs %0, cntfrq_el0" : "=r"(virtual_timer_freq)); 33 | return (double)1.0e9 / (double)virtual_timer_freq; 34 | #else 35 | return 1.0; 36 | #endif 37 | } 38 | 39 | cycles_t CalibrateTimer() 40 | { 41 | cycles_t dtMin = CYCLES_MAX; /* big number to start */ 42 | cycles_t t0,t1,i; 43 | 44 | timerCorrectionFactor = getTimerCorrectionFactor(); 45 | 46 | for (i=0;i < TIMER_SAMPLE_CNT;i++) /* calibrate the overhead for measuring time */ 47 | { 48 | t0 = CycleTimer(); 49 | t1 = CycleTimer(); 50 | if (dtMin > t1-t0) /* keep only the minimum time */ 51 | dtMin = t1-t0; 52 | } 53 | return dtMin; 54 | } 55 | -------------------------------------------------------------------------------- /lib/Plain64/KeccakP-1600-SnP.h: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. 6 | 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | 16 | --- 17 | 18 | Please refer to the XKCP for more details. 19 | */ 20 | 21 | #ifndef _KeccakP_1600_SnP_h_ 22 | #define _KeccakP_1600_SnP_h_ 23 | 24 | /* Keccak-p[1600] */ 25 | 26 | #define KeccakP1600_stateSizeInBytes 200 27 | #define KeccakP1600_stateAlignment 8 28 | #define KeccakP1600_12rounds_FastLoop_supported 29 | #define KeccakP1600_disableParallelism 30 | 31 | const char * KeccakP1600_GetImplementation(); 32 | void KeccakP1600_Initialize(void *state); 33 | void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); 34 | void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); 35 | void KeccakP1600_Permute_12rounds(void *state); 36 | void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); 37 | size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); 38 | 39 | // Instead of defining proxy functions which do nothing, simply rename the 40 | // symbols of the opt64 implementation where they are used. 41 | #define KeccakP1600_opt64_Initialize KeccakP1600_Initialize 42 | #define KeccakP1600_opt64_AddByte KeccakP1600_AddByte 43 | #define KeccakP1600_opt64_AddBytes KeccakP1600_AddBytes 44 | #define KeccakP1600_opt64_Permute_12rounds KeccakP1600_Permute_12rounds 45 | #define KeccakP1600_opt64_ExtractBytes KeccakP1600_ExtractBytes 46 | #define KeccakP1600_opt64_12rounds_FastLoop_Absorb KeccakP1600_12rounds_FastLoop_Absorb 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /lib/KangarooTwelve-threading.h: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. 6 | 7 | Threading support implementation using portable thread pool abstraction. 8 | 9 | PLATFORM COMPATIBILITY: 10 | The library uses a portable thread pool abstraction that supports multiple backends: 11 | - Built-in pthread backend (Linux, macOS, BSD, Unix-like systems) 12 | - Built-in sequential backend (all platforms, no actual parallelism) 13 | - Custom application-provided backends (see KT-threadpool.h) 14 | 15 | By default, the pthread backend is used on systems with pthread support, 16 | and the sequential backend is used elsewhere. 17 | 18 | To the extent possible under law, the implementer has waived all copyright 19 | and related or neighboring rights to the source code in this file. 20 | http://creativecommons.org/publicdomain/zero/1.0/ 21 | */ 22 | 23 | #ifndef _KangarooTwelve_threading_h_ 24 | #define _KangarooTwelve_threading_h_ 25 | 26 | #include 27 | #include "KT-threadpool.h" 28 | 29 | /** 30 | * Internal function to process multiple chunks in parallel using threads. 31 | * 32 | * @param threadpool_api Thread pool API implementation 33 | * @param threadpool_handle Thread pool handle 34 | * @param thread_count Number of threads in the pool 35 | * @param input Pointer to input data (multiple chunks) 36 | * @param chunkCount Number of chunks to process 37 | * @param output Pointer to output buffer for chaining values 38 | * @param securityLevel 128 for KT128 or 256 for KT256 39 | * @return 0 if successful, 1 otherwise 40 | */ 41 | int KT_ProcessChunksThreaded(const KT_ThreadPool_API* threadpool_api, 42 | void* threadpool_handle, 43 | int thread_count, 44 | const unsigned char *input, 45 | size_t chunkCount, 46 | unsigned char *output, 47 | int securityLevel); 48 | 49 | #endif /* _KangarooTwelve_threading_h_ */ 50 | -------------------------------------------------------------------------------- /lib/ARMv8Asha3/KeccakP-1600-SnP.h: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. 6 | 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | 16 | --- 17 | 18 | Please refer to the XKCP for more details. 19 | */ 20 | 21 | #ifndef _KeccakP_1600_SnP_h_ 22 | #define _KeccakP_1600_SnP_h_ 23 | 24 | /* Keccak-p[1600] */ 25 | 26 | #define KeccakP1600_stateSizeInBytes 200 27 | #define KeccakP1600_stateAlignment 8 28 | #define KeccakP1600_12rounds_FastLoop_supported 29 | 30 | const char * KeccakP1600_GetImplementation(); 31 | void KeccakP1600_Initialize(void *state); 32 | void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); 33 | void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); 34 | void KeccakP1600_Permute_12rounds(void *state); 35 | void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); 36 | size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); 37 | 38 | /* Keccak-p[1600]×2 */ 39 | 40 | int KeccakP1600times2_IsAvailable(); 41 | const char * KeccakP1600times2_GetImplementation(); 42 | void KeccakP1600times2_Permute_12rounds(void *state); 43 | void KT128_Process2Leaves(const unsigned char *input, unsigned char *output); 44 | void KT256_Process2Leaves(const unsigned char *input, unsigned char *output); 45 | 46 | /* Keccak-p[1600]×4 */ 47 | 48 | int KeccakP1600times4_IsAvailable(); 49 | const char * KeccakP1600times4_GetImplementation(); 50 | 51 | /* Keccak-p[1600]×8 */ 52 | 53 | int KeccakP1600times8_IsAvailable(); 54 | const char * KeccakP1600times8_GetImplementation(); 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /Python/KangarooTwelve.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Implementation by Gilles Van Assche, hereby denoted as "the implementer". 3 | # 4 | # For more information, feedback or questions, please refer to our website: 5 | # https://keccak.team/ 6 | # 7 | # To the extent possible under law, the implementer has waived all copyright 8 | # and related or neighboring rights to the source code in this file. 9 | # http://creativecommons.org/publicdomain/zero/1.0/ 10 | 11 | from TurboSHAKE import TurboSHAKE128, TurboSHAKE256 12 | from Utils import outputHex 13 | 14 | def length_encode(x): 15 | S = bytearray() 16 | while x > 0: 17 | S = bytearray([x % 256]) + S 18 | x = x//256 19 | S = S + bytearray([len(S)]) 20 | return S 21 | 22 | # inputMessage and customizationString must be of type byte string or byte array 23 | def KT128(inputMessage, customString, outputByteLen): 24 | S = inputMessage + customString 25 | S = S + length_encode(len(customString)) 26 | 27 | if len(S) <= 8192: 28 | return TurboSHAKE128(S, 0x07, outputByteLen) 29 | else: 30 | # === Kangaroo hopping === 31 | FinalNode = S[0:8192] + bytearray([0x03] + [0x00]*7) 32 | offset = 8192 33 | numBlock = 0 34 | while offset < len(S): 35 | blockSize = min(len(S) - offset, 8192) 36 | CV = TurboSHAKE128(S[offset : offset + blockSize], 0x0B, 32) 37 | FinalNode = FinalNode + CV 38 | numBlock += 1 39 | offset += blockSize 40 | FinalNode = FinalNode + length_encode( numBlock ) + bytearray([0xFF, 0xFF]) 41 | return TurboSHAKE128(FinalNode, 0x06, outputByteLen) 42 | 43 | def KT256(inputMessage, customString, outputByteLen): 44 | S = inputMessage + customString 45 | S = S + length_encode(len(customString)) 46 | 47 | if len(S) <= 8192: 48 | return TurboSHAKE256(S, 0x07, outputByteLen) 49 | else: 50 | # === Kangaroo hopping === 51 | FinalNode = S[0:8192] + bytearray([0x03] + [0x00]*7) 52 | offset = 8192 53 | numBlock = 0 54 | while offset < len(S): 55 | blockSize = min(len(S) - offset, 8192) 56 | CV = TurboSHAKE256(S[offset : offset + blockSize], 0x0B, 64) 57 | FinalNode = FinalNode + CV 58 | numBlock += 1 59 | offset += blockSize 60 | FinalNode = FinalNode + length_encode( numBlock ) + bytearray([0xFF, 0xFF]) 61 | return TurboSHAKE256(FinalNode, 0x06, outputByteLen) 62 | -------------------------------------------------------------------------------- /tests/main.c: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. 6 | 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "align.h" 22 | #include "KangarooTwelve.h" 23 | #include "testKangarooTwelve.h" 24 | #include "testPerformance.h" 25 | 26 | #define BENCH1GB 27 | 28 | void printHelp() 29 | { 30 | printf("Usage: KeccakTests command(s), where the commands can be\n"); 31 | printf(" --help or -h To display this page\n"); 32 | printf(" --all or -a All tests\n"); 33 | printf(" --KangarooTwelve or -K12 Tests on KangarooTwelve\n"); 34 | printf(" --speed or -s Speed measuresments\n"); 35 | #ifdef BENCH1GB 36 | printf(" --1GB Just hash 1GB of data and exit\n"); 37 | #endif 38 | } 39 | 40 | #ifdef BENCH1GB 41 | void bench1GB() 42 | { 43 | #define INPUT_SIZE 1000000000 44 | static ALIGN(64) unsigned char input[INPUT_SIZE]; 45 | static ALIGN(64) unsigned char output[32]; 46 | KT128(input, INPUT_SIZE, output, 32, 0, 0); 47 | #undef INPUT_SIZE 48 | } 49 | #endif 50 | 51 | int process(int argc, char* argv[]) 52 | { 53 | int i; 54 | int help = 0; 55 | int KangarooTwelve = 0; 56 | int speed = 0; 57 | 58 | if (argc <= 1) 59 | help = 1; 60 | 61 | #ifdef BENCH1GB 62 | if (argc > 1 && strcmp("--1GB", argv[1]) == 0) { 63 | bench1GB(); 64 | return 0; 65 | } 66 | #endif 67 | 68 | for(i=1; i 18 | 19 | #define MAX_JOBS 256 20 | 21 | /* Sequential pool context */ 22 | typedef struct { 23 | /* Job queue */ 24 | struct { 25 | void (*work_fn)(void*); 26 | void* work_data; 27 | } jobs[MAX_JOBS]; 28 | int job_count; 29 | int valid; /* Marker to detect use-after-free */ 30 | } SequentialPool; 31 | 32 | /* Create sequential pool */ 33 | static void* sequential_create_pool(int num_threads) 34 | { 35 | /* Ignore num_threads - we're always sequential */ 36 | (void)num_threads; 37 | 38 | SequentialPool* pool = (SequentialPool*)malloc(sizeof(SequentialPool)); 39 | if (!pool) 40 | return NULL; 41 | 42 | pool->job_count = 0; 43 | pool->valid = 0x12345678; /* Magic number for validation */ 44 | 45 | return pool; 46 | } 47 | 48 | /* Submit work to sequential pool (just queue it) */ 49 | static int sequential_submit(void* pool_handle, void (*work_fn)(void*), void* work_data) 50 | { 51 | SequentialPool* pool = (SequentialPool*)pool_handle; 52 | if (!pool || pool->valid != 0x12345678 || !work_fn) 53 | return 1; 54 | 55 | if (pool->job_count >= MAX_JOBS) 56 | return 1; /* Job queue full */ 57 | 58 | pool->jobs[pool->job_count].work_fn = work_fn; 59 | pool->jobs[pool->job_count].work_data = work_data; 60 | pool->job_count++; 61 | 62 | return 0; 63 | } 64 | 65 | /* Wait for all work (execute queued jobs sequentially) */ 66 | static void sequential_wait_all(void* pool_handle) 67 | { 68 | SequentialPool* pool = (SequentialPool*)pool_handle; 69 | if (!pool || pool->valid != 0x12345678) 70 | return; 71 | 72 | /* Execute all queued jobs sequentially */ 73 | for (int i = 0; i < pool->job_count; i++) { 74 | if (pool->jobs[i].work_fn) { 75 | pool->jobs[i].work_fn(pool->jobs[i].work_data); 76 | } 77 | } 78 | 79 | /* Reset for next batch */ 80 | pool->job_count = 0; 81 | } 82 | 83 | /* Destroy sequential pool */ 84 | static void sequential_destroy(void* pool_handle) 85 | { 86 | SequentialPool* pool = (SequentialPool*)pool_handle; 87 | if (!pool) 88 | return; 89 | 90 | pool->valid = 0; /* Invalidate */ 91 | free(pool); 92 | } 93 | 94 | /* Export sequential backend API */ 95 | const KT_ThreadPool_API KT_ThreadPool_Sequential = { 96 | .min_input_size_for_threading = 2097152, /* 2 MB default threshold */ 97 | .create = sequential_create_pool, 98 | .submit = sequential_submit, 99 | .wait_all = sequential_wait_all, 100 | .destroy = sequential_destroy 101 | }; 102 | -------------------------------------------------------------------------------- /util/KeccakSum/base64.c: -------------------------------------------------------------------------------- 1 | /* 2 | Implementation taken from: 3 | https://en.wikibooks.org/wiki/Algorithm_Implementation/Miscellaneous/Base64 4 | (2015-12-16) 5 | 6 | Available under the Creative Commons Attribution-ShareAlike License: 7 | https://creativecommons.org/licenses/by-sa/3.0/ 8 | */ 9 | 10 | #include 11 | #include 12 | 13 | int base64encode(const void* data_buf, size_t dataLength, char* result, size_t resultSize) 14 | { 15 | const char base64chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 16 | const uint8_t *data = (const uint8_t *)data_buf; 17 | size_t resultIndex = 0; 18 | size_t x; 19 | uint32_t n = 0; 20 | int padCount = dataLength % 3; 21 | uint8_t n0, n1, n2, n3; 22 | 23 | /* increment over the length of the string, three characters at a time */ 24 | for (x = 0; x < dataLength; x += 3) 25 | { 26 | /* these three 8-bit (ASCII) characters become one 24-bit number */ 27 | n = ((uint32_t)data[x]) << 16; /* parenthesis needed, compiler depending on flags can do the shifting before conversion to uint32_t, resulting to 0 */ 28 | 29 | if((x+1) < dataLength) 30 | n += ((uint32_t)data[x+1]) << 8; /* parenthesis needed, compiler depending on flags can do the shifting before conversion to uint32_t, resulting to 0 */ 31 | 32 | if((x+2) < dataLength) 33 | n += data[x+2]; 34 | 35 | /* this 24-bit number gets separated into four 6-bit numbers */ 36 | n0 = (uint8_t)(n >> 18) & 63; 37 | n1 = (uint8_t)(n >> 12) & 63; 38 | n2 = (uint8_t)(n >> 6) & 63; 39 | n3 = (uint8_t)n & 63; 40 | 41 | /* 42 | * if we have one byte available, then its encoding is spread 43 | * out over two characters 44 | */ 45 | if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */ 46 | result[resultIndex++] = base64chars[n0]; 47 | if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */ 48 | result[resultIndex++] = base64chars[n1]; 49 | 50 | /* 51 | * if we have only two bytes available, then their encoding is 52 | * spread out over three chars 53 | */ 54 | if((x+1) < dataLength) 55 | { 56 | if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */ 57 | result[resultIndex++] = base64chars[n2]; 58 | } 59 | 60 | /* 61 | * if we have all three bytes available, then their encoding is spread 62 | * out over four characters 63 | */ 64 | if((x+2) < dataLength) 65 | { 66 | if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */ 67 | result[resultIndex++] = base64chars[n3]; 68 | } 69 | } 70 | 71 | /* 72 | * create and add padding that is required if we did not have a multiple of 3 73 | * number of characters available 74 | */ 75 | if (padCount > 0) 76 | { 77 | for (; padCount < 3; padCount++) 78 | { 79 | if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */ 80 | result[resultIndex++] = '='; 81 | } 82 | } 83 | if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */ 84 | result[resultIndex] = 0; 85 | return 0; /* indicate success */ 86 | } 87 | -------------------------------------------------------------------------------- /Python/TurboSHAKE256Tests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Implementation by Gilles Van Assche and Benoit Viguier, hereby denoted as "the implementers". 3 | # 4 | # For more information, feedback or questions, please refer to our website: 5 | # https://keccak.team/ 6 | # 7 | # To the extent possible under law, the implementers has waived all copyright 8 | # and related or neighboring rights to the source code in this file. 9 | # http://creativecommons.org/publicdomain/zero/1.0/ 10 | 11 | from __future__ import print_function 12 | from TurboSHAKE import TurboSHAKE256 13 | from Utils import hexString, printTestVectorOutput 14 | 15 | def generateSimpleRawMaterial(length, seed1, seed2): 16 | seed2 = seed2 % 8 17 | return bytes([(seed1 + 161*length - ((i%256) << seed2) - ((i%256) >> (8-seed2)) + i)%256 for i in range(length)]) 18 | 19 | customizationByteSize = 32 20 | 21 | def performTestTurboSHAKE256OneInput(inputLen, outputLen, customLen): 22 | customization = 97 23 | inputMessage = generateSimpleRawMaterial(inputLen, outputLen, inputLen + customLen) 24 | print("outputLen {0:5d}, inputLen {1:5d}, customLen {2:3d}".format(outputLen, inputLen, customLen)) 25 | output = TurboSHAKE256(inputMessage, customization, outputLen) 26 | print("Kangaroo-Twelve") 27 | print("Input of {0:d} bytes:".format(inputLen), end='') 28 | for i in range(min(inputLen, 16)): 29 | print(" {0:02x}".format(inputMessage[i]), end='') 30 | if (inputLen > 16): 31 | print(" ...", end='') 32 | print("") 33 | print("Output of {0:d} bytes:".format(outputLen), end='') 34 | for i in range(outputLen): 35 | print(" {0:02x}".format(output[i]), end='') 36 | print("") 37 | print("") 38 | 39 | def performTestTurboSHAKE256(): 40 | cBlockSize = 8192 41 | outputLen = 256//8 42 | customLen = 0 43 | for inputLen in range(cBlockSize*9+124): 44 | performTestTurboSHAKE256OneInput(inputLen, outputLen, customLen) 45 | 46 | outputLen = 128//8 47 | while(outputLen <= 512//8): 48 | inputLen = 0 49 | while(inputLen <= (3*cBlockSize)): 50 | customLen = 0 51 | while(customLen <= customizationByteSize): 52 | performTestTurboSHAKE256OneInput(inputLen, outputLen, customLen) 53 | customLen += 7 54 | inputLen = (inputLen + 167) if (inputLen > 0) else 1 55 | outputLen = outputLen*2 56 | 57 | def performShortTestTurboSHAKE256(): 58 | cBlockSize = 8192 59 | outputLen = 256//8 60 | customLen = 0 61 | for inputLen in range(4): 62 | performTestTurboSHAKE256OneInput(inputLen, outputLen, customLen) 63 | performTestTurboSHAKE256OneInput(27121, outputLen, customLen) 64 | 65 | #performTestTurboSHAKE256() 66 | #performShortTestTurboSHAKE256() 67 | 68 | def printTestVectors(): 69 | print(" TurboSHAKE256(M=`00`^0, D=`1F`, 64):") 70 | printTestVectorOutput(TurboSHAKE256(b'', 0x1F, 64)) 71 | print(" TurboSHAKE256(M=`00`^0, D=`1F`, 10032), last 32 bytes:") 72 | printTestVectorOutput(TurboSHAKE256(b'', 0x1F, 10032)[10000:]) 73 | for i in range(7): 74 | M = bytearray([(j % 251) for j in range(17**i)]) 75 | print(" TurboSHAKE256(M=ptn(17**{0:d} bytes), D=`1F`, 64):".format(i)) 76 | printTestVectorOutput(TurboSHAKE256(M, 0x1F, 64)) 77 | for D in [0x01, 0x06, 0x07, 0x0B, 0x30, 0x7F]: 78 | i = D%3 + 1 79 | M = bytearray([0xFF for j in range(2**i-1)]) 80 | print(" TurboSHAKE256(M=`{0}`, D=`{1:02X}`, 64):".format(hexString(M), D)) 81 | printTestVectorOutput(TurboSHAKE256(M, D, 64)) 82 | 83 | printTestVectors() 84 | -------------------------------------------------------------------------------- /Python/TurboSHAKE128Tests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Implementation by Gilles Van Assche and Benoit Viguier, hereby denoted as "the implementers". 3 | # 4 | # For more information, feedback or questions, please refer to our website: 5 | # https://keccak.team/ 6 | # 7 | # To the extent possible under law, the implementers has waived all copyright 8 | # and related or neighboring rights to the source code in this file. 9 | # http://creativecommons.org/publicdomain/zero/1.0/ 10 | 11 | from __future__ import print_function 12 | from TurboSHAKE import TurboSHAKE128 13 | from Utils import hexString, printTestVectorOutput 14 | 15 | def generateSimpleRawMaterial(length, seed1, seed2): 16 | seed2 = seed2 % 8 17 | return bytes([(seed1 + 161*length - ((i%256) << seed2) - ((i%256) >> (8-seed2)) + i)%256 for i in range(length)]) 18 | 19 | customizationByteSize = 32 20 | 21 | def performTestTurboSHAKE128OneInput(inputLen, outputLen, customLen): 22 | customization = 97 23 | inputMessage = generateSimpleRawMaterial(inputLen, outputLen, inputLen + customLen) 24 | print("outputLen {0:5d}, inputLen {1:5d}, customLen {2:3d}".format(outputLen, inputLen, customLen)) 25 | output = TurboSHAKE128(inputMessage, customization, outputLen) 26 | print("Kangaroo-Twelve") 27 | print("Input of {0:d} bytes:".format(inputLen), end='') 28 | for i in range(min(inputLen, 16)): 29 | print(" {0:02x}".format(inputMessage[i]), end='') 30 | if (inputLen > 16): 31 | print(" ...", end='') 32 | print("") 33 | print("Output of {0:d} bytes:".format(outputLen), end='') 34 | for i in range(outputLen): 35 | print(" {0:02x}".format(output[i]), end='') 36 | print("") 37 | print("") 38 | 39 | def performTestTurboSHAKE128(): 40 | cBlockSize = 8192 41 | outputLen = 256//8 42 | customLen = 0 43 | for inputLen in range(cBlockSize*9+124): 44 | performTestTurboSHAKE128OneInput(inputLen, outputLen, customLen) 45 | 46 | outputLen = 128//8 47 | while(outputLen <= 512//8): 48 | inputLen = 0 49 | while(inputLen <= (3*cBlockSize)): 50 | customLen = 0 51 | while(customLen <= customizationByteSize): 52 | performTestTurboSHAKE128OneInput(inputLen, outputLen, customLen) 53 | customLen += 7 54 | inputLen = (inputLen + 167) if (inputLen > 0) else 1 55 | outputLen = outputLen*2 56 | 57 | def performShortTestTurboSHAKE128(): 58 | cBlockSize = 8192 59 | outputLen = 256//8 60 | customLen = 0 61 | for inputLen in range(4): 62 | performTestTurboSHAKE128OneInput(inputLen, outputLen, customLen) 63 | performTestTurboSHAKE128OneInput(27121, outputLen, customLen) 64 | 65 | #performTestTurboSHAKE128() 66 | #performShortTestTurboSHAKE128() 67 | 68 | def printTestVectors(): 69 | print(" TurboSHAKE128(M=`00`^0, D=`1F`, 32):") 70 | printTestVectorOutput(TurboSHAKE128(b'', 0x1F, 32)) 71 | print(" TurboSHAKE128(M=`00`^0, D=`1F`, 64):") 72 | printTestVectorOutput(TurboSHAKE128(b'', 0x1F, 64)) 73 | print(" TurboSHAKE128(M=`00`^0, D=`1F`, 10032), last 32 bytes:") 74 | printTestVectorOutput(TurboSHAKE128(b'', 0x1F, 10032)[10000:]) 75 | for i in range(7): 76 | M = bytearray([(j % 251) for j in range(17**i)]) 77 | print(" TurboSHAKE128(M=ptn(17**{0:d} bytes), D=`1F`, 32):".format(i)) 78 | printTestVectorOutput(TurboSHAKE128(M, 0x1F, 32)) 79 | for D in [0x01, 0x06, 0x07, 0x0B, 0x30, 0x7F]: 80 | i = D%3 + 1 81 | M = bytearray([0xFF for j in range(2**i-1)]) 82 | print(" TurboSHAKE128(M=`{0}`, D=`{1:02X}`, 32):".format(hexString(M), D)) 83 | printTestVectorOutput(TurboSHAKE128(M, D, 32)) 84 | 85 | printTestVectors() 86 | -------------------------------------------------------------------------------- /lib/Optimized64/KeccakP-1600-SnP.h: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. 6 | 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | 16 | --- 17 | 18 | Please refer to the XKCP for more details. 19 | */ 20 | 21 | #ifndef _KeccakP_1600_SnP_h_ 22 | #define _KeccakP_1600_SnP_h_ 23 | 24 | /* Keccak-p[1600] */ 25 | 26 | #define KeccakP1600_stateSizeInBytes 200 27 | #define KeccakP1600_stateAlignment 8 28 | #define KeccakP1600_12rounds_FastLoop_supported 29 | 30 | const char * KeccakP1600_GetImplementation(); 31 | void KeccakP1600_Initialize(void *state); 32 | void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); 33 | void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); 34 | void KeccakP1600_Permute_12rounds(void *state); 35 | void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); 36 | size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); 37 | 38 | void KeccakP1600_AVX512_Initialize(void *state); 39 | void KeccakP1600_AVX512_AddByte(void *state, unsigned char data, unsigned int offset); 40 | void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); 41 | void KeccakP1600_AVX512_Permute_12rounds(void *state); 42 | void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); 43 | size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); 44 | 45 | void KeccakP1600_AVX2_Initialize(void *state); 46 | void KeccakP1600_AVX2_AddByte(void *state, unsigned char data, unsigned int offset); 47 | void KeccakP1600_AVX2_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); 48 | void KeccakP1600_AVX2_Permute_12rounds(void *state); 49 | void KeccakP1600_AVX2_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); 50 | size_t KeccakP1600_AVX2_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); 51 | 52 | void KeccakP1600_opt64_Initialize(void *state); 53 | void KeccakP1600_opt64_AddByte(void *state, unsigned char data, unsigned int offset); 54 | void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); 55 | void KeccakP1600_opt64_Permute_12rounds(void *state); 56 | void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); 57 | size_t KeccakP1600_opt64_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); 58 | 59 | /* Keccak-p[1600]×2 */ 60 | 61 | int KeccakP1600times2_IsAvailable(); 62 | const char * KeccakP1600times2_GetImplementation(); 63 | 64 | /* Keccak-p[1600]×4 */ 65 | 66 | int KeccakP1600times4_IsAvailable(); 67 | const char * KeccakP1600times4_GetImplementation(); 68 | 69 | /* Keccak-p[1600]×8 */ 70 | 71 | int KeccakP1600times8_IsAvailable(); 72 | const char * KeccakP1600times8_GetImplementation(); 73 | 74 | #endif 75 | -------------------------------------------------------------------------------- /Python/KangarooTwelveTests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Implementation by Gilles Van Assche, hereby denoted as "the implementer". 3 | # 4 | # For more information, feedback or questions, please refer to our website: 5 | # https://keccak.team/ 6 | # 7 | # To the extent possible under law, the implementer has waived all copyright 8 | # and related or neighboring rights to the source code in this file. 9 | # http://creativecommons.org/publicdomain/zero/1.0/ 10 | 11 | from KangarooTwelve import KT128, KT256 12 | from Utils import hexStringSpecial, numberStringSpecial, printTestVectorOutput 13 | 14 | def printKT128TestVectors(): 15 | print(" KT128(M=`00`^0, C=`00`^0, 32):") 16 | printTestVectorOutput(KT128(b'', b'', 32)) 17 | print(" KT128(M=`00`^0, C=`00`^0, 64):") 18 | printTestVectorOutput(KT128(b'', b'', 64)) 19 | print(" KT128(M=`00`^0, C=`00`^0, 10032), last 32 bytes:") 20 | printTestVectorOutput(KT128(b'', b'', 10032)[10000:]) 21 | for i in range(7): 22 | C = b'' 23 | M = bytearray([(j % 251) for j in range(17**i)]) 24 | print(" KT128(M=ptn({0:s} bytes), C=`00`^0, 32):".format(numberStringSpecial(17, i))) 25 | printTestVectorOutput(KT128(M, C, 32)) 26 | for i in range(4): 27 | M = bytearray([0xFF for j in range(2**i-1)]) 28 | C = bytearray([(j % 251) for j in range(41**i)]) 29 | print(" KT128({0:s}, C=ptn({1:s} bytes), 32):".format(hexStringSpecial(M), numberStringSpecial(41, i))) 30 | printTestVectorOutput(KT128(M, C, 32)) 31 | # We test for 8191 bytes of M because right_encode of empty C is 1 byte, so S is exactly 8192 bytes 32 | print(" KT128(M=ptn(8191 bytes), C=`00`^0, 32):") 33 | printTestVectorOutput(KT128(bytearray([(j % 251) for j in range(8191)]), b'', 32)) 34 | # We test for 8192 bytes of M because right_encode of empty C is 1 byte so this put a full new block 35 | print(" KT128(M=ptn(8192 bytes), C=`00`^0, 32):") 36 | printTestVectorOutput(KT128(bytearray([(j % 251) for j in range(8192)]), b'', 32)) 37 | # We test with 8192 bytes of M + 8189 bytes of C because 8189 = 3 bytes of Right_ecnode thus S is exactly 2 * 8192 bytes 38 | # We test with 8192 bytes of M + 8190 bytes of C because 8189 = 3 bytes of Right_ecnode thus S is exactly 2 * 8192 + 1 bytes 39 | for c in [8189, 8190]: 40 | C = bytearray([(j % 251) for j in range(c)]) 41 | print(" KT128(M=ptn(8192 bytes), C=ptn({0:d} bytes), 32):".format(c)) 42 | printTestVectorOutput(KT128(bytearray([(j % 251) for j in range(8192)]), C, 32)) 43 | 44 | def printKT256TestVectors(): 45 | print(" KT256(M=`00`^0, C=`00`^0, 64):") 46 | printTestVectorOutput(KT256(b'', b'', 64)) 47 | print(" KT256(M=`00`^0, C=`00`^0, 128):") 48 | printTestVectorOutput(KT256(b'', b'', 128)) 49 | print(" KT256(M=`00`^0, C=`00`^0, 10064), last 64 bytes:") 50 | printTestVectorOutput(KT256(b'', b'', 10064)[10000:]) 51 | for i in range(7): 52 | C = b'' 53 | M = bytearray([(j % 251) for j in range(17**i)]) 54 | print(" KT256(M=ptn({0:s} bytes), C=`00`^0, 64):".format(numberStringSpecial(17, i))) 55 | printTestVectorOutput(KT256(M, C, 64)) 56 | for i in range(4): 57 | M = bytearray([0xFF for j in range(2**i-1)]) 58 | C = bytearray([(j % 251) for j in range(41**i)]) 59 | print(" KT256({0:s}, C=ptn({1:s} bytes), 64):".format(hexStringSpecial(M), numberStringSpecial(41, i))) 60 | printTestVectorOutput(KT256(M, C, 64)) 61 | # We test for 8191 bytes of M because right_encode of empty C is 1 byte, so S is exactly 8192 bytes 62 | print(" KT256(M=ptn(8191 bytes), C=`00`^0, 64):") 63 | printTestVectorOutput(KT256(bytearray([(j % 251) for j in range(8191)]), b'', 64)) 64 | # We test for 8192 bytes of M because right_encode of empty C is 1 byte so this put a full new block 65 | print(" KT256(M=ptn(8192 bytes), C=`00`^0, 64):") 66 | printTestVectorOutput(KT256(bytearray([(j % 251) for j in range(8192)]), b'', 64)) 67 | # We test with 8192 bytes of M + 8189 bytes of C because 8189 = 3 bytes of Right_ecnode thus S is exactly 2 * 8192 bytes 68 | # We test with 8192 bytes of M + 8190 bytes of C because 8189 = 3 bytes of Right_ecnode thus S is exactly 2 * 8192 + 1 bytes 69 | for c in [8189, 8190]: 70 | C = bytearray([(j % 251) for j in range(c)]) 71 | print(" KT256(M=ptn(8192 bytes), C=ptn({0:d} bytes), 64):".format(c)) 72 | printTestVectorOutput(KT256(bytearray([(j % 251) for j in range(8192)]), C, 64)) 73 | 74 | printKT128TestVectors() 75 | printKT256TestVectors() 76 | -------------------------------------------------------------------------------- /lib/KT-threadpool.h: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | Thread pool abstraction layer for portable threading support. 6 | 7 | This provides a simple, application-implementable thread pool API that allows 8 | KangarooTwelve to use custom threading implementations or fall back to 9 | sequential execution on platforms without threading support. 10 | 11 | To the extent possible under law, the implementer has waived all copyright 12 | and related or neighboring rights to the source code in this file. 13 | http://creativecommons.org/publicdomain/zero/1.0/ 14 | */ 15 | 16 | #ifndef _KT_threadpool_h_ 17 | #define _KT_threadpool_h_ 18 | 19 | #include 20 | 21 | /** 22 | * Abstract thread pool API. 23 | * 24 | * Applications can implement this interface to provide custom threading 25 | * backends (e.g., Windows thread pool, custom work-stealing scheduler, etc.). 26 | * 27 | * The API is designed for batch job processing: submit multiple jobs, then 28 | * wait for all to complete. This matches the KangarooTwelve tree hashing 29 | * pattern where chunk processing is distributed across threads. 30 | */ 31 | typedef struct KT_ThreadPool_API { 32 | /** 33 | * Minimum input size (in bytes) required to enable parallel processing. 34 | * 35 | * If the total input size is smaller than this threshold, KangarooTwelve 36 | * will not use threading for that particular Update() call, avoiding 37 | * thread overhead for small inputs. 38 | * 39 | * Default: 2097152 (2 MB) 40 | * Rationale: Threading overhead outweighs benefits for small inputs. 41 | * Optimal results typically seen with inputs > 10 MB. 42 | */ 43 | size_t min_input_size_for_threading; 44 | 45 | /** 46 | * Create a thread pool with the specified number of worker threads. 47 | * 48 | * @param num_threads Number of worker threads to create. 49 | * If 1, implementation may skip thread creation overhead. 50 | * @return Opaque pool handle on success, NULL on failure 51 | * @note This is called once during KangarooTwelve initialization 52 | */ 53 | void* (*create)(int num_threads); 54 | 55 | /** 56 | * Submit work to the thread pool. 57 | * 58 | * The work function will be called with work_data as its argument. 59 | * Multiple jobs may be submitted before calling wait_all(). 60 | * 61 | * @param pool Opaque pool handle from create() 62 | * @param work_fn Function to execute (called as work_fn(work_data)) 63 | * @param work_data Opaque pointer passed to work_fn 64 | * @return 0 on success, non-zero on error 65 | * @note work_fn must be thread-safe and not access shared mutable state 66 | */ 67 | int (*submit)(void* pool, void (*work_fn)(void*), void* work_data); 68 | 69 | /** 70 | * Wait for all submitted work to complete. 71 | * 72 | * Blocks until all jobs submitted since the last wait_all() have finished. 73 | * After this returns, it is safe to submit new work. 74 | * 75 | * @param pool Opaque pool handle from create() 76 | * @note This may be called multiple times to wait for different batches 77 | */ 78 | void (*wait_all)(void* pool); 79 | 80 | /** 81 | * Destroy the thread pool and free all resources. 82 | * 83 | * All work must be complete before calling this (call wait_all() first). 84 | * After destruction, the pool handle must not be used. 85 | * 86 | * @param pool Opaque pool handle from create() 87 | * @note This is called during KangarooTwelve cleanup 88 | */ 89 | void (*destroy)(void* pool); 90 | } KT_ThreadPool_API; 91 | 92 | /** 93 | * Built-in thread pool backend using POSIX threads (pthreads). 94 | * 95 | * Available on Linux, macOS, BSD, and other Unix-like systems with pthread support. 96 | * Provides true parallel execution using worker threads. 97 | * 98 | * This is the default backend on pthread-capable platforms. 99 | */ 100 | extern const KT_ThreadPool_API KT_ThreadPool_Pthread; 101 | 102 | /** 103 | * Built-in sequential (no-threading) backend. 104 | * 105 | * Available on all platforms. Executes work functions immediately in the 106 | * calling thread (no actual parallelism). Useful as a fallback on platforms 107 | * without threading support or for testing/debugging. 108 | * 109 | * This is the default backend on platforms without pthread support. 110 | */ 111 | extern const KT_ThreadPool_API KT_ThreadPool_Sequential; 112 | 113 | /** 114 | * Get the default thread pool API for the current platform. 115 | * 116 | * Returns pthread backend if available, otherwise sequential backend. 117 | * 118 | * @return Pointer to the default thread pool API 119 | */ 120 | const KT_ThreadPool_API* KT_ThreadPool_GetDefault(void); 121 | 122 | #endif /* _KT_threadpool_h_ */ 123 | -------------------------------------------------------------------------------- /Python/TurboSHAKE.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Implementation by Gilles Van Assche, hereby denoted as "the implementer". 3 | # 4 | # For more information, feedback or questions, please refer to our website: 5 | # https://keccak.team/ 6 | # 7 | # To the extent possible under law, the implementer has waived all copyright 8 | # and related or neighboring rights to the source code in this file. 9 | # http://creativecommons.org/publicdomain/zero/1.0/ 10 | 11 | def ROL64(a, n): 12 | return ((a >> (64-(n%64))) + (a << (n%64))) % (1 << 64) 13 | 14 | def load64(b): 15 | return sum((b[i] << (8*i)) for i in range(8)) 16 | 17 | def store64(a): 18 | return bytearray((a >> (8*i)) % 256 for i in range(8)) 19 | 20 | def hex2lane(hexstring): 21 | bytez = [int(token, 16) for token in hexstring.split()] 22 | return load64(bytez) 23 | 24 | def KP(state): 25 | RC = [ 26 | hex2lane("8B 80 00 80 00 00 00 00"), 27 | hex2lane("8B 00 00 00 00 00 00 80"), 28 | hex2lane("89 80 00 00 00 00 00 80"), 29 | hex2lane("03 80 00 00 00 00 00 80"), 30 | hex2lane("02 80 00 00 00 00 00 80"), 31 | hex2lane("80 00 00 00 00 00 00 80"), 32 | hex2lane("0A 80 00 00 00 00 00 00"), 33 | hex2lane("0A 00 00 80 00 00 00 80"), 34 | hex2lane("81 80 00 80 00 00 00 80"), 35 | hex2lane("80 80 00 00 00 00 00 80"), 36 | hex2lane("01 00 00 80 00 00 00 00"), 37 | hex2lane("08 80 00 80 00 00 00 80"), 38 | ] 39 | 40 | lanes = [[0 for _ in range(5)] for _ in range(5)] 41 | for x in range(5): 42 | for y in range(5): 43 | lanes[x][y] = load64(state[8*(x+5*y):8*(x+5*y)+8]) 44 | 45 | for round in range(12): 46 | # theta 47 | C = [0]*5 48 | for x in range(5): 49 | C[x] = lanes[x][0] 50 | C[x] ^= lanes[x][1] 51 | C[x] ^= lanes[x][2] 52 | C[x] ^= lanes[x][3] 53 | C[x] ^= lanes[x][4] 54 | D = [0]*5 55 | for x in range(5): 56 | D[x] = C[(x+4) % 5] ^ ROL64(C[(x+1) % 5], 1) 57 | for y in range(5): 58 | for x in range(5): 59 | lanes[x][y] = lanes[x][y]^D[x] 60 | 61 | # rho and pi 62 | (x, y) = (1, 0) 63 | current = lanes[x][y] 64 | for t in range(24): 65 | (x, y) = (y, (2*x+3*y) % 5) 66 | (current, lanes[x][y]) = (lanes[x][y], ROL64(current, (t+1)*(t+2)//2)) 67 | 68 | # chi 69 | for y in range(5): 70 | T = [0]*5 71 | for x in range(5): 72 | T[x] = lanes[x][y] 73 | for x in range(5): 74 | lanes[x][y] = T[x] ^((~T[(x+1) % 5]) & T[(x+2) % 5]) 75 | 76 | # iota 77 | lanes[0][0] ^= RC[round] 78 | 79 | state = bytearray() 80 | for y in range(5): 81 | for x in range(5): 82 | state = state + store64(lanes[x][y]) 83 | 84 | return state 85 | 86 | def XOR(state1, state2): 87 | return [state1[i] ^ state2[i] for i in range(min(len(state1), len(state2)))] 88 | 89 | def TurboSHAKE128(message, separationByte, outputByteLen): 90 | offset = 0 91 | state = [0x00]*200 92 | input = list(message) + [separationByte] 93 | 94 | # === Absorb complete blocks === 95 | while offset < len(input) - 168: 96 | state = XOR(state, input[offset : offset + 168] + [0x00]*32) 97 | state = KP(state) 98 | offset += 168 99 | 100 | # === Absorb last block and treatment of padding === 101 | LastBlockLength = len(input) - offset 102 | state = XOR(state, input[offset:] + [0x00]*(200-LastBlockLength)) 103 | state = XOR(state, [0x00]*167 + [0x80] + [0x00]*32) 104 | state = KP(state) 105 | 106 | # === Squeeze === 107 | output = bytearray() 108 | while outputByteLen > 168: 109 | output = output + state[0:168] 110 | outputByteLen -= 168 111 | state = KP(state) 112 | 113 | output = output + state[0:outputByteLen] 114 | return output 115 | 116 | def TurboSHAKE256(message, separationByte, outputByteLen): 117 | offset = 0 118 | state = [0x00]*200 119 | input = list(message) + [separationByte] 120 | 121 | # === Absorb complete blocks === 122 | while offset < len(input) - 136: 123 | state = XOR(state, input[offset : offset + 136] + [0x00]*64) 124 | state = KP(state) 125 | offset += 136 126 | 127 | # === Absorb last block and treatment of padding === 128 | LastBlockLength = len(input) - offset 129 | state = XOR(state, input[offset:] + [0x00]*(200-LastBlockLength)) 130 | state = XOR(state, [0x00]*135 + [0x80] + [0x00]*64) 131 | state = KP(state) 132 | 133 | # === Squeeze === 134 | output = bytearray() 135 | while outputByteLen > 136: 136 | output = output + state[0:136] 137 | outputByteLen -= 136 138 | state = KP(state) 139 | 140 | output = output + state[0:outputByteLen] 141 | return output 142 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | # What is KangarooTwelve ? 2 | 3 | [**KangarooTwelve**][k12] is a family of two (**KT128** and **KT256**) fast and secure extendable-output functions (XOF), the generalization of hash functions to arbitrary output lengths. 4 | Derived from Keccak, they aim at higher speeds than FIPS 202's SHA-3 and SHAKE functions, while retaining their flexibility and basis of security. 5 | 6 | On high-end platforms, they can exploit a high degree of parallelism, whether using multiple cores or the single-instruction multiple-data (SIMD) instruction set of modern processors. 7 | On Intel's Haswell and Skylake architectures, KT128 tops at less than 1.5 cycles/byte for long messages on a single core, and at 0.51 cycles/byte on the SkylakeX and Cascade Lake architectures. 8 | On the latest Apple A14 and M1 processors, KangarooTwelve can take advantage of the ARMv8-A's SHA-3 dedicated instructions and KT128 delivers 0.75 cycles/byte for long messages on a single core. 9 | On low-end platforms, as well as for short messages, KT128 also benefits from about a factor two speed-up compared to the fastest FIPS 202 instance SHAKE128. 10 | 11 | More details can be found in our [ACNS paper][eprint] (KT128 only) and in [RFC 9861](rfc9861). 12 | 13 | # What can I find here? 14 | 15 | This repository contains source code that implements the extendable output (or hash) function **KT128** and **KT256**. 16 | Its purpose is to offer optimized implementations of the KangarooTwelve and nothing else. 17 | 18 | The code comes from the [**eXtended Keccak Code Package**][xkcp] (or **XKCP**), after much trimming to keep only what is needed for KT. 19 | It is still structured like the XKCP in two layers. The lower layer implements the permutation Keccak-_p_[1600, 12] and possibly parallel versions thereof, whereas the higher layer implements the sponge construction and the tree hash mode. 20 | Also, some sources have been merged to reduce the file count. 21 | 22 | * For the higher layer, we kept only the code needed for KT. 23 | * For the lower layer, we removed all the functions that are not needed for KT. The lower layer therefore implements a subset of the SnP and PlSnP interfaces. 24 | 25 | For Keccak or Xoodoo-based functions other than KT128 and KT256, it is recommended to use the XKCP itself instead and not to mix both this repository and the XKCP. 26 | 27 | 28 | # Is there a tool to compute the hash of a file? 29 | 30 | Not in this repository, but Jack O'Connor's [`kangarootwelve_xkcp.rs` repository](https://github.com/oconnor663/kangarootwelve_xkcp.rs) contains Rust bindings to this code and a `k12sum` utility. 31 | Pre-built binaries can be found [there](https://github.com/oconnor663/kangarootwelve_xkcp.rs/releases). 32 | 33 | 34 | # How can I build this code? 35 | 36 | This repository uses the same build system as that of the XKCP. 37 | To build, the following tools are needed: 38 | 39 | * *GCC* 40 | * *GNU make* 41 | * *xsltproc* 42 | 43 | The different targets are defined in [`Makefile.build`](Makefile.build). This file is expanded into a regular makefile using *xsltproc*. To use it, simply type, e.g., 44 | 45 | ``` 46 | make generic64/K12Tests 47 | ``` 48 | 49 | to build K12Tests generically optimized for 64-bit platforms. The name before the slash indicates the platform, while the part after the slash is the executable to build. As another example, the static (resp. dynamic) library is built by typing `make generic64/libK12.a` (resp. `.so`) or similarly with `generic64` replaced with the appropriate platform name. An alternate C compiler can be specified via the `CC` environment variable. 50 | 51 | Instead of building an executable with *GCC*, one can choose to select the files needed and make a package. For this, simply append `.pack` to the target name, e.g., 52 | 53 | ``` 54 | make generic64/K12Tests.pack 55 | ``` 56 | 57 | This creates a `.tar.gz` archive with all the necessary files to build the given target. 58 | 59 | The list of targets can be found at the end of [`Makefile.build`](Makefile.build) or by running `make` without parameters. 60 | 61 | ## Microsoft Visual Studio support 62 | 63 | KangarooTwelve can be compiled with Microsoft Visual Studio (MSVC). The XKCP build system offers support for the creation of project files. To get a project file for a given target, simply append `.vcxproj` to the target name, e.g., 64 | 65 | ``` 66 | make generic64noAsm/K12Tests.vcxproj 67 | ``` 68 | 69 | The targets `generic32` and `generic64noAsm` can be used with MSVC, but not `generic64` as it contains assembly implementations in the GCC syntax, which at this point cannot be used with MSVC. 70 | Please refer to the documention of [XKCP][xkcp] for more details on the limitations of the support of MSVC. 71 | 72 | [k12]: https://keccak.team/kangarootwelve.html 73 | [xkcp]: https://github.com/XKCP/XKCP 74 | [eprint]: https://eprint.iacr.org/2016/770.pdf 75 | [rfc9861]: https://datatracker.ietf.org/doc/rfc9861/ 76 | 77 | 78 | # Acknowledgments 79 | 80 | We wish to thank: 81 | 82 | - Andy Polyakov for his expertise with the ARMv8-A+SHA3 code, and in particular for his core routine from [CRYPTOGAMS](https://github.com/dot-asm/cryptogams) 83 | - Duc Tri Nguyen for his benchmark on the Apple M1 84 | - Jack O'Connor for bug fixes and more importantly for his [Rust bindings](https://github.com/oconnor663/kangarootwelve_xkcp.rs) 85 | - Kent Ross for his contributions to this code and its quality 86 | - Hadi El Yakhni for adding KT256 87 | -------------------------------------------------------------------------------- /Makefile.build: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 19 | 20 | -fomit-frame-pointer 21 | -O2 22 | -g0 23 | 24 | 25 | 26 | 27 | 28 | lib/Inplace32BI/KeccakP-1600-inplace32BI.c 29 | lib/Inplace32BI/KeccakP-1600-SnP.h 30 | 31 | 32 | 33 | lib/Optimized64/KeccakP-1600-opt64.c 34 | lib/Optimized64/KeccakP-1600-SnP.h 35 | lib/Optimized64/KeccakP-1600-AVX2.s 36 | lib/Optimized64/KeccakP-1600-AVX512.s 37 | lib/Optimized64/KeccakP-1600-timesN-SSSE3.c 38 | lib/Optimized64/KeccakP-1600-timesN-AVX2.c 39 | lib/Optimized64/KeccakP-1600-timesN-AVX512.c 40 | lib/Optimized64/KeccakP-1600-runtimeDispatch.c 41 | 42 | 43 | 44 | lib/Optimized64/KeccakP-1600-opt64.c 45 | lib/Optimized64/KeccakP-1600-AVX512-plainC.c 46 | lib/Optimized64/KeccakP-1600-SnP.h 47 | lib/Optimized64/KeccakP-1600-timesN-SSSE3.c 48 | lib/Optimized64/KeccakP-1600-timesN-AVX2.c 49 | lib/Optimized64/KeccakP-1600-timesN-AVX512.c 50 | lib/Optimized64/KeccakP-1600-runtimeDispatch.c 51 | KeccakP1600_noAssembly 52 | 53 | 54 | 55 | lib/Optimized64/KeccakP-1600-opt64.c 56 | lib/Plain64/KeccakP-1600-plain64.c 57 | lib/Plain64/KeccakP-1600-SnP.h 58 | 59 | 60 | 61 | lib/Optimized64/KeccakP-1600-opt64.c 62 | lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S 63 | lib/ARMv8Asha3/KeccakP-1600-runtimeDispatch.c 64 | lib/ARMv8Asha3/KeccakP-1600-SnP.h 65 | 66 | 67 | 68 | 69 | 70 | XKCP_has_KangarooTwelve 71 | lib/align.h 72 | lib/KangarooTwelve.c 73 | lib/KangarooTwelve.h 74 | lib/KT-threadpool.h 75 | lib/KT-threadpool.c 76 | lib/KT-threadpool-pthread.c 77 | lib/KT-threadpool-sequential.c 78 | lib/KangarooTwelve-threading.c 79 | lib/KangarooTwelve-threading.h 80 | -pthread 81 | 82 | 83 | 84 | 85 | 86 | lib/align.h 87 | lib/brg_endian.h 88 | 89 | 90 | 91 | 92 | tests/main.c 93 | tests/testPerformance.c 94 | tests/timing.c 95 | tests/timing.h 96 | tests/testPerformance.h 97 | tests/testKangarooTwelve.c 98 | tests/testKangarooTwelve.h 99 | -lm 100 | KeccakP1600_enable_simd_options 101 | 102 | 103 | 104 | 105 | util/KeccakSum/KeccakSum.c 106 | util/KeccakSum/base64.c 107 | util/KeccakSum/base64.h 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | notifications: 2 | email: 3 | recipients: 4 | - gilles-travis@noekeon.org 5 | on_failure: always 6 | 7 | language: c 8 | 9 | sudo: required 10 | 11 | before_install: 12 | - |- 13 | case $TRAVIS_OS_NAME in 14 | linux) 15 | sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test 16 | sudo apt-get -qq update 17 | sudo apt-get install xsltproc 18 | ;; 19 | windows) 20 | choco install -y make 21 | choco install -y xsltproc 22 | ;; 23 | esac 24 | 25 | jobs: 26 | allow_failures: 27 | - script: make all 28 | - script: make generic32/K12Tests && ./bin/generic32/K12Tests -a 29 | - script: make generic64/K12Tests && ./bin/generic64/K12Tests -a 30 | - script: make generic64noAsm/K12Tests && ./bin/generic64noAsm/K12Tests -a 31 | - script: make plain64/K12Tests && ./bin/plain64/K12Tests -a 32 | include: 33 | - stage: "Tests on AMD64 (windows)" 34 | script: make generic32/K12Tests && ./bin/generic32/K12Tests -a 35 | os: windows 36 | arch: amd64 37 | compiler: gcc 38 | name: "generic32 (gcc)" 39 | - script: make generic64/K12Tests && ./bin/generic64/K12Tests -a 40 | os: windows 41 | arch: amd64 42 | compiler: gcc 43 | name: "generic64 (gcc)" 44 | - script: make generic64noAsm/K12Tests && ./bin/generic64noAsm/K12Tests -a 45 | os: windows 46 | arch: amd64 47 | compiler: gcc 48 | name: "generic64noAsm (gcc)" 49 | - script: make plain64/K12Tests && ./bin/plain64/K12Tests -a 50 | os: windows 51 | arch: amd64 52 | compiler: gcc 53 | name: "plain64 (gcc)" 54 | - script: make all 55 | os: windows 56 | arch: amd64 57 | compiler: gcc 58 | name: "all (gcc)" 59 | - script: make generic32/K12Tests && ./bin/generic32/K12Tests -a 60 | os: windows 61 | arch: amd64 62 | compiler: clang 63 | name: "generic32 (clang)" 64 | - script: make generic64/K12Tests && ./bin/generic64/K12Tests -a 65 | os: windows 66 | arch: amd64 67 | compiler: clang 68 | name: "generic64 (clang)" 69 | - script: make generic64noAsm/K12Tests && ./bin/generic64noAsm/K12Tests -a 70 | os: windows 71 | arch: amd64 72 | compiler: clang 73 | name: "generic64noAsm (clang)" 74 | - script: make plain64/K12Tests && ./bin/plain64/K12Tests -a 75 | os: windows 76 | arch: amd64 77 | compiler: clang 78 | name: "plain64 (clang)" 79 | - script: make all 80 | os: windows 81 | arch: amd64 82 | compiler: clang 83 | name: "all (clang)" 84 | 85 | - stage: "Tests on AMD64 (linux)" 86 | script: make generic32/K12Tests && ./bin/generic32/K12Tests -a 87 | os: linux 88 | arch: amd64 89 | compiler: gcc 90 | name: "generic32 (gcc)" 91 | - script: make generic64/K12Tests && ./bin/generic64/K12Tests -a 92 | os: linux 93 | arch: amd64 94 | compiler: gcc 95 | name: "generic64 (gcc)" 96 | - script: make generic64noAsm/K12Tests && ./bin/generic64noAsm/K12Tests -a 97 | os: linux 98 | arch: amd64 99 | compiler: gcc 100 | name: "generic64noAsm (gcc)" 101 | - script: make plain64/K12Tests && ./bin/plain64/K12Tests -a 102 | os: linux 103 | arch: amd64 104 | compiler: gcc 105 | name: "plain64 (gcc)" 106 | - script: make all 107 | os: linux 108 | arch: amd64 109 | compiler: gcc 110 | name: "all (gcc)" 111 | - script: make generic32/K12Tests && ./bin/generic32/K12Tests -a 112 | os: linux 113 | arch: amd64 114 | compiler: clang 115 | name: "generic32 (clang)" 116 | - script: make generic64/K12Tests && ./bin/generic64/K12Tests -a 117 | os: linux 118 | arch: amd64 119 | compiler: clang 120 | name: "generic64 (clang)" 121 | - script: make generic64noAsm/K12Tests && ./bin/generic64noAsm/K12Tests -a 122 | os: linux 123 | arch: amd64 124 | compiler: clang 125 | name: "generic64noAsm (clang)" 126 | - script: make plain64/K12Tests && ./bin/plain64/K12Tests -a 127 | os: linux 128 | arch: amd64 129 | compiler: clang 130 | name: "plain64 (clang)" 131 | - script: make all 132 | os: linux 133 | arch: amd64 134 | compiler: clang 135 | name: "all (clang)" 136 | 137 | - stage: "Tests on AMD64 (osx)" 138 | script: make generic32/K12Tests && ./bin/generic32/K12Tests -a 139 | os: osx 140 | arch: amd64 141 | compiler: gcc 142 | name: "generic32 (gcc)" 143 | - script: make generic64/K12Tests && ./bin/generic64/K12Tests -a 144 | os: osx 145 | arch: amd64 146 | compiler: gcc 147 | name: "generic64 (gcc)" 148 | - script: make generic64noAsm/K12Tests && ./bin/generic64noAsm/K12Tests -a 149 | os: osx 150 | arch: amd64 151 | compiler: gcc 152 | name: "generic64noAsm (gcc)" 153 | - script: make plain64/K12Tests && ./bin/plain64/K12Tests -a 154 | os: osx 155 | arch: amd64 156 | compiler: gcc 157 | name: "plain64 (gcc)" 158 | - script: make all 159 | os: osx 160 | arch: amd64 161 | compiler: gcc 162 | name: "all (gcc)" 163 | - script: make generic32/K12Tests && ./bin/generic32/K12Tests -a 164 | os: osx 165 | arch: amd64 166 | compiler: clang 167 | name: "generic32 (clang)" 168 | - script: make generic64/K12Tests && ./bin/generic64/K12Tests -a 169 | os: osx 170 | arch: amd64 171 | compiler: clang 172 | name: "generic64 (clang)" 173 | - script: make generic64noAsm/K12Tests && ./bin/generic64noAsm/K12Tests -a 174 | os: osx 175 | arch: amd64 176 | compiler: clang 177 | name: "generic64noAsm (clang)" 178 | - script: make plain64/K12Tests && ./bin/plain64/K12Tests -a 179 | os: osx 180 | arch: amd64 181 | compiler: clang 182 | name: "plain64 (clang)" 183 | - script: make all 184 | os: osx 185 | arch: amd64 186 | compiler: clang 187 | name: "all (clang)" 188 | -------------------------------------------------------------------------------- /lib/brg_endian.h: -------------------------------------------------------------------------------- 1 | /* 2 | --------------------------------------------------------------------------- 3 | Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. 4 | 5 | LICENSE TERMS 6 | 7 | The redistribution and use of this software (with or without changes) 8 | is allowed without the payment of fees or royalties provided that: 9 | 10 | 1. source code distributions include the above copyright notice, this 11 | list of conditions and the following disclaimer; 12 | 13 | 2. binary distributions include the above copyright notice, this list 14 | of conditions and the following disclaimer in their documentation; 15 | 16 | 3. the name of the copyright holder is not used to endorse products 17 | built using this software without specific written permission. 18 | 19 | DISCLAIMER 20 | 21 | This software is provided 'as is' with no explicit or implied warranties 22 | in respect of its properties, including, but not limited to, correctness 23 | and/or fitness for purpose. 24 | --------------------------------------------------------------------------- 25 | Issue Date: 20/12/2007 26 | Changes for ARM 9/9/2010 27 | */ 28 | 29 | #ifndef _BRG_ENDIAN_H 30 | #define _BRG_ENDIAN_H 31 | 32 | #define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ 33 | #define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ 34 | 35 | #if 0 36 | /* Include files where endian defines and byteswap functions may reside */ 37 | #if defined( __sun ) 38 | # include 39 | #elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) 40 | # include 41 | #elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ 42 | defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) 43 | # include 44 | #elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) 45 | # if !defined( __MINGW32__ ) && !defined( _AIX ) 46 | # include 47 | # if !defined( __BEOS__ ) 48 | # include 49 | # endif 50 | # endif 51 | #endif 52 | #endif 53 | 54 | /* Now attempt to set the define for platform byte order using any */ 55 | /* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ 56 | /* seem to encompass most endian symbol definitions */ 57 | 58 | #if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) 59 | # if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN 60 | # define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN 61 | # elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN 62 | # define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN 63 | # endif 64 | #elif defined( BIG_ENDIAN ) 65 | # define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN 66 | #elif defined( LITTLE_ENDIAN ) 67 | # define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN 68 | #endif 69 | 70 | #if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) 71 | # if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN 72 | # define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN 73 | # elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN 74 | # define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN 75 | # endif 76 | #elif defined( _BIG_ENDIAN ) 77 | # define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN 78 | #elif defined( _LITTLE_ENDIAN ) 79 | # define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN 80 | #endif 81 | 82 | #if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) 83 | # if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN 84 | # define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN 85 | # elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN 86 | # define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN 87 | # endif 88 | #elif defined( __BIG_ENDIAN ) 89 | # define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN 90 | #elif defined( __LITTLE_ENDIAN ) 91 | # define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN 92 | #endif 93 | 94 | #if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) 95 | # if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ 96 | # define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN 97 | # elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ 98 | # define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN 99 | # endif 100 | #elif defined( __BIG_ENDIAN__ ) 101 | # define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN 102 | #elif defined( __LITTLE_ENDIAN__ ) 103 | # define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN 104 | #endif 105 | 106 | /* if the platform byte order could not be determined, then try to */ 107 | /* set this define using common machine defines */ 108 | #if !defined(PLATFORM_BYTE_ORDER) 109 | 110 | #if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ 111 | defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ 112 | defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ 113 | defined( vax ) || defined( vms ) || defined( VMS ) || \ 114 | defined( __VMS ) || defined( _M_X64 ) 115 | # define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN 116 | 117 | #elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ 118 | defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ 119 | defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ 120 | defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ 121 | defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ 122 | defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ 123 | defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) || \ 124 | defined( __s390__ ) || defined( __s390x__ ) || defined( __zarch__ ) 125 | # define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN 126 | 127 | #elif defined(__arm__) 128 | # ifdef __BIG_ENDIAN 129 | # define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN 130 | # else 131 | # define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN 132 | # endif 133 | #elif 1 /* **** EDIT HERE IF NECESSARY **** */ 134 | # define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN 135 | #elif 0 /* **** EDIT HERE IF NECESSARY **** */ 136 | # define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN 137 | #else 138 | # error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order 139 | #endif 140 | 141 | #endif 142 | 143 | #endif 144 | -------------------------------------------------------------------------------- /lib/ARMv8Asha3/KeccakP-1600-opt64.c: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. 6 | 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | 16 | --- 17 | 18 | Please refer to the XKCP for more details. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | const char * KeccakP1600_GetImplementation() 27 | { 28 | return "ARMv8-A+SHA3 optimized implementation"; 29 | } 30 | 31 | /* ---------------------------------------------------------------- */ 32 | 33 | void KeccakP1600_opt64_Initialize(void *state) 34 | { 35 | memset(state, 0, 200); 36 | } 37 | 38 | /* ---------------------------------------------------------------- */ 39 | 40 | void KeccakP1600_opt64_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length) 41 | { 42 | uint64_t lane; 43 | 44 | if (length == 0) 45 | return; 46 | if (length == 1) 47 | lane = data[0]; 48 | else { 49 | lane = 0; 50 | memcpy(&lane, data, length); 51 | } 52 | lane <<= offset*8; 53 | ((uint64_t*)state)[lanePosition] ^= lane; 54 | } 55 | 56 | /* ---------------------------------------------------------------- */ 57 | 58 | static void KeccakP1600_opt64_AddLanes(void *state, const unsigned char *data, unsigned int laneCount) 59 | { 60 | unsigned int i = 0; 61 | 62 | for( ; (i+8)<=laneCount; i+=8) { 63 | ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0]; 64 | ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1]; 65 | ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2]; 66 | ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3]; 67 | ((uint64_t*)state)[i+4] ^= ((uint64_t*)data)[i+4]; 68 | ((uint64_t*)state)[i+5] ^= ((uint64_t*)data)[i+5]; 69 | ((uint64_t*)state)[i+6] ^= ((uint64_t*)data)[i+6]; 70 | ((uint64_t*)state)[i+7] ^= ((uint64_t*)data)[i+7]; 71 | } 72 | for( ; (i+4)<=laneCount; i+=4) { 73 | ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0]; 74 | ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1]; 75 | ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2]; 76 | ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3]; 77 | } 78 | for( ; (i+2)<=laneCount; i+=2) { 79 | ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0]; 80 | ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1]; 81 | } 82 | if (i 0) { \ 112 | unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ 113 | if (_bytesInLane > _sizeLeft) \ 114 | _bytesInLane = _sizeLeft; \ 115 | SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ 116 | _sizeLeft -= _bytesInLane; \ 117 | _lanePosition++; \ 118 | _offsetInLane = 0; \ 119 | _curData += _bytesInLane; \ 120 | } \ 121 | } \ 122 | } 123 | 124 | void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) 125 | { 126 | SnP_AddBytes(state, data, offset, length, KeccakP1600_opt64_AddLanes, KeccakP1600_opt64_AddBytesInLane, 8); 127 | } 128 | 129 | /* ---------------------------------------------------------------- */ 130 | 131 | void KeccakP1600_opt64_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length) 132 | { 133 | uint64_t lane = ((uint64_t*)state)[lanePosition]; 134 | { 135 | uint64_t lane1[1]; 136 | lane1[0] = lane; 137 | memcpy(data, (uint8_t*)lane1+offset, length); 138 | } 139 | } 140 | 141 | /* ---------------------------------------------------------------- */ 142 | 143 | void KeccakP1600_opt64_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount) 144 | { 145 | memcpy(data, state, laneCount*8); 146 | } 147 | 148 | /* ---------------------------------------------------------------- */ 149 | 150 | #define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \ 151 | { \ 152 | if ((offset) == 0) { \ 153 | SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \ 154 | SnP_ExtractBytesInLane(state, \ 155 | (length)/SnP_laneLengthInBytes, \ 156 | (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \ 157 | 0, \ 158 | (length)%SnP_laneLengthInBytes); \ 159 | } \ 160 | else { \ 161 | unsigned int _sizeLeft = (length); \ 162 | unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \ 163 | unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \ 164 | unsigned char *_curData = (data); \ 165 | while(_sizeLeft > 0) { \ 166 | unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ 167 | if (_bytesInLane > _sizeLeft) \ 168 | _bytesInLane = _sizeLeft; \ 169 | SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ 170 | _sizeLeft -= _bytesInLane; \ 171 | _lanePosition++; \ 172 | _offsetInLane = 0; \ 173 | _curData += _bytesInLane; \ 174 | } \ 175 | } \ 176 | } 177 | 178 | void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) 179 | { 180 | SnP_ExtractBytes(state, data, offset, length, KeccakP1600_opt64_ExtractLanes, KeccakP1600_opt64_ExtractBytesInLane, 8); 181 | } 182 | 183 | /* ---------------------------------------------------------------- */ 184 | 185 | /* Keccak-p[1600]×2 */ 186 | 187 | int KeccakP1600times2_IsAvailable() 188 | { 189 | return 1; 190 | } 191 | 192 | const char * KeccakP1600times2_GetImplementation() 193 | { 194 | return "ARMv8-A+SHA3 optimized implementation"; 195 | } 196 | 197 | /* Keccak-p[1600]×4 */ 198 | 199 | int KeccakP1600times4_IsAvailable() 200 | { 201 | return 0; 202 | } 203 | 204 | const char * KeccakP1600times4_GetImplementation() 205 | { 206 | return ""; 207 | } 208 | 209 | void KT128_Process4Leaves(const unsigned char *input, unsigned char *output) 210 | { 211 | } 212 | 213 | void KT256_Process4Leaves(const unsigned char *input, unsigned char *output) 214 | { 215 | } 216 | 217 | /* Keccak-p[1600]×8 */ 218 | 219 | int KeccakP1600times8_IsAvailable() 220 | { 221 | return 0; 222 | } 223 | 224 | const char * KeccakP1600times8_GetImplementation() 225 | { 226 | return ""; 227 | } 228 | 229 | void KT128_Process8Leaves(const unsigned char *input, unsigned char *output) 230 | { 231 | } 232 | 233 | void KT256_Process8Leaves(const unsigned char *input, unsigned char *output) 234 | { 235 | } 236 | -------------------------------------------------------------------------------- /tests/testPerformance.c: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. 6 | 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "align.h" 24 | #include "KangarooTwelve.h" 25 | #include "KeccakP-1600-SnP.h" 26 | #include "timing.h" 27 | #include "testPerformance.h" 28 | 29 | #if !defined(__x86_64__) && !defined(_M_X64) && !defined(__i386__) && !defined(_M_IX86) 30 | #undef KeccakP1600_enable_simd_options 31 | #endif 32 | 33 | #define BIG_BUFFER_SIZE (2*1024*1024) 34 | ALIGN(64) uint8_t bigBuffer[BIG_BUFFER_SIZE]; 35 | 36 | cycles_t measurePerformance(int (*impl)(const unsigned char*, size_t, 37 | unsigned char*, size_t, 38 | const unsigned char*, size_t), 39 | cycles_t dtMin, unsigned int inputLen) 40 | { 41 | ALIGN(64) unsigned char output[32]; 42 | measureTimingDeclare 43 | 44 | assert(inputLen <= BIG_BUFFER_SIZE); 45 | 46 | memset(bigBuffer, 0xA5, 16); 47 | 48 | measureTimingBeginDeclared 49 | impl(bigBuffer, inputLen, output, 32, (const unsigned char *)"", 0); 50 | measureTimingEnd 51 | } 52 | 53 | #if defined(KeccakP1600_enable_simd_options) && !defined(KeccakP1600_disableParallelism) 54 | void KangarooTwelve_SetProcessorCapabilities(); 55 | #endif 56 | 57 | void printKangarooTwelvePerformanceHeader(int securityLevel) 58 | { 59 | #if defined(KeccakP1600_enable_simd_options) && !defined(KeccakP1600_disableParallelism) 60 | KangarooTwelve_SetProcessorCapabilities(); 61 | #endif 62 | printf("*** KT%d ***\n", securityLevel); 63 | printf("Using Keccak-p[1600,12] implementations:\n"); 64 | printf("- \303\2271: %s\n", KeccakP1600_GetImplementation()); 65 | #if defined(KeccakP1600_12rounds_FastLoop_supported) 66 | printf(" + KeccakP1600_12rounds_FastLoop_Absorb()\n"); 67 | #endif 68 | 69 | #ifndef KeccakP1600_disableParallelism 70 | if (KeccakP1600times2_IsAvailable()) { 71 | printf("- \303\2272: %s\n", KeccakP1600times2_GetImplementation()); 72 | #if defined(KeccakP1600times2_12rounds_FastLoop_supported) 73 | printf(" + KeccakP1600times2_12rounds_FastLoop_Absorb()\n"); 74 | #endif 75 | } 76 | else 77 | printf("- \303\2272: not used\n"); 78 | 79 | if (KeccakP1600times4_IsAvailable()) { 80 | printf("- \303\2274: %s\n", KeccakP1600times4_GetImplementation()); 81 | #if defined(KeccakP1600times4_12rounds_FastLoop_supported) 82 | printf(" + KeccakP1600times4_12rounds_FastLoop_Absorb()\n"); 83 | #endif 84 | } 85 | else 86 | printf("- \303\2274: not used\n"); 87 | 88 | if (KeccakP1600times8_IsAvailable()) { 89 | printf("- \303\2278: %s\n", KeccakP1600times8_GetImplementation()); 90 | #if defined(KeccakP1600times8_12rounds_FastLoop_supported) 91 | printf(" + KeccakP1600times8_12rounds_FastLoop_Absorb()\n"); 92 | #endif 93 | } 94 | else 95 | printf("- \303\2278: not used\n"); 96 | #endif 97 | 98 | printf("\n"); 99 | } 100 | 101 | void testPerformanceFull(int (*impl)(const unsigned char*, size_t, 102 | unsigned char*, size_t, 103 | const unsigned char*, size_t), int extra) 104 | { 105 | const unsigned int chunkSize = 8192; 106 | unsigned halfTones; 107 | cycles_t calibration = CalibrateTimer(); 108 | unsigned int chunkSizeLog = (unsigned int)floor(log(chunkSize)/log(2.0)+0.5); 109 | int displaySlope = 0; 110 | 111 | measurePerformance(impl, calibration, 500000); 112 | for(halfTones=chunkSizeLog*12-28; halfTones<=13*12; halfTones+=4) { 113 | double I = pow(2.0, halfTones/12.0); 114 | unsigned int i = (unsigned int)floor(I+0.5); 115 | cycles_t time, timePlus1Block, timePlus2Blocks, timePlus4Blocks, timePlus8Blocks; 116 | cycles_t timePlus168Blocks; 117 | time = measurePerformance(impl, calibration, i); 118 | if (i == chunkSize) { 119 | displaySlope = 1; 120 | timePlus1Block = measurePerformance(impl, calibration, i+1*chunkSize); 121 | timePlus2Blocks = measurePerformance(impl, calibration, i+2*chunkSize); 122 | timePlus4Blocks = measurePerformance(impl, calibration, i+4*chunkSize); 123 | timePlus8Blocks = measurePerformance(impl, calibration, i+8*chunkSize); 124 | timePlus168Blocks = measurePerformance(impl, calibration, i+extra*chunkSize); 125 | } 126 | printf("%8u bytes: %9"PRId64" %s, %6.3f %s/byte\n", i, time, getTimerUnit(), time*1.0/i, getTimerUnit()); 127 | if (displaySlope) { 128 | printf(" +1 block: %9"PRId64" %s, %6.3f %s/byte (slope)\n", timePlus1Block, getTimerUnit(), (timePlus1Block-(double)(time))*1.0/chunkSize/1.0, getTimerUnit()); 129 | printf(" +2 blocks: %9"PRId64" %s, %6.3f %s/byte (slope)\n", timePlus2Blocks, getTimerUnit(), (timePlus2Blocks-(double)(time))*1.0/chunkSize/2.0, getTimerUnit()); 130 | printf(" +4 blocks: %9"PRId64" %s, %6.3f %s/byte (slope)\n", timePlus4Blocks, getTimerUnit(), (timePlus4Blocks-(double)(time))*1.0/chunkSize/4.0, getTimerUnit()); 131 | printf(" +8 blocks: %9"PRId64" %s, %6.3f %s/byte (slope)\n", timePlus8Blocks, getTimerUnit(), (timePlus8Blocks-(double)(time))*1.0/chunkSize/8.0, getTimerUnit()); 132 | printf(" +%d blocks: %9"PRId64" %s, %6.3f %s/byte (slope)\n", extra, timePlus168Blocks, getTimerUnit(), (timePlus168Blocks-(double)(time))*1.0/chunkSize/(extra*1.0), getTimerUnit()); 133 | displaySlope = 0; 134 | } 135 | } 136 | for(halfTones=12*12; halfTones<=20*12; halfTones+=4) { 137 | double I = chunkSize + pow(2.0, halfTones/12.0); 138 | unsigned int i = (unsigned int)floor(I+0.5); 139 | cycles_t time; 140 | time = measurePerformance(impl, calibration, i); 141 | printf("%8u bytes: %9"PRId64" %s, %6.3f %s/byte\n", i, time, getTimerUnit(), time*1.0/i, getTimerUnit()); 142 | } 143 | printf("\n\n"); 144 | } 145 | 146 | void testKangarooTwelvePerformance() 147 | { 148 | printKangarooTwelvePerformanceHeader(128); 149 | testPerformanceFull(KT128, 168); 150 | printKangarooTwelvePerformanceHeader(256); 151 | testPerformanceFull(KT256, 136); 152 | } 153 | void testPerformance() 154 | { 155 | #if defined(KeccakP1600_enable_simd_options) && !defined(KeccakP1600_disableParallelism) 156 | // Read feature availability 157 | KangarooTwelve_EnableAllCpuFeatures(); 158 | int cpu_has_AVX512 = KangarooTwelve_DisableAVX512(); 159 | int cpu_has_AVX2 = KangarooTwelve_DisableAVX2(); 160 | int cpu_has_SSSE3 = KangarooTwelve_DisableSSSE3(); 161 | #endif 162 | 163 | // Test without vectorization 164 | testKangarooTwelvePerformance(); 165 | 166 | #if defined(KeccakP1600_enable_simd_options) && !defined(KeccakP1600_disableParallelism) 167 | // Test with SSSE3 only if it's available 168 | if (cpu_has_SSSE3) { 169 | printf("\n"); 170 | KangarooTwelve_EnableAllCpuFeatures(); 171 | KangarooTwelve_DisableAVX512(); 172 | KangarooTwelve_DisableAVX2(); 173 | testKangarooTwelvePerformance(); 174 | } 175 | // Test with SSSE3 and AVX2 if they're available 176 | if (cpu_has_AVX2) { 177 | printf("\n"); 178 | KangarooTwelve_EnableAllCpuFeatures(); 179 | KangarooTwelve_DisableAVX512(); 180 | testKangarooTwelvePerformance(); 181 | } 182 | // Finally, test with everything enabled if we have AVX512 183 | if (cpu_has_AVX512) { 184 | printf("\n"); 185 | KangarooTwelve_EnableAllCpuFeatures(); 186 | testKangarooTwelvePerformance(); 187 | } 188 | #endif 189 | 190 | // Set `comparison` to your own function here to directly 191 | // compare performance against K12. It should have the same signature 192 | // as KangarooTwelve(...): the parameters are input, output, and 193 | // customization buffers. 194 | int (*comparison)(const unsigned char*, size_t, 195 | unsigned char*, size_t, 196 | const unsigned char*, size_t) = NULL; 197 | 198 | if (comparison != NULL) { 199 | printf("\n*** Non-K12 function for comparison: ***\n"); 200 | testPerformanceFull(comparison, 128); 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /lib/KT-threadpool-pthread.c: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | Thread pool implementation using POSIX threads (pthreads). 6 | 7 | To the extent possible under law, the implementer has waived all copyright 8 | and related or neighboring rights to the source code in this file. 9 | http://creativecommons.org/publicdomain/zero/1.0/ 10 | */ 11 | 12 | #include "KT-threadpool.h" 13 | #include 14 | #include 15 | 16 | /* Only compile pthread backend if pthreads are available */ 17 | #if defined(_POSIX_THREADS) || defined(__unix__) || defined(__unix) || \ 18 | (defined(__APPLE__) && defined(__MACH__)) || defined(__linux__) || \ 19 | defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) 20 | 21 | #include 22 | 23 | #define MAX_THREADS 64 24 | #define MAX_JOBS 256 25 | 26 | /* Job structure */ 27 | typedef struct { 28 | void (*work_fn)(void*); 29 | void* work_data; 30 | } Job; 31 | 32 | /* Thread pool context */ 33 | typedef struct { 34 | pthread_t* threads; 35 | int num_threads; 36 | 37 | /* Job queue */ 38 | Job job_queue[MAX_JOBS]; 39 | int job_count; 40 | int jobs_grabbed; /* Number of jobs grabbed by workers */ 41 | int jobs_finished; /* Number of jobs actually completed */ 42 | 43 | /* Synchronization */ 44 | pthread_mutex_t mutex; 45 | pthread_cond_t work_available; 46 | pthread_cond_t work_complete; 47 | 48 | /* Thread IDs for passing to worker threads */ 49 | int* thread_ids; 50 | 51 | /* Lifecycle */ 52 | int shutdown; 53 | } PthreadPool; 54 | 55 | /* Worker thread function */ 56 | static void* worker_thread(void* arg) 57 | { 58 | PthreadPool* pool = (PthreadPool*)arg; 59 | 60 | while (1) { 61 | pthread_mutex_lock(&pool->mutex); 62 | 63 | /* Wait for work or shutdown */ 64 | while (!pool->shutdown && pool->jobs_grabbed >= pool->job_count) { 65 | pthread_cond_wait(&pool->work_available, &pool->mutex); 66 | } 67 | 68 | if (pool->shutdown) { 69 | pthread_mutex_unlock(&pool->mutex); 70 | break; 71 | } 72 | 73 | /* Get next available job atomically */ 74 | Job job; 75 | int has_job = 0; 76 | if (pool->jobs_grabbed < pool->job_count) { 77 | job = pool->job_queue[pool->jobs_grabbed]; 78 | pool->jobs_grabbed++; 79 | has_job = 1; 80 | } 81 | 82 | pthread_mutex_unlock(&pool->mutex); 83 | 84 | /* Execute job outside the lock */ 85 | if (has_job && job.work_fn) { 86 | job.work_fn(job.work_data); 87 | 88 | /* Mark job as finished */ 89 | pthread_mutex_lock(&pool->mutex); 90 | pool->jobs_finished++; 91 | if (pool->jobs_finished >= pool->job_count) { 92 | pthread_cond_signal(&pool->work_complete); 93 | } 94 | pthread_mutex_unlock(&pool->mutex); 95 | } 96 | } 97 | 98 | return NULL; 99 | } 100 | 101 | /* Create pthread pool */ 102 | static void* pthread_create_pool(int num_threads) 103 | { 104 | if (num_threads < 1 || num_threads > MAX_THREADS) 105 | return NULL; 106 | 107 | PthreadPool* pool = (PthreadPool*)malloc(sizeof(PthreadPool)); 108 | if (!pool) 109 | return NULL; 110 | 111 | memset(pool, 0, sizeof(PthreadPool)); 112 | pool->num_threads = num_threads; 113 | 114 | /* Initialize synchronization primitives */ 115 | if (pthread_mutex_init(&pool->mutex, NULL) != 0) { 116 | free(pool); 117 | return NULL; 118 | } 119 | 120 | if (pthread_cond_init(&pool->work_available, NULL) != 0) { 121 | pthread_mutex_destroy(&pool->mutex); 122 | free(pool); 123 | return NULL; 124 | } 125 | 126 | if (pthread_cond_init(&pool->work_complete, NULL) != 0) { 127 | pthread_mutex_destroy(&pool->mutex); 128 | pthread_cond_destroy(&pool->work_available); 129 | free(pool); 130 | return NULL; 131 | } 132 | 133 | /* Allocate thread array */ 134 | pool->threads = (pthread_t*)malloc(num_threads * sizeof(pthread_t)); 135 | if (!pool->threads) { 136 | pthread_mutex_destroy(&pool->mutex); 137 | pthread_cond_destroy(&pool->work_available); 138 | pthread_cond_destroy(&pool->work_complete); 139 | free(pool); 140 | return NULL; 141 | } 142 | 143 | pool->thread_ids = (int*)malloc(num_threads * sizeof(int)); 144 | if (!pool->thread_ids) { 145 | free(pool->threads); 146 | pthread_mutex_destroy(&pool->mutex); 147 | pthread_cond_destroy(&pool->work_available); 148 | pthread_cond_destroy(&pool->work_complete); 149 | free(pool); 150 | return NULL; 151 | } 152 | 153 | /* Create worker threads */ 154 | pool->shutdown = 0; 155 | pool->job_count = 0; 156 | pool->jobs_grabbed = 0; 157 | pool->jobs_finished = 0; 158 | 159 | for (int i = 0; i < num_threads; i++) { 160 | pool->thread_ids[i] = i; 161 | if (pthread_create(&pool->threads[i], NULL, worker_thread, pool) != 0) { 162 | /* Failed to create thread - clean up */ 163 | pool->shutdown = 1; 164 | pthread_cond_broadcast(&pool->work_available); 165 | for (int j = 0; j < i; j++) { 166 | pthread_join(pool->threads[j], NULL); 167 | } 168 | free(pool->thread_ids); 169 | free(pool->threads); 170 | pthread_mutex_destroy(&pool->mutex); 171 | pthread_cond_destroy(&pool->work_available); 172 | pthread_cond_destroy(&pool->work_complete); 173 | free(pool); 174 | return NULL; 175 | } 176 | } 177 | 178 | return pool; 179 | } 180 | 181 | /* Submit work to pthread pool */ 182 | static int pthread_submit(void* pool_handle, void (*work_fn)(void*), void* work_data) 183 | { 184 | PthreadPool* pool = (PthreadPool*)pool_handle; 185 | if (!pool || !work_fn) 186 | return 1; 187 | 188 | pthread_mutex_lock(&pool->mutex); 189 | 190 | if (pool->job_count >= MAX_JOBS) { 191 | pthread_mutex_unlock(&pool->mutex); 192 | return 1; /* Job queue full */ 193 | } 194 | 195 | pool->job_queue[pool->job_count].work_fn = work_fn; 196 | pool->job_queue[pool->job_count].work_data = work_data; 197 | pool->job_count++; 198 | 199 | pthread_mutex_unlock(&pool->mutex); 200 | 201 | return 0; 202 | } 203 | 204 | /* Wait for all work to complete */ 205 | static void pthread_wait_all(void* pool_handle) 206 | { 207 | PthreadPool* pool = (PthreadPool*)pool_handle; 208 | if (!pool) 209 | return; 210 | 211 | pthread_mutex_lock(&pool->mutex); 212 | 213 | /* Reset counters and wake up workers */ 214 | pool->jobs_grabbed = 0; 215 | pool->jobs_finished = 0; 216 | pthread_cond_broadcast(&pool->work_available); 217 | 218 | /* Wait for all jobs to finish execution */ 219 | while (pool->jobs_finished < pool->job_count) { 220 | pthread_cond_wait(&pool->work_complete, &pool->mutex); 221 | } 222 | 223 | /* Reset for next batch */ 224 | pool->job_count = 0; 225 | pool->jobs_grabbed = 0; 226 | pool->jobs_finished = 0; 227 | 228 | pthread_mutex_unlock(&pool->mutex); 229 | } 230 | 231 | /* Destroy pthread pool */ 232 | static void pthread_destroy(void* pool_handle) 233 | { 234 | PthreadPool* pool = (PthreadPool*)pool_handle; 235 | if (!pool) 236 | return; 237 | 238 | pthread_mutex_lock(&pool->mutex); 239 | pool->shutdown = 1; 240 | pthread_cond_broadcast(&pool->work_available); 241 | pthread_mutex_unlock(&pool->mutex); 242 | 243 | /* Wait for all threads to finish */ 244 | for (int i = 0; i < pool->num_threads; i++) { 245 | pthread_join(pool->threads[i], NULL); 246 | } 247 | 248 | /* Cleanup */ 249 | free(pool->thread_ids); 250 | free(pool->threads); 251 | pthread_mutex_destroy(&pool->mutex); 252 | pthread_cond_destroy(&pool->work_available); 253 | pthread_cond_destroy(&pool->work_complete); 254 | free(pool); 255 | } 256 | 257 | /* Export pthread backend API */ 258 | const KT_ThreadPool_API KT_ThreadPool_Pthread = { 259 | .min_input_size_for_threading = 2097152, /* 2 MB default threshold */ 260 | .create = pthread_create_pool, 261 | .submit = pthread_submit, 262 | .wait_all = pthread_wait_all, 263 | .destroy = pthread_destroy 264 | }; 265 | 266 | #else /* !HAVE_PTHREADS */ 267 | 268 | /* Pthread not available on this platform - provide stub */ 269 | static void* pthread_stub_create(int num_threads) { 270 | (void)num_threads; 271 | return NULL; 272 | } 273 | 274 | static int pthread_stub_submit(void* pool, void (*work_fn)(void*), void* work_data) { 275 | (void)pool; (void)work_fn; (void)work_data; 276 | return 1; 277 | } 278 | 279 | static void pthread_stub_wait_all(void* pool) { 280 | (void)pool; 281 | } 282 | 283 | static void pthread_stub_destroy(void* pool) { 284 | (void)pool; 285 | } 286 | 287 | const KT_ThreadPool_API KT_ThreadPool_Pthread = { 288 | .min_input_size_for_threading = 2097152, /* 2 MB default threshold */ 289 | .create = pthread_stub_create, 290 | .submit = pthread_stub_submit, 291 | .wait_all = pthread_stub_wait_all, 292 | .destroy = pthread_stub_destroy 293 | }; 294 | 295 | #endif /* HAVE_PTHREADS */ 296 | -------------------------------------------------------------------------------- /lib/KangarooTwelve.h: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. 6 | 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | */ 16 | 17 | #ifndef _KangarooTwelve_h_ 18 | #define _KangarooTwelve_h_ 19 | 20 | #include 21 | #include 22 | #include "align.h" 23 | #include "KeccakP-1600-SnP.h" 24 | 25 | typedef struct TurboSHAKE_InstanceStruct { 26 | uint8_t state[KeccakP1600_stateSizeInBytes]; 27 | unsigned int rate; 28 | uint8_t byteIOIndex; 29 | uint8_t squeezing; 30 | } TurboSHAKE_Instance; 31 | 32 | typedef struct KangarooTwelve_InstanceStruct { 33 | ALIGN(KeccakP1600_stateAlignment) TurboSHAKE_Instance queueNode; 34 | ALIGN(KeccakP1600_stateAlignment) TurboSHAKE_Instance finalNode; 35 | size_t fixedOutputLength; 36 | size_t blockNumber; 37 | unsigned int queueAbsorbedLen; 38 | int phase; 39 | int securityLevel; 40 | /* Thread pool for parallel chunk processing (optional, can be NULL) */ 41 | const void* threadpool_api; /* KT_ThreadPool_API* */ 42 | void* threadpool_handle; 43 | int thread_count; 44 | } KangarooTwelve_Instance; 45 | 46 | /** Extendable ouput function KangarooTwelve. 47 | * @param securityLevel 128 for KT128 or 256 for KT256 48 | * @param input Pointer to the input message (M). 49 | * @param inputByteLen The length of the input message in bytes. 50 | * @param output Pointer to the output buffer. 51 | * @param outputByteLen The desired number of output bytes. 52 | * @param customization Pointer to the customization string (C). 53 | * @param customByteLen The length of the customization string in bytes. 54 | * @return 0 if successful, 1 otherwise. 55 | */ 56 | int KangarooTwelve(int securityLevel, const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen); 57 | 58 | /** 59 | * Wrapper around `KangarooTwelve` to use the 128-bit security level. 60 | */ 61 | int KT128(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen); 62 | 63 | /** 64 | * Wrapper around `KangarooTwelve` to use the 256-bit security level. 65 | */ 66 | int KT256(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen); 67 | 68 | /** 69 | * Function to initialize a KangarooTwelve instance. 70 | * @param ktInstance Pointer to the instance to be initialized. 71 | * @param securityLevel 128 for KT128 or 256 for KT256 72 | * @param outputByteLen The desired number of output bytes, 73 | * or 0 for an arbitrarily-long output. 74 | * @return 0 if successful, 1 otherwise. 75 | */ 76 | int KangarooTwelve_Initialize(KangarooTwelve_Instance *ktInstance, int securityLevel, size_t outputByteLen); 77 | 78 | /** 79 | * Function to initialize a KangarooTwelve instance with threading support. 80 | * @param ktInstance Pointer to the instance to be initialized. 81 | * @param securityLevel 128 for KT128 or 256 for KT256 82 | * @param outputByteLen The desired number of output bytes, 83 | * or 0 for an arbitrarily-long output. 84 | * @param threadpool_api Thread pool API (NULL for no threading). 85 | * Must point to a KT_ThreadPool_API struct. 86 | * @param threadpool_handle Thread pool handle (from threadpool_api->create()). 87 | * Ignored if threadpool_api is NULL. 88 | * @param thread_count Number of threads in the pool. 89 | * Ignored if threadpool_api is NULL. 90 | * @return 0 if successful, 1 otherwise. 91 | */ 92 | int KangarooTwelve_Initialize_Threaded(KangarooTwelve_Instance *ktInstance, int securityLevel, size_t outputByteLen, 93 | const void *threadpool_api, void *threadpool_handle, int thread_count); 94 | 95 | #define KT128_Initialize(instance, outputByteLen) \ 96 | KangarooTwelve_Initialize((instance), 128, (outputByteLen)) 97 | 98 | #define KT256_Initialize(instance, outputByteLen) \ 99 | KangarooTwelve_Initialize((instance), 256, (outputByteLen)) 100 | 101 | #define KT128_Initialize_Threaded(instance, outputByteLen, threadpool_api, threadpool_handle, thread_count) \ 102 | KangarooTwelve_Initialize_Threaded((instance), 128, (outputByteLen), (threadpool_api), (threadpool_handle), (thread_count)) 103 | 104 | #define KT256_Initialize_Threaded(instance, outputByteLen, threadpool_api, threadpool_handle, thread_count) \ 105 | KangarooTwelve_Initialize_Threaded((instance), 256, (outputByteLen), (threadpool_api), (threadpool_handle), (thread_count)) 106 | 107 | /** 108 | * Function to give input data to be absorbed. 109 | * @param ktInstance Pointer to the instance initialized by KangarooTwelve_Initialize(). 110 | * @param input Pointer to the input message data (M). 111 | * @param inputByteLen The number of bytes provided in the input message data. 112 | * @return 0 if successful, 1 otherwise. 113 | */ 114 | int KangarooTwelve_Update(KangarooTwelve_Instance *ktInstance, const unsigned char *input, size_t inputByteLen); 115 | 116 | /** 117 | * Function to call after all the input message has been input, and to get 118 | * output bytes if the length was specified when calling KangarooTwelve_Initialize(). 119 | * @param ktInstance Pointer to the hash instance initialized by KangarooTwelve_Initialize(). 120 | * If @a outputByteLen was not 0 in the call to KangarooTwelve_Initialize(), the number of 121 | * output bytes is equal to @a outputByteLen. 122 | * If @a outputByteLen was 0 in the call to KangarooTwelve_Initialize(), the output bytes 123 | * must be extracted using the KangarooTwelve_Squeeze() function. 124 | * @param output Pointer to the buffer where to store the output data. 125 | * @param customization Pointer to the customization string (C). 126 | * @param customByteLen The length of the customization string in bytes. 127 | * @return 0 if successful, 1 otherwise. 128 | */ 129 | int KangarooTwelve_Final(KangarooTwelve_Instance *ktInstance, unsigned char *output, const unsigned char *customization, size_t customByteLen); 130 | 131 | /** 132 | * Function to squeeze output data. 133 | * @param ktInstance Pointer to the hash instance initialized by KangarooTwelve_Initialize(). 134 | * @param data Pointer to the buffer where to store the output data. 135 | * @param outputByteLen The number of output bytes desired. 136 | * @pre KangarooTwelve_Final() must have been already called. 137 | * @return 0 if successful, 1 otherwise. 138 | */ 139 | int KangarooTwelve_Squeeze(KangarooTwelve_Instance *ktInstance, unsigned char *output, size_t outputByteLen); 140 | 141 | #if !defined(KeccakP1600_disableParallelism) && defined(KeccakP1600_enable_simd_options) 142 | /** 143 | * Functions to selectively disable the use of CPU features. Should be rarely 144 | * needed; if you're not sure this is what you want, don't worry about it. 145 | * 146 | * /!\ WARNING /!\: Calling these functions REQUIRES that there are no 147 | * KangarooTwelve instances in use. The effects are global and affect the code 148 | * paths taken by every call, as well as the details of the represented states. 149 | * Calling these functions in the middle of your program (as opposed to during 150 | * setup) is PROBABLY WRONG. 151 | * 152 | * These functions are at present only used to increase test suite coverage, 153 | * and demonstrate comparative performance between implementations in different 154 | * instruction sets. To enable them, the macro KeccakP1600_enable_simd_options 155 | * must be defined at compile time. 156 | * 157 | * They can potentially also be useful in an environment where it is 158 | * detrimental to online large vector units on the CPU, since doing so can lead 159 | * to downclocking, performance hits in other threads sharing the same CPU 160 | * core, and short delays while the CPU's power license is increased to online 161 | * the vector unit. 162 | * 163 | * In the majority of situations, however, this should rarely matter and it is 164 | * usually the case that the performance difference will be a wash or even an 165 | * overall improvement despite the downsides. 166 | * 167 | * @return 1 if the feature was enabled and available and has been turned off, 168 | * 0 if it was already disabled or unavailable. 169 | */ 170 | int KangarooTwelve_DisableAVX512(void); 171 | int KangarooTwelve_DisableAVX2(void); 172 | int KangarooTwelve_DisableSSSE3(void); 173 | int KangarooTwelve_DisableNeon(void); 174 | int KangarooTwelve_DisableArmSha3(void); 175 | 176 | /** 177 | * Function to reset all CPU features to enabled-if-available. Calling this 178 | * always has no effect if no CPU features have been explicitly disabled. 179 | */ 180 | void KangarooTwelve_EnableAllCpuFeatures(void); 181 | void KangarooTwelve_EnableAllArmCpuFeatures(void); 182 | #endif // !KeccakP1600_disableParallelism && KeccakP1600_enable_simd_options 183 | 184 | #endif 185 | -------------------------------------------------------------------------------- /lib/Optimized64/KeccakP-1600-AVX512-plainC.c: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. 6 | 7 | Implementation by Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | 16 | --- 17 | 18 | We would like to thank Vladimir Sedach, we have used parts of his Keccak AVX-512 C++ code. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include "align.h" 30 | 31 | typedef __m512i V512; 32 | 33 | #define XOR(a,b) _mm512_xor_si512(a,b) 34 | #define XOR3(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0x96) 35 | #define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) 36 | #define ROL(a,offset) _mm512_rol_epi64(a,offset) 37 | #define Chi(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0xD2) 38 | 39 | #define LOAD_Lanes(m,a) _mm512_maskz_loadu_epi64(m,a) 40 | #define LOAD_Lane(a) LOAD_Lanes(0x01,a) 41 | #define LOAD_Plane(a) LOAD_Lanes(0x1F,a) 42 | #define LOAD_8Lanes(a) LOAD_Lanes(0xFF,a) 43 | #define STORE_Lanes(a,m,v) _mm512_mask_storeu_epi64(a,m,v) 44 | #define STORE_Lane(a,v) STORE_Lanes(a,0x01,v) 45 | #define STORE_Plane(a,v) STORE_Lanes(a,0x1F,v) 46 | #define STORE_8Lanes(a,v) STORE_Lanes(a,0xFF,v) 47 | 48 | /* ---------------------------------------------------------------- */ 49 | 50 | void KeccakP1600_AVX512_Initialize(void *state) 51 | { 52 | memset(state, 0, 1600/8); 53 | } 54 | 55 | /* ---------------------------------------------------------------- */ 56 | 57 | void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) 58 | { 59 | uint8_t *stateAsBytes; 60 | uint64_t *stateAsLanes; 61 | 62 | for( stateAsBytes = (uint8_t*)state; ((offset % 8) != 0) && (length != 0); ++offset, --length) 63 | stateAsBytes[offset] ^= *(data++); 64 | for (stateAsLanes = (uint64_t*)(stateAsBytes + offset); length >= 8*8; stateAsLanes += 8, data += 8*8, length -= 8*8) 65 | STORE_8Lanes( stateAsLanes, XOR(LOAD_8Lanes(stateAsLanes), LOAD_8Lanes((const uint64_t*)data))); 66 | for (/* empty */; length >= 8; ++stateAsLanes, data += 8, length -= 8) 67 | STORE_Lane( stateAsLanes, XOR(LOAD_Lane(stateAsLanes), LOAD_Lane((const uint64_t*)data))); 68 | for ( stateAsBytes = (uint8_t*)stateAsLanes; length != 0; --length) 69 | *(stateAsBytes++) ^= *(data++); 70 | } 71 | 72 | /* ---------------------------------------------------------------- */ 73 | 74 | void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) 75 | { 76 | memcpy(data, (unsigned char*)state+offset, length); 77 | } 78 | 79 | /* ---------------------------------------------------------------- */ 80 | 81 | const uint64_t KeccakP1600RoundConstants[24] = { 82 | 0x0000000000000001ULL, 83 | 0x0000000000008082ULL, 84 | 0x800000000000808aULL, 85 | 0x8000000080008000ULL, 86 | 0x000000000000808bULL, 87 | 0x0000000080000001ULL, 88 | 0x8000000080008081ULL, 89 | 0x8000000000008009ULL, 90 | 0x000000000000008aULL, 91 | 0x0000000000000088ULL, 92 | 0x0000000080008009ULL, 93 | 0x000000008000000aULL, 94 | 0x000000008000808bULL, 95 | 0x800000000000008bULL, 96 | 0x8000000000008089ULL, 97 | 0x8000000000008003ULL, 98 | 0x8000000000008002ULL, 99 | 0x8000000000000080ULL, 100 | 0x000000000000800aULL, 101 | 0x800000008000000aULL, 102 | 0x8000000080008081ULL, 103 | 0x8000000000008080ULL, 104 | 0x0000000080000001ULL, 105 | 0x8000000080008008ULL }; 106 | 107 | #define KeccakP_DeclareVars \ 108 | V512 b0, b1, b2, b3, b4; \ 109 | V512 Baeiou, Gaeiou, Kaeiou, Maeiou, Saeiou; \ 110 | V512 moveThetaPrev = _mm512_setr_epi64(4, 0, 1, 2, 3, 5, 6, 7); \ 111 | V512 moveThetaNext = _mm512_setr_epi64(1, 2, 3, 4, 0, 5, 6, 7); \ 112 | V512 rhoB = _mm512_setr_epi64( 0, 1, 62, 28, 27, 0, 0, 0); \ 113 | V512 rhoG = _mm512_setr_epi64(36, 44, 6, 55, 20, 0, 0, 0); \ 114 | V512 rhoK = _mm512_setr_epi64( 3, 10, 43, 25, 39, 0, 0, 0); \ 115 | V512 rhoM = _mm512_setr_epi64(41, 45, 15, 21, 8, 0, 0, 0); \ 116 | V512 rhoS = _mm512_setr_epi64(18, 2, 61, 56, 14, 0, 0, 0); \ 117 | V512 pi1B = _mm512_setr_epi64(0, 3, 1, 4, 2, 5, 6, 7); \ 118 | V512 pi1G = _mm512_setr_epi64(1, 4, 2, 0, 3, 5, 6, 7); \ 119 | V512 pi1K = _mm512_setr_epi64(2, 0, 3, 1, 4, 5, 6, 7); \ 120 | V512 pi1M = _mm512_setr_epi64(3, 1, 4, 2, 0, 5, 6, 7); \ 121 | V512 pi1S = _mm512_setr_epi64(4, 2, 0, 3, 1, 5, 6, 7); \ 122 | V512 pi2S1 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 0+8, 2+8); \ 123 | V512 pi2S2 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 1+8, 3+8); \ 124 | V512 pi2BG = _mm512_setr_epi64(0, 1, 0+8, 1+8, 6, 5, 6, 7); \ 125 | V512 pi2KM = _mm512_setr_epi64(2, 3, 2+8, 3+8, 7, 5, 6, 7); \ 126 | V512 pi2S3 = _mm512_setr_epi64(4, 5, 4+8, 5+8, 4, 5, 6, 7); 127 | 128 | #define copyFromState(pState) \ 129 | Baeiou = LOAD_Plane(pState+ 0); \ 130 | Gaeiou = LOAD_Plane(pState+ 5); \ 131 | Kaeiou = LOAD_Plane(pState+10); \ 132 | Maeiou = LOAD_Plane(pState+15); \ 133 | Saeiou = LOAD_Plane(pState+20); 134 | 135 | #define copyToState(pState) \ 136 | STORE_Plane(pState+ 0, Baeiou); \ 137 | STORE_Plane(pState+ 5, Gaeiou); \ 138 | STORE_Plane(pState+10, Kaeiou); \ 139 | STORE_Plane(pState+15, Maeiou); \ 140 | STORE_Plane(pState+20, Saeiou); 141 | 142 | #define KeccakP_Round(i) \ 143 | /* Theta */ \ 144 | b0 = XOR5( Baeiou, Gaeiou, Kaeiou, Maeiou, Saeiou ); \ 145 | b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); \ 146 | b0 = _mm512_permutexvar_epi64(moveThetaNext, b0); \ 147 | b0 = _mm512_rol_epi64(b0, 1); \ 148 | Baeiou = XOR3( Baeiou, b0, b1 ); \ 149 | Gaeiou = XOR3( Gaeiou, b0, b1 ); \ 150 | Kaeiou = XOR3( Kaeiou, b0, b1 ); \ 151 | Maeiou = XOR3( Maeiou, b0, b1 ); \ 152 | Saeiou = XOR3( Saeiou, b0, b1 ); \ 153 | /* Rho */ \ 154 | Baeiou = _mm512_rolv_epi64(Baeiou, rhoB); \ 155 | Gaeiou = _mm512_rolv_epi64(Gaeiou, rhoG); \ 156 | Kaeiou = _mm512_rolv_epi64(Kaeiou, rhoK); \ 157 | Maeiou = _mm512_rolv_epi64(Maeiou, rhoM); \ 158 | Saeiou = _mm512_rolv_epi64(Saeiou, rhoS); \ 159 | /* Pi 1 */ \ 160 | b0 = _mm512_permutexvar_epi64(pi1B, Baeiou); \ 161 | b1 = _mm512_permutexvar_epi64(pi1G, Gaeiou); \ 162 | b2 = _mm512_permutexvar_epi64(pi1K, Kaeiou); \ 163 | b3 = _mm512_permutexvar_epi64(pi1M, Maeiou); \ 164 | b4 = _mm512_permutexvar_epi64(pi1S, Saeiou); \ 165 | /* Chi */ \ 166 | Baeiou = Chi(b0, b1, b2); \ 167 | Gaeiou = Chi(b1, b2, b3); \ 168 | Kaeiou = Chi(b2, b3, b4); \ 169 | Maeiou = Chi(b3, b4, b0); \ 170 | Saeiou = Chi(b4, b0, b1); \ 171 | /* Iota */ \ 172 | Baeiou = XOR(Baeiou, LOAD_Lane(KeccakP1600RoundConstants+i)); \ 173 | /* Pi 2 */ \ 174 | b0 = _mm512_unpacklo_epi64(Baeiou, Gaeiou); \ 175 | b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); \ 176 | b0 = _mm512_permutex2var_epi64(b0, pi2S1, Saeiou); \ 177 | b2 = _mm512_unpackhi_epi64(Baeiou, Gaeiou); \ 178 | b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); \ 179 | b2 = _mm512_permutex2var_epi64(b2, pi2S2, Saeiou); \ 180 | Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); \ 181 | Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); \ 182 | Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); \ 183 | Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); \ 184 | b0 = _mm512_permutex2var_epi64(b0, pi2S3, b1); \ 185 | Saeiou = _mm512_mask_blend_epi64(0x10, b0, Saeiou) 186 | 187 | #define rounds12 \ 188 | KeccakP_Round( 12 ); \ 189 | KeccakP_Round( 13 ); \ 190 | KeccakP_Round( 14 ); \ 191 | KeccakP_Round( 15 ); \ 192 | KeccakP_Round( 16 ); \ 193 | KeccakP_Round( 17 ); \ 194 | KeccakP_Round( 18 ); \ 195 | KeccakP_Round( 19 ); \ 196 | KeccakP_Round( 20 ); \ 197 | KeccakP_Round( 21 ); \ 198 | KeccakP_Round( 22 ); \ 199 | KeccakP_Round( 23 ) 200 | 201 | /* ---------------------------------------------------------------- */ 202 | 203 | void KeccakP1600_AVX512_Permute_12rounds(void *state) 204 | { 205 | KeccakP_DeclareVars 206 | uint64_t *stateAsLanes = (uint64_t*)state; 207 | 208 | copyFromState(stateAsLanes); 209 | rounds12; 210 | copyToState(stateAsLanes); 211 | } 212 | 213 | /* ---------------------------------------------------------------- */ 214 | 215 | #include 216 | 217 | size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) 218 | { 219 | size_t originalDataByteLen = dataByteLen; 220 | 221 | assert(laneCount == 21 || laneCount == 17); 222 | 223 | KeccakP_DeclareVars; 224 | uint64_t *stateAsLanes = (uint64_t*)state; 225 | uint64_t *inDataAsLanes = (uint64_t*)data; 226 | 227 | if (laneCount == 21) { 228 | #define laneCount 21 229 | copyFromState(stateAsLanes); 230 | while(dataByteLen >= 21*8) { 231 | Baeiou = XOR(Baeiou, LOAD_Plane(inDataAsLanes+ 0)); 232 | Gaeiou = XOR(Gaeiou, LOAD_Plane(inDataAsLanes+ 5)); 233 | Kaeiou = XOR(Kaeiou, LOAD_Plane(inDataAsLanes+10)); 234 | Maeiou = XOR(Maeiou, LOAD_Plane(inDataAsLanes+15)); 235 | Saeiou = XOR(Saeiou, LOAD_Lane(inDataAsLanes+20)); 236 | rounds12; 237 | inDataAsLanes += 21; 238 | dataByteLen -= 21*8; 239 | } 240 | #undef laneCount 241 | copyToState(stateAsLanes); 242 | } else if (laneCount == 17) { 243 | // TODO: further optimization needed for this case, laneCount == 17. 244 | while(dataByteLen >= laneCount*8) { 245 | KeccakP1600_AddBytes(state, data, 0, laneCount*8); 246 | KeccakP1600_Permute_12rounds(state); 247 | data += laneCount*8; 248 | dataByteLen -= laneCount*8; 249 | } 250 | } 251 | 252 | return originalDataByteLen - dataByteLen; 253 | } 254 | -------------------------------------------------------------------------------- /tests/timing.h: -------------------------------------------------------------------------------- 1 | // Adapted from Google Benchmark (https://github.com/google/benchmark). 2 | // 3 | // Copyright 2020 Google Inc. 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #ifndef _XKCP_timing_h_ 18 | #define _XKCP_timing_h_ 19 | 20 | #include 21 | 22 | #if defined(__GNUC__) 23 | #define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline)) 24 | #elif defined(_MSC_VER) && !defined(__clang__) 25 | #define BENCHMARK_ALWAYS_INLINE __forceinline 26 | #if _MSC_VER >= 1900 27 | #else 28 | #endif 29 | #define __func__ __FUNCTION__ 30 | #else 31 | #define BENCHMARK_ALWAYS_INLINE 32 | #endif 33 | 34 | #ifndef __has_feature 35 | #define __has_feature(x) 0 36 | #endif 37 | 38 | #if defined(__clang__) 39 | #if defined(__ibmxl__) 40 | #if !defined(COMPILER_IBMXL) 41 | #define COMPILER_IBMXL 42 | #endif 43 | #elif !defined(COMPILER_CLANG) 44 | #define COMPILER_CLANG 45 | #endif 46 | #elif defined(_MSC_VER) 47 | #if !defined(COMPILER_MSVC) 48 | #define COMPILER_MSVC 49 | #endif 50 | #elif defined(__GNUC__) 51 | #if !defined(COMPILER_GCC) 52 | #define COMPILER_GCC 53 | #endif 54 | #endif 55 | 56 | #if defined(__CYGWIN__) 57 | #define BENCHMARK_OS_CYGWIN 1 58 | #elif defined(_WIN32) 59 | #define BENCHMARK_OS_WINDOWS 1 60 | #if defined(__MINGW32__) 61 | #define BENCHMARK_OS_MINGW 1 62 | #endif 63 | #elif defined(__APPLE__) 64 | #define BENCHMARK_OS_APPLE 1 65 | #include "TargetConditionals.h" 66 | #if defined(TARGET_OS_MAC) 67 | #define BENCHMARK_OS_MACOSX 1 68 | #if defined(TARGET_OS_IPHONE) 69 | #define BENCHMARK_OS_IOS 1 70 | #endif 71 | #endif 72 | #elif defined(__FreeBSD__) 73 | #define BENCHMARK_OS_FREEBSD 1 74 | #elif defined(__NetBSD__) 75 | #define BENCHMARK_OS_NETBSD 1 76 | #elif defined(__OpenBSD__) 77 | #define BENCHMARK_OS_OPENBSD 1 78 | #elif defined(__DragonFly__) 79 | #define BENCHMARK_OS_DRAGONFLY 1 80 | #elif defined(__linux__) 81 | #define BENCHMARK_OS_LINUX 1 82 | #elif defined(__native_client__) 83 | #define BENCHMARK_OS_NACL 1 84 | #elif defined(__EMSCRIPTEN__) 85 | #define BENCHMARK_OS_EMSCRIPTEN 1 86 | #elif defined(__rtems__) 87 | #define BENCHMARK_OS_RTEMS 1 88 | #elif defined(__Fuchsia__) 89 | #define BENCHMARK_OS_FUCHSIA 1 90 | #elif defined (__SVR4) && defined (__sun) 91 | #define BENCHMARK_OS_SOLARIS 1 92 | #elif defined(__QNX__) 93 | #define BENCHMARK_OS_QNX 1 94 | #elif defined(__MVS__) 95 | #define BENCHMARK_OS_ZOS 1 96 | #endif 97 | 98 | #if defined(BENCHMARK_OS_MACOSX) 99 | #include 100 | #endif 101 | // For MSVC, we want to use '_asm rdtsc' when possible (since it works 102 | // with even ancient MSVC compilers), and when not possible the 103 | // __rdtsc intrinsic, declared in . Unfortunately, in some 104 | // environments, and have conflicting 105 | // declarations of some other intrinsics, breaking compilation. 106 | // Therefore, we simply declare __rdtsc ourselves. See also 107 | // http://connect.microsoft.com/VisualStudio/feedback/details/262047 108 | #if defined(COMPILER_MSVC) && !defined(_M_IX86) 109 | uint64_t __rdtsc(); 110 | #pragma intrinsic(__rdtsc) 111 | #endif 112 | 113 | #if !defined(BENCHMARK_OS_WINDOWS) || defined(BENCHMARK_OS_MINGW) 114 | #include 115 | #include 116 | #endif 117 | 118 | #ifdef BENCHMARK_OS_EMSCRIPTEN 119 | #include 120 | #endif 121 | 122 | // NOTE: only i386 and x86_64 have been well tested. 123 | // PPC, sparc, alpha, and ia64 are based on 124 | // http://peter.kuscsik.com/wordpress/?p=14 125 | // with modifications by m3b. See also 126 | // https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h 127 | 128 | // This should return the number of cycles since power-on. Thread-safe. 129 | inline BENCHMARK_ALWAYS_INLINE int64_t CycleTimer() { 130 | #if defined(BENCHMARK_OS_EMSCRIPTEN) 131 | // this goes above x86-specific code because old versions of Emscripten 132 | // define __x86_64__, although they have nothing to do with it. 133 | return (int64_t)(emscripten_get_now() * 1e+6); 134 | #elif defined(__i386__) 135 | int64_t ret; 136 | __asm__ volatile("rdtsc" : "=A"(ret)); 137 | return ret; 138 | #elif defined(__x86_64__) || defined(__amd64__) 139 | uint64_t low, high; 140 | __asm__ volatile("rdtsc" : "=a"(low), "=d"(high)); 141 | return (high << 32) | low; 142 | #elif defined(BENCHMARK_OS_MACOSX) 143 | // this goes at the top because we need ALL Macs, regardless of 144 | // architecture, to return the number of "mach time units" that 145 | // have passed since startup. See sysinfo.cc where 146 | // InitializeSystemInfo() sets the supposed cpu clock frequency of 147 | // macs to the number of mach time units per second, not actual 148 | // CPU clock frequency (which can change in the face of CPU 149 | // frequency scaling). Also note that when the Mac sleeps, this 150 | // counter pauses; it does not continue counting, nor does it 151 | // reset to zero. 152 | // XKCP-specific: moved this below i386 and x86_64 tests to favor real CPU cycles when available 153 | return mach_absolute_time(); 154 | #elif defined(__powerpc__) || defined(__ppc__) 155 | // This returns a time-base, which is not always precisely a cycle-count. 156 | #if defined(__powerpc64__) || defined(__ppc64__) 157 | int64_t tb; 158 | asm volatile("mfspr %0, 268" : "=r"(tb)); 159 | return tb; 160 | #else 161 | uint32_t tbl, tbu0, tbu1; 162 | asm volatile( 163 | "mftbu %0\n" 164 | "mftb %1\n" 165 | "mftbu %2" 166 | : "=r"(tbu0), "=r"(tbl), "=r"(tbu1)); 167 | tbl &= -(int32_t)(tbu0 == tbu1); 168 | // high 32 bits in tbu1; low 32 bits in tbl (tbu0 is no longer needed) 169 | return ((uint64_t)(tbu1) << 32) | tbl; 170 | #endif 171 | #elif defined(__sparc__) 172 | int64_t tick; 173 | asm(".byte 0x83, 0x41, 0x00, 0x00"); 174 | asm("mov %%g1, %0" : "=r"(tick)); 175 | return tick; 176 | #elif defined(__ia64__) 177 | int64_t itc; 178 | asm("mov %0 = ar.itc" : "=r"(itc)); 179 | return itc; 180 | #elif defined(COMPILER_MSVC) && defined(_M_IX86) 181 | // Older MSVC compilers (like 7.x) don't seem to support the 182 | // __rdtsc intrinsic properly, so I prefer to use _asm instead 183 | // when I know it will work. Otherwise, I'll use __rdtsc and hope 184 | // the code is being compiled with a non-ancient compiler. 185 | _asm rdtsc 186 | #elif defined(COMPILER_MSVC) 187 | return __rdtsc(); 188 | #elif defined(BENCHMARK_OS_NACL) 189 | // Native Client validator on x86/x86-64 allows RDTSC instructions, 190 | // and this case is handled above. Native Client validator on ARM 191 | // rejects MRC instructions (used in the ARM-specific sequence below), 192 | // so we handle it here. Portable Native Client compiles to 193 | // architecture-agnostic bytecode, which doesn't provide any 194 | // cycle counter access mnemonics. 195 | 196 | // Native Client does not provide any API to access cycle counter. 197 | // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday 198 | // because is provides nanosecond resolution (which is noticable at 199 | // least for PNaCl modules running on x86 Mac & Linux). 200 | // Initialize to always return 0 if clock_gettime fails. 201 | struct timespec ts = {0, 0}; 202 | clock_gettime(CLOCK_MONOTONIC, &ts); 203 | return (int64_t)(ts.tv_sec) * 1000000000 + ts.tv_nsec; 204 | #elif defined(__aarch64__) 205 | // System timer of ARMv8 runs at a different frequency than the CPU's. 206 | // The frequency is fixed, typically in the range 1-50MHz. It can be 207 | // read at CNTFRQ special register. We assume the OS has set up 208 | // the virtual timer properly. 209 | int64_t virtual_timer_value; 210 | asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value)); 211 | return virtual_timer_value; 212 | #elif defined(__ARM_ARCH) 213 | // V6 is the earliest arch that has a standard cyclecount 214 | // Native Client validator doesn't allow MRC instructions. 215 | #if (__ARM_ARCH >= 6) 216 | uint32_t pmccntr; 217 | uint32_t pmuseren; 218 | uint32_t pmcntenset; 219 | // Read the user mode perf monitor counter access permissions. 220 | asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren)); 221 | if (pmuseren & 1) { // Allows reading perfmon counters for user mode code. 222 | asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset)); 223 | if (pmcntenset & 0x80000000ul) { // Is it counting? 224 | asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr)); 225 | // The counter is set up to count every 64th cycle 226 | return (int64_t)(pmccntr) * 64; // Should optimize to << 6 227 | } 228 | } 229 | #endif 230 | struct timeval tv; 231 | gettimeofday(&tv, NULL); 232 | return (int64_t)(tv.tv_sec) * 1000000 + tv.tv_usec; 233 | #elif defined(__mips__) || defined(__m68k__) 234 | // mips apparently only allows rdtsc for superusers, so we fall 235 | // back to gettimeofday. It's possible clock_gettime would be better. 236 | struct timeval tv; 237 | gettimeofday(&tv, NULL); 238 | return (int64_t)(tv.tv_sec) * 1000000 + tv.tv_usec; 239 | #elif defined(__s390__) // Covers both s390 and s390x. 240 | // Return the CPU clock. 241 | uint64_t tsc; 242 | #if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL) 243 | // z/OS XL compiler HLASM syntax. 244 | asm(" stck %0" : "=m"(tsc) : : "cc"); 245 | #else 246 | asm("stck %0" : "=Q"(tsc) : : "cc"); 247 | #endif 248 | return tsc; 249 | #elif defined(__riscv) // RISC-V 250 | // Use RDCYCLE (and RDCYCLEH on riscv32) 251 | #if __riscv_xlen == 32 252 | uint32_t cycles_lo, cycles_hi0, cycles_hi1; 253 | // This asm also includes the PowerPC overflow handling strategy, as above. 254 | // Implemented in assembly because Clang insisted on branching. 255 | asm volatile( 256 | "rdcycleh %0\n" 257 | "rdcycle %1\n" 258 | "rdcycleh %2\n" 259 | "sub %0, %0, %2\n" 260 | "seqz %0, %0\n" 261 | "sub %0, zero, %0\n" 262 | "and %1, %1, %0\n" 263 | : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1)); 264 | return ((uint64_t)(cycles_hi1) << 32) | cycles_lo; 265 | #else 266 | uint64_t cycles; 267 | asm volatile("rdcycle %0" : "=r"(cycles)); 268 | return cycles; 269 | #endif 270 | #else 271 | // The soft failover to a generic implementation is automatic only for ARM. 272 | // For other platforms the developer is expected to make an attempt to create 273 | // a fast implementation and use generic version if nothing better is available. 274 | #error You need to define CycleTimer for your OS and CPU 275 | #endif 276 | } 277 | 278 | /* ---------------------------------------------------------------- */ 279 | /* XKCP-specific definitions follow. */ 280 | /* ---------------------------------------------------------------- */ 281 | 282 | 283 | typedef int64_t cycles_t; 284 | #define CYCLES_MAX INT64_MAX 285 | 286 | #define TIMER_SAMPLE_CNT (100) 287 | 288 | const char * getTimerUnit(); 289 | extern double timerCorrectionFactor; 290 | cycles_t CalibrateTimer(); 291 | 292 | #define measureTimingDeclare \ 293 | cycles_t tMin = CYCLES_MAX; \ 294 | cycles_t t0,t1,i; 295 | 296 | #define measureTimingBeginDeclared \ 297 | for (i=0;i < TIMER_SAMPLE_CNT;i++) \ 298 | { \ 299 | t0 = CycleTimer(); 300 | 301 | #define measureTimingBegin \ 302 | cycles_t tMin = CYCLES_MAX; \ 303 | cycles_t t0,t1,i; \ 304 | for (i=0;i < TIMER_SAMPLE_CNT;i++) \ 305 | { \ 306 | t0 = CycleTimer(); 307 | 308 | #define measureTimingEnd \ 309 | t1 = CycleTimer(); \ 310 | if (tMin > t1-t0 - dtMin) \ 311 | tMin = t1-t0 - dtMin; \ 312 | } \ 313 | return (cycles_t)(tMin * timerCorrectionFactor + 0.5); 314 | 315 | #endif // _XKCP_timing_h_ 316 | -------------------------------------------------------------------------------- /lib/ARMv8Asha3/KeccakP-1600-runtimeDispatch.c: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. 6 | 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | 16 | --- 17 | 18 | Please refer to the XKCP for more details. 19 | 20 | ARM CPU feature detection adapted from libaegis by Frank Denis. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include "KeccakP-1600-SnP.h" 27 | 28 | #ifdef KeccakP1600_disableParallelism 29 | #undef KeccakP1600_enable_simd_options 30 | #else 31 | 32 | // Forward declarations 33 | void KangarooTwelve_SetArmProcessorCapabilities(); 34 | 35 | #ifdef KeccakP1600_enable_simd_options 36 | int K12_NEON_requested_disabled = 0; 37 | int K12_ARM_SHA3_requested_disabled = 0; 38 | #endif // KeccakP1600_enable_simd_options 39 | 40 | int K12_enableNEON = 0; 41 | int K12_enableARM_SHA3 = 0; 42 | 43 | /* ---------------------------------------------------------------- */ 44 | /* Platform-specific includes for CPU feature detection */ 45 | /* ---------------------------------------------------------------- */ 46 | 47 | #if defined(__linux__) && (defined(__aarch64__) || defined(__arm__)) 48 | #define K12_HAVE_LINUX_ARM 49 | #if defined(__GLIBC__) || defined(__BIONIC__) 50 | #include 51 | #define K12_HAVE_GETAUXVAL 52 | #endif 53 | #endif 54 | 55 | #if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm__)) 56 | #define K12_HAVE_APPLE_ARM 57 | #include 58 | #endif 59 | 60 | #if defined(_WIN32) && (defined(_M_ARM64) || defined(_M_ARM)) 61 | #define K12_HAVE_WINDOWS_ARM 62 | #include 63 | #endif 64 | 65 | #if defined(__ANDROID__) && (defined(__aarch64__) || defined(__arm__)) 66 | #define K12_HAVE_ANDROID_ARM 67 | #include 68 | #endif 69 | 70 | /* ---------------------------------------------------------------- */ 71 | /* Hardware capability constants */ 72 | /* ---------------------------------------------------------------- */ 73 | 74 | // 32-bit ARM hwcaps (AT_HWCAP) 75 | #ifndef K12_ARM_HWCAP_NEON 76 | #define K12_ARM_HWCAP_NEON (1L << 12) 77 | #endif 78 | 79 | // AArch64 hwcaps (AT_HWCAP) 80 | #ifndef K12_AARCH64_HWCAP_ASIMD 81 | #define K12_AARCH64_HWCAP_ASIMD (1L << 1) 82 | #endif 83 | 84 | #ifndef K12_AARCH64_HWCAP_SHA3 85 | #define K12_AARCH64_HWCAP_SHA3 (1L << 17) 86 | #endif 87 | 88 | /* ---------------------------------------------------------------- */ 89 | /* CPU feature detection */ 90 | /* ---------------------------------------------------------------- */ 91 | 92 | enum arm_cpu_feature { 93 | ARM_NEON = 1 << 0, 94 | ARM_SHA3 = 1 << 1, 95 | ARM_UNDEFINED = 1 << 30 96 | }; 97 | 98 | static enum arm_cpu_feature g_arm_cpu_features = ARM_UNDEFINED; 99 | 100 | #if defined(K12_HAVE_LINUX_ARM) && defined(K12_HAVE_GETAUXVAL) 101 | static int _have_hwcap(unsigned long hwcap_bit) { 102 | unsigned long hwcap = getauxval(AT_HWCAP); 103 | return (hwcap & hwcap_bit) != 0; 104 | } 105 | #endif 106 | 107 | #if defined(K12_HAVE_APPLE_ARM) 108 | static int _have_arm_feature(const char *feature_name) { 109 | int64_t feature_present = 0; 110 | size_t size = sizeof(feature_present); 111 | if (sysctlbyname(feature_name, &feature_present, &size, NULL, 0) != 0) { 112 | return 0; 113 | } 114 | return feature_present != 0; 115 | } 116 | #endif 117 | 118 | static enum arm_cpu_feature get_arm_cpu_features(void) { 119 | if (g_arm_cpu_features != ARM_UNDEFINED) { 120 | return g_arm_cpu_features; 121 | } 122 | 123 | enum arm_cpu_feature features = 0; 124 | 125 | /* ---------------------------------------------------------------- */ 126 | /* NEON Detection */ 127 | /* ---------------------------------------------------------------- */ 128 | 129 | // Compile-time check - if built with NEON, assume available 130 | #if defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64) 131 | features |= ARM_NEON; 132 | #elif defined(K12_HAVE_LINUX_ARM) && defined(K12_HAVE_GETAUXVAL) 133 | // Runtime detection on Linux ARM 134 | #if defined(__aarch64__) 135 | if (_have_hwcap(K12_AARCH64_HWCAP_ASIMD)) { 136 | features |= ARM_NEON; 137 | } 138 | #elif defined(__arm__) 139 | if (_have_hwcap(K12_ARM_HWCAP_NEON)) { 140 | features |= ARM_NEON; 141 | } 142 | #endif 143 | #elif defined(K12_HAVE_ANDROID_ARM) 144 | // Android detection 145 | uint64_t android_features = android_getCpuFeatures(); 146 | if (android_features & ANDROID_CPU_ARM_FEATURE_NEON) { 147 | features |= ARM_NEON; 148 | } 149 | #elif defined(K12_HAVE_WINDOWS_ARM) 150 | // Windows ARM64 - assume all have NEON 151 | features |= ARM_NEON; 152 | #endif 153 | 154 | /* ---------------------------------------------------------------- */ 155 | /* SHA3 Detection (requires NEON) */ 156 | /* ---------------------------------------------------------------- */ 157 | 158 | if (features & ARM_NEON) { 159 | // Compile-time check 160 | #if defined(__ARM_FEATURE_SHA3) 161 | features |= ARM_SHA3; 162 | #elif defined(K12_HAVE_LINUX_ARM) && defined(K12_HAVE_GETAUXVAL) && defined(__aarch64__) 163 | // Runtime detection on Linux AArch64 164 | if (_have_hwcap(K12_AARCH64_HWCAP_SHA3)) { 165 | features |= ARM_SHA3; 166 | } 167 | #elif defined(K12_HAVE_APPLE_ARM) 168 | // macOS/Apple Silicon detection 169 | if (_have_arm_feature("hw.optional.arm.FEAT_SHA3")) { 170 | features |= ARM_SHA3; 171 | } 172 | #endif 173 | } 174 | 175 | g_arm_cpu_features = features; 176 | return features; 177 | } 178 | 179 | void KangarooTwelve_SetArmProcessorCapabilities() { 180 | enum arm_cpu_feature features = get_arm_cpu_features(); 181 | K12_enableNEON = (features & ARM_NEON) != 0; 182 | K12_enableARM_SHA3 = (features & ARM_SHA3) != 0; 183 | 184 | #ifdef KeccakP1600_enable_simd_options 185 | K12_enableNEON = K12_enableNEON && !K12_NEON_requested_disabled; 186 | K12_enableARM_SHA3 = K12_enableARM_SHA3 && !K12_ARM_SHA3_requested_disabled; 187 | #endif // KeccakP1600_enable_simd_options 188 | } 189 | 190 | /* ---------------------------------------------------------------- */ 191 | /* External function declarations */ 192 | /* ---------------------------------------------------------------- */ 193 | 194 | // Generic ARM64 implementations (from KeccakP-1600-opt64.c) 195 | extern void KeccakP1600_opt64_Initialize(void *state); 196 | extern void KeccakP1600_opt64_AddByte(void *state, unsigned char data, unsigned int offset); 197 | extern void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); 198 | extern void KeccakP1600_opt64_Permute_12rounds(void *state); 199 | extern void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); 200 | extern size_t KeccakP1600_opt64_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); 201 | 202 | // ARMv8-A SHA3 optimized implementations (from assembly) 203 | extern void KeccakP1600_ARMv8Asha3_Permute_12rounds(void *state); 204 | extern size_t KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); 205 | extern void KeccakP1600times2_ARMv8Asha3_Permute_12rounds(void *state); 206 | extern void KT128_ARMv8Asha3_Process2Leaves(const unsigned char *input, unsigned char *output); 207 | extern void KT256_ARMv8Asha3_Process2Leaves(const unsigned char *input, unsigned char *output); 208 | 209 | /* ---------------------------------------------------------------- */ 210 | /* Dispatch functions for Keccak-p[1600] */ 211 | /* ---------------------------------------------------------------- */ 212 | 213 | const char * KeccakP1600_GetImplementation() { 214 | KangarooTwelve_SetArmProcessorCapabilities(); 215 | if (K12_enableARM_SHA3) { 216 | return "ARMv8-A+SHA3 optimized implementation"; 217 | } else { 218 | return "Generic ARM64 implementation"; 219 | } 220 | } 221 | 222 | void KeccakP1600_Initialize(void *state) { 223 | KangarooTwelve_SetArmProcessorCapabilities(); 224 | KeccakP1600_opt64_Initialize(state); // Both use same initialization 225 | } 226 | 227 | void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset) { 228 | KangarooTwelve_SetArmProcessorCapabilities(); 229 | KeccakP1600_opt64_AddByte(state, data, offset); // Both use same AddByte 230 | } 231 | 232 | void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) { 233 | KangarooTwelve_SetArmProcessorCapabilities(); 234 | KeccakP1600_opt64_AddBytes(state, data, offset, length); // Both use same AddBytes 235 | } 236 | 237 | void KeccakP1600_Permute_12rounds(void *state) { 238 | KangarooTwelve_SetArmProcessorCapabilities(); 239 | if (K12_enableARM_SHA3) { 240 | KeccakP1600_ARMv8Asha3_Permute_12rounds(state); 241 | } else { 242 | KeccakP1600_opt64_Permute_12rounds(state); 243 | } 244 | } 245 | 246 | void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) { 247 | KangarooTwelve_SetArmProcessorCapabilities(); 248 | KeccakP1600_opt64_ExtractBytes(state, data, offset, length); // Both use same ExtractBytes 249 | } 250 | 251 | size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) { 252 | KangarooTwelve_SetArmProcessorCapabilities(); 253 | if (K12_enableARM_SHA3) { 254 | return KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); 255 | } else { 256 | return KeccakP1600_opt64_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); 257 | } 258 | } 259 | 260 | /* ---------------------------------------------------------------- */ 261 | /* Dispatch functions for Keccak-p[1600]×2 */ 262 | /* ---------------------------------------------------------------- */ 263 | 264 | int KeccakP1600times2_IsAvailable() { 265 | KangarooTwelve_SetArmProcessorCapabilities(); 266 | return K12_enableARM_SHA3; 267 | } 268 | 269 | const char * KeccakP1600times2_GetImplementation() { 270 | KangarooTwelve_SetArmProcessorCapabilities(); 271 | if (K12_enableARM_SHA3) { 272 | return "ARMv8-A+SHA3 optimized implementation"; 273 | } else { 274 | return ""; 275 | } 276 | } 277 | 278 | void KeccakP1600times2_Permute_12rounds(void *state) { 279 | KangarooTwelve_SetArmProcessorCapabilities(); 280 | if (K12_enableARM_SHA3) { 281 | KeccakP1600times2_ARMv8Asha3_Permute_12rounds(state); 282 | } 283 | } 284 | 285 | void KT128_Process2Leaves(const unsigned char *input, unsigned char *output) { 286 | KangarooTwelve_SetArmProcessorCapabilities(); 287 | if (K12_enableARM_SHA3) { 288 | KT128_ARMv8Asha3_Process2Leaves(input, output); 289 | } 290 | } 291 | 292 | void KT256_Process2Leaves(const unsigned char *input, unsigned char *output) { 293 | KangarooTwelve_SetArmProcessorCapabilities(); 294 | if (K12_enableARM_SHA3) { 295 | KT256_ARMv8Asha3_Process2Leaves(input, output); 296 | } 297 | } 298 | 299 | /* ---------------------------------------------------------------- */ 300 | /* Keccak-p[1600]×4 (not available on ARM) */ 301 | /* ---------------------------------------------------------------- */ 302 | 303 | int KeccakP1600times4_IsAvailable() { 304 | return 0; 305 | } 306 | 307 | const char * KeccakP1600times4_GetImplementation() { 308 | return ""; 309 | } 310 | 311 | void KT128_Process4Leaves(const unsigned char *input, unsigned char *output) { 312 | (void)input; 313 | (void)output; 314 | } 315 | 316 | void KT256_Process4Leaves(const unsigned char *input, unsigned char *output) { 317 | (void)input; 318 | (void)output; 319 | } 320 | 321 | /* ---------------------------------------------------------------- */ 322 | /* Keccak-p[1600]×8 (not available on ARM) */ 323 | /* ---------------------------------------------------------------- */ 324 | 325 | int KeccakP1600times8_IsAvailable() { 326 | return 0; 327 | } 328 | 329 | const char * KeccakP1600times8_GetImplementation() { 330 | return ""; 331 | } 332 | 333 | void KT128_Process8Leaves(const unsigned char *input, unsigned char *output) { 334 | (void)input; 335 | (void)output; 336 | } 337 | 338 | void KT256_Process8Leaves(const unsigned char *input, unsigned char *output) { 339 | (void)input; 340 | (void)output; 341 | } 342 | 343 | /* ---------------------------------------------------------------- */ 344 | /* Optional API for disabling CPU features */ 345 | /* ---------------------------------------------------------------- */ 346 | 347 | #ifdef KeccakP1600_enable_simd_options 348 | 349 | int KangarooTwelve_DisableNeon(void) { 350 | KangarooTwelve_SetArmProcessorCapabilities(); 351 | K12_NEON_requested_disabled = 1; 352 | if (K12_enableNEON) { 353 | KangarooTwelve_SetArmProcessorCapabilities(); 354 | return 1; // NEON was disabled on this call. 355 | } else { 356 | return 0; // Nothing changed. 357 | } 358 | } 359 | 360 | int KangarooTwelve_DisableArmSha3(void) { 361 | KangarooTwelve_SetArmProcessorCapabilities(); 362 | K12_ARM_SHA3_requested_disabled = 1; 363 | if (K12_enableARM_SHA3) { 364 | KangarooTwelve_SetArmProcessorCapabilities(); 365 | return 1; // ARM SHA3 was disabled on this call. 366 | } else { 367 | return 0; // Nothing changed. 368 | } 369 | } 370 | 371 | void KangarooTwelve_EnableAllArmCpuFeatures(void) { 372 | K12_NEON_requested_disabled = 0; 373 | K12_ARM_SHA3_requested_disabled = 0; 374 | KangarooTwelve_SetArmProcessorCapabilities(); 375 | } 376 | 377 | #endif // KeccakP1600_enable_simd_options 378 | 379 | #endif // !KeccakP1600_disableParallelism 380 | -------------------------------------------------------------------------------- /lib/Optimized64/KeccakP-1600-runtimeDispatch.c: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. 6 | 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | 16 | --- 17 | 18 | Please refer to the XKCP for more details. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include "brg_endian.h" 25 | #include "KeccakP-1600-SnP.h" 26 | 27 | #ifdef KeccakP1600_disableParallelism 28 | #undef KeccakP1600_enable_simd_options 29 | #else 30 | 31 | // Forward declaration 32 | void KangarooTwelve_SetProcessorCapabilities(); 33 | #ifdef KeccakP1600_enable_simd_options 34 | int K12_SSSE3_requested_disabled = 0; 35 | int K12_AVX2_requested_disabled = 0; 36 | int K12_AVX512_requested_disabled = 0; 37 | #endif // KeccakP1600_enable_simd_options 38 | int K12_enableSSSE3 = 0; 39 | int K12_enableAVX2 = 0; 40 | int K12_enableAVX512 = 0; 41 | 42 | /* ---------------------------------------------------------------- */ 43 | 44 | void KT128_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output); 45 | void KT128_AVX512_Process2Leaves(const unsigned char *input, unsigned char *output); 46 | 47 | void KT256_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output); 48 | void KT256_AVX512_Process2Leaves(const unsigned char *input, unsigned char *output); 49 | 50 | int KeccakP1600times2_IsAvailable() 51 | { 52 | int result = 0; 53 | result |= K12_enableAVX512; 54 | result |= K12_enableSSSE3; 55 | return result; 56 | } 57 | 58 | const char * KeccakP1600times2_GetImplementation() 59 | { 60 | if (K12_enableAVX512) { 61 | return "AVX-512 implementation"; 62 | } else if (K12_enableSSSE3) { 63 | return "SSSE3 implementation"; 64 | } else { 65 | return ""; 66 | } 67 | } 68 | 69 | void KT128_Process2Leaves(const unsigned char *input, unsigned char *output) 70 | { 71 | if (K12_enableAVX512) { 72 | KT128_AVX512_Process2Leaves(input, output); 73 | } else if (K12_enableSSSE3) { 74 | KT128_SSSE3_Process2Leaves(input, output); 75 | } 76 | } 77 | 78 | void KT256_Process2Leaves(const unsigned char *input, unsigned char *output) 79 | { 80 | if (K12_enableAVX512) { 81 | KT256_AVX512_Process2Leaves(input, output); 82 | } else if (K12_enableSSSE3) { 83 | KT256_SSSE3_Process2Leaves(input, output); 84 | } 85 | } 86 | 87 | 88 | void KT128_AVX2_Process4Leaves(const unsigned char *input, unsigned char *output); 89 | void KT128_AVX512_Process4Leaves(const unsigned char *input, unsigned char *output); 90 | 91 | void KT256_AVX2_Process4Leaves(const unsigned char *input, unsigned char *output); 92 | void KT256_AVX512_Process4Leaves(const unsigned char *input, unsigned char *output); 93 | 94 | int KeccakP1600times4_IsAvailable() 95 | { 96 | int result = 0; 97 | result |= K12_enableAVX512; 98 | result |= K12_enableAVX2; 99 | return result; 100 | } 101 | 102 | const char * KeccakP1600times4_GetImplementation() 103 | { 104 | if (K12_enableAVX512) { 105 | return "AVX-512 implementation"; 106 | } else if (K12_enableAVX2) { 107 | return "AVX2 implementation"; 108 | } else { 109 | return ""; 110 | } 111 | } 112 | 113 | void KT128_Process4Leaves(const unsigned char *input, unsigned char *output) 114 | { 115 | if (K12_enableAVX512) { 116 | KT128_AVX512_Process4Leaves(input, output); 117 | } else if (K12_enableAVX2) { 118 | KT128_AVX2_Process4Leaves(input, output); 119 | } 120 | } 121 | 122 | void KT256_Process4Leaves(const unsigned char *input, unsigned char *output) 123 | { 124 | if (K12_enableAVX512) { 125 | KT256_AVX512_Process4Leaves(input, output); 126 | } else if (K12_enableAVX2) { 127 | KT256_AVX2_Process4Leaves(input, output); 128 | } 129 | } 130 | 131 | void KT128_AVX512_Process8Leaves(const unsigned char *input, unsigned char *output); 132 | 133 | void KT256_AVX512_Process8Leaves(const unsigned char *input, unsigned char *output); 134 | 135 | int KeccakP1600times8_IsAvailable() 136 | { 137 | int result = 0; 138 | result |= K12_enableAVX512; 139 | return result; 140 | } 141 | 142 | const char * KeccakP1600times8_GetImplementation() 143 | { 144 | if (K12_enableAVX512) { 145 | return "AVX-512 implementation"; 146 | } else { 147 | return ""; 148 | } 149 | } 150 | 151 | void KT128_Process8Leaves(const unsigned char *input, unsigned char *output) 152 | { 153 | if (K12_enableAVX512) 154 | KT128_AVX512_Process8Leaves(input, output); 155 | } 156 | 157 | void KT256_Process8Leaves(const unsigned char *input, unsigned char *output) 158 | { 159 | if (K12_enableAVX512) 160 | KT256_AVX512_Process8Leaves(input, output); 161 | } 162 | 163 | #endif // KeccakP1600_disableParallelism 164 | 165 | const char * KeccakP1600_GetImplementation() 166 | { 167 | if (K12_enableAVX512) 168 | return "AVX-512 implementation"; 169 | else 170 | #ifndef KeccakP1600_noAssembly 171 | if (K12_enableAVX2) 172 | return "AVX2 implementation"; 173 | else 174 | #endif 175 | return "generic 64-bit implementation"; 176 | } 177 | 178 | void KeccakP1600_Initialize(void *state) 179 | { 180 | KangarooTwelve_SetProcessorCapabilities(); 181 | if (K12_enableAVX512) 182 | KeccakP1600_AVX512_Initialize(state); 183 | else 184 | #ifndef KeccakP1600_noAssembly 185 | if (K12_enableAVX2) 186 | KeccakP1600_AVX2_Initialize(state); 187 | else 188 | #endif 189 | KeccakP1600_opt64_Initialize(state); 190 | } 191 | 192 | void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset) 193 | { 194 | if (K12_enableAVX512) 195 | ((unsigned char*)(state))[offset] ^= data; 196 | else 197 | #ifndef KeccakP1600_noAssembly 198 | if (K12_enableAVX2) 199 | KeccakP1600_AVX2_AddByte(state, data, offset); 200 | else 201 | #endif 202 | KeccakP1600_opt64_AddByte(state, data, offset); 203 | } 204 | 205 | void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) 206 | { 207 | if (K12_enableAVX512) 208 | KeccakP1600_AVX512_AddBytes(state, data, offset, length); 209 | else 210 | #ifndef KeccakP1600_noAssembly 211 | if (K12_enableAVX2) 212 | KeccakP1600_AVX2_AddBytes(state, data, offset, length); 213 | else 214 | #endif 215 | KeccakP1600_opt64_AddBytes(state, data, offset, length); 216 | } 217 | 218 | void KeccakP1600_Permute_12rounds(void *state) 219 | { 220 | if (K12_enableAVX512) 221 | KeccakP1600_AVX512_Permute_12rounds(state); 222 | else 223 | #ifndef KeccakP1600_noAssembly 224 | if (K12_enableAVX2) 225 | KeccakP1600_AVX2_Permute_12rounds(state); 226 | else 227 | #endif 228 | KeccakP1600_opt64_Permute_12rounds(state); 229 | } 230 | 231 | void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) 232 | { 233 | if (K12_enableAVX512) 234 | KeccakP1600_AVX512_ExtractBytes(state, data, offset, length); 235 | else 236 | #ifndef KeccakP1600_noAssembly 237 | if (K12_enableAVX2) 238 | KeccakP1600_AVX2_ExtractBytes(state, data, offset, length); 239 | else 240 | #endif 241 | KeccakP1600_opt64_ExtractBytes(state, data, offset, length); 242 | } 243 | 244 | size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) 245 | { 246 | if (K12_enableAVX512) 247 | return KeccakP1600_AVX512_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); 248 | else 249 | #ifndef KeccakP1600_noAssembly 250 | if (K12_enableAVX2) 251 | return KeccakP1600_AVX2_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); 252 | else 253 | #endif 254 | return KeccakP1600_opt64_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); 255 | } 256 | 257 | /* ---------------------------------------------------------------- */ 258 | 259 | /* Processor capability detection code by Samuel Neves and Jack O'Connor, see 260 | * https://github.com/BLAKE3-team/BLAKE3/blob/master/c/blake3_dispatch.c 261 | */ 262 | 263 | #if defined(__x86_64__) || defined(_M_X64) 264 | #define IS_X86 265 | #define IS_X86_64 266 | #endif 267 | 268 | #if defined(__i386__) || defined(_M_IX86) 269 | #define IS_X86 270 | #define IS_X86_32 271 | #endif 272 | 273 | #if defined(IS_X86) 274 | static uint64_t xgetbv() { 275 | #if defined(_MSC_VER) 276 | return _xgetbv(0); 277 | #else 278 | uint32_t eax = 0, edx = 0; 279 | __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); 280 | return ((uint64_t)edx << 32) | eax; 281 | #endif 282 | } 283 | 284 | static void cpuid(uint32_t out[4], uint32_t id) { 285 | #if defined(_MSC_VER) 286 | __cpuid((int *)out, id); 287 | #elif defined(__i386__) || defined(_M_IX86) 288 | __asm__ __volatile__("movl %%ebx, %1\n" 289 | "cpuid\n" 290 | "xchgl %1, %%ebx\n" 291 | : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) 292 | : "a"(id)); 293 | #else 294 | __asm__ __volatile__("cpuid\n" 295 | : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) 296 | : "a"(id)); 297 | #endif 298 | } 299 | 300 | static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { 301 | #if defined(_MSC_VER) 302 | __cpuidex((int *)out, id, sid); 303 | #elif defined(__i386__) || defined(_M_IX86) 304 | __asm__ __volatile__("movl %%ebx, %1\n" 305 | "cpuid\n" 306 | "xchgl %1, %%ebx\n" 307 | : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) 308 | : "a"(id), "c"(sid)); 309 | #else 310 | __asm__ __volatile__("cpuid\n" 311 | : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) 312 | : "a"(id), "c"(sid)); 313 | #endif 314 | } 315 | 316 | #endif 317 | 318 | enum cpu_feature { 319 | SSE2 = 1 << 0, 320 | SSSE3 = 1 << 1, 321 | SSE41 = 1 << 2, 322 | AVX = 1 << 3, 323 | AVX2 = 1 << 4, 324 | AVX512F = 1 << 5, 325 | AVX512VL = 1 << 6, 326 | /* ... */ 327 | UNDEFINED = 1 << 30 328 | }; 329 | 330 | static enum cpu_feature g_cpu_features = UNDEFINED; 331 | 332 | static enum cpu_feature 333 | get_cpu_features(void) { 334 | 335 | if (g_cpu_features != UNDEFINED) { 336 | return g_cpu_features; 337 | } else { 338 | #if defined(IS_X86) 339 | uint32_t regs[4] = {0}; 340 | uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; 341 | (void)edx; 342 | enum cpu_feature features = 0; 343 | cpuid(regs, 0); 344 | const int max_id = *eax; 345 | cpuid(regs, 1); 346 | #if defined(__amd64__) || defined(_M_X64) 347 | features |= SSE2; 348 | #else 349 | if (*edx & (1UL << 26)) 350 | features |= SSE2; 351 | #endif 352 | if (*ecx & (1UL << 9)) 353 | features |= SSSE3; 354 | if (*ecx & (1UL << 19)) 355 | features |= SSE41; 356 | 357 | if (*ecx & (1UL << 27)) { // OSXSAVE 358 | const uint64_t mask = xgetbv(); 359 | if ((mask & 6) == 6) { // SSE and AVX states 360 | if (*ecx & (1UL << 28)) 361 | features |= AVX; 362 | if (max_id >= 7) { 363 | cpuidex(regs, 7, 0); 364 | if (*ebx & (1UL << 5)) 365 | features |= AVX2; 366 | if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm 367 | if (*ebx & (1UL << 31)) 368 | features |= AVX512VL; 369 | if (*ebx & (1UL << 16)) 370 | features |= AVX512F; 371 | } 372 | } 373 | } 374 | } 375 | g_cpu_features = features; 376 | return features; 377 | #else 378 | /* How to detect NEON? */ 379 | return 0; 380 | #endif 381 | } 382 | } 383 | 384 | void KangarooTwelve_SetProcessorCapabilities() 385 | { 386 | enum cpu_feature features = get_cpu_features(); 387 | K12_enableSSSE3 = (features & SSSE3); 388 | K12_enableAVX2 = (features & AVX2); 389 | K12_enableAVX512 = (features & AVX512F) && (features & AVX512VL); 390 | #ifdef KeccakP1600_enable_simd_options 391 | K12_enableSSSE3 = K12_enableSSSE3 && !K12_SSSE3_requested_disabled; 392 | K12_enableAVX2 = K12_enableAVX2 && !K12_AVX2_requested_disabled; 393 | K12_enableAVX512 = K12_enableAVX512 && !K12_AVX512_requested_disabled; 394 | #endif // KeccakP1600_enable_simd_options 395 | } 396 | 397 | #ifdef KeccakP1600_enable_simd_options 398 | int KangarooTwelve_DisableSSSE3(void) { 399 | KangarooTwelve_SetProcessorCapabilities(); 400 | K12_SSSE3_requested_disabled = 1; 401 | if (K12_enableSSSE3) { 402 | KangarooTwelve_SetProcessorCapabilities(); 403 | return 1; // SSSE3 was disabled on this call. 404 | } else { 405 | return 0; // Nothing changed. 406 | } 407 | } 408 | 409 | int KangarooTwelve_DisableAVX2(void) { 410 | KangarooTwelve_SetProcessorCapabilities(); 411 | K12_AVX2_requested_disabled = 1; 412 | if (K12_enableAVX2) { 413 | KangarooTwelve_SetProcessorCapabilities(); 414 | return 1; // AVX2 was disabled on this call. 415 | } else { 416 | return 0; // Nothing changed. 417 | } 418 | } 419 | 420 | int KangarooTwelve_DisableAVX512(void) { 421 | KangarooTwelve_SetProcessorCapabilities(); 422 | K12_AVX512_requested_disabled = 1; 423 | if (K12_enableAVX512) { 424 | KangarooTwelve_SetProcessorCapabilities(); 425 | return 1; // AVX512 was disabled on this call. 426 | } else { 427 | return 0; // Nothing changed. 428 | } 429 | } 430 | 431 | void KangarooTwelve_EnableAllCpuFeatures(void) { 432 | K12_SSSE3_requested_disabled = 0; 433 | K12_AVX2_requested_disabled = 0; 434 | K12_AVX512_requested_disabled = 0; 435 | KangarooTwelve_SetProcessorCapabilities(); 436 | } 437 | #endif // KeccakP1600_enable_simd_options 438 | -------------------------------------------------------------------------------- /lib/KangarooTwelve-threading.c: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. 6 | 7 | Threading support implementation using portable thread pool abstraction. 8 | 9 | To the extent possible under law, the implementer has waived all copyright 10 | and related or neighboring rights to the source code in this file. 11 | http://creativecommons.org/publicdomain/zero/1.0/ 12 | */ 13 | 14 | #include "KangarooTwelve-threading.h" 15 | #include "KangarooTwelve.h" 16 | #include "KT-threadpool.h" 17 | #include "KeccakP-1600-SnP.h" 18 | #include 19 | #include 20 | 21 | /* Constants from KangarooTwelve.c */ 22 | #define K12_chunkSize 8192 23 | #define K12_suffixLeaf 0x0B 24 | #define KT128_capacityInBytes 32 25 | #define KT256_capacityInBytes 64 26 | 27 | /* Thread pool configuration */ 28 | #define MAX_THREADS 64 29 | #define MIN_CHUNKS_PER_THREAD 8 30 | 31 | /* Batch-based work distribution */ 32 | #define CHUNKS_PER_BATCH 32 /* 32 chunks = 256 KB per batch - matches Zig */ 33 | #define MAX_BATCHES 256 /* Maximum concurrent batches */ 34 | 35 | /* SIMD leaf processing functions (from KeccakP-1600-runtimeDispatch.c) */ 36 | #ifndef KeccakP1600_disableParallelism 37 | void KT128_Process2Leaves(const unsigned char *input, unsigned char *output); 38 | void KT128_Process4Leaves(const unsigned char *input, unsigned char *output); 39 | void KT128_Process8Leaves(const unsigned char *input, unsigned char *output); 40 | void KT256_Process2Leaves(const unsigned char *input, unsigned char *output); 41 | void KT256_Process4Leaves(const unsigned char *input, unsigned char *output); 42 | void KT256_Process8Leaves(const unsigned char *input, unsigned char *output); 43 | int KeccakP1600times2_IsAvailable(void); 44 | int KeccakP1600times4_IsAvailable(void); 45 | int KeccakP1600times8_IsAvailable(void); 46 | #endif 47 | 48 | /* Work item for chunk processing */ 49 | typedef struct { 50 | const unsigned char *input; 51 | size_t start_chunk; 52 | size_t end_chunk; 53 | unsigned char *output; 54 | int security_level; 55 | int capacity_bytes; 56 | } ChunkWork; 57 | 58 | /* TurboSHAKE instance for local use */ 59 | typedef struct { 60 | uint8_t state[KeccakP1600_stateSizeInBytes]; 61 | unsigned int rate; 62 | uint8_t byteIOIndex; 63 | uint8_t squeezing; 64 | } TurboSHAKE_Instance_Local; 65 | 66 | /* Forward declarations */ 67 | static void process_chunk_range(void *work_ptr); 68 | static void TurboSHAKE_Initialize_Local(TurboSHAKE_Instance_Local *instance, unsigned int capacity); 69 | static void TurboSHAKE_Absorb_Local(TurboSHAKE_Instance_Local *instance, const unsigned char *data, size_t dataByteLen); 70 | static void TurboSHAKE_AbsorbDomainSeparationByte_Local(TurboSHAKE_Instance_Local *instance, unsigned char D); 71 | static void TurboSHAKE_Squeeze_Local(TurboSHAKE_Instance_Local *instance, unsigned char *data, size_t dataByteLen); 72 | 73 | /* TurboSHAKE helper functions for thread-local processing */ 74 | static void TurboSHAKE_Initialize_Local(TurboSHAKE_Instance_Local *instance, unsigned int capacity) 75 | { 76 | KeccakP1600_Initialize(instance->state); 77 | instance->rate = 1600 - capacity; 78 | instance->byteIOIndex = 0; 79 | instance->squeezing = 0; 80 | } 81 | 82 | static void TurboSHAKE_Absorb_Local(TurboSHAKE_Instance_Local *instance, const unsigned char *data, size_t dataByteLen) 83 | { 84 | size_t i, j; 85 | uint8_t partialBlock; 86 | const unsigned char *curData; 87 | const uint8_t rateInBytes = instance->rate/8; 88 | 89 | i = 0; 90 | curData = data; 91 | while(i < dataByteLen) { 92 | if ((instance->byteIOIndex == 0) && (dataByteLen-i >= rateInBytes)) { 93 | #ifdef KeccakP1600_12rounds_FastLoop_supported 94 | j = KeccakP1600_12rounds_FastLoop_Absorb(instance->state, instance->rate/64, curData, dataByteLen - i); 95 | i += j; 96 | curData += j; 97 | #endif 98 | for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) { 99 | KeccakP1600_AddBytes(instance->state, curData, 0, rateInBytes); 100 | KeccakP1600_Permute_12rounds(instance->state); 101 | curData+=rateInBytes; 102 | } 103 | i = dataByteLen - j; 104 | } else { 105 | if (dataByteLen - i > (size_t)rateInBytes - instance->byteIOIndex) { 106 | partialBlock = rateInBytes-instance->byteIOIndex; 107 | } else { 108 | partialBlock = (uint8_t)(dataByteLen - i); 109 | } 110 | i += partialBlock; 111 | 112 | KeccakP1600_AddBytes(instance->state, curData, instance->byteIOIndex, partialBlock); 113 | curData += partialBlock; 114 | instance->byteIOIndex += partialBlock; 115 | if (instance->byteIOIndex == rateInBytes) { 116 | KeccakP1600_Permute_12rounds(instance->state); 117 | instance->byteIOIndex = 0; 118 | } 119 | } 120 | } 121 | } 122 | 123 | static void TurboSHAKE_AbsorbDomainSeparationByte_Local(TurboSHAKE_Instance_Local *instance, unsigned char D) 124 | { 125 | const unsigned int rateInBytes = instance->rate/8; 126 | 127 | KeccakP1600_AddByte(instance->state, D, instance->byteIOIndex); 128 | if ((D >= 0x80) && (instance->byteIOIndex == (rateInBytes-1))) 129 | KeccakP1600_Permute_12rounds(instance->state); 130 | KeccakP1600_AddByte(instance->state, 0x80, rateInBytes-1); 131 | KeccakP1600_Permute_12rounds(instance->state); 132 | instance->byteIOIndex = 0; 133 | instance->squeezing = 1; 134 | } 135 | 136 | static void TurboSHAKE_Squeeze_Local(TurboSHAKE_Instance_Local *instance, unsigned char *data, size_t dataByteLen) 137 | { 138 | size_t i, j; 139 | unsigned int partialBlock; 140 | const unsigned int rateInBytes = instance->rate/8; 141 | unsigned char *curData; 142 | 143 | if (!instance->squeezing) 144 | TurboSHAKE_AbsorbDomainSeparationByte_Local(instance, 0x01); 145 | 146 | i = 0; 147 | curData = data; 148 | while(i < dataByteLen) { 149 | if ((instance->byteIOIndex == rateInBytes) && (dataByteLen-i >= rateInBytes)) { 150 | for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) { 151 | KeccakP1600_Permute_12rounds(instance->state); 152 | KeccakP1600_ExtractBytes(instance->state, curData, 0, rateInBytes); 153 | curData+=rateInBytes; 154 | } 155 | i = dataByteLen - j; 156 | } else { 157 | if (instance->byteIOIndex == rateInBytes) { 158 | KeccakP1600_Permute_12rounds(instance->state); 159 | instance->byteIOIndex = 0; 160 | } 161 | if (dataByteLen-i > rateInBytes-instance->byteIOIndex) 162 | partialBlock = rateInBytes-instance->byteIOIndex; 163 | else 164 | partialBlock = (unsigned int)(dataByteLen - i); 165 | i += partialBlock; 166 | 167 | KeccakP1600_ExtractBytes(instance->state, curData, instance->byteIOIndex, partialBlock); 168 | curData += partialBlock; 169 | instance->byteIOIndex += partialBlock; 170 | } 171 | } 172 | } 173 | 174 | /* Process a single chunk without SIMD (scalar fallback) */ 175 | static void process_single_chunk(const unsigned char *input, unsigned char *output, 176 | int security_level, int capacity_bytes) 177 | { 178 | TurboSHAKE_Instance_Local queueNode; 179 | 180 | /* Initialize TurboSHAKE for this chunk */ 181 | TurboSHAKE_Initialize_Local(&queueNode, 2 * security_level); 182 | 183 | /* Absorb the chunk */ 184 | TurboSHAKE_Absorb_Local(&queueNode, input, K12_chunkSize); 185 | 186 | /* Finalize with domain separation */ 187 | TurboSHAKE_AbsorbDomainSeparationByte_Local(&queueNode, K12_suffixLeaf); 188 | 189 | /* Squeeze out the chaining value */ 190 | TurboSHAKE_Squeeze_Local(&queueNode, output, capacity_bytes); 191 | } 192 | 193 | /* Process a range of chunks - adapted to work as thread pool job */ 194 | static void process_chunk_range(void *work_ptr) 195 | { 196 | ChunkWork *work = (ChunkWork *)work_ptr; 197 | const unsigned char *chunk_ptr = work->input + (work->start_chunk * K12_chunkSize); 198 | unsigned char *output_ptr = work->output + (work->start_chunk * work->capacity_bytes); 199 | size_t chunks_remaining = work->end_chunk - work->start_chunk; 200 | 201 | #ifndef KeccakP1600_disableParallelism 202 | /* Use SIMD parallelism when available - process 8/4/2 chunks at a time */ 203 | if (work->security_level == 128) { 204 | /* KT128 mode */ 205 | if (KeccakP1600times8_IsAvailable()) { 206 | while (chunks_remaining >= 8) { 207 | KT128_Process8Leaves(chunk_ptr, output_ptr); 208 | chunk_ptr += 8 * K12_chunkSize; 209 | output_ptr += 8 * KT128_capacityInBytes; 210 | chunks_remaining -= 8; 211 | } 212 | } 213 | 214 | if (KeccakP1600times4_IsAvailable()) { 215 | while (chunks_remaining >= 4) { 216 | KT128_Process4Leaves(chunk_ptr, output_ptr); 217 | chunk_ptr += 4 * K12_chunkSize; 218 | output_ptr += 4 * KT128_capacityInBytes; 219 | chunks_remaining -= 4; 220 | } 221 | } 222 | 223 | if (KeccakP1600times2_IsAvailable()) { 224 | while (chunks_remaining >= 2) { 225 | KT128_Process2Leaves(chunk_ptr, output_ptr); 226 | chunk_ptr += 2 * K12_chunkSize; 227 | output_ptr += 2 * KT128_capacityInBytes; 228 | chunks_remaining -= 2; 229 | } 230 | } 231 | } else { 232 | /* KT256 mode */ 233 | if (KeccakP1600times8_IsAvailable()) { 234 | while (chunks_remaining >= 8) { 235 | KT256_Process8Leaves(chunk_ptr, output_ptr); 236 | chunk_ptr += 8 * K12_chunkSize; 237 | output_ptr += 8 * KT256_capacityInBytes; 238 | chunks_remaining -= 8; 239 | } 240 | } 241 | 242 | if (KeccakP1600times4_IsAvailable()) { 243 | while (chunks_remaining >= 4) { 244 | KT256_Process4Leaves(chunk_ptr, output_ptr); 245 | chunk_ptr += 4 * K12_chunkSize; 246 | output_ptr += 4 * KT256_capacityInBytes; 247 | chunks_remaining -= 4; 248 | } 249 | } 250 | 251 | if (KeccakP1600times2_IsAvailable()) { 252 | while (chunks_remaining >= 2) { 253 | KT256_Process2Leaves(chunk_ptr, output_ptr); 254 | chunk_ptr += 2 * K12_chunkSize; 255 | output_ptr += 2 * KT256_capacityInBytes; 256 | chunks_remaining -= 2; 257 | } 258 | } 259 | } 260 | #endif /* KeccakP1600_disableParallelism */ 261 | 262 | /* Process any remaining chunks with scalar code */ 263 | while (chunks_remaining > 0) { 264 | process_single_chunk(chunk_ptr, output_ptr, 265 | work->security_level, work->capacity_bytes); 266 | chunk_ptr += K12_chunkSize; 267 | output_ptr += work->capacity_bytes; 268 | chunks_remaining--; 269 | } 270 | } 271 | 272 | /* Main function to process chunks in parallel 273 | * 274 | * Each batch is CHUNKS_PER_BATCH chunks (256 KB), which: 275 | * - Is large enough to amortize task scheduling overhead 276 | * - Is small enough to allow good load balancing via work-stealing 277 | */ 278 | int KT_ProcessChunksThreaded(const KT_ThreadPool_API* threadpool_api, 279 | void* threadpool_handle, 280 | int thread_count, 281 | const unsigned char *input, 282 | size_t chunkCount, 283 | unsigned char *output, 284 | int securityLevel) 285 | { 286 | if (chunkCount == 0) 287 | return 1; 288 | 289 | /* No threading configured - should not happen, but handle gracefully */ 290 | if (!threadpool_api || !threadpool_handle || thread_count < 1) 291 | return 1; 292 | 293 | /* Determine capacity in bytes */ 294 | int capacity_bytes = (securityLevel == 128) ? KT128_capacityInBytes : KT256_capacityInBytes; 295 | 296 | /* Calculate number of batches needed */ 297 | size_t num_batches = (chunkCount + CHUNKS_PER_BATCH - 1) / CHUNKS_PER_BATCH; 298 | 299 | /* If only one batch worth of work, process sequentially */ 300 | if (num_batches <= 1) { 301 | ChunkWork work; 302 | work.input = input; 303 | work.start_chunk = 0; 304 | work.end_chunk = chunkCount; 305 | work.output = output; 306 | work.security_level = securityLevel; 307 | work.capacity_bytes = capacity_bytes; 308 | process_chunk_range(&work); 309 | return 0; 310 | } 311 | 312 | /* Cap number of batches to avoid excessive overhead */ 313 | if (num_batches > MAX_BATCHES) 314 | num_batches = MAX_BATCHES; 315 | 316 | /* Allocate work items for batch processing */ 317 | ChunkWork* work_items = (ChunkWork*)malloc(num_batches * sizeof(ChunkWork)); 318 | if (!work_items) 319 | return 1; 320 | 321 | /* Distribute chunks evenly across batches 322 | * When batch count is capped, ensure balanced distribution 323 | * instead of giving all remaining chunks to the last batch. 324 | */ 325 | size_t chunks_per_batch = chunkCount / num_batches; 326 | size_t extra_chunks = chunkCount % num_batches; 327 | size_t current_chunk = 0; 328 | 329 | for (size_t i = 0; i < num_batches; i++) { 330 | /* First 'extra_chunks' batches get one additional chunk */ 331 | size_t batch_chunks = chunks_per_batch + (i < extra_chunks ? 1 : 0); 332 | 333 | work_items[i].input = input; 334 | work_items[i].start_chunk = current_chunk; 335 | work_items[i].end_chunk = current_chunk + batch_chunks; 336 | work_items[i].output = output; 337 | work_items[i].security_level = securityLevel; 338 | work_items[i].capacity_bytes = capacity_bytes; 339 | 340 | /* Submit batch to thread pool */ 341 | if (threadpool_api->submit(threadpool_handle, process_chunk_range, 342 | &work_items[i]) != 0) { 343 | free(work_items); 344 | return 1; 345 | } 346 | 347 | current_chunk += batch_chunks; 348 | } 349 | 350 | /* Wait for all batches to complete */ 351 | threadpool_api->wait_all(threadpool_handle); 352 | 353 | free(work_items); 354 | return 0; 355 | } 356 | -------------------------------------------------------------------------------- /tests/testKangarooTwelve.c: -------------------------------------------------------------------------------- 1 | /* 2 | K12 based on the eXtended Keccak Code Package (XKCP) 3 | https://github.com/XKCP/XKCP 4 | 5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. 6 | 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". 8 | 9 | For more information, feedback or questions, please refer to the Keccak Team website: 10 | https://keccak.team/ 11 | 12 | To the extent possible under law, the implementer has waived all copyright 13 | and related or neighboring rights to the source code in this file. 14 | http://creativecommons.org/publicdomain/zero/1.0/ 15 | */ 16 | 17 | #include "KangarooTwelve.h" 18 | #include "KeccakP-1600-SnP.h" 19 | 20 | /* #define OUTPUT */ 21 | /* #define VERBOSE */ 22 | 23 | #define SnP_width 1600 24 | #define inputByteSize (80*1024) 25 | #define outputByteSize 256 26 | #define customizationByteSize 32 27 | #define checksumByteSize 16 28 | #define cChunkSize 8192 29 | 30 | #if !defined(__x86_64__) && !defined(_M_X64) && !defined(__i386__) && !defined(_M_IX86) 31 | #undef KeccakP1600_enable_simd_options 32 | #endif 33 | 34 | #if (defined(OUTPUT) || defined(VERBOSE) || !defined(EMBEDDED)) 35 | #include 36 | #endif 37 | #include 38 | #include 39 | #include 40 | 41 | #if defined(EMBEDDED) 42 | static void assert(int condition) 43 | { 44 | if (!condition) 45 | { 46 | for ( ; ; ) ; 47 | } 48 | } 49 | #else 50 | #include 51 | #endif 52 | 53 | static void generateSimpleRawMaterial(unsigned char* data, unsigned int length, unsigned char seed1, unsigned int seed2) 54 | { 55 | unsigned int i; 56 | 57 | for(i=0; i> (8-seed2)); 62 | byte = seed1 + 161*length - iRolled + i; 63 | data[i] = byte; 64 | } 65 | } 66 | 67 | static void performTestKangarooTwelveOneInput(int securityLevel, unsigned int inputLen, unsigned int outputLen, unsigned int customLen, KangarooTwelve_Instance *pSpongeChecksum, unsigned int mode, unsigned int useSqueeze) 68 | { 69 | unsigned char input[inputByteSize]; 70 | unsigned char output[outputByteSize]; 71 | unsigned char customization[customizationByteSize]; 72 | int result; 73 | unsigned int i; 74 | 75 | generateSimpleRawMaterial(customization, customizationByteSize, customLen, 97); 76 | generateSimpleRawMaterial(input, inputLen, outputLen, inputLen + customLen); 77 | 78 | #ifdef VERBOSE 79 | printf("outputLen %5u, inputLen %5u, customLen %3u\n", outputLen, inputLen, customLen); 80 | #endif 81 | if (!useSqueeze) 82 | { 83 | if (mode == 0) 84 | { 85 | /* Input/Output full size in one call */ 86 | result = KangarooTwelve(securityLevel, input, inputLen, output, outputLen, customization, customLen); 87 | assert(result == 0); 88 | } 89 | else if (mode == 1) 90 | { 91 | /* Input/Output one byte per call */ 92 | KangarooTwelve_Instance kt; 93 | result = KangarooTwelve_Initialize(&kt, securityLevel, outputLen); 94 | assert(result == 0); 95 | for (i = 0; i < inputLen; ++i) 96 | { 97 | result = KangarooTwelve_Update(&kt, input + i, 1); 98 | assert(result == 0); 99 | } 100 | result = KangarooTwelve_Final(&kt, output, customization, customLen); 101 | assert(result == 0); 102 | } 103 | else if (mode == 2) 104 | { 105 | /* Input/Output random number of bytes per call */ 106 | KangarooTwelve_Instance kt; 107 | unsigned char *pInput = input; 108 | result = KangarooTwelve_Initialize(&kt, securityLevel, outputLen); 109 | assert(result == 0); 110 | while (inputLen) 111 | { 112 | unsigned int len = ((rand() * 32768) + rand()) % (inputLen + 1); 113 | result = KangarooTwelve_Update(&kt, pInput, len); 114 | assert(result == 0); 115 | pInput += len; 116 | inputLen -= len; 117 | } 118 | result = KangarooTwelve_Final(&kt, output, customization, customLen); 119 | assert(result == 0); 120 | } 121 | } 122 | else 123 | { 124 | if (mode == 0) 125 | { 126 | KangarooTwelve_Instance kt; 127 | result = KangarooTwelve_Initialize(&kt, securityLevel, 0); 128 | assert(result == 0); 129 | result = KangarooTwelve_Update(&kt, input, inputLen); 130 | assert(result == 0); 131 | result = KangarooTwelve_Final(&kt, 0, customization, customLen); 132 | assert(result == 0); 133 | result = KangarooTwelve_Squeeze(&kt, output, outputLen); 134 | assert(result == 0); 135 | } 136 | else if (mode == 1) 137 | { 138 | KangarooTwelve_Instance kt; 139 | result = KangarooTwelve_Initialize(&kt, securityLevel, 0); 140 | assert(result == 0); 141 | result = KangarooTwelve_Update(&kt, input, inputLen); 142 | assert(result == 0); 143 | result = KangarooTwelve_Final(&kt, 0, customization, customLen); 144 | assert(result == 0); 145 | 146 | for (i = 0; i < outputLen; ++i) 147 | { 148 | result = KangarooTwelve_Squeeze(&kt, output + i, 1); 149 | assert(result == 0); 150 | } 151 | } 152 | else if (mode == 2) 153 | { 154 | KangarooTwelve_Instance kt; 155 | unsigned int len; 156 | result = KangarooTwelve_Initialize(&kt, securityLevel, 0); 157 | assert(result == 0); 158 | result = KangarooTwelve_Update(&kt, input, inputLen); 159 | assert(result == 0); 160 | result = KangarooTwelve_Final(&kt, 0, customization, customLen); 161 | assert(result == 0); 162 | 163 | for (i = 0; i < outputLen; i += len) 164 | { 165 | len = ((rand() << 15) ^ rand()) % ((outputLen-i) + 1); 166 | result = KangarooTwelve_Squeeze(&kt, output+i, len); 167 | assert(result == 0); 168 | } 169 | } 170 | } 171 | 172 | #ifdef VERBOSE 173 | { 174 | unsigned int i; 175 | 176 | printf("KT%d\n", securityLevel); 177 | printf("Input of %u bytes:", inputLen); 178 | for(i=0; (i 16) 181 | printf(" ..."); 182 | printf("\n"); 183 | printf("Output of %u bytes:", outputLen); 184 | for(i=0; i= cChunkSize*2)) ? 32 : 1))) { 206 | assert(inputLen <= inputByteSize); 207 | performTestKangarooTwelveOneInput(securityLevel, inputLen, outputLen, customLen, &spongeChecksum, mode, useSqueeze); 208 | } 209 | } 210 | 211 | for(outputLen = 128/8; outputLen <= 512/8; outputLen <<= 1) 212 | for(inputLen = 0; inputLen <= (3*cChunkSize) && inputLen <= inputByteSize; inputLen = inputLen ? (inputLen + 167) : 1) 213 | for(customLen = 0; customLen <= customizationByteSize; customLen += 7) 214 | { 215 | assert(inputLen <= inputByteSize); 216 | performTestKangarooTwelveOneInput(securityLevel, inputLen, outputLen, customLen, &spongeChecksum, 0, useSqueeze); 217 | } 218 | KangarooTwelve_Final(&spongeChecksum, 0, (const unsigned char *)"", 0); 219 | KangarooTwelve_Squeeze(&spongeChecksum, checksum, checksumByteSize); 220 | 221 | #ifdef VERBOSE 222 | { 223 | unsigned int i; 224 | printf("KT%d\n", securityLevel); 225 | printf("Checksum: "); 226 | for(i=0; i 22 | #include 23 | #include "KeccakP-1600-SnP.h" 24 | #include "align.h" 25 | 26 | #define KeccakP1600times2_SSSE3_unrolling 2 27 | 28 | #define SSSE3alignment 16 29 | 30 | #define ANDnu128(a, b) _mm_andnot_si128(a, b) 31 | #define CONST128(a) _mm_load_si128((const __m128i *)&(a)) 32 | #define LOAD128(a) _mm_load_si128((const __m128i *)&(a)) 33 | #define LOAD6464(a, b) _mm_set_epi64x(a, b) 34 | #define CONST128_64(a) _mm_set1_epi64x(a) 35 | #define ROL64in128(a, o) _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o))) 36 | #define ROL64in128_8(a) _mm_shuffle_epi8(a, CONST128(rho8)) 37 | #define ROL64in128_56(a) _mm_shuffle_epi8(a, CONST128(rho56)) 38 | static const uint64_t rho8[2] = {0x0605040302010007, 0x0E0D0C0B0A09080F}; 39 | static const uint64_t rho56[2] = {0x0007060504030201, 0x080F0E0D0C0B0A09}; 40 | #define STORE128(a, b) _mm_store_si128((__m128i *)&(a), b) 41 | #define STORE128u(a, b) _mm_storeu_si128((__m128i *)&(a), b) 42 | #define XOR128(a, b) _mm_xor_si128(a, b) 43 | #define XOReq128(a, b) a = _mm_xor_si128(a, b) 44 | #define UNPACKL( a, b ) _mm_unpacklo_epi64((a), (b)) 45 | #define UNPACKH( a, b ) _mm_unpackhi_epi64((a), (b)) 46 | #define ZERO() _mm_setzero_si128() 47 | 48 | static ALIGN(SSSE3alignment) const uint64_t KeccakP1600RoundConstants[24] = { 49 | 0x0000000000000001ULL, 50 | 0x0000000000008082ULL, 51 | 0x800000000000808aULL, 52 | 0x8000000080008000ULL, 53 | 0x000000000000808bULL, 54 | 0x0000000080000001ULL, 55 | 0x8000000080008081ULL, 56 | 0x8000000000008009ULL, 57 | 0x000000000000008aULL, 58 | 0x0000000000000088ULL, 59 | 0x0000000080008009ULL, 60 | 0x000000008000000aULL, 61 | 0x000000008000808bULL, 62 | 0x800000000000008bULL, 63 | 0x8000000000008089ULL, 64 | 0x8000000000008003ULL, 65 | 0x8000000000008002ULL, 66 | 0x8000000000000080ULL, 67 | 0x000000000000800aULL, 68 | 0x800000008000000aULL, 69 | 0x8000000080008081ULL, 70 | 0x8000000000008080ULL, 71 | 0x0000000080000001ULL, 72 | 0x8000000080008008ULL}; 73 | 74 | #define declareABCDE \ 75 | __m128i Aba, Abe, Abi, Abo, Abu; \ 76 | __m128i Aga, Age, Agi, Ago, Agu; \ 77 | __m128i Aka, Ake, Aki, Ako, Aku; \ 78 | __m128i Ama, Ame, Ami, Amo, Amu; \ 79 | __m128i Asa, Ase, Asi, Aso, Asu; \ 80 | __m128i Bba, Bbe, Bbi, Bbo, Bbu; \ 81 | __m128i Bga, Bge, Bgi, Bgo, Bgu; \ 82 | __m128i Bka, Bke, Bki, Bko, Bku; \ 83 | __m128i Bma, Bme, Bmi, Bmo, Bmu; \ 84 | __m128i Bsa, Bse, Bsi, Bso, Bsu; \ 85 | __m128i Ca, Ce, Ci, Co, Cu; \ 86 | __m128i Da, De, Di, Do, Du; \ 87 | __m128i Eba, Ebe, Ebi, Ebo, Ebu; \ 88 | __m128i Ega, Ege, Egi, Ego, Egu; \ 89 | __m128i Eka, Eke, Eki, Eko, Eku; \ 90 | __m128i Ema, Eme, Emi, Emo, Emu; \ 91 | __m128i Esa, Ese, Esi, Eso, Esu; \ 92 | 93 | #define prepareTheta \ 94 | Ca = XOR128(Aba, XOR128(Aga, XOR128(Aka, XOR128(Ama, Asa)))); \ 95 | Ce = XOR128(Abe, XOR128(Age, XOR128(Ake, XOR128(Ame, Ase)))); \ 96 | Ci = XOR128(Abi, XOR128(Agi, XOR128(Aki, XOR128(Ami, Asi)))); \ 97 | Co = XOR128(Abo, XOR128(Ago, XOR128(Ako, XOR128(Amo, Aso)))); \ 98 | Cu = XOR128(Abu, XOR128(Agu, XOR128(Aku, XOR128(Amu, Asu)))); \ 99 | 100 | /* --- Theta Rho Pi Chi Iota Prepare-theta */ 101 | /* --- 64-bit lanes mapped to 64-bit words */ 102 | #define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ 103 | Da = XOR128(Cu, ROL64in128(Ce, 1)); \ 104 | De = XOR128(Ca, ROL64in128(Ci, 1)); \ 105 | Di = XOR128(Ce, ROL64in128(Co, 1)); \ 106 | Do = XOR128(Ci, ROL64in128(Cu, 1)); \ 107 | Du = XOR128(Co, ROL64in128(Ca, 1)); \ 108 | \ 109 | XOReq128(A##ba, Da); \ 110 | Bba = A##ba; \ 111 | XOReq128(A##ge, De); \ 112 | Bbe = ROL64in128(A##ge, 44); \ 113 | XOReq128(A##ki, Di); \ 114 | Bbi = ROL64in128(A##ki, 43); \ 115 | E##ba = XOR128(Bba, ANDnu128(Bbe, Bbi)); \ 116 | XOReq128(E##ba, CONST128_64(KeccakP1600RoundConstants[i])); \ 117 | Ca = E##ba; \ 118 | XOReq128(A##mo, Do); \ 119 | Bbo = ROL64in128(A##mo, 21); \ 120 | E##be = XOR128(Bbe, ANDnu128(Bbi, Bbo)); \ 121 | Ce = E##be; \ 122 | XOReq128(A##su, Du); \ 123 | Bbu = ROL64in128(A##su, 14); \ 124 | E##bi = XOR128(Bbi, ANDnu128(Bbo, Bbu)); \ 125 | Ci = E##bi; \ 126 | E##bo = XOR128(Bbo, ANDnu128(Bbu, Bba)); \ 127 | Co = E##bo; \ 128 | E##bu = XOR128(Bbu, ANDnu128(Bba, Bbe)); \ 129 | Cu = E##bu; \ 130 | \ 131 | XOReq128(A##bo, Do); \ 132 | Bga = ROL64in128(A##bo, 28); \ 133 | XOReq128(A##gu, Du); \ 134 | Bge = ROL64in128(A##gu, 20); \ 135 | XOReq128(A##ka, Da); \ 136 | Bgi = ROL64in128(A##ka, 3); \ 137 | E##ga = XOR128(Bga, ANDnu128(Bge, Bgi)); \ 138 | XOReq128(Ca, E##ga); \ 139 | XOReq128(A##me, De); \ 140 | Bgo = ROL64in128(A##me, 45); \ 141 | E##ge = XOR128(Bge, ANDnu128(Bgi, Bgo)); \ 142 | XOReq128(Ce, E##ge); \ 143 | XOReq128(A##si, Di); \ 144 | Bgu = ROL64in128(A##si, 61); \ 145 | E##gi = XOR128(Bgi, ANDnu128(Bgo, Bgu)); \ 146 | XOReq128(Ci, E##gi); \ 147 | E##go = XOR128(Bgo, ANDnu128(Bgu, Bga)); \ 148 | XOReq128(Co, E##go); \ 149 | E##gu = XOR128(Bgu, ANDnu128(Bga, Bge)); \ 150 | XOReq128(Cu, E##gu); \ 151 | \ 152 | XOReq128(A##be, De); \ 153 | Bka = ROL64in128(A##be, 1); \ 154 | XOReq128(A##gi, Di); \ 155 | Bke = ROL64in128(A##gi, 6); \ 156 | XOReq128(A##ko, Do); \ 157 | Bki = ROL64in128(A##ko, 25); \ 158 | E##ka = XOR128(Bka, ANDnu128(Bke, Bki)); \ 159 | XOReq128(Ca, E##ka); \ 160 | XOReq128(A##mu, Du); \ 161 | Bko = ROL64in128_8(A##mu); \ 162 | E##ke = XOR128(Bke, ANDnu128(Bki, Bko)); \ 163 | XOReq128(Ce, E##ke); \ 164 | XOReq128(A##sa, Da); \ 165 | Bku = ROL64in128(A##sa, 18); \ 166 | E##ki = XOR128(Bki, ANDnu128(Bko, Bku)); \ 167 | XOReq128(Ci, E##ki); \ 168 | E##ko = XOR128(Bko, ANDnu128(Bku, Bka)); \ 169 | XOReq128(Co, E##ko); \ 170 | E##ku = XOR128(Bku, ANDnu128(Bka, Bke)); \ 171 | XOReq128(Cu, E##ku); \ 172 | \ 173 | XOReq128(A##bu, Du); \ 174 | Bma = ROL64in128(A##bu, 27); \ 175 | XOReq128(A##ga, Da); \ 176 | Bme = ROL64in128(A##ga, 36); \ 177 | XOReq128(A##ke, De); \ 178 | Bmi = ROL64in128(A##ke, 10); \ 179 | E##ma = XOR128(Bma, ANDnu128(Bme, Bmi)); \ 180 | XOReq128(Ca, E##ma); \ 181 | XOReq128(A##mi, Di); \ 182 | Bmo = ROL64in128(A##mi, 15); \ 183 | E##me = XOR128(Bme, ANDnu128(Bmi, Bmo)); \ 184 | XOReq128(Ce, E##me); \ 185 | XOReq128(A##so, Do); \ 186 | Bmu = ROL64in128_56(A##so); \ 187 | E##mi = XOR128(Bmi, ANDnu128(Bmo, Bmu)); \ 188 | XOReq128(Ci, E##mi); \ 189 | E##mo = XOR128(Bmo, ANDnu128(Bmu, Bma)); \ 190 | XOReq128(Co, E##mo); \ 191 | E##mu = XOR128(Bmu, ANDnu128(Bma, Bme)); \ 192 | XOReq128(Cu, E##mu); \ 193 | \ 194 | XOReq128(A##bi, Di); \ 195 | Bsa = ROL64in128(A##bi, 62); \ 196 | XOReq128(A##go, Do); \ 197 | Bse = ROL64in128(A##go, 55); \ 198 | XOReq128(A##ku, Du); \ 199 | Bsi = ROL64in128(A##ku, 39); \ 200 | E##sa = XOR128(Bsa, ANDnu128(Bse, Bsi)); \ 201 | XOReq128(Ca, E##sa); \ 202 | XOReq128(A##ma, Da); \ 203 | Bso = ROL64in128(A##ma, 41); \ 204 | E##se = XOR128(Bse, ANDnu128(Bsi, Bso)); \ 205 | XOReq128(Ce, E##se); \ 206 | XOReq128(A##se, De); \ 207 | Bsu = ROL64in128(A##se, 2); \ 208 | E##si = XOR128(Bsi, ANDnu128(Bso, Bsu)); \ 209 | XOReq128(Ci, E##si); \ 210 | E##so = XOR128(Bso, ANDnu128(Bsu, Bsa)); \ 211 | XOReq128(Co, E##so); \ 212 | E##su = XOR128(Bsu, ANDnu128(Bsa, Bse)); \ 213 | XOReq128(Cu, E##su); \ 214 | \ 215 | 216 | /* --- Theta Rho Pi Chi Iota */ 217 | /* --- 64-bit lanes mapped to 64-bit words */ 218 | #define thetaRhoPiChiIota(i, A, E) \ 219 | Da = XOR128(Cu, ROL64in128(Ce, 1)); \ 220 | De = XOR128(Ca, ROL64in128(Ci, 1)); \ 221 | Di = XOR128(Ce, ROL64in128(Co, 1)); \ 222 | Do = XOR128(Ci, ROL64in128(Cu, 1)); \ 223 | Du = XOR128(Co, ROL64in128(Ca, 1)); \ 224 | \ 225 | XOReq128(A##ba, Da); \ 226 | Bba = A##ba; \ 227 | XOReq128(A##ge, De); \ 228 | Bbe = ROL64in128(A##ge, 44); \ 229 | XOReq128(A##ki, Di); \ 230 | Bbi = ROL64in128(A##ki, 43); \ 231 | E##ba = XOR128(Bba, ANDnu128(Bbe, Bbi)); \ 232 | XOReq128(E##ba, CONST128_64(KeccakP1600RoundConstants[i])); \ 233 | XOReq128(A##mo, Do); \ 234 | Bbo = ROL64in128(A##mo, 21); \ 235 | E##be = XOR128(Bbe, ANDnu128(Bbi, Bbo)); \ 236 | XOReq128(A##su, Du); \ 237 | Bbu = ROL64in128(A##su, 14); \ 238 | E##bi = XOR128(Bbi, ANDnu128(Bbo, Bbu)); \ 239 | E##bo = XOR128(Bbo, ANDnu128(Bbu, Bba)); \ 240 | E##bu = XOR128(Bbu, ANDnu128(Bba, Bbe)); \ 241 | \ 242 | XOReq128(A##bo, Do); \ 243 | Bga = ROL64in128(A##bo, 28); \ 244 | XOReq128(A##gu, Du); \ 245 | Bge = ROL64in128(A##gu, 20); \ 246 | XOReq128(A##ka, Da); \ 247 | Bgi = ROL64in128(A##ka, 3); \ 248 | E##ga = XOR128(Bga, ANDnu128(Bge, Bgi)); \ 249 | XOReq128(A##me, De); \ 250 | Bgo = ROL64in128(A##me, 45); \ 251 | E##ge = XOR128(Bge, ANDnu128(Bgi, Bgo)); \ 252 | XOReq128(A##si, Di); \ 253 | Bgu = ROL64in128(A##si, 61); \ 254 | E##gi = XOR128(Bgi, ANDnu128(Bgo, Bgu)); \ 255 | E##go = XOR128(Bgo, ANDnu128(Bgu, Bga)); \ 256 | E##gu = XOR128(Bgu, ANDnu128(Bga, Bge)); \ 257 | \ 258 | XOReq128(A##be, De); \ 259 | Bka = ROL64in128(A##be, 1); \ 260 | XOReq128(A##gi, Di); \ 261 | Bke = ROL64in128(A##gi, 6); \ 262 | XOReq128(A##ko, Do); \ 263 | Bki = ROL64in128(A##ko, 25); \ 264 | E##ka = XOR128(Bka, ANDnu128(Bke, Bki)); \ 265 | XOReq128(A##mu, Du); \ 266 | Bko = ROL64in128_8(A##mu); \ 267 | E##ke = XOR128(Bke, ANDnu128(Bki, Bko)); \ 268 | XOReq128(A##sa, Da); \ 269 | Bku = ROL64in128(A##sa, 18); \ 270 | E##ki = XOR128(Bki, ANDnu128(Bko, Bku)); \ 271 | E##ko = XOR128(Bko, ANDnu128(Bku, Bka)); \ 272 | E##ku = XOR128(Bku, ANDnu128(Bka, Bke)); \ 273 | \ 274 | XOReq128(A##bu, Du); \ 275 | Bma = ROL64in128(A##bu, 27); \ 276 | XOReq128(A##ga, Da); \ 277 | Bme = ROL64in128(A##ga, 36); \ 278 | XOReq128(A##ke, De); \ 279 | Bmi = ROL64in128(A##ke, 10); \ 280 | E##ma = XOR128(Bma, ANDnu128(Bme, Bmi)); \ 281 | XOReq128(A##mi, Di); \ 282 | Bmo = ROL64in128(A##mi, 15); \ 283 | E##me = XOR128(Bme, ANDnu128(Bmi, Bmo)); \ 284 | XOReq128(A##so, Do); \ 285 | Bmu = ROL64in128_56(A##so); \ 286 | E##mi = XOR128(Bmi, ANDnu128(Bmo, Bmu)); \ 287 | E##mo = XOR128(Bmo, ANDnu128(Bmu, Bma)); \ 288 | E##mu = XOR128(Bmu, ANDnu128(Bma, Bme)); \ 289 | \ 290 | XOReq128(A##bi, Di); \ 291 | Bsa = ROL64in128(A##bi, 62); \ 292 | XOReq128(A##go, Do); \ 293 | Bse = ROL64in128(A##go, 55); \ 294 | XOReq128(A##ku, Du); \ 295 | Bsi = ROL64in128(A##ku, 39); \ 296 | E##sa = XOR128(Bsa, ANDnu128(Bse, Bsi)); \ 297 | XOReq128(A##ma, Da); \ 298 | Bso = ROL64in128(A##ma, 41); \ 299 | E##se = XOR128(Bse, ANDnu128(Bsi, Bso)); \ 300 | XOReq128(A##se, De); \ 301 | Bsu = ROL64in128(A##se, 2); \ 302 | E##si = XOR128(Bsi, ANDnu128(Bso, Bsu)); \ 303 | E##so = XOR128(Bso, ANDnu128(Bsu, Bsa)); \ 304 | E##su = XOR128(Bsu, ANDnu128(Bsa, Bse)); \ 305 | \ 306 | 307 | #define initializeState(X) \ 308 | X##ba = ZERO(); \ 309 | X##be = ZERO(); \ 310 | X##bi = ZERO(); \ 311 | X##bo = ZERO(); \ 312 | X##bu = ZERO(); \ 313 | X##ga = ZERO(); \ 314 | X##ge = ZERO(); \ 315 | X##gi = ZERO(); \ 316 | X##go = ZERO(); \ 317 | X##gu = ZERO(); \ 318 | X##ka = ZERO(); \ 319 | X##ke = ZERO(); \ 320 | X##ki = ZERO(); \ 321 | X##ko = ZERO(); \ 322 | X##ku = ZERO(); \ 323 | X##ma = ZERO(); \ 324 | X##me = ZERO(); \ 325 | X##mi = ZERO(); \ 326 | X##mo = ZERO(); \ 327 | X##mu = ZERO(); \ 328 | X##sa = ZERO(); \ 329 | X##se = ZERO(); \ 330 | X##si = ZERO(); \ 331 | X##so = ZERO(); \ 332 | X##su = ZERO(); \ 333 | 334 | #define XORdata4(X, data0, data1) \ 335 | XOReq128(X##ba, LOAD6464((data1)[ 0], (data0)[ 0])); \ 336 | XOReq128(X##be, LOAD6464((data1)[ 1], (data0)[ 1])); \ 337 | XOReq128(X##bi, LOAD6464((data1)[ 2], (data0)[ 2])); \ 338 | XOReq128(X##bo, LOAD6464((data1)[ 3], (data0)[ 3])); \ 339 | 340 | #define XORdata16(X, data0, data1) \ 341 | XOReq128(X##ba, LOAD6464((data1)[ 0], (data0)[ 0])); \ 342 | XOReq128(X##be, LOAD6464((data1)[ 1], (data0)[ 1])); \ 343 | XOReq128(X##bi, LOAD6464((data1)[ 2], (data0)[ 2])); \ 344 | XOReq128(X##bo, LOAD6464((data1)[ 3], (data0)[ 3])); \ 345 | XOReq128(X##bu, LOAD6464((data1)[ 4], (data0)[ 4])); \ 346 | XOReq128(X##ga, LOAD6464((data1)[ 5], (data0)[ 5])); \ 347 | XOReq128(X##ge, LOAD6464((data1)[ 6], (data0)[ 6])); \ 348 | XOReq128(X##gi, LOAD6464((data1)[ 7], (data0)[ 7])); \ 349 | XOReq128(X##go, LOAD6464((data1)[ 8], (data0)[ 8])); \ 350 | XOReq128(X##gu, LOAD6464((data1)[ 9], (data0)[ 9])); \ 351 | XOReq128(X##ka, LOAD6464((data1)[10], (data0)[10])); \ 352 | XOReq128(X##ke, LOAD6464((data1)[11], (data0)[11])); \ 353 | XOReq128(X##ki, LOAD6464((data1)[12], (data0)[12])); \ 354 | XOReq128(X##ko, LOAD6464((data1)[13], (data0)[13])); \ 355 | XOReq128(X##ku, LOAD6464((data1)[14], (data0)[14])); \ 356 | XOReq128(X##ma, LOAD6464((data1)[15], (data0)[15])); \ 357 | 358 | #define XORdata17(X, data0, data1) \ 359 | XORdata16(X, data0, data1) \ 360 | XOReq128(X##me, LOAD6464((data1)[16], (data0)[16])); \ 361 | 362 | #define XORdata21(X, data0, data1) \ 363 | XORdata17(X, data0, data1) \ 364 | XOReq128(X##mi, LOAD6464((data1)[17], (data0)[17])); \ 365 | XOReq128(X##mo, LOAD6464((data1)[18], (data0)[18])); \ 366 | XOReq128(X##mu, LOAD6464((data1)[19], (data0)[19])); \ 367 | XOReq128(X##sa, LOAD6464((data1)[20], (data0)[20])); \ 368 | 369 | #if ((defined(KeccakP1600times2_SSSE3_fullUnrolling)) || (KeccakP1600times2_SSSE3_unrolling == 12)) 370 | #define rounds12 \ 371 | prepareTheta \ 372 | thetaRhoPiChiIotaPrepareTheta(12, A, E) \ 373 | thetaRhoPiChiIotaPrepareTheta(13, E, A) \ 374 | thetaRhoPiChiIotaPrepareTheta(14, A, E) \ 375 | thetaRhoPiChiIotaPrepareTheta(15, E, A) \ 376 | thetaRhoPiChiIotaPrepareTheta(16, A, E) \ 377 | thetaRhoPiChiIotaPrepareTheta(17, E, A) \ 378 | thetaRhoPiChiIotaPrepareTheta(18, A, E) \ 379 | thetaRhoPiChiIotaPrepareTheta(19, E, A) \ 380 | thetaRhoPiChiIotaPrepareTheta(20, A, E) \ 381 | thetaRhoPiChiIotaPrepareTheta(21, E, A) \ 382 | thetaRhoPiChiIotaPrepareTheta(22, A, E) \ 383 | thetaRhoPiChiIota(23, E, A) \ 384 | 385 | #elif (KeccakP1600times2_SSSE3_unrolling == 6) 386 | #define rounds12 \ 387 | prepareTheta \ 388 | for(i=12; i<24; i+=6) { \ 389 | thetaRhoPiChiIotaPrepareTheta(i , A, E) \ 390 | thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ 391 | thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ 392 | thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ 393 | thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ 394 | thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ 395 | } \ 396 | 397 | #elif (KeccakP1600times2_SSSE3_unrolling == 4) 398 | #define rounds12 \ 399 | prepareTheta \ 400 | for(i=12; i<24; i+=4) { \ 401 | thetaRhoPiChiIotaPrepareTheta(i , A, E) \ 402 | thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ 403 | thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ 404 | thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ 405 | } \ 406 | 407 | #elif (KeccakP1600times2_SSSE3_unrolling == 2) 408 | #define rounds12 \ 409 | prepareTheta \ 410 | for(i=12; i<24; i+=2) { \ 411 | thetaRhoPiChiIotaPrepareTheta(i , A, E) \ 412 | thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ 413 | } \ 414 | 415 | #else 416 | #error "KeccakP1600times2_SSSE3_unrolling is not correctly specified!" 417 | #endif 418 | 419 | #define chunkSize 8192 420 | #define KT128_rateInBytes (21*8) 421 | #define KT256_rateInBytes (17*8) 422 | 423 | void KT128_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output) 424 | { 425 | declareABCDE 426 | #ifndef KeccakP1600times2_SSSE3_fullUnrolling 427 | unsigned int i; 428 | #endif 429 | unsigned int j; 430 | 431 | initializeState(A); 432 | 433 | for(j = 0; j < (chunkSize - KT128_rateInBytes); j += KT128_rateInBytes) { 434 | XORdata21(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); 435 | rounds12 436 | input += KT128_rateInBytes; 437 | } 438 | 439 | XORdata16(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); 440 | XOReq128(Ame, _mm_set1_epi64x(0x0BULL)); 441 | XOReq128(Asa, _mm_set1_epi64x(0x8000000000000000ULL)); 442 | rounds12 443 | 444 | STORE128u( *(__m128i*)&(output[ 0]), UNPACKL( Aba, Abe ) ); 445 | STORE128u( *(__m128i*)&(output[16]), UNPACKL( Abi, Abo ) ); 446 | STORE128u( *(__m128i*)&(output[32]), UNPACKH( Aba, Abe ) ); 447 | STORE128u( *(__m128i*)&(output[48]), UNPACKH( Abi, Abo ) ); 448 | } 449 | 450 | void KT256_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output) 451 | { 452 | declareABCDE 453 | #ifndef KeccakP1600times2_SSSE3_fullUnrolling 454 | unsigned int i; 455 | #endif 456 | unsigned int j; 457 | 458 | initializeState(A); 459 | 460 | for(j = 0; j < (chunkSize - KT256_rateInBytes); j += KT256_rateInBytes) { 461 | XORdata17(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); 462 | rounds12 463 | input += KT256_rateInBytes; 464 | } 465 | 466 | XORdata4(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); 467 | XOReq128(Abu, _mm_set1_epi64x(0x0BULL)); 468 | XOReq128(Ame, _mm_set1_epi64x(0x8000000000000000ULL)); 469 | rounds12 470 | 471 | STORE128u( *(__m128i*)&(output[ 0]), UNPACKL( Aba, Abe ) ); 472 | STORE128u( *(__m128i*)&(output[16]), UNPACKL( Abi, Abo ) ); 473 | STORE128u( *(__m128i*)&(output[32]), UNPACKL( Abu, Aga ) ); 474 | STORE128u( *(__m128i*)&(output[48]), UNPACKL( Age, Agi ) ); 475 | STORE128u( *(__m128i*)&(output[64]), UNPACKH( Aba, Abe ) ); 476 | STORE128u( *(__m128i*)&(output[80]), UNPACKH( Abi, Abo ) ); 477 | STORE128u( *(__m128i*)&(output[96]), UNPACKH( Abu, Aga ) ); 478 | STORE128u( *(__m128i*)&(output[112]), UNPACKH( Age, Agi ) ); 479 | } 480 | --------------------------------------------------------------------------------