├── .gitignore
├── Makefile
├── .gitmodules
├── util
    └── KeccakSum
    │   ├── base64.h
    │   └── base64.c
├── .github
    └── workflows
    │   └── CI.yml
├── tests
    ├── testPerformance.h
    ├── testKangarooTwelve.h
    ├── timing.c
    ├── main.c
    ├── testPerformance.c
    ├── timing.h
    └── testKangarooTwelve.c
├── lib
    ├── Plain64
    │   ├── KeccakP-1600-plain64.c
    │   └── KeccakP-1600-SnP.h
    ├── KT-threadpool.c
    ├── align.h
    ├── Inplace32BI
    │   └── KeccakP-1600-SnP.h
    ├── KangarooTwelve-threading.h
    ├── ARMv8Asha3
    │   ├── KeccakP-1600-SnP.h
    │   ├── KeccakP-1600-opt64.c
    │   └── KeccakP-1600-runtimeDispatch.c
    ├── KT-threadpool-sequential.c
    ├── Optimized64
    │   ├── KeccakP-1600-SnP.h
    │   ├── KeccakP-1600-AVX512-plainC.c
    │   ├── KeccakP-1600-runtimeDispatch.c
    │   └── KeccakP-1600-timesN-SSSE3.c
    ├── KT-threadpool.h
    ├── brg_endian.h
    ├── KT-threadpool-pthread.c
    ├── KangarooTwelve.h
    └── KangarooTwelve-threading.c
├── Python
    ├── Utils.py
    ├── KangarooTwelve.py
    ├── TurboSHAKE256Tests.py
    ├── TurboSHAKE128Tests.py
    ├── KangarooTwelveTests.py
    └── TurboSHAKE.py
├── README.markdown
├── Makefile.build
└── .travis.yml


/.gitignore:
--------------------------------------------------------------------------------
1 | bin/
2 | var/
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | XKCBpath = support/XKCBuild
2 | include $(XKCBpath)/src/Main.makefile
3 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "support/XKCBuild"]
2 | 	path = support/XKCBuild
3 | 	url = https://github.com/XKCP/XKCBuild.git
4 | 


--------------------------------------------------------------------------------
/util/KeccakSum/base64.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Implementation taken from:
 3 | https://en.wikibooks.org/wiki/Algorithm_Implementation/Miscellaneous/Base64
 4 | (2015-12-16)
 5 | 
 6 | Available under the Creative Commons Attribution-ShareAlike License:
 7 | https://creativecommons.org/licenses/by-sa/3.0/
 8 | */
 9 | 
10 | #include <string.h>
11 | 
12 | int base64encode(const void* data_buf, size_t dataLength, char* result, size_t resultSize);
13 | 


--------------------------------------------------------------------------------
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "master" ]
 6 |   pull_request:
 7 |     branches: [ "master" ]
 8 | 
 9 | jobs:
10 |   build:
11 |     strategy:
12 |       matrix:
13 |         target: ["generic32", "generic64", "generic64noAsm", "plain64"]
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |     - uses: actions/checkout@v3
17 |       with:
18 |         submodules: recursive
19 |     - name: Install xsltproc
20 |       run: sudo apt-get install xsltproc
21 |     - name: Build
22 |       run: make ${{ matrix.target }}/KTtests ${{ matrix.target }}/libKT.a ${{ matrix.target }}/libKT.so
23 |     - name: Test
24 |       run: bin/${{ matrix.target }}/KTtests -K12
25 | 


--------------------------------------------------------------------------------
/tests/testPerformance.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | K12 based on the eXtended Keccak Code Package (XKCP)
 3 | https://github.com/XKCP/XKCP
 4 | 
 5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier.
 6 | 
 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
 8 | 
 9 | For more information, feedback or questions, please refer to the Keccak Team website:
10 | https://keccak.team/
11 | 
12 | To the extent possible under law, the implementer has waived all copyright
13 | and related or neighboring rights to the source code in this file.
14 | http://creativecommons.org/publicdomain/zero/1.0/
15 | */
16 | 
17 | #ifndef _testPerformance_h_
18 | #define _testPerformance_h_
19 | 
20 | void testPerformance(void);
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/tests/testKangarooTwelve.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | K12 based on the eXtended Keccak Code Package (XKCP)
 3 | https://github.com/XKCP/XKCP
 4 | 
 5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier.
 6 | 
 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
 8 | 
 9 | For more information, feedback or questions, please refer to the Keccak Team website:
10 | https://keccak.team/
11 | 
12 | To the extent possible under law, the implementer has waived all copyright
13 | and related or neighboring rights to the source code in this file.
14 | http://creativecommons.org/publicdomain/zero/1.0/
15 | */
16 | 
17 | #ifndef _TestKangarooTwelve_h_
18 | #define _TestKangarooTwelve_h_
19 | 
20 | void testKangarooTwelve(void);
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/lib/Plain64/KeccakP-1600-plain64.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | K12 based on the eXtended Keccak Code Package (XKCP)
 3 | https://github.com/XKCP/XKCP
 4 | 
 5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
 6 | 
 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
 8 | 
 9 | For more information, feedback or questions, please refer to the Keccak Team website:
10 | https://keccak.team/
11 | 
12 | To the extent possible under law, the implementer has waived all copyright
13 | and related or neighboring rights to the source code in this file.
14 | http://creativecommons.org/publicdomain/zero/1.0/
15 | 
16 | ---
17 | 
18 | Please refer to the XKCP for more details.
19 | */
20 | 
21 | const char * KeccakP1600_GetImplementation()
22 | {
23 |     return "generic 64-bit implementation";
24 | }
25 | 


--------------------------------------------------------------------------------
/lib/KT-threadpool.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | K12 based on the eXtended Keccak Code Package (XKCP)
 3 | https://github.com/XKCP/XKCP
 4 | 
 5 | Thread pool abstraction layer - common functions.
 6 | 
 7 | To the extent possible under law, the implementer has waived all copyright
 8 | and related or neighboring rights to the source code in this file.
 9 | http://creativecommons.org/publicdomain/zero/1.0/
10 | */
11 | 
12 | #include "KT-threadpool.h"
13 | 
14 | /* Detect pthread availability */
15 | #if defined(_POSIX_THREADS) || defined(__unix__) || defined(__unix) || \
16 |     (defined(__APPLE__) && defined(__MACH__)) || defined(__linux__) || \
17 |     defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
18 | #define HAVE_PTHREADS 1
19 | #endif
20 | 
21 | const KT_ThreadPool_API* KT_ThreadPool_GetDefault(void)
22 | {
23 | #ifdef HAVE_PTHREADS
24 |     return &KT_ThreadPool_Pthread;
25 | #else
26 |     return &KT_ThreadPool_Sequential;
27 | #endif
28 | }
29 | 


--------------------------------------------------------------------------------
/lib/align.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | K12 based on the eXtended Keccak Code Package (XKCP)
 3 | https://github.com/XKCP/XKCP
 4 | 
 5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier.
 6 | 
 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
 8 | 
 9 | For more information, feedback or questions, please refer to the Keccak Team website:
10 | https://keccak.team/
11 | 
12 | To the extent possible under law, the implementer has waived all copyright
13 | and related or neighboring rights to the source code in this file.
14 | http://creativecommons.org/publicdomain/zero/1.0/
15 | */
16 | 
17 | #ifndef _align_h_
18 | #define _align_h_
19 | 
20 | #ifdef ALIGN
21 | #undef ALIGN
22 | #endif
23 | 
24 | #if defined(__GNUC__)
25 | #define ALIGN(x) __attribute__ ((aligned(x)))
26 | #elif defined(_MSC_VER)
27 | #define ALIGN(x) __declspec(align(x))
28 | #elif defined(__ARMCC_VERSION)
29 | #define ALIGN(x) __align(x)
30 | #else
31 | #define ALIGN(x)
32 | #endif
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/lib/Inplace32BI/KeccakP-1600-SnP.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | K12 based on the eXtended Keccak Code Package (XKCP)
 3 | https://github.com/XKCP/XKCP
 4 | 
 5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
 6 | 
 7 | Implementation by Ronny Van Keer, hereby denoted as "the implementer".
 8 | 
 9 | For more information, feedback or questions, please refer to the Keccak Team website:
10 | https://keccak.team/
11 | 
12 | To the extent possible under law, the implementer has waived all copyright
13 | and related or neighboring rights to the source code in this file.
14 | http://creativecommons.org/publicdomain/zero/1.0/
15 | 
16 | ---
17 | 
18 | Please refer to the XKCP for more details.
19 | */
20 | 
21 | #ifndef _KeccakP_1600_SnP_h_
22 | #define _KeccakP_1600_SnP_h_
23 | 
24 | #define KeccakP1600_stateSizeInBytes    200
25 | #define KeccakP1600_stateAlignment      8
26 | #define KeccakP1600_disableParallelism
27 | 
28 | const char * KeccakP1600_GetImplementation();
29 | void KeccakP1600_Initialize(void *state);
30 | void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
31 | void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
32 | void KeccakP1600_Permute_12rounds(void *state);
33 | void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/Python/Utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Implementation by Gilles Van Assche and Benoit Viguier, hereby denoted as "the implementers".
 3 | #
 4 | # For more information, feedback or questions, please refer to our website:
 5 | # https://keccak.team/
 6 | #
 7 | # To the extent possible under law, the implementers has waived all copyright
 8 | # and related or neighboring rights to the source code in this file.
 9 | # http://creativecommons.org/publicdomain/zero/1.0/
10 | 
11 | from __future__ import print_function
12 | 
13 | def hexString(s):
14 |     r = ''
15 |     for i in range(len(s)):
16 |         if r != '': r = r + ' '
17 |         r = r + "{0:02X}".format(s[i])
18 |     return r
19 | 
20 | def hexStringSpecial(s):
21 |     if len(s) == 0:
22 |         return "`00`^0"
23 |     else:
24 |         return "`"+hexString(s)+"`"
25 | 
26 | def numberStringSpecial(base, exponent):
27 |     if exponent == 0:
28 |         return "1"
29 |     elif exponent == 1:
30 |         return "{0:d}".format(base)
31 |     else:
32 |         return "{0:d}**{1:d}".format(base, exponent)
33 | 
34 | def outputHex(s):
35 |     for i in range(len(s)):
36 |         print("{0:02X}".format(s[i]), end=' ')
37 |         if i % 16 == 15:
38 |             print()
39 |     print()
40 |     print()
41 | 
42 | def printTestVectorOutput(s):
43 |     print('    `', end='')
44 |     for i in range(len(s)):
45 |         print("{0:02X}".format(s[i]), end=('`' if i == len(s) - 1 else ' '))
46 |         if i % 16 == 15:
47 |             print()
48 |             print('     ', end='')
49 |     print()
50 | 


--------------------------------------------------------------------------------
/tests/timing.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | The eXtended Keccak Code Package (XKCP)
 3 | https://github.com/XKCP/XKCP
 4 | 
 5 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
 6 | 
 7 | For more information, feedback or questions, please refer to the Keccak Team website:
 8 | https://keccak.team/
 9 | 
10 | To the extent possible under law, the implementer has waived all copyright
11 | and related or neighboring rights to the source code in this file.
12 | http://creativecommons.org/publicdomain/zero/1.0/
13 | */
14 | 
15 | #include "timing.h"
16 | 
17 | const char * getTimerUnit()
18 | {
19 | #if defined(__aarch64__)
20 |     return "ns";
21 | #else
22 |     return "cycles";
23 | #endif
24 | }
25 | 
26 | double timerCorrectionFactor = 1.0;
27 | 
28 | static double getTimerCorrectionFactor()
29 | {
30 | #if defined(__aarch64__)
31 |     int64_t virtual_timer_freq;
32 |     asm volatile("mrs %0, cntfrq_el0" : "=r"(virtual_timer_freq));
33 |     return (double)1.0e9 / (double)virtual_timer_freq;
34 | #else
35 |     return 1.0;
36 | #endif
37 | }
38 | 
39 | cycles_t CalibrateTimer()
40 | {
41 |     cycles_t dtMin = CYCLES_MAX;        /* big number to start */
42 |     cycles_t t0,t1,i;
43 | 
44 |     timerCorrectionFactor = getTimerCorrectionFactor();
45 | 
46 |     for (i=0;i < TIMER_SAMPLE_CNT;i++)  /* calibrate the overhead for measuring time */
47 |         {
48 |         t0 = CycleTimer();
49 |         t1 = CycleTimer();
50 |         if (dtMin > t1-t0)              /* keep only the minimum time */
51 |             dtMin = t1-t0;
52 |         }
53 |     return dtMin;
54 | }
55 | 


--------------------------------------------------------------------------------
/lib/Plain64/KeccakP-1600-SnP.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | K12 based on the eXtended Keccak Code Package (XKCP)
 3 | https://github.com/XKCP/XKCP
 4 | 
 5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
 6 | 
 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
 8 | 
 9 | For more information, feedback or questions, please refer to the Keccak Team website:
10 | https://keccak.team/
11 | 
12 | To the extent possible under law, the implementer has waived all copyright
13 | and related or neighboring rights to the source code in this file.
14 | http://creativecommons.org/publicdomain/zero/1.0/
15 | 
16 | ---
17 | 
18 | Please refer to the XKCP for more details.
19 | */
20 | 
21 | #ifndef _KeccakP_1600_SnP_h_
22 | #define _KeccakP_1600_SnP_h_
23 | 
24 | /* Keccak-p[1600] */
25 | 
26 | #define KeccakP1600_stateSizeInBytes    200
27 | #define KeccakP1600_stateAlignment      8
28 | #define KeccakP1600_12rounds_FastLoop_supported
29 | #define KeccakP1600_disableParallelism
30 | 
31 | const char * KeccakP1600_GetImplementation();
32 | void KeccakP1600_Initialize(void *state);
33 | void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
34 | void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
35 | void KeccakP1600_Permute_12rounds(void *state);
36 | void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
37 | size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
38 | 
39 | // Instead of defining proxy functions which do nothing, simply rename the
40 | // symbols of the opt64 implementation where they are used.
41 | #define KeccakP1600_opt64_Initialize KeccakP1600_Initialize
42 | #define KeccakP1600_opt64_AddByte KeccakP1600_AddByte
43 | #define KeccakP1600_opt64_AddBytes KeccakP1600_AddBytes
44 | #define KeccakP1600_opt64_Permute_12rounds KeccakP1600_Permute_12rounds
45 | #define KeccakP1600_opt64_ExtractBytes KeccakP1600_ExtractBytes
46 | #define KeccakP1600_opt64_12rounds_FastLoop_Absorb KeccakP1600_12rounds_FastLoop_Absorb
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/lib/KangarooTwelve-threading.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | K12 based on the eXtended Keccak Code Package (XKCP)
 3 | https://github.com/XKCP/XKCP
 4 | 
 5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier.
 6 | 
 7 | Threading support implementation using portable thread pool abstraction.
 8 | 
 9 | PLATFORM COMPATIBILITY:
10 | The library uses a portable thread pool abstraction that supports multiple backends:
11 | - Built-in pthread backend (Linux, macOS, BSD, Unix-like systems)
12 | - Built-in sequential backend (all platforms, no actual parallelism)
13 | - Custom application-provided backends (see KT-threadpool.h)
14 | 
15 | By default, the pthread backend is used on systems with pthread support,
16 | and the sequential backend is used elsewhere.
17 | 
18 | To the extent possible under law, the implementer has waived all copyright
19 | and related or neighboring rights to the source code in this file.
20 | http://creativecommons.org/publicdomain/zero/1.0/
21 | */
22 | 
23 | #ifndef _KangarooTwelve_threading_h_
24 | #define _KangarooTwelve_threading_h_
25 | 
26 | #include <stddef.h>
27 | #include "KT-threadpool.h"
28 | 
29 | /**
30 |  * Internal function to process multiple chunks in parallel using threads.
31 |  *
32 |  * @param threadpool_api    Thread pool API implementation
33 |  * @param threadpool_handle Thread pool handle
34 |  * @param thread_count      Number of threads in the pool
35 |  * @param input             Pointer to input data (multiple chunks)
36 |  * @param chunkCount        Number of chunks to process
37 |  * @param output            Pointer to output buffer for chaining values
38 |  * @param securityLevel     128 for KT128 or 256 for KT256
39 |  * @return 0 if successful, 1 otherwise
40 |  */
41 | int KT_ProcessChunksThreaded(const KT_ThreadPool_API* threadpool_api,
42 |                              void* threadpool_handle,
43 |                              int thread_count,
44 |                              const unsigned char *input,
45 |                              size_t chunkCount,
46 |                              unsigned char *output,
47 |                              int securityLevel);
48 | 
49 | #endif /* _KangarooTwelve_threading_h_ */
50 | 


--------------------------------------------------------------------------------
/lib/ARMv8Asha3/KeccakP-1600-SnP.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | K12 based on the eXtended Keccak Code Package (XKCP)
 3 | https://github.com/XKCP/XKCP
 4 | 
 5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
 6 | 
 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
 8 | 
 9 | For more information, feedback or questions, please refer to the Keccak Team website:
10 | https://keccak.team/
11 | 
12 | To the extent possible under law, the implementer has waived all copyright
13 | and related or neighboring rights to the source code in this file.
14 | http://creativecommons.org/publicdomain/zero/1.0/
15 | 
16 | ---
17 | 
18 | Please refer to the XKCP for more details.
19 | */
20 | 
21 | #ifndef _KeccakP_1600_SnP_h_
22 | #define _KeccakP_1600_SnP_h_
23 | 
24 | /* Keccak-p[1600] */
25 | 
26 | #define KeccakP1600_stateSizeInBytes    200
27 | #define KeccakP1600_stateAlignment      8
28 | #define KeccakP1600_12rounds_FastLoop_supported
29 | 
30 | const char * KeccakP1600_GetImplementation();
31 | void KeccakP1600_Initialize(void *state);
32 | void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
33 | void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
34 | void KeccakP1600_Permute_12rounds(void *state);
35 | void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
36 | size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
37 | 
38 | /* Keccak-p[1600]×2 */
39 | 
40 | int KeccakP1600times2_IsAvailable();
41 | const char * KeccakP1600times2_GetImplementation();
42 | void KeccakP1600times2_Permute_12rounds(void *state);
43 | void KT128_Process2Leaves(const unsigned char *input, unsigned char *output);
44 | void KT256_Process2Leaves(const unsigned char *input, unsigned char *output);
45 | 
46 | /* Keccak-p[1600]×4 */
47 | 
48 | int KeccakP1600times4_IsAvailable();
49 | const char * KeccakP1600times4_GetImplementation();
50 | 
51 | /* Keccak-p[1600]×8 */
52 | 
53 | int KeccakP1600times8_IsAvailable();
54 | const char * KeccakP1600times8_GetImplementation();
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------
/Python/KangarooTwelve.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Implementation by Gilles Van Assche, hereby denoted as "the implementer".
 3 | #
 4 | # For more information, feedback or questions, please refer to our website:
 5 | # https://keccak.team/
 6 | #
 7 | # To the extent possible under law, the implementer has waived all copyright
 8 | # and related or neighboring rights to the source code in this file.
 9 | # http://creativecommons.org/publicdomain/zero/1.0/
10 | 
11 | from TurboSHAKE import TurboSHAKE128, TurboSHAKE256
12 | from Utils import outputHex
13 | 
14 | def length_encode(x):
15 |     S = bytearray()
16 |     while x > 0:
17 |         S = bytearray([x % 256]) + S
18 |         x = x//256
19 |     S = S + bytearray([len(S)])
20 |     return S
21 | 
22 | # inputMessage and customizationString must be of type byte string or byte array
23 | def KT128(inputMessage, customString, outputByteLen):
24 |     S = inputMessage + customString
25 |     S = S + length_encode(len(customString))
26 | 
27 |     if len(S) <= 8192:
28 |         return TurboSHAKE128(S, 0x07, outputByteLen)
29 |     else:
30 |         # === Kangaroo hopping ===
31 |         FinalNode = S[0:8192] + bytearray([0x03] + [0x00]*7)
32 |         offset = 8192
33 |         numBlock = 0
34 |         while offset < len(S):
35 |             blockSize = min(len(S) - offset, 8192)
36 |             CV = TurboSHAKE128(S[offset : offset + blockSize], 0x0B, 32)
37 |             FinalNode = FinalNode + CV
38 |             numBlock += 1
39 |             offset   += blockSize
40 |         FinalNode = FinalNode + length_encode( numBlock ) + bytearray([0xFF, 0xFF])
41 |         return TurboSHAKE128(FinalNode, 0x06, outputByteLen)
42 | 
43 | def KT256(inputMessage, customString, outputByteLen):
44 |     S = inputMessage + customString
45 |     S = S + length_encode(len(customString))
46 | 
47 |     if len(S) <= 8192:
48 |         return TurboSHAKE256(S, 0x07, outputByteLen)
49 |     else:
50 |         # === Kangaroo hopping ===
51 |         FinalNode = S[0:8192] + bytearray([0x03] + [0x00]*7)
52 |         offset = 8192
53 |         numBlock = 0
54 |         while offset < len(S):
55 |             blockSize = min(len(S) - offset, 8192)
56 |             CV = TurboSHAKE256(S[offset : offset + blockSize], 0x0B, 64)
57 |             FinalNode = FinalNode + CV
58 |             numBlock += 1
59 |             offset   += blockSize
60 |         FinalNode = FinalNode + length_encode( numBlock ) + bytearray([0xFF, 0xFF])
61 |         return TurboSHAKE256(FinalNode, 0x06, outputByteLen)
62 | 


--------------------------------------------------------------------------------
/tests/main.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | K12 based on the eXtended Keccak Code Package (XKCP)
 3 | https://github.com/XKCP/XKCP
 4 | 
 5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier.
 6 | 
 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
 8 | 
 9 | For more information, feedback or questions, please refer to the Keccak Team website:
10 | https://keccak.team/
11 | 
12 | To the extent possible under law, the implementer has waived all copyright
13 | and related or neighboring rights to the source code in this file.
14 | http://creativecommons.org/publicdomain/zero/1.0/
15 | */
16 | 
17 | #include <assert.h>
18 | #include <stdio.h>
19 | #include <stdlib.h>
20 | #include <string.h>
21 | #include "align.h"
22 | #include "KangarooTwelve.h"
23 | #include "testKangarooTwelve.h"
24 | #include "testPerformance.h"
25 | 
26 | #define BENCH1GB
27 | 
28 | void printHelp()
29 | {
30 |         printf("Usage: KeccakTests command(s), where the commands can be\n");
31 |         printf("  --help or -h              To display this page\n");
32 |         printf("  --all or -a               All tests\n");
33 |         printf("  --KangarooTwelve or -K12  Tests on KangarooTwelve\n");
34 |         printf("  --speed or -s             Speed measuresments\n");
35 | #ifdef BENCH1GB
36 |         printf("  --1GB                     Just hash 1GB of data and exit\n");
37 | #endif
38 | }
39 | 
40 | #ifdef BENCH1GB
41 | void bench1GB()
42 | {
43 |     #define INPUT_SIZE 1000000000
44 |     static ALIGN(64) unsigned char input[INPUT_SIZE];
45 |     static ALIGN(64) unsigned char output[32];
46 |     KT128(input, INPUT_SIZE, output, 32, 0, 0);
47 |     #undef INPUT_SIZE
48 | }
49 | #endif
50 | 
51 | int process(int argc, char* argv[])
52 | {
53 |     int i;
54 |     int help = 0;
55 |     int KangarooTwelve = 0;
56 |     int speed = 0;
57 | 
58 |     if (argc <= 1)
59 |         help = 1;
60 | 
61 | #ifdef BENCH1GB
62 |     if (argc > 1 && strcmp("--1GB", argv[1]) == 0) {
63 |         bench1GB();
64 |         return 0;
65 |     }
66 | #endif
67 | 
68 |     for(i=1; i<argc; i++) {
69 |         if ((strcmp("--help", argv[i]) == 0) || (strcmp("-h", argv[i]) == 0))
70 |             help = 1;
71 |         else if ((strcmp("--all", argv[i]) == 0) || (strcmp("-a", argv[i]) == 0))
72 |            KangarooTwelve = speed = 1;
73 |         else if ((strcmp("--KangarooTwelve", argv[i]) == 0) || (strcmp("-K12", argv[i]) == 0))
74 |             KangarooTwelve = 1;
75 |         else if ((strcmp("--speed", argv[i]) == 0) || (strcmp("-s", argv[i]) == 0))
76 |             speed = 1;
77 |         else {
78 |             printf("Unrecognized command '%s'\n", argv[i]);
79 |             return -1;
80 |         }
81 |     }
82 |     if (help) {
83 |         printHelp();
84 |         return 0;
85 |     }
86 |     if (KangarooTwelve) {
87 |         testKangarooTwelve();
88 |     }
89 |     if (speed) {
90 |         testPerformance();
91 |     }
92 |     return 0;
93 | }
94 | 
95 | int main(int argc, char* argv[])
96 | {
97 |     return process(argc, argv);
98 | }
99 | 


--------------------------------------------------------------------------------
/lib/KT-threadpool-sequential.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | K12 based on the eXtended Keccak Code Package (XKCP)
  3 | https://github.com/XKCP/XKCP
  4 | 
  5 | Sequential (no-threading) thread pool implementation.
  6 | 
  7 | This backend executes work functions immediately in the calling thread,
  8 | providing no actual parallelism. It serves as a portable fallback for
  9 | platforms without threading support and is useful for testing/debugging.
 10 | 
 11 | To the extent possible under law, the implementer has waived all copyright
 12 | and related or neighboring rights to the source code in this file.
 13 | http://creativecommons.org/publicdomain/zero/1.0/
 14 | */
 15 | 
 16 | #include "KT-threadpool.h"
 17 | #include <stdlib.h>
 18 | 
 19 | #define MAX_JOBS 256
 20 | 
 21 | /* Sequential pool context */
 22 | typedef struct {
 23 |     /* Job queue */
 24 |     struct {
 25 |         void (*work_fn)(void*);
 26 |         void* work_data;
 27 |     } jobs[MAX_JOBS];
 28 |     int job_count;
 29 |     int valid;  /* Marker to detect use-after-free */
 30 | } SequentialPool;
 31 | 
 32 | /* Create sequential pool */
 33 | static void* sequential_create_pool(int num_threads)
 34 | {
 35 |     /* Ignore num_threads - we're always sequential */
 36 |     (void)num_threads;
 37 | 
 38 |     SequentialPool* pool = (SequentialPool*)malloc(sizeof(SequentialPool));
 39 |     if (!pool)
 40 |         return NULL;
 41 | 
 42 |     pool->job_count = 0;
 43 |     pool->valid = 0x12345678;  /* Magic number for validation */
 44 | 
 45 |     return pool;
 46 | }
 47 | 
 48 | /* Submit work to sequential pool (just queue it) */
 49 | static int sequential_submit(void* pool_handle, void (*work_fn)(void*), void* work_data)
 50 | {
 51 |     SequentialPool* pool = (SequentialPool*)pool_handle;
 52 |     if (!pool || pool->valid != 0x12345678 || !work_fn)
 53 |         return 1;
 54 | 
 55 |     if (pool->job_count >= MAX_JOBS)
 56 |         return 1;  /* Job queue full */
 57 | 
 58 |     pool->jobs[pool->job_count].work_fn = work_fn;
 59 |     pool->jobs[pool->job_count].work_data = work_data;
 60 |     pool->job_count++;
 61 | 
 62 |     return 0;
 63 | }
 64 | 
 65 | /* Wait for all work (execute queued jobs sequentially) */
 66 | static void sequential_wait_all(void* pool_handle)
 67 | {
 68 |     SequentialPool* pool = (SequentialPool*)pool_handle;
 69 |     if (!pool || pool->valid != 0x12345678)
 70 |         return;
 71 | 
 72 |     /* Execute all queued jobs sequentially */
 73 |     for (int i = 0; i < pool->job_count; i++) {
 74 |         if (pool->jobs[i].work_fn) {
 75 |             pool->jobs[i].work_fn(pool->jobs[i].work_data);
 76 |         }
 77 |     }
 78 | 
 79 |     /* Reset for next batch */
 80 |     pool->job_count = 0;
 81 | }
 82 | 
 83 | /* Destroy sequential pool */
 84 | static void sequential_destroy(void* pool_handle)
 85 | {
 86 |     SequentialPool* pool = (SequentialPool*)pool_handle;
 87 |     if (!pool)
 88 |         return;
 89 | 
 90 |     pool->valid = 0;  /* Invalidate */
 91 |     free(pool);
 92 | }
 93 | 
 94 | /* Export sequential backend API */
 95 | const KT_ThreadPool_API KT_ThreadPool_Sequential = {
 96 |     .min_input_size_for_threading = 2097152,  /* 2 MB default threshold */
 97 |     .create = sequential_create_pool,
 98 |     .submit = sequential_submit,
 99 |     .wait_all = sequential_wait_all,
100 |     .destroy = sequential_destroy
101 | };
102 | 


--------------------------------------------------------------------------------
/util/KeccakSum/base64.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | Implementation taken from:
 3 | https://en.wikibooks.org/wiki/Algorithm_Implementation/Miscellaneous/Base64
 4 | (2015-12-16)
 5 | 
 6 | Available under the Creative Commons Attribution-ShareAlike License:
 7 | https://creativecommons.org/licenses/by-sa/3.0/
 8 | */
 9 | 
10 | #include <stdint.h>
11 | #include <string.h>
12 | 
13 | int base64encode(const void* data_buf, size_t dataLength, char* result, size_t resultSize)
14 | {
15 |    const char base64chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
16 |    const uint8_t *data = (const uint8_t *)data_buf;
17 |    size_t resultIndex = 0;
18 |    size_t x;
19 |    uint32_t n = 0;
20 |    int padCount = dataLength % 3;
21 |    uint8_t n0, n1, n2, n3;
22 | 
23 |    /* increment over the length of the string, three characters at a time */
24 |    for (x = 0; x < dataLength; x += 3)
25 |    {
26 |       /* these three 8-bit (ASCII) characters become one 24-bit number */
27 |       n = ((uint32_t)data[x]) << 16; /* parenthesis needed, compiler depending on flags can do the shifting before conversion to uint32_t, resulting to 0 */
28 | 
29 |       if((x+1) < dataLength)
30 |          n += ((uint32_t)data[x+1]) << 8; /* parenthesis needed, compiler depending on flags can do the shifting before conversion to uint32_t, resulting to 0 */
31 | 
32 |       if((x+2) < dataLength)
33 |          n += data[x+2];
34 | 
35 |       /* this 24-bit number gets separated into four 6-bit numbers */
36 |       n0 = (uint8_t)(n >> 18) & 63;
37 |       n1 = (uint8_t)(n >> 12) & 63;
38 |       n2 = (uint8_t)(n >> 6) & 63;
39 |       n3 = (uint8_t)n & 63;
40 | 
41 |       /*
42 |        * if we have one byte available, then its encoding is spread
43 |        * out over two characters
44 |        */
45 |       if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
46 |       result[resultIndex++] = base64chars[n0];
47 |       if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
48 |       result[resultIndex++] = base64chars[n1];
49 | 
50 |       /*
51 |        * if we have only two bytes available, then their encoding is
52 |        * spread out over three chars
53 |        */
54 |       if((x+1) < dataLength)
55 |       {
56 |          if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
57 |          result[resultIndex++] = base64chars[n2];
58 |       }
59 | 
60 |       /*
61 |        * if we have all three bytes available, then their encoding is spread
62 |        * out over four characters
63 |        */
64 |       if((x+2) < dataLength)
65 |       {
66 |          if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
67 |          result[resultIndex++] = base64chars[n3];
68 |       }
69 |    }
70 | 
71 |    /*
72 |     * create and add padding that is required if we did not have a multiple of 3
73 |     * number of characters available
74 |     */
75 |    if (padCount > 0)
76 |    {
77 |       for (; padCount < 3; padCount++)
78 |       {
79 |          if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
80 |          result[resultIndex++] = '=';
81 |       }
82 |    }
83 |    if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
84 |    result[resultIndex] = 0;
85 |    return 0;   /* indicate success */
86 | }
87 | 


--------------------------------------------------------------------------------
/Python/TurboSHAKE256Tests.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Implementation by Gilles Van Assche and Benoit Viguier, hereby denoted as "the implementers".
 3 | #
 4 | # For more information, feedback or questions, please refer to our website:
 5 | # https://keccak.team/
 6 | #
 7 | # To the extent possible under law, the implementers has waived all copyright
 8 | # and related or neighboring rights to the source code in this file.
 9 | # http://creativecommons.org/publicdomain/zero/1.0/
10 | 
11 | from __future__ import print_function
12 | from TurboSHAKE import TurboSHAKE256
13 | from Utils import hexString, printTestVectorOutput
14 | 
15 | def generateSimpleRawMaterial(length, seed1, seed2):
16 |     seed2 = seed2 % 8
17 |     return bytes([(seed1 + 161*length - ((i%256) << seed2) - ((i%256) >> (8-seed2)) + i)%256 for i in range(length)])
18 | 
19 | customizationByteSize = 32
20 | 
21 | def performTestTurboSHAKE256OneInput(inputLen, outputLen, customLen):
22 |     customization = 97
23 |     inputMessage = generateSimpleRawMaterial(inputLen, outputLen, inputLen + customLen)
24 |     print("outputLen {0:5d}, inputLen {1:5d}, customLen {2:3d}".format(outputLen, inputLen, customLen))
25 |     output = TurboSHAKE256(inputMessage, customization, outputLen)
26 |     print("Kangaroo-Twelve")
27 |     print("Input of {0:d} bytes:".format(inputLen), end='')
28 |     for i in range(min(inputLen, 16)):
29 |         print(" {0:02x}".format(inputMessage[i]), end='')
30 |     if (inputLen > 16):
31 |         print(" ...", end='')
32 |     print("")
33 |     print("Output of {0:d} bytes:".format(outputLen), end='')
34 |     for i in range(outputLen):
35 |         print(" {0:02x}".format(output[i]), end='')
36 |     print("")
37 |     print("")
38 | 
39 | def performTestTurboSHAKE256():
40 |     cBlockSize = 8192
41 |     outputLen = 256//8
42 |     customLen = 0
43 |     for inputLen in range(cBlockSize*9+124):
44 |         performTestTurboSHAKE256OneInput(inputLen, outputLen, customLen)
45 | 
46 |     outputLen = 128//8
47 |     while(outputLen <= 512//8):
48 |         inputLen = 0
49 |         while(inputLen <= (3*cBlockSize)):
50 |             customLen = 0
51 |             while(customLen <= customizationByteSize):
52 |                 performTestTurboSHAKE256OneInput(inputLen, outputLen, customLen)
53 |                 customLen += 7
54 |             inputLen = (inputLen + 167) if (inputLen > 0) else 1
55 |         outputLen = outputLen*2
56 | 
57 | def performShortTestTurboSHAKE256():
58 |     cBlockSize = 8192
59 |     outputLen = 256//8
60 |     customLen = 0
61 |     for inputLen in range(4):
62 |         performTestTurboSHAKE256OneInput(inputLen, outputLen, customLen)
63 |     performTestTurboSHAKE256OneInput(27121, outputLen, customLen)
64 | 
65 | #performTestTurboSHAKE256()
66 | #performShortTestTurboSHAKE256()
67 | 
68 | def printTestVectors():
69 |     print("  TurboSHAKE256(M=`00`^0, D=`1F`, 64):")
70 |     printTestVectorOutput(TurboSHAKE256(b'', 0x1F, 64))
71 |     print("  TurboSHAKE256(M=`00`^0, D=`1F`, 10032), last 32 bytes:")
72 |     printTestVectorOutput(TurboSHAKE256(b'', 0x1F, 10032)[10000:])
73 |     for i in range(7):
74 |         M = bytearray([(j % 251) for j in range(17**i)])
75 |         print("  TurboSHAKE256(M=ptn(17**{0:d} bytes), D=`1F`, 64):".format(i))
76 |         printTestVectorOutput(TurboSHAKE256(M, 0x1F, 64))
77 |     for D in [0x01, 0x06, 0x07, 0x0B, 0x30, 0x7F]:
78 |         i = D%3 + 1
79 |         M = bytearray([0xFF for j in range(2**i-1)])
80 |         print("  TurboSHAKE256(M=`{0}`, D=`{1:02X}`, 64):".format(hexString(M), D))
81 |         printTestVectorOutput(TurboSHAKE256(M, D, 64))
82 | 
83 | printTestVectors()
84 | 


--------------------------------------------------------------------------------
/Python/TurboSHAKE128Tests.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Implementation by Gilles Van Assche and Benoit Viguier, hereby denoted as "the implementers".
 3 | #
 4 | # For more information, feedback or questions, please refer to our website:
 5 | # https://keccak.team/
 6 | #
 7 | # To the extent possible under law, the implementers has waived all copyright
 8 | # and related or neighboring rights to the source code in this file.
 9 | # http://creativecommons.org/publicdomain/zero/1.0/
10 | 
11 | from __future__ import print_function
12 | from TurboSHAKE import TurboSHAKE128
13 | from Utils import hexString, printTestVectorOutput
14 | 
15 | def generateSimpleRawMaterial(length, seed1, seed2):
16 |     seed2 = seed2 % 8
17 |     return bytes([(seed1 + 161*length - ((i%256) << seed2) - ((i%256) >> (8-seed2)) + i)%256 for i in range(length)])
18 | 
19 | customizationByteSize = 32
20 | 
21 | def performTestTurboSHAKE128OneInput(inputLen, outputLen, customLen):
22 |     customization = 97
23 |     inputMessage = generateSimpleRawMaterial(inputLen, outputLen, inputLen + customLen)
24 |     print("outputLen {0:5d}, inputLen {1:5d}, customLen {2:3d}".format(outputLen, inputLen, customLen))
25 |     output = TurboSHAKE128(inputMessage, customization, outputLen)
26 |     print("Kangaroo-Twelve")
27 |     print("Input of {0:d} bytes:".format(inputLen), end='')
28 |     for i in range(min(inputLen, 16)):
29 |         print(" {0:02x}".format(inputMessage[i]), end='')
30 |     if (inputLen > 16):
31 |         print(" ...", end='')
32 |     print("")
33 |     print("Output of {0:d} bytes:".format(outputLen), end='')
34 |     for i in range(outputLen):
35 |         print(" {0:02x}".format(output[i]), end='')
36 |     print("")
37 |     print("")
38 | 
39 | def performTestTurboSHAKE128():
40 |     cBlockSize = 8192
41 |     outputLen = 256//8
42 |     customLen = 0
43 |     for inputLen in range(cBlockSize*9+124):
44 |         performTestTurboSHAKE128OneInput(inputLen, outputLen, customLen)
45 | 
46 |     outputLen = 128//8
47 |     while(outputLen <= 512//8):
48 |         inputLen = 0
49 |         while(inputLen <= (3*cBlockSize)):
50 |             customLen = 0
51 |             while(customLen <= customizationByteSize):
52 |                 performTestTurboSHAKE128OneInput(inputLen, outputLen, customLen)
53 |                 customLen += 7
54 |             inputLen = (inputLen + 167) if (inputLen > 0) else 1
55 |         outputLen = outputLen*2
56 | 
57 | def performShortTestTurboSHAKE128():
58 |     cBlockSize = 8192
59 |     outputLen = 256//8
60 |     customLen = 0
61 |     for inputLen in range(4):
62 |         performTestTurboSHAKE128OneInput(inputLen, outputLen, customLen)
63 |     performTestTurboSHAKE128OneInput(27121, outputLen, customLen)
64 | 
65 | #performTestTurboSHAKE128()
66 | #performShortTestTurboSHAKE128()
67 | 
68 | def printTestVectors():
69 |     print("  TurboSHAKE128(M=`00`^0, D=`1F`, 32):")
70 |     printTestVectorOutput(TurboSHAKE128(b'', 0x1F, 32))
71 |     print("  TurboSHAKE128(M=`00`^0, D=`1F`, 64):")
72 |     printTestVectorOutput(TurboSHAKE128(b'', 0x1F, 64))
73 |     print("  TurboSHAKE128(M=`00`^0, D=`1F`, 10032), last 32 bytes:")
74 |     printTestVectorOutput(TurboSHAKE128(b'', 0x1F, 10032)[10000:])
75 |     for i in range(7):
76 |         M = bytearray([(j % 251) for j in range(17**i)])
77 |         print("  TurboSHAKE128(M=ptn(17**{0:d} bytes), D=`1F`, 32):".format(i))
78 |         printTestVectorOutput(TurboSHAKE128(M, 0x1F, 32))
79 |     for D in [0x01, 0x06, 0x07, 0x0B, 0x30, 0x7F]:
80 |         i = D%3 + 1
81 |         M = bytearray([0xFF for j in range(2**i-1)])
82 |         print("  TurboSHAKE128(M=`{0}`, D=`{1:02X}`, 32):".format(hexString(M), D))
83 |         printTestVectorOutput(TurboSHAKE128(M, D, 32))
84 | 
85 | printTestVectors()
86 | 


--------------------------------------------------------------------------------
/lib/Optimized64/KeccakP-1600-SnP.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | K12 based on the eXtended Keccak Code Package (XKCP)
 3 | https://github.com/XKCP/XKCP
 4 | 
 5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
 6 | 
 7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
 8 | 
 9 | For more information, feedback or questions, please refer to the Keccak Team website:
10 | https://keccak.team/
11 | 
12 | To the extent possible under law, the implementer has waived all copyright
13 | and related or neighboring rights to the source code in this file.
14 | http://creativecommons.org/publicdomain/zero/1.0/
15 | 
16 | ---
17 | 
18 | Please refer to the XKCP for more details.
19 | */
20 | 
21 | #ifndef _KeccakP_1600_SnP_h_
22 | #define _KeccakP_1600_SnP_h_
23 | 
24 | /* Keccak-p[1600] */
25 | 
26 | #define KeccakP1600_stateSizeInBytes    200
27 | #define KeccakP1600_stateAlignment      8
28 | #define KeccakP1600_12rounds_FastLoop_supported
29 | 
30 | const char * KeccakP1600_GetImplementation();
31 | void KeccakP1600_Initialize(void *state);
32 | void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
33 | void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
34 | void KeccakP1600_Permute_12rounds(void *state);
35 | void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
36 | size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
37 | 
38 | void KeccakP1600_AVX512_Initialize(void *state);
39 | void KeccakP1600_AVX512_AddByte(void *state, unsigned char data, unsigned int offset);
40 | void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
41 | void KeccakP1600_AVX512_Permute_12rounds(void *state);
42 | void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
43 | size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
44 | 
45 | void KeccakP1600_AVX2_Initialize(void *state);
46 | void KeccakP1600_AVX2_AddByte(void *state, unsigned char data, unsigned int offset);
47 | void KeccakP1600_AVX2_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
48 | void KeccakP1600_AVX2_Permute_12rounds(void *state);
49 | void KeccakP1600_AVX2_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
50 | size_t KeccakP1600_AVX2_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
51 | 
52 | void KeccakP1600_opt64_Initialize(void *state);
53 | void KeccakP1600_opt64_AddByte(void *state, unsigned char data, unsigned int offset);
54 | void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
55 | void KeccakP1600_opt64_Permute_12rounds(void *state);
56 | void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
57 | size_t KeccakP1600_opt64_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
58 | 
59 | /* Keccak-p[1600]×2 */
60 | 
61 | int KeccakP1600times2_IsAvailable();
62 | const char * KeccakP1600times2_GetImplementation();
63 | 
64 | /* Keccak-p[1600]×4 */
65 | 
66 | int KeccakP1600times4_IsAvailable();
67 | const char * KeccakP1600times4_GetImplementation();
68 | 
69 | /* Keccak-p[1600]×8 */
70 | 
71 | int KeccakP1600times8_IsAvailable();
72 | const char * KeccakP1600times8_GetImplementation();
73 | 
74 | #endif
75 | 


--------------------------------------------------------------------------------
/Python/KangarooTwelveTests.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Implementation by Gilles Van Assche, hereby denoted as "the implementer".
 3 | #
 4 | # For more information, feedback or questions, please refer to our website:
 5 | # https://keccak.team/
 6 | #
 7 | # To the extent possible under law, the implementer has waived all copyright
 8 | # and related or neighboring rights to the source code in this file.
 9 | # http://creativecommons.org/publicdomain/zero/1.0/
10 | 
11 | from KangarooTwelve import KT128, KT256
12 | from Utils import hexStringSpecial, numberStringSpecial, printTestVectorOutput
13 | 
14 | def printKT128TestVectors():
15 |     print("  KT128(M=`00`^0, C=`00`^0, 32):")
16 |     printTestVectorOutput(KT128(b'', b'', 32))
17 |     print("  KT128(M=`00`^0, C=`00`^0, 64):")
18 |     printTestVectorOutput(KT128(b'', b'', 64))
19 |     print("  KT128(M=`00`^0, C=`00`^0, 10032), last 32 bytes:")
20 |     printTestVectorOutput(KT128(b'', b'', 10032)[10000:])
21 |     for i in range(7):
22 |         C = b''
23 |         M = bytearray([(j % 251) for j in range(17**i)])
24 |         print("  KT128(M=ptn({0:s} bytes), C=`00`^0, 32):".format(numberStringSpecial(17, i)))
25 |         printTestVectorOutput(KT128(M, C, 32))
26 |     for i in range(4):
27 |         M = bytearray([0xFF for j in range(2**i-1)])
28 |         C = bytearray([(j % 251) for j in range(41**i)])
29 |         print("  KT128({0:s}, C=ptn({1:s} bytes), 32):".format(hexStringSpecial(M), numberStringSpecial(41, i)))
30 |         printTestVectorOutput(KT128(M, C, 32))
31 |     # We test for 8191 bytes of M because right_encode of empty C is 1 byte, so S is exactly 8192 bytes
32 |     print("  KT128(M=ptn(8191 bytes), C=`00`^0, 32):")
33 |     printTestVectorOutput(KT128(bytearray([(j % 251) for j in range(8191)]), b'', 32))
34 |     # We test for 8192 bytes of M because right_encode of empty C is 1 byte so this put a full new block
35 |     print("  KT128(M=ptn(8192 bytes), C=`00`^0, 32):")
36 |     printTestVectorOutput(KT128(bytearray([(j % 251) for j in range(8192)]), b'', 32))
37 |     # We test with 8192 bytes of M + 8189 bytes of C because 8189 = 3 bytes of Right_ecnode thus S is exactly 2 * 8192 bytes
38 |     # We test with 8192 bytes of M + 8190 bytes of C because 8189 = 3 bytes of Right_ecnode thus S is exactly 2 * 8192 + 1 bytes
39 |     for c in [8189, 8190]:
40 |         C = bytearray([(j % 251) for j in range(c)])
41 |         print("  KT128(M=ptn(8192 bytes), C=ptn({0:d} bytes), 32):".format(c))
42 |         printTestVectorOutput(KT128(bytearray([(j % 251) for j in range(8192)]), C, 32))
43 | 
44 | def printKT256TestVectors():
45 |     print("  KT256(M=`00`^0, C=`00`^0, 64):")
46 |     printTestVectorOutput(KT256(b'', b'', 64))
47 |     print("  KT256(M=`00`^0, C=`00`^0, 128):")
48 |     printTestVectorOutput(KT256(b'', b'', 128))
49 |     print("  KT256(M=`00`^0, C=`00`^0, 10064), last 64 bytes:")
50 |     printTestVectorOutput(KT256(b'', b'', 10064)[10000:])
51 |     for i in range(7):
52 |         C = b''
53 |         M = bytearray([(j % 251) for j in range(17**i)])
54 |         print("  KT256(M=ptn({0:s} bytes), C=`00`^0, 64):".format(numberStringSpecial(17, i)))
55 |         printTestVectorOutput(KT256(M, C, 64))
56 |     for i in range(4):
57 |         M = bytearray([0xFF for j in range(2**i-1)])
58 |         C = bytearray([(j % 251) for j in range(41**i)])
59 |         print("  KT256({0:s}, C=ptn({1:s} bytes), 64):".format(hexStringSpecial(M), numberStringSpecial(41, i)))
60 |         printTestVectorOutput(KT256(M, C, 64))
61 |     # We test for 8191 bytes of M because right_encode of empty C is 1 byte, so S is exactly 8192 bytes
62 |     print("  KT256(M=ptn(8191 bytes), C=`00`^0, 64):")
63 |     printTestVectorOutput(KT256(bytearray([(j % 251) for j in range(8191)]), b'', 64))
64 |     # We test for 8192 bytes of M because right_encode of empty C is 1 byte so this put a full new block
65 |     print("  KT256(M=ptn(8192 bytes), C=`00`^0, 64):")
66 |     printTestVectorOutput(KT256(bytearray([(j % 251) for j in range(8192)]), b'', 64))
67 |     # We test with 8192 bytes of M + 8189 bytes of C because 8189 = 3 bytes of Right_ecnode thus S is exactly 2 * 8192 bytes
68 |     # We test with 8192 bytes of M + 8190 bytes of C because 8189 = 3 bytes of Right_ecnode thus S is exactly 2 * 8192 + 1 bytes
69 |     for c in [8189, 8190]:
70 |         C = bytearray([(j % 251) for j in range(c)])
71 |         print("  KT256(M=ptn(8192 bytes), C=ptn({0:d} bytes), 64):".format(c))
72 |         printTestVectorOutput(KT256(bytearray([(j % 251) for j in range(8192)]), C, 64))
73 | 
74 | printKT128TestVectors()
75 | printKT256TestVectors()
76 | 


--------------------------------------------------------------------------------
/lib/KT-threadpool.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | K12 based on the eXtended Keccak Code Package (XKCP)
  3 | https://github.com/XKCP/XKCP
  4 | 
  5 | Thread pool abstraction layer for portable threading support.
  6 | 
  7 | This provides a simple, application-implementable thread pool API that allows
  8 | KangarooTwelve to use custom threading implementations or fall back to
  9 | sequential execution on platforms without threading support.
 10 | 
 11 | To the extent possible under law, the implementer has waived all copyright
 12 | and related or neighboring rights to the source code in this file.
 13 | http://creativecommons.org/publicdomain/zero/1.0/
 14 | */
 15 | 
 16 | #ifndef _KT_threadpool_h_
 17 | #define _KT_threadpool_h_
 18 | 
 19 | #include <stddef.h>
 20 | 
 21 | /**
 22 |  * Abstract thread pool API.
 23 |  *
 24 |  * Applications can implement this interface to provide custom threading
 25 |  * backends (e.g., Windows thread pool, custom work-stealing scheduler, etc.).
 26 |  *
 27 |  * The API is designed for batch job processing: submit multiple jobs, then
 28 |  * wait for all to complete. This matches the KangarooTwelve tree hashing
 29 |  * pattern where chunk processing is distributed across threads.
 30 |  */
 31 | typedef struct KT_ThreadPool_API {
 32 |     /**
 33 |      * Minimum input size (in bytes) required to enable parallel processing.
 34 |      *
 35 |      * If the total input size is smaller than this threshold, KangarooTwelve
 36 |      * will not use threading for that particular Update() call, avoiding
 37 |      * thread overhead for small inputs.
 38 |      *
 39 |      * Default: 2097152 (2 MB)
 40 |      * Rationale: Threading overhead outweighs benefits for small inputs.
 41 |      *            Optimal results typically seen with inputs > 10 MB.
 42 |      */
 43 |     size_t min_input_size_for_threading;
 44 | 
 45 |     /**
 46 |      * Create a thread pool with the specified number of worker threads.
 47 |      *
 48 |      * @param num_threads  Number of worker threads to create.
 49 |      *                     If 1, implementation may skip thread creation overhead.
 50 |      * @return Opaque pool handle on success, NULL on failure
 51 |      * @note This is called once during KangarooTwelve initialization
 52 |      */
 53 |     void* (*create)(int num_threads);
 54 | 
 55 |     /**
 56 |      * Submit work to the thread pool.
 57 |      *
 58 |      * The work function will be called with work_data as its argument.
 59 |      * Multiple jobs may be submitted before calling wait_all().
 60 |      *
 61 |      * @param pool       Opaque pool handle from create()
 62 |      * @param work_fn    Function to execute (called as work_fn(work_data))
 63 |      * @param work_data  Opaque pointer passed to work_fn
 64 |      * @return 0 on success, non-zero on error
 65 |      * @note work_fn must be thread-safe and not access shared mutable state
 66 |      */
 67 |     int (*submit)(void* pool, void (*work_fn)(void*), void* work_data);
 68 | 
 69 |     /**
 70 |      * Wait for all submitted work to complete.
 71 |      *
 72 |      * Blocks until all jobs submitted since the last wait_all() have finished.
 73 |      * After this returns, it is safe to submit new work.
 74 |      *
 75 |      * @param pool  Opaque pool handle from create()
 76 |      * @note This may be called multiple times to wait for different batches
 77 |      */
 78 |     void (*wait_all)(void* pool);
 79 | 
 80 |     /**
 81 |      * Destroy the thread pool and free all resources.
 82 |      *
 83 |      * All work must be complete before calling this (call wait_all() first).
 84 |      * After destruction, the pool handle must not be used.
 85 |      *
 86 |      * @param pool  Opaque pool handle from create()
 87 |      * @note This is called during KangarooTwelve cleanup
 88 |      */
 89 |     void (*destroy)(void* pool);
 90 | } KT_ThreadPool_API;
 91 | 
 92 | /**
 93 |  * Built-in thread pool backend using POSIX threads (pthreads).
 94 |  *
 95 |  * Available on Linux, macOS, BSD, and other Unix-like systems with pthread support.
 96 |  * Provides true parallel execution using worker threads.
 97 |  *
 98 |  * This is the default backend on pthread-capable platforms.
 99 |  */
100 | extern const KT_ThreadPool_API KT_ThreadPool_Pthread;
101 | 
102 | /**
103 |  * Built-in sequential (no-threading) backend.
104 |  *
105 |  * Available on all platforms. Executes work functions immediately in the
106 |  * calling thread (no actual parallelism). Useful as a fallback on platforms
107 |  * without threading support or for testing/debugging.
108 |  *
109 |  * This is the default backend on platforms without pthread support.
110 |  */
111 | extern const KT_ThreadPool_API KT_ThreadPool_Sequential;
112 | 
113 | /**
114 |  * Get the default thread pool API for the current platform.
115 |  *
116 |  * Returns pthread backend if available, otherwise sequential backend.
117 |  *
118 |  * @return Pointer to the default thread pool API
119 |  */
120 | const KT_ThreadPool_API* KT_ThreadPool_GetDefault(void);
121 | 
122 | #endif /* _KT_threadpool_h_ */
123 | 


--------------------------------------------------------------------------------
/Python/TurboSHAKE.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Implementation by Gilles Van Assche, hereby denoted as "the implementer".
  3 | #
  4 | # For more information, feedback or questions, please refer to our website:
  5 | # https://keccak.team/
  6 | #
  7 | # To the extent possible under law, the implementer has waived all copyright
  8 | # and related or neighboring rights to the source code in this file.
  9 | # http://creativecommons.org/publicdomain/zero/1.0/
 10 | 
 11 | def ROL64(a, n):
 12 |     return ((a >> (64-(n%64))) + (a << (n%64))) % (1 << 64)
 13 | 
 14 | def load64(b):
 15 |     return sum((b[i] << (8*i)) for i in range(8))
 16 | 
 17 | def store64(a):
 18 |     return bytearray((a >> (8*i)) % 256 for i in range(8))
 19 | 
 20 | def hex2lane(hexstring):
 21 |     bytez = [int(token, 16) for token in hexstring.split()]
 22 |     return load64(bytez)
 23 | 
 24 | def KP(state):
 25 |     RC = [
 26 |         hex2lane("8B 80 00 80 00 00 00 00"),
 27 |         hex2lane("8B 00 00 00 00 00 00 80"),
 28 |         hex2lane("89 80 00 00 00 00 00 80"),
 29 |         hex2lane("03 80 00 00 00 00 00 80"),
 30 |         hex2lane("02 80 00 00 00 00 00 80"),
 31 |         hex2lane("80 00 00 00 00 00 00 80"),
 32 |         hex2lane("0A 80 00 00 00 00 00 00"),
 33 |         hex2lane("0A 00 00 80 00 00 00 80"),
 34 |         hex2lane("81 80 00 80 00 00 00 80"),
 35 |         hex2lane("80 80 00 00 00 00 00 80"),
 36 |         hex2lane("01 00 00 80 00 00 00 00"),
 37 |         hex2lane("08 80 00 80 00 00 00 80"),
 38 |     ]
 39 | 
 40 |     lanes = [[0 for _ in range(5)] for _ in range(5)]
 41 |     for x in range(5):
 42 |         for y in range(5):
 43 |             lanes[x][y] = load64(state[8*(x+5*y):8*(x+5*y)+8])
 44 | 
 45 |     for round in range(12):
 46 |         # theta
 47 |         C = [0]*5
 48 |         for x in range(5):
 49 |             C[x] = lanes[x][0]
 50 |             C[x] ^= lanes[x][1]
 51 |             C[x] ^= lanes[x][2]
 52 |             C[x] ^= lanes[x][3]
 53 |             C[x] ^= lanes[x][4]
 54 |         D = [0]*5
 55 |         for x in range(5):
 56 |             D[x] = C[(x+4) % 5] ^ ROL64(C[(x+1) % 5], 1)
 57 |         for y in range(5):
 58 |             for x in range(5):
 59 |                 lanes[x][y] = lanes[x][y]^D[x]
 60 | 
 61 |         # rho and pi
 62 |         (x, y) = (1, 0)
 63 |         current = lanes[x][y]
 64 |         for t in range(24):
 65 |             (x, y) = (y, (2*x+3*y) % 5)
 66 |             (current, lanes[x][y]) = (lanes[x][y], ROL64(current, (t+1)*(t+2)//2))
 67 | 
 68 |         # chi
 69 |         for y in range(5):
 70 |             T = [0]*5
 71 |             for x in range(5):
 72 |                 T[x] = lanes[x][y]
 73 |             for x in range(5):
 74 |                 lanes[x][y] = T[x] ^((~T[(x+1) % 5]) & T[(x+2) % 5])
 75 | 
 76 |         # iota
 77 |         lanes[0][0] ^= RC[round]
 78 | 
 79 |     state = bytearray()
 80 |     for y in range(5):
 81 |         for x in range(5):
 82 |             state = state + store64(lanes[x][y])
 83 | 
 84 |     return state
 85 | 
 86 | def XOR(state1, state2):
 87 |     return [state1[i] ^ state2[i] for i in range(min(len(state1), len(state2)))]
 88 | 
 89 | def TurboSHAKE128(message, separationByte, outputByteLen):
 90 |     offset = 0
 91 |     state = [0x00]*200
 92 |     input = list(message) + [separationByte]
 93 | 
 94 |     # === Absorb complete blocks ===
 95 |     while offset < len(input) - 168:
 96 |         state = XOR(state, input[offset : offset + 168] + [0x00]*32)
 97 |         state = KP(state)
 98 |         offset += 168
 99 | 
100 |     # === Absorb last block and treatment of padding ===
101 |     LastBlockLength = len(input) - offset
102 |     state = XOR(state, input[offset:] + [0x00]*(200-LastBlockLength))
103 |     state = XOR(state, [0x00]*167 + [0x80] + [0x00]*32)
104 |     state = KP(state)
105 | 
106 |     # === Squeeze ===
107 |     output = bytearray()
108 |     while outputByteLen > 168:
109 |         output = output + state[0:168]
110 |         outputByteLen -= 168
111 |         state = KP(state)
112 | 
113 |     output = output + state[0:outputByteLen]
114 |     return output
115 | 
116 | def TurboSHAKE256(message, separationByte, outputByteLen):
117 |     offset = 0
118 |     state = [0x00]*200
119 |     input = list(message) + [separationByte]
120 | 
121 |     # === Absorb complete blocks ===
122 |     while offset < len(input) - 136:
123 |         state = XOR(state, input[offset : offset + 136] + [0x00]*64)
124 |         state = KP(state)
125 |         offset += 136
126 | 
127 |     # === Absorb last block and treatment of padding ===
128 |     LastBlockLength = len(input) - offset
129 |     state = XOR(state, input[offset:] + [0x00]*(200-LastBlockLength))
130 |     state = XOR(state, [0x00]*135 + [0x80] + [0x00]*64)
131 |     state = KP(state)
132 | 
133 |     # === Squeeze ===
134 |     output = bytearray()
135 |     while outputByteLen > 136:
136 |         output = output + state[0:136]
137 |         outputByteLen -= 136
138 |         state = KP(state)
139 | 
140 |     output = output + state[0:outputByteLen]
141 |     return output
142 | 


--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
 1 | # What is KangarooTwelve ?
 2 | 
 3 | [**KangarooTwelve**][k12] is a family of two (**KT128** and **KT256**) fast and secure extendable-output functions (XOF), the generalization of hash functions to arbitrary output lengths.
 4 | Derived from Keccak, they aim at higher speeds than FIPS 202's SHA-3 and SHAKE functions, while retaining their flexibility and basis of security.
 5 | 
 6 | On high-end platforms, they can exploit a high degree of parallelism, whether using multiple cores or the single-instruction multiple-data (SIMD) instruction set of modern processors.
 7 | On Intel's Haswell and Skylake architectures, KT128 tops at less than 1.5 cycles/byte for long messages on a single core, and at 0.51 cycles/byte on the SkylakeX and Cascade Lake architectures.
 8 | On the latest Apple A14 and M1 processors, KangarooTwelve can take advantage of the ARMv8-A's SHA-3 dedicated instructions and KT128 delivers 0.75 cycles/byte for long messages on a single core.
 9 | On low-end platforms, as well as for short messages, KT128 also benefits from about a factor two speed-up compared to the fastest FIPS 202 instance SHAKE128.
10 | 
11 | More details can be found in our [ACNS paper][eprint] (KT128 only) and in [RFC 9861](rfc9861).
12 | 
13 | # What can I find here?
14 | 
15 | This repository contains source code that implements the extendable output (or hash) function **KT128** and **KT256**.
16 | Its purpose is to offer optimized implementations of the KangarooTwelve and nothing else.
17 | 
18 | The code comes from the [**eXtended Keccak Code Package**][xkcp] (or **XKCP**), after much trimming to keep only what is needed for KT.
19 | It is still structured like the XKCP in two layers. The lower layer implements the permutation Keccak-_p_[1600, 12] and possibly parallel versions thereof, whereas the higher layer implements the sponge construction and the tree hash mode.
20 | Also, some sources have been merged to reduce the file count.
21 | 
22 | * For the higher layer, we kept only the code needed for KT.
23 | * For the lower layer, we removed all the functions that are not needed for KT. The lower layer therefore implements a subset of the SnP and PlSnP interfaces.
24 | 
25 | For Keccak or Xoodoo-based functions other than KT128 and KT256, it is recommended to use the XKCP itself instead and not to mix both this repository and the XKCP.
26 | 
27 | 
28 | # Is there a tool to compute the hash of a file?
29 | 
30 | Not in this repository, but Jack O'Connor's [`kangarootwelve_xkcp.rs` repository](https://github.com/oconnor663/kangarootwelve_xkcp.rs) contains Rust bindings to this code and a `k12sum` utility.
31 | Pre-built binaries can be found [there](https://github.com/oconnor663/kangarootwelve_xkcp.rs/releases).
32 | 
33 | 
34 | # How can I build this code?
35 | 
36 | This repository uses the same build system as that of the XKCP.
37 | To build, the following tools are needed:
38 | 
39 | * *GCC*
40 | * *GNU make*
41 | * *xsltproc*
42 | 
43 | The different targets are defined in [`Makefile.build`](Makefile.build). This file is expanded into a regular makefile using *xsltproc*. To use it, simply type, e.g.,
44 | 
45 | ```
46 | make generic64/K12Tests
47 | ```
48 | 
49 | to build K12Tests generically optimized for 64-bit platforms. The name before the slash indicates the platform, while the part after the slash is the executable to build. As another example, the static (resp. dynamic) library is built by typing `make generic64/libK12.a` (resp. `.so`) or similarly with `generic64` replaced with the appropriate platform name.  An alternate C compiler can be specified via the `CC` environment variable.
50 | 
51 | Instead of building an executable with *GCC*, one can choose to select the files needed and make a package. For this, simply append `.pack` to the target name, e.g.,
52 | 
53 | ```
54 | make generic64/K12Tests.pack
55 | ```
56 | 
57 | This creates a `.tar.gz` archive with all the necessary files to build the given target.
58 | 
59 | The list of targets can be found at the end of [`Makefile.build`](Makefile.build) or by running `make` without parameters.
60 | 
61 | ## Microsoft Visual Studio support
62 | 
63 | KangarooTwelve can be compiled with Microsoft Visual Studio (MSVC). The XKCP build system offers support for the creation of project files. To get a project file for a given target, simply append `.vcxproj` to the target name, e.g.,
64 | 
65 | ```
66 | make generic64noAsm/K12Tests.vcxproj
67 | ```
68 | 
69 | The targets `generic32` and `generic64noAsm` can be used with MSVC, but not `generic64` as it contains assembly implementations in the GCC syntax, which at this point cannot be used with MSVC.
70 | Please refer to the documention of [XKCP][xkcp] for more details on the limitations of the support of MSVC.
71 | 
72 | [k12]: https://keccak.team/kangarootwelve.html
73 | [xkcp]: https://github.com/XKCP/XKCP
74 | [eprint]: https://eprint.iacr.org/2016/770.pdf
75 | [rfc9861]: https://datatracker.ietf.org/doc/rfc9861/
76 | 
77 | 
78 | # Acknowledgments
79 | 
80 | We wish to thank:
81 | 
82 | - Andy Polyakov for his expertise with the ARMv8-A+SHA3 code, and in particular for his core routine from [CRYPTOGAMS](https://github.com/dot-asm/cryptogams)
83 | - Duc Tri Nguyen for his benchmark on the Apple M1
84 | - Jack O'Connor for bug fixes and more importantly for his [Rust bindings](https://github.com/oconnor663/kangarootwelve_xkcp.rs)
85 | - Kent Ross for his contributions to this code and its quality
86 | - Hadi El Yakhni for adding KT256
87 | 


--------------------------------------------------------------------------------
/Makefile.build:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <!--
  3 | K12 based on the eXtended Keccak Code Package (XKCP)
  4 | https://github.com/XKCP/XKCP
  5 | 
  6 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier.
  7 | 
  8 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
  9 | 
 10 | For more information, feedback or questions, please refer to the Keccak Team website:
 11 | https://keccak.team/
 12 | 
 13 | To the extent possible under law, the implementer has waived all copyright
 14 | and related or neighboring rights to the source code in this file.
 15 | http://creativecommons.org/publicdomain/zero/1.0/
 16 | -->
 17 | <build xmlns:xi="http://www.w3.org/2001/XInclude">
 18 | 
 19 |     <fragment name="optimized">
 20 |         <gcc>-fomit-frame-pointer</gcc>
 21 |         <gcc>-O2</gcc>
 22 |         <gcc>-g0</gcc>
 23 |     </fragment>
 24 | 
 25 |     <!-- Keccak-p[1600] -->
 26 | 
 27 |     <fragment name="inplace32bi" inherits="optimized">
 28 |         <c>lib/Inplace32BI/KeccakP-1600-inplace32BI.c</c>
 29 |         <h>lib/Inplace32BI/KeccakP-1600-SnP.h</h>
 30 |     </fragment>
 31 | 
 32 |     <fragment name="optimized64" inherits="optimized">
 33 |         <c>lib/Optimized64/KeccakP-1600-opt64.c</c>
 34 |         <h>lib/Optimized64/KeccakP-1600-SnP.h</h>
 35 |         <s>lib/Optimized64/KeccakP-1600-AVX2.s</s>
 36 |         <s>lib/Optimized64/KeccakP-1600-AVX512.s</s>
 37 |         <c gcc="-mssse3">lib/Optimized64/KeccakP-1600-timesN-SSSE3.c</c>
 38 |         <c gcc="-mavx2">lib/Optimized64/KeccakP-1600-timesN-AVX2.c</c>
 39 |         <c gcc="-mavx512f -mavx512vl">lib/Optimized64/KeccakP-1600-timesN-AVX512.c</c>
 40 |         <c>lib/Optimized64/KeccakP-1600-runtimeDispatch.c</c>
 41 |     </fragment>
 42 | 
 43 |     <fragment name="optimized64noAsm" inherits="optimized">
 44 |         <c>lib/Optimized64/KeccakP-1600-opt64.c</c>
 45 |         <c gcc="-mavx512f -mavx512vl">lib/Optimized64/KeccakP-1600-AVX512-plainC.c</c>
 46 |         <h>lib/Optimized64/KeccakP-1600-SnP.h</h>
 47 |         <c gcc="-mssse3">lib/Optimized64/KeccakP-1600-timesN-SSSE3.c</c>
 48 |         <c gcc="-mavx2">lib/Optimized64/KeccakP-1600-timesN-AVX2.c</c>
 49 |         <c gcc="-mavx512f -mavx512vl">lib/Optimized64/KeccakP-1600-timesN-AVX512.c</c>
 50 |         <c>lib/Optimized64/KeccakP-1600-runtimeDispatch.c</c>
 51 |         <define>KeccakP1600_noAssembly</define>
 52 |     </fragment>
 53 | 
 54 |     <fragment name="optimized64plain" inherits="optimized">
 55 |         <c>lib/Optimized64/KeccakP-1600-opt64.c</c>
 56 |         <c>lib/Plain64/KeccakP-1600-plain64.c</c>
 57 |         <h>lib/Plain64/KeccakP-1600-SnP.h</h>
 58 |     </fragment>
 59 | 
 60 |     <fragment name="ARMv8Asha3" inherits="optimized">
 61 |         <c>lib/Optimized64/KeccakP-1600-opt64.c</c>
 62 |         <s gcc="-march=armv8.4-a+sha3">lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S</s>
 63 |         <c>lib/ARMv8Asha3/KeccakP-1600-runtimeDispatch.c</c>
 64 |         <h>lib/ARMv8Asha3/KeccakP-1600-SnP.h</h>
 65 |     </fragment>
 66 | 
 67 |     <!-- KangarooTwelve -->
 68 | 
 69 |     <fragment name="KangarooTwelve">
 70 |         <config>XKCP_has_KangarooTwelve</config>
 71 |         <h>lib/align.h</h>
 72 |         <c>lib/KangarooTwelve.c</c>
 73 |         <h>lib/KangarooTwelve.h</h>
 74 |         <h>lib/KT-threadpool.h</h>
 75 |         <c>lib/KT-threadpool.c</c>
 76 |         <c>lib/KT-threadpool-pthread.c</c>
 77 |         <c>lib/KT-threadpool-sequential.c</c>
 78 |         <c>lib/KangarooTwelve-threading.c</c>
 79 |         <h>lib/KangarooTwelve-threading.h</h>
 80 |         <gcc>-pthread</gcc>
 81 |     </fragment>
 82 | 
 83 |     <!-- For the name of the targets, please see the end of this file. -->
 84 | 
 85 |     <fragment name="common">
 86 |         <h>lib/align.h</h>
 87 |         <h>lib/brg_endian.h</h>
 88 |     </fragment>
 89 | 
 90 |     <!-- To run many tests -->
 91 |     <fragment name="KTtests" inherits="common KangarooTwelve">
 92 |         <c>tests/main.c</c>
 93 |         <c>tests/testPerformance.c</c>
 94 |         <c>tests/timing.c</c>
 95 |         <h>tests/timing.h</h>
 96 |         <h>tests/testPerformance.h</h>
 97 |         <c>tests/testKangarooTwelve.c</c>
 98 |         <h>tests/testKangarooTwelve.h</h>
 99 |         <gcc>-lm</gcc>
100 |         <define>KeccakP1600_enable_simd_options</define>
101 |     </fragment>
102 | 
103 |     <!-- To compute a hash of a file -->
104 |     <fragment name="KeccakSum" inherits="KangarooTwelve">
105 |         <c>util/KeccakSum/KeccakSum.c</c>
106 |         <c>util/KeccakSum/base64.c</c>
107 |         <h>util/KeccakSum/base64.h</h>
108 |     </fragment>
109 | 
110 |     <!-- To make a library -->
111 |     <fragment name="libKT.a" inherits="KangarooTwelve"/>
112 |     <fragment name="libKT.so" inherits="KangarooTwelve"/>
113 |     <fragment name="libKT.dylib" inherits="KangarooTwelve"/>
114 | 
115 |     <!-- Generically optimized 32-bit implementation -->
116 |     <fragment name="generic32" inherits="inplace32bi"/>
117 | 
118 |     <!-- Generically optimized 64-bit implementation, including SSSE3, AVX2 and AVX512 -->
119 |     <fragment name="generic64" inherits="optimized64"/>
120 | 
121 |     <!-- Same, but without the assembly file (for MS Visual Studio) -->
122 |     <fragment name="generic64noAsm" inherits="optimized64noAsm"/>
123 | 
124 |     <!-- Plain C optimized 64-bit implementation only -->
125 |     <fragment name="plain64" inherits="optimized64plain"/>
126 | 
127 |     <!-- Target names are of the form x/y where x is taken from the first set and y from the second set. -->
128 |     <group all="all">
129 |         <product delimiter="/">
130 |             <factor set="generic32 generic64 generic64noAsm plain64 ARMv8Asha3"/>
131 |             <factor set="KTtests libKT.a libKT.so libKT.dylib KeccakSum"/>
132 |         </product>
133 |     </group>
134 | </build>
135 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
  1 | notifications:
  2 |   email:
  3 |     recipients:
  4 |       - gilles-travis@noekeon.org
  5 |     on_failure: always
  6 | 
  7 | language: c
  8 | 
  9 | sudo: required
 10 | 
 11 | before_install:
 12 | - |-
 13 |     case $TRAVIS_OS_NAME in
 14 |       linux)
 15 |         sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test 
 16 |         sudo apt-get -qq update 
 17 |         sudo apt-get install xsltproc
 18 |       ;;
 19 |       windows)
 20 |         choco install -y make 
 21 |         choco install -y xsltproc
 22 |       ;;
 23 |     esac
 24 | 
 25 | jobs:
 26 |   allow_failures:
 27 |     - script: make all
 28 |     - script: make generic32/K12Tests && ./bin/generic32/K12Tests -a
 29 |     - script: make generic64/K12Tests && ./bin/generic64/K12Tests -a
 30 |     - script: make generic64noAsm/K12Tests && ./bin/generic64noAsm/K12Tests -a
 31 |     - script: make plain64/K12Tests && ./bin/plain64/K12Tests -a
 32 |   include:
 33 |     - stage: "Tests on AMD64 (windows)"
 34 |       script: make generic32/K12Tests && ./bin/generic32/K12Tests -a
 35 |       os: windows
 36 |       arch: amd64
 37 |       compiler: gcc
 38 |       name: "generic32 (gcc)"
 39 |     - script: make generic64/K12Tests && ./bin/generic64/K12Tests -a
 40 |       os: windows
 41 |       arch: amd64
 42 |       compiler: gcc
 43 |       name: "generic64 (gcc)"
 44 |     - script: make generic64noAsm/K12Tests && ./bin/generic64noAsm/K12Tests -a
 45 |       os: windows
 46 |       arch: amd64
 47 |       compiler: gcc
 48 |       name: "generic64noAsm (gcc)"
 49 |     - script: make plain64/K12Tests && ./bin/plain64/K12Tests -a
 50 |       os: windows
 51 |       arch: amd64
 52 |       compiler: gcc
 53 |       name: "plain64 (gcc)"
 54 |     - script: make all
 55 |       os: windows
 56 |       arch: amd64
 57 |       compiler: gcc
 58 |       name: "all (gcc)"
 59 |     - script: make generic32/K12Tests && ./bin/generic32/K12Tests -a
 60 |       os: windows
 61 |       arch: amd64
 62 |       compiler: clang
 63 |       name: "generic32 (clang)"
 64 |     - script: make generic64/K12Tests && ./bin/generic64/K12Tests -a
 65 |       os: windows
 66 |       arch: amd64
 67 |       compiler: clang
 68 |       name: "generic64 (clang)"
 69 |     - script: make generic64noAsm/K12Tests && ./bin/generic64noAsm/K12Tests -a
 70 |       os: windows
 71 |       arch: amd64
 72 |       compiler: clang
 73 |       name: "generic64noAsm (clang)"
 74 |     - script: make plain64/K12Tests && ./bin/plain64/K12Tests -a
 75 |       os: windows
 76 |       arch: amd64
 77 |       compiler: clang
 78 |       name: "plain64 (clang)"
 79 |     - script: make all
 80 |       os: windows
 81 |       arch: amd64
 82 |       compiler: clang
 83 |       name: "all (clang)"
 84 | 
 85 |     - stage: "Tests on AMD64 (linux)"
 86 |       script: make generic32/K12Tests && ./bin/generic32/K12Tests -a
 87 |       os: linux
 88 |       arch: amd64
 89 |       compiler: gcc
 90 |       name: "generic32 (gcc)"
 91 |     - script: make generic64/K12Tests && ./bin/generic64/K12Tests -a
 92 |       os: linux
 93 |       arch: amd64
 94 |       compiler: gcc
 95 |       name: "generic64 (gcc)"
 96 |     - script: make generic64noAsm/K12Tests && ./bin/generic64noAsm/K12Tests -a
 97 |       os: linux
 98 |       arch: amd64
 99 |       compiler: gcc
100 |       name: "generic64noAsm (gcc)"
101 |     - script: make plain64/K12Tests && ./bin/plain64/K12Tests -a
102 |       os: linux
103 |       arch: amd64
104 |       compiler: gcc
105 |       name: "plain64 (gcc)"
106 |     - script: make all
107 |       os: linux
108 |       arch: amd64
109 |       compiler: gcc
110 |       name: "all (gcc)"
111 |     - script: make generic32/K12Tests && ./bin/generic32/K12Tests -a
112 |       os: linux
113 |       arch: amd64
114 |       compiler: clang
115 |       name: "generic32 (clang)"
116 |     - script: make generic64/K12Tests && ./bin/generic64/K12Tests -a
117 |       os: linux
118 |       arch: amd64
119 |       compiler: clang
120 |       name: "generic64 (clang)"
121 |     - script: make generic64noAsm/K12Tests && ./bin/generic64noAsm/K12Tests -a
122 |       os: linux
123 |       arch: amd64
124 |       compiler: clang
125 |       name: "generic64noAsm (clang)"
126 |     - script: make plain64/K12Tests && ./bin/plain64/K12Tests -a
127 |       os: linux
128 |       arch: amd64
129 |       compiler: clang
130 |       name: "plain64 (clang)"
131 |     - script: make all
132 |       os: linux
133 |       arch: amd64
134 |       compiler: clang
135 |       name: "all (clang)"
136 | 
137 |     - stage: "Tests on AMD64 (osx)"
138 |       script: make generic32/K12Tests && ./bin/generic32/K12Tests -a
139 |       os: osx
140 |       arch: amd64
141 |       compiler: gcc
142 |       name: "generic32 (gcc)"
143 |     - script: make generic64/K12Tests && ./bin/generic64/K12Tests -a
144 |       os: osx
145 |       arch: amd64
146 |       compiler: gcc
147 |       name: "generic64 (gcc)"
148 |     - script: make generic64noAsm/K12Tests && ./bin/generic64noAsm/K12Tests -a
149 |       os: osx
150 |       arch: amd64
151 |       compiler: gcc
152 |       name: "generic64noAsm (gcc)"
153 |     - script: make plain64/K12Tests && ./bin/plain64/K12Tests -a
154 |       os: osx
155 |       arch: amd64
156 |       compiler: gcc
157 |       name: "plain64 (gcc)"
158 |     - script: make all
159 |       os: osx
160 |       arch: amd64
161 |       compiler: gcc
162 |       name: "all (gcc)"
163 |     - script: make generic32/K12Tests && ./bin/generic32/K12Tests -a
164 |       os: osx
165 |       arch: amd64
166 |       compiler: clang
167 |       name: "generic32 (clang)"
168 |     - script: make generic64/K12Tests && ./bin/generic64/K12Tests -a
169 |       os: osx
170 |       arch: amd64
171 |       compiler: clang
172 |       name: "generic64 (clang)"
173 |     - script: make generic64noAsm/K12Tests && ./bin/generic64noAsm/K12Tests -a
174 |       os: osx
175 |       arch: amd64
176 |       compiler: clang
177 |       name: "generic64noAsm (clang)"
178 |     - script: make plain64/K12Tests && ./bin/plain64/K12Tests -a
179 |       os: osx
180 |       arch: amd64
181 |       compiler: clang
182 |       name: "plain64 (clang)"
183 |     - script: make all
184 |       os: osx
185 |       arch: amd64
186 |       compiler: clang
187 |       name: "all (clang)"
188 | 


--------------------------------------------------------------------------------
/lib/brg_endian.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  ---------------------------------------------------------------------------
  3 |  Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
  4 | 
  5 |  LICENSE TERMS
  6 | 
  7 |  The redistribution and use of this software (with or without changes)
  8 |  is allowed without the payment of fees or royalties provided that:
  9 | 
 10 |   1. source code distributions include the above copyright notice, this
 11 |      list of conditions and the following disclaimer;
 12 | 
 13 |   2. binary distributions include the above copyright notice, this list
 14 |      of conditions and the following disclaimer in their documentation;
 15 | 
 16 |   3. the name of the copyright holder is not used to endorse products
 17 |      built using this software without specific written permission.
 18 | 
 19 |  DISCLAIMER
 20 | 
 21 |  This software is provided 'as is' with no explicit or implied warranties
 22 |  in respect of its properties, including, but not limited to, correctness
 23 |  and/or fitness for purpose.
 24 |  ---------------------------------------------------------------------------
 25 |  Issue Date: 20/12/2007
 26 |  Changes for ARM 9/9/2010
 27 | */
 28 | 
 29 | #ifndef _BRG_ENDIAN_H
 30 | #define _BRG_ENDIAN_H
 31 | 
 32 | #define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
 33 | #define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
 34 | 
 35 | #if 0
 36 | /* Include files where endian defines and byteswap functions may reside */
 37 | #if defined( __sun )
 38 | #  include <sys/isa_defs.h>
 39 | #elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
 40 | #  include <sys/endian.h>
 41 | #elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
 42 |       defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
 43 | #  include <machine/endian.h>
 44 | #elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
 45 | #  if !defined( __MINGW32__ ) && !defined( _AIX )
 46 | #    include <endian.h>
 47 | #    if !defined( __BEOS__ )
 48 | #      include <byteswap.h>
 49 | #    endif
 50 | #  endif
 51 | #endif
 52 | #endif
 53 | 
 54 | /* Now attempt to set the define for platform byte order using any  */
 55 | /* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
 56 | /* seem to encompass most endian symbol definitions                 */
 57 | 
 58 | #if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
 59 | #  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
 60 | #    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 61 | #  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
 62 | #    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 63 | #  endif
 64 | #elif defined( BIG_ENDIAN )
 65 | #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 66 | #elif defined( LITTLE_ENDIAN )
 67 | #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 68 | #endif
 69 | 
 70 | #if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
 71 | #  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
 72 | #    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 73 | #  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
 74 | #    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 75 | #  endif
 76 | #elif defined( _BIG_ENDIAN )
 77 | #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 78 | #elif defined( _LITTLE_ENDIAN )
 79 | #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 80 | #endif
 81 | 
 82 | #if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
 83 | #  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
 84 | #    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 85 | #  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
 86 | #    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 87 | #  endif
 88 | #elif defined( __BIG_ENDIAN )
 89 | #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 90 | #elif defined( __LITTLE_ENDIAN )
 91 | #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 92 | #endif
 93 | 
 94 | #if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
 95 | #  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
 96 | #    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 97 | #  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
 98 | #    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 99 | #  endif
100 | #elif defined( __BIG_ENDIAN__ )
101 | #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
102 | #elif defined( __LITTLE_ENDIAN__ )
103 | #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
104 | #endif
105 | 
106 | /*  if the platform byte order could not be determined, then try to */
107 | /*  set this define using common machine defines                    */
108 | #if !defined(PLATFORM_BYTE_ORDER)
109 | 
110 | #if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
111 |       defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
112 |       defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
113 |       defined( vax )       || defined( vms )     || defined( VMS )        || \
114 |       defined( __VMS )     || defined( _M_X64 )
115 | #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
116 | 
117 | #elif defined( AMIGA )    || defined( applec )    || defined( __AS400__ )  || \
118 |       defined( _CRAY )    || defined( __hppa )    || defined( __hp9000 )   || \
119 |       defined( ibm370 )   || defined( mc68000 )   || defined( m68k )       || \
120 |       defined( __MRC__ )  || defined( __MVS__ )   || defined( __MWERKS__ ) || \
121 |       defined( sparc )    || defined( __sparc)    || defined( SYMANTEC_C ) || \
122 |       defined( __VOS__ )  || defined( __TIGCC__ ) || defined( __TANDEM )   || \
123 |       defined( THINK_C )  || defined( __VMCMS__ ) || defined( _AIX )       || \
124 |       defined( __s390__ ) || defined( __s390x__ ) || defined( __zarch__ )
125 | #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
126 | 
127 | #elif defined(__arm__)
128 | # ifdef __BIG_ENDIAN
129 | #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
130 | # else
131 | #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
132 | # endif
133 | #elif 1     /* **** EDIT HERE IF NECESSARY **** */
134 | #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
135 | #elif 0     /* **** EDIT HERE IF NECESSARY **** */
136 | #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
137 | #else
138 | #  error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
139 | #endif
140 | 
141 | #endif
142 | 
143 | #endif
144 | 


--------------------------------------------------------------------------------
/lib/ARMv8Asha3/KeccakP-1600-opt64.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | K12 based on the eXtended Keccak Code Package (XKCP)
  3 | https://github.com/XKCP/XKCP
  4 | 
  5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
  6 | 
  7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
  8 | 
  9 | For more information, feedback or questions, please refer to the Keccak Team website:
 10 | https://keccak.team/
 11 | 
 12 | To the extent possible under law, the implementer has waived all copyright
 13 | and related or neighboring rights to the source code in this file.
 14 | http://creativecommons.org/publicdomain/zero/1.0/
 15 | 
 16 | ---
 17 | 
 18 | Please refer to the XKCP for more details.
 19 | */
 20 | 
 21 | #include <stdint.h>
 22 | #include <stdlib.h>
 23 | #include <string.h>
 24 | #include <KeccakP-1600-SnP.h>
 25 | 
 26 | const char * KeccakP1600_GetImplementation()
 27 | {
 28 |     return "ARMv8-A+SHA3 optimized implementation";
 29 | }
 30 | 
 31 | /* ---------------------------------------------------------------- */
 32 | 
 33 | void KeccakP1600_opt64_Initialize(void *state)
 34 | {
 35 |     memset(state, 0, 200);
 36 | }
 37 | 
 38 | /* ---------------------------------------------------------------- */
 39 | 
 40 | void KeccakP1600_opt64_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
 41 | {
 42 |     uint64_t lane;
 43 | 
 44 |     if (length == 0)
 45 |         return;
 46 |     if (length == 1)
 47 |         lane = data[0];
 48 |     else {
 49 |         lane = 0;
 50 |         memcpy(&lane, data, length);
 51 |     }
 52 |     lane <<= offset*8;
 53 |     ((uint64_t*)state)[lanePosition] ^= lane;
 54 | }
 55 | 
 56 | /* ---------------------------------------------------------------- */
 57 | 
 58 | static void KeccakP1600_opt64_AddLanes(void *state, const unsigned char *data, unsigned int laneCount)
 59 | {
 60 |     unsigned int i = 0;
 61 | 
 62 |     for( ; (i+8)<=laneCount; i+=8) {
 63 |         ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
 64 |         ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1];
 65 |         ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2];
 66 |         ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3];
 67 |         ((uint64_t*)state)[i+4] ^= ((uint64_t*)data)[i+4];
 68 |         ((uint64_t*)state)[i+5] ^= ((uint64_t*)data)[i+5];
 69 |         ((uint64_t*)state)[i+6] ^= ((uint64_t*)data)[i+6];
 70 |         ((uint64_t*)state)[i+7] ^= ((uint64_t*)data)[i+7];
 71 |     }
 72 |     for( ; (i+4)<=laneCount; i+=4) {
 73 |         ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
 74 |         ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1];
 75 |         ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2];
 76 |         ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3];
 77 |     }
 78 |     for( ; (i+2)<=laneCount; i+=2) {
 79 |         ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
 80 |         ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1];
 81 |     }
 82 |     if (i<laneCount) {
 83 |         ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
 84 |     }
 85 | }
 86 | 
 87 | /* ---------------------------------------------------------------- */
 88 | 
 89 | void KeccakP1600_opt64_AddByte(void *state, unsigned char byte, unsigned int offset)
 90 | {
 91 |     ((unsigned char*)(state))[offset] ^= byte;
 92 | }
 93 | 
 94 | /* ---------------------------------------------------------------- */
 95 | 
 96 | #define SnP_AddBytes(state, data, offset, length, SnP_AddLanes, SnP_AddBytesInLane, SnP_laneLengthInBytes) \
 97 |     { \
 98 |         if ((offset) == 0) { \
 99 |             SnP_AddLanes(state, data, (length)/SnP_laneLengthInBytes); \
100 |             SnP_AddBytesInLane(state, \
101 |                 (length)/SnP_laneLengthInBytes, \
102 |                 (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
103 |                 0, \
104 |                 (length)%SnP_laneLengthInBytes); \
105 |         } \
106 |         else { \
107 |             unsigned int _sizeLeft = (length); \
108 |             unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
109 |             unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
110 |             const unsigned char *_curData = (data); \
111 |             while(_sizeLeft > 0) { \
112 |                 unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
113 |                 if (_bytesInLane > _sizeLeft) \
114 |                     _bytesInLane = _sizeLeft; \
115 |                 SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
116 |                 _sizeLeft -= _bytesInLane; \
117 |                 _lanePosition++; \
118 |                 _offsetInLane = 0; \
119 |                 _curData += _bytesInLane; \
120 |             } \
121 |         } \
122 |     }
123 | 
124 | void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
125 | {
126 |     SnP_AddBytes(state, data, offset, length, KeccakP1600_opt64_AddLanes, KeccakP1600_opt64_AddBytesInLane, 8);
127 | }
128 | 
129 | /* ---------------------------------------------------------------- */
130 | 
131 | void KeccakP1600_opt64_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length)
132 | {
133 |     uint64_t lane = ((uint64_t*)state)[lanePosition];
134 |     {
135 |         uint64_t lane1[1];
136 |         lane1[0] = lane;
137 |         memcpy(data, (uint8_t*)lane1+offset, length);
138 |     }
139 | }
140 | 
141 | /* ---------------------------------------------------------------- */
142 | 
143 | void KeccakP1600_opt64_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount)
144 | {
145 |     memcpy(data, state, laneCount*8);
146 | }
147 | 
148 | /* ---------------------------------------------------------------- */
149 | 
150 | #define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \
151 |     { \
152 |         if ((offset) == 0) { \
153 |             SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \
154 |             SnP_ExtractBytesInLane(state, \
155 |                 (length)/SnP_laneLengthInBytes, \
156 |                 (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
157 |                 0, \
158 |                 (length)%SnP_laneLengthInBytes); \
159 |         } \
160 |         else { \
161 |             unsigned int _sizeLeft = (length); \
162 |             unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
163 |             unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
164 |             unsigned char *_curData = (data); \
165 |             while(_sizeLeft > 0) { \
166 |                 unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
167 |                 if (_bytesInLane > _sizeLeft) \
168 |                     _bytesInLane = _sizeLeft; \
169 |                 SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
170 |                 _sizeLeft -= _bytesInLane; \
171 |                 _lanePosition++; \
172 |                 _offsetInLane = 0; \
173 |                 _curData += _bytesInLane; \
174 |             } \
175 |         } \
176 |     }
177 | 
178 | void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
179 | {
180 |     SnP_ExtractBytes(state, data, offset, length, KeccakP1600_opt64_ExtractLanes, KeccakP1600_opt64_ExtractBytesInLane, 8);
181 | }
182 | 
183 | /* ---------------------------------------------------------------- */
184 | 
185 | /* Keccak-p[1600]×2 */
186 | 
187 | int KeccakP1600times2_IsAvailable()
188 | {
189 |     return 1;
190 | }
191 | 
192 | const char * KeccakP1600times2_GetImplementation()
193 | {
194 |     return "ARMv8-A+SHA3 optimized implementation";
195 | }
196 | 
197 | /* Keccak-p[1600]×4 */
198 | 
199 | int KeccakP1600times4_IsAvailable()
200 | {
201 |     return 0;
202 | }
203 | 
204 | const char * KeccakP1600times4_GetImplementation()
205 | {
206 |     return "";
207 | }
208 | 
209 | void KT128_Process4Leaves(const unsigned char *input, unsigned char *output)
210 | {
211 | }
212 | 
213 | void KT256_Process4Leaves(const unsigned char *input, unsigned char *output)
214 | {
215 | }
216 | 
217 | /* Keccak-p[1600]×8 */
218 | 
219 | int KeccakP1600times8_IsAvailable()
220 | {
221 |     return 0;
222 | }
223 | 
224 | const char * KeccakP1600times8_GetImplementation()
225 | {
226 |     return "";
227 | }
228 | 
229 | void KT128_Process8Leaves(const unsigned char *input, unsigned char *output)
230 | {
231 | }
232 | 
233 | void KT256_Process8Leaves(const unsigned char *input, unsigned char *output)
234 | {
235 | }
236 | 


--------------------------------------------------------------------------------
/tests/testPerformance.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | K12 based on the eXtended Keccak Code Package (XKCP)
  3 | https://github.com/XKCP/XKCP
  4 | 
  5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier.
  6 | 
  7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
  8 | 
  9 | For more information, feedback or questions, please refer to the Keccak Team website:
 10 | https://keccak.team/
 11 | 
 12 | To the extent possible under law, the implementer has waived all copyright
 13 | and related or neighboring rights to the source code in this file.
 14 | http://creativecommons.org/publicdomain/zero/1.0/
 15 | */
 16 | 
 17 | #include <assert.h>
 18 | #include <inttypes.h>
 19 | #include <math.h>
 20 | #include <stdint.h>
 21 | #include <stdio.h>
 22 | #include <string.h>
 23 | #include "align.h"
 24 | #include "KangarooTwelve.h"
 25 | #include "KeccakP-1600-SnP.h"
 26 | #include "timing.h"
 27 | #include "testPerformance.h"
 28 | 
 29 | #if !defined(__x86_64__) && !defined(_M_X64) && !defined(__i386__) && !defined(_M_IX86)
 30 | #undef KeccakP1600_enable_simd_options
 31 | #endif
 32 | 
 33 | #define BIG_BUFFER_SIZE (2*1024*1024)
 34 | ALIGN(64) uint8_t bigBuffer[BIG_BUFFER_SIZE];
 35 | 
 36 | cycles_t measurePerformance(int (*impl)(const unsigned char*, size_t,
 37 |                                        unsigned char*, size_t,
 38 |                                        const unsigned char*, size_t),
 39 |                            cycles_t dtMin, unsigned int inputLen)
 40 | {
 41 |     ALIGN(64) unsigned char output[32];
 42 |     measureTimingDeclare
 43 | 
 44 |     assert(inputLen <= BIG_BUFFER_SIZE);
 45 | 
 46 |     memset(bigBuffer, 0xA5, 16);
 47 | 
 48 |     measureTimingBeginDeclared
 49 |     impl(bigBuffer, inputLen, output, 32, (const unsigned char *)"", 0);
 50 |     measureTimingEnd
 51 | }
 52 | 
 53 | #if defined(KeccakP1600_enable_simd_options) && !defined(KeccakP1600_disableParallelism)
 54 | void KangarooTwelve_SetProcessorCapabilities();
 55 | #endif
 56 | 
 57 | void printKangarooTwelvePerformanceHeader(int securityLevel)
 58 | {
 59 | #if defined(KeccakP1600_enable_simd_options) && !defined(KeccakP1600_disableParallelism)
 60 |     KangarooTwelve_SetProcessorCapabilities();
 61 | #endif
 62 |     printf("*** KT%d ***\n", securityLevel);
 63 |     printf("Using Keccak-p[1600,12] implementations:\n");
 64 |     printf("- \303\2271: %s\n", KeccakP1600_GetImplementation());
 65 |     #if defined(KeccakP1600_12rounds_FastLoop_supported)
 66 |     printf("      + KeccakP1600_12rounds_FastLoop_Absorb()\n");
 67 |     #endif
 68 | 
 69 | #ifndef KeccakP1600_disableParallelism
 70 |     if (KeccakP1600times2_IsAvailable()) {
 71 |         printf("- \303\2272: %s\n", KeccakP1600times2_GetImplementation());
 72 |     #if defined(KeccakP1600times2_12rounds_FastLoop_supported)
 73 |         printf("      + KeccakP1600times2_12rounds_FastLoop_Absorb()\n");
 74 |     #endif
 75 |     }
 76 |     else
 77 |         printf("- \303\2272: not used\n");
 78 | 
 79 |     if (KeccakP1600times4_IsAvailable()) {
 80 |         printf("- \303\2274: %s\n", KeccakP1600times4_GetImplementation());
 81 |     #if defined(KeccakP1600times4_12rounds_FastLoop_supported)
 82 |         printf("      + KeccakP1600times4_12rounds_FastLoop_Absorb()\n");
 83 |     #endif
 84 |     }
 85 |     else
 86 |         printf("- \303\2274: not used\n");
 87 | 
 88 |     if (KeccakP1600times8_IsAvailable()) {
 89 |         printf("- \303\2278: %s\n", KeccakP1600times8_GetImplementation());
 90 |     #if defined(KeccakP1600times8_12rounds_FastLoop_supported)
 91 |         printf("      + KeccakP1600times8_12rounds_FastLoop_Absorb()\n");
 92 |     #endif
 93 |     }
 94 |     else
 95 |         printf("- \303\2278: not used\n");
 96 | #endif
 97 | 
 98 |     printf("\n");
 99 | }
100 | 
101 | void testPerformanceFull(int (*impl)(const unsigned char*, size_t,
102 |                                      unsigned char*, size_t,
103 |                                      const unsigned char*, size_t), int extra)
104 | {
105 |     const unsigned int chunkSize = 8192;
106 |     unsigned halfTones;
107 |     cycles_t calibration = CalibrateTimer();
108 |     unsigned int chunkSizeLog = (unsigned int)floor(log(chunkSize)/log(2.0)+0.5);
109 |     int displaySlope = 0;
110 | 
111 |     measurePerformance(impl, calibration, 500000);
112 |     for(halfTones=chunkSizeLog*12-28; halfTones<=13*12; halfTones+=4) {
113 |         double I = pow(2.0, halfTones/12.0);
114 |         unsigned int i  = (unsigned int)floor(I+0.5);
115 |         cycles_t time, timePlus1Block, timePlus2Blocks, timePlus4Blocks, timePlus8Blocks;
116 |         cycles_t timePlus168Blocks;
117 |         time = measurePerformance(impl, calibration, i);
118 |         if (i == chunkSize) {
119 |             displaySlope = 1;
120 |             timePlus1Block  = measurePerformance(impl, calibration, i+1*chunkSize);
121 |             timePlus2Blocks = measurePerformance(impl, calibration, i+2*chunkSize);
122 |             timePlus4Blocks = measurePerformance(impl, calibration, i+4*chunkSize);
123 |             timePlus8Blocks = measurePerformance(impl, calibration, i+8*chunkSize);
124 |             timePlus168Blocks = measurePerformance(impl, calibration, i+extra*chunkSize);
125 |         }
126 |         printf("%8u bytes: %9"PRId64" %s, %6.3f %s/byte\n", i, time, getTimerUnit(), time*1.0/i, getTimerUnit());
127 |         if (displaySlope) {
128 |             printf("     +1 block:  %9"PRId64" %s, %6.3f %s/byte (slope)\n", timePlus1Block, getTimerUnit(), (timePlus1Block-(double)(time))*1.0/chunkSize/1.0, getTimerUnit());
129 |             printf("     +2 blocks: %9"PRId64" %s, %6.3f %s/byte (slope)\n", timePlus2Blocks, getTimerUnit(), (timePlus2Blocks-(double)(time))*1.0/chunkSize/2.0, getTimerUnit());
130 |             printf("     +4 blocks: %9"PRId64" %s, %6.3f %s/byte (slope)\n", timePlus4Blocks, getTimerUnit(), (timePlus4Blocks-(double)(time))*1.0/chunkSize/4.0, getTimerUnit());
131 |             printf("     +8 blocks: %9"PRId64" %s, %6.3f %s/byte (slope)\n", timePlus8Blocks, getTimerUnit(), (timePlus8Blocks-(double)(time))*1.0/chunkSize/8.0, getTimerUnit());
132 |             printf("   +%d blocks: %9"PRId64" %s, %6.3f %s/byte (slope)\n", extra, timePlus168Blocks, getTimerUnit(), (timePlus168Blocks-(double)(time))*1.0/chunkSize/(extra*1.0), getTimerUnit());
133 |             displaySlope = 0;
134 |         }
135 |     }
136 |     for(halfTones=12*12; halfTones<=20*12; halfTones+=4) {
137 |         double I = chunkSize + pow(2.0, halfTones/12.0);
138 |         unsigned int i  = (unsigned int)floor(I+0.5);
139 |         cycles_t time;
140 |         time = measurePerformance(impl, calibration, i);
141 |         printf("%8u bytes: %9"PRId64" %s, %6.3f %s/byte\n", i, time, getTimerUnit(), time*1.0/i, getTimerUnit());
142 |     }
143 |     printf("\n\n");
144 | }
145 | 
146 | void testKangarooTwelvePerformance()
147 | {
148 |     printKangarooTwelvePerformanceHeader(128);
149 |     testPerformanceFull(KT128, 168);
150 |     printKangarooTwelvePerformanceHeader(256);
151 |     testPerformanceFull(KT256, 136);
152 | }
153 | void testPerformance()
154 | {
155 | #if defined(KeccakP1600_enable_simd_options) && !defined(KeccakP1600_disableParallelism)
156 |     // Read feature availability
157 |     KangarooTwelve_EnableAllCpuFeatures();
158 |     int cpu_has_AVX512 = KangarooTwelve_DisableAVX512();
159 |     int cpu_has_AVX2 = KangarooTwelve_DisableAVX2();
160 |     int cpu_has_SSSE3 = KangarooTwelve_DisableSSSE3();
161 | #endif
162 | 
163 |     // Test without vectorization
164 |     testKangarooTwelvePerformance();
165 | 
166 | #if defined(KeccakP1600_enable_simd_options) && !defined(KeccakP1600_disableParallelism)
167 |     // Test with SSSE3 only if it's available
168 |     if (cpu_has_SSSE3) {
169 |         printf("\n");
170 |         KangarooTwelve_EnableAllCpuFeatures();
171 |         KangarooTwelve_DisableAVX512();
172 |         KangarooTwelve_DisableAVX2();
173 |         testKangarooTwelvePerformance();
174 |     }
175 |     // Test with SSSE3 and AVX2 if they're available
176 |     if (cpu_has_AVX2) {
177 |         printf("\n");
178 |         KangarooTwelve_EnableAllCpuFeatures();
179 |         KangarooTwelve_DisableAVX512();
180 |         testKangarooTwelvePerformance();
181 |     }
182 |     // Finally, test with everything enabled if we have AVX512
183 |     if (cpu_has_AVX512) {
184 |         printf("\n");
185 |         KangarooTwelve_EnableAllCpuFeatures();
186 |         testKangarooTwelvePerformance();
187 |     }
188 | #endif
189 | 
190 |     // Set `comparison` to your own function here to directly
191 |     // compare performance against K12. It should have the same signature
192 |     // as KangarooTwelve(...): the parameters are input, output, and
193 |     // customization buffers.
194 |     int (*comparison)(const unsigned char*, size_t,
195 |                       unsigned char*, size_t,
196 |                       const unsigned char*, size_t) = NULL;
197 | 
198 |     if (comparison != NULL) {
199 |       printf("\n*** Non-K12 function for comparison: ***\n");
200 |       testPerformanceFull(comparison, 128);
201 |     }
202 | }
203 | 


--------------------------------------------------------------------------------
/lib/KT-threadpool-pthread.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | K12 based on the eXtended Keccak Code Package (XKCP)
  3 | https://github.com/XKCP/XKCP
  4 | 
  5 | Thread pool implementation using POSIX threads (pthreads).
  6 | 
  7 | To the extent possible under law, the implementer has waived all copyright
  8 | and related or neighboring rights to the source code in this file.
  9 | http://creativecommons.org/publicdomain/zero/1.0/
 10 | */
 11 | 
 12 | #include "KT-threadpool.h"
 13 | #include <stdlib.h>
 14 | #include <string.h>
 15 | 
 16 | /* Only compile pthread backend if pthreads are available */
 17 | #if defined(_POSIX_THREADS) || defined(__unix__) || defined(__unix) || \
 18 |     (defined(__APPLE__) && defined(__MACH__)) || defined(__linux__) || \
 19 |     defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
 20 | 
 21 | #include <pthread.h>
 22 | 
 23 | #define MAX_THREADS 64
 24 | #define MAX_JOBS 256
 25 | 
 26 | /* Job structure */
 27 | typedef struct {
 28 |     void (*work_fn)(void*);
 29 |     void* work_data;
 30 | } Job;
 31 | 
 32 | /* Thread pool context */
 33 | typedef struct {
 34 |     pthread_t* threads;
 35 |     int num_threads;
 36 | 
 37 |     /* Job queue */
 38 |     Job job_queue[MAX_JOBS];
 39 |     int job_count;
 40 |     int jobs_grabbed;    /* Number of jobs grabbed by workers */
 41 |     int jobs_finished;   /* Number of jobs actually completed */
 42 | 
 43 |     /* Synchronization */
 44 |     pthread_mutex_t mutex;
 45 |     pthread_cond_t work_available;
 46 |     pthread_cond_t work_complete;
 47 | 
 48 |     /* Thread IDs for passing to worker threads */
 49 |     int* thread_ids;
 50 | 
 51 |     /* Lifecycle */
 52 |     int shutdown;
 53 | } PthreadPool;
 54 | 
 55 | /* Worker thread function */
 56 | static void* worker_thread(void* arg)
 57 | {
 58 |     PthreadPool* pool = (PthreadPool*)arg;
 59 | 
 60 |     while (1) {
 61 |         pthread_mutex_lock(&pool->mutex);
 62 | 
 63 |         /* Wait for work or shutdown */
 64 |         while (!pool->shutdown && pool->jobs_grabbed >= pool->job_count) {
 65 |             pthread_cond_wait(&pool->work_available, &pool->mutex);
 66 |         }
 67 | 
 68 |         if (pool->shutdown) {
 69 |             pthread_mutex_unlock(&pool->mutex);
 70 |             break;
 71 |         }
 72 | 
 73 |         /* Get next available job atomically */
 74 |         Job job;
 75 |         int has_job = 0;
 76 |         if (pool->jobs_grabbed < pool->job_count) {
 77 |             job = pool->job_queue[pool->jobs_grabbed];
 78 |             pool->jobs_grabbed++;
 79 |             has_job = 1;
 80 |         }
 81 | 
 82 |         pthread_mutex_unlock(&pool->mutex);
 83 | 
 84 |         /* Execute job outside the lock */
 85 |         if (has_job && job.work_fn) {
 86 |             job.work_fn(job.work_data);
 87 | 
 88 |             /* Mark job as finished */
 89 |             pthread_mutex_lock(&pool->mutex);
 90 |             pool->jobs_finished++;
 91 |             if (pool->jobs_finished >= pool->job_count) {
 92 |                 pthread_cond_signal(&pool->work_complete);
 93 |             }
 94 |             pthread_mutex_unlock(&pool->mutex);
 95 |         }
 96 |     }
 97 | 
 98 |     return NULL;
 99 | }
100 | 
101 | /* Create pthread pool */
102 | static void* pthread_create_pool(int num_threads)
103 | {
104 |     if (num_threads < 1 || num_threads > MAX_THREADS)
105 |         return NULL;
106 | 
107 |     PthreadPool* pool = (PthreadPool*)malloc(sizeof(PthreadPool));
108 |     if (!pool)
109 |         return NULL;
110 | 
111 |     memset(pool, 0, sizeof(PthreadPool));
112 |     pool->num_threads = num_threads;
113 | 
114 |     /* Initialize synchronization primitives */
115 |     if (pthread_mutex_init(&pool->mutex, NULL) != 0) {
116 |         free(pool);
117 |         return NULL;
118 |     }
119 | 
120 |     if (pthread_cond_init(&pool->work_available, NULL) != 0) {
121 |         pthread_mutex_destroy(&pool->mutex);
122 |         free(pool);
123 |         return NULL;
124 |     }
125 | 
126 |     if (pthread_cond_init(&pool->work_complete, NULL) != 0) {
127 |         pthread_mutex_destroy(&pool->mutex);
128 |         pthread_cond_destroy(&pool->work_available);
129 |         free(pool);
130 |         return NULL;
131 |     }
132 | 
133 |     /* Allocate thread array */
134 |     pool->threads = (pthread_t*)malloc(num_threads * sizeof(pthread_t));
135 |     if (!pool->threads) {
136 |         pthread_mutex_destroy(&pool->mutex);
137 |         pthread_cond_destroy(&pool->work_available);
138 |         pthread_cond_destroy(&pool->work_complete);
139 |         free(pool);
140 |         return NULL;
141 |     }
142 | 
143 |     pool->thread_ids = (int*)malloc(num_threads * sizeof(int));
144 |     if (!pool->thread_ids) {
145 |         free(pool->threads);
146 |         pthread_mutex_destroy(&pool->mutex);
147 |         pthread_cond_destroy(&pool->work_available);
148 |         pthread_cond_destroy(&pool->work_complete);
149 |         free(pool);
150 |         return NULL;
151 |     }
152 | 
153 |     /* Create worker threads */
154 |     pool->shutdown = 0;
155 |     pool->job_count = 0;
156 |     pool->jobs_grabbed = 0;
157 |     pool->jobs_finished = 0;
158 | 
159 |     for (int i = 0; i < num_threads; i++) {
160 |         pool->thread_ids[i] = i;
161 |         if (pthread_create(&pool->threads[i], NULL, worker_thread, pool) != 0) {
162 |             /* Failed to create thread - clean up */
163 |             pool->shutdown = 1;
164 |             pthread_cond_broadcast(&pool->work_available);
165 |             for (int j = 0; j < i; j++) {
166 |                 pthread_join(pool->threads[j], NULL);
167 |             }
168 |             free(pool->thread_ids);
169 |             free(pool->threads);
170 |             pthread_mutex_destroy(&pool->mutex);
171 |             pthread_cond_destroy(&pool->work_available);
172 |             pthread_cond_destroy(&pool->work_complete);
173 |             free(pool);
174 |             return NULL;
175 |         }
176 |     }
177 | 
178 |     return pool;
179 | }
180 | 
181 | /* Submit work to pthread pool */
182 | static int pthread_submit(void* pool_handle, void (*work_fn)(void*), void* work_data)
183 | {
184 |     PthreadPool* pool = (PthreadPool*)pool_handle;
185 |     if (!pool || !work_fn)
186 |         return 1;
187 | 
188 |     pthread_mutex_lock(&pool->mutex);
189 | 
190 |     if (pool->job_count >= MAX_JOBS) {
191 |         pthread_mutex_unlock(&pool->mutex);
192 |         return 1;  /* Job queue full */
193 |     }
194 | 
195 |     pool->job_queue[pool->job_count].work_fn = work_fn;
196 |     pool->job_queue[pool->job_count].work_data = work_data;
197 |     pool->job_count++;
198 | 
199 |     pthread_mutex_unlock(&pool->mutex);
200 | 
201 |     return 0;
202 | }
203 | 
204 | /* Wait for all work to complete */
205 | static void pthread_wait_all(void* pool_handle)
206 | {
207 |     PthreadPool* pool = (PthreadPool*)pool_handle;
208 |     if (!pool)
209 |         return;
210 | 
211 |     pthread_mutex_lock(&pool->mutex);
212 | 
213 |     /* Reset counters and wake up workers */
214 |     pool->jobs_grabbed = 0;
215 |     pool->jobs_finished = 0;
216 |     pthread_cond_broadcast(&pool->work_available);
217 | 
218 |     /* Wait for all jobs to finish execution */
219 |     while (pool->jobs_finished < pool->job_count) {
220 |         pthread_cond_wait(&pool->work_complete, &pool->mutex);
221 |     }
222 | 
223 |     /* Reset for next batch */
224 |     pool->job_count = 0;
225 |     pool->jobs_grabbed = 0;
226 |     pool->jobs_finished = 0;
227 | 
228 |     pthread_mutex_unlock(&pool->mutex);
229 | }
230 | 
231 | /* Destroy pthread pool */
232 | static void pthread_destroy(void* pool_handle)
233 | {
234 |     PthreadPool* pool = (PthreadPool*)pool_handle;
235 |     if (!pool)
236 |         return;
237 | 
238 |     pthread_mutex_lock(&pool->mutex);
239 |     pool->shutdown = 1;
240 |     pthread_cond_broadcast(&pool->work_available);
241 |     pthread_mutex_unlock(&pool->mutex);
242 | 
243 |     /* Wait for all threads to finish */
244 |     for (int i = 0; i < pool->num_threads; i++) {
245 |         pthread_join(pool->threads[i], NULL);
246 |     }
247 | 
248 |     /* Cleanup */
249 |     free(pool->thread_ids);
250 |     free(pool->threads);
251 |     pthread_mutex_destroy(&pool->mutex);
252 |     pthread_cond_destroy(&pool->work_available);
253 |     pthread_cond_destroy(&pool->work_complete);
254 |     free(pool);
255 | }
256 | 
257 | /* Export pthread backend API */
258 | const KT_ThreadPool_API KT_ThreadPool_Pthread = {
259 |     .min_input_size_for_threading = 2097152,  /* 2 MB default threshold */
260 |     .create = pthread_create_pool,
261 |     .submit = pthread_submit,
262 |     .wait_all = pthread_wait_all,
263 |     .destroy = pthread_destroy
264 | };
265 | 
266 | #else /* !HAVE_PTHREADS */
267 | 
268 | /* Pthread not available on this platform - provide stub */
269 | static void* pthread_stub_create(int num_threads) {
270 |     (void)num_threads;
271 |     return NULL;
272 | }
273 | 
274 | static int pthread_stub_submit(void* pool, void (*work_fn)(void*), void* work_data) {
275 |     (void)pool; (void)work_fn; (void)work_data;
276 |     return 1;
277 | }
278 | 
279 | static void pthread_stub_wait_all(void* pool) {
280 |     (void)pool;
281 | }
282 | 
283 | static void pthread_stub_destroy(void* pool) {
284 |     (void)pool;
285 | }
286 | 
287 | const KT_ThreadPool_API KT_ThreadPool_Pthread = {
288 |     .min_input_size_for_threading = 2097152,  /* 2 MB default threshold */
289 |     .create = pthread_stub_create,
290 |     .submit = pthread_stub_submit,
291 |     .wait_all = pthread_stub_wait_all,
292 |     .destroy = pthread_stub_destroy
293 | };
294 | 
295 | #endif /* HAVE_PTHREADS */
296 | 


--------------------------------------------------------------------------------
/lib/KangarooTwelve.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | K12 based on the eXtended Keccak Code Package (XKCP)
  3 | https://github.com/XKCP/XKCP
  4 | 
  5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier.
  6 | 
  7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
  8 | 
  9 | For more information, feedback or questions, please refer to the Keccak Team website:
 10 | https://keccak.team/
 11 | 
 12 | To the extent possible under law, the implementer has waived all copyright
 13 | and related or neighboring rights to the source code in this file.
 14 | http://creativecommons.org/publicdomain/zero/1.0/
 15 | */
 16 | 
 17 | #ifndef _KangarooTwelve_h_
 18 | #define _KangarooTwelve_h_
 19 | 
 20 | #include <stddef.h>
 21 | #include <stdint.h>
 22 | #include "align.h"
 23 | #include "KeccakP-1600-SnP.h"
 24 | 
 25 | typedef struct TurboSHAKE_InstanceStruct {
 26 |     uint8_t state[KeccakP1600_stateSizeInBytes];
 27 |     unsigned int rate;
 28 |     uint8_t byteIOIndex;
 29 |     uint8_t squeezing;
 30 | } TurboSHAKE_Instance;
 31 | 
 32 | typedef struct KangarooTwelve_InstanceStruct {
 33 |     ALIGN(KeccakP1600_stateAlignment) TurboSHAKE_Instance queueNode;
 34 |     ALIGN(KeccakP1600_stateAlignment) TurboSHAKE_Instance finalNode;
 35 |     size_t fixedOutputLength;
 36 |     size_t blockNumber;
 37 |     unsigned int queueAbsorbedLen;
 38 |     int phase;
 39 |     int securityLevel;
 40 |     /* Thread pool for parallel chunk processing (optional, can be NULL) */
 41 |     const void* threadpool_api;  /* KT_ThreadPool_API* */
 42 |     void* threadpool_handle;
 43 |     int thread_count;
 44 | } KangarooTwelve_Instance;
 45 | 
 46 | /** Extendable ouput function KangarooTwelve.
 47 |   * @param  securityLevel   128 for KT128 or 256 for KT256
 48 |   * @param  input           Pointer to the input message (M).
 49 |   * @param  inputByteLen    The length of the input message in bytes.
 50 |   * @param  output          Pointer to the output buffer.
 51 |   * @param  outputByteLen   The desired number of output bytes.
 52 |   * @param  customization   Pointer to the customization string (C).
 53 |   * @param  customByteLen   The length of the customization string in bytes.
 54 |   * @return 0 if successful, 1 otherwise.
 55 |   */
 56 | int KangarooTwelve(int securityLevel, const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen);
 57 | 
 58 | /**
 59 |  * Wrapper around `KangarooTwelve` to use the 128-bit security level.
 60 | */
 61 | int KT128(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen);
 62 | 
 63 | /**
 64 |  * Wrapper around `KangarooTwelve` to use the 256-bit security level.
 65 | */
 66 | int KT256(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen);
 67 | 
 68 | /**
 69 |   * Function to initialize a KangarooTwelve instance.
 70 |   * @param  ktInstance      Pointer to the instance to be initialized.
 71 |   * @param  securityLevel   128 for KT128 or 256 for KT256
 72 |   * @param  outputByteLen   The desired number of output bytes,
 73 |   *                         or 0 for an arbitrarily-long output.
 74 |   * @return 0 if successful, 1 otherwise.
 75 |   */
 76 | int KangarooTwelve_Initialize(KangarooTwelve_Instance *ktInstance, int securityLevel, size_t outputByteLen);
 77 | 
 78 | /**
 79 |   * Function to initialize a KangarooTwelve instance with threading support.
 80 |   * @param  ktInstance      Pointer to the instance to be initialized.
 81 |   * @param  securityLevel   128 for KT128 or 256 for KT256
 82 |   * @param  outputByteLen   The desired number of output bytes,
 83 |   *                         or 0 for an arbitrarily-long output.
 84 |   * @param  threadpool_api  Thread pool API (NULL for no threading).
 85 |   *                         Must point to a KT_ThreadPool_API struct.
 86 |   * @param  threadpool_handle  Thread pool handle (from threadpool_api->create()).
 87 |   *                            Ignored if threadpool_api is NULL.
 88 |   * @param  thread_count    Number of threads in the pool.
 89 |   *                         Ignored if threadpool_api is NULL.
 90 |   * @return 0 if successful, 1 otherwise.
 91 |   */
 92 | int KangarooTwelve_Initialize_Threaded(KangarooTwelve_Instance *ktInstance, int securityLevel, size_t outputByteLen,
 93 |                                        const void *threadpool_api, void *threadpool_handle, int thread_count);
 94 | 
 95 | #define KT128_Initialize(instance, outputByteLen) \
 96 |     KangarooTwelve_Initialize((instance), 128, (outputByteLen))
 97 | 
 98 | #define KT256_Initialize(instance, outputByteLen) \
 99 |     KangarooTwelve_Initialize((instance), 256, (outputByteLen))
100 | 
101 | #define KT128_Initialize_Threaded(instance, outputByteLen, threadpool_api, threadpool_handle, thread_count) \
102 |     KangarooTwelve_Initialize_Threaded((instance), 128, (outputByteLen), (threadpool_api), (threadpool_handle), (thread_count))
103 | 
104 | #define KT256_Initialize_Threaded(instance, outputByteLen, threadpool_api, threadpool_handle, thread_count) \
105 |     KangarooTwelve_Initialize_Threaded((instance), 256, (outputByteLen), (threadpool_api), (threadpool_handle), (thread_count))
106 | 
107 | /**
108 |   * Function to give input data to be absorbed.
109 |   * @param  ktInstance      Pointer to the instance initialized by KangarooTwelve_Initialize().
110 |   * @param  input           Pointer to the input message data (M).
111 |   * @param  inputByteLen    The number of bytes provided in the input message data.
112 |   * @return 0 if successful, 1 otherwise.
113 |   */
114 | int KangarooTwelve_Update(KangarooTwelve_Instance *ktInstance, const unsigned char *input, size_t inputByteLen);
115 | 
116 | /**
117 |   * Function to call after all the input message has been input, and to get
118 |   * output bytes if the length was specified when calling KangarooTwelve_Initialize().
119 |   * @param  ktInstance      Pointer to the hash instance initialized by KangarooTwelve_Initialize().
120 |   * If @a outputByteLen was not 0 in the call to KangarooTwelve_Initialize(), the number of
121 |   *     output bytes is equal to @a outputByteLen.
122 |   * If @a outputByteLen was 0 in the call to KangarooTwelve_Initialize(), the output bytes
123 |   *     must be extracted using the KangarooTwelve_Squeeze() function.
124 |   * @param  output          Pointer to the buffer where to store the output data.
125 |   * @param  customization   Pointer to the customization string (C).
126 |   * @param  customByteLen   The length of the customization string in bytes.
127 |   * @return 0 if successful, 1 otherwise.
128 |   */
129 | int KangarooTwelve_Final(KangarooTwelve_Instance *ktInstance, unsigned char *output, const unsigned char *customization, size_t customByteLen);
130 | 
131 | /**
132 |   * Function to squeeze output data.
133 |   * @param  ktInstance     Pointer to the hash instance initialized by KangarooTwelve_Initialize().
134 |   * @param  data           Pointer to the buffer where to store the output data.
135 |   * @param  outputByteLen  The number of output bytes desired.
136 |   * @pre    KangarooTwelve_Final() must have been already called.
137 |   * @return 0 if successful, 1 otherwise.
138 |   */
139 | int KangarooTwelve_Squeeze(KangarooTwelve_Instance *ktInstance, unsigned char *output, size_t outputByteLen);
140 | 
141 | #if !defined(KeccakP1600_disableParallelism) && defined(KeccakP1600_enable_simd_options)
142 | /**
143 |   * Functions to selectively disable the use of CPU features. Should be rarely
144 |   * needed; if you're not sure this is what you want, don't worry about it.
145 |   *
146 |   * /!\ WARNING /!\: Calling these functions REQUIRES that there are no
147 |   * KangarooTwelve instances in use. The effects are global and affect the code
148 |   * paths taken by every call, as well as the details of the represented states.
149 |   * Calling these functions in the middle of your program (as opposed to during
150 |   * setup) is PROBABLY WRONG.
151 |   *
152 |   * These functions are at present only used to increase test suite coverage,
153 |   * and demonstrate comparative performance between implementations in different
154 |   * instruction sets. To enable them, the macro KeccakP1600_enable_simd_options
155 |   * must be defined at compile time.
156 |   *
157 |   * They can potentially also be useful in an environment where it is
158 |   * detrimental to online large vector units on the CPU, since doing so can lead
159 |   * to downclocking, performance hits in other threads sharing the same CPU
160 |   * core, and short delays while the CPU's power license is increased to online
161 |   * the vector unit.
162 |   *
163 |   * In the majority of situations, however, this should rarely matter and it is
164 |   * usually the case that the performance difference will be a wash or even an
165 |   * overall improvement despite the downsides.
166 |   *
167 |   * @return 1 if the feature was enabled and available and has been turned off,
168 |   *     0 if it was already disabled or unavailable.
169 |   */
170 | int KangarooTwelve_DisableAVX512(void);
171 | int KangarooTwelve_DisableAVX2(void);
172 | int KangarooTwelve_DisableSSSE3(void);
173 | int KangarooTwelve_DisableNeon(void);
174 | int KangarooTwelve_DisableArmSha3(void);
175 | 
176 | /**
177 |   * Function to reset all CPU features to enabled-if-available. Calling this
178 |   * always has no effect if no CPU features have been explicitly disabled.
179 |   */
180 | void KangarooTwelve_EnableAllCpuFeatures(void);
181 | void KangarooTwelve_EnableAllArmCpuFeatures(void);
182 | #endif  // !KeccakP1600_disableParallelism && KeccakP1600_enable_simd_options
183 | 
184 | #endif
185 | 


--------------------------------------------------------------------------------
/lib/Optimized64/KeccakP-1600-AVX512-plainC.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | K12 based on the eXtended Keccak Code Package (XKCP)
  3 | https://github.com/XKCP/XKCP
  4 | 
  5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
  6 | 
  7 | Implementation by Ronny Van Keer, hereby denoted as "the implementer".
  8 | 
  9 | For more information, feedback or questions, please refer to the Keccak Team website:
 10 | https://keccak.team/
 11 | 
 12 | To the extent possible under law, the implementer has waived all copyright
 13 | and related or neighboring rights to the source code in this file.
 14 | http://creativecommons.org/publicdomain/zero/1.0/
 15 | 
 16 | ---
 17 | 
 18 | We would like to thank Vladimir Sedach, we have used parts of his Keccak AVX-512 C++ code.
 19 |  */
 20 | 
 21 | #include <stdio.h>
 22 | #include <stdlib.h>
 23 | #include <string.h>
 24 | #include <stdint.h>
 25 | #include <smmintrin.h>
 26 | #include <wmmintrin.h>
 27 | #include <immintrin.h>
 28 | #include <emmintrin.h>
 29 | #include "align.h"
 30 | 
 31 | typedef __m512i     V512;
 32 | 
 33 | #define XOR(a,b)                    _mm512_xor_si512(a,b)
 34 | #define XOR3(a,b,c)                 _mm512_ternarylogic_epi64(a,b,c,0x96)
 35 | #define XOR5(a,b,c,d,e)             XOR3(XOR3(a,b,c),d,e)
 36 | #define ROL(a,offset)               _mm512_rol_epi64(a,offset)
 37 | #define Chi(a,b,c)                  _mm512_ternarylogic_epi64(a,b,c,0xD2)
 38 | 
 39 | #define LOAD_Lanes(m,a)             _mm512_maskz_loadu_epi64(m,a)
 40 | #define LOAD_Lane(a)                LOAD_Lanes(0x01,a)
 41 | #define LOAD_Plane(a)               LOAD_Lanes(0x1F,a)
 42 | #define LOAD_8Lanes(a)              LOAD_Lanes(0xFF,a)
 43 | #define STORE_Lanes(a,m,v)          _mm512_mask_storeu_epi64(a,m,v)
 44 | #define STORE_Lane(a,v)             STORE_Lanes(a,0x01,v)
 45 | #define STORE_Plane(a,v)            STORE_Lanes(a,0x1F,v)
 46 | #define STORE_8Lanes(a,v)           STORE_Lanes(a,0xFF,v)
 47 | 
 48 | /* ---------------------------------------------------------------- */
 49 | 
 50 | void KeccakP1600_AVX512_Initialize(void *state)
 51 | {
 52 |     memset(state, 0, 1600/8);
 53 | }
 54 | 
 55 | /* ---------------------------------------------------------------- */
 56 | 
 57 | void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
 58 | {
 59 |     uint8_t  *stateAsBytes;
 60 |     uint64_t *stateAsLanes;
 61 | 
 62 |     for( stateAsBytes = (uint8_t*)state; ((offset % 8) != 0) && (length != 0); ++offset, --length)
 63 |         stateAsBytes[offset] ^= *(data++);
 64 |     for (stateAsLanes = (uint64_t*)(stateAsBytes + offset); length >= 8*8; stateAsLanes += 8, data += 8*8, length -= 8*8)
 65 |         STORE_8Lanes( stateAsLanes, XOR(LOAD_8Lanes(stateAsLanes), LOAD_8Lanes((const uint64_t*)data)));
 66 |     for (/* empty */; length >= 8; ++stateAsLanes, data += 8, length -= 8)
 67 |         STORE_Lane( stateAsLanes, XOR(LOAD_Lane(stateAsLanes), LOAD_Lane((const uint64_t*)data)));
 68 |     for ( stateAsBytes = (uint8_t*)stateAsLanes; length != 0; --length)
 69 |         *(stateAsBytes++) ^= *(data++);
 70 | }
 71 | 
 72 | /* ---------------------------------------------------------------- */
 73 | 
 74 | void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
 75 | {
 76 |     memcpy(data, (unsigned char*)state+offset, length);
 77 | }
 78 | 
 79 | /* ---------------------------------------------------------------- */
 80 | 
 81 | const uint64_t KeccakP1600RoundConstants[24] = {
 82 |     0x0000000000000001ULL,
 83 |     0x0000000000008082ULL,
 84 |     0x800000000000808aULL,
 85 |     0x8000000080008000ULL,
 86 |     0x000000000000808bULL,
 87 |     0x0000000080000001ULL,
 88 |     0x8000000080008081ULL,
 89 |     0x8000000000008009ULL,
 90 |     0x000000000000008aULL,
 91 |     0x0000000000000088ULL,
 92 |     0x0000000080008009ULL,
 93 |     0x000000008000000aULL,
 94 |     0x000000008000808bULL,
 95 |     0x800000000000008bULL,
 96 |     0x8000000000008089ULL,
 97 |     0x8000000000008003ULL,
 98 |     0x8000000000008002ULL,
 99 |     0x8000000000000080ULL,
100 |     0x000000000000800aULL,
101 |     0x800000008000000aULL,
102 |     0x8000000080008081ULL,
103 |     0x8000000000008080ULL,
104 |     0x0000000080000001ULL,
105 |     0x8000000080008008ULL };
106 | 
107 | #define KeccakP_DeclareVars \
108 |     V512    b0, b1, b2, b3, b4; \
109 |     V512    Baeiou, Gaeiou, Kaeiou, Maeiou, Saeiou; \
110 |     V512    moveThetaPrev = _mm512_setr_epi64(4, 0, 1, 2, 3, 5, 6, 7); \
111 |     V512    moveThetaNext = _mm512_setr_epi64(1, 2, 3, 4, 0, 5, 6, 7); \
112 |     V512    rhoB = _mm512_setr_epi64( 0,  1, 62, 28, 27, 0, 0, 0); \
113 |     V512    rhoG = _mm512_setr_epi64(36, 44,  6, 55, 20, 0, 0, 0); \
114 |     V512    rhoK = _mm512_setr_epi64( 3, 10, 43, 25, 39, 0, 0, 0); \
115 |     V512    rhoM = _mm512_setr_epi64(41, 45, 15, 21,  8, 0, 0, 0); \
116 |     V512    rhoS = _mm512_setr_epi64(18,  2, 61, 56, 14, 0, 0, 0); \
117 |     V512    pi1B = _mm512_setr_epi64(0, 3, 1, 4, 2, 5, 6, 7); \
118 |     V512    pi1G = _mm512_setr_epi64(1, 4, 2, 0, 3, 5, 6, 7); \
119 |     V512    pi1K = _mm512_setr_epi64(2, 0, 3, 1, 4, 5, 6, 7); \
120 |     V512    pi1M = _mm512_setr_epi64(3, 1, 4, 2, 0, 5, 6, 7); \
121 |     V512    pi1S = _mm512_setr_epi64(4, 2, 0, 3, 1, 5, 6, 7); \
122 |     V512    pi2S1 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 0+8, 2+8); \
123 |     V512    pi2S2 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 1+8, 3+8); \
124 |     V512    pi2BG = _mm512_setr_epi64(0, 1, 0+8, 1+8, 6, 5, 6, 7); \
125 |     V512    pi2KM = _mm512_setr_epi64(2, 3, 2+8, 3+8, 7, 5, 6, 7); \
126 |     V512    pi2S3 = _mm512_setr_epi64(4, 5, 4+8, 5+8, 4, 5, 6, 7);
127 | 
128 | #define copyFromState(pState) \
129 |     Baeiou = LOAD_Plane(pState+ 0); \
130 |     Gaeiou = LOAD_Plane(pState+ 5); \
131 |     Kaeiou = LOAD_Plane(pState+10); \
132 |     Maeiou = LOAD_Plane(pState+15); \
133 |     Saeiou = LOAD_Plane(pState+20);
134 | 
135 | #define copyToState(pState) \
136 |     STORE_Plane(pState+ 0, Baeiou); \
137 |     STORE_Plane(pState+ 5, Gaeiou); \
138 |     STORE_Plane(pState+10, Kaeiou); \
139 |     STORE_Plane(pState+15, Maeiou); \
140 |     STORE_Plane(pState+20, Saeiou);
141 | 
142 | #define KeccakP_Round(i) \
143 |     /* Theta */ \
144 |     b0 = XOR5( Baeiou, Gaeiou, Kaeiou, Maeiou, Saeiou ); \
145 |     b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); \
146 |     b0 = _mm512_permutexvar_epi64(moveThetaNext, b0); \
147 |     b0 = _mm512_rol_epi64(b0, 1); \
148 |     Baeiou = XOR3( Baeiou, b0, b1 ); \
149 |     Gaeiou = XOR3( Gaeiou, b0, b1 ); \
150 |     Kaeiou = XOR3( Kaeiou, b0, b1 ); \
151 |     Maeiou = XOR3( Maeiou, b0, b1 ); \
152 |     Saeiou = XOR3( Saeiou, b0, b1 ); \
153 |     /* Rho */ \
154 |     Baeiou = _mm512_rolv_epi64(Baeiou, rhoB); \
155 |     Gaeiou = _mm512_rolv_epi64(Gaeiou, rhoG); \
156 |     Kaeiou = _mm512_rolv_epi64(Kaeiou, rhoK); \
157 |     Maeiou = _mm512_rolv_epi64(Maeiou, rhoM); \
158 |     Saeiou = _mm512_rolv_epi64(Saeiou, rhoS); \
159 |     /* Pi 1 */ \
160 |     b0 = _mm512_permutexvar_epi64(pi1B, Baeiou); \
161 |     b1 = _mm512_permutexvar_epi64(pi1G, Gaeiou); \
162 |     b2 = _mm512_permutexvar_epi64(pi1K, Kaeiou); \
163 |     b3 = _mm512_permutexvar_epi64(pi1M, Maeiou); \
164 |     b4 = _mm512_permutexvar_epi64(pi1S, Saeiou); \
165 |     /* Chi */ \
166 |     Baeiou = Chi(b0, b1, b2); \
167 |     Gaeiou = Chi(b1, b2, b3); \
168 |     Kaeiou = Chi(b2, b3, b4); \
169 |     Maeiou = Chi(b3, b4, b0); \
170 |     Saeiou = Chi(b4, b0, b1); \
171 |     /* Iota */ \
172 |     Baeiou = XOR(Baeiou, LOAD_Lane(KeccakP1600RoundConstants+i)); \
173 |     /* Pi 2 */ \
174 |     b0 = _mm512_unpacklo_epi64(Baeiou, Gaeiou); \
175 |     b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); \
176 |     b0 = _mm512_permutex2var_epi64(b0, pi2S1, Saeiou); \
177 |     b2 = _mm512_unpackhi_epi64(Baeiou, Gaeiou); \
178 |     b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); \
179 |     b2 = _mm512_permutex2var_epi64(b2, pi2S2, Saeiou); \
180 |     Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); \
181 |     Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); \
182 |     Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); \
183 |     Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); \
184 |     b0 = _mm512_permutex2var_epi64(b0, pi2S3, b1); \
185 |     Saeiou = _mm512_mask_blend_epi64(0x10, b0, Saeiou)
186 | 
187 | #define rounds12 \
188 |     KeccakP_Round( 12 ); \
189 |     KeccakP_Round( 13 ); \
190 |     KeccakP_Round( 14 ); \
191 |     KeccakP_Round( 15 ); \
192 |     KeccakP_Round( 16 ); \
193 |     KeccakP_Round( 17 ); \
194 |     KeccakP_Round( 18 ); \
195 |     KeccakP_Round( 19 ); \
196 |     KeccakP_Round( 20 ); \
197 |     KeccakP_Round( 21 ); \
198 |     KeccakP_Round( 22 ); \
199 |     KeccakP_Round( 23 )
200 | 
201 | /* ---------------------------------------------------------------- */
202 | 
203 | void KeccakP1600_AVX512_Permute_12rounds(void *state)
204 | {
205 |     KeccakP_DeclareVars
206 |     uint64_t *stateAsLanes = (uint64_t*)state;
207 | 
208 |     copyFromState(stateAsLanes);
209 |     rounds12;
210 |     copyToState(stateAsLanes);
211 | }
212 | 
213 | /* ---------------------------------------------------------------- */
214 | 
215 | #include <assert.h>
216 | 
217 | size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen)
218 | {
219 |     size_t originalDataByteLen = dataByteLen;
220 | 
221 |     assert(laneCount == 21 || laneCount == 17);
222 | 
223 |     KeccakP_DeclareVars;
224 |     uint64_t *stateAsLanes = (uint64_t*)state;
225 |     uint64_t *inDataAsLanes = (uint64_t*)data;
226 | 
227 |     if (laneCount == 21) {
228 |         #define laneCount 21
229 |         copyFromState(stateAsLanes);
230 |         while(dataByteLen >= 21*8) {
231 |             Baeiou = XOR(Baeiou, LOAD_Plane(inDataAsLanes+ 0));
232 |             Gaeiou = XOR(Gaeiou, LOAD_Plane(inDataAsLanes+ 5));
233 |             Kaeiou = XOR(Kaeiou, LOAD_Plane(inDataAsLanes+10));
234 |             Maeiou = XOR(Maeiou, LOAD_Plane(inDataAsLanes+15));
235 |             Saeiou = XOR(Saeiou, LOAD_Lane(inDataAsLanes+20));
236 |             rounds12;
237 |             inDataAsLanes += 21;
238 |             dataByteLen -= 21*8;
239 |         }
240 |         #undef laneCount
241 |         copyToState(stateAsLanes);
242 |     } else if (laneCount == 17) {
243 |         // TODO: further optimization needed for this case, laneCount == 17.
244 |         while(dataByteLen >= laneCount*8) {
245 |             KeccakP1600_AddBytes(state, data, 0, laneCount*8);
246 |             KeccakP1600_Permute_12rounds(state);
247 |             data += laneCount*8;
248 |             dataByteLen -= laneCount*8;
249 |         }
250 |     }
251 | 
252 |     return originalDataByteLen - dataByteLen;
253 | }
254 | 


--------------------------------------------------------------------------------
/tests/timing.h:
--------------------------------------------------------------------------------
  1 | // Adapted from Google Benchmark (https://github.com/google/benchmark).
  2 | //
  3 | // Copyright 2020 Google Inc.
  4 | //
  5 | // Licensed under the Apache License, Version 2.0 (the "License");
  6 | // you may not use this file except in compliance with the License.
  7 | // You may obtain a copy of the License at
  8 | //
  9 | //     http://www.apache.org/licenses/LICENSE-2.0
 10 | //
 11 | // Unless required by applicable law or agreed to in writing, software
 12 | // distributed under the License is distributed on an "AS IS" BASIS,
 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | // See the License for the specific language governing permissions and
 15 | // limitations under the License.
 16 | 
 17 | #ifndef _XKCP_timing_h_
 18 | #define _XKCP_timing_h_
 19 | 
 20 | #include <stdint.h>
 21 | 
 22 | #if defined(__GNUC__)
 23 | #define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
 24 | #elif defined(_MSC_VER) && !defined(__clang__)
 25 | #define BENCHMARK_ALWAYS_INLINE __forceinline
 26 | #if _MSC_VER >= 1900
 27 | #else
 28 | #endif
 29 | #define __func__ __FUNCTION__
 30 | #else
 31 | #define BENCHMARK_ALWAYS_INLINE
 32 | #endif
 33 | 
 34 | #ifndef __has_feature
 35 | #define __has_feature(x) 0
 36 | #endif
 37 | 
 38 | #if defined(__clang__)
 39 |   #if defined(__ibmxl__)
 40 |     #if !defined(COMPILER_IBMXL)
 41 |       #define COMPILER_IBMXL
 42 |     #endif
 43 |   #elif !defined(COMPILER_CLANG)
 44 |     #define COMPILER_CLANG
 45 |   #endif
 46 | #elif defined(_MSC_VER)
 47 |   #if !defined(COMPILER_MSVC)
 48 |     #define COMPILER_MSVC
 49 |   #endif
 50 | #elif defined(__GNUC__)
 51 |   #if !defined(COMPILER_GCC)
 52 |     #define COMPILER_GCC
 53 |   #endif
 54 | #endif
 55 | 
 56 | #if defined(__CYGWIN__)
 57 |   #define BENCHMARK_OS_CYGWIN 1
 58 | #elif defined(_WIN32)
 59 |   #define BENCHMARK_OS_WINDOWS 1
 60 |   #if defined(__MINGW32__)
 61 |     #define BENCHMARK_OS_MINGW 1
 62 |   #endif
 63 | #elif defined(__APPLE__)
 64 |   #define BENCHMARK_OS_APPLE 1
 65 |   #include "TargetConditionals.h"
 66 |   #if defined(TARGET_OS_MAC)
 67 |     #define BENCHMARK_OS_MACOSX 1
 68 |     #if defined(TARGET_OS_IPHONE)
 69 |       #define BENCHMARK_OS_IOS 1
 70 |     #endif
 71 |   #endif
 72 | #elif defined(__FreeBSD__)
 73 |   #define BENCHMARK_OS_FREEBSD 1
 74 | #elif defined(__NetBSD__)
 75 |   #define BENCHMARK_OS_NETBSD 1
 76 | #elif defined(__OpenBSD__)
 77 |   #define BENCHMARK_OS_OPENBSD 1
 78 | #elif defined(__DragonFly__)
 79 |   #define BENCHMARK_OS_DRAGONFLY 1
 80 | #elif defined(__linux__)
 81 |   #define BENCHMARK_OS_LINUX 1
 82 | #elif defined(__native_client__)
 83 |   #define BENCHMARK_OS_NACL 1
 84 | #elif defined(__EMSCRIPTEN__)
 85 |   #define BENCHMARK_OS_EMSCRIPTEN 1
 86 | #elif defined(__rtems__)
 87 |   #define BENCHMARK_OS_RTEMS 1
 88 | #elif defined(__Fuchsia__)
 89 | #define BENCHMARK_OS_FUCHSIA 1
 90 | #elif defined (__SVR4) && defined (__sun)
 91 | #define BENCHMARK_OS_SOLARIS 1
 92 | #elif defined(__QNX__)
 93 | #define BENCHMARK_OS_QNX 1
 94 | #elif defined(__MVS__)
 95 | #define BENCHMARK_OS_ZOS 1
 96 | #endif
 97 | 
 98 | #if defined(BENCHMARK_OS_MACOSX)
 99 | #include <mach/mach_time.h>
100 | #endif
101 | // For MSVC, we want to use '_asm rdtsc' when possible (since it works
102 | // with even ancient MSVC compilers), and when not possible the
103 | // __rdtsc intrinsic, declared in <intrin.h>.  Unfortunately, in some
104 | // environments, <windows.h> and <intrin.h> have conflicting
105 | // declarations of some other intrinsics, breaking compilation.
106 | // Therefore, we simply declare __rdtsc ourselves. See also
107 | // http://connect.microsoft.com/VisualStudio/feedback/details/262047
108 | #if defined(COMPILER_MSVC) && !defined(_M_IX86)
109 | uint64_t __rdtsc();
110 | #pragma intrinsic(__rdtsc)
111 | #endif
112 | 
113 | #if !defined(BENCHMARK_OS_WINDOWS) || defined(BENCHMARK_OS_MINGW)
114 | #include <sys/time.h>
115 | #include <time.h>
116 | #endif
117 | 
118 | #ifdef BENCHMARK_OS_EMSCRIPTEN
119 | #include <emscripten.h>
120 | #endif
121 | 
122 | // NOTE: only i386 and x86_64 have been well tested.
123 | // PPC, sparc, alpha, and ia64 are based on
124 | //    http://peter.kuscsik.com/wordpress/?p=14
125 | // with modifications by m3b.  See also
126 | //    https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h
127 | 
128 | // This should return the number of cycles since power-on.  Thread-safe.
129 | inline BENCHMARK_ALWAYS_INLINE int64_t CycleTimer() {
130 | #if defined(BENCHMARK_OS_EMSCRIPTEN)
131 |   // this goes above x86-specific code because old versions of Emscripten
132 |   // define __x86_64__, although they have nothing to do with it.
133 |   return (int64_t)(emscripten_get_now() * 1e+6);
134 | #elif defined(__i386__)
135 |   int64_t ret;
136 |   __asm__ volatile("rdtsc" : "=A"(ret));
137 |   return ret;
138 | #elif defined(__x86_64__) || defined(__amd64__)
139 |   uint64_t low, high;
140 |   __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
141 |   return (high << 32) | low;
142 | #elif defined(BENCHMARK_OS_MACOSX)
143 |   // this goes at the top because we need ALL Macs, regardless of
144 |   // architecture, to return the number of "mach time units" that
145 |   // have passed since startup.  See sysinfo.cc where
146 |   // InitializeSystemInfo() sets the supposed cpu clock frequency of
147 |   // macs to the number of mach time units per second, not actual
148 |   // CPU clock frequency (which can change in the face of CPU
149 |   // frequency scaling).  Also note that when the Mac sleeps, this
150 |   // counter pauses; it does not continue counting, nor does it
151 |   // reset to zero.
152 |   // XKCP-specific: moved this below i386 and x86_64 tests to favor real CPU cycles when available
153 |   return mach_absolute_time();
154 | #elif defined(__powerpc__) || defined(__ppc__)
155 |   // This returns a time-base, which is not always precisely a cycle-count.
156 | #if defined(__powerpc64__) || defined(__ppc64__)
157 |   int64_t tb;
158 |   asm volatile("mfspr %0, 268" : "=r"(tb));
159 |   return tb;
160 | #else
161 |   uint32_t tbl, tbu0, tbu1;
162 |   asm volatile(
163 |       "mftbu %0\n"
164 |       "mftb %1\n"
165 |       "mftbu %2"
166 |       : "=r"(tbu0), "=r"(tbl), "=r"(tbu1));
167 |   tbl &= -(int32_t)(tbu0 == tbu1);
168 |   // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is no longer needed)
169 |   return ((uint64_t)(tbu1) << 32) | tbl;
170 | #endif
171 | #elif defined(__sparc__)
172 |   int64_t tick;
173 |   asm(".byte 0x83, 0x41, 0x00, 0x00");
174 |   asm("mov   %%g1, %0" : "=r"(tick));
175 |   return tick;
176 | #elif defined(__ia64__)
177 |   int64_t itc;
178 |   asm("mov %0 = ar.itc" : "=r"(itc));
179 |   return itc;
180 | #elif defined(COMPILER_MSVC) && defined(_M_IX86)
181 |   // Older MSVC compilers (like 7.x) don't seem to support the
182 |   // __rdtsc intrinsic properly, so I prefer to use _asm instead
183 |   // when I know it will work.  Otherwise, I'll use __rdtsc and hope
184 |   // the code is being compiled with a non-ancient compiler.
185 |   _asm rdtsc
186 | #elif defined(COMPILER_MSVC)
187 |   return __rdtsc();
188 | #elif defined(BENCHMARK_OS_NACL)
189 |   // Native Client validator on x86/x86-64 allows RDTSC instructions,
190 |   // and this case is handled above. Native Client validator on ARM
191 |   // rejects MRC instructions (used in the ARM-specific sequence below),
192 |   // so we handle it here. Portable Native Client compiles to
193 |   // architecture-agnostic bytecode, which doesn't provide any
194 |   // cycle counter access mnemonics.
195 | 
196 |   // Native Client does not provide any API to access cycle counter.
197 |   // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
198 |   // because is provides nanosecond resolution (which is noticable at
199 |   // least for PNaCl modules running on x86 Mac & Linux).
200 |   // Initialize to always return 0 if clock_gettime fails.
201 |   struct timespec ts = {0, 0};
202 |   clock_gettime(CLOCK_MONOTONIC, &ts);
203 |   return (int64_t)(ts.tv_sec) * 1000000000 + ts.tv_nsec;
204 | #elif defined(__aarch64__)
205 |   // System timer of ARMv8 runs at a different frequency than the CPU's.
206 |   // The frequency is fixed, typically in the range 1-50MHz.  It can be
207 |   // read at CNTFRQ special register.  We assume the OS has set up
208 |   // the virtual timer properly.
209 |   int64_t virtual_timer_value;
210 |   asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
211 |   return virtual_timer_value;
212 | #elif defined(__ARM_ARCH)
213 |   // V6 is the earliest arch that has a standard cyclecount
214 |   // Native Client validator doesn't allow MRC instructions.
215 | #if (__ARM_ARCH >= 6)
216 |   uint32_t pmccntr;
217 |   uint32_t pmuseren;
218 |   uint32_t pmcntenset;
219 |   // Read the user mode perf monitor counter access permissions.
220 |   asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
221 |   if (pmuseren & 1) {  // Allows reading perfmon counters for user mode code.
222 |     asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
223 |     if (pmcntenset & 0x80000000ul) {  // Is it counting?
224 |       asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
225 |       // The counter is set up to count every 64th cycle
226 |       return (int64_t)(pmccntr) * 64;  // Should optimize to << 6
227 |     }
228 |   }
229 | #endif
230 |   struct timeval tv;
231 |   gettimeofday(&tv, NULL);
232 |   return (int64_t)(tv.tv_sec) * 1000000 + tv.tv_usec;
233 | #elif defined(__mips__) || defined(__m68k__)
234 |   // mips apparently only allows rdtsc for superusers, so we fall
235 |   // back to gettimeofday.  It's possible clock_gettime would be better.
236 |   struct timeval tv;
237 |   gettimeofday(&tv, NULL);
238 |   return (int64_t)(tv.tv_sec) * 1000000 + tv.tv_usec;
239 | #elif defined(__s390__)  // Covers both s390 and s390x.
240 |   // Return the CPU clock.
241 |   uint64_t tsc;
242 | #if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL)
243 |   // z/OS XL compiler HLASM syntax.
244 |   asm(" stck %0" : "=m"(tsc) : : "cc");
245 | #else
246 |   asm("stck %0" : "=Q"(tsc) : : "cc");
247 | #endif
248 |   return tsc;
249 | #elif defined(__riscv) // RISC-V
250 |   // Use RDCYCLE (and RDCYCLEH on riscv32)
251 | #if __riscv_xlen == 32
252 |   uint32_t cycles_lo, cycles_hi0, cycles_hi1;
253 |   // This asm also includes the PowerPC overflow handling strategy, as above.
254 |   // Implemented in assembly because Clang insisted on branching.
255 |   asm volatile(
256 |       "rdcycleh %0\n"
257 |       "rdcycle %1\n"
258 |       "rdcycleh %2\n"
259 |       "sub %0, %0, %2\n"
260 |       "seqz %0, %0\n"
261 |       "sub %0, zero, %0\n"
262 |       "and %1, %1, %0\n"
263 |       : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1));
264 |   return ((uint64_t)(cycles_hi1) << 32) | cycles_lo;
265 | #else
266 |   uint64_t cycles;
267 |   asm volatile("rdcycle %0" : "=r"(cycles));
268 |   return cycles;
269 | #endif
270 | #else
271 | // The soft failover to a generic implementation is automatic only for ARM.
272 | // For other platforms the developer is expected to make an attempt to create
273 | // a fast implementation and use generic version if nothing better is available.
274 | #error You need to define CycleTimer for your OS and CPU
275 | #endif
276 | }
277 | 
278 | /* ---------------------------------------------------------------- */
279 | /*           XKCP-specific definitions follow.                      */
280 | /* ---------------------------------------------------------------- */
281 | 
282 | 
283 | typedef int64_t cycles_t;
284 | #define CYCLES_MAX INT64_MAX
285 | 
286 | #define TIMER_SAMPLE_CNT (100)
287 | 
288 | const char * getTimerUnit();
289 | extern double timerCorrectionFactor;
290 | cycles_t CalibrateTimer();
291 | 
292 | #define measureTimingDeclare \
293 |     cycles_t tMin = CYCLES_MAX; \
294 |     cycles_t t0,t1,i;
295 | 
296 | #define measureTimingBeginDeclared \
297 |     for (i=0;i < TIMER_SAMPLE_CNT;i++) \
298 |         { \
299 |         t0 = CycleTimer();
300 | 
301 | #define measureTimingBegin \
302 |     cycles_t tMin = CYCLES_MAX; \
303 |     cycles_t t0,t1,i; \
304 |     for (i=0;i < TIMER_SAMPLE_CNT;i++) \
305 |         { \
306 |         t0 = CycleTimer();
307 | 
308 | #define measureTimingEnd \
309 |         t1 = CycleTimer(); \
310 |         if (tMin > t1-t0 - dtMin) \
311 |             tMin = t1-t0 - dtMin; \
312 |         } \
313 |     return (cycles_t)(tMin * timerCorrectionFactor + 0.5);
314 | 
315 | #endif  // _XKCP_timing_h_
316 | 


--------------------------------------------------------------------------------
/lib/ARMv8Asha3/KeccakP-1600-runtimeDispatch.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | K12 based on the eXtended Keccak Code Package (XKCP)
  3 | https://github.com/XKCP/XKCP
  4 | 
  5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
  6 | 
  7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
  8 | 
  9 | For more information, feedback or questions, please refer to the Keccak Team website:
 10 | https://keccak.team/
 11 | 
 12 | To the extent possible under law, the implementer has waived all copyright
 13 | and related or neighboring rights to the source code in this file.
 14 | http://creativecommons.org/publicdomain/zero/1.0/
 15 | 
 16 | ---
 17 | 
 18 | Please refer to the XKCP for more details.
 19 | 
 20 | ARM CPU feature detection adapted from libaegis by Frank Denis.
 21 | */
 22 | 
 23 | #include <stdint.h>
 24 | #include <stdlib.h>
 25 | #include <string.h>
 26 | #include "KeccakP-1600-SnP.h"
 27 | 
 28 | #ifdef KeccakP1600_disableParallelism
 29 | #undef KeccakP1600_enable_simd_options
 30 | #else
 31 | 
 32 | // Forward declarations
 33 | void KangarooTwelve_SetArmProcessorCapabilities();
 34 | 
 35 | #ifdef KeccakP1600_enable_simd_options
 36 | int K12_NEON_requested_disabled = 0;
 37 | int K12_ARM_SHA3_requested_disabled = 0;
 38 | #endif  // KeccakP1600_enable_simd_options
 39 | 
 40 | int K12_enableNEON = 0;
 41 | int K12_enableARM_SHA3 = 0;
 42 | 
 43 | /* ---------------------------------------------------------------- */
 44 | /* Platform-specific includes for CPU feature detection */
 45 | /* ---------------------------------------------------------------- */
 46 | 
 47 | #if defined(__linux__) && (defined(__aarch64__) || defined(__arm__))
 48 | #define K12_HAVE_LINUX_ARM
 49 | #if defined(__GLIBC__) || defined(__BIONIC__)
 50 | #include <sys/auxv.h>
 51 | #define K12_HAVE_GETAUXVAL
 52 | #endif
 53 | #endif
 54 | 
 55 | #if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm__))
 56 | #define K12_HAVE_APPLE_ARM
 57 | #include <sys/sysctl.h>
 58 | #endif
 59 | 
 60 | #if defined(_WIN32) && (defined(_M_ARM64) || defined(_M_ARM))
 61 | #define K12_HAVE_WINDOWS_ARM
 62 | #include <windows.h>
 63 | #endif
 64 | 
 65 | #if defined(__ANDROID__) && (defined(__aarch64__) || defined(__arm__))
 66 | #define K12_HAVE_ANDROID_ARM
 67 | #include <cpu-features.h>
 68 | #endif
 69 | 
 70 | /* ---------------------------------------------------------------- */
 71 | /* Hardware capability constants */
 72 | /* ---------------------------------------------------------------- */
 73 | 
 74 | // 32-bit ARM hwcaps (AT_HWCAP)
 75 | #ifndef K12_ARM_HWCAP_NEON
 76 | #define K12_ARM_HWCAP_NEON (1L << 12)
 77 | #endif
 78 | 
 79 | // AArch64 hwcaps (AT_HWCAP)
 80 | #ifndef K12_AARCH64_HWCAP_ASIMD
 81 | #define K12_AARCH64_HWCAP_ASIMD (1L << 1)
 82 | #endif
 83 | 
 84 | #ifndef K12_AARCH64_HWCAP_SHA3
 85 | #define K12_AARCH64_HWCAP_SHA3 (1L << 17)
 86 | #endif
 87 | 
 88 | /* ---------------------------------------------------------------- */
 89 | /* CPU feature detection */
 90 | /* ---------------------------------------------------------------- */
 91 | 
 92 | enum arm_cpu_feature {
 93 |     ARM_NEON = 1 << 0,
 94 |     ARM_SHA3 = 1 << 1,
 95 |     ARM_UNDEFINED = 1 << 30
 96 | };
 97 | 
 98 | static enum arm_cpu_feature g_arm_cpu_features = ARM_UNDEFINED;
 99 | 
100 | #if defined(K12_HAVE_LINUX_ARM) && defined(K12_HAVE_GETAUXVAL)
101 | static int _have_hwcap(unsigned long hwcap_bit) {
102 |     unsigned long hwcap = getauxval(AT_HWCAP);
103 |     return (hwcap & hwcap_bit) != 0;
104 | }
105 | #endif
106 | 
107 | #if defined(K12_HAVE_APPLE_ARM)
108 | static int _have_arm_feature(const char *feature_name) {
109 |     int64_t feature_present = 0;
110 |     size_t size = sizeof(feature_present);
111 |     if (sysctlbyname(feature_name, &feature_present, &size, NULL, 0) != 0) {
112 |         return 0;
113 |     }
114 |     return feature_present != 0;
115 | }
116 | #endif
117 | 
118 | static enum arm_cpu_feature get_arm_cpu_features(void) {
119 |     if (g_arm_cpu_features != ARM_UNDEFINED) {
120 |         return g_arm_cpu_features;
121 |     }
122 | 
123 |     enum arm_cpu_feature features = 0;
124 | 
125 |     /* ---------------------------------------------------------------- */
126 |     /* NEON Detection */
127 |     /* ---------------------------------------------------------------- */
128 | 
129 |     // Compile-time check - if built with NEON, assume available
130 | #if defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64)
131 |     features |= ARM_NEON;
132 | #elif defined(K12_HAVE_LINUX_ARM) && defined(K12_HAVE_GETAUXVAL)
133 |     // Runtime detection on Linux ARM
134 | #if defined(__aarch64__)
135 |     if (_have_hwcap(K12_AARCH64_HWCAP_ASIMD)) {
136 |         features |= ARM_NEON;
137 |     }
138 | #elif defined(__arm__)
139 |     if (_have_hwcap(K12_ARM_HWCAP_NEON)) {
140 |         features |= ARM_NEON;
141 |     }
142 | #endif
143 | #elif defined(K12_HAVE_ANDROID_ARM)
144 |     // Android detection
145 |     uint64_t android_features = android_getCpuFeatures();
146 |     if (android_features & ANDROID_CPU_ARM_FEATURE_NEON) {
147 |         features |= ARM_NEON;
148 |     }
149 | #elif defined(K12_HAVE_WINDOWS_ARM)
150 |     // Windows ARM64 - assume all have NEON
151 |     features |= ARM_NEON;
152 | #endif
153 | 
154 |     /* ---------------------------------------------------------------- */
155 |     /* SHA3 Detection (requires NEON) */
156 |     /* ---------------------------------------------------------------- */
157 | 
158 |     if (features & ARM_NEON) {
159 |         // Compile-time check
160 | #if defined(__ARM_FEATURE_SHA3)
161 |         features |= ARM_SHA3;
162 | #elif defined(K12_HAVE_LINUX_ARM) && defined(K12_HAVE_GETAUXVAL) && defined(__aarch64__)
163 |         // Runtime detection on Linux AArch64
164 |         if (_have_hwcap(K12_AARCH64_HWCAP_SHA3)) {
165 |             features |= ARM_SHA3;
166 |         }
167 | #elif defined(K12_HAVE_APPLE_ARM)
168 |         // macOS/Apple Silicon detection
169 |         if (_have_arm_feature("hw.optional.arm.FEAT_SHA3")) {
170 |             features |= ARM_SHA3;
171 |         }
172 | #endif
173 |     }
174 | 
175 |     g_arm_cpu_features = features;
176 |     return features;
177 | }
178 | 
179 | void KangarooTwelve_SetArmProcessorCapabilities() {
180 |     enum arm_cpu_feature features = get_arm_cpu_features();
181 |     K12_enableNEON = (features & ARM_NEON) != 0;
182 |     K12_enableARM_SHA3 = (features & ARM_SHA3) != 0;
183 | 
184 | #ifdef KeccakP1600_enable_simd_options
185 |     K12_enableNEON = K12_enableNEON && !K12_NEON_requested_disabled;
186 |     K12_enableARM_SHA3 = K12_enableARM_SHA3 && !K12_ARM_SHA3_requested_disabled;
187 | #endif  // KeccakP1600_enable_simd_options
188 | }
189 | 
190 | /* ---------------------------------------------------------------- */
191 | /* External function declarations */
192 | /* ---------------------------------------------------------------- */
193 | 
194 | // Generic ARM64 implementations (from KeccakP-1600-opt64.c)
195 | extern void KeccakP1600_opt64_Initialize(void *state);
196 | extern void KeccakP1600_opt64_AddByte(void *state, unsigned char data, unsigned int offset);
197 | extern void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
198 | extern void KeccakP1600_opt64_Permute_12rounds(void *state);
199 | extern void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
200 | extern size_t KeccakP1600_opt64_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
201 | 
202 | // ARMv8-A SHA3 optimized implementations (from assembly)
203 | extern void KeccakP1600_ARMv8Asha3_Permute_12rounds(void *state);
204 | extern size_t KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
205 | extern void KeccakP1600times2_ARMv8Asha3_Permute_12rounds(void *state);
206 | extern void KT128_ARMv8Asha3_Process2Leaves(const unsigned char *input, unsigned char *output);
207 | extern void KT256_ARMv8Asha3_Process2Leaves(const unsigned char *input, unsigned char *output);
208 | 
209 | /* ---------------------------------------------------------------- */
210 | /* Dispatch functions for Keccak-p[1600] */
211 | /* ---------------------------------------------------------------- */
212 | 
213 | const char * KeccakP1600_GetImplementation() {
214 |     KangarooTwelve_SetArmProcessorCapabilities();
215 |     if (K12_enableARM_SHA3) {
216 |         return "ARMv8-A+SHA3 optimized implementation";
217 |     } else {
218 |         return "Generic ARM64 implementation";
219 |     }
220 | }
221 | 
222 | void KeccakP1600_Initialize(void *state) {
223 |     KangarooTwelve_SetArmProcessorCapabilities();
224 |     KeccakP1600_opt64_Initialize(state);  // Both use same initialization
225 | }
226 | 
227 | void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset) {
228 |     KangarooTwelve_SetArmProcessorCapabilities();
229 |     KeccakP1600_opt64_AddByte(state, data, offset);  // Both use same AddByte
230 | }
231 | 
232 | void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) {
233 |     KangarooTwelve_SetArmProcessorCapabilities();
234 |     KeccakP1600_opt64_AddBytes(state, data, offset, length);  // Both use same AddBytes
235 | }
236 | 
237 | void KeccakP1600_Permute_12rounds(void *state) {
238 |     KangarooTwelve_SetArmProcessorCapabilities();
239 |     if (K12_enableARM_SHA3) {
240 |         KeccakP1600_ARMv8Asha3_Permute_12rounds(state);
241 |     } else {
242 |         KeccakP1600_opt64_Permute_12rounds(state);
243 |     }
244 | }
245 | 
246 | void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) {
247 |     KangarooTwelve_SetArmProcessorCapabilities();
248 |     KeccakP1600_opt64_ExtractBytes(state, data, offset, length);  // Both use same ExtractBytes
249 | }
250 | 
251 | size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) {
252 |     KangarooTwelve_SetArmProcessorCapabilities();
253 |     if (K12_enableARM_SHA3) {
254 |         return KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen);
255 |     } else {
256 |         return KeccakP1600_opt64_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen);
257 |     }
258 | }
259 | 
260 | /* ---------------------------------------------------------------- */
261 | /* Dispatch functions for Keccak-p[1600]×2 */
262 | /* ---------------------------------------------------------------- */
263 | 
264 | int KeccakP1600times2_IsAvailable() {
265 |     KangarooTwelve_SetArmProcessorCapabilities();
266 |     return K12_enableARM_SHA3;
267 | }
268 | 
269 | const char * KeccakP1600times2_GetImplementation() {
270 |     KangarooTwelve_SetArmProcessorCapabilities();
271 |     if (K12_enableARM_SHA3) {
272 |         return "ARMv8-A+SHA3 optimized implementation";
273 |     } else {
274 |         return "";
275 |     }
276 | }
277 | 
278 | void KeccakP1600times2_Permute_12rounds(void *state) {
279 |     KangarooTwelve_SetArmProcessorCapabilities();
280 |     if (K12_enableARM_SHA3) {
281 |         KeccakP1600times2_ARMv8Asha3_Permute_12rounds(state);
282 |     }
283 | }
284 | 
285 | void KT128_Process2Leaves(const unsigned char *input, unsigned char *output) {
286 |     KangarooTwelve_SetArmProcessorCapabilities();
287 |     if (K12_enableARM_SHA3) {
288 |         KT128_ARMv8Asha3_Process2Leaves(input, output);
289 |     }
290 | }
291 | 
292 | void KT256_Process2Leaves(const unsigned char *input, unsigned char *output) {
293 |     KangarooTwelve_SetArmProcessorCapabilities();
294 |     if (K12_enableARM_SHA3) {
295 |         KT256_ARMv8Asha3_Process2Leaves(input, output);
296 |     }
297 | }
298 | 
299 | /* ---------------------------------------------------------------- */
300 | /* Keccak-p[1600]×4 (not available on ARM) */
301 | /* ---------------------------------------------------------------- */
302 | 
303 | int KeccakP1600times4_IsAvailable() {
304 |     return 0;
305 | }
306 | 
307 | const char * KeccakP1600times4_GetImplementation() {
308 |     return "";
309 | }
310 | 
311 | void KT128_Process4Leaves(const unsigned char *input, unsigned char *output) {
312 |     (void)input;
313 |     (void)output;
314 | }
315 | 
316 | void KT256_Process4Leaves(const unsigned char *input, unsigned char *output) {
317 |     (void)input;
318 |     (void)output;
319 | }
320 | 
321 | /* ---------------------------------------------------------------- */
322 | /* Keccak-p[1600]×8 (not available on ARM) */
323 | /* ---------------------------------------------------------------- */
324 | 
325 | int KeccakP1600times8_IsAvailable() {
326 |     return 0;
327 | }
328 | 
329 | const char * KeccakP1600times8_GetImplementation() {
330 |     return "";
331 | }
332 | 
333 | void KT128_Process8Leaves(const unsigned char *input, unsigned char *output) {
334 |     (void)input;
335 |     (void)output;
336 | }
337 | 
338 | void KT256_Process8Leaves(const unsigned char *input, unsigned char *output) {
339 |     (void)input;
340 |     (void)output;
341 | }
342 | 
343 | /* ---------------------------------------------------------------- */
344 | /* Optional API for disabling CPU features */
345 | /* ---------------------------------------------------------------- */
346 | 
347 | #ifdef KeccakP1600_enable_simd_options
348 | 
349 | int KangarooTwelve_DisableNeon(void) {
350 |     KangarooTwelve_SetArmProcessorCapabilities();
351 |     K12_NEON_requested_disabled = 1;
352 |     if (K12_enableNEON) {
353 |         KangarooTwelve_SetArmProcessorCapabilities();
354 |         return 1;  // NEON was disabled on this call.
355 |     } else {
356 |         return 0;  // Nothing changed.
357 |     }
358 | }
359 | 
360 | int KangarooTwelve_DisableArmSha3(void) {
361 |     KangarooTwelve_SetArmProcessorCapabilities();
362 |     K12_ARM_SHA3_requested_disabled = 1;
363 |     if (K12_enableARM_SHA3) {
364 |         KangarooTwelve_SetArmProcessorCapabilities();
365 |         return 1;  // ARM SHA3 was disabled on this call.
366 |     } else {
367 |         return 0;  // Nothing changed.
368 |     }
369 | }
370 | 
371 | void KangarooTwelve_EnableAllArmCpuFeatures(void) {
372 |     K12_NEON_requested_disabled = 0;
373 |     K12_ARM_SHA3_requested_disabled = 0;
374 |     KangarooTwelve_SetArmProcessorCapabilities();
375 | }
376 | 
377 | #endif  // KeccakP1600_enable_simd_options
378 | 
379 | #endif  // !KeccakP1600_disableParallelism
380 | 


--------------------------------------------------------------------------------
/lib/Optimized64/KeccakP-1600-runtimeDispatch.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | K12 based on the eXtended Keccak Code Package (XKCP)
  3 | https://github.com/XKCP/XKCP
  4 | 
  5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
  6 | 
  7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
  8 | 
  9 | For more information, feedback or questions, please refer to the Keccak Team website:
 10 | https://keccak.team/
 11 | 
 12 | To the extent possible under law, the implementer has waived all copyright
 13 | and related or neighboring rights to the source code in this file.
 14 | http://creativecommons.org/publicdomain/zero/1.0/
 15 | 
 16 | ---
 17 | 
 18 | Please refer to the XKCP for more details.
 19 | */
 20 | 
 21 | #include <stdint.h>
 22 | #include <stdlib.h>
 23 | #include <string.h>
 24 | #include "brg_endian.h"
 25 | #include "KeccakP-1600-SnP.h"
 26 | 
 27 | #ifdef KeccakP1600_disableParallelism
 28 | #undef KeccakP1600_enable_simd_options
 29 | #else
 30 | 
 31 | // Forward declaration
 32 | void KangarooTwelve_SetProcessorCapabilities();
 33 | #ifdef KeccakP1600_enable_simd_options
 34 | int K12_SSSE3_requested_disabled = 0;
 35 | int K12_AVX2_requested_disabled = 0;
 36 | int K12_AVX512_requested_disabled = 0;
 37 | #endif  // KeccakP1600_enable_simd_options
 38 | int K12_enableSSSE3 = 0;
 39 | int K12_enableAVX2 = 0;
 40 | int K12_enableAVX512 = 0;
 41 | 
 42 | /* ---------------------------------------------------------------- */
 43 | 
 44 | void KT128_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output);
 45 | void KT128_AVX512_Process2Leaves(const unsigned char *input, unsigned char *output);
 46 | 
 47 | void KT256_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output);
 48 | void KT256_AVX512_Process2Leaves(const unsigned char *input, unsigned char *output);
 49 | 
 50 | int KeccakP1600times2_IsAvailable()
 51 | {
 52 |     int result = 0;
 53 |     result |= K12_enableAVX512;
 54 |     result |= K12_enableSSSE3;
 55 |     return result;
 56 | }
 57 | 
 58 | const char * KeccakP1600times2_GetImplementation()
 59 | {
 60 |     if (K12_enableAVX512) {
 61 |         return "AVX-512 implementation";
 62 |     } else if (K12_enableSSSE3) {
 63 |         return "SSSE3 implementation";
 64 |     } else {
 65 |         return "";
 66 |     }
 67 | }
 68 | 
 69 | void KT128_Process2Leaves(const unsigned char *input, unsigned char *output)
 70 | {
 71 |     if (K12_enableAVX512) {
 72 |         KT128_AVX512_Process2Leaves(input, output);
 73 |     } else if (K12_enableSSSE3) {
 74 |         KT128_SSSE3_Process2Leaves(input, output);
 75 |     }
 76 | }
 77 | 
 78 | void KT256_Process2Leaves(const unsigned char *input, unsigned char *output)
 79 | {
 80 |     if (K12_enableAVX512) {
 81 |         KT256_AVX512_Process2Leaves(input, output);
 82 |     } else if (K12_enableSSSE3) {
 83 |         KT256_SSSE3_Process2Leaves(input, output);
 84 |     }
 85 | }
 86 | 
 87 | 
 88 | void KT128_AVX2_Process4Leaves(const unsigned char *input, unsigned char *output);
 89 | void KT128_AVX512_Process4Leaves(const unsigned char *input, unsigned char *output);
 90 | 
 91 | void KT256_AVX2_Process4Leaves(const unsigned char *input, unsigned char *output);
 92 | void KT256_AVX512_Process4Leaves(const unsigned char *input, unsigned char *output);
 93 | 
 94 | int KeccakP1600times4_IsAvailable()
 95 | {
 96 |     int result = 0;
 97 |     result |= K12_enableAVX512;
 98 |     result |= K12_enableAVX2;
 99 |     return result;
100 | }
101 | 
102 | const char * KeccakP1600times4_GetImplementation()
103 | {
104 |     if (K12_enableAVX512) {
105 |         return "AVX-512 implementation";
106 |     } else if (K12_enableAVX2) {
107 |         return "AVX2 implementation";
108 |     } else {
109 |         return "";
110 |     }
111 | }
112 | 
113 | void KT128_Process4Leaves(const unsigned char *input, unsigned char *output)
114 | {
115 |     if (K12_enableAVX512) {
116 |         KT128_AVX512_Process4Leaves(input, output);
117 |     } else if (K12_enableAVX2) {
118 |         KT128_AVX2_Process4Leaves(input, output);
119 |     }
120 | }
121 | 
122 | void KT256_Process4Leaves(const unsigned char *input, unsigned char *output)
123 | {
124 |     if (K12_enableAVX512) {
125 |         KT256_AVX512_Process4Leaves(input, output);
126 |     } else if (K12_enableAVX2) {
127 |         KT256_AVX2_Process4Leaves(input, output);
128 |     }
129 | }
130 | 
131 | void KT128_AVX512_Process8Leaves(const unsigned char *input, unsigned char *output);
132 | 
133 | void KT256_AVX512_Process8Leaves(const unsigned char *input, unsigned char *output);
134 | 
135 | int KeccakP1600times8_IsAvailable()
136 | {
137 |     int result = 0;
138 |     result |= K12_enableAVX512;
139 |     return result;
140 | }
141 | 
142 | const char * KeccakP1600times8_GetImplementation()
143 | {
144 |     if (K12_enableAVX512) {
145 |         return "AVX-512 implementation";
146 |     } else {
147 |         return "";
148 |     }
149 | }
150 | 
151 | void KT128_Process8Leaves(const unsigned char *input, unsigned char *output)
152 | {
153 |     if (K12_enableAVX512)
154 |         KT128_AVX512_Process8Leaves(input, output);
155 | }
156 | 
157 | void KT256_Process8Leaves(const unsigned char *input, unsigned char *output)
158 | {
159 |     if (K12_enableAVX512)
160 |         KT256_AVX512_Process8Leaves(input, output);
161 | }
162 | 
163 | #endif  // KeccakP1600_disableParallelism
164 | 
165 | const char * KeccakP1600_GetImplementation()
166 | {
167 |     if (K12_enableAVX512)
168 |         return "AVX-512 implementation";
169 |     else
170 | #ifndef KeccakP1600_noAssembly
171 |     if (K12_enableAVX2)
172 |         return "AVX2 implementation";
173 |     else
174 | #endif
175 |         return "generic 64-bit implementation";
176 | }
177 | 
178 | void KeccakP1600_Initialize(void *state)
179 | {
180 |     KangarooTwelve_SetProcessorCapabilities();
181 |     if (K12_enableAVX512)
182 |         KeccakP1600_AVX512_Initialize(state);
183 |     else
184 | #ifndef KeccakP1600_noAssembly
185 |     if (K12_enableAVX2)
186 |         KeccakP1600_AVX2_Initialize(state);
187 |     else
188 | #endif
189 |         KeccakP1600_opt64_Initialize(state);
190 | }
191 | 
192 | void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset)
193 | {
194 |     if (K12_enableAVX512)
195 |         ((unsigned char*)(state))[offset] ^= data;
196 |     else
197 | #ifndef KeccakP1600_noAssembly
198 |     if (K12_enableAVX2)
199 |         KeccakP1600_AVX2_AddByte(state, data, offset);
200 |     else
201 | #endif
202 |         KeccakP1600_opt64_AddByte(state, data, offset);
203 | }
204 | 
205 | void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
206 | {
207 |     if (K12_enableAVX512)
208 |         KeccakP1600_AVX512_AddBytes(state, data, offset, length);
209 |     else
210 | #ifndef KeccakP1600_noAssembly
211 |     if (K12_enableAVX2)
212 |         KeccakP1600_AVX2_AddBytes(state, data, offset, length);
213 |     else
214 | #endif
215 |         KeccakP1600_opt64_AddBytes(state, data, offset, length);
216 | }
217 | 
218 | void KeccakP1600_Permute_12rounds(void *state)
219 | {
220 |     if (K12_enableAVX512)
221 |         KeccakP1600_AVX512_Permute_12rounds(state);
222 |     else
223 | #ifndef KeccakP1600_noAssembly
224 |     if (K12_enableAVX2)
225 |         KeccakP1600_AVX2_Permute_12rounds(state);
226 |     else
227 | #endif
228 |         KeccakP1600_opt64_Permute_12rounds(state);
229 | }
230 | 
231 | void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
232 | {
233 |     if (K12_enableAVX512)
234 |         KeccakP1600_AVX512_ExtractBytes(state, data, offset, length);
235 |     else
236 | #ifndef KeccakP1600_noAssembly
237 |     if (K12_enableAVX2)
238 |         KeccakP1600_AVX2_ExtractBytes(state, data, offset, length);
239 |     else
240 | #endif
241 |         KeccakP1600_opt64_ExtractBytes(state, data, offset, length);
242 | }
243 | 
244 | size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen)
245 | {
246 |     if (K12_enableAVX512)
247 |         return KeccakP1600_AVX512_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen);
248 |     else
249 | #ifndef KeccakP1600_noAssembly
250 |     if (K12_enableAVX2)
251 |         return KeccakP1600_AVX2_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen);
252 |     else
253 | #endif
254 |         return KeccakP1600_opt64_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen);
255 | }
256 | 
257 | /* ---------------------------------------------------------------- */
258 | 
259 | /* Processor capability detection code by Samuel Neves and Jack O'Connor, see
260 |  * https://github.com/BLAKE3-team/BLAKE3/blob/master/c/blake3_dispatch.c
261 |  */
262 | 
263 | #if defined(__x86_64__) || defined(_M_X64)
264 | #define IS_X86
265 | #define IS_X86_64
266 | #endif
267 | 
268 | #if defined(__i386__) || defined(_M_IX86)
269 | #define IS_X86
270 | #define IS_X86_32
271 | #endif
272 | 
273 | #if defined(IS_X86)
274 | static uint64_t xgetbv() {
275 | #if defined(_MSC_VER)
276 |   return _xgetbv(0);
277 | #else
278 |   uint32_t eax = 0, edx = 0;
279 |   __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
280 |   return ((uint64_t)edx << 32) | eax;
281 | #endif
282 | }
283 | 
284 | static void cpuid(uint32_t out[4], uint32_t id) {
285 | #if defined(_MSC_VER)
286 |   __cpuid((int *)out, id);
287 | #elif defined(__i386__) || defined(_M_IX86)
288 |   __asm__ __volatile__("movl %%ebx, %1\n"
289 |                        "cpuid\n"
290 |                        "xchgl %1, %%ebx\n"
291 |                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
292 |                        : "a"(id));
293 | #else
294 |   __asm__ __volatile__("cpuid\n"
295 |                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
296 |                        : "a"(id));
297 | #endif
298 | }
299 | 
300 | static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
301 | #if defined(_MSC_VER)
302 |   __cpuidex((int *)out, id, sid);
303 | #elif defined(__i386__) || defined(_M_IX86)
304 |   __asm__ __volatile__("movl %%ebx, %1\n"
305 |                        "cpuid\n"
306 |                        "xchgl %1, %%ebx\n"
307 |                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
308 |                        : "a"(id), "c"(sid));
309 | #else
310 |   __asm__ __volatile__("cpuid\n"
311 |                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
312 |                        : "a"(id), "c"(sid));
313 | #endif
314 | }
315 | 
316 | #endif
317 | 
318 | enum cpu_feature {
319 |   SSE2 = 1 << 0,
320 |   SSSE3 = 1 << 1,
321 |   SSE41 = 1 << 2,
322 |   AVX = 1 << 3,
323 |   AVX2 = 1 << 4,
324 |   AVX512F = 1 << 5,
325 |   AVX512VL = 1 << 6,
326 |   /* ... */
327 |   UNDEFINED = 1 << 30
328 | };
329 | 
330 | static enum cpu_feature g_cpu_features = UNDEFINED;
331 | 
332 | static enum cpu_feature
333 |     get_cpu_features(void) {
334 | 
335 |   if (g_cpu_features != UNDEFINED) {
336 |     return g_cpu_features;
337 |   } else {
338 | #if defined(IS_X86)
339 |     uint32_t regs[4] = {0};
340 |     uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
341 |     (void)edx;
342 |     enum cpu_feature features = 0;
343 |     cpuid(regs, 0);
344 |     const int max_id = *eax;
345 |     cpuid(regs, 1);
346 | #if defined(__amd64__) || defined(_M_X64)
347 |     features |= SSE2;
348 | #else
349 |     if (*edx & (1UL << 26))
350 |       features |= SSE2;
351 | #endif
352 |     if (*ecx & (1UL << 9))
353 |       features |= SSSE3;
354 |     if (*ecx & (1UL << 19))
355 |       features |= SSE41;
356 | 
357 |     if (*ecx & (1UL << 27)) { // OSXSAVE
358 |       const uint64_t mask = xgetbv();
359 |       if ((mask & 6) == 6) { // SSE and AVX states
360 |         if (*ecx & (1UL << 28))
361 |           features |= AVX;
362 |         if (max_id >= 7) {
363 |           cpuidex(regs, 7, 0);
364 |           if (*ebx & (1UL << 5))
365 |             features |= AVX2;
366 |           if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
367 |             if (*ebx & (1UL << 31))
368 |               features |= AVX512VL;
369 |             if (*ebx & (1UL << 16))
370 |               features |= AVX512F;
371 |           }
372 |         }
373 |       }
374 |     }
375 |     g_cpu_features = features;
376 |     return features;
377 | #else
378 |     /* How to detect NEON? */
379 |     return 0;
380 | #endif
381 |   }
382 | }
383 | 
384 | void KangarooTwelve_SetProcessorCapabilities()
385 | {
386 |     enum cpu_feature features = get_cpu_features();
387 |     K12_enableSSSE3 = (features & SSSE3);
388 |     K12_enableAVX2 = (features & AVX2);
389 |     K12_enableAVX512 = (features & AVX512F) && (features & AVX512VL);
390 | #ifdef KeccakP1600_enable_simd_options
391 |     K12_enableSSSE3 = K12_enableSSSE3 && !K12_SSSE3_requested_disabled;
392 |     K12_enableAVX2 = K12_enableAVX2 && !K12_AVX2_requested_disabled;
393 |     K12_enableAVX512 = K12_enableAVX512 && !K12_AVX512_requested_disabled;
394 | #endif  // KeccakP1600_enable_simd_options
395 | }
396 | 
397 | #ifdef KeccakP1600_enable_simd_options
398 | int KangarooTwelve_DisableSSSE3(void) {
399 |     KangarooTwelve_SetProcessorCapabilities();
400 |     K12_SSSE3_requested_disabled = 1;
401 |     if (K12_enableSSSE3) {
402 |         KangarooTwelve_SetProcessorCapabilities();
403 |         return 1;  // SSSE3 was disabled on this call.
404 |     } else {
405 |         return 0;  // Nothing changed.
406 |     }
407 | }
408 | 
409 | int KangarooTwelve_DisableAVX2(void) {
410 |     KangarooTwelve_SetProcessorCapabilities();
411 |     K12_AVX2_requested_disabled = 1;
412 |     if (K12_enableAVX2) {
413 |         KangarooTwelve_SetProcessorCapabilities();
414 |         return 1;  // AVX2 was disabled on this call.
415 |     } else {
416 |         return 0;  // Nothing changed.
417 |     }
418 | }
419 | 
420 | int KangarooTwelve_DisableAVX512(void) {
421 |     KangarooTwelve_SetProcessorCapabilities();
422 |     K12_AVX512_requested_disabled = 1;
423 |     if (K12_enableAVX512) {
424 |         KangarooTwelve_SetProcessorCapabilities();
425 |         return 1;  // AVX512 was disabled on this call.
426 |     } else {
427 |         return 0;  // Nothing changed.
428 |     }
429 | }
430 | 
431 | void KangarooTwelve_EnableAllCpuFeatures(void) {
432 |     K12_SSSE3_requested_disabled = 0;
433 |     K12_AVX2_requested_disabled = 0;
434 |     K12_AVX512_requested_disabled = 0;
435 |     KangarooTwelve_SetProcessorCapabilities();
436 | }
437 | #endif  // KeccakP1600_enable_simd_options
438 | 


--------------------------------------------------------------------------------
/lib/KangarooTwelve-threading.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | K12 based on the eXtended Keccak Code Package (XKCP)
  3 | https://github.com/XKCP/XKCP
  4 | 
  5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier.
  6 | 
  7 | Threading support implementation using portable thread pool abstraction.
  8 | 
  9 | To the extent possible under law, the implementer has waived all copyright
 10 | and related or neighboring rights to the source code in this file.
 11 | http://creativecommons.org/publicdomain/zero/1.0/
 12 | */
 13 | 
 14 | #include "KangarooTwelve-threading.h"
 15 | #include "KangarooTwelve.h"
 16 | #include "KT-threadpool.h"
 17 | #include "KeccakP-1600-SnP.h"
 18 | #include <stdlib.h>
 19 | #include <string.h>
 20 | 
 21 | /* Constants from KangarooTwelve.c */
 22 | #define K12_chunkSize       8192
 23 | #define K12_suffixLeaf      0x0B
 24 | #define KT128_capacityInBytes   32
 25 | #define KT256_capacityInBytes   64
 26 | 
 27 | /* Thread pool configuration */
 28 | #define MAX_THREADS 64
 29 | #define MIN_CHUNKS_PER_THREAD 8
 30 | 
 31 | /* Batch-based work distribution */
 32 | #define CHUNKS_PER_BATCH 32      /* 32 chunks = 256 KB per batch - matches Zig */
 33 | #define MAX_BATCHES 256          /* Maximum concurrent batches */
 34 | 
 35 | /* SIMD leaf processing functions (from KeccakP-1600-runtimeDispatch.c) */
 36 | #ifndef KeccakP1600_disableParallelism
 37 | void KT128_Process2Leaves(const unsigned char *input, unsigned char *output);
 38 | void KT128_Process4Leaves(const unsigned char *input, unsigned char *output);
 39 | void KT128_Process8Leaves(const unsigned char *input, unsigned char *output);
 40 | void KT256_Process2Leaves(const unsigned char *input, unsigned char *output);
 41 | void KT256_Process4Leaves(const unsigned char *input, unsigned char *output);
 42 | void KT256_Process8Leaves(const unsigned char *input, unsigned char *output);
 43 | int KeccakP1600times2_IsAvailable(void);
 44 | int KeccakP1600times4_IsAvailable(void);
 45 | int KeccakP1600times8_IsAvailable(void);
 46 | #endif
 47 | 
 48 | /* Work item for chunk processing */
 49 | typedef struct {
 50 |     const unsigned char *input;
 51 |     size_t start_chunk;
 52 |     size_t end_chunk;
 53 |     unsigned char *output;
 54 |     int security_level;
 55 |     int capacity_bytes;
 56 | } ChunkWork;
 57 | 
 58 | /* TurboSHAKE instance for local use */
 59 | typedef struct {
 60 |     uint8_t state[KeccakP1600_stateSizeInBytes];
 61 |     unsigned int rate;
 62 |     uint8_t byteIOIndex;
 63 |     uint8_t squeezing;
 64 | } TurboSHAKE_Instance_Local;
 65 | 
 66 | /* Forward declarations */
 67 | static void process_chunk_range(void *work_ptr);
 68 | static void TurboSHAKE_Initialize_Local(TurboSHAKE_Instance_Local *instance, unsigned int capacity);
 69 | static void TurboSHAKE_Absorb_Local(TurboSHAKE_Instance_Local *instance, const unsigned char *data, size_t dataByteLen);
 70 | static void TurboSHAKE_AbsorbDomainSeparationByte_Local(TurboSHAKE_Instance_Local *instance, unsigned char D);
 71 | static void TurboSHAKE_Squeeze_Local(TurboSHAKE_Instance_Local *instance, unsigned char *data, size_t dataByteLen);
 72 | 
 73 | /* TurboSHAKE helper functions for thread-local processing */
 74 | static void TurboSHAKE_Initialize_Local(TurboSHAKE_Instance_Local *instance, unsigned int capacity)
 75 | {
 76 |     KeccakP1600_Initialize(instance->state);
 77 |     instance->rate = 1600 - capacity;
 78 |     instance->byteIOIndex = 0;
 79 |     instance->squeezing = 0;
 80 | }
 81 | 
 82 | static void TurboSHAKE_Absorb_Local(TurboSHAKE_Instance_Local *instance, const unsigned char *data, size_t dataByteLen)
 83 | {
 84 |     size_t i, j;
 85 |     uint8_t partialBlock;
 86 |     const unsigned char *curData;
 87 |     const uint8_t rateInBytes = instance->rate/8;
 88 | 
 89 |     i = 0;
 90 |     curData = data;
 91 |     while(i < dataByteLen) {
 92 |         if ((instance->byteIOIndex == 0) && (dataByteLen-i >= rateInBytes)) {
 93 | #ifdef KeccakP1600_12rounds_FastLoop_supported
 94 |             j = KeccakP1600_12rounds_FastLoop_Absorb(instance->state, instance->rate/64, curData, dataByteLen - i);
 95 |             i += j;
 96 |             curData += j;
 97 | #endif
 98 |             for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) {
 99 |                 KeccakP1600_AddBytes(instance->state, curData, 0, rateInBytes);
100 |                 KeccakP1600_Permute_12rounds(instance->state);
101 |                 curData+=rateInBytes;
102 |             }
103 |             i = dataByteLen - j;
104 |         } else {
105 |             if (dataByteLen - i > (size_t)rateInBytes - instance->byteIOIndex) {
106 |                 partialBlock = rateInBytes-instance->byteIOIndex;
107 |             } else {
108 |                 partialBlock = (uint8_t)(dataByteLen - i);
109 |             }
110 |             i += partialBlock;
111 | 
112 |             KeccakP1600_AddBytes(instance->state, curData, instance->byteIOIndex, partialBlock);
113 |             curData += partialBlock;
114 |             instance->byteIOIndex += partialBlock;
115 |             if (instance->byteIOIndex == rateInBytes) {
116 |                 KeccakP1600_Permute_12rounds(instance->state);
117 |                 instance->byteIOIndex = 0;
118 |             }
119 |         }
120 |     }
121 | }
122 | 
123 | static void TurboSHAKE_AbsorbDomainSeparationByte_Local(TurboSHAKE_Instance_Local *instance, unsigned char D)
124 | {
125 |     const unsigned int rateInBytes = instance->rate/8;
126 | 
127 |     KeccakP1600_AddByte(instance->state, D, instance->byteIOIndex);
128 |     if ((D >= 0x80) && (instance->byteIOIndex == (rateInBytes-1)))
129 |         KeccakP1600_Permute_12rounds(instance->state);
130 |     KeccakP1600_AddByte(instance->state, 0x80, rateInBytes-1);
131 |     KeccakP1600_Permute_12rounds(instance->state);
132 |     instance->byteIOIndex = 0;
133 |     instance->squeezing = 1;
134 | }
135 | 
136 | static void TurboSHAKE_Squeeze_Local(TurboSHAKE_Instance_Local *instance, unsigned char *data, size_t dataByteLen)
137 | {
138 |     size_t i, j;
139 |     unsigned int partialBlock;
140 |     const unsigned int rateInBytes = instance->rate/8;
141 |     unsigned char *curData;
142 | 
143 |     if (!instance->squeezing)
144 |         TurboSHAKE_AbsorbDomainSeparationByte_Local(instance, 0x01);
145 | 
146 |     i = 0;
147 |     curData = data;
148 |     while(i < dataByteLen) {
149 |         if ((instance->byteIOIndex == rateInBytes) && (dataByteLen-i >= rateInBytes)) {
150 |             for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) {
151 |                 KeccakP1600_Permute_12rounds(instance->state);
152 |                 KeccakP1600_ExtractBytes(instance->state, curData, 0, rateInBytes);
153 |                 curData+=rateInBytes;
154 |             }
155 |             i = dataByteLen - j;
156 |         } else {
157 |             if (instance->byteIOIndex == rateInBytes) {
158 |                 KeccakP1600_Permute_12rounds(instance->state);
159 |                 instance->byteIOIndex = 0;
160 |             }
161 |             if (dataByteLen-i > rateInBytes-instance->byteIOIndex)
162 |                 partialBlock = rateInBytes-instance->byteIOIndex;
163 |             else
164 |                 partialBlock = (unsigned int)(dataByteLen - i);
165 |             i += partialBlock;
166 | 
167 |             KeccakP1600_ExtractBytes(instance->state, curData, instance->byteIOIndex, partialBlock);
168 |             curData += partialBlock;
169 |             instance->byteIOIndex += partialBlock;
170 |         }
171 |     }
172 | }
173 | 
174 | /* Process a single chunk without SIMD (scalar fallback) */
175 | static void process_single_chunk(const unsigned char *input, unsigned char *output,
176 |                                  int security_level, int capacity_bytes)
177 | {
178 |     TurboSHAKE_Instance_Local queueNode;
179 | 
180 |     /* Initialize TurboSHAKE for this chunk */
181 |     TurboSHAKE_Initialize_Local(&queueNode, 2 * security_level);
182 | 
183 |     /* Absorb the chunk */
184 |     TurboSHAKE_Absorb_Local(&queueNode, input, K12_chunkSize);
185 | 
186 |     /* Finalize with domain separation */
187 |     TurboSHAKE_AbsorbDomainSeparationByte_Local(&queueNode, K12_suffixLeaf);
188 | 
189 |     /* Squeeze out the chaining value */
190 |     TurboSHAKE_Squeeze_Local(&queueNode, output, capacity_bytes);
191 | }
192 | 
193 | /* Process a range of chunks - adapted to work as thread pool job */
194 | static void process_chunk_range(void *work_ptr)
195 | {
196 |     ChunkWork *work = (ChunkWork *)work_ptr;
197 |     const unsigned char *chunk_ptr = work->input + (work->start_chunk * K12_chunkSize);
198 |     unsigned char *output_ptr = work->output + (work->start_chunk * work->capacity_bytes);
199 |     size_t chunks_remaining = work->end_chunk - work->start_chunk;
200 | 
201 | #ifndef KeccakP1600_disableParallelism
202 |     /* Use SIMD parallelism when available - process 8/4/2 chunks at a time */
203 |     if (work->security_level == 128) {
204 |         /* KT128 mode */
205 |         if (KeccakP1600times8_IsAvailable()) {
206 |             while (chunks_remaining >= 8) {
207 |                 KT128_Process8Leaves(chunk_ptr, output_ptr);
208 |                 chunk_ptr += 8 * K12_chunkSize;
209 |                 output_ptr += 8 * KT128_capacityInBytes;
210 |                 chunks_remaining -= 8;
211 |             }
212 |         }
213 | 
214 |         if (KeccakP1600times4_IsAvailable()) {
215 |             while (chunks_remaining >= 4) {
216 |                 KT128_Process4Leaves(chunk_ptr, output_ptr);
217 |                 chunk_ptr += 4 * K12_chunkSize;
218 |                 output_ptr += 4 * KT128_capacityInBytes;
219 |                 chunks_remaining -= 4;
220 |             }
221 |         }
222 | 
223 |         if (KeccakP1600times2_IsAvailable()) {
224 |             while (chunks_remaining >= 2) {
225 |                 KT128_Process2Leaves(chunk_ptr, output_ptr);
226 |                 chunk_ptr += 2 * K12_chunkSize;
227 |                 output_ptr += 2 * KT128_capacityInBytes;
228 |                 chunks_remaining -= 2;
229 |             }
230 |         }
231 |     } else {
232 |         /* KT256 mode */
233 |         if (KeccakP1600times8_IsAvailable()) {
234 |             while (chunks_remaining >= 8) {
235 |                 KT256_Process8Leaves(chunk_ptr, output_ptr);
236 |                 chunk_ptr += 8 * K12_chunkSize;
237 |                 output_ptr += 8 * KT256_capacityInBytes;
238 |                 chunks_remaining -= 8;
239 |             }
240 |         }
241 | 
242 |         if (KeccakP1600times4_IsAvailable()) {
243 |             while (chunks_remaining >= 4) {
244 |                 KT256_Process4Leaves(chunk_ptr, output_ptr);
245 |                 chunk_ptr += 4 * K12_chunkSize;
246 |                 output_ptr += 4 * KT256_capacityInBytes;
247 |                 chunks_remaining -= 4;
248 |             }
249 |         }
250 | 
251 |         if (KeccakP1600times2_IsAvailable()) {
252 |             while (chunks_remaining >= 2) {
253 |                 KT256_Process2Leaves(chunk_ptr, output_ptr);
254 |                 chunk_ptr += 2 * K12_chunkSize;
255 |                 output_ptr += 2 * KT256_capacityInBytes;
256 |                 chunks_remaining -= 2;
257 |             }
258 |         }
259 |     }
260 | #endif  /* KeccakP1600_disableParallelism */
261 | 
262 |     /* Process any remaining chunks with scalar code */
263 |     while (chunks_remaining > 0) {
264 |         process_single_chunk(chunk_ptr, output_ptr,
265 |                             work->security_level, work->capacity_bytes);
266 |         chunk_ptr += K12_chunkSize;
267 |         output_ptr += work->capacity_bytes;
268 |         chunks_remaining--;
269 |     }
270 | }
271 | 
272 | /* Main function to process chunks in parallel
273 |  *
274 |  * Each batch is CHUNKS_PER_BATCH chunks (256 KB), which:
275 |  * - Is large enough to amortize task scheduling overhead
276 |  * - Is small enough to allow good load balancing via work-stealing
277 |  */
278 | int KT_ProcessChunksThreaded(const KT_ThreadPool_API* threadpool_api,
279 |                              void* threadpool_handle,
280 |                              int thread_count,
281 |                              const unsigned char *input,
282 |                              size_t chunkCount,
283 |                              unsigned char *output,
284 |                              int securityLevel)
285 | {
286 |     if (chunkCount == 0)
287 |         return 1;
288 | 
289 |     /* No threading configured - should not happen, but handle gracefully */
290 |     if (!threadpool_api || !threadpool_handle || thread_count < 1)
291 |         return 1;
292 | 
293 |     /* Determine capacity in bytes */
294 |     int capacity_bytes = (securityLevel == 128) ? KT128_capacityInBytes : KT256_capacityInBytes;
295 | 
296 |     /* Calculate number of batches needed */
297 |     size_t num_batches = (chunkCount + CHUNKS_PER_BATCH - 1) / CHUNKS_PER_BATCH;
298 | 
299 |     /* If only one batch worth of work, process sequentially */
300 |     if (num_batches <= 1) {
301 |         ChunkWork work;
302 |         work.input = input;
303 |         work.start_chunk = 0;
304 |         work.end_chunk = chunkCount;
305 |         work.output = output;
306 |         work.security_level = securityLevel;
307 |         work.capacity_bytes = capacity_bytes;
308 |         process_chunk_range(&work);
309 |         return 0;
310 |     }
311 | 
312 |     /* Cap number of batches to avoid excessive overhead */
313 |     if (num_batches > MAX_BATCHES)
314 |         num_batches = MAX_BATCHES;
315 | 
316 |     /* Allocate work items for batch processing */
317 |     ChunkWork* work_items = (ChunkWork*)malloc(num_batches * sizeof(ChunkWork));
318 |     if (!work_items)
319 |         return 1;
320 | 
321 |     /* Distribute chunks evenly across batches
322 |      * When batch count is capped, ensure balanced distribution
323 |      * instead of giving all remaining chunks to the last batch.
324 |      */
325 |     size_t chunks_per_batch = chunkCount / num_batches;
326 |     size_t extra_chunks = chunkCount % num_batches;
327 |     size_t current_chunk = 0;
328 | 
329 |     for (size_t i = 0; i < num_batches; i++) {
330 |         /* First 'extra_chunks' batches get one additional chunk */
331 |         size_t batch_chunks = chunks_per_batch + (i < extra_chunks ? 1 : 0);
332 | 
333 |         work_items[i].input = input;
334 |         work_items[i].start_chunk = current_chunk;
335 |         work_items[i].end_chunk = current_chunk + batch_chunks;
336 |         work_items[i].output = output;
337 |         work_items[i].security_level = securityLevel;
338 |         work_items[i].capacity_bytes = capacity_bytes;
339 | 
340 |         /* Submit batch to thread pool */
341 |         if (threadpool_api->submit(threadpool_handle, process_chunk_range,
342 |                                    &work_items[i]) != 0) {
343 |             free(work_items);
344 |             return 1;
345 |         }
346 | 
347 |         current_chunk += batch_chunks;
348 |     }
349 | 
350 |     /* Wait for all batches to complete */
351 |     threadpool_api->wait_all(threadpool_handle);
352 | 
353 |     free(work_items);
354 |     return 0;
355 | }
356 | 


--------------------------------------------------------------------------------
/tests/testKangarooTwelve.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | K12 based on the eXtended Keccak Code Package (XKCP)
  3 | https://github.com/XKCP/XKCP
  4 | 
  5 | KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier.
  6 | 
  7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
  8 | 
  9 | For more information, feedback or questions, please refer to the Keccak Team website:
 10 | https://keccak.team/
 11 | 
 12 | To the extent possible under law, the implementer has waived all copyright
 13 | and related or neighboring rights to the source code in this file.
 14 | http://creativecommons.org/publicdomain/zero/1.0/
 15 | */
 16 | 
 17 | #include "KangarooTwelve.h"
 18 | #include "KeccakP-1600-SnP.h"
 19 | 
 20 | /* #define OUTPUT */
 21 | /* #define VERBOSE */
 22 | 
 23 | #define SnP_width               1600
 24 | #define inputByteSize           (80*1024)
 25 | #define outputByteSize          256
 26 | #define customizationByteSize   32
 27 | #define checksumByteSize        16
 28 | #define cChunkSize              8192
 29 | 
 30 | #if !defined(__x86_64__) && !defined(_M_X64) && !defined(__i386__) && !defined(_M_IX86)
 31 | #undef KeccakP1600_enable_simd_options
 32 | #endif
 33 | 
 34 | #if (defined(OUTPUT) || defined(VERBOSE) || !defined(EMBEDDED))
 35 | #include <stdio.h>
 36 | #endif
 37 | #include <stdint.h>
 38 | #include <stdlib.h>
 39 | #include <string.h>
 40 | 
 41 | #if defined(EMBEDDED)
 42 | static void assert(int condition)
 43 | {
 44 |     if (!condition)
 45 |     {
 46 |         for ( ; ; ) ;
 47 |     }
 48 | }
 49 | #else
 50 | #include <assert.h>
 51 | #endif
 52 | 
 53 | static void generateSimpleRawMaterial(unsigned char* data, unsigned int length, unsigned char seed1, unsigned int seed2)
 54 | {
 55 |     unsigned int i;
 56 | 
 57 |     for(i=0; i<length; i++) {
 58 |         unsigned char iRolled;
 59 |         unsigned char byte;
 60 |         seed2 = seed2 % 8;
 61 |         iRolled = ((unsigned char)i << seed2) | ((unsigned char)i >> (8-seed2));
 62 |         byte = seed1 + 161*length - iRolled + i;
 63 |         data[i] = byte;
 64 |     }
 65 | }
 66 | 
 67 | static void performTestKangarooTwelveOneInput(int securityLevel, unsigned int inputLen, unsigned int outputLen, unsigned int customLen, KangarooTwelve_Instance *pSpongeChecksum, unsigned int mode, unsigned int useSqueeze)
 68 | {
 69 |     unsigned char input[inputByteSize];
 70 |     unsigned char output[outputByteSize];
 71 |     unsigned char customization[customizationByteSize];
 72 |     int result;
 73 |     unsigned int i;
 74 | 
 75 |     generateSimpleRawMaterial(customization, customizationByteSize, customLen, 97);
 76 |     generateSimpleRawMaterial(input, inputLen, outputLen, inputLen + customLen);
 77 | 
 78 |     #ifdef VERBOSE
 79 |     printf("outputLen %5u, inputLen %5u, customLen %3u\n", outputLen, inputLen, customLen);
 80 |     #endif
 81 |     if (!useSqueeze)
 82 |     {
 83 |         if (mode == 0)
 84 |         {
 85 |             /* Input/Output full size in one call */
 86 |             result = KangarooTwelve(securityLevel, input, inputLen, output, outputLen, customization, customLen);
 87 |             assert(result == 0);
 88 |         }
 89 |         else if (mode == 1)
 90 |         {
 91 |             /* Input/Output one byte per call */
 92 |             KangarooTwelve_Instance kt;
 93 |             result = KangarooTwelve_Initialize(&kt, securityLevel, outputLen);
 94 |             assert(result == 0);
 95 |             for (i = 0; i < inputLen; ++i)
 96 |             {
 97 |                 result = KangarooTwelve_Update(&kt, input + i, 1);
 98 |                 assert(result == 0);
 99 |             }
100 |             result = KangarooTwelve_Final(&kt, output, customization, customLen);
101 |             assert(result == 0);
102 |         }
103 |         else if (mode == 2)
104 |         {
105 |             /* Input/Output random number of bytes per call */
106 |             KangarooTwelve_Instance kt;
107 |             unsigned char *pInput = input;
108 |             result = KangarooTwelve_Initialize(&kt, securityLevel, outputLen);
109 |             assert(result == 0);
110 |             while (inputLen)
111 |             {
112 |                 unsigned int len = ((rand() * 32768) + rand()) % (inputLen + 1);
113 |                 result = KangarooTwelve_Update(&kt, pInput, len);
114 |                 assert(result == 0);
115 |                 pInput += len;
116 |                 inputLen -= len;
117 |             }
118 |             result = KangarooTwelve_Final(&kt, output, customization, customLen);
119 |             assert(result == 0);
120 |         }
121 |     }
122 |     else
123 |     {
124 |         if (mode == 0)
125 |         {
126 |             KangarooTwelve_Instance kt;
127 |             result = KangarooTwelve_Initialize(&kt, securityLevel, 0);
128 |             assert(result == 0);
129 |             result = KangarooTwelve_Update(&kt, input, inputLen);
130 |             assert(result == 0);
131 |             result = KangarooTwelve_Final(&kt, 0, customization, customLen);
132 |             assert(result == 0);
133 |             result = KangarooTwelve_Squeeze(&kt, output, outputLen);
134 |             assert(result == 0);
135 |         }
136 |         else if (mode == 1)
137 |         {
138 |             KangarooTwelve_Instance kt;
139 |             result = KangarooTwelve_Initialize(&kt, securityLevel, 0);
140 |             assert(result == 0);
141 |             result = KangarooTwelve_Update(&kt, input, inputLen);
142 |             assert(result == 0);
143 |             result = KangarooTwelve_Final(&kt, 0, customization, customLen);
144 |             assert(result == 0);
145 | 
146 |             for (i = 0; i < outputLen; ++i)
147 |             {
148 |                 result = KangarooTwelve_Squeeze(&kt, output + i, 1);
149 |                 assert(result == 0);
150 |             }
151 |         }
152 |         else if (mode == 2)
153 |         {
154 |             KangarooTwelve_Instance kt;
155 |             unsigned int len;
156 |             result = KangarooTwelve_Initialize(&kt, securityLevel, 0);
157 |             assert(result == 0);
158 |             result = KangarooTwelve_Update(&kt, input, inputLen);
159 |             assert(result == 0);
160 |             result = KangarooTwelve_Final(&kt, 0, customization, customLen);
161 |             assert(result == 0);
162 | 
163 |             for (i = 0; i < outputLen; i += len)
164 |             {
165 |                 len = ((rand() << 15) ^ rand()) % ((outputLen-i) + 1);
166 |                 result = KangarooTwelve_Squeeze(&kt, output+i, len);
167 |                 assert(result == 0);
168 |             }
169 |         }
170 |     }
171 | 
172 |     #ifdef VERBOSE
173 |     {
174 |         unsigned int i;
175 | 
176 |         printf("KT%d\n", securityLevel);
177 |         printf("Input of %u bytes:", inputLen);
178 |         for(i=0; (i<inputLen) && (i<16); i++)
179 |             printf(" %02x", (int)input[i]);
180 |         if (inputLen > 16)
181 |             printf(" ...");
182 |         printf("\n");
183 |         printf("Output of %u bytes:", outputLen);
184 |         for(i=0; i<outputLen; i++)
185 |             printf(" %02x", (int)output[i]);
186 |         printf("\n\n");
187 |         fflush(stdout);
188 |     }
189 |     #endif
190 | 
191 |     KangarooTwelve_Update(pSpongeChecksum, output, outputLen);
192 | }
193 | 
194 | static void performTestKangarooTwelve(int securityLevel, unsigned char *checksum, unsigned int mode, unsigned int useSqueeze)
195 | {
196 |     unsigned int inputLen, outputLen, customLen;
197 | 
198 |     /* Acumulated test vector */
199 |     KangarooTwelve_Instance spongeChecksum;
200 |     KT128_Initialize(&spongeChecksum, 0);
201 | 
202 |     if (mode != 1) {
203 |         outputLen = securityLevel*2/8;
204 |         customLen = 0;
205 |         for(inputLen=0; inputLen<=cChunkSize*9+123; inputLen += (useSqueeze ? 23 : (((mode == 2) && (inputLen >= cChunkSize*2)) ? 32 : 1))) {
206 |             assert(inputLen <= inputByteSize);
207 |             performTestKangarooTwelveOneInput(securityLevel, inputLen, outputLen, customLen, &spongeChecksum, mode, useSqueeze);
208 |         }
209 |     }
210 | 
211 |     for(outputLen = 128/8; outputLen <= 512/8; outputLen <<= 1)
212 |     for(inputLen = 0; inputLen <= (3*cChunkSize) && inputLen <= inputByteSize; inputLen = inputLen ? (inputLen + 167) : 1)
213 |     for(customLen = 0; customLen <= customizationByteSize; customLen += 7) 
214 |     {
215 |         assert(inputLen <= inputByteSize);
216 |         performTestKangarooTwelveOneInput(securityLevel, inputLen, outputLen, customLen, &spongeChecksum, 0, useSqueeze);
217 |     }
218 |     KangarooTwelve_Final(&spongeChecksum, 0, (const unsigned char *)"", 0);
219 |     KangarooTwelve_Squeeze(&spongeChecksum, checksum, checksumByteSize);
220 | 
221 |     #ifdef VERBOSE
222 |     {
223 |         unsigned int i;
224 |         printf("KT%d\n", securityLevel);
225 |         printf("Checksum: ");
226 |         for(i=0; i<checksumByteSize; i++)
227 |             printf("\\x%02x", (int)checksum[i]);
228 |         printf("\n\n");
229 |     }
230 |     #endif
231 | }
232 | 
233 | void selfTestKT128()
234 | {
235 |     const unsigned char* expected[6] = {
236 |         (const unsigned char*)"\x61\x4d\x7a\xf8\xd5\xcc\xd0\xe1\x02\x53\x7d\x21\x5e\x39\x05\xed",
237 |         (const unsigned char*)"\x60\x9c\x95\xbe\xce\xdc\xcd\x58\x43\xf2\x4d\xdf\x15\xf3\x91\xdb",
238 |         (const unsigned char*)"\xcb\x8d\x23\xf4\xbd\xfc\x2a\x5a\x27\xb1\x6a\xfa\x65\x3a\x76\xbe",
239 |         (const unsigned char*)"\x5a\xac\xd7\x2d\x46\x7a\x4f\xa6\xf3\xc2\xa8\xe6\x10\x02\x8d\xc5",
240 |         (const unsigned char*)"\x60\x9c\x95\xbe\xce\xdc\xcd\x58\x43\xf2\x4d\xdf\x15\xf3\x91\xdb",
241 |         (const unsigned char*)"\x5a\xac\xd7\x2d\x46\x7a\x4f\xa6\xf3\xc2\xa8\xe6\x10\x02\x8d\xc5",
242 |     };
243 |     unsigned char checksum[checksumByteSize];
244 |     unsigned int mode, useSqueeze;
245 | 
246 |     #ifndef EMBEDDED
247 |     printf("Testing KT128");
248 |     fflush(stdout);
249 |     #endif
250 |     for(useSqueeze = 0; useSqueeze <= 1; ++useSqueeze)
251 |     for(mode = 0; mode <= 2; ++mode) {
252 |         #ifndef EMBEDDED
253 |         printf(".");
254 |         fflush(stdout);
255 |         #endif
256 |         performTestKangarooTwelve(128, checksum, mode, useSqueeze);
257 |         assert(memcmp(expected[useSqueeze*3 + mode], checksum, checksumByteSize) == 0);
258 |     }
259 |     #ifndef EMBEDDED
260 |     printf("\n   - OK.\n");
261 |     #endif
262 | }
263 | 
264 | void selfTestKT256()
265 | {
266 |     const unsigned char* expected[6] = {
267 |         (const unsigned char*)"\x03\xff\x7b\xfc\x96\x80\x77\xf6\x4e\x19\x2e\xc6\xb6\x73\xe4\x5b",
268 |         (const unsigned char*)"\x92\x45\x23\x33\x8f\x38\xe8\x7e\x8a\x5a\x2d\x35\x01\x36\xfa\x3e",
269 |         (const unsigned char*)"\x94\xb4\xa8\x2e\x9e\x70\xe7\xcd\x66\x1f\x84\xf2\xc6\xcc\x97\x02",
270 |         (const unsigned char*)"\x83\xe2\xa2\x5c\x0f\x24\xdd\x58\x46\x84\xab\x7c\xe4\xd9\x03\xbd",
271 |         (const unsigned char*)"\x92\x45\x23\x33\x8f\x38\xe8\x7e\x8a\x5a\x2d\x35\x01\x36\xfa\x3e",
272 |         (const unsigned char*)"\x83\xe2\xa2\x5c\x0f\x24\xdd\x58\x46\x84\xab\x7c\xe4\xd9\x03\xbd",
273 |     };
274 |     unsigned char checksum[checksumByteSize];
275 |     unsigned int mode, useSqueeze;
276 | 
277 |     #ifndef EMBEDDED
278 |     printf("Testing KT256");
279 |     fflush(stdout);
280 |     #endif
281 |     for(useSqueeze = 0; useSqueeze <= 1; ++useSqueeze)
282 |     for(mode = 0; mode <= 2; ++mode) {
283 |         #ifndef EMBEDDED
284 |         printf(".");
285 |         fflush(stdout);
286 |         #endif
287 |         performTestKangarooTwelve(256, checksum, mode, useSqueeze);
288 |         assert(memcmp(expected[useSqueeze*3 + mode], checksum, checksumByteSize) == 0);
289 |     }
290 |     #ifndef EMBEDDED
291 |     printf("\n   - OK.\n");
292 |     #endif
293 | }
294 | 
295 | void selfTestKangarooTwelve()
296 | {
297 |     selfTestKT128();
298 |     selfTestKT256();
299 | }
300 | 
301 | #ifdef OUTPUT
302 | void writeTestKangarooTwelveOne(int securityLevel, FILE *f)
303 | {
304 |     unsigned char checksum[checksumByteSize];
305 |     unsigned int offset;
306 | 
307 |     performTestKangarooTwelve(securityLevel, checksum, 0, 0);
308 |     fprintf(f, "    selfTestKT%d(\"", securityLevel);
309 |         for(offset=0; offset<checksumByteSize; offset++)
310 |             fprintf(f, "\\x%02x", checksum[offset]);
311 |         fprintf(f, "\");\n");
312 |     }
313 | 
314 | void writeTestKangarooTwelve(const char *filename)
315 | {
316 |     FILE *f = fopen(filename, "w");
317 |     assert(f != NULL);
318 |     writeTestKangarooTwelveOne(128, f);
319 |     writeTestKangarooTwelveOne(256, f);
320 |     fclose(f);
321 | }
322 | #endif
323 | 
324 | static void outputHex(const unsigned char *data, unsigned char length)
325 | {
326 |     #ifndef EMBEDDED
327 |     unsigned int i;
328 |     for(i=0; i<length; i++)
329 |         printf("%02x ", (int)data[i]);
330 |     printf("\n\n");
331 |     #endif
332 | }
333 | 
334 | void printKT128TestVectors()
335 | {
336 |     unsigned char *M, *C;
337 |     unsigned char output[10032];
338 |     unsigned int i, j, l;
339 | 
340 |     printf("KT128(M=empty, C=empty, 32 output bytes):\n");
341 |     KT128(0, 0, output, 32, 0, 0);
342 |     outputHex(output, 32);
343 |     printf("KT128(M=empty, C=empty, 64 output bytes):\n");
344 |     KT128(0, 0, output, 64, 0, 0);
345 |     outputHex(output, 64);
346 |     printf("KT128(M=empty, C=empty, 10032 output bytes), last 32 bytes:\n");
347 |     KT128(0, 0, output, 10032, 0, 0);
348 |     outputHex(output+10000, 32);
349 |     for(l=1, i=0; i<7; i++, l=l*17) {
350 |         M = (unsigned char*)malloc(l);
351 |         for(j=0; j<l; j++)
352 |             M[j] = j%251;
353 |         printf("KT128(M=pattern 0x00 to 0xFA for 17^%u bytes, C=empty, 32 output bytes):\n", i);
354 |         KT128(M, l, output, 32, 0, 0);
355 |         outputHex(output, 32);
356 |         free(M);
357 |     }
358 |     for(l=1, i=0; i<4; i++, l=l*41) {
359 |         unsigned int ll = (1 << i)-1;
360 |         M = (unsigned char*)malloc(ll);
361 |         memset(M, 0xFF, ll);
362 |         C = (unsigned char*)malloc(l);
363 |         for(j=0; j<l; j++)
364 |             C[j] = j%251;
365 |         printf("KT128(M=%u times byte 0xFF, C=pattern 0x00 to 0xFA for 41^%u bytes, 32 output bytes):\n", ll, i);
366 |         KT128(M, ll, output, 32, C, l);
367 |         outputHex(output, 32);
368 |         free(M);
369 |         free(C);
370 |     }
371 | }
372 | 
373 | void printKT256TestVectors()
374 | {
375 |     unsigned char *M, *C;
376 |     unsigned char output[10064];
377 |     unsigned int i, j, l;
378 | 
379 |     printf("KT256(M=empty, C=empty, 64 output bytes):\n");
380 |     KT256(0, 0, output, 64, 0, 0);
381 |     outputHex(output, 64);
382 |     printf("KT256(M=empty, C=empty, 128 output bytes):\n");
383 |     KT256(0, 0, output, 128, 0, 0);
384 |     outputHex(output, 128);
385 |     printf("KT256(M=empty, C=empty, 10064 output bytes), last 64 bytes:\n");
386 |     KT256(0, 0, output, 10064, 0, 0);
387 |     outputHex(output+10000, 64);
388 |     for(l=1, i=0; i<7; i++, l=l*17) {
389 |         M = (unsigned char*)malloc(l);
390 |         for(j=0; j<l; j++)
391 |             M[j] = j%251;
392 |         printf("KT256(M=pattern 0x00 to 0xFA for 17^%u bytes, C=empty, 64 output bytes):\n", i);
393 |         KT256(M, l, output, 64, 0, 0);
394 |         outputHex(output, 64);
395 |         free(M);
396 |     }
397 |     for(l=1, i=0; i<4; i++, l=l*41) {
398 |         unsigned int ll = (1 << i)-1;
399 |         M = (unsigned char*)malloc(ll);
400 |         memset(M, 0xFF, ll);
401 |         C = (unsigned char*)malloc(l);
402 |         for(j=0; j<l; j++)
403 |             C[j] = j%251;
404 |         printf("KT256(M=%u times byte 0xFF, C=pattern 0x00 to 0xFA for 41^%u bytes, 64 output bytes):\n", ll, i);
405 |         KT256(M, ll, output, 64, C, l);
406 |         outputHex(output, 64);
407 |         free(M);
408 |         free(C);
409 |     }
410 | }
411 | 
412 | void testKangarooTwelve(void)
413 | {
414 | #ifdef OUTPUT
415 |     printKT128TestVectors();
416 |     printKT256TestVectors();
417 |     writeTestKangarooTwelve("KangarooTwelve.txt");
418 | #endif
419 | 
420 | #if defined(KeccakP1600_enable_simd_options) && !defined(KeccakP1600_disableParallelism)
421 |     // Read feature availability
422 |     KangarooTwelve_EnableAllCpuFeatures();
423 |     int cpu_has_AVX512 = KangarooTwelve_DisableAVX512();
424 |     int cpu_has_AVX2 = KangarooTwelve_DisableAVX2();
425 |     int cpu_has_SSSE3 = KangarooTwelve_DisableSSSE3();
426 | 
427 |     // Test without vectorization
428 |     printf(" * Testing without vectorization:\n");
429 | #endif
430 |     selfTestKangarooTwelve();
431 | 
432 | #if defined(KeccakP1600_enable_simd_options) && !defined(KeccakP1600_disableParallelism)
433 |     // Test with SSSE3 only if it's available
434 |     if (cpu_has_SSSE3) {
435 |         printf("\n * Testing with SSSE3 enabled:\n");
436 |         KangarooTwelve_EnableAllCpuFeatures();
437 |         KangarooTwelve_DisableAVX512();
438 |         KangarooTwelve_DisableAVX2();
439 |         selfTestKangarooTwelve();
440 |     }
441 |     // Test with SSSE3 and AVX2 if they're available
442 |     if (cpu_has_AVX2) {
443 |         printf("\n * Testing with AVX2 enabled:\n");
444 |         KangarooTwelve_EnableAllCpuFeatures();
445 |         KangarooTwelve_DisableAVX512();
446 |         selfTestKangarooTwelve();
447 |     }
448 |     // Finally, test with everything enabled if we have AVX512
449 |     if (cpu_has_AVX512) {
450 |         printf("\n * Testing with AVX512 enabled:\n");
451 |         KangarooTwelve_EnableAllCpuFeatures();
452 |         selfTestKangarooTwelve();
453 |     }
454 | #endif
455 | }
456 | 


--------------------------------------------------------------------------------
/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | K12 based on the eXtended Keccak Code Package (XKCP)
  3 | https://github.com/XKCP/XKCP
  4 | 
  5 | The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
  6 | 
  7 | Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
  8 | 
  9 | For more information, feedback or questions, please refer to the Keccak Team website:
 10 | https://keccak.team/
 11 | 
 12 | To the extent possible under law, the implementer has waived all copyright
 13 | and related or neighboring rights to the source code in this file.
 14 | http://creativecommons.org/publicdomain/zero/1.0/
 15 | 
 16 | ---
 17 | 
 18 | Please refer to the XKCP for more details.
 19 | */
 20 | 
 21 | #include <stdint.h>
 22 | #include <tmmintrin.h>
 23 | #include "KeccakP-1600-SnP.h"
 24 | #include "align.h"
 25 | 
 26 | #define KeccakP1600times2_SSSE3_unrolling 2
 27 | 
 28 | #define SSSE3alignment 16
 29 | 
 30 | #define ANDnu128(a, b)      _mm_andnot_si128(a, b)
 31 | #define CONST128(a)         _mm_load_si128((const __m128i *)&(a))
 32 | #define LOAD128(a)          _mm_load_si128((const __m128i *)&(a))
 33 | #define LOAD6464(a, b)      _mm_set_epi64x(a, b)
 34 | #define CONST128_64(a)      _mm_set1_epi64x(a)
 35 | #define ROL64in128(a, o)    _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
 36 | #define ROL64in128_8(a)     _mm_shuffle_epi8(a, CONST128(rho8))
 37 | #define ROL64in128_56(a)    _mm_shuffle_epi8(a, CONST128(rho56))
 38 | static const uint64_t rho8[2] = {0x0605040302010007, 0x0E0D0C0B0A09080F};
 39 | static const uint64_t rho56[2] = {0x0007060504030201, 0x080F0E0D0C0B0A09};
 40 | #define STORE128(a, b)      _mm_store_si128((__m128i *)&(a), b)
 41 | #define STORE128u(a, b)     _mm_storeu_si128((__m128i *)&(a), b)
 42 | #define XOR128(a, b)        _mm_xor_si128(a, b)
 43 | #define XOReq128(a, b)      a = _mm_xor_si128(a, b)
 44 | #define UNPACKL( a, b )     _mm_unpacklo_epi64((a), (b))
 45 | #define UNPACKH( a, b )     _mm_unpackhi_epi64((a), (b))
 46 | #define ZERO()              _mm_setzero_si128()
 47 | 
 48 | static ALIGN(SSSE3alignment) const uint64_t KeccakP1600RoundConstants[24] = {
 49 |     0x0000000000000001ULL,
 50 |     0x0000000000008082ULL,
 51 |     0x800000000000808aULL,
 52 |     0x8000000080008000ULL,
 53 |     0x000000000000808bULL,
 54 |     0x0000000080000001ULL,
 55 |     0x8000000080008081ULL,
 56 |     0x8000000000008009ULL,
 57 |     0x000000000000008aULL,
 58 |     0x0000000000000088ULL,
 59 |     0x0000000080008009ULL,
 60 |     0x000000008000000aULL,
 61 |     0x000000008000808bULL,
 62 |     0x800000000000008bULL,
 63 |     0x8000000000008089ULL,
 64 |     0x8000000000008003ULL,
 65 |     0x8000000000008002ULL,
 66 |     0x8000000000000080ULL,
 67 |     0x000000000000800aULL,
 68 |     0x800000008000000aULL,
 69 |     0x8000000080008081ULL,
 70 |     0x8000000000008080ULL,
 71 |     0x0000000080000001ULL,
 72 |     0x8000000080008008ULL};
 73 | 
 74 | #define declareABCDE \
 75 |     __m128i Aba, Abe, Abi, Abo, Abu; \
 76 |     __m128i Aga, Age, Agi, Ago, Agu; \
 77 |     __m128i Aka, Ake, Aki, Ako, Aku; \
 78 |     __m128i Ama, Ame, Ami, Amo, Amu; \
 79 |     __m128i Asa, Ase, Asi, Aso, Asu; \
 80 |     __m128i Bba, Bbe, Bbi, Bbo, Bbu; \
 81 |     __m128i Bga, Bge, Bgi, Bgo, Bgu; \
 82 |     __m128i Bka, Bke, Bki, Bko, Bku; \
 83 |     __m128i Bma, Bme, Bmi, Bmo, Bmu; \
 84 |     __m128i Bsa, Bse, Bsi, Bso, Bsu; \
 85 |     __m128i Ca, Ce, Ci, Co, Cu; \
 86 |     __m128i Da, De, Di, Do, Du; \
 87 |     __m128i Eba, Ebe, Ebi, Ebo, Ebu; \
 88 |     __m128i Ega, Ege, Egi, Ego, Egu; \
 89 |     __m128i Eka, Eke, Eki, Eko, Eku; \
 90 |     __m128i Ema, Eme, Emi, Emo, Emu; \
 91 |     __m128i Esa, Ese, Esi, Eso, Esu; \
 92 | 
 93 | #define prepareTheta \
 94 |     Ca = XOR128(Aba, XOR128(Aga, XOR128(Aka, XOR128(Ama, Asa)))); \
 95 |     Ce = XOR128(Abe, XOR128(Age, XOR128(Ake, XOR128(Ame, Ase)))); \
 96 |     Ci = XOR128(Abi, XOR128(Agi, XOR128(Aki, XOR128(Ami, Asi)))); \
 97 |     Co = XOR128(Abo, XOR128(Ago, XOR128(Ako, XOR128(Amo, Aso)))); \
 98 |     Cu = XOR128(Abu, XOR128(Agu, XOR128(Aku, XOR128(Amu, Asu)))); \
 99 | 
100 | /* --- Theta Rho Pi Chi Iota Prepare-theta */
101 | /* --- 64-bit lanes mapped to 64-bit words */
102 | #define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
103 |     Da = XOR128(Cu, ROL64in128(Ce, 1)); \
104 |     De = XOR128(Ca, ROL64in128(Ci, 1)); \
105 |     Di = XOR128(Ce, ROL64in128(Co, 1)); \
106 |     Do = XOR128(Ci, ROL64in128(Cu, 1)); \
107 |     Du = XOR128(Co, ROL64in128(Ca, 1)); \
108 | \
109 |     XOReq128(A##ba, Da); \
110 |     Bba = A##ba; \
111 |     XOReq128(A##ge, De); \
112 |     Bbe = ROL64in128(A##ge, 44); \
113 |     XOReq128(A##ki, Di); \
114 |     Bbi = ROL64in128(A##ki, 43); \
115 |     E##ba = XOR128(Bba, ANDnu128(Bbe, Bbi)); \
116 |     XOReq128(E##ba, CONST128_64(KeccakP1600RoundConstants[i])); \
117 |     Ca = E##ba; \
118 |     XOReq128(A##mo, Do); \
119 |     Bbo = ROL64in128(A##mo, 21); \
120 |     E##be = XOR128(Bbe, ANDnu128(Bbi, Bbo)); \
121 |     Ce = E##be; \
122 |     XOReq128(A##su, Du); \
123 |     Bbu = ROL64in128(A##su, 14); \
124 |     E##bi = XOR128(Bbi, ANDnu128(Bbo, Bbu)); \
125 |     Ci = E##bi; \
126 |     E##bo = XOR128(Bbo, ANDnu128(Bbu, Bba)); \
127 |     Co = E##bo; \
128 |     E##bu = XOR128(Bbu, ANDnu128(Bba, Bbe)); \
129 |     Cu = E##bu; \
130 | \
131 |     XOReq128(A##bo, Do); \
132 |     Bga = ROL64in128(A##bo, 28); \
133 |     XOReq128(A##gu, Du); \
134 |     Bge = ROL64in128(A##gu, 20); \
135 |     XOReq128(A##ka, Da); \
136 |     Bgi = ROL64in128(A##ka, 3); \
137 |     E##ga = XOR128(Bga, ANDnu128(Bge, Bgi)); \
138 |     XOReq128(Ca, E##ga); \
139 |     XOReq128(A##me, De); \
140 |     Bgo = ROL64in128(A##me, 45); \
141 |     E##ge = XOR128(Bge, ANDnu128(Bgi, Bgo)); \
142 |     XOReq128(Ce, E##ge); \
143 |     XOReq128(A##si, Di); \
144 |     Bgu = ROL64in128(A##si, 61); \
145 |     E##gi = XOR128(Bgi, ANDnu128(Bgo, Bgu)); \
146 |     XOReq128(Ci, E##gi); \
147 |     E##go = XOR128(Bgo, ANDnu128(Bgu, Bga)); \
148 |     XOReq128(Co, E##go); \
149 |     E##gu = XOR128(Bgu, ANDnu128(Bga, Bge)); \
150 |     XOReq128(Cu, E##gu); \
151 | \
152 |     XOReq128(A##be, De); \
153 |     Bka = ROL64in128(A##be, 1); \
154 |     XOReq128(A##gi, Di); \
155 |     Bke = ROL64in128(A##gi, 6); \
156 |     XOReq128(A##ko, Do); \
157 |     Bki = ROL64in128(A##ko, 25); \
158 |     E##ka = XOR128(Bka, ANDnu128(Bke, Bki)); \
159 |     XOReq128(Ca, E##ka); \
160 |     XOReq128(A##mu, Du); \
161 |     Bko = ROL64in128_8(A##mu); \
162 |     E##ke = XOR128(Bke, ANDnu128(Bki, Bko)); \
163 |     XOReq128(Ce, E##ke); \
164 |     XOReq128(A##sa, Da); \
165 |     Bku = ROL64in128(A##sa, 18); \
166 |     E##ki = XOR128(Bki, ANDnu128(Bko, Bku)); \
167 |     XOReq128(Ci, E##ki); \
168 |     E##ko = XOR128(Bko, ANDnu128(Bku, Bka)); \
169 |     XOReq128(Co, E##ko); \
170 |     E##ku = XOR128(Bku, ANDnu128(Bka, Bke)); \
171 |     XOReq128(Cu, E##ku); \
172 | \
173 |     XOReq128(A##bu, Du); \
174 |     Bma = ROL64in128(A##bu, 27); \
175 |     XOReq128(A##ga, Da); \
176 |     Bme = ROL64in128(A##ga, 36); \
177 |     XOReq128(A##ke, De); \
178 |     Bmi = ROL64in128(A##ke, 10); \
179 |     E##ma = XOR128(Bma, ANDnu128(Bme, Bmi)); \
180 |     XOReq128(Ca, E##ma); \
181 |     XOReq128(A##mi, Di); \
182 |     Bmo = ROL64in128(A##mi, 15); \
183 |     E##me = XOR128(Bme, ANDnu128(Bmi, Bmo)); \
184 |     XOReq128(Ce, E##me); \
185 |     XOReq128(A##so, Do); \
186 |     Bmu = ROL64in128_56(A##so); \
187 |     E##mi = XOR128(Bmi, ANDnu128(Bmo, Bmu)); \
188 |     XOReq128(Ci, E##mi); \
189 |     E##mo = XOR128(Bmo, ANDnu128(Bmu, Bma)); \
190 |     XOReq128(Co, E##mo); \
191 |     E##mu = XOR128(Bmu, ANDnu128(Bma, Bme)); \
192 |     XOReq128(Cu, E##mu); \
193 | \
194 |     XOReq128(A##bi, Di); \
195 |     Bsa = ROL64in128(A##bi, 62); \
196 |     XOReq128(A##go, Do); \
197 |     Bse = ROL64in128(A##go, 55); \
198 |     XOReq128(A##ku, Du); \
199 |     Bsi = ROL64in128(A##ku, 39); \
200 |     E##sa = XOR128(Bsa, ANDnu128(Bse, Bsi)); \
201 |     XOReq128(Ca, E##sa); \
202 |     XOReq128(A##ma, Da); \
203 |     Bso = ROL64in128(A##ma, 41); \
204 |     E##se = XOR128(Bse, ANDnu128(Bsi, Bso)); \
205 |     XOReq128(Ce, E##se); \
206 |     XOReq128(A##se, De); \
207 |     Bsu = ROL64in128(A##se, 2); \
208 |     E##si = XOR128(Bsi, ANDnu128(Bso, Bsu)); \
209 |     XOReq128(Ci, E##si); \
210 |     E##so = XOR128(Bso, ANDnu128(Bsu, Bsa)); \
211 |     XOReq128(Co, E##so); \
212 |     E##su = XOR128(Bsu, ANDnu128(Bsa, Bse)); \
213 |     XOReq128(Cu, E##su); \
214 | \
215 | 
216 | /* --- Theta Rho Pi Chi Iota */
217 | /* --- 64-bit lanes mapped to 64-bit words */
218 | #define thetaRhoPiChiIota(i, A, E) \
219 |     Da = XOR128(Cu, ROL64in128(Ce, 1)); \
220 |     De = XOR128(Ca, ROL64in128(Ci, 1)); \
221 |     Di = XOR128(Ce, ROL64in128(Co, 1)); \
222 |     Do = XOR128(Ci, ROL64in128(Cu, 1)); \
223 |     Du = XOR128(Co, ROL64in128(Ca, 1)); \
224 | \
225 |     XOReq128(A##ba, Da); \
226 |     Bba = A##ba; \
227 |     XOReq128(A##ge, De); \
228 |     Bbe = ROL64in128(A##ge, 44); \
229 |     XOReq128(A##ki, Di); \
230 |     Bbi = ROL64in128(A##ki, 43); \
231 |     E##ba = XOR128(Bba, ANDnu128(Bbe, Bbi)); \
232 |     XOReq128(E##ba, CONST128_64(KeccakP1600RoundConstants[i])); \
233 |     XOReq128(A##mo, Do); \
234 |     Bbo = ROL64in128(A##mo, 21); \
235 |     E##be = XOR128(Bbe, ANDnu128(Bbi, Bbo)); \
236 |     XOReq128(A##su, Du); \
237 |     Bbu = ROL64in128(A##su, 14); \
238 |     E##bi = XOR128(Bbi, ANDnu128(Bbo, Bbu)); \
239 |     E##bo = XOR128(Bbo, ANDnu128(Bbu, Bba)); \
240 |     E##bu = XOR128(Bbu, ANDnu128(Bba, Bbe)); \
241 | \
242 |     XOReq128(A##bo, Do); \
243 |     Bga = ROL64in128(A##bo, 28); \
244 |     XOReq128(A##gu, Du); \
245 |     Bge = ROL64in128(A##gu, 20); \
246 |     XOReq128(A##ka, Da); \
247 |     Bgi = ROL64in128(A##ka, 3); \
248 |     E##ga = XOR128(Bga, ANDnu128(Bge, Bgi)); \
249 |     XOReq128(A##me, De); \
250 |     Bgo = ROL64in128(A##me, 45); \
251 |     E##ge = XOR128(Bge, ANDnu128(Bgi, Bgo)); \
252 |     XOReq128(A##si, Di); \
253 |     Bgu = ROL64in128(A##si, 61); \
254 |     E##gi = XOR128(Bgi, ANDnu128(Bgo, Bgu)); \
255 |     E##go = XOR128(Bgo, ANDnu128(Bgu, Bga)); \
256 |     E##gu = XOR128(Bgu, ANDnu128(Bga, Bge)); \
257 | \
258 |     XOReq128(A##be, De); \
259 |     Bka = ROL64in128(A##be, 1); \
260 |     XOReq128(A##gi, Di); \
261 |     Bke = ROL64in128(A##gi, 6); \
262 |     XOReq128(A##ko, Do); \
263 |     Bki = ROL64in128(A##ko, 25); \
264 |     E##ka = XOR128(Bka, ANDnu128(Bke, Bki)); \
265 |     XOReq128(A##mu, Du); \
266 |     Bko = ROL64in128_8(A##mu); \
267 |     E##ke = XOR128(Bke, ANDnu128(Bki, Bko)); \
268 |     XOReq128(A##sa, Da); \
269 |     Bku = ROL64in128(A##sa, 18); \
270 |     E##ki = XOR128(Bki, ANDnu128(Bko, Bku)); \
271 |     E##ko = XOR128(Bko, ANDnu128(Bku, Bka)); \
272 |     E##ku = XOR128(Bku, ANDnu128(Bka, Bke)); \
273 | \
274 |     XOReq128(A##bu, Du); \
275 |     Bma = ROL64in128(A##bu, 27); \
276 |     XOReq128(A##ga, Da); \
277 |     Bme = ROL64in128(A##ga, 36); \
278 |     XOReq128(A##ke, De); \
279 |     Bmi = ROL64in128(A##ke, 10); \
280 |     E##ma = XOR128(Bma, ANDnu128(Bme, Bmi)); \
281 |     XOReq128(A##mi, Di); \
282 |     Bmo = ROL64in128(A##mi, 15); \
283 |     E##me = XOR128(Bme, ANDnu128(Bmi, Bmo)); \
284 |     XOReq128(A##so, Do); \
285 |     Bmu = ROL64in128_56(A##so); \
286 |     E##mi = XOR128(Bmi, ANDnu128(Bmo, Bmu)); \
287 |     E##mo = XOR128(Bmo, ANDnu128(Bmu, Bma)); \
288 |     E##mu = XOR128(Bmu, ANDnu128(Bma, Bme)); \
289 | \
290 |     XOReq128(A##bi, Di); \
291 |     Bsa = ROL64in128(A##bi, 62); \
292 |     XOReq128(A##go, Do); \
293 |     Bse = ROL64in128(A##go, 55); \
294 |     XOReq128(A##ku, Du); \
295 |     Bsi = ROL64in128(A##ku, 39); \
296 |     E##sa = XOR128(Bsa, ANDnu128(Bse, Bsi)); \
297 |     XOReq128(A##ma, Da); \
298 |     Bso = ROL64in128(A##ma, 41); \
299 |     E##se = XOR128(Bse, ANDnu128(Bsi, Bso)); \
300 |     XOReq128(A##se, De); \
301 |     Bsu = ROL64in128(A##se, 2); \
302 |     E##si = XOR128(Bsi, ANDnu128(Bso, Bsu)); \
303 |     E##so = XOR128(Bso, ANDnu128(Bsu, Bsa)); \
304 |     E##su = XOR128(Bsu, ANDnu128(Bsa, Bse)); \
305 | \
306 | 
307 | #define initializeState(X) \
308 |     X##ba = ZERO(); \
309 |     X##be = ZERO(); \
310 |     X##bi = ZERO(); \
311 |     X##bo = ZERO(); \
312 |     X##bu = ZERO(); \
313 |     X##ga = ZERO(); \
314 |     X##ge = ZERO(); \
315 |     X##gi = ZERO(); \
316 |     X##go = ZERO(); \
317 |     X##gu = ZERO(); \
318 |     X##ka = ZERO(); \
319 |     X##ke = ZERO(); \
320 |     X##ki = ZERO(); \
321 |     X##ko = ZERO(); \
322 |     X##ku = ZERO(); \
323 |     X##ma = ZERO(); \
324 |     X##me = ZERO(); \
325 |     X##mi = ZERO(); \
326 |     X##mo = ZERO(); \
327 |     X##mu = ZERO(); \
328 |     X##sa = ZERO(); \
329 |     X##se = ZERO(); \
330 |     X##si = ZERO(); \
331 |     X##so = ZERO(); \
332 |     X##su = ZERO(); \
333 | 
334 | #define XORdata4(X, data0, data1) \
335 |     XOReq128(X##ba, LOAD6464((data1)[ 0], (data0)[ 0])); \
336 |     XOReq128(X##be, LOAD6464((data1)[ 1], (data0)[ 1])); \
337 |     XOReq128(X##bi, LOAD6464((data1)[ 2], (data0)[ 2])); \
338 |     XOReq128(X##bo, LOAD6464((data1)[ 3], (data0)[ 3])); \
339 | 
340 | #define XORdata16(X, data0, data1) \
341 |     XOReq128(X##ba, LOAD6464((data1)[ 0], (data0)[ 0])); \
342 |     XOReq128(X##be, LOAD6464((data1)[ 1], (data0)[ 1])); \
343 |     XOReq128(X##bi, LOAD6464((data1)[ 2], (data0)[ 2])); \
344 |     XOReq128(X##bo, LOAD6464((data1)[ 3], (data0)[ 3])); \
345 |     XOReq128(X##bu, LOAD6464((data1)[ 4], (data0)[ 4])); \
346 |     XOReq128(X##ga, LOAD6464((data1)[ 5], (data0)[ 5])); \
347 |     XOReq128(X##ge, LOAD6464((data1)[ 6], (data0)[ 6])); \
348 |     XOReq128(X##gi, LOAD6464((data1)[ 7], (data0)[ 7])); \
349 |     XOReq128(X##go, LOAD6464((data1)[ 8], (data0)[ 8])); \
350 |     XOReq128(X##gu, LOAD6464((data1)[ 9], (data0)[ 9])); \
351 |     XOReq128(X##ka, LOAD6464((data1)[10], (data0)[10])); \
352 |     XOReq128(X##ke, LOAD6464((data1)[11], (data0)[11])); \
353 |     XOReq128(X##ki, LOAD6464((data1)[12], (data0)[12])); \
354 |     XOReq128(X##ko, LOAD6464((data1)[13], (data0)[13])); \
355 |     XOReq128(X##ku, LOAD6464((data1)[14], (data0)[14])); \
356 |     XOReq128(X##ma, LOAD6464((data1)[15], (data0)[15])); \
357 | 
358 | #define XORdata17(X, data0, data1) \
359 |     XORdata16(X, data0, data1) \
360 |     XOReq128(X##me, LOAD6464((data1)[16], (data0)[16])); \
361 | 
362 | #define XORdata21(X, data0, data1) \
363 |     XORdata17(X, data0, data1) \
364 |     XOReq128(X##mi, LOAD6464((data1)[17], (data0)[17])); \
365 |     XOReq128(X##mo, LOAD6464((data1)[18], (data0)[18])); \
366 |     XOReq128(X##mu, LOAD6464((data1)[19], (data0)[19])); \
367 |     XOReq128(X##sa, LOAD6464((data1)[20], (data0)[20])); \
368 | 
369 | #if ((defined(KeccakP1600times2_SSSE3_fullUnrolling)) || (KeccakP1600times2_SSSE3_unrolling == 12))
370 | #define rounds12 \
371 |     prepareTheta \
372 |     thetaRhoPiChiIotaPrepareTheta(12, A, E) \
373 |     thetaRhoPiChiIotaPrepareTheta(13, E, A) \
374 |     thetaRhoPiChiIotaPrepareTheta(14, A, E) \
375 |     thetaRhoPiChiIotaPrepareTheta(15, E, A) \
376 |     thetaRhoPiChiIotaPrepareTheta(16, A, E) \
377 |     thetaRhoPiChiIotaPrepareTheta(17, E, A) \
378 |     thetaRhoPiChiIotaPrepareTheta(18, A, E) \
379 |     thetaRhoPiChiIotaPrepareTheta(19, E, A) \
380 |     thetaRhoPiChiIotaPrepareTheta(20, A, E) \
381 |     thetaRhoPiChiIotaPrepareTheta(21, E, A) \
382 |     thetaRhoPiChiIotaPrepareTheta(22, A, E) \
383 |     thetaRhoPiChiIota(23, E, A) \
384 | 
385 | #elif (KeccakP1600times2_SSSE3_unrolling == 6)
386 | #define rounds12 \
387 |     prepareTheta \
388 |     for(i=12; i<24; i+=6) { \
389 |         thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
390 |         thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
391 |         thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
392 |         thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
393 |         thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
394 |         thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
395 |     } \
396 | 
397 | #elif (KeccakP1600times2_SSSE3_unrolling == 4)
398 | #define rounds12 \
399 |     prepareTheta \
400 |     for(i=12; i<24; i+=4) { \
401 |         thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
402 |         thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
403 |         thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
404 |         thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
405 |     } \
406 | 
407 | #elif (KeccakP1600times2_SSSE3_unrolling == 2)
408 | #define rounds12 \
409 |     prepareTheta \
410 |     for(i=12; i<24; i+=2) { \
411 |         thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
412 |         thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
413 |     } \
414 | 
415 | #else
416 | #error "KeccakP1600times2_SSSE3_unrolling is not correctly specified!"
417 | #endif
418 | 
419 | #define chunkSize 8192
420 | #define KT128_rateInBytes (21*8)
421 | #define KT256_rateInBytes (17*8)
422 | 
423 | void KT128_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output)
424 | {
425 |     declareABCDE
426 |     #ifndef KeccakP1600times2_SSSE3_fullUnrolling
427 |     unsigned int i;
428 |     #endif
429 |     unsigned int j;
430 | 
431 |     initializeState(A);
432 | 
433 |     for(j = 0; j < (chunkSize - KT128_rateInBytes); j += KT128_rateInBytes) {
434 |         XORdata21(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize));
435 |         rounds12
436 |         input += KT128_rateInBytes;
437 |     }
438 | 
439 |     XORdata16(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize));
440 |     XOReq128(Ame, _mm_set1_epi64x(0x0BULL));
441 |     XOReq128(Asa, _mm_set1_epi64x(0x8000000000000000ULL));
442 |     rounds12
443 | 
444 |     STORE128u( *(__m128i*)&(output[ 0]), UNPACKL( Aba, Abe ) );
445 |     STORE128u( *(__m128i*)&(output[16]), UNPACKL( Abi, Abo ) );
446 |     STORE128u( *(__m128i*)&(output[32]), UNPACKH( Aba, Abe ) );
447 |     STORE128u( *(__m128i*)&(output[48]), UNPACKH( Abi, Abo ) );
448 | }
449 | 
450 | void KT256_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output)
451 | {
452 |     declareABCDE
453 |     #ifndef KeccakP1600times2_SSSE3_fullUnrolling
454 |     unsigned int i;
455 |     #endif
456 |     unsigned int j;
457 | 
458 |     initializeState(A);
459 | 
460 |     for(j = 0; j < (chunkSize - KT256_rateInBytes); j += KT256_rateInBytes) {
461 |         XORdata17(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize));
462 |         rounds12
463 |         input += KT256_rateInBytes;
464 |     }
465 | 
466 |     XORdata4(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize));
467 |     XOReq128(Abu, _mm_set1_epi64x(0x0BULL));
468 |     XOReq128(Ame, _mm_set1_epi64x(0x8000000000000000ULL));
469 |     rounds12
470 | 
471 |     STORE128u( *(__m128i*)&(output[ 0]), UNPACKL( Aba, Abe ) );
472 |     STORE128u( *(__m128i*)&(output[16]), UNPACKL( Abi, Abo ) );
473 |     STORE128u( *(__m128i*)&(output[32]), UNPACKL( Abu, Aga ) );
474 |     STORE128u( *(__m128i*)&(output[48]), UNPACKL( Age, Agi ) );
475 |     STORE128u( *(__m128i*)&(output[64]), UNPACKH( Aba, Abe ) );
476 |     STORE128u( *(__m128i*)&(output[80]), UNPACKH( Abi, Abo ) );
477 |     STORE128u( *(__m128i*)&(output[96]), UNPACKH( Abu, Aga ) );
478 |     STORE128u( *(__m128i*)&(output[112]), UNPACKH( Age, Agi ) );
479 | }
480 | 


--------------------------------------------------------------------------------