├── avx2
    ├── kem.c
    ├── kem.h
    ├── indcpa.h
    ├── verify.h
    ├── fips202.c
    ├── randombytes.c
    ├── randombytes.h
    ├── symmetric-shake.c
    ├── test
    │   ├── cpucycles.c
    │   ├── cpucycles.h
    │   ├── speed_print.c
    │   ├── speed_print.h
    │   ├── test_kyber.c
    │   ├── test_speed.c
    │   └── test_vectors.c
    ├── keccak4x
    │   ├── KeccakP-SIMD256-config.h
    │   ├── KeccakP-align.h
    │   ├── KeccakP-1600-times4-SnP.h
    │   ├── KeccakP-brg_endian.h
    │   └── KeccakP-1600-unrolling.macros
    ├── .gitignore
    ├── reduce.h
    ├── cbd.h
    ├── align.h
    ├── rejsample.h
    ├── shuffle.inc
    ├── fq.inc
    ├── ntt.h
    ├── consts.h
    ├── polyvec.h
    ├── symmetric.h
    ├── fq.S
    ├── params.h
    ├── fips202.h
    ├── fips202x4.h
    ├── verify.c
    ├── basemul.S
    ├── poly.h
    ├── api.h
    ├── ntt.S
    ├── Makefile
    ├── cbd.c
    ├── shuffle.S
    ├── invntt.S
    ├── consts.c
    ├── fips202x4.c
    └── polyvec.c
├── .gitattributes
├── .gitignore
├── AUTHORS
├── ref
    ├── randombytes.h
    ├── test
    │   ├── speed_print.h
    │   ├── cpucycles.c
    │   ├── cpucycles.h
    │   ├── speed_print.c
    │   ├── test_vectors.c
    │   ├── test_kyber.c
    │   └── test_speed.c
    ├── .gitignore
    ├── cbd.h
    ├── reduce.h
    ├── verify.h
    ├── ntt.h
    ├── indcpa.h
    ├── kem.h
    ├── polyvec.h
    ├── symmetric.h
    ├── reduce.c
    ├── nistkat
    │   ├── rng.h
    │   ├── rng.c
    │   └── PQCgenKAT_kem.c
    ├── params.h
    ├── poly.h
    ├── randombytes.c
    ├── fips202.h
    ├── verify.c
    ├── symmetric-shake.c
    ├── api.h
    ├── cbd.c
    ├── Makefile
    ├── ntt.c
    ├── kem.c
    ├── polyvec.c
    └── poly.c
├── SHA256SUMS
├── LICENSE
├── runlcov.sh
├── runtests.sh
├── .travis.yml
├── Common_META.yml
├── Kyber512_META.yml
├── Kyber768_META.yml
├── Kyber1024_META.yml
└── README.md


/avx2/kem.c:
--------------------------------------------------------------------------------
1 | ../ref/kem.c


--------------------------------------------------------------------------------
/avx2/kem.h:
--------------------------------------------------------------------------------
1 | ../ref/kem.h


--------------------------------------------------------------------------------
/avx2/indcpa.h:
--------------------------------------------------------------------------------
1 | ../ref/indcpa.h


--------------------------------------------------------------------------------
/avx2/verify.h:
--------------------------------------------------------------------------------
1 | ../ref/verify.h


--------------------------------------------------------------------------------
/avx2/fips202.c:
--------------------------------------------------------------------------------
1 | ../ref/fips202.c


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | SHA256SUMS eol=lf
2 | 


--------------------------------------------------------------------------------
/avx2/randombytes.c:
--------------------------------------------------------------------------------
1 | ../ref/randombytes.c


--------------------------------------------------------------------------------
/avx2/randombytes.h:
--------------------------------------------------------------------------------
1 | ../ref/randombytes.h


--------------------------------------------------------------------------------
/avx2/symmetric-shake.c:
--------------------------------------------------------------------------------
1 | ../ref/symmetric-shake.c


--------------------------------------------------------------------------------
/avx2/test/cpucycles.c:
--------------------------------------------------------------------------------
1 | ../../ref/test/cpucycles.c


--------------------------------------------------------------------------------
/avx2/test/cpucycles.h:
--------------------------------------------------------------------------------
1 | ../../ref/test/cpucycles.h


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | tvecs1024
2 | tvecs512
3 | tvecs768
4 | 


--------------------------------------------------------------------------------
/avx2/test/speed_print.c:
--------------------------------------------------------------------------------
1 | ../../ref/test/speed_print.c


--------------------------------------------------------------------------------
/avx2/test/speed_print.h:
--------------------------------------------------------------------------------
1 | ../../ref/test/speed_print.h


--------------------------------------------------------------------------------
/avx2/test/test_kyber.c:
--------------------------------------------------------------------------------
1 | ../../ref/test/test_kyber.c


--------------------------------------------------------------------------------
/avx2/test/test_speed.c:
--------------------------------------------------------------------------------
1 | ../../ref/test/test_speed.c


--------------------------------------------------------------------------------
/avx2/test/test_vectors.c:
--------------------------------------------------------------------------------
1 | ../../ref/test/test_vectors.c


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | Joppe Bos,
 2 | Léo Ducas,
 3 | Eike Kiltz,
 4 | Tancrède Lepoint,
 5 | Vadim Lyubashevsky,
 6 | John Schanck,
 7 | Peter Schwabe,
 8 | Gregor Seiler,
 9 | Damien Stehlé
10 | 


--------------------------------------------------------------------------------
/avx2/keccak4x/KeccakP-SIMD256-config.h:
--------------------------------------------------------------------------------
1 | #define KeccakP1600times4_implementation_config "AVX2, all rounds unrolled"
2 | #define KeccakP1600times4_fullUnrolling
3 | #define KeccakP1600times4_useAVX2
4 | 


--------------------------------------------------------------------------------
/ref/randombytes.h:
--------------------------------------------------------------------------------
 1 | #ifndef RANDOMBYTES_H
 2 | #define RANDOMBYTES_H
 3 | 
 4 | #include <stddef.h>
 5 | #include <stdint.h>
 6 | 
 7 | void randombytes(uint8_t *out, size_t outlen);
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/ref/test/speed_print.h:
--------------------------------------------------------------------------------
 1 | #ifndef PRINT_SPEED_H
 2 | #define PRINT_SPEED_H
 3 | 
 4 | #include <stddef.h>
 5 | #include <stdint.h>
 6 | 
 7 | void print_results(const char *s, uint64_t *t, size_t tlen);
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/SHA256SUMS:
--------------------------------------------------------------------------------
1 | 4d34994299a8f8dcb36c550951a00f6e16918d6d5b6f280ee2aa12a7bf8375a0  tvecs512
2 | b59ac4d2b429b1f8c3b8a5542fb638179da2fd8b1212891d2f976e70e219fed1  tvecs768
3 | 3f577090c7cb7a345ce0417a2a2353153a9f1b8d79f8d927cb6a7b4ec17fd2a1  tvecs1024
4 | 


--------------------------------------------------------------------------------
/avx2/.gitignore:
--------------------------------------------------------------------------------
 1 | *.so
 2 | *.o
 3 | test/test_kyber1024
 4 | test/test_kyber512
 5 | test/test_kyber768
 6 | test/test_speed1024
 7 | test/test_speed512
 8 | test/test_speed768
 9 | test/test_vectors1024
10 | test/test_vectors512
11 | test/test_vectors768
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/);
2 | or Apache 2.0 License (https://www.apache.org/licenses/LICENSE-2.0.html).
3 | 
4 | For Keccak and AES we are using public-domain
5 | code from sources and by authors listed in
6 | comments on top of the respective files.
7 | 


--------------------------------------------------------------------------------
/avx2/reduce.h:
--------------------------------------------------------------------------------
 1 | #ifndef REDUCE_H
 2 | #define REDUCE_H
 3 | 
 4 | #include "params.h"
 5 | #include <immintrin.h>
 6 | 
 7 | #define reduce_avx KYBER_NAMESPACE(reduce_avx)
 8 | void reduce_avx(__m256i *r, const __m256i *qdata);
 9 | #define tomont_avx KYBER_NAMESPACE(tomont_avx)
10 | void tomont_avx(__m256i *r, const __m256i *qdata);
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/ref/.gitignore:
--------------------------------------------------------------------------------
 1 | *.so
 2 | *.o
 3 | test/test_kyber1024
 4 | test/test_kyber512
 5 | test/test_kyber768
 6 | test/test_speed1024
 7 | test/test_speed512
 8 | test/test_speed768
 9 | test/test_vectors1024
10 | test/test_vectors512
11 | test/test_vectors768
12 | nistkat/PQCgenKAT_kem512
13 | nistkat/PQCgenKAT_kem768
14 | nistkat/PQCgenKAT_kem1024
15 | nistkat/*.req
16 | nistkat/*.rsp
17 | 


--------------------------------------------------------------------------------
/ref/test/cpucycles.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include "cpucycles.h"
 3 | 
 4 | uint64_t cpucycles_overhead(void) {
 5 |   uint64_t t0, t1, overhead = -1LL;
 6 |   unsigned int i;
 7 | 
 8 |   for(i=0;i<100000;i++) {
 9 |     t0 = cpucycles();
10 |     __asm__ volatile ("");
11 |     t1 = cpucycles();
12 |     if(t1 - t0 < overhead)
13 |       overhead = t1 - t0;
14 |   }
15 | 
16 |   return overhead;
17 | }
18 | 


--------------------------------------------------------------------------------
/ref/cbd.h:
--------------------------------------------------------------------------------
 1 | #ifndef CBD_H
 2 | #define CBD_H
 3 | 
 4 | #include <stdint.h>
 5 | #include "params.h"
 6 | #include "poly.h"
 7 | 
 8 | #define poly_cbd_eta1 KYBER_NAMESPACE(poly_cbd_eta1)
 9 | void poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1*KYBER_N/4]);
10 | 
11 | #define poly_cbd_eta2 KYBER_NAMESPACE(poly_cbd_eta2)
12 | void poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2*KYBER_N/4]);
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/ref/reduce.h:
--------------------------------------------------------------------------------
 1 | #ifndef REDUCE_H
 2 | #define REDUCE_H
 3 | 
 4 | #include <stdint.h>
 5 | #include "params.h"
 6 | 
 7 | #define MONT -1044 // 2^16 mod q
 8 | #define QINV -3327 // q^-1 mod 2^16
 9 | 
10 | #define montgomery_reduce KYBER_NAMESPACE(montgomery_reduce)
11 | int16_t montgomery_reduce(int32_t a);
12 | 
13 | #define barrett_reduce KYBER_NAMESPACE(barrett_reduce)
14 | int16_t barrett_reduce(int16_t a);
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/avx2/cbd.h:
--------------------------------------------------------------------------------
 1 | #ifndef CBD_H
 2 | #define CBD_H
 3 | 
 4 | #include <stdint.h>
 5 | #include <immintrin.h>
 6 | #include "params.h"
 7 | #include "poly.h"
 8 | 
 9 | #define poly_cbd_eta1 KYBER_NAMESPACE(poly_cbd_eta1)
10 | void poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1*KYBER_N/128+1]);
11 | 
12 | #define poly_cbd_eta2 KYBER_NAMESPACE(poly_cbd_eta2)
13 | void poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2*KYBER_N/128]);
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/avx2/align.h:
--------------------------------------------------------------------------------
 1 | #ifndef ALIGN_H
 2 | #define ALIGN_H
 3 | 
 4 | #include <stdint.h>
 5 | #include <immintrin.h>
 6 | 
 7 | #define ALIGNED_UINT8(N)        \
 8 |     union {                     \
 9 |         uint8_t coeffs[N];      \
10 |         __m256i vec[(N+31)/32]; \
11 |     }
12 | 
13 | #define ALIGNED_INT16(N)        \
14 |     union {                     \
15 |         int16_t coeffs[N];      \
16 |         __m256i vec[(N+15)/16]; \
17 |     }
18 | 
19 | #endif
20 | 


--------------------------------------------------------------------------------
/runlcov.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh -e
 2 | 
 3 | cd ref
 4 | 
 5 | for alg in 512 768 1024; do
 6 |   make -B test/test_kyber$alg CFLAGS="-O0 -g --coverage"
 7 |   ./test/test_kyber$alg
 8 |   lcov -c -d . -o kyber$alg.lcov
 9 |   lcov -z -d .
10 |   rm test/test_kyber$alg
11 | done
12 | 
13 | lcov -o kyber.lcov \
14 |   -a kyber512.lcov \
15 |   -a kyber768.lcov \
16 |   -a kyber1024.lcov \
17 | 
18 | lcov -r kyber.lcov -o kyber.lcov \
19 |   '*/test/test_kyber.c'
20 | 
21 | exit 0
22 | 


--------------------------------------------------------------------------------
/avx2/rejsample.h:
--------------------------------------------------------------------------------
 1 | #ifndef REJSAMPLE_H
 2 | #define REJSAMPLE_H
 3 | 
 4 | #include <stdint.h>
 5 | #include "params.h"
 6 | #include "symmetric.h"
 7 | 
 8 | #define REJ_UNIFORM_AVX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
 9 | #define REJ_UNIFORM_AVX_BUFLEN (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES)
10 | 
11 | #define rej_uniform_avx KYBER_NAMESPACE(rej_uniform_avx)
12 | unsigned int rej_uniform_avx(int16_t *r, const uint8_t *buf);
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/ref/verify.h:
--------------------------------------------------------------------------------
 1 | #ifndef VERIFY_H
 2 | #define VERIFY_H
 3 | 
 4 | #include <stddef.h>
 5 | #include <stdint.h>
 6 | #include "params.h"
 7 | 
 8 | #define verify KYBER_NAMESPACE(verify)
 9 | int verify(const uint8_t *a, const uint8_t *b, size_t len);
10 | 
11 | #define cmov KYBER_NAMESPACE(cmov)
12 | void cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);
13 | 
14 | #define cmov_int16 KYBER_NAMESPACE(cmov_int16)
15 | void cmov_int16(int16_t *r, int16_t v, uint16_t b);
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/ref/ntt.h:
--------------------------------------------------------------------------------
 1 | #ifndef NTT_H
 2 | #define NTT_H
 3 | 
 4 | #include <stdint.h>
 5 | #include "params.h"
 6 | 
 7 | #define zetas KYBER_NAMESPACE(zetas)
 8 | extern const int16_t zetas[128];
 9 | 
10 | #define ntt KYBER_NAMESPACE(ntt)
11 | void ntt(int16_t poly[256]);
12 | 
13 | #define invntt KYBER_NAMESPACE(invntt)
14 | void invntt(int16_t poly[256]);
15 | 
16 | #define basemul KYBER_NAMESPACE(basemul)
17 | void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);
18 | 
19 | #endif
20 | 


--------------------------------------------------------------------------------
/avx2/shuffle.inc:
--------------------------------------------------------------------------------
 1 | .macro shuffle8 r0,r1,r2,r3
 2 | vperm2i128	$0x20,%ymm\r1,%ymm\r0,%ymm\r2
 3 | vperm2i128	$0x31,%ymm\r1,%ymm\r0,%ymm\r3
 4 | .endm
 5 | 
 6 | .macro shuffle4 r0,r1,r2,r3
 7 | vpunpcklqdq	%ymm\r1,%ymm\r0,%ymm\r2
 8 | vpunpckhqdq	%ymm\r1,%ymm\r0,%ymm\r3
 9 | .endm
10 | 
11 | .macro shuffle2 r0,r1,r2,r3
12 | #vpsllq		$32,%ymm\r1,%ymm\r2
13 | vmovsldup	%ymm\r1,%ymm\r2
14 | vpblendd	$0xAA,%ymm\r2,%ymm\r0,%ymm\r2
15 | vpsrlq		$32,%ymm\r0,%ymm\r0
16 | #vmovshdup	%ymm\r0,%ymm\r0
17 | vpblendd	$0xAA,%ymm\r1,%ymm\r0,%ymm\r3
18 | .endm
19 | 
20 | .macro shuffle1 r0,r1,r2,r3
21 | vpslld		$16,%ymm\r1,%ymm\r2
22 | vpblendw	$0xAA,%ymm\r2,%ymm\r0,%ymm\r2
23 | vpsrld		$16,%ymm\r0,%ymm\r0
24 | vpblendw	$0xAA,%ymm\r1,%ymm\r0,%ymm\r3
25 | .endm
26 | 


--------------------------------------------------------------------------------
/ref/test/cpucycles.h:
--------------------------------------------------------------------------------
 1 | #ifndef CPUCYCLES_H
 2 | #define CPUCYCLES_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #ifdef USE_RDPMC  /* Needs echo 2 > /sys/devices/cpu/rdpmc */
 7 | 
 8 | static inline uint64_t cpucycles(void) {
 9 |   const uint32_t ecx = (1U << 30) + 1;
10 |   uint64_t result;
11 | 
12 |   __asm__ volatile ("rdpmc; shlq $32,%%rdx; orq %%rdx,%%rax"
13 |     : "=a" (result) : "c" (ecx) : "rdx");
14 | 
15 |   return result;
16 | }
17 | 
18 | #else
19 | 
20 | static inline uint64_t cpucycles(void) {
21 |   uint64_t result;
22 | 
23 |   __asm__ volatile ("rdtsc; shlq $32,%%rdx; orq %%rdx,%%rax"
24 |     : "=a" (result) : : "%rdx");
25 | 
26 |   return result;
27 | }
28 | 
29 | #endif
30 | 
31 | uint64_t cpucycles_overhead(void);
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/avx2/fq.inc:
--------------------------------------------------------------------------------
 1 | .macro red16 r,rs=0,x=12
 2 | vpmulhw         %ymm1,%ymm\r,%ymm\x
 3 | .if \rs
 4 | vpmulhrsw	%ymm\rs,%ymm\x,%ymm\x
 5 | .else
 6 | vpsraw          $10,%ymm\x,%ymm\x
 7 | .endif
 8 | vpmullw         %ymm0,%ymm\x,%ymm\x
 9 | vpsubw          %ymm\x,%ymm\r,%ymm\r
10 | .endm
11 | 
12 | .macro csubq r,x=12
13 | vpsubw		%ymm0,%ymm\r,%ymm\r
14 | vpsraw		$15,%ymm\r,%ymm\x
15 | vpand		%ymm0,%ymm\x,%ymm\x
16 | vpaddw		%ymm\x,%ymm\r,%ymm\r
17 | .endm
18 | 
19 | .macro caddq r,x=12
20 | vpsraw		$15,%ymm\r,%ymm\x
21 | vpand		%ymm0,%ymm\x,%ymm\x
22 | vpaddw		%ymm\x,%ymm\r,%ymm\r
23 | .endm
24 | 
25 | .macro fqmulprecomp al,ah,b,x=12
26 | vpmullw		%ymm\al,%ymm\b,%ymm\x
27 | vpmulhw		%ymm\ah,%ymm\b,%ymm\b
28 | vpmulhw		%ymm0,%ymm\x,%ymm\x
29 | vpsubw		%ymm\x,%ymm\b,%ymm\b
30 | .endm
31 | 


--------------------------------------------------------------------------------
/runtests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh -e
 2 | nproc="${nproc:-2}"
 3 | 
 4 | ARCH="${ARCH:-amd64}"
 5 | ARCH="${TRAVIS_CPU_ARCH:-$ARCH}"
 6 | 
 7 | if [ "$ARCH" = "amd64" -a "$TRAVIS_OS_NAME" != "osx" ]; then
 8 |   DIRS="ref avx2"
 9 | else
10 |   DIRS="ref"
11 | fi
12 | 
13 | if [ "$ARCH" = "amd64" -o "$ARCH" = "arm64" ]; then
14 |   export CC=/usr/bin/gcc
15 | #  export CFLAGS="-fsanitize=undefined,address ${CFLAGS}"
16 | fi
17 | 
18 | for dir in $DIRS; do
19 |   make -j$(nproc) -C $dir clean
20 |   make -j$(nproc) -C $dir
21 |   for alg in 512 768 1024; do
22 |     valgrind --vex-guest-max-insns=25 ./$dir/test/test_kyber$alg
23 |     echo test_kyber$alg
24 |     ./$dir/test/test_kyber$alg 
25 |     ./$dir/test/test_vectors$alg > tvecs$alg 
26 |   done
27 |   shasum -a256 -c SHA256SUMS
28 | done
29 | 
30 | exit 0
31 | 


--------------------------------------------------------------------------------
/avx2/ntt.h:
--------------------------------------------------------------------------------
 1 | #ifndef NTT_H
 2 | #define NTT_H
 3 | 
 4 | #include <stdint.h>
 5 | #include <immintrin.h>
 6 | 
 7 | #define ntt_avx KYBER_NAMESPACE(ntt_avx)
 8 | void ntt_avx(__m256i *r, const __m256i *qdata);
 9 | #define invntt_avx KYBER_NAMESPACE(invntt_avx)
10 | void invntt_avx(__m256i *r, const __m256i *qdata);
11 | 
12 | #define nttpack_avx KYBER_NAMESPACE(nttpack_avx)
13 | void nttpack_avx(__m256i *r, const __m256i *qdata);
14 | #define nttunpack_avx KYBER_NAMESPACE(nttunpack_avx)
15 | void nttunpack_avx(__m256i *r, const __m256i *qdata);
16 | 
17 | #define basemul_avx KYBER_NAMESPACE(basemul_avx)
18 | void basemul_avx(__m256i *r,
19 |                  const __m256i *a,
20 |                  const __m256i *b,
21 |                  const __m256i *qdata);
22 | 
23 | #define ntttobytes_avx KYBER_NAMESPACE(ntttobytes_avx)
24 | void ntttobytes_avx(uint8_t *r, const __m256i *a, const __m256i *qdata);
25 | #define nttfrombytes_avx KYBER_NAMESPACE(nttfrombytes_avx)
26 | void nttfrombytes_avx(__m256i *r, const uint8_t *a, const __m256i *qdata);
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/ref/indcpa.h:
--------------------------------------------------------------------------------
 1 | #ifndef INDCPA_H
 2 | #define INDCPA_H
 3 | 
 4 | #include <stdint.h>
 5 | #include "params.h"
 6 | #include "polyvec.h"
 7 | 
 8 | #define gen_matrix KYBER_NAMESPACE(gen_matrix)
 9 | void gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed);
10 | 
11 | #define indcpa_keypair_derand KYBER_NAMESPACE(indcpa_keypair_derand)
12 | void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
13 |                            uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
14 |                            const uint8_t coins[KYBER_SYMBYTES]);
15 | 
16 | #define indcpa_enc KYBER_NAMESPACE(indcpa_enc)
17 | void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
18 |                 const uint8_t m[KYBER_INDCPA_MSGBYTES],
19 |                 const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
20 |                 const uint8_t coins[KYBER_SYMBYTES]);
21 | 
22 | #define indcpa_dec KYBER_NAMESPACE(indcpa_dec)
23 | void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
24 |                 const uint8_t c[KYBER_INDCPA_BYTES],
25 |                 const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]);
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c
 2 | os: linux
 3 | dist: bionic
 4 | compiler:
 5 |   - gcc
 6 |   - clang
 7 | arch:
 8 |   - amd64
 9 |   - arm64
10 |   - ppc64le
11 |   - s390x
12 | script: ./runtests.sh
13 | 
14 | jobs:
15 |   include:
16 |     - os: osx
17 |       compiler: clang
18 |       env:
19 |         - CFLAGS="-I/usr/local/opt/openssl@1.1/include/"
20 |         - NISTFLAGS="-I/usr/local/opt/openssl@1.1/include/"
21 |         - LDFLAGS="-L/usr/local/opt/openssl@1.1/lib/"
22 | 
23 |     - stage: coverage
24 |       os: linux
25 |       compiler: gcc
26 |       before_install:
27 |         - sudo apt-get update
28 |         - sudo apt-get install -y lcov
29 |         - gem install coveralls-lcov
30 |       script: ./runlcov.sh
31 |       after_success:
32 |         - coveralls-lcov ref/kyber.lcov
33 | 
34 |   allow_failures:
35 |     - os: windows
36 |       compiler: msvc19
37 |       before_install:
38 |         - choco install openssl
39 |       script:
40 |         - mkdir build
41 |         - cd build
42 |         - cmake -A x64 ..
43 |         - cmake --build .
44 |         - ctest --output-on-failure
45 | 
46 | 


--------------------------------------------------------------------------------
/avx2/keccak4x/KeccakP-align.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
 3 | Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
 4 | denoted as "the implementer".
 5 | 
 6 | For more information, feedback or questions, please refer to our websites:
 7 | http://keccak.noekeon.org/
 8 | http://keyak.noekeon.org/
 9 | http://ketje.noekeon.org/
10 | 
11 | To the extent possible under law, the implementer has waived all copyright
12 | and related or neighboring rights to the source code in this file.
13 | http://creativecommons.org/publicdomain/zero/1.0/
14 | */
15 | 
16 | #ifndef _keccakp_align_h_
17 | #define _keccakp_align_h_
18 | 
19 | /* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */
20 | #ifdef ALIGN
21 | #undef ALIGN
22 | #endif
23 | 
24 | #if defined(__GNUC__)
25 | #define ALIGN(x) __attribute__ ((aligned(x)))
26 | #elif defined(_MSC_VER)
27 | #define ALIGN(x) __declspec(align(x))
28 | #elif defined(__ARMCC_VERSION)
29 | #define ALIGN(x) __align(x)
30 | #else
31 | #define ALIGN(x)
32 | #endif
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/Common_META.yml:
--------------------------------------------------------------------------------
 1 | commons:
 2 |   - name: common_ref
 3 |     folder_name: ref
 4 |     sources: fips202.c fips202.h
 5 |   - name: common_aes
 6 |     folder_name: avx2
 7 |     sources: aes256ctr.c aes256ctr.h
 8 |     supported_platforms:
 9 |       - architecture: x86_64
10 |         operating_systems:
11 |           - Darwin
12 |           - Linux
13 |         required_flags:
14 |           - sse2
15 |           - ssse3
16 |   - name: common_avx2
17 |     folder_name: avx2
18 |     sources: fips202.c fips202.h fips202x4.c fips202x4.h
19 |     supported_platforms:
20 |       - architecture: x86_64
21 |         operating_systems:
22 |           - Darwin
23 |           - Linux
24 |         required_flags:
25 |           - avx2
26 |   - name: common_keccak4x_avx2
27 |     folder_name: avx2
28 |     sources: fips202x4.h keccak4x/KeccakP-1600-times4-SIMD256.c keccak4x/KeccakP-1600-times4-SnP.h keccak4x/KeccakP-1600-unrolling.macros keccak4x/KeccakP-SIMD256-config.h keccak4x/KeccakP-align.h keccak4x/KeccakP-brg_endian.h
29 |     supported_platforms:
30 |       - architecture: x86_64
31 |         operating_systems:
32 |           - Darwin
33 |           - Linux
34 |         required_flags:
35 |           - avx2
36 | 


--------------------------------------------------------------------------------
/avx2/consts.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONSTS_H
 2 | #define CONSTS_H
 3 | 
 4 | #include "params.h"
 5 | 
 6 | #define _16XQ            0
 7 | #define _16XQINV        16
 8 | #define _16XV           32
 9 | #define _16XFLO         48
10 | #define _16XFHI         64
11 | #define _16XMONTSQLO    80
12 | #define _16XMONTSQHI    96
13 | #define _16XMASK       112
14 | #define _REVIDXB       128
15 | #define _REVIDXD       144
16 | #define _ZETAS_EXP     160
17 | #define	_16XSHIFT      624
18 | 
19 | /* The C ABI on MacOS exports all symbols with a leading
20 |  * underscore. This means that any symbols we refer to from
21 |  * C files (functions) can't be found, and all symbols we
22 |  * refer to from ASM also can't be found.
23 |  *
24 |  * This define helps us get around this
25 |  */
26 | #ifdef __ASSEMBLER__
27 | #if defined(__WIN32__) || defined(__APPLE__)
28 | #define decorate(s) _##s
29 | #define cdecl2(s) decorate(s)
30 | #define cdecl(s) cdecl2(KYBER_NAMESPACE(##s))
31 | #else
32 | #define cdecl(s) KYBER_NAMESPACE(##s)
33 | #endif
34 | #endif
35 | 
36 | #ifndef __ASSEMBLER__
37 | #include "align.h"
38 | typedef ALIGNED_INT16(640) qdata_t;
39 | #define qdata KYBER_NAMESPACE(qdata)
40 | extern const qdata_t qdata;
41 | #endif
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/ref/kem.h:
--------------------------------------------------------------------------------
 1 | #ifndef KEM_H
 2 | #define KEM_H
 3 | 
 4 | #include <stdint.h>
 5 | #include "params.h"
 6 | 
 7 | #define CRYPTO_SECRETKEYBYTES  KYBER_SECRETKEYBYTES
 8 | #define CRYPTO_PUBLICKEYBYTES  KYBER_PUBLICKEYBYTES
 9 | #define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES
10 | #define CRYPTO_BYTES           KYBER_SSBYTES
11 | 
12 | #if   (KYBER_K == 2)
13 | #define CRYPTO_ALGNAME "Kyber512"
14 | #elif (KYBER_K == 3)
15 | #define CRYPTO_ALGNAME "Kyber768"
16 | #elif (KYBER_K == 4)
17 | #define CRYPTO_ALGNAME "Kyber1024"
18 | #endif
19 | 
20 | #define crypto_kem_keypair_derand KYBER_NAMESPACE(keypair_derand)
21 | int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
22 | 
23 | #define crypto_kem_keypair KYBER_NAMESPACE(keypair)
24 | int crypto_kem_keypair(uint8_t *pk, uint8_t *sk);
25 | 
26 | #define crypto_kem_enc_derand KYBER_NAMESPACE(enc_derand)
27 | int crypto_kem_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
28 | 
29 | #define crypto_kem_enc KYBER_NAMESPACE(enc)
30 | int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
31 | 
32 | #define crypto_kem_dec KYBER_NAMESPACE(dec)
33 | int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/ref/test/speed_print.c:
--------------------------------------------------------------------------------
 1 | #include <stddef.h>
 2 | #include <stdint.h>
 3 | #include <stdlib.h>
 4 | #include <stdio.h>
 5 | #include "cpucycles.h"
 6 | #include "speed_print.h"
 7 | 
 8 | static int cmp_uint64(const void *a, const void *b) {
 9 |   if(*(uint64_t *)a < *(uint64_t *)b) return -1;
10 |   if(*(uint64_t *)a > *(uint64_t *)b) return 1;
11 |   return 0;
12 | }
13 | 
14 | static uint64_t median(uint64_t *l, size_t llen) {
15 |   qsort(l,llen,sizeof(uint64_t),cmp_uint64);
16 | 
17 |   if(llen%2) return l[llen/2];
18 |   else return (l[llen/2-1]+l[llen/2])/2;
19 | }
20 | 
21 | static uint64_t average(uint64_t *t, size_t tlen) {
22 |   size_t i;
23 |   uint64_t acc=0;
24 | 
25 |   for(i=0;i<tlen;i++)
26 |     acc += t[i];
27 | 
28 |   return acc/tlen;
29 | }
30 | 
31 | void print_results(const char *s, uint64_t *t, size_t tlen) {
32 |   size_t i;
33 |   static uint64_t overhead = -1;
34 | 
35 |   if(tlen < 2) {
36 |     fprintf(stderr, "ERROR: Need a least two cycle counts!\n");
37 |     return;
38 |   }
39 | 
40 |   if(overhead  == (uint64_t)-1)
41 |     overhead = cpucycles_overhead();
42 | 
43 |   tlen--;
44 |   for(i=0;i<tlen;++i)
45 |     t[i] = t[i+1] - t[i] - overhead;
46 | 
47 |   printf("%s\n", s);
48 |   printf("median: %llu cycles/ticks\n", (unsigned long long)median(t, tlen));
49 |   printf("average: %llu cycles/ticks\n", (unsigned long long)average(t, tlen));
50 |   printf("\n");
51 | }
52 | 


--------------------------------------------------------------------------------
/ref/polyvec.h:
--------------------------------------------------------------------------------
 1 | #ifndef POLYVEC_H
 2 | #define POLYVEC_H
 3 | 
 4 | #include <stdint.h>
 5 | #include "params.h"
 6 | #include "poly.h"
 7 | 
 8 | typedef struct{
 9 |   poly vec[KYBER_K];
10 | } polyvec;
11 | 
12 | #define polyvec_compress KYBER_NAMESPACE(polyvec_compress)
13 | void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a);
14 | #define polyvec_decompress KYBER_NAMESPACE(polyvec_decompress)
15 | void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]);
16 | 
17 | #define polyvec_tobytes KYBER_NAMESPACE(polyvec_tobytes)
18 | void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a);
19 | #define polyvec_frombytes KYBER_NAMESPACE(polyvec_frombytes)
20 | void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);
21 | 
22 | #define polyvec_ntt KYBER_NAMESPACE(polyvec_ntt)
23 | void polyvec_ntt(polyvec *r);
24 | #define polyvec_invntt_tomont KYBER_NAMESPACE(polyvec_invntt_tomont)
25 | void polyvec_invntt_tomont(polyvec *r);
26 | 
27 | #define polyvec_basemul_acc_montgomery KYBER_NAMESPACE(polyvec_basemul_acc_montgomery)
28 | void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);
29 | 
30 | #define polyvec_reduce KYBER_NAMESPACE(polyvec_reduce)
31 | void polyvec_reduce(polyvec *r);
32 | 
33 | #define polyvec_add KYBER_NAMESPACE(polyvec_add)
34 | void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/avx2/polyvec.h:
--------------------------------------------------------------------------------
 1 | #ifndef POLYVEC_H
 2 | #define POLYVEC_H
 3 | 
 4 | #include <stdint.h>
 5 | #include "params.h"
 6 | #include "poly.h"
 7 | 
 8 | typedef struct{
 9 |   poly vec[KYBER_K];
10 | } polyvec;
11 | 
12 | #define polyvec_compress KYBER_NAMESPACE(polyvec_compress)
13 | void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES+2], const polyvec *a);
14 | #define polyvec_decompress KYBER_NAMESPACE(polyvec_decompress)
15 | void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES+12]);
16 | 
17 | #define polyvec_tobytes KYBER_NAMESPACE(polyvec_tobytes)
18 | void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a);
19 | #define polyvec_frombytes KYBER_NAMESPACE(polyvec_frombytes)
20 | void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);
21 | 
22 | #define polyvec_ntt KYBER_NAMESPACE(polyvec_ntt)
23 | void polyvec_ntt(polyvec *r);
24 | #define polyvec_invntt_tomont KYBER_NAMESPACE(polyvec_invntt_tomont)
25 | void polyvec_invntt_tomont(polyvec *r);
26 | 
27 | #define polyvec_basemul_acc_montgomery KYBER_NAMESPACE(polyvec_basemul_acc_montgomery)
28 | void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);
29 | 
30 | #define polyvec_reduce KYBER_NAMESPACE(polyvec_reduce)
31 | void polyvec_reduce(polyvec *r);
32 | 
33 | #define polyvec_add KYBER_NAMESPACE(polyvec_add)
34 | void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/ref/symmetric.h:
--------------------------------------------------------------------------------
 1 | #ifndef SYMMETRIC_H
 2 | #define SYMMETRIC_H
 3 | 
 4 | #include <stddef.h>
 5 | #include <stdint.h>
 6 | #include "params.h"
 7 | 
 8 | #include "fips202.h"
 9 | 
10 | typedef keccak_state xof_state;
11 | 
12 | #define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
13 | void kyber_shake128_absorb(keccak_state *s,
14 |                            const uint8_t seed[KYBER_SYMBYTES],
15 |                            uint8_t x,
16 |                            uint8_t y);
17 | 
18 | #define kyber_shake256_prf KYBER_NAMESPACE(kyber_shake256_prf)
19 | void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce);
20 | 
21 | #define kyber_shake256_rkprf KYBER_NAMESPACE(kyber_shake256_rkprf)
22 | void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]);
23 | 
24 | #define XOF_BLOCKBYTES SHAKE128_RATE
25 | 
26 | #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
27 | #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
28 | #define xof_absorb(STATE, SEED, X, Y) kyber_shake128_absorb(STATE, SEED, X, Y)
29 | #define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
30 | #define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
31 | #define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT)
32 | 
33 | #endif /* SYMMETRIC_H */
34 | 


--------------------------------------------------------------------------------
/ref/reduce.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include "params.h"
 3 | #include "reduce.h"
 4 | 
 5 | /*************************************************
 6 | * Name:        montgomery_reduce
 7 | *
 8 | * Description: Montgomery reduction; given a 32-bit integer a, computes
 9 | *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
10 | *
11 | * Arguments:   - int32_t a: input integer to be reduced;
12 | *                           has to be in {-q2^15,...,q2^15-1}
13 | *
14 | * Returns:     integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q.
15 | **************************************************/
16 | int16_t montgomery_reduce(int32_t a)
17 | {
18 |   int16_t t;
19 | 
20 |   t = (int16_t)a*QINV;
21 |   t = (a - (int32_t)t*KYBER_Q) >> 16;
22 |   return t;
23 | }
24 | 
25 | /*************************************************
26 | * Name:        barrett_reduce
27 | *
28 | * Description: Barrett reduction; given a 16-bit integer a, computes
29 | *              centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2}
30 | *
31 | * Arguments:   - int16_t a: input integer to be reduced
32 | *
33 | * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
34 | **************************************************/
35 | int16_t barrett_reduce(int16_t a) {
36 |   int16_t t;
37 |   const int16_t v = ((1<<26) + KYBER_Q/2)/KYBER_Q;
38 | 
39 |   t  = ((int32_t)v*a + (1<<25)) >> 26;
40 |   t *= KYBER_Q;
41 |   return a - t;
42 | }
43 | 


--------------------------------------------------------------------------------
/avx2/symmetric.h:
--------------------------------------------------------------------------------
 1 | #ifndef SYMMETRIC_H
 2 | #define SYMMETRIC_H
 3 | 
 4 | #include <stddef.h>
 5 | #include <stdint.h>
 6 | #include "params.h"
 7 | 
 8 | #include "fips202.h"
 9 | #include "fips202x4.h"
10 | 
11 | typedef keccak_state xof_state;
12 | 
13 | #define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
14 | void kyber_shake128_absorb(keccak_state *s,
15 |                            const uint8_t seed[KYBER_SYMBYTES],
16 |                            uint8_t x,
17 |                            uint8_t y);
18 | 
19 | #define kyber_shake256_prf KYBER_NAMESPACE(kyber_shake256_prf)
20 | void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce);
21 | 
22 | #define kyber_shake256_rkprf KYBER_NAMESPACE(kyber_shake256_rkprf)
23 | void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]);
24 | 
25 | #define XOF_BLOCKBYTES SHAKE128_RATE
26 | 
27 | #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
28 | #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
29 | #define xof_absorb(STATE, SEED, X, Y) kyber_shake128_absorb(STATE, SEED, X, Y)
30 | #define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
31 | #define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
32 | #define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT)
33 | 
34 | #endif /* SYMMETRIC_H */
35 | 


--------------------------------------------------------------------------------
/ref/nistkat/rng.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  rng.h
 3 | //
 4 | //  Created by Bassham, Lawrence E (Fed) on 8/29/17.
 5 | //  Copyright © 2017 Bassham, Lawrence E (Fed). All rights reserved.
 6 | //
 7 | 
 8 | #ifndef rng_h
 9 | #define rng_h
10 | 
11 | #include <stdio.h>
12 | 
13 | #define RNG_SUCCESS      0
14 | #define RNG_BAD_MAXLEN  -1
15 | #define RNG_BAD_OUTBUF  -2
16 | #define RNG_BAD_REQ_LEN -3
17 | 
18 | typedef struct {
19 |     unsigned char   buffer[16];
20 |     int             buffer_pos;
21 |     unsigned long   length_remaining;
22 |     unsigned char   key[32];
23 |     unsigned char   ctr[16];
24 | } AES_XOF_struct;
25 | 
26 | typedef struct {
27 |     unsigned char   Key[32];
28 |     unsigned char   V[16];
29 |     int             reseed_counter;
30 | } AES256_CTR_DRBG_struct;
31 | 
32 | 
33 | void
34 | AES256_CTR_DRBG_Update(unsigned char *provided_data,
35 |                        unsigned char *Key,
36 |                        unsigned char *V);
37 | 
38 | int
39 | seedexpander_init(AES_XOF_struct *ctx,
40 |                   unsigned char *seed,
41 |                   unsigned char *diversifier,
42 |                   unsigned long maxlen);
43 | 
44 | int
45 | seedexpander(AES_XOF_struct *ctx, unsigned char *x, unsigned long xlen);
46 | 
47 | void
48 | randombytes_init(unsigned char *entropy_input,
49 |                  unsigned char *personalization_string,
50 |                  int security_strength);
51 | 
52 | int
53 | randombytes(unsigned char *x, unsigned long long xlen);
54 | 
55 | #endif /* rng_h */
56 | 


--------------------------------------------------------------------------------
/avx2/fq.S:
--------------------------------------------------------------------------------
 1 | #include "consts.h"
 2 | .include "fq.inc"
 3 | 
 4 | .text
 5 | reduce128_avx:
 6 | #load
 7 | vmovdqa		(%rdi),%ymm2
 8 | vmovdqa		32(%rdi),%ymm3
 9 | vmovdqa		64(%rdi),%ymm4
10 | vmovdqa		96(%rdi),%ymm5
11 | vmovdqa		128(%rdi),%ymm6
12 | vmovdqa		160(%rdi),%ymm7
13 | vmovdqa		192(%rdi),%ymm8
14 | vmovdqa		224(%rdi),%ymm9
15 | 
16 | red16		2
17 | red16		3
18 | red16		4
19 | red16		5
20 | red16		6
21 | red16		7
22 | red16		8
23 | red16		9
24 | 
25 | #store
26 | vmovdqa		%ymm2,(%rdi)
27 | vmovdqa		%ymm3,32(%rdi)
28 | vmovdqa		%ymm4,64(%rdi)
29 | vmovdqa		%ymm5,96(%rdi)
30 | vmovdqa		%ymm6,128(%rdi)
31 | vmovdqa		%ymm7,160(%rdi)
32 | vmovdqa		%ymm8,192(%rdi)
33 | vmovdqa		%ymm9,224(%rdi)
34 | 
35 | ret
36 | 
37 | .global cdecl(reduce_avx)
38 | cdecl(reduce_avx):
39 | #consts
40 | vmovdqa		_16XQ*2(%rsi),%ymm0
41 | vmovdqa		_16XV*2(%rsi),%ymm1
42 | call		reduce128_avx
43 | add		$256,%rdi
44 | call		reduce128_avx
45 | ret
46 | 
47 | tomont128_avx:
48 | #load
49 | vmovdqa		(%rdi),%ymm3
50 | vmovdqa		32(%rdi),%ymm4
51 | vmovdqa		64(%rdi),%ymm5
52 | vmovdqa		96(%rdi),%ymm6
53 | vmovdqa		128(%rdi),%ymm7
54 | vmovdqa		160(%rdi),%ymm8
55 | vmovdqa		192(%rdi),%ymm9
56 | vmovdqa		224(%rdi),%ymm10
57 | 
58 | fqmulprecomp	1,2,3,11
59 | fqmulprecomp	1,2,4,12
60 | fqmulprecomp	1,2,5,13
61 | fqmulprecomp	1,2,6,14
62 | fqmulprecomp	1,2,7,15
63 | fqmulprecomp	1,2,8,11
64 | fqmulprecomp	1,2,9,12
65 | fqmulprecomp	1,2,10,13
66 | 
67 | #store
68 | vmovdqa		%ymm3,(%rdi)
69 | vmovdqa		%ymm4,32(%rdi)
70 | vmovdqa		%ymm5,64(%rdi)
71 | vmovdqa		%ymm6,96(%rdi)
72 | vmovdqa		%ymm7,128(%rdi)
73 | vmovdqa		%ymm8,160(%rdi)
74 | vmovdqa		%ymm9,192(%rdi)
75 | vmovdqa		%ymm10,224(%rdi)
76 | 
77 | ret
78 | 
79 | .global cdecl(tomont_avx)
80 | cdecl(tomont_avx):
81 | #consts
82 | vmovdqa		_16XQ*2(%rsi),%ymm0
83 | vmovdqa		_16XMONTSQLO*2(%rsi),%ymm1
84 | vmovdqa		_16XMONTSQHI*2(%rsi),%ymm2
85 | call		tomont128_avx
86 | add		$256,%rdi
87 | call		tomont128_avx
88 | ret
89 | 


--------------------------------------------------------------------------------
/ref/params.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARAMS_H
 2 | #define PARAMS_H
 3 | 
 4 | #ifndef KYBER_K
 5 | #define KYBER_K 3	/* Change this for different security strengths */
 6 | #endif
 7 | 
 8 | 
 9 | /* Don't change parameters below this line */
10 | #if   (KYBER_K == 2)
11 | #define KYBER_NAMESPACE(s) pqcrystals_kyber512_ref_##s
12 | #elif (KYBER_K == 3)
13 | #define KYBER_NAMESPACE(s) pqcrystals_kyber768_ref_##s
14 | #elif (KYBER_K == 4)
15 | #define KYBER_NAMESPACE(s) pqcrystals_kyber1024_ref_##s
16 | #else
17 | #error "KYBER_K must be in {2,3,4}"
18 | #endif
19 | 
20 | #define KYBER_N 256
21 | #define KYBER_Q 3329
22 | 
23 | #define KYBER_SYMBYTES 32   /* size in bytes of hashes, and seeds */
24 | #define KYBER_SSBYTES  32   /* size in bytes of shared key */
25 | 
26 | #define KYBER_POLYBYTES		384
27 | #define KYBER_POLYVECBYTES	(KYBER_K * KYBER_POLYBYTES)
28 | 
29 | #if KYBER_K == 2
30 | #define KYBER_ETA1 3
31 | #define KYBER_POLYCOMPRESSEDBYTES    128
32 | #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
33 | #elif KYBER_K == 3
34 | #define KYBER_ETA1 2
35 | #define KYBER_POLYCOMPRESSEDBYTES    128
36 | #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
37 | #elif KYBER_K == 4
38 | #define KYBER_ETA1 2
39 | #define KYBER_POLYCOMPRESSEDBYTES    160
40 | #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)
41 | #endif
42 | 
43 | #define KYBER_ETA2 2
44 | 
45 | #define KYBER_INDCPA_MSGBYTES       (KYBER_SYMBYTES)
46 | #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
47 | #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
48 | #define KYBER_INDCPA_BYTES          (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
49 | 
50 | #define KYBER_PUBLICKEYBYTES  (KYBER_INDCPA_PUBLICKEYBYTES)
51 | /* 32 bytes of additional space to save H(pk) */
52 | #define KYBER_SECRETKEYBYTES  (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES)
53 | #define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES)
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/ref/poly.h:
--------------------------------------------------------------------------------
 1 | #ifndef POLY_H
 2 | #define POLY_H
 3 | 
 4 | #include <stdint.h>
 5 | #include "params.h"
 6 | 
 7 | /*
 8 |  * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
 9 |  * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
10 |  */
11 | typedef struct{
12 |   int16_t coeffs[KYBER_N];
13 | } poly;
14 | 
15 | #define poly_compress KYBER_NAMESPACE(poly_compress)
16 | void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a);
17 | #define poly_decompress KYBER_NAMESPACE(poly_decompress)
18 | void poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);
19 | 
20 | #define poly_tobytes KYBER_NAMESPACE(poly_tobytes)
21 | void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a);
22 | #define poly_frombytes KYBER_NAMESPACE(poly_frombytes)
23 | void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]);
24 | 
25 | #define poly_frommsg KYBER_NAMESPACE(poly_frommsg)
26 | void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);
27 | #define poly_tomsg KYBER_NAMESPACE(poly_tomsg)
28 | void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *r);
29 | 
30 | #define poly_getnoise_eta1 KYBER_NAMESPACE(poly_getnoise_eta1)
31 | void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
32 | 
33 | #define poly_getnoise_eta2 KYBER_NAMESPACE(poly_getnoise_eta2)
34 | void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
35 | 
36 | #define poly_ntt KYBER_NAMESPACE(poly_ntt)
37 | void poly_ntt(poly *r);
38 | #define poly_invntt_tomont KYBER_NAMESPACE(poly_invntt_tomont)
39 | void poly_invntt_tomont(poly *r);
40 | #define poly_basemul_montgomery KYBER_NAMESPACE(poly_basemul_montgomery)
41 | void poly_basemul_montgomery(poly *r, const poly *a, const poly *b);
42 | #define poly_tomont KYBER_NAMESPACE(poly_tomont)
43 | void poly_tomont(poly *r);
44 | 
45 | #define poly_reduce KYBER_NAMESPACE(poly_reduce)
46 | void poly_reduce(poly *r);
47 | 
48 | #define poly_add KYBER_NAMESPACE(poly_add)
49 | void poly_add(poly *r, const poly *a, const poly *b);
50 | #define poly_sub KYBER_NAMESPACE(poly_sub)
51 | void poly_sub(poly *r, const poly *a, const poly *b);
52 | 
53 | #endif
54 | 


--------------------------------------------------------------------------------
/Kyber512_META.yml:
--------------------------------------------------------------------------------
 1 | name: Kyber512
 2 | type: kem
 3 | claimed-nist-level: 1
 4 | claimed-security: IND-CCA2
 5 | length-public-key: 800
 6 | length-ciphertext: 768
 7 | length-secret-key: 1632
 8 | length-shared-secret: 32
 9 | nistkat-sha256: bb0481d3325d828817900b709d23917cefbc10026fc857f098979451f67bb0ca
10 | testvectors-sha256: 6730bb552c22d9d2176ffb5568e48eb30952cf1f065073ec5f9724f6a3c6ea85
11 | principal-submitters:
12 |   - Peter Schwabe
13 | auxiliary-submitters:
14 |   - Roberto Avanzi
15 |   - Joppe Bos
16 |   - Léo Ducas
17 |   - Eike Kiltz
18 |   - Tancrède Lepoint
19 |   - Vadim Lyubashevsky
20 |   - John M. Schanck
21 |   - Gregor Seiler
22 |   - Damien Stehlé
23 | implementations:
24 |   - name: ref
25 |     version: https://github.com/pq-crystals/kyber/commit/74cad307858b61e434490c75f812cb9b9ef7279b
26 |     folder_name: ref
27 |     compile_opts: -DKYBER_K=2 
28 |     signature_keypair: pqcrystals_kyber512_ref_keypair
29 |     signature_enc: pqcrystals_kyber512_ref_enc
30 |     signature_dec: pqcrystals_kyber512_ref_dec
31 |     sources: ../LICENSE kem.c indcpa.c polyvec.c poly.c reduce.c ntt.c cbd.c verify.c kem.h params.h api.h indcpa.h polyvec.h poly.h reduce.h ntt.h cbd.h verify.h symmetric.h fips202.h symmetric-shake.c
32 |     common_dep: common_ref
33 |   - name: avx2
34 |     version: https://github.com/pq-crystals/kyber/commit/36414d64fc1890ed58d1ca8b1e0cab23635d1ac2
35 |     compile_opts: -DKYBER_K=2 
36 |     signature_keypair: pqcrystals_kyber512_avx2_keypair
37 |     signature_enc: pqcrystals_kyber512_avx2_enc
38 |     signature_dec: pqcrystals_kyber512_avx2_dec
39 |     sources: ../LICENSE kem.c indcpa.c polyvec.c poly.c fq.S shuffle.S ntt.S invntt.S basemul.S consts.c rejsample.c cbd.c verify.c align.h kem.h params.h api.h indcpa.h polyvec.h poly.h reduce.h fq.inc shuffle.inc ntt.h consts.h rejsample.h cbd.h verify.h symmetric.h fips202.h fips202x4.h symmetric-shake.c
40 |     common_dep: common_avx2 common_keccak4x_avx2
41 |     supported_platforms:
42 |       - architecture: x86_64
43 |         operating_systems:
44 |           - Linux
45 |           - Darwin
46 |         required_flags:
47 |           - avx2
48 |           - bmi2
49 |           - popcnt
50 | 


--------------------------------------------------------------------------------
/Kyber768_META.yml:
--------------------------------------------------------------------------------
 1 | name: Kyber768
 2 | type: kem
 3 | claimed-nist-level: 3
 4 | claimed-security: IND-CCA2
 5 | length-public-key: 1184
 6 | length-ciphertext: 1088
 7 | length-secret-key: 2400
 8 | length-shared-secret: 32
 9 | nistkat-sha256: 89e82a5bf2d4ddb2c6444e10409e6d9ca65dafbca67d1a0db2c9b54920a29172
10 | testvectors-sha256: 667c8ca2ca93729c0df6ff24588460bad1bbdbfb64ece0fe8563852a7ff348c6
11 | principal-submitters:
12 |   - Peter Schwabe
13 | auxiliary-submitters:
14 |   - Roberto Avanzi
15 |   - Joppe Bos
16 |   - Léo Ducas
17 |   - Eike Kiltz
18 |   - Tancrède Lepoint
19 |   - Vadim Lyubashevsky
20 |   - John M. Schanck
21 |   - Gregor Seiler
22 |   - Damien Stehlé
23 | implementations:
24 |   - name: ref
25 |     version: https://github.com/pq-crystals/kyber/commit/28413dfbf523fdde181246451c2bd77199c0f7ff
26 |     folder_name: ref
27 |     compile_opts: -DKYBER_K=3
28 |     signature_keypair: pqcrystals_kyber768_ref_keypair
29 |     signature_enc: pqcrystals_kyber768_ref_enc
30 |     signature_dec: pqcrystals_kyber768_ref_dec
31 |     sources: ../LICENSE kem.c indcpa.c polyvec.c poly.c reduce.c ntt.c cbd.c verify.c kem.h params.h api.h indcpa.h polyvec.h poly.h reduce.h ntt.h cbd.h verify.h symmetric.h fips202.h symmetric-shake.c
32 |     common_dep: common_ref
33 |   - name: avx2
34 |     version: https://github.com/pq-crystals/kyber/commit/28413dfbf523fdde181246451c2bd77199c0f7ff
35 |     compile_opts: -DKYBER_K=3
36 |     signature_keypair: pqcrystals_kyber768_avx2_keypair
37 |     signature_enc: pqcrystals_kyber768_avx2_enc
38 |     signature_dec: pqcrystals_kyber768_avx2_dec
39 |     sources: ../LICENSE kem.c indcpa.c polyvec.c poly.c fq.S shuffle.S ntt.S invntt.S basemul.S consts.c rejsample.c cbd.c verify.c align.h kem.h params.h api.h indcpa.h polyvec.h poly.h reduce.h fq.inc shuffle.inc ntt.h consts.h rejsample.h cbd.h verify.h symmetric.h fips202.h fips202x4.h symmetric-shake.c
40 |     common_dep: common_avx2 common_keccak4x_avx2
41 |     supported_platforms:
42 |       - architecture: x86_64
43 |         operating_systems:
44 |           - Linux
45 |           - Darwin
46 |         required_flags:
47 |           - avx2
48 |           - bmi2
49 |           - popcnt
50 | 


--------------------------------------------------------------------------------
/Kyber1024_META.yml:
--------------------------------------------------------------------------------
 1 | name: Kyber1024
 2 | type: kem
 3 | claimed-nist-level: 5
 4 | claimed-security: IND-CCA2
 5 | length-public-key: 1568
 6 | length-ciphertext: 1568
 7 | length-secret-key: 3168
 8 | length-shared-secret: 32
 9 | nistkat-sha256: 5afcf2a568ad32d49b55105b032af1850f03f3888ff9e2a72f4059c58e968f60
10 | testvectors-sha256: ff1a854b9b6761a70c65ccae85246fe0596a949e72eae0866a8a2a2d4ea54b10
11 | principal-submitters:
12 |   - Peter Schwabe
13 | auxiliary-submitters:
14 |   - Roberto Avanzi
15 |   - Joppe Bos
16 |   - Léo Ducas
17 |   - Eike Kiltz
18 |   - Tancrède Lepoint
19 |   - Vadim Lyubashevsky
20 |   - John M. Schanck
21 |   - Gregor Seiler
22 |   - Damien Stehlé
23 | implementations:
24 |   - name: ref
25 |     version: https://github.com/pq-crystals/kyber/commit/28413dfbf523fdde181246451c2bd77199c0f7ff
26 |     folder_name: ref
27 |     compile_opts: -DKYBER_K=4
28 |     signature_keypair: pqcrystals_kyber1024_ref_keypair
29 |     signature_enc: pqcrystals_kyber1024_ref_enc
30 |     signature_dec: pqcrystals_kyber1024_ref_dec
31 |     sources: ../LICENSE kem.c indcpa.c polyvec.c poly.c reduce.c ntt.c cbd.c verify.c kem.h params.h api.h indcpa.h polyvec.h poly.h reduce.h ntt.h cbd.h verify.h symmetric.h fips202.h symmetric-shake.c
32 |     common_dep: common_ref
33 |   - name: avx2
34 |     version: https://github.com/pq-crystals/kyber/commit/28413dfbf523fdde181246451c2bd77199c0f7ff
35 |     compile_opts: -DKYBER_K=4
36 |     signature_keypair: pqcrystals_kyber1024_avx2_keypair
37 |     signature_enc: pqcrystals_kyber1024_avx2_enc
38 |     signature_dec: pqcrystals_kyber1024_avx2_dec
39 |     sources: ../LICENSE kem.c indcpa.c polyvec.c poly.c fq.S shuffle.S ntt.S invntt.S basemul.S consts.c rejsample.c cbd.c verify.c align.h kem.h params.h api.h indcpa.h polyvec.h poly.h reduce.h fq.inc shuffle.inc ntt.h consts.h rejsample.h cbd.h verify.h symmetric.h fips202.h fips202x4.h symmetric-shake.c
40 |     common_dep: common_avx2 common_keccak4x_avx2
41 |     supported_platforms:
42 |       - architecture: x86_64
43 |         operating_systems:
44 |           - Linux
45 |           - Darwin
46 |         required_flags:
47 |           - avx2
48 |           - bmi2
49 |           - popcnt
50 | 


--------------------------------------------------------------------------------
/ref/randombytes.c:
--------------------------------------------------------------------------------
 1 | #include <stddef.h>
 2 | #include <stdint.h>
 3 | #include <stdlib.h>
 4 | #include "randombytes.h"
 5 | 
 6 | #ifdef _WIN32
 7 | #include <windows.h>
 8 | #include <wincrypt.h>
 9 | #else
10 | #include <fcntl.h>
11 | #include <errno.h>
12 | #ifdef __linux__
13 | #define _GNU_SOURCE
14 | #include <unistd.h>
15 | #include <sys/syscall.h>
16 | #elif __NetBSD__
17 | #include <sys/random.h>
18 | #else
19 | #include <unistd.h>
20 | #endif
21 | #endif
22 | 
23 | #ifdef _WIN32
24 | void randombytes(uint8_t *out, size_t outlen) {
25 |   HCRYPTPROV ctx;
26 |   size_t len;
27 | 
28 |   if(!CryptAcquireContext(&ctx, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT))
29 |     abort();
30 | 
31 |   while(outlen > 0) {
32 |     len = (outlen > 1048576) ? 1048576 : outlen;
33 |     if(!CryptGenRandom(ctx, len, (BYTE *)out))
34 |       abort();
35 | 
36 |     out += len;
37 |     outlen -= len;
38 |   }
39 | 
40 |   if(!CryptReleaseContext(ctx, 0))
41 |     abort();
42 | }
43 | #elif defined(__linux__) && defined(SYS_getrandom)
44 | void randombytes(uint8_t *out, size_t outlen) {
45 |   ssize_t ret;
46 | 
47 |   while(outlen > 0) {
48 |     ret = syscall(SYS_getrandom, out, outlen, 0);
49 |     if(ret == -1 && errno == EINTR)
50 |       continue;
51 |     else if(ret == -1)
52 |       abort();
53 | 
54 |     out += ret;
55 |     outlen -= ret;
56 |   }
57 | }
58 | #elif defined(__NetBSD__)
59 | void randombytes(uint8_t *out, size_t outlen) {
60 |   ssize_t ret;
61 | 
62 |   while(outlen > 0) {
63 |     ret = getrandom(out, outlen, 0);
64 |     if(ret == -1 && errno == EINTR)
65 |       continue;
66 |     else if(ret == -1)
67 |       abort();
68 | 
69 |     out += ret;
70 |     outlen -= ret;
71 |   }
72 | }
73 | #else
74 | void randombytes(uint8_t *out, size_t outlen) {
75 |   static int fd = -1;
76 |   ssize_t ret;
77 | 
78 |   while(fd == -1) {
79 |     fd = open("/dev/urandom", O_RDONLY);
80 |     if(fd == -1 && errno == EINTR)
81 |       continue;
82 |     else if(fd == -1)
83 |       abort();
84 |   }
85 | 
86 |   while(outlen > 0) {
87 |     ret = read(fd, out, outlen);
88 |     if(ret == -1 && errno == EINTR)
89 |       continue;
90 |     else if(ret == -1)
91 |       abort();
92 | 
93 |     out += ret;
94 |     outlen -= ret;
95 |   }
96 | }
97 | #endif
98 | 


--------------------------------------------------------------------------------
/avx2/params.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARAMS_H
 2 | #define PARAMS_H
 3 | 
 4 | #ifndef KYBER_K
 5 | #define KYBER_K 3	/* Change this for different security strengths */
 6 | #endif
 7 | 
 8 | //#define KYBER_90S	/* Uncomment this if you want the 90S variant */
 9 | 
10 | /* Don't change parameters below this line */
11 | #if   (KYBER_K == 2)
12 | #ifdef KYBER_90S
13 | #define KYBER_NAMESPACE(s) pqcrystals_kyber512_90s_avx2_##s
14 | #else
15 | #define KYBER_NAMESPACE(s) pqcrystals_kyber512_avx2_##s
16 | #endif
17 | #elif (KYBER_K == 3)
18 | #ifdef KYBER_90S
19 | #define KYBER_NAMESPACE(s) pqcrystals_kyber768_90s_avx2_##s
20 | #else
21 | #define KYBER_NAMESPACE(s) pqcrystals_kyber768_avx2_##s
22 | #endif
23 | #elif (KYBER_K == 4)
24 | #ifdef KYBER_90S
25 | #define KYBER_NAMESPACE(s) pqcrystals_kyber1024_90s_avx2_##s
26 | #else
27 | #define KYBER_NAMESPACE(s) pqcrystals_kyber1024_avx2_##s
28 | #endif
29 | #else
30 | #error "KYBER_K must be in {2,3,4}"
31 | #endif
32 | 
33 | #define KYBER_N 256
34 | #define KYBER_Q 3329
35 | 
36 | #define KYBER_SYMBYTES 32   /* size in bytes of hashes, and seeds */
37 | #define KYBER_SSBYTES  32   /* size in bytes of shared key */
38 | 
39 | #define KYBER_POLYBYTES		384
40 | #define KYBER_POLYVECBYTES	(KYBER_K * KYBER_POLYBYTES)
41 | 
42 | #if KYBER_K == 2
43 | #define KYBER_ETA1 3
44 | #define KYBER_POLYCOMPRESSEDBYTES    128
45 | #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
46 | #elif KYBER_K == 3
47 | #define KYBER_ETA1 2
48 | #define KYBER_POLYCOMPRESSEDBYTES    128
49 | #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
50 | #elif KYBER_K == 4
51 | #define KYBER_ETA1 2
52 | #define KYBER_POLYCOMPRESSEDBYTES    160
53 | #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)
54 | #endif
55 | 
56 | #define KYBER_ETA2 2
57 | 
58 | #define KYBER_INDCPA_MSGBYTES       (KYBER_SYMBYTES)
59 | #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
60 | #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
61 | #define KYBER_INDCPA_BYTES          (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
62 | 
63 | #define KYBER_PUBLICKEYBYTES  (KYBER_INDCPA_PUBLICKEYBYTES)
64 | /* 32 bytes of additional space to save H(pk) */
65 | #define KYBER_SECRETKEYBYTES  (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES)
66 | #define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES)
67 | 
68 | #endif
69 | 


--------------------------------------------------------------------------------
/ref/test/test_vectors.c:
--------------------------------------------------------------------------------
 1 | /* Deterministic randombytes by Daniel J. Bernstein */
 2 | /* taken from SUPERCOP (https://bench.cr.yp.to)     */
 3 | 
 4 | #include <stddef.h>
 5 | #include <stdint.h>
 6 | #include <stdio.h>
 7 | #include "../kem.h"
 8 | #include "../randombytes.h"
 9 | #include "../fips202.h"
10 | 
11 | #define NTESTS 10000
12 | 
13 | 
14 | /* Initital state after absorbing empty string 
15 |  * Permute before squeeze is achieved by setting pos to SHAKE128_RATE */
16 | static keccak_state rngstate = {{0x1F, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, (1ULL << 63), 0, 0, 0, 0}, SHAKE128_RATE};
17 | 
18 | void randombytes(uint8_t *x,size_t xlen)
19 | {
20 |   shake128_squeeze(x, xlen, &rngstate);
21 | }
22 | 
23 | int main(void)
24 | {
25 |   unsigned int i,j;
26 |   uint8_t pk[CRYPTO_PUBLICKEYBYTES];
27 |   uint8_t sk[CRYPTO_SECRETKEYBYTES];
28 |   uint8_t ct[CRYPTO_CIPHERTEXTBYTES];
29 |   uint8_t key_a[CRYPTO_BYTES];
30 |   uint8_t key_b[CRYPTO_BYTES];
31 | 
32 |   for(i=0;i<NTESTS;i++) {
33 |     // Key-pair generation
34 |     crypto_kem_keypair(pk, sk);
35 |     printf("Public Key: ");
36 |     for(j=0;j<CRYPTO_PUBLICKEYBYTES;j++)
37 |       printf("%02x",pk[j]);
38 |     printf("\n");
39 |     printf("Secret Key: ");
40 |     for(j=0;j<CRYPTO_SECRETKEYBYTES;j++)
41 |       printf("%02x",sk[j]);
42 |     printf("\n");
43 | 
44 |     // Encapsulation
45 |     crypto_kem_enc(ct, key_b, pk);
46 |     printf("Ciphertext: ");
47 |     for(j=0;j<CRYPTO_CIPHERTEXTBYTES;j++)
48 |       printf("%02x",ct[j]);
49 |     printf("\n");
50 |     printf("Shared Secret B: ");
51 |     for(j=0;j<CRYPTO_BYTES;j++)
52 |       printf("%02x",key_b[j]);
53 |     printf("\n");
54 | 
55 |     // Decapsulation
56 |     crypto_kem_dec(key_a, ct, sk);
57 |     printf("Shared Secret A: ");
58 |     for(j=0;j<CRYPTO_BYTES;j++)
59 |       printf("%02x",key_a[j]);
60 |     printf("\n");
61 | 
62 |     for(j=0;j<CRYPTO_BYTES;j++) {
63 |       if(key_a[j] != key_b[j]) {
64 |         fprintf(stderr, "ERROR\n");
65 |         return -1;
66 |       }
67 |     }
68 | 
69 |     // Decapsulation of invalid (random) ciphertexts
70 |     randombytes(ct, KYBER_CIPHERTEXTBYTES); 
71 |     crypto_kem_dec(key_a, ct, sk);
72 |     printf("Pseudorandom shared Secret A: ");
73 |     for(j=0;j<CRYPTO_BYTES;j++)
74 |       printf("%02x",key_a[j]);
75 |     printf("\n");
76 |   }
77 | 
78 |   return 0;
79 | }
80 | 


--------------------------------------------------------------------------------
/ref/fips202.h:
--------------------------------------------------------------------------------
 1 | #ifndef FIPS202_H
 2 | #define FIPS202_H
 3 | 
 4 | #include <stddef.h>
 5 | #include <stdint.h>
 6 | 
 7 | #define SHAKE128_RATE 168
 8 | #define SHAKE256_RATE 136
 9 | #define SHA3_256_RATE 136
10 | #define SHA3_512_RATE 72
11 | 
12 | #define FIPS202_NAMESPACE(s) pqcrystals_kyber_fips202_ref_##s
13 | 
14 | typedef struct {
15 |   uint64_t s[25];
16 |   unsigned int pos;
17 | } keccak_state;
18 | 
19 | #define shake128_init FIPS202_NAMESPACE(shake128_init)
20 | void shake128_init(keccak_state *state);
21 | #define shake128_absorb FIPS202_NAMESPACE(shake128_absorb)
22 | void shake128_absorb(keccak_state *state, const uint8_t *in, size_t inlen);
23 | #define shake128_finalize FIPS202_NAMESPACE(shake128_finalize)
24 | void shake128_finalize(keccak_state *state);
25 | #define shake128_squeeze FIPS202_NAMESPACE(shake128_squeeze)
26 | void shake128_squeeze(uint8_t *out, size_t outlen, keccak_state *state);
27 | #define shake128_absorb_once FIPS202_NAMESPACE(shake128_absorb_once)
28 | void shake128_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen);
29 | #define shake128_squeezeblocks FIPS202_NAMESPACE(shake128_squeezeblocks)
30 | void shake128_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state);
31 | 
32 | #define shake256_init FIPS202_NAMESPACE(shake256_init)
33 | void shake256_init(keccak_state *state);
34 | #define shake256_absorb FIPS202_NAMESPACE(shake256_absorb)
35 | void shake256_absorb(keccak_state *state, const uint8_t *in, size_t inlen);
36 | #define shake256_finalize FIPS202_NAMESPACE(shake256_finalize)
37 | void shake256_finalize(keccak_state *state);
38 | #define shake256_squeeze FIPS202_NAMESPACE(shake256_squeeze)
39 | void shake256_squeeze(uint8_t *out, size_t outlen, keccak_state *state);
40 | #define shake256_absorb_once FIPS202_NAMESPACE(shake256_absorb_once)
41 | void shake256_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen);
42 | #define shake256_squeezeblocks FIPS202_NAMESPACE(shake256_squeezeblocks)
43 | void shake256_squeezeblocks(uint8_t *out, size_t nblocks,  keccak_state *state);
44 | 
45 | #define shake128 FIPS202_NAMESPACE(shake128)
46 | void shake128(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen);
47 | #define shake256 FIPS202_NAMESPACE(shake256)
48 | void shake256(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen);
49 | #define sha3_256 FIPS202_NAMESPACE(sha3_256)
50 | void sha3_256(uint8_t h[32], const uint8_t *in, size_t inlen);
51 | #define sha3_512 FIPS202_NAMESPACE(sha3_512)
52 | void sha3_512(uint8_t h[64], const uint8_t *in, size_t inlen);
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/avx2/fips202.h:
--------------------------------------------------------------------------------
 1 | #ifndef FIPS202_H
 2 | #define FIPS202_H
 3 | 
 4 | #include <stddef.h>
 5 | #include <stdint.h>
 6 | 
 7 | #define SHAKE128_RATE 168
 8 | #define SHAKE256_RATE 136
 9 | #define SHA3_256_RATE 136
10 | #define SHA3_512_RATE 72
11 | 
12 | #define FIPS202_NAMESPACE(s) pqcrystals_kyber_fips202_avx2_##s
13 | 
14 | typedef struct {
15 |     uint64_t s[25];
16 |     unsigned int pos;
17 | } keccak_state;
18 | 
19 | #define shake128_init FIPS202_NAMESPACE(shake128_init)
20 | void shake128_init(keccak_state *state);
21 | #define shake128_absorb FIPS202_NAMESPACE(shake128_absorb)
22 | void shake128_absorb(keccak_state *state, const uint8_t *in, size_t inlen);
23 | #define shake128_finalize FIPS202_NAMESPACE(shake128_finalize)
24 | void shake128_finalize(keccak_state *state);
25 | #define shake128_squeeze FIPS202_NAMESPACE(shake128_squeeze)
26 | void shake128_squeeze(uint8_t *out, size_t outlen, keccak_state *state);
27 | #define shake128_absorb_once FIPS202_NAMESPACE(shake128_absorb_once)
28 | void shake128_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen);
29 | #define shake128_squeezeblocks FIPS202_NAMESPACE(shake128_squeezeblocks)
30 | void shake128_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state);
31 | 
32 | #define shake256_init FIPS202_NAMESPACE(shake256_init)
33 | void shake256_init(keccak_state *state);
34 | #define shake256_absorb FIPS202_NAMESPACE(shake256_absorb)
35 | void shake256_absorb(keccak_state *state, const uint8_t *in, size_t inlen);
36 | #define shake256_finalize FIPS202_NAMESPACE(shake256_finalize)
37 | void shake256_finalize(keccak_state *state);
38 | #define shake256_squeeze FIPS202_NAMESPACE(shake256_squeeze)
39 | void shake256_squeeze(uint8_t *out, size_t outlen, keccak_state *state);
40 | #define shake256_absorb_once FIPS202_NAMESPACE(shake256_absorb_once)
41 | void shake256_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen);
42 | #define shake256_squeezeblocks FIPS202_NAMESPACE(shake256_squeezeblocks)
43 | void shake256_squeezeblocks(uint8_t *out, size_t nblocks,  keccak_state *state);
44 | 
45 | #define shake128 FIPS202_NAMESPACE(shake128)
46 | void shake128(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen);
47 | #define shake256 FIPS202_NAMESPACE(shake256)
48 | void shake256(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen);
49 | #define sha3_256 FIPS202_NAMESPACE(sha3_256)
50 | void sha3_256(uint8_t h[32], const uint8_t *in, size_t inlen);
51 | #define sha3_512 FIPS202_NAMESPACE(sha3_512)
52 | void sha3_512(uint8_t h[64], const uint8_t *in, size_t inlen);
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/ref/verify.c:
--------------------------------------------------------------------------------
 1 | #include <stddef.h>
 2 | #include <stdint.h>
 3 | #include "verify.h"
 4 | 
 5 | /*************************************************
 6 | * Name:        verify
 7 | *
 8 | * Description: Compare two arrays for equality in constant time.
 9 | *
10 | * Arguments:   const uint8_t *a: pointer to first byte array
11 | *              const uint8_t *b: pointer to second byte array
12 | *              size_t len:       length of the byte arrays
13 | *
14 | * Returns 0 if the byte arrays are equal, 1 otherwise
15 | **************************************************/
16 | int verify(const uint8_t *a, const uint8_t *b, size_t len)
17 | {
18 |   size_t i;
19 |   uint8_t r = 0;
20 | 
21 |   for(i=0;i<len;i++)
22 |     r |= a[i] ^ b[i];
23 | 
24 |   return (-(uint64_t)r) >> 63;
25 | }
26 | 
27 | /*************************************************
28 | * Name:        cmov
29 | *
30 | * Description: Copy len bytes from x to r if b is 1;
31 | *              don't modify x if b is 0. Requires b to be in {0,1};
32 | *              assumes two's complement representation of negative integers.
33 | *              Runs in constant time.
34 | *
35 | * Arguments:   uint8_t *r:       pointer to output byte array
36 | *              const uint8_t *x: pointer to input byte array
37 | *              size_t len:       Amount of bytes to be copied
38 | *              uint8_t b:        Condition bit; has to be in {0,1}
39 | **************************************************/
40 | void cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b)
41 | {
42 |   size_t i;
43 | 
44 | #if defined(__GNUC__) || defined(__clang__)
45 |   // Prevent the compiler from
46 |   //    1) inferring that b is 0/1-valued, and
47 |   //    2) handling the two cases with a branch.
48 |   // This is not necessary when verify.c and kem.c are separate translation
49 |   // units, but we expect that downstream consumers will copy this code and/or
50 |   // change how it is built.
51 |   __asm__("" : "+r"(b) : /* no inputs */);
52 | #endif
53 | 
54 |   b = -b;
55 |   for(i=0;i<len;i++)
56 |     r[i] ^= b & (r[i] ^ x[i]);
57 | }
58 | 
59 | 
60 | /*************************************************
61 | * Name:        cmov_int16
62 | *
63 | * Description: Copy input v to *r if b is 1, don't modify *r if b is 0. 
64 | *              Requires b to be in {0,1};
65 | *              Runs in constant time.
66 | *
67 | * Arguments:   int16_t *r:       pointer to output int16_t
68 | *              int16_t v:        input int16_t 
69 | *              uint8_t b:        Condition bit; has to be in {0,1}
70 | **************************************************/
71 | void cmov_int16(int16_t *r, int16_t v, uint16_t b)
72 | {
73 |   b = -b;
74 |   *r ^= b & ((*r) ^ v);
75 | }
76 | 


--------------------------------------------------------------------------------
/avx2/fips202x4.h:
--------------------------------------------------------------------------------
 1 | #ifndef FIPS202X4_H
 2 | #define FIPS202X4_H
 3 | 
 4 | #include <stddef.h>
 5 | #include <stdint.h>
 6 | #include <immintrin.h>
 7 | 
 8 | #define FIPS202X4_NAMESPACE(s) pqcrystals_kyber_fips202x4_avx2_##s
 9 | 
10 | typedef struct {
11 |   __m256i s[25];
12 | } keccakx4_state;
13 | 
14 | #define shake128x4_absorb_once FIPS202X4_NAMESPACE(shake128x4_absorb_once)
15 | void shake128x4_absorb_once(keccakx4_state *state,
16 |                             const uint8_t *in0,
17 |                             const uint8_t *in1,
18 |                             const uint8_t *in2,
19 |                             const uint8_t *in3,
20 |                             size_t inlen);
21 | 
22 | #define shake128x4_squeezeblocks FIPS202X4_NAMESPACE(shake128x4_squeezeblocks)
23 | void shake128x4_squeezeblocks(uint8_t *out0,
24 |                               uint8_t *out1,
25 |                               uint8_t *out2,
26 |                               uint8_t *out3,
27 |                               size_t nblocks,
28 |                               keccakx4_state *state);
29 | 
30 | #define shake256x4_absorb_once FIPS202X4_NAMESPACE(shake256x4_absorb_once)
31 | void shake256x4_absorb_once(keccakx4_state *state,
32 |                             const uint8_t *in0,
33 |                             const uint8_t *in1,
34 |                             const uint8_t *in2,
35 |                             const uint8_t *in3,
36 |                             size_t inlen);
37 | 
38 | #define shake256x4_squeezeblocks FIPS202X4_NAMESPACE(shake256x4_squeezeblocks)
39 | void shake256x4_squeezeblocks(uint8_t *out0,
40 |                               uint8_t *out1,
41 |                               uint8_t *out2,
42 |                               uint8_t *out3,
43 |                               size_t nblocks,
44 |                               keccakx4_state *state);
45 | 
46 | #define shake128x4 FIPS202X4_NAMESPACE(shake128x4)
47 | void shake128x4(uint8_t *out0,
48 |                 uint8_t *out1,
49 |                 uint8_t *out2,
50 |                 uint8_t *out3,
51 |                 size_t outlen,
52 |                 const uint8_t *in0,
53 |                 const uint8_t *in1,
54 |                 const uint8_t *in2,
55 |                 const uint8_t *in3,
56 |                 size_t inlen);
57 | 
58 | #define shake256x4 FIPS202X4_NAMESPACE(shake256x4)
59 | void shake256x4(uint8_t *out0,
60 |                 uint8_t *out1,
61 |                 uint8_t *out2,
62 |                 uint8_t *out3,
63 |                 size_t outlen,
64 |                 const uint8_t *in0,
65 |                 const uint8_t *in1,
66 |                 const uint8_t *in2,
67 |                 const uint8_t *in3,
68 |                 size_t inlen);
69 | 
70 | #endif
71 | 


--------------------------------------------------------------------------------
/avx2/verify.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdint.h>
 3 | #include <immintrin.h>
 4 | #include "verify.h"
 5 | 
 6 | /*************************************************
 7 | * Name:        verify
 8 | *
 9 | * Description: Compare two arrays for equality in constant time.
10 | *
11 | * Arguments:   const uint8_t *a: pointer to first byte array
12 | *              const uint8_t *b: pointer to second byte array
13 | *              size_t len: length of the byte arrays
14 | *
15 | * Returns 0 if the byte arrays are equal, 1 otherwise
16 | **************************************************/
17 | int verify(const uint8_t *a, const uint8_t *b, size_t len)
18 | {
19 |   size_t i;
20 |   uint64_t r;
21 |   __m256i f, g, h;
22 | 
23 |   h = _mm256_setzero_si256();
24 |   for(i=0;i<len/32;i++) {
25 |     f = _mm256_loadu_si256((__m256i *)&a[32*i]);
26 |     g = _mm256_loadu_si256((__m256i *)&b[32*i]);
27 |     f = _mm256_xor_si256(f,g);
28 |     h = _mm256_or_si256(h,f);
29 |   }
30 |   r = 1 - _mm256_testz_si256(h,h);
31 | 
32 |   a += 32*i;
33 |   b += 32*i;
34 |   len -= 32*i;
35 |   for(i=0;i<len;i++)
36 |     r |= a[i] ^ b[i];
37 | 
38 |   r = (-r) >> 63;
39 |   return r;
40 | }
41 | 
42 | /*************************************************
43 | * Name:        cmov
44 | *
45 | * Description: Copy len bytes from x to r if b is 1;
46 | *              don't modify x if b is 0. Requires b to be in {0,1};
47 | *              assumes two's complement representation of negative integers.
48 | *              Runs in constant time.
49 | *
50 | * Arguments:   uint8_t *r: pointer to output byte array
51 | *              const uint8_t *x: pointer to input byte array
52 | *              size_t len: Amount of bytes to be copied
53 | *              uint8_t b: Condition bit; has to be in {0,1}
54 | **************************************************/
55 | void cmov(uint8_t * restrict r, const uint8_t *x, size_t len, uint8_t b)
56 | {
57 |   size_t i;
58 |   __m256i xvec, rvec, bvec;
59 | 
60 | #if defined(__GNUC__) || defined(__clang__)
61 |   // Prevent the compiler from
62 |   //    1) inferring that b is 0/1-valued, and
63 |   //    2) handling the two cases with a branch.
64 |   // This is not necessary when verify.c and kem.c are separate translation
65 |   // units, but we expect that downstream consumers will copy this code and/or
66 |   // change how it is built.
67 |   __asm__("" : "+r"(b) : /* no inputs */);
68 | #endif
69 | 
70 |   bvec = _mm256_set1_epi64x(-(uint64_t)b);
71 |   for(i=0;i<len/32;i++) {
72 |     rvec = _mm256_loadu_si256((__m256i *)&r[32*i]);
73 |     xvec = _mm256_loadu_si256((__m256i *)&x[32*i]);
74 |     rvec = _mm256_blendv_epi8(rvec,xvec,bvec);
75 |     _mm256_storeu_si256((__m256i *)&r[32*i],rvec);
76 |   }
77 | 
78 |   r += 32*i;
79 |   x += 32*i;
80 |   len -= 32*i;
81 |   for(i=0;i<len;i++)
82 |     r[i] ^= -b & (x[i] ^ r[i]);
83 | }
84 | 


--------------------------------------------------------------------------------
/ref/symmetric-shake.c:
--------------------------------------------------------------------------------
 1 | #include <stddef.h>
 2 | #include <stdint.h>
 3 | #include <string.h>
 4 | #include "params.h"
 5 | #include "symmetric.h"
 6 | #include "fips202.h"
 7 | 
 8 | /*************************************************
 9 | * Name:        kyber_shake128_absorb
10 | *
11 | * Description: Absorb step of the SHAKE128 specialized for the Kyber context.
12 | *
13 | * Arguments:   - keccak_state *state: pointer to (uninitialized) output Keccak state
14 | *              - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state
15 | *              - uint8_t i: additional byte of input
16 | *              - uint8_t j: additional byte of input
17 | **************************************************/
18 | void kyber_shake128_absorb(keccak_state *state,
19 |                            const uint8_t seed[KYBER_SYMBYTES],
20 |                            uint8_t x,
21 |                            uint8_t y)
22 | {
23 |   uint8_t extseed[KYBER_SYMBYTES+2];
24 | 
25 |   memcpy(extseed, seed, KYBER_SYMBYTES);
26 |   extseed[KYBER_SYMBYTES+0] = x;
27 |   extseed[KYBER_SYMBYTES+1] = y;
28 | 
29 |   shake128_absorb_once(state, extseed, sizeof(extseed));
30 | }
31 | 
32 | /*************************************************
33 | * Name:        kyber_shake256_prf
34 | *
35 | * Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
36 | *              and then generates outlen bytes of SHAKE256 output
37 | *
38 | * Arguments:   - uint8_t *out: pointer to output
39 | *              - size_t outlen: number of requested output bytes
40 | *              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
41 | *              - uint8_t nonce: single-byte nonce (public PRF input)
42 | **************************************************/
43 | void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce)
44 | {
45 |   uint8_t extkey[KYBER_SYMBYTES+1];
46 | 
47 |   memcpy(extkey, key, KYBER_SYMBYTES);
48 |   extkey[KYBER_SYMBYTES] = nonce;
49 | 
50 |   shake256(out, outlen, extkey, sizeof(extkey));
51 | }
52 | 
53 | /*************************************************
54 | * Name:        kyber_shake256_prf
55 | *
56 | * Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
57 | *              and then generates outlen bytes of SHAKE256 output
58 | *
59 | * Arguments:   - uint8_t *out: pointer to output
60 | *              - size_t outlen: number of requested output bytes
61 | *              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
62 | *              - uint8_t nonce: single-byte nonce (public PRF input)
63 | **************************************************/
64 | void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES])
65 | {
66 |   keccak_state s;
67 | 
68 |   shake256_init(&s);
69 |   shake256_absorb(&s, key, KYBER_SYMBYTES);
70 |   shake256_absorb(&s, input, KYBER_CIPHERTEXTBYTES);
71 |   shake256_finalize(&s);
72 |   shake256_squeeze(out, KYBER_SSBYTES, &s);
73 | }
74 | 


--------------------------------------------------------------------------------
/avx2/basemul.S:
--------------------------------------------------------------------------------
  1 | #include "consts.h"
  2 | 
  3 | .macro schoolbook off
  4 | vmovdqa		_16XQINV*2(%rcx),%ymm0
  5 | vmovdqa		(64*\off+ 0)*2(%rsi),%ymm1		# a0
  6 | vmovdqa		(64*\off+16)*2(%rsi),%ymm2		# b0
  7 | vmovdqa		(64*\off+32)*2(%rsi),%ymm3		# a1
  8 | vmovdqa		(64*\off+48)*2(%rsi),%ymm4		# b1
  9 | 
 10 | vpmullw		%ymm0,%ymm1,%ymm9			# a0.lo
 11 | vpmullw		%ymm0,%ymm2,%ymm10			# b0.lo
 12 | vpmullw		%ymm0,%ymm3,%ymm11			# a1.lo
 13 | vpmullw		%ymm0,%ymm4,%ymm12			# b1.lo
 14 | 
 15 | vmovdqa		(64*\off+ 0)*2(%rdx),%ymm5		# c0
 16 | vmovdqa		(64*\off+16)*2(%rdx),%ymm6		# d0
 17 | 
 18 | vpmulhw		%ymm5,%ymm1,%ymm13			# a0c0.hi
 19 | vpmulhw		%ymm6,%ymm1,%ymm1			# a0d0.hi
 20 | vpmulhw		%ymm5,%ymm2,%ymm14			# b0c0.hi
 21 | vpmulhw		%ymm6,%ymm2,%ymm2			# b0d0.hi
 22 | 
 23 | vmovdqa		(64*\off+32)*2(%rdx),%ymm7		# c1
 24 | vmovdqa		(64*\off+48)*2(%rdx),%ymm8		# d1
 25 | 
 26 | vpmulhw		%ymm7,%ymm3,%ymm15			# a1c1.hi
 27 | vpmulhw		%ymm8,%ymm3,%ymm3			# a1d1.hi
 28 | vpmulhw		%ymm7,%ymm4,%ymm0			# b1c1.hi
 29 | vpmulhw		%ymm8,%ymm4,%ymm4			# b1d1.hi
 30 | 
 31 | vmovdqa		%ymm13,(%rsp)
 32 | 
 33 | vpmullw		%ymm5,%ymm9,%ymm13			# a0c0.lo
 34 | vpmullw		%ymm6,%ymm9,%ymm9			# a0d0.lo
 35 | vpmullw		%ymm5,%ymm10,%ymm5			# b0c0.lo
 36 | vpmullw		%ymm6,%ymm10,%ymm10			# b0d0.lo
 37 | 
 38 | vpmullw		%ymm7,%ymm11,%ymm6			# a1c1.lo
 39 | vpmullw		%ymm8,%ymm11,%ymm11			# a1d1.lo
 40 | vpmullw		%ymm7,%ymm12,%ymm7			# b1c1.lo
 41 | vpmullw		%ymm8,%ymm12,%ymm12			# b1d1.lo
 42 | 
 43 | vmovdqa		_16XQ*2(%rcx),%ymm8
 44 | vpmulhw		%ymm8,%ymm13,%ymm13
 45 | vpmulhw		%ymm8,%ymm9,%ymm9
 46 | vpmulhw		%ymm8,%ymm5,%ymm5
 47 | vpmulhw		%ymm8,%ymm10,%ymm10
 48 | vpmulhw		%ymm8,%ymm6,%ymm6
 49 | vpmulhw		%ymm8,%ymm11,%ymm11
 50 | vpmulhw		%ymm8,%ymm7,%ymm7
 51 | vpmulhw		%ymm8,%ymm12,%ymm12
 52 | 
 53 | vpsubw		(%rsp),%ymm13,%ymm13			# -a0c0
 54 | vpsubw		%ymm9,%ymm1,%ymm9			# a0d0
 55 | vpsubw		%ymm5,%ymm14,%ymm5			# b0c0
 56 | vpsubw		%ymm10,%ymm2,%ymm10			# b0d0
 57 | 
 58 | vpsubw		%ymm6,%ymm15,%ymm6			# a1c1
 59 | vpsubw		%ymm11,%ymm3,%ymm11			# a1d1
 60 | vpsubw		%ymm7,%ymm0,%ymm7			# b1c1
 61 | vpsubw		%ymm12,%ymm4,%ymm12			# b1d1
 62 | 
 63 | vmovdqa		(%r9),%ymm0
 64 | vmovdqa		32(%r9),%ymm1
 65 | vpmullw		%ymm0,%ymm10,%ymm2
 66 | vpmullw		%ymm0,%ymm12,%ymm3
 67 | vpmulhw		%ymm1,%ymm10,%ymm10
 68 | vpmulhw		%ymm1,%ymm12,%ymm12
 69 | vpmulhw		%ymm8,%ymm2,%ymm2
 70 | vpmulhw		%ymm8,%ymm3,%ymm3
 71 | vpsubw		%ymm2,%ymm10,%ymm10			# rb0d0
 72 | vpsubw		%ymm3,%ymm12,%ymm12			# rb1d1
 73 | 
 74 | vpaddw		%ymm5,%ymm9,%ymm9
 75 | vpaddw		%ymm7,%ymm11,%ymm11
 76 | vpsubw		%ymm13,%ymm10,%ymm13
 77 | vpsubw		%ymm12,%ymm6,%ymm6
 78 | 
 79 | vmovdqa		%ymm13,(64*\off+ 0)*2(%rdi)
 80 | vmovdqa		%ymm9,(64*\off+16)*2(%rdi)
 81 | vmovdqa		%ymm6,(64*\off+32)*2(%rdi)
 82 | vmovdqa		%ymm11,(64*\off+48)*2(%rdi)
 83 | .endm
 84 | 
 85 | .text
 86 | .global cdecl(basemul_avx)
 87 | cdecl(basemul_avx):
 88 | mov		%rsp,%r8
 89 | and		$-32,%rsp
 90 | sub		$32,%rsp
 91 | 
 92 | lea		(_ZETAS_EXP+176)*2(%rcx),%r9
 93 | schoolbook	0
 94 | 
 95 | add		$32*2,%r9
 96 | schoolbook	1
 97 | 
 98 | add		$192*2,%r9
 99 | schoolbook	2
100 | 
101 | add		$32*2,%r9
102 | schoolbook	3
103 | 
104 | mov		%r8,%rsp
105 | ret
106 | 


--------------------------------------------------------------------------------
/ref/test/test_kyber.c:
--------------------------------------------------------------------------------
  1 | #include <stddef.h>
  2 | #include <stdio.h>
  3 | #include <string.h>
  4 | #include "../kem.h"
  5 | #include "../randombytes.h"
  6 | 
  7 | #define NTESTS 1000
  8 | 
  9 | static int test_keys(void)
 10 | {
 11 |   uint8_t pk[CRYPTO_PUBLICKEYBYTES];
 12 |   uint8_t sk[CRYPTO_SECRETKEYBYTES];
 13 |   uint8_t ct[CRYPTO_CIPHERTEXTBYTES];
 14 |   uint8_t key_a[CRYPTO_BYTES];
 15 |   uint8_t key_b[CRYPTO_BYTES];
 16 | 
 17 |   //Alice generates a public key
 18 |   crypto_kem_keypair(pk, sk);
 19 | 
 20 |   //Bob derives a secret key and creates a response
 21 |   crypto_kem_enc(ct, key_b, pk);
 22 | 
 23 |   //Alice uses Bobs response to get her shared key
 24 |   crypto_kem_dec(key_a, ct, sk);
 25 | 
 26 |   if(memcmp(key_a, key_b, CRYPTO_BYTES)) {
 27 |     printf("ERROR keys\n");
 28 |     return 1;
 29 |   }
 30 | 
 31 |   return 0;
 32 | }
 33 | 
 34 | static int test_invalid_sk_a(void)
 35 | {
 36 |   uint8_t pk[CRYPTO_PUBLICKEYBYTES];
 37 |   uint8_t sk[CRYPTO_SECRETKEYBYTES];
 38 |   uint8_t ct[CRYPTO_CIPHERTEXTBYTES];
 39 |   uint8_t key_a[CRYPTO_BYTES];
 40 |   uint8_t key_b[CRYPTO_BYTES];
 41 | 
 42 |   //Alice generates a public key
 43 |   crypto_kem_keypair(pk, sk);
 44 | 
 45 |   //Bob derives a secret key and creates a response
 46 |   crypto_kem_enc(ct, key_b, pk);
 47 | 
 48 |   //Replace secret key with random values
 49 |   randombytes(sk, CRYPTO_SECRETKEYBYTES);
 50 | 
 51 |   //Alice uses Bobs response to get her shared key
 52 |   crypto_kem_dec(key_a, ct, sk);
 53 | 
 54 |   if(!memcmp(key_a, key_b, CRYPTO_BYTES)) {
 55 |     printf("ERROR invalid sk\n");
 56 |     return 1;
 57 |   }
 58 | 
 59 |   return 0;
 60 | }
 61 | 
 62 | static int test_invalid_ciphertext(void)
 63 | {
 64 |   uint8_t pk[CRYPTO_PUBLICKEYBYTES];
 65 |   uint8_t sk[CRYPTO_SECRETKEYBYTES];
 66 |   uint8_t ct[CRYPTO_CIPHERTEXTBYTES];
 67 |   uint8_t key_a[CRYPTO_BYTES];
 68 |   uint8_t key_b[CRYPTO_BYTES];
 69 |   uint8_t b;
 70 |   size_t pos;
 71 | 
 72 |   do {
 73 |     randombytes(&b, sizeof(uint8_t));
 74 |   } while(!b);
 75 |   randombytes((uint8_t *)&pos, sizeof(size_t));
 76 | 
 77 |   //Alice generates a public key
 78 |   crypto_kem_keypair(pk, sk);
 79 | 
 80 |   //Bob derives a secret key and creates a response
 81 |   crypto_kem_enc(ct, key_b, pk);
 82 | 
 83 |   //Change some byte in the ciphertext (i.e., encapsulated key)
 84 |   ct[pos % CRYPTO_CIPHERTEXTBYTES] ^= b;
 85 | 
 86 |   //Alice uses Bobs response to get her shared key
 87 |   crypto_kem_dec(key_a, ct, sk);
 88 | 
 89 |   if(!memcmp(key_a, key_b, CRYPTO_BYTES)) {
 90 |     printf("ERROR invalid ciphertext\n");
 91 |     return 1;
 92 |   }
 93 | 
 94 |   return 0;
 95 | }
 96 | 
 97 | int main(void)
 98 | {
 99 |   unsigned int i;
100 |   int r;
101 | 
102 |   for(i=0;i<NTESTS;i++) {
103 |     r  = test_keys();
104 |     r |= test_invalid_sk_a();
105 |     r |= test_invalid_ciphertext();
106 |     if(r)
107 |       return 1;
108 |   }
109 | 
110 |   printf("CRYPTO_SECRETKEYBYTES:  %d\n",CRYPTO_SECRETKEYBYTES);
111 |   printf("CRYPTO_PUBLICKEYBYTES:  %d\n",CRYPTO_PUBLICKEYBYTES);
112 |   printf("CRYPTO_CIPHERTEXTBYTES: %d\n",CRYPTO_CIPHERTEXTBYTES);
113 | 
114 |   return 0;
115 | }
116 | 


--------------------------------------------------------------------------------
/avx2/poly.h:
--------------------------------------------------------------------------------
 1 | #ifndef POLY_H
 2 | #define POLY_H
 3 | 
 4 | #include <stdint.h>
 5 | #include "align.h"
 6 | #include "params.h"
 7 | 
 8 | typedef ALIGNED_INT16(KYBER_N) poly;
 9 | 
10 | #define poly_compress KYBER_NAMESPACE(poly_compress)
11 | void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a);
12 | #define poly_decompress KYBER_NAMESPACE(poly_decompress)
13 | void poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);
14 | 
15 | #define poly_tobytes KYBER_NAMESPACE(poly_tobytes)
16 | void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a);
17 | #define poly_frombytes KYBER_NAMESPACE(poly_frombytes)
18 | void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]);
19 | 
20 | #define poly_frommsg KYBER_NAMESPACE(poly_frommsg)
21 | void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);
22 | #define poly_tomsg KYBER_NAMESPACE(poly_tomsg)
23 | void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *r);
24 | 
25 | #define poly_getnoise_eta1 KYBER_NAMESPACE(poly_getnoise_eta1)
26 | void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
27 | 
28 | #define poly_getnoise_eta2 KYBER_NAMESPACE(poly_getnoise_eta2)
29 | void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
30 | 
31 | #ifndef KYBER_90S
32 | #define poly_getnoise_eta1_4x KYBER_NAMESPACE(poly_getnoise_eta2_4x)
33 | void poly_getnoise_eta1_4x(poly *r0,
34 |                            poly *r1,
35 |                            poly *r2,
36 |                            poly *r3,
37 |                            const uint8_t seed[32],
38 |                            uint8_t nonce0,
39 |                            uint8_t nonce1,
40 |                            uint8_t nonce2,
41 |                            uint8_t nonce3);
42 | 
43 | #if KYBER_K == 2
44 | #define poly_getnoise_eta1122_4x KYBER_NAMESPACE(poly_getnoise_eta1122_4x)
45 | void poly_getnoise_eta1122_4x(poly *r0,
46 |                               poly *r1,
47 |                               poly *r2,
48 |                               poly *r3,
49 |                               const uint8_t seed[32],
50 |                               uint8_t nonce0,
51 |                               uint8_t nonce1,
52 |                               uint8_t nonce2,
53 |                               uint8_t nonce3);
54 | #endif
55 | #endif
56 | 
57 | 
58 | #define poly_ntt KYBER_NAMESPACE(poly_ntt)
59 | void poly_ntt(poly *r);
60 | #define poly_invntt_tomont KYBER_NAMESPACE(poly_invntt_tomont)
61 | void poly_invntt_tomont(poly *r);
62 | #define poly_nttunpack KYBER_NAMESPACE(poly_nttunpack)
63 | void poly_nttunpack(poly *r);
64 | #define poly_basemul_montgomery KYBER_NAMESPACE(poly_basemul_montgomery)
65 | void poly_basemul_montgomery(poly *r, const poly *a, const poly *b);
66 | #define poly_tomont KYBER_NAMESPACE(poly_tomont)
67 | void poly_tomont(poly *r);
68 | 
69 | #define poly_reduce KYBER_NAMESPACE(poly_reduce)
70 | void poly_reduce(poly *r);
71 | 
72 | #define poly_add KYBER_NAMESPACE(poly_add)
73 | void poly_add(poly *r, const poly *a, const poly *b);
74 | #define poly_sub KYBER_NAMESPACE(poly_sub)
75 | void poly_sub(poly *r, const poly *a, const poly *b);
76 | 
77 | #endif
78 | 


--------------------------------------------------------------------------------
/ref/api.h:
--------------------------------------------------------------------------------
 1 | #ifndef API_H
 2 | #define API_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #define pqcrystals_kyber512_SECRETKEYBYTES 1632
 7 | #define pqcrystals_kyber512_PUBLICKEYBYTES 800
 8 | #define pqcrystals_kyber512_CIPHERTEXTBYTES 768
 9 | #define pqcrystals_kyber512_KEYPAIRCOINBYTES 64
10 | #define pqcrystals_kyber512_ENCCOINBYTES 32
11 | #define pqcrystals_kyber512_BYTES 32
12 | 
13 | #define pqcrystals_kyber512_ref_SECRETKEYBYTES pqcrystals_kyber512_SECRETKEYBYTES
14 | #define pqcrystals_kyber512_ref_PUBLICKEYBYTES pqcrystals_kyber512_PUBLICKEYBYTES
15 | #define pqcrystals_kyber512_ref_CIPHERTEXTBYTES pqcrystals_kyber512_CIPHERTEXTBYTES
16 | #define pqcrystals_kyber512_ref_KEYPAIRCOINBYTES pqcrystals_kyber512_KEYPAIRCOINBYTES
17 | #define pqcrystals_kyber512_ref_ENCCOINBYTES pqcrystals_kyber512_ENCCOINBYTES
18 | #define pqcrystals_kyber512_ref_BYTES pqcrystals_kyber512_BYTES
19 | 
20 | int pqcrystals_kyber512_ref_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
21 | int pqcrystals_kyber512_ref_keypair(uint8_t *pk, uint8_t *sk);
22 | int pqcrystals_kyber512_ref_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
23 | int pqcrystals_kyber512_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
24 | int pqcrystals_kyber512_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
25 | 
26 | #define pqcrystals_kyber768_SECRETKEYBYTES 2400
27 | #define pqcrystals_kyber768_PUBLICKEYBYTES 1184
28 | #define pqcrystals_kyber768_CIPHERTEXTBYTES 1088
29 | #define pqcrystals_kyber768_KEYPAIRCOINBYTES 64
30 | #define pqcrystals_kyber768_ENCCOINBYTES 32
31 | #define pqcrystals_kyber768_BYTES 32
32 | 
33 | #define pqcrystals_kyber768_ref_SECRETKEYBYTES pqcrystals_kyber768_SECRETKEYBYTES
34 | #define pqcrystals_kyber768_ref_PUBLICKEYBYTES pqcrystals_kyber768_PUBLICKEYBYTES
35 | #define pqcrystals_kyber768_ref_CIPHERTEXTBYTES pqcrystals_kyber768_CIPHERTEXTBYTES
36 | #define pqcrystals_kyber768_ref_KEYPAIRCOINBYTES pqcrystals_kyber768_KEYPAIRCOINBYTES
37 | #define pqcrystals_kyber768_ref_ENCCOINBYTES pqcrystals_kyber768_ENCCOINBYTES
38 | #define pqcrystals_kyber768_ref_BYTES pqcrystals_kyber768_BYTES
39 | 
40 | int pqcrystals_kyber768_ref_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
41 | int pqcrystals_kyber768_ref_keypair(uint8_t *pk, uint8_t *sk);
42 | int pqcrystals_kyber768_ref_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
43 | int pqcrystals_kyber768_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
44 | int pqcrystals_kyber768_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
45 | 
46 | #define pqcrystals_kyber1024_SECRETKEYBYTES 3168
47 | #define pqcrystals_kyber1024_PUBLICKEYBYTES 1568
48 | #define pqcrystals_kyber1024_CIPHERTEXTBYTES 1568
49 | #define pqcrystals_kyber1024_KEYPAIRCOINBYTES 64
50 | #define pqcrystals_kyber1024_ENCCOINBYTES 32
51 | #define pqcrystals_kyber1024_BYTES 32
52 | 
53 | #define pqcrystals_kyber1024_ref_SECRETKEYBYTES pqcrystals_kyber1024_SECRETKEYBYTES
54 | #define pqcrystals_kyber1024_ref_PUBLICKEYBYTES pqcrystals_kyber1024_PUBLICKEYBYTES
55 | #define pqcrystals_kyber1024_ref_CIPHERTEXTBYTES pqcrystals_kyber1024_CIPHERTEXTBYTES
56 | #define pqcrystals_kyber1024_ref_KEYPAIRCOINBYTES pqcrystals_kyber1024_KEYPAIRCOINBYTES
57 | #define pqcrystals_kyber1024_ref_ENCCOINBYTES pqcrystals_kyber1024_ENCCOINBYTES
58 | #define pqcrystals_kyber1024_ref_BYTES pqcrystals_kyber1024_BYTES
59 | 
60 | int pqcrystals_kyber1024_ref_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
61 | int pqcrystals_kyber1024_ref_keypair(uint8_t *pk, uint8_t *sk);
62 | int pqcrystals_kyber1024_ref_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
63 | int pqcrystals_kyber1024_ref_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
64 | int pqcrystals_kyber1024_ref_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
65 | 
66 | #endif
67 | 


--------------------------------------------------------------------------------
/avx2/api.h:
--------------------------------------------------------------------------------
 1 | #ifndef API_H
 2 | #define API_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #define pqcrystals_kyber512_SECRETKEYBYTES 1632
 7 | #define pqcrystals_kyber512_PUBLICKEYBYTES 800
 8 | #define pqcrystals_kyber512_CIPHERTEXTBYTES 768
 9 | #define pqcrystals_kyber512_KEYPAIRCOINBYTES 64
10 | #define pqcrystals_kyber512_ENCCOINBYTES 32
11 | #define pqcrystals_kyber512_BYTES 32
12 | 
13 | #define pqcrystals_kyber512_avx2_SECRETKEYBYTES pqcrystals_kyber512_SECRETKEYBYTES
14 | #define pqcrystals_kyber512_avx2_PUBLICKEYBYTES pqcrystals_kyber512_PUBLICKEYBYTES
15 | #define pqcrystals_kyber512_avx2_CIPHERTEXTBYTES pqcrystals_kyber512_CIPHERTEXTBYTES
16 | #define pqcrystals_kyber512_avx2_KEYPAIRCOINBYTES pqcrystals_kyber512_KEYPAIRCOINBYTES
17 | #define pqcrystals_kyber512_avx2_ENCCOINBYTES pqcrystals_kyber512_ENCCOINBYTES
18 | #define pqcrystals_kyber512_avx2_BYTES pqcrystals_kyber512_BYTES
19 | 
20 | int pqcrystals_kyber512_avx2_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
21 | int pqcrystals_kyber512_avx2_keypair(uint8_t *pk, uint8_t *sk);
22 | int pqcrystals_kyber512_avx2_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
23 | int pqcrystals_kyber512_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
24 | int pqcrystals_kyber512_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
25 | 
26 | #define pqcrystals_kyber768_SECRETKEYBYTES 2400
27 | #define pqcrystals_kyber768_PUBLICKEYBYTES 1184
28 | #define pqcrystals_kyber768_CIPHERTEXTBYTES 1088
29 | #define pqcrystals_kyber768_KEYPAIRCOINBYTES 64
30 | #define pqcrystals_kyber768_ENCCOINBYTES 32
31 | #define pqcrystals_kyber768_BYTES 32
32 | 
33 | #define pqcrystals_kyber768_avx2_SECRETKEYBYTES pqcrystals_kyber768_SECRETKEYBYTES
34 | #define pqcrystals_kyber768_avx2_PUBLICKEYBYTES pqcrystals_kyber768_PUBLICKEYBYTES
35 | #define pqcrystals_kyber768_avx2_CIPHERTEXTBYTES pqcrystals_kyber768_CIPHERTEXTBYTES
36 | #define pqcrystals_kyber768_avx2_KEYPAIRCOINBYTES pqcrystals_kyber768_KEYPAIRCOINBYTES
37 | #define pqcrystals_kyber768_avx2_ENCCOINBYTES pqcrystals_kyber768_ENCCOINBYTES
38 | #define pqcrystals_kyber768_avx2_BYTES pqcrystals_kyber768_BYTES
39 | 
40 | int pqcrystals_kyber768_avx2_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
41 | int pqcrystals_kyber768_avx2_keypair(uint8_t *pk, uint8_t *sk);
42 | int pqcrystals_kyber768_avx2_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
43 | int pqcrystals_kyber768_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
44 | int pqcrystals_kyber768_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
45 | 
46 | #define pqcrystals_kyber1024_SECRETKEYBYTES 3168
47 | #define pqcrystals_kyber1024_PUBLICKEYBYTES 1568
48 | #define pqcrystals_kyber1024_CIPHERTEXTBYTES 1568
49 | #define pqcrystals_kyber1024_KEYPAIRCOINBYTES 64
50 | #define pqcrystals_kyber1024_ENCCOINBYTES 32
51 | #define pqcrystals_kyber1024_BYTES 32
52 | 
53 | #define pqcrystals_kyber1024_avx2_SECRETKEYBYTES pqcrystals_kyber1024_SECRETKEYBYTES
54 | #define pqcrystals_kyber1024_avx2_PUBLICKEYBYTES pqcrystals_kyber1024_PUBLICKEYBYTES
55 | #define pqcrystals_kyber1024_avx2_CIPHERTEXTBYTES pqcrystals_kyber1024_CIPHERTEXTBYTES
56 | #define pqcrystals_kyber1024_avx2_KEYPAIRCOINBYTES pqcrystals_kyber1024_KEYPAIRCOINBYTES
57 | #define pqcrystals_kyber1024_avx2_ENCCOINBYTES pqcrystals_kyber1024_ENCCOINBYTES
58 | #define pqcrystals_kyber1024_avx2_BYTES pqcrystals_kyber1024_BYTES
59 | 
60 | int pqcrystals_kyber1024_avx2_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
61 | int pqcrystals_kyber1024_avx2_keypair(uint8_t *pk, uint8_t *sk);
62 | int pqcrystals_kyber1024_avx2_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
63 | int pqcrystals_kyber1024_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
64 | int pqcrystals_kyber1024_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
65 | 
66 | #endif
67 | 


--------------------------------------------------------------------------------
/ref/cbd.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include "params.h"
  3 | #include "cbd.h"
  4 | 
  5 | /*************************************************
  6 | * Name:        load32_littleendian
  7 | *
  8 | * Description: load 4 bytes into a 32-bit integer
  9 | *              in little-endian order
 10 | *
 11 | * Arguments:   - const uint8_t *x: pointer to input byte array
 12 | *
 13 | * Returns 32-bit unsigned integer loaded from x
 14 | **************************************************/
 15 | static uint32_t load32_littleendian(const uint8_t x[4])
 16 | {
 17 |   uint32_t r;
 18 |   r  = (uint32_t)x[0];
 19 |   r |= (uint32_t)x[1] << 8;
 20 |   r |= (uint32_t)x[2] << 16;
 21 |   r |= (uint32_t)x[3] << 24;
 22 |   return r;
 23 | }
 24 | 
 25 | /*************************************************
 26 | * Name:        load24_littleendian
 27 | *
 28 | * Description: load 3 bytes into a 32-bit integer
 29 | *              in little-endian order.
 30 | *              This function is only needed for Kyber-512
 31 | *
 32 | * Arguments:   - const uint8_t *x: pointer to input byte array
 33 | *
 34 | * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
 35 | **************************************************/
 36 | #if KYBER_ETA1 == 3
 37 | static uint32_t load24_littleendian(const uint8_t x[3])
 38 | {
 39 |   uint32_t r;
 40 |   r  = (uint32_t)x[0];
 41 |   r |= (uint32_t)x[1] << 8;
 42 |   r |= (uint32_t)x[2] << 16;
 43 |   return r;
 44 | }
 45 | #endif
 46 | 
 47 | 
 48 | /*************************************************
 49 | * Name:        cbd2
 50 | *
 51 | * Description: Given an array of uniformly random bytes, compute
 52 | *              polynomial with coefficients distributed according to
 53 | *              a centered binomial distribution with parameter eta=2
 54 | *
 55 | * Arguments:   - poly *r: pointer to output polynomial
 56 | *              - const uint8_t *buf: pointer to input byte array
 57 | **************************************************/
 58 | static void cbd2(poly *r, const uint8_t buf[2*KYBER_N/4])
 59 | {
 60 |   unsigned int i,j;
 61 |   uint32_t t,d;
 62 |   int16_t a,b;
 63 | 
 64 |   for(i=0;i<KYBER_N/8;i++) {
 65 |     t  = load32_littleendian(buf+4*i);
 66 |     d  = t & 0x55555555;
 67 |     d += (t>>1) & 0x55555555;
 68 | 
 69 |     for(j=0;j<8;j++) {
 70 |       a = (d >> (4*j+0)) & 0x3;
 71 |       b = (d >> (4*j+2)) & 0x3;
 72 |       r->coeffs[8*i+j] = a - b;
 73 |     }
 74 |   }
 75 | }
 76 | 
 77 | /*************************************************
 78 | * Name:        cbd3
 79 | *
 80 | * Description: Given an array of uniformly random bytes, compute
 81 | *              polynomial with coefficients distributed according to
 82 | *              a centered binomial distribution with parameter eta=3.
 83 | *              This function is only needed for Kyber-512
 84 | *
 85 | * Arguments:   - poly *r: pointer to output polynomial
 86 | *              - const uint8_t *buf: pointer to input byte array
 87 | **************************************************/
 88 | #if KYBER_ETA1 == 3
 89 | static void cbd3(poly *r, const uint8_t buf[3*KYBER_N/4])
 90 | {
 91 |   unsigned int i,j;
 92 |   uint32_t t,d;
 93 |   int16_t a,b;
 94 | 
 95 |   for(i=0;i<KYBER_N/4;i++) {
 96 |     t  = load24_littleendian(buf+3*i);
 97 |     d  = t & 0x00249249;
 98 |     d += (t>>1) & 0x00249249;
 99 |     d += (t>>2) & 0x00249249;
100 | 
101 |     for(j=0;j<4;j++) {
102 |       a = (d >> (6*j+0)) & 0x7;
103 |       b = (d >> (6*j+3)) & 0x7;
104 |       r->coeffs[4*i+j] = a - b;
105 |     }
106 |   }
107 | }
108 | #endif
109 | 
110 | void poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1*KYBER_N/4])
111 | {
112 | #if KYBER_ETA1 == 2
113 |   cbd2(r, buf);
114 | #elif KYBER_ETA1 == 3
115 |   cbd3(r, buf);
116 | #else
117 | #error "This implementation requires eta1 in {2,3}"
118 | #endif
119 | }
120 | 
121 | void poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2*KYBER_N/4])
122 | {
123 | #if KYBER_ETA2 == 2
124 |   cbd2(r, buf);
125 | #else
126 | #error "This implementation requires eta2 = 2"
127 | #endif
128 | }
129 | 


--------------------------------------------------------------------------------
/ref/test/test_speed.c:
--------------------------------------------------------------------------------
  1 | #include <stddef.h>
  2 | #include <stdint.h>
  3 | #include <stdlib.h>
  4 | #include <stdio.h>
  5 | #include "../kem.h"
  6 | #include "../params.h"
  7 | #include "../indcpa.h"
  8 | #include "../polyvec.h"
  9 | #include "../poly.h"
 10 | #include "../randombytes.h"
 11 | #include "cpucycles.h"
 12 | #include "speed_print.h"
 13 | 
 14 | #define NTESTS 1000
 15 | 
 16 | uint64_t t[NTESTS];
 17 | uint8_t seed[KYBER_SYMBYTES] = {0};
 18 | 
 19 | int main(void)
 20 | {
 21 |   unsigned int i;
 22 |   uint8_t pk[CRYPTO_PUBLICKEYBYTES];
 23 |   uint8_t sk[CRYPTO_SECRETKEYBYTES];
 24 |   uint8_t ct[CRYPTO_CIPHERTEXTBYTES];
 25 |   uint8_t key[CRYPTO_BYTES];
 26 |   uint8_t coins32[KYBER_SYMBYTES];
 27 |   uint8_t coins64[2*KYBER_SYMBYTES];
 28 |   polyvec matrix[KYBER_K];
 29 |   poly ap;
 30 | 
 31 |   randombytes(coins32, KYBER_SYMBYTES);
 32 |   randombytes(coins64, 2*KYBER_SYMBYTES);
 33 | 
 34 |   for(i=0;i<NTESTS;i++) {
 35 |     t[i] = cpucycles();
 36 |     gen_matrix(matrix, seed, 0);
 37 |   }
 38 |   print_results("gen_a: ", t, NTESTS);
 39 | 
 40 |   for(i=0;i<NTESTS;i++) {
 41 |     t[i] = cpucycles();
 42 |     poly_getnoise_eta1(&ap, seed, 0);
 43 |   }
 44 |   print_results("poly_getnoise_eta1: ", t, NTESTS);
 45 | 
 46 |   for(i=0;i<NTESTS;i++) {
 47 |     t[i] = cpucycles();
 48 |     poly_getnoise_eta2(&ap, seed, 0);
 49 |   }
 50 |   print_results("poly_getnoise_eta2: ", t, NTESTS);
 51 | 
 52 |   for(i=0;i<NTESTS;i++) {
 53 |     t[i] = cpucycles();
 54 |     poly_ntt(&ap);
 55 |   }
 56 |   print_results("NTT: ", t, NTESTS);
 57 | 
 58 |   for(i=0;i<NTESTS;i++) {
 59 |     t[i] = cpucycles();
 60 |     poly_invntt_tomont(&ap);
 61 |   }
 62 |   print_results("INVNTT: ", t, NTESTS);
 63 | 
 64 |   for(i=0;i<NTESTS;i++) {
 65 |     t[i] = cpucycles();
 66 |     polyvec_basemul_acc_montgomery(&ap, &matrix[0], &matrix[1]);
 67 |   }
 68 |   print_results("polyvec_basemul_acc_montgomery: ", t, NTESTS);
 69 | 
 70 |   for(i=0;i<NTESTS;i++) {
 71 |     t[i] = cpucycles();
 72 |     poly_tomsg(ct,&ap);
 73 |   }
 74 |   print_results("poly_tomsg: ", t, NTESTS);
 75 | 
 76 |   for(i=0;i<NTESTS;i++) {
 77 |     t[i] = cpucycles();
 78 |     poly_frommsg(&ap,ct);
 79 |   }
 80 |   print_results("poly_frommsg: ", t, NTESTS);
 81 | 
 82 |   for(i=0;i<NTESTS;i++) {
 83 |     t[i] = cpucycles();
 84 |     poly_compress(ct,&ap);
 85 |   }
 86 |   print_results("poly_compress: ", t, NTESTS);
 87 | 
 88 |   for(i=0;i<NTESTS;i++) {
 89 |     t[i] = cpucycles();
 90 |     poly_decompress(&ap,ct);
 91 |   }
 92 |   print_results("poly_decompress: ", t, NTESTS);
 93 | 
 94 |   for(i=0;i<NTESTS;i++) {
 95 |     t[i] = cpucycles();
 96 |     polyvec_compress(ct,&matrix[0]);
 97 |   }
 98 |   print_results("polyvec_compress: ", t, NTESTS);
 99 | 
100 |   for(i=0;i<NTESTS;i++) {
101 |     t[i] = cpucycles();
102 |     polyvec_decompress(&matrix[0],ct);
103 |   }
104 |   print_results("polyvec_decompress: ", t, NTESTS);
105 | 
106 |   for(i=0;i<NTESTS;i++) {
107 |     t[i] = cpucycles();
108 |     indcpa_keypair_derand(pk, sk, coins32);
109 |   }
110 |   print_results("indcpa_keypair: ", t, NTESTS);
111 | 
112 |   for(i=0;i<NTESTS;i++) {
113 |     t[i] = cpucycles();
114 |     indcpa_enc(ct, key, pk, seed);
115 |   }
116 |   print_results("indcpa_enc: ", t, NTESTS);
117 | 
118 |   for(i=0;i<NTESTS;i++) {
119 |     t[i] = cpucycles();
120 |     indcpa_dec(key, ct, sk);
121 |   }
122 |   print_results("indcpa_dec: ", t, NTESTS);
123 | 
124 |   for(i=0;i<NTESTS;i++) {
125 |     t[i] = cpucycles();
126 |     crypto_kem_keypair_derand(pk, sk, coins64);
127 |   }
128 |   print_results("kyber_keypair_derand: ", t, NTESTS);
129 | 
130 |   for(i=0;i<NTESTS;i++) {
131 |     t[i] = cpucycles();
132 |     crypto_kem_keypair(pk, sk);
133 |   }
134 |   print_results("kyber_keypair: ", t, NTESTS);
135 | 
136 |   for(i=0;i<NTESTS;i++) {
137 |     t[i] = cpucycles();
138 |     crypto_kem_enc_derand(ct, key, pk, coins32);
139 |   }
140 |   print_results("kyber_encaps_derand: ", t, NTESTS);
141 | 
142 |   for(i=0;i<NTESTS;i++) {
143 |     t[i] = cpucycles();
144 |     crypto_kem_enc(ct, key, pk);
145 |   }
146 |   print_results("kyber_encaps: ", t, NTESTS);
147 | 
148 |   for(i=0;i<NTESTS;i++) {
149 |     t[i] = cpucycles();
150 |     crypto_kem_dec(key, ct, sk);
151 |   }
152 |   print_results("kyber_decaps: ", t, NTESTS);
153 | 
154 |   return 0;
155 | }
156 | 


--------------------------------------------------------------------------------
/avx2/keccak4x/KeccakP-1600-times4-SnP.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
 3 | Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
 4 | denoted as "the implementer".
 5 | 
 6 | For more information, feedback or questions, please refer to our websites:
 7 | http://keccak.noekeon.org/
 8 | http://keyak.noekeon.org/
 9 | http://ketje.noekeon.org/
10 | 
11 | To the extent possible under law, the implementer has waived all copyright
12 | and related or neighboring rights to the source code in this file.
13 | http://creativecommons.org/publicdomain/zero/1.0/
14 | */
15 | 
16 | #ifndef _KeccakP_1600_times4_SnP_h_
17 | #define _KeccakP_1600_times4_SnP_h_
18 | 
19 | /** For the documentation, see PlSnP-documentation.h.
20 |  */
21 | 
22 | #include "KeccakP-SIMD256-config.h"
23 | #include "../fips202x4.h"
24 | 
25 | #define KeccakP1600times4_implementation        "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")"
26 | #define KeccakP1600times4_statesSizeInBytes     800
27 | #define KeccakP1600times4_statesAlignment       32
28 | #define KeccakF1600times4_FastLoop_supported
29 | #define KeccakP1600times4_12rounds_FastLoop_supported
30 | 
31 | #include <stddef.h>
32 | 
33 | #define KeccakP1600times4_StaticInitialize()
34 | #define KeccakP1600times4_InitializeAll FIPS202X4_NAMESPACE(KeccakP1600times4_InitializeAll)
35 | void KeccakP1600times4_InitializeAll(void *states);
36 | #define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \
37 |     ((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte)
38 | #define KeccakP1600times4_AddBytes FIPS202X4_NAMESPACE(KeccakP1600times4_AddBytes)
39 | void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
40 | #define KeccakP1600times4_AddLanesAll FIPS202X4_NAMESPACE(KeccakP1600times4_AddLanesAll)
41 | void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
42 | #define KeccakP1600times4_OverwriteBytes FIPS202X4_NAMESPACE(KeccakP1600times4_OverwriteBytes)
43 | void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
44 | #define KeccakP1600times4_OverwriteLanesAll FIPS202X4_NAMESPACE(KeccakP1600times4_OverwriteLanesAll)
45 | void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
46 | #define KeccakP1600times4_OverwriteWithZeroes FIPS202X4_NAMESPACE(KeccakP1600times4_OverwriteWithZeroes)
47 | void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount);
48 | #define KeccakP1600times4_PermuteAll_12rounds FIPS202X4_NAMESPACE(KeccakP1600times4_PermuteAll_12rounds)
49 | void KeccakP1600times4_PermuteAll_12rounds(void *states);
50 | #define KeccakP1600times4_PermuteAll_24rounds FIPS202X4_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds)
51 | void KeccakP1600times4_PermuteAll_24rounds(void *states);
52 | #define KeccakP1600times4_ExtractBytes FIPS202X4_NAMESPACE(KeccakP1600times4_ExtractBytes)
53 | void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length);
54 | #define KeccakP1600times4_ExtractLanesAll FIPS202X4_NAMESPACE(KeccakP1600times4_ExtractLanesAll)
55 | void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
56 | #define KeccakP1600times4_ExtractAndAddBytes FIPS202X4_NAMESPACE(KeccakP1600times4_ExtractAndAddBytes)
57 | void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex,  const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
58 | #define KeccakP1600times4_ExtractAndAddLanesAll FIPS202X4_NAMESPACE(KeccakP1600times4_ExtractAndAddLanesAll)
59 | void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset);
60 | #define KeccakF1600times4_FastLoop_Absorb FIPS202X4_NAMESPACE(KeccakF1600times4_FastLoop_Absorb)
61 | size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
62 | #define KeccakP1600times4_12rounds_FastLoop_Absorb FIPS202X4_NAMESPACE(KeccakP1600times4_12rounds_FastLoop_Absorb)
63 | size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
64 | 
65 | #endif
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Kyber
 2 | 
 3 | [![Build Status](https://travis-ci.org/pq-crystals/kyber.svg?branch=master)](https://travis-ci.org/pq-crystals/kyber) 
 4 | [![Coverage Status](https://coveralls.io/repos/github/pq-crystals/kyber/badge.svg?branch=master)](https://coveralls.io/github/pq-crystals/kyber?branch=master)
 5 | 
 6 | This repository contains the official reference implementation of the [Kyber](https://www.pq-crystals.org/kyber/) key encapsulation mechanism, 
 7 | and an optimized implementation for x86 CPUs supporting the AVX2 instruction set. 
 8 | Kyber has been selected for standardization in [round 3](https://csrc.nist.gov/Projects/post-quantum-cryptography/round-3-submissions) 
 9 | of the [NIST PQC](https://csrc.nist.gov/projects/post-quantum-cryptography) standardization project.
10 | 
11 | 
12 | ## Build instructions
13 | 
14 | The implementations contain several test and benchmarking programs and a Makefile to facilitate compilation.
15 | 
16 | ### Prerequisites
17 | 
18 | Some of the test programs require [OpenSSL](https://openssl.org). 
19 | If the OpenSSL header files and/or shared libraries do not lie in one of the standard locations on your system, 
20 | it is necessary to specify their location via compiler and linker flags in the environment variables `CFLAGS`, `NISTFLAGS`, and `LDFLAGS`.
21 | 
22 | For example, on macOS you can install OpenSSL via [Homebrew](https://brew.sh) by running
23 | ```sh
24 | brew install openssl
25 | ```
26 | Then, run
27 | ```sh
28 | export CFLAGS="-I/usr/local/opt/openssl@1.1/include"
29 | export NISTFLAGS="-I/usr/local/opt/openssl@1.1/include"
30 | export LDFLAGS="-L/usr/local/opt/openssl@1.1/lib"
31 | ```
32 | before compilation to add the OpenSSL header and library locations to the respective search paths.
33 | 
34 | ### Building all binaries
35 | 
36 | To compile the test and benchmarking programs on Linux or macOS, go to the `ref/` or `avx2/` directory and run
37 | ```sh
38 | make
39 | ```
40 | This produces the executables
41 | ```sh
42 | test/test_kyber$ALG
43 | test/test_vectors$ALG
44 | test/test_speed$ALG
45 | ```
46 | where `$ALG` ranges over the parameter sets 512, 768, 1024.
47 | 
48 | * `test_kyber$ALG` tests 1000 times to generate keys, encapsulate a random key and correctly decapsulate it again. 
49 |   Also, the program tests that the keys cannot correctly be decapsulated using a random secret key 
50 |   or a ciphertext where a single random byte was randomly distorted in order to test for trivial failures of the CCA security. 
51 |   The program will abort with an error message and return 1 if there was an error. 
52 |   Otherwise it will output the key and ciphertext sizes and return 0.
53 | * `test_vectors$ALG` generates 10000 sets of test vectors containing keys, ciphertexts and shared secrets 
54 |   whose byte-strings are output in hexadecimal. It also generates test vector for decapsulation of invalid
55 |   (pseudorandom) ciphertexts.
56 |   The required random bytes are deterministic and come from SHAKE128 on empty input.
57 | * `test_speed$ALG` reports the median and average cycle counts of 1000 executions of various internal functions 
58 |   and the API functions for key generation, encapsulation and decapsulation. 
59 |   By default the Time Step Counter is used. 
60 |   If instead you want to obtain the actual cycle counts from the Performance Measurement Counters, export `CFLAGS="-DUSE_RDPMC"` before compilation.
61 | 
62 | Please note that the reference implementation in `ref/` is not optimized for any platform, and, since it prioritises clean code, 
63 | is significantly slower than a trivially optimized but still platform-independent implementation. 
64 | Hence benchmarking the reference code does not provide particularly meaningful results.
65 | 
66 | <!--
67 | Our Kyber implementations are contained in the [SUPERCOP](https://bench.cr.yp.to) benchmarking framework. 
68 | See [here](http://bench.cr.yp.to/results-kem.html#amd64-kizomba) for cycle counts on an Intel KabyLake CPU.
69 | -->
70 | 
71 | ## Shared libraries
72 | 
73 | All implementations can be compiled into shared libraries by running
74 | ```sh
75 | make shared
76 | ```
77 | For example in the directory `ref/` of the reference implementation, this produces the libraries
78 | ```sh
79 | libpqcrystals_kyber$ALG_ref.so
80 | ```
81 | for all parameter sets `$ALG`, and the required symmetric crypto libraries
82 | ```
83 | libpqcrystals_aes256ctr_ref.so
84 | libpqcrystals_fips202_ref.so
85 | ```
86 | All global symbols in the libraries lie in the namespaces `pqcrystals_kyber$ALG_ref`, `libpqcrystals_aes256ctr_ref` and `libpqcrystals_fips202_ref`. Hence it is possible to link a program against all libraries simultaneously and obtain access to all implementations for all parameter sets. The corresponding API header file is `ref/api.h`, which contains prototypes for all API functions and preprocessor defines for the key and signature lengths.
87 | 
88 | 


--------------------------------------------------------------------------------
/avx2/ntt.S:
--------------------------------------------------------------------------------
  1 | #include "consts.h"
  2 | .include "shuffle.inc"
  3 | 
  4 | .macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2
  5 | vpmullw		%ymm\zl0,%ymm\rh0,%ymm12
  6 | vpmullw		%ymm\zl0,%ymm\rh1,%ymm13
  7 | 
  8 | vpmullw		%ymm\zl1,%ymm\rh2,%ymm14
  9 | vpmullw		%ymm\zl1,%ymm\rh3,%ymm15
 10 | 
 11 | vpmulhw		%ymm\zh0,%ymm\rh0,%ymm\rh0
 12 | vpmulhw		%ymm\zh0,%ymm\rh1,%ymm\rh1
 13 | 
 14 | vpmulhw		%ymm\zh1,%ymm\rh2,%ymm\rh2
 15 | vpmulhw		%ymm\zh1,%ymm\rh3,%ymm\rh3
 16 | .endm
 17 | 
 18 | .macro reduce
 19 | vpmulhw		%ymm0,%ymm12,%ymm12
 20 | vpmulhw		%ymm0,%ymm13,%ymm13
 21 | 
 22 | vpmulhw		%ymm0,%ymm14,%ymm14
 23 | vpmulhw		%ymm0,%ymm15,%ymm15
 24 | .endm
 25 | 
 26 | .macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3
 27 | vpaddw		%ymm\rh0,%ymm\rl0,%ymm\rln
 28 | vpsubw		%ymm\rh0,%ymm\rl0,%ymm\rh0
 29 | vpaddw		%ymm\rh1,%ymm\rl1,%ymm\rl0
 30 | 
 31 | vpsubw		%ymm\rh1,%ymm\rl1,%ymm\rh1
 32 | vpaddw		%ymm\rh2,%ymm\rl2,%ymm\rl1
 33 | vpsubw		%ymm\rh2,%ymm\rl2,%ymm\rh2
 34 | 
 35 | vpaddw		%ymm\rh3,%ymm\rl3,%ymm\rl2
 36 | vpsubw		%ymm\rh3,%ymm\rl3,%ymm\rh3
 37 | 
 38 | vpsubw		%ymm12,%ymm\rln,%ymm\rln
 39 | vpaddw		%ymm12,%ymm\rh0,%ymm\rh0
 40 | vpsubw		%ymm13,%ymm\rl0,%ymm\rl0
 41 | 
 42 | vpaddw		%ymm13,%ymm\rh1,%ymm\rh1
 43 | vpsubw		%ymm14,%ymm\rl1,%ymm\rl1
 44 | vpaddw		%ymm14,%ymm\rh2,%ymm\rh2
 45 | 
 46 | vpsubw		%ymm15,%ymm\rl2,%ymm\rl2
 47 | vpaddw		%ymm15,%ymm\rh3,%ymm\rh3
 48 | .endm
 49 | 
 50 | .macro level0 off
 51 | vpbroadcastq	(_ZETAS_EXP+0)*2(%rsi),%ymm15
 52 | vmovdqa		(64*\off+128)*2(%rdi),%ymm8
 53 | vmovdqa		(64*\off+144)*2(%rdi),%ymm9
 54 | vmovdqa		(64*\off+160)*2(%rdi),%ymm10
 55 | vmovdqa		(64*\off+176)*2(%rdi),%ymm11
 56 | vpbroadcastq	(_ZETAS_EXP+4)*2(%rsi),%ymm2
 57 | 
 58 | mul		8,9,10,11
 59 | 
 60 | vmovdqa		(64*\off+  0)*2(%rdi),%ymm4
 61 | vmovdqa		(64*\off+ 16)*2(%rdi),%ymm5
 62 | vmovdqa		(64*\off+ 32)*2(%rdi),%ymm6
 63 | vmovdqa		(64*\off+ 48)*2(%rdi),%ymm7
 64 | 
 65 | reduce
 66 | update		3,4,5,6,7,8,9,10,11
 67 | 
 68 | vmovdqa		%ymm3,(64*\off+  0)*2(%rdi)
 69 | vmovdqa		%ymm4,(64*\off+ 16)*2(%rdi)
 70 | vmovdqa		%ymm5,(64*\off+ 32)*2(%rdi)
 71 | vmovdqa		%ymm6,(64*\off+ 48)*2(%rdi)
 72 | vmovdqa		%ymm8,(64*\off+128)*2(%rdi)
 73 | vmovdqa		%ymm9,(64*\off+144)*2(%rdi)
 74 | vmovdqa		%ymm10,(64*\off+160)*2(%rdi)
 75 | vmovdqa		%ymm11,(64*\off+176)*2(%rdi)
 76 | .endm
 77 | 
 78 | .macro levels1t6 off
 79 | /* level 1 */
 80 | vmovdqa		(_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15
 81 | vmovdqa		(128*\off+ 64)*2(%rdi),%ymm8
 82 | vmovdqa		(128*\off+ 80)*2(%rdi),%ymm9
 83 | vmovdqa		(128*\off+ 96)*2(%rdi),%ymm10
 84 | vmovdqa		(128*\off+112)*2(%rdi),%ymm11
 85 | vmovdqa		(_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2
 86 | 
 87 | mul		8,9,10,11
 88 | 
 89 | vmovdqa		(128*\off+  0)*2(%rdi),%ymm4
 90 | vmovdqa	 	(128*\off+ 16)*2(%rdi),%ymm5
 91 | vmovdqa		(128*\off+ 32)*2(%rdi),%ymm6
 92 | vmovdqa		(128*\off+ 48)*2(%rdi),%ymm7
 93 | 
 94 | reduce
 95 | update		3,4,5,6,7,8,9,10,11
 96 | 
 97 | /* level 2 */
 98 | shuffle8	5,10,7,10
 99 | shuffle8	6,11,5,11
100 | 
101 | vmovdqa		(_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15
102 | vmovdqa		(_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2
103 | 
104 | mul		7,10,5,11
105 | 
106 | shuffle8	3,8,6,8
107 | shuffle8	4,9,3,9
108 | 
109 | reduce
110 | update		4,6,8,3,9,7,10,5,11
111 | 
112 | /* level 3 */
113 | shuffle4	8,5,9,5
114 | shuffle4	3,11,8,11
115 | 
116 | vmovdqa		(_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15
117 | vmovdqa		(_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2
118 | 
119 | mul		9,5,8,11
120 | 
121 | shuffle4	4,7,3,7
122 | shuffle4	6,10,4,10
123 | 
124 | reduce
125 | update		6,3,7,4,10,9,5,8,11
126 | 
127 | /* level 4 */
128 | shuffle2	7,8,10,8
129 | shuffle2	4,11,7,11
130 | 
131 | vmovdqa		(_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15
132 | vmovdqa		(_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2
133 | 
134 | mul		10,8,7,11
135 | 
136 | shuffle2	6,9,4,9
137 | shuffle2	3,5,6,5
138 | 
139 | reduce
140 | update		3,4,9,6,5,10,8,7,11
141 | 
142 | /* level 5 */
143 | shuffle1	9,7,5,7
144 | shuffle1	6,11,9,11
145 | 
146 | vmovdqa		(_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15
147 | vmovdqa		(_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2
148 | 
149 | mul		5,7,9,11
150 | 
151 | shuffle1	3,10,6,10
152 | shuffle1	4,8,3,8
153 | 
154 | reduce
155 | update		4,6,10,3,8,5,7,9,11
156 | 
157 | /* level 6 */
158 | vmovdqa		(_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14
159 | vmovdqa		(_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15
160 | vmovdqa		(_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8
161 | vmovdqa		(_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2
162 | 
163 | mul		10,3,9,11,14,15,8,2
164 | 
165 | reduce
166 | update		8,4,6,5,7,10,3,9,11
167 | 
168 | vmovdqa		%ymm8,(128*\off+  0)*2(%rdi)
169 | vmovdqa		%ymm4,(128*\off+ 16)*2(%rdi)
170 | vmovdqa		%ymm10,(128*\off+ 32)*2(%rdi)
171 | vmovdqa		%ymm3,(128*\off+ 48)*2(%rdi)
172 | vmovdqa		%ymm6,(128*\off+ 64)*2(%rdi)
173 | vmovdqa		%ymm5,(128*\off+ 80)*2(%rdi)
174 | vmovdqa		%ymm9,(128*\off+ 96)*2(%rdi)
175 | vmovdqa		%ymm11,(128*\off+112)*2(%rdi)
176 | .endm
177 | 
178 | .text
179 | .global cdecl(ntt_avx)
180 | cdecl(ntt_avx):
181 | vmovdqa		_16XQ*2(%rsi),%ymm0
182 | 
183 | level0		0
184 | level0		1
185 | 
186 | levels1t6	0
187 | levels1t6	1
188 | 
189 | ret
190 | 


--------------------------------------------------------------------------------
/ref/Makefile:
--------------------------------------------------------------------------------
  1 | CC ?= /usr/bin/cc
  2 | CFLAGS += -Wall -Wextra -Wpedantic -Wmissing-prototypes -Wredundant-decls \
  3 |   -Wshadow -Wpointer-arith -O3 -fomit-frame-pointer -z noexecstack
  4 | NISTFLAGS += -Wno-unused-result -O3 -fomit-frame-pointer
  5 | RM = /bin/rm
  6 | 
  7 | SOURCES = kem.c indcpa.c polyvec.c poly.c ntt.c cbd.c reduce.c verify.c
  8 | SOURCESKECCAK = $(SOURCES) fips202.c symmetric-shake.c
  9 | HEADERS = params.h kem.h indcpa.h polyvec.h poly.h ntt.h cbd.h reduce.c verify.h symmetric.h
 10 | HEADERSKECCAK = $(HEADERS) fips202.h
 11 | 
 12 | .PHONY: all speed shared clean
 13 | 
 14 | all: test speed shared nistkat
 15 | 
 16 | test: \
 17 |   test/test_kyber512 \
 18 |   test/test_kyber768 \
 19 |   test/test_kyber1024 \
 20 |   test/test_vectors512 \
 21 |   test/test_vectors768 \
 22 |   test/test_vectors1024 \
 23 | 
 24 | speed: \
 25 |   test/test_speed512 \
 26 |   test/test_speed768 \
 27 |   test/test_speed1024 \
 28 | 
 29 | shared: \
 30 |   lib/libpqcrystals_kyber512_ref.so \
 31 |   lib/libpqcrystals_kyber768_ref.so \
 32 |   lib/libpqcrystals_kyber1024_ref.so \
 33 |   lib/libpqcrystals_fips202_ref.so \
 34 | 
 35 | nistkat: \
 36 | 	nistkat/PQCgenKAT_kem512 \
 37 | 	nistkat/PQCgenKAT_kem768 \
 38 | 	nistkat/PQCgenKAT_kem1024 \
 39 | 
 40 | 
 41 | lib/libpqcrystals_fips202_ref.so: fips202.c fips202.h
 42 | 	mkdir -p lib
 43 | 	$(CC) -shared -fPIC $(CFLAGS) fips202.c -o $@
 44 | 
 45 | lib/libpqcrystals_kyber512_ref.so: $(SOURCES) $(HEADERS) symmetric-shake.c
 46 | 	mkdir -p lib
 47 | 	$(CC) -shared -fPIC $(CFLAGS) -DKYBER_K=2 $(SOURCES) symmetric-shake.c -o $@
 48 | 
 49 | lib/libpqcrystals_kyber768_ref.so: $(SOURCES) $(HEADERS) symmetric-shake.c
 50 | 	mkdir -p lib
 51 | 	$(CC) -shared -fPIC $(CFLAGS) -DKYBER_K=3 $(SOURCES) symmetric-shake.c -o $@
 52 | 
 53 | lib/libpqcrystals_kyber1024_ref.so: $(SOURCES) $(HEADERS) symmetric-shake.c
 54 | 	mkdir -p lib
 55 | 	$(CC) -shared -fPIC $(CFLAGS) -DKYBER_K=4 $(SOURCES) symmetric-shake.c -o $@
 56 | 
 57 | test/test_kyber512: $(SOURCESKECCAK) $(HEADERSKECCAK) test/test_kyber.c randombytes.c
 58 | 	$(CC) $(CFLAGS) -DKYBER_K=2 $(SOURCESKECCAK) randombytes.c test/test_kyber.c -o $@
 59 | 
 60 | test/test_kyber768: $(SOURCESKECCAK) $(HEADERSKECCAK) test/test_kyber.c randombytes.c
 61 | 	$(CC) $(CFLAGS) -DKYBER_K=3 $(SOURCESKECCAK) randombytes.c test/test_kyber.c -o $@
 62 | 
 63 | test/test_kyber1024: $(SOURCESKECCAK) $(HEADERSKECCAK) test/test_kyber.c randombytes.c
 64 | 	$(CC) $(CFLAGS) -DKYBER_K=4 $(SOURCESKECCAK) randombytes.c test/test_kyber.c -o $@
 65 | 
 66 | test/test_vectors512: $(SOURCESKECCAK) $(HEADERSKECCAK) test/test_vectors.c
 67 | 	$(CC) $(CFLAGS) -DKYBER_K=2 $(SOURCESKECCAK) test/test_vectors.c -o $@
 68 | 
 69 | test/test_vectors768: $(SOURCESKECCAK) $(HEADERSKECCAK) test/test_vectors.c
 70 | 	$(CC) $(CFLAGS) -DKYBER_K=3 $(SOURCESKECCAK) test/test_vectors.c -o $@
 71 | 
 72 | test/test_vectors1024: $(SOURCESKECCAK) $(HEADERSKECCAK) test/test_vectors.c
 73 | 	$(CC) $(CFLAGS) -DKYBER_K=4 $(SOURCESKECCAK) test/test_vectors.c -o $@
 74 | 
 75 | test/test_speed512: $(SOURCESKECCAK) $(HEADERSKECCAK) test/cpucycles.h test/cpucycles.c test/speed_print.h test/speed_print.c test/test_speed.c randombytes.c
 76 | 	$(CC) $(CFLAGS) -DKYBER_K=2 $(SOURCESKECCAK) randombytes.c test/cpucycles.c test/speed_print.c test/test_speed.c -o $@
 77 | 
 78 | test/test_speed768: $(SOURCESKECCAK) $(HEADERSKECCAK) test/cpucycles.h test/cpucycles.c test/speed_print.h test/speed_print.c test/test_speed.c randombytes.c
 79 | 	$(CC) $(CFLAGS) -DKYBER_K=3 $(SOURCESKECCAK) randombytes.c test/cpucycles.c test/speed_print.c test/test_speed.c -o $@
 80 | 
 81 | test/test_speed1024: $(SOURCESKECCAK) $(HEADERSKECCAK) test/cpucycles.h test/cpucycles.c test/speed_print.h test/speed_print.c test/test_speed.c randombytes.c
 82 | 	$(CC) $(CFLAGS) -DKYBER_K=4 $(SOURCESKECCAK) randombytes.c test/cpucycles.c test/speed_print.c test/test_speed.c -o $@
 83 | 
 84 | nistkat/PQCgenKAT_kem512: $(SOURCESKECCAK) $(HEADERSKECCAK) nistkat/PQCgenKAT_kem.c nistkat/rng.c nistkat/rng.h
 85 | 	$(CC) $(NISTFLAGS) -DKYBER_K=2 -o $@ $(SOURCESKECCAK) nistkat/rng.c nistkat/PQCgenKAT_kem.c $(LDFLAGS) -lcrypto
 86 | 
 87 | nistkat/PQCgenKAT_kem768: $(SOURCESKECCAK) $(HEADERSKECCAK) nistkat/PQCgenKAT_kem.c nistkat/rng.c nistkat/rng.h
 88 | 	$(CC) $(NISTFLAGS) -DKYBER_K=3 -o $@ $(SOURCESKECCAK) nistkat/rng.c nistkat/PQCgenKAT_kem.c $(LDFLAGS) -lcrypto
 89 | 
 90 | nistkat/PQCgenKAT_kem1024: $(SOURCESKECCAK) $(HEADERSKECCAK) nistkat/PQCgenKAT_kem.c nistkat/rng.c nistkat/rng.h
 91 | 	$(CC) $(NISTFLAGS) -DKYBER_K=4 -o $@ $(SOURCESKECCAK) nistkat/rng.c nistkat/PQCgenKAT_kem.c $(LDFLAGS) -lcrypto
 92 | 
 93 | clean:
 94 | 	-$(RM) -f *.gcno *.gcda *.lcov *.o *.so
 95 | 	-$(RM) -f test/test_kyber512
 96 | 	-$(RM) -f test/test_kyber768
 97 | 	-$(RM) -f test/test_kyber1024
 98 | 	-$(RM) -f test/test_vectors512
 99 | 	-$(RM) -f test/test_vectors768
100 | 	-$(RM) -f test/test_vectors1024
101 | 	-$(RM) -f test/test_speed512
102 | 	-$(RM) -f test/test_speed768
103 | 	-$(RM) -f test/test_speed1024
104 | 	-$(RM) -f nistkat/PQCgenKAT_kem512
105 | 	-$(RM) -f nistkat/PQCgenKAT_kem768
106 | 	-$(RM) -f nistkat/PQCgenKAT_kem1024
107 | 	-$(RM) -f nistkat/*.req
108 | 	-$(RM) -f nistkat/*.rsp
109 | 	-$(RM) -rf lib/
110 | 
111 | 


--------------------------------------------------------------------------------
/avx2/Makefile:
--------------------------------------------------------------------------------
  1 | CC ?= /usr/bin/cc
  2 | CFLAGS += -Wall -Wextra -Wpedantic -Wmissing-prototypes -Wredundant-decls \
  3 |   -Wshadow -Wpointer-arith -mavx2 -mbmi2 -mpopcnt \
  4 |   -march=native -mtune=native -O3 -fomit-frame-pointer -z noexecstack
  5 | NISTFLAGS += -Wno-unused-result -mavx2 -mbmi2 -mpopcnt \
  6 |   -march=native -mtune=native -O3 -fomit-frame-pointer
  7 | RM = /bin/rm
  8 | 
  9 | SOURCES = kem.c indcpa.c polyvec.c poly.c fq.S shuffle.S ntt.S invntt.S \
 10 |   basemul.S consts.c rejsample.c cbd.c verify.c
 11 | SOURCESKECCAK   = $(SOURCES) fips202.c fips202x4.c symmetric-shake.c \
 12 |   keccak4x/KeccakP-1600-times4-SIMD256.o
 13 | HEADERS = params.h align.h kem.h indcpa.h polyvec.h poly.h reduce.h fq.inc shuffle.inc \
 14 |   ntt.h consts.h rejsample.h cbd.h verify.h symmetric.h randombytes.h
 15 | HEADERSKECCAK   = $(HEADERS) fips202.h fips202x4.h
 16 | 
 17 | .PHONY: all shared clean
 18 | 
 19 | all: \
 20 |   test/test_kyber512 \
 21 |   test/test_kyber768 \
 22 |   test/test_kyber1024 \
 23 |   test/test_vectors512 \
 24 |   test/test_vectors768 \
 25 |   test/test_vectors1024 \
 26 |   speed
 27 | 
 28 | speed: \
 29 |   test/test_speed512 \
 30 |   test/test_speed768 \
 31 |   test/test_speed1024 \
 32 | 
 33 | shared: \
 34 |   libpqcrystals_kyber512_avx2.so \
 35 |   libpqcrystals_kyber768_avx2.so \
 36 |   libpqcrystals_kyber1024_avx2.so \
 37 |   libpqcrystals_fips202_ref.so \
 38 |   libpqcrystals_fips202x4_avx2.so \
 39 | 
 40 | keccak4x/KeccakP-1600-times4-SIMD256.o: \
 41 |   keccak4x/KeccakP-1600-times4-SIMD256.c \
 42 |   keccak4x/KeccakP-1600-times4-SnP.h \
 43 |   keccak4x/KeccakP-1600-unrolling.macros \
 44 |   keccak4x/KeccakP-SIMD256-config.h \
 45 |   keccak4x/KeccakP-align.h \
 46 |   keccak4x/KeccakP-brg_endian.h
 47 | 	$(CC) $(CFLAGS) -c $< -o $@
 48 | 
 49 | libpqcrystals_fips202_ref.so: fips202.c fips202.h
 50 | 	$(CC) -shared -fPIC $(CFLAGS) -o $@ $<
 51 | 
 52 | libpqcrystals_fips202x4_avx2.so: fips202x4.c fips202x4.h \
 53 |   keccak4x/KeccakP-1600-times4-SIMD256.c \
 54 |   keccak4x/KeccakP-1600-times4-SnP.h \
 55 |   keccak4x/KeccakP-1600-unrolling.macros \
 56 |   keccak4x/KeccakP-SIMD256-config.h \
 57 |   keccak4x/KeccakP-align.h \
 58 |   keccak4x/KeccakP-brg_endian.h
 59 | 	$(CC) -shared -fPIC $(CFLAGS) -o $@ $< keccak4x/KeccakP-1600-times4-SIMD256.c
 60 | 
 61 | libpqcrystals_kyber512_avx2.so: $(SOURCES) $(HEADERS) symmetric-shake.c
 62 | 	$(CC) -shared -fpic $(CFLAGS) -DKYBER_K=2 $(SOURCES) \
 63 | 	  symmetric-shake.c -o libpqcrystals_kyber512_avx2.so
 64 | 
 65 | libpqcrystals_kyber768_avx2.so: $(SOURCES) $(HEADERS) symmetric-shake.c
 66 | 	$(CC) -shared -fpic $(CFLAGS) -DKYBER_K=3 $(SOURCES) \
 67 | 	  symmetric-shake.c -o libpqcrystals_kyber768_avx2.so
 68 | 
 69 | libpqcrystals_kyber1024_avx2.so: $(SOURCES) $(HEADERS) symmetric-shake.c
 70 | 	$(CC) -shared -fpic $(CFLAGS) -DKYBER_K=4 $(SOURCES) \
 71 | 	  symmetric-shake.c -o libpqcrystals_kyber1024_avx2.so
 72 | 
 73 | test/test_kyber512: $(SOURCESKECCAK) $(HEADERSKECCAK) test/test_kyber.c randombytes.c
 74 | 	$(CC) $(CFLAGS) -DKYBER_K=2 $(SOURCESKECCAK) randombytes.c test/test_kyber.c -o $@
 75 | 
 76 | test/test_kyber768: $(SOURCESKECCAK) $(HEADERSKECCAK) test/test_kyber.c randombytes.c
 77 | 	$(CC) $(CFLAGS) -DKYBER_K=3 $(SOURCESKECCAK) randombytes.c test/test_kyber.c -o $@
 78 | 
 79 | test/test_kyber1024: $(SOURCESKECCAK) $(HEADERSKECCAK) test/test_kyber.c randombytes.c
 80 | 	$(CC) $(CFLAGS) -DKYBER_K=4 $(SOURCESKECCAK) randombytes.c test/test_kyber.c -o $@
 81 | 
 82 | test/test_vectors512: $(SOURCESKECCAK) $(HEADERSKECCAK) test/test_vectors.c
 83 | 	$(CC) $(CFLAGS) -DKYBER_K=2 $(SOURCESKECCAK) test/test_vectors.c -o $@
 84 | 
 85 | test/test_vectors768: $(SOURCESKECCAK) $(HEADERSKECCAK) test/test_vectors.c
 86 | 	$(CC) $(CFLAGS) -DKYBER_K=3 $(SOURCESKECCAK) test/test_vectors.c -o $@
 87 | 
 88 | test/test_vectors1024: $(SOURCESKECCAK) $(HEADERSKECCAK) test/test_vectors.c
 89 | 	$(CC) $(CFLAGS) -DKYBER_K=4 $(SOURCESKECCAK) test/test_vectors.c -o $@
 90 | 
 91 | test/test_speed512: $(SOURCESKECCAK) $(HEADERSKECCAK) test/cpucycles.h test/cpucycles.c test/speed_print.h test/speed_print.c test/test_speed.c randombytes.c
 92 | 	$(CC) $(CFLAGS) -DKYBER_K=2 $(SOURCESKECCAK) randombytes.c test/cpucycles.c test/speed_print.c test/test_speed.c -o $@
 93 | 
 94 | test/test_speed768: $(SOURCESKECCAK) $(HEADERSKECCAK) test/cpucycles.h test/cpucycles.c test/speed_print.h test/speed_print.c test/test_speed.c randombytes.c
 95 | 	$(CC) $(CFLAGS) -DKYBER_K=3 $(SOURCESKECCAK) randombytes.c test/cpucycles.c test/speed_print.c test/test_speed.c -o $@
 96 | 
 97 | test/test_speed1024: $(SOURCESKECCAK) $(HEADERSKECCAK) test/cpucycles.h test/cpucycles.c test/speed_print.h test/speed_print.c test/test_speed.c randombytes.c
 98 | 	$(CC) $(CFLAGS) -DKYBER_K=4 $(SOURCESKECCAK) randombytes.c test/cpucycles.c test/speed_print.c test/test_speed.c -o $@
 99 | 
100 | 
101 | clean:
102 | 	-$(RM) -rf *.o *.a *.so
103 | 	-$(RM) -rf test/test_kyber512
104 | 	-$(RM) -rf test/test_kyber768
105 | 	-$(RM) -rf test/test_kyber1024
106 | 	-$(RM) -rf test/test_vectors512
107 | 	-$(RM) -rf test/test_vectors768
108 | 	-$(RM) -rf test/test_vectors1024
109 | 	-$(RM) -rf test/test_speed512
110 | 	-$(RM) -rf test/test_speed768
111 | 	-$(RM) -rf test/test_speed1024
112 | 	-$(RM) -rf keccak4x/KeccakP-1600-times4-SIMD256.o
113 | 


--------------------------------------------------------------------------------
/avx2/cbd.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <immintrin.h>
  3 | #include "params.h"
  4 | #include "cbd.h"
  5 | 
  6 | /*************************************************
  7 | * Name:        cbd2
  8 | *
  9 | * Description: Given an array of uniformly random bytes, compute
 10 | *              polynomial with coefficients distributed according to
 11 | *              a centered binomial distribution with parameter eta=2
 12 | *
 13 | * Arguments:   - poly *r: pointer to output polynomial
 14 | *              - const __m256i *buf: pointer to aligned input byte array
 15 | **************************************************/
 16 | static void cbd2(poly * restrict r, const __m256i buf[2*KYBER_N/128])
 17 | {
 18 |   unsigned int i;
 19 |   __m256i f0, f1, f2, f3;
 20 |   const __m256i mask55 = _mm256_set1_epi32(0x55555555);
 21 |   const __m256i mask33 = _mm256_set1_epi32(0x33333333);
 22 |   const __m256i mask03 = _mm256_set1_epi32(0x03030303);
 23 |   const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F);
 24 | 
 25 |   for(i = 0; i < KYBER_N/64; i++) {
 26 |     f0 = _mm256_load_si256(&buf[i]);
 27 | 
 28 |     f1 = _mm256_srli_epi16(f0, 1);
 29 |     f0 = _mm256_and_si256(mask55, f0);
 30 |     f1 = _mm256_and_si256(mask55, f1);
 31 |     f0 = _mm256_add_epi8(f0, f1);
 32 | 
 33 |     f1 = _mm256_srli_epi16(f0, 2);
 34 |     f0 = _mm256_and_si256(mask33, f0);
 35 |     f1 = _mm256_and_si256(mask33, f1);
 36 |     f0 = _mm256_add_epi8(f0, mask33);
 37 |     f0 = _mm256_sub_epi8(f0, f1);
 38 | 
 39 |     f1 = _mm256_srli_epi16(f0, 4);
 40 |     f0 = _mm256_and_si256(mask0F, f0);
 41 |     f1 = _mm256_and_si256(mask0F, f1);
 42 |     f0 = _mm256_sub_epi8(f0, mask03);
 43 |     f1 = _mm256_sub_epi8(f1, mask03);
 44 | 
 45 |     f2 = _mm256_unpacklo_epi8(f0, f1);
 46 |     f3 = _mm256_unpackhi_epi8(f0, f1);
 47 | 
 48 |     f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2));
 49 |     f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2,1));
 50 |     f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3));
 51 |     f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3,1));
 52 | 
 53 |     _mm256_store_si256(&r->vec[4*i+0], f0);
 54 |     _mm256_store_si256(&r->vec[4*i+1], f2);
 55 |     _mm256_store_si256(&r->vec[4*i+2], f1);
 56 |     _mm256_store_si256(&r->vec[4*i+3], f3);
 57 |   }
 58 | }
 59 | 
 60 | #if KYBER_ETA1 == 3
 61 | /*************************************************
 62 | * Name:        cbd3
 63 | *
 64 | * Description: Given an array of uniformly random bytes, compute
 65 | *              polynomial with coefficients distributed according to
 66 | *              a centered binomial distribution with parameter eta=3
 67 | *              This function is only needed for Kyber-512
 68 | *
 69 | * Arguments:   - poly *r: pointer to output polynomial
 70 | *              - const __m256i *buf: pointer to aligned input byte array
 71 | **************************************************/
 72 | static void cbd3(poly * restrict r, const uint8_t buf[3*KYBER_N/4+8])
 73 | {
 74 |   unsigned int i;
 75 |   __m256i f0, f1, f2, f3;
 76 |   const __m256i mask249 = _mm256_set1_epi32(0x249249);
 77 |   const __m256i mask6DB = _mm256_set1_epi32(0x6DB6DB);
 78 |   const __m256i mask07 = _mm256_set1_epi32(7);
 79 |   const __m256i mask70 = _mm256_set1_epi32(7 << 16);
 80 |   const __m256i mask3 = _mm256_set1_epi16(3);
 81 |   const __m256i shufbidx = _mm256_set_epi8(-1,15,14,13,-1,12,11,10,-1, 9, 8, 7,-1, 6, 5, 4,
 82 |                                            -1,11,10, 9,-1, 8, 7, 6,-1, 5, 4, 3,-1, 2, 1, 0);
 83 | 
 84 |   for(i = 0; i < KYBER_N/32; i++) {
 85 |     f0 = _mm256_loadu_si256((__m256i *)&buf[24*i]);
 86 |     f0 = _mm256_permute4x64_epi64(f0,0x94);
 87 |     f0 = _mm256_shuffle_epi8(f0,shufbidx);
 88 | 
 89 |     f1 = _mm256_srli_epi32(f0,1);
 90 |     f2 = _mm256_srli_epi32(f0,2);
 91 |     f0 = _mm256_and_si256(mask249,f0);
 92 |     f1 = _mm256_and_si256(mask249,f1);
 93 |     f2 = _mm256_and_si256(mask249,f2);
 94 |     f0 = _mm256_add_epi32(f0,f1);
 95 |     f0 = _mm256_add_epi32(f0,f2);
 96 | 
 97 |     f1 = _mm256_srli_epi32(f0,3);
 98 |     f0 = _mm256_add_epi32(f0,mask6DB);
 99 |     f0 = _mm256_sub_epi32(f0,f1);
100 | 
101 |     f1 = _mm256_slli_epi32(f0,10);
102 |     f2 = _mm256_srli_epi32(f0,12);
103 |     f3 = _mm256_srli_epi32(f0, 2);
104 |     f0 = _mm256_and_si256(f0,mask07);
105 |     f1 = _mm256_and_si256(f1,mask70);
106 |     f2 = _mm256_and_si256(f2,mask07);
107 |     f3 = _mm256_and_si256(f3,mask70);
108 |     f0 = _mm256_add_epi16(f0,f1);
109 |     f1 = _mm256_add_epi16(f2,f3);
110 |     f0 = _mm256_sub_epi16(f0,mask3);
111 |     f1 = _mm256_sub_epi16(f1,mask3);
112 | 
113 |     f2 = _mm256_unpacklo_epi32(f0,f1);
114 |     f3 = _mm256_unpackhi_epi32(f0,f1);
115 | 
116 |     f0 = _mm256_permute2x128_si256(f2,f3,0x20);
117 |     f1 = _mm256_permute2x128_si256(f2,f3,0x31);
118 | 
119 |     _mm256_store_si256(&r->vec[2*i+0], f0);
120 |     _mm256_store_si256(&r->vec[2*i+1], f1);
121 |   }
122 | }
123 | #endif
124 | 
125 | /* buf 32 bytes longer for cbd3 */
126 | void poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1*KYBER_N/128+1])
127 | {
128 | #if KYBER_ETA1 == 2
129 |   cbd2(r, buf);
130 | #elif KYBER_ETA1 == 3
131 |   cbd3(r, (uint8_t *)buf);
132 | #else
133 | #error "This implementation requires eta1 in {2,3}"
134 | #endif
135 | }
136 | 
137 | void poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2*KYBER_N/128])
138 | {
139 | #if KYBER_ETA2 == 2
140 |   cbd2(r, buf);
141 | #else
142 | #error "This implementation requires eta2 = 2"
143 | #endif
144 | }
145 | 


--------------------------------------------------------------------------------
/ref/ntt.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include "params.h"
  3 | #include "ntt.h"
  4 | #include "reduce.h"
  5 | 
  6 | /* Code to generate zetas and zetas_inv used in the number-theoretic transform:
  7 | 
  8 | #define KYBER_ROOT_OF_UNITY 17
  9 | 
 10 | static const uint8_t tree[128] = {
 11 |   0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120,
 12 |   4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124,
 13 |   2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122,
 14 |   6, 70, 38, 102, 22, 86, 54, 118, 14, 78, 46, 110, 30, 94, 62, 126,
 15 |   1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121,
 16 |   5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125,
 17 |   3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123,
 18 |   7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127
 19 | };
 20 | 
 21 | void init_ntt() {
 22 |   unsigned int i;
 23 |   int16_t tmp[128];
 24 | 
 25 |   tmp[0] = MONT;
 26 |   for(i=1;i<128;i++)
 27 |     tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q);
 28 | 
 29 |   for(i=0;i<128;i++) {
 30 |     zetas[i] = tmp[tree[i]];
 31 |     if(zetas[i] > KYBER_Q/2)
 32 |       zetas[i] -= KYBER_Q;
 33 |     if(zetas[i] < -KYBER_Q/2)
 34 |       zetas[i] += KYBER_Q;
 35 |   }
 36 | }
 37 | */
 38 | 
 39 | const int16_t zetas[128] = {
 40 |   -1044,  -758,  -359, -1517,  1493,  1422,   287,   202,
 41 |    -171,   622,  1577,   182,   962, -1202, -1474,  1468,
 42 |     573, -1325,   264,   383,  -829,  1458, -1602,  -130,
 43 |    -681,  1017,   732,   608, -1542,   411,  -205, -1571,
 44 |    1223,   652,  -552,  1015, -1293,  1491,  -282, -1544,
 45 |     516,    -8,  -320,  -666, -1618, -1162,   126,  1469,
 46 |    -853,   -90,  -271,   830,   107, -1421,  -247,  -951,
 47 |    -398,   961, -1508,  -725,   448, -1065,   677, -1275,
 48 |   -1103,   430,   555,   843, -1251,   871,  1550,   105,
 49 |     422,   587,   177,  -235,  -291,  -460,  1574,  1653,
 50 |    -246,   778,  1159,  -147,  -777,  1483,  -602,  1119,
 51 |   -1590,   644,  -872,   349,   418,   329,  -156,   -75,
 52 |     817,  1097,   603,   610,  1322, -1285, -1465,   384,
 53 |   -1215,  -136,  1218, -1335,  -874,   220, -1187, -1659,
 54 |   -1185, -1530, -1278,   794, -1510,  -854,  -870,   478,
 55 |    -108,  -308,   996,   991,   958, -1460,  1522,  1628
 56 | };
 57 | 
 58 | /*************************************************
 59 | * Name:        fqmul
 60 | *
 61 | * Description: Multiplication followed by Montgomery reduction
 62 | *
 63 | * Arguments:   - int16_t a: first factor
 64 | *              - int16_t b: second factor
 65 | *
 66 | * Returns 16-bit integer congruent to a*b*R^{-1} mod q
 67 | **************************************************/
 68 | static int16_t fqmul(int16_t a, int16_t b) {
 69 |   return montgomery_reduce((int32_t)a*b);
 70 | }
 71 | 
 72 | /*************************************************
 73 | * Name:        ntt
 74 | *
 75 | * Description: Inplace number-theoretic transform (NTT) in Rq.
 76 | *              input is in standard order, output is in bitreversed order
 77 | *
 78 | * Arguments:   - int16_t r[256]: pointer to input/output vector of elements of Zq
 79 | **************************************************/
 80 | void ntt(int16_t r[256]) {
 81 |   unsigned int len, start, j, k;
 82 |   int16_t t, zeta;
 83 | 
 84 |   k = 1;
 85 |   for(len = 128; len >= 2; len >>= 1) {
 86 |     for(start = 0; start < 256; start = j + len) {
 87 |       zeta = zetas[k++];
 88 |       for(j = start; j < start + len; j++) {
 89 |         t = fqmul(zeta, r[j + len]);
 90 |         r[j + len] = r[j] - t;
 91 |         r[j] = r[j] + t;
 92 |       }
 93 |     }
 94 |   }
 95 | }
 96 | 
 97 | /*************************************************
 98 | * Name:        invntt_tomont
 99 | *
100 | * Description: Inplace inverse number-theoretic transform in Rq and
101 | *              multiplication by Montgomery factor 2^16.
102 | *              Input is in bitreversed order, output is in standard order
103 | *
104 | * Arguments:   - int16_t r[256]: pointer to input/output vector of elements of Zq
105 | **************************************************/
106 | void invntt(int16_t r[256]) {
107 |   unsigned int start, len, j, k;
108 |   int16_t t, zeta;
109 |   const int16_t f = 1441; // mont^2/128
110 | 
111 |   k = 127;
112 |   for(len = 2; len <= 128; len <<= 1) {
113 |     for(start = 0; start < 256; start = j + len) {
114 |       zeta = zetas[k--];
115 |       for(j = start; j < start + len; j++) {
116 |         t = r[j];
117 |         r[j] = barrett_reduce(t + r[j + len]);
118 |         r[j + len] = r[j + len] - t;
119 |         r[j + len] = fqmul(zeta, r[j + len]);
120 |       }
121 |     }
122 |   }
123 | 
124 |   for(j = 0; j < 256; j++)
125 |     r[j] = fqmul(r[j], f);
126 | }
127 | 
128 | /*************************************************
129 | * Name:        basemul
130 | *
131 | * Description: Multiplication of polynomials in Zq[X]/(X^2-zeta)
132 | *              used for multiplication of elements in Rq in NTT domain
133 | *
134 | * Arguments:   - int16_t r[2]: pointer to the output polynomial
135 | *              - const int16_t a[2]: pointer to the first factor
136 | *              - const int16_t b[2]: pointer to the second factor
137 | *              - int16_t zeta: integer defining the reduction polynomial
138 | **************************************************/
139 | void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta)
140 | {
141 |   r[0]  = fqmul(a[1], b[1]);
142 |   r[0]  = fqmul(r[0], zeta);
143 |   r[0] += fqmul(a[0], b[0]);
144 |   r[1]  = fqmul(a[0], b[1]);
145 |   r[1] += fqmul(a[1], b[0]);
146 | }
147 | 


--------------------------------------------------------------------------------
/avx2/shuffle.S:
--------------------------------------------------------------------------------
  1 | #include "consts.h"
  2 | .include "fq.inc"
  3 | .include "shuffle.inc"
  4 | 
  5 | /*
  6 | nttpack_avx:
  7 | #load
  8 | vmovdqa		(%rdi),%ymm4
  9 | vmovdqa		32(%rdi),%ymm5
 10 | vmovdqa		64(%rdi),%ymm6
 11 | vmovdqa		96(%rdi),%ymm7
 12 | vmovdqa		128(%rdi),%ymm8
 13 | vmovdqa		160(%rdi),%ymm9
 14 | vmovdqa		192(%rdi),%ymm10
 15 | vmovdqa		224(%rdi),%ymm11
 16 | 
 17 | shuffle1	4,5,3,5
 18 | shuffle1	6,7,4,7
 19 | shuffle1	8,9,6,9
 20 | shuffle1	10,11,8,11
 21 | 
 22 | shuffle2	3,4,10,4
 23 | shuffle2	6,8,3,8
 24 | shuffle2	5,7,6,7
 25 | shuffle2	9,11,5,11
 26 | 
 27 | shuffle4	10,3,9,3
 28 | shuffle4	6,5,10,5
 29 | shuffle4	4,8,6,8
 30 | shuffle4	7,11,4,11
 31 | 
 32 | shuffle8	9,10,7,10
 33 | shuffle8	6,4,9,4
 34 | shuffle8	3,5,6,5
 35 | shuffle8	8,11,3,11
 36 | 
 37 | #store
 38 | vmovdqa		%ymm7,(%rdi)
 39 | vmovdqa		%ymm9,32(%rdi)
 40 | vmovdqa		%ymm6,64(%rdi)
 41 | vmovdqa		%ymm3,96(%rdi)
 42 | vmovdqa		%ymm10,128(%rdi)
 43 | vmovdqa		%ymm4,160(%rdi)
 44 | vmovdqa		%ymm5,192(%rdi)
 45 | vmovdqa		%ymm11,224(%rdi)
 46 | 
 47 | ret
 48 | */
 49 | 
 50 | .text
 51 | nttunpack128_avx:
 52 | #load
 53 | vmovdqa		(%rdi),%ymm4
 54 | vmovdqa		32(%rdi),%ymm5
 55 | vmovdqa		64(%rdi),%ymm6
 56 | vmovdqa		96(%rdi),%ymm7
 57 | vmovdqa		128(%rdi),%ymm8
 58 | vmovdqa		160(%rdi),%ymm9
 59 | vmovdqa		192(%rdi),%ymm10
 60 | vmovdqa		224(%rdi),%ymm11
 61 | 
 62 | shuffle8	4,8,3,8
 63 | shuffle8	5,9,4,9
 64 | shuffle8	6,10,5,10
 65 | shuffle8	7,11,6,11
 66 | 
 67 | shuffle4	3,5,7,5
 68 | shuffle4	8,10,3,10
 69 | shuffle4	4,6,8,6
 70 | shuffle4	9,11,4,11
 71 | 
 72 | shuffle2	7,8,9,8
 73 | shuffle2	5,6,7,6
 74 | shuffle2	3,4,5,4
 75 | shuffle2	10,11,3,11
 76 | 
 77 | shuffle1	9,5,10,5
 78 | shuffle1	8,4,9,4
 79 | shuffle1	7,3,8,3
 80 | shuffle1	6,11,7,11
 81 | 
 82 | #store
 83 | vmovdqa		%ymm10,(%rdi)
 84 | vmovdqa		%ymm5,32(%rdi)
 85 | vmovdqa		%ymm9,64(%rdi)
 86 | vmovdqa		%ymm4,96(%rdi)
 87 | vmovdqa		%ymm8,128(%rdi)
 88 | vmovdqa		%ymm3,160(%rdi)
 89 | vmovdqa		%ymm7,192(%rdi)
 90 | vmovdqa		%ymm11,224(%rdi)
 91 | 
 92 | ret
 93 | 
 94 | .global cdecl(nttunpack_avx)
 95 | cdecl(nttunpack_avx):
 96 | call		nttunpack128_avx
 97 | add		$256,%rdi
 98 | call		nttunpack128_avx
 99 | ret
100 | 
101 | ntttobytes128_avx:
102 | #load
103 | vmovdqa		(%rsi),%ymm5
104 | vmovdqa		32(%rsi),%ymm6
105 | vmovdqa		64(%rsi),%ymm7
106 | vmovdqa		96(%rsi),%ymm8
107 | vmovdqa		128(%rsi),%ymm9
108 | vmovdqa		160(%rsi),%ymm10
109 | vmovdqa		192(%rsi),%ymm11
110 | vmovdqa		224(%rsi),%ymm12
111 | 
112 | #csubq
113 | csubq		5,13
114 | csubq		6,13
115 | csubq		7,13
116 | csubq		8,13
117 | csubq		9,13
118 | csubq		10,13
119 | csubq		11,13
120 | csubq		12,13
121 | 
122 | #bitpack
123 | vpsllw		$12,%ymm6,%ymm4
124 | vpor		%ymm4,%ymm5,%ymm4
125 | 
126 | vpsrlw		$4,%ymm6,%ymm5
127 | vpsllw		$8,%ymm7,%ymm6
128 | vpor		%ymm5,%ymm6,%ymm5
129 | 
130 | vpsrlw		$8,%ymm7,%ymm6
131 | vpsllw		$4,%ymm8,%ymm7
132 | vpor		%ymm6,%ymm7,%ymm6
133 | 
134 | vpsllw		$12,%ymm10,%ymm7
135 | vpor		%ymm7,%ymm9,%ymm7
136 | 
137 | vpsrlw		$4,%ymm10,%ymm8
138 | vpsllw		$8,%ymm11,%ymm9
139 | vpor		%ymm8,%ymm9,%ymm8
140 | 
141 | vpsrlw		$8,%ymm11,%ymm9
142 | vpsllw		$4,%ymm12,%ymm10
143 | vpor		%ymm9,%ymm10,%ymm9
144 | 
145 | shuffle1	4,5,3,5
146 | shuffle1	6,7,4,7
147 | shuffle1	8,9,6,9
148 | 
149 | shuffle2	3,4,8,4
150 | shuffle2	6,5,3,5
151 | shuffle2	7,9,6,9
152 | 
153 | shuffle4	8,3,7,3
154 | shuffle4	6,4,8,4
155 | shuffle4	5,9,6,9
156 | 
157 | shuffle8	7,8,5,8
158 | shuffle8	6,3,7,3
159 | shuffle8	4,9,6,9
160 | 
161 | #store
162 | vmovdqu		%ymm5,(%rdi)
163 | vmovdqu		%ymm7,32(%rdi)
164 | vmovdqu		%ymm6,64(%rdi)
165 | vmovdqu		%ymm8,96(%rdi)
166 | vmovdqu		%ymm3,128(%rdi)
167 | vmovdqu		%ymm9,160(%rdi)
168 | 
169 | ret
170 | 
171 | .global cdecl(ntttobytes_avx)
172 | cdecl(ntttobytes_avx):
173 | #consts
174 | vmovdqa		_16XQ*2(%rdx),%ymm0
175 | call		ntttobytes128_avx
176 | add		$256,%rsi
177 | add		$192,%rdi
178 | call		ntttobytes128_avx
179 | ret
180 | 
181 | nttfrombytes128_avx:
182 | #load
183 | vmovdqu		(%rsi),%ymm4
184 | vmovdqu		32(%rsi),%ymm5
185 | vmovdqu		64(%rsi),%ymm6
186 | vmovdqu		96(%rsi),%ymm7
187 | vmovdqu		128(%rsi),%ymm8
188 | vmovdqu		160(%rsi),%ymm9
189 | 
190 | shuffle8	4,7,3,7
191 | shuffle8	5,8,4,8
192 | shuffle8	6,9,5,9
193 | 
194 | shuffle4	3,8,6,8
195 | shuffle4	7,5,3,5
196 | shuffle4	4,9,7,9
197 | 
198 | shuffle2	6,5,4,5
199 | shuffle2	8,7,6,7
200 | shuffle2	3,9,8,9
201 | 
202 | shuffle1	4,7,10,7
203 | shuffle1	5,8,4,8
204 | shuffle1	6,9,5,9
205 | 
206 | #bitunpack
207 | vpsrlw		$12,%ymm10,%ymm11
208 | vpsllw		$4,%ymm7,%ymm12
209 | vpor		%ymm11,%ymm12,%ymm11
210 | vpand		%ymm0,%ymm10,%ymm10
211 | vpand		%ymm0,%ymm11,%ymm11
212 | 
213 | vpsrlw		$8,%ymm7,%ymm12
214 | vpsllw		$8,%ymm4,%ymm13
215 | vpor		%ymm12,%ymm13,%ymm12
216 | vpand		%ymm0,%ymm12,%ymm12
217 | 
218 | vpsrlw		$4,%ymm4,%ymm13
219 | vpand		%ymm0,%ymm13,%ymm13
220 | 
221 | vpsrlw		$12,%ymm8,%ymm14
222 | vpsllw		$4,%ymm5,%ymm15
223 | vpor		%ymm14,%ymm15,%ymm14
224 | vpand		%ymm0,%ymm8,%ymm8
225 | vpand		%ymm0,%ymm14,%ymm14
226 | 
227 | vpsrlw		$8,%ymm5,%ymm15
228 | vpsllw		$8,%ymm9,%ymm1
229 | vpor		%ymm15,%ymm1,%ymm15
230 | vpand		%ymm0,%ymm15,%ymm15
231 | 
232 | vpsrlw		$4,%ymm9,%ymm1
233 | vpand		%ymm0,%ymm1,%ymm1
234 | 
235 | #store
236 | vmovdqa		%ymm10,(%rdi)
237 | vmovdqa		%ymm11,32(%rdi)
238 | vmovdqa		%ymm12,64(%rdi)
239 | vmovdqa		%ymm13,96(%rdi)
240 | vmovdqa		%ymm8,128(%rdi)
241 | vmovdqa		%ymm14,160(%rdi)
242 | vmovdqa		%ymm15,192(%rdi)
243 | vmovdqa		%ymm1,224(%rdi)
244 | 
245 | ret
246 | 
247 | .global cdecl(nttfrombytes_avx)
248 | cdecl(nttfrombytes_avx):
249 | #consts
250 | vmovdqa		_16XMASK*2(%rdx),%ymm0
251 | call		nttfrombytes128_avx
252 | add		$256,%rdi
253 | add		$192,%rsi
254 | call		nttfrombytes128_avx
255 | ret
256 | 


--------------------------------------------------------------------------------
/avx2/invntt.S:
--------------------------------------------------------------------------------
  1 | #include "consts.h"
  2 | .include "shuffle.inc"
  3 | .include "fq.inc"
  4 | 
  5 | .macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
  6 | vpsubw		%ymm\rl0,%ymm\rh0,%ymm12
  7 | vpaddw		%ymm\rh0,%ymm\rl0,%ymm\rl0
  8 | vpsubw		%ymm\rl1,%ymm\rh1,%ymm13
  9 | 
 10 | vpmullw		%ymm\zl0,%ymm12,%ymm\rh0
 11 | vpaddw		%ymm\rh1,%ymm\rl1,%ymm\rl1
 12 | vpsubw		%ymm\rl2,%ymm\rh2,%ymm14
 13 | 
 14 | vpmullw		%ymm\zl0,%ymm13,%ymm\rh1
 15 | vpaddw		%ymm\rh2,%ymm\rl2,%ymm\rl2
 16 | vpsubw		%ymm\rl3,%ymm\rh3,%ymm15
 17 | 
 18 | vpmullw		%ymm\zl1,%ymm14,%ymm\rh2
 19 | vpaddw		%ymm\rh3,%ymm\rl3,%ymm\rl3
 20 | vpmullw		%ymm\zl1,%ymm15,%ymm\rh3
 21 | 
 22 | vpmulhw		%ymm\zh0,%ymm12,%ymm12
 23 | vpmulhw		%ymm\zh0,%ymm13,%ymm13
 24 | 
 25 | vpmulhw		%ymm\zh1,%ymm14,%ymm14
 26 | vpmulhw		%ymm\zh1,%ymm15,%ymm15
 27 | 
 28 | vpmulhw		%ymm0,%ymm\rh0,%ymm\rh0
 29 | 
 30 | vpmulhw		%ymm0,%ymm\rh1,%ymm\rh1
 31 | 
 32 | vpmulhw		%ymm0,%ymm\rh2,%ymm\rh2
 33 | vpmulhw		%ymm0,%ymm\rh3,%ymm\rh3
 34 | 
 35 | #
 36 | 
 37 | #
 38 | 
 39 | vpsubw		%ymm\rh0,%ymm12,%ymm\rh0
 40 | 
 41 | vpsubw		%ymm\rh1,%ymm13,%ymm\rh1
 42 | 
 43 | vpsubw		%ymm\rh2,%ymm14,%ymm\rh2
 44 | vpsubw		%ymm\rh3,%ymm15,%ymm\rh3
 45 | .endm
 46 | 
 47 | .macro intt_levels0t5 off
 48 | /* level 0 */
 49 | vmovdqa		_16XFLO*2(%rsi),%ymm2
 50 | vmovdqa		_16XFHI*2(%rsi),%ymm3
 51 | 
 52 | vmovdqa         (128*\off+  0)*2(%rdi),%ymm4
 53 | vmovdqa         (128*\off+ 32)*2(%rdi),%ymm6
 54 | vmovdqa         (128*\off+ 16)*2(%rdi),%ymm5
 55 | vmovdqa         (128*\off+ 48)*2(%rdi),%ymm7
 56 | 
 57 | fqmulprecomp	2,3,4
 58 | fqmulprecomp	2,3,6
 59 | fqmulprecomp	2,3,5
 60 | fqmulprecomp	2,3,7
 61 | 
 62 | vmovdqa         (128*\off+ 64)*2(%rdi),%ymm8
 63 | vmovdqa         (128*\off+ 96)*2(%rdi),%ymm10
 64 | vmovdqa         (128*\off+ 80)*2(%rdi),%ymm9
 65 | vmovdqa         (128*\off+112)*2(%rdi),%ymm11
 66 | 
 67 | fqmulprecomp	2,3,8
 68 | fqmulprecomp	2,3,10
 69 | fqmulprecomp	2,3,9
 70 | fqmulprecomp	2,3,11
 71 | 
 72 | vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15
 73 | vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1
 74 | vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2
 75 | vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3
 76 | vmovdqa		_REVIDXB*2(%rsi),%ymm12
 77 | vpshufb		%ymm12,%ymm15,%ymm15
 78 | vpshufb		%ymm12,%ymm1,%ymm1
 79 | vpshufb		%ymm12,%ymm2,%ymm2
 80 | vpshufb		%ymm12,%ymm3,%ymm3
 81 | 
 82 | butterfly	4,5,8,9,6,7,10,11,15,1,2,3
 83 | 
 84 | /* level 1 */
 85 | vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2
 86 | vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3
 87 | vmovdqa		_REVIDXB*2(%rsi),%ymm1
 88 | vpshufb		%ymm1,%ymm2,%ymm2
 89 | vpshufb		%ymm1,%ymm3,%ymm3
 90 | 
 91 | butterfly	4,5,6,7,8,9,10,11,2,2,3,3
 92 | 
 93 | shuffle1	4,5,3,5
 94 | shuffle1	6,7,4,7
 95 | shuffle1	8,9,6,9
 96 | shuffle1	10,11,8,11
 97 | 
 98 | /* level 2 */
 99 | vmovdqa		_REVIDXD*2(%rsi),%ymm12
100 | vpermd		(_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
101 | vpermd		(_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10
102 | 
103 | butterfly	3,4,6,8,5,7,9,11,2,2,10,10
104 | 
105 | vmovdqa		_16XV*2(%rsi),%ymm1
106 | red16		3
107 | 
108 | shuffle2	3,4,10,4
109 | shuffle2	6,8,3,8
110 | shuffle2	5,7,6,7
111 | shuffle2	9,11,5,11
112 | 
113 | /* level 3 */
114 | vpermq		$0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2
115 | vpermq		$0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9
116 | 
117 | butterfly	10,3,6,5,4,8,7,11,2,2,9,9
118 | 
119 | shuffle4	10,3,9,3
120 | shuffle4	6,5,10,5
121 | shuffle4	4,8,6,8
122 | shuffle4	7,11,4,11
123 | 
124 | /* level 4 */
125 | vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2
126 | vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7
127 | 
128 | butterfly	9,10,6,4,3,5,8,11,2,2,7,7
129 | 
130 | red16		9
131 | 
132 | shuffle8	9,10,7,10
133 | shuffle8	6,4,9,4
134 | shuffle8	3,5,6,5
135 | shuffle8	8,11,3,11
136 | 
137 | /* level 5 */
138 | vmovdqa		(_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2
139 | vmovdqa		(_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8
140 | 
141 | butterfly	7,9,6,3,10,4,5,11,2,2,8,8
142 | 
143 | vmovdqa         %ymm7,(128*\off+  0)*2(%rdi)
144 | vmovdqa         %ymm9,(128*\off+ 16)*2(%rdi)
145 | vmovdqa         %ymm6,(128*\off+ 32)*2(%rdi)
146 | vmovdqa         %ymm3,(128*\off+ 48)*2(%rdi)
147 | vmovdqa         %ymm10,(128*\off+ 64)*2(%rdi)
148 | vmovdqa         %ymm4,(128*\off+ 80)*2(%rdi)
149 | vmovdqa         %ymm5,(128*\off+ 96)*2(%rdi)
150 | vmovdqa         %ymm11,(128*\off+112)*2(%rdi)
151 | .endm
152 | 
153 | .macro intt_level6 off
154 | /* level 6 */
155 | vmovdqa         (64*\off+  0)*2(%rdi),%ymm4
156 | vmovdqa         (64*\off+128)*2(%rdi),%ymm8
157 | vmovdqa         (64*\off+ 16)*2(%rdi),%ymm5
158 | vmovdqa         (64*\off+144)*2(%rdi),%ymm9
159 | vpbroadcastq	(_ZETAS_EXP+0)*2(%rsi),%ymm2
160 | 
161 | vmovdqa         (64*\off+ 32)*2(%rdi),%ymm6
162 | vmovdqa         (64*\off+160)*2(%rdi),%ymm10
163 | vmovdqa         (64*\off+ 48)*2(%rdi),%ymm7
164 | vmovdqa         (64*\off+176)*2(%rdi),%ymm11
165 | vpbroadcastq	(_ZETAS_EXP+4)*2(%rsi),%ymm3
166 | 
167 | butterfly	4,5,6,7,8,9,10,11
168 | 
169 | .if \off == 0
170 | red16		4
171 | .endif
172 | 
173 | vmovdqa		%ymm4,(64*\off+  0)*2(%rdi)
174 | vmovdqa		%ymm5,(64*\off+ 16)*2(%rdi)
175 | vmovdqa		%ymm6,(64*\off+ 32)*2(%rdi)
176 | vmovdqa		%ymm7,(64*\off+ 48)*2(%rdi)
177 | vmovdqa		%ymm8,(64*\off+128)*2(%rdi)
178 | vmovdqa		%ymm9,(64*\off+144)*2(%rdi)
179 | vmovdqa		%ymm10,(64*\off+160)*2(%rdi)
180 | vmovdqa		%ymm11,(64*\off+176)*2(%rdi)
181 | .endm
182 | 
183 | .text
184 | .global cdecl(invntt_avx)
185 | cdecl(invntt_avx):
186 | vmovdqa         _16XQ*2(%rsi),%ymm0
187 | 
188 | intt_levels0t5	0
189 | intt_levels0t5	1
190 | 
191 | intt_level6	0
192 | intt_level6	1
193 | ret
194 | 


--------------------------------------------------------------------------------
/avx2/consts.c:
--------------------------------------------------------------------------------
  1 | #include "align.h"
  2 | #include "params.h"
  3 | #include "consts.h"
  4 | 
  5 | #define Q KYBER_Q
  6 | #define MONT -1044 // 2^16 mod q
  7 | #define QINV -3327 // q^-1 mod 2^16
  8 | #define V 20159 // floor(2^26/q + 0.5)
  9 | #define FHI 1441 // mont^2/128
 10 | #define FLO -10079 // qinv*FHI
 11 | #define MONTSQHI 1353 // mont^2
 12 | #define MONTSQLO 20553 // qinv*MONTSQHI
 13 | #define MASK 4095
 14 | #define SHIFT 32
 15 | 
 16 | const qdata_t qdata = {{
 17 | #define _16XQ 0
 18 |   Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q,
 19 | 
 20 | #define _16XQINV 16
 21 |   QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
 22 |   QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
 23 | 
 24 | #define _16XV 32
 25 |   V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
 26 | 
 27 | #define _16XFLO 48
 28 |   FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
 29 |   FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
 30 | 
 31 | #define _16XFHI 64
 32 |   FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
 33 |   FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
 34 | 
 35 | #define _16XMONTSQLO 80
 36 |   MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
 37 |   MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
 38 |   MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
 39 |   MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
 40 | 
 41 | #define _16XMONTSQHI 96
 42 |   MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
 43 |   MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
 44 |   MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
 45 |   MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
 46 | 
 47 | #define _16XMASK 112
 48 |   MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
 49 |   MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
 50 | 
 51 | #define _REVIDXB 128
 52 |   3854, 3340, 2826, 2312, 1798, 1284, 770, 256,
 53 |   3854, 3340, 2826, 2312, 1798, 1284, 770, 256,
 54 | 
 55 | #define _REVIDXD 144
 56 |   7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0,
 57 | 
 58 | #define _ZETAS_EXP 160
 59 |    31498,  31498,  31498,  31498,   -758,   -758,   -758,   -758,
 60 |     5237,   5237,   5237,   5237,   1397,   1397,   1397,   1397,
 61 |    14745,  14745,  14745,  14745,  14745,  14745,  14745,  14745,
 62 |    14745,  14745,  14745,  14745,  14745,  14745,  14745,  14745,
 63 |     -359,   -359,   -359,   -359,   -359,   -359,   -359,   -359,
 64 |     -359,   -359,   -359,   -359,   -359,   -359,   -359,   -359,
 65 |    13525,  13525,  13525,  13525,  13525,  13525,  13525,  13525,
 66 |   -12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402,
 67 |     1493,   1493,   1493,   1493,   1493,   1493,   1493,   1493,
 68 |     1422,   1422,   1422,   1422,   1422,   1422,   1422,   1422,
 69 |   -20907, -20907, -20907, -20907,  27758,  27758,  27758,  27758,
 70 |    -3799,  -3799,  -3799,  -3799, -15690, -15690, -15690, -15690,
 71 |     -171,   -171,   -171,   -171,    622,    622,    622,    622,
 72 |     1577,   1577,   1577,   1577,    182,    182,    182,    182,
 73 |    -5827,  -5827,  17363,  17363, -26360, -26360, -29057, -29057,
 74 |     5571,   5571,  -1102,  -1102,  21438,  21438, -26242, -26242,
 75 |      573,    573,  -1325,  -1325,    264,    264,    383,    383,
 76 |     -829,   -829,   1458,   1458,  -1602,  -1602,   -130,   -130,
 77 |    -5689,  -6516,   1496,  30967, -23565,  20179,  20710,  25080,
 78 |   -12796,  26616,  16064, -12442,   9134,   -650, -25986,  27837,
 79 |     1223,    652,   -552,   1015,  -1293,   1491,   -282,  -1544,
 80 |      516,     -8,   -320,   -666,  -1618,  -1162,    126,   1469,
 81 |     -335, -11477, -32227,  20494, -27738,    945, -14883,   6182,
 82 |    32010,  10631,  29175, -28762, -18486,  17560, -14430,  -5276,
 83 |    -1103,    555,  -1251,   1550,    422,    177,   -291,   1574,
 84 |     -246,   1159,   -777,   -602,  -1590,   -872,    418,   -156,
 85 |    11182,  13387, -14233, -21655,  13131,  -4587,  23092,   5493,
 86 |   -32502,  30317, -18741,  12639,  20100,  18525,  19529, -12619,
 87 |      430,    843,    871,    105,    587,   -235,   -460,   1653,
 88 |      778,   -147,   1483,   1119,    644,    349,    329,    -75,
 89 |      787,    787,    787,    787,    787,    787,    787,    787,
 90 |      787,    787,    787,    787,    787,    787,    787,    787,
 91 |    -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,
 92 |    -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,  -1517,
 93 |    28191,  28191,  28191,  28191,  28191,  28191,  28191,  28191,
 94 |   -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694,
 95 |      287,    287,    287,    287,    287,    287,    287,    287,
 96 |      202,    202,    202,    202,    202,    202,    202,    202,
 97 |    10690,  10690,  10690,  10690,   1358,   1358,   1358,   1358,
 98 |   -11202, -11202, -11202, -11202,  31164,  31164,  31164,  31164,
 99 |      962,    962,    962,    962,  -1202,  -1202,  -1202,  -1202,
100 |    -1474,  -1474,  -1474,  -1474,   1468,   1468,   1468,   1468,
101 |   -28073, -28073,  24313,  24313, -10532, -10532,   8800,   8800,
102 |    18426,  18426,   8859,   8859,  26675,  26675, -16163, -16163,
103 |     -681,   -681,   1017,   1017,    732,    732,    608,    608,
104 |    -1542,  -1542,    411,    411,   -205,   -205,  -1571,  -1571,
105 |    19883, -28250, -15887,  -8898, -28309,   9075, -30199,  18249,
106 |    13426,  14017, -29156, -12757,  16832,   4311, -24155, -17915,
107 |     -853,    -90,   -271,    830,    107,  -1421,   -247,   -951,
108 |     -398,    961,  -1508,   -725,    448,  -1065,    677,  -1275,
109 |   -31183,  25435,  -7382,  24391, -20927,  10946,  24214,  16989,
110 |    10335,  -7934, -22502,  10906,  31636,  28644,  23998, -17422,
111 |      817,    603,   1322,  -1465,  -1215,   1218,   -874,  -1187,
112 |    -1185,  -1278,  -1510,   -870,   -108,    996,    958,   1522,
113 |    20297,   2146,  15355, -32384,  -6280, -14903, -11044,  14469,
114 |   -21498, -20198,  23210, -17442, -23860, -20257,   7756,  23132,
115 |     1097,    610,  -1285,    384,   -136,  -1335,    220,  -1659,
116 |    -1530,    794,   -854,    478,   -308,    991,  -1460,   1628,
117 | 
118 | #define _16XSHIFT 624
119 |   SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT,
120 |   SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT
121 | }};
122 | 


--------------------------------------------------------------------------------
/avx2/keccak4x/KeccakP-brg_endian.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  ---------------------------------------------------------------------------
  3 |  Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
  4 | 
  5 |  LICENSE TERMS
  6 | 
  7 |  The redistribution and use of this software (with or without changes)
  8 |  is allowed without the payment of fees or royalties provided that:
  9 | 
 10 |   1. source code distributions include the above copyright notice, this
 11 |      list of conditions and the following disclaimer;
 12 | 
 13 |   2. binary distributions include the above copyright notice, this list
 14 |      of conditions and the following disclaimer in their documentation;
 15 | 
 16 |   3. the name of the copyright holder is not used to endorse products
 17 |      built using this software without specific written permission.
 18 | 
 19 |  DISCLAIMER
 20 | 
 21 |  This software is provided 'as is' with no explicit or implied warranties
 22 |  in respect of its properties, including, but not limited to, correctness
 23 |  and/or fitness for purpose.
 24 |  ---------------------------------------------------------------------------
 25 |  Issue Date: 20/12/2007
 26 |  Changes for ARM 9/9/2010
 27 | */
 28 | 
 29 | #ifndef _KECCAKP_BRG_ENDIAN_H
 30 | #define _KECCAKP_BRG_ENDIAN_H
 31 | 
 32 | #define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
 33 | #define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
 34 | 
 35 | #if 0
 36 | /* Include files where endian defines and byteswap functions may reside */
 37 | #if defined( __sun )
 38 | #  include <sys/isa_defs.h>
 39 | #elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
 40 | #  include <sys/endian.h>
 41 | #elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
 42 |       defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
 43 | #  include <machine/endian.h>
 44 | #elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
 45 | #  if !defined( __MINGW32__ ) && !defined( _AIX )
 46 | #    include <endian.h>
 47 | #    if !defined( __BEOS__ )
 48 | #      include <byteswap.h>
 49 | #    endif
 50 | #  endif
 51 | #endif
 52 | #endif
 53 | 
 54 | /* Now attempt to set the define for platform byte order using any  */
 55 | /* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
 56 | /* seem to encompass most endian symbol definitions                 */
 57 | 
 58 | #if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
 59 | #  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
 60 | #    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 61 | #  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
 62 | #    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 63 | #  endif
 64 | #elif defined( BIG_ENDIAN )
 65 | #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 66 | #elif defined( LITTLE_ENDIAN )
 67 | #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 68 | #endif
 69 | 
 70 | #if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
 71 | #  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
 72 | #    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 73 | #  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
 74 | #    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 75 | #  endif
 76 | #elif defined( _BIG_ENDIAN )
 77 | #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 78 | #elif defined( _LITTLE_ENDIAN )
 79 | #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 80 | #endif
 81 | 
 82 | #if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
 83 | #  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
 84 | #    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 85 | #  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
 86 | #    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 87 | #  endif
 88 | #elif defined( __BIG_ENDIAN )
 89 | #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 90 | #elif defined( __LITTLE_ENDIAN )
 91 | #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 92 | #endif
 93 | 
 94 | #if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
 95 | #  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
 96 | #    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 97 | #  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
 98 | #    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 99 | #  endif
100 | #elif defined( __BIG_ENDIAN__ )
101 | #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
102 | #elif defined( __LITTLE_ENDIAN__ )
103 | #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
104 | #endif
105 | 
106 | /*  if the platform byte order could not be determined, then try to */
107 | /*  set this define using common machine defines                    */
108 | #if !defined(PLATFORM_BYTE_ORDER)
109 | 
110 | #if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
111 |       defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
112 |       defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
113 |       defined( vax )       || defined( vms )     || defined( VMS )        || \
114 |       defined( __VMS )     || defined( _M_X64 )
115 | #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
116 | 
117 | #elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
118 |       defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
119 |       defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
120 |       defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
121 |       defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
122 |       defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
123 |       defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
124 | #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
125 | 
126 | #elif defined(__arm__)
127 | # ifdef __BIG_ENDIAN
128 | #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
129 | # else
130 | #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
131 | # endif
132 | #elif 1     /* **** EDIT HERE IF NECESSARY **** */
133 | #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
134 | #elif 0     /* **** EDIT HERE IF NECESSARY **** */
135 | #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
136 | #else
137 | #  error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
138 | #endif
139 | 
140 | #endif
141 | 
142 | #endif
143 | 


--------------------------------------------------------------------------------
/ref/nistkat/rng.c:
--------------------------------------------------------------------------------
  1 | //
  2 | //  rng.c
  3 | //
  4 | //  Created by Bassham, Lawrence E (Fed) on 8/29/17.
  5 | //  Copyright © 2017 Bassham, Lawrence E (Fed). All rights reserved.
  6 | //
  7 | 
  8 | #include <string.h>
  9 | #include "rng.h"
 10 | #include <openssl/conf.h>
 11 | #include <openssl/evp.h>
 12 | #include <openssl/err.h>
 13 | 
 14 | AES256_CTR_DRBG_struct  DRBG_ctx;
 15 | 
 16 | void    AES256_ECB(unsigned char *key, unsigned char *ctr, unsigned char *buffer);
 17 | 
 18 | /*
 19 |  seedexpander_init()
 20 |  ctx            - stores the current state of an instance of the seed expander
 21 |  seed           - a 32 byte random value
 22 |  diversifier    - an 8 byte diversifier
 23 |  maxlen         - maximum number of bytes (less than 2**32) generated under this seed and diversifier
 24 |  */
 25 | int
 26 | seedexpander_init(AES_XOF_struct *ctx,
 27 |                   unsigned char *seed,
 28 |                   unsigned char *diversifier,
 29 |                   unsigned long maxlen)
 30 | {
 31 |     if ( maxlen >= 0x100000000 )
 32 |         return RNG_BAD_MAXLEN;
 33 | 
 34 |     ctx->length_remaining = maxlen;
 35 | 
 36 |     memcpy(ctx->key, seed, 32);
 37 | 
 38 |     memcpy(ctx->ctr, diversifier, 8);
 39 |     ctx->ctr[11] = maxlen % 256;
 40 |     maxlen >>= 8;
 41 |     ctx->ctr[10] = maxlen % 256;
 42 |     maxlen >>= 8;
 43 |     ctx->ctr[9] = maxlen % 256;
 44 |     maxlen >>= 8;
 45 |     ctx->ctr[8] = maxlen % 256;
 46 |     memset(ctx->ctr+12, 0x00, 4);
 47 | 
 48 |     ctx->buffer_pos = 16;
 49 |     memset(ctx->buffer, 0x00, 16);
 50 | 
 51 |     return RNG_SUCCESS;
 52 | }
 53 | 
 54 | /*
 55 |  seedexpander()
 56 |     ctx  - stores the current state of an instance of the seed expander
 57 |     x    - returns the XOF data
 58 |     xlen - number of bytes to return
 59 |  */
 60 | int
 61 | seedexpander(AES_XOF_struct *ctx, unsigned char *x, unsigned long xlen)
 62 | {
 63 |     unsigned long   offset;
 64 | 
 65 |     if ( x == NULL )
 66 |         return RNG_BAD_OUTBUF;
 67 |     if ( xlen >= ctx->length_remaining )
 68 |         return RNG_BAD_REQ_LEN;
 69 | 
 70 |     ctx->length_remaining -= xlen;
 71 | 
 72 |     offset = 0;
 73 |     while ( xlen > 0 ) {
 74 |         if ( xlen <= (16-ctx->buffer_pos) ) { // buffer has what we need
 75 |             memcpy(x+offset, ctx->buffer+ctx->buffer_pos, xlen);
 76 |             ctx->buffer_pos += xlen;
 77 | 
 78 |             return RNG_SUCCESS;
 79 |         }
 80 | 
 81 |         // take what's in the buffer
 82 |         memcpy(x+offset, ctx->buffer+ctx->buffer_pos, 16-ctx->buffer_pos);
 83 |         xlen -= 16-ctx->buffer_pos;
 84 |         offset += 16-ctx->buffer_pos;
 85 | 
 86 |         AES256_ECB(ctx->key, ctx->ctr, ctx->buffer);
 87 |         ctx->buffer_pos = 0;
 88 | 
 89 |         //increment the counter
 90 |         for (int i=15; i>=12; i--) {
 91 |             if ( ctx->ctr[i] == 0xff )
 92 |                 ctx->ctr[i] = 0x00;
 93 |             else {
 94 |                 ctx->ctr[i]++;
 95 |                 break;
 96 |             }
 97 |         }
 98 | 
 99 |     }
100 | 
101 |     return RNG_SUCCESS;
102 | }
103 | 
104 | 
105 | void handleErrors(void)
106 | {
107 |     ERR_print_errors_fp(stderr);
108 |     abort();
109 | }
110 | 
111 | // Use whatever AES implementation you have. This uses AES from openSSL library
112 | //    key - 256-bit AES key
113 | //    ctr - a 128-bit plaintext value
114 | //    buffer - a 128-bit ciphertext value
115 | void
116 | AES256_ECB(unsigned char *key, unsigned char *ctr, unsigned char *buffer)
117 | {
118 |     EVP_CIPHER_CTX *ctx;
119 | 
120 |     int len;
121 | 
122 |     int ciphertext_len;
123 | 
124 |     /* Create and initialise the context */
125 |     if(!(ctx = EVP_CIPHER_CTX_new())) handleErrors();
126 | 
127 |     if(1 != EVP_EncryptInit_ex(ctx, EVP_aes_256_ecb(), NULL, key, NULL))
128 |         handleErrors();
129 | 
130 |     if(1 != EVP_EncryptUpdate(ctx, buffer, &len, ctr, 16))
131 |         handleErrors();
132 |     ciphertext_len = len;
133 | 
134 |     /* Clean up */
135 |     EVP_CIPHER_CTX_free(ctx);
136 | }
137 | 
138 | void
139 | randombytes_init(unsigned char *entropy_input,
140 |                  unsigned char *personalization_string,
141 |                  int security_strength)
142 | {
143 |     unsigned char   seed_material[48];
144 | 
145 |     memcpy(seed_material, entropy_input, 48);
146 |     if (personalization_string)
147 |         for (int i=0; i<48; i++)
148 |             seed_material[i] ^= personalization_string[i];
149 |     memset(DRBG_ctx.Key, 0x00, 32);
150 |     memset(DRBG_ctx.V, 0x00, 16);
151 |     AES256_CTR_DRBG_Update(seed_material, DRBG_ctx.Key, DRBG_ctx.V);
152 |     DRBG_ctx.reseed_counter = 1;
153 | }
154 | 
155 | int
156 | randombytes(unsigned char *x, unsigned long long xlen)
157 | {
158 |     unsigned char   block[16];
159 |     int             i = 0;
160 | 
161 |     while ( xlen > 0 ) {
162 |         //increment V
163 |         for (int j=15; j>=0; j--) {
164 |             if ( DRBG_ctx.V[j] == 0xff )
165 |                 DRBG_ctx.V[j] = 0x00;
166 |             else {
167 |                 DRBG_ctx.V[j]++;
168 |                 break;
169 |             }
170 |         }
171 |         AES256_ECB(DRBG_ctx.Key, DRBG_ctx.V, block);
172 |         if ( xlen > 15 ) {
173 |             memcpy(x+i, block, 16);
174 |             i += 16;
175 |             xlen -= 16;
176 |         }
177 |         else {
178 |             memcpy(x+i, block, xlen);
179 |             xlen = 0;
180 |         }
181 |     }
182 |     AES256_CTR_DRBG_Update(NULL, DRBG_ctx.Key, DRBG_ctx.V);
183 |     DRBG_ctx.reseed_counter++;
184 | 
185 |     return RNG_SUCCESS;
186 | }
187 | 
188 | void
189 | AES256_CTR_DRBG_Update(unsigned char *provided_data,
190 |                        unsigned char *Key,
191 |                        unsigned char *V)
192 | {
193 |     unsigned char   temp[48];
194 | 
195 |     for (int i=0; i<3; i++) {
196 |         //increment V
197 |         for (int j=15; j>=0; j--) {
198 |             if ( V[j] == 0xff )
199 |                 V[j] = 0x00;
200 |             else {
201 |                 V[j]++;
202 |                 break;
203 |             }
204 |         }
205 | 
206 |         AES256_ECB(Key, V, temp+16*i);
207 |     }
208 |     if ( provided_data != NULL )
209 |         for (int i=0; i<48; i++)
210 |             temp[i] ^= provided_data[i];
211 |     memcpy(Key, temp, 32);
212 |     memcpy(V, temp+32, 16);
213 | }
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 
221 | 
222 | 
223 | 


--------------------------------------------------------------------------------
/ref/kem.c:
--------------------------------------------------------------------------------
  1 | #include <stddef.h>
  2 | #include <stdint.h>
  3 | #include <string.h>
  4 | #include "params.h"
  5 | #include "kem.h"
  6 | #include "indcpa.h"
  7 | #include "verify.h"
  8 | #include "symmetric.h"
  9 | #include "randombytes.h"
 10 | /*************************************************
 11 | * Name:        crypto_kem_keypair_derand
 12 | *
 13 | * Description: Generates public and private key
 14 | *              for CCA-secure Kyber key encapsulation mechanism
 15 | *
 16 | * Arguments:   - uint8_t *pk: pointer to output public key
 17 | *                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
 18 | *              - uint8_t *sk: pointer to output private key
 19 | *                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
 20 | *              - uint8_t *coins: pointer to input randomness
 21 | *                (an already allocated array filled with 2*KYBER_SYMBYTES random bytes)
 22 | **
 23 | * Returns 0 (success)
 24 | **************************************************/
 25 | int crypto_kem_keypair_derand(uint8_t *pk,
 26 |                               uint8_t *sk,
 27 |                               const uint8_t *coins)
 28 | {
 29 |   indcpa_keypair_derand(pk, sk, coins);
 30 |   memcpy(sk+KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES);
 31 |   hash_h(sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
 32 |   /* Value z for pseudo-random output on reject */
 33 |   memcpy(sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES, coins+KYBER_SYMBYTES, KYBER_SYMBYTES);
 34 |   return 0;
 35 | }
 36 | 
 37 | /*************************************************
 38 | * Name:        crypto_kem_keypair
 39 | *
 40 | * Description: Generates public and private key
 41 | *              for CCA-secure Kyber key encapsulation mechanism
 42 | *
 43 | * Arguments:   - uint8_t *pk: pointer to output public key
 44 | *                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
 45 | *              - uint8_t *sk: pointer to output private key
 46 | *                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
 47 | *
 48 | * Returns 0 (success)
 49 | **************************************************/
 50 | int crypto_kem_keypair(uint8_t *pk,
 51 |                        uint8_t *sk)
 52 | {
 53 |   uint8_t coins[2*KYBER_SYMBYTES];
 54 |   randombytes(coins, 2*KYBER_SYMBYTES);
 55 |   crypto_kem_keypair_derand(pk, sk, coins);
 56 |   return 0;
 57 | }
 58 | 
 59 | /*************************************************
 60 | * Name:        crypto_kem_enc_derand
 61 | *
 62 | * Description: Generates cipher text and shared
 63 | *              secret for given public key
 64 | *
 65 | * Arguments:   - uint8_t *ct: pointer to output cipher text
 66 | *                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
 67 | *              - uint8_t *ss: pointer to output shared secret
 68 | *                (an already allocated array of KYBER_SSBYTES bytes)
 69 | *              - const uint8_t *pk: pointer to input public key
 70 | *                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
 71 | *              - const uint8_t *coins: pointer to input randomness
 72 | *                (an already allocated array filled with KYBER_SYMBYTES random bytes)
 73 | **
 74 | * Returns 0 (success)
 75 | **************************************************/
 76 | int crypto_kem_enc_derand(uint8_t *ct,
 77 |                           uint8_t *ss,
 78 |                           const uint8_t *pk,
 79 |                           const uint8_t *coins)
 80 | {
 81 |   uint8_t buf[2*KYBER_SYMBYTES];
 82 |   /* Will contain key, coins */
 83 |   uint8_t kr[2*KYBER_SYMBYTES];
 84 | 
 85 |   memcpy(buf, coins, KYBER_SYMBYTES);
 86 | 
 87 |   /* Multitarget countermeasure for coins + contributory KEM */
 88 |   hash_h(buf+KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
 89 |   hash_g(kr, buf, 2*KYBER_SYMBYTES);
 90 | 
 91 |   /* coins are in kr+KYBER_SYMBYTES */
 92 |   indcpa_enc(ct, buf, pk, kr+KYBER_SYMBYTES);
 93 | 
 94 |   memcpy(ss,kr,KYBER_SYMBYTES);
 95 |   return 0;
 96 | }
 97 | 
 98 | /*************************************************
 99 | * Name:        crypto_kem_enc
100 | *
101 | * Description: Generates cipher text and shared
102 | *              secret for given public key
103 | *
104 | * Arguments:   - uint8_t *ct: pointer to output cipher text
105 | *                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
106 | *              - uint8_t *ss: pointer to output shared secret
107 | *                (an already allocated array of KYBER_SSBYTES bytes)
108 | *              - const uint8_t *pk: pointer to input public key
109 | *                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
110 | *
111 | * Returns 0 (success)
112 | **************************************************/
113 | int crypto_kem_enc(uint8_t *ct,
114 |                    uint8_t *ss,
115 |                    const uint8_t *pk)
116 | {
117 |   uint8_t coins[KYBER_SYMBYTES];
118 |   randombytes(coins, KYBER_SYMBYTES);
119 |   crypto_kem_enc_derand(ct, ss, pk, coins);
120 |   return 0;
121 | }
122 | 
123 | /*************************************************
124 | * Name:        crypto_kem_dec
125 | *
126 | * Description: Generates shared secret for given
127 | *              cipher text and private key
128 | *
129 | * Arguments:   - uint8_t *ss: pointer to output shared secret
130 | *                (an already allocated array of KYBER_SSBYTES bytes)
131 | *              - const uint8_t *ct: pointer to input cipher text
132 | *                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
133 | *              - const uint8_t *sk: pointer to input private key
134 | *                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
135 | *
136 | * Returns 0.
137 | *
138 | * On failure, ss will contain a pseudo-random value.
139 | **************************************************/
140 | int crypto_kem_dec(uint8_t *ss,
141 |                    const uint8_t *ct,
142 |                    const uint8_t *sk)
143 | {
144 |   int fail;
145 |   uint8_t buf[2*KYBER_SYMBYTES];
146 |   /* Will contain key, coins */
147 |   uint8_t kr[2*KYBER_SYMBYTES];
148 | //  uint8_t cmp[KYBER_CIPHERTEXTBYTES+KYBER_SYMBYTES];
149 |   uint8_t cmp[KYBER_CIPHERTEXTBYTES];
150 |   const uint8_t *pk = sk+KYBER_INDCPA_SECRETKEYBYTES;
151 | 
152 |   indcpa_dec(buf, ct, sk);
153 | 
154 |   /* Multitarget countermeasure for coins + contributory KEM */
155 |   memcpy(buf+KYBER_SYMBYTES, sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, KYBER_SYMBYTES);
156 |   hash_g(kr, buf, 2*KYBER_SYMBYTES);
157 | 
158 |   /* coins are in kr+KYBER_SYMBYTES */
159 |   indcpa_enc(cmp, buf, pk, kr+KYBER_SYMBYTES);
160 | 
161 |   fail = verify(ct, cmp, KYBER_CIPHERTEXTBYTES);
162 | 
163 |   /* Compute rejection key */
164 |   rkprf(ss,sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES,ct);
165 | 
166 |   /* Copy true key to return buffer if fail is false */
167 |   cmov(ss,kr,KYBER_SYMBYTES,!fail);
168 | 
169 |   return 0;
170 | }
171 | 


--------------------------------------------------------------------------------
/ref/nistkat/PQCgenKAT_kem.c:
--------------------------------------------------------------------------------
  1 | 
  2 | //
  3 | //  PQCgenKAT_kem.c
  4 | //
  5 | //  Created by Bassham, Lawrence E (Fed) on 8/29/17.
  6 | //  Copyright © 2017 Bassham, Lawrence E (Fed). All rights reserved.
  7 | //
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include <string.h>
 11 | #include <ctype.h>
 12 | #include "rng.h"
 13 | #include "../kem.h"
 14 | 
 15 | #define	MAX_MARKER_LEN		50
 16 | #define KAT_SUCCESS          0
 17 | #define KAT_FILE_OPEN_ERROR -1
 18 | #define KAT_DATA_ERROR      -3
 19 | #define KAT_CRYPTO_FAILURE  -4
 20 | 
 21 | int		FindMarker(FILE *infile, const char *marker);
 22 | int		ReadHex(FILE *infile, unsigned char *A, int Length, char *str);
 23 | void	fprintBstr(FILE *fp, char *S, unsigned char *A, unsigned long long L);
 24 | 
 25 | int
 26 | main()
 27 | {
 28 |     char                fn_req[32], fn_rsp[32];
 29 |     FILE                *fp_req, *fp_rsp;
 30 |     unsigned char       seed[48];
 31 |     unsigned char       entropy_input[48];
 32 |     unsigned char       ct[CRYPTO_CIPHERTEXTBYTES], ss[CRYPTO_BYTES], ss1[CRYPTO_BYTES];
 33 |     int                 count;
 34 |     int                 done;
 35 |     unsigned char       pk[CRYPTO_PUBLICKEYBYTES], sk[CRYPTO_SECRETKEYBYTES];
 36 |     int                 ret_val;
 37 | 
 38 |     // Create the REQUEST file
 39 |     sprintf(fn_req, "PQCkemKAT_%d.req", CRYPTO_SECRETKEYBYTES);
 40 |     if ( (fp_req = fopen(fn_req, "w")) == NULL ) {
 41 |         printf("Couldn't open <%s> for write\n", fn_req);
 42 |         return KAT_FILE_OPEN_ERROR;
 43 |     }
 44 |     sprintf(fn_rsp, "PQCkemKAT_%d.rsp", CRYPTO_SECRETKEYBYTES);
 45 |     if ( (fp_rsp = fopen(fn_rsp, "w")) == NULL ) {
 46 |         printf("Couldn't open <%s> for write\n", fn_rsp);
 47 |         return KAT_FILE_OPEN_ERROR;
 48 |     }
 49 | 
 50 |     for (int i=0; i<48; i++)
 51 |         entropy_input[i] = i;
 52 | 
 53 |     randombytes_init(entropy_input, NULL, 256);
 54 |     for (int i=0; i<100; i++) {
 55 |         fprintf(fp_req, "count = %d\n", i);
 56 |         randombytes(seed, 48);
 57 |         fprintBstr(fp_req, "seed = ", seed, 48);
 58 |         fprintf(fp_req, "pk =\n");
 59 |         fprintf(fp_req, "sk =\n");
 60 |         fprintf(fp_req, "ct =\n");
 61 |         fprintf(fp_req, "ss =\n\n");
 62 |     }
 63 |     fclose(fp_req);
 64 | 
 65 |     //Create the RESPONSE file based on what's in the REQUEST file
 66 |     if ( (fp_req = fopen(fn_req, "r")) == NULL ) {
 67 |         printf("Couldn't open <%s> for read\n", fn_req);
 68 |         return KAT_FILE_OPEN_ERROR;
 69 |     }
 70 | 
 71 |     fprintf(fp_rsp, "# %s\n\n", CRYPTO_ALGNAME);
 72 |     done = 0;
 73 |     do {
 74 |         if ( FindMarker(fp_req, "count = ") )
 75 |             fscanf(fp_req, "%d", &count);
 76 |         else {
 77 |             done = 1;
 78 |             break;
 79 |         }
 80 |         fprintf(fp_rsp, "count = %d\n", count);
 81 | 
 82 |         if ( !ReadHex(fp_req, seed, 48, "seed = ") ) {
 83 |             printf("ERROR: unable to read 'seed' from <%s>\n", fn_req);
 84 |             return KAT_DATA_ERROR;
 85 |         }
 86 |         fprintBstr(fp_rsp, "seed = ", seed, 48);
 87 | 
 88 |         randombytes_init(seed, NULL, 256);
 89 | 
 90 |         // Generate the public/private keypair
 91 |         if ( (ret_val = crypto_kem_keypair(pk, sk)) != 0) {
 92 |             printf("crypto_kem_keypair returned <%d>\n", ret_val);
 93 |             return KAT_CRYPTO_FAILURE;
 94 |         }
 95 |         fprintBstr(fp_rsp, "pk = ", pk, CRYPTO_PUBLICKEYBYTES);
 96 |         fprintBstr(fp_rsp, "sk = ", sk, CRYPTO_SECRETKEYBYTES);
 97 | 
 98 |         if ( (ret_val = crypto_kem_enc(ct, ss, pk)) != 0) {
 99 |             printf("crypto_kem_enc returned <%d>\n", ret_val);
100 |             return KAT_CRYPTO_FAILURE;
101 |         }
102 |         fprintBstr(fp_rsp, "ct = ", ct, CRYPTO_CIPHERTEXTBYTES);
103 |         fprintBstr(fp_rsp, "ss = ", ss, CRYPTO_BYTES);
104 | 
105 |         fprintf(fp_rsp, "\n");
106 | 
107 |         if ( (ret_val = crypto_kem_dec(ss1, ct, sk)) != 0) {
108 |             printf("crypto_kem_dec returned <%d>\n", ret_val);
109 |             return KAT_CRYPTO_FAILURE;
110 |         }
111 | 
112 |         if ( memcmp(ss, ss1, CRYPTO_BYTES) ) {
113 |             printf("crypto_kem_dec returned bad 'ss' value\n");
114 |             return KAT_CRYPTO_FAILURE;
115 |         }
116 | 
117 |     } while ( !done );
118 | 
119 |     fclose(fp_req);
120 |     fclose(fp_rsp);
121 | 
122 |     return KAT_SUCCESS;
123 | }
124 | 
125 | 
126 | 
127 | //
128 | // ALLOW TO READ HEXADECIMAL ENTRY (KEYS, DATA, TEXT, etc.)
129 | //
130 | //
131 | // ALLOW TO READ HEXADECIMAL ENTRY (KEYS, DATA, TEXT, etc.)
132 | //
133 | int
134 | FindMarker(FILE *infile, const char *marker)
135 | {
136 | 	char	line[MAX_MARKER_LEN];
137 | 	int		i, len;
138 | 	int curr_line;
139 | 
140 | 	len = (int)strlen(marker);
141 | 	if ( len > MAX_MARKER_LEN-1 )
142 | 		len = MAX_MARKER_LEN-1;
143 | 
144 | 	for ( i=0; i<len; i++ )
145 | 	  {
146 | 	    curr_line = fgetc(infile);
147 | 	    line[i] = curr_line;
148 | 	    if (curr_line == EOF )
149 | 	      return 0;
150 | 	  }
151 | 	line[len] = '\0';
152 | 
153 | 	while ( 1 ) {
154 | 		if ( !strncmp(line, marker, len) )
155 | 			return 1;
156 | 
157 | 		for ( i=0; i<len-1; i++ )
158 | 			line[i] = line[i+1];
159 | 		curr_line = fgetc(infile);
160 | 		line[len-1] = curr_line;
161 | 		if (curr_line == EOF )
162 | 		    return 0;
163 | 		line[len] = '\0';
164 | 	}
165 | 
166 | 	// shouldn't get here
167 | 	return 0;
168 | }
169 | 
170 | //
171 | // ALLOW TO READ HEXADECIMAL ENTRY (KEYS, DATA, TEXT, etc.)
172 | //
173 | int
174 | ReadHex(FILE *infile, unsigned char *A, int Length, char *str)
175 | {
176 | 	int			i, ch, started;
177 | 	unsigned char	ich;
178 | 
179 | 	if ( Length == 0 ) {
180 | 		A[0] = 0x00;
181 | 		return 1;
182 | 	}
183 | 	memset(A, 0x00, Length);
184 | 	started = 0;
185 | 	if ( FindMarker(infile, str) )
186 | 		while ( (ch = fgetc(infile)) != EOF ) {
187 | 			if ( !isxdigit(ch) ) {
188 | 				if ( !started ) {
189 | 					if ( ch == '\n' )
190 | 						break;
191 | 					else
192 | 						continue;
193 | 				}
194 | 				else
195 | 					break;
196 | 			}
197 | 			started = 1;
198 | 			if ( (ch >= '0') && (ch <= '9') )
199 | 				ich = ch - '0';
200 | 			else if ( (ch >= 'A') && (ch <= 'F') )
201 | 				ich = ch - 'A' + 10;
202 | 			else if ( (ch >= 'a') && (ch <= 'f') )
203 | 				ich = ch - 'a' + 10;
204 |             else // shouldn't ever get here
205 |                 ich = 0;
206 | 
207 | 			for ( i=0; i<Length-1; i++ )
208 | 				A[i] = (A[i] << 4) | (A[i+1] >> 4);
209 | 			A[Length-1] = (A[Length-1] << 4) | ich;
210 | 		}
211 | 	else
212 | 		return 0;
213 | 
214 | 	return 1;
215 | }
216 | 
217 | void
218 | fprintBstr(FILE *fp, char *S, unsigned char *A, unsigned long long L)
219 | {
220 | 	unsigned long long  i;
221 | 
222 | 	fprintf(fp, "%s", S);
223 | 
224 | 	for ( i=0; i<L; i++ )
225 | 		fprintf(fp, "%02X", A[i]);
226 | 
227 | 	if ( L == 0 )
228 | 		fprintf(fp, "00");
229 | 
230 | 	fprintf(fp, "\n");
231 | }
232 | 
233 | 


--------------------------------------------------------------------------------
/avx2/fips202x4.c:
--------------------------------------------------------------------------------
  1 | #include <stddef.h>
  2 | #include <stdint.h>
  3 | #include <immintrin.h>
  4 | #include <string.h>
  5 | #include "fips202.h"
  6 | #include "fips202x4.h"
  7 | 
  8 | /* Use implementation from the Keccak Code Package */
  9 | #define KeccakF1600_StatePermute4x FIPS202X4_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds)
 10 | extern void KeccakF1600_StatePermute4x(__m256i *s);
 11 | 
 12 | static void keccakx4_absorb_once(__m256i s[25],
 13 |                                  unsigned int r,
 14 |                                  const uint8_t *in0,
 15 |                                  const uint8_t *in1,
 16 |                                  const uint8_t *in2,
 17 |                                  const uint8_t *in3,
 18 |                                  size_t inlen,
 19 |                                  uint8_t p)
 20 | {
 21 |   size_t i;
 22 |   uint64_t pos = 0;
 23 |   __m256i t, idx;
 24 | 
 25 |   for(i = 0; i < 25; ++i)
 26 |     s[i] = _mm256_setzero_si256();
 27 | 
 28 |   idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0);
 29 |   while(inlen >= r) {
 30 |     for(i = 0; i < r/8; ++i) {
 31 |       t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
 32 |       s[i] = _mm256_xor_si256(s[i], t);
 33 |       pos += 8;
 34 |     }
 35 |     inlen -= r;
 36 | 
 37 |     KeccakF1600_StatePermute4x(s);
 38 |   }
 39 | 
 40 |   for(i = 0; i < inlen/8; ++i) {
 41 |     t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
 42 |     s[i] = _mm256_xor_si256(s[i], t);
 43 |     pos += 8;
 44 |   }
 45 |   inlen -= 8*i;
 46 | 
 47 |   if(inlen) {
 48 |     t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
 49 |     idx = _mm256_set1_epi64x((1ULL << (8*inlen)) - 1);
 50 |     t = _mm256_and_si256(t, idx);
 51 |     s[i] = _mm256_xor_si256(s[i], t);
 52 |   }
 53 | 
 54 |   t = _mm256_set1_epi64x((uint64_t)p << 8*inlen);
 55 |   s[i] = _mm256_xor_si256(s[i], t);
 56 |   t = _mm256_set1_epi64x(1ULL << 63);
 57 |   s[r/8 - 1] = _mm256_xor_si256(s[r/8 - 1], t);
 58 | }
 59 | 
 60 | static void keccakx4_squeezeblocks(uint8_t *out0,
 61 |                                    uint8_t *out1,
 62 |                                    uint8_t *out2,
 63 |                                    uint8_t *out3,
 64 |                                    size_t nblocks,
 65 |                                    unsigned int r,
 66 |                                    __m256i s[25])
 67 | {
 68 |   unsigned int i;
 69 |   __m128d t;
 70 | 
 71 |   while(nblocks > 0) {
 72 |     KeccakF1600_StatePermute4x(s);
 73 |     for(i=0; i < r/8; ++i) {
 74 |       t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i]));
 75 |       _mm_storel_pd((__attribute__((__may_alias__)) double *)&out0[8*i], t);
 76 |       _mm_storeh_pd((__attribute__((__may_alias__)) double *)&out1[8*i], t);
 77 |       t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i],1));
 78 |       _mm_storel_pd((__attribute__((__may_alias__)) double *)&out2[8*i], t);
 79 |       _mm_storeh_pd((__attribute__((__may_alias__)) double *)&out3[8*i], t);
 80 |     }
 81 | 
 82 |     out0 += r;
 83 |     out1 += r;
 84 |     out2 += r;
 85 |     out3 += r;
 86 |     --nblocks;
 87 |   }
 88 | }
 89 | 
 90 | void shake128x4_absorb_once(keccakx4_state *state,
 91 |                             const uint8_t *in0,
 92 |                             const uint8_t *in1,
 93 |                             const uint8_t *in2,
 94 |                             const uint8_t *in3,
 95 |                             size_t inlen)
 96 | {
 97 |   keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
 98 | }
 99 | 
100 | void shake128x4_squeezeblocks(uint8_t *out0,
101 |                               uint8_t *out1,
102 |                               uint8_t *out2,
103 |                               uint8_t *out3,
104 |                               size_t nblocks,
105 |                               keccakx4_state *state)
106 | {
107 |   keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s);
108 | }
109 | 
110 | void shake256x4_absorb_once(keccakx4_state *state,
111 |                             const uint8_t *in0,
112 |                             const uint8_t *in1,
113 |                             const uint8_t *in2,
114 |                             const uint8_t *in3,
115 |                             size_t inlen)
116 | {
117 |   keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F);
118 | }
119 | 
120 | void shake256x4_squeezeblocks(uint8_t *out0,
121 |                               uint8_t *out1,
122 |                               uint8_t *out2,
123 |                               uint8_t *out3,
124 |                               size_t nblocks,
125 |                               keccakx4_state *state)
126 | {
127 |   keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s);
128 | }
129 | 
130 | void shake128x4(uint8_t *out0,
131 |                 uint8_t *out1,
132 |                 uint8_t *out2,
133 |                 uint8_t *out3,
134 |                 size_t outlen,
135 |                 const uint8_t *in0,
136 |                 const uint8_t *in1,
137 |                 const uint8_t *in2,
138 |                 const uint8_t *in3,
139 |                 size_t inlen)
140 | {
141 |   unsigned int i;
142 |   size_t nblocks = outlen/SHAKE128_RATE;
143 |   uint8_t t[4][SHAKE128_RATE];
144 |   keccakx4_state state;
145 | 
146 |   shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen);
147 |   shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);
148 | 
149 |   out0 += nblocks*SHAKE128_RATE;
150 |   out1 += nblocks*SHAKE128_RATE;
151 |   out2 += nblocks*SHAKE128_RATE;
152 |   out3 += nblocks*SHAKE128_RATE;
153 |   outlen -= nblocks*SHAKE128_RATE;
154 | 
155 |   if(outlen) {
156 |     shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
157 |     for(i = 0; i < outlen; ++i) {
158 |       out0[i] = t[0][i];
159 |       out1[i] = t[1][i];
160 |       out2[i] = t[2][i];
161 |       out3[i] = t[3][i];
162 |     }
163 |   }
164 | }
165 | 
166 | void shake256x4(uint8_t *out0,
167 |                 uint8_t *out1,
168 |                 uint8_t *out2,
169 |                 uint8_t *out3,
170 |                 size_t outlen,
171 |                 const uint8_t *in0,
172 |                 const uint8_t *in1,
173 |                 const uint8_t *in2,
174 |                 const uint8_t *in3,
175 |                 size_t inlen)
176 | {
177 |   unsigned int i;
178 |   size_t nblocks = outlen/SHAKE256_RATE;
179 |   uint8_t t[4][SHAKE256_RATE];
180 |   keccakx4_state state;
181 | 
182 |   shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen);
183 |   shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);
184 | 
185 |   out0 += nblocks*SHAKE256_RATE;
186 |   out1 += nblocks*SHAKE256_RATE;
187 |   out2 += nblocks*SHAKE256_RATE;
188 |   out3 += nblocks*SHAKE256_RATE;
189 |   outlen -= nblocks*SHAKE256_RATE;
190 | 
191 |   if(outlen) {
192 |     shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
193 |     for(i = 0; i < outlen; ++i) {
194 |       out0[i] = t[0][i];
195 |       out1[i] = t[1][i];
196 |       out2[i] = t[2][i];
197 |       out3[i] = t[3][i];
198 |     }
199 |   }
200 | }
201 | 


--------------------------------------------------------------------------------
/avx2/keccak4x/KeccakP-1600-unrolling.macros:
--------------------------------------------------------------------------------
  1 | /*
  2 | Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
  3 | Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
  4 | denoted as "the implementer".
  5 | 
  6 | For more information, feedback or questions, please refer to our websites:
  7 | http://keccak.noekeon.org/
  8 | http://keyak.noekeon.org/
  9 | http://ketje.noekeon.org/
 10 | 
 11 | To the extent possible under law, the implementer has waived all copyright
 12 | and related or neighboring rights to the source code in this file.
 13 | http://creativecommons.org/publicdomain/zero/1.0/
 14 | */
 15 | 
 16 | #if (defined(FullUnrolling))
 17 | #define rounds24 \
 18 |     prepareTheta \
 19 |     thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
 20 |     thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
 21 |     thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
 22 |     thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
 23 |     thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
 24 |     thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
 25 |     thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
 26 |     thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
 27 |     thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
 28 |     thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
 29 |     thetaRhoPiChiIotaPrepareTheta(10, A, E) \
 30 |     thetaRhoPiChiIotaPrepareTheta(11, E, A) \
 31 |     thetaRhoPiChiIotaPrepareTheta(12, A, E) \
 32 |     thetaRhoPiChiIotaPrepareTheta(13, E, A) \
 33 |     thetaRhoPiChiIotaPrepareTheta(14, A, E) \
 34 |     thetaRhoPiChiIotaPrepareTheta(15, E, A) \
 35 |     thetaRhoPiChiIotaPrepareTheta(16, A, E) \
 36 |     thetaRhoPiChiIotaPrepareTheta(17, E, A) \
 37 |     thetaRhoPiChiIotaPrepareTheta(18, A, E) \
 38 |     thetaRhoPiChiIotaPrepareTheta(19, E, A) \
 39 |     thetaRhoPiChiIotaPrepareTheta(20, A, E) \
 40 |     thetaRhoPiChiIotaPrepareTheta(21, E, A) \
 41 |     thetaRhoPiChiIotaPrepareTheta(22, A, E) \
 42 |     thetaRhoPiChiIota(23, E, A) \
 43 | 
 44 | #define rounds12 \
 45 |     prepareTheta \
 46 |     thetaRhoPiChiIotaPrepareTheta(12, A, E) \
 47 |     thetaRhoPiChiIotaPrepareTheta(13, E, A) \
 48 |     thetaRhoPiChiIotaPrepareTheta(14, A, E) \
 49 |     thetaRhoPiChiIotaPrepareTheta(15, E, A) \
 50 |     thetaRhoPiChiIotaPrepareTheta(16, A, E) \
 51 |     thetaRhoPiChiIotaPrepareTheta(17, E, A) \
 52 |     thetaRhoPiChiIotaPrepareTheta(18, A, E) \
 53 |     thetaRhoPiChiIotaPrepareTheta(19, E, A) \
 54 |     thetaRhoPiChiIotaPrepareTheta(20, A, E) \
 55 |     thetaRhoPiChiIotaPrepareTheta(21, E, A) \
 56 |     thetaRhoPiChiIotaPrepareTheta(22, A, E) \
 57 |     thetaRhoPiChiIota(23, E, A) \
 58 | 
 59 | #elif (Unrolling == 12)
 60 | #define rounds24 \
 61 |     prepareTheta \
 62 |     for(i=0; i<24; i+=12) { \
 63 |         thetaRhoPiChiIotaPrepareTheta(i   , A, E) \
 64 |         thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \
 65 |         thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \
 66 |         thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \
 67 |         thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \
 68 |         thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \
 69 |         thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \
 70 |         thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \
 71 |         thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \
 72 |         thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \
 73 |         thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \
 74 |         thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \
 75 |     } \
 76 | 
 77 | #define rounds12 \
 78 |     prepareTheta \
 79 |     thetaRhoPiChiIotaPrepareTheta(12, A, E) \
 80 |     thetaRhoPiChiIotaPrepareTheta(13, E, A) \
 81 |     thetaRhoPiChiIotaPrepareTheta(14, A, E) \
 82 |     thetaRhoPiChiIotaPrepareTheta(15, E, A) \
 83 |     thetaRhoPiChiIotaPrepareTheta(16, A, E) \
 84 |     thetaRhoPiChiIotaPrepareTheta(17, E, A) \
 85 |     thetaRhoPiChiIotaPrepareTheta(18, A, E) \
 86 |     thetaRhoPiChiIotaPrepareTheta(19, E, A) \
 87 |     thetaRhoPiChiIotaPrepareTheta(20, A, E) \
 88 |     thetaRhoPiChiIotaPrepareTheta(21, E, A) \
 89 |     thetaRhoPiChiIotaPrepareTheta(22, A, E) \
 90 |     thetaRhoPiChiIota(23, E, A) \
 91 | 
 92 | #elif (Unrolling == 6)
 93 | #define rounds24 \
 94 |     prepareTheta \
 95 |     for(i=0; i<24; i+=6) { \
 96 |         thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
 97 |         thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
 98 |         thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
 99 |         thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
100 |         thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
101 |         thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
102 |     } \
103 | 
104 | #define rounds12 \
105 |     prepareTheta \
106 |     for(i=12; i<24; i+=6) { \
107 |         thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
108 |         thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
109 |         thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
110 |         thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
111 |         thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
112 |         thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
113 |     } \
114 | 
115 | #elif (Unrolling == 4)
116 | #define rounds24 \
117 |     prepareTheta \
118 |     for(i=0; i<24; i+=4) { \
119 |         thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
120 |         thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
121 |         thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
122 |         thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
123 |     } \
124 | 
125 | #define rounds12 \
126 |     prepareTheta \
127 |     for(i=12; i<24; i+=4) { \
128 |         thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
129 |         thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
130 |         thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
131 |         thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
132 |     } \
133 | 
134 | #elif (Unrolling == 3)
135 | #define rounds24 \
136 |     prepareTheta \
137 |     for(i=0; i<24; i+=3) { \
138 |         thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
139 |         thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
140 |         thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
141 |         copyStateVariables(A, E) \
142 |     } \
143 | 
144 | #define rounds12 \
145 |     prepareTheta \
146 |     for(i=12; i<24; i+=3) { \
147 |         thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
148 |         thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
149 |         thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
150 |         copyStateVariables(A, E) \
151 |     } \
152 | 
153 | #elif (Unrolling == 2)
154 | #define rounds24 \
155 |     prepareTheta \
156 |     for(i=0; i<24; i+=2) { \
157 |         thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
158 |         thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
159 |     } \
160 | 
161 | #define rounds12 \
162 |     prepareTheta \
163 |     for(i=12; i<24; i+=2) { \
164 |         thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
165 |         thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
166 |     } \
167 | 
168 | #elif (Unrolling == 1)
169 | #define rounds24 \
170 |     prepareTheta \
171 |     for(i=0; i<24; i++) { \
172 |         thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
173 |         copyStateVariables(A, E) \
174 |     } \
175 | 
176 | #define rounds12 \
177 |     prepareTheta \
178 |     for(i=12; i<24; i++) { \
179 |         thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
180 |         copyStateVariables(A, E) \
181 |     } \
182 | 
183 | #else
184 | #error "Unrolling is not correctly specified!"
185 | #endif
186 | 
187 | #define roundsN(__nrounds) \
188 |     prepareTheta \
189 |     i = 24 - (__nrounds); \
190 |     if ((i&1) != 0) { \
191 |         thetaRhoPiChiIotaPrepareTheta(i, A, E) \
192 |         copyStateVariables(A, E) \
193 |         ++i; \
194 |     } \
195 |     for( /* empty */; i<24; i+=2) { \
196 |         thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
197 |         thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
198 |     }
199 | 


--------------------------------------------------------------------------------
/ref/polyvec.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include "params.h"
  3 | #include "poly.h"
  4 | #include "polyvec.h"
  5 | 
  6 | /*************************************************
  7 | * Name:        polyvec_compress
  8 | *
  9 | * Description: Compress and serialize vector of polynomials
 10 | *
 11 | * Arguments:   - uint8_t *r: pointer to output byte array
 12 | *                            (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
 13 | *              - const polyvec *a: pointer to input vector of polynomials
 14 | **************************************************/
 15 | void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
 16 | {
 17 |   unsigned int i,j,k;
 18 |   uint64_t d0;
 19 | 
 20 | #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
 21 |   uint16_t t[8];
 22 |   for(i=0;i<KYBER_K;i++) {
 23 |     for(j=0;j<KYBER_N/8;j++) {
 24 |       for(k=0;k<8;k++) {
 25 |         t[k]  = a->vec[i].coeffs[8*j+k];
 26 |         t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
 27 | /*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
 28 |         d0 = t[k];
 29 |         d0 <<= 11;
 30 |         d0 += 1664;
 31 |         d0 *= 645084;
 32 |         d0 >>= 31;
 33 |         t[k] = d0 & 0x7ff;
 34 |       }
 35 | 
 36 |       r[ 0] = (t[0] >>  0);
 37 |       r[ 1] = (t[0] >>  8) | (t[1] << 3);
 38 |       r[ 2] = (t[1] >>  5) | (t[2] << 6);
 39 |       r[ 3] = (t[2] >>  2);
 40 |       r[ 4] = (t[2] >> 10) | (t[3] << 1);
 41 |       r[ 5] = (t[3] >>  7) | (t[4] << 4);
 42 |       r[ 6] = (t[4] >>  4) | (t[5] << 7);
 43 |       r[ 7] = (t[5] >>  1);
 44 |       r[ 8] = (t[5] >>  9) | (t[6] << 2);
 45 |       r[ 9] = (t[6] >>  6) | (t[7] << 5);
 46 |       r[10] = (t[7] >>  3);
 47 |       r += 11;
 48 |     }
 49 |   }
 50 | #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
 51 |   uint16_t t[4];
 52 |   for(i=0;i<KYBER_K;i++) {
 53 |     for(j=0;j<KYBER_N/4;j++) {
 54 |       for(k=0;k<4;k++) {
 55 |         t[k]  = a->vec[i].coeffs[4*j+k];
 56 |         t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
 57 | /*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
 58 |         d0 = t[k];
 59 |         d0 <<= 10;
 60 |         d0 += 1665;
 61 |         d0 *= 1290167;
 62 |         d0 >>= 32;
 63 |         t[k] = d0 & 0x3ff;
 64 |       }
 65 | 
 66 |       r[0] = (t[0] >> 0);
 67 |       r[1] = (t[0] >> 8) | (t[1] << 2);
 68 |       r[2] = (t[1] >> 6) | (t[2] << 4);
 69 |       r[3] = (t[2] >> 4) | (t[3] << 6);
 70 |       r[4] = (t[3] >> 2);
 71 |       r += 5;
 72 |     }
 73 |   }
 74 | #else
 75 | #error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
 76 | #endif
 77 | }
 78 | 
 79 | /*************************************************
 80 | * Name:        polyvec_decompress
 81 | *
 82 | * Description: De-serialize and decompress vector of polynomials;
 83 | *              approximate inverse of polyvec_compress
 84 | *
 85 | * Arguments:   - polyvec *r:       pointer to output vector of polynomials
 86 | *              - const uint8_t *a: pointer to input byte array
 87 | *                                  (of length KYBER_POLYVECCOMPRESSEDBYTES)
 88 | **************************************************/
 89 | void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES])
 90 | {
 91 |   unsigned int i,j,k;
 92 | 
 93 | #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
 94 |   uint16_t t[8];
 95 |   for(i=0;i<KYBER_K;i++) {
 96 |     for(j=0;j<KYBER_N/8;j++) {
 97 |       t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8);
 98 |       t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5);
 99 |       t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10);
100 |       t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7);
101 |       t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4);
102 |       t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9);
103 |       t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6);
104 |       t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3);
105 |       a += 11;
106 | 
107 |       for(k=0;k<8;k++)
108 |         r->vec[i].coeffs[8*j+k] = ((uint32_t)(t[k] & 0x7FF)*KYBER_Q + 1024) >> 11;
109 |     }
110 |   }
111 | #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
112 |   uint16_t t[4];
113 |   for(i=0;i<KYBER_K;i++) {
114 |     for(j=0;j<KYBER_N/4;j++) {
115 |       t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8);
116 |       t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6);
117 |       t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4);
118 |       t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2);
119 |       a += 5;
120 | 
121 |       for(k=0;k<4;k++)
122 |         r->vec[i].coeffs[4*j+k] = ((uint32_t)(t[k] & 0x3FF)*KYBER_Q + 512) >> 10;
123 |     }
124 |   }
125 | #else
126 | #error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
127 | #endif
128 | }
129 | 
130 | /*************************************************
131 | * Name:        polyvec_tobytes
132 | *
133 | * Description: Serialize vector of polynomials
134 | *
135 | * Arguments:   - uint8_t *r: pointer to output byte array
136 | *                            (needs space for KYBER_POLYVECBYTES)
137 | *              - const polyvec *a: pointer to input vector of polynomials
138 | **************************************************/
139 | void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a)
140 | {
141 |   unsigned int i;
142 |   for(i=0;i<KYBER_K;i++)
143 |     poly_tobytes(r+i*KYBER_POLYBYTES, &a->vec[i]);
144 | }
145 | 
146 | /*************************************************
147 | * Name:        polyvec_frombytes
148 | *
149 | * Description: De-serialize vector of polynomials;
150 | *              inverse of polyvec_tobytes
151 | *
152 | * Arguments:   - uint8_t *r:       pointer to output byte array
153 | *              - const polyvec *a: pointer to input vector of polynomials
154 | *                                  (of length KYBER_POLYVECBYTES)
155 | **************************************************/
156 | void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES])
157 | {
158 |   unsigned int i;
159 |   for(i=0;i<KYBER_K;i++)
160 |     poly_frombytes(&r->vec[i], a+i*KYBER_POLYBYTES);
161 | }
162 | 
163 | /*************************************************
164 | * Name:        polyvec_ntt
165 | *
166 | * Description: Apply forward NTT to all elements of a vector of polynomials
167 | *
168 | * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
169 | **************************************************/
170 | void polyvec_ntt(polyvec *r)
171 | {
172 |   unsigned int i;
173 |   for(i=0;i<KYBER_K;i++)
174 |     poly_ntt(&r->vec[i]);
175 | }
176 | 
177 | /*************************************************
178 | * Name:        polyvec_invntt_tomont
179 | *
180 | * Description: Apply inverse NTT to all elements of a vector of polynomials
181 | *              and multiply by Montgomery factor 2^16
182 | *
183 | * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
184 | **************************************************/
185 | void polyvec_invntt_tomont(polyvec *r)
186 | {
187 |   unsigned int i;
188 |   for(i=0;i<KYBER_K;i++)
189 |     poly_invntt_tomont(&r->vec[i]);
190 | }
191 | 
192 | /*************************************************
193 | * Name:        polyvec_basemul_acc_montgomery
194 | *
195 | * Description: Multiply elements of a and b in NTT domain, accumulate into r,
196 | *              and multiply by 2^-16.
197 | *
198 | * Arguments: - poly *r: pointer to output polynomial
199 | *            - const polyvec *a: pointer to first input vector of polynomials
200 | *            - const polyvec *b: pointer to second input vector of polynomials
201 | **************************************************/
202 | void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
203 | {
204 |   unsigned int i;
205 |   poly t;
206 | 
207 |   poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]);
208 |   for(i=1;i<KYBER_K;i++) {
209 |     poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]);
210 |     poly_add(r, r, &t);
211 |   }
212 | 
213 |   poly_reduce(r);
214 | }
215 | 
216 | /*************************************************
217 | * Name:        polyvec_reduce
218 | *
219 | * Description: Applies Barrett reduction to each coefficient
220 | *              of each element of a vector of polynomials;
221 | *              for details of the Barrett reduction see comments in reduce.c
222 | *
223 | * Arguments:   - polyvec *r: pointer to input/output polynomial
224 | **************************************************/
225 | void polyvec_reduce(polyvec *r)
226 | {
227 |   unsigned int i;
228 |   for(i=0;i<KYBER_K;i++)
229 |     poly_reduce(&r->vec[i]);
230 | }
231 | 
232 | /*************************************************
233 | * Name:        polyvec_add
234 | *
235 | * Description: Add vectors of polynomials
236 | *
237 | * Arguments: - polyvec *r: pointer to output vector of polynomials
238 | *            - const polyvec *a: pointer to first input vector of polynomials
239 | *            - const polyvec *b: pointer to second input vector of polynomials
240 | **************************************************/
241 | void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b)
242 | {
243 |   unsigned int i;
244 |   for(i=0;i<KYBER_K;i++)
245 |     poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
246 | }
247 | 


--------------------------------------------------------------------------------
/avx2/polyvec.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <immintrin.h>
  3 | #include <string.h>
  4 | #include "params.h"
  5 | #include "polyvec.h"
  6 | #include "poly.h"
  7 | #include "ntt.h"
  8 | #include "consts.h"
  9 | 
 10 | #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
 11 | static void poly_compress10(uint8_t r[320], const poly * restrict a)
 12 | {
 13 |   unsigned int i;
 14 |   __m256i f0, f1, f2;
 15 |   __m128i t0, t1;
 16 |   const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
 17 |   const __m256i v8 = _mm256_slli_epi16(v,3);
 18 |   const __m256i off = _mm256_set1_epi16(15);
 19 |   const __m256i shift1 = _mm256_set1_epi16(1 << 12);
 20 |   const __m256i mask = _mm256_set1_epi16(1023);
 21 |   const __m256i shift2 = _mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1);
 22 |   const __m256i sllvdidx = _mm256_set1_epi64x(12);
 23 |   const __m256i shufbidx = _mm256_set_epi8( 8, 4, 3, 2, 1, 0,-1,-1,-1,-1,-1,-1,12,11,10, 9,
 24 |                                            -1,-1,-1,-1,-1,-1,12,11,10, 9, 8, 4, 3, 2, 1, 0);
 25 | 
 26 |   for(i=0;i<KYBER_N/16;i++) {
 27 |     f0 = _mm256_load_si256(&a->vec[i]);
 28 |     f1 = _mm256_mullo_epi16(f0,v8);
 29 |     f2 = _mm256_add_epi16(f0,off);
 30 |     f0 = _mm256_slli_epi16(f0,3);
 31 |     f0 = _mm256_mulhi_epi16(f0,v);
 32 |     f2 = _mm256_sub_epi16(f1,f2);
 33 |     f1 = _mm256_andnot_si256(f1,f2);
 34 |     f1 = _mm256_srli_epi16(f1,15);
 35 |     f0 = _mm256_sub_epi16(f0,f1);
 36 |     f0 = _mm256_mulhrs_epi16(f0,shift1);
 37 |     f0 = _mm256_and_si256(f0,mask);
 38 |     f0 = _mm256_madd_epi16(f0,shift2);
 39 |     f0 = _mm256_sllv_epi32(f0,sllvdidx);
 40 |     f0 = _mm256_srli_epi64(f0,12);
 41 |     f0 = _mm256_shuffle_epi8(f0,shufbidx);
 42 |     t0 = _mm256_castsi256_si128(f0);
 43 |     t1 = _mm256_extracti128_si256(f0,1);
 44 |     t0 = _mm_blend_epi16(t0,t1,0xE0);
 45 |     _mm_storeu_si128((__m128i *)&r[20*i+ 0],t0);
 46 |     memcpy(&r[20*i+16],&t1,4);
 47 |   }
 48 | }
 49 | 
 50 | static void poly_decompress10(poly * restrict r, const uint8_t a[320+12])
 51 | {
 52 |   unsigned int i;
 53 |   __m256i f;
 54 |   const __m256i q = _mm256_set1_epi32((KYBER_Q << 16) + 4*KYBER_Q);
 55 |   const __m256i shufbidx = _mm256_set_epi8(11,10,10, 9, 9, 8, 8, 7,
 56 |                                             6, 5, 5, 4, 4, 3, 3, 2,
 57 |                                             9, 8, 8, 7, 7, 6, 6, 5,
 58 |                                             4, 3, 3, 2, 2, 1, 1, 0);
 59 |   const __m256i sllvdidx = _mm256_set1_epi64x(4);
 60 |   const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184);
 61 | 
 62 |   for(i=0;i<KYBER_N/16;i++) {
 63 |     f = _mm256_loadu_si256((__m256i *)&a[20*i]);
 64 |     f = _mm256_permute4x64_epi64(f,0x94);
 65 |     f = _mm256_shuffle_epi8(f,shufbidx);
 66 |     f = _mm256_sllv_epi32(f,sllvdidx);
 67 |     f = _mm256_srli_epi16(f,1);
 68 |     f = _mm256_and_si256(f,mask);
 69 |     f = _mm256_mulhrs_epi16(f,q);
 70 |     _mm256_store_si256(&r->vec[i],f);
 71 |   }
 72 | }
 73 | 
 74 | #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
 75 | static void poly_compress11(uint8_t r[352+2], const poly * restrict a)
 76 | {
 77 |   unsigned int i;
 78 |   __m256i f0, f1, f2;
 79 |   __m128i t0, t1;
 80 |   const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
 81 |   const __m256i v8 = _mm256_slli_epi16(v,3);
 82 |   const __m256i off = _mm256_set1_epi16(36);
 83 |   const __m256i shift1 = _mm256_set1_epi16(1 << 13);
 84 |   const __m256i mask = _mm256_set1_epi16(2047);
 85 |   const __m256i shift2 = _mm256_set1_epi64x((2048LL << 48) + (1LL << 32) + (2048 << 16) + 1);
 86 |   const __m256i sllvdidx = _mm256_set1_epi64x(10);
 87 |   const __m256i srlvqidx = _mm256_set_epi64x(30,10,30,10);
 88 |   const __m256i shufbidx = _mm256_set_epi8( 4, 3, 2, 1, 0, 0,-1,-1,-1,-1,10, 9, 8, 7, 6, 5,
 89 |                                            -1,-1,-1,-1,-1,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 90 | 
 91 |   for(i=0;i<KYBER_N/16;i++) {
 92 |     f0 = _mm256_load_si256(&a->vec[i]);
 93 |     f1 = _mm256_mullo_epi16(f0,v8);
 94 |     f2 = _mm256_add_epi16(f0,off);
 95 |     f0 = _mm256_slli_epi16(f0,3);
 96 |     f0 = _mm256_mulhi_epi16(f0,v);
 97 |     f2 = _mm256_sub_epi16(f1,f2);
 98 |     f1 = _mm256_andnot_si256(f1,f2);
 99 |     f1 = _mm256_srli_epi16(f1,15);
100 |     f0 = _mm256_sub_epi16(f0,f1);
101 |     f0 = _mm256_mulhrs_epi16(f0,shift1);
102 |     f0 = _mm256_and_si256(f0,mask);
103 |     f0 = _mm256_madd_epi16(f0,shift2);
104 |     f0 = _mm256_sllv_epi32(f0,sllvdidx);
105 |     f1 = _mm256_bsrli_epi128(f0,8);
106 |     f0 = _mm256_srlv_epi64(f0,srlvqidx);
107 |     f1 = _mm256_slli_epi64(f1,34);
108 |     f0 = _mm256_add_epi64(f0,f1);
109 |     f0 = _mm256_shuffle_epi8(f0,shufbidx);
110 |     t0 = _mm256_castsi256_si128(f0);
111 |     t1 = _mm256_extracti128_si256(f0,1);
112 |     t0 = _mm_blendv_epi8(t0,t1,_mm256_castsi256_si128(shufbidx));
113 |     _mm_storeu_si128((__m128i *)&r[22*i+ 0],t0);
114 |     _mm_storel_epi64((__m128i *)&r[22*i+16],t1);
115 |   }
116 | }
117 | 
118 | static void poly_decompress11(poly * restrict r, const uint8_t a[352+10])
119 | {
120 |   unsigned int i;
121 |   __m256i f;
122 |   const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
123 |   const __m256i shufbidx = _mm256_set_epi8(13,12,12,11,10, 9, 9, 8,
124 |                                             8, 7, 6, 5, 5, 4, 4, 3,
125 |                                            10, 9, 9, 8, 7, 6, 6, 5,
126 |                                             5, 4, 3, 2, 2, 1, 1, 0);
127 |   const __m256i srlvdidx = _mm256_set_epi32(0,0,1,0,0,0,1,0);
128 |   const __m256i srlvqidx = _mm256_set_epi64x(2,0,2,0);
129 |   const __m256i shift = _mm256_set_epi16(4,32,1,8,32,1,4,32,4,32,1,8,32,1,4,32);
130 |   const __m256i mask = _mm256_set1_epi16(32752);
131 | 
132 |   for(i=0;i<KYBER_N/16;i++) {
133 |     f = _mm256_loadu_si256((__m256i *)&a[22*i]);
134 |     f = _mm256_permute4x64_epi64(f,0x94);
135 |     f = _mm256_shuffle_epi8(f,shufbidx);
136 |     f = _mm256_srlv_epi32(f,srlvdidx);
137 |     f = _mm256_srlv_epi64(f,srlvqidx);
138 |     f = _mm256_mullo_epi16(f,shift);
139 |     f = _mm256_srli_epi16(f,1);
140 |     f = _mm256_and_si256(f,mask);
141 |     f = _mm256_mulhrs_epi16(f,q);
142 |     _mm256_store_si256(&r->vec[i],f);
143 |   }
144 | }
145 | 
146 | #endif
147 | 
148 | /*************************************************
149 | * Name:        polyvec_compress
150 | *
151 | * Description: Compress and serialize vector of polynomials
152 | *
153 | * Arguments:   - uint8_t *r: pointer to output byte array
154 | *                            (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
155 | *              - polyvec *a: pointer to input vector of polynomials
156 | **************************************************/
157 | void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES+2], const polyvec *a)
158 | {
159 |   unsigned int i;
160 | 
161 | #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
162 |   for(i=0;i<KYBER_K;i++)
163 |     poly_compress10(&r[320*i],&a->vec[i]);
164 | #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
165 |   for(i=0;i<KYBER_K;i++)
166 |     poly_compress11(&r[352*i],&a->vec[i]);
167 | #endif
168 | }
169 | 
170 | /*************************************************
171 | * Name:        polyvec_decompress
172 | *
173 | * Description: De-serialize and decompress vector of polynomials;
174 | *              approximate inverse of polyvec_compress
175 | *
176 | * Arguments:   - polyvec *r: pointer to output vector of polynomials
177 | *              - const uint8_t *a: pointer to input byte array
178 | *                                  (of length KYBER_POLYVECCOMPRESSEDBYTES)
179 | **************************************************/
180 | void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES+12])
181 | {
182 |   unsigned int i;
183 | 
184 | #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
185 |   for(i=0;i<KYBER_K;i++)
186 |     poly_decompress10(&r->vec[i],&a[320*i]);
187 | #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
188 |   for(i=0;i<KYBER_K;i++)
189 |     poly_decompress11(&r->vec[i],&a[352*i]);
190 | #endif
191 | }
192 | 
193 | /*************************************************
194 | * Name:        polyvec_tobytes
195 | *
196 | * Description: Serialize vector of polynomials
197 | *
198 | * Arguments:   - uint8_t *r: pointer to output byte array
199 | *                            (needs space for KYBER_POLYVECBYTES)
200 | *              - polyvec *a: pointer to input vector of polynomials
201 | **************************************************/
202 | void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a)
203 | {
204 |   unsigned int i;
205 |   for(i=0;i<KYBER_K;i++)
206 |     poly_tobytes(r+i*KYBER_POLYBYTES, &a->vec[i]);
207 | }
208 | 
209 | /*************************************************
210 | * Name:        polyvec_frombytes
211 | *
212 | * Description: De-serialize vector of polynomials;
213 | *              inverse of polyvec_tobytes
214 | *
215 | * Arguments:   - uint8_t *r: pointer to output byte array
216 | *              - const polyvec *a: pointer to input vector of polynomials
217 | *                                  (of length KYBER_POLYVECBYTES)
218 | **************************************************/
219 | void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES])
220 | {
221 |   unsigned int i;
222 |   for(i=0;i<KYBER_K;i++)
223 |     poly_frombytes(&r->vec[i], a+i*KYBER_POLYBYTES);
224 | }
225 | 
226 | /*************************************************
227 | * Name:        polyvec_ntt
228 | *
229 | * Description: Apply forward NTT to all elements of a vector of polynomials
230 | *
231 | * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
232 | **************************************************/
233 | void polyvec_ntt(polyvec *r)
234 | {
235 |   unsigned int i;
236 |   for(i=0;i<KYBER_K;i++)
237 |     poly_ntt(&r->vec[i]);
238 | }
239 | 
240 | /*************************************************
241 | * Name:        polyvec_invntt_tomont
242 | *
243 | * Description: Apply inverse NTT to all elements of a vector of polynomials
244 | *              and multiply by Montgomery factor 2^16
245 | *
246 | * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
247 | **************************************************/
248 | void polyvec_invntt_tomont(polyvec *r)
249 | {
250 |   unsigned int i;
251 |   for(i=0;i<KYBER_K;i++)
252 |     poly_invntt_tomont(&r->vec[i]);
253 | }
254 | 
255 | /*************************************************
256 | * Name:        polyvec_basemul_acc_montgomery
257 | *
258 | * Description: Multiply elements in a and b in NTT domain, accumulate into r,
259 | *              and multiply by 2^-16.
260 | *
261 | * Arguments: - poly *r: pointer to output polynomial
262 | *            - const polyvec *a: pointer to first input vector of polynomials
263 | *            - const polyvec *b: pointer to second input vector of polynomials
264 | **************************************************/
265 | void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
266 | {
267 |   unsigned int i;
268 |   poly tmp;
269 | 
270 |   poly_basemul_montgomery(r,&a->vec[0],&b->vec[0]);
271 |   for(i=1;i<KYBER_K;i++) {
272 |     poly_basemul_montgomery(&tmp,&a->vec[i],&b->vec[i]);
273 |     poly_add(r,r,&tmp);
274 |   }
275 | }
276 | 
277 | /*************************************************
278 | * Name:        polyvec_reduce
279 | *
280 | * Description: Applies Barrett reduction to each coefficient
281 | *              of each element of a vector of polynomials;
282 | *              for details of the Barrett reduction see comments in reduce.c
283 | *
284 | * Arguments:   - polyvec *r: pointer to input/output polynomial
285 | **************************************************/
286 | void polyvec_reduce(polyvec *r)
287 | {
288 |   unsigned int i;
289 |   for(i=0;i<KYBER_K;i++)
290 |     poly_reduce(&r->vec[i]);
291 | }
292 | 
293 | /*************************************************
294 | * Name:        polyvec_add
295 | *
296 | * Description: Add vectors of polynomials
297 | *
298 | * Arguments: - polyvec *r:       pointer to output vector of polynomials
299 | *            - const polyvec *a: pointer to first input vector of polynomials
300 | *            - const polyvec *b: pointer to second input vector of polynomials
301 | **************************************************/
302 | void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b)
303 | {
304 |   unsigned int i;
305 |   for(i=0;i<KYBER_K;i++)
306 |     poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
307 | }
308 | 


--------------------------------------------------------------------------------
/ref/poly.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include "params.h"
  3 | #include "poly.h"
  4 | #include "ntt.h"
  5 | #include "reduce.h"
  6 | #include "cbd.h"
  7 | #include "symmetric.h"
  8 | #include "verify.h"
  9 | 
 10 | /*************************************************
 11 | * Name:        poly_compress
 12 | *
 13 | * Description: Compression and subsequent serialization of a polynomial
 14 | *
 15 | * Arguments:   - uint8_t *r: pointer to output byte array
 16 | *                            (of length KYBER_POLYCOMPRESSEDBYTES)
 17 | *              - const poly *a: pointer to input polynomial
 18 | **************************************************/
 19 | void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
 20 | {
 21 |   unsigned int i,j;
 22 |   int16_t u;
 23 |   uint32_t d0;
 24 |   uint8_t t[8];
 25 | 
 26 | #if (KYBER_POLYCOMPRESSEDBYTES == 128)
 27 | 
 28 |   for(i=0;i<KYBER_N/8;i++) {
 29 |     for(j=0;j<8;j++) {
 30 |       // map to positive standard representatives
 31 |       u  = a->coeffs[8*i+j];
 32 |       u += (u >> 15) & KYBER_Q;
 33 | /*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
 34 |       d0 = u << 4;
 35 |       d0 += 1665;
 36 |       d0 *= 80635;
 37 |       d0 >>= 28;
 38 |       t[j] = d0 & 0xf;
 39 |     }
 40 | 
 41 |     r[0] = t[0] | (t[1] << 4);
 42 |     r[1] = t[2] | (t[3] << 4);
 43 |     r[2] = t[4] | (t[5] << 4);
 44 |     r[3] = t[6] | (t[7] << 4);
 45 |     r += 4;
 46 |   }
 47 | #elif (KYBER_POLYCOMPRESSEDBYTES == 160)
 48 |   for(i=0;i<KYBER_N/8;i++) {
 49 |     for(j=0;j<8;j++) {
 50 |       // map to positive standard representatives
 51 |       u  = a->coeffs[8*i+j];
 52 |       u += (u >> 15) & KYBER_Q;
 53 | /*    t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
 54 |       d0 = u << 5;
 55 |       d0 += 1664;
 56 |       d0 *= 40318;
 57 |       d0 >>= 27;
 58 |       t[j] = d0 & 0x1f;
 59 |     }
 60 | 
 61 |     r[0] = (t[0] >> 0) | (t[1] << 5);
 62 |     r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
 63 |     r[2] = (t[3] >> 1) | (t[4] << 4);
 64 |     r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
 65 |     r[4] = (t[6] >> 2) | (t[7] << 3);
 66 |     r += 5;
 67 |   }
 68 | #else
 69 | #error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
 70 | #endif
 71 | }
 72 | 
 73 | /*************************************************
 74 | * Name:        poly_decompress
 75 | *
 76 | * Description: De-serialization and subsequent decompression of a polynomial;
 77 | *              approximate inverse of poly_compress
 78 | *
 79 | * Arguments:   - poly *r: pointer to output polynomial
 80 | *              - const uint8_t *a: pointer to input byte array
 81 | *                                  (of length KYBER_POLYCOMPRESSEDBYTES bytes)
 82 | **************************************************/
 83 | void poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES])
 84 | {
 85 |   unsigned int i;
 86 | 
 87 | #if (KYBER_POLYCOMPRESSEDBYTES == 128)
 88 |   for(i=0;i<KYBER_N/2;i++) {
 89 |     r->coeffs[2*i+0] = (((uint16_t)(a[0] & 15)*KYBER_Q) + 8) >> 4;
 90 |     r->coeffs[2*i+1] = (((uint16_t)(a[0] >> 4)*KYBER_Q) + 8) >> 4;
 91 |     a += 1;
 92 |   }
 93 | #elif (KYBER_POLYCOMPRESSEDBYTES == 160)
 94 |   unsigned int j;
 95 |   uint8_t t[8];
 96 |   for(i=0;i<KYBER_N/8;i++) {
 97 |     t[0] = (a[0] >> 0);
 98 |     t[1] = (a[0] >> 5) | (a[1] << 3);
 99 |     t[2] = (a[1] >> 2);
100 |     t[3] = (a[1] >> 7) | (a[2] << 1);
101 |     t[4] = (a[2] >> 4) | (a[3] << 4);
102 |     t[5] = (a[3] >> 1);
103 |     t[6] = (a[3] >> 6) | (a[4] << 2);
104 |     t[7] = (a[4] >> 3);
105 |     a += 5;
106 | 
107 |     for(j=0;j<8;j++)
108 |       r->coeffs[8*i+j] = ((uint32_t)(t[j] & 31)*KYBER_Q + 16) >> 5;
109 |   }
110 | #else
111 | #error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
112 | #endif
113 | }
114 | 
115 | /*************************************************
116 | * Name:        poly_tobytes
117 | *
118 | * Description: Serialization of a polynomial
119 | *
120 | * Arguments:   - uint8_t *r: pointer to output byte array
121 | *                            (needs space for KYBER_POLYBYTES bytes)
122 | *              - const poly *a: pointer to input polynomial
123 | **************************************************/
124 | void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a)
125 | {
126 |   unsigned int i;
127 |   uint16_t t0, t1;
128 | 
129 |   for(i=0;i<KYBER_N/2;i++) {
130 |     // map to positive standard representatives
131 |     t0  = a->coeffs[2*i];
132 |     t0 += ((int16_t)t0 >> 15) & KYBER_Q;
133 |     t1 = a->coeffs[2*i+1];
134 |     t1 += ((int16_t)t1 >> 15) & KYBER_Q;
135 |     r[3*i+0] = (t0 >> 0);
136 |     r[3*i+1] = (t0 >> 8) | (t1 << 4);
137 |     r[3*i+2] = (t1 >> 4);
138 |   }
139 | }
140 | 
141 | /*************************************************
142 | * Name:        poly_frombytes
143 | *
144 | * Description: De-serialization of a polynomial;
145 | *              inverse of poly_tobytes
146 | *
147 | * Arguments:   - poly *r: pointer to output polynomial
148 | *              - const uint8_t *a: pointer to input byte array
149 | *                                  (of KYBER_POLYBYTES bytes)
150 | **************************************************/
151 | void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES])
152 | {
153 |   unsigned int i;
154 |   for(i=0;i<KYBER_N/2;i++) {
155 |     r->coeffs[2*i]   = ((a[3*i+0] >> 0) | ((uint16_t)a[3*i+1] << 8)) & 0xFFF;
156 |     r->coeffs[2*i+1] = ((a[3*i+1] >> 4) | ((uint16_t)a[3*i+2] << 4)) & 0xFFF;
157 |   }
158 | }
159 | 
160 | /*************************************************
161 | * Name:        poly_frommsg
162 | *
163 | * Description: Convert 32-byte message to polynomial
164 | *
165 | * Arguments:   - poly *r: pointer to output polynomial
166 | *              - const uint8_t *msg: pointer to input message
167 | **************************************************/
168 | void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES])
169 | {
170 |   unsigned int i,j;
171 | 
172 | #if (KYBER_INDCPA_MSGBYTES != KYBER_N/8)
173 | #error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!"
174 | #endif
175 | 
176 |   for(i=0;i<KYBER_N/8;i++) {
177 |     for(j=0;j<8;j++) {
178 |       r->coeffs[8*i+j] = 0;
179 |       cmov_int16(r->coeffs+8*i+j, ((KYBER_Q+1)/2), (msg[i] >> j)&1);
180 |     }
181 |   }
182 | }
183 | 
184 | /*************************************************
185 | * Name:        poly_tomsg
186 | *
187 | * Description: Convert polynomial to 32-byte message
188 | *
189 | * Arguments:   - uint8_t *msg: pointer to output message
190 | *              - const poly *a: pointer to input polynomial
191 | **************************************************/
192 | void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a)
193 | {
194 |   unsigned int i,j;
195 |   uint32_t t;
196 | 
197 |   for(i=0;i<KYBER_N/8;i++) {
198 |     msg[i] = 0;
199 |     for(j=0;j<8;j++) {
200 |       t  = a->coeffs[8*i+j];
201 |       // t += ((int16_t)t >> 15) & KYBER_Q;
202 |       // t  = (((t << 1) + KYBER_Q/2)/KYBER_Q) & 1;
203 |       t <<= 1;
204 |       t += 1665;
205 |       t *= 80635;
206 |       t >>= 28;
207 |       t &= 1;
208 |       msg[i] |= t << j;
209 |     }
210 |   }
211 | }
212 | 
213 | /*************************************************
214 | * Name:        poly_getnoise_eta1
215 | *
216 | * Description: Sample a polynomial deterministically from a seed and a nonce,
217 | *              with output polynomial close to centered binomial distribution
218 | *              with parameter KYBER_ETA1
219 | *
220 | * Arguments:   - poly *r: pointer to output polynomial
221 | *              - const uint8_t *seed: pointer to input seed
222 | *                                     (of length KYBER_SYMBYTES bytes)
223 | *              - uint8_t nonce: one-byte input nonce
224 | **************************************************/
225 | void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
226 | {
227 |   uint8_t buf[KYBER_ETA1*KYBER_N/4];
228 |   prf(buf, sizeof(buf), seed, nonce);
229 |   poly_cbd_eta1(r, buf);
230 | }
231 | 
232 | /*************************************************
233 | * Name:        poly_getnoise_eta2
234 | *
235 | * Description: Sample a polynomial deterministically from a seed and a nonce,
236 | *              with output polynomial close to centered binomial distribution
237 | *              with parameter KYBER_ETA2
238 | *
239 | * Arguments:   - poly *r: pointer to output polynomial
240 | *              - const uint8_t *seed: pointer to input seed
241 | *                                     (of length KYBER_SYMBYTES bytes)
242 | *              - uint8_t nonce: one-byte input nonce
243 | **************************************************/
244 | void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
245 | {
246 |   uint8_t buf[KYBER_ETA2*KYBER_N/4];
247 |   prf(buf, sizeof(buf), seed, nonce);
248 |   poly_cbd_eta2(r, buf);
249 | }
250 | 
251 | 
252 | /*************************************************
253 | * Name:        poly_ntt
254 | *
255 | * Description: Computes negacyclic number-theoretic transform (NTT) of
256 | *              a polynomial in place;
257 | *              inputs assumed to be in normal order, output in bitreversed order
258 | *
259 | * Arguments:   - uint16_t *r: pointer to in/output polynomial
260 | **************************************************/
261 | void poly_ntt(poly *r)
262 | {
263 |   ntt(r->coeffs);
264 |   poly_reduce(r);
265 | }
266 | 
267 | /*************************************************
268 | * Name:        poly_invntt_tomont
269 | *
270 | * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
271 | *              of a polynomial in place;
272 | *              inputs assumed to be in bitreversed order, output in normal order
273 | *
274 | * Arguments:   - uint16_t *a: pointer to in/output polynomial
275 | **************************************************/
276 | void poly_invntt_tomont(poly *r)
277 | {
278 |   invntt(r->coeffs);
279 | }
280 | 
281 | /*************************************************
282 | * Name:        poly_basemul_montgomery
283 | *
284 | * Description: Multiplication of two polynomials in NTT domain
285 | *
286 | * Arguments:   - poly *r: pointer to output polynomial
287 | *              - const poly *a: pointer to first input polynomial
288 | *              - const poly *b: pointer to second input polynomial
289 | **************************************************/
290 | void poly_basemul_montgomery(poly *r, const poly *a, const poly *b)
291 | {
292 |   unsigned int i;
293 |   for(i=0;i<KYBER_N/4;i++) {
294 |     basemul(&r->coeffs[4*i], &a->coeffs[4*i], &b->coeffs[4*i], zetas[64+i]);
295 |     basemul(&r->coeffs[4*i+2], &a->coeffs[4*i+2], &b->coeffs[4*i+2], -zetas[64+i]);
296 |   }
297 | }
298 | 
299 | /*************************************************
300 | * Name:        poly_tomont
301 | *
302 | * Description: Inplace conversion of all coefficients of a polynomial
303 | *              from normal domain to Montgomery domain
304 | *
305 | * Arguments:   - poly *r: pointer to input/output polynomial
306 | **************************************************/
307 | void poly_tomont(poly *r)
308 | {
309 |   unsigned int i;
310 |   const int16_t f = (1ULL << 32) % KYBER_Q;
311 |   for(i=0;i<KYBER_N;i++)
312 |     r->coeffs[i] = montgomery_reduce((int32_t)r->coeffs[i]*f);
313 | }
314 | 
315 | /*************************************************
316 | * Name:        poly_reduce
317 | *
318 | * Description: Applies Barrett reduction to all coefficients of a polynomial
319 | *              for details of the Barrett reduction see comments in reduce.c
320 | *
321 | * Arguments:   - poly *r: pointer to input/output polynomial
322 | **************************************************/
323 | void poly_reduce(poly *r)
324 | {
325 |   unsigned int i;
326 |   for(i=0;i<KYBER_N;i++)
327 |     r->coeffs[i] = barrett_reduce(r->coeffs[i]);
328 | }
329 | 
330 | /*************************************************
331 | * Name:        poly_add
332 | *
333 | * Description: Add two polynomials; no modular reduction is performed
334 | *
335 | * Arguments: - poly *r: pointer to output polynomial
336 | *            - const poly *a: pointer to first input polynomial
337 | *            - const poly *b: pointer to second input polynomial
338 | **************************************************/
339 | void poly_add(poly *r, const poly *a, const poly *b)
340 | {
341 |   unsigned int i;
342 |   for(i=0;i<KYBER_N;i++)
343 |     r->coeffs[i] = a->coeffs[i] + b->coeffs[i];
344 | }
345 | 
346 | /*************************************************
347 | * Name:        poly_sub
348 | *
349 | * Description: Subtract two polynomials; no modular reduction is performed
350 | *
351 | * Arguments: - poly *r:       pointer to output polynomial
352 | *            - const poly *a: pointer to first input polynomial
353 | *            - const poly *b: pointer to second input polynomial
354 | **************************************************/
355 | void poly_sub(poly *r, const poly *a, const poly *b)
356 | {
357 |   unsigned int i;
358 |   for(i=0;i<KYBER_N;i++)
359 |     r->coeffs[i] = a->coeffs[i] - b->coeffs[i];
360 | }
361 | 


--------------------------------------------------------------------------------