├── src
    ├── NOTES.md
    ├── sort.h
    ├── bitrev16_table.c
    ├── bitrev16_table.h
    ├── bitrev256_table.h
    ├── bitrev512_table.h
    ├── test_avx_support.c
    ├── bitrev1024_table.h
    ├── data_poly1024.h
    ├── tests_in_paper
    │   ├── test_ntt_red1024b.c
    │   ├── test_ntt_red1024.c
    │   ├── test_intt_red1024b.c
    │   ├── test_ntt_red1024e.c
    │   ├── test_ntt_red1024f.c
    │   ├── test_intt_red1024.c
    │   ├── test_ntt_red1024d.c
    │   ├── test_ntt_red1024c.c
    │   └── Makefile
    ├── test_shift.c
    ├── test_bitrev_tables.h
    ├── ntt32_tables.h
    ├── sort.c
    ├── ntt16.c
    ├── ntt256.c
    ├── ntt512.c
    ├── ntt1024.c
    ├── test_ntt_tables.h
    ├── kat_mul1024.c
    ├── test_mul1024.c
    ├── naive_ntt16.c
    ├── kat_mul1024_red.c
    ├── naive_ntt256.c
    ├── naive_ntt512.c
    ├── kat_mul1024_red_asm.c
    ├── naive_ntt1024.c
    ├── bitrev256_table.c
    ├── intervals.h
    ├── ntt256.h
    ├── ntt512.h
    ├── ntt1024.h
    ├── ntt_red16.h
    ├── ntt_red256.h
    ├── ntt_red512.h
    ├── speed_mul1024.c
    ├── ntt_red1024.h
    ├── speed_mul1024_red.c
    ├── ntt_red_asm16.h
    ├── speed_mul1024_naive.c
    ├── speed_mul1024_red_asm.c
    ├── ntt_red_asm256.h
    ├── ntt_red_asm512.h
    ├── ntt_red_asm1024.h
    ├── red_bounds.h
    ├── ntt16.h
    ├── naive_ntt16.h
    ├── README.md
    ├── naive_ntt256.h
    ├── naive_ntt512.h
    ├── test_ntt_red_tables.h
    ├── naive_ntt1024.h
    ├── ntt32_tables.c
    ├── make_bitrev_table.c
    ├── ntt_red16.c
    ├── ntt_red256.c
    ├── ntt_red512.c
    ├── ntt_red1024.c
    ├── test_mod.c
    ├── bitrev512_table.c
    ├── ntt_red_asm16.c
    ├── ntt_red_asm1024.c
    ├── ntt_red_asm256.c
    ├── ntt_red_asm512.c
    └── test_red.c
├── paper
    ├── slides.pdf
    └── main_final.pdf
├── verifier
    ├── vstte20-benchmarks
    │   ├── sort.h
    │   ├── bitrev1024_table.h
    │   ├── harness_ntt_red1024b.c
    │   ├── harness_ntt_red1024e.c
    │   ├── harness_intt_red1024b.c
    │   ├── harness_ntt_red1024.c
    │   ├── harness_ntt_red1024f.c
    │   ├── harness_intt_red1024.c
    │   ├── harness_ntt_red1024c.c
    │   ├── harness_ntt_red1024d.c
    │   ├── sort.c
    │   ├── verify.sh
    │   ├── clam.h
    │   ├── verify_all
    │   ├── ntt_red1024.h
    │   ├── red_bounds.h
    │   ├── make_bitrev_table.c
    │   └── ntt_red1024.c
    ├── install.sh
    └── src
    │   └── CMakeLists.txt
├── .gitignore
├── LICENSE
├── README.md
└── data
    └── primitive-roots-1024.txt


/src/NOTES.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/paper/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SRI-CSL/NTT/HEAD/paper/slides.pdf


--------------------------------------------------------------------------------
/paper/main_final.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SRI-CSL/NTT/HEAD/paper/main_final.pdf


--------------------------------------------------------------------------------
/src/sort.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Sort an array of uint64_t numbers in increasing order
 3 |  */
 4 | 
 5 | #ifndef __SORT_H
 6 | #define __SORT_H
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | extern void sort(uint64_t *a, uint32_t n);
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/src/bitrev16_table.c:
--------------------------------------------------------------------------------
1 | #include "bitrev16_table.h"
2 | 
3 | const uint16_t bitrev16[BITREV16_NPAIRS][2] = {
4 |     {     1,     8 }, {     2,     4 }, {     3,    12 }, {     5,    10 },
5 |     {     7,    14 }, {    11,    13 },
6 | };
7 | 
8 | 


--------------------------------------------------------------------------------
/src/bitrev16_table.h:
--------------------------------------------------------------------------------
 1 | #ifndef __BITREV16_TABLE_H
 2 | #define __BITREV16_TABLE_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #define BITREV16_NPAIRS 6
 7 | 
 8 | extern const uint16_t bitrev16[BITREV16_NPAIRS][2];
 9 | 
10 | #endif /* __BITREV16_TABLE_H */
11 | 


--------------------------------------------------------------------------------
/src/bitrev256_table.h:
--------------------------------------------------------------------------------
 1 | #ifndef __BITREV256_TABLE_H
 2 | #define __BITREV256_TABLE_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #define BITREV256_NPAIRS 120
 7 | 
 8 | extern const uint16_t bitrev256[BITREV256_NPAIRS][2];
 9 | 
10 | #endif /* __BITREV256_TABLE_H */
11 | 


--------------------------------------------------------------------------------
/src/bitrev512_table.h:
--------------------------------------------------------------------------------
 1 | #ifndef __BITREV512_TABLE_H
 2 | #define __BITREV512_TABLE_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #define BITREV512_NPAIRS 240
 7 | 
 8 | extern const uint16_t bitrev512[BITREV512_NPAIRS][2];
 9 | 
10 | #endif /* __BITREV512_TABLE_H */
11 | 


--------------------------------------------------------------------------------
/src/test_avx_support.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "ntt_asm.h"
 3 | 
 4 | int main(void) {
 5 |   if (avx2_supported()) {
 6 |     printf("AVX2 is supported\n");
 7 |   } else {
 8 |     printf("AVX2 is not supported\n");
 9 |   }
10 |   return 0;
11 | }
12 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/sort.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Sort an array of uint64_t numbers in increasing order
 3 |  */
 4 | 
 5 | #ifndef __SORT_H
 6 | #define __SORT_H
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | extern void sort(uint64_t *a, uint32_t n);
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/src/bitrev1024_table.h:
--------------------------------------------------------------------------------
 1 | #ifndef __BITREV1024_TABLE_H
 2 | #define __BITREV1024_TABLE_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #define BITREV1024_NPAIRS 496
 7 | 
 8 | extern const uint16_t bitrev1024[BITREV1024_NPAIRS][2];
 9 | 
10 | #endif /* __BITREV1024_TABLE_H */
11 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/bitrev1024_table.h:
--------------------------------------------------------------------------------
 1 | #ifndef __BITREV1024_TABLE_H
 2 | #define __BITREV1024_TABLE_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #define BITREV1024_NPAIRS 496
 7 | 
 8 | extern const uint16_t bitrev1024[BITREV1024_NPAIRS][2];
 9 | 
10 | #endif /* __BITREV1024_TABLE_H */
11 | 


--------------------------------------------------------------------------------
/verifier/install.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | mkdir build && cd build
 4 | cmake .. 
 5 | cmake --build . --target clam-seadsa && cmake ..
 6 | cmake --build . --target clam-seallvm && cmake ..
 7 | cmake --build . --target ntt-clam && cmake ..
 8 | cmake --build . --target crab && cmake ..
 9 | cmake --build . --target install
10 | 


--------------------------------------------------------------------------------
/src/data_poly1024.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Declarations for KAT
 3 |  */
 4 | 
 5 | #ifndef __DATA_POLY1024_H
 6 | #define __DATA_POLY1024_H
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | #define REPETITIONS 100
11 | #define N 1025
12 | 
13 | extern int32_t a[REPETITIONS][N], b[REPETITIONS][N], c[REPETITIONS][N];
14 | 
15 | extern void build_kat(void);
16 | 
17 | #endif /* __DATA_POLY1024_H */
18 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/harness_ntt_red1024b.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <inttypes.h>
 6 | 
 7 | #include "ntt.h"
 8 | #include "bitrev1024_table.h"
 9 | #include "ntt_red1024.h"
10 | #include "sort.h"
11 | 
12 | #include "clam.h"
13 | 
14 | #define Q 12289
15 | 
16 | static int32_t nd_a[1024];
17 | 
18 | int main(void) {
19 |   ASSUME_FORALL(nd_a, 1024, 0, Q)
20 |   // defined in ntt_red.c
21 |   ntt_red_ct_rev2std(nd_a, 1024, ntt_red1024_omega_powers);
22 | 
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/harness_ntt_red1024e.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <inttypes.h>
 6 | 
 7 | #include "ntt.h"
 8 | #include "bitrev1024_table.h"
 9 | #include "ntt_red1024.h"
10 | #include "sort.h"
11 | 
12 | #include "clam.h"
13 | 
14 | #define Q 12289
15 | 
16 | static int32_t nd_a[1024];
17 | 
18 | int main(void) {
19 |   ASSUME_FORALL(nd_a, 1024, 0, Q)
20 |   // defined in ntt_red.c
21 |   ntt_red_gs_std2rev(nd_a, 1024, ntt_red1024_omega_powers);
22 | 
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/harness_intt_red1024b.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <inttypes.h>
 6 | 
 7 | #include "ntt.h"
 8 | #include "bitrev1024_table.h"
 9 | #include "ntt_red1024.h"
10 | #include "sort.h"
11 | 
12 | #include "clam.h"
13 | 
14 | #define Q 12289
15 | 
16 | static int32_t nd_a[1024];
17 | 
18 | int main(void) {
19 |   ASSUME_FORALL(nd_a, 1024, 0, Q)
20 |   // defined in ntt_red.c
21 |   ntt_red_ct_rev2std(nd_a, 1024, ntt_red1024_inv_omega_powers);
22 | 
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/harness_ntt_red1024.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <inttypes.h>
 6 | 
 7 | #include "ntt.h"
 8 | #include "bitrev1024_table.h"
 9 | #include "ntt_red1024.h"
10 | #include "sort.h"
11 | 
12 | #include "clam.h"
13 | 
14 | #define Q 12289
15 | 
16 | static int32_t nd_a[1024];
17 | 
18 | int main(void) {
19 |   ASSUME_FORALL(nd_a, 1024, 0, Q)
20 |   // defined in ntt_red.c
21 |   ntt_red_ct_std2rev(nd_a, 1024, ntt_red1024_omega_powers_rev);
22 | 
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/harness_ntt_red1024f.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <inttypes.h>
 6 | 
 7 | #include "ntt.h"
 8 | #include "bitrev1024_table.h"
 9 | #include "ntt_red1024.h"
10 | #include "sort.h"
11 | 
12 | #include "clam.h"
13 | 
14 | #define Q 12289
15 | 
16 | static int32_t nd_a[1024];
17 | 
18 | int main(void) {
19 |   ASSUME_FORALL(nd_a, 1024, 0, Q)
20 |   // defined in ntt_red.c
21 |   ntt_red_gs_rev2std(nd_a, 1024, ntt_red1024_omega_powers_rev);
22 | 
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/harness_intt_red1024.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <inttypes.h>
 6 | 
 7 | #include "ntt.h"
 8 | #include "bitrev1024_table.h"
 9 | #include "ntt_red1024.h"
10 | #include "sort.h"
11 | 
12 | #include "clam.h"
13 | 
14 | #define Q 12289
15 | 
16 | static int32_t nd_a[1024];
17 | 
18 | int main(void) {
19 |   ASSUME_FORALL(nd_a, 1024, 0, Q)
20 |   // defined in ntt_red.c
21 |   ntt_red_ct_std2rev(nd_a, 1024, ntt_red1024_inv_omega_powers_rev);
22 | 
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/src/tests_in_paper/test_ntt_red1024b.c:
--------------------------------------------------------------------------------
 1 | #include "../ntt_red_interval.h"
 2 | #include "ntt_red1024_tables.h"
 3 | 
 4 | #define Q 12289
 5 | 
 6 | /*
 7 |  * forward NTT, CT, rev2std
 8 |  *
 9 |  * static inline void ntt_red1024_ct_rev2std(int32_t *a) {
10 |  *   ntt_red_ct_rev2std(a, 1024, ntt_red1024_omega_powers);
11 |  * }
12 |  */
13 | 
14 | int main(void) {
15 |   interval_t *a[1024];
16 |   uint32_t i;
17 | 
18 |   for (i=0; i<1024; i++) {
19 |     a[i] = interval(0, Q-1);
20 |   }
21 |   abstract_ntt_red_ct_rev2std(a, 1024, ntt_red1024_omega_powers);
22 | 
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/src/tests_in_paper/test_ntt_red1024.c:
--------------------------------------------------------------------------------
 1 | #include "../ntt_red_interval.h"
 2 | #include "ntt_red1024_tables.h"
 3 | 
 4 | #define Q 12289
 5 | 
 6 | /*
 7 |  * forward NTT, CT, std2rev
 8 |  *
 9 |  * static inline void ntt_red1024_ct_std2rev(int32_t *a) {
10 |  *   ntt_red_ct_std2rev(a, 1024, ntt_red1024_omega_powers_rev);
11 |  * }
12 |  */
13 | 
14 | int main(void) {
15 |   interval_t *a[1024];
16 |   uint32_t i;
17 | 
18 |   for (i=0; i<1024; i++) {
19 |     a[i] = interval(0, Q-1);
20 |   }
21 |   abstract_ntt_red_ct_std2rev(a, 1024, ntt_red1024_omega_powers_rev);
22 | 
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/src/tests_in_paper/test_intt_red1024b.c:
--------------------------------------------------------------------------------
 1 | #include "../ntt_red_interval.h"
 2 | #include "ntt_red1024_tables.h"
 3 | 
 4 | #define Q 12289
 5 | 
 6 | /*
 7 |  * inverse NTT, CT, rev2std
 8 |  *
 9 |  * static inline void intt_red1024_ct_rev2std(int32_t *a) {
10 |  *   ntt_red_ct_rev2std(a, 1024, ntt_red1024_inv_omega_powers);
11 |  * }
12 |  */
13 | 
14 | int main(void) {
15 |   interval_t *a[1024];
16 |   uint32_t i;
17 | 
18 |   for (i=0; i<1024; i++) {
19 |     a[i] = interval(0, Q-1);
20 |   }
21 |   abstract_ntt_red_ct_rev2std(a, 1024, ntt_red1024_inv_omega_powers);
22 | 
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/src/tests_in_paper/test_ntt_red1024e.c:
--------------------------------------------------------------------------------
 1 | #include "../ntt_red_interval.h"
 2 | #include "ntt_red1024_tables.h"
 3 | 
 4 | #define Q 12289
 5 | 
 6 | /*
 7 |  * forward NTT, GS, std2rev
 8 |  *
 9 |  * static inline void ntt_red1024_gs_std2rev(int32_t *a) {
10 |  *   ntt_red_gs_std2rev(a, 1024, ntt_red1024_omega_powers);
11 |  * }
12 |  *
13 |  */
14 | 
15 | int main(void) {
16 |   interval_t *a[1024];
17 |   uint32_t i;
18 | 
19 |   for (i=0; i<1024; i++) {
20 |     a[i] = interval(0, Q-1);
21 |   }
22 |   abstract_ntt_red_gs_std2rev(a, 1024, ntt_red1024_omega_powers);
23 | 
24 |   return 0;
25 | }
26 | 


--------------------------------------------------------------------------------
/src/tests_in_paper/test_ntt_red1024f.c:
--------------------------------------------------------------------------------
 1 | #include "../ntt_red_interval.h"
 2 | #include "ntt_red1024_tables.h"
 3 | 
 4 | #define Q 12289
 5 | 
 6 | /*
 7 |  * forward NTT, GS, rev2std
 8 |  *
 9 |  * static inline void ntt_red1024_gs_rev2std(int32_t *a) {
10 |  *   ntt_red_gs_rev2std(a, 1024, ntt_red1024_omega_powers_rev);
11 |  * }
12 |  */
13 | 
14 | int main(void) {
15 |   interval_t *a[1024];
16 |   uint32_t i;
17 | 
18 |   for (i=0; i<1024; i++) {
19 |     a[i] = interval(0, Q-1);
20 |   }
21 |   abstract_ntt_red_gs_rev2std(a, 1024, ntt_red1024_omega_powers_rev);
22 | 
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/src/tests_in_paper/test_intt_red1024.c:
--------------------------------------------------------------------------------
 1 | #include "../ntt_red_interval.h"
 2 | #include "ntt_red1024_tables.h"
 3 | 
 4 | #define Q 12289
 5 | 
 6 | /*
 7 |  * inverse NTT, CT, std2rev
 8 |  *
 9 |  * static inline void intt_red1024_ct_std2rev(int32_t *a) {
10 |  *   ntt_red_ct_std2rev(a, 1024, ntt_red1024_inv_omega_powers_rev);
11 |  * }
12 |  */
13 | 
14 | int main(void) {
15 |   interval_t *a[1024];
16 |   uint32_t i;
17 | 
18 |   for (i=0; i<1024; i++) {
19 |     a[i] = interval(0, Q-1);
20 |   }
21 |   abstract_ntt_red_ct_std2rev(a, 1024, ntt_red1024_inv_omega_powers_rev);
22 | 
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/harness_ntt_red1024c.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <inttypes.h>
 6 | 
 7 | #include "ntt.h"
 8 | #include "bitrev1024_table.h"
 9 | #include "ntt_red1024.h"
10 | #include "sort.h"
11 | 
12 | #include "clam.h"
13 | 
14 | #define Q 12289
15 | 
16 | static int32_t nd_a[1024];
17 | static int16_t nd_p[1024];
18 | 
19 | int main(void) {
20 |   ASSUME_FORALL(nd_a, 1024, 0, Q)
21 |   ASSUME_FORALL(nd_p, 1024, -6144, 6144)  
22 |   
23 |   // defined in ntt_red.c
24 |   ntt_red_ct_std2rev(nd_a, 1024, nd_p);
25 | 
26 |   return 0;
27 | }
28 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/harness_ntt_red1024d.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <inttypes.h>
 6 | 
 7 | #include "ntt.h"
 8 | #include "bitrev1024_table.h"
 9 | #include "ntt_red1024.h"
10 | #include "sort.h"
11 | 
12 | #include "clam.h"
13 | 
14 | #define Q 12289
15 | 
16 | static int32_t nd_a[1024];
17 | static int16_t nd_p[1024];
18 | 
19 | int main(void) {
20 |   ASSUME_FORALL(nd_a, 1024, 0, Q)
21 |   ASSUME_FORALL(nd_p, 1024, -6144, 6144)  
22 |   
23 |   // defined in ntt_red.c
24 |   ntt_red_ct_rev2std(nd_a, 1024, nd_p);
25 | 
26 |   return 0;
27 | }
28 | 


--------------------------------------------------------------------------------
/src/tests_in_paper/test_ntt_red1024d.c:
--------------------------------------------------------------------------------
 1 | #include "../ntt_red_interval.h"
 2 | 
 3 | #define Q 12289
 4 | 
 5 | /*
 6 |  * forward NTT, CT, rev2std
 7 |  *
 8 |  * static inline void ntt_red1024_ct_rev2std(int32_t *a) {
 9 |  *   ntt_red_ct_rev2std(a, 1024, ntt_red1024_omega_powers);
10 |  * }
11 |  */
12 | 
13 | int main(void) {
14 |   interval_t *a[1024];
15 |   interval_t *p[1024];
16 |   uint32_t i;
17 | 
18 |   for (i=0; i<1024; i++) {
19 |     a[i] = interval(0, Q-1);
20 |   }
21 |   for (i=0; i<1024; i++) {
22 |     p[i] = interval(-(Q-1)/2, (Q-1)/2);
23 |   }
24 | 
25 |   abstract2_ntt_red_ct_rev2std(a, 1024, (const interval_t **) p);
26 | 
27 |   return 0;
28 | }
29 | 


--------------------------------------------------------------------------------
/src/tests_in_paper/test_ntt_red1024c.c:
--------------------------------------------------------------------------------
 1 | #include "../ntt_red_interval.h"
 2 | 
 3 | #define Q 12289
 4 | 
 5 | /*
 6 |  * forward NTT, CT, std2rev.
 7 |  *
 8 |  * static inline void ntt_red1024_ct_std2rev(int32_t *a) {
 9 |  *   ntt_red_ct_std2rev(a, 1024, ntt_red1024_omega_powers_rev);
10 |  * }
11 |  */
12 | 
13 | int main(void) {
14 |   interval_t *a[1024];
15 |   interval_t *p[1024];
16 |   uint32_t i;
17 | 
18 |   for (i=0; i<1024; i++) {
19 |     a[i] = interval(0, Q-1);
20 |   }
21 |   for (i=0; i<1024; i++) {
22 |     p[i] = interval(-(Q-1)/2, (Q-1)/2);
23 |   }
24 | 
25 |   abstract2_ntt_red_ct_std2rev(a, 1024, (const interval_t **) p);
26 | 
27 |   return 0;
28 | }
29 | 


--------------------------------------------------------------------------------
/src/test_shift.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <inttypes.h>
 3 | 
 4 | static void print_shift(int32_t x) {
 5 |   printf("shift_31(%"PRId32") = %"PRId32"\n", x, (x >> 31));
 6 |   printf("shift_11(%"PRId32") = %"PRId32"\n", x, (x >> 11));
 7 | }
 8 | 
 9 | static void print_shift_and(int32_t x, int32_t q) {
10 |   printf("shift_31(%"PRId32" & %"PRId32") = %"PRId32"\n", x, q, (x >> 31) & q);
11 |   printf("shift_11(%"PRId32" & %"PRId32") = %"PRId32"\n", x, q, (x >> 11) & q);
12 | }
13 | 
14 | int main(void) {
15 |   int32_t i;
16 | 
17 |   for (i=0; i<1003; i++) {
18 |     print_shift(i);
19 |     print_shift(-i);
20 |     print_shift_and(i, 12289);
21 |     print_shift_and(-i, 12289);
22 |   }
23 | 
24 |   return 0;
25 | }
26 | 


--------------------------------------------------------------------------------
/src/test_bitrev_tables.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Bitreverse tables.
 3 |  */
 4 | 
 5 | #ifndef __TEST_BITREV_TABLES_H
 6 | #define __TEST_BITREV_TABLES_H
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | /*
11 |  * Tables for bit-reverse shuffle for n=128, 256, 512, 1024, 2048
12 |  */
13 | #define BITREV128_NPAIRS 56
14 | #define BITREV256_NPAIRS 120
15 | #define BITREV512_NPAIRS 240
16 | #define BITREV1024_NPAIRS 496
17 | #define BITREV2048_NPAIRS 992
18 | 
19 | extern const uint16_t bitrev128_pair[BITREV128_NPAIRS][2];
20 | extern const uint16_t bitrev256_pair[BITREV256_NPAIRS][2];
21 | extern const uint16_t bitrev512_pair[BITREV512_NPAIRS][2];
22 | extern const uint16_t bitrev1024_pair[BITREV1024_NPAIRS][2];
23 | extern const uint16_t bitrev2048_pair[BITREV2048_NPAIRS][2];
24 | 
25 | #endif /* __TEST_BITREV_TABLES_H */
26 | 


--------------------------------------------------------------------------------
/verifier/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_definitions(-D__STDC_CONSTANT_MACROS)
 2 | add_definitions(-D__STDC_LIMIT_MACROS)
 3 | 
 4 | set(LLVM_LINK_COMPONENTS 
 5 |   irreader 
 6 |   bitwriter 
 7 |   ipo 
 8 |   scalaropts 
 9 |   instrumentation
10 |   transformutils
11 |   core 
12 |   codegen 
13 |   objcarcopts)
14 | 
15 | 
16 | add_llvm_executable(nttverifier DISABLE_LLVM_LINK_LLVM_DYLIB
17 |   ntt_verifier.cpp
18 |   ntt_intervals.cpp)
19 | 
20 | target_link_libraries (nttverifier PRIVATE
21 |   ${LLVM_SEAHORN_LIBS}
22 |   ${SEA_DSA_BS}
23 |   ${CLAM_LIBS}
24 | )
25 | llvm_config (nttverifier ${LLVM_LINK_COMPONENTS})
26 | install(TARGETS nttverifier RUNTIME DESTINATION bin)
27 | 
28 | if (NTT_VERIFIER_STATIC_EXE)
29 |   set (CMAKE_EXE_LINKER_FLAGS "-static -static-libgcc -static-libstdc++")
30 |   set_target_properties (nttverifier PROPERTIES LINK_SEARCH_START_STATIC ON)
31 |   set_target_properties (nttverifier PROPERTIES LINK_SEARCH_END_STATIC ON)
32 | endif()
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *~
 3 | *.dSYM
 4 | test_ntt
 5 | test_ntt16
 6 | test_ntt256
 7 | test_ntt512
 8 | test_ntt1024
 9 | kat_mul1024
10 | speed_mul1024
11 | kat_mul1024_red
12 | speed_mul1024_red
13 | kat_mul1024_red_asm
14 | speed_mul1024_red_asm
15 | test_ntt_red16
16 | test_ntt_red256
17 | test_ntt_red512
18 | test_ntt_red1024
19 | test_ntt_red_asm16
20 | test_ntt_red_asm256
21 | test_ntt_red_asm512
22 | test_ntt_red_asm1024
23 | test_ntt_red
24 | test_red_bounds
25 | test_avx
26 | test_avx_support
27 | test_ntt_avx
28 | make_tables
29 | make_red_tables
30 | make_bitrev_table
31 | ntt16_tables.h
32 | ntt16_tables.c
33 | ntt256_tables.h
34 | ntt256_tables.c
35 | ntt512_tables.h
36 | ntt512_tables.c
37 | ntt1024_tables.h
38 | ntt1024_tables.c
39 | ntt_red16_tables.h
40 | ntt_red16_tables.c
41 | ntt_red256_tables.h
42 | ntt_red256_tables.c
43 | ntt_red512_tables.h
44 | ntt_red512_tables.c
45 | ntt_red1024_tables.h
46 | ntt_red1024_tables.c
47 | bitrev16_tables.h
48 | bitrev16_tables.c
49 | bitrev256_tables.h
50 | bitrev256_tables.c
51 | bitrev512_tables.h
52 | bitrev512_tables.c
53 | bitrev1024_tables.h
54 | bitrev1024_tables.c
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 SRI International's Computer Science Laboratory
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # The Number Theoretic Transform
 2 | 
 3 | The Number Theoretic Transform ([NTT](https://en.wikipedia.org/wiki/Discrete_Fourier_transform_(general)#Number-theoretic_transform)) is an efficient algorithm for
 4 | computing the products of polynomials whose coefficients belong to
 5 | a finite field.
 6 | 
 7 | This repository contains SRI's various implementations of the NTT (developed while 
 8 | implementing the [Bliss](https://github.com/SRI-CSL/Bliss)).
 9 | 
10 | It also includes the verification of these algorithms.
11 | 
12 | The repository is organized into three subdirectories:
13 | 
14 | * [src](https://github.com/SRI-CSL/NTT/tree/master/src/README.md) contains a plethora of code implementing the algorithms described in paper.
15 | 
16 | * [verifier](https://github.com/SRI-CSL/NTT/tree/master/verifier/README.md) contains the code of the verifier that proves absence of integer overflows of the programs described in [src](https://github.com/SRI-CSL/NTT/tree/master/src/README.md).
17 | 
18 | * [paper](https://github.com/SRI-CSL/NTT/blob/master/paper/main_final.pdf) contains the [VSTTE20 conference](https://sri-csl.github.io/VSTTE20/) version of the paper, as well as the [slides](https://github.com/SRI-CSL/NTT/blob/master/paper/slides.pdf) from the conference talk.
19 | 


--------------------------------------------------------------------------------
/src/tests_in_paper/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # all tests
 3 | #
 4 | tests=test_intt_red1024 test_intt_red1024b \
 5 |   test_ntt_red1024 test_ntt_red1024b test_ntt_red1024c \
 6 |   test_ntt_red1024d test_ntt_red1024e test_ntt_red1024f
 7 | 
 8 | CC?=clang
 9 | CFLAGS=-Wall -I../
10 | 
11 | #
12 | # We assume ../intervals.o ../red_bounds.o ../ntt_red_interval.o ../ntt_red1024_tables.o all exist
13 | # and are up to date.
14 | #
15 | obj=../intervals.o ../red_bounds.o ../ntt_red_interval.o ../ntt_red1024_tables.o
16 | 
17 | all: $(tests)
18 | 
19 | test_intt_red1024: test_intt_red1024.c
20 | 	$(CC) $(CFLAGS) -o $@ $^ $(obj)
21 | 
22 | test_intt_red1024b: test_intt_red1024b.c
23 | 	$(CC) $(CFLAGS) -o $@ $^ $(obj)
24 | 
25 | test_ntt_red1024: test_ntt_red1024.c
26 | 	$(CC) $(CFLAGS) -o $@ $^ $(obj)
27 | 
28 | test_ntt_red1024b: test_ntt_red1024b.c
29 | 	$(CC) $(CFLAGS) -o $@ $^ $(obj)
30 | 
31 | test_ntt_red1024c: test_ntt_red1024c.c
32 | 	$(CC) $(CFLAGS) -o $@ $^ $(obj)
33 | 
34 | test_ntt_red1024d: test_ntt_red1024d.c
35 | 	$(CC) $(CFLAGS) -o $@ $^ $(obj)
36 | 
37 | test_ntt_red1024e: test_ntt_red1024e.c
38 | 	$(CC) $(CFLAGS) -o $@ $^ $(obj)
39 | 
40 | test_ntt_red1024f: test_ntt_red1024f.c
41 | 	$(CC) $(CFLAGS) -o $@ $^ $(obj)
42 | 
43 | #
44 | # Clean up
45 | #
46 | clean:
47 | 	rm -f $(tests)
48 | 	rm -f *.o
49 | 


--------------------------------------------------------------------------------
/src/ntt32_tables.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parameters:
 3 |  * - q = 12289
 4 |  * - n = 32
 5 |  * - psi = 563
 6 |  * - omega = psi^2 = 9744
 7 |  * - inverse of psi = 5828
 8 |  * - inverse of omega = 11077
 9 |  * - inverse of n = 11905
10 |  */
11 | 
12 | #ifndef __NTT32_TABLES_H
13 | #define __NTT32_TABLES_H
14 | 
15 | #include <stdint.h>
16 | 
17 | /*
18 |  * PARAMETERS
19 |  */
20 | static const int32_t ntt32_psi = 563;
21 | static const int32_t ntt32_omega = 9744;
22 | static const int32_t ntt32_inv_psi = 5828;
23 | static const int32_t ntt32_inv_omega = 11077;
24 | static const int32_t ntt32_inv_n = 11905;
25 | 
26 | /*
27 |  * BIT-REVERSE SHUFFLE
28 |  */
29 | #define BITREV32_NPAIRS 12
30 | 
31 | extern const uint16_t ntt32_bitrev[BITREV32_NPAIRS][2];
32 | 
33 | /*
34 |  * POWERS OF PSI
35 |  */
36 | extern const uint16_t ntt32_psi_powers[32];
37 | extern const uint16_t ntt32_inv_psi_powers[32];
38 | extern const uint16_t ntt32_scaled_inv_psi_powers[32];
39 | 
40 | /*
41 |  * TABLES FOR NTT COMPUTATION
42 |  */
43 | extern const uint16_t ntt32_omega_powers[32];
44 | extern const uint16_t ntt32_omega_powers_rev[32];
45 | extern const uint16_t ntt32_inv_omega_powers[32];
46 | extern const uint16_t ntt32_inv_omega_powers_rev[32];
47 | extern const uint16_t ntt32_mixed_powers[32];
48 | extern const uint16_t ntt32_mixed_powers_rev[32];
49 | extern const uint16_t ntt32_inv_mixed_powers[32];
50 | extern const uint16_t ntt32_inv_mixed_powers_rev[32];
51 | 
52 | #endif /* __NTT32_TABLES_H */
53 | 


--------------------------------------------------------------------------------
/src/sort.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * BASIC SORT FOR INTEGER ARRAYS
 3 |  */
 4 | 
 5 | #include "sort.h"
 6 | 
 7 | static void qsort_int_array(uint64_t *a, uint32_t n);
 8 | 
 9 | // insertion sort
10 | static void isort_int_array(uint64_t *a, uint32_t n) {
11 |   uint32_t i, j;
12 |   uint64_t x, y;
13 | 
14 |   for (i=1; i<n; i++) {
15 |     x = a[i];
16 |     j = 0;
17 |     while (a[j] < x) j ++;
18 |     while (j < i) {
19 |       y = a[j]; a[j] = x; x = y;
20 |       j ++;
21 |     }
22 |     a[j] = x;
23 |   }
24 | }
25 | 
26 | static inline void sort_array(uint64_t *a, uint32_t n) {
27 |   if (n < 10) {
28 |     isort_int_array(a, n);
29 |   } else {
30 |     qsort_int_array(a, n);
31 |   }
32 | }
33 | 
34 | // quick sort: requires n > 1
35 | static void qsort_int_array(uint64_t *a, uint32_t n) {
36 |   uint32_t i, j;
37 |   uint64_t x, y;
38 | 
39 |   // x = random pivot
40 |   i = n/2;
41 |   x = a[i];
42 | 
43 |   // swap x and a[0]
44 |   a[i] = a[0];
45 |   a[0] = x;
46 | 
47 |   i = 0;
48 |   j = n;
49 | 
50 |   do { j--; } while (a[j] > x);
51 |   do { i++; } while (i <= j && a[i] < x);
52 | 
53 |   while (i < j) {
54 |     y = a[i]; a[i] = a[j]; a[j] = y;
55 | 
56 |     do { j--; } while (a[j] > x);
57 |     do { i++; } while (a[i] < x);
58 |   }
59 | 
60 |   // pivot goes into a[j]
61 |   a[0] = a[j];
62 |   a[j] = x;
63 | 
64 |   // sort a[0...j-1] and a[j+1 .. n-1]
65 |   sort_array(a, j);
66 |   j++;
67 |   sort_array(a + j, n - j);
68 | }
69 | 
70 | 
71 | /*
72 |  * External call
73 |  */
74 | void sort(uint64_t *a, uint32_t n) {
75 |   sort_array(a, n);
76 | }
77 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/sort.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * BASIC SORT FOR INTEGER ARRAYS
 3 |  */
 4 | 
 5 | #include "sort.h"
 6 | 
 7 | static void qsort_int_array(uint64_t *a, uint32_t n);
 8 | 
 9 | // insertion sort
10 | static void isort_int_array(uint64_t *a, uint32_t n) {
11 |   uint32_t i, j;
12 |   uint64_t x, y;
13 | 
14 |   for (i=1; i<n; i++) {
15 |     x = a[i];
16 |     j = 0;
17 |     while (a[j] < x) j ++;
18 |     while (j < i) {
19 |       y = a[j]; a[j] = x; x = y;
20 |       j ++;
21 |     }
22 |     a[j] = x;
23 |   }
24 | }
25 | 
26 | static inline void sort_array(uint64_t *a, uint32_t n) {
27 |   if (n < 10) {
28 |     isort_int_array(a, n);
29 |   } else {
30 |     qsort_int_array(a, n);
31 |   }
32 | }
33 | 
34 | // quick sort: requires n > 1
35 | static void qsort_int_array(uint64_t *a, uint32_t n) {
36 |   uint32_t i, j;
37 |   uint64_t x, y;
38 | 
39 |   // x = random pivot
40 |   i = n/2;
41 |   x = a[i];
42 | 
43 |   // swap x and a[0]
44 |   a[i] = a[0];
45 |   a[0] = x;
46 | 
47 |   i = 0;
48 |   j = n;
49 | 
50 |   do { j--; } while (a[j] > x);
51 |   do { i++; } while (i <= j && a[i] < x);
52 | 
53 |   while (i < j) {
54 |     y = a[i]; a[i] = a[j]; a[j] = y;
55 | 
56 |     do { j--; } while (a[j] > x);
57 |     do { i++; } while (a[i] < x);
58 |   }
59 | 
60 |   // pivot goes into a[j]
61 |   a[0] = a[j];
62 |   a[j] = x;
63 | 
64 |   // sort a[0...j-1] and a[j+1 .. n-1]
65 |   sort_array(a, j);
66 |   j++;
67 |   sort_array(a + j, n - j);
68 | }
69 | 
70 | 
71 | /*
72 |  * External call
73 |  */
74 | void sort(uint64_t *a, uint32_t n) {
75 |   sort_array(a, n);
76 | }
77 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/verify.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure we exit if there is a failure
 4 | set -e
 5 | 
 6 | usage() {
 7 |     echo "Usage: $0 FILE [-inline]"
 8 | }
 9 | 
10 | if [[ $# -lt 1 ]]; then
11 |     usage
12 |     exit 1
13 | fi
14 | 
15 | INLINE=false 
16 | POSITIONAL=()
17 | while [[ $# -gt 0 ]]
18 | do
19 | key="$1"
20 | case $key in
21 |     -inline|--inline)
22 | 	shift # past argument
23 | 	INLINE=true
24 | 	;;
25 |     -help|--help)
26 | 	usage
27 | 	exit 0
28 | 	;;
29 |     *)  # unknown option
30 | 	POSITIONAL+=("$1") # save it in an array for later
31 | 	shift # past argument
32 | 	;;
33 | esac
34 | done
35 | set -- "${POSITIONAL[@]}" # restore positional parameters
36 | FILE=$1
37 | shift
38 | 
39 | INSTALL_DIR=$(pwd)/../install/bin
40 | 
41 | CLAMPP=${INSTALL_DIR}/clam-pp
42 | if [ "${CLAMPP}" == "" ]; then
43 |     echo "Cannot find clam-pp"
44 |     exit 1
45 | fi    
46 | 
47 | SEAOPT=${INSTALL_DIR}/seaopt
48 | if [ "${SEAOPT}" == "" ]; then
49 |     echo "Cannot find seaopt"
50 |     exit 1
51 | fi    
52 | 
53 | NTTVERIFIER=${INSTALL_DIR}/nttverifier
54 | if [ "${NTTVERIFIER}" == "" ]; then
55 |     echo "Cannot find nttverifier"
56 |     exit 1
57 | fi    
58 | 
59 | ### Clam preprocessor
60 | CLAMPP_OPTS="--simplifycfg-sink-common=false --clam-devirt --devirt-resolver=sea-dsa --sea-dsa-type-aware=true"
61 | if [ ${INLINE} == true ] ; then
62 | CLAMPP_OPTS="${CLAMPP_OPTS} --clam-inline-all" 
63 | fi    
64 | ${CLAMPP} ${FILE} ${CLAMPP_OPTS} -o ${FILE}.pp.bc
65 | ### Static loop unrolling
66 | ${SEAOPT} -O1  ${FILE}.pp.bc \
67 |      -loop-simplify -fake-latch-exit -loop-unroll -unroll-threshold=99999999 \
68 |      -o ${FILE}.unrolled.pp.bc	       
69 | ### NTT Verifier
70 | ${NTTVERIFIER} ${FILE}.unrolled.pp.bc
71 | 
72 | exit 0
73 | 


--------------------------------------------------------------------------------
/src/ntt16.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289 and n=16.
 3 |  */
 4 | 
 5 | #include "ntt16.h"
 6 | 
 7 | /*
 8 |  * Product of two polynomials
 9 |  */
10 | void ntt16_product1(int32_t *c, int32_t *a, int32_t *b) {
11 |   mul_array16(a, 16, ntt16_psi_powers);
12 |   ntt16_ct_std2rev(a);
13 |   mul_array16(b, 16, ntt16_psi_powers);
14 |   ntt16_ct_std2rev(b);
15 |   mul_array(c, 16, a, b);
16 |   intt16_ct_rev2std(c);
17 |   mul_array16(c, 16, ntt16_scaled_inv_psi_powers);
18 | }
19 | 
20 | void ntt16_product2(int32_t *c, int32_t *a, int32_t *b) {
21 |   mul_array16(a, 16, ntt16_psi_powers);
22 |   ntt16_gs_std2rev(a);
23 |   mul_array16(b, 16, ntt16_psi_powers);
24 |   ntt16_gs_std2rev(b);
25 |   mul_array(c, 16, a, b);
26 |   intt16_ct_rev2std(c);
27 |   mul_array16(c, 16, ntt16_scaled_inv_psi_powers);
28 | }
29 | 
30 | void ntt16_product3(int32_t *c, int32_t *a, int32_t *b) {
31 |   mul_array16(a, 16, ntt16_psi_powers);
32 |   ntt16_ct_std2rev(a);
33 |   mul_array16(b, 16, ntt16_psi_powers);
34 |   ntt16_ct_std2rev(b);
35 |   mul_array(c, 16, a, b);
36 |   intt16_gs_rev2std(c);
37 |   mul_array16(c, 16, ntt16_scaled_inv_psi_powers);
38 | }
39 | 
40 | void ntt16_product4(int32_t *c, int32_t *a, int32_t *b) {
41 |   mul_array16(a, 16, ntt16_psi_powers);
42 |   ntt16_gs_std2rev(a);
43 |   mul_array16(b, 16, ntt16_psi_powers);
44 |   ntt16_gs_std2rev(b);
45 |   mul_array(c, 16, a, b);
46 |   intt16_gs_rev2std(c);
47 |   mul_array16(c, 16, ntt16_scaled_inv_psi_powers);
48 | }
49 | 
50 | 
51 | /*
52 |  * Use combined mulntt then inttmul
53 |  */
54 | void ntt16_product5(int32_t *c, int32_t *a, int32_t *b) {
55 |   mulntt16_ct_std2rev(a);
56 |   mulntt16_ct_std2rev(b);
57 |   mul_array(c, 16, a, b);
58 |   inttmul16_gs_rev2std(c);
59 |   scalar_mul_array(c, 16, ntt16_inv_n); // divide by n
60 | }
61 | 


--------------------------------------------------------------------------------
/src/ntt256.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289 and n=256.
 3 |  */
 4 | 
 5 | #include "ntt256.h"
 6 | 
 7 | /*
 8 |  * Product of two polynomials
 9 |  */
10 | void ntt256_product1(int32_t *c, int32_t *a, int32_t *b) {
11 |   mul_array16(a, 256, ntt256_psi_powers);
12 |   ntt256_ct_std2rev(a);
13 |   mul_array16(b, 256, ntt256_psi_powers);
14 |   ntt256_ct_std2rev(b);
15 |   mul_array(c, 256, a, b);
16 |   intt256_ct_rev2std(c);
17 |   mul_array16(c, 256, ntt256_scaled_inv_psi_powers);
18 | }
19 | 
20 | void ntt256_product2(int32_t *c, int32_t *a, int32_t *b) {
21 |   mul_array16(a, 256, ntt256_psi_powers);
22 |   ntt256_gs_std2rev(a);
23 |   mul_array16(b, 256, ntt256_psi_powers);
24 |   ntt256_gs_std2rev(b);
25 |   mul_array(c, 256, a, b);
26 |   intt256_ct_rev2std(c);
27 |   mul_array16(c, 256, ntt256_scaled_inv_psi_powers);
28 | }
29 | 
30 | void ntt256_product3(int32_t *c, int32_t *a, int32_t *b) {
31 |   mul_array16(a, 256, ntt256_psi_powers);
32 |   ntt256_ct_std2rev(a);
33 |   mul_array16(b, 256, ntt256_psi_powers);
34 |   ntt256_ct_std2rev(b);
35 |   mul_array(c, 256, a, b);
36 |   intt256_gs_rev2std(c);
37 |   mul_array16(c, 256, ntt256_scaled_inv_psi_powers);
38 | }
39 | 
40 | void ntt256_product4(int32_t *c, int32_t *a, int32_t *b) {
41 |   mul_array16(a, 256, ntt256_psi_powers);
42 |   ntt256_gs_std2rev(a);
43 |   mul_array16(b, 256, ntt256_psi_powers);
44 |   ntt256_gs_std2rev(b);
45 |   mul_array(c, 256, a, b);
46 |   intt256_gs_rev2std(c);
47 |   mul_array16(c, 256, ntt256_scaled_inv_psi_powers);
48 | }
49 | 
50 | /*
51 |  * Use combined mulntt then inttmul
52 |  */
53 | void ntt256_product5(int32_t *c, int32_t *a, int32_t *b) {
54 |   mulntt256_ct_std2rev(a);
55 |   mulntt256_ct_std2rev(b);
56 |   mul_array(c, 256, a, b);
57 |   inttmul256_gs_rev2std(c);
58 |   scalar_mul_array(c, 256, ntt256_inv_n); // divide by n
59 | }
60 | 


--------------------------------------------------------------------------------
/src/ntt512.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289 and n=512.
 3 |  */
 4 | 
 5 | #include "ntt512.h"
 6 | 
 7 | /*
 8 |  * Product of two polynomials
 9 |  */
10 | void ntt512_product1(int32_t *c, int32_t *a, int32_t *b) {
11 |   mul_array16(a, 512, ntt512_psi_powers);
12 |   ntt512_ct_std2rev(a);
13 |   mul_array16(b, 512, ntt512_psi_powers);
14 |   ntt512_ct_std2rev(b);
15 |   mul_array(c, 512, a, b);
16 |   intt512_ct_rev2std(c);
17 |   mul_array16(c, 512, ntt512_scaled_inv_psi_powers);
18 | }
19 | 
20 | void ntt512_product2(int32_t *c, int32_t *a, int32_t *b) {
21 |   mul_array16(a, 512, ntt512_psi_powers);
22 |   ntt512_gs_std2rev(a);
23 |   mul_array16(b, 512, ntt512_psi_powers);
24 |   ntt512_gs_std2rev(b);
25 |   mul_array(c, 512, a, b);
26 |   intt512_ct_rev2std(c);
27 |   mul_array16(c, 512, ntt512_scaled_inv_psi_powers);
28 | }
29 | 
30 | void ntt512_product3(int32_t *c, int32_t *a, int32_t *b) {
31 |   mul_array16(a, 512, ntt512_psi_powers);
32 |   ntt512_ct_std2rev(a);
33 |   mul_array16(b, 512, ntt512_psi_powers);
34 |   ntt512_ct_std2rev(b);
35 |   mul_array(c, 512, a, b);
36 |   intt512_gs_rev2std(c);
37 |   mul_array16(c, 512, ntt512_scaled_inv_psi_powers);
38 | }
39 | 
40 | void ntt512_product4(int32_t *c, int32_t *a, int32_t *b) {
41 |   mul_array16(a, 512, ntt512_psi_powers);
42 |   ntt512_gs_std2rev(a);
43 |   mul_array16(b, 512, ntt512_psi_powers);
44 |   ntt512_gs_std2rev(b);
45 |   mul_array(c, 512, a, b);
46 |   intt512_gs_rev2std(c);
47 |   mul_array16(c, 512, ntt512_scaled_inv_psi_powers);
48 | }
49 | 
50 | /*
51 |  * Use combined mulntt then inttmul
52 |  */
53 | void ntt512_product5(int32_t *c, int32_t *a, int32_t *b) {
54 |   mulntt512_ct_std2rev(a);
55 |   mulntt512_ct_std2rev(b);
56 |   mul_array(c, 512, a, b);
57 |   inttmul512_gs_rev2std(c);
58 |   scalar_mul_array(c, 512, ntt512_inv_n); // divide by n
59 | }
60 | 


--------------------------------------------------------------------------------
/src/ntt1024.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289 and n=1024.
 3 |  */
 4 | 
 5 | #include "ntt1024.h"
 6 | 
 7 | /*
 8 |  * Product of two polynomials
 9 |  */
10 | void ntt1024_product1(int32_t *c, int32_t *a, int32_t *b) {
11 |   mul_array16(a, 1024, ntt1024_psi_powers);
12 |   ntt1024_ct_std2rev(a);
13 |   mul_array16(b, 1024, ntt1024_psi_powers);
14 |   ntt1024_ct_std2rev(b);
15 |   mul_array(c, 1024, a, b);
16 |   intt1024_ct_rev2std(c);
17 |   mul_array16(c, 1024, ntt1024_scaled_inv_psi_powers);
18 | }
19 | 
20 | void ntt1024_product2(int32_t *c, int32_t *a, int32_t *b) {
21 |   mul_array16(a, 1024, ntt1024_psi_powers);
22 |   ntt1024_gs_std2rev(a);
23 |   mul_array16(b, 1024, ntt1024_psi_powers);
24 |   ntt1024_gs_std2rev(b);
25 |   mul_array(c, 1024, a, b);
26 |   intt1024_ct_rev2std(c);
27 |   mul_array16(c, 1024, ntt1024_scaled_inv_psi_powers);
28 | }
29 | 
30 | void ntt1024_product3(int32_t *c, int32_t *a, int32_t *b) {
31 |   mul_array16(a, 1024, ntt1024_psi_powers);
32 |   ntt1024_ct_std2rev(a);
33 |   mul_array16(b, 1024, ntt1024_psi_powers);
34 |   ntt1024_ct_std2rev(b);
35 |   mul_array(c, 1024, a, b);
36 |   intt1024_gs_rev2std(c);
37 |   mul_array16(c, 1024, ntt1024_scaled_inv_psi_powers);
38 | }
39 | 
40 | void ntt1024_product4(int32_t *c, int32_t *a, int32_t *b) {
41 |   mul_array16(a, 1024, ntt1024_psi_powers);
42 |   ntt1024_gs_std2rev(a);
43 |   mul_array16(b, 1024, ntt1024_psi_powers);
44 |   ntt1024_gs_std2rev(b);
45 |   mul_array(c, 1024, a, b);
46 |   intt1024_gs_rev2std(c);
47 |   mul_array16(c, 1024, ntt1024_scaled_inv_psi_powers);
48 | }
49 | 
50 | /*
51 |  * Use combined mulntt then inttmul
52 |  */
53 | void ntt1024_product5(int32_t *c, int32_t *a, int32_t *b) {
54 |   mulntt1024_ct_std2rev(a);
55 |   mulntt1024_ct_std2rev(b);
56 |   mul_array(c, 1024, a, b);
57 |   inttmul1024_gs_rev2std(c);
58 |   scalar_mul_array(c, 1024, ntt1024_inv_n); // divide by n
59 | }
60 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/clam.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef __cplusplus
 4 | extern "C" {
 5 | #endif        
 6 | 
 7 | extern void __CRAB_assert(int);
 8 | extern void __CRAB_assume(int);
 9 | extern int int_nd(void);
10 | 
11 | #ifdef __cplusplus
12 | }
13 | #endif
14 | 
15 | 
16 | #define clam_assume __CRAB_assume
17 | #define clam_assert(X) __CRAB_assert(X)
18 | 
19 | // Enable this for verify2
20 | #define UNROLL_ASSUME_FORALL 
21 | 
22 | 
23 | #define STRINGIFY_(A) #A
24 | #define STRINGIFY(A) STRINGIFY_(A)
25 | 
26 | // forall i :: ARRAY[i] \in [LB_VAL, UB_VAL)
27 | #define ASSUME_FORALL_WITH_LOOP(ARRAY, ARRAY_SIZE, LB_VAL, UB_VAL)  \
28 |   {							  \
29 | int i;							  \
30 | _Pragma("nounroll")                                       \
31 | for(i=0; i<ARRAY_SIZE; i++) {				  \
32 | int x = int_nd();	                                  \
33 | clam_assume(x >= LB_VAL);                                 \
34 | clam_assume(x < UB_VAL);                                  \
35 | ARRAY[i] = x;                                             \
36 | }						          \
37 | }
38 | 
39 | 
40 | // forall i :: ARRAY[i] \in [LB_VAL, UB_VAL)
41 | #define ASSUME_FORALL_WITHOUT_LOOP(ARRAY, ARRAY_SIZE, LB_VAL, UB_VAL)  \
42 | {							  \
43 | int i;							  \
44 | _Pragma("unroll(1024)")                                   \
45 | for(i=0; i<ARRAY_SIZE; i++) {				  \
46 | int x = int_nd();	                                  \
47 | clam_assume(x >= LB_VAL);                                 \
48 | clam_assume(x < UB_VAL);                                  \
49 | ARRAY[i] = x;                                             \
50 | }						          \
51 | }
52 | 
53 | 
54 | #ifndef UNROLL_ASSUME_FORALL
55 | #define ASSUME_FORALL(ARRAY, ARRAY_SIZE, LB_VAL, UB_VAL)	\
56 |   ASSUME_FORALL_WITH_LOOP(ARRAY, ARRAY_SIZE, LB_VAL, UB_VAL)
57 | #else
58 | #define ASSUME_FORALL(ARRAY, ARRAY_SIZE, LB_VAL, UB_VAL) \
59 |   ASSUME_FORALL_WITHOUT_LOOP(ARRAY, ARRAY_SIZE, LB_VAL, UB_VAL)
60 | #endif 
61 | 


--------------------------------------------------------------------------------
/src/test_ntt_tables.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tables for testing the NTT functions
 3 |  */
 4 | 
 5 | #ifndef __TEST_NTT_TABLES_H
 6 | #define __TEST_NTT_TABLES_H
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | /*
11 |  * Powers of psi:
12 |  * - for n=16: psi=1212, omega=6553
13 |  * - for n=256: psi=1002, omega=8595
14 |  * - for n=512: psi=1003, omega=10600
15 |  * - for n=1024: psi=1014, omega=8209
16 |  */
17 | extern const uint16_t psi_powers_ntt16_12289[16];
18 | extern const uint16_t psi_powers_ntt256_12289[256];
19 | extern const uint16_t psi_powers_ntt512_12289[512];
20 | extern const uint16_t psi_powers_ntt1024_12289[1024];
21 | 
22 | /*
23 |  * Powers of omega in Shoup-style format
24 |  * - use the same parameters psi/omega as above
25 |  */
26 | extern const uint16_t shoup_ntt16_12289[16];
27 | extern const uint16_t shoup_ntt256_12289[256];
28 | extern const uint16_t shoup_ntt512_12289[512];
29 | extern const uint16_t shoup_ntt1024_12289[1024];
30 | 
31 | /*
32 |  * Scaled tables in Shoup-style format:
33 |  * - powers of omega multiplied by powers of psi
34 |  */
35 | extern const uint16_t shoup_scaled_ntt16_12289[16];
36 | extern const uint16_t shoup_scaled_ntt256_12289[256];
37 | extern const uint16_t shoup_scaled_ntt512_12289[512];
38 | extern const uint16_t shoup_scaled_ntt1024_12289[1024];
39 | 
40 | /*
41 |  * Powers of omega in bitreverse/Shoup-style format
42 |  * - use the same parameters psi/omega as above
43 |  */
44 | extern const uint16_t rev_shoup_ntt16_12289[16];
45 | extern const uint16_t rev_shoup_ntt256_12289[256];
46 | extern const uint16_t rev_shoup_ntt512_12289[512];
47 | extern const uint16_t rev_shoup_ntt1024_12289[1024];
48 | 
49 | /*
50 |  * Powers of omega and spi in bitreverse/Shoup-style format
51 |  * - use the same parameters psi/omega as above
52 |  */
53 | extern const uint16_t rev_shoup_scaled_ntt16_12289[16];
54 | extern const uint16_t rev_shoup_scaled_ntt256_12289[256];
55 | extern const uint16_t rev_shoup_scaled_ntt512_12289[512];
56 | extern const uint16_t rev_shoup_scaled_ntt1024_12289[1024];
57 | 
58 | 
59 | #endif /* __TEST_NTT_TABLES_H */
60 | 


--------------------------------------------------------------------------------
/src/kat_mul1024.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 |   Tancrede: I added to the repository Harvey’s NTT (with a file
 4 |   tools/precomputation-ntt-harvey.sage to explain how the constants
 5 |   are generated), and a new test test_poly.c which uses known values
 6 |   (a[i]*b[i]=c[i]) and verifies that INTT(NTT(a[i])*NTT(b[i])) == c[i]
 7 |   with Harvey’s NTT. I believe similar tests should be possible with
 8 |   the others NTTs in the repository, although they do not output
 9 |   numbers in [0, PARAM_Q) so the test should be adapted.
10 | 
11 | */
12 | 
13 | #include <stdint.h>
14 | #include <stdio.h>
15 | #include <stdlib.h>
16 | #include <inttypes.h>
17 | 
18 | #include "ntt1024.h"
19 | #include "data_poly1024.h"
20 | 
21 | static void copy_poly(int32_t a[1024], const int32_t b[1024]) {
22 |   uint32_t i;
23 | 
24 |   for (i=0; i<1024; i++) {
25 |     a[i] = b[i];
26 |   }
27 | }
28 | 
29 | static void test_mul_from_KAT_values(void (*f)(int32_t *, int32_t *, int32_t *)) {
30 |   int32_t ua[1024], ub[1024], uc[1024];
31 | 
32 |   for (int i = 0; i < REPETITIONS; i++) {
33 |     copy_poly(ua, a[i]);
34 |     copy_poly(ub, b[i]);
35 |     f(uc, ua, ub);
36 | 
37 |     for (int j = 0; j < 1024; j++) {
38 |       if (uc[j] != c[i][j]) {
39 | 	printf("\t Failure at round %d on coeff %d: %"PRIi32" != %"PRIi32".\n", i, j, uc[j], c[i][j]);
40 | 	exit(EXIT_FAILURE);
41 |       }
42 |     }
43 |   }
44 | 
45 |   printf("\t Success after %d tests\n", REPETITIONS);
46 | }
47 | 
48 | int main(void){
49 |   build_kat();
50 | 
51 |   printf("Testing ntt1024_product1 (KAT values)\n");
52 |   test_mul_from_KAT_values(ntt1024_product1);
53 | 
54 |   printf("\nTesting ntt1024_product2 (KAT values)\n");
55 |   test_mul_from_KAT_values(ntt1024_product2);
56 | 
57 |   printf("\nTesting ntt1024_product3 (KAT values)\n");
58 |   test_mul_from_KAT_values(ntt1024_product3);
59 | 
60 |   printf("\nTesting ntt1024_product4 (KAT values)\n");
61 |   test_mul_from_KAT_values(ntt1024_product4);
62 | 
63 |   printf("\nTesting ntt1024_product5 (KAT values)\n");
64 |   test_mul_from_KAT_values(ntt1024_product5);
65 | 
66 |   return 0;
67 | }
68 | 


--------------------------------------------------------------------------------
/src/test_mul1024.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 |   Tancrede: I added to the repository Harvey’s NTT (with a file
 4 |   tools/precomputation-ntt-harvey.sage to explain how the constants
 5 |   are generated), and a new test test_poly.c which uses known values
 6 |   (a[i]*b[i]=c[i]) and verifies that INTT(NTT(a[i])*NTT(b[i])) == c[i]
 7 |   with Harvey’s NTT. I believe similar tests should be possible with
 8 |   the others NTTs in the repository, although they do not output
 9 |   numbers in [0, PARAM_Q) so the test should be adapted.
10 | 
11 | */
12 | 
13 | #include <stdint.h>
14 | #include <stdio.h>
15 | #include <stdlib.h>
16 | #include <inttypes.h>
17 | 
18 | #include "ntts1024.h"
19 | 
20 | static void copy_poly(int32_t a[1024], const int32_t b[1024]) {
21 |   uint32_t i;
22 | 
23 |   for (i=0; i<1024; i++) {
24 |     a[i] = b[i];
25 |   }
26 | }
27 | 
28 | static void test_mul_from_KAT_values(void (*f)(int32_t *, int32_t *, int32_t *)) {
29 |   int32_t ua[1024], ub[1024], uc[1024];
30 | 
31 |   // Include KAT vectors
32 |   #include "data_poly1024.c"
33 | 
34 |   for (int i = 0; i < REPETITIONS; i++) {
35 |     copy_poly(ua, a[i]);
36 |     copy_poly(ub, b[i]);
37 |     f(uc, ua, ub);
38 | 
39 |     for (int j = 0; j < 1024; j++) {
40 |       if (uc[j] != c[i][j]) {
41 | 	printf("\t Failure at round %d on coeff %d: %"PRIi32" != %"PRIi32".\n", i, j, uc[j], c[i][j]);
42 | 	exit(EXIT_FAILURE);
43 |       }
44 |     }
45 |   }
46 | 
47 |   printf("\t Success after %d tests\n", REPETITIONS);
48 | }
49 | 
50 | int main(void){
51 |   printf("\nTesting ntt1024_product1 (KAT values)\n");
52 |   test_mul_from_KAT_values(ntt1024_product1);
53 | 
54 |   printf("\nTesting ntt1024_product2 (KAT values)\n");
55 |   test_mul_from_KAT_values(ntt1024_product2);
56 | 
57 |   printf("\nTesting ntt1024_product3 (KAT values)\n");
58 |   test_mul_from_KAT_values(ntt1024_product3);
59 | 
60 |   printf("\nTesting ntt1024_product4 (KAT values)\n");
61 |   test_mul_from_KAT_values(ntt1024_product4);
62 | 
63 |   printf("\nTesting ntt1024_product5 (KAT values)\n");
64 |   test_mul_from_KAT_values(ntt1024_product5);
65 | 
66 |   return 0;
67 | }
68 | 


--------------------------------------------------------------------------------
/src/naive_ntt16.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289 and n=16.
 3 |  */
 4 | 
 5 | #include "naive_ntt16.h"
 6 | 
 7 | /*
 8 |  * Product of two polynomials
 9 |  */
10 | void naive_ntt16_product1(int32_t *c, int32_t *a, int32_t *b) {
11 |   mul_array16_naive(a, 16, ntt16_psi_powers, 12289);
12 |   naive_ntt16_ct_std2rev(a);
13 |   mul_array16_naive(b, 16, ntt16_psi_powers, 12289);
14 |   naive_ntt16_ct_std2rev(b);
15 |   mul_array_naive(c, 16, a, b, 12289);
16 |   naive_intt16_ct_rev2std(c);
17 |   mul_array16_naive(c, 16, ntt16_scaled_inv_psi_powers, 12289);
18 | }
19 | 
20 | void naive_ntt16_product2(int32_t *c, int32_t *a, int32_t *b) {
21 |   mul_array16_naive(a, 16, ntt16_psi_powers, 12289);
22 |   naive_ntt16_gs_std2rev(a);
23 |   mul_array16_naive(b, 16, ntt16_psi_powers, 12289);
24 |   naive_ntt16_gs_std2rev(b);
25 |   mul_array_naive(c, 16, a, b, 12289);
26 |   naive_intt16_ct_rev2std(c);
27 |   mul_array16_naive(c, 16, ntt16_scaled_inv_psi_powers, 12289);
28 | }
29 | 
30 | void naive_ntt16_product3(int32_t *c, int32_t *a, int32_t *b) {
31 |   mul_array16_naive(a, 16, ntt16_psi_powers, 12289);
32 |   naive_ntt16_ct_std2rev(a);
33 |   mul_array16_naive(b, 16, ntt16_psi_powers, 12289);
34 |   naive_ntt16_ct_std2rev(b);
35 |   mul_array_naive(c, 16, a, b, 12289);
36 |   naive_intt16_gs_rev2std(c);
37 |   mul_array16_naive(c, 16, ntt16_scaled_inv_psi_powers, 12289);
38 | }
39 | 
40 | void naive_ntt16_product4(int32_t *c, int32_t *a, int32_t *b) {
41 |   mul_array16_naive(a, 16, ntt16_psi_powers, 12289);
42 |   naive_ntt16_gs_std2rev(a);
43 |   mul_array16_naive(b, 16, ntt16_psi_powers, 12289);
44 |   naive_ntt16_gs_std2rev(b);
45 |   mul_array_naive(c, 16, a, b, 12289);
46 |   naive_intt16_gs_rev2std(c);
47 |   mul_array16_naive(c, 16, ntt16_scaled_inv_psi_powers, 12289);
48 | }
49 | 
50 | 
51 | /*
52 |  * Use combined mulntt then inttmul
53 |  */
54 | void naive_ntt16_product5(int32_t *c, int32_t *a, int32_t *b) {
55 |   naive_mulntt16_ct_std2rev(a);
56 |   naive_mulntt16_ct_std2rev(b);
57 |   mul_array_naive(c, 16, a, b, 12289);
58 |   naive_inttmul16_gs_rev2std(c);
59 |   scalar_mul_array_naive(c, 16, ntt16_inv_n, 12289); // divide by n
60 | }
61 | 


--------------------------------------------------------------------------------
/src/kat_mul1024_red.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 |   Tancrede: I added to the repository Harvey’s NTT (with a file
 4 |   tools/precomputation-ntt-harvey.sage to explain how the constants
 5 |   are generated), and a new test test_poly.c which uses known values
 6 |   (a[i]*b[i]=c[i]) and verifies that INTT(NTT(a[i])*NTT(b[i])) == c[i]
 7 |   with Harvey’s NTT. I believe similar tests should be possible with
 8 |   the others NTTs in the repository, although they do not output
 9 |   numbers in [0, PARAM_Q) so the test should be adapted.
10 | 
11 | */
12 | 
13 | #include <stdint.h>
14 | #include <stdio.h>
15 | #include <stdlib.h>
16 | #include <inttypes.h>
17 | 
18 | #include "ntt_red1024.h"
19 | #include "data_poly1024.h"
20 | #include "sort.h"
21 | 
22 | static void copy_poly(int32_t a[1024], const int32_t b[1024]) {
23 |   uint32_t i;
24 | 
25 |   for (i=0; i<1024; i++) {
26 |     a[i] = b[i];
27 |   }
28 | }
29 | 
30 | static void test_mul_from_KAT_values(void (*f)(int32_t *, int32_t *, int32_t *)) {
31 |   int32_t ua[1024], ub[1024], uc[1024];
32 | 
33 |   for (int i = 0; i < REPETITIONS; i++) {
34 |     copy_poly(ua, a[i]);
35 |     copy_poly(ub, b[i]);
36 |     f(uc, ua, ub);
37 | 
38 |     for (int j = 0; j < 1024; j++) {
39 |       if (uc[j] != c[i][j]) {
40 | 	printf("\t Failure at round %d on coeff %d: %"PRIi32" != %"PRIi32".\n", i, j, uc[j], c[i][j]);
41 | 	exit(EXIT_FAILURE);
42 |       }
43 |     }
44 |   }
45 | 
46 |   printf("\t Success after %d tests\n", REPETITIONS);
47 | }
48 | 
49 | int main(void){
50 |   build_kat();
51 | 
52 |   printf("Testing ntt_red1024_product1 (KAT values)\n");
53 |   test_mul_from_KAT_values(ntt_red1024_product1);
54 | 
55 |   printf("\nTesting ntt_red1024_product2 (KAT values)\n");
56 |   test_mul_from_KAT_values(ntt_red1024_product2);
57 | 
58 |   printf("\nTesting ntt_red1024_product3 (KAT values)\n");
59 |   test_mul_from_KAT_values(ntt_red1024_product3);
60 | 
61 |   printf("\nTesting ntt_red1024_product4 (KAT values)\n");
62 |   test_mul_from_KAT_values(ntt_red1024_product4);
63 | 
64 |   printf("\nTesting ntt_red1024_product5 (KAT values)\n");
65 |   test_mul_from_KAT_values(ntt_red1024_product5);
66 | 
67 |   return 0;
68 | }
69 | 


--------------------------------------------------------------------------------
/src/naive_ntt256.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289 and n=256.
 3 |  */
 4 | 
 5 | #include "naive_ntt256.h"
 6 | 
 7 | /*
 8 |  * Product of two polynomials
 9 |  */
10 | void naive_ntt256_product1(int32_t *c, int32_t *a, int32_t *b) {
11 |   mul_array16_naive(a, 256, ntt256_psi_powers, 12289);
12 |   naive_ntt256_ct_std2rev(a);
13 |   mul_array16_naive(b, 256, ntt256_psi_powers, 12289);
14 |   naive_ntt256_ct_std2rev(b);
15 |   mul_array_naive(c, 256, a, b, 12289);
16 |   naive_intt256_ct_rev2std(c);
17 |   mul_array16_naive(c, 256, ntt256_scaled_inv_psi_powers, 12289);
18 | }
19 | 
20 | void naive_ntt256_product2(int32_t *c, int32_t *a, int32_t *b) {
21 |   mul_array16_naive(a, 256, ntt256_psi_powers, 12289);
22 |   naive_ntt256_gs_std2rev(a);
23 |   mul_array16_naive(b, 256, ntt256_psi_powers, 12289);
24 |   naive_ntt256_gs_std2rev(b);
25 |   mul_array_naive(c, 256, a, b, 12289);
26 |   naive_intt256_ct_rev2std(c);
27 |   mul_array16_naive(c, 256, ntt256_scaled_inv_psi_powers, 12289);
28 | }
29 | 
30 | void naive_ntt256_product3(int32_t *c, int32_t *a, int32_t *b) {
31 |   mul_array16_naive(a, 256, ntt256_psi_powers, 12289);
32 |   naive_ntt256_ct_std2rev(a);
33 |   mul_array16_naive(b, 256, ntt256_psi_powers, 12289);
34 |   naive_ntt256_ct_std2rev(b);
35 |   mul_array_naive(c, 256, a, b, 12289);
36 |   naive_intt256_gs_rev2std(c);
37 |   mul_array16_naive(c, 256, ntt256_scaled_inv_psi_powers, 12289);
38 | }
39 | 
40 | void naive_ntt256_product4(int32_t *c, int32_t *a, int32_t *b) {
41 |   mul_array16_naive(a, 256, ntt256_psi_powers, 12289);
42 |   naive_ntt256_gs_std2rev(a);
43 |   mul_array16_naive(b, 256, ntt256_psi_powers, 12289);
44 |   naive_ntt256_gs_std2rev(b);
45 |   mul_array_naive(c, 256, a, b, 12289);
46 |   naive_intt256_gs_rev2std(c);
47 |   mul_array16_naive(c, 256, ntt256_scaled_inv_psi_powers, 12289);
48 | }
49 | 
50 | 
51 | /*
52 |  * Use combined mulntt then inttmul
53 |  */
54 | void naive_ntt256_product5(int32_t *c, int32_t *a, int32_t *b) {
55 |   naive_mulntt256_ct_std2rev(a);
56 |   naive_mulntt256_ct_std2rev(b);
57 |   mul_array_naive(c, 256, a, b, 12289);
58 |   naive_inttmul256_gs_rev2std(c);
59 |   scalar_mul_array_naive(c, 256, ntt256_inv_n, 12289); // divide by n
60 | }
61 | 


--------------------------------------------------------------------------------
/src/naive_ntt512.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289 and n=512.
 3 |  */
 4 | 
 5 | #include "naive_ntt512.h"
 6 | 
 7 | /*
 8 |  * Product of two polynomials
 9 |  */
10 | void naive_ntt512_product1(int32_t *c, int32_t *a, int32_t *b) {
11 |   mul_array16_naive(a, 512, ntt512_psi_powers, 12289);
12 |   naive_ntt512_ct_std2rev(a);
13 |   mul_array16_naive(b, 512, ntt512_psi_powers, 12289);
14 |   naive_ntt512_ct_std2rev(b);
15 |   mul_array_naive(c, 512, a, b, 12289);
16 |   naive_intt512_ct_rev2std(c);
17 |   mul_array16_naive(c, 512, ntt512_scaled_inv_psi_powers, 12289);
18 | }
19 | 
20 | void naive_ntt512_product2(int32_t *c, int32_t *a, int32_t *b) {
21 |   mul_array16_naive(a, 512, ntt512_psi_powers, 12289);
22 |   naive_ntt512_gs_std2rev(a);
23 |   mul_array16_naive(b, 512, ntt512_psi_powers, 12289);
24 |   naive_ntt512_gs_std2rev(b);
25 |   mul_array_naive(c, 512, a, b, 12289);
26 |   naive_intt512_ct_rev2std(c);
27 |   mul_array16_naive(c, 512, ntt512_scaled_inv_psi_powers, 12289);
28 | }
29 | 
30 | void naive_ntt512_product3(int32_t *c, int32_t *a, int32_t *b) {
31 |   mul_array16_naive(a, 512, ntt512_psi_powers, 12289);
32 |   naive_ntt512_ct_std2rev(a);
33 |   mul_array16_naive(b, 512, ntt512_psi_powers, 12289);
34 |   naive_ntt512_ct_std2rev(b);
35 |   mul_array_naive(c, 512, a, b, 12289);
36 |   naive_intt512_gs_rev2std(c);
37 |   mul_array16_naive(c, 512, ntt512_scaled_inv_psi_powers, 12289);
38 | }
39 | 
40 | void naive_ntt512_product4(int32_t *c, int32_t *a, int32_t *b) {
41 |   mul_array16_naive(a, 512, ntt512_psi_powers, 12289);
42 |   naive_ntt512_gs_std2rev(a);
43 |   mul_array16_naive(b, 512, ntt512_psi_powers, 12289);
44 |   naive_ntt512_gs_std2rev(b);
45 |   mul_array_naive(c, 512, a, b, 12289);
46 |   naive_intt512_gs_rev2std(c);
47 |   mul_array16_naive(c, 512, ntt512_scaled_inv_psi_powers, 12289);
48 | }
49 | 
50 | 
51 | /*
52 |  * Use combined mulntt then inttmul
53 |  */
54 | void naive_ntt512_product5(int32_t *c, int32_t *a, int32_t *b) {
55 |   naive_mulntt512_ct_std2rev(a);
56 |   naive_mulntt512_ct_std2rev(b);
57 |   mul_array_naive(c, 512, a, b, 12289);
58 |   naive_inttmul512_gs_rev2std(c);
59 |   scalar_mul_array_naive(c, 512, ntt512_inv_n, 12289); // divide by n
60 | }
61 | 


--------------------------------------------------------------------------------
/src/kat_mul1024_red_asm.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 |   Tancrede: I added to the repository Harvey’s NTT (with a file
 4 |   tools/precomputation-ntt-harvey.sage to explain how the constants
 5 |   are generated), and a new test test_poly.c which uses known values
 6 |   (a[i]*b[i]=c[i]) and verifies that INTT(NTT(a[i])*NTT(b[i])) == c[i]
 7 |   with Harvey’s NTT. I believe similar tests should be possible with
 8 |   the others NTTs in the repository, although they do not output
 9 |   numbers in [0, PARAM_Q) so the test should be adapted.
10 | 
11 | */
12 | 
13 | #include <stdint.h>
14 | #include <stdio.h>
15 | #include <stdlib.h>
16 | #include <inttypes.h>
17 | 
18 | #include "ntt_red_asm1024.h"
19 | #include "data_poly1024.h"
20 | #include "sort.h"
21 | 
22 | static void copy_poly(int32_t a[1024], const int32_t b[1024]) {
23 |   uint32_t i;
24 | 
25 |   for (i=0; i<1024; i++) {
26 |     a[i] = b[i];
27 |   }
28 | }
29 | 
30 | static void test_mul_from_KAT_values(void (*f)(int32_t *, int32_t *, int32_t *)) {
31 |   int32_t ua[1024], ub[1024], uc[1024];
32 | 
33 |   for (int i = 0; i < REPETITIONS; i++) {
34 |     copy_poly(ua, a[i]);
35 |     copy_poly(ub, b[i]);
36 |     f(uc, ua, ub);
37 | 
38 |     for (int j = 0; j < 1024; j++) {
39 |       if (uc[j] != c[i][j]) {
40 | 	printf("\t Failure at round %d on coeff %d: %"PRIi32" != %"PRIi32".\n", i, j, uc[j], c[i][j]);
41 | 	exit(EXIT_FAILURE);
42 |       }
43 |     }
44 |   }
45 | 
46 |   printf("\t Success after %d tests\n", REPETITIONS);
47 | }
48 | 
49 | int main(void){
50 |   build_kat();
51 | 
52 |   printf("Testing ntt_red1024_product1_asm (KAT values)\n");
53 |   test_mul_from_KAT_values(ntt_red1024_product1_asm);
54 | 
55 |   printf("\nTesting ntt_red1024_product2_asm (KAT values)\n");
56 |   test_mul_from_KAT_values(ntt_red1024_product2_asm);
57 | 
58 |   printf("\nTesting ntt_red1024_product3_asm (KAT values)\n");
59 |   test_mul_from_KAT_values(ntt_red1024_product3_asm);
60 | 
61 |   printf("\nTesting ntt_red1024_product4_asm (KAT values)\n");
62 |   test_mul_from_KAT_values(ntt_red1024_product4_asm);
63 | 
64 |   printf("\nTesting ntt_red1024_product5_asm (KAT values)\n");
65 |   test_mul_from_KAT_values(ntt_red1024_product5_asm);
66 | 
67 |   return 0;
68 | }
69 | 


--------------------------------------------------------------------------------
/src/naive_ntt1024.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289 and n=1024.
 3 |  */
 4 | 
 5 | #include "naive_ntt1024.h"
 6 | 
 7 | /*
 8 |  * Product of two polynomials
 9 |  */
10 | void naive_ntt1024_product1(int32_t *c, int32_t *a, int32_t *b) {
11 |   mul_array16_naive(a, 1024, ntt1024_psi_powers, 12289);
12 |   naive_ntt1024_ct_std2rev(a);
13 |   mul_array16_naive(b, 1024, ntt1024_psi_powers, 12289);
14 |   naive_ntt1024_ct_std2rev(b);
15 |   mul_array_naive(c, 1024, a, b, 12289);
16 |   naive_intt1024_ct_rev2std(c);
17 |   mul_array16_naive(c, 1024, ntt1024_scaled_inv_psi_powers, 12289);
18 | }
19 | 
20 | void naive_ntt1024_product2(int32_t *c, int32_t *a, int32_t *b) {
21 |   mul_array16_naive(a, 1024, ntt1024_psi_powers, 12289);
22 |   naive_ntt1024_gs_std2rev(a);
23 |   mul_array16_naive(b, 1024, ntt1024_psi_powers, 12289);
24 |   naive_ntt1024_gs_std2rev(b);
25 |   mul_array_naive(c, 1024, a, b, 12289);
26 |   naive_intt1024_ct_rev2std(c);
27 |   mul_array16_naive(c, 1024, ntt1024_scaled_inv_psi_powers, 12289);
28 | }
29 | 
30 | void naive_ntt1024_product3(int32_t *c, int32_t *a, int32_t *b) {
31 |   mul_array16_naive(a, 1024, ntt1024_psi_powers, 12289);
32 |   naive_ntt1024_ct_std2rev(a);
33 |   mul_array16_naive(b, 1024, ntt1024_psi_powers, 12289);
34 |   naive_ntt1024_ct_std2rev(b);
35 |   mul_array_naive(c, 1024, a, b, 12289);
36 |   naive_intt1024_gs_rev2std(c);
37 |   mul_array16_naive(c, 1024, ntt1024_scaled_inv_psi_powers, 12289);
38 | }
39 | 
40 | void naive_ntt1024_product4(int32_t *c, int32_t *a, int32_t *b) {
41 |   mul_array16_naive(a, 1024, ntt1024_psi_powers, 12289);
42 |   naive_ntt1024_gs_std2rev(a);
43 |   mul_array16_naive(b, 1024, ntt1024_psi_powers, 12289);
44 |   naive_ntt1024_gs_std2rev(b);
45 |   mul_array_naive(c, 1024, a, b, 12289);
46 |   naive_intt1024_gs_rev2std(c);
47 |   mul_array16_naive(c, 1024, ntt1024_scaled_inv_psi_powers, 12289);
48 | }
49 | 
50 | 
51 | /*
52 |  * Use combined mulntt then inttmul
53 |  */
54 | void naive_ntt1024_product5(int32_t *c, int32_t *a, int32_t *b) {
55 |   naive_mulntt1024_ct_std2rev(a);
56 |   naive_mulntt1024_ct_std2rev(b);
57 |   mul_array_naive(c, 1024, a, b, 12289);
58 |   naive_inttmul1024_gs_rev2std(c);
59 |   scalar_mul_array_naive(c, 1024, ntt1024_inv_n, 12289); // divide by n
60 | }
61 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/verify_all:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | VERIFY=./verify.sh
 4 | make realclean
 5 | make all
 6 | make install INSTALL_DIR=./bitcode
 7 | 
 8 | echo "=== Verifying harness_intt_red1024.all_linked.bc ==="
 9 | time $VERIFY ./bitcode/harness_intt_red1024.all_linked.bc
10 | echo "=== Verifying harness_intt_red1024b.all_linked.bc ==="
11 | time $VERIFY ./bitcode/harness_intt_red1024b.all_linked.bc
12 | echo "=== Verifying harness_ntt_red1024.all_linked.bc ==="
13 | time $VERIFY ./bitcode/harness_ntt_red1024.all_linked.bc
14 | echo "=== Verifying harness_ntt_red1024b.all_linked.bc ==="
15 | time $VERIFY ./bitcode/harness_ntt_red1024b.all_linked.bc
16 | echo "=== Verifying harness_ntt_red1024c.all_linked.bc ==="
17 | time $VERIFY ./bitcode/harness_ntt_red1024c.all_linked.bc
18 | echo "=== Verifying harness_ntt_red1024d.all_linked.bc ==="
19 | time $VERIFY ./bitcode/harness_ntt_red1024d.all_linked.bc
20 | echo "=== Verifying harness_ntt_red1024e.all_linked.bc ==="
21 | time $VERIFY ./bitcode/harness_ntt_red1024e.all_linked.bc
22 | echo "=== Verifying harness_ntt_red1024f.all_linked.bc ==="
23 | time $VERIFY ./bitcode/harness_ntt_red1024f.all_linked.bc
24 | 
25 | echo "=== Verifying harness_intt_red1024.all_linked.bc with inlining ==="
26 | time $VERIFY --inline ./bitcode/harness_intt_red1024.all_linked.bc
27 | echo "=== Verifying harness_intt_red1024b.all_linked.bc with inlining ==="
28 | time $VERIFY --inline ./bitcode/harness_intt_red1024b.all_linked.bc
29 | echo "=== Verifying harness_ntt_red1024.all_linked.bc with inlining ==="
30 | time $VERIFY --inline ./bitcode/harness_ntt_red1024.all_linked.bc
31 | echo "=== Verifying harness_ntt_red1024b.all_linked.bc with inlining ==="
32 | time $VERIFY --inline ./bitcode/harness_ntt_red1024b.all_linked.bc
33 | echo "=== Verifying harness_ntt_red1024c.all_linked.bc with inlining ==="
34 | time $VERIFY --inline ./bitcode/harness_ntt_red1024c.all_linked.bc
35 | echo "=== Verifying harness_ntt_red1024d.all_linked.bc with inlining ==="
36 | time $VERIFY --inline ./bitcode/harness_ntt_red1024d.all_linked.bc
37 | echo "=== Verifying harness_ntt_red1024e.all_linked.bc with inlining ==="
38 | time $VERIFY --inline ./bitcode/harness_ntt_red1024e.all_linked.bc
39 | echo "=== Verifying harness_ntt_red1024f.all_linked.bc with inlining ==="
40 | time $VERIFY --inline ./bitcode/harness_ntt_red1024f.all_linked.bc
41 | 


--------------------------------------------------------------------------------
/src/bitrev256_table.c:
--------------------------------------------------------------------------------
 1 | #include "bitrev256_table.h"
 2 | 
 3 | const uint16_t bitrev256[BITREV256_NPAIRS][2] = {
 4 |     {     1,   128 }, {     2,    64 }, {     3,   192 }, {     4,    32 },
 5 |     {     5,   160 }, {     6,    96 }, {     7,   224 }, {     8,    16 },
 6 |     {     9,   144 }, {    10,    80 }, {    11,   208 }, {    12,    48 },
 7 |     {    13,   176 }, {    14,   112 }, {    15,   240 }, {    17,   136 },
 8 |     {    18,    72 }, {    19,   200 }, {    20,    40 }, {    21,   168 },
 9 |     {    22,   104 }, {    23,   232 }, {    25,   152 }, {    26,    88 },
10 |     {    27,   216 }, {    28,    56 }, {    29,   184 }, {    30,   120 },
11 |     {    31,   248 }, {    33,   132 }, {    34,    68 }, {    35,   196 },
12 |     {    37,   164 }, {    38,   100 }, {    39,   228 }, {    41,   148 },
13 |     {    42,    84 }, {    43,   212 }, {    44,    52 }, {    45,   180 },
14 |     {    46,   116 }, {    47,   244 }, {    49,   140 }, {    50,    76 },
15 |     {    51,   204 }, {    53,   172 }, {    54,   108 }, {    55,   236 },
16 |     {    57,   156 }, {    58,    92 }, {    59,   220 }, {    61,   188 },
17 |     {    62,   124 }, {    63,   252 }, {    65,   130 }, {    67,   194 },
18 |     {    69,   162 }, {    70,    98 }, {    71,   226 }, {    73,   146 },
19 |     {    74,    82 }, {    75,   210 }, {    77,   178 }, {    78,   114 },
20 |     {    79,   242 }, {    81,   138 }, {    83,   202 }, {    85,   170 },
21 |     {    86,   106 }, {    87,   234 }, {    89,   154 }, {    91,   218 },
22 |     {    93,   186 }, {    94,   122 }, {    95,   250 }, {    97,   134 },
23 |     {    99,   198 }, {   101,   166 }, {   103,   230 }, {   105,   150 },
24 |     {   107,   214 }, {   109,   182 }, {   110,   118 }, {   111,   246 },
25 |     {   113,   142 }, {   115,   206 }, {   117,   174 }, {   119,   238 },
26 |     {   121,   158 }, {   123,   222 }, {   125,   190 }, {   127,   254 },
27 |     {   131,   193 }, {   133,   161 }, {   135,   225 }, {   137,   145 },
28 |     {   139,   209 }, {   141,   177 }, {   143,   241 }, {   147,   201 },
29 |     {   149,   169 }, {   151,   233 }, {   155,   217 }, {   157,   185 },
30 |     {   159,   249 }, {   163,   197 }, {   167,   229 }, {   171,   213 },
31 |     {   173,   181 }, {   175,   245 }, {   179,   205 }, {   183,   237 },
32 |     {   187,   221 }, {   191,   253 }, {   199,   227 }, {   203,   211 },
33 |     {   207,   243 }, {   215,   235 }, {   223,   251 }, {   239,   247 },
34 | };
35 | 
36 | 


--------------------------------------------------------------------------------
/src/intervals.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Interval abstract domain
 3 |  * - we represent intervals as pairs of 64bit signed numbers,
 4 |  *   which should be safe for our NTT
 5 |  */
 6 | 
 7 | #ifndef INTERVALS_H
 8 | #define INTERVALS_H
 9 | 
10 | #include <stdint.h>
11 | 
12 | typedef struct interval_s {
13 |   int64_t min;
14 |   int64_t max;
15 | } interval_t;
16 | 
17 | 
18 | /*
19 |  * Constructors: all allocate and return a pointer to an interval
20 |  * structure.
21 |  */
22 | extern interval_t *point(int64_t x);
23 | extern interval_t *interval(int64_t min, int64_t max);
24 | 
25 | /*
26 |  * Destructor: just calls free
27 |  */
28 | extern void delete_interval(interval_t *a);
29 | 
30 | /*
31 |  * Basic operations
32 |  */
33 | extern interval_t *add(const interval_t *a, const interval_t *b);
34 | extern interval_t *sub(const interval_t *a, const interval_t *b);
35 | extern interval_t *neg(const interval_t *a);
36 | 
37 | /*
38 |  * Reductions
39 |  * - red(a) = [l, h] such that l <= red(x) <= h for any x in a
40 |  * - red_mul(a, b) = [l, h] such that l <= red(x * y) <= h for any x in a and y in b.
41 |  * - red_scale(k, a) = [l, h] such that l <= red(x * k) <= h for any x in a
42 |  * - red_twice(a) = [l, h] such that l <= red(red(x)) <= h for x in a.
43 |  */
44 | extern interval_t *red(const interval_t *a);
45 | extern interval_t *red_mul(const interval_t *a, const interval_t *b);
46 | extern interval_t *red_scale(int64_t k, const interval_t *a);
47 | extern interval_t *red_twice(const interval_t *a);
48 | 
49 | /*
50 |  * Reduction modulo q: [l, h] such that l <= x % q <= h whenever x is in a.
51 |  * The modulo operation returns an integer between 0 and  q-1 here.
52 |  * - q is 12289.
53 |  */
54 | extern interval_t *normal(const interval_t *a);
55 | 
56 | /*
57 |  * Multiply by inverse(3) then reduce modulo q
58 |  */
59 | extern interval_t *normal_inv3(const interval_t *a);
60 | 
61 | /*
62 |  * Shift representation: a must be a sub-interval of [0 .. q-1]
63 |  * - returns [l, h] such that l <= shift(x) <= h where
64 |  *   shift(x) = x if 0 <= x <= (q-1)/2
65 |  *   shift(x) = x - q if (q-1/2) < x <= q-1
66 |  */
67 | extern interval_t *shift(const interval_t *a);
68 | 
69 | 
70 | /*
71 |  * Correct: assume x is in the interval [-q, 2*q-1] then
72 |  * correct(x) is if (x<0) then x+q elsif (x >= q) then x-q else x.
73 |  * So correct(x) is in thee interval [0 .. q-1]
74 |  *
75 |  * Interval a must be a subinterval of [-q, 2q-1]
76 |  * Correct(a) returns [l, h] such that l <= correct(x) <= h for x in a.
77 |  */
78 | extern interval_t *correct(const interval_t *a);
79 | 
80 | 
81 | #endif /* INTERVALS_H */
82 | 


--------------------------------------------------------------------------------
/src/ntt256.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289 and n=256
 3 |  */
 4 | 
 5 | #ifndef __NTT256_H
 6 | #define __NTT256_H
 7 | 
 8 | #include "ntt256_tables.h"
 9 | #include "ntt.h"
10 | 
11 | /*
12 |  * NTT VARIANTS
13 |  *
14 |  * - the input a is an array of n integers that must be between 0 and Q-1
15 |  * - the result is stored in place
16 |  * - the inverse transforms return a result scaled by n:
17 |  *    we have intt(ntt(a)) = n * a
18 |  */
19 | // forward
20 | static inline void ntt256_ct_rev2std(int32_t *a) {
21 |   ntt_ct_rev2std(a, 256, ntt256_omega_powers);
22 | }
23 | 
24 | static inline void ntt256_gs_rev2std(int32_t *a) {
25 |   ntt_gs_rev2std(a, 256, ntt256_omega_powers_rev);
26 | }
27 | 
28 | static inline void ntt256_ct_std2rev(int32_t *a) {
29 |   ntt_ct_std2rev(a, 256, ntt256_omega_powers_rev);
30 | }
31 | 
32 | static inline void ntt256_gs_std2rev(int32_t *a) {
33 |   ntt_gs_std2rev(a, 256, ntt256_omega_powers);
34 | }
35 | 
36 | // inverse
37 | static inline void intt256_ct_rev2std(int32_t *a) {
38 |   ntt_ct_rev2std(a, 256, ntt256_inv_omega_powers);
39 | }
40 | 
41 | static inline void intt256_gs_rev2std(int32_t *a) {
42 |   ntt_gs_rev2std(a, 256, ntt256_inv_omega_powers_rev);
43 | }
44 | 
45 | static inline void intt256_ct_std2rev(int32_t *a) {
46 |   ntt_ct_std2rev(a, 256, ntt256_inv_omega_powers_rev);
47 | }
48 | 
49 | static inline void intt256_gs_std2rev(int32_t *a) {
50 |   ntt_gs_std2rev(a, 256, ntt256_inv_omega_powers);
51 | }
52 | 
53 | // multiplication by powers of psi then forward ntt
54 | static inline void mulntt256_ct_rev2std(int32_t *a) {
55 |   mulntt_ct_rev2std(a, 256, ntt256_mixed_powers);
56 | }
57 | 
58 | static inline void mulntt256_ct_std2rev(int32_t *a) {
59 |   mulntt_ct_std2rev(a, 256, ntt256_mixed_powers_rev);
60 | }
61 | 
62 | // inverse ntt then multiplication by powers of psi^-1
63 | static inline void inttmul256_gs_rev2std(int32_t *a) {
64 |   nttmul_gs_rev2std(a, 256, ntt256_inv_mixed_powers_rev);
65 | }
66 | 
67 | static inline void inttmul256_gs_std2rev(int32_t *a) {
68 |   nttmul_gs_std2rev(a, 256, ntt256_inv_mixed_powers);
69 | }
70 | 
71 | 
72 | /*
73 |  * PRODUCTS
74 |  */
75 | 
76 | /*
77 |  * Input: two arrays a and b in standard order
78 |  * Result: 
79 |  * - the product is stored in array c, in standard order.
80 |  * - arrays a and b are modified
81 |  *
82 |  * The input arrays must contain elements in the range [0 .. Q-1]
83 |  * The result is also in that range.
84 |  */
85 | extern void ntt256_product1(int32_t *c, int32_t *a, int32_t *b);
86 | extern void ntt256_product2(int32_t *c, int32_t *a, int32_t *b);
87 | extern void ntt256_product3(int32_t *c, int32_t *a, int32_t *b);
88 | extern void ntt256_product4(int32_t *c, int32_t *a, int32_t *b);
89 | extern void ntt256_product5(int32_t *c, int32_t *a, int32_t *b);
90 | extern void ntt256_product6(int32_t *c, int32_t *a, int32_t *b);
91 | 
92 | #endif /* __NTT256_H */
93 | 


--------------------------------------------------------------------------------
/src/ntt512.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289 and n=512
 3 |  */
 4 | 
 5 | #ifndef __NTT512_H
 6 | #define __NTT512_H
 7 | 
 8 | #include "ntt512_tables.h"
 9 | #include "ntt.h"
10 | 
11 | /*
12 |  * NTT VARIANTS
13 |  *
14 |  * - the input a is an array of n integers that must be between 0 and Q-1
15 |  * - the result is stored in place
16 |  * - the inverse transforms return a result scaled by n:
17 |  *    we have intt(ntt(a)) = n * a
18 |  */
19 | // forward
20 | static inline void ntt512_ct_rev2std(int32_t *a) {
21 |   ntt_ct_rev2std(a, 512, ntt512_omega_powers);
22 | }
23 | 
24 | static inline void ntt512_gs_rev2std(int32_t *a) {
25 |   ntt_gs_rev2std(a, 512, ntt512_omega_powers_rev);
26 | }
27 | 
28 | static inline void ntt512_ct_std2rev(int32_t *a) {
29 |   ntt_ct_std2rev(a, 512, ntt512_omega_powers_rev);
30 | }
31 | 
32 | static inline void ntt512_gs_std2rev(int32_t *a) {
33 |   ntt_gs_std2rev(a, 512, ntt512_omega_powers);
34 | }
35 | 
36 | // inverse
37 | static inline void intt512_ct_rev2std(int32_t *a) {
38 |   ntt_ct_rev2std(a, 512, ntt512_inv_omega_powers);
39 | }
40 | 
41 | static inline void intt512_gs_rev2std(int32_t *a) {
42 |   ntt_gs_rev2std(a, 512, ntt512_inv_omega_powers_rev);
43 | }
44 | 
45 | static inline void intt512_ct_std2rev(int32_t *a) {
46 |   ntt_ct_std2rev(a, 512, ntt512_inv_omega_powers_rev);
47 | }
48 | 
49 | static inline void intt512_gs_std2rev(int32_t *a) {
50 |   ntt_gs_std2rev(a, 512, ntt512_inv_omega_powers);
51 | }
52 | 
53 | // multiplication by powers of psi then forward ntt
54 | static inline void mulntt512_ct_rev2std(int32_t *a) {
55 |   mulntt_ct_rev2std(a, 512, ntt512_mixed_powers);
56 | }
57 | 
58 | static inline void mulntt512_ct_std2rev(int32_t *a) {
59 |   mulntt_ct_std2rev(a, 512, ntt512_mixed_powers_rev);
60 | }
61 | 
62 | // inverse ntt then multiplication by powers of psi^-1
63 | static inline void inttmul512_gs_rev2std(int32_t *a) {
64 |   nttmul_gs_rev2std(a, 512, ntt512_inv_mixed_powers_rev);
65 | }
66 | 
67 | static inline void inttmul512_gs_std2rev(int32_t *a) {
68 |   nttmul_gs_std2rev(a, 512, ntt512_inv_mixed_powers);
69 | }
70 | 
71 | 
72 | /*
73 |  * PRODUCTS
74 |  */
75 | 
76 | /*
77 |  * Input: two arrays a and b in standard order
78 |  * Result: 
79 |  * - the product is stored in array c, in standard order.
80 |  * - arrays a and b are modified
81 |  *
82 |  * The input arrays must contain elements in the range [0 .. Q-1]
83 |  * The result is also in that range.
84 |  */
85 | extern void ntt512_product1(int32_t *c, int32_t *a, int32_t *b);
86 | extern void ntt512_product2(int32_t *c, int32_t *a, int32_t *b);
87 | extern void ntt512_product3(int32_t *c, int32_t *a, int32_t *b);
88 | extern void ntt512_product4(int32_t *c, int32_t *a, int32_t *b);
89 | extern void ntt512_product5(int32_t *c, int32_t *a, int32_t *b);
90 | extern void ntt512_product6(int32_t *c, int32_t *a, int32_t *b);
91 | 
92 | #endif /* __NTT512_H */
93 | 


--------------------------------------------------------------------------------
/src/ntt1024.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289 and n=1024
 3 |  */
 4 | 
 5 | #ifndef __NTT1024_H
 6 | #define __NTT1024_H
 7 | 
 8 | #include "ntt1024_tables.h"
 9 | #include "ntt.h"
10 | 
11 | 
12 | /*
13 |  * NTT VARIANTS
14 |  *
15 |  * - the input a is an array of n integers that must be between 0 and Q-1
16 |  * - the result is stored in place
17 |  * - the inverse transforms return a result scaled by n:
18 |  *    we have intt(ntt(a)) = n * a
19 |  */
20 | // forward
21 | static inline void ntt1024_ct_rev2std(int32_t *a) {
22 |   ntt_ct_rev2std(a, 1024, ntt1024_omega_powers);
23 | }
24 | 
25 | static inline void ntt1024_gs_rev2std(int32_t *a) {
26 |   ntt_gs_rev2std(a, 1024, ntt1024_omega_powers_rev);
27 | }
28 | 
29 | static inline void ntt1024_ct_std2rev(int32_t *a) {
30 |   ntt_ct_std2rev(a, 1024, ntt1024_omega_powers_rev);
31 | }
32 | 
33 | static inline void ntt1024_gs_std2rev(int32_t *a) {
34 |   ntt_gs_std2rev(a, 1024, ntt1024_omega_powers);
35 | }
36 | 
37 | // inverse
38 | static inline void intt1024_ct_rev2std(int32_t *a) {
39 |   ntt_ct_rev2std(a, 1024, ntt1024_inv_omega_powers);
40 | }
41 | 
42 | static inline void intt1024_gs_rev2std(int32_t *a) {
43 |   ntt_gs_rev2std(a, 1024, ntt1024_inv_omega_powers_rev);
44 | }
45 | 
46 | static inline void intt1024_ct_std2rev(int32_t *a) {
47 |   ntt_ct_std2rev(a, 1024, ntt1024_inv_omega_powers_rev);
48 | }
49 | 
50 | static inline void intt1024_gs_std2rev(int32_t *a) {
51 |   ntt_gs_std2rev(a, 1024, ntt1024_inv_omega_powers);
52 | }
53 | 
54 | // multiplication by powers of psi then forward ntt
55 | static inline void mulntt1024_ct_rev2std(int32_t *a) {
56 |   mulntt_ct_rev2std(a, 1024, ntt1024_mixed_powers);
57 | }
58 | 
59 | static inline void mulntt1024_ct_std2rev(int32_t *a) {
60 |   mulntt_ct_std2rev(a, 1024, ntt1024_mixed_powers_rev);
61 | }
62 | 
63 | // inverse ntt then multiplication by powers of psi^-1
64 | static inline void inttmul1024_gs_rev2std(int32_t *a) {
65 |   nttmul_gs_rev2std(a, 1024, ntt1024_inv_mixed_powers_rev);
66 | }
67 | 
68 | static inline void inttmul1024_gs_std2rev(int32_t *a) {
69 |   nttmul_gs_std2rev(a, 1024, ntt1024_inv_mixed_powers);
70 | }
71 | 
72 | 
73 | /*
74 |  * PRODUCTS
75 |  */
76 | 
77 | /*
78 |  * Input: two arrays a and b in standard order
79 |  * Result: 
80 |  * - the product is stored in array c, in standard order.
81 |  * - arrays a and b are modified
82 |  *
83 |  * The input arrays must contain elements in the range [0 .. Q-1]
84 |  * The result is also in that range.
85 |  */
86 | extern void ntt1024_product1(int32_t *c, int32_t *a, int32_t *b);
87 | extern void ntt1024_product2(int32_t *c, int32_t *a, int32_t *b);
88 | extern void ntt1024_product3(int32_t *c, int32_t *a, int32_t *b);
89 | extern void ntt1024_product4(int32_t *c, int32_t *a, int32_t *b);
90 | extern void ntt1024_product5(int32_t *c, int32_t *a, int32_t *b);
91 | extern void ntt1024_product6(int32_t *c, int32_t *a, int32_t *b);
92 | 
93 | #endif /* __NTT1024_H */
94 | 


--------------------------------------------------------------------------------
/src/ntt_red16.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289, n=16, using the Longa/Naehrig reduction method.
 3 |  */
 4 | 
 5 | #ifndef __NTT_RED16_H
 6 | #define __NTT_RED16_H
 7 | 
 8 | #include "ntt_red16_tables.h"
 9 | #include "ntt_red.h"
10 | 
11 | /*
12 |  * NTT Variants: as in ntt_red.h
13 |  * using tables from ntt16_red_tables.h
14 |  *
15 |  * Input: a[i] for i=0 .. 15 is expected to satisfy
16 |  *   -21499 <= a[i] <= 21499
17 |  *
18 |  * The result is stored in a, it is not reduced modulo Q.
19 |  */
20 | // forward NTTs
21 | static inline void ntt_red16_ct_rev2std(int32_t *a) {
22 |   ntt_red_ct_rev2std(a, 16, ntt_red16_omega_powers);
23 | }
24 | 
25 | static inline void ntt_red16_gs_rev2std(int32_t *a) {
26 |   ntt_red_gs_rev2std(a, 16, ntt_red16_omega_powers_rev);
27 | }
28 | 
29 | static inline void ntt_red16_ct_std2rev(int32_t *a) {
30 |   ntt_red_ct_std2rev(a, 16, ntt_red16_omega_powers_rev);
31 | }
32 | 
33 | static inline void ntt_red16_gs_std2rev(int32_t *a) {
34 |   ntt_red_gs_std2rev(a, 16, ntt_red16_omega_powers);
35 | }
36 | 
37 | // inverse
38 | static inline void intt_red16_ct_rev2std(int32_t *a) {
39 |   ntt_red_ct_rev2std(a, 16, ntt_red16_inv_omega_powers);
40 | }
41 | 
42 | static inline void intt_red16_gs_rev2std(int32_t *a) {
43 |   ntt_red_gs_rev2std(a, 16, ntt_red16_inv_omega_powers_rev);
44 | }
45 | 
46 | static inline void intt_red16_ct_std2rev(int32_t *a) {
47 |   ntt_red_ct_std2rev(a, 16, ntt_red16_inv_omega_powers_rev);
48 | }
49 | 
50 | static inline void intt_red16_gs_std2rev(int32_t *a) {
51 |   ntt_red_gs_std2rev(a, 16, ntt_red16_inv_omega_powers);
52 | }
53 | 
54 | // multiplication by powers of psi then forward ntt
55 | static inline void mulntt_red16_ct_rev2std(int32_t *a) {
56 |   mulntt_red_ct_rev2std(a, 16, ntt_red16_mixed_powers);
57 | }
58 | 
59 | static inline void mulntt_red16_ct_std2rev(int32_t *a) {
60 |   mulntt_red_ct_std2rev(a, 16, ntt_red16_mixed_powers_rev);
61 | }
62 | 
63 | // inverse ntt then multiplication by powers of psi^-1
64 | static inline void inttmul_red16_gs_rev2std(int32_t *a) {
65 |   nttmul_red_gs_rev2std(a, 16, ntt_red16_inv_mixed_powers_rev);
66 | }
67 | 
68 | static inline void inttmul_red16_gs_std2rev(int32_t *a) {
69 |   nttmul_red_gs_std2rev(a, 16, ntt_red16_inv_mixed_powers);
70 | }
71 | 
72 | 
73 | /*
74 |  * PRODUCTS
75 |  */
76 | 
77 | /*
78 |  * Input: two arrays a and b in standard order
79 |  *
80 |  * Result: 
81 |  * - the product is stored in array c, in standard order.
82 |  * - arrays a and b are modified
83 |  *
84 |  * The input arrays must contain elements in the range [0, Q-1]
85 |  * The result is also in that range.
86 |  */
87 | extern void ntt_red16_product1(int32_t *c, int32_t *a, int32_t *b);
88 | extern void ntt_red16_product2(int32_t *c, int32_t *a, int32_t *b);
89 | extern void ntt_red16_product3(int32_t *c, int32_t *a, int32_t *b);
90 | extern void ntt_red16_product4(int32_t *c, int32_t *a, int32_t *b);
91 | extern void ntt_red16_product5(int32_t *c, int32_t *a, int32_t *b);
92 | 
93 | #endif /* __NTT_RED16_H */
94 | 


--------------------------------------------------------------------------------
/src/ntt_red256.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289, n=256, using the Longa/Naehrig reduction method.
 3 |  */
 4 | 
 5 | #ifndef __NTT_RED256_H
 6 | #define __NTT_RED256_H
 7 | 
 8 | #include "ntt_red256_tables.h"
 9 | #include "ntt_red.h"
10 | 
11 | /*
12 |  * NTT Variants: as in ntt_red.h
13 |  * using tables from ntt256_red_tables.h
14 |  *
15 |  * Input: a[i] for i=0 .. 15 is expected to satisfy
16 |  *   -21499 <= a[i] <= 21499
17 |  *
18 |  * The result is stored in a, it is not reduced modulo Q.
19 |  */
20 | // forward NTTs
21 | static inline void ntt_red256_ct_rev2std(int32_t *a) {
22 |   ntt_red_ct_rev2std(a, 256, ntt_red256_omega_powers);
23 | }
24 | 
25 | static inline void ntt_red256_gs_rev2std(int32_t *a) {
26 |   ntt_red_gs_rev2std(a, 256, ntt_red256_omega_powers_rev);
27 | }
28 | 
29 | static inline void ntt_red256_ct_std2rev(int32_t *a) {
30 |   ntt_red_ct_std2rev(a, 256, ntt_red256_omega_powers_rev);
31 | }
32 | 
33 | static inline void ntt_red256_gs_std2rev(int32_t *a) {
34 |   ntt_red_gs_std2rev(a, 256, ntt_red256_omega_powers);
35 | }
36 | 
37 | // inverse
38 | static inline void intt_red256_ct_rev2std(int32_t *a) {
39 |   ntt_red_ct_rev2std(a, 256, ntt_red256_inv_omega_powers);
40 | }
41 | 
42 | static inline void intt_red256_gs_rev2std(int32_t *a) {
43 |   ntt_red_gs_rev2std(a, 256, ntt_red256_inv_omega_powers_rev);
44 | }
45 | 
46 | static inline void intt_red256_ct_std2rev(int32_t *a) {
47 |   ntt_red_ct_std2rev(a, 256, ntt_red256_inv_omega_powers_rev);
48 | }
49 | 
50 | static inline void intt_red256_gs_std2rev(int32_t *a) {
51 |   ntt_red_gs_std2rev(a, 256, ntt_red256_inv_omega_powers);
52 | }
53 | 
54 | // multiplication by powers of psi then forward ntt
55 | static inline void mulntt_red256_ct_rev2std(int32_t *a) {
56 |   mulntt_red_ct_rev2std(a, 256, ntt_red256_mixed_powers);
57 | }
58 | 
59 | static inline void mulntt_red256_ct_std2rev(int32_t *a) {
60 |   mulntt_red_ct_std2rev(a, 256, ntt_red256_mixed_powers_rev);
61 | }
62 | 
63 | // inverse ntt then multiplication by powers of psi^-1
64 | static inline void inttmul_red256_gs_rev2std(int32_t *a) {
65 |   nttmul_red_gs_rev2std(a, 256, ntt_red256_inv_mixed_powers_rev);
66 | }
67 | 
68 | static inline void inttmul_red256_gs_std2rev(int32_t *a) {
69 |   nttmul_red_gs_std2rev(a, 256, ntt_red256_inv_mixed_powers);
70 | }
71 | 
72 | 
73 | /*
74 |  * PRODUCTS
75 |  */
76 | 
77 | /*
78 |  * Input: two arrays a and b in standard order
79 |  *
80 |  * Result: 
81 |  * - the product is stored in array c, in standard order.
82 |  * - arrays a and b are modified
83 |  *
84 |  * The input arrays must contain elements in the range [0, Q-1]
85 |  * The result is also in that range.
86 |  */
87 | extern void ntt_red256_product1(int32_t *c, int32_t *a, int32_t *b);
88 | extern void ntt_red256_product2(int32_t *c, int32_t *a, int32_t *b);
89 | extern void ntt_red256_product3(int32_t *c, int32_t *a, int32_t *b);
90 | extern void ntt_red256_product4(int32_t *c, int32_t *a, int32_t *b);
91 | extern void ntt_red256_product5(int32_t *c, int32_t *a, int32_t *b);
92 | 
93 | #endif /* __NTT_RED256_H */
94 | 


--------------------------------------------------------------------------------
/src/ntt_red512.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289, n=512, using the Longa/Naehrig reduction method.
 3 |  */
 4 | 
 5 | #ifndef __NTT_RED512_H
 6 | #define __NTT_RED512_H
 7 | 
 8 | #include "ntt_red512_tables.h"
 9 | #include "ntt_red.h"
10 | 
11 | /*
12 |  * NTT Variants: as in ntt_red.h
13 |  * using tables from ntt512_red_tables.h
14 |  *
15 |  * Input: a[i] for i=0 .. 15 is expected to satisfy
16 |  *   -21499 <= a[i] <= 21499
17 |  *
18 |  * The result is stored in a, it is not reduced modulo Q.
19 |  */
20 | // forward NTTs
21 | static inline void ntt_red512_ct_rev2std(int32_t *a) {
22 |   ntt_red_ct_rev2std(a, 512, ntt_red512_omega_powers);
23 | }
24 | 
25 | static inline void ntt_red512_gs_rev2std(int32_t *a) {
26 |   ntt_red_gs_rev2std(a, 512, ntt_red512_omega_powers_rev);
27 | }
28 | 
29 | static inline void ntt_red512_ct_std2rev(int32_t *a) {
30 |   ntt_red_ct_std2rev(a, 512, ntt_red512_omega_powers_rev);
31 | }
32 | 
33 | static inline void ntt_red512_gs_std2rev(int32_t *a) {
34 |   ntt_red_gs_std2rev(a, 512, ntt_red512_omega_powers);
35 | }
36 | 
37 | // inverse
38 | static inline void intt_red512_ct_rev2std(int32_t *a) {
39 |   ntt_red_ct_rev2std(a, 512, ntt_red512_inv_omega_powers);
40 | }
41 | 
42 | static inline void intt_red512_gs_rev2std(int32_t *a) {
43 |   ntt_red_gs_rev2std(a, 512, ntt_red512_inv_omega_powers_rev);
44 | }
45 | 
46 | static inline void intt_red512_ct_std2rev(int32_t *a) {
47 |   ntt_red_ct_std2rev(a, 512, ntt_red512_inv_omega_powers_rev);
48 | }
49 | 
50 | static inline void intt_red512_gs_std2rev(int32_t *a) {
51 |   ntt_red_gs_std2rev(a, 512, ntt_red512_inv_omega_powers);
52 | }
53 | 
54 | // multiplication by powers of psi then forward ntt
55 | static inline void mulntt_red512_ct_rev2std(int32_t *a) {
56 |   mulntt_red_ct_rev2std(a, 512, ntt_red512_mixed_powers);
57 | }
58 | 
59 | static inline void mulntt_red512_ct_std2rev(int32_t *a) {
60 |   mulntt_red_ct_std2rev(a, 512, ntt_red512_mixed_powers_rev);
61 | }
62 | 
63 | // inverse ntt then multiplication by powers of psi^-1
64 | static inline void inttmul_red512_gs_rev2std(int32_t *a) {
65 |   nttmul_red_gs_rev2std(a, 512, ntt_red512_inv_mixed_powers_rev);
66 | }
67 | 
68 | static inline void inttmul_red512_gs_std2rev(int32_t *a) {
69 |   nttmul_red_gs_std2rev(a, 512, ntt_red512_inv_mixed_powers);
70 | }
71 | 
72 | 
73 | /*
74 |  * PRODUCTS
75 |  */
76 | 
77 | /*
78 |  * Input: two arrays a and b in standard order
79 |  *
80 |  * Result: 
81 |  * - the product is stored in array c, in standard order.
82 |  * - arrays a and b are modified
83 |  *
84 |  * The input arrays must contain elements in the range [0, Q-1]
85 |  * The result is also in that range.
86 |  */
87 | extern void ntt_red512_product1(int32_t *c, int32_t *a, int32_t *b);
88 | extern void ntt_red512_product2(int32_t *c, int32_t *a, int32_t *b);
89 | extern void ntt_red512_product3(int32_t *c, int32_t *a, int32_t *b);
90 | extern void ntt_red512_product4(int32_t *c, int32_t *a, int32_t *b);
91 | extern void ntt_red512_product5(int32_t *c, int32_t *a, int32_t *b);
92 | 
93 | #endif /* __NTT_RED512_H */
94 | 


--------------------------------------------------------------------------------
/src/speed_mul1024.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <inttypes.h>
  5 | 
  6 | #include "ntt1024.h"
  7 | #include "sort.h"
  8 | 
  9 | /*
 10 |  * PERFORMANCE MEASUREMENTS
 11 |  */
 12 | 
 13 | /*
 14 |  * For speed measurements: counter of CPU cycles
 15 |  */
 16 | static inline uint64_t cpucycles(void) {
 17 |   uint64_t result;
 18 |   __asm__ volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax"
 19 |     : "=a" (result) ::  "%rdx");
 20 |   return result;
 21 | }
 22 | 
 23 | #define NTESTS 102400
 24 | 
 25 | static uint64_t t[NTESTS];
 26 | 
 27 | // Average run time
 28 | static uint64_t average_time(void) {
 29 |   uint64_t s;
 30 |   uint32_t i;
 31 | 
 32 |   s = 0;
 33 |   for (i=0; i<NTESTS; i++) {
 34 |     s += t[i];
 35 |   }
 36 |   return s/NTESTS;
 37 | }
 38 | 
 39 | // Median
 40 | static uint64_t median_time(void) {
 41 |   uint32_t i;
 42 | 
 43 |   sort(t, NTESTS);
 44 |   for (i=1; i<NTESTS; i++) {
 45 |     if (t[i] < t[i-1]) {
 46 |       fprintf(stderr, "BUG in sort\n");
 47 |       exit(1);
 48 |     }
 49 |   }
 50 | 
 51 |   return t[NTESTS/2];
 52 | }
 53 | 
 54 | static void print_results(const char *s, uint64_t c) {
 55 |   uint32_t i;
 56 | 
 57 |   for(i=0 ;i<NTESTS-1; i++) {
 58 |     t[i] = t[i+1] - t[i];
 59 |   }
 60 |   t[i] = c - t[i];
 61 | 
 62 |   printf("%s\n", s);
 63 |   printf("median: %"PRIu64"\n", median_time());
 64 |   printf("average: %"PRIu64"\n", average_time());
 65 |   printf("\n");
 66 | }
 67 | 
 68 | static void test_mul(void) {
 69 |   int32_t a[1024], b[1024], c[1024];
 70 |   uint32_t i;
 71 | 
 72 |   for (i=0; i<1024; i++) {
 73 |     a[i] = i;
 74 |     b[i] = i;
 75 |   }
 76 |   
 77 |   for (i=0; i<NTESTS; i++) {
 78 |     t[i] = cpucycles();
 79 |     ntt1024_product1(c, a, b);
 80 |   }
 81 |   print_results("ntt1024_product1 ", cpucycles());
 82 | 
 83 |   for (i=0; i<1024; i++) {
 84 |     a[i] = i;
 85 |     b[i] = i;
 86 |   }
 87 |   
 88 |   for (i=0; i<NTESTS; i++) {
 89 |     t[i] = cpucycles();
 90 |     ntt1024_product2(c, a, b);
 91 |   }
 92 |   print_results("ntt1024_product2 ", cpucycles());
 93 | 
 94 |   for (i=0; i<1024; i++) {
 95 |     a[i] = i;
 96 |     b[i] = i;
 97 |   }
 98 |   
 99 |   for (i=0; i<NTESTS; i++) {
100 |     t[i] = cpucycles();
101 |     ntt1024_product3(c, a, b);
102 |   }
103 |   print_results("ntt1024_product3 ", cpucycles());
104 | 
105 |   for (i=0; i<1024; i++) {
106 |     a[i] = i;
107 |     b[i] = i;
108 |   }
109 |   
110 |   for (i=0; i<NTESTS; i++) {
111 |     t[i] = cpucycles();
112 |     ntt1024_product4(c, a, b);
113 |   }
114 |   print_results("ntt1024_product4 ", cpucycles());
115 | 
116 |   for (i=0; i<1024; i++) {
117 |     a[i] = i;
118 |     b[i] = i;
119 |   }
120 |   
121 |   for (i=0; i<NTESTS; i++) {
122 |     t[i] = cpucycles();
123 |     ntt1024_product5(c, a, b);
124 |   }
125 |   print_results("ntt1024_product5 ", cpucycles());
126 | }
127 | 
128 | int main(void){
129 |   printf("Testing ntt1024 product functions\n\n");
130 |   test_mul();
131 |   return 0;
132 | }
133 | 


--------------------------------------------------------------------------------
/src/ntt_red1024.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289, n=1024, using the Longa/Naehrig reduction method.
 3 |  */
 4 | 
 5 | #ifndef __NTT_RED1024_H
 6 | #define __NTT_RED1024_H
 7 | 
 8 | #include "ntt_red1024_tables.h"
 9 | #include "ntt_red.h"
10 | 
11 | /*
12 |  * NTT Variants: as in ntt_red.h
13 |  * using tables from ntt1024_red_tables.h
14 |  *
15 |  * Input: a[i] for i=0 .. 15 is expected to satisfy
16 |  *   -21499 <= a[i] <= 21499
17 |  *
18 |  * The result is stored in a, it is not reduced modulo Q.
19 |  */
20 | // forward NTTs
21 | static inline void ntt_red1024_ct_rev2std(int32_t *a) {
22 |   ntt_red_ct_rev2std(a, 1024, ntt_red1024_omega_powers);
23 | }
24 | 
25 | static inline void ntt_red1024_gs_rev2std(int32_t *a) {
26 |   ntt_red_gs_rev2std(a, 1024, ntt_red1024_omega_powers_rev);
27 | }
28 | 
29 | static inline void ntt_red1024_ct_std2rev(int32_t *a) {
30 |   ntt_red_ct_std2rev(a, 1024, ntt_red1024_omega_powers_rev);
31 | }
32 | 
33 | static inline void ntt_red1024_gs_std2rev(int32_t *a) {
34 |   ntt_red_gs_std2rev(a, 1024, ntt_red1024_omega_powers);
35 | }
36 | 
37 | // inverse
38 | static inline void intt_red1024_ct_rev2std(int32_t *a) {
39 |   ntt_red_ct_rev2std(a, 1024, ntt_red1024_inv_omega_powers);
40 | }
41 | 
42 | static inline void intt_red1024_gs_rev2std(int32_t *a) {
43 |   ntt_red_gs_rev2std(a, 1024, ntt_red1024_inv_omega_powers_rev);
44 | }
45 | 
46 | static inline void intt_red1024_ct_std2rev(int32_t *a) {
47 |   ntt_red_ct_std2rev(a, 1024, ntt_red1024_inv_omega_powers_rev);
48 | }
49 | 
50 | static inline void intt_red1024_gs_std2rev(int32_t *a) {
51 |   ntt_red_gs_std2rev(a, 1024, ntt_red1024_inv_omega_powers);
52 | }
53 | 
54 | // multiplication by powers of psi then forward ntt
55 | static inline void mulntt_red1024_ct_rev2std(int32_t *a) {
56 |   mulntt_red_ct_rev2std(a, 1024, ntt_red1024_mixed_powers);
57 | }
58 | 
59 | static inline void mulntt_red1024_ct_std2rev(int32_t *a) {
60 |   mulntt_red_ct_std2rev(a, 1024, ntt_red1024_mixed_powers_rev);
61 | }
62 | 
63 | // inverse ntt then multiplication by powers of psi^-1
64 | static inline void inttmul_red1024_gs_rev2std(int32_t *a) {
65 |   nttmul_red_gs_rev2std(a, 1024, ntt_red1024_inv_mixed_powers_rev);
66 | }
67 | 
68 | static inline void inttmul_red1024_gs_std2rev(int32_t *a) {
69 |   nttmul_red_gs_std2rev(a, 1024, ntt_red1024_inv_mixed_powers);
70 | }
71 | 
72 | 
73 | /*
74 |  * PRODUCTS
75 |  */
76 | 
77 | /*
78 |  * Input: two arrays a and b in standard order
79 |  *
80 |  * Result: 
81 |  * - the product is stored in array c, in standard order.
82 |  * - arrays a and b are modified
83 |  *
84 |  * The input arrays must contain elements in the range [0, Q-1]
85 |  * The result is also in that range.
86 |  */
87 | extern void ntt_red1024_product1(int32_t *c, int32_t *a, int32_t *b);
88 | extern void ntt_red1024_product2(int32_t *c, int32_t *a, int32_t *b);
89 | extern void ntt_red1024_product3(int32_t *c, int32_t *a, int32_t *b);
90 | extern void ntt_red1024_product4(int32_t *c, int32_t *a, int32_t *b);
91 | extern void ntt_red1024_product5(int32_t *c, int32_t *a, int32_t *b);
92 | 
93 | #endif /* __NTT_RED1024_H */
94 | 


--------------------------------------------------------------------------------
/src/speed_mul1024_red.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <inttypes.h>
  5 | 
  6 | #include "ntt_red1024.h"
  7 | #include "sort.h"
  8 | 
  9 | /*
 10 |  * PERFORMANCE MEASUREMENTS
 11 |  */
 12 | 
 13 | /*
 14 |  * For speed measurements: counter of CPU cycles
 15 |  */
 16 | static inline uint64_t cpucycles(void) {
 17 |   uint64_t result;
 18 |   __asm__ volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax"
 19 |     : "=a" (result) ::  "%rdx");
 20 |   return result;
 21 | }
 22 | 
 23 | #define NTESTS 102400
 24 | 
 25 | static uint64_t t[NTESTS];
 26 | 
 27 | // Average run time
 28 | static uint64_t average_time(void) {
 29 |   uint64_t s;
 30 |   uint32_t i;
 31 | 
 32 |   s = 0;
 33 |   for (i=0; i<NTESTS; i++) {
 34 |     s += t[i];
 35 |   }
 36 |   return s/NTESTS;
 37 | }
 38 | 
 39 | // Median
 40 | static uint64_t median_time(void) {
 41 |   uint32_t i;
 42 | 
 43 |   sort(t, NTESTS);
 44 |   for (i=1; i<NTESTS; i++) {
 45 |     if (t[i] < t[i-1]) {
 46 |       fprintf(stderr, "BUG in sort\n");
 47 |       exit(1);
 48 |     }
 49 |   }
 50 | 
 51 |   return t[NTESTS/2];
 52 | }
 53 | 
 54 | static void print_results(const char *s, uint64_t c) {
 55 |   uint32_t i;
 56 | 
 57 |   for(i=0 ;i<NTESTS-1; i++) {
 58 |     t[i] = t[i+1] - t[i];
 59 |   }
 60 |   t[i] = c - t[i];
 61 | 
 62 |   printf("%s\n", s);
 63 |   printf("median: %"PRIu64"\n", median_time());
 64 |   printf("average: %"PRIu64"\n", average_time());
 65 |   printf("\n");
 66 | }
 67 | 
 68 | static void test_mul(void) {
 69 |   int32_t a[1024], b[1024], c[1024];
 70 |   uint32_t i;
 71 | 
 72 |   for (i=0; i<1024; i++) {
 73 |     a[i] = i;
 74 |     b[i] = i;
 75 |   }
 76 |   
 77 |   for (i=0; i<NTESTS; i++) {
 78 |     t[i] = cpucycles();
 79 |     ntt_red1024_product1(c, a, b);
 80 |   }
 81 |   print_results("ntt_red1024_product1 ", cpucycles());
 82 | 
 83 |   for (i=0; i<1024; i++) {
 84 |     a[i] = i;
 85 |     b[i] = i;
 86 |   }
 87 |   
 88 |   for (i=0; i<NTESTS; i++) {
 89 |     t[i] = cpucycles();
 90 |     ntt_red1024_product2(c, a, b);
 91 |   }
 92 |   print_results("ntt_red1024_product2 ", cpucycles());
 93 | 
 94 |   for (i=0; i<1024; i++) {
 95 |     a[i] = i;
 96 |     b[i] = i;
 97 |   }
 98 |   
 99 |   for (i=0; i<NTESTS; i++) {
100 |     t[i] = cpucycles();
101 |     ntt_red1024_product3(c, a, b);
102 |   }
103 |   print_results("ntt_red1024_product3 ", cpucycles());
104 | 
105 |   for (i=0; i<1024; i++) {
106 |     a[i] = i;
107 |     b[i] = i;
108 |   }
109 |   
110 |   for (i=0; i<NTESTS; i++) {
111 |     t[i] = cpucycles();
112 |     ntt_red1024_product4(c, a, b);
113 |   }
114 |   print_results("ntt_red1024_product4 ", cpucycles());
115 | 
116 |   for (i=0; i<1024; i++) {
117 |     a[i] = i;
118 |     b[i] = i;
119 |   }
120 |   
121 |   for (i=0; i<NTESTS; i++) {
122 |     t[i] = cpucycles();
123 |     ntt_red1024_product5(c, a, b);
124 |   }
125 |   print_results("ntt_red1024_product5 ", cpucycles());
126 | }
127 | 
128 | int main(void){
129 |   printf("Testing ntt_red1024 product functions\n\n");
130 |   test_mul();
131 |   return 0;
132 | }
133 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/ntt_red1024.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289, n=1024, using the Longa/Naehrig reduction method.
 3 |  */
 4 | 
 5 | #ifndef __NTT_RED1024_H
 6 | #define __NTT_RED1024_H
 7 | 
 8 | #include "ntt_red1024_tables.h"
 9 | #include "ntt_red.h"
10 | 
11 | /*
12 |  * NTT Variants: as in ntt_red.h
13 |  * using tables from ntt1024_red_tables.h
14 |  *
15 |  * Input: a[i] for i=0 .. 15 is expected to satisfy
16 |  *   -21499 <= a[i] <= 21499
17 |  *
18 |  * The result is stored in a, it is not reduced modulo Q.
19 |  */
20 | // forward NTTs
21 | static inline void ntt_red1024_ct_rev2std(int32_t *a) {
22 |   ntt_red_ct_rev2std(a, 1024, ntt_red1024_omega_powers);
23 | }
24 | 
25 | static inline void ntt_red1024_gs_rev2std(int32_t *a) {
26 |   ntt_red_gs_rev2std(a, 1024, ntt_red1024_omega_powers_rev);
27 | }
28 | 
29 | static inline void ntt_red1024_ct_std2rev(int32_t *a) {
30 |   ntt_red_ct_std2rev(a, 1024, ntt_red1024_omega_powers_rev);
31 | }
32 | 
33 | static inline void ntt_red1024_gs_std2rev(int32_t *a) {
34 |   ntt_red_gs_std2rev(a, 1024, ntt_red1024_omega_powers);
35 | }
36 | 
37 | // inverse
38 | static inline void intt_red1024_ct_rev2std(int32_t *a) {
39 |   ntt_red_ct_rev2std(a, 1024, ntt_red1024_inv_omega_powers);
40 | }
41 | 
42 | static inline void intt_red1024_gs_rev2std(int32_t *a) {
43 |   ntt_red_gs_rev2std(a, 1024, ntt_red1024_inv_omega_powers_rev);
44 | }
45 | 
46 | static inline void intt_red1024_ct_std2rev(int32_t *a) {
47 |   ntt_red_ct_std2rev(a, 1024, ntt_red1024_inv_omega_powers_rev);
48 | }
49 | 
50 | static inline void intt_red1024_gs_std2rev(int32_t *a) {
51 |   ntt_red_gs_std2rev(a, 1024, ntt_red1024_inv_omega_powers);
52 | }
53 | 
54 | // multiplication by powers of psi then forward ntt
55 | static inline void mulntt_red1024_ct_rev2std(int32_t *a) {
56 |   mulntt_red_ct_rev2std(a, 1024, ntt_red1024_mixed_powers);
57 | }
58 | 
59 | static inline void mulntt_red1024_ct_std2rev(int32_t *a) {
60 |   mulntt_red_ct_std2rev(a, 1024, ntt_red1024_mixed_powers_rev);
61 | }
62 | 
63 | // inverse ntt then multiplication by powers of psi^-1
64 | static inline void inttmul_red1024_gs_rev2std(int32_t *a) {
65 |   nttmul_red_gs_rev2std(a, 1024, ntt_red1024_inv_mixed_powers_rev);
66 | }
67 | 
68 | static inline void inttmul_red1024_gs_std2rev(int32_t *a) {
69 |   nttmul_red_gs_std2rev(a, 1024, ntt_red1024_inv_mixed_powers);
70 | }
71 | 
72 | 
73 | /*
74 |  * PRODUCTS
75 |  */
76 | 
77 | /*
78 |  * Input: two arrays a and b in standard order
79 |  *
80 |  * Result: 
81 |  * - the product is stored in array c, in standard order.
82 |  * - arrays a and b are modified
83 |  *
84 |  * The input arrays must contain elements in the range [0, Q-1]
85 |  * The result is also in that range.
86 |  */
87 | extern void ntt_red1024_product1(int32_t *c, int32_t *a, int32_t *b);
88 | extern void ntt_red1024_product2(int32_t *c, int32_t *a, int32_t *b);
89 | extern void ntt_red1024_product3(int32_t *c, int32_t *a, int32_t *b);
90 | extern void ntt_red1024_product4(int32_t *c, int32_t *a, int32_t *b);
91 | extern void ntt_red1024_product5(int32_t *c, int32_t *a, int32_t *b);
92 | 
93 | #endif /* __NTT_RED1024_H */
94 | 


--------------------------------------------------------------------------------
/src/ntt_red_asm16.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289, n=16, using the Longa/Naehrig reduction method.
 3 |  */
 4 | 
 5 | #ifndef __NTT_RED_ASM16_H
 6 | #define __NTT_RED_ASM16_H
 7 | 
 8 | #include "ntt_red16_tables.h"
 9 | #include "ntt_asm.h"
10 | 
11 | /*
12 |  * NTT Variants: as in ntt_red.h
13 |  * using tables from ntt16_red_tables.h
14 |  *
15 |  * Input: a[i] for i=0 .. 15 is expected to satisfy
16 |  *   -21499 <= a[i] <= 21499
17 |  *
18 |  * The result is stored in a, it is not reduced modulo Q.
19 |  */
20 | // forward NTTs
21 | static inline void ntt_red16_ct_rev2std_asm(int32_t *a) {
22 |   ntt_red_ct_rev2std_asm(a, 16, ntt_red16_omega_powers);
23 | }
24 | 
25 | static inline void ntt_red16_gs_rev2std_asm(int32_t *a) {
26 |   ntt_red_gs_rev2std_asm(a, 16, ntt_red16_omega_powers_rev);
27 | }
28 | 
29 | static inline void ntt_red16_ct_std2rev_asm(int32_t *a) {
30 |   ntt_red_ct_std2rev_asm(a, 16, ntt_red16_omega_powers_rev);
31 | }
32 | 
33 | static inline void ntt_red16_gs_std2rev_asm(int32_t *a) {
34 |   ntt_red_gs_std2rev_asm(a, 16, ntt_red16_omega_powers);
35 | }
36 | 
37 | // inverse
38 | static inline void intt_red16_ct_rev2std_asm(int32_t *a) {
39 |   ntt_red_ct_rev2std_asm(a, 16, ntt_red16_inv_omega_powers);
40 | }
41 | 
42 | static inline void intt_red16_gs_rev2std_asm(int32_t *a) {
43 |   ntt_red_gs_rev2std_asm(a, 16, ntt_red16_inv_omega_powers_rev);
44 | }
45 | 
46 | static inline void intt_red16_ct_std2rev_asm(int32_t *a) {
47 |   ntt_red_ct_std2rev_asm(a, 16, ntt_red16_inv_omega_powers_rev);
48 | }
49 | 
50 | static inline void intt_red16_gs_std2rev_asm(int32_t *a) {
51 |   ntt_red_gs_std2rev_asm(a, 16, ntt_red16_inv_omega_powers);
52 | }
53 | 
54 | // multiplication by powers of psi then forward ntt
55 | static inline void mulntt_red16_ct_rev2std_asm(int32_t *a) {
56 |   mulntt_red_ct_rev2std_asm(a, 16, ntt_red16_mixed_powers);
57 | }
58 | 
59 | static inline void mulntt_red16_ct_std2rev_asm(int32_t *a) {
60 |   mulntt_red_ct_std2rev_asm(a, 16, ntt_red16_mixed_powers_rev);
61 | }
62 | 
63 | // inverse ntt then multiplication by powers of psi^-1
64 | static inline void inttmul_red16_gs_rev2std_asm(int32_t *a) {
65 |   nttmul_red_gs_rev2std_asm(a, 16, ntt_red16_inv_mixed_powers_rev);
66 | }
67 | 
68 | static inline void inttmul_red16_gs_std2rev_asm(int32_t *a) {
69 |   nttmul_red_gs_std2rev_asm(a, 16, ntt_red16_inv_mixed_powers);
70 | }
71 | 
72 | 
73 | /*
74 |  * PRODUCTS
75 |  */
76 | 
77 | /*
78 |  * Input: two arrays a and b in standard order
79 |  *
80 |  * Result: 
81 |  * - the product is stored in array c, in standard order.
82 |  * - arrays a and b are modified
83 |  *
84 |  * The input arrays must contain elements in the range [0, Q-1]
85 |  * The result is also in that range.
86 |  */
87 | extern void ntt_red16_product1_asm(int32_t *c, int32_t *a, int32_t *b);
88 | extern void ntt_red16_product2_asm(int32_t *c, int32_t *a, int32_t *b);
89 | extern void ntt_red16_product3_asm(int32_t *c, int32_t *a, int32_t *b);
90 | extern void ntt_red16_product4_asm(int32_t *c, int32_t *a, int32_t *b);
91 | extern void ntt_red16_product5_asm(int32_t *c, int32_t *a, int32_t *b);
92 | 
93 | #endif /* __NTT_RED_ASM16_H */
94 | 


--------------------------------------------------------------------------------
/src/speed_mul1024_naive.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <inttypes.h>
  5 | 
  6 | #include "naive_ntt1024.h"
  7 | #include "sort.h"
  8 | 
  9 | /*
 10 |  * PERFORMANCE MEASUREMENTS
 11 |  */
 12 | 
 13 | /*
 14 |  * For speed measurements: counter of CPU cycles
 15 |  */
 16 | static inline uint64_t cpucycles(void) {
 17 |   uint64_t result;
 18 |   __asm__ volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax"
 19 |     : "=a" (result) ::  "%rdx");
 20 |   return result;
 21 | }
 22 | 
 23 | #define NTESTS 102400
 24 | 
 25 | static uint64_t t[NTESTS];
 26 | 
 27 | // Average run time
 28 | static uint64_t average_time(void) {
 29 |   uint64_t s;
 30 |   uint32_t i;
 31 | 
 32 |   s = 0;
 33 |   for (i=0; i<NTESTS; i++) {
 34 |     s += t[i];
 35 |   }
 36 |   return s/NTESTS;
 37 | }
 38 | 
 39 | // Median
 40 | static uint64_t median_time(void) {
 41 |   uint32_t i;
 42 | 
 43 |   sort(t, NTESTS);
 44 |   for (i=1; i<NTESTS; i++) {
 45 |     if (t[i] < t[i-1]) {
 46 |       fprintf(stderr, "BUG in sort\n");
 47 |       exit(1);
 48 |     }
 49 |   }
 50 | 
 51 |   return t[NTESTS/2];
 52 | }
 53 | 
 54 | static void print_results(const char *s, uint64_t c) {
 55 |   uint32_t i;
 56 | 
 57 |   for(i=0 ;i<NTESTS-1; i++) {
 58 |     t[i] = t[i+1] - t[i];
 59 |   }
 60 |   t[i] = c - t[i];
 61 | 
 62 |   printf("%s\n", s);
 63 |   printf("median: %"PRIu64"\n", median_time());
 64 |   printf("average: %"PRIu64"\n", average_time());
 65 |   printf("\n");
 66 | }
 67 | 
 68 | static void test_mul(void) {
 69 |   int32_t a[1024], b[1024], c[1024];
 70 |   uint32_t i;
 71 | 
 72 |   for (i=0; i<1024; i++) {
 73 |     a[i] = i;
 74 |     b[i] = i;
 75 |   }
 76 |   
 77 |   for (i=0; i<NTESTS; i++) {
 78 |     t[i] = cpucycles();
 79 |     naive_ntt1024_product1(c, a, b);
 80 |   }
 81 |   print_results("naive_ntt1024_product1 ", cpucycles());
 82 | 
 83 |   for (i=0; i<1024; i++) {
 84 |     a[i] = i;
 85 |     b[i] = i;
 86 |   }
 87 |   
 88 |   for (i=0; i<NTESTS; i++) {
 89 |     t[i] = cpucycles();
 90 |     naive_ntt1024_product2(c, a, b);
 91 |   }
 92 |   print_results("naive_ntt1024_product2 ", cpucycles());
 93 | 
 94 |   for (i=0; i<1024; i++) {
 95 |     a[i] = i;
 96 |     b[i] = i;
 97 |   }
 98 |   
 99 |   for (i=0; i<NTESTS; i++) {
100 |     t[i] = cpucycles();
101 |     naive_ntt1024_product3(c, a, b);
102 |   }
103 |   print_results("naive_ntt1024_product3 ", cpucycles());
104 | 
105 |   for (i=0; i<1024; i++) {
106 |     a[i] = i;
107 |     b[i] = i;
108 |   }
109 |   
110 |   for (i=0; i<NTESTS; i++) {
111 |     t[i] = cpucycles();
112 |     naive_ntt1024_product4(c, a, b);
113 |   }
114 |   print_results("naive_ntt1024_product4 ", cpucycles());
115 | 
116 |   for (i=0; i<1024; i++) {
117 |     a[i] = i;
118 |     b[i] = i;
119 |   }
120 |   
121 |   for (i=0; i<NTESTS; i++) {
122 |     t[i] = cpucycles();
123 |     naive_ntt1024_product5(c, a, b);
124 |   }
125 |   print_results("naive_ntt1024_product5 ", cpucycles());
126 | }
127 | 
128 | int main(void){
129 |   printf("Testing naive_ntt1024 product functions\n\n");
130 |   test_mul();
131 |   return 0;
132 | }
133 | 


--------------------------------------------------------------------------------
/src/speed_mul1024_red_asm.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <inttypes.h>
  5 | 
  6 | #include "ntt_red_asm1024.h"
  7 | #include "sort.h"
  8 | 
  9 | /*
 10 |  * PERFORMANCE MEASUREMENTS
 11 |  */
 12 | 
 13 | /*
 14 |  * For speed measurements: counter of CPU cycles
 15 |  */
 16 | static inline uint64_t cpucycles(void) {
 17 |   uint64_t result;
 18 |   __asm__ volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax"
 19 |     : "=a" (result) ::  "%rdx");
 20 |   return result;
 21 | }
 22 | 
 23 | #define NTESTS 102400
 24 | 
 25 | static uint64_t t[NTESTS];
 26 | 
 27 | // Average run time
 28 | static uint64_t average_time(void) {
 29 |   uint64_t s;
 30 |   uint32_t i;
 31 | 
 32 |   s = 0;
 33 |   for (i=0; i<NTESTS; i++) {
 34 |     s += t[i];
 35 |   }
 36 |   return s/NTESTS;
 37 | }
 38 | 
 39 | // Median
 40 | static uint64_t median_time(void) {
 41 |   uint32_t i;
 42 | 
 43 |   sort(t, NTESTS);
 44 |   for (i=1; i<NTESTS; i++) {
 45 |     if (t[i] < t[i-1]) {
 46 |       fprintf(stderr, "BUG in sort\n");
 47 |       exit(1);
 48 |     }
 49 |   }
 50 | 
 51 |   return t[NTESTS/2];
 52 | }
 53 | 
 54 | static void print_results(const char *s, uint64_t c) {
 55 |   uint32_t i;
 56 | 
 57 |   for(i=0 ;i<NTESTS-1; i++) {
 58 |     t[i] = t[i+1] - t[i];
 59 |   }
 60 |   t[i] = c - t[i];
 61 | 
 62 |   printf("%s\n", s);
 63 |   printf("median: %"PRIu64"\n", median_time());
 64 |   printf("average: %"PRIu64"\n", average_time());
 65 |   printf("\n");
 66 | }
 67 | 
 68 | static void test_mul(void) {
 69 |   int32_t a[1024], b[1024], c[1024];
 70 |   uint32_t i;
 71 | 
 72 |   for (i=0; i<1024; i++) {
 73 |     a[i] = i;
 74 |     b[i] = i;
 75 |   }
 76 |   
 77 |   for (i=0; i<NTESTS; i++) {
 78 |     t[i] = cpucycles();
 79 |     ntt_red1024_product1_asm(c, a, b);
 80 |   }
 81 |   print_results("ntt_red1024_product1_asm ", cpucycles());
 82 | 
 83 |   for (i=0; i<1024; i++) {
 84 |     a[i] = i;
 85 |     b[i] = i;
 86 |   }
 87 |   
 88 |   for (i=0; i<NTESTS; i++) {
 89 |     t[i] = cpucycles();
 90 |     ntt_red1024_product2_asm(c, a, b);
 91 |   }
 92 |   print_results("ntt_red1024_product2_asm ", cpucycles());
 93 | 
 94 |   for (i=0; i<1024; i++) {
 95 |     a[i] = i;
 96 |     b[i] = i;
 97 |   }
 98 |   
 99 |   for (i=0; i<NTESTS; i++) {
100 |     t[i] = cpucycles();
101 |     ntt_red1024_product3_asm(c, a, b);
102 |   }
103 |   print_results("ntt_red1024_product3_asm ", cpucycles());
104 | 
105 |   for (i=0; i<1024; i++) {
106 |     a[i] = i;
107 |     b[i] = i;
108 |   }
109 |   
110 |   for (i=0; i<NTESTS; i++) {
111 |     t[i] = cpucycles();
112 |     ntt_red1024_product4_asm(c, a, b);
113 |   }
114 |   print_results("ntt_red1024_product4_asm ", cpucycles());
115 | 
116 |   for (i=0; i<1024; i++) {
117 |     a[i] = i;
118 |     b[i] = i;
119 |   }
120 |   
121 |   for (i=0; i<NTESTS; i++) {
122 |     t[i] = cpucycles();
123 |     ntt_red1024_product5_asm(c, a, b);
124 |   }
125 |   print_results("ntt_red1024_product5_asm ", cpucycles());
126 | }
127 | 
128 | int main(void){
129 |   printf("Testing ntt_red_asm1024 product functions\n\n");
130 |   test_mul();
131 |   return 0;
132 | }
133 | 


--------------------------------------------------------------------------------
/src/ntt_red_asm256.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289, n=256, using the Longa/Naehrig reduction method.
 3 |  */
 4 | 
 5 | #ifndef __NTT_RED_ASM256_H
 6 | #define __NTT_RED_ASM256_H
 7 | 
 8 | #include "ntt_red256_tables.h"
 9 | #include "ntt_asm.h"
10 | 
11 | /*
12 |  * NTT Variants: as in ntt_asm.h
13 |  * using tables from ntt256_red_tables.h
14 |  *
15 |  * Input: a[i] for i=0 .. 15 is expected to satisfy
16 |  *   -21499 <= a[i] <= 21499
17 |  *
18 |  * The result is stored in a, it is not reduced modulo Q.
19 |  */
20 | // forward NTTs
21 | static inline void ntt_red256_ct_rev2std_asm(int32_t *a) {
22 |   ntt_red_ct_rev2std_asm(a, 256, ntt_red256_omega_powers);
23 | }
24 | 
25 | static inline void ntt_red256_gs_rev2std_asm(int32_t *a) {
26 |   ntt_red_gs_rev2std_asm(a, 256, ntt_red256_omega_powers_rev);
27 | }
28 | 
29 | static inline void ntt_red256_ct_std2rev_asm(int32_t *a) {
30 |   ntt_red_ct_std2rev_asm(a, 256, ntt_red256_omega_powers_rev);
31 | }
32 | 
33 | static inline void ntt_red256_gs_std2rev_asm(int32_t *a) {
34 |   ntt_red_gs_std2rev_asm(a, 256, ntt_red256_omega_powers);
35 | }
36 | 
37 | // inverse
38 | static inline void intt_red256_ct_rev2std_asm(int32_t *a) {
39 |   ntt_red_ct_rev2std_asm(a, 256, ntt_red256_inv_omega_powers);
40 | }
41 | 
42 | static inline void intt_red256_gs_rev2std_asm(int32_t *a) {
43 |   ntt_red_gs_rev2std_asm(a, 256, ntt_red256_inv_omega_powers_rev);
44 | }
45 | 
46 | static inline void intt_red256_ct_std2rev_asm(int32_t *a) {
47 |   ntt_red_ct_std2rev_asm(a, 256, ntt_red256_inv_omega_powers_rev);
48 | }
49 | 
50 | static inline void intt_red256_gs_std2rev_asm(int32_t *a) {
51 |   ntt_red_gs_std2rev_asm(a, 256, ntt_red256_inv_omega_powers);
52 | }
53 | 
54 | // multiplication by powers of psi then forward ntt
55 | static inline void mulntt_red256_ct_rev2std_asm(int32_t *a) {
56 |   mulntt_red_ct_rev2std_asm(a, 256, ntt_red256_mixed_powers);
57 | }
58 | 
59 | static inline void mulntt_red256_ct_std2rev_asm(int32_t *a) {
60 |   mulntt_red_ct_std2rev_asm(a, 256, ntt_red256_mixed_powers_rev);
61 | }
62 | 
63 | // inverse ntt then multiplication by powers of psi^-1
64 | static inline void inttmul_red256_gs_rev2std_asm(int32_t *a) {
65 |   nttmul_red_gs_rev2std_asm(a, 256, ntt_red256_inv_mixed_powers_rev);
66 | }
67 | 
68 | static inline void inttmul_red256_gs_std2rev_asm(int32_t *a) {
69 |   nttmul_red_gs_std2rev_asm(a, 256, ntt_red256_inv_mixed_powers);
70 | }
71 | 
72 | 
73 | /*
74 |  * PRODUCTS
75 |  */
76 | 
77 | /*
78 |  * Input: two arrays a and b in standard order
79 |  *
80 |  * Result: 
81 |  * - the product is stored in array c, in standard order.
82 |  * - arrays a and b are modified
83 |  *
84 |  * The input arrays must contain elements in the range [0, Q-1]
85 |  * The result is also in that range.
86 |  */
87 | extern void ntt_red256_product1_asm(int32_t *c, int32_t *a, int32_t *b);
88 | extern void ntt_red256_product2_asm(int32_t *c, int32_t *a, int32_t *b);
89 | extern void ntt_red256_product3_asm(int32_t *c, int32_t *a, int32_t *b);
90 | extern void ntt_red256_product4_asm(int32_t *c, int32_t *a, int32_t *b);
91 | extern void ntt_red256_product5_asm(int32_t *c, int32_t *a, int32_t *b);
92 | 
93 | #endif /* __NTT_RED_ASM256_H */
94 | 


--------------------------------------------------------------------------------
/src/ntt_red_asm512.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289, n=512, using the Longa/Naehrig reduction method.
 3 |  */
 4 | 
 5 | #ifndef __NTT_RED_ASM512_H
 6 | #define __NTT_RED_ASM512_H
 7 | 
 8 | #include "ntt_red512_tables.h"
 9 | #include "ntt_asm.h"
10 | 
11 | /*
12 |  * NTT Variants: as in ntt_red.h
13 |  * using tables from ntt512_red_tables.h
14 |  *
15 |  * Input: a[i] for i=0 .. 15 is expected to satisfy
16 |  *   -21499 <= a[i] <= 21499
17 |  *
18 |  * The result is stored in a, it is not reduced modulo Q.
19 |  */
20 | // forward NTTs
21 | static inline void ntt_red512_ct_rev2std_asm(int32_t *a) {
22 |   ntt_red_ct_rev2std_asm(a, 512, ntt_red512_omega_powers);
23 | }
24 | 
25 | static inline void ntt_red512_gs_rev2std_asm(int32_t *a) {
26 |   ntt_red_gs_rev2std_asm(a, 512, ntt_red512_omega_powers_rev);
27 | }
28 | 
29 | static inline void ntt_red512_ct_std2rev_asm(int32_t *a) {
30 |   ntt_red_ct_std2rev_asm(a, 512, ntt_red512_omega_powers_rev);
31 | }
32 | 
33 | static inline void ntt_red512_gs_std2rev_asm(int32_t *a) {
34 |   ntt_red_gs_std2rev_asm(a, 512, ntt_red512_omega_powers);
35 | }
36 | 
37 | // inverse
38 | static inline void intt_red512_ct_rev2std_asm(int32_t *a) {
39 |   ntt_red_ct_rev2std_asm(a, 512, ntt_red512_inv_omega_powers);
40 | }
41 | 
42 | static inline void intt_red512_gs_rev2std_asm(int32_t *a) {
43 |   ntt_red_gs_rev2std_asm(a, 512, ntt_red512_inv_omega_powers_rev);
44 | }
45 | 
46 | static inline void intt_red512_ct_std2rev_asm(int32_t *a) {
47 |   ntt_red_ct_std2rev_asm(a, 512, ntt_red512_inv_omega_powers_rev);
48 | }
49 | 
50 | static inline void intt_red512_gs_std2rev_asm(int32_t *a) {
51 |   ntt_red_gs_std2rev_asm(a, 512, ntt_red512_inv_omega_powers);
52 | }
53 | 
54 | // multiplication by powers of psi then forward ntt
55 | static inline void mulntt_red512_ct_rev2std_asm(int32_t *a) {
56 |   mulntt_red_ct_rev2std_asm(a, 512, ntt_red512_mixed_powers);
57 | }
58 | 
59 | static inline void mulntt_red512_ct_std2rev_asm(int32_t *a) {
60 |   mulntt_red_ct_std2rev_asm(a, 512, ntt_red512_mixed_powers_rev);
61 | }
62 | 
63 | // inverse ntt then multiplication by powers of psi^-1
64 | static inline void inttmul_red512_gs_rev2std_asm(int32_t *a) {
65 |   nttmul_red_gs_rev2std_asm(a, 512, ntt_red512_inv_mixed_powers_rev);
66 | }
67 | 
68 | static inline void inttmul_red512_gs_std2rev_asm(int32_t *a) {
69 |   nttmul_red_gs_std2rev_asm(a, 512, ntt_red512_inv_mixed_powers);
70 | }
71 | 
72 | 
73 | /*
74 |  * PRODUCTS
75 |  */
76 | 
77 | /*
78 |  * Input: two arrays a and b in standard order
79 |  *
80 |  * Result: 
81 |  * - the product is stored in array c, in standard order.
82 |  * - arrays a and b are modified
83 |  *
84 |  * The input arrays must contain elements in the range [0, Q-1]
85 |  * The result is also in that range.
86 |  */
87 | extern void ntt_red512_product1_asm(int32_t *c, int32_t *a, int32_t *b);
88 | extern void ntt_red512_product2_asm(int32_t *c, int32_t *a, int32_t *b);
89 | extern void ntt_red512_product3_asm(int32_t *c, int32_t *a, int32_t *b);
90 | extern void ntt_red512_product4_asm(int32_t *c, int32_t *a, int32_t *b);
91 | extern void ntt_red512_product5_asm(int32_t *c, int32_t *a, int32_t *b);
92 | 
93 | #endif /* __NTT_RED_ASM512_H */
94 | 


--------------------------------------------------------------------------------
/src/ntt_red_asm1024.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NTT for Q=12289, n=1024, using the Longa/Naehrig reduction method.
 3 |  * AVX implementation.
 4 |  */
 5 | 
 6 | #ifndef __NTT_RED_ASM1024_H
 7 | #define __NTT_RED_ASM1024_H
 8 | 
 9 | #include "ntt_red1024_tables.h"
10 | #include "ntt_asm.h"
11 | 
12 | /*
13 |  * NTT Variants: as in ntt_asm.h
14 |  * using tables from ntt1024_red_tables.h
15 |  *
16 |  * Input: a[i] for i=0 .. 15 is expected to satisfy
17 |  *   -21499 <= a[i] <= 21499
18 |  *
19 |  * The result is stored in a, it is not reduced modulo Q.
20 |  */
21 | // forward NTTs
22 | static inline void ntt_red1024_ct_rev2std_asm(int32_t *a) {
23 |   ntt_red_ct_rev2std_asm(a, 1024, ntt_red1024_omega_powers);
24 | }
25 | 
26 | static inline void ntt_red1024_gs_rev2std_asm(int32_t *a) {
27 |   ntt_red_gs_rev2std_asm(a, 1024, ntt_red1024_omega_powers_rev);
28 | }
29 | 
30 | static inline void ntt_red1024_ct_std2rev_asm(int32_t *a) {
31 |   ntt_red_ct_std2rev_asm(a, 1024, ntt_red1024_omega_powers_rev);
32 | }
33 | 
34 | static inline void ntt_red1024_gs_std2rev_asm(int32_t *a) {
35 |   ntt_red_gs_std2rev_asm(a, 1024, ntt_red1024_omega_powers);
36 | }
37 | 
38 | // inverse
39 | static inline void intt_red1024_ct_rev2std_asm(int32_t *a) {
40 |   ntt_red_ct_rev2std_asm(a, 1024, ntt_red1024_inv_omega_powers);
41 | }
42 | 
43 | static inline void intt_red1024_gs_rev2std_asm(int32_t *a) {
44 |   ntt_red_gs_rev2std_asm(a, 1024, ntt_red1024_inv_omega_powers_rev);
45 | }
46 | 
47 | static inline void intt_red1024_ct_std2rev_asm(int32_t *a) {
48 |   ntt_red_ct_std2rev_asm(a, 1024, ntt_red1024_inv_omega_powers_rev);
49 | }
50 | 
51 | static inline void intt_red1024_gs_std2rev_asm(int32_t *a) {
52 |   ntt_red_gs_std2rev_asm(a, 1024, ntt_red1024_inv_omega_powers);
53 | }
54 | 
55 | // multiplication by powers of psi then forward ntt
56 | static inline void mulntt_red1024_ct_rev2std_asm(int32_t *a) {
57 |   mulntt_red_ct_rev2std_asm(a, 1024, ntt_red1024_mixed_powers);
58 | }
59 | 
60 | static inline void mulntt_red1024_ct_std2rev_asm(int32_t *a) {
61 |   mulntt_red_ct_std2rev_asm(a, 1024, ntt_red1024_mixed_powers_rev);
62 | }
63 | 
64 | // inverse ntt then multiplication by powers of psi^-1
65 | static inline void inttmul_red1024_gs_rev2std_asm(int32_t *a) {
66 |   nttmul_red_gs_rev2std_asm(a, 1024, ntt_red1024_inv_mixed_powers_rev);
67 | }
68 | 
69 | static inline void inttmul_red1024_gs_std2rev_asm(int32_t *a) {
70 |   nttmul_red_gs_std2rev_asm(a, 1024, ntt_red1024_inv_mixed_powers);
71 | }
72 | 
73 | 
74 | /*
75 |  * PRODUCTS
76 |  */
77 | 
78 | /*
79 |  * Input: two arrays a and b in standard order
80 |  *
81 |  * Result: 
82 |  * - the product is stored in array c, in standard order.
83 |  * - arrays a and b are modified
84 |  *
85 |  * The input arrays must contain elements in the range [0, Q-1]
86 |  * The result is also in that range.
87 |  */
88 | extern void ntt_red1024_product1_asm(int32_t *c, int32_t *a, int32_t *b);
89 | extern void ntt_red1024_product2_asm(int32_t *c, int32_t *a, int32_t *b);
90 | extern void ntt_red1024_product3_asm(int32_t *c, int32_t *a, int32_t *b);
91 | extern void ntt_red1024_product4_asm(int32_t *c, int32_t *a, int32_t *b);
92 | extern void ntt_red1024_product5_asm(int32_t *c, int32_t *a, int32_t *b);
93 | 
94 | #endif /* __NTT_RED_ASM1024_H */
95 | 


--------------------------------------------------------------------------------
/src/red_bounds.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Bounds on the reduction function
  3 |  */
  4 | 
  5 | #ifndef __RED_BOUNDS_H
  6 | #define __RED_BOUNDS_H
  7 | 
  8 | #include <stdint.h>
  9 | 
 10 | /*
 11 |  * Maximum of red(x) for a <= x <= b
 12 |  * - red(x) is returned, x is stored in *m
 13 |  */
 14 | extern int64_t max_red(int64_t a, int64_t b, int64_t *m);
 15 | 
 16 | /*
 17 |  * Minimum of red(x) for a <= x <= b
 18 |  */
 19 | extern int64_t min_red(int64_t a, int64_t b, int64_t *m);
 20 | 
 21 | /*
 22 |  * Maximum of red(w x) for a <= x <= b
 23 |  */
 24 | extern int64_t max_red_mul(int64_t a, int64_t b, int64_t w, int64_t *m);
 25 | 
 26 | /*
 27 |  * Minimum of red(w*x) for a <= x <= b
 28 |  */
 29 | extern int64_t min_red_mul(int64_t a, int64_t b, int64_t w, int64_t *m);
 30 | 
 31 | /*
 32 |  * Maximum of red(w * x) for a <= x <= b and low <= w <= high.
 33 |  * - the max is returned. The corresponding x and w are stored in *m and
 34 |  * *mw, respectively.
 35 |  */
 36 | extern int64_t max_red_mul_interval(int64_t a, int64_t b, int64_t low, int64_t high, int64_t *m, int64_t *mw);
 37 | 
 38 | /*
 39 |  * Minimum of red(x * w) for a <= x <= b and low <= w <= high
 40 |  */
 41 | extern int64_t min_red_mul_interval(int64_t a, int64_t b, int64_t low, int64_t high, int64_t *m, int64_t *wm);
 42 | 
 43 | /*
 44 |  * Bounds after a CT step
 45 |  * - assuming |x| <= b and |y| <= b, this function returns b'
 46 |  *   such that |x'| <= b' and |y'| <= b'  after executing
 47 |  *      x' = x + red(w * y)
 48 |  *      y' = x - red(w * y)
 49 |  *  for the worst-case w in interval [low, high].
 50 |  */
 51 | extern int64_t ct_bound(int64_t b, int64_t low, int64_t high);
 52 | 
 53 | /*
 54 |  * Bounds after a GS step: same as CT but the updates are
 55 |  *  x' = x + y
 56 |  *  y' = (x - y) * w.
 57 |  */
 58 | extern int64_t gs_bound(int64_t b, int64_t low, int64_t high);
 59 | 
 60 | 
 61 | /*
 62 |  * Bounds after a CT step with a fixed w
 63 |  * - assuming |x| <= b and |y| <= b, returns b' such that
 64 |  *    |x + red(w, y)| <= b' and |x - red(w, y)| <= b'
 65 |  */
 66 | extern int64_t ct_bound_fixed(int64_t b, int64_t w);
 67 | 
 68 | /*
 69 |  * Bounds after a GS step with a fixed w
 70 |  * - assuming |x| <= b and |y| <= b, returns b' such that
 71 |  *    |x + y| <= b' and |(x - y) * w| <= b'
 72 |  */
 73 | extern int64_t gs_bound_fixed(int64_t b, int64_t w);
 74 | 
 75 | 
 76 | /*
 77 |  * Bounds after ntt computations based on Cooley Tukey
 78 |  * - b0 = bound on the input
 79 |  * - p = array of coefficients used in the algorithm
 80 |  *   p[t + i] = omega^(n/2t)^i (or a variant of this).
 81 |  * We assume the input coefficients a[i] satisfy |a[i]| <= b0.
 82 |  *
 83 |  * The final bound is returned.
 84 |  * Bounds for each round are stored in array bounds (must be of size log_2(n))
 85 |  */
 86 | extern int64_t ntt_ct_bounds(int64_t b0, uint32_t n, const int16_t *p, int64_t *bound);
 87 | 
 88 | /*
 89 |  * Bounds after ntt computations based on Gentleman Sande
 90 |  * - b0 = bound on the input
 91 |  * - p = array of coefficients used in the algorithm
 92 |  *   p[t + i] = omega^(n/2t)^i (or a variant of this).
 93 |  * We assume the input coefficients a[i] satisfy |a[i]| <= b0.
 94 |  *
 95 |  * The final bound is returned.
 96 |  * Bounds for each round are stored in array bound (must be of size log_2(n))
 97 |  */
 98 | extern int64_t ntt_gs_bounds(int64_t b0, uint32_t n, const int16_t *p, int64_t *bound);
 99 | 
100 | #endif /* __RED_BOUNDS_H */
101 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/red_bounds.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Bounds on the reduction function
  3 |  */
  4 | 
  5 | #ifndef __RED_BOUNDS_H
  6 | #define __RED_BOUNDS_H
  7 | 
  8 | #include <stdint.h>
  9 | 
 10 | /*
 11 |  * Maximum of red(x) for a <= x <= b
 12 |  * - red(x) is returned, x is stored in *m
 13 |  */
 14 | extern int64_t max_red(int64_t a, int64_t b, int64_t *m);
 15 | 
 16 | /*
 17 |  * Minimum of red(x) for a <= x <= b
 18 |  */
 19 | extern int64_t min_red(int64_t a, int64_t b, int64_t *m);
 20 | 
 21 | /*
 22 |  * Maximum of red(w x) for a <= x <= b
 23 |  */
 24 | extern int64_t max_red_mul(int64_t a, int64_t b, int64_t w, int64_t *m);
 25 | 
 26 | /*
 27 |  * Minimum of red(w*x) for a <= x <= b
 28 |  */
 29 | extern int64_t min_red_mul(int64_t a, int64_t b, int64_t w, int64_t *m);
 30 | 
 31 | /*
 32 |  * Maximum of red(w * x) for a <= x <= b and low <= w <= high. 
 33 |  * - the max is returned. The corresponding x and w are stored in *m and
 34 |  * *mw, respectively.
 35 |  */
 36 | extern int64_t max_red_mul_interval(int64_t a, int64_t b, int64_t low, int64_t high,
 37 | 				    int64_t *m, int64_t *mw);
 38 | 
 39 |  /*
 40 |   * Minimum of red(x * w) for a <= x <= b and low <= w <= high
 41 |   */
 42 | extern int64_t min_red_mul_interval(int64_t a, int64_t b, int64_t low, int64_t high,
 43 | 				    int64_t *m, int64_t *wm);
 44 | 
 45 | 
 46 | /*
 47 |  * Bounds after a CT step
 48 |  * - assuming |x| <= b and |y| <= b, this function returns b' 
 49 |  *   such that |x'| <= b' and |y'| <= b'  after executing
 50 |  *      x' = x + red(w * y)
 51 |  *      y' = x - red(w * y)
 52 |  *  for the worst-case w in interval [low, high].
 53 |  */
 54 | extern int64_t ct_bound(int64_t b, int64_t low, int64_t high);
 55 | 
 56 | /*
 57 |  * Bounds after a GS step: same as CT but the updates are
 58 |  *  x' = x + y
 59 |  *  y' = (x - y) * w.
 60 |  */
 61 | extern int64_t gs_bound(int64_t b, int64_t low, int64_t high);
 62 | 
 63 | 
 64 | /*
 65 |  * Bounds after a CT step with a fixed w
 66 |  * - assuming |x| <= b and |y| <= b, returns b' such that
 67 |  *    |x + red(w, y)| <= b' and |x - red(w, y)| <= b'
 68 |  */
 69 | extern int64_t ct_bound_fixed(int64_t b, int64_t w);
 70 | 
 71 | /*
 72 |  * Bounds after a GS step with a fixed w
 73 |  * - assuming |x| <= b and |y| <= b, returns b' such that
 74 |  *    |x + y| <= b' and |(x - y) * w| <= b'
 75 |  */
 76 | extern int64_t gs_bound_fixed(int64_t b, int64_t w);
 77 | 
 78 | 
 79 | /*
 80 |  * Bounds after ntt computations based on Cooley Tukey
 81 |  * - b0 = bound on the input
 82 |  * - p = array of coefficients used in the algorithm
 83 |  *   p[t + i] = omega^(n/2t)^i (or a variant of this).
 84 |  * We assume the input coefficients a[i] satisfy |a[i]| <= b0.
 85 |  *
 86 |  * The final bound is returned.
 87 |  * Bounds for each round are stored in array bounds (must be of size log_2(n))
 88 |  */
 89 | extern int64_t ntt_ct_bounds(int64_t b0, uint32_t n, const int16_t *p, int64_t *bound);
 90 | 
 91 | /*
 92 |  * Bounds after ntt computations based on Gentleman Sande
 93 |  * - b0 = bound on the input
 94 |  * - p = array of coefficients used in the algorithm
 95 |  *   p[t + i] = omega^(n/2t)^i (or a variant of this).
 96 |  * We assume the input coefficients a[i] satisfy |a[i]| <= b0.
 97 |  *
 98 |  * The final bound is returned.
 99 |  * Bounds for each round are stored in array bound (must be of size log_2(n))
100 |  */
101 | extern int64_t ntt_gs_bounds(int64_t b0, uint32_t n, const int16_t *p, int64_t *bound);
102 | 
103 | #endif /* __RED_BOUNDS_H */
104 | 


--------------------------------------------------------------------------------
/src/ntt16.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * NTT for Q=12289 and n=16
  3 |  */
  4 | 
  5 | #ifndef __NTT16_H
  6 | #define __NTT16_H
  7 | 
  8 | #include "ntt16_tables.h"
  9 | #include "ntt.h"
 10 | 
 11 | 
 12 | /*
 13 |  * NTT VARIANTS
 14 |  *
 15 |  * - the input a is an array of n integers that must be between 0 and Q-1
 16 |  * - the result is stored in place
 17 |  * - the inverse transforms return a result scaled by n:
 18 |  *    we have intt(ntt(a)) = n * a
 19 |  */
 20 | // forward
 21 | static inline void ntt16_ct_rev2std(int32_t *a) {
 22 |   ntt_ct_rev2std(a, 16, ntt16_omega_powers);
 23 | }
 24 | 
 25 | static inline void ntt16_gs_rev2std(int32_t *a) {
 26 |   ntt_gs_rev2std(a, 16, ntt16_omega_powers_rev);
 27 | }
 28 | 
 29 | static inline void ntt16_ct_std2rev(int32_t *a) {
 30 |   ntt_ct_std2rev(a, 16, ntt16_omega_powers_rev);
 31 | }
 32 | 
 33 | static inline void ntt16_gs_std2rev(int32_t *a) {
 34 |   ntt_gs_std2rev(a, 16, ntt16_omega_powers);
 35 | }
 36 | 
 37 | // inverse
 38 | static inline void intt16_ct_rev2std(int32_t *a) {
 39 |   ntt_ct_rev2std(a, 16, ntt16_inv_omega_powers);
 40 | }
 41 | 
 42 | static inline void intt16_gs_rev2std(int32_t *a) {
 43 |   ntt_gs_rev2std(a, 16, ntt16_inv_omega_powers_rev);
 44 | }
 45 | 
 46 | static inline void intt16_ct_std2rev(int32_t *a) {
 47 |   ntt_ct_std2rev(a, 16, ntt16_inv_omega_powers_rev);
 48 | }
 49 | 
 50 | static inline void intt16_gs_std2rev(int32_t *a) {
 51 |   ntt_gs_std2rev(a, 16, ntt16_inv_omega_powers);
 52 | }
 53 | 
 54 | // multiplication by powers of psi then forward ntt
 55 | static inline void mulntt16_ct_rev2std(int32_t *a) {
 56 |   mulntt_ct_rev2std(a, 16, ntt16_mixed_powers);
 57 | }
 58 | 
 59 | static inline void mulntt16_ct_std2rev(int32_t *a) {
 60 |   mulntt_ct_std2rev(a, 16, ntt16_mixed_powers_rev);
 61 | }
 62 | 
 63 | // inverse ntt then multiplication by powers of psi^-1
 64 | static inline void inttmul16_gs_rev2std(int32_t *a) {
 65 |   nttmul_gs_rev2std(a, 16, ntt16_inv_mixed_powers_rev);
 66 | }
 67 | 
 68 | static inline void inttmul16_gs_std2rev(int32_t *a) {
 69 |   nttmul_gs_std2rev(a, 16, ntt16_inv_mixed_powers);
 70 | }
 71 | 
 72 | 
 73 | /*
 74 |  * PRODUCTS
 75 |  */
 76 | 
 77 | /*
 78 |  * Input: two arrays a and b in standard order
 79 |  *
 80 |  * Result: 
 81 |  * - the product is stored in array c, in standard order.
 82 |  * - arrays a and b are modified
 83 |  *
 84 |  * The input arrays must contain elements in the range [0 .. Q-1]
 85 |  * The result is also in that range.
 86 |  *
 87 |  * The first four variants have the following form:
 88 |  * - multiply a and b by powers of psi
 89 |  * - compute NNT(a) and NTT(b) using a std2rev variant
 90 |  * - c = elementwise product of NTT(a) and NTT(b)
 91 |  * - compute INTT(c) usign a rev2std variant
 92 |  * - multiply the result by n^(-1) * powers of psi^(-1)
 93 |  * There are two choices for the NTT and INTT functions:
 94 |  * - NTT:  either ntt_ct_std2rev or ntt_gs_std2rev
 95 |  * - INTT: either intt_ct_rev2std or intt_gs_reg2std
 96 |  *
 97 |  * Product5 uses the combined mul/ntt variants:
 98 |  * - compute MULNTT(a) and MULNTT(b) using mulntt_ct_std2rev
 99 |  * - c = elementwise product
100 |  * - compute INTTMUL(c) using inttmul_gs_rev2std
101 |  * - multiply the result by n^(-1)
102 |  */
103 | extern void ntt16_product1(int32_t *c, int32_t *a, int32_t *b);
104 | extern void ntt16_product2(int32_t *c, int32_t *a, int32_t *b);
105 | extern void ntt16_product3(int32_t *c, int32_t *a, int32_t *b);
106 | extern void ntt16_product4(int32_t *c, int32_t *a, int32_t *b);
107 | 
108 | extern void ntt16_product5(int32_t *c, int32_t *a, int32_t *b);
109 | 
110 | #endif /* __NTT16_H */
111 | 


--------------------------------------------------------------------------------
/src/naive_ntt16.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Naive NTT for Q=12289 and n=16
  3 |  */
  4 | 
  5 | #ifndef __NAIVE_NTT16_H
  6 | #define __NAIVE_NTT16_H
  7 | 
  8 | #include "ntt16_tables.h"
  9 | #include "naive_ntt.h"
 10 | 
 11 | 
 12 | /*
 13 |  * NTT VARIANTS
 14 |  *
 15 |  * - the input a is an array of n integers that must be between 0 and Q-1
 16 |  * - the result is stored in place
 17 |  * - the inverse transforms return a result scaled by n:
 18 |  *    we have intt(ntt(a)) = n * a
 19 |  */
 20 | // forward
 21 | static inline void naive_ntt16_ct_rev2std(int32_t *a) {
 22 |   ntt_ct_rev2std_naive(a, 16, ntt16_omega_powers, 12289);
 23 | }
 24 | 
 25 | static inline void naive_ntt16_gs_rev2std(int32_t *a) {
 26 |   ntt_gs_rev2std_naive(a, 16, ntt16_omega_powers_rev, 12289);
 27 | }
 28 | 
 29 | static inline void naive_ntt16_ct_std2rev(int32_t *a) {
 30 |   ntt_ct_std2rev_naive(a, 16, ntt16_omega_powers_rev, 12289);
 31 | }
 32 | 
 33 | static inline void naive_ntt16_gs_std2rev(int32_t *a) {
 34 |   ntt_gs_std2rev_naive(a, 16, ntt16_omega_powers, 12289);
 35 | }
 36 | 
 37 | // inverse
 38 | static inline void naive_intt16_ct_rev2std(int32_t *a) {
 39 |   ntt_ct_rev2std_naive(a, 16, ntt16_inv_omega_powers, 12289);
 40 | }
 41 | 
 42 | static inline void naive_intt16_gs_rev2std(int32_t *a) {
 43 |   ntt_gs_rev2std_naive(a, 16, ntt16_inv_omega_powers_rev, 12289);
 44 | }
 45 | 
 46 | static inline void naive_intt16_ct_std2rev(int32_t *a) {
 47 |   ntt_ct_std2rev_naive(a, 16, ntt16_inv_omega_powers_rev, 12289);
 48 | }
 49 | 
 50 | static inline void naive_intt16_gs_std2rev(int32_t *a) {
 51 |   ntt_gs_std2rev_naive(a, 16, ntt16_inv_omega_powers, 12289);
 52 | }
 53 | 
 54 | // multiplication by powers of psi then forward ntt
 55 | static inline void naive_mulntt16_ct_rev2std(int32_t *a) {
 56 |   mulntt_ct_rev2std_naive(a, 16, ntt16_mixed_powers, 12289);
 57 | }
 58 | 
 59 | static inline void naive_mulntt16_ct_std2rev(int32_t *a) {
 60 |   mulntt_ct_std2rev_naive(a, 16, ntt16_mixed_powers_rev, 12289);
 61 | }
 62 | 
 63 | // inverse ntt then multiplication by powers of psi^-1
 64 | static inline void naive_inttmul16_gs_rev2std(int32_t *a) {
 65 |   nttmul_gs_rev2std_naive(a, 16, ntt16_inv_mixed_powers_rev, 12289);
 66 | }
 67 | 
 68 | static inline void naive_inttmul16_gs_std2rev(int32_t *a) {
 69 |   nttmul_gs_std2rev_naive(a, 16, ntt16_inv_mixed_powers, 12289);
 70 | }
 71 | 
 72 | 
 73 | /*
 74 |  * PRODUCTS
 75 |  */
 76 | 
 77 | /*
 78 |  * Input: two arrays a and b in standard order
 79 |  *
 80 |  * Result: 
 81 |  * - the product is stored in array c, in standard order.
 82 |  * - arrays a and b are modified
 83 |  *
 84 |  * The input arrays must contain elements in the range [0 .. Q-1]
 85 |  * The result is also in that range.
 86 |  *
 87 |  * The first four variants have the following form:
 88 |  * - multiply a and b by powers of psi
 89 |  * - compute NNT(a) and NTT(b) using a std2rev variant
 90 |  * - c = elementwise product of NTT(a) and NTT(b)
 91 |  * - compute INTT(c) usign a rev2std variant
 92 |  * - multiply the result by n^(-1) * powers of psi^(-1)
 93 |  * There are two choices for the NTT and INTT functions:
 94 |  * - NTT:  either ntt_ct_std2rev or ntt_gs_std2rev
 95 |  * - INTT: either intt_ct_rev2std or intt_gs_reg2std
 96 |  *
 97 |  * Product5 uses the combined mul/ntt variants:
 98 |  * - compute MULNTT(a) and MULNTT(b) using mulntt_ct_std2rev
 99 |  * - c = elementwise product
100 |  * - compute INTTMUL(c) using inttmul_gs_rev2std
101 |  * - multiply the result by n^(-1)
102 |  */
103 | extern void naive_ntt16_product1(int32_t *c, int32_t *a, int32_t *b);
104 | extern void naive_ntt16_product2(int32_t *c, int32_t *a, int32_t *b);
105 | extern void naive_ntt16_product3(int32_t *c, int32_t *a, int32_t *b);
106 | extern void naive_ntt16_product4(int32_t *c, int32_t *a, int32_t *b);
107 | 
108 | extern void naive_ntt16_product5(int32_t *c, int32_t *a, int32_t *b);
109 | 
110 | #endif /* __NAIVE_NTT16_H */
111 | 


--------------------------------------------------------------------------------
/src/README.md:
--------------------------------------------------------------------------------
 1 | # Sources
 2 | 
 3 | 
 4 | ## NTT Implementations
 5 | 
 6 | The various algorithms are parametric in Q and n. Q is the prime order of the finite field, and is 
 7 | always assumed to be 12289. n is the degree of the polynomials under consideration. 
 8 | n must be a power of two, no larger than 2048. In addition, the NTT is based on two parameters
 9 | ``phi`` and ``psi`` such that ``phi^n = 1`` and ``psi^2 = phi``.
10 | 
11 | We include four variant implementations of the basic algorithms:
12 | - naive implementation (unoptimized modular arithmetic)
13 | - default implementation (optimized modular arithmetic for Q=12289)
14 | - implementation in C based on the Longa-Naehrig reduction
15 | - implementation in x86-64 assembler that uses the AVX2 vector instruction (also using the Longa-Naehrig reduction)
16 | 
17 | Each source file includes variant procedures for constructing forward and backward transforms, using either
18 | the Cooley-Tukey or the Gentleman-Sande approaches. Some variants implement pre or post-multiplication by
19 | powers of ``psi``. The source files also include utilities for shuffling array components, multiplying
20 | by scalars, amd more utilities that can be used to implement products of polynomials.
21 | 
22 | The main source files include:
23 | - ``naive_ntt.c`` and ``naive_ntt.h``: naive implementation
24 | - ``ntt.c`` and ``ntt.h``: default implementation
25 | - ``ntt_red.c`` and ``ntt_red.h``: Longa-Naehrig reduction (C implementation)
26 | - ``ntt_asm.S`` and ``ntt_asm.h``: Longa-Naehrig reduction (assembler/AVX2 implementation)
27 | 
28 | For testing and experimentation, we instantiate the generic procedures for n=16, 256, 512, and 1024,
29 | and for fixed values of the parameters ``phi`` and ``psi``.
30 | For example ``ntt1024.c`` uses the default NTT procedure (from ``ntt.h`` and ``ntt.c``). 
31 | It is specialized for ``n=1024`` and it includes five procedures that compute products of polynonials.
32 | The five procedures are semantically equivalent but they use different forward/backward transforms.
33 | 
34 | ## Tables
35 | 
36 | All the NTT procedures we implement take a table of constants as argument.
37 | This table is derived from the parameters ``phi``, ``psi``, and ``n``.  We include
38 | two utilies that generate the relevant tables based on these parameters.
39 | 
40 | * `make_tables` generates tables suitable for ``naive_ntt`` and ``ntt``. The resulting
41 |    tables are in ``ntt_[16, 256, 512, 1024]_tables.h``.
42 | 
43 | * `make_red_tables` generates suitable tables for ``ntt_red`` and ``ntt_asm``. 
44 |    The resulting tables are in ``ntt_red[16, 256, 512, 1024]_tables.h``.
45 | 
46 | For shuffling array elements in the bit-reverse order, we also use a table that defines
47 | an index permutation and we include a utility to generate this table:
48 | 
49 | * `bitrev[16, 256, 512, 1024]_table.h` are generated by `make_bitrev_table`
50 | 
51 | 
52 | ## Tests
53 | 
54 | Basic tests include
55 | 
56 | ```
57 | test_ntt
58 | test_ntt_red
59 | test_ntt_avx
60 | ```
61 | These run a first round of tests to validate the implementations and a second
62 | round of tests to measure speed.
63 | 
64 | The following variants do more extensive testing and are specialized for a fixed ``n``:
65 | 
66 | ```
67 | test_naive_ntt[16, 256, 512, 1024]
68 | test_ntt[16, 256, 512, 1024]
69 | test_ntt_red[16, 256, 512, 1024]
70 | test_ntt_red_asm[16, 256, 512, 1024]
71 | ```
72 | 
73 | We also include Known Answer Tests (kat) for n=1024:
74 | ```
75 | data_poly1024.[ch]
76 | kat_mul1024[, _red, red_asm].c
77 | speed_mul1024[, _naive, _red, _red_asm].c
78 | ```
79 | 
80 | The tests in the paper can be found in this [subdirectory](https://github.com/SRI-CSL/NTT/tree/master/src/tests_in_paper). To make them one can simply do
81 | ```
82 | make paper_tests
83 | ```
84 | in *this* directory (not the subdirectory).
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/src/naive_ntt256.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Naive NTT for Q=12289 and n=256
  3 |  */
  4 | 
  5 | #ifndef __NAIVE_NTT256_H
  6 | #define __NAIVE_NTT256_H
  7 | 
  8 | #include "ntt256_tables.h"
  9 | #include "naive_ntt.h"
 10 | 
 11 | 
 12 | /*
 13 |  * NTT VARIANTS
 14 |  *
 15 |  * - the input a is an array of n integers that must be between 0 and Q-1
 16 |  * - the result is stored in place
 17 |  * - the inverse transforms return a result scaled by n:
 18 |  *    we have intt(ntt(a)) = n * a
 19 |  */
 20 | // forward
 21 | static inline void naive_ntt256_ct_rev2std(int32_t *a) {
 22 |   ntt_ct_rev2std_naive(a, 256, ntt256_omega_powers, 12289);
 23 | }
 24 | 
 25 | static inline void naive_ntt256_gs_rev2std(int32_t *a) {
 26 |   ntt_gs_rev2std_naive(a, 256, ntt256_omega_powers_rev, 12289);
 27 | }
 28 | 
 29 | static inline void naive_ntt256_ct_std2rev(int32_t *a) {
 30 |   ntt_ct_std2rev_naive(a, 256, ntt256_omega_powers_rev, 12289);
 31 | }
 32 | 
 33 | static inline void naive_ntt256_gs_std2rev(int32_t *a) {
 34 |   ntt_gs_std2rev_naive(a, 256, ntt256_omega_powers, 12289);
 35 | }
 36 | 
 37 | // inverse
 38 | static inline void naive_intt256_ct_rev2std(int32_t *a) {
 39 |   ntt_ct_rev2std_naive(a, 256, ntt256_inv_omega_powers, 12289);
 40 | }
 41 | 
 42 | static inline void naive_intt256_gs_rev2std(int32_t *a) {
 43 |   ntt_gs_rev2std_naive(a, 256, ntt256_inv_omega_powers_rev, 12289);
 44 | }
 45 | 
 46 | static inline void naive_intt256_ct_std2rev(int32_t *a) {
 47 |   ntt_ct_std2rev_naive(a, 256, ntt256_inv_omega_powers_rev, 12289);
 48 | }
 49 | 
 50 | static inline void naive_intt256_gs_std2rev(int32_t *a) {
 51 |   ntt_gs_std2rev_naive(a, 256, ntt256_inv_omega_powers, 12289);
 52 | }
 53 | 
 54 | // multiplication by powers of psi then forward ntt
 55 | static inline void naive_mulntt256_ct_rev2std(int32_t *a) {
 56 |   mulntt_ct_rev2std_naive(a, 256, ntt256_mixed_powers, 12289);
 57 | }
 58 | 
 59 | static inline void naive_mulntt256_ct_std2rev(int32_t *a) {
 60 |   mulntt_ct_std2rev_naive(a, 256, ntt256_mixed_powers_rev, 12289);
 61 | }
 62 | 
 63 | // inverse ntt then multiplication by powers of psi^-1
 64 | static inline void naive_inttmul256_gs_rev2std(int32_t *a) {
 65 |   nttmul_gs_rev2std_naive(a, 256, ntt256_inv_mixed_powers_rev, 12289);
 66 | }
 67 | 
 68 | static inline void naive_inttmul256_gs_std2rev(int32_t *a) {
 69 |   nttmul_gs_std2rev_naive(a, 256, ntt256_inv_mixed_powers, 12289);
 70 | }
 71 | 
 72 | 
 73 | /*
 74 |  * PRODUCTS
 75 |  */
 76 | 
 77 | /*
 78 |  * Input: two arrays a and b in standard order
 79 |  *
 80 |  * Result: 
 81 |  * - the product is stored in array c, in standard order.
 82 |  * - arrays a and b are modified
 83 |  *
 84 |  * The input arrays must contain elements in the range [0 .. Q-1]
 85 |  * The result is also in that range.
 86 |  *
 87 |  * The first four variants have the following form:
 88 |  * - multiply a and b by powers of psi
 89 |  * - compute NNT(a) and NTT(b) using a std2rev variant
 90 |  * - c = elementwise product of NTT(a) and NTT(b)
 91 |  * - compute INTT(c) usign a rev2std variant
 92 |  * - multiply the result by n^(-1) * powers of psi^(-1)
 93 |  * There are two choices for the NTT and INTT functions:
 94 |  * - NTT:  either ntt_ct_std2rev or ntt_gs_std2rev
 95 |  * - INTT: either intt_ct_rev2std or intt_gs_reg2std
 96 |  *
 97 |  * Product5 uses the combined mul/ntt variants:
 98 |  * - compute MULNTT(a) and MULNTT(b) using mulntt_ct_std2rev
 99 |  * - c = elementwise product
100 |  * - compute INTTMUL(c) using inttmul_gs_rev2std
101 |  * - multiply the result by n^(-1)
102 |  */
103 | extern void naive_ntt256_product1(int32_t *c, int32_t *a, int32_t *b);
104 | extern void naive_ntt256_product2(int32_t *c, int32_t *a, int32_t *b);
105 | extern void naive_ntt256_product3(int32_t *c, int32_t *a, int32_t *b);
106 | extern void naive_ntt256_product4(int32_t *c, int32_t *a, int32_t *b);
107 | 
108 | extern void naive_ntt256_product5(int32_t *c, int32_t *a, int32_t *b);
109 | 
110 | #endif /* __NAIVE_NTT256_H */
111 | 


--------------------------------------------------------------------------------
/src/naive_ntt512.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Naive NTT for Q=12289 and n=512
  3 |  */
  4 | 
  5 | #ifndef __NAIVE_NTT512_H
  6 | #define __NAIVE_NTT512_H
  7 | 
  8 | #include "ntt512_tables.h"
  9 | #include "naive_ntt.h"
 10 | 
 11 | 
 12 | /*
 13 |  * NTT VARIANTS
 14 |  *
 15 |  * - the input a is an array of n integers that must be between 0 and Q-1
 16 |  * - the result is stored in place
 17 |  * - the inverse transforms return a result scaled by n:
 18 |  *    we have intt(ntt(a)) = n * a
 19 |  */
 20 | // forward
 21 | static inline void naive_ntt512_ct_rev2std(int32_t *a) {
 22 |   ntt_ct_rev2std_naive(a, 512, ntt512_omega_powers, 12289);
 23 | }
 24 | 
 25 | static inline void naive_ntt512_gs_rev2std(int32_t *a) {
 26 |   ntt_gs_rev2std_naive(a, 512, ntt512_omega_powers_rev, 12289);
 27 | }
 28 | 
 29 | static inline void naive_ntt512_ct_std2rev(int32_t *a) {
 30 |   ntt_ct_std2rev_naive(a, 512, ntt512_omega_powers_rev, 12289);
 31 | }
 32 | 
 33 | static inline void naive_ntt512_gs_std2rev(int32_t *a) {
 34 |   ntt_gs_std2rev_naive(a, 512, ntt512_omega_powers, 12289);
 35 | }
 36 | 
 37 | // inverse
 38 | static inline void naive_intt512_ct_rev2std(int32_t *a) {
 39 |   ntt_ct_rev2std_naive(a, 512, ntt512_inv_omega_powers, 12289);
 40 | }
 41 | 
 42 | static inline void naive_intt512_gs_rev2std(int32_t *a) {
 43 |   ntt_gs_rev2std_naive(a, 512, ntt512_inv_omega_powers_rev, 12289);
 44 | }
 45 | 
 46 | static inline void naive_intt512_ct_std2rev(int32_t *a) {
 47 |   ntt_ct_std2rev_naive(a, 512, ntt512_inv_omega_powers_rev, 12289);
 48 | }
 49 | 
 50 | static inline void naive_intt512_gs_std2rev(int32_t *a) {
 51 |   ntt_gs_std2rev_naive(a, 512, ntt512_inv_omega_powers, 12289);
 52 | }
 53 | 
 54 | // multiplication by powers of psi then forward ntt
 55 | static inline void naive_mulntt512_ct_rev2std(int32_t *a) {
 56 |   mulntt_ct_rev2std_naive(a, 512, ntt512_mixed_powers, 12289);
 57 | }
 58 | 
 59 | static inline void naive_mulntt512_ct_std2rev(int32_t *a) {
 60 |   mulntt_ct_std2rev_naive(a, 512, ntt512_mixed_powers_rev, 12289);
 61 | }
 62 | 
 63 | // inverse ntt then multiplication by powers of psi^-1
 64 | static inline void naive_inttmul512_gs_rev2std(int32_t *a) {
 65 |   nttmul_gs_rev2std_naive(a, 512, ntt512_inv_mixed_powers_rev, 12289);
 66 | }
 67 | 
 68 | static inline void naive_inttmul512_gs_std2rev(int32_t *a) {
 69 |   nttmul_gs_std2rev_naive(a, 512, ntt512_inv_mixed_powers, 12289);
 70 | }
 71 | 
 72 | 
 73 | /*
 74 |  * PRODUCTS
 75 |  */
 76 | 
 77 | /*
 78 |  * Input: two arrays a and b in standard order
 79 |  *
 80 |  * Result: 
 81 |  * - the product is stored in array c, in standard order.
 82 |  * - arrays a and b are modified
 83 |  *
 84 |  * The input arrays must contain elements in the range [0 .. Q-1]
 85 |  * The result is also in that range.
 86 |  *
 87 |  * The first four variants have the following form:
 88 |  * - multiply a and b by powers of psi
 89 |  * - compute NNT(a) and NTT(b) using a std2rev variant
 90 |  * - c = elementwise product of NTT(a) and NTT(b)
 91 |  * - compute INTT(c) usign a rev2std variant
 92 |  * - multiply the result by n^(-1) * powers of psi^(-1)
 93 |  * There are two choices for the NTT and INTT functions:
 94 |  * - NTT:  either ntt_ct_std2rev or ntt_gs_std2rev
 95 |  * - INTT: either intt_ct_rev2std or intt_gs_reg2std
 96 |  *
 97 |  * Product5 uses the combined mul/ntt variants:
 98 |  * - compute MULNTT(a) and MULNTT(b) using mulntt_ct_std2rev
 99 |  * - c = elementwise product
100 |  * - compute INTTMUL(c) using inttmul_gs_rev2std
101 |  * - multiply the result by n^(-1)
102 |  */
103 | extern void naive_ntt512_product1(int32_t *c, int32_t *a, int32_t *b);
104 | extern void naive_ntt512_product2(int32_t *c, int32_t *a, int32_t *b);
105 | extern void naive_ntt512_product3(int32_t *c, int32_t *a, int32_t *b);
106 | extern void naive_ntt512_product4(int32_t *c, int32_t *a, int32_t *b);
107 | 
108 | extern void naive_ntt512_product5(int32_t *c, int32_t *a, int32_t *b);
109 | 
110 | #endif /* __NAIVE_NTT512_H */
111 | 


--------------------------------------------------------------------------------
/src/test_ntt_red_tables.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tables for testing NTT using Longa&Naehrig reduction.
 3 |  */
 4 | 
 5 | #ifndef __TEST_NTT_TABLES_H
 6 | #define __TEST_NTT_TABLES_H
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | /*
11 |  * Parameters
12 |  * - for n=16: psi=1212, omega=6553
13 |  * - for n=16, psi=1022, omega=12208
14 |  * - for n=256: psi=1002, omega=8595
15 |  * - for n=512: psi=1003, omega=10600
16 |  * - for n=1024: psi=1014, omega=8209
17 |  * - for n=2048: psi=1016, omega=12269
18 |  *
19 |  * All tables are scaled by inverse(3) = 8193
20 |  */
21 | 
22 | // powers of omega in standard order
23 | extern const int16_t shoup_red_ntt16_12289[16];
24 | extern const int16_t shoup_red_ntt128_12289[128];
25 | extern const int16_t shoup_red_ntt256_12289[256];
26 | extern const int16_t shoup_red_ntt512_12289[512];
27 | extern const int16_t shoup_red_ntt1024_12289[1024];
28 | extern const int16_t shoup_red_ntt2048_12289[2048];
29 | 
30 | // powers of omega in bit-reverse order
31 | extern const int16_t rev_shoup_red_ntt16_12289[16];
32 | extern const int16_t rev_shoup_red_ntt128_12289[128];
33 | extern const int16_t rev_shoup_red_ntt256_12289[256];
34 | extern const int16_t rev_shoup_red_ntt512_12289[512];
35 | extern const int16_t rev_shoup_red_ntt1024_12289[1024];
36 | extern const int16_t rev_shoup_red_ntt2048_12289[2048];
37 | 
38 | // powers of omega and psi in standard order
39 | extern const int16_t shoup_red_scaled_ntt16_12289[16];
40 | extern const int16_t shoup_red_scaled_ntt128_12289[128];
41 | extern const int16_t shoup_red_scaled_ntt256_12289[256];
42 | extern const int16_t shoup_red_scaled_ntt512_12289[512];
43 | extern const int16_t shoup_red_scaled_ntt1024_12289[1024];
44 | extern const int16_t shoup_red_scaled_ntt2048_12289[2048];
45 | 
46 | // powers of omega and psi in bit-reverse order
47 | extern const int16_t rev_shoup_red_scaled_ntt16_12289[16];
48 | extern const int16_t rev_shoup_red_scaled_ntt128_12289[128];
49 | extern const int16_t rev_shoup_red_scaled_ntt256_12289[256];
50 | extern const int16_t rev_shoup_red_scaled_ntt512_12289[512];
51 | extern const int16_t rev_shoup_red_scaled_ntt1024_12289[1024];
52 | extern const int16_t rev_shoup_red_scaled_ntt2048_12289[2048];
53 | 
54 | /*
55 |  * Same tables but with coefficients in the interval [-6144,+6144]
56 |  */
57 | // powers of omega in standard order
58 | extern const int16_t shoup_sred_ntt16_12289[16];
59 | extern const int16_t shoup_sred_ntt128_12289[128];
60 | extern const int16_t shoup_sred_ntt256_12289[256];
61 | extern const int16_t shoup_sred_ntt512_12289[512];
62 | extern const int16_t shoup_sred_ntt1024_12289[1024];
63 | extern const int16_t shoup_sred_ntt2048_12289[2048];
64 | 
65 | // powers of omega in bit-reverse order
66 | extern const int16_t rev_shoup_sred_ntt16_12289[16];
67 | extern const int16_t rev_shoup_sred_ntt128_12289[128];
68 | extern const int16_t rev_shoup_sred_ntt256_12289[256];
69 | extern const int16_t rev_shoup_sred_ntt512_12289[512];
70 | extern const int16_t rev_shoup_sred_ntt1024_12289[1024];
71 | extern const int16_t rev_shoup_sred_ntt2048_12289[2048];
72 | 
73 | // powers of omega and psi in standard order
74 | extern const int16_t shoup_sred_scaled_ntt16_12289[16];
75 | extern const int16_t shoup_sred_scaled_ntt128_12289[128];
76 | extern const int16_t shoup_sred_scaled_ntt256_12289[256];
77 | extern const int16_t shoup_sred_scaled_ntt512_12289[512];
78 | extern const int16_t shoup_sred_scaled_ntt1024_12289[1024];
79 | extern const int16_t shoup_sred_scaled_ntt2048_12289[2048];
80 | 
81 | // powers of omega and psi in bit-reverse order
82 | extern const int16_t rev_shoup_sred_scaled_ntt16_12289[16];
83 | extern const int16_t rev_shoup_sred_scaled_ntt128_12289[128];
84 | extern const int16_t rev_shoup_sred_scaled_ntt256_12289[256];
85 | extern const int16_t rev_shoup_sred_scaled_ntt512_12289[512];
86 | extern const int16_t rev_shoup_sred_scaled_ntt1024_12289[1024];
87 | extern const int16_t rev_shoup_sred_scaled_ntt2048_12289[2048];
88 | 
89 | 
90 | #endif /* __TEST_NTT_TABLES_H */
91 | 


--------------------------------------------------------------------------------
/src/naive_ntt1024.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Naive NTT for Q=12289 and n=1024
  3 |  */
  4 | 
  5 | #ifndef __NAIVE_NTT1024_H
  6 | #define __NAIVE_NTT1024_H
  7 | 
  8 | #include "ntt1024_tables.h"
  9 | #include "naive_ntt.h"
 10 | 
 11 | 
 12 | /*
 13 |  * NTT VARIANTS
 14 |  *
 15 |  * - the input a is an array of n integers that must be between 0 and Q-1
 16 |  * - the result is stored in place
 17 |  * - the inverse transforms return a result scaled by n:
 18 |  *    we have intt(ntt(a)) = n * a
 19 |  */
 20 | // forward
 21 | static inline void naive_ntt1024_ct_rev2std(int32_t *a) {
 22 |   ntt_ct_rev2std_naive(a, 1024, ntt1024_omega_powers, 12289);
 23 | }
 24 | 
 25 | static inline void naive_ntt1024_gs_rev2std(int32_t *a) {
 26 |   ntt_gs_rev2std_naive(a, 1024, ntt1024_omega_powers_rev, 12289);
 27 | }
 28 | 
 29 | static inline void naive_ntt1024_ct_std2rev(int32_t *a) {
 30 |   ntt_ct_std2rev_naive(a, 1024, ntt1024_omega_powers_rev, 12289);
 31 | }
 32 | 
 33 | static inline void naive_ntt1024_gs_std2rev(int32_t *a) {
 34 |   ntt_gs_std2rev_naive(a, 1024, ntt1024_omega_powers, 12289);
 35 | }
 36 | 
 37 | // inverse
 38 | static inline void naive_intt1024_ct_rev2std(int32_t *a) {
 39 |   ntt_ct_rev2std_naive(a, 1024, ntt1024_inv_omega_powers, 12289);
 40 | }
 41 | 
 42 | static inline void naive_intt1024_gs_rev2std(int32_t *a) {
 43 |   ntt_gs_rev2std_naive(a, 1024, ntt1024_inv_omega_powers_rev, 12289);
 44 | }
 45 | 
 46 | static inline void naive_intt1024_ct_std2rev(int32_t *a) {
 47 |   ntt_ct_std2rev_naive(a, 1024, ntt1024_inv_omega_powers_rev, 12289);
 48 | }
 49 | 
 50 | static inline void naive_intt1024_gs_std2rev(int32_t *a) {
 51 |   ntt_gs_std2rev_naive(a, 1024, ntt1024_inv_omega_powers, 12289);
 52 | }
 53 | 
 54 | // multiplication by powers of psi then forward ntt
 55 | static inline void naive_mulntt1024_ct_rev2std(int32_t *a) {
 56 |   mulntt_ct_rev2std_naive(a, 1024, ntt1024_mixed_powers, 12289);
 57 | }
 58 | 
 59 | static inline void naive_mulntt1024_ct_std2rev(int32_t *a) {
 60 |   mulntt_ct_std2rev_naive(a, 1024, ntt1024_mixed_powers_rev, 12289);
 61 | }
 62 | 
 63 | // inverse ntt then multiplication by powers of psi^-1
 64 | static inline void naive_inttmul1024_gs_rev2std(int32_t *a) {
 65 |   nttmul_gs_rev2std_naive(a, 1024, ntt1024_inv_mixed_powers_rev, 12289);
 66 | }
 67 | 
 68 | static inline void naive_inttmul1024_gs_std2rev(int32_t *a) {
 69 |   nttmul_gs_std2rev_naive(a, 1024, ntt1024_inv_mixed_powers, 12289);
 70 | }
 71 | 
 72 | 
 73 | /*
 74 |  * PRODUCTS
 75 |  */
 76 | 
 77 | /*
 78 |  * Input: two arrays a and b in standard order
 79 |  *
 80 |  * Result: 
 81 |  * - the product is stored in array c, in standard order.
 82 |  * - arrays a and b are modified
 83 |  *
 84 |  * The input arrays must contain elements in the range [0 .. Q-1]
 85 |  * The result is also in that range.
 86 |  *
 87 |  * The first four variants have the following form:
 88 |  * - multiply a and b by powers of psi
 89 |  * - compute NNT(a) and NTT(b) using a std2rev variant
 90 |  * - c = elementwise product of NTT(a) and NTT(b)
 91 |  * - compute INTT(c) usign a rev2std variant
 92 |  * - multiply the result by n^(-1) * powers of psi^(-1)
 93 |  * There are two choices for the NTT and INTT functions:
 94 |  * - NTT:  either ntt_ct_std2rev or ntt_gs_std2rev
 95 |  * - INTT: either intt_ct_rev2std or intt_gs_reg2std
 96 |  *
 97 |  * Product5 uses the combined mul/ntt variants:
 98 |  * - compute MULNTT(a) and MULNTT(b) using mulntt_ct_std2rev
 99 |  * - c = elementwise product
100 |  * - compute INTTMUL(c) using inttmul_gs_rev2std
101 |  * - multiply the result by n^(-1)
102 |  */
103 | extern void naive_ntt1024_product1(int32_t *c, int32_t *a, int32_t *b);
104 | extern void naive_ntt1024_product2(int32_t *c, int32_t *a, int32_t *b);
105 | extern void naive_ntt1024_product3(int32_t *c, int32_t *a, int32_t *b);
106 | extern void naive_ntt1024_product4(int32_t *c, int32_t *a, int32_t *b);
107 | 
108 | extern void naive_ntt1024_product5(int32_t *c, int32_t *a, int32_t *b);
109 | 
110 | #endif /* __NAIVE_NTT1024_H */
111 | 


--------------------------------------------------------------------------------
/src/ntt32_tables.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parameters:
 3 |  * - q = 12289
 4 |  * - n = 32
 5 |  * - psi = 563
 6 |  * - omega = psi^2 = 9744
 7 |  * - inverse of psi = 5828
 8 |  * - inverse of omega = 11077
 9 |  * - inverse of n = 11905
10 |  */
11 | 
12 | #include "ntt32_tables.h"
13 | 
14 | const uint16_t ntt32_bitrev[BITREV32_NPAIRS][2] = {
15 |     {     1,    16 }, {     2,     8 }, {     3,    24 }, {     5,    20 },
16 |     {     6,    12 }, {     7,    28 }, {     9,    18 }, {    11,    26 },
17 |     {    13,    22 }, {    15,    30 }, {    19,    25 }, {    23,    29 },
18 | };
19 | 
20 | const uint16_t ntt32_psi_powers[32] = {
21 |         1,   563,  9744,  4978,   722,   949,  5860,  5728,
22 |      5146,  9283,  3504,  6512,  4134,  4821, 10643,  7266,
23 |     10810,  2975,  3621, 10938,  1305,  9664,  9094,  7698,
24 |      8246,  9545,  3542,  3328,  5736,  9650,  1212,  6461,
25 | };
26 | 
27 | const uint16_t ntt32_inv_psi_powers[32] = {
28 |         1,  5828, 11077,  2639,  6553,  8961,  8747,  2744,
29 |      4043,  4591,  3195,  2625, 10984,  1351,  8668,  9314,
30 |      1479,  5023,  1646,  7468,  8155,  5777,  8785,  3006,
31 |      7143,  6561,  6429, 11340, 11567,  7311,  2545, 11726,
32 | };
33 | 
34 | const uint16_t ntt32_scaled_inv_psi_powers[32] = {
35 |     11905, 10935, 10715,  6611,  2893, 12185,  8338,  3158,
36 |      8191,  6672,  2020, 11987,  9560,  9643,  1807, 11812,
37 |      9647,   541,  6964,  7914,  2175,  5941,  6035,   862,
38 |      9824, 12110,  1353,  8035,  6890,  6757,  5840,  7279,
39 | };
40 | 
41 | const uint16_t ntt32_omega_powers[32] = {
42 |         0,     1,     1, 10810,     1,  5146, 10810,  8246,
43 |         1,   722,  5146,  4134, 10810,  1305,  8246,  5736,
44 |         1,  9744,   722,  5860,  5146,  3504,  4134, 10643,
45 |     10810,  3621,  1305,  9094,  8246,  3542,  5736,  1212,
46 | };
47 | 
48 | const uint16_t ntt32_omega_powers_rev[32] = {
49 |         0,     1,     1, 10810,     1, 10810,  5146,  8246,
50 |         1, 10810,  5146,  8246,   722,  1305,  4134,  5736,
51 |         1, 10810,  5146,  8246,   722,  1305,  4134,  5736,
52 |      9744,  3621,  3504,  3542,  5860,  9094, 10643,  1212,
53 | };
54 | 
55 | const uint16_t ntt32_inv_omega_powers[32] = {
56 |         0,     1,     1,  1479,     1,  4043,  1479,  7143,
57 |         1,  6553,  4043, 10984,  1479,  8155,  7143, 11567,
58 |         1, 11077,  6553,  8747,  4043,  3195, 10984,  8668,
59 |      1479,  1646,  8155,  8785,  7143,  6429, 11567,  2545,
60 | };
61 | 
62 | const uint16_t ntt32_inv_omega_powers_rev[32] = {
63 |         0,     1,     1,  1479,     1,  1479,  4043,  7143,
64 |         1,  1479,  4043,  7143,  6553,  8155, 10984, 11567,
65 |         1,  1479,  4043,  7143,  6553,  8155, 10984, 11567,
66 |     11077,  1646,  3195,  6429,  8747,  8785,  8668,  2545,
67 | };
68 | 
69 | const uint16_t ntt32_mixed_powers[32] = {
70 |         0, 10810,  5146,  8246,   722,  4134,  1305,  5736,
71 |      9744,  5860,  3504, 10643,  3621,  9094,  3542,  1212,
72 |       563,  4978,   949,  5728,  9283,  6512,  4821,  7266,
73 |      2975, 10938,  9664,  7698,  9545,  3328,  9650,  6461,
74 | };
75 | 
76 | const uint16_t ntt32_mixed_powers_rev[32] = {
77 |         0, 10810,  5146,  8246,   722,  1305,  4134,  5736,
78 |      9744,  3621,  3504,  3542,  5860,  9094, 10643,  1212,
79 |       563,  2975,  9283,  9545,   949,  9664,  4821,  9650,
80 |      4978, 10938,  6512,  3328,  5728,  7698,  7266,  6461,
81 | };
82 | 
83 | const uint16_t ntt32_inv_mixed_powers[32] = {
84 |         0,  1479,  4043,  7143,  6553, 10984,  8155, 11567,
85 |     11077,  8747,  3195,  8668,  1646,  8785,  6429,  2545,
86 |      5828,  2639,  8961,  2744,  4591,  2625,  1351,  9314,
87 |      5023,  7468,  5777,  3006,  6561, 11340,  7311, 11726,
88 | };
89 | 
90 | const uint16_t ntt32_inv_mixed_powers_rev[32] = {
91 |         0,  1479,  4043,  7143,  6553,  8155, 10984, 11567,
92 |     11077,  1646,  3195,  6429,  8747,  8785,  8668,  2545,
93 |      5828,  5023,  4591,  6561,  8961,  5777,  1351,  7311,
94 |      2639,  7468,  2625, 11340,  2744,  3006,  9314, 11726,
95 | };
96 | 
97 | 


--------------------------------------------------------------------------------
/src/make_bitrev_table.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Table for bitverse shuffle.
  3 |  *
  4 |  * Input: n = table size.
  5 |  */
  6 | 
  7 | #include <assert.h>
  8 | #include <stdbool.h>
  9 | #include <stdlib.h>
 10 | #include <stdio.h>
 11 | #include <stdint.h>
 12 | #include <inttypes.h>
 13 | 
 14 | /*
 15 |  * Check that n is a power of two and return k such that n=2^k.
 16 |  */
 17 | static bool logtwo(uint32_t n, uint32_t *k) {
 18 |   uint32_t i;
 19 | 
 20 |   i = 0;
 21 |   while ((n & 1) == 0) {
 22 |     i ++;
 23 |     n >>= 1;
 24 |   }
 25 |   if (n == 1) {
 26 |     *k = i;
 27 |     return true;
 28 |   }
 29 |   return false;
 30 | }
 31 | 
 32 | /*
 33 |  * Bitreverse of i, interpreted as a k-bit integer
 34 |  */
 35 | static uint32_t reverse(uint32_t i, uint32_t k) {
 36 |   uint32_t x, b, j;
 37 | 
 38 |   x = 0;
 39 |   for (j=0; j<k; j++) {
 40 |     b = i & 1;
 41 |     x = (x<<1) | b;
 42 |     i >>= 1;
 43 |   }
 44 | 
 45 |   return x;
 46 | }
 47 | 
 48 | /*
 49 |  * Count the number of i such that i < bitrev(i, k), where n=2^k.
 50 |  */
 51 | static uint32_t rev_table_npairs(uint32_t n, uint32_t k) {
 52 |   uint32_t i, c;
 53 | 
 54 |   c = 0;
 55 |   for (i=0; i<n; i++) {
 56 |     c += (i < reverse(i, k));
 57 |   }
 58 |   return c;
 59 | }
 60 | 
 61 | 
 62 | /*
 63 |  * Print the table of pairs (i, reverse(i)) where i < reverse(i)
 64 |  * - n = 2^k
 65 |  */
 66 | static void print_bitrev_table(FILE *f, uint32_t n, uint32_t k) {
 67 |   uint32_t i, j, z;
 68 | 
 69 |   fprintf(f, "#include \"bitrev%"PRIu32"_table.h\"\n\n", n);
 70 |   fprintf(f, "const uint16_t bitrev%"PRIu32"[BITREV%"PRIu32"_NPAIRS][2] = {\n", n, n);
 71 |   z = 0;
 72 |   for (i=0; i<n; i++) {
 73 |     j = reverse(i, k);
 74 |     if (i < j) {
 75 |       if (z == 0) fprintf(f, "   ");
 76 |       fprintf(f, " { %5"PRIu32", %5"PRIu32" },", i, j);
 77 |       z ++;
 78 |       if (z == 4) {
 79 | 	fprintf(f, "\n");
 80 | 	z = 0;
 81 |       }
 82 |     }
 83 |   }
 84 |   if (z > 0) fprintf(f, "\n");
 85 |   fprintf(f, "};\n\n");
 86 | }
 87 | 
 88 | 
 89 | /*
 90 |  * Declarations in file f
 91 |  */
 92 | static void print_bitrev_declarations(FILE *f, uint32_t n, uint32_t k) {
 93 |   uint32_t m;
 94 | 
 95 |   m = rev_table_npairs(n, k);
 96 | 
 97 |   fprintf(f, "#ifndef __BITREV%"PRIu32"_TABLE_H\n", n);
 98 |   fprintf(f, "#define __BITREV%"PRIu32"_TABLE_H\n\n", n);
 99 |   fprintf(f, "#include <stdint.h>\n\n");
100 |   fprintf(f, "#define BITREV%"PRIu32"_NPAIRS %"PRIu32"\n\n", n, m);
101 |   fprintf(f, "extern const uint16_t bitrev%"PRIu32"[BITREV%"PRIu32"_NPAIRS][2];\n\n", n, n);
102 | 
103 |   fprintf(f, "#endif /* __BITREV%"PRIu32"_TABLE_H */\n", n);
104 | }
105 | 
106 | /*
107 |  * Open file: name is "bitrev<size>_table.h" or "bitrev<size>_table.c"
108 |  * - return NULL if we can't create the file
109 |  */
110 | #define BUFFER_SIZE 100
111 | 
112 | static FILE *open_file(uint32_t n, const char *suffix) {
113 |   char filename[BUFFER_SIZE];
114 |   int len;
115 |   FILE *f;
116 | 
117 |   f = NULL;
118 |   len = snprintf(filename, BUFFER_SIZE, "bitrev%"PRIu32"_table.%s", n, suffix);
119 |   if (len < BUFFER_SIZE) {
120 |     f = fopen(filename, "w");
121 |   }
122 |   return f;
123 | }
124 | 
125 | int main(int argc, char *argv[]) {
126 |   uint32_t n, log_n;
127 |   long x;
128 |   FILE *f;
129 | 
130 |   if (argc != 2) {
131 |     fprintf(stderr, "Usage: %s <size>\n", argv[0]);
132 |     exit(EXIT_FAILURE);
133 |   }
134 | 
135 |   // size
136 |   x = atol(argv[1]);
137 |   if (x <= 1) {
138 |     fprintf(stderr, "Invalid size %ld: must be at least 2\n", x);
139 |     exit(EXIT_FAILURE);
140 |   }
141 |   if (x >= UINT16_MAX) {
142 |     fprintf(stderr, "The size is too large: max = %"PRIu32"\n", (uint32_t)UINT16_MAX);
143 |     exit(EXIT_FAILURE);
144 |   }
145 |   n = (uint32_t) x;
146 |   if (!logtwo(n, &log_n)) {
147 |     fprintf(stderr, "Invalid size: %"PRIu32" is not a power of two\n", n);
148 |     exit(EXIT_FAILURE);
149 |   }
150 | 
151 |   f = open_file(n, "h");
152 |   if (f == NULL) {
153 |     fprintf(stderr, "failed to open file 'bitrev%"PRIu32"_tables.h'\n", n);
154 |     exit(EXIT_FAILURE);
155 |   }
156 |   print_bitrev_declarations(f, n, log_n);
157 |   fclose(f);
158 | 
159 |   f = open_file(n, "c");
160 |   if (f == NULL) {
161 |     fprintf(stderr, "failed to open file 'bitrev%"PRIu32"_tables.h'\n", n);
162 |     exit(EXIT_FAILURE);
163 |   }
164 |   print_bitrev_table(f, n, log_n);
165 |   fclose(f);
166 |   
167 |   return 0;
168 | }
169 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/make_bitrev_table.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Table for bitverse shuffle.
  3 |  *
  4 |  * Input: n = table size.
  5 |  */
  6 | 
  7 | #include <assert.h>
  8 | #include <stdbool.h>
  9 | #include <stdlib.h>
 10 | #include <stdio.h>
 11 | #include <stdint.h>
 12 | #include <inttypes.h>
 13 | 
 14 | /*
 15 |  * Check that n is a power of two and return k such that n=2^k.
 16 |  */
 17 | static bool logtwo(uint32_t n, uint32_t *k) {
 18 |   uint32_t i;
 19 | 
 20 |   i = 0;
 21 |   while ((n & 1) == 0) {
 22 |     i ++;
 23 |     n >>= 1;
 24 |   }
 25 |   if (n == 1) {
 26 |     *k = i;
 27 |     return true;
 28 |   }
 29 |   return false;
 30 | }
 31 | 
 32 | /*
 33 |  * Bitreverse of i, interpreted as a k-bit integer
 34 |  */
 35 | static uint32_t reverse(uint32_t i, uint32_t k) {
 36 |   uint32_t x, b, j;
 37 | 
 38 |   x = 0;
 39 |   for (j=0; j<k; j++) {
 40 |     b = i & 1;
 41 |     x = (x<<1) | b;
 42 |     i >>= 1;
 43 |   }
 44 | 
 45 |   return x;
 46 | }
 47 | 
 48 | /*
 49 |  * Count the number of i such that i < bitrev(i, k), where n=2^k.
 50 |  */
 51 | static uint32_t rev_table_npairs(uint32_t n, uint32_t k) {
 52 |   uint32_t i, c;
 53 | 
 54 |   c = 0;
 55 |   for (i=0; i<n; i++) {
 56 |     c += (i < reverse(i, k));
 57 |   }
 58 |   return c;
 59 | }
 60 | 
 61 | 
 62 | /*
 63 |  * Print the table of pairs (i, reverse(i)) where i < reverse(i)
 64 |  * - n = 2^k
 65 |  */
 66 | static void print_bitrev_table(FILE *f, uint32_t n, uint32_t k) {
 67 |   uint32_t i, j, z;
 68 | 
 69 |   fprintf(f, "#include \"bitrev%"PRIu32"_table.h\"\n\n", n);
 70 |   fprintf(f, "const uint16_t bitrev%"PRIu32"[BITREV%"PRIu32"_NPAIRS][2] = {\n", n, n);
 71 |   z = 0;
 72 |   for (i=0; i<n; i++) {
 73 |     j = reverse(i, k);
 74 |     if (i < j) {
 75 |       if (z == 0) fprintf(f, "   ");
 76 |       fprintf(f, " { %5"PRIu32", %5"PRIu32" },", i, j);
 77 |       z ++;
 78 |       if (z == 4) {
 79 | 	fprintf(f, "\n");
 80 | 	z = 0;
 81 |       }
 82 |     }
 83 |   }
 84 |   if (z > 0) fprintf(f, "\n");
 85 |   fprintf(f, "};\n\n");
 86 | }
 87 | 
 88 | 
 89 | /*
 90 |  * Declarations in file f
 91 |  */
 92 | static void print_bitrev_declarations(FILE *f, uint32_t n, uint32_t k) {
 93 |   uint32_t m;
 94 | 
 95 |   m = rev_table_npairs(n, k);
 96 | 
 97 |   fprintf(f, "#ifndef __BITREV%"PRIu32"_TABLE_H\n", n);
 98 |   fprintf(f, "#define __BITREV%"PRIu32"_TABLE_H\n\n", n);
 99 |   fprintf(f, "#include <stdint.h>\n\n");
100 |   fprintf(f, "#define BITREV%"PRIu32"_NPAIRS %"PRIu32"\n\n", n, m);
101 |   fprintf(f, "extern const uint16_t bitrev%"PRIu32"[BITREV%"PRIu32"_NPAIRS][2];\n\n", n, n);
102 | 
103 |   fprintf(f, "#endif /* __BITREV%"PRIu32"_TABLE_H */\n", n);
104 | }
105 | 
106 | /*
107 |  * Open file: name is "bitrev<size>_table.h" or "bitrev<size>_table.c"
108 |  * - return NULL if we can't create the file
109 |  */
110 | #define BUFFER_SIZE 100
111 | 
112 | static FILE *open_file(uint32_t n, const char *suffix) {
113 |   char filename[BUFFER_SIZE];
114 |   int len;
115 |   FILE *f;
116 | 
117 |   f = NULL;
118 |   len = snprintf(filename, BUFFER_SIZE, "bitrev%"PRIu32"_table.%s", n, suffix);
119 |   if (len < BUFFER_SIZE) {
120 |     f = fopen(filename, "w");
121 |   }
122 |   return f;
123 | }
124 | 
125 | int main(int argc, char *argv[]) {
126 |   uint32_t n, log_n;
127 |   long x;
128 |   FILE *f;
129 | 
130 |   if (argc != 2) {
131 |     fprintf(stderr, "Usage: %s <size>\n", argv[0]);
132 |     exit(EXIT_FAILURE);
133 |   }
134 | 
135 |   // size
136 |   x = atol(argv[1]);
137 |   if (x <= 1) {
138 |     fprintf(stderr, "Invalid size %ld: must be at least 2\n", x);
139 |     exit(EXIT_FAILURE);
140 |   }
141 |   if (x >= UINT16_MAX) {
142 |     fprintf(stderr, "The size is too large: max = %"PRIu32"\n", (uint32_t)UINT16_MAX);
143 |     exit(EXIT_FAILURE);
144 |   }
145 |   n = (uint32_t) x;
146 |   if (!logtwo(n, &log_n)) {
147 |     fprintf(stderr, "Invalid size: %"PRIu32" is not a power of two\n", n);
148 |     exit(EXIT_FAILURE);
149 |   }
150 | 
151 |   f = open_file(n, "h");
152 |   if (f == NULL) {
153 |     fprintf(stderr, "failed to open file 'bitrev%"PRIu32"_tables.h'\n", n);
154 |     exit(EXIT_FAILURE);
155 |   }
156 |   print_bitrev_declarations(f, n, log_n);
157 |   fclose(f);
158 | 
159 |   f = open_file(n, "c");
160 |   if (f == NULL) {
161 |     fprintf(stderr, "failed to open file 'bitrev%"PRIu32"_tables.h'\n", n);
162 |     exit(EXIT_FAILURE);
163 |   }
164 |   print_bitrev_table(f, n, log_n);
165 |   fclose(f);
166 |   
167 |   return 0;
168 | }
169 | 


--------------------------------------------------------------------------------
/src/ntt_red16.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * NTT for Q=12289, n=16, using the Longa/Naehrig reduction method.
  3 |  */
  4 | 
  5 | #include "ntt_red16.h"
  6 | 
  7 | /*
  8 |  * Input: two arrays a and b in standard order
  9 |  *
 10 |  * Result: 
 11 |  * - the product is stored in array c, in standard order.
 12 |  * - arrays a and b are modified
 13 |  *
 14 |  * The input arrays must contain elements in the range [0, Q-1]
 15 |  * The result is also in that range.
 16 |  */
 17 | void ntt_red16_product1(int32_t *c, int32_t *a, int32_t *b) {
 18 |   shift_array(a, 16); // convert to [-(Q-1)/2, (Q-1)/2]
 19 |   mul_reduce_array16(a, 16, ntt_red16_psi_powers);
 20 |   ntt_red16_ct_std2rev(a);
 21 |   reduce_array(a, 16);
 22 | 
 23 |   shift_array(b, 16);
 24 |   mul_reduce_array16(b, 16, ntt_red16_psi_powers);
 25 |   ntt_red16_ct_std2rev(b);
 26 |   reduce_array(b, 16);
 27 |   
 28 |   // at this point:
 29 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 30 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 31 |   mul_reduce_array(c, 16, a, b); // c[i] = 3 * a[i] * b[i] 
 32 |   reduce_array_twice(c, 16);  // c[i] = 9 * c[i] mod Q
 33 | 
 34 |   // we have: -130 <= c[i] <= 12413
 35 |   intt_red16_ct_rev2std(c);
 36 |   mul_reduce_array16(c, 16, ntt_red16_scaled_inv_psi_powers);
 37 |   reduce_array_twice(c, 16); // c[i] = 9 * c[i] mod Q
 38 |   correct(c, 16);
 39 | }
 40 | 
 41 | void ntt_red16_product2(int32_t *c, int32_t *a, int32_t *b) {
 42 |   shift_array(a, 16); // convert to [-(Q-1)/2, (Q-1)/2]
 43 |   mul_reduce_array16(a, 16, ntt_red16_psi_powers);
 44 |   ntt_red16_gs_std2rev(a);
 45 |   reduce_array(a, 16);
 46 | 
 47 |   shift_array(b, 16);
 48 |   mul_reduce_array16(b, 16, ntt_red16_psi_powers);
 49 |   ntt_red16_gs_std2rev(b);
 50 |   reduce_array(b, 16);
 51 |   
 52 |   // at this point:
 53 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 54 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 55 |   mul_reduce_array(c, 16, a, b); // c[i] = 3 * a[i] * b[i] 
 56 |   reduce_array_twice(c, 16);  // c[i] = 9 * c[i] mod Q
 57 | 
 58 |   // we have: -130 <= c[i] <= 12413
 59 |   intt_red16_ct_rev2std(c);
 60 |   mul_reduce_array16(c, 16, ntt_red16_scaled_inv_psi_powers);
 61 |   reduce_array_twice(c, 16); // c[i] = 9 * c[i] mod Q
 62 |   correct(c, 16);
 63 | }
 64 | 
 65 | void ntt_red16_product3(int32_t *c, int32_t *a, int32_t *b) {
 66 |   shift_array(a, 16); // convert to [-(Q-1)/2, (Q-1)/2]
 67 |   mul_reduce_array16(a, 16, ntt_red16_psi_powers);
 68 |   ntt_red16_ct_std2rev(a);
 69 |   reduce_array(a, 16);
 70 | 
 71 |   shift_array(b, 16);
 72 |   mul_reduce_array16(b, 16, ntt_red16_psi_powers);
 73 |   ntt_red16_ct_std2rev(b);
 74 |   reduce_array(b, 16);
 75 |   
 76 |   // at this point:
 77 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 78 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 79 |   mul_reduce_array(c, 16, a, b); // c[i] = 3 * a[i] * b[i] 
 80 |   reduce_array_twice(c, 16);  // c[i] = 9 * c[i] mod Q
 81 | 
 82 |   // we have: -130 <= c[i] <= 12413
 83 |   intt_red16_gs_rev2std(c);
 84 |   mul_reduce_array16(c, 16, ntt_red16_scaled_inv_psi_powers);
 85 |   reduce_array_twice(c, 16); // c[i] = 9 * c[i] mod Q
 86 |   correct(c, 16);
 87 | }
 88 | 
 89 | void ntt_red16_product4(int32_t *c, int32_t *a, int32_t *b) {
 90 |   shift_array(a, 16); // convert to [-(Q-1)/2, (Q-1)/2]
 91 |   mul_reduce_array16(a, 16, ntt_red16_psi_powers);
 92 |   ntt_red16_gs_std2rev(a);
 93 |   reduce_array(a, 16);
 94 | 
 95 |   shift_array(b, 16);
 96 |   mul_reduce_array16(b, 16, ntt_red16_psi_powers);
 97 |   ntt_red16_gs_std2rev(b);
 98 |   reduce_array(b, 16);
 99 |   
100 |   // at this point:
101 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
102 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
103 |   mul_reduce_array(c, 16, a, b); // c[i] = 3 * a[i] * b[i] 
104 |   reduce_array_twice(c, 16);  // c[i] = 9 * c[i] mod Q
105 | 
106 |   // we have: -130 <= c[i] <= 12413
107 |   intt_red16_gs_rev2std(c);
108 |   mul_reduce_array16(c, 16, ntt_red16_scaled_inv_psi_powers);
109 |   reduce_array_twice(c, 16); // c[i] = 9 * c[i] mod Q
110 |   correct(c, 16);
111 | }
112 | 
113 | void ntt_red16_product5(int32_t *c, int32_t *a, int32_t *b) {
114 |   shift_array(a, 16);
115 |   mulntt_red16_ct_std2rev(a);
116 |   reduce_array(a, 16);
117 | 
118 |   shift_array(b, 16);
119 |   mulntt_red16_ct_std2rev(b);
120 |   reduce_array(b, 16);
121 | 
122 |   mul_reduce_array(c, 16, a, b); // c[i] = 3 * a[i] * b[i] 
123 |   reduce_array_twice(c, 16);  // c[i] = 9 * c[i] mod Q
124 | 
125 |   inttmul_red16_gs_rev2std(c);
126 |   scalar_mul_reduce_array(c, 16, ntt_red16_rescale8);
127 |   reduce_array_twice(c, 16);
128 |   correct(c, 16);
129 | }
130 | 


--------------------------------------------------------------------------------
/src/ntt_red256.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * NTT for Q=12289, n=256, using the Longa/Naehrig reduction method.
  3 |  */
  4 | 
  5 | #include "ntt_red256.h"
  6 | 
  7 | /*
  8 |  * Input: two arrays a and b in standard order
  9 |  *
 10 |  * Result: 
 11 |  * - the product is stored in array c, in standard order.
 12 |  * - arrays a and b are modified
 13 |  *
 14 |  * The input arrays must contain elements in the range [0, Q-1]
 15 |  * The result is also in that range.
 16 |  */
 17 | void ntt_red256_product1(int32_t *c, int32_t *a, int32_t *b) {
 18 |   shift_array(a, 256); // convert to [-(Q-1)/2, (Q-1)/2]
 19 |   mul_reduce_array16(a, 256, ntt_red256_psi_powers);
 20 |   ntt_red256_ct_std2rev(a);
 21 |   reduce_array(a, 256);
 22 | 
 23 |   shift_array(b, 256);
 24 |   mul_reduce_array16(b, 256, ntt_red256_psi_powers);
 25 |   ntt_red256_ct_std2rev(b);
 26 |   reduce_array(b, 256);
 27 |   
 28 |   // at this point:
 29 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 30 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 31 |   mul_reduce_array(c, 256, a, b); // c[i] = 3 * a[i] * b[i] 
 32 |   reduce_array_twice(c, 256);  // c[i] = 9 * c[i] mod Q
 33 | 
 34 |   // we have: -130 <= c[i] <= 12413
 35 |   intt_red256_ct_rev2std(c);
 36 |   mul_reduce_array16(c, 256, ntt_red256_scaled_inv_psi_powers);
 37 |   reduce_array_twice(c, 256); // c[i] = 9 * c[i] mod Q
 38 |   correct(c, 256);
 39 | }
 40 | 
 41 | void ntt_red256_product2(int32_t *c, int32_t *a, int32_t *b) {
 42 |   shift_array(a, 256); // convert to [-(Q-1)/2, (Q-1)/2]
 43 |   mul_reduce_array16(a, 256, ntt_red256_psi_powers);
 44 |   ntt_red256_gs_std2rev(a);
 45 |   reduce_array(a, 256);
 46 | 
 47 |   shift_array(b, 256);
 48 |   mul_reduce_array16(b, 256, ntt_red256_psi_powers);
 49 |   ntt_red256_gs_std2rev(b);
 50 |   reduce_array(b, 256);
 51 |   
 52 |   // at this point:
 53 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 54 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 55 |   mul_reduce_array(c, 256, a, b); // c[i] = 3 * a[i] * b[i] 
 56 |   reduce_array_twice(c, 256);  // c[i] = 9 * c[i] mod Q
 57 | 
 58 |   // we have: -130 <= c[i] <= 12413
 59 |   intt_red256_ct_rev2std(c);
 60 |   mul_reduce_array16(c, 256, ntt_red256_scaled_inv_psi_powers);
 61 |   reduce_array_twice(c, 256); // c[i] = 9 * c[i] mod Q
 62 |   correct(c, 256);
 63 | }
 64 | 
 65 | void ntt_red256_product3(int32_t *c, int32_t *a, int32_t *b) {
 66 |   shift_array(a, 256); // convert to [-(Q-1)/2, (Q-1)/2]
 67 |   mul_reduce_array16(a, 256, ntt_red256_psi_powers);
 68 |   ntt_red256_ct_std2rev(a);
 69 |   reduce_array(a, 256);
 70 | 
 71 |   shift_array(b, 256);
 72 |   mul_reduce_array16(b, 256, ntt_red256_psi_powers);
 73 |   ntt_red256_ct_std2rev(b);
 74 |   reduce_array(b, 256);
 75 |   
 76 |   // at this point:
 77 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 78 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 79 |   mul_reduce_array(c, 256, a, b); // c[i] = 3 * a[i] * b[i] 
 80 |   reduce_array_twice(c, 256);  // c[i] = 9 * c[i] mod Q
 81 | 
 82 |   // we have: -130 <= c[i] <= 12413
 83 |   intt_red256_gs_rev2std(c);
 84 |   mul_reduce_array16(c, 256, ntt_red256_scaled_inv_psi_powers);
 85 |   reduce_array_twice(c, 256); // c[i] = 9 * c[i] mod Q
 86 |   correct(c, 256);
 87 | }
 88 | 
 89 | void ntt_red256_product4(int32_t *c, int32_t *a, int32_t *b) {
 90 |   shift_array(a, 256); // convert to [-(Q-1)/2, (Q-1)/2]
 91 |   mul_reduce_array16(a, 256, ntt_red256_psi_powers);
 92 |   ntt_red256_gs_std2rev(a);
 93 |   reduce_array(a, 256);
 94 | 
 95 |   shift_array(b, 256);
 96 |   mul_reduce_array16(b, 256, ntt_red256_psi_powers);
 97 |   ntt_red256_gs_std2rev(b);
 98 |   reduce_array(b, 256);
 99 |   
100 |   // at this point:
101 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
102 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
103 |   mul_reduce_array(c, 256, a, b); // c[i] = 3 * a[i] * b[i] 
104 |   reduce_array_twice(c, 256);  // c[i] = 9 * c[i] mod Q
105 | 
106 |   // we have: -130 <= c[i] <= 12413
107 |   intt_red256_gs_rev2std(c);
108 |   mul_reduce_array16(c, 256, ntt_red256_scaled_inv_psi_powers);
109 |   reduce_array_twice(c, 256); // c[i] = 9 * c[i] mod Q
110 |   correct(c, 256);
111 | }
112 | 
113 | void ntt_red256_product5(int32_t *c, int32_t *a, int32_t *b) {
114 |   shift_array(a, 256);
115 |   mulntt_red256_ct_std2rev(a);
116 |   reduce_array(a, 256);
117 | 
118 |   shift_array(b, 256);
119 |   mulntt_red256_ct_std2rev(b);
120 |   reduce_array(b, 256);
121 | 
122 |   mul_reduce_array(c, 256, a, b); // c[i] = 3 * a[i] * b[i] 
123 |   reduce_array_twice(c, 256);  // c[i] = 9 * c[i] mod Q
124 | 
125 |   inttmul_red256_gs_rev2std(c);
126 |   scalar_mul_reduce_array(c, 256, ntt_red256_rescale8);
127 |   reduce_array_twice(c, 256);
128 |   correct(c, 256);
129 | }
130 | 


--------------------------------------------------------------------------------
/src/ntt_red512.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * NTT for Q=12289, n=512, using the Longa/Naehrig reduction method.
  3 |  */
  4 | 
  5 | #include "ntt_red512.h"
  6 | 
  7 | /*
  8 |  * Input: two arrays a and b in standard order
  9 |  *
 10 |  * Result: 
 11 |  * - the product is stored in array c, in standard order.
 12 |  * - arrays a and b are modified
 13 |  *
 14 |  * The input arrays must contain elements in the range [0, Q-1]
 15 |  * The result is also in that range.
 16 |  */
 17 | void ntt_red512_product1(int32_t *c, int32_t *a, int32_t *b) {
 18 |   shift_array(a, 512); // convert to [-(Q-1)/2, (Q-1)/2]
 19 |   mul_reduce_array16(a, 512, ntt_red512_psi_powers);
 20 |   ntt_red512_ct_std2rev(a);
 21 |   reduce_array(a, 512);
 22 | 
 23 |   shift_array(b, 512);
 24 |   mul_reduce_array16(b, 512, ntt_red512_psi_powers);
 25 |   ntt_red512_ct_std2rev(b);
 26 |   reduce_array(b, 512);
 27 |   
 28 |   // at this point:
 29 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 30 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 31 |   mul_reduce_array(c, 512, a, b); // c[i] = 3 * a[i] * b[i] 
 32 |   reduce_array_twice(c, 512);  // c[i] = 9 * c[i] mod Q
 33 | 
 34 |   // we have: -130 <= c[i] <= 12413
 35 |   intt_red512_ct_rev2std(c);
 36 |   mul_reduce_array16(c, 512, ntt_red512_scaled_inv_psi_powers);
 37 |   reduce_array_twice(c, 512); // c[i] = 9 * c[i] mod Q
 38 |   correct(c, 512);
 39 | }
 40 | 
 41 | void ntt_red512_product2(int32_t *c, int32_t *a, int32_t *b) {
 42 |   shift_array(a, 512); // convert to [-(Q-1)/2, (Q-1)/2]
 43 |   mul_reduce_array16(a, 512, ntt_red512_psi_powers);
 44 |   ntt_red512_gs_std2rev(a);
 45 |   reduce_array(a, 512);
 46 | 
 47 |   shift_array(b, 512);
 48 |   mul_reduce_array16(b, 512, ntt_red512_psi_powers);
 49 |   ntt_red512_gs_std2rev(b);
 50 |   reduce_array(b, 512);
 51 |   
 52 |   // at this point:
 53 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 54 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 55 |   mul_reduce_array(c, 512, a, b); // c[i] = 3 * a[i] * b[i] 
 56 |   reduce_array_twice(c, 512);  // c[i] = 9 * c[i] mod Q
 57 | 
 58 |   // we have: -130 <= c[i] <= 12413
 59 |   intt_red512_ct_rev2std(c);
 60 |   mul_reduce_array16(c, 512, ntt_red512_scaled_inv_psi_powers);
 61 |   reduce_array_twice(c, 512); // c[i] = 9 * c[i] mod Q
 62 |   correct(c, 512);
 63 | }
 64 | 
 65 | void ntt_red512_product3(int32_t *c, int32_t *a, int32_t *b) {
 66 |   shift_array(a, 512); // convert to [-(Q-1)/2, (Q-1)/2]
 67 |   mul_reduce_array16(a, 512, ntt_red512_psi_powers);
 68 |   ntt_red512_ct_std2rev(a);
 69 |   reduce_array(a, 512);
 70 | 
 71 |   shift_array(b, 512);
 72 |   mul_reduce_array16(b, 512, ntt_red512_psi_powers);
 73 |   ntt_red512_ct_std2rev(b);
 74 |   reduce_array(b, 512);
 75 |   
 76 |   // at this point:
 77 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 78 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 79 |   mul_reduce_array(c, 512, a, b); // c[i] = 3 * a[i] * b[i] 
 80 |   reduce_array_twice(c, 512);  // c[i] = 9 * c[i] mod Q
 81 | 
 82 |   // we have: -130 <= c[i] <= 12413
 83 |   intt_red512_gs_rev2std(c);
 84 |   mul_reduce_array16(c, 512, ntt_red512_scaled_inv_psi_powers);
 85 |   reduce_array_twice(c, 512); // c[i] = 9 * c[i] mod Q
 86 |   correct(c, 512);
 87 | }
 88 | 
 89 | void ntt_red512_product4(int32_t *c, int32_t *a, int32_t *b) {
 90 |   shift_array(a, 512); // convert to [-(Q-1)/2, (Q-1)/2]
 91 |   mul_reduce_array16(a, 512, ntt_red512_psi_powers);
 92 |   ntt_red512_gs_std2rev(a);
 93 |   reduce_array(a, 512);
 94 | 
 95 |   shift_array(b, 512);
 96 |   mul_reduce_array16(b, 512, ntt_red512_psi_powers);
 97 |   ntt_red512_gs_std2rev(b);
 98 |   reduce_array(b, 512);
 99 |   
100 |   // at this point:
101 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
102 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
103 |   mul_reduce_array(c, 512, a, b); // c[i] = 3 * a[i] * b[i] 
104 |   reduce_array_twice(c, 512);  // c[i] = 9 * c[i] mod Q
105 | 
106 |   // we have: -130 <= c[i] <= 12413
107 |   intt_red512_gs_rev2std(c);
108 |   mul_reduce_array16(c, 512, ntt_red512_scaled_inv_psi_powers);
109 |   reduce_array_twice(c, 512); // c[i] = 9 * c[i] mod Q
110 |   correct(c, 512);
111 | }
112 | 
113 | void ntt_red512_product5(int32_t *c, int32_t *a, int32_t *b) {
114 |   shift_array(a, 512);
115 |   mulntt_red512_ct_std2rev(a);
116 |   reduce_array(a, 512);
117 | 
118 |   shift_array(b, 512);
119 |   mulntt_red512_ct_std2rev(b);
120 |   reduce_array(b, 512);
121 | 
122 |   mul_reduce_array(c, 512, a, b); // c[i] = 3 * a[i] * b[i] 
123 |   reduce_array_twice(c, 512);  // c[i] = 9 * c[i] mod Q
124 | 
125 |   inttmul_red512_gs_rev2std(c);
126 |   scalar_mul_reduce_array(c, 512, ntt_red512_rescale8);
127 |   reduce_array_twice(c, 512);
128 |   correct(c, 512);
129 | }
130 | 


--------------------------------------------------------------------------------
/src/ntt_red1024.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * NTT for Q=12289, n=1024, using the Longa/Naehrig reduction method.
  3 |  */
  4 | 
  5 | #include "ntt_red1024.h"
  6 | 
  7 | /*
  8 |  * Input: two arrays a and b in standard order
  9 |  *
 10 |  * Result: 
 11 |  * - the product is stored in array c, in standard order.
 12 |  * - arrays a and b are modified
 13 |  *
 14 |  * The input arrays must contain elements in the range [0, Q-1]
 15 |  * The result is also in that range.
 16 |  */
 17 | void ntt_red1024_product1(int32_t *c, int32_t *a, int32_t *b) {
 18 |   mul_reduce_array16(a, 1024, ntt_red1024_psi_powers);
 19 |   ntt_red1024_ct_std2rev(a);
 20 |   reduce_array(a, 1024);
 21 | 
 22 |   mul_reduce_array16(b, 1024, ntt_red1024_psi_powers);
 23 |   ntt_red1024_ct_std2rev(b);
 24 |   reduce_array(b, 1024);
 25 |   
 26 |   // at this point:
 27 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 28 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 29 |   mul_reduce_array(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
 30 |   reduce_array_twice(c, 1024);  // c[i] = 9 * c[i] mod Q
 31 | 
 32 |   // we have: -130 <= c[i] <= 12413
 33 |   intt_red1024_ct_rev2std(c);
 34 |   mul_reduce_array16(c, 1024, ntt_red1024_scaled_inv_psi_powers);
 35 |   reduce_array_twice(c, 1024); // c[i] = 9 * c[i] mod Q
 36 |   correct(c, 1024);
 37 | }
 38 | 
 39 | void ntt_red1024_product2(int32_t *c, int32_t *a, int32_t *b) {
 40 |   //  shift_array(a, 1024); // convert to [-(Q-1)/2, (Q-1)/2]
 41 |   mul_reduce_array16(a, 1024, ntt_red1024_psi_powers);
 42 |   ntt_red1024_gs_std2rev(a);
 43 |   reduce_array(a, 1024);
 44 | 
 45 |   //  shift_array(b, 1024);
 46 |   mul_reduce_array16(b, 1024, ntt_red1024_psi_powers);
 47 |   ntt_red1024_gs_std2rev(b);
 48 |   reduce_array(b, 1024);
 49 |   
 50 |   // at this point:
 51 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 52 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 53 |   mul_reduce_array(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
 54 |   reduce_array_twice(c, 1024);  // c[i] = 9 * c[i] mod Q
 55 | 
 56 |   // we have: -130 <= c[i] <= 12413
 57 |   intt_red1024_ct_rev2std(c);
 58 |   mul_reduce_array16(c, 1024, ntt_red1024_scaled_inv_psi_powers);
 59 |   reduce_array_twice(c, 1024); // c[i] = 9 * c[i] mod Q
 60 |   correct(c, 1024);
 61 | }
 62 | 
 63 | void ntt_red1024_product3(int32_t *c, int32_t *a, int32_t *b) {
 64 |   //  shift_array(a, 1024); // convert to [-(Q-1)/2, (Q-1)/2]
 65 |   mul_reduce_array16(a, 1024, ntt_red1024_psi_powers);
 66 |   ntt_red1024_ct_std2rev(a);
 67 |   reduce_array(a, 1024);
 68 | 
 69 |   //  shift_array(b, 1024);
 70 |   mul_reduce_array16(b, 1024, ntt_red1024_psi_powers);
 71 |   ntt_red1024_ct_std2rev(b);
 72 |   reduce_array(b, 1024);
 73 |   
 74 |   // at this point:
 75 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 76 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 77 |   mul_reduce_array(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
 78 |   reduce_array_twice(c, 1024);  // c[i] = 9 * c[i] mod Q
 79 | 
 80 |   // we have: -130 <= c[i] <= 12413
 81 |   intt_red1024_gs_rev2std(c);
 82 |   mul_reduce_array16(c, 1024, ntt_red1024_scaled_inv_psi_powers);
 83 |   reduce_array_twice(c, 1024); // c[i] = 9 * c[i] mod Q
 84 |   correct(c, 1024);
 85 | }
 86 | 
 87 | void ntt_red1024_product4(int32_t *c, int32_t *a, int32_t *b) {
 88 |   //  shift_array(a, 1024); // convert to [-(Q-1)/2, (Q-1)/2]
 89 |   mul_reduce_array16(a, 1024, ntt_red1024_psi_powers);
 90 |   ntt_red1024_gs_std2rev(a);
 91 |   reduce_array(a, 1024);
 92 | 
 93 |   //  shift_array(b, 1024);
 94 |   mul_reduce_array16(b, 1024, ntt_red1024_psi_powers);
 95 |   ntt_red1024_gs_std2rev(b);
 96 |   reduce_array(b, 1024);
 97 |   
 98 |   // at this point:
 99 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
100 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
101 |   mul_reduce_array(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
102 |   reduce_array_twice(c, 1024);  // c[i] = 9 * c[i] mod Q
103 | 
104 |   // we have: -130 <= c[i] <= 12413
105 |   intt_red1024_gs_rev2std(c);
106 |   mul_reduce_array16(c, 1024, ntt_red1024_scaled_inv_psi_powers);
107 |   reduce_array_twice(c, 1024); // c[i] = 9 * c[i] mod Q
108 |   correct(c, 1024);
109 | }
110 | 
111 | void ntt_red1024_product5(int32_t *c, int32_t *a, int32_t *b) {
112 |   //  shift_array(a, 1024);
113 |   mulntt_red1024_ct_std2rev(a);
114 |   reduce_array(a, 1024);
115 | 
116 |   //  shift_array(b, 1024);
117 |   mulntt_red1024_ct_std2rev(b);
118 |   reduce_array(b, 1024);
119 | 
120 |   mul_reduce_array(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
121 |   reduce_array_twice(c, 1024);  // c[i] = 9 * c[i] mod Q
122 | 
123 |   inttmul_red1024_gs_rev2std(c);
124 |   scalar_mul_reduce_array(c, 1024, ntt_red1024_rescale8);
125 |   reduce_array_twice(c, 1024);
126 |   correct(c, 1024);
127 | }
128 | 


--------------------------------------------------------------------------------
/src/test_mod.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <inttypes.h>
  3 | #include <assert.h>
  4 | 
  5 | #define Q 12289
  6 | #define P 16
  7 | #define R 4
  8 | 
  9 | int32_t smodq(int32_t x) {
 10 |   return x % Q;
 11 | }
 12 | 
 13 | uint32_t umodq(uint32_t x) {
 14 |   return x % Q;
 15 | }
 16 | 
 17 | int32_t pmodq(int32_t x) {
 18 |   int32_t r;
 19 |   r = smodq(x);
 20 |   return (r < 0) ? r + Q : r;
 21 | }
 22 | 
 23 | static uint32_t approx_qinv(uint32_t n) {
 24 |   return (((uint64_t)1)<< n)/Q+1;
 25 | }
 26 | static uint32_t approx_divq(uint32_t x, uint32_t n) {
 27 |   return (((uint64_t) x) * approx_qinv(n)) >> n;
 28 | }
 29 | 
 30 | static uint32_t approx_modq(uint32_t x, uint32_t n) {
 31 |   return x - approx_divq(x, n) * Q;
 32 | }
 33 | 
 34 | static void test_approx(uint32_t n) {
 35 |   uint32_t x;
 36 | 
 37 |   x = 0;
 38 |   do {
 39 |     if (approx_modq(x, n) != (x % Q)) {
 40 |       printf("approx %"PRIu32" fails for %"PRIu32"\n", n, x);
 41 |       return;
 42 |     }
 43 |     x ++;
 44 |     //  } while (x != 0); to test on full 32bit range
 45 |   } while (x <= (Q - 1) * (Q -1));
 46 | 
 47 |   printf("approx %"PRIu32" works\n", n);
 48 | }
 49 | 
 50 | static void test_all_approx(void) {
 51 |   uint32_t n;
 52 | 
 53 |   for (n=30; n<50; n++) {
 54 |     printf("trying approx %"PRIu32", approx(1/q) = %"PRIu32"\n", n, approx_qinv(n));
 55 |     test_approx(n);
 56 |   }
 57 | }
 58 | 
 59 | 
 60 | static int64_t approx_pinv(uint32_t n) {
 61 |   return (((int64_t) 1) << n)/Q+1;
 62 | }
 63 | 
 64 | static int32_t approx_pdivq(int32_t x, uint32_t n) {
 65 |   return (((int64_t) x) * approx_pinv(n)) >> n;
 66 | }
 67 | 
 68 | static int32_t approx_pmodq(int32_t x, uint32_t n) {
 69 |   int32_t r;
 70 |   r = x - approx_pdivq(x, n) * Q;
 71 |   return (r < Q) ? r : 0;
 72 | }
 73 | 
 74 | // check that pmod(x, n) is between 0 and Q
 75 | static void check_approx_pmodq_bound(uint32_t n) {
 76 |   int32_t x, r;
 77 | 
 78 |   for (x = 0; x<INT32_MAX; x++) {
 79 |     r = approx_pmodq(x, n);
 80 |     if (r < 0 || r > Q) {
 81 |       printf("signed-approx bound (with n=%"PRIu32") fails for x = %"PRId32" (approx = %"PRId32")\n", n, x, r);
 82 |       return;
 83 |     }
 84 |   }
 85 | 
 86 |   r = approx_pmodq(x, n);
 87 |   if (r < 0 || r > Q) {
 88 |     printf("signed-approx bound (with n=%"PRIu32") fails for x = %"PRId32" (approx = %"PRId32")\n", n, x, r);
 89 |     return;
 90 |   }
 91 | 
 92 |   for (x=-1; x>INT32_MIN; x--) {
 93 |     r = approx_pmodq(x, n);
 94 |     if (r < 0 || r > Q) {
 95 |       printf("signed-approx bound (with n=%"PRIu32") fails for x = %"PRId32" (approx = %"PRId32")\n", n, x, r);
 96 |       return;
 97 |     }
 98 |   }
 99 |   
100 |   r = approx_pmodq(x, n);
101 |   if (r < 0 || r > Q) {
102 |     printf("signed-approx bound (with n=%"PRIu32") fails for x = %"PRId32" (approx = %"PRId32")\n", n, x, r);
103 |     return;
104 |   }
105 | 
106 |   printf("Bound for signed-approx (with n=%"PRIu32") holds\n", n);
107 | }
108 | 
109 | static void test_pmod_approx(uint32_t n) {
110 |   int32_t x;
111 | 
112 |   for (x = 0; x<INT32_MAX; x++) {
113 |     if (approx_pmodq(x, n) != pmodq(x)) {
114 |       printf("signed-approx %"PRIu32" fails for %"PRId32": pmod = %"PRId32", approx = %"PRId32"\n", n, x, pmodq(x), approx_pmodq(x, n));
115 |       return;
116 |     }
117 |   }
118 | 
119 |   if (approx_pmodq(x, n) != pmodq(x)) {
120 |     printf("signed-approx %"PRIu32" fails for %"PRId32": pmod = %"PRId32", approx = %"PRId32"\n", n, x, pmodq(x), approx_pmodq(x, n));
121 |     return;
122 |   }
123 | 
124 |   for (x = -1; x>INT32_MIN; x--) {
125 |     if (approx_pmodq(x, n) != pmodq(x)) {
126 |       printf("neg: signed-approx %"PRIu32" fails for %"PRId32": pmod = %"PRId32", approx = %"PRId32"\n", n, x, pmodq(x), approx_pmodq(x, n));
127 |       return;
128 |     }
129 |   }
130 | 
131 |   if (approx_pmodq(x, n) != pmodq(x)) {
132 |     printf("signed-approx %"PRIu32" fails for %"PRId32": pmod = %"PRId32", approx = %"PRId32"\n", n, x, pmodq(x), approx_pmodq(x, n));
133 |     return;
134 |   }
135 |   
136 |   printf("signed-approx %"PRIu32" works\n", n);
137 |   printf("checking bounds\n");
138 |   check_approx_pmodq_bound(n);
139 | }
140 | 
141 | static void test_all_pmod_approx(void) {
142 |   uint32_t n;
143 | 
144 |   for (n=30; n<50; n++) {
145 |     printf("trying signed-approx %"PRIu32", approx(1/q) = %"PRId64"\n", n, approx_pinv(n));
146 |     test_pmod_approx(n);
147 |     printf("\n");
148 |   }
149 | }
150 | 
151 | int main(void) {
152 |   int32_t i, x, m, min;
153 | 
154 |   min = Q;
155 |   x = R;
156 |   for (i=0; i<100000; i++) {
157 |     m = smodq(x);
158 |     if (m < min) min = m;
159 |     x = P * x + R;
160 |   }
161 |   
162 |   printf("min = %"PRId32"\n", min);
163 | 
164 |   test_all_approx();
165 |   printf("\n");
166 |   test_all_pmod_approx();
167 | 
168 |   return 0;
169 | }
170 | 


--------------------------------------------------------------------------------
/verifier/vstte20-benchmarks/ntt_red1024.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * NTT for Q=12289, n=1024, using the Longa/Naehrig reduction method.
  3 |  */
  4 | 
  5 | #include "ntt_red1024.h"
  6 | 
  7 | /*
  8 |  * Input: two arrays a and b in standard order
  9 |  *
 10 |  * Result: 
 11 |  * - the product is stored in array c, in standard order.
 12 |  * - arrays a and b are modified
 13 |  *
 14 |  * The input arrays must contain elements in the range [0, Q-1]
 15 |  * The result is also in that range.
 16 |  */
 17 | void ntt_red1024_product1(int32_t *c, int32_t *a, int32_t *b) {
 18 |   //  shift_array(a, 1024); // convert to [-(Q-1)/2, (Q-1)/2]
 19 |   mul_reduce_array16(a, 1024, ntt_red1024_psi_powers);
 20 |   ntt_red1024_ct_std2rev(a);
 21 |   reduce_array(a, 1024);
 22 | 
 23 |   //  shift_array(b, 1024);
 24 |   mul_reduce_array16(b, 1024, ntt_red1024_psi_powers);
 25 |   ntt_red1024_ct_std2rev(b);
 26 |   reduce_array(b, 1024);
 27 |   
 28 |   // at this point:
 29 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 30 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 31 |   mul_reduce_array(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
 32 |   reduce_array_twice(c, 1024);  // c[i] = 9 * c[i] mod Q
 33 | 
 34 |   // we have: -130 <= c[i] <= 12413
 35 |   intt_red1024_ct_rev2std(c);
 36 |   mul_reduce_array16(c, 1024, ntt_red1024_scaled_inv_psi_powers);
 37 |   reduce_array_twice(c, 1024); // c[i] = 9 * c[i] mod Q
 38 |   correct(c, 1024);
 39 | }
 40 | 
 41 | void ntt_red1024_product2(int32_t *c, int32_t *a, int32_t *b) {
 42 |   //  shift_array(a, 1024); // convert to [-(Q-1)/2, (Q-1)/2]
 43 |   mul_reduce_array16(a, 1024, ntt_red1024_psi_powers);
 44 |   ntt_red1024_gs_std2rev(a);
 45 |   reduce_array(a, 1024);
 46 | 
 47 |   //  shift_array(b, 1024);
 48 |   mul_reduce_array16(b, 1024, ntt_red1024_psi_powers);
 49 |   ntt_red1024_gs_std2rev(b);
 50 |   reduce_array(b, 1024);
 51 |   
 52 |   // at this point:
 53 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 54 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 55 |   mul_reduce_array(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
 56 |   reduce_array_twice(c, 1024);  // c[i] = 9 * c[i] mod Q
 57 | 
 58 |   // we have: -130 <= c[i] <= 12413
 59 |   intt_red1024_ct_rev2std(c);
 60 |   mul_reduce_array16(c, 1024, ntt_red1024_scaled_inv_psi_powers);
 61 |   reduce_array_twice(c, 1024); // c[i] = 9 * c[i] mod Q
 62 |   correct(c, 1024);
 63 | }
 64 | 
 65 | void ntt_red1024_product3(int32_t *c, int32_t *a, int32_t *b) {
 66 |   //  shift_array(a, 1024); // convert to [-(Q-1)/2, (Q-1)/2]
 67 |   mul_reduce_array16(a, 1024, ntt_red1024_psi_powers);
 68 |   ntt_red1024_ct_std2rev(a);
 69 |   reduce_array(a, 1024);
 70 | 
 71 |   //  shift_array(b, 1024);
 72 |   mul_reduce_array16(b, 1024, ntt_red1024_psi_powers);
 73 |   ntt_red1024_ct_std2rev(b);
 74 |   reduce_array(b, 1024);
 75 |   
 76 |   // at this point:
 77 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 78 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 79 |   mul_reduce_array(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
 80 |   reduce_array_twice(c, 1024);  // c[i] = 9 * c[i] mod Q
 81 | 
 82 |   // we have: -130 <= c[i] <= 12413
 83 |   intt_red1024_gs_rev2std(c);
 84 |   mul_reduce_array16(c, 1024, ntt_red1024_scaled_inv_psi_powers);
 85 |   reduce_array_twice(c, 1024); // c[i] = 9 * c[i] mod Q
 86 |   correct(c, 1024);
 87 | }
 88 | 
 89 | void ntt_red1024_product4(int32_t *c, int32_t *a, int32_t *b) {
 90 |   //  shift_array(a, 1024); // convert to [-(Q-1)/2, (Q-1)/2]
 91 |   mul_reduce_array16(a, 1024, ntt_red1024_psi_powers);
 92 |   ntt_red1024_gs_std2rev(a);
 93 |   reduce_array(a, 1024);
 94 | 
 95 |   //  shift_array(b, 1024);
 96 |   mul_reduce_array16(b, 1024, ntt_red1024_psi_powers);
 97 |   ntt_red1024_gs_std2rev(b);
 98 |   reduce_array(b, 1024);
 99 |   
100 |   // at this point:
101 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
102 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
103 |   mul_reduce_array(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
104 |   reduce_array_twice(c, 1024);  // c[i] = 9 * c[i] mod Q
105 | 
106 |   // we have: -130 <= c[i] <= 12413
107 |   intt_red1024_gs_rev2std(c);
108 |   mul_reduce_array16(c, 1024, ntt_red1024_scaled_inv_psi_powers);
109 |   reduce_array_twice(c, 1024); // c[i] = 9 * c[i] mod Q
110 |   correct(c, 1024);
111 | }
112 | 
113 | void ntt_red1024_product5(int32_t *c, int32_t *a, int32_t *b) {
114 |   //  shift_array(a, 1024);
115 |   mulntt_red1024_ct_std2rev(a);
116 |   reduce_array(a, 1024);
117 | 
118 |   //  shift_array(b, 1024);
119 |   mulntt_red1024_ct_std2rev(b);
120 |   reduce_array(b, 1024);
121 | 
122 |   mul_reduce_array(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
123 |   reduce_array_twice(c, 1024);  // c[i] = 9 * c[i] mod Q
124 | 
125 |   inttmul_red1024_gs_rev2std(c);
126 |   scalar_mul_reduce_array(c, 1024, ntt_red1024_rescale);
127 |   reduce_array_twice(c, 1024);
128 |   correct(c, 1024);
129 | }
130 | 


--------------------------------------------------------------------------------
/src/bitrev512_table.c:
--------------------------------------------------------------------------------
 1 | #include "bitrev512_table.h"
 2 | 
 3 | const uint16_t bitrev512[BITREV512_NPAIRS][2] = {
 4 |     {     1,   256 }, {     2,   128 }, {     3,   384 }, {     4,    64 },
 5 |     {     5,   320 }, {     6,   192 }, {     7,   448 }, {     8,    32 },
 6 |     {     9,   288 }, {    10,   160 }, {    11,   416 }, {    12,    96 },
 7 |     {    13,   352 }, {    14,   224 }, {    15,   480 }, {    17,   272 },
 8 |     {    18,   144 }, {    19,   400 }, {    20,    80 }, {    21,   336 },
 9 |     {    22,   208 }, {    23,   464 }, {    24,    48 }, {    25,   304 },
10 |     {    26,   176 }, {    27,   432 }, {    28,   112 }, {    29,   368 },
11 |     {    30,   240 }, {    31,   496 }, {    33,   264 }, {    34,   136 },
12 |     {    35,   392 }, {    36,    72 }, {    37,   328 }, {    38,   200 },
13 |     {    39,   456 }, {    41,   296 }, {    42,   168 }, {    43,   424 },
14 |     {    44,   104 }, {    45,   360 }, {    46,   232 }, {    47,   488 },
15 |     {    49,   280 }, {    50,   152 }, {    51,   408 }, {    52,    88 },
16 |     {    53,   344 }, {    54,   216 }, {    55,   472 }, {    57,   312 },
17 |     {    58,   184 }, {    59,   440 }, {    60,   120 }, {    61,   376 },
18 |     {    62,   248 }, {    63,   504 }, {    65,   260 }, {    66,   132 },
19 |     {    67,   388 }, {    69,   324 }, {    70,   196 }, {    71,   452 },
20 |     {    73,   292 }, {    74,   164 }, {    75,   420 }, {    76,   100 },
21 |     {    77,   356 }, {    78,   228 }, {    79,   484 }, {    81,   276 },
22 |     {    82,   148 }, {    83,   404 }, {    85,   340 }, {    86,   212 },
23 |     {    87,   468 }, {    89,   308 }, {    90,   180 }, {    91,   436 },
24 |     {    92,   116 }, {    93,   372 }, {    94,   244 }, {    95,   500 },
25 |     {    97,   268 }, {    98,   140 }, {    99,   396 }, {   101,   332 },
26 |     {   102,   204 }, {   103,   460 }, {   105,   300 }, {   106,   172 },
27 |     {   107,   428 }, {   109,   364 }, {   110,   236 }, {   111,   492 },
28 |     {   113,   284 }, {   114,   156 }, {   115,   412 }, {   117,   348 },
29 |     {   118,   220 }, {   119,   476 }, {   121,   316 }, {   122,   188 },
30 |     {   123,   444 }, {   125,   380 }, {   126,   252 }, {   127,   508 },
31 |     {   129,   258 }, {   131,   386 }, {   133,   322 }, {   134,   194 },
32 |     {   135,   450 }, {   137,   290 }, {   138,   162 }, {   139,   418 },
33 |     {   141,   354 }, {   142,   226 }, {   143,   482 }, {   145,   274 },
34 |     {   147,   402 }, {   149,   338 }, {   150,   210 }, {   151,   466 },
35 |     {   153,   306 }, {   154,   178 }, {   155,   434 }, {   157,   370 },
36 |     {   158,   242 }, {   159,   498 }, {   161,   266 }, {   163,   394 },
37 |     {   165,   330 }, {   166,   202 }, {   167,   458 }, {   169,   298 },
38 |     {   171,   426 }, {   173,   362 }, {   174,   234 }, {   175,   490 },
39 |     {   177,   282 }, {   179,   410 }, {   181,   346 }, {   182,   218 },
40 |     {   183,   474 }, {   185,   314 }, {   187,   442 }, {   189,   378 },
41 |     {   190,   250 }, {   191,   506 }, {   193,   262 }, {   195,   390 },
42 |     {   197,   326 }, {   199,   454 }, {   201,   294 }, {   203,   422 },
43 |     {   205,   358 }, {   206,   230 }, {   207,   486 }, {   209,   278 },
44 |     {   211,   406 }, {   213,   342 }, {   215,   470 }, {   217,   310 },
45 |     {   219,   438 }, {   221,   374 }, {   222,   246 }, {   223,   502 },
46 |     {   225,   270 }, {   227,   398 }, {   229,   334 }, {   231,   462 },
47 |     {   233,   302 }, {   235,   430 }, {   237,   366 }, {   239,   494 },
48 |     {   241,   286 }, {   243,   414 }, {   245,   350 }, {   247,   478 },
49 |     {   249,   318 }, {   251,   446 }, {   253,   382 }, {   255,   510 },
50 |     {   259,   385 }, {   261,   321 }, {   263,   449 }, {   265,   289 },
51 |     {   267,   417 }, {   269,   353 }, {   271,   481 }, {   275,   401 },
52 |     {   277,   337 }, {   279,   465 }, {   281,   305 }, {   283,   433 },
53 |     {   285,   369 }, {   287,   497 }, {   291,   393 }, {   293,   329 },
54 |     {   295,   457 }, {   299,   425 }, {   301,   361 }, {   303,   489 },
55 |     {   307,   409 }, {   309,   345 }, {   311,   473 }, {   315,   441 },
56 |     {   317,   377 }, {   319,   505 }, {   323,   389 }, {   327,   453 },
57 |     {   331,   421 }, {   333,   357 }, {   335,   485 }, {   339,   405 },
58 |     {   343,   469 }, {   347,   437 }, {   349,   373 }, {   351,   501 },
59 |     {   355,   397 }, {   359,   461 }, {   363,   429 }, {   367,   493 },
60 |     {   371,   413 }, {   375,   477 }, {   379,   445 }, {   383,   509 },
61 |     {   391,   451 }, {   395,   419 }, {   399,   483 }, {   407,   467 },
62 |     {   411,   435 }, {   415,   499 }, {   423,   459 }, {   431,   491 },
63 |     {   439,   475 }, {   447,   507 }, {   463,   487 }, {   479,   503 },
64 | };
65 | 
66 | 


--------------------------------------------------------------------------------
/src/ntt_red_asm16.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * NTT for Q=12289, n=16, using the Longa/Naehrig reduction method.
  3 |  */
  4 | 
  5 | #include "ntt_red_asm16.h"
  6 | 
  7 | /*
  8 |  * Input: two arrays a and b in standard order
  9 |  *
 10 |  * Result: 
 11 |  * - the product is stored in array c, in standard order.
 12 |  * - arrays a and b are modified
 13 |  *
 14 |  * The input arrays must contain elements in the range [0, Q-1]
 15 |  * The result is also in that range.
 16 |  */
 17 | void ntt_red16_product1_asm(int32_t *c, int32_t *a, int32_t *b) {
 18 |   shift_array_asm(a, 16); // convert to [-(Q-1)/2, (Q-1)/2]
 19 |   mul_reduce_array16_asm(a, 16, ntt_red16_psi_powers);
 20 |   ntt_red16_ct_std2rev_asm(a);
 21 |   reduce_array_asm(a, 16);
 22 | 
 23 |   shift_array_asm(b, 16);
 24 |   mul_reduce_array16_asm(b, 16, ntt_red16_psi_powers);
 25 |   ntt_red16_ct_std2rev_asm(b);
 26 |   reduce_array_asm(b, 16);
 27 |   
 28 |   // at this point:
 29 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 30 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 31 |   mul_reduce_array_asm(c, 16, a, b); // c[i] = 3 * a[i] * b[i] 
 32 |   reduce_array_twice_asm(c, 16);     // c[i] = 9 * c[i] mod Q
 33 | 
 34 |   // we have: -130 <= c[i] <= 12413
 35 |   intt_red16_ct_rev2std_asm(c);
 36 |   mul_reduce_array16_asm(c, 16, ntt_red16_scaled_inv_psi_powers);
 37 |   reduce_array_twice_asm(c, 16); // c[i] = 9 * c[i] mod Q
 38 |   correct_asm(c, 16);
 39 | }
 40 | 
 41 | void ntt_red16_product2_asm(int32_t *c, int32_t *a, int32_t *b) {
 42 |   shift_array_asm(a, 16); // convert to [-(Q-1)/2, (Q-1)/2]
 43 |   mul_reduce_array16_asm(a, 16, ntt_red16_psi_powers);
 44 |   ntt_red16_gs_std2rev_asm(a);
 45 |   reduce_array_asm(a, 16);
 46 | 
 47 |   shift_array_asm(b, 16);
 48 |   mul_reduce_array16_asm(b, 16, ntt_red16_psi_powers);
 49 |   ntt_red16_gs_std2rev_asm(b);
 50 |   reduce_array_asm(b, 16);
 51 |   
 52 |   // at this point:
 53 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 54 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 55 |   mul_reduce_array_asm(c, 16, a, b); // c[i] = 3 * a[i] * b[i] 
 56 |   reduce_array_twice_asm(c, 16);  // c[i] = 9 * c[i] mod Q
 57 | 
 58 |   // we have: -130 <= c[i] <= 12413
 59 |   intt_red16_ct_rev2std_asm(c);
 60 |   mul_reduce_array16_asm(c, 16, ntt_red16_scaled_inv_psi_powers);
 61 |   reduce_array_twice_asm(c, 16); // c[i] = 9 * c[i] mod Q
 62 |   correct_asm(c, 16);
 63 | }
 64 | 
 65 | void ntt_red16_product3_asm(int32_t *c, int32_t *a, int32_t *b) {
 66 |   shift_array_asm(a, 16); // convert to [-(Q-1)/2, (Q-1)/2]
 67 |   mul_reduce_array16_asm(a, 16, ntt_red16_psi_powers);
 68 |   ntt_red16_ct_std2rev_asm(a);
 69 |   reduce_array_asm(a, 16);
 70 | 
 71 |   shift_array_asm(b, 16);
 72 |   mul_reduce_array16_asm(b, 16, ntt_red16_psi_powers);
 73 |   ntt_red16_ct_std2rev_asm(b);
 74 |   reduce_array_asm(b, 16);
 75 |   
 76 |   // at this point:
 77 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 78 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 79 |   mul_reduce_array_asm(c, 16, a, b); // c[i] = 3 * a[i] * b[i] 
 80 |   reduce_array_twice_asm(c, 16);  // c[i] = 9 * c[i] mod Q
 81 | 
 82 |   // we have: -130 <= c[i] <= 12413
 83 |   intt_red16_gs_rev2std_asm(c);
 84 |   mul_reduce_array16_asm(c, 16, ntt_red16_scaled_inv_psi_powers);
 85 |   reduce_array_twice_asm(c, 16); // c[i] = 9 * c[i] mod Q
 86 |   correct_asm(c, 16);
 87 | }
 88 | 
 89 | void ntt_red16_product4_asm(int32_t *c, int32_t *a, int32_t *b) {
 90 |   shift_array_asm(a, 16); // convert to [-(Q-1)/2, (Q-1)/2]
 91 |   mul_reduce_array16_asm(a, 16, ntt_red16_psi_powers);
 92 |   ntt_red16_gs_std2rev_asm(a);
 93 |   reduce_array_asm(a, 16);
 94 | 
 95 |   shift_array_asm(b, 16);
 96 |   mul_reduce_array16_asm(b, 16, ntt_red16_psi_powers);
 97 |   ntt_red16_gs_std2rev_asm(b);
 98 |   reduce_array_asm(b, 16);
 99 |   
100 |   // at this point:
101 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
102 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
103 |   mul_reduce_array_asm(c, 16, a, b); // c[i] = 3 * a[i] * b[i] 
104 |   reduce_array_twice_asm(c, 16);  // c[i] = 9 * c[i] mod Q
105 | 
106 |   // we have: -130 <= c[i] <= 12413
107 |   intt_red16_gs_rev2std_asm(c);
108 |   mul_reduce_array16_asm(c, 16, ntt_red16_scaled_inv_psi_powers);
109 |   reduce_array_twice_asm(c, 16); // c[i] = 9 * c[i] mod Q
110 |   correct_asm(c, 16);
111 | }
112 | 
113 | void ntt_red16_product5_asm(int32_t *c, int32_t *a, int32_t *b) {
114 |   shift_array_asm(a, 16);
115 |   mulntt_red16_ct_std2rev_asm(a);
116 |   reduce_array_asm(a, 16);
117 | 
118 |   shift_array_asm(b, 16);
119 |   mulntt_red16_ct_std2rev_asm(b);
120 |   reduce_array_asm(b, 16);
121 | 
122 |   mul_reduce_array_asm(c, 16, a, b); // c[i] = 3 * a[i] * b[i] 
123 |   reduce_array_twice_asm(c, 16);  // c[i] = 9 * c[i] mod Q
124 | 
125 |   inttmul_red16_gs_rev2std_asm(c);
126 |   scalar_mul_reduce_array_asm(c, 16, ntt_red16_rescale8);
127 |   reduce_array_twice_asm(c, 16);
128 |   correct_asm(c, 16);
129 | }
130 | 


--------------------------------------------------------------------------------
/src/ntt_red_asm1024.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * NTT for Q=12289, n=1024, using the Longa/Naehrig reduction method.
  3 |  */
  4 | 
  5 | #include "ntt_red_asm1024.h"
  6 | 
  7 | /*
  8 |  * Input: two arrays a and b in standard order
  9 |  *
 10 |  * Result: 
 11 |  * - the product is stored in array c, in standard order.
 12 |  * - arrays a and b are modified
 13 |  *
 14 |  * The input arrays must contain elements in the range [0, Q-1]
 15 |  * The result is also in that range.
 16 |  */
 17 | void ntt_red1024_product1_asm(int32_t *c, int32_t *a, int32_t *b) {
 18 |   mul_reduce_array16_asm(a, 1024, ntt_red1024_psi_powers);
 19 |   ntt_red1024_ct_std2rev_asm(a);
 20 |   reduce_array_asm(a, 1024);
 21 | 
 22 |   mul_reduce_array16_asm(b, 1024, ntt_red1024_psi_powers);
 23 |   ntt_red1024_ct_std2rev_asm(b);
 24 |   reduce_array_asm(b, 1024);
 25 |   
 26 |   // at this point:
 27 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 28 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 29 |   mul_reduce_array_asm(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
 30 |   //  reduce_array_twice_asm(c, 1024);  // c[i] = 9 * c[i] mod Q
 31 | 
 32 |   // we have: -130 <= c[i] <= 12413
 33 |   intt_red1024_ct_rev2std_asm(c);
 34 |   mul_reduce_array16_asm(c, 1024, ntt_red1024_scaled_inv_psi_powers_var);
 35 |   reduce_array_twice_asm(c, 1024); // c[i] = 9 * c[i] mod Q
 36 |   correct_asm(c, 1024);
 37 | }
 38 | 
 39 | void ntt_red1024_product2_asm(int32_t *c, int32_t *a, int32_t *b) {
 40 |   //  shift_array(a, 1024); // convert to [-(Q-1)/2, (Q-1)/2]
 41 |   mul_reduce_array16_asm(a, 1024, ntt_red1024_psi_powers);
 42 |   ntt_red1024_gs_std2rev_asm(a);
 43 |   reduce_array_asm(a, 1024);
 44 | 
 45 |   //  shift_array(b, 1024);
 46 |   mul_reduce_array16_asm(b, 1024, ntt_red1024_psi_powers);
 47 |   ntt_red1024_gs_std2rev_asm(b);
 48 |   reduce_array_asm(b, 1024);
 49 |   
 50 |   // at this point:
 51 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 52 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 53 |   mul_reduce_array_asm(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
 54 |   reduce_array_twice_asm(c, 1024);  // c[i] = 9 * c[i] mod Q
 55 | 
 56 |   // we have: -130 <= c[i] <= 12413
 57 |   intt_red1024_ct_rev2std_asm(c);
 58 |   mul_reduce_array16_asm(c, 1024, ntt_red1024_scaled_inv_psi_powers);
 59 |   reduce_array_twice_asm(c, 1024); // c[i] = 9 * c[i] mod Q
 60 |   correct_asm(c, 1024);
 61 | }
 62 | 
 63 | void ntt_red1024_product3_asm(int32_t *c, int32_t *a, int32_t *b) {
 64 |   //  shift_array(a, 1024); // convert to [-(Q-1)/2, (Q-1)/2]
 65 |   mul_reduce_array16_asm(a, 1024, ntt_red1024_psi_powers);
 66 |   ntt_red1024_ct_std2rev_asm(a);
 67 |   reduce_array_asm(a, 1024);
 68 | 
 69 |   //  shift_array(b, 1024);
 70 |   mul_reduce_array16_asm(b, 1024, ntt_red1024_psi_powers);
 71 |   ntt_red1024_ct_std2rev_asm(b);
 72 |   reduce_array_asm(b, 1024);
 73 |   
 74 |   // at this point:
 75 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 76 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 77 |   mul_reduce_array_asm(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
 78 |   reduce_array_twice_asm(c, 1024);  // c[i] = 9 * c[i] mod Q
 79 | 
 80 |   // we have: -130 <= c[i] <= 12413
 81 |   intt_red1024_gs_rev2std_asm(c);
 82 |   mul_reduce_array16_asm(c, 1024, ntt_red1024_scaled_inv_psi_powers);
 83 |   reduce_array_twice_asm(c, 1024); // c[i] = 9 * c[i] mod Q
 84 |   correct_asm(c, 1024);
 85 | }
 86 | 
 87 | void ntt_red1024_product4_asm(int32_t *c, int32_t *a, int32_t *b) {
 88 |   //  shift_array(a, 1024); // convert to [-(Q-1)/2, (Q-1)/2]
 89 |   mul_reduce_array16_asm(a, 1024, ntt_red1024_psi_powers);
 90 |   ntt_red1024_gs_std2rev_asm(a);
 91 |   reduce_array_asm(a, 1024);
 92 | 
 93 |   //  shift_array(b, 1024);
 94 |   mul_reduce_array16_asm(b, 1024, ntt_red1024_psi_powers);
 95 |   ntt_red1024_gs_std2rev_asm(b);
 96 |   reduce_array_asm(b, 1024);
 97 |   
 98 |   // at this point:
 99 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
100 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
101 |   mul_reduce_array_asm(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
102 |   reduce_array_twice_asm(c, 1024);  // c[i] = 9 * c[i] mod Q
103 | 
104 |   // we have: -130 <= c[i] <= 12413
105 |   intt_red1024_gs_rev2std_asm(c);
106 |   mul_reduce_array16_asm(c, 1024, ntt_red1024_scaled_inv_psi_powers);
107 |   reduce_array_twice_asm(c, 1024); // c[i] = 9 * c[i] mod Q
108 |   correct_asm(c, 1024);
109 | }
110 | 
111 | void ntt_red1024_product5_asm(int32_t *c, int32_t *a, int32_t *b) {
112 |   //  shift_array(a, 1024);
113 |   mulntt_red1024_ct_std2rev_asm(a);
114 |   reduce_array_asm(a, 1024);
115 | 
116 |   //  shift_array(b, 1024);
117 |   mulntt_red1024_ct_std2rev_asm(b);
118 |   reduce_array_asm(b, 1024);
119 | 
120 |   mul_reduce_array_asm(c, 1024, a, b); // c[i] = 3 * a[i] * b[i] 
121 |   reduce_array_twice_asm(c, 1024);  // c[i] = 9 * c[i] mod Q
122 | 
123 |   inttmul_red1024_gs_rev2std_asm(c);
124 |   scalar_mul_reduce_array_asm(c, 1024, ntt_red1024_rescale8);
125 |   reduce_array_twice_asm(c, 1024);
126 |   correct_asm(c, 1024);
127 | }
128 | 


--------------------------------------------------------------------------------
/src/ntt_red_asm256.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * NTT for Q=12289, n=256, using the Longa/Naehrig reduction method.
  3 |  */
  4 | 
  5 | #include "ntt_red_asm256.h"
  6 | 
  7 | /*
  8 |  * Input: two arrays a and b in standard order
  9 |  *
 10 |  * Result: 
 11 |  * - the product is stored in array c, in standard order.
 12 |  * - arrays a and b are modified
 13 |  *
 14 |  * The input arrays must contain elements in the range [0, Q-1]
 15 |  * The result is also in that range.
 16 |  */
 17 | void ntt_red256_product1_asm(int32_t *c, int32_t *a, int32_t *b) {
 18 |   shift_array_asm(a, 256); // convert to [-(Q-1)/2, (Q-1)/2]
 19 |   mul_reduce_array16_asm(a, 256, ntt_red256_psi_powers);
 20 |   ntt_red256_ct_std2rev_asm(a);
 21 |   reduce_array_asm(a, 256);
 22 | 
 23 |   shift_array_asm(b, 256);
 24 |   mul_reduce_array16_asm(b, 256, ntt_red256_psi_powers);
 25 |   ntt_red256_ct_std2rev_asm(b);
 26 |   reduce_array_asm(b, 256);
 27 |   
 28 |   // at this point:
 29 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 30 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 31 |   mul_reduce_array_asm(c, 256, a, b); // c[i] = 3 * a[i] * b[i] 
 32 |   reduce_array_twice_asm(c, 256);  // c[i] = 9 * c[i] mod Q
 33 | 
 34 |   // we have: -130 <= c[i] <= 12413
 35 |   intt_red256_ct_rev2std_asm(c);
 36 |   mul_reduce_array16_asm(c, 256, ntt_red256_scaled_inv_psi_powers);
 37 |   reduce_array_twice_asm(c, 256); // c[i] = 9 * c[i] mod Q
 38 |   correct_asm(c, 256);
 39 | }
 40 | 
 41 | void ntt_red256_product2_asm(int32_t *c, int32_t *a, int32_t *b) {
 42 |   shift_array_asm(a, 256); // convert to [-(Q-1)/2, (Q-1)/2]
 43 |   mul_reduce_array16_asm(a, 256, ntt_red256_psi_powers);
 44 |   ntt_red256_gs_std2rev_asm(a);
 45 |   reduce_array_asm(a, 256);
 46 | 
 47 |   shift_array_asm(b, 256);
 48 |   mul_reduce_array16_asm(b, 256, ntt_red256_psi_powers);
 49 |   ntt_red256_gs_std2rev_asm(b);
 50 |   reduce_array_asm(b, 256);
 51 |   
 52 |   // at this point:
 53 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 54 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 55 |   mul_reduce_array_asm(c, 256, a, b); // c[i] = 3 * a[i] * b[i] 
 56 |   reduce_array_twice_asm(c, 256);  // c[i] = 9 * c[i] mod Q
 57 | 
 58 |   // we have: -130 <= c[i] <= 12413
 59 |   intt_red256_ct_rev2std_asm(c);
 60 |   mul_reduce_array16_asm(c, 256, ntt_red256_scaled_inv_psi_powers);
 61 |   reduce_array_twice_asm(c, 256); // c[i] = 9 * c[i] mod Q
 62 |   correct_asm(c, 256);
 63 | }
 64 | 
 65 | void ntt_red256_product3_asm(int32_t *c, int32_t *a, int32_t *b) {
 66 |   shift_array_asm(a, 256); // convert to [-(Q-1)/2, (Q-1)/2]
 67 |   mul_reduce_array16_asm(a, 256, ntt_red256_psi_powers);
 68 |   ntt_red256_ct_std2rev_asm(a);
 69 |   reduce_array_asm(a, 256);
 70 | 
 71 |   shift_array_asm(b, 256);
 72 |   mul_reduce_array16_asm(b, 256, ntt_red256_psi_powers);
 73 |   ntt_red256_ct_std2rev_asm(b);
 74 |   reduce_array_asm(b, 256);
 75 |   
 76 |   // at this point:
 77 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 78 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 79 |   mul_reduce_array_asm(c, 256, a, b); // c[i] = 3 * a[i] * b[i] 
 80 |   reduce_array_twice_asm(c, 256);  // c[i] = 9 * c[i] mod Q
 81 | 
 82 |   // we have: -130 <= c[i] <= 12413
 83 |   intt_red256_gs_rev2std_asm(c);
 84 |   mul_reduce_array16_asm(c, 256, ntt_red256_scaled_inv_psi_powers);
 85 |   reduce_array_twice_asm(c, 256); // c[i] = 9 * c[i] mod Q
 86 |   correct_asm(c, 256);
 87 | }
 88 | 
 89 | void ntt_red256_product4_asm(int32_t *c, int32_t *a, int32_t *b) {
 90 |   shift_array_asm(a, 256); // convert to [-(Q-1)/2, (Q-1)/2]
 91 |   mul_reduce_array16_asm(a, 256, ntt_red256_psi_powers);
 92 |   ntt_red256_gs_std2rev_asm(a);
 93 |   reduce_array_asm(a, 256);
 94 | 
 95 |   shift_array_asm(b, 256);
 96 |   mul_reduce_array16_asm(b, 256, ntt_red256_psi_powers);
 97 |   ntt_red256_gs_std2rev_asm(b);
 98 |   reduce_array_asm(b, 256);
 99 |   
100 |   // at this point:
101 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
102 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
103 |   mul_reduce_array_asm(c, 256, a, b); // c[i] = 3 * a[i] * b[i] 
104 |   reduce_array_twice_asm(c, 256);  // c[i] = 9 * c[i] mod Q
105 | 
106 |   // we have: -130 <= c[i] <= 12413
107 |   intt_red256_gs_rev2std_asm(c);
108 |   mul_reduce_array16_asm(c, 256, ntt_red256_scaled_inv_psi_powers);
109 |   reduce_array_twice_asm(c, 256); // c[i] = 9 * c[i] mod Q
110 |   correct_asm(c, 256);
111 | }
112 | 
113 | void ntt_red256_product5_asm(int32_t *c, int32_t *a, int32_t *b) {
114 |   shift_array_asm(a, 256);
115 |   mulntt_red256_ct_std2rev_asm(a);
116 |   reduce_array_asm(a, 256);
117 | 
118 |   shift_array_asm(b, 256);
119 |   mulntt_red256_ct_std2rev_asm(b);
120 |   reduce_array_asm(b, 256);
121 | 
122 |   mul_reduce_array_asm(c, 256, a, b); // c[i] = 3 * a[i] * b[i] 
123 |   reduce_array_twice_asm(c, 256);  // c[i] = 9 * c[i] mod Q
124 | 
125 |   inttmul_red256_gs_rev2std_asm(c);
126 |   scalar_mul_reduce_array_asm(c, 256, ntt_red256_rescale8);
127 |   reduce_array_twice_asm(c, 256);
128 |   correct_asm(c, 256);
129 | }
130 | 


--------------------------------------------------------------------------------
/src/ntt_red_asm512.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * NTT for Q=12289, n=512, using the Longa/Naehrig reduction method.
  3 |  */
  4 | 
  5 | #include "ntt_red_asm512.h"
  6 | 
  7 | /*
  8 |  * Input: two arrays a and b in standard order
  9 |  *
 10 |  * Result: 
 11 |  * - the product is stored in array c, in standard order.
 12 |  * - arrays a and b are modified
 13 |  *
 14 |  * The input arrays must contain elements in the range [0, Q-1]
 15 |  * The result is also in that range.
 16 |  */
 17 | void ntt_red512_product1_asm(int32_t *c, int32_t *a, int32_t *b) {
 18 |   shift_array_asm(a, 512); // convert to [-(Q-1)/2, (Q-1)/2]
 19 |   mul_reduce_array16_asm(a, 512, ntt_red512_psi_powers);
 20 |   ntt_red512_ct_std2rev_asm(a);
 21 |   reduce_array_asm(a, 512);
 22 | 
 23 |   shift_array_asm(b, 512);
 24 |   mul_reduce_array16_asm(b, 512, ntt_red512_psi_powers);
 25 |   ntt_red512_ct_std2rev_asm(b);
 26 |   reduce_array_asm(b, 512);
 27 |   
 28 |   // at this point:
 29 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 30 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 31 |   mul_reduce_array_asm(c, 512, a, b); // c[i] = 3 * a[i] * b[i] 
 32 |   reduce_array_twice_asm(c, 512);  // c[i] = 9 * c[i] mod Q
 33 | 
 34 |   // we have: -130 <= c[i] <= 12413
 35 |   intt_red512_ct_rev2std_asm(c);
 36 |   mul_reduce_array16_asm(c, 512, ntt_red512_scaled_inv_psi_powers);
 37 |   reduce_array_twice_asm(c, 512); // c[i] = 9 * c[i] mod Q
 38 |   correct_asm(c, 512);
 39 | }
 40 | 
 41 | void ntt_red512_product2_asm(int32_t *c, int32_t *a, int32_t *b) {
 42 |   shift_array_asm(a, 512); // convert to [-(Q-1)/2, (Q-1)/2]
 43 |   mul_reduce_array16_asm(a, 512, ntt_red512_psi_powers);
 44 |   ntt_red512_gs_std2rev_asm(a);
 45 |   reduce_array_asm(a, 512);
 46 | 
 47 |   shift_array_asm(b, 512);
 48 |   mul_reduce_array16_asm(b, 512, ntt_red512_psi_powers);
 49 |   ntt_red512_gs_std2rev_asm(b);
 50 |   reduce_array_asm(b, 512);
 51 |   
 52 |   // at this point:
 53 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 54 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 55 |   mul_reduce_array_asm(c, 512, a, b); // c[i] = 3 * a[i] * b[i] 
 56 |   reduce_array_twice_asm(c, 512);  // c[i] = 9 * c[i] mod Q
 57 | 
 58 |   // we have: -130 <= c[i] <= 12413
 59 |   intt_red512_ct_rev2std_asm(c);
 60 |   mul_reduce_array16_asm(c, 512, ntt_red512_scaled_inv_psi_powers);
 61 |   reduce_array_twice_asm(c, 512); // c[i] = 9 * c[i] mod Q
 62 |   correct_asm(c, 512);
 63 | }
 64 | 
 65 | void ntt_red512_product3_asm(int32_t *c, int32_t *a, int32_t *b) {
 66 |   shift_array_asm(a, 512); // convert to [-(Q-1)/2, (Q-1)/2]
 67 |   mul_reduce_array16_asm(a, 512, ntt_red512_psi_powers);
 68 |   ntt_red512_ct_std2rev_asm(a);
 69 |   reduce_array_asm(a, 512);
 70 | 
 71 |   shift_array_asm(b, 512);
 72 |   mul_reduce_array16_asm(b, 512, ntt_red512_psi_powers);
 73 |   ntt_red512_ct_std2rev_asm(b);
 74 |   reduce_array_asm(b, 512);
 75 |   
 76 |   // at this point:
 77 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
 78 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
 79 |   mul_reduce_array_asm(c, 512, a, b); // c[i] = 3 * a[i] * b[i] 
 80 |   reduce_array_twice_asm(c, 512);  // c[i] = 9 * c[i] mod Q
 81 | 
 82 |   // we have: -130 <= c[i] <= 12413
 83 |   intt_red512_gs_rev2std_asm(c);
 84 |   mul_reduce_array16_asm(c, 512, ntt_red512_scaled_inv_psi_powers);
 85 |   reduce_array_twice_asm(c, 512); // c[i] = 9 * c[i] mod Q
 86 |   correct_asm(c, 512);
 87 | }
 88 | 
 89 | void ntt_red512_product4_asm(int32_t *c, int32_t *a, int32_t *b) {
 90 |   shift_array_asm(a, 512); // convert to [-(Q-1)/2, (Q-1)/2]
 91 |   mul_reduce_array16_asm(a, 512, ntt_red512_psi_powers);
 92 |   ntt_red512_gs_std2rev_asm(a);
 93 |   reduce_array_asm(a, 512);
 94 | 
 95 |   shift_array_asm(b, 512);
 96 |   mul_reduce_array16_asm(b, 512, ntt_red512_psi_powers);
 97 |   ntt_red512_gs_std2rev_asm(b);
 98 |   reduce_array_asm(b, 512);
 99 |   
100 |   // at this point:
101 |   // a = NTT(a) * 3, -524287 <= a[i] <= 536573
102 |   // b = NTT(b) * 3, -524287 <= b[i] <= 536573
103 |   mul_reduce_array_asm(c, 512, a, b); // c[i] = 3 * a[i] * b[i] 
104 |   reduce_array_twice_asm(c, 512);  // c[i] = 9 * c[i] mod Q
105 | 
106 |   // we have: -130 <= c[i] <= 12413
107 |   intt_red512_gs_rev2std_asm(c);
108 |   mul_reduce_array16_asm(c, 512, ntt_red512_scaled_inv_psi_powers);
109 |   reduce_array_twice_asm(c, 512); // c[i] = 9 * c[i] mod Q
110 |   correct_asm(c, 512);
111 | }
112 | 
113 | void ntt_red512_product5_asm(int32_t *c, int32_t *a, int32_t *b) {
114 |   shift_array_asm(a, 512);
115 |   mulntt_red512_ct_std2rev_asm(a);
116 |   reduce_array_asm(a, 512);
117 | 
118 |   shift_array_asm(b, 512);
119 |   mulntt_red512_ct_std2rev_asm(b);
120 |   reduce_array_asm(b, 512);
121 | 
122 |   mul_reduce_array_asm(c, 512, a, b); // c[i] = 3 * a[i] * b[i] 
123 |   reduce_array_twice_asm(c, 512);  // c[i] = 9 * c[i] mod Q
124 | 
125 |   inttmul_red512_gs_rev2std_asm(c);
126 |   scalar_mul_reduce_array_asm(c, 512, ntt_red512_rescale8);
127 |   reduce_array_twice_asm(c, 512);
128 |   correct_asm(c, 512);
129 | }
130 | 


--------------------------------------------------------------------------------
/data/primitive-roots-1024.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # These are all the primitive n-th roots of unity in Z_q
  3 | # when q=12289 and n=1024.
  4 | #
  5 | 49
  6 | 52
  7 | 56
  8 | 58
  9 | 64
 10 | 142
 11 | 147
 12 | 151
 13 | 156
 14 | 168
 15 | 174
 16 | 192
 17 | 218
 18 | 241
 19 | 295
 20 | 316
 21 | 325
 22 | 347
 23 | 350
 24 | 382
 25 | 400
 26 | 418
 27 | 421
 28 | 426
 29 | 431
 30 | 441
 31 | 453
 32 | 468
 33 | 504
 34 | 522
 35 | 576
 36 | 605
 37 | 652
 38 | 654
 39 | 677
 40 | 683
 41 | 709
 42 | 723
 43 | 787
 44 | 835
 45 | 885
 46 | 922
 47 | 948
 48 | 973
 49 | 975
 50 | 1003
 51 | 1010
 52 | 1018
 53 | 1041
 54 | 1050
 55 | 1058
 56 | 1105
 57 | 1112
 58 | 1146
 59 | 1159
 60 | 1190
 61 | 1200
 62 | 1254
 63 | 1263
 64 | 1278
 65 | 1293
 66 | 1319
 67 | 1321
 68 | 1323
 69 | 1359
 70 | 1360
 71 | 1404
 72 | 1483
 73 | 1489
 74 | 1512
 75 | 1566
 76 | 1579
 77 | 1583
 78 | 1594
 79 | 1693
 80 | 1702
 81 | 1728
 82 | 1747
 83 | 1805
 84 | 1815
 85 | 1843
 86 | 1858
 87 | 1922
 88 | 1954
 89 | 1956
 90 | 1958
 91 | 1962
 92 | 1973
 93 | 1975
 94 | 1987
 95 | 2031
 96 | 2033
 97 | 2049
 98 | 2051
 99 | 2057
100 | 2078
101 | 2127
102 | 2169
103 | 2281
104 | 2302
105 | 2344
106 | 2361
107 | 2447
108 | 2459
109 | 2500
110 | 2503
111 | 2505
112 | 2555
113 | 2655
114 | 2683
115 | 2692
116 | 2738
117 | 2766
118 | 2767
119 | 2839
120 | 2844
121 | 2882
122 | 2908
123 | 2919
124 | 2920
125 | 2925
126 | 2948
127 | 3009
128 | 3029
129 | 3030
130 | 3054
131 | 3123
132 | 3127
133 | 3150
134 | 3174
135 | 3199
136 | 3202
137 | 3262
138 | 3263
139 | 3315
140 | 3329
141 | 3336
142 | 3434
143 | 3438
144 | 3445
145 | 3477
146 | 3482
147 | 3514
148 | 3529
149 | 3532
150 | 3565
151 | 3570
152 | 3600
153 | 3602
154 | 3643
155 | 3656
156 | 3710
157 | 3728
158 | 3757
159 | 3762
160 | 3772
161 | 3789
162 | 3818
163 | 3834
164 | 3860
165 | 3879
166 | 3956
167 | 3957
168 | 3963
169 | 3969
170 | 3988
171 | 3991
172 | 3998
173 | 4016
174 | 4046
175 | 4049
176 | 4075
177 | 4077
178 | 4079
179 | 4080
180 | 4115
181 | 4169
182 | 4212
183 | 4213
184 | 4240
185 | 4298
186 | 4322
187 | 4324
188 | 4433
189 | 4449
190 | 4467
191 | 4493
192 | 4536
193 | 4624
194 | 4698
195 | 4737
196 | 4749
197 | 4754
198 | 4774
199 | 4780
200 | 4782
201 | 4789
202 | 4912
203 | 4916
204 | 4948
205 | 5009
206 | 5057
207 | 5079
208 | 5106
209 | 5184
210 | 5206
211 | 5241
212 | 5257
213 | 5297
214 | 5315
215 | 5333
216 | 5339
217 | 5369
218 | 5383
219 | 5415
220 | 5429
221 | 5435
222 | 5445
223 | 5446
224 | 5456
225 | 5468
226 | 5486
227 | 5529
228 | 5537
229 | 5574
230 | 5594
231 | 5681
232 | 5735
233 | 5766
234 | 5782
235 | 5862
236 | 5868
237 | 5874
238 | 5876
239 | 5886
240 | 5906
241 | 5908
242 | 5915
243 | 5919
244 | 5925
245 | 5942
246 | 5961
247 | 5990
248 | 6008
249 | 6055
250 | 6065
251 | 6068
252 | 6077
253 | 6093
254 | 6099
255 | 6118
256 | 6119
257 | 6122
258 | 6136
259 | 6137
260 | 6142
261 | 6147
262 | 6152
263 | 6153
264 | 6167
265 | 6170
266 | 6171
267 | 6190
268 | 6196
269 | 6212
270 | 6221
271 | 6224
272 | 6234
273 | 6281
274 | 6299
275 | 6328
276 | 6347
277 | 6364
278 | 6370
279 | 6374
280 | 6381
281 | 6383
282 | 6403
283 | 6413
284 | 6415
285 | 6421
286 | 6427
287 | 6507
288 | 6523
289 | 6554
290 | 6608
291 | 6695
292 | 6715
293 | 6752
294 | 6760
295 | 6803
296 | 6821
297 | 6833
298 | 6843
299 | 6844
300 | 6854
301 | 6860
302 | 6874
303 | 6906
304 | 6920
305 | 6950
306 | 6956
307 | 6974
308 | 6992
309 | 7032
310 | 7048
311 | 7083
312 | 7105
313 | 7183
314 | 7210
315 | 7232
316 | 7280
317 | 7341
318 | 7373
319 | 7377
320 | 7500
321 | 7507
322 | 7509
323 | 7515
324 | 7535
325 | 7540
326 | 7552
327 | 7591
328 | 7665
329 | 7753
330 | 7796
331 | 7822
332 | 7840
333 | 7856
334 | 7965
335 | 7967
336 | 7991
337 | 8049
338 | 8076
339 | 8077
340 | 8120
341 | 8174
342 | 8209
343 | 8210
344 | 8212
345 | 8214
346 | 8240
347 | 8243
348 | 8273
349 | 8291
350 | 8298
351 | 8301
352 | 8320
353 | 8326
354 | 8332
355 | 8333
356 | 8410
357 | 8429
358 | 8455
359 | 8471
360 | 8500
361 | 8517
362 | 8527
363 | 8532
364 | 8561
365 | 8579
366 | 8633
367 | 8646
368 | 8687
369 | 8689
370 | 8719
371 | 8724
372 | 8757
373 | 8760
374 | 8775
375 | 8807
376 | 8812
377 | 8844
378 | 8851
379 | 8855
380 | 8953
381 | 8960
382 | 8974
383 | 9026
384 | 9027
385 | 9087
386 | 9090
387 | 9115
388 | 9139
389 | 9162
390 | 9166
391 | 9235
392 | 9259
393 | 9260
394 | 9280
395 | 9341
396 | 9364
397 | 9369
398 | 9370
399 | 9381
400 | 9407
401 | 9445
402 | 9450
403 | 9522
404 | 9523
405 | 9551
406 | 9597
407 | 9606
408 | 9634
409 | 9734
410 | 9784
411 | 9786
412 | 9789
413 | 9830
414 | 9842
415 | 9928
416 | 9945
417 | 9987
418 | 10008
419 | 10120
420 | 10162
421 | 10211
422 | 10232
423 | 10238
424 | 10240
425 | 10256
426 | 10258
427 | 10302
428 | 10314
429 | 10316
430 | 10327
431 | 10331
432 | 10333
433 | 10335
434 | 10367
435 | 10431
436 | 10446
437 | 10474
438 | 10484
439 | 10542
440 | 10561
441 | 10587
442 | 10596
443 | 10695
444 | 10706
445 | 10710
446 | 10723
447 | 10777
448 | 10800
449 | 10806
450 | 10885
451 | 10929
452 | 10930
453 | 10966
454 | 10968
455 | 10970
456 | 10996
457 | 11011
458 | 11026
459 | 11035
460 | 11089
461 | 11099
462 | 11130
463 | 11143
464 | 11177
465 | 11184
466 | 11231
467 | 11239
468 | 11248
469 | 11271
470 | 11279
471 | 11286
472 | 11314
473 | 11316
474 | 11341
475 | 11367
476 | 11404
477 | 11454
478 | 11502
479 | 11566
480 | 11580
481 | 11606
482 | 11612
483 | 11635
484 | 11637
485 | 11684
486 | 11713
487 | 11767
488 | 11785
489 | 11821
490 | 11836
491 | 11848
492 | 11858
493 | 11863
494 | 11868
495 | 11871
496 | 11889
497 | 11907
498 | 11939
499 | 11942
500 | 11964
501 | 11973
502 | 11994
503 | 12048
504 | 12071
505 | 12097
506 | 12115
507 | 12121
508 | 12133
509 | 12138
510 | 12142
511 | 12147
512 | 12225
513 | 12231
514 | 12233
515 | 12237
516 | 12240
517 | 
518 | 


--------------------------------------------------------------------------------
/src/test_red.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <inttypes.h>
  3 | #include <stdbool.h>
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | 
  7 | /*
  8 |  * 12289 is 3 * 2^12 + 1
  9 |  * MASK = 2^12 - 1
 10 |  */
 11 | #define Q 12289
 12 | #define K 3
 13 | #define MASK 4095
 14 | 
 15 | #define MAXABS 715827882
 16 | #define MAXABS2 (((int64_t) 1) << 43)
 17 | 
 18 | static int64_t red(int64_t x) {
 19 |   return (3 * (x & 4095)) - (x >> 12);
 20 | }
 21 | 
 22 | #if 0
 23 | static void test(int64_t x, int64_t w) {
 24 |   int64_t r;
 25 | 
 26 |   r = red(x * w);
 27 |   if (r > INT32_MAX || r < INT32_MIN) {
 28 |     printf("  32bit overflow detected for x = %"PRId64", w = %"PRId64"\n", x, w);
 29 |     printf("  red(x * w) = %"PRId64"\n", r);
 30 |     if (-MAXABS <= x && x <= MAXABS) {
 31 |       printf("---> x is within limits\n");
 32 |       fflush(stdout);
 33 |       exit(1);
 34 |     }
 35 |   }
 36 | }
 37 | 
 38 | // return true on overflow
 39 | static bool test2(int64_t x) {
 40 |   int64_t r;
 41 | 
 42 |   r = red(x);
 43 |   if (r > INT32_MAX || r < INT32_MIN) {
 44 |     printf("  32bit overflow detected for x = %"PRId64"\n", x);
 45 |     printf("  red(x) = %"PRId64"\n", r);
 46 |     return true;
 47 |   }
 48 |   return false;
 49 | }
 50 | #endif
 51 | 
 52 | static int64_t lower_bound, upper_bound;
 53 | 
 54 | static bool test_overflow(int64_t x) {
 55 |   int64_t r;
 56 |   r = red(x);
 57 |   return r>INT32_MAX || r<INT32_MIN;
 58 | }
 59 | 
 60 | // check for overflow in the interval [4096*x, 4096x+4095]
 61 | static bool test_overflow2(int64_t x) {
 62 |   int64_t w;
 63 | 
 64 |   x *= 4096;
 65 |   for (w=0; w<4096; w++) {
 66 |     if (test_overflow(x + w)) {
 67 |       return true;
 68 |     }
 69 |   }
 70 |   return false;
 71 | }
 72 | 
 73 | static void find_lower_bound(void) {
 74 |   int64_t l, h, x;
 75 | 
 76 |   l = INT64_MIN/4096;
 77 |   h = -1;
 78 |   do {
 79 |     x = (l + h)/2;
 80 |     assert(l < x && x <= h);
 81 |     if (test_overflow2(x)) {
 82 |       l = x;
 83 |     } else {
 84 |       h = x;
 85 |     }    
 86 |   } while (l + 1 < h);
 87 | 
 88 |   printf("Safe lower bound = %"PRId64"\n", 4096 * h);
 89 |   lower_bound = 4096 * h;
 90 | }
 91 | 
 92 | static void find_upper_bound(void) {
 93 |   int64_t l, h, x;
 94 | 
 95 |   l = 0;
 96 |   h = INT64_MAX/4096;
 97 |   do {
 98 |     x = (l + h)/2;
 99 |     assert(l <= x && x < h);
100 |     if (test_overflow2(x)) {
101 |       h = x;
102 |     } else {
103 |       l = x;
104 |     }    
105 |   } while (l + 1 < h);
106 | 
107 |   printf("Safe upper bound = %"PRId64"\n", 4096 * l + 4095);
108 |   upper_bound = 4096 * l + 4095;
109 | }
110 | 
111 | static void iter_bound_abs(void) {
112 |   uint32_t i;
113 |   int64_t b, nb;
114 | 
115 |   printf("CT updates\n");
116 |   b = Q-1;
117 |   for (i=0; i<20; i++) {
118 |     printf("B%"PRIu32" = %"PRId64"\n", i, b);
119 |     nb = (K+1) * b + Q - K;
120 |     if (nb > INT32_MAX) {
121 |       b = b/4096 + Q - K;
122 |       printf("Reduction to %"PRId64"\n", b);
123 |       nb = (K+1) * b + Q - K;
124 |     }
125 |     b = nb;
126 |   }
127 |   printf("\n");
128 | 
129 |   printf("GS updates\n");
130 |   b = Q-1;
131 |   for (i=0; i<20; i++) {
132 |     printf("B%"PRIu32" = %"PRIi64"\n", i, b);
133 |     nb = 2 * K * b + Q - K;
134 |     if (nb > INT32_MAX) {
135 |       b = b/4096 + Q - K;
136 |       printf("Reduction to %"PRId64"\n", b);
137 |       nb = 2 * K * b + Q - K;
138 |     }
139 |     b = nb;
140 |   }
141 |   printf("\n");
142 | }
143 | 
144 | 
145 | /*
146 |  * CT: x' = x + red(w * y)
147 |  *     y' = x - red(w * y)
148 |  * 
149 |  * Bound on |x'| and |y'| assuming |x| <= b and |y| <= b.
150 |  */
151 | static int64_t ct_bound(int64_t b) {
152 |   int64_t delta;
153 | 
154 |   delta = (Q - K) - (b+1)/4096;
155 |   return delta < 0 ? (K + 1) * b : (K + 1) * b + delta;  
156 | }
157 | 
158 | /*
159 |  * GS: x' = x + y
160 |  *     y' = red(w * (x - y))
161 |  *
162 |  * Bound on |x'| and |y'| assuming |x| <= b and |y| <= b.
163 |  */
164 | static int64_t gs_bound(int64_t b) {
165 |   int64_t delta, bb;
166 | 
167 |   bb = 2 * b; // |x - y| <= 2b
168 |   delta = (Q - K) - (bb + 1)/4096;
169 |   return delta < 0 ? K * bb : K * bb + delta;
170 | }
171 | 
172 | static void iter_bound_abs2(void) {
173 |   uint32_t i;
174 |   int64_t b, nb;
175 | 
176 |   printf("CT updates\n");
177 |   b = Q-1;
178 |   for (i=0; i<20; i++) {
179 |     printf("B%"PRIu32" = %"PRId64"\n", i, b);
180 |     nb = ct_bound(b);
181 |     if (nb > INT32_MAX) {
182 |       b = b/4096 + Q - K;
183 |       printf("Reduction to %"PRId64"\n", b);
184 |       nb = ct_bound(b);
185 |     }
186 |     b = nb;
187 |   }
188 | 
189 |   printf("\nGS updates\n");
190 |   b = Q-1;
191 |   for (i=0; i<20; i++) {
192 |     printf("B%"PRIu32" = %"PRId64"\n", i, b);
193 |     nb = gs_bound(b);
194 |     if (nb > INT32_MAX) {
195 |       b = b/4096 + Q - K;
196 |       printf("Reduction to %"PRId64"\n", b);
197 |       nb = gs_bound(b);
198 |     }
199 |     b = nb;
200 |   }
201 | }
202 | 
203 | int main(void) {
204 |   int64_t x;
205 |   uint64_t u;
206 | 
207 |   find_lower_bound();
208 |   find_upper_bound();
209 | 
210 |   for (x=-10000; x < 1000000000; x++) {
211 |     if (test_overflow(lower_bound + x)) {
212 |       printf("Overflow for lower_bound + %"PRId64"\n", x);
213 |     }
214 |   }
215 |   printf("\n");
216 |   for (x= -1000000000; x < 10000; x++) {
217 |     if (test_overflow(upper_bound + x)) {
218 |       printf("Overflow for upper_bound + %"PRId64"\n", x);
219 |     }
220 |   }
221 |   printf("\n");
222 |   u = lower_bound;
223 |   printf("Hex: lower_bound = %0llx\n", u);
224 |   u = upper_bound;
225 |   printf("Hex: upper_bound = %0llx\n", u);
226 |   printf("\n");
227 |   
228 |   iter_bound_abs();
229 |   iter_bound_abs2();
230 | 
231 |   return 0;
232 | }
233 | 


--------------------------------------------------------------------------------