├── kex.c
├── README.txt
├── License.txt
├── Visual Studio
├── tests
│ ├── tests.vcxproj.filters
│ └── tests.vcxproj
└── LatticeCrypto
│ ├── LatticeCrypto.vcxproj.filters
│ ├── LatticeCrypto.sln
│ └── LatticeCrypto.vcxproj
├── AMD64
├── ntt_x64.c
├── consts.c
├── error_asm.S
└── ntt_x64_asm.S
├── tests
├── test_extras.h
├── test_extras.c
└── tests.c
├── makefile
├── random.c
├── LatticeCrypto_priv.h
├── generic
└── ntt.c
├── LatticeCrypto.h
└── ntt_constants.c
/kex.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/b/LatticeCrypto/master/kex.c
--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/b/LatticeCrypto/master/README.txt
--------------------------------------------------------------------------------
/License.txt:
--------------------------------------------------------------------------------
1 | LatticeCrypto
2 |
3 | Copyright (c) Microsoft Corporation
4 | All rights reserved.
5 |
6 | MIT License
7 |
8 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
9 | associated documentation files (the ""Software""), to deal in the Software without restriction,
10 | including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 | and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
12 | subject to the following conditions:
13 |
14 | The above copyright notice and this permission notice shall be included in all copies or substantial
15 | portions of the Software.
16 |
17 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
18 | LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Visual Studio/tests/tests.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;hm;inl;inc;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 | Header Files
20 |
21 |
22 | Header Files
23 |
24 |
25 | Header Files
26 |
27 |
28 |
29 |
30 | Source Files
31 |
32 |
33 | Source Files
34 |
35 |
36 |
--------------------------------------------------------------------------------
/Visual Studio/LatticeCrypto/LatticeCrypto.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;hm;inl;inc;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 | {d2c0b572-de10-4258-8700-245d332f161c}
18 |
19 |
20 |
21 |
22 | Source Files
23 |
24 |
25 | Source Files
26 |
27 |
28 | Source Files\generic
29 |
30 |
31 | Source Files
32 |
33 |
34 |
35 |
36 | Header Files
37 |
38 |
39 | Header Files
40 |
41 |
42 |
--------------------------------------------------------------------------------
/AMD64/ntt_x64.c:
--------------------------------------------------------------------------------
1 | /****************************************************************************************
2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
3 | *
4 | * Copyright (c) Microsoft Corporation. All rights reserved.
5 | *
6 | *
7 | * Abstract: NTT functions and other low-level operations
8 | *
9 | *****************************************************************************************/
10 |
11 | #include "../LatticeCrypto_priv.h"
12 |
13 |
14 | void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N)
15 | {
16 | NTT_CT_std2rev_12289_asm(a, psi_rev, N);
17 | }
18 |
19 |
20 | void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N)
21 | {
22 | INTT_GS_rev2std_12289_asm(a, omegainv_rev, omegainv1N_rev, Ninv, N);
23 | }
24 |
25 |
26 | void two_reduce12289(int32_t* a, unsigned int N)
27 | {
28 | two_reduce12289_asm(a, N);
29 | }
30 |
31 |
32 | void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N)
33 | {
34 | pmul_asm(a, b, c, N);
35 | }
36 |
37 |
38 | void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N)
39 | {
40 | pmuladd_asm(a, b, c, d, N);
41 | }
42 |
43 |
44 | void smul(int32_t* a, int32_t scalar, unsigned int N)
45 | {
46 | unsigned int i;
47 |
48 | for (i = 0; i < N; i++) {
49 | a[i] = a[i]*scalar;
50 | }
51 | }
52 |
53 |
54 | void correction(int32_t* a, int32_t p, unsigned int N)
55 | {
56 | unsigned int i;
57 | int32_t mask;
58 |
59 | for (i = 0; i < N; i++) {
60 | mask = a[i] >> (4*sizeof(int32_t) - 1);
61 | a[i] += (p & mask) - p;
62 | mask = a[i] >> (4*sizeof(int32_t) - 1);
63 | a[i] += (p & mask);
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/AMD64/consts.c:
--------------------------------------------------------------------------------
1 | /****************************************************************************************
2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
3 | *
4 | * Copyright (c) Microsoft Corporation. All rights reserved.
5 | *
6 | *
7 | * Abstract: constants for the x64 assembly implementation
8 | *
9 | *****************************************************************************************/
10 |
11 | #include "../LatticeCrypto_priv.h"
12 | #include
13 |
14 |
15 | uint32_t PRIME8x[8] = {PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q};
16 | uint8_t ONE32x[32] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
17 | uint32_t MASK12x8[8] = {0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff};
18 | uint32_t PERM0246[4] = {0,2,4,6};
19 | uint32_t PERM00224466[8] = {0,0,2,2,4,4,6,6};
20 | uint32_t PERM02134657[8] = {0,2,1,3,4,6,5,7};
21 | uint64_t PERM0145[4] = {0,1,4,5};
22 | uint64_t PERM2367[4] = {2,3,6,7};
23 | uint64_t MASK32[4] = {0xffffffff,0,0xffffffff,0};
24 | uint64_t MASK42[4] = {0x3fff0000000,0,0x3fff0000000,0};
25 |
26 | uint64_t MASK14_1[4] = {0x3fff,0,0x3fff,0};
27 | uint64_t MASK14_2[4] = {0xFFFC000,0,0xFFFC000,0};
28 | uint64_t MASK14_3[4] = {0x3FFF0000000,0,0x3FFF0000000,0};
29 | uint64_t MASK14_4[4] = {0xFFFC0000000000,0,0xFFFC0000000000,0};
30 |
31 | uint32_t ONE8x[8] = {1,1,1,1,1,1,1,1};
32 | uint32_t THREE8x[8] = {3,3,3,3,3,3,3,3};
33 | uint32_t FOUR8x[8] = {4,4,4,4,4,4,4,4};
34 | uint32_t PARAM_Q4x8[8] = {3073,3073,3073,3073,3073,3073,3073,3073};
35 | uint32_t PARAM_3Q4x8[8] = {9217,9217,9217,9217,9217,9217,9217,9217};
36 | uint32_t PARAM_5Q4x8[8] = {15362,15362,15362,15362,15362,15362,15362,15362};
37 | uint32_t PARAM_7Q4x8[8] = {21506,21506,21506,21506,21506,21506,21506,21506};
38 | uint32_t PARAM_Q2x8[8] = {6145,6145,6145,6145,6145,6145,6145,6145};
39 | uint32_t PARAM_3Q2x8[8] = {18434,18434,18434,18434,18434,18434,18434,18434};
40 |
41 |
--------------------------------------------------------------------------------
/Visual Studio/LatticeCrypto/LatticeCrypto.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 2013
4 | VisualStudioVersion = 12.0.21005.1
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tests", "..\tests\tests.vcxproj", "{C9639168-C3FF-4427-BC3B-D907FF11DE73}"
7 | ProjectSection(ProjectDependencies) = postProject
8 | {8283DD76-E88A-4B63-ABDE-33F014178413} = {8283DD76-E88A-4B63-ABDE-33F014178413}
9 | EndProjectSection
10 | EndProject
11 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LatticeCrypto", "LatticeCrypto.vcxproj", "{8283DD76-E88A-4B63-ABDE-33F014178413}"
12 | EndProject
13 | Global
14 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
15 | Debug|Win32 = Debug|Win32
16 | Debug|x64 = Debug|x64
17 | Generic|Win32 = Generic|Win32
18 | Generic|x64 = Generic|x64
19 | Release|Win32 = Release|Win32
20 | Release|x64 = Release|x64
21 | EndGlobalSection
22 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
23 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Debug|Win32.ActiveCfg = Debug|Win32
24 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Debug|x64.ActiveCfg = Debug|x64
25 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Generic|Win32.ActiveCfg = Generic|Win32
26 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Generic|Win32.Build.0 = Generic|Win32
27 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Generic|x64.ActiveCfg = Generic|x64
28 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Generic|x64.Build.0 = Generic|x64
29 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Release|Win32.ActiveCfg = Release|Win32
30 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Release|x64.ActiveCfg = Release|x64
31 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Debug|Win32.ActiveCfg = Debug|Win32
32 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Debug|x64.ActiveCfg = Debug|x64
33 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Generic|Win32.ActiveCfg = Generic|Win32
34 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Generic|Win32.Build.0 = Generic|Win32
35 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Generic|x64.ActiveCfg = Generic|x64
36 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Generic|x64.Build.0 = Generic|x64
37 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Release|Win32.ActiveCfg = Release|Win32
38 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Release|x64.ActiveCfg = Release|x64
39 | EndGlobalSection
40 | GlobalSection(SolutionProperties) = preSolution
41 | HideSolutionNode = FALSE
42 | EndGlobalSection
43 | EndGlobal
44 |
--------------------------------------------------------------------------------
/tests/test_extras.h:
--------------------------------------------------------------------------------
1 | /****************************************************************************************
2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
3 | *
4 | * Copyright (c) Microsoft Corporation. All rights reserved.
5 | *
6 | *
7 | * Abstract: utility header file for tests
8 | *
9 | *****************************************************************************************/
10 |
11 | #ifndef __TEST_EXTRAS_H__
12 | #define __TEST_EXTRAS_H__
13 |
14 |
15 | // For C++
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 |
20 |
21 | #include "../LatticeCrypto_priv.h"
22 |
23 |
24 | // Access system counter for benchmarking
25 | int64_t cpucycles(void);
26 |
27 | // Generate "nbytes" of random values and output the result to random_array.
28 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY.
29 | CRYPTO_STATUS random_bytes_test(unsigned int nbytes, unsigned char* random_array);
30 |
31 | // Generate "array_ndigits" of 32-bit values and output the result to extended_array.
32 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY.
33 | CRYPTO_STATUS extendable_output_test(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array);
34 |
35 | // Generate "array_nbytes" of values and output the result to stream_array.
36 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY.
37 | CRYPTO_STATUS stream_output_test(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array);
38 |
39 | // Generating a pseudo-random polynomial a[x] over GF(p)
40 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY.
41 | void random_poly_test(int32_t* a, unsigned int p, unsigned int pbits, unsigned int N);
42 |
43 | // Comparing two polynomials over GF(p), a[x]=b[x]? : (0) a=b, (1) a!=b
44 | // NOTE: TO BE USED FOR TESTING ONLY.
45 | int compare_poly(int32_t* a, int32_t* b, unsigned int N);
46 |
47 | // Modular reduction
48 | // NOTE: TO BE USED FOR TESTING ONLY.
49 | int reduce(int a, int p);
50 |
51 | // Polynomial multiplication using the schoolbook method, c[x] = a[x]*b[x]
52 | // NOTE: TO BE USED FOR TESTING ONLY.
53 | void mul_test(int32_t* a, int32_t* b, int32_t* c, uint32_t p, unsigned int N);
54 |
55 | // Polynomial addition, c[x] = a[x] + b[x]
56 | // NOTE: TO BE USED FOR TESTING ONLY.
57 | void add_test(int32_t* a, int32_t* b, int32_t* c, uint32_t p, unsigned int N);
58 |
59 |
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 |
64 |
65 | #endif
--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
1 | #### Makefile for compilation on Linux ####
2 |
3 | OPT=-O3 # Optimization option by default
4 |
5 | ifeq "$(CC)" "gcc"
6 | COMPILER=gcc
7 | else ifeq "$(CC)" "clang"
8 | COMPILER=clang
9 | endif
10 |
11 | ifeq "$(ARCH)" "x64"
12 | ARCHITECTURE=_AMD64_
13 | else ifeq "$(ARCH)" "x86"
14 | ARCHITECTURE=_X86_
15 | else ifeq "$(ARCH)" "ARM"
16 | ARCHITECTURE=_ARM_
17 | endif
18 |
19 | ADDITIONAL_SETTINGS=
20 | ifeq "$(SET)" "EXTENDED"
21 | ADDITIONAL_SETTINGS=-fwrapv -fomit-frame-pointer -march=native
22 | endif
23 |
24 | ifeq "$(ASM)" "TRUE"
25 | USE_ASM=-D _ASM_
26 | endif
27 |
28 | ifeq "$(GENERIC)" "TRUE"
29 | USE_GENERIC=-D _GENERIC_
30 | endif
31 |
32 | ifeq "$(AVX2)" "TRUE"
33 | USE_AVX2=-D _AVX2_
34 | SIMD=-mavx2
35 | endif
36 |
37 | ifeq "$(ARCH)" "ARM"
38 | ARM_SETTING=-lrt
39 | endif
40 |
41 | cc=$(COMPILER)
42 | CFLAGS=-c $(OPT) $(ADDITIONAL_SETTINGS) $(SIMD) -D $(ARCHITECTURE) -D __LINUX__ $(USE_AVX2) $(USE_ASM) $(USE_GENERIC)
43 | LDFLAGS=
44 | ifeq "$(GENERIC)" "TRUE"
45 | OTHER_OBJECTS=ntt.o
46 | else
47 | ifeq "$(ASM)" "TRUE"
48 | OTHER_OBJECTS=ntt_x64.o consts.o
49 | ASM_OBJECTS=ntt_x64_asm.o error_asm.o
50 | endif
51 | endif
52 | OBJECTS=kex.o random.o ntt_constants.o $(ASM_OBJECTS) $(OTHER_OBJECTS)
53 | OBJECTS_TEST=tests.o test_extras.o $(OBJECTS)
54 | OBJECTS_ALL=$(OBJECTS) $(OBJECTS_TEST)
55 |
56 | test: $(OBJECTS_TEST)
57 | $(CC) -o test $(OBJECTS_TEST) $(ARM_SETTING)
58 |
59 | kex.o: kex.c LatticeCrypto_priv.h
60 | $(CC) $(CFLAGS) kex.c
61 |
62 | random.o: random.c LatticeCrypto_priv.h
63 | $(CC) $(CFLAGS) random.c
64 |
65 | ntt_constants.o: ntt_constants.c LatticeCrypto_priv.h
66 | $(CC) $(CFLAGS) ntt_constants.c
67 |
68 | ifeq "$(GENERIC)" "TRUE"
69 | ntt.o: generic/ntt.c LatticeCrypto_priv.h
70 | $(CC) $(CFLAGS) generic/ntt.c
71 | else
72 | ifeq "$(ASM)" "TRUE"
73 | ntt_x64.o: AMD64/ntt_x64.c
74 | $(CC) $(CFLAGS) AMD64/ntt_x64.c
75 | ntt_x64_asm.o: AMD64/ntt_x64_asm.S
76 | $(CC) $(CFLAGS) AMD64/ntt_x64_asm.S
77 | error_asm.o: AMD64/error_asm.S
78 | $(CC) $(CFLAGS) AMD64/error_asm.S
79 | consts.o: AMD64/consts.c
80 | $(CC) $(CFLAGS) AMD64/consts.c
81 | endif
82 | endif
83 |
84 | test_extras.o: tests/test_extras.c tests/test_extras.h LatticeCrypto_priv.h
85 | $(CC) $(CFLAGS) tests/test_extras.c
86 |
87 | tests.o: tests/tests.c LatticeCrypto_priv.h
88 | $(CC) $(CFLAGS) tests/tests.c
89 |
90 | .PHONY: clean
91 |
92 | clean:
93 | rm -f test ntt.o ntt_x64.o ntt_x64_asm.o error_asm.o consts.o $(OBJECTS_ALL)
94 |
95 |
--------------------------------------------------------------------------------
/random.c:
--------------------------------------------------------------------------------
1 | /****************************************************************************************
2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
3 | *
4 | * Copyright (c) Microsoft Corporation. All rights reserved.
5 | *
6 | *
7 | * Abstract: wrappers for user-provided functions
8 | *
9 | *****************************************************************************************/
10 |
11 |
12 | #include "LatticeCrypto_priv.h"
13 |
14 |
15 | CRYPTO_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction)
16 | { // Output "nbytes" of random values.
17 | // It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array".
18 | // The caller is responsible for providing the "RandomBytesFunction" function passing random values as octets.
19 |
20 | if (random_array == NULL || RandomBytesFunction == NULL || nbytes == 0) {
21 | return CRYPTO_ERROR_INVALID_PARAMETER;
22 | }
23 |
24 | return (RandomBytesFunction)(nbytes, random_array);
25 | }
26 |
27 |
28 | CRYPTO_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction)
29 | { // Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes".
30 | // It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array".
31 | // The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits.
32 |
33 | if (seed == NULL || extended_array == NULL || ExtendableOutputFunction == NULL || seed_nbytes == 0 || array_ndigits == 0) {
34 | return CRYPTO_ERROR_INVALID_PARAMETER;
35 | }
36 |
37 | return (ExtendableOutputFunction)(seed, seed_nbytes, array_ndigits, extended_array);
38 | }
39 |
40 |
41 | CRYPTO_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction)
42 | { // Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes".
43 | // It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array".
44 | // The caller is responsible for providing the "StreamOutputFunction" function passing values as octets.
45 |
46 | if (seed == NULL || stream_array == NULL || StreamOutputFunction == NULL || seed_nbytes == 0 || nonce_nbytes == 0 || array_nbytes == 0) {
47 | return CRYPTO_ERROR_INVALID_PARAMETER;
48 | }
49 |
50 | return (StreamOutputFunction)(seed, seed_nbytes, nonce, nonce_nbytes, array_nbytes, stream_array);
51 | }
--------------------------------------------------------------------------------
/LatticeCrypto_priv.h:
--------------------------------------------------------------------------------
1 | /****************************************************************************************
2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
3 | *
4 | * Copyright (c) Microsoft Corporation. All rights reserved.
5 | *
6 | *
7 | * Abstract: internal header file
8 | *
9 | *****************************************************************************************/
10 |
11 | #ifndef __LatticeCrypto_priv_H__
12 | #define __LatticeCrypto_priv_H__
13 |
14 |
15 | // For C++
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 |
20 |
21 | #include "LatticeCrypto.h"
22 |
23 |
24 | // Basic constants
25 | #define PARAMETER_N 1024
26 | #define PARAMETER_Q 12289
27 | #define SEED_BYTES 256/8
28 | #define ERROR_SEED_BYTES 256/8
29 | #define NONCE_SEED_BYTES 256/8
30 | #define PARAMETER_Q4 3073
31 | #define PARAMETER_3Q4 9217
32 | #define PARAMETER_5Q4 15362
33 | #define PARAMETER_7Q4 21506
34 | #define PARAMETER_Q2 6145
35 | #define PARAMETER_3Q2 18434
36 |
37 |
38 | // Macro definitions
39 |
40 | #define NBITS_TO_NWORDS(nbits) (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8)) // Conversion macro from number of bits to number of computer words
41 | #define NBYTES_TO_NWORDS(nbytes) (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t)) // Conversion macro from number of bytes to number of computer words
42 |
43 | // Macro to avoid compiler warnings when detecting unreferenced parameters
44 | #define UNREFERENCED_PARAMETER(PAR) (PAR)
45 |
46 |
47 | /******************** Function prototypes *******************/
48 | /******************* Polynomial functions *******************/
49 |
50 | // Forward NTT
51 | void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N);
52 | void NTT_CT_std2rev_12289_asm(int32_t* a, const int32_t* psi_rev, unsigned int N);
53 |
54 | // Inverse NTT
55 | void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N);
56 | void INTT_GS_rev2std_12289_asm(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N);
57 |
58 | // Reduction modulo q
59 | int32_t reduce12289(int64_t a);
60 |
61 | // Two merged reductions modulo q
62 | int32_t reduce12289_2x(int64_t a);
63 |
64 | // Two consecutive reductions modulo q
65 | void two_reduce12289(int32_t* a, unsigned int N);
66 | void two_reduce12289_asm(int32_t* a, unsigned int N);
67 |
68 | // Correction modulo q
69 | void correction(int32_t* a, int32_t p, unsigned int N);
70 |
71 | // Component-wise multiplication
72 | void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N);
73 | void pmul_asm(int32_t* a, int32_t* b, int32_t* c, unsigned int N);
74 |
75 | // Component-wise multiplication and addition
76 | void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N);
77 | void pmuladd_asm(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N);
78 |
79 | // Component-wise multiplication with scalar
80 | void smul(int32_t* a, int32_t scalar, unsigned int N);
81 |
82 | /******************* Key exchange functions *******************/
83 |
84 | // Alice's message encoding
85 | void encode_A(const uint32_t* pk, const unsigned char* seed, unsigned char* m);
86 |
87 | // Alice's message decoding
88 | void decode_A(const unsigned char* m, uint32_t *pk, unsigned char* seed);
89 |
90 | // Bob's message encoding
91 | void encode_B(const uint32_t* pk, const uint32_t* rvec, unsigned char* m);
92 |
93 | // Bob's message decoding
94 | void decode_B(unsigned char* m, uint32_t* pk, uint32_t* rvec);
95 |
96 | // Partial message encoding/decoding (assembly optimized)
97 | void encode_asm(const uint32_t* pk, unsigned char* m);
98 | void decode_asm(const unsigned char* m, uint32_t *pk);
99 |
100 | // Reconciliation helper
101 | CRYPTO_STATUS HelpRec(const uint32_t* x, uint32_t* rvec, const unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction);
102 |
103 | // Partial reconciliation helper (assembly optimized)
104 | void helprec_asm(const uint32_t* x, uint32_t* rvec, unsigned char* random_bits);
105 |
106 | // Reconciliation
107 | void Rec(const uint32_t *x, const uint32_t* rvec, unsigned char *key);
108 | void rec_asm(const uint32_t *x, const uint32_t* rvec, unsigned char *key);
109 |
110 | // Error sampling
111 | CRYPTO_STATUS get_error(int32_t* e, unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction);
112 |
113 | // Partial error sampling (assembly optimized)
114 | void error_sampling_asm(unsigned char* stream, int32_t* e);
115 |
116 | // Generation of parameter a
117 | CRYPTO_STATUS generate_a(uint32_t* a, const unsigned char* seed, ExtendableOutput ExtendableOutputFunction);
118 |
119 |
120 | #ifdef __cplusplus
121 | }
122 | #endif
123 |
124 |
125 | #endif
126 |
--------------------------------------------------------------------------------
/generic/ntt.c:
--------------------------------------------------------------------------------
1 | /****************************************************************************************
2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
3 | *
4 | * Copyright (c) Microsoft Corporation. All rights reserved.
5 | *
6 | *
7 | * Abstract: NTT functions and other polynomial operations
8 | *
9 | *****************************************************************************************/
10 |
11 | #include "../LatticeCrypto_priv.h"
12 |
13 | const uint32_t mask12 = ((uint64_t)1 << 12) - 1;
14 |
15 |
16 | int32_t reduce12289(int64_t a)
17 | { // Reduction modulo q
18 | int32_t c0, c1;
19 |
20 | c0 = (int32_t)(a & mask12);
21 | c1 = (int32_t)(a >> 12);
22 |
23 | return (3*c0 - c1);
24 | }
25 |
26 |
27 | int32_t reduce12289_2x(int64_t a)
28 | { // Two merged reductions modulo q
29 | int32_t c0, c1, c2;
30 |
31 | c0 = (int32_t)(a & mask12);
32 | c1 = (int32_t)((a >> 12) & mask12);
33 | c2 = (int32_t)(a >> 24);
34 |
35 | return (9*c0 - 3*c1 + c2);
36 | }
37 |
38 |
39 | void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N)
40 | { // Forward NTT
41 | unsigned int m, i, j, j1, j2, k = N;
42 | int32_t S, U, V;
43 |
44 | for (m = 1; m < 128; m = 2*m) {
45 | k = k >> 1;
46 | for (i = 0; i < m; i++) {
47 | j1 = 2*i*k;
48 | j2 = j1+k-1;
49 | S = psi_rev[m+i];
50 | for (j = j1; j <= j2; j++) {
51 | U = a[j];
52 | V = reduce12289((int64_t)a[j+k]*S);
53 | a[j] = U+V;
54 | a[j+k] = U-V;
55 | }
56 | }
57 | }
58 |
59 | k = 4;
60 | for (i = 0; i < 128; i++) {
61 | j1 = 8*i;
62 | j2 = j1+3;
63 | S = psi_rev[i+128];
64 | for (j = j1; j <= j2; j++) {
65 | U = reduce12289((int64_t)a[j]);
66 | V = reduce12289_2x((int64_t)a[j+4]*S);
67 | a[j] = U+V;
68 | a[j+4] = U-V;
69 | }
70 | }
71 |
72 | for (m = 256; m < N; m = 2*m) {
73 | k = k >> 1;
74 | for (i = 0; i < m; i++) {
75 | j1 = 2*i*k;
76 | j2 = j1+k-1;
77 | S = psi_rev[m+i];
78 | for (j = j1; j <= j2; j++) {
79 | U = a[j];
80 | V = reduce12289((int64_t)a[j+k]*S);
81 | a[j] = U+V;
82 | a[j+k] = U-V;
83 | }
84 | }
85 | }
86 | return;
87 | }
88 |
89 |
90 | void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N)
91 | { // Inverse NTT
92 | unsigned int m, h, i, j, j1, j2, k = 1;
93 | int32_t S, U, V;
94 | int64_t temp;
95 |
96 | for (m = N; m > 2; m >>= 1) {
97 | j1 = 0;
98 | h = m >> 1;
99 | for (i = 0; i < h; i++) {
100 | j2 = j1+k-1;
101 | S = omegainv_rev[h+i];
102 | for (j = j1; j <= j2; j++) {
103 | U = a[j];
104 | V = a[j+k];
105 | a[j] = U+V;
106 | temp = (int64_t)(U-V)*S;
107 | if (m == 32) {
108 | a[j] = reduce12289((int64_t)a[j]);
109 | a[j+k] = reduce12289_2x(temp);
110 | } else {
111 | a[j+k] = reduce12289(temp);
112 | }
113 | }
114 | j1 = j1+2*k;
115 | }
116 | k = 2*k;
117 | }
118 | for (j = 0; j < k; j++) {
119 | U = a[j];
120 | V = a[j+k];
121 | a[j] = reduce12289((int64_t)(U+V)*Ninv);
122 | a[j+k] = reduce12289((int64_t)(U-V)*omegainv1N_rev);
123 | }
124 | return;
125 | }
126 |
127 |
128 | void two_reduce12289(int32_t* a, unsigned int N)
129 | { // Two consecutive reductions modulo q
130 | unsigned int i;
131 |
132 | for (i = 0; i < N; i++) {
133 | a[i] = reduce12289((int64_t)a[i]);
134 | a[i] = reduce12289((int64_t)a[i]);
135 | }
136 | }
137 |
138 |
139 | void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N)
140 | { // Component-wise multiplication
141 | unsigned int i;
142 |
143 | for (i = 0; i < N; i++) {
144 | c[i] = reduce12289((int64_t)a[i]*b[i]);
145 | c[i] = reduce12289((int64_t)c[i]);
146 | }
147 | }
148 |
149 |
150 | void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N)
151 | { // Component-wise multiplication and addition
152 | unsigned int i;
153 |
154 | for (i = 0; i < N; i++) {
155 | d[i] = reduce12289((int64_t)a[i]*b[i] + c[i]);
156 | d[i] = reduce12289((int64_t)d[i]);
157 | }
158 | }
159 |
160 |
161 | void smul(int32_t* a, int32_t scalar, unsigned int N)
162 | { // Component-wise multiplication with scalar
163 | unsigned int i;
164 |
165 | for (i = 0; i < N; i++) {
166 | a[i] = a[i]*scalar;
167 | }
168 | }
169 |
170 |
171 | void correction(int32_t* a, int32_t p, unsigned int N)
172 | { // Correction modulo q
173 | unsigned int i;
174 | int32_t mask;
175 |
176 | for (i = 0; i < N; i++) {
177 | mask = a[i] >> (4*sizeof(int32_t) - 1);
178 | a[i] += (p & mask) - p;
179 | mask = a[i] >> (4*sizeof(int32_t) - 1);
180 | a[i] += (p & mask);
181 | }
182 | }
183 |
--------------------------------------------------------------------------------
/tests/test_extras.c:
--------------------------------------------------------------------------------
1 | /****************************************************************************************
2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
3 | *
4 | * Copyright (c) Microsoft Corporation. All rights reserved.
5 | *
6 | *
7 | * Abstract: additional functions for testing
8 | *
9 | *****************************************************************************************/
10 |
11 |
12 | #include "../LatticeCrypto_priv.h"
13 | #include "test_extras.h"
14 | #if (OS_TARGET == OS_WIN)
15 | #include
16 | #include
17 | #endif
18 | #if (OS_TARGET == OS_LINUX) && (TARGET == TARGET_ARM)
19 | #include
20 | #endif
21 | #include
22 |
23 |
24 | int64_t cpucycles(void)
25 | { // Access system counter for benchmarking
26 | #if (OS_TARGET == OS_WIN) && (TARGET == TARGET_AMD64 || TARGET == TARGET_x86)
27 | return __rdtsc();
28 | #elif (OS_TARGET == OS_WIN) && (TARGET == TARGET_ARM)
29 | return __rdpmccntr64();
30 | #elif (OS_TARGET == OS_LINUX) && (TARGET == TARGET_AMD64 || TARGET == TARGET_x86)
31 | unsigned int hi, lo;
32 |
33 | asm volatile ("rdtsc\n\t" : "=a" (lo), "=d"(hi));
34 | return ((int64_t)lo) | (((int64_t)hi) << 32);
35 | #elif (OS_TARGET == OS_LINUX) && (TARGET == TARGET_ARM)
36 | struct timespec time;
37 |
38 | clock_gettime(CLOCK_REALTIME, &time);
39 | return (int64_t)(time.tv_sec*1e9 + time.tv_nsec);
40 | #else
41 | return 0;
42 | #endif
43 | }
44 |
45 |
46 | CRYPTO_STATUS random_bytes_test(unsigned int nbytes, unsigned char* random_array)
47 | { // Generate "nbytes" of random values and output the result to random_array.
48 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY.
49 | unsigned int i;
50 |
51 | for (i = 0; i < nbytes; i++) {
52 | *(random_array + i) = (unsigned char)rand(); // nbytes of random values
53 | }
54 |
55 | return CRYPTO_SUCCESS;
56 | }
57 |
58 |
59 | CRYPTO_STATUS extendable_output_test(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array)
60 | { // Generate "array_ndigits" of 32-bit values and output the result to extended_array.
61 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY.
62 | unsigned int count = 0;
63 | uint32_t digit;
64 |
65 | UNREFERENCED_PARAMETER(seed);
66 | UNREFERENCED_PARAMETER(seed_nbytes);
67 | UNREFERENCED_PARAMETER(array_ndigits);
68 |
69 | srand((unsigned int)seed[0]);
70 |
71 | while (count < array_ndigits) {
72 | random_bytes_test(2, (unsigned char*)&digit); // Pull 2 bytes to get a 14-bit value
73 | digit &= 0x3FFF;
74 | if (digit < PARAMETER_Q) { // Take it if it is in [0, q-1]
75 | extended_array[count] = digit;
76 | count++;
77 | }
78 | }
79 |
80 | return CRYPTO_SUCCESS;
81 | }
82 |
83 |
84 | CRYPTO_STATUS stream_output_test(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array)
85 | { // Generate "array_nbytes" of values and output the result to stream_array.
86 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY.
87 |
88 | UNREFERENCED_PARAMETER(seed);
89 | UNREFERENCED_PARAMETER(seed_nbytes);
90 | UNREFERENCED_PARAMETER(nonce);
91 | UNREFERENCED_PARAMETER(nonce_nbytes);
92 |
93 | random_bytes_test(array_nbytes, stream_array);
94 |
95 | return CRYPTO_SUCCESS;
96 | }
97 |
98 |
99 | void random_poly_test(int32_t* a, unsigned int p, unsigned int pbits, unsigned int N)
100 | { // Generating a pseudo-random polynomial a[x] over GF(p)
101 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY.
102 | unsigned int i, mask = ((unsigned int)1 << pbits) - 1;
103 | unsigned char* string = (unsigned char*)a;
104 |
105 | for (i = 0; i < N; i++) {
106 | do {
107 | *(string + 4*i) = (unsigned char)rand(); // Obtain GF(p) coefficient
108 | *(string + 4*i + 1) = (unsigned char)rand();
109 | a[i] &= mask;
110 | } while (a[i] >= (int32_t)p);
111 | }
112 | }
113 |
114 |
115 | int compare_poly(int32_t* a, int32_t* b, unsigned int N)
116 | { // Comparing two polynomials over GF(p), a[x]=b[x]? : (0) a=b, (1) a!=b
117 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY.
118 | unsigned int i;
119 |
120 | for (i = 0; i < N; i++)
121 | {
122 | if (a[i] != b[i])
123 | return 1;
124 | }
125 |
126 | return 0;
127 | }
128 |
129 |
130 | int reduce(int a, int p)
131 | { // Modular reduction
132 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY.
133 | a %= p;
134 | if (a < 0) a += p;
135 |
136 | return a;
137 | }
138 |
139 |
140 | void mul_test(int32_t* a, int32_t* b, int32_t* c, uint32_t p, unsigned int N)
141 | { // Polynomial multiplication using the schoolbook method, c[x] = a[x]*b[x]
142 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY.
143 | unsigned int i, j, index, mask = N - 1;
144 |
145 | for (i = 0; i < N; i++) c[i] = 0;
146 |
147 | for (i = 0; i < N; i++) {
148 | for (j = 0; j < N; j++) {
149 | index = (i+j) & mask;
150 | if (i+j >= N) {
151 | c[index] = reduce(c[index] - (a[i]*b[j]), p);
152 | } else {
153 | c[index] = reduce(c[index] + (a[i]*b[j]), p);
154 | }
155 | }
156 | }
157 | }
158 |
159 |
160 | void add_test(int32_t* a, int32_t* b, int32_t* c, uint32_t p, unsigned int N)
161 | { // Polynomial addition, c[x] = a[x] + b[x]
162 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY.
163 | unsigned int i;
164 |
165 | for (i = 0; i < N; i++) {
166 | c[i] = reduce(a[i] + b[i], p);
167 | }
168 | }
--------------------------------------------------------------------------------
/Visual Studio/tests/tests.vcxproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | Win32
7 |
8 |
9 | Debug
10 | x64
11 |
12 |
13 | Generic
14 | Win32
15 |
16 |
17 | Generic
18 | x64
19 |
20 |
21 | Release
22 | Win32
23 |
24 |
25 | Release
26 | x64
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 | {8283dd76-e88a-4b63-abde-33f014178413}
41 |
42 |
43 |
44 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}
45 | Win32Proj
46 | fp_tests
47 | tests
48 |
49 |
50 |
51 | Application
52 | true
53 | v120
54 | Unicode
55 |
56 |
57 | Application
58 | true
59 | v120
60 | Unicode
61 |
62 |
63 | Application
64 | false
65 | v120
66 | true
67 | Unicode
68 |
69 |
70 | Application
71 | false
72 | v120
73 | true
74 | Unicode
75 |
76 |
77 | v120
78 |
79 |
80 | v120
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 | true
100 |
101 |
102 | true
103 |
104 |
105 | false
106 |
107 |
108 | false
109 |
110 |
111 |
112 |
113 |
114 | Level4
115 | Disabled
116 | __WINDOWS__; _X86_;
117 | false
118 | Default
119 | MultiThreadedDLL
120 | true
121 | ProgramDatabase
122 |
123 |
124 | Console
125 | true
126 |
127 |
128 |
129 |
130 |
131 |
132 | Level4
133 | Disabled
134 | __WINDOWS__; _AMD64_;
135 | false
136 | Default
137 | MultiThreadedDLL
138 | true
139 | ProgramDatabase
140 | AdvancedVectorExtensions
141 |
142 |
143 | Console
144 | true
145 |
146 |
147 |
148 |
149 | Level4
150 |
151 |
152 | MaxSpeed
153 | true
154 | true
155 | __WINDOWS__; _X86_;
156 | MultiThreadedDLL
157 |
158 |
159 | Console
160 | true
161 | true
162 | true
163 |
164 |
165 |
166 |
167 | Level4
168 |
169 |
170 | MaxSpeed
171 | true
172 | true
173 | __WINDOWS__; _AMD64_;
174 | MultiThreadedDLL
175 | AdvancedVectorExtensions
176 |
177 |
178 | Console
179 | true
180 | true
181 | true
182 |
183 |
184 |
185 |
186 | Level4
187 | true
188 | true
189 | __WINDOWS__; _X86_; _GENERIC_;
190 | true
191 | MaxSpeed
192 |
193 |
194 | UseLinkTimeCodeGeneration
195 | true
196 |
197 |
198 |
199 |
200 | Level4
201 | true
202 | true
203 | __WINDOWS__; _AMD64_; _GENERIC_;
204 | true
205 | AdvancedVectorExtensions
206 | MaxSpeed
207 |
208 |
209 | UseLinkTimeCodeGeneration
210 | true
211 |
212 |
213 |
214 |
215 |
216 |
--------------------------------------------------------------------------------
/LatticeCrypto.h:
--------------------------------------------------------------------------------
1 | /****************************************************************************************
2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
3 | *
4 | * Copyright (c) Microsoft Corporation. All rights reserved.
5 | *
6 | *
7 | * Abstract: main header file
8 | *
9 | *****************************************************************************************/
10 |
11 | #ifndef __LatticeCrypto_H__
12 | #define __LatticeCrypto_H__
13 |
14 |
15 | // For C++
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 |
20 |
21 | #include
22 | #include
23 | #include
24 |
25 |
26 | // Definition of operating system
27 |
28 | #define OS_WIN 1
29 | #define OS_LINUX 2
30 |
31 | #if defined(__WINDOWS__) // Microsoft Windows OS
32 | #define OS_TARGET OS_WIN
33 | #elif defined(__LINUX__) // Linux OS
34 | #define OS_TARGET OS_LINUX
35 | #else
36 | #error -- "Unsupported OS"
37 | #endif
38 |
39 |
40 | // Definition of compiler
41 |
42 | #define COMPILER_VC 1
43 | #define COMPILER_GCC 2
44 | #define COMPILER_CLANG 3
45 |
46 | #if defined(_MSC_VER) // Microsoft Visual C compiler
47 | #define COMPILER COMPILER_VC
48 | #elif defined(__GNUC__) // GNU GCC compiler
49 | #define COMPILER COMPILER_GCC
50 | #elif defined(__clang__) // Clang compiler
51 | #define COMPILER COMPILER_CLANG
52 | #else
53 | #error -- "Unsupported COMPILER"
54 | #endif
55 |
56 |
57 | // Definition of the targeted architecture and basic data types
58 |
59 | #define TARGET_AMD64 1
60 | #define TARGET_x86 2
61 | #define TARGET_ARM 3
62 |
63 | #if defined(_AMD64_)
64 | #define TARGET TARGET_AMD64
65 | #define RADIX 64
66 | typedef uint64_t digit_t; // Unsigned 64-bit digit
67 | typedef int64_t sdigit_t; // Signed 64-bit digit
68 | #elif defined(_X86_)
69 | #define TARGET TARGET_x86
70 | #define RADIX 32
71 | typedef uint32_t digit_t; // Unsigned 32-bit digit
72 | typedef int32_t sdigit_t; // Signed 32-bit digit
73 | #elif defined(_ARM_)
74 | #define TARGET TARGET_ARM
75 | #define RADIX 32
76 | typedef uint32_t digit_t; // Unsigned 32-bit digit
77 | typedef int32_t sdigit_t; // Signed 32-bit digit
78 | #else
79 | #error -- "Unsupported ARCHITECTURE"
80 | #endif
81 |
82 |
83 | // Instruction support
84 |
85 | #define NO_SIMD_SUPPORT 0
86 | #define AVX_SUPPORT 1
87 | #define AVX2_SUPPORT 2
88 |
89 | #if defined(_AVX2_)
90 | #define SIMD_SUPPORT AVX2_SUPPORT // AVX2 support selection
91 | #elif defined(_AVX_)
92 | #define SIMD_SUPPORT AVX_SUPPORT // AVX support selection
93 | #else
94 | #define SIMD_SUPPORT NO_SIMD_SUPPORT
95 | #endif
96 |
97 | #if defined(_ASM_) // Assembly support selection
98 | #define ASM_SUPPORT
99 | #endif
100 |
101 | #if defined(_GENERIC_) // Selection of generic, portable implementation
102 | #define GENERIC_IMPLEMENTATION
103 | #endif
104 |
105 |
106 | // Unsupported configurations
107 |
108 | #if defined(ASM_SUPPORT) && (OS_TARGET == OS_WIN)
109 | #error -- "Assembly is not supported on this platform"
110 | #endif
111 |
112 | #if defined(ASM_SUPPORT) && defined(GENERIC_IMPLEMENTATION)
113 | #error -- "Unsupported configuration"
114 | #endif
115 |
116 | #if (SIMD_SUPPORT != NO_SIMD_SUPPORT) && defined(GENERIC_IMPLEMENTATION)
117 | #error -- "Unsupported configuration"
118 | #endif
119 |
120 | #if (TARGET != TARGET_AMD64) && !defined(GENERIC_IMPLEMENTATION)
121 | #error -- "Unsupported configuration"
122 | #endif
123 |
124 | #if (OS_TARGET == OS_LINUX) && defined(ASM_SUPPORT) && (SIMD_SUPPORT != AVX2_SUPPORT)
125 | #error -- "Unsupported configuration"
126 | #endif
127 |
128 |
129 | // Definitions of the error-handling type and error codes
130 |
131 | typedef enum {
132 | CRYPTO_SUCCESS, // 0x00
133 | CRYPTO_ERROR, // 0x01
134 | CRYPTO_ERROR_DURING_TEST, // 0x02
135 | CRYPTO_ERROR_UNKNOWN, // 0x03
136 | CRYPTO_ERROR_NOT_IMPLEMENTED, // 0x04
137 | CRYPTO_ERROR_NO_MEMORY, // 0x05
138 | CRYPTO_ERROR_INVALID_PARAMETER, // 0x06
139 | CRYPTO_ERROR_SHARED_KEY, // 0x07
140 | CRYPTO_ERROR_TOO_MANY_ITERATIONS, // 0x08
141 | CRYPTO_ERROR_END_OF_LIST
142 | } CRYPTO_STATUS;
143 |
144 | #define CRYPTO_STATUS_TYPE_SIZE (CRYPTO_ERROR_END_OF_LIST)
145 |
146 |
147 | // Definitions of the error messages
148 | // NOTE: they must match the error codes above
149 |
150 | #define CRYPTO_MSG_SUCCESS "CRYPTO_SUCCESS"
151 | #define CRYPTO_MSG_ERROR "CRYPTO_ERROR"
152 | #define CRYPTO_MSG_ERROR_DURING_TEST "CRYPTO_ERROR_DURING_TEST"
153 | #define CRYPTO_MSG_ERROR_UNKNOWN "CRYPTO_ERROR_UNKNOWN"
154 | #define CRYPTO_MSG_ERROR_NOT_IMPLEMENTED "CRYPTO_ERROR_NOT_IMPLEMENTED"
155 | #define CRYPTO_MSG_ERROR_NO_MEMORY "CRYPTO_ERROR_NO_MEMORY"
156 | #define CRYPTO_MSG_ERROR_INVALID_PARAMETER "CRYPTO_ERROR_INVALID_PARAMETER"
157 | #define CRYPTO_MSG_ERROR_SHARED_KEY "CRYPTO_ERROR_SHARED_KEY"
158 | #define CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS "CRYPTO_ERROR_TOO_MANY_ITERATIONS"
159 |
160 |
161 | // Definition of type "RandomBytes" to implement callback function outputting "nbytes" of random values to "random_array"
162 | typedef CRYPTO_STATUS (*RandomBytes)(unsigned int nbytes, unsigned char* random_array);
163 |
164 | // Definition of type "ExtendableOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "extended_array"
165 | typedef CRYPTO_STATUS (*ExtendableOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array);
166 |
167 | // Definition of type "StreamOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "stream_array"
168 | typedef CRYPTO_STATUS (*StreamOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array);
169 |
170 |
171 | // Basic key-exchange constants
172 | #define PKA_BYTES 1824 // Alice's public key size
173 | #define PKB_BYTES 2048 // Bob's public key size
174 | #define SHAREDKEY_BYTES 32 // Shared key size
175 |
176 |
177 | // This data struct is initialized during setup with user-provided functions
178 | typedef struct
179 | {
180 | RandomBytes RandomBytesFunction; // Function providing random bytes
181 | ExtendableOutput ExtendableOutputFunction; // Extendable output function
182 | StreamOutput StreamOutputFunction; // Stream cipher function
183 | } LatticeCryptoStruct, *PLatticeCryptoStruct;
184 |
185 |
186 | /******************** Function prototypes *******************/
187 | /*********************** Auxiliary API **********************/
188 |
189 | // Clear digits from memory. "nwords" indicates the number of digits to be zeroed.
190 | extern void clear_words(void* mem, digit_t nwords);
191 |
192 | // Output "nbytes" of random values.
193 | // It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array".
194 | // The caller is responsible for providing the "RandomBytesFunction" function passing random value as octets.
195 | CRYPTO_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction);
196 |
197 | // Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes".
198 | // It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array".
199 | // The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits.
200 | CRYPTO_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction);
201 |
202 | // Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes".
203 | // It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array".
204 | // The caller is responsible for providing the "StreamOutputFunction" function passing values as octets.
205 | CRYPTO_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction);
206 |
207 | // Dynamic allocation of memory for LatticeCrypto structure. It should be called before initialization with LatticeCrypto_initialize(). Returns NULL on error.
208 | PLatticeCryptoStruct LatticeCrypto_allocate(void);
209 |
210 | // Initialize structure pLatticeCrypto with user-provided functions: RandomBytesFunction, ExtendableOutputFunction and StreamOutputFunction.
211 | CRYPTO_STATUS LatticeCrypto_initialize(PLatticeCryptoStruct pLatticeCrypto, RandomBytes RandomBytesFunction, ExtendableOutput ExtendableOutputFunction, StreamOutput StreamOutputFunction);
212 |
213 | // Output error/success message for a given CRYPTO_STATUS
214 | const char* LatticeCrypto_get_error_message(CRYPTO_STATUS Status);
215 |
216 | /*********************** Key exchange API ***********************/
217 |
218 | // Alice's key generation
219 | // It produces a private key SecretKeyA and computes the public key PublicKeyA.
220 | // Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
221 | // the public key PublicKeyA that occupies 1824 bytes
222 | // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
223 | CRYPTO_STATUS KeyGeneration_A(int32_t* SecretKeyA, unsigned char* PublicKeyA, PLatticeCryptoStruct pLatticeCrypto);
224 |
225 | // Bob's key generation and shared secret computation
226 | // It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes
227 | // the shared secret SharedSecretB.
228 | // Input: Alice's public key PublicKeyA that consists of 1824 bytes
229 | // Outputs: the public key PublicKeyB that occupies 2048 bytes.
230 | // the 256-bit shared secret SharedSecretB.
231 | // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
232 | CRYPTO_STATUS SecretAgreement_B(unsigned char* PublicKeyA, unsigned char* SharedSecretB, unsigned char* PublicKeyB, PLatticeCryptoStruct pLatticeCrypto);
233 |
234 | // Alice's shared secret computation
235 | // It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA.
236 | // Inputs: Bob's public key PublicKeyB that consists of 2048 bytes
237 | // the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total)
238 | // Output: the 256-bit shared secret SharedSecretA.
239 | // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize().
240 | CRYPTO_STATUS SecretAgreement_A(unsigned char* PublicKeyB, int32_t* SecretKeyA, unsigned char* SharedSecretA);
241 |
242 |
243 | #ifdef __cplusplus
244 | }
245 | #endif
246 |
247 |
248 | #endif
249 |
--------------------------------------------------------------------------------
/Visual Studio/LatticeCrypto/LatticeCrypto.vcxproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | Win32
7 |
8 |
9 | Debug
10 | x64
11 |
12 |
13 | Generic
14 | Win32
15 |
16 |
17 | Generic
18 | x64
19 |
20 |
21 | Release
22 | Win32
23 |
24 |
25 | Release
26 | x64
27 |
28 |
29 |
30 | {8283DD76-E88A-4B63-ABDE-33F014178413}
31 | Win32Proj
32 | isoECClib
33 |
34 |
35 |
36 | StaticLibrary
37 | true
38 | v120
39 | Unicode
40 |
41 |
42 | StaticLibrary
43 | true
44 | v120
45 | Unicode
46 |
47 |
48 | StaticLibrary
49 | false
50 | v120
51 | true
52 | Unicode
53 |
54 |
55 | StaticLibrary
56 | false
57 | v120
58 | true
59 | Unicode
60 |
61 |
62 | StaticLibrary
63 | false
64 | v120
65 | true
66 | Unicode
67 |
68 |
69 | StaticLibrary
70 | false
71 | v120
72 | true
73 | Unicode
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 | Level4
103 | Disabled
104 | __WINDOWS__; _X86_; _GENERIC_;
105 | ProgramDatabase
106 | false
107 | false
108 | false
109 | Default
110 | MultiThreadedDLL
111 | true
112 |
113 |
114 | Windows
115 | true
116 |
117 |
118 |
119 |
120 |
121 |
122 | Level4
123 | Disabled
124 | __WINDOWS__; _AMD64_; _GENERIC_;
125 | ProgramDatabase
126 | false
127 | false
128 | true
129 | Default
130 | MultiThreadedDLL
131 |
132 |
133 | AdvancedVectorExtensions
134 |
135 |
136 | Windows
137 | true
138 |
139 |
140 |
141 |
142 | Level4
143 |
144 |
145 | MaxSpeed
146 | true
147 | true
148 | __WINDOWS__; _X86_; _GENERIC_;
149 | MultiThreadedDLL
150 |
151 |
152 | Windows
153 | true
154 | true
155 | true
156 |
157 |
158 |
159 |
160 | Level4
161 |
162 |
163 | MaxSpeed
164 | true
165 | true
166 | __WINDOWS__; _AMD64_; _GENERIC_;
167 | MultiThreadedDLL
168 | AdvancedVectorExtensions
169 |
170 |
171 | Windows
172 | true
173 | true
174 | true
175 |
176 |
177 |
178 |
179 | Level4
180 |
181 |
182 | MaxSpeed
183 | true
184 | true
185 | __WINDOWS__; _X86_; _GENERIC_;
186 | MultiThreadedDLL
187 |
188 |
189 | Windows
190 | true
191 | true
192 | true
193 |
194 |
195 |
196 |
197 | Level4
198 |
199 |
200 | MaxSpeed
201 | true
202 | true
203 | __WINDOWS__; _AMD64_; _GENERIC_;
204 | MultiThreadedDLL
205 | AdvancedVectorExtensions
206 |
207 |
208 | Windows
209 | true
210 | true
211 | true
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
--------------------------------------------------------------------------------
/tests/tests.c:
--------------------------------------------------------------------------------
1 | /****************************************************************************************
2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library
3 | *
4 | * Copyright (c) Microsoft Corporation. All rights reserved.
5 | *
6 | *
7 | * Abstract: testing code
8 | *
9 | *****************************************************************************************/
10 |
11 | #include "../LatticeCrypto_priv.h"
12 | #include "test_extras.h"
13 | #include
14 | #include
15 |
16 | extern const int32_t psi_rev_ntt1024_12289[PARAMETER_N];
17 | extern const int32_t omegainv_rev_ntt1024_12289[PARAMETER_N];
18 | extern const int32_t omegainv7N_rev_ntt1024_12289;
19 | extern const int32_t omegainv10N_rev_ntt1024_12289;
20 | extern const int32_t Ninv8_ntt1024_12289;
21 | extern const int32_t Ninv11_ntt1024_12289;
22 |
23 | // Benchmark and test parameters
24 | #define BENCH_LOOPS 1000 // Number of iterations per bench
25 | #define TEST_LOOPS 100 // Number of iterations per test
26 |
27 |
28 | bool ntt_test()
29 | { // Tests for the NTT functions
30 | bool OK = true;
31 | int n, passed;
32 | int32_t a[PARAMETER_N], b[PARAMETER_N], c[PARAMETER_N], d[PARAMETER_N], e[PARAMETER_N], f[PARAMETER_N], g[PARAMETER_N], ff[PARAMETER_N];
33 | unsigned int pbits = 14;
34 |
35 | printf("\n--------------------------------------------------------------------------------------------------------\n\n");
36 | printf("Testing NTT functions: \n\n");
37 |
38 | passed = 1;
39 | for (n=0; n m=32
35 | mov r9, 1 // m = 1
36 | mov rax, reg_p3
37 | mov r12, reg_p3
38 | shr r12, 4 // n/16
39 | vmovdqu ymm14, MASK12x8
40 | vmovdqu ymm12, PERM0246
41 | mov r14, 16
42 | mov rcx, 11
43 | loop1:
44 | shr rax, 1 // k = k/2
45 | dec rcx
46 | xor rdx, rdx // i = 0
47 | loop2:
48 | mov r10, rdx
49 | mov r11, rax
50 | dec r11
51 | shl r10, cl // j1
52 | add r11, r10 // j2
53 | mov r13, r9
54 | add r13, rdx // m+i
55 | vbroadcastss ymm11, DWORD PTR [reg_p2+4*r13] // S
56 |
57 | loop3:
58 | mov r13, r10
59 | add r13, rax // j+k
60 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r13] // a[j+k]
61 | vpmovsxdq ymm3, XMMWORD PTR [reg_p1+4*r13+16] // a[j+k]
62 | vpmovsxdq ymm5, XMMWORD PTR [reg_p1+4*r13+32] // a[j+k]
63 | vpmovsxdq ymm7, XMMWORD PTR [reg_p1+4*r13+48] // a[j+k]
64 |
65 | vpmuldq ymm1, ymm1, ymm11 // a[j+k].S
66 | vpmuldq ymm3, ymm3, ymm11
67 | vpmuldq ymm5, ymm5, ymm11
68 | vpmuldq ymm7, ymm7, ymm11
69 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j]
70 |
71 | vmovdqu ymm13, ymm1
72 | vpand ymm1, ymm14, ymm1 // c0
73 | vpsrlq ymm13, ymm13, 12 // c1
74 | vpslld ymm15, ymm1, 1 // 2*c0
75 | vpsubd ymm13, ymm1, ymm13 // c0-c1
76 | vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1
77 | vpsubd ymm1, ymm0, ymm13 // a[j+k] = U - V
78 | vpaddd ymm0, ymm0, ymm13 // a[j] = U + V
79 | vpermd ymm1, ymm12, ymm1
80 | vpermd ymm0, ymm12, ymm0
81 | vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j]
82 |
83 | vmovdqu ymm13, ymm3
84 | vpand ymm3, ymm14, ymm3 // c0
85 | vpsrlq ymm13, ymm13, 12 // c1
86 | vpslld ymm15, ymm3, 1 // 2*c0
87 | vpsubd ymm13, ymm3, ymm13 // c0-c1
88 | vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1
89 | vpsubd ymm3, ymm2, ymm13 // a[j+k] = U - V
90 | vpaddd ymm2, ymm2, ymm13 // a[j] = U + V
91 | vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0
92 | vmovdqu XMMWORD PTR [reg_p1+4*r13], xmm1
93 | vpermd ymm3, ymm12, ymm3
94 | vpermd ymm2, ymm12, ymm2
95 | vpmovsxdq ymm4, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]
96 |
97 | vmovdqu ymm13, ymm5
98 | vpand ymm5, ymm14, ymm5 // c0
99 | vpsrlq ymm13, ymm13, 12 // c1
100 | vpslld ymm15, ymm5, 1 // 2*c0
101 | vpsubd ymm13, ymm5, ymm13 // c0-c1
102 | vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1
103 | vpsubd ymm5, ymm4, ymm13 // a[j+k] = U - V
104 | vpaddd ymm4, ymm4, ymm13 // a[j] = U + V
105 | vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2
106 | vmovdqu XMMWORD PTR [reg_p1+4*r13+16], xmm3
107 | vpermd ymm5, ymm12, ymm5
108 | vpermd ymm4, ymm12, ymm4
109 | vpmovsxdq ymm6, XMMWORD PTR [reg_p1+4*r10+48] // U = a[j]
110 |
111 | vmovdqu ymm13, ymm7
112 | vpand ymm7, ymm14, ymm7 // c0
113 | vpsrlq ymm13, ymm13, 12 // c1
114 | vpslld ymm15, ymm7, 1 // 2*c0
115 | vpsubd ymm13, ymm7, ymm13 // c0-c1
116 | vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1
117 | vpsubd ymm7, ymm6, ymm13 // a[j+k] = U - V
118 | vpaddd ymm6, ymm6, ymm13 // a[j] = U + V
119 | vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm4
120 | vmovdqu XMMWORD PTR [reg_p1+4*r13+32], xmm5
121 | vpermd ymm6, ymm12, ymm6
122 | vpermd ymm7, ymm12, ymm7
123 | vmovdqu XMMWORD PTR [reg_p1+4*r13+48], xmm7
124 | vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm6
125 |
126 | add r10, r14
127 | cmp r10, r11
128 | jl loop3
129 | inc rdx
130 | cmp rdx, r9
131 | jl loop2
132 | shl r9, 1
133 | cmp r9, r12
134 | jl loop1
135 |
136 | // Stage m=64
137 | xor rdx, rdx // i = 0
138 | xor r10, r10 // j1 = 0
139 | loop4:
140 | vbroadcastss ymm11, DWORD PTR [reg_p2+4*rdx+4*64] // S
141 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+32] // a[j+k]
142 | vpmovsxdq ymm3, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k]
143 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j]
144 | vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j]
145 | vpmuldq ymm1, ymm1, ymm11 // a[j+k].S
146 | vpmuldq ymm3, ymm3, ymm11 // a[j+k].S
147 |
148 | vmovdqu ymm13, ymm1
149 | vpand ymm1, ymm14, ymm1 // c0
150 | vpsrlq ymm13, ymm13, 12 // c1
151 | vpslld ymm15, ymm1, 1 // 2*c0
152 | vpsubd ymm13, ymm1, ymm13 // c0-c1
153 | vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1
154 |
155 | vmovdqu ymm10, ymm3
156 | vpand ymm3, ymm14, ymm3 // c0
157 | vpsrlq ymm10, ymm10, 12 // c1
158 | vpslld ymm15, ymm3, 1 // 2*c0
159 | vpsubd ymm10, ymm3, ymm10 // c0-c1
160 | vpaddd ymm10, ymm10, ymm15 // V = 3*c0-c1
161 |
162 | vpsubd ymm1, ymm0, ymm13 // a[j+k] = U - V
163 | vpaddd ymm0, ymm0, ymm13 // a[j] = U + V
164 | vpsubd ymm3, ymm2, ymm10 // a[j+k] = U - V
165 | vpaddd ymm2, ymm2, ymm10 // a[j] = U + V
166 |
167 | vpermd ymm0, ymm12, ymm0
168 | vpermd ymm1, ymm12, ymm1
169 | vpermd ymm2, ymm12, ymm2
170 | vpermd ymm3, ymm12, ymm3
171 | vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0
172 | vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm1
173 | vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2
174 | vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm3
175 |
176 | add r10, r14 // j+16
177 | inc rdx // i+1
178 | cmp rdx, r9
179 | jl loop4
180 |
181 | // Stage m=128
182 | shl r9, 1
183 | xor rdx, rdx // i = 0
184 | xor r10, r10 // j1 = 0
185 | mov r13, 8
186 | loop6:
187 | vbroadcastss ymm2, DWORD PTR [reg_p2+4*rdx+4*128] // S
188 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k]
189 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j]
190 | vpmuldq ymm1, ymm1, ymm2 // a[j+k].S
191 |
192 | vmovdqu ymm3, ymm0
193 | vpand ymm0, ymm14, ymm0 // c0
194 | vpsrad ymm3, ymm3, 12 // c1
195 | vpslld ymm4, ymm0, 1 // 2*c0
196 | vpsubd ymm3, ymm0, ymm3 // c0-c1
197 | vpaddd ymm0, ymm3, ymm4 // U = 3*c0-c1
198 |
199 | vmovdqu ymm3, ymm1
200 | vpand ymm1, ymm14, ymm1 // c0
201 | vpsrlq ymm4, ymm3, 24 // c2
202 | vpsrad ymm3, ymm3, 12 // xc1
203 | vpand ymm3, ymm14, ymm3 // c1
204 | vpslld ymm5, ymm1, 3 // 8*c0
205 | vpaddd ymm4, ymm1, ymm4 // c0+c2
206 | vpaddd ymm4, ymm4, ymm5 // 9*c0+c2
207 | vpslld ymm5, ymm3, 1 // 2*c1
208 | vpaddd ymm1, ymm0, ymm3 // U+c1
209 | vpsubd ymm0, ymm0, ymm3 // U-c1
210 | vpsubd ymm4, ymm4, ymm5 // 9*c0-2*c1+c2
211 | vpaddd ymm0, ymm0, ymm4 // U+(9*c0-3*c1+c2)
212 | vpsubd ymm1, ymm1, ymm4 // U-(9*c0-3*c1+c2)
213 | vpermd ymm0, ymm12, ymm0
214 | vpermd ymm1, ymm12, ymm1
215 | vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0
216 | vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm1
217 |
218 | add r10, r13 // j+8
219 | inc rdx // i+1
220 | cmp rdx, r9
221 | jl loop6
222 |
223 | // Stage m=256
224 | vmovdqu ymm9, PERM02134657
225 | shl r9, 1
226 | xor rdx, rdx // i = 0
227 | xor r10, r10 // j1 = 0
228 | mov r14, 32
229 | loop7:
230 | vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256] // S = psi[m+i]->psi[m+i+3]
231 | vpermq ymm8, ymm2, 0x50
232 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j]->a[j+3]
233 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k]->a[j+k+3]
234 | vpermq ymm3, ymm0, 0x4e
235 | vinserti128 ymm0, ymm0, xmm1, 1 // U
236 | vpblendd ymm1, ymm1, ymm3, 15
237 | vpmuldq ymm3, ymm1, ymm8 // a[j+k].S
238 | vmovdqu ymm4, ymm3
239 | vpand ymm3, ymm14, ymm3 // c0
240 | vpsrlq ymm4, ymm4, 12 // c1
241 | vpslld ymm5, ymm3, 1 // 2*c0
242 | vpsubd ymm4, ymm3, ymm4 // c0-c1
243 | vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1
244 | vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V
245 | vpaddd ymm0, ymm0, ymm4 // a[j] = U + V
246 | vpslldq ymm1, ymm1, 4
247 | vpblendd ymm0, ymm0, ymm1, 0xaa
248 | vpermd ymm0, ymm9, ymm0
249 | vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0
250 |
251 | vpermq ymm8, ymm2, 0xfa
252 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]->a[j+3]
253 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k]->a[j+k+3]
254 | vpermq ymm3, ymm0, 0x4e
255 | vinserti128 ymm0, ymm0, xmm1, 1 // U
256 | vpblendd ymm1, ymm1, ymm3, 15
257 | vpmuldq ymm3, ymm1, ymm8 // a[j+k].S
258 | vmovdqu ymm4, ymm3
259 | vpand ymm3, ymm14, ymm3 // c0
260 | vpsrlq ymm4, ymm4, 12 // c1
261 | vpslld ymm5, ymm3, 1 // 2*c0
262 | vpsubd ymm4, ymm3, ymm4 // c0-c1
263 | vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1
264 | vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V
265 | vpaddd ymm0, ymm0, ymm4 // a[j] = U + V
266 | vpslldq ymm1, ymm1, 4
267 | vpblendd ymm0, ymm0, ymm1, 0xaa
268 | vpermd ymm0, ymm9, ymm0
269 | vmovdqu YMMWORD PTR [reg_p1+4*r10+32], ymm0
270 |
271 | vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256+16] // S = psi[m+i]->psi[m+i+3]
272 | vpermq ymm8, ymm2, 0x50
273 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+64] // U = a[j]->a[j+3]
274 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+80] // a[j+k]->a[j+k+3]
275 | vpermq ymm3, ymm0, 0x4e
276 | vinserti128 ymm0, ymm0, xmm1, 1 // U
277 | vpblendd ymm1, ymm1, ymm3, 15
278 | vpmuldq ymm3, ymm1, ymm8 // a[j+k].S
279 | vmovdqu ymm4, ymm3
280 | vpand ymm3, ymm14, ymm3 // c0
281 | vpsrlq ymm4, ymm4, 12 // c1
282 | vpslld ymm5, ymm3, 1 // 2*c0
283 | vpsubd ymm4, ymm3, ymm4 // c0-c1
284 | vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1
285 | vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V
286 | vpaddd ymm0, ymm0, ymm4 // a[j] = U + V
287 | vpslldq ymm1, ymm1, 4
288 | vpblendd ymm0, ymm0, ymm1, 0xaa
289 | vpermd ymm0, ymm9, ymm0
290 | vmovdqu YMMWORD PTR [reg_p1+4*r10+64], ymm0
291 |
292 | vpermq ymm8, ymm2, 0xfa
293 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+96] // U = a[j]->a[j+3]
294 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+112] // a[j+k]->a[j+k+3]
295 | vpermq ymm3, ymm0, 0x4e
296 | vinserti128 ymm0, ymm0, xmm1, 1 // U
297 | vpblendd ymm1, ymm1, ymm3, 15
298 | vpmuldq ymm3, ymm1, ymm8 // a[j+k].S
299 | vmovdqu ymm4, ymm3
300 | vpand ymm3, ymm14, ymm3 // c0
301 | vpsrlq ymm4, ymm4, 12 // c1
302 | vpslld ymm5, ymm3, 1 // 2*c0
303 | vpsubd ymm4, ymm3, ymm4 // c0-c1
304 | vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1
305 | vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V
306 | vpaddd ymm0, ymm0, ymm4 // a[j] = U + V
307 | vpslldq ymm1, ymm1, 4
308 | vpblendd ymm0, ymm0, ymm1, 0xaa
309 | vpermd ymm0, ymm9, ymm0
310 | vmovdqu YMMWORD PTR [reg_p1+4*r10+96], ymm0
311 |
312 | add r10, r14 // j+32
313 | add rdx, r13 // i+8
314 | cmp rdx, r9
315 | jl loop7
316 |
317 | // Stage m=512
318 | vmovdqu ymm9, PERM00224466
319 | shl r9, 1 // m = n/2
320 | xor rdx, rdx // i = 0
321 | xor r10, r10 // j1 = 0
322 | mov r14, 4
323 | loop8:
324 | vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*512] // S
325 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j]
326 | vmovdqu ymm1, YMMWORD PTR [reg_p1+4*r10+4] // a[j+k]
327 | vpmuldq ymm3, ymm1, ymm2 // a[j+k].S
328 | vmovdqu ymm4, ymm3
329 | vpand ymm3, ymm14, ymm3 // c0
330 | vpsrlq ymm4, ymm4, 12 // c1
331 | vpslld ymm5, ymm3, 1 // 2*c0
332 | vpsubd ymm4, ymm3, ymm4 // c0-c1
333 | vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1
334 | vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V
335 | vpaddd ymm0, ymm0, ymm4 // a[j] = U + V
336 | vpermd ymm1, ymm9, ymm1
337 | vpblendd ymm0, ymm0, ymm1, 0xaa
338 | vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0
339 |
340 | add r10, r13 // j+8
341 | add rdx, r14 // i+4
342 | cmp rdx, r9
343 | jl loop8
344 |
345 | pop r14
346 | pop r13
347 | pop r12
348 | ret
349 |
350 |
351 | //***********************************************************************
352 | // Inverse NTT
353 | // Operation: a [reg_p1] <- INTT(a) [reg_p1],
354 | // [reg_p2] points to table
355 | // reg_p3 and reg_p4 point to constants for scaling and
356 | // reg_p5 contains parameter n
357 | //***********************************************************************
358 | .global INTT_GS_rev2std_12289_asm
359 | INTT_GS_rev2std_12289_asm:
360 | push r12
361 | push r13
362 | push r14
363 | push r15
364 | push rbx
365 |
366 | // Stage m=1024
367 | vmovdqu ymm9, PERM00224466
368 | vmovdqu ymm14, MASK12x8
369 | mov r12, reg_p5
370 | shr r12, 1 // n/2 = 512
371 | xor r15, r15 // i = 0
372 | xor r10, r10 // j1 = 0
373 | mov r13, 8
374 | mov r14, 4
375 | loop1b:
376 | vmovdqu ymm1, YMMWORD PTR [reg_p1+4*r10+4] // V = a[j+k]
377 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j]
378 | vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*512] // S
379 | vpsubd ymm3, ymm0, ymm1 // U - V
380 | vpaddd ymm0, ymm0, ymm1 // U + V
381 | vpmuldq ymm3, ymm3, ymm2 // (U - V).S
382 | vmovdqu ymm4, ymm3
383 | vpand ymm3, ymm14, ymm3 // c0
384 | vpsrlq ymm4, ymm4, 12 // c1
385 | vpslld ymm5, ymm3, 1 // 2*c0
386 | vpsubd ymm4, ymm3, ymm4 // c0-c1
387 | vpaddd ymm1, ymm4, ymm5 // 3*c0-c1
388 | vpermd ymm1, ymm9, ymm1
389 | vpblendd ymm0, ymm0, ymm1, 0xaa
390 | vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0
391 |
392 | add r10, r13 // j+8
393 | add r15, r14 // i+4
394 | cmp r15, r12
395 | jl loop1b
396 |
397 | // Stage m=512
398 | vmovdqu ymm9, PERM02134657
399 | vmovdqu ymm13, PERM0145
400 | vmovdqu ymm15, PERM2367
401 | shr r12, 1 // n/4 = 256
402 | xor r15, r15 // i = 0
403 | xor r10, r10 // j1 = 0
404 | mov r14, 32
405 | loop2b:
406 | vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*256] // S = psi[m+i]->psi[m+i+3]
407 | vpermq ymm8, ymm2, 0x50
408 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j]->a[j+7]
409 | vpermd ymm1, ymm15, ymm0
410 | vpermd ymm0, ymm13, ymm0
411 | vpsubd ymm3, ymm0, ymm1 // U - V
412 | vpaddd ymm0, ymm0, ymm1 // U + V
413 | vpmuldq ymm3, ymm3, ymm8 // (U - V).S
414 | vmovdqu ymm4, ymm3
415 | vpand ymm3, ymm14, ymm3 // c0
416 | vpsrlq ymm4, ymm4, 12 // c1
417 | vpslld ymm5, ymm3, 1 // 2*c0
418 | vpsubd ymm4, ymm3, ymm4 // c0-c1
419 | vpaddd ymm1, ymm4, ymm5 // 3*c0-c1
420 | vpslldq ymm1, ymm1, 4
421 | vpblendd ymm0, ymm0, ymm1, 0xaa
422 | vpermd ymm0, ymm9, ymm0
423 | vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0
424 |
425 | vpermq ymm8, ymm2, 0xfa
426 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+32] // U = a[j]->a[j+7]
427 | vpermd ymm1, ymm15, ymm0
428 | vpermd ymm0, ymm13, ymm0
429 | vpsubd ymm3, ymm0, ymm1 // U - V
430 | vpaddd ymm0, ymm0, ymm1 // U + V
431 | vpmuldq ymm3, ymm3, ymm8 // (U - V).S
432 | vmovdqu ymm4, ymm3
433 | vpand ymm3, ymm14, ymm3 // c0
434 | vpsrlq ymm4, ymm4, 12 // c1
435 | vpslld ymm5, ymm3, 1 // 2*c0
436 | vpsubd ymm4, ymm3, ymm4 // c0-c1
437 | vpaddd ymm1, ymm4, ymm5 // 3*c0-c1
438 | vpslldq ymm1, ymm1, 4
439 | vpblendd ymm0, ymm0, ymm1, 0xaa
440 | vpermd ymm0, ymm9, ymm0
441 | vmovdqu YMMWORD PTR [reg_p1+4*r10+32], ymm0
442 |
443 | vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*256+16]// S = psi[m+i]->psi[m+i+3]
444 | vpermq ymm8, ymm2, 0x50
445 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+64] // U = a[j]->a[j+7]
446 | vpermd ymm1, ymm15, ymm0
447 | vpermd ymm0, ymm13, ymm0
448 | vpsubd ymm3, ymm0, ymm1 // U - V
449 | vpaddd ymm0, ymm0, ymm1 // U + V
450 | vpmuldq ymm3, ymm3, ymm8 // (U - V).S
451 | vmovdqu ymm4, ymm3
452 | vpand ymm3, ymm14, ymm3 // c0
453 | vpsrlq ymm4, ymm4, 12 // c1
454 | vpslld ymm5, ymm3, 1 // 2*c0
455 | vpsubd ymm4, ymm3, ymm4 // c0-c1
456 | vpaddd ymm1, ymm4, ymm5 // 3*c0-c1
457 | vpslldq ymm1, ymm1, 4
458 | vpblendd ymm0, ymm0, ymm1, 0xaa
459 | vpermd ymm0, ymm9, ymm0
460 | vmovdqu YMMWORD PTR [reg_p1+4*r10+64], ymm0
461 |
462 | vpermq ymm8, ymm2, 0xfa
463 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+96] // U = a[j]->a[j+7]
464 | vpermd ymm1, ymm15, ymm0
465 | vpermd ymm0, ymm13, ymm0
466 | vpsubd ymm3, ymm0, ymm1 // U - V
467 | vpaddd ymm0, ymm0, ymm1 // U + V
468 | vpmuldq ymm3, ymm3, ymm8 // (U - V).S
469 | vmovdqu ymm4, ymm3
470 | vpand ymm3, ymm14, ymm3 // c0
471 | vpsrlq ymm4, ymm4, 12 // c1
472 | vpslld ymm5, ymm3, 1 // 2*c0
473 | vpsubd ymm4, ymm3, ymm4 // c0-c1
474 | vpaddd ymm1, ymm4, ymm5 // 3*c0-c1
475 | vpslldq ymm1, ymm1, 4
476 | vpblendd ymm0, ymm0, ymm1, 0xaa
477 | vpermd ymm0, ymm9, ymm0
478 | vmovdqu YMMWORD PTR [reg_p1+4*r10+96], ymm0
479 |
480 | add r10, r14 // j+32
481 | add r15, r13 // i+8
482 | cmp r15, r12
483 | jl loop2b
484 |
485 | // Stage m=256
486 | vmovdqu ymm12, PERM0246
487 | shr r12, 1 // n/8 = 128
488 | xor r15, r15 // i = 0
489 | xor r10, r10 // j1 = 0
490 | loop3b:
491 | vbroadcastss ymm2, DWORD PTR [reg_p2+4*r15+4*128] // S
492 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // V = a[j+k]
493 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j]
494 | vpsubd ymm3, ymm0, ymm1 // U - V
495 | vpaddd ymm0, ymm0, ymm1 // U + V
496 | vpmuldq ymm3, ymm3, ymm2 // (U - V).S
497 | vmovdqu ymm4, ymm3
498 | vpand ymm3, ymm14, ymm3 // c0
499 | vpsrlq ymm4, ymm4, 12 // c1
500 | vpslld ymm5, ymm3, 1 // 2*c0
501 | vpsubd ymm4, ymm3, ymm4 // c0-c1
502 | vpaddd ymm1, ymm4, ymm5 // 3*c0-c1
503 | vpermd ymm0, ymm12, ymm0
504 | vpermd ymm1, ymm12, ymm1
505 | vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0
506 | vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm1
507 |
508 | add r10, r13 // j+8
509 | inc r15 // i+1
510 | cmp r15, r12
511 | jl loop3b
512 |
513 | // Stage m=128
514 | shr r12, 1 // n/16 = 64
515 | xor r15, r15 // i = 0
516 | xor r10, r10 // j1 = 0
517 | mov r14, 16
518 | loop4b:
519 | vbroadcastss ymm11, DWORD PTR [reg_p2+4*r15+4*64] // S
520 | vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r10+32] // V = a[j+k]
521 | vpmovsxdq ymm15, XMMWORD PTR [reg_p1+4*r10+48] // V = a[j+k]
522 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j]
523 | vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j]
524 | vpsubd ymm1, ymm0, ymm13 // U - V
525 | vpaddd ymm0, ymm0, ymm13 // U + V
526 | vpsubd ymm3, ymm2, ymm15 // U - V
527 | vpaddd ymm2, ymm2, ymm15 // U + V
528 | vpmuldq ymm1, ymm1, ymm11 // (U - V).S
529 | vpmuldq ymm3, ymm3, ymm11 // (U - V).S
530 |
531 | vmovdqu ymm13, ymm1
532 | vpand ymm1, ymm14, ymm1 // c0
533 | vpsrlq ymm13, ymm13, 12 // c1
534 | vpslld ymm15, ymm1, 1 // 2*c0
535 | vpsubd ymm13, ymm1, ymm13 // c0-c1
536 | vpaddd ymm1, ymm13, ymm15 // 3*c0-c1
537 |
538 | vmovdqu ymm13, ymm3
539 | vpand ymm3, ymm14, ymm3 // c0
540 | vpsrlq ymm13, ymm13, 12 // c1
541 | vpslld ymm15, ymm3, 1 // 2*c0
542 | vpsubd ymm13, ymm3, ymm13 // c0-c1
543 | vpaddd ymm3, ymm13, ymm15 // 3*c0-c1
544 |
545 | vpermd ymm0, ymm12, ymm0
546 | vpermd ymm1, ymm12, ymm1
547 | vpermd ymm2, ymm12, ymm2
548 | vpermd ymm3, ymm12, ymm3
549 | vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0
550 | vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm1
551 | vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2
552 | vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm3
553 |
554 | add r10, r14 // j+16
555 | inc r15 // i+1
556 | cmp r15, r12
557 | jl loop4b
558 |
559 | // Stages m=64 -> m=4
560 | mov r9, 5 // 5 iterations
561 | mov rax, 8
562 | loop5b:
563 | shl rax, 1 // k = 2*k
564 | shr r12, 1 // m/2
565 | xor r15, r15 // i = 0
566 | xor r8, r8
567 | loop6b:
568 | mov r10, r8 // Load j1
569 | mov r11, rax
570 | dec r11
571 | add r11, r10 // j2
572 | mov r13, r12
573 | add r13, r15 // m/2+i
574 | vbroadcastss ymm9, DWORD PTR [reg_p2+4*r13] // S
575 | mov rbx, 4
576 |
577 | loop7b:
578 | mov r13, r10
579 | add r13, rax // j+k
580 | vpmovsxdq ymm10, XMMWORD PTR [reg_p1+4*r13] // V = a[j+k]
581 | vpmovsxdq ymm11, XMMWORD PTR [reg_p1+4*r13+16] // V = a[j+k]
582 | vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r13+32] // V = a[j+k]
583 | vpmovsxdq ymm15, XMMWORD PTR [reg_p1+4*r13+48] // V = a[j+k]
584 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j]
585 | vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j]
586 | vpmovsxdq ymm4, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]
587 | vpmovsxdq ymm6, XMMWORD PTR [reg_p1+4*r10+48] // U = a[j]
588 |
589 | vpsubd ymm1, ymm0, ymm10 // U - V
590 | vpaddd ymm0, ymm0, ymm10 // U + V
591 | vpsubd ymm3, ymm2, ymm11 // U - V
592 | vpaddd ymm2, ymm2, ymm11 // U + V
593 | vpsubd ymm5, ymm4, ymm13 // U - V
594 | vpaddd ymm4, ymm4, ymm13 // U + V
595 | vpsubd ymm7, ymm6, ymm15 // U - V
596 | vpaddd ymm6, ymm6, ymm15 // U + V
597 |
598 | vpmuldq ymm1, ymm1, ymm9 // (U - V).S
599 | vpmuldq ymm3, ymm3, ymm9
600 | vpmuldq ymm5, ymm5, ymm9
601 | vpmuldq ymm7, ymm7, ymm9
602 |
603 | vmovdqu ymm13, ymm1
604 | vpand ymm1, ymm14, ymm1 // c0
605 | vpsrlq ymm13, ymm13, 12 // c1
606 | vpslld ymm15, ymm1, 1 // 2*c0
607 | vpsubd ymm13, ymm1, ymm13 // c0-c1
608 | vpaddd ymm1, ymm13, ymm15 // 3*c0-c1
609 |
610 | cmp r9, rbx
611 | jne skip1
612 | vmovdqu ymm13, ymm0
613 | vpand ymm0, ymm14, ymm0 // c0
614 | vpsrad ymm13, ymm13, 12 // c1
615 | vpslld ymm15, ymm0, 1 // 2*c0
616 | vpsubd ymm13, ymm0, ymm13 // c0-c1
617 | vpaddd ymm0, ymm13, ymm15 // 3*c0-c1
618 |
619 | vmovdqu ymm13, ymm1
620 | vpand ymm1, ymm14, ymm1 // c0
621 | vpsrad ymm13, ymm13, 12 // c1
622 | vpslld ymm15, ymm1, 1 // 2*c0
623 | vpsubd ymm13, ymm1, ymm13 // c0-c1
624 | vpaddd ymm1, ymm13, ymm15 // 3*c0-c1
625 | skip1:
626 | vpermd ymm1, ymm12, ymm1
627 | vpermd ymm0, ymm12, ymm0
628 |
629 | vmovdqu ymm13, ymm3
630 | vpand ymm3, ymm14, ymm3 // c0
631 | vpsrlq ymm13, ymm13, 12 // c1
632 | vpslld ymm15, ymm3, 1 // 2*c0
633 | vpsubd ymm13, ymm3, ymm13 // c0-c1
634 | vpaddd ymm3, ymm13, ymm15 // 3*c0-c1
635 | vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0
636 | vmovdqu XMMWORD PTR [reg_p1+4*r13], xmm1
637 |
638 | cmp r9, rbx
639 | jne skip2
640 | vmovdqu ymm13, ymm2
641 | vpand ymm2, ymm14, ymm2 // c0
642 | vpsrad ymm13, ymm13, 12 // c1
643 | vpslld ymm15, ymm2, 1 // 2*c0
644 | vpsubd ymm13, ymm2, ymm13 // c0-c1
645 | vpaddd ymm2, ymm13, ymm15 // 3*c0-c1
646 |
647 | vmovdqu ymm13, ymm3
648 | vpand ymm3, ymm14, ymm3 // c0
649 | vpsrad ymm13, ymm13, 12 // c1
650 | vpslld ymm15, ymm3, 1 // 2*c0
651 | vpsubd ymm13, ymm3, ymm13 // c0-c1
652 | vpaddd ymm3, ymm13, ymm15 // 3*c0-c1
653 | skip2:
654 | vpermd ymm3, ymm12, ymm3
655 | vpermd ymm2, ymm12, ymm2
656 |
657 | vmovdqu ymm13, ymm5
658 | vpand ymm5, ymm14, ymm5 // c0
659 | vpsrlq ymm13, ymm13, 12 // c1
660 | vpslld ymm15, ymm5, 1 // 2*c0
661 | vpsubd ymm13, ymm5, ymm13 // c0-c1
662 | vpaddd ymm5, ymm13, ymm15 // 3*c0-c1
663 | vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2
664 | vmovdqu XMMWORD PTR [reg_p1+4*r13+16], xmm3
665 |
666 | cmp r9, rbx
667 | jne skip3
668 | vmovdqu ymm13, ymm4
669 | vpand ymm4, ymm14, ymm4 // c0
670 | vpsrad ymm13, ymm13, 12 // c1
671 | vpslld ymm15, ymm4, 1 // 2*c0
672 | vpsubd ymm13, ymm4, ymm13 // c0-c1
673 | vpaddd ymm4, ymm13, ymm15 // 3*c0-c1
674 |
675 | vmovdqu ymm13, ymm5
676 | vpand ymm5, ymm14, ymm5 // c0
677 | vpsrad ymm13, ymm13, 12 // c1
678 | vpslld ymm15, ymm5, 1 // 2*c0
679 | vpsubd ymm13, ymm5, ymm13 // c0-c1
680 | vpaddd ymm5, ymm13, ymm15 // 3*c0-c1
681 | skip3:
682 | vpermd ymm5, ymm12, ymm5
683 | vpermd ymm4, ymm12, ymm4
684 |
685 | vmovdqu ymm13, ymm7
686 | vpand ymm7, ymm14, ymm7 // c0
687 | vpsrlq ymm13, ymm13, 12 // c1
688 | vpslld ymm15, ymm7, 1 // 2*c0
689 | vpsubd ymm13, ymm7, ymm13 // c0-c1
690 | vpaddd ymm7, ymm13, ymm15 // 3*c0-c1
691 | vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm4
692 | vmovdqu XMMWORD PTR [reg_p1+4*r13+32], xmm5
693 |
694 | cmp r9, rbx
695 | jne skip4
696 | vmovdqu ymm13, ymm6
697 | vpand ymm6, ymm14, ymm6 // c0
698 | vpsrad ymm13, ymm13, 12 // c1
699 | vpslld ymm15, ymm6, 1 // 2*c0
700 | vpsubd ymm13, ymm6, ymm13 // c0-c1
701 | vpaddd ymm6, ymm13, ymm15 // 3*c0-c1
702 |
703 | vmovdqu ymm13, ymm7
704 | vpand ymm7, ymm14, ymm7 // c0
705 | vpsrad ymm13, ymm13, 12 // c1
706 | vpslld ymm15, ymm7, 1 // 2*c0
707 | vpsubd ymm13, ymm7, ymm13 // c0-c1
708 | vpaddd ymm7, ymm13, ymm15 // 3*c0-c1
709 | skip4:
710 | vpermd ymm7, ymm12, ymm7
711 | vpermd ymm6, ymm12, ymm6
712 | vmovdqu XMMWORD PTR [reg_p1+4*r13+48], xmm7
713 | vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm6
714 |
715 | add r10, r14
716 | cmp r10, r11
717 | jl loop7b
718 | mov rbx, rax
719 | shl rbx, 1 // 2*k
720 | add r8, rbx // j1+2*k
721 | inc r15
722 | cmp r15, r12
723 | jl loop6b
724 | dec r9
725 | jnz loop5b
726 |
727 | // Scaling step
728 | shl rax, 1 // k = 2*k = 512
729 | xor r10, r10 // j = 0
730 | mov r14, 4
731 | movq xmm0, reg_p3
732 | vbroadcastsd ymm10, xmm0 // S = omegainv1N_rev
733 | movq xmm0, reg_p4
734 | vbroadcastsd ymm11, xmm0 // T = Ninv
735 | loop8b:
736 | vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r10+4*512] // V = a[j+k]
737 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j]
738 | vpsubd ymm1, ymm0, ymm13 // U - V
739 | vpaddd ymm0, ymm0, ymm13 // U + V
740 | vpmuldq ymm1, ymm1, ymm10 // (U - V).S
741 | vpmuldq ymm0, ymm0, ymm11 // (U + V).T
742 |
743 | vmovdqu ymm13, ymm0
744 | vpand ymm0, ymm14, ymm0 // c0
745 | vpsrlq ymm13, ymm13, 12 // c1
746 | vpslld ymm15, ymm0, 1 // 2*c0
747 | vpsubd ymm13, ymm0, ymm13 // c0-c1
748 | vpaddd ymm0, ymm13, ymm15 // 3*c0-c1
749 |
750 | vmovdqu ymm13, ymm1
751 | vpand ymm1, ymm14, ymm1 // c0
752 | vpsrlq ymm13, ymm13, 12 // c1
753 | vpslld ymm15, ymm1, 1 // 2*c0
754 | vpsubd ymm13, ymm1, ymm13 // c0-c1
755 | vpaddd ymm1, ymm13, ymm15 // 3*c0-c1
756 |
757 | vpermd ymm0, ymm12, ymm0
758 | vpermd ymm1, ymm12, ymm1
759 | vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0
760 | vmovdqu XMMWORD PTR [reg_p1+4*r10+4*512], xmm1
761 |
762 | add r10, r14 // j+4
763 | cmp r10, rax
764 | jl loop8b
765 | loop9b:
766 | pop rbx
767 | pop r15
768 | pop r14
769 | pop r13
770 | pop r12
771 | ret
772 |
773 |
774 | //***********************************************************************
775 | // Component-wise multiplication and addition
776 | // Operation: d [reg_p4] <- a [reg_p1] * b [reg_p2] + c [reg_p3]
777 | // reg_p5 contains parameter n
778 | //***********************************************************************
779 | .global pmuladd_asm
780 | pmuladd_asm:
781 | vmovdqu ymm5, PERM0246
782 | vmovdqu ymm6, MASK12x8
783 | xor rax, rax
784 | movq r11, 4
785 | lazo2:
786 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*rax] // a
787 | vpmovsxdq ymm1, XMMWORD PTR [reg_p2+4*rax] // b
788 | vpmovsxdq ymm2, XMMWORD PTR [reg_p3+4*rax] // c
789 | vpmuldq ymm0, ymm1, ymm0
790 | vpaddq ymm0, ymm2, ymm0
791 |
792 | vmovdqu ymm3, ymm0
793 | vpand ymm0, ymm6, ymm0 // c0
794 | vpsrlq ymm3, ymm3, 12 // c1
795 | vpslld ymm4, ymm0, 1 // 2*c0
796 | vpsubd ymm3, ymm0, ymm3 // c0-c1
797 | vpaddd ymm0, ymm3, ymm4 // 3*c0-c1
798 |
799 | vmovdqu ymm3, ymm0
800 | vpand ymm0, ymm6, ymm0 // c0
801 | vpsrad ymm3, ymm3, 12 // c1
802 | vpslld ymm4, ymm0, 1 // 2*c0
803 | vpsubd ymm3, ymm0, ymm3 // c0-c1
804 | vpaddd ymm0, ymm3, ymm4 // 3*c0-c1
805 |
806 | vpermd ymm0, ymm5, ymm0
807 | vmovdqu XMMWORD PTR [reg_p4+4*rax], xmm0
808 |
809 | add rax, r11 // j+4
810 | cmp rax, reg_p5
811 | jl lazo2
812 | ret
813 |
814 |
815 | //***********************************************************************
816 | // Component-wise multiplication
817 | // Operation: c [reg_p3] <- a [reg_p1] * b [reg_p2]
818 | // reg_p4 contains parameter n
819 | //***********************************************************************
820 | .global pmul_asm
821 | pmul_asm:
822 | vmovdqu ymm5, PERM0246
823 | vmovdqu ymm6, MASK12x8
824 | xor rax, rax
825 | movq r11, 4
826 | lazo3:
827 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*rax] // a
828 | vpmovsxdq ymm1, XMMWORD PTR [reg_p2+4*rax] // b
829 | vpmuldq ymm0, ymm1, ymm0
830 |
831 | vmovdqu ymm3, ymm0
832 | vpand ymm0, ymm6, ymm0 // c0
833 | vpsrlq ymm3, ymm3, 12 // c1
834 | vpslld ymm4, ymm0, 1 // 2*c0
835 | vpsubd ymm3, ymm0, ymm3 // c0-c1
836 | vpaddd ymm0, ymm3, ymm4 // 3*c0-c1
837 |
838 | vmovdqu ymm3, ymm0
839 | vpand ymm0, ymm6, ymm0 // c0
840 | vpsrad ymm3, ymm3, 12 // c1
841 | vpslld ymm4, ymm0, 1 // 2*c0
842 | vpsubd ymm3, ymm0, ymm3 // c0-c1
843 | vpaddd ymm0, ymm3, ymm4 // 3*c0-c1
844 |
845 | vpermd ymm0, ymm5, ymm0
846 | vmovdqu XMMWORD PTR [reg_p3+4*rax], xmm0
847 |
848 | add rax, r11 // j+4
849 | cmp rax, reg_p4
850 | jl lazo3
851 | ret
852 |
853 |
854 | //***********************************************************************
855 | // Two consecutive reductions
856 | // Operation: c [reg_p1] <- a [reg_p1]
857 | // reg_p2 contains parameter n
858 | //***********************************************************************
859 | .global two_reduce12289_asm
860 | two_reduce12289_asm:
861 | vmovdqu ymm6, MASK12x8
862 | vmovdqu ymm7, PRIME8x
863 | xor rax, rax
864 | movq r11, 8
865 | lazo4:
866 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // a
867 |
868 | vmovdqu ymm3, ymm0
869 | vpand ymm0, ymm6, ymm0 // c0
870 | vpsrad ymm3, ymm3, 12 // c1
871 | vpslld ymm4, ymm0, 1 // 2*c0
872 | vpsubd ymm3, ymm0, ymm3 // c0-c1
873 | vpaddd ymm0, ymm3, ymm4 // 3*c0-c1
874 |
875 | vmovdqu ymm3, ymm0
876 | vpand ymm0, ymm6, ymm0 // c0
877 | vpsrad ymm3, ymm3, 12 // c1
878 | vpslld ymm4, ymm0, 1 // 2*c0
879 | vpsubd ymm3, ymm0, ymm3 // c0-c1
880 | vpaddd ymm0, ymm3, ymm4 // 3*c0-c1
881 |
882 | vpsrad ymm2, ymm0, 31
883 | vpand ymm2, ymm7, ymm2
884 | vpaddd ymm2, ymm0, ymm2
885 | vpsubd ymm0, ymm2, ymm7
886 |
887 | vpsrad ymm2, ymm0, 31
888 | vpand ymm2, ymm7, ymm2
889 | vpaddd ymm0, ymm0, ymm2
890 |
891 | vmovdqu YMMWORD PTR [reg_p1+4*rax], ymm0
892 |
893 | add rax, r11 // j+8
894 | cmp rax, reg_p2
895 | jl lazo4
896 | ret
897 |
898 |
899 | //***********************************************************************
900 | // Encoding
901 | // Operation: c [reg_p2] <- a [reg_p1]
902 | //***********************************************************************
903 | .global encode_asm
904 | encode_asm:
905 | vmovdqu ymm6, MASK32
906 | vmovdqu ymm7, MASK42
907 | mov r9, 1024
908 | xor rax, rax
909 | xor r10, r10
910 | mov r11, 14
911 | mov rcx, 8
912 | lazo5:
913 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // a
914 |
915 | vpsrlq ymm1, ymm0, 18
916 | vpsllq ymm2, ymm0, 4
917 | vpand ymm0, ymm0, ymm6
918 | vpsrldq ymm2, ymm2, 5
919 | vpsrlq ymm3, ymm1, 4
920 | vpand ymm1, ymm1, ymm6
921 | vpand ymm2, ymm2, ymm7
922 | vpsrldq ymm3, ymm3, 4
923 | vpor ymm0, ymm0, ymm1
924 | vpor ymm0, ymm0, ymm2
925 | vpor ymm0, ymm0, ymm3
926 | vpermq ymm1, ymm0, 0x0e
927 |
928 | vmovdqu XMMWORD PTR [reg_p2+r10], xmm0
929 | vmovdqu XMMWORD PTR [reg_p2+r10+7], xmm1
930 |
931 | add r10, r11
932 | add rax, rcx // j+8
933 | cmp rax, r9
934 | jl lazo5
935 | ret
936 |
937 |
938 | //***********************************************************************
939 | // Decoding
940 | // Operation: c [reg_p2] <- a [reg_p1]
941 | //***********************************************************************
942 | .global decode_asm
943 | decode_asm:
944 | vmovdqu ymm6, MASK14_1
945 | vmovdqu ymm7, MASK14_2
946 | vmovdqu ymm8, MASK14_3
947 | vmovdqu ymm9, MASK14_4
948 | mov r9, 1024
949 | xor rax, rax
950 | xor r10, r10
951 | mov r11, 14
952 | mov rcx, 8
953 | lazo6:
954 | vmovdqu xmm0, XMMWORD PTR [reg_p1+r10]
955 | vmovdqu xmm1, XMMWORD PTR [reg_p1+r10+7]
956 | vinserti128 ymm0, ymm0, xmm1, 1
957 |
958 | vpand ymm1, ymm0, ymm6
959 | vpand ymm2, ymm0, ymm7
960 | vpand ymm3, ymm0, ymm8
961 | vpand ymm4, ymm0, ymm9
962 |
963 | vpsllq ymm2, ymm2, 18
964 | vpsllq ymm3, ymm3, 4
965 | vpslldq ymm3, ymm3, 4
966 | vpsrlq ymm4, ymm4, 2
967 | vpslldq ymm4, ymm4, 7
968 |
969 | vpor ymm1, ymm1, ymm2
970 | vpor ymm1, ymm1, ymm3
971 | vpor ymm1, ymm1, ymm4
972 |
973 | vmovdqu YMMWORD PTR [reg_p2+4*rax], ymm1
974 |
975 | add r10, r11
976 | add rax, rcx // j+8
977 | cmp rax, r9
978 | jl lazo6
979 | ret
--------------------------------------------------------------------------------