├── kex.c ├── README.txt ├── License.txt ├── Visual Studio ├── tests │ ├── tests.vcxproj.filters │ └── tests.vcxproj └── LatticeCrypto │ ├── LatticeCrypto.vcxproj.filters │ ├── LatticeCrypto.sln │ └── LatticeCrypto.vcxproj ├── AMD64 ├── ntt_x64.c ├── consts.c ├── error_asm.S └── ntt_x64_asm.S ├── tests ├── test_extras.h ├── test_extras.c └── tests.c ├── makefile ├── random.c ├── LatticeCrypto_priv.h ├── generic └── ntt.c ├── LatticeCrypto.h └── ntt_constants.c /kex.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/b/LatticeCrypto/master/kex.c -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/b/LatticeCrypto/master/README.txt -------------------------------------------------------------------------------- /License.txt: -------------------------------------------------------------------------------- 1 | LatticeCrypto 2 | 3 | Copyright (c) Microsoft Corporation 4 | All rights reserved. 5 | 6 | MIT License 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 9 | associated documentation files (the ""Software""), to deal in the Software without restriction, 10 | including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 | and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, 12 | subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all copies or substantial 15 | portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 18 | LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /Visual Studio/tests/tests.vcxproj.filters: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Header Files 20 | 21 | 22 | Header Files 23 | 24 | 25 | Header Files 26 | 27 | 28 | 29 | 30 | Source Files 31 | 32 | 33 | Source Files 34 | 35 | 36 | -------------------------------------------------------------------------------- /Visual Studio/LatticeCrypto/LatticeCrypto.vcxproj.filters: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | {d2c0b572-de10-4258-8700-245d332f161c} 18 | 19 | 20 | 21 | 22 | Source Files 23 | 24 | 25 | Source Files 26 | 27 | 28 | Source Files\generic 29 | 30 | 31 | Source Files 32 | 33 | 34 | 35 | 36 | Header Files 37 | 38 | 39 | Header Files 40 | 41 | 42 | -------------------------------------------------------------------------------- /AMD64/ntt_x64.c: -------------------------------------------------------------------------------- 1 | /**************************************************************************************** 2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library 3 | * 4 | * Copyright (c) Microsoft Corporation. All rights reserved. 5 | * 6 | * 7 | * Abstract: NTT functions and other low-level operations 8 | * 9 | *****************************************************************************************/ 10 | 11 | #include "../LatticeCrypto_priv.h" 12 | 13 | 14 | void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N) 15 | { 16 | NTT_CT_std2rev_12289_asm(a, psi_rev, N); 17 | } 18 | 19 | 20 | void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N) 21 | { 22 | INTT_GS_rev2std_12289_asm(a, omegainv_rev, omegainv1N_rev, Ninv, N); 23 | } 24 | 25 | 26 | void two_reduce12289(int32_t* a, unsigned int N) 27 | { 28 | two_reduce12289_asm(a, N); 29 | } 30 | 31 | 32 | void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N) 33 | { 34 | pmul_asm(a, b, c, N); 35 | } 36 | 37 | 38 | void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N) 39 | { 40 | pmuladd_asm(a, b, c, d, N); 41 | } 42 | 43 | 44 | void smul(int32_t* a, int32_t scalar, unsigned int N) 45 | { 46 | unsigned int i; 47 | 48 | for (i = 0; i < N; i++) { 49 | a[i] = a[i]*scalar; 50 | } 51 | } 52 | 53 | 54 | void correction(int32_t* a, int32_t p, unsigned int N) 55 | { 56 | unsigned int i; 57 | int32_t mask; 58 | 59 | for (i = 0; i < N; i++) { 60 | mask = a[i] >> (4*sizeof(int32_t) - 1); 61 | a[i] += (p & mask) - p; 62 | mask = a[i] >> (4*sizeof(int32_t) - 1); 63 | a[i] += (p & mask); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /AMD64/consts.c: -------------------------------------------------------------------------------- 1 | /**************************************************************************************** 2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library 3 | * 4 | * Copyright (c) Microsoft Corporation. All rights reserved. 5 | * 6 | * 7 | * Abstract: constants for the x64 assembly implementation 8 | * 9 | *****************************************************************************************/ 10 | 11 | #include "../LatticeCrypto_priv.h" 12 | #include 13 | 14 | 15 | uint32_t PRIME8x[8] = {PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q, PARAMETER_Q}; 16 | uint8_t ONE32x[32] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; 17 | uint32_t MASK12x8[8] = {0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff,0xfff}; 18 | uint32_t PERM0246[4] = {0,2,4,6}; 19 | uint32_t PERM00224466[8] = {0,0,2,2,4,4,6,6}; 20 | uint32_t PERM02134657[8] = {0,2,1,3,4,6,5,7}; 21 | uint64_t PERM0145[4] = {0,1,4,5}; 22 | uint64_t PERM2367[4] = {2,3,6,7}; 23 | uint64_t MASK32[4] = {0xffffffff,0,0xffffffff,0}; 24 | uint64_t MASK42[4] = {0x3fff0000000,0,0x3fff0000000,0}; 25 | 26 | uint64_t MASK14_1[4] = {0x3fff,0,0x3fff,0}; 27 | uint64_t MASK14_2[4] = {0xFFFC000,0,0xFFFC000,0}; 28 | uint64_t MASK14_3[4] = {0x3FFF0000000,0,0x3FFF0000000,0}; 29 | uint64_t MASK14_4[4] = {0xFFFC0000000000,0,0xFFFC0000000000,0}; 30 | 31 | uint32_t ONE8x[8] = {1,1,1,1,1,1,1,1}; 32 | uint32_t THREE8x[8] = {3,3,3,3,3,3,3,3}; 33 | uint32_t FOUR8x[8] = {4,4,4,4,4,4,4,4}; 34 | uint32_t PARAM_Q4x8[8] = {3073,3073,3073,3073,3073,3073,3073,3073}; 35 | uint32_t PARAM_3Q4x8[8] = {9217,9217,9217,9217,9217,9217,9217,9217}; 36 | uint32_t PARAM_5Q4x8[8] = {15362,15362,15362,15362,15362,15362,15362,15362}; 37 | uint32_t PARAM_7Q4x8[8] = {21506,21506,21506,21506,21506,21506,21506,21506}; 38 | uint32_t PARAM_Q2x8[8] = {6145,6145,6145,6145,6145,6145,6145,6145}; 39 | uint32_t PARAM_3Q2x8[8] = {18434,18434,18434,18434,18434,18434,18434,18434}; 40 | 41 | -------------------------------------------------------------------------------- /Visual Studio/LatticeCrypto/LatticeCrypto.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2013 4 | VisualStudioVersion = 12.0.21005.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tests", "..\tests\tests.vcxproj", "{C9639168-C3FF-4427-BC3B-D907FF11DE73}" 7 | ProjectSection(ProjectDependencies) = postProject 8 | {8283DD76-E88A-4B63-ABDE-33F014178413} = {8283DD76-E88A-4B63-ABDE-33F014178413} 9 | EndProjectSection 10 | EndProject 11 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LatticeCrypto", "LatticeCrypto.vcxproj", "{8283DD76-E88A-4B63-ABDE-33F014178413}" 12 | EndProject 13 | Global 14 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 15 | Debug|Win32 = Debug|Win32 16 | Debug|x64 = Debug|x64 17 | Generic|Win32 = Generic|Win32 18 | Generic|x64 = Generic|x64 19 | Release|Win32 = Release|Win32 20 | Release|x64 = Release|x64 21 | EndGlobalSection 22 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 23 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Debug|Win32.ActiveCfg = Debug|Win32 24 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Debug|x64.ActiveCfg = Debug|x64 25 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Generic|Win32.ActiveCfg = Generic|Win32 26 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Generic|Win32.Build.0 = Generic|Win32 27 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Generic|x64.ActiveCfg = Generic|x64 28 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Generic|x64.Build.0 = Generic|x64 29 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Release|Win32.ActiveCfg = Release|Win32 30 | {C9639168-C3FF-4427-BC3B-D907FF11DE73}.Release|x64.ActiveCfg = Release|x64 31 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Debug|Win32.ActiveCfg = Debug|Win32 32 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Debug|x64.ActiveCfg = Debug|x64 33 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Generic|Win32.ActiveCfg = Generic|Win32 34 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Generic|Win32.Build.0 = Generic|Win32 35 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Generic|x64.ActiveCfg = Generic|x64 36 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Generic|x64.Build.0 = Generic|x64 37 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Release|Win32.ActiveCfg = Release|Win32 38 | {8283DD76-E88A-4B63-ABDE-33F014178413}.Release|x64.ActiveCfg = Release|x64 39 | EndGlobalSection 40 | GlobalSection(SolutionProperties) = preSolution 41 | HideSolutionNode = FALSE 42 | EndGlobalSection 43 | EndGlobal 44 | -------------------------------------------------------------------------------- /tests/test_extras.h: -------------------------------------------------------------------------------- 1 | /**************************************************************************************** 2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library 3 | * 4 | * Copyright (c) Microsoft Corporation. All rights reserved. 5 | * 6 | * 7 | * Abstract: utility header file for tests 8 | * 9 | *****************************************************************************************/ 10 | 11 | #ifndef __TEST_EXTRAS_H__ 12 | #define __TEST_EXTRAS_H__ 13 | 14 | 15 | // For C++ 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | 21 | #include "../LatticeCrypto_priv.h" 22 | 23 | 24 | // Access system counter for benchmarking 25 | int64_t cpucycles(void); 26 | 27 | // Generate "nbytes" of random values and output the result to random_array. 28 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY. 29 | CRYPTO_STATUS random_bytes_test(unsigned int nbytes, unsigned char* random_array); 30 | 31 | // Generate "array_ndigits" of 32-bit values and output the result to extended_array. 32 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY. 33 | CRYPTO_STATUS extendable_output_test(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array); 34 | 35 | // Generate "array_nbytes" of values and output the result to stream_array. 36 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY. 37 | CRYPTO_STATUS stream_output_test(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array); 38 | 39 | // Generating a pseudo-random polynomial a[x] over GF(p) 40 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY. 41 | void random_poly_test(int32_t* a, unsigned int p, unsigned int pbits, unsigned int N); 42 | 43 | // Comparing two polynomials over GF(p), a[x]=b[x]? : (0) a=b, (1) a!=b 44 | // NOTE: TO BE USED FOR TESTING ONLY. 45 | int compare_poly(int32_t* a, int32_t* b, unsigned int N); 46 | 47 | // Modular reduction 48 | // NOTE: TO BE USED FOR TESTING ONLY. 49 | int reduce(int a, int p); 50 | 51 | // Polynomial multiplication using the schoolbook method, c[x] = a[x]*b[x] 52 | // NOTE: TO BE USED FOR TESTING ONLY. 53 | void mul_test(int32_t* a, int32_t* b, int32_t* c, uint32_t p, unsigned int N); 54 | 55 | // Polynomial addition, c[x] = a[x] + b[x] 56 | // NOTE: TO BE USED FOR TESTING ONLY. 57 | void add_test(int32_t* a, int32_t* b, int32_t* c, uint32_t p, unsigned int N); 58 | 59 | 60 | #ifdef __cplusplus 61 | } 62 | #endif 63 | 64 | 65 | #endif -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | #### Makefile for compilation on Linux #### 2 | 3 | OPT=-O3 # Optimization option by default 4 | 5 | ifeq "$(CC)" "gcc" 6 | COMPILER=gcc 7 | else ifeq "$(CC)" "clang" 8 | COMPILER=clang 9 | endif 10 | 11 | ifeq "$(ARCH)" "x64" 12 | ARCHITECTURE=_AMD64_ 13 | else ifeq "$(ARCH)" "x86" 14 | ARCHITECTURE=_X86_ 15 | else ifeq "$(ARCH)" "ARM" 16 | ARCHITECTURE=_ARM_ 17 | endif 18 | 19 | ADDITIONAL_SETTINGS= 20 | ifeq "$(SET)" "EXTENDED" 21 | ADDITIONAL_SETTINGS=-fwrapv -fomit-frame-pointer -march=native 22 | endif 23 | 24 | ifeq "$(ASM)" "TRUE" 25 | USE_ASM=-D _ASM_ 26 | endif 27 | 28 | ifeq "$(GENERIC)" "TRUE" 29 | USE_GENERIC=-D _GENERIC_ 30 | endif 31 | 32 | ifeq "$(AVX2)" "TRUE" 33 | USE_AVX2=-D _AVX2_ 34 | SIMD=-mavx2 35 | endif 36 | 37 | ifeq "$(ARCH)" "ARM" 38 | ARM_SETTING=-lrt 39 | endif 40 | 41 | cc=$(COMPILER) 42 | CFLAGS=-c $(OPT) $(ADDITIONAL_SETTINGS) $(SIMD) -D $(ARCHITECTURE) -D __LINUX__ $(USE_AVX2) $(USE_ASM) $(USE_GENERIC) 43 | LDFLAGS= 44 | ifeq "$(GENERIC)" "TRUE" 45 | OTHER_OBJECTS=ntt.o 46 | else 47 | ifeq "$(ASM)" "TRUE" 48 | OTHER_OBJECTS=ntt_x64.o consts.o 49 | ASM_OBJECTS=ntt_x64_asm.o error_asm.o 50 | endif 51 | endif 52 | OBJECTS=kex.o random.o ntt_constants.o $(ASM_OBJECTS) $(OTHER_OBJECTS) 53 | OBJECTS_TEST=tests.o test_extras.o $(OBJECTS) 54 | OBJECTS_ALL=$(OBJECTS) $(OBJECTS_TEST) 55 | 56 | test: $(OBJECTS_TEST) 57 | $(CC) -o test $(OBJECTS_TEST) $(ARM_SETTING) 58 | 59 | kex.o: kex.c LatticeCrypto_priv.h 60 | $(CC) $(CFLAGS) kex.c 61 | 62 | random.o: random.c LatticeCrypto_priv.h 63 | $(CC) $(CFLAGS) random.c 64 | 65 | ntt_constants.o: ntt_constants.c LatticeCrypto_priv.h 66 | $(CC) $(CFLAGS) ntt_constants.c 67 | 68 | ifeq "$(GENERIC)" "TRUE" 69 | ntt.o: generic/ntt.c LatticeCrypto_priv.h 70 | $(CC) $(CFLAGS) generic/ntt.c 71 | else 72 | ifeq "$(ASM)" "TRUE" 73 | ntt_x64.o: AMD64/ntt_x64.c 74 | $(CC) $(CFLAGS) AMD64/ntt_x64.c 75 | ntt_x64_asm.o: AMD64/ntt_x64_asm.S 76 | $(CC) $(CFLAGS) AMD64/ntt_x64_asm.S 77 | error_asm.o: AMD64/error_asm.S 78 | $(CC) $(CFLAGS) AMD64/error_asm.S 79 | consts.o: AMD64/consts.c 80 | $(CC) $(CFLAGS) AMD64/consts.c 81 | endif 82 | endif 83 | 84 | test_extras.o: tests/test_extras.c tests/test_extras.h LatticeCrypto_priv.h 85 | $(CC) $(CFLAGS) tests/test_extras.c 86 | 87 | tests.o: tests/tests.c LatticeCrypto_priv.h 88 | $(CC) $(CFLAGS) tests/tests.c 89 | 90 | .PHONY: clean 91 | 92 | clean: 93 | rm -f test ntt.o ntt_x64.o ntt_x64_asm.o error_asm.o consts.o $(OBJECTS_ALL) 94 | 95 | -------------------------------------------------------------------------------- /random.c: -------------------------------------------------------------------------------- 1 | /**************************************************************************************** 2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library 3 | * 4 | * Copyright (c) Microsoft Corporation. All rights reserved. 5 | * 6 | * 7 | * Abstract: wrappers for user-provided functions 8 | * 9 | *****************************************************************************************/ 10 | 11 | 12 | #include "LatticeCrypto_priv.h" 13 | 14 | 15 | CRYPTO_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction) 16 | { // Output "nbytes" of random values. 17 | // It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array". 18 | // The caller is responsible for providing the "RandomBytesFunction" function passing random values as octets. 19 | 20 | if (random_array == NULL || RandomBytesFunction == NULL || nbytes == 0) { 21 | return CRYPTO_ERROR_INVALID_PARAMETER; 22 | } 23 | 24 | return (RandomBytesFunction)(nbytes, random_array); 25 | } 26 | 27 | 28 | CRYPTO_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction) 29 | { // Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes". 30 | // It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array". 31 | // The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits. 32 | 33 | if (seed == NULL || extended_array == NULL || ExtendableOutputFunction == NULL || seed_nbytes == 0 || array_ndigits == 0) { 34 | return CRYPTO_ERROR_INVALID_PARAMETER; 35 | } 36 | 37 | return (ExtendableOutputFunction)(seed, seed_nbytes, array_ndigits, extended_array); 38 | } 39 | 40 | 41 | CRYPTO_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction) 42 | { // Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes". 43 | // It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array". 44 | // The caller is responsible for providing the "StreamOutputFunction" function passing values as octets. 45 | 46 | if (seed == NULL || stream_array == NULL || StreamOutputFunction == NULL || seed_nbytes == 0 || nonce_nbytes == 0 || array_nbytes == 0) { 47 | return CRYPTO_ERROR_INVALID_PARAMETER; 48 | } 49 | 50 | return (StreamOutputFunction)(seed, seed_nbytes, nonce, nonce_nbytes, array_nbytes, stream_array); 51 | } -------------------------------------------------------------------------------- /LatticeCrypto_priv.h: -------------------------------------------------------------------------------- 1 | /**************************************************************************************** 2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library 3 | * 4 | * Copyright (c) Microsoft Corporation. All rights reserved. 5 | * 6 | * 7 | * Abstract: internal header file 8 | * 9 | *****************************************************************************************/ 10 | 11 | #ifndef __LatticeCrypto_priv_H__ 12 | #define __LatticeCrypto_priv_H__ 13 | 14 | 15 | // For C++ 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | 21 | #include "LatticeCrypto.h" 22 | 23 | 24 | // Basic constants 25 | #define PARAMETER_N 1024 26 | #define PARAMETER_Q 12289 27 | #define SEED_BYTES 256/8 28 | #define ERROR_SEED_BYTES 256/8 29 | #define NONCE_SEED_BYTES 256/8 30 | #define PARAMETER_Q4 3073 31 | #define PARAMETER_3Q4 9217 32 | #define PARAMETER_5Q4 15362 33 | #define PARAMETER_7Q4 21506 34 | #define PARAMETER_Q2 6145 35 | #define PARAMETER_3Q2 18434 36 | 37 | 38 | // Macro definitions 39 | 40 | #define NBITS_TO_NWORDS(nbits) (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8)) // Conversion macro from number of bits to number of computer words 41 | #define NBYTES_TO_NWORDS(nbytes) (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t)) // Conversion macro from number of bytes to number of computer words 42 | 43 | // Macro to avoid compiler warnings when detecting unreferenced parameters 44 | #define UNREFERENCED_PARAMETER(PAR) (PAR) 45 | 46 | 47 | /******************** Function prototypes *******************/ 48 | /******************* Polynomial functions *******************/ 49 | 50 | // Forward NTT 51 | void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N); 52 | void NTT_CT_std2rev_12289_asm(int32_t* a, const int32_t* psi_rev, unsigned int N); 53 | 54 | // Inverse NTT 55 | void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N); 56 | void INTT_GS_rev2std_12289_asm(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N); 57 | 58 | // Reduction modulo q 59 | int32_t reduce12289(int64_t a); 60 | 61 | // Two merged reductions modulo q 62 | int32_t reduce12289_2x(int64_t a); 63 | 64 | // Two consecutive reductions modulo q 65 | void two_reduce12289(int32_t* a, unsigned int N); 66 | void two_reduce12289_asm(int32_t* a, unsigned int N); 67 | 68 | // Correction modulo q 69 | void correction(int32_t* a, int32_t p, unsigned int N); 70 | 71 | // Component-wise multiplication 72 | void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N); 73 | void pmul_asm(int32_t* a, int32_t* b, int32_t* c, unsigned int N); 74 | 75 | // Component-wise multiplication and addition 76 | void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N); 77 | void pmuladd_asm(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N); 78 | 79 | // Component-wise multiplication with scalar 80 | void smul(int32_t* a, int32_t scalar, unsigned int N); 81 | 82 | /******************* Key exchange functions *******************/ 83 | 84 | // Alice's message encoding 85 | void encode_A(const uint32_t* pk, const unsigned char* seed, unsigned char* m); 86 | 87 | // Alice's message decoding 88 | void decode_A(const unsigned char* m, uint32_t *pk, unsigned char* seed); 89 | 90 | // Bob's message encoding 91 | void encode_B(const uint32_t* pk, const uint32_t* rvec, unsigned char* m); 92 | 93 | // Bob's message decoding 94 | void decode_B(unsigned char* m, uint32_t* pk, uint32_t* rvec); 95 | 96 | // Partial message encoding/decoding (assembly optimized) 97 | void encode_asm(const uint32_t* pk, unsigned char* m); 98 | void decode_asm(const unsigned char* m, uint32_t *pk); 99 | 100 | // Reconciliation helper 101 | CRYPTO_STATUS HelpRec(const uint32_t* x, uint32_t* rvec, const unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction); 102 | 103 | // Partial reconciliation helper (assembly optimized) 104 | void helprec_asm(const uint32_t* x, uint32_t* rvec, unsigned char* random_bits); 105 | 106 | // Reconciliation 107 | void Rec(const uint32_t *x, const uint32_t* rvec, unsigned char *key); 108 | void rec_asm(const uint32_t *x, const uint32_t* rvec, unsigned char *key); 109 | 110 | // Error sampling 111 | CRYPTO_STATUS get_error(int32_t* e, unsigned char* seed, unsigned int nonce, StreamOutput StreamOutputFunction); 112 | 113 | // Partial error sampling (assembly optimized) 114 | void error_sampling_asm(unsigned char* stream, int32_t* e); 115 | 116 | // Generation of parameter a 117 | CRYPTO_STATUS generate_a(uint32_t* a, const unsigned char* seed, ExtendableOutput ExtendableOutputFunction); 118 | 119 | 120 | #ifdef __cplusplus 121 | } 122 | #endif 123 | 124 | 125 | #endif 126 | -------------------------------------------------------------------------------- /generic/ntt.c: -------------------------------------------------------------------------------- 1 | /**************************************************************************************** 2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library 3 | * 4 | * Copyright (c) Microsoft Corporation. All rights reserved. 5 | * 6 | * 7 | * Abstract: NTT functions and other polynomial operations 8 | * 9 | *****************************************************************************************/ 10 | 11 | #include "../LatticeCrypto_priv.h" 12 | 13 | const uint32_t mask12 = ((uint64_t)1 << 12) - 1; 14 | 15 | 16 | int32_t reduce12289(int64_t a) 17 | { // Reduction modulo q 18 | int32_t c0, c1; 19 | 20 | c0 = (int32_t)(a & mask12); 21 | c1 = (int32_t)(a >> 12); 22 | 23 | return (3*c0 - c1); 24 | } 25 | 26 | 27 | int32_t reduce12289_2x(int64_t a) 28 | { // Two merged reductions modulo q 29 | int32_t c0, c1, c2; 30 | 31 | c0 = (int32_t)(a & mask12); 32 | c1 = (int32_t)((a >> 12) & mask12); 33 | c2 = (int32_t)(a >> 24); 34 | 35 | return (9*c0 - 3*c1 + c2); 36 | } 37 | 38 | 39 | void NTT_CT_std2rev_12289(int32_t* a, const int32_t* psi_rev, unsigned int N) 40 | { // Forward NTT 41 | unsigned int m, i, j, j1, j2, k = N; 42 | int32_t S, U, V; 43 | 44 | for (m = 1; m < 128; m = 2*m) { 45 | k = k >> 1; 46 | for (i = 0; i < m; i++) { 47 | j1 = 2*i*k; 48 | j2 = j1+k-1; 49 | S = psi_rev[m+i]; 50 | for (j = j1; j <= j2; j++) { 51 | U = a[j]; 52 | V = reduce12289((int64_t)a[j+k]*S); 53 | a[j] = U+V; 54 | a[j+k] = U-V; 55 | } 56 | } 57 | } 58 | 59 | k = 4; 60 | for (i = 0; i < 128; i++) { 61 | j1 = 8*i; 62 | j2 = j1+3; 63 | S = psi_rev[i+128]; 64 | for (j = j1; j <= j2; j++) { 65 | U = reduce12289((int64_t)a[j]); 66 | V = reduce12289_2x((int64_t)a[j+4]*S); 67 | a[j] = U+V; 68 | a[j+4] = U-V; 69 | } 70 | } 71 | 72 | for (m = 256; m < N; m = 2*m) { 73 | k = k >> 1; 74 | for (i = 0; i < m; i++) { 75 | j1 = 2*i*k; 76 | j2 = j1+k-1; 77 | S = psi_rev[m+i]; 78 | for (j = j1; j <= j2; j++) { 79 | U = a[j]; 80 | V = reduce12289((int64_t)a[j+k]*S); 81 | a[j] = U+V; 82 | a[j+k] = U-V; 83 | } 84 | } 85 | } 86 | return; 87 | } 88 | 89 | 90 | void INTT_GS_rev2std_12289(int32_t* a, const int32_t* omegainv_rev, const int32_t omegainv1N_rev, const int32_t Ninv, unsigned int N) 91 | { // Inverse NTT 92 | unsigned int m, h, i, j, j1, j2, k = 1; 93 | int32_t S, U, V; 94 | int64_t temp; 95 | 96 | for (m = N; m > 2; m >>= 1) { 97 | j1 = 0; 98 | h = m >> 1; 99 | for (i = 0; i < h; i++) { 100 | j2 = j1+k-1; 101 | S = omegainv_rev[h+i]; 102 | for (j = j1; j <= j2; j++) { 103 | U = a[j]; 104 | V = a[j+k]; 105 | a[j] = U+V; 106 | temp = (int64_t)(U-V)*S; 107 | if (m == 32) { 108 | a[j] = reduce12289((int64_t)a[j]); 109 | a[j+k] = reduce12289_2x(temp); 110 | } else { 111 | a[j+k] = reduce12289(temp); 112 | } 113 | } 114 | j1 = j1+2*k; 115 | } 116 | k = 2*k; 117 | } 118 | for (j = 0; j < k; j++) { 119 | U = a[j]; 120 | V = a[j+k]; 121 | a[j] = reduce12289((int64_t)(U+V)*Ninv); 122 | a[j+k] = reduce12289((int64_t)(U-V)*omegainv1N_rev); 123 | } 124 | return; 125 | } 126 | 127 | 128 | void two_reduce12289(int32_t* a, unsigned int N) 129 | { // Two consecutive reductions modulo q 130 | unsigned int i; 131 | 132 | for (i = 0; i < N; i++) { 133 | a[i] = reduce12289((int64_t)a[i]); 134 | a[i] = reduce12289((int64_t)a[i]); 135 | } 136 | } 137 | 138 | 139 | void pmul(int32_t* a, int32_t* b, int32_t* c, unsigned int N) 140 | { // Component-wise multiplication 141 | unsigned int i; 142 | 143 | for (i = 0; i < N; i++) { 144 | c[i] = reduce12289((int64_t)a[i]*b[i]); 145 | c[i] = reduce12289((int64_t)c[i]); 146 | } 147 | } 148 | 149 | 150 | void pmuladd(int32_t* a, int32_t* b, int32_t* c, int32_t* d, unsigned int N) 151 | { // Component-wise multiplication and addition 152 | unsigned int i; 153 | 154 | for (i = 0; i < N; i++) { 155 | d[i] = reduce12289((int64_t)a[i]*b[i] + c[i]); 156 | d[i] = reduce12289((int64_t)d[i]); 157 | } 158 | } 159 | 160 | 161 | void smul(int32_t* a, int32_t scalar, unsigned int N) 162 | { // Component-wise multiplication with scalar 163 | unsigned int i; 164 | 165 | for (i = 0; i < N; i++) { 166 | a[i] = a[i]*scalar; 167 | } 168 | } 169 | 170 | 171 | void correction(int32_t* a, int32_t p, unsigned int N) 172 | { // Correction modulo q 173 | unsigned int i; 174 | int32_t mask; 175 | 176 | for (i = 0; i < N; i++) { 177 | mask = a[i] >> (4*sizeof(int32_t) - 1); 178 | a[i] += (p & mask) - p; 179 | mask = a[i] >> (4*sizeof(int32_t) - 1); 180 | a[i] += (p & mask); 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /tests/test_extras.c: -------------------------------------------------------------------------------- 1 | /**************************************************************************************** 2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library 3 | * 4 | * Copyright (c) Microsoft Corporation. All rights reserved. 5 | * 6 | * 7 | * Abstract: additional functions for testing 8 | * 9 | *****************************************************************************************/ 10 | 11 | 12 | #include "../LatticeCrypto_priv.h" 13 | #include "test_extras.h" 14 | #if (OS_TARGET == OS_WIN) 15 | #include 16 | #include 17 | #endif 18 | #if (OS_TARGET == OS_LINUX) && (TARGET == TARGET_ARM) 19 | #include 20 | #endif 21 | #include 22 | 23 | 24 | int64_t cpucycles(void) 25 | { // Access system counter for benchmarking 26 | #if (OS_TARGET == OS_WIN) && (TARGET == TARGET_AMD64 || TARGET == TARGET_x86) 27 | return __rdtsc(); 28 | #elif (OS_TARGET == OS_WIN) && (TARGET == TARGET_ARM) 29 | return __rdpmccntr64(); 30 | #elif (OS_TARGET == OS_LINUX) && (TARGET == TARGET_AMD64 || TARGET == TARGET_x86) 31 | unsigned int hi, lo; 32 | 33 | asm volatile ("rdtsc\n\t" : "=a" (lo), "=d"(hi)); 34 | return ((int64_t)lo) | (((int64_t)hi) << 32); 35 | #elif (OS_TARGET == OS_LINUX) && (TARGET == TARGET_ARM) 36 | struct timespec time; 37 | 38 | clock_gettime(CLOCK_REALTIME, &time); 39 | return (int64_t)(time.tv_sec*1e9 + time.tv_nsec); 40 | #else 41 | return 0; 42 | #endif 43 | } 44 | 45 | 46 | CRYPTO_STATUS random_bytes_test(unsigned int nbytes, unsigned char* random_array) 47 | { // Generate "nbytes" of random values and output the result to random_array. 48 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY. 49 | unsigned int i; 50 | 51 | for (i = 0; i < nbytes; i++) { 52 | *(random_array + i) = (unsigned char)rand(); // nbytes of random values 53 | } 54 | 55 | return CRYPTO_SUCCESS; 56 | } 57 | 58 | 59 | CRYPTO_STATUS extendable_output_test(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array) 60 | { // Generate "array_ndigits" of 32-bit values and output the result to extended_array. 61 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY. 62 | unsigned int count = 0; 63 | uint32_t digit; 64 | 65 | UNREFERENCED_PARAMETER(seed); 66 | UNREFERENCED_PARAMETER(seed_nbytes); 67 | UNREFERENCED_PARAMETER(array_ndigits); 68 | 69 | srand((unsigned int)seed[0]); 70 | 71 | while (count < array_ndigits) { 72 | random_bytes_test(2, (unsigned char*)&digit); // Pull 2 bytes to get a 14-bit value 73 | digit &= 0x3FFF; 74 | if (digit < PARAMETER_Q) { // Take it if it is in [0, q-1] 75 | extended_array[count] = digit; 76 | count++; 77 | } 78 | } 79 | 80 | return CRYPTO_SUCCESS; 81 | } 82 | 83 | 84 | CRYPTO_STATUS stream_output_test(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array) 85 | { // Generate "array_nbytes" of values and output the result to stream_array. 86 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY. 87 | 88 | UNREFERENCED_PARAMETER(seed); 89 | UNREFERENCED_PARAMETER(seed_nbytes); 90 | UNREFERENCED_PARAMETER(nonce); 91 | UNREFERENCED_PARAMETER(nonce_nbytes); 92 | 93 | random_bytes_test(array_nbytes, stream_array); 94 | 95 | return CRYPTO_SUCCESS; 96 | } 97 | 98 | 99 | void random_poly_test(int32_t* a, unsigned int p, unsigned int pbits, unsigned int N) 100 | { // Generating a pseudo-random polynomial a[x] over GF(p) 101 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY. 102 | unsigned int i, mask = ((unsigned int)1 << pbits) - 1; 103 | unsigned char* string = (unsigned char*)a; 104 | 105 | for (i = 0; i < N; i++) { 106 | do { 107 | *(string + 4*i) = (unsigned char)rand(); // Obtain GF(p) coefficient 108 | *(string + 4*i + 1) = (unsigned char)rand(); 109 | a[i] &= mask; 110 | } while (a[i] >= (int32_t)p); 111 | } 112 | } 113 | 114 | 115 | int compare_poly(int32_t* a, int32_t* b, unsigned int N) 116 | { // Comparing two polynomials over GF(p), a[x]=b[x]? : (0) a=b, (1) a!=b 117 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY. 118 | unsigned int i; 119 | 120 | for (i = 0; i < N; i++) 121 | { 122 | if (a[i] != b[i]) 123 | return 1; 124 | } 125 | 126 | return 0; 127 | } 128 | 129 | 130 | int reduce(int a, int p) 131 | { // Modular reduction 132 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY. 133 | a %= p; 134 | if (a < 0) a += p; 135 | 136 | return a; 137 | } 138 | 139 | 140 | void mul_test(int32_t* a, int32_t* b, int32_t* c, uint32_t p, unsigned int N) 141 | { // Polynomial multiplication using the schoolbook method, c[x] = a[x]*b[x] 142 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY. 143 | unsigned int i, j, index, mask = N - 1; 144 | 145 | for (i = 0; i < N; i++) c[i] = 0; 146 | 147 | for (i = 0; i < N; i++) { 148 | for (j = 0; j < N; j++) { 149 | index = (i+j) & mask; 150 | if (i+j >= N) { 151 | c[index] = reduce(c[index] - (a[i]*b[j]), p); 152 | } else { 153 | c[index] = reduce(c[index] + (a[i]*b[j]), p); 154 | } 155 | } 156 | } 157 | } 158 | 159 | 160 | void add_test(int32_t* a, int32_t* b, int32_t* c, uint32_t p, unsigned int N) 161 | { // Polynomial addition, c[x] = a[x] + b[x] 162 | // SECURITY NOTE: TO BE USED FOR TESTING ONLY. 163 | unsigned int i; 164 | 165 | for (i = 0; i < N; i++) { 166 | c[i] = reduce(a[i] + b[i], p); 167 | } 168 | } -------------------------------------------------------------------------------- /Visual Studio/tests/tests.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Debug 10 | x64 11 | 12 | 13 | Generic 14 | Win32 15 | 16 | 17 | Generic 18 | x64 19 | 20 | 21 | Release 22 | Win32 23 | 24 | 25 | Release 26 | x64 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | {8283dd76-e88a-4b63-abde-33f014178413} 41 | 42 | 43 | 44 | {C9639168-C3FF-4427-BC3B-D907FF11DE73} 45 | Win32Proj 46 | fp_tests 47 | tests 48 | 49 | 50 | 51 | Application 52 | true 53 | v120 54 | Unicode 55 | 56 | 57 | Application 58 | true 59 | v120 60 | Unicode 61 | 62 | 63 | Application 64 | false 65 | v120 66 | true 67 | Unicode 68 | 69 | 70 | Application 71 | false 72 | v120 73 | true 74 | Unicode 75 | 76 | 77 | v120 78 | 79 | 80 | v120 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | true 100 | 101 | 102 | true 103 | 104 | 105 | false 106 | 107 | 108 | false 109 | 110 | 111 | 112 | 113 | 114 | Level4 115 | Disabled 116 | __WINDOWS__; _X86_; 117 | false 118 | Default 119 | MultiThreadedDLL 120 | true 121 | ProgramDatabase 122 | 123 | 124 | Console 125 | true 126 | 127 | 128 | 129 | 130 | 131 | 132 | Level4 133 | Disabled 134 | __WINDOWS__; _AMD64_; 135 | false 136 | Default 137 | MultiThreadedDLL 138 | true 139 | ProgramDatabase 140 | AdvancedVectorExtensions 141 | 142 | 143 | Console 144 | true 145 | 146 | 147 | 148 | 149 | Level4 150 | 151 | 152 | MaxSpeed 153 | true 154 | true 155 | __WINDOWS__; _X86_; 156 | MultiThreadedDLL 157 | 158 | 159 | Console 160 | true 161 | true 162 | true 163 | 164 | 165 | 166 | 167 | Level4 168 | 169 | 170 | MaxSpeed 171 | true 172 | true 173 | __WINDOWS__; _AMD64_; 174 | MultiThreadedDLL 175 | AdvancedVectorExtensions 176 | 177 | 178 | Console 179 | true 180 | true 181 | true 182 | 183 | 184 | 185 | 186 | Level4 187 | true 188 | true 189 | __WINDOWS__; _X86_; _GENERIC_; 190 | true 191 | MaxSpeed 192 | 193 | 194 | UseLinkTimeCodeGeneration 195 | true 196 | 197 | 198 | 199 | 200 | Level4 201 | true 202 | true 203 | __WINDOWS__; _AMD64_; _GENERIC_; 204 | true 205 | AdvancedVectorExtensions 206 | MaxSpeed 207 | 208 | 209 | UseLinkTimeCodeGeneration 210 | true 211 | 212 | 213 | 214 | 215 | 216 | -------------------------------------------------------------------------------- /LatticeCrypto.h: -------------------------------------------------------------------------------- 1 | /**************************************************************************************** 2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library 3 | * 4 | * Copyright (c) Microsoft Corporation. All rights reserved. 5 | * 6 | * 7 | * Abstract: main header file 8 | * 9 | *****************************************************************************************/ 10 | 11 | #ifndef __LatticeCrypto_H__ 12 | #define __LatticeCrypto_H__ 13 | 14 | 15 | // For C++ 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | 26 | // Definition of operating system 27 | 28 | #define OS_WIN 1 29 | #define OS_LINUX 2 30 | 31 | #if defined(__WINDOWS__) // Microsoft Windows OS 32 | #define OS_TARGET OS_WIN 33 | #elif defined(__LINUX__) // Linux OS 34 | #define OS_TARGET OS_LINUX 35 | #else 36 | #error -- "Unsupported OS" 37 | #endif 38 | 39 | 40 | // Definition of compiler 41 | 42 | #define COMPILER_VC 1 43 | #define COMPILER_GCC 2 44 | #define COMPILER_CLANG 3 45 | 46 | #if defined(_MSC_VER) // Microsoft Visual C compiler 47 | #define COMPILER COMPILER_VC 48 | #elif defined(__GNUC__) // GNU GCC compiler 49 | #define COMPILER COMPILER_GCC 50 | #elif defined(__clang__) // Clang compiler 51 | #define COMPILER COMPILER_CLANG 52 | #else 53 | #error -- "Unsupported COMPILER" 54 | #endif 55 | 56 | 57 | // Definition of the targeted architecture and basic data types 58 | 59 | #define TARGET_AMD64 1 60 | #define TARGET_x86 2 61 | #define TARGET_ARM 3 62 | 63 | #if defined(_AMD64_) 64 | #define TARGET TARGET_AMD64 65 | #define RADIX 64 66 | typedef uint64_t digit_t; // Unsigned 64-bit digit 67 | typedef int64_t sdigit_t; // Signed 64-bit digit 68 | #elif defined(_X86_) 69 | #define TARGET TARGET_x86 70 | #define RADIX 32 71 | typedef uint32_t digit_t; // Unsigned 32-bit digit 72 | typedef int32_t sdigit_t; // Signed 32-bit digit 73 | #elif defined(_ARM_) 74 | #define TARGET TARGET_ARM 75 | #define RADIX 32 76 | typedef uint32_t digit_t; // Unsigned 32-bit digit 77 | typedef int32_t sdigit_t; // Signed 32-bit digit 78 | #else 79 | #error -- "Unsupported ARCHITECTURE" 80 | #endif 81 | 82 | 83 | // Instruction support 84 | 85 | #define NO_SIMD_SUPPORT 0 86 | #define AVX_SUPPORT 1 87 | #define AVX2_SUPPORT 2 88 | 89 | #if defined(_AVX2_) 90 | #define SIMD_SUPPORT AVX2_SUPPORT // AVX2 support selection 91 | #elif defined(_AVX_) 92 | #define SIMD_SUPPORT AVX_SUPPORT // AVX support selection 93 | #else 94 | #define SIMD_SUPPORT NO_SIMD_SUPPORT 95 | #endif 96 | 97 | #if defined(_ASM_) // Assembly support selection 98 | #define ASM_SUPPORT 99 | #endif 100 | 101 | #if defined(_GENERIC_) // Selection of generic, portable implementation 102 | #define GENERIC_IMPLEMENTATION 103 | #endif 104 | 105 | 106 | // Unsupported configurations 107 | 108 | #if defined(ASM_SUPPORT) && (OS_TARGET == OS_WIN) 109 | #error -- "Assembly is not supported on this platform" 110 | #endif 111 | 112 | #if defined(ASM_SUPPORT) && defined(GENERIC_IMPLEMENTATION) 113 | #error -- "Unsupported configuration" 114 | #endif 115 | 116 | #if (SIMD_SUPPORT != NO_SIMD_SUPPORT) && defined(GENERIC_IMPLEMENTATION) 117 | #error -- "Unsupported configuration" 118 | #endif 119 | 120 | #if (TARGET != TARGET_AMD64) && !defined(GENERIC_IMPLEMENTATION) 121 | #error -- "Unsupported configuration" 122 | #endif 123 | 124 | #if (OS_TARGET == OS_LINUX) && defined(ASM_SUPPORT) && (SIMD_SUPPORT != AVX2_SUPPORT) 125 | #error -- "Unsupported configuration" 126 | #endif 127 | 128 | 129 | // Definitions of the error-handling type and error codes 130 | 131 | typedef enum { 132 | CRYPTO_SUCCESS, // 0x00 133 | CRYPTO_ERROR, // 0x01 134 | CRYPTO_ERROR_DURING_TEST, // 0x02 135 | CRYPTO_ERROR_UNKNOWN, // 0x03 136 | CRYPTO_ERROR_NOT_IMPLEMENTED, // 0x04 137 | CRYPTO_ERROR_NO_MEMORY, // 0x05 138 | CRYPTO_ERROR_INVALID_PARAMETER, // 0x06 139 | CRYPTO_ERROR_SHARED_KEY, // 0x07 140 | CRYPTO_ERROR_TOO_MANY_ITERATIONS, // 0x08 141 | CRYPTO_ERROR_END_OF_LIST 142 | } CRYPTO_STATUS; 143 | 144 | #define CRYPTO_STATUS_TYPE_SIZE (CRYPTO_ERROR_END_OF_LIST) 145 | 146 | 147 | // Definitions of the error messages 148 | // NOTE: they must match the error codes above 149 | 150 | #define CRYPTO_MSG_SUCCESS "CRYPTO_SUCCESS" 151 | #define CRYPTO_MSG_ERROR "CRYPTO_ERROR" 152 | #define CRYPTO_MSG_ERROR_DURING_TEST "CRYPTO_ERROR_DURING_TEST" 153 | #define CRYPTO_MSG_ERROR_UNKNOWN "CRYPTO_ERROR_UNKNOWN" 154 | #define CRYPTO_MSG_ERROR_NOT_IMPLEMENTED "CRYPTO_ERROR_NOT_IMPLEMENTED" 155 | #define CRYPTO_MSG_ERROR_NO_MEMORY "CRYPTO_ERROR_NO_MEMORY" 156 | #define CRYPTO_MSG_ERROR_INVALID_PARAMETER "CRYPTO_ERROR_INVALID_PARAMETER" 157 | #define CRYPTO_MSG_ERROR_SHARED_KEY "CRYPTO_ERROR_SHARED_KEY" 158 | #define CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS "CRYPTO_ERROR_TOO_MANY_ITERATIONS" 159 | 160 | 161 | // Definition of type "RandomBytes" to implement callback function outputting "nbytes" of random values to "random_array" 162 | typedef CRYPTO_STATUS (*RandomBytes)(unsigned int nbytes, unsigned char* random_array); 163 | 164 | // Definition of type "ExtendableOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "extended_array" 165 | typedef CRYPTO_STATUS (*ExtendableOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array); 166 | 167 | // Definition of type "StreamOutput" to implement callback function outputting 32-bit "array_ndigits" of values to "stream_array" 168 | typedef CRYPTO_STATUS (*StreamOutput)(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array); 169 | 170 | 171 | // Basic key-exchange constants 172 | #define PKA_BYTES 1824 // Alice's public key size 173 | #define PKB_BYTES 2048 // Bob's public key size 174 | #define SHAREDKEY_BYTES 32 // Shared key size 175 | 176 | 177 | // This data struct is initialized during setup with user-provided functions 178 | typedef struct 179 | { 180 | RandomBytes RandomBytesFunction; // Function providing random bytes 181 | ExtendableOutput ExtendableOutputFunction; // Extendable output function 182 | StreamOutput StreamOutputFunction; // Stream cipher function 183 | } LatticeCryptoStruct, *PLatticeCryptoStruct; 184 | 185 | 186 | /******************** Function prototypes *******************/ 187 | /*********************** Auxiliary API **********************/ 188 | 189 | // Clear digits from memory. "nwords" indicates the number of digits to be zeroed. 190 | extern void clear_words(void* mem, digit_t nwords); 191 | 192 | // Output "nbytes" of random values. 193 | // It makes requests of random values to RandomBytesFunction. If successful, the output is given in "random_array". 194 | // The caller is responsible for providing the "RandomBytesFunction" function passing random value as octets. 195 | CRYPTO_STATUS random_bytes(unsigned int nbytes, unsigned char* random_array, RandomBytes RandomBytesFunction); 196 | 197 | // Output "array_ndigits" of values in [0, q-1] using an extendable-output function and a seed of size "seed_nbytes". 198 | // It makes requests of values to ExtendableOutputFunction. If successful, the output is given in "extended_array". 199 | // The caller is responsible for providing the "ExtendableOutputFunction" function passing values as 32-bit digits. 200 | CRYPTO_STATUS extended_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned int array_ndigits, uint32_t* extended_array, ExtendableOutput ExtendableOutputFunction); 201 | 202 | // Output "array_nbytes" of values using a stream cipher, a seed of size "seed_nbytes" and a nonce of size "nonce_nbytes". 203 | // It makes requests of values to StreamOutputFunction. If successful, the output is given in "stream_array". 204 | // The caller is responsible for providing the "StreamOutputFunction" function passing values as octets. 205 | CRYPTO_STATUS stream_output(const unsigned char* seed, unsigned int seed_nbytes, unsigned char* nonce, unsigned int nonce_nbytes, unsigned int array_nbytes, unsigned char* stream_array, StreamOutput StreamOutputFunction); 206 | 207 | // Dynamic allocation of memory for LatticeCrypto structure. It should be called before initialization with LatticeCrypto_initialize(). Returns NULL on error. 208 | PLatticeCryptoStruct LatticeCrypto_allocate(void); 209 | 210 | // Initialize structure pLatticeCrypto with user-provided functions: RandomBytesFunction, ExtendableOutputFunction and StreamOutputFunction. 211 | CRYPTO_STATUS LatticeCrypto_initialize(PLatticeCryptoStruct pLatticeCrypto, RandomBytes RandomBytesFunction, ExtendableOutput ExtendableOutputFunction, StreamOutput StreamOutputFunction); 212 | 213 | // Output error/success message for a given CRYPTO_STATUS 214 | const char* LatticeCrypto_get_error_message(CRYPTO_STATUS Status); 215 | 216 | /*********************** Key exchange API ***********************/ 217 | 218 | // Alice's key generation 219 | // It produces a private key SecretKeyA and computes the public key PublicKeyA. 220 | // Outputs: the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total) 221 | // the public key PublicKeyA that occupies 1824 bytes 222 | // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). 223 | CRYPTO_STATUS KeyGeneration_A(int32_t* SecretKeyA, unsigned char* PublicKeyA, PLatticeCryptoStruct pLatticeCrypto); 224 | 225 | // Bob's key generation and shared secret computation 226 | // It produces a private key and computes the public key PublicKeyB. In combination with Alice's public key PublicKeyA, it computes 227 | // the shared secret SharedSecretB. 228 | // Input: Alice's public key PublicKeyA that consists of 1824 bytes 229 | // Outputs: the public key PublicKeyB that occupies 2048 bytes. 230 | // the 256-bit shared secret SharedSecretB. 231 | // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). 232 | CRYPTO_STATUS SecretAgreement_B(unsigned char* PublicKeyA, unsigned char* SharedSecretB, unsigned char* PublicKeyB, PLatticeCryptoStruct pLatticeCrypto); 233 | 234 | // Alice's shared secret computation 235 | // It computes the shared secret SharedSecretA using Bob's public key PublicKeyB and Alice's private key SecretKeyA. 236 | // Inputs: Bob's public key PublicKeyB that consists of 2048 bytes 237 | // the private key SecretKeyA that consists of a 32-bit signed 1024-element array (4096 bytes in total) 238 | // Output: the 256-bit shared secret SharedSecretA. 239 | // pLatticeCrypto must be set up in advance using LatticeCrypto_initialize(). 240 | CRYPTO_STATUS SecretAgreement_A(unsigned char* PublicKeyB, int32_t* SecretKeyA, unsigned char* SharedSecretA); 241 | 242 | 243 | #ifdef __cplusplus 244 | } 245 | #endif 246 | 247 | 248 | #endif 249 | -------------------------------------------------------------------------------- /Visual Studio/LatticeCrypto/LatticeCrypto.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Debug 10 | x64 11 | 12 | 13 | Generic 14 | Win32 15 | 16 | 17 | Generic 18 | x64 19 | 20 | 21 | Release 22 | Win32 23 | 24 | 25 | Release 26 | x64 27 | 28 | 29 | 30 | {8283DD76-E88A-4B63-ABDE-33F014178413} 31 | Win32Proj 32 | isoECClib 33 | 34 | 35 | 36 | StaticLibrary 37 | true 38 | v120 39 | Unicode 40 | 41 | 42 | StaticLibrary 43 | true 44 | v120 45 | Unicode 46 | 47 | 48 | StaticLibrary 49 | false 50 | v120 51 | true 52 | Unicode 53 | 54 | 55 | StaticLibrary 56 | false 57 | v120 58 | true 59 | Unicode 60 | 61 | 62 | StaticLibrary 63 | false 64 | v120 65 | true 66 | Unicode 67 | 68 | 69 | StaticLibrary 70 | false 71 | v120 72 | true 73 | Unicode 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | Level4 103 | Disabled 104 | __WINDOWS__; _X86_; _GENERIC_; 105 | ProgramDatabase 106 | false 107 | false 108 | false 109 | Default 110 | MultiThreadedDLL 111 | true 112 | 113 | 114 | Windows 115 | true 116 | 117 | 118 | 119 | 120 | 121 | 122 | Level4 123 | Disabled 124 | __WINDOWS__; _AMD64_; _GENERIC_; 125 | ProgramDatabase 126 | false 127 | false 128 | true 129 | Default 130 | MultiThreadedDLL 131 | 132 | 133 | AdvancedVectorExtensions 134 | 135 | 136 | Windows 137 | true 138 | 139 | 140 | 141 | 142 | Level4 143 | 144 | 145 | MaxSpeed 146 | true 147 | true 148 | __WINDOWS__; _X86_; _GENERIC_; 149 | MultiThreadedDLL 150 | 151 | 152 | Windows 153 | true 154 | true 155 | true 156 | 157 | 158 | 159 | 160 | Level4 161 | 162 | 163 | MaxSpeed 164 | true 165 | true 166 | __WINDOWS__; _AMD64_; _GENERIC_; 167 | MultiThreadedDLL 168 | AdvancedVectorExtensions 169 | 170 | 171 | Windows 172 | true 173 | true 174 | true 175 | 176 | 177 | 178 | 179 | Level4 180 | 181 | 182 | MaxSpeed 183 | true 184 | true 185 | __WINDOWS__; _X86_; _GENERIC_; 186 | MultiThreadedDLL 187 | 188 | 189 | Windows 190 | true 191 | true 192 | true 193 | 194 | 195 | 196 | 197 | Level4 198 | 199 | 200 | MaxSpeed 201 | true 202 | true 203 | __WINDOWS__; _AMD64_; _GENERIC_; 204 | MultiThreadedDLL 205 | AdvancedVectorExtensions 206 | 207 | 208 | Windows 209 | true 210 | true 211 | true 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /tests/tests.c: -------------------------------------------------------------------------------- 1 | /**************************************************************************************** 2 | * LatticeCrypto: an efficient post-quantum Ring-Learning With Errors cryptography library 3 | * 4 | * Copyright (c) Microsoft Corporation. All rights reserved. 5 | * 6 | * 7 | * Abstract: testing code 8 | * 9 | *****************************************************************************************/ 10 | 11 | #include "../LatticeCrypto_priv.h" 12 | #include "test_extras.h" 13 | #include 14 | #include 15 | 16 | extern const int32_t psi_rev_ntt1024_12289[PARAMETER_N]; 17 | extern const int32_t omegainv_rev_ntt1024_12289[PARAMETER_N]; 18 | extern const int32_t omegainv7N_rev_ntt1024_12289; 19 | extern const int32_t omegainv10N_rev_ntt1024_12289; 20 | extern const int32_t Ninv8_ntt1024_12289; 21 | extern const int32_t Ninv11_ntt1024_12289; 22 | 23 | // Benchmark and test parameters 24 | #define BENCH_LOOPS 1000 // Number of iterations per bench 25 | #define TEST_LOOPS 100 // Number of iterations per test 26 | 27 | 28 | bool ntt_test() 29 | { // Tests for the NTT functions 30 | bool OK = true; 31 | int n, passed; 32 | int32_t a[PARAMETER_N], b[PARAMETER_N], c[PARAMETER_N], d[PARAMETER_N], e[PARAMETER_N], f[PARAMETER_N], g[PARAMETER_N], ff[PARAMETER_N]; 33 | unsigned int pbits = 14; 34 | 35 | printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 36 | printf("Testing NTT functions: \n\n"); 37 | 38 | passed = 1; 39 | for (n=0; n m=32 35 | mov r9, 1 // m = 1 36 | mov rax, reg_p3 37 | mov r12, reg_p3 38 | shr r12, 4 // n/16 39 | vmovdqu ymm14, MASK12x8 40 | vmovdqu ymm12, PERM0246 41 | mov r14, 16 42 | mov rcx, 11 43 | loop1: 44 | shr rax, 1 // k = k/2 45 | dec rcx 46 | xor rdx, rdx // i = 0 47 | loop2: 48 | mov r10, rdx 49 | mov r11, rax 50 | dec r11 51 | shl r10, cl // j1 52 | add r11, r10 // j2 53 | mov r13, r9 54 | add r13, rdx // m+i 55 | vbroadcastss ymm11, DWORD PTR [reg_p2+4*r13] // S 56 | 57 | loop3: 58 | mov r13, r10 59 | add r13, rax // j+k 60 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r13] // a[j+k] 61 | vpmovsxdq ymm3, XMMWORD PTR [reg_p1+4*r13+16] // a[j+k] 62 | vpmovsxdq ymm5, XMMWORD PTR [reg_p1+4*r13+32] // a[j+k] 63 | vpmovsxdq ymm7, XMMWORD PTR [reg_p1+4*r13+48] // a[j+k] 64 | 65 | vpmuldq ymm1, ymm1, ymm11 // a[j+k].S 66 | vpmuldq ymm3, ymm3, ymm11 67 | vpmuldq ymm5, ymm5, ymm11 68 | vpmuldq ymm7, ymm7, ymm11 69 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] 70 | 71 | vmovdqu ymm13, ymm1 72 | vpand ymm1, ymm14, ymm1 // c0 73 | vpsrlq ymm13, ymm13, 12 // c1 74 | vpslld ymm15, ymm1, 1 // 2*c0 75 | vpsubd ymm13, ymm1, ymm13 // c0-c1 76 | vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 77 | vpsubd ymm1, ymm0, ymm13 // a[j+k] = U - V 78 | vpaddd ymm0, ymm0, ymm13 // a[j] = U + V 79 | vpermd ymm1, ymm12, ymm1 80 | vpermd ymm0, ymm12, ymm0 81 | vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] 82 | 83 | vmovdqu ymm13, ymm3 84 | vpand ymm3, ymm14, ymm3 // c0 85 | vpsrlq ymm13, ymm13, 12 // c1 86 | vpslld ymm15, ymm3, 1 // 2*c0 87 | vpsubd ymm13, ymm3, ymm13 // c0-c1 88 | vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 89 | vpsubd ymm3, ymm2, ymm13 // a[j+k] = U - V 90 | vpaddd ymm2, ymm2, ymm13 // a[j] = U + V 91 | vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 92 | vmovdqu XMMWORD PTR [reg_p1+4*r13], xmm1 93 | vpermd ymm3, ymm12, ymm3 94 | vpermd ymm2, ymm12, ymm2 95 | vpmovsxdq ymm4, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j] 96 | 97 | vmovdqu ymm13, ymm5 98 | vpand ymm5, ymm14, ymm5 // c0 99 | vpsrlq ymm13, ymm13, 12 // c1 100 | vpslld ymm15, ymm5, 1 // 2*c0 101 | vpsubd ymm13, ymm5, ymm13 // c0-c1 102 | vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 103 | vpsubd ymm5, ymm4, ymm13 // a[j+k] = U - V 104 | vpaddd ymm4, ymm4, ymm13 // a[j] = U + V 105 | vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 106 | vmovdqu XMMWORD PTR [reg_p1+4*r13+16], xmm3 107 | vpermd ymm5, ymm12, ymm5 108 | vpermd ymm4, ymm12, ymm4 109 | vpmovsxdq ymm6, XMMWORD PTR [reg_p1+4*r10+48] // U = a[j] 110 | 111 | vmovdqu ymm13, ymm7 112 | vpand ymm7, ymm14, ymm7 // c0 113 | vpsrlq ymm13, ymm13, 12 // c1 114 | vpslld ymm15, ymm7, 1 // 2*c0 115 | vpsubd ymm13, ymm7, ymm13 // c0-c1 116 | vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 117 | vpsubd ymm7, ymm6, ymm13 // a[j+k] = U - V 118 | vpaddd ymm6, ymm6, ymm13 // a[j] = U + V 119 | vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm4 120 | vmovdqu XMMWORD PTR [reg_p1+4*r13+32], xmm5 121 | vpermd ymm6, ymm12, ymm6 122 | vpermd ymm7, ymm12, ymm7 123 | vmovdqu XMMWORD PTR [reg_p1+4*r13+48], xmm7 124 | vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm6 125 | 126 | add r10, r14 127 | cmp r10, r11 128 | jl loop3 129 | inc rdx 130 | cmp rdx, r9 131 | jl loop2 132 | shl r9, 1 133 | cmp r9, r12 134 | jl loop1 135 | 136 | // Stage m=64 137 | xor rdx, rdx // i = 0 138 | xor r10, r10 // j1 = 0 139 | loop4: 140 | vbroadcastss ymm11, DWORD PTR [reg_p2+4*rdx+4*64] // S 141 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+32] // a[j+k] 142 | vpmovsxdq ymm3, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k] 143 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] 144 | vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] 145 | vpmuldq ymm1, ymm1, ymm11 // a[j+k].S 146 | vpmuldq ymm3, ymm3, ymm11 // a[j+k].S 147 | 148 | vmovdqu ymm13, ymm1 149 | vpand ymm1, ymm14, ymm1 // c0 150 | vpsrlq ymm13, ymm13, 12 // c1 151 | vpslld ymm15, ymm1, 1 // 2*c0 152 | vpsubd ymm13, ymm1, ymm13 // c0-c1 153 | vpaddd ymm13, ymm13, ymm15 // V = 3*c0-c1 154 | 155 | vmovdqu ymm10, ymm3 156 | vpand ymm3, ymm14, ymm3 // c0 157 | vpsrlq ymm10, ymm10, 12 // c1 158 | vpslld ymm15, ymm3, 1 // 2*c0 159 | vpsubd ymm10, ymm3, ymm10 // c0-c1 160 | vpaddd ymm10, ymm10, ymm15 // V = 3*c0-c1 161 | 162 | vpsubd ymm1, ymm0, ymm13 // a[j+k] = U - V 163 | vpaddd ymm0, ymm0, ymm13 // a[j] = U + V 164 | vpsubd ymm3, ymm2, ymm10 // a[j+k] = U - V 165 | vpaddd ymm2, ymm2, ymm10 // a[j] = U + V 166 | 167 | vpermd ymm0, ymm12, ymm0 168 | vpermd ymm1, ymm12, ymm1 169 | vpermd ymm2, ymm12, ymm2 170 | vpermd ymm3, ymm12, ymm3 171 | vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 172 | vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm1 173 | vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 174 | vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm3 175 | 176 | add r10, r14 // j+16 177 | inc rdx // i+1 178 | cmp rdx, r9 179 | jl loop4 180 | 181 | // Stage m=128 182 | shl r9, 1 183 | xor rdx, rdx // i = 0 184 | xor r10, r10 // j1 = 0 185 | mov r13, 8 186 | loop6: 187 | vbroadcastss ymm2, DWORD PTR [reg_p2+4*rdx+4*128] // S 188 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k] 189 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] 190 | vpmuldq ymm1, ymm1, ymm2 // a[j+k].S 191 | 192 | vmovdqu ymm3, ymm0 193 | vpand ymm0, ymm14, ymm0 // c0 194 | vpsrad ymm3, ymm3, 12 // c1 195 | vpslld ymm4, ymm0, 1 // 2*c0 196 | vpsubd ymm3, ymm0, ymm3 // c0-c1 197 | vpaddd ymm0, ymm3, ymm4 // U = 3*c0-c1 198 | 199 | vmovdqu ymm3, ymm1 200 | vpand ymm1, ymm14, ymm1 // c0 201 | vpsrlq ymm4, ymm3, 24 // c2 202 | vpsrad ymm3, ymm3, 12 // xc1 203 | vpand ymm3, ymm14, ymm3 // c1 204 | vpslld ymm5, ymm1, 3 // 8*c0 205 | vpaddd ymm4, ymm1, ymm4 // c0+c2 206 | vpaddd ymm4, ymm4, ymm5 // 9*c0+c2 207 | vpslld ymm5, ymm3, 1 // 2*c1 208 | vpaddd ymm1, ymm0, ymm3 // U+c1 209 | vpsubd ymm0, ymm0, ymm3 // U-c1 210 | vpsubd ymm4, ymm4, ymm5 // 9*c0-2*c1+c2 211 | vpaddd ymm0, ymm0, ymm4 // U+(9*c0-3*c1+c2) 212 | vpsubd ymm1, ymm1, ymm4 // U-(9*c0-3*c1+c2) 213 | vpermd ymm0, ymm12, ymm0 214 | vpermd ymm1, ymm12, ymm1 215 | vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 216 | vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm1 217 | 218 | add r10, r13 // j+8 219 | inc rdx // i+1 220 | cmp rdx, r9 221 | jl loop6 222 | 223 | // Stage m=256 224 | vmovdqu ymm9, PERM02134657 225 | shl r9, 1 226 | xor rdx, rdx // i = 0 227 | xor r10, r10 // j1 = 0 228 | mov r14, 32 229 | loop7: 230 | vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256] // S = psi[m+i]->psi[m+i+3] 231 | vpermq ymm8, ymm2, 0x50 232 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j]->a[j+3] 233 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // a[j+k]->a[j+k+3] 234 | vpermq ymm3, ymm0, 0x4e 235 | vinserti128 ymm0, ymm0, xmm1, 1 // U 236 | vpblendd ymm1, ymm1, ymm3, 15 237 | vpmuldq ymm3, ymm1, ymm8 // a[j+k].S 238 | vmovdqu ymm4, ymm3 239 | vpand ymm3, ymm14, ymm3 // c0 240 | vpsrlq ymm4, ymm4, 12 // c1 241 | vpslld ymm5, ymm3, 1 // 2*c0 242 | vpsubd ymm4, ymm3, ymm4 // c0-c1 243 | vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 244 | vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V 245 | vpaddd ymm0, ymm0, ymm4 // a[j] = U + V 246 | vpslldq ymm1, ymm1, 4 247 | vpblendd ymm0, ymm0, ymm1, 0xaa 248 | vpermd ymm0, ymm9, ymm0 249 | vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 250 | 251 | vpermq ymm8, ymm2, 0xfa 252 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j]->a[j+3] 253 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+48] // a[j+k]->a[j+k+3] 254 | vpermq ymm3, ymm0, 0x4e 255 | vinserti128 ymm0, ymm0, xmm1, 1 // U 256 | vpblendd ymm1, ymm1, ymm3, 15 257 | vpmuldq ymm3, ymm1, ymm8 // a[j+k].S 258 | vmovdqu ymm4, ymm3 259 | vpand ymm3, ymm14, ymm3 // c0 260 | vpsrlq ymm4, ymm4, 12 // c1 261 | vpslld ymm5, ymm3, 1 // 2*c0 262 | vpsubd ymm4, ymm3, ymm4 // c0-c1 263 | vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 264 | vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V 265 | vpaddd ymm0, ymm0, ymm4 // a[j] = U + V 266 | vpslldq ymm1, ymm1, 4 267 | vpblendd ymm0, ymm0, ymm1, 0xaa 268 | vpermd ymm0, ymm9, ymm0 269 | vmovdqu YMMWORD PTR [reg_p1+4*r10+32], ymm0 270 | 271 | vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*256+16] // S = psi[m+i]->psi[m+i+3] 272 | vpermq ymm8, ymm2, 0x50 273 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+64] // U = a[j]->a[j+3] 274 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+80] // a[j+k]->a[j+k+3] 275 | vpermq ymm3, ymm0, 0x4e 276 | vinserti128 ymm0, ymm0, xmm1, 1 // U 277 | vpblendd ymm1, ymm1, ymm3, 15 278 | vpmuldq ymm3, ymm1, ymm8 // a[j+k].S 279 | vmovdqu ymm4, ymm3 280 | vpand ymm3, ymm14, ymm3 // c0 281 | vpsrlq ymm4, ymm4, 12 // c1 282 | vpslld ymm5, ymm3, 1 // 2*c0 283 | vpsubd ymm4, ymm3, ymm4 // c0-c1 284 | vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 285 | vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V 286 | vpaddd ymm0, ymm0, ymm4 // a[j] = U + V 287 | vpslldq ymm1, ymm1, 4 288 | vpblendd ymm0, ymm0, ymm1, 0xaa 289 | vpermd ymm0, ymm9, ymm0 290 | vmovdqu YMMWORD PTR [reg_p1+4*r10+64], ymm0 291 | 292 | vpermq ymm8, ymm2, 0xfa 293 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10+96] // U = a[j]->a[j+3] 294 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+112] // a[j+k]->a[j+k+3] 295 | vpermq ymm3, ymm0, 0x4e 296 | vinserti128 ymm0, ymm0, xmm1, 1 // U 297 | vpblendd ymm1, ymm1, ymm3, 15 298 | vpmuldq ymm3, ymm1, ymm8 // a[j+k].S 299 | vmovdqu ymm4, ymm3 300 | vpand ymm3, ymm14, ymm3 // c0 301 | vpsrlq ymm4, ymm4, 12 // c1 302 | vpslld ymm5, ymm3, 1 // 2*c0 303 | vpsubd ymm4, ymm3, ymm4 // c0-c1 304 | vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 305 | vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V 306 | vpaddd ymm0, ymm0, ymm4 // a[j] = U + V 307 | vpslldq ymm1, ymm1, 4 308 | vpblendd ymm0, ymm0, ymm1, 0xaa 309 | vpermd ymm0, ymm9, ymm0 310 | vmovdqu YMMWORD PTR [reg_p1+4*r10+96], ymm0 311 | 312 | add r10, r14 // j+32 313 | add rdx, r13 // i+8 314 | cmp rdx, r9 315 | jl loop7 316 | 317 | // Stage m=512 318 | vmovdqu ymm9, PERM00224466 319 | shl r9, 1 // m = n/2 320 | xor rdx, rdx // i = 0 321 | xor r10, r10 // j1 = 0 322 | mov r14, 4 323 | loop8: 324 | vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*rdx+4*512] // S 325 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j] 326 | vmovdqu ymm1, YMMWORD PTR [reg_p1+4*r10+4] // a[j+k] 327 | vpmuldq ymm3, ymm1, ymm2 // a[j+k].S 328 | vmovdqu ymm4, ymm3 329 | vpand ymm3, ymm14, ymm3 // c0 330 | vpsrlq ymm4, ymm4, 12 // c1 331 | vpslld ymm5, ymm3, 1 // 2*c0 332 | vpsubd ymm4, ymm3, ymm4 // c0-c1 333 | vpaddd ymm4, ymm4, ymm5 // V = 3*c0-c1 334 | vpsubd ymm1, ymm0, ymm4 // a[j+k] = U - V 335 | vpaddd ymm0, ymm0, ymm4 // a[j] = U + V 336 | vpermd ymm1, ymm9, ymm1 337 | vpblendd ymm0, ymm0, ymm1, 0xaa 338 | vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 339 | 340 | add r10, r13 // j+8 341 | add rdx, r14 // i+4 342 | cmp rdx, r9 343 | jl loop8 344 | 345 | pop r14 346 | pop r13 347 | pop r12 348 | ret 349 | 350 | 351 | //*********************************************************************** 352 | // Inverse NTT 353 | // Operation: a [reg_p1] <- INTT(a) [reg_p1], 354 | // [reg_p2] points to table 355 | // reg_p3 and reg_p4 point to constants for scaling and 356 | // reg_p5 contains parameter n 357 | //*********************************************************************** 358 | .global INTT_GS_rev2std_12289_asm 359 | INTT_GS_rev2std_12289_asm: 360 | push r12 361 | push r13 362 | push r14 363 | push r15 364 | push rbx 365 | 366 | // Stage m=1024 367 | vmovdqu ymm9, PERM00224466 368 | vmovdqu ymm14, MASK12x8 369 | mov r12, reg_p5 370 | shr r12, 1 // n/2 = 512 371 | xor r15, r15 // i = 0 372 | xor r10, r10 // j1 = 0 373 | mov r13, 8 374 | mov r14, 4 375 | loop1b: 376 | vmovdqu ymm1, YMMWORD PTR [reg_p1+4*r10+4] // V = a[j+k] 377 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j] 378 | vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*512] // S 379 | vpsubd ymm3, ymm0, ymm1 // U - V 380 | vpaddd ymm0, ymm0, ymm1 // U + V 381 | vpmuldq ymm3, ymm3, ymm2 // (U - V).S 382 | vmovdqu ymm4, ymm3 383 | vpand ymm3, ymm14, ymm3 // c0 384 | vpsrlq ymm4, ymm4, 12 // c1 385 | vpslld ymm5, ymm3, 1 // 2*c0 386 | vpsubd ymm4, ymm3, ymm4 // c0-c1 387 | vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 388 | vpermd ymm1, ymm9, ymm1 389 | vpblendd ymm0, ymm0, ymm1, 0xaa 390 | vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 391 | 392 | add r10, r13 // j+8 393 | add r15, r14 // i+4 394 | cmp r15, r12 395 | jl loop1b 396 | 397 | // Stage m=512 398 | vmovdqu ymm9, PERM02134657 399 | vmovdqu ymm13, PERM0145 400 | vmovdqu ymm15, PERM2367 401 | shr r12, 1 // n/4 = 256 402 | xor r15, r15 // i = 0 403 | xor r10, r10 // j1 = 0 404 | mov r14, 32 405 | loop2b: 406 | vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*256] // S = psi[m+i]->psi[m+i+3] 407 | vpermq ymm8, ymm2, 0x50 408 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10] // U = a[j]->a[j+7] 409 | vpermd ymm1, ymm15, ymm0 410 | vpermd ymm0, ymm13, ymm0 411 | vpsubd ymm3, ymm0, ymm1 // U - V 412 | vpaddd ymm0, ymm0, ymm1 // U + V 413 | vpmuldq ymm3, ymm3, ymm8 // (U - V).S 414 | vmovdqu ymm4, ymm3 415 | vpand ymm3, ymm14, ymm3 // c0 416 | vpsrlq ymm4, ymm4, 12 // c1 417 | vpslld ymm5, ymm3, 1 // 2*c0 418 | vpsubd ymm4, ymm3, ymm4 // c0-c1 419 | vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 420 | vpslldq ymm1, ymm1, 4 421 | vpblendd ymm0, ymm0, ymm1, 0xaa 422 | vpermd ymm0, ymm9, ymm0 423 | vmovdqu YMMWORD PTR [reg_p1+4*r10], ymm0 424 | 425 | vpermq ymm8, ymm2, 0xfa 426 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+32] // U = a[j]->a[j+7] 427 | vpermd ymm1, ymm15, ymm0 428 | vpermd ymm0, ymm13, ymm0 429 | vpsubd ymm3, ymm0, ymm1 // U - V 430 | vpaddd ymm0, ymm0, ymm1 // U + V 431 | vpmuldq ymm3, ymm3, ymm8 // (U - V).S 432 | vmovdqu ymm4, ymm3 433 | vpand ymm3, ymm14, ymm3 // c0 434 | vpsrlq ymm4, ymm4, 12 // c1 435 | vpslld ymm5, ymm3, 1 // 2*c0 436 | vpsubd ymm4, ymm3, ymm4 // c0-c1 437 | vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 438 | vpslldq ymm1, ymm1, 4 439 | vpblendd ymm0, ymm0, ymm1, 0xaa 440 | vpermd ymm0, ymm9, ymm0 441 | vmovdqu YMMWORD PTR [reg_p1+4*r10+32], ymm0 442 | 443 | vpmovsxdq ymm2, XMMWORD PTR [reg_p2+4*r15+4*256+16]// S = psi[m+i]->psi[m+i+3] 444 | vpermq ymm8, ymm2, 0x50 445 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+64] // U = a[j]->a[j+7] 446 | vpermd ymm1, ymm15, ymm0 447 | vpermd ymm0, ymm13, ymm0 448 | vpsubd ymm3, ymm0, ymm1 // U - V 449 | vpaddd ymm0, ymm0, ymm1 // U + V 450 | vpmuldq ymm3, ymm3, ymm8 // (U - V).S 451 | vmovdqu ymm4, ymm3 452 | vpand ymm3, ymm14, ymm3 // c0 453 | vpsrlq ymm4, ymm4, 12 // c1 454 | vpslld ymm5, ymm3, 1 // 2*c0 455 | vpsubd ymm4, ymm3, ymm4 // c0-c1 456 | vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 457 | vpslldq ymm1, ymm1, 4 458 | vpblendd ymm0, ymm0, ymm1, 0xaa 459 | vpermd ymm0, ymm9, ymm0 460 | vmovdqu YMMWORD PTR [reg_p1+4*r10+64], ymm0 461 | 462 | vpermq ymm8, ymm2, 0xfa 463 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*r10+96] // U = a[j]->a[j+7] 464 | vpermd ymm1, ymm15, ymm0 465 | vpermd ymm0, ymm13, ymm0 466 | vpsubd ymm3, ymm0, ymm1 // U - V 467 | vpaddd ymm0, ymm0, ymm1 // U + V 468 | vpmuldq ymm3, ymm3, ymm8 // (U - V).S 469 | vmovdqu ymm4, ymm3 470 | vpand ymm3, ymm14, ymm3 // c0 471 | vpsrlq ymm4, ymm4, 12 // c1 472 | vpslld ymm5, ymm3, 1 // 2*c0 473 | vpsubd ymm4, ymm3, ymm4 // c0-c1 474 | vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 475 | vpslldq ymm1, ymm1, 4 476 | vpblendd ymm0, ymm0, ymm1, 0xaa 477 | vpermd ymm0, ymm9, ymm0 478 | vmovdqu YMMWORD PTR [reg_p1+4*r10+96], ymm0 479 | 480 | add r10, r14 // j+32 481 | add r15, r13 // i+8 482 | cmp r15, r12 483 | jl loop2b 484 | 485 | // Stage m=256 486 | vmovdqu ymm12, PERM0246 487 | shr r12, 1 // n/8 = 128 488 | xor r15, r15 // i = 0 489 | xor r10, r10 // j1 = 0 490 | loop3b: 491 | vbroadcastss ymm2, DWORD PTR [reg_p2+4*r15+4*128] // S 492 | vpmovsxdq ymm1, XMMWORD PTR [reg_p1+4*r10+16] // V = a[j+k] 493 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] 494 | vpsubd ymm3, ymm0, ymm1 // U - V 495 | vpaddd ymm0, ymm0, ymm1 // U + V 496 | vpmuldq ymm3, ymm3, ymm2 // (U - V).S 497 | vmovdqu ymm4, ymm3 498 | vpand ymm3, ymm14, ymm3 // c0 499 | vpsrlq ymm4, ymm4, 12 // c1 500 | vpslld ymm5, ymm3, 1 // 2*c0 501 | vpsubd ymm4, ymm3, ymm4 // c0-c1 502 | vpaddd ymm1, ymm4, ymm5 // 3*c0-c1 503 | vpermd ymm0, ymm12, ymm0 504 | vpermd ymm1, ymm12, ymm1 505 | vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 506 | vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm1 507 | 508 | add r10, r13 // j+8 509 | inc r15 // i+1 510 | cmp r15, r12 511 | jl loop3b 512 | 513 | // Stage m=128 514 | shr r12, 1 // n/16 = 64 515 | xor r15, r15 // i = 0 516 | xor r10, r10 // j1 = 0 517 | mov r14, 16 518 | loop4b: 519 | vbroadcastss ymm11, DWORD PTR [reg_p2+4*r15+4*64] // S 520 | vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r10+32] // V = a[j+k] 521 | vpmovsxdq ymm15, XMMWORD PTR [reg_p1+4*r10+48] // V = a[j+k] 522 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] 523 | vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] 524 | vpsubd ymm1, ymm0, ymm13 // U - V 525 | vpaddd ymm0, ymm0, ymm13 // U + V 526 | vpsubd ymm3, ymm2, ymm15 // U - V 527 | vpaddd ymm2, ymm2, ymm15 // U + V 528 | vpmuldq ymm1, ymm1, ymm11 // (U - V).S 529 | vpmuldq ymm3, ymm3, ymm11 // (U - V).S 530 | 531 | vmovdqu ymm13, ymm1 532 | vpand ymm1, ymm14, ymm1 // c0 533 | vpsrlq ymm13, ymm13, 12 // c1 534 | vpslld ymm15, ymm1, 1 // 2*c0 535 | vpsubd ymm13, ymm1, ymm13 // c0-c1 536 | vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 537 | 538 | vmovdqu ymm13, ymm3 539 | vpand ymm3, ymm14, ymm3 // c0 540 | vpsrlq ymm13, ymm13, 12 // c1 541 | vpslld ymm15, ymm3, 1 // 2*c0 542 | vpsubd ymm13, ymm3, ymm13 // c0-c1 543 | vpaddd ymm3, ymm13, ymm15 // 3*c0-c1 544 | 545 | vpermd ymm0, ymm12, ymm0 546 | vpermd ymm1, ymm12, ymm1 547 | vpermd ymm2, ymm12, ymm2 548 | vpermd ymm3, ymm12, ymm3 549 | vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 550 | vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm1 551 | vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 552 | vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm3 553 | 554 | add r10, r14 // j+16 555 | inc r15 // i+1 556 | cmp r15, r12 557 | jl loop4b 558 | 559 | // Stages m=64 -> m=4 560 | mov r9, 5 // 5 iterations 561 | mov rax, 8 562 | loop5b: 563 | shl rax, 1 // k = 2*k 564 | shr r12, 1 // m/2 565 | xor r15, r15 // i = 0 566 | xor r8, r8 567 | loop6b: 568 | mov r10, r8 // Load j1 569 | mov r11, rax 570 | dec r11 571 | add r11, r10 // j2 572 | mov r13, r12 573 | add r13, r15 // m/2+i 574 | vbroadcastss ymm9, DWORD PTR [reg_p2+4*r13] // S 575 | mov rbx, 4 576 | 577 | loop7b: 578 | mov r13, r10 579 | add r13, rax // j+k 580 | vpmovsxdq ymm10, XMMWORD PTR [reg_p1+4*r13] // V = a[j+k] 581 | vpmovsxdq ymm11, XMMWORD PTR [reg_p1+4*r13+16] // V = a[j+k] 582 | vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r13+32] // V = a[j+k] 583 | vpmovsxdq ymm15, XMMWORD PTR [reg_p1+4*r13+48] // V = a[j+k] 584 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] 585 | vpmovsxdq ymm2, XMMWORD PTR [reg_p1+4*r10+16] // U = a[j] 586 | vpmovsxdq ymm4, XMMWORD PTR [reg_p1+4*r10+32] // U = a[j] 587 | vpmovsxdq ymm6, XMMWORD PTR [reg_p1+4*r10+48] // U = a[j] 588 | 589 | vpsubd ymm1, ymm0, ymm10 // U - V 590 | vpaddd ymm0, ymm0, ymm10 // U + V 591 | vpsubd ymm3, ymm2, ymm11 // U - V 592 | vpaddd ymm2, ymm2, ymm11 // U + V 593 | vpsubd ymm5, ymm4, ymm13 // U - V 594 | vpaddd ymm4, ymm4, ymm13 // U + V 595 | vpsubd ymm7, ymm6, ymm15 // U - V 596 | vpaddd ymm6, ymm6, ymm15 // U + V 597 | 598 | vpmuldq ymm1, ymm1, ymm9 // (U - V).S 599 | vpmuldq ymm3, ymm3, ymm9 600 | vpmuldq ymm5, ymm5, ymm9 601 | vpmuldq ymm7, ymm7, ymm9 602 | 603 | vmovdqu ymm13, ymm1 604 | vpand ymm1, ymm14, ymm1 // c0 605 | vpsrlq ymm13, ymm13, 12 // c1 606 | vpslld ymm15, ymm1, 1 // 2*c0 607 | vpsubd ymm13, ymm1, ymm13 // c0-c1 608 | vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 609 | 610 | cmp r9, rbx 611 | jne skip1 612 | vmovdqu ymm13, ymm0 613 | vpand ymm0, ymm14, ymm0 // c0 614 | vpsrad ymm13, ymm13, 12 // c1 615 | vpslld ymm15, ymm0, 1 // 2*c0 616 | vpsubd ymm13, ymm0, ymm13 // c0-c1 617 | vpaddd ymm0, ymm13, ymm15 // 3*c0-c1 618 | 619 | vmovdqu ymm13, ymm1 620 | vpand ymm1, ymm14, ymm1 // c0 621 | vpsrad ymm13, ymm13, 12 // c1 622 | vpslld ymm15, ymm1, 1 // 2*c0 623 | vpsubd ymm13, ymm1, ymm13 // c0-c1 624 | vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 625 | skip1: 626 | vpermd ymm1, ymm12, ymm1 627 | vpermd ymm0, ymm12, ymm0 628 | 629 | vmovdqu ymm13, ymm3 630 | vpand ymm3, ymm14, ymm3 // c0 631 | vpsrlq ymm13, ymm13, 12 // c1 632 | vpslld ymm15, ymm3, 1 // 2*c0 633 | vpsubd ymm13, ymm3, ymm13 // c0-c1 634 | vpaddd ymm3, ymm13, ymm15 // 3*c0-c1 635 | vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 636 | vmovdqu XMMWORD PTR [reg_p1+4*r13], xmm1 637 | 638 | cmp r9, rbx 639 | jne skip2 640 | vmovdqu ymm13, ymm2 641 | vpand ymm2, ymm14, ymm2 // c0 642 | vpsrad ymm13, ymm13, 12 // c1 643 | vpslld ymm15, ymm2, 1 // 2*c0 644 | vpsubd ymm13, ymm2, ymm13 // c0-c1 645 | vpaddd ymm2, ymm13, ymm15 // 3*c0-c1 646 | 647 | vmovdqu ymm13, ymm3 648 | vpand ymm3, ymm14, ymm3 // c0 649 | vpsrad ymm13, ymm13, 12 // c1 650 | vpslld ymm15, ymm3, 1 // 2*c0 651 | vpsubd ymm13, ymm3, ymm13 // c0-c1 652 | vpaddd ymm3, ymm13, ymm15 // 3*c0-c1 653 | skip2: 654 | vpermd ymm3, ymm12, ymm3 655 | vpermd ymm2, ymm12, ymm2 656 | 657 | vmovdqu ymm13, ymm5 658 | vpand ymm5, ymm14, ymm5 // c0 659 | vpsrlq ymm13, ymm13, 12 // c1 660 | vpslld ymm15, ymm5, 1 // 2*c0 661 | vpsubd ymm13, ymm5, ymm13 // c0-c1 662 | vpaddd ymm5, ymm13, ymm15 // 3*c0-c1 663 | vmovdqu XMMWORD PTR [reg_p1+4*r10+16], xmm2 664 | vmovdqu XMMWORD PTR [reg_p1+4*r13+16], xmm3 665 | 666 | cmp r9, rbx 667 | jne skip3 668 | vmovdqu ymm13, ymm4 669 | vpand ymm4, ymm14, ymm4 // c0 670 | vpsrad ymm13, ymm13, 12 // c1 671 | vpslld ymm15, ymm4, 1 // 2*c0 672 | vpsubd ymm13, ymm4, ymm13 // c0-c1 673 | vpaddd ymm4, ymm13, ymm15 // 3*c0-c1 674 | 675 | vmovdqu ymm13, ymm5 676 | vpand ymm5, ymm14, ymm5 // c0 677 | vpsrad ymm13, ymm13, 12 // c1 678 | vpslld ymm15, ymm5, 1 // 2*c0 679 | vpsubd ymm13, ymm5, ymm13 // c0-c1 680 | vpaddd ymm5, ymm13, ymm15 // 3*c0-c1 681 | skip3: 682 | vpermd ymm5, ymm12, ymm5 683 | vpermd ymm4, ymm12, ymm4 684 | 685 | vmovdqu ymm13, ymm7 686 | vpand ymm7, ymm14, ymm7 // c0 687 | vpsrlq ymm13, ymm13, 12 // c1 688 | vpslld ymm15, ymm7, 1 // 2*c0 689 | vpsubd ymm13, ymm7, ymm13 // c0-c1 690 | vpaddd ymm7, ymm13, ymm15 // 3*c0-c1 691 | vmovdqu XMMWORD PTR [reg_p1+4*r10+32], xmm4 692 | vmovdqu XMMWORD PTR [reg_p1+4*r13+32], xmm5 693 | 694 | cmp r9, rbx 695 | jne skip4 696 | vmovdqu ymm13, ymm6 697 | vpand ymm6, ymm14, ymm6 // c0 698 | vpsrad ymm13, ymm13, 12 // c1 699 | vpslld ymm15, ymm6, 1 // 2*c0 700 | vpsubd ymm13, ymm6, ymm13 // c0-c1 701 | vpaddd ymm6, ymm13, ymm15 // 3*c0-c1 702 | 703 | vmovdqu ymm13, ymm7 704 | vpand ymm7, ymm14, ymm7 // c0 705 | vpsrad ymm13, ymm13, 12 // c1 706 | vpslld ymm15, ymm7, 1 // 2*c0 707 | vpsubd ymm13, ymm7, ymm13 // c0-c1 708 | vpaddd ymm7, ymm13, ymm15 // 3*c0-c1 709 | skip4: 710 | vpermd ymm7, ymm12, ymm7 711 | vpermd ymm6, ymm12, ymm6 712 | vmovdqu XMMWORD PTR [reg_p1+4*r13+48], xmm7 713 | vmovdqu XMMWORD PTR [reg_p1+4*r10+48], xmm6 714 | 715 | add r10, r14 716 | cmp r10, r11 717 | jl loop7b 718 | mov rbx, rax 719 | shl rbx, 1 // 2*k 720 | add r8, rbx // j1+2*k 721 | inc r15 722 | cmp r15, r12 723 | jl loop6b 724 | dec r9 725 | jnz loop5b 726 | 727 | // Scaling step 728 | shl rax, 1 // k = 2*k = 512 729 | xor r10, r10 // j = 0 730 | mov r14, 4 731 | movq xmm0, reg_p3 732 | vbroadcastsd ymm10, xmm0 // S = omegainv1N_rev 733 | movq xmm0, reg_p4 734 | vbroadcastsd ymm11, xmm0 // T = Ninv 735 | loop8b: 736 | vpmovsxdq ymm13, XMMWORD PTR [reg_p1+4*r10+4*512] // V = a[j+k] 737 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*r10] // U = a[j] 738 | vpsubd ymm1, ymm0, ymm13 // U - V 739 | vpaddd ymm0, ymm0, ymm13 // U + V 740 | vpmuldq ymm1, ymm1, ymm10 // (U - V).S 741 | vpmuldq ymm0, ymm0, ymm11 // (U + V).T 742 | 743 | vmovdqu ymm13, ymm0 744 | vpand ymm0, ymm14, ymm0 // c0 745 | vpsrlq ymm13, ymm13, 12 // c1 746 | vpslld ymm15, ymm0, 1 // 2*c0 747 | vpsubd ymm13, ymm0, ymm13 // c0-c1 748 | vpaddd ymm0, ymm13, ymm15 // 3*c0-c1 749 | 750 | vmovdqu ymm13, ymm1 751 | vpand ymm1, ymm14, ymm1 // c0 752 | vpsrlq ymm13, ymm13, 12 // c1 753 | vpslld ymm15, ymm1, 1 // 2*c0 754 | vpsubd ymm13, ymm1, ymm13 // c0-c1 755 | vpaddd ymm1, ymm13, ymm15 // 3*c0-c1 756 | 757 | vpermd ymm0, ymm12, ymm0 758 | vpermd ymm1, ymm12, ymm1 759 | vmovdqu XMMWORD PTR [reg_p1+4*r10], xmm0 760 | vmovdqu XMMWORD PTR [reg_p1+4*r10+4*512], xmm1 761 | 762 | add r10, r14 // j+4 763 | cmp r10, rax 764 | jl loop8b 765 | loop9b: 766 | pop rbx 767 | pop r15 768 | pop r14 769 | pop r13 770 | pop r12 771 | ret 772 | 773 | 774 | //*********************************************************************** 775 | // Component-wise multiplication and addition 776 | // Operation: d [reg_p4] <- a [reg_p1] * b [reg_p2] + c [reg_p3] 777 | // reg_p5 contains parameter n 778 | //*********************************************************************** 779 | .global pmuladd_asm 780 | pmuladd_asm: 781 | vmovdqu ymm5, PERM0246 782 | vmovdqu ymm6, MASK12x8 783 | xor rax, rax 784 | movq r11, 4 785 | lazo2: 786 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*rax] // a 787 | vpmovsxdq ymm1, XMMWORD PTR [reg_p2+4*rax] // b 788 | vpmovsxdq ymm2, XMMWORD PTR [reg_p3+4*rax] // c 789 | vpmuldq ymm0, ymm1, ymm0 790 | vpaddq ymm0, ymm2, ymm0 791 | 792 | vmovdqu ymm3, ymm0 793 | vpand ymm0, ymm6, ymm0 // c0 794 | vpsrlq ymm3, ymm3, 12 // c1 795 | vpslld ymm4, ymm0, 1 // 2*c0 796 | vpsubd ymm3, ymm0, ymm3 // c0-c1 797 | vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 798 | 799 | vmovdqu ymm3, ymm0 800 | vpand ymm0, ymm6, ymm0 // c0 801 | vpsrad ymm3, ymm3, 12 // c1 802 | vpslld ymm4, ymm0, 1 // 2*c0 803 | vpsubd ymm3, ymm0, ymm3 // c0-c1 804 | vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 805 | 806 | vpermd ymm0, ymm5, ymm0 807 | vmovdqu XMMWORD PTR [reg_p4+4*rax], xmm0 808 | 809 | add rax, r11 // j+4 810 | cmp rax, reg_p5 811 | jl lazo2 812 | ret 813 | 814 | 815 | //*********************************************************************** 816 | // Component-wise multiplication 817 | // Operation: c [reg_p3] <- a [reg_p1] * b [reg_p2] 818 | // reg_p4 contains parameter n 819 | //*********************************************************************** 820 | .global pmul_asm 821 | pmul_asm: 822 | vmovdqu ymm5, PERM0246 823 | vmovdqu ymm6, MASK12x8 824 | xor rax, rax 825 | movq r11, 4 826 | lazo3: 827 | vpmovsxdq ymm0, XMMWORD PTR [reg_p1+4*rax] // a 828 | vpmovsxdq ymm1, XMMWORD PTR [reg_p2+4*rax] // b 829 | vpmuldq ymm0, ymm1, ymm0 830 | 831 | vmovdqu ymm3, ymm0 832 | vpand ymm0, ymm6, ymm0 // c0 833 | vpsrlq ymm3, ymm3, 12 // c1 834 | vpslld ymm4, ymm0, 1 // 2*c0 835 | vpsubd ymm3, ymm0, ymm3 // c0-c1 836 | vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 837 | 838 | vmovdqu ymm3, ymm0 839 | vpand ymm0, ymm6, ymm0 // c0 840 | vpsrad ymm3, ymm3, 12 // c1 841 | vpslld ymm4, ymm0, 1 // 2*c0 842 | vpsubd ymm3, ymm0, ymm3 // c0-c1 843 | vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 844 | 845 | vpermd ymm0, ymm5, ymm0 846 | vmovdqu XMMWORD PTR [reg_p3+4*rax], xmm0 847 | 848 | add rax, r11 // j+4 849 | cmp rax, reg_p4 850 | jl lazo3 851 | ret 852 | 853 | 854 | //*********************************************************************** 855 | // Two consecutive reductions 856 | // Operation: c [reg_p1] <- a [reg_p1] 857 | // reg_p2 contains parameter n 858 | //*********************************************************************** 859 | .global two_reduce12289_asm 860 | two_reduce12289_asm: 861 | vmovdqu ymm6, MASK12x8 862 | vmovdqu ymm7, PRIME8x 863 | xor rax, rax 864 | movq r11, 8 865 | lazo4: 866 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // a 867 | 868 | vmovdqu ymm3, ymm0 869 | vpand ymm0, ymm6, ymm0 // c0 870 | vpsrad ymm3, ymm3, 12 // c1 871 | vpslld ymm4, ymm0, 1 // 2*c0 872 | vpsubd ymm3, ymm0, ymm3 // c0-c1 873 | vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 874 | 875 | vmovdqu ymm3, ymm0 876 | vpand ymm0, ymm6, ymm0 // c0 877 | vpsrad ymm3, ymm3, 12 // c1 878 | vpslld ymm4, ymm0, 1 // 2*c0 879 | vpsubd ymm3, ymm0, ymm3 // c0-c1 880 | vpaddd ymm0, ymm3, ymm4 // 3*c0-c1 881 | 882 | vpsrad ymm2, ymm0, 31 883 | vpand ymm2, ymm7, ymm2 884 | vpaddd ymm2, ymm0, ymm2 885 | vpsubd ymm0, ymm2, ymm7 886 | 887 | vpsrad ymm2, ymm0, 31 888 | vpand ymm2, ymm7, ymm2 889 | vpaddd ymm0, ymm0, ymm2 890 | 891 | vmovdqu YMMWORD PTR [reg_p1+4*rax], ymm0 892 | 893 | add rax, r11 // j+8 894 | cmp rax, reg_p2 895 | jl lazo4 896 | ret 897 | 898 | 899 | //*********************************************************************** 900 | // Encoding 901 | // Operation: c [reg_p2] <- a [reg_p1] 902 | //*********************************************************************** 903 | .global encode_asm 904 | encode_asm: 905 | vmovdqu ymm6, MASK32 906 | vmovdqu ymm7, MASK42 907 | mov r9, 1024 908 | xor rax, rax 909 | xor r10, r10 910 | mov r11, 14 911 | mov rcx, 8 912 | lazo5: 913 | vmovdqu ymm0, YMMWORD PTR [reg_p1+4*rax] // a 914 | 915 | vpsrlq ymm1, ymm0, 18 916 | vpsllq ymm2, ymm0, 4 917 | vpand ymm0, ymm0, ymm6 918 | vpsrldq ymm2, ymm2, 5 919 | vpsrlq ymm3, ymm1, 4 920 | vpand ymm1, ymm1, ymm6 921 | vpand ymm2, ymm2, ymm7 922 | vpsrldq ymm3, ymm3, 4 923 | vpor ymm0, ymm0, ymm1 924 | vpor ymm0, ymm0, ymm2 925 | vpor ymm0, ymm0, ymm3 926 | vpermq ymm1, ymm0, 0x0e 927 | 928 | vmovdqu XMMWORD PTR [reg_p2+r10], xmm0 929 | vmovdqu XMMWORD PTR [reg_p2+r10+7], xmm1 930 | 931 | add r10, r11 932 | add rax, rcx // j+8 933 | cmp rax, r9 934 | jl lazo5 935 | ret 936 | 937 | 938 | //*********************************************************************** 939 | // Decoding 940 | // Operation: c [reg_p2] <- a [reg_p1] 941 | //*********************************************************************** 942 | .global decode_asm 943 | decode_asm: 944 | vmovdqu ymm6, MASK14_1 945 | vmovdqu ymm7, MASK14_2 946 | vmovdqu ymm8, MASK14_3 947 | vmovdqu ymm9, MASK14_4 948 | mov r9, 1024 949 | xor rax, rax 950 | xor r10, r10 951 | mov r11, 14 952 | mov rcx, 8 953 | lazo6: 954 | vmovdqu xmm0, XMMWORD PTR [reg_p1+r10] 955 | vmovdqu xmm1, XMMWORD PTR [reg_p1+r10+7] 956 | vinserti128 ymm0, ymm0, xmm1, 1 957 | 958 | vpand ymm1, ymm0, ymm6 959 | vpand ymm2, ymm0, ymm7 960 | vpand ymm3, ymm0, ymm8 961 | vpand ymm4, ymm0, ymm9 962 | 963 | vpsllq ymm2, ymm2, 18 964 | vpsllq ymm3, ymm3, 4 965 | vpslldq ymm3, ymm3, 4 966 | vpsrlq ymm4, ymm4, 2 967 | vpslldq ymm4, ymm4, 7 968 | 969 | vpor ymm1, ymm1, ymm2 970 | vpor ymm1, ymm1, ymm3 971 | vpor ymm1, ymm1, ymm4 972 | 973 | vmovdqu YMMWORD PTR [reg_p2+4*rax], ymm1 974 | 975 | add r10, r11 976 | add rax, rcx // j+8 977 | cmp rax, r9 978 | jl lazo6 979 | ret --------------------------------------------------------------------------------