├── src ├── p448 │ ├── arch_32 │ │ ├── arch_config.h │ │ ├── p448.h │ │ └── p448.c │ ├── arch_ref64 │ │ ├── arch_config.h │ │ ├── p448.h │ │ └── p448.c │ ├── arch_arm_32 │ │ ├── arch_config.h │ │ └── p448.h │ ├── arch_x86_64 │ │ ├── arch_config.h │ │ ├── p448.h │ │ ├── x86-64-arith.h │ │ └── p448.c │ ├── arch_neon_experimental │ │ ├── arch_config.h │ │ └── p448.h │ ├── f_field.h │ └── f_arithmetic.c ├── p521 │ ├── arch_ref64 │ │ ├── arch_config.h │ │ ├── p521.h │ │ └── p521.c │ ├── arch_x86_64_r12 │ │ ├── arch_config.h │ │ └── p521.h │ ├── f_field.h │ └── f_arithmetic.c ├── p480 │ ├── arch_x86_64 │ │ ├── arch_config.h │ │ ├── p480.h │ │ ├── x86-64-arith.h │ │ └── p480.c │ ├── f_field.h │ └── f_arithmetic.c ├── bat │ ├── api_dh.h │ ├── api_sign.h │ ├── dh.c │ └── sign.c ├── include │ ├── decaf_448_config.h │ ├── field.h │ ├── word.h │ └── constant_time.h ├── decaf_gen_tables.c └── decaf_crypto.c ├── test ├── batarch.map ├── shakesum.c ├── test_decaf.cxx └── test_decaf.sage ├── README.txt ├── LICENSE.txt ├── TODO.txt ├── aux ├── decaffeinate_curve25519.sage ├── idealized.sage └── curve.sage ├── include └── decaf_crypto.h └── Makefile /src/p448/arch_32/arch_config.h: -------------------------------------------------------------------------------- 1 | #define WORD_BITS 32 2 | -------------------------------------------------------------------------------- /src/p448/arch_ref64/arch_config.h: -------------------------------------------------------------------------------- 1 | #define WORD_BITS 64 2 | -------------------------------------------------------------------------------- /src/p521/arch_ref64/arch_config.h: -------------------------------------------------------------------------------- 1 | #define WORD_BITS 64 2 | -------------------------------------------------------------------------------- /src/p448/arch_arm_32/arch_config.h: -------------------------------------------------------------------------------- 1 | #define WORD_BITS 32 2 | -------------------------------------------------------------------------------- /src/p448/arch_x86_64/arch_config.h: -------------------------------------------------------------------------------- 1 | #define WORD_BITS 64 2 | -------------------------------------------------------------------------------- /src/p480/arch_x86_64/arch_config.h: -------------------------------------------------------------------------------- 1 | #define WORD_BITS 64 2 | -------------------------------------------------------------------------------- /src/p521/arch_x86_64_r12/arch_config.h: -------------------------------------------------------------------------------- 1 | #define WORD_BITS 64 2 | -------------------------------------------------------------------------------- /src/p448/arch_neon_experimental/arch_config.h: -------------------------------------------------------------------------------- 1 | #define WORD_BITS 32 2 | -------------------------------------------------------------------------------- /test/batarch.map: -------------------------------------------------------------------------------- 1 | neon arch_neon_experimental 2 | arm32 arch_arm_32 3 | 64 arch_ref64 4 | 32 arch_32 5 | amd64 arch_x86_64 6 | -------------------------------------------------------------------------------- /src/bat/api_dh.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file sizes.h 3 | * @copyright 4 | * Copyright (c) 2014 Cryptography Research, Inc. \n 5 | * Released under the MIT License. See LICENSE.txt for license information. 6 | * @author Mike Hamburg 7 | * @brief BATMAN / SUPERCOP glue for benchmarking. 8 | */ 9 | 10 | #include 11 | #include "decaf_crypto.h" 12 | 13 | #define PUBLICKEY_BYTES (sizeof(decaf_448_public_key_t)) 14 | #define SECRETKEY_BYTES (sizeof(decaf_448_private_key_t)) 15 | #define SHAREDSECRET_BYTES 32 16 | 17 | #define CRYPTO_PUBLICKEYBYTES PUBLICKEY_BYTES 18 | #define CRYPTO_SECRETKEYBYTES SECRETKEY_BYTES 19 | #define CRYPTO_BYTES SHAREDSECRET_BYTES 20 | #define PRIVATEKEY_BYTES SECRETKEY_BYTES 21 | #define CRYPTO_VERSION "__TODAY__" 22 | 23 | #define CRYPTO_DETERMINISTIC 1 24 | 25 | -------------------------------------------------------------------------------- /src/bat/api_sign.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file sizes.h 3 | * @copyright 4 | * Copyright (c) 2014 Cryptography Research, Inc. \n 5 | * Released under the MIT License. See LICENSE.txt for license information. 6 | * @author Mike Hamburg 7 | * @brief BATMAN / SUPERCOP glue for benchmarking. 8 | */ 9 | 10 | #include 11 | #include "decaf_crypto.h" 12 | 13 | #define PUBLICKEY_BYTES (sizeof(decaf_448_public_key_t)) 14 | #define SECRETKEY_BYTES (sizeof(decaf_448_private_key_t)) 15 | #define SIGNATURE_BYTES (sizeof(decaf_448_signature_t)) 16 | 17 | #define CRYPTO_PUBLICKEYBYTES PUBLICKEY_BYTES 18 | #define CRYPTO_SECRETKEYBYTES SECRETKEY_BYTES 19 | #define CRYPTO_BYTES SIGNATURE_BYTES 20 | #define PRIVATEKEY_BYTES SECRETKEY_BYTES 21 | #define CRYPTO_VERSION "__TODAY__" 22 | 23 | #define CRYPTO_DETERMINISTIC 1 24 | 25 | -------------------------------------------------------------------------------- /src/bat/dh.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file sizes.h 3 | * @copyright 4 | * Copyright (c) 2014 Cryptography Research, Inc. \n 5 | * Released under the MIT License. See LICENSE.txt for license information. 6 | * @author Mike Hamburg 7 | * @brief BATMAN / SUPERCOP glue for benchmarking. 8 | */ 9 | 10 | #include 11 | #include 12 | #include "api.h" 13 | #include "crypto_dh.h" 14 | #include "randombytes.h" 15 | 16 | int crypto_dh_keypair ( 17 | unsigned char pk[PUBLICKEY_BYTES], 18 | unsigned char sk[SECRETKEY_BYTES] 19 | ) { 20 | decaf_448_symmetric_key_t proto; 21 | randombytes(proto,sizeof(proto)); 22 | decaf_448_derive_private_key((decaf_448_private_key_s *)sk,proto); 23 | decaf_448_private_to_public(pk,(decaf_448_private_key_s *)sk); 24 | return 0; 25 | } 26 | 27 | int crypto_dh ( 28 | unsigned char s[SHAREDSECRET_BYTES], 29 | const unsigned char pk[PUBLICKEY_BYTES], 30 | const unsigned char sk[SECRETKEY_BYTES] 31 | ) { 32 | return !decaf_448_shared_secret (s,SHAREDSECRET_BYTES, 33 | (const decaf_448_private_key_s *)sk, pk 34 | ); 35 | } 36 | -------------------------------------------------------------------------------- /src/p448/f_field.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file f_field.h 3 | * @brief Field-specific code. 4 | * @copyright 5 | * Copyright (c) 2014 Cryptography Research, Inc. \n 6 | * Released under the MIT License. See LICENSE.txt for license information. 7 | * @author Mike Hamburg 8 | */ 9 | #ifndef __F_FIELD_H__ 10 | #define __F_FIELD_H__ 1 11 | 12 | #include "constant_time.h" 13 | #include 14 | 15 | #include "p448.h" 16 | #define FIELD_LIT_LIMB_BITS 56 17 | #define FIELD_BITS 448 18 | #define field_t p448_t 19 | #define field_mul p448_mul 20 | #define field_sqr p448_sqr 21 | #define field_add_RAW p448_add_RAW 22 | #define field_sub_RAW p448_sub_RAW 23 | #define field_mulw p448_mulw 24 | #define field_bias p448_bias 25 | #define field_isr p448_isr 26 | #define field_inverse p448_inverse 27 | #define field_weak_reduce p448_weak_reduce 28 | #define field_strong_reduce p448_strong_reduce 29 | #define field_serialize p448_serialize 30 | #define field_deserialize p448_deserialize 31 | 32 | #endif /* __F_FIELD_H__ */ 33 | -------------------------------------------------------------------------------- /src/p480/f_field.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file f_field.h 3 | * @brief Field-specific code. 4 | * @copyright 5 | * Copyright (c) 2014 Cryptography Research, Inc. \n 6 | * Released under the MIT License. See LICENSE.txt for license information. 7 | * @author Mike Hamburg 8 | */ 9 | #ifndef __F_FIELD_H__ 10 | #define __F_FIELD_H__ 1 11 | 12 | #include "constant_time.h" 13 | #include 14 | 15 | #include "p480.h" 16 | #define FIELD_LIT_LIMB_BITS 60 17 | #define FIELD_BITS 480 18 | #define field_t p480_t 19 | #define field_mul p480_mul 20 | #define field_sqr p480_sqr 21 | #define field_add_RAW p480_add_RAW 22 | #define field_sub_RAW p480_sub_RAW 23 | #define field_mulw p480_mulw 24 | #define field_bias p480_bias 25 | #define field_isr p480_isr 26 | #define field_inverse p480_inverse 27 | #define field_weak_reduce p480_weak_reduce 28 | #define field_strong_reduce p480_strong_reduce 29 | #define field_serialize p480_serialize 30 | #define field_deserialize p480_deserialize 31 | 32 | #endif /* __F_FIELD_H__ */ 33 | -------------------------------------------------------------------------------- /src/p521/f_field.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file f_field.h 3 | * @brief Field-specific code. 4 | * @copyright 5 | * Copyright (c) 2014 Cryptography Research, Inc. \n 6 | * Released under the MIT License. See LICENSE.txt for license information. 7 | * @author Mike Hamburg 8 | */ 9 | #ifndef __F_FIELD_H__ 10 | #define __F_FIELD_H__ 1 11 | 12 | #include 13 | #include "constant_time.h" 14 | 15 | #include "p521.h" 16 | #define FIELD_LIT_LIMB_BITS 58 17 | #define FIELD_BITS 521 18 | #define field_t p521_t 19 | #define field_mul p521_mul 20 | #define field_sqr p521_sqr 21 | #define field_add_RAW p521_add_RAW 22 | #define field_sub_RAW p521_sub_RAW 23 | #define field_mulw p521_mulw 24 | #define field_bias p521_bias 25 | #define field_isr p521_isr 26 | #define field_inverse p521_inverse 27 | #define field_weak_reduce p521_weak_reduce 28 | #define field_strong_reduce p521_strong_reduce 29 | #define field_serialize p521_serialize 30 | #define field_deserialize p521_deserialize 31 | 32 | #endif /* __F_FIELD_H__ */ 33 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | Ed448-Goldilocks, Decaf version. 2 | 3 | This software is an experimental implementation of a new 448-bit elliptic 4 | curve called Ed448-Goldilocks, with "Decaf" cofactor removal. 5 | 6 | The source files here are all by Mike Hamburg. Most of them are (c) 7 | 2014-2015 Cryptography Research, Inc (a division of Rambus). All of these 8 | files are usable under the MIT license contained in LICENSE.txt. 9 | 10 | The Makefile is set for my 2013 MacBook Air. You can `make bench` to run 11 | a completely arbitrary set of benchmarks and tests, or `make lib` to build 12 | a stripped-down version of the library. For non-Haswell platforms, you may 13 | need to replace -mavx2 -mbmi2 by an appropriate vector declaration. 14 | 15 | I've attempted to protect against timing attacks and invalid point attacks, 16 | but as of yet no attempt to protect against power analysis. 17 | 18 | This software is incomplete, and lacks documentation. None of the APIs are 19 | yet stable, though they may be getting there. The software is probably not 20 | secure. Please consult TODO.txt for additional agenda items. Do not taunt 21 | happy fun ball. 22 | 23 | Cheers, 24 | -- Mike Hamburg 25 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2011 Stanford University. 4 | Copyright (c) 2014 Cryptography Research, Inc. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /src/p448/f_arithmetic.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @cond internal 3 | * @file f_arithmetic.c 4 | * @copyright 5 | * Copyright (c) 2014 Cryptography Research, Inc. \n 6 | * Released under the MIT License. See LICENSE.txt for license information. 7 | * @author Mike Hamburg 8 | * @brief Field-specific arithmetic. 9 | */ 10 | 11 | #include "field.h" 12 | 13 | void 14 | field_isr ( 15 | field_a_t a, 16 | const field_a_t x 17 | ) { 18 | field_a_t L0, L1, L2; 19 | field_sqr ( L1, x ); 20 | field_mul ( L2, x, L1 ); 21 | field_sqr ( L1, L2 ); 22 | field_mul ( L2, x, L1 ); 23 | field_sqrn ( L1, L2, 3 ); 24 | field_mul ( L0, L2, L1 ); 25 | field_sqrn ( L1, L0, 3 ); 26 | field_mul ( L0, L2, L1 ); 27 | field_sqrn ( L2, L0, 9 ); 28 | field_mul ( L1, L0, L2 ); 29 | field_sqr ( L0, L1 ); 30 | field_mul ( L2, x, L0 ); 31 | field_sqrn ( L0, L2, 18 ); 32 | field_mul ( L2, L1, L0 ); 33 | field_sqrn ( L0, L2, 37 ); 34 | field_mul ( L1, L2, L0 ); 35 | field_sqrn ( L0, L1, 37 ); 36 | field_mul ( L1, L2, L0 ); 37 | field_sqrn ( L0, L1, 111 ); 38 | field_mul ( L2, L1, L0 ); 39 | field_sqr ( L0, L2 ); 40 | field_mul ( L1, x, L0 ); 41 | field_sqrn ( L0, L1, 223 ); 42 | field_mul ( a, L2, L0 ); 43 | } 44 | -------------------------------------------------------------------------------- /src/p521/f_arithmetic.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @cond internal 3 | * @file f_arithmetic.c 4 | * @copyright 5 | * Copyright (c) 2014 Cryptography Research, Inc. \n 6 | * Released under the MIT License. See LICENSE.txt for license information. 7 | * @author Mike Hamburg 8 | * @brief Field-specific arithmetic. 9 | */ 10 | 11 | #include "field.h" 12 | 13 | void 14 | field_isr ( 15 | field_a_t a, 16 | const field_a_t x 17 | ) { 18 | field_a_t L0, L1, L2; 19 | field_sqr ( L1, x ); 20 | field_mul ( L0, x, L1 ); 21 | field_sqrn ( L2, L0, 2 ); 22 | field_mul ( L1, L0, L2 ); 23 | field_sqrn ( L2, L1, 4 ); 24 | field_mul ( L0, L1, L2 ); 25 | field_sqrn ( L2, L0, 8 ); 26 | field_mul ( L1, L0, L2 ); 27 | field_sqrn ( L2, L1, 16 ); 28 | field_mul ( L0, L1, L2 ); 29 | field_sqrn ( L2, L0, 32 ); 30 | field_mul ( L1, L0, L2 ); 31 | field_sqr ( L2, L1 ); 32 | field_mul ( L0, x, L2 ); 33 | field_sqrn ( L2, L0, 64 ); 34 | field_mul ( L0, L1, L2 ); 35 | field_sqrn ( L2, L0, 129 ); 36 | field_mul ( L1, L0, L2 ); 37 | field_sqr ( L2, L1 ); 38 | field_mul ( L0, x, L2 ); 39 | field_sqrn ( L2, L0, 259 ); 40 | field_mul ( L1, L0, L2 ); 41 | field_sqr ( L0, L1 ); 42 | field_mul ( a, x, L0 ); 43 | } 44 | -------------------------------------------------------------------------------- /src/p480/f_arithmetic.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @cond internal 3 | * @file f_arithmetic.c 4 | * @copyright 5 | * Copyright (c) 2014 Cryptography Research, Inc. \n 6 | * Released under the MIT License. See LICENSE.txt for license information. 7 | * @author Mike Hamburg 8 | * @brief Field-specific arithmetic. 9 | */ 10 | 11 | #include "field.h" 12 | 13 | void 14 | field_isr ( 15 | field_a_t a, 16 | const field_a_t x 17 | ) { 18 | field_a_t L0, L1, L2, L3; 19 | field_sqr ( L2, x ); 20 | field_mul ( L1, x, L2 ); 21 | field_sqrn ( L0, L1, 2 ); 22 | field_mul ( L2, L1, L0 ); 23 | field_sqrn ( L0, L2, 4 ); 24 | field_mul ( L1, L2, L0 ); 25 | field_sqr ( L0, L1 ); 26 | field_mul ( L2, x, L0 ); 27 | field_sqrn ( L0, L2, 8 ); 28 | field_mul ( L2, L1, L0 ); 29 | field_sqrn ( L0, L2, 17 ); 30 | field_mul ( L1, L2, L0 ); 31 | field_sqrn ( L0, L1, 17 ); 32 | field_mul ( L1, L2, L0 ); 33 | field_sqrn ( L3, L1, 17 ); 34 | field_mul ( L0, L2, L3 ); 35 | field_sqrn ( L2, L0, 51 ); 36 | field_mul ( L0, L1, L2 ); 37 | field_sqrn ( L1, L0, 119 ); 38 | field_mul ( L2, L0, L1 ); 39 | field_sqr ( L0, L2 ); 40 | field_mul ( L1, x, L0 ); 41 | field_sqrn ( L0, L1, 239 ); 42 | field_mul ( a, L2, L0 ); 43 | } 44 | -------------------------------------------------------------------------------- /src/include/decaf_448_config.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file decaf_config.h 3 | * @author Mike Hamburg 4 | * 5 | * @copyright 6 | * Copyright (c) 2015 Cryptography Research, Inc. \n 7 | * Released under the MIT License. See LICENSE.txt for license information. 8 | * 9 | * @brief Configuration for decaf_fast.c 10 | */ 11 | #ifndef __DECAF_448_CONFIG_H__ 12 | #define __DECAF_448_CONFIG_H__ 1 13 | 14 | /** 15 | * Use the Montgomery ladder for direct scalarmul. 16 | * 17 | * The Montgomery ladder is faster than Edwards scalarmul, but providing 18 | * the features Decaf supports (cofactor elimination, twist rejection) 19 | * makes it complicated and adds code. Removing the ladder saves a few 20 | * kilobytes at the cost of perhaps 5-10% overhead in direct scalarmul 21 | * time. 22 | */ 23 | #define DECAF_USE_MONTGOMERY_LADDER 1 24 | 25 | /** The number of comb tables for fixed base scalarmul. */ 26 | #define DECAF_COMBS_N 5 27 | 28 | /** The number of teeth per comb for fixed base scalarmul. */ 29 | #define DECAF_COMBS_T 5 30 | 31 | /** The comb spacing fixed base scalarmul. */ 32 | #define DECAF_COMBS_S 18 33 | 34 | /** Performance tuning: the width of the fixed window for scalar mul. */ 35 | #define DECAF_WINDOW_BITS 5 36 | 37 | /** 38 | * The number of bits used for the precomputed table in variable-time 39 | * double scalarmul. 40 | */ 41 | #define DECAF_WNAF_FIXED_TABLE_BITS 5 42 | 43 | /** 44 | * Performance tuning: bits used for the variable table in variable-time 45 | * double scalarmul. 46 | */ 47 | #define DECAF_WNAF_VAR_TABLE_BITS 3 48 | 49 | 50 | #endif /* __DECAF_448_CONFIG_H__ */ 51 | -------------------------------------------------------------------------------- /src/bat/sign.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file sizes.h 3 | * @copyright 4 | * Copyright (c) 2014 Cryptography Research, Inc. \n 5 | * Released under the MIT License. See LICENSE.txt for license information. 6 | * @author Mike Hamburg 7 | * @brief BATMAN / SUPERCOP glue for benchmarking. 8 | */ 9 | 10 | #include 11 | #include 12 | #include "api.h" 13 | #include "crypto_sign.h" 14 | #include "randombytes.h" 15 | 16 | int crypto_sign_keypair ( 17 | unsigned char pk[PUBLICKEY_BYTES], 18 | unsigned char sk[SECRETKEY_BYTES] 19 | ) { 20 | decaf_448_symmetric_key_t proto; 21 | randombytes(proto,sizeof(proto)); 22 | decaf_448_derive_private_key((decaf_448_private_key_s *)sk,proto); 23 | decaf_448_private_to_public(pk, 24 | (decaf_448_private_key_s *)sk 25 | ); 26 | return 0; 27 | } 28 | 29 | int crypto_sign ( 30 | unsigned char *sm, 31 | unsigned long long *smlen, 32 | const unsigned char *m, 33 | unsigned long long mlen, 34 | const unsigned char sk[SECRETKEY_BYTES] 35 | ) { 36 | unsigned char sig[SIGNATURE_BYTES]; 37 | decaf_448_sign( 38 | sig, 39 | (const decaf_448_private_key_s *)sk, 40 | m, mlen 41 | ); 42 | memmove(sm + SIGNATURE_BYTES, m, mlen); 43 | memcpy(sm, sig, SIGNATURE_BYTES); 44 | *smlen = mlen + SIGNATURE_BYTES; 45 | return 0; 46 | } 47 | 48 | int crypto_sign_open ( 49 | unsigned char *m, 50 | unsigned long long *mlen, 51 | const unsigned char *sm, 52 | unsigned long long smlen, 53 | const unsigned char pk[PUBLICKEY_BYTES] 54 | ) { 55 | int ret = decaf_448_verify( 56 | sm,pk, 57 | sm + SIGNATURE_BYTES, smlen - SIGNATURE_BYTES 58 | ); 59 | if (ret) { 60 | *mlen = smlen - SIGNATURE_BYTES; 61 | memmove(m, sm + SIGNATURE_BYTES, *mlen); 62 | } 63 | return ret ? 0 : -1; 64 | } 65 | -------------------------------------------------------------------------------- /TODO.txt: -------------------------------------------------------------------------------- 1 | Important work items for Ed448-Goldilocks / decaf: 2 | 3 | * Factor out hash, crandom from core library? 4 | [DONE, except for C++ headers] 5 | 6 | * Signed 32-bit NEON implementation to avoid bias/reduce after subtract 7 | 8 | * Documentation: write high-level API docs, and internal docs to help 9 | other implementors. 10 | * Partial progress on Doxygenating the code. 11 | 12 | * Documentation: write a spec or add to Watson's 13 | 14 | * Cleanup: rename everything consistently. 15 | * namespace_op or op_namespace? namespace_op_type? 16 | * We don't have to be super-careful with the namespacing, because 17 | symbols will be scrubbed by visibility 18 | 19 | * Cleanup: unify intrinsics code 20 | * Word_t, mask_t, bigregister_t, etc. 21 | * Generate asm intrinsics with a script? 22 | 23 | * Testing: 24 | * More testing. Testing, testing and testing. 25 | * Test corner cases better. 26 | 27 | * Safety: add static analysis attributes for compilers that support them 28 | * Most functions now have warn on ignored return. 29 | * [ MOSTLY DONE ] 30 | 31 | * Safety: 32 | * Decide what to do about RNG failures 33 | * abort 34 | * return error and zeroize 35 | * return error but continue if RNG is kind of mostly OK 36 | 37 | * High-level API: [DONE] 38 | 39 | * Portability: test and make clean with other compilers 40 | * Using a fair amount of __attribute__ code. 41 | * [DONE] Should work for GCC now. 42 | 43 | * Portability: try to make the vector code as portable as possible 44 | * Currently using clang ext_vector_length. 45 | * I can't get a simple for-loop to autovectorize :-/ 46 | * SAGE tool? 47 | 48 | * [DONE] Portability: make the outer layers of the code 32-bit clean. 49 | 50 | * [DONE] Performance/flexibility: decide which parameters should be hard-coded. 51 | * Perhaps useful for comb precomputation. 52 | 53 | * Performance: Improve SHAKE. 54 | * Improve speed. (Maybe) 55 | 56 | * Clear other TODO/FIXME/HACK/PERF items in the code 57 | 58 | * Submit Decaf to SUPERCOP 59 | -------------------------------------------------------------------------------- /test/shakesum.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @cond internal 3 | * @file shakesum.c 4 | * @copyright 5 | * Copyright (c) 2015 Cryptography Research, Inc. \n 6 | * Released under the MIT License. See LICENSE.txt for license information. 7 | * @author Mike Hamburg 8 | * @brief SHA3 utility, to be combined with test vectors eventually... 9 | */ 10 | 11 | #include 12 | #include 13 | #include 14 | #include "shake.h" 15 | 16 | int main(int argc, char **argv) { 17 | (void)argc; (void)argv; 18 | 19 | keccak_sponge_t sponge; 20 | unsigned char buf[1024]; 21 | 22 | unsigned int outlen = 512; 23 | shake256_init(sponge); 24 | 25 | /* Sloppy. Real utility would parse --algo, --size ... */ 26 | if (argc > 1) { 27 | if (!strcmp(argv[1], "shake256") || !strcmp(argv[1], "SHAKE256")) { 28 | outlen = 512; 29 | shake256_init(sponge); 30 | } else if (!strcmp(argv[1], "shake128") || !strcmp(argv[1], "SHAKE128")) { 31 | outlen = 512; 32 | shake128_init(sponge); 33 | } else if (!strcmp(argv[1], "sha3-224") || !strcmp(argv[1], "SHA3-224")) { 34 | outlen = 224/8; 35 | sha3_224_init(sponge); 36 | } else if (!strcmp(argv[1], "sha3-256") || !strcmp(argv[1], "SHA3-256")) { 37 | outlen = 256/8; 38 | sha3_256_init(sponge); 39 | } else if (!strcmp(argv[1], "sha3-384") || !strcmp(argv[1], "SHA3-384")) { 40 | outlen = 384/8; 41 | sha3_384_init(sponge); 42 | } else if (!strcmp(argv[1], "sha3-512") || !strcmp(argv[1], "SHA3-512")) { 43 | outlen = 512/8; 44 | sha3_512_init(sponge); 45 | } 46 | } 47 | 48 | ssize_t red; 49 | do { 50 | red = read(0, buf, sizeof(buf)); 51 | if (red>0) sha3_update(sponge,buf,red); 52 | } while (red>0); 53 | 54 | sha3_output(sponge,buf,outlen); 55 | sponge_destroy(sponge); 56 | unsigned i; 57 | for (i=0; i 16 | 17 | typedef struct field_t field_a_t[1]; 18 | #define field_a_restrict_t struct field_t *__restrict__ 19 | 20 | #define is32 (GOLDI_BITS == 32 || FIELD_BITS != 448) 21 | #if (is32) 22 | #define IF32(s) (s) 23 | #else 24 | #define IF32(s) 25 | #endif 26 | 27 | /** 28 | * Returns 1/sqrt(+- x). 29 | * 30 | * The Legendre symbol of the result is the same as that of the 31 | * input. 32 | * 33 | * If x=0, returns 0. 34 | */ 35 | void 36 | field_isr ( 37 | field_a_t a, 38 | const field_a_t x 39 | ); 40 | 41 | /** 42 | * Returns 1/x. 43 | * 44 | * If x=0, returns 0. 45 | * 46 | * TODO: this is currently unused in Decaf, but I've left a decl 47 | * for it because field_inverse is different (and simpler) than 48 | * field_isqrt for 5-mod-8 fields. 49 | */ 50 | void 51 | field_inverse ( 52 | field_a_t a, 53 | const field_a_t x 54 | ); 55 | 56 | /** 57 | * Square x, n times. 58 | */ 59 | static __inline__ void 60 | __attribute__((unused,always_inline)) 61 | field_sqrn ( 62 | field_a_restrict_t y, 63 | const field_a_t x, 64 | int n 65 | ) { 66 | field_a_t tmp; 67 | assert(n>0); 68 | if (n&1) { 69 | field_sqr(y,x); 70 | n--; 71 | } else { 72 | field_sqr(tmp,x); 73 | field_sqr(y,tmp); 74 | n-=2; 75 | } 76 | for (; n; n-=2) { 77 | field_sqr(tmp,y); 78 | field_sqr(y,tmp); 79 | } 80 | } 81 | 82 | static __inline__ void 83 | field_subx_RAW ( 84 | field_a_t d, 85 | const field_a_t a, 86 | const field_a_t b 87 | ) { 88 | field_sub_RAW ( d, a, b ); 89 | field_bias( d, 2 ); 90 | IF32( field_weak_reduce ( d ) ); 91 | } 92 | 93 | static __inline__ void 94 | field_sub ( 95 | field_a_t d, 96 | const field_a_t a, 97 | const field_a_t b 98 | ) { 99 | field_sub_RAW ( d, a, b ); 100 | field_bias( d, 2 ); 101 | field_weak_reduce ( d ); 102 | } 103 | 104 | static __inline__ void 105 | field_add ( 106 | field_a_t d, 107 | const field_a_t a, 108 | const field_a_t b 109 | ) { 110 | field_add_RAW ( d, a, b ); 111 | field_weak_reduce ( d ); 112 | } 113 | 114 | /** Require the warning annotation on raw routines */ 115 | #define ANALYZE_THIS_ROUTINE_CAREFULLY const int ANNOTATE___ANALYZE_THIS_ROUTINE_CAREFULLY = 0; 116 | #define MUST_BE_CAREFUL (void) ANNOTATE___ANALYZE_THIS_ROUTINE_CAREFULLY 117 | #define field_add_nr(a,b,c) { MUST_BE_CAREFUL; field_add_RAW(a,b,c); } 118 | #define field_sub_nr(a,b,c) { MUST_BE_CAREFUL; field_sub_RAW(a,b,c); } 119 | #define field_subx_nr(a,b,c) { MUST_BE_CAREFUL; field_subx_RAW(a,b,c); } 120 | 121 | #endif // __FIELD_H__ 122 | -------------------------------------------------------------------------------- /src/p521/arch_ref64/p521.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | #ifndef __P521_H__ 5 | #define __P521_H__ 1 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include "word.h" 12 | 13 | typedef struct p521_t { 14 | uint64_t limb[9]; 15 | } p521_t; 16 | 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | static __inline__ void 22 | p521_add_RAW ( 23 | p521_t *out, 24 | const p521_t *a, 25 | const p521_t *b 26 | ) __attribute__((unused)); 27 | 28 | static __inline__ void 29 | p521_sub_RAW ( 30 | p521_t *out, 31 | const p521_t *a, 32 | const p521_t *b 33 | ) __attribute__((unused)); 34 | 35 | static __inline__ void 36 | p521_copy ( 37 | p521_t *out, 38 | const p521_t *a 39 | ) __attribute__((unused)); 40 | 41 | static __inline__ void 42 | p521_weak_reduce ( 43 | p521_t *inout 44 | ) __attribute__((unused)); 45 | 46 | void 47 | p521_strong_reduce ( 48 | p521_t *inout 49 | ); 50 | 51 | static __inline__ void 52 | p521_bias ( 53 | p521_t *inout, 54 | int amount 55 | ) __attribute__((unused)); 56 | 57 | void 58 | p521_mul ( 59 | p521_t *__restrict__ out, 60 | const p521_t *a, 61 | const p521_t *b 62 | ); 63 | 64 | void 65 | p521_mulw ( 66 | p521_t *__restrict__ out, 67 | const p521_t *a, 68 | uint64_t b 69 | ); 70 | 71 | void 72 | p521_sqr ( 73 | p521_t *__restrict__ out, 74 | const p521_t *a 75 | ); 76 | 77 | void 78 | p521_serialize ( 79 | uint8_t *serial, 80 | const struct p521_t *x 81 | ); 82 | 83 | mask_t 84 | p521_deserialize ( 85 | p521_t *x, 86 | const uint8_t serial[66] 87 | ); 88 | 89 | /* -------------- Inline functions begin here -------------- */ 90 | 91 | void 92 | p521_add_RAW ( 93 | p521_t *out, 94 | const p521_t *a, 95 | const p521_t *b 96 | ) { 97 | unsigned int i; 98 | for (i=0; i<9; i++) { 99 | out->limb[i] = a->limb[i] + b->limb[i]; 100 | } 101 | p521_weak_reduce(out); 102 | } 103 | 104 | void 105 | p521_sub_RAW ( 106 | p521_t *out, 107 | const p521_t *a, 108 | const p521_t *b 109 | ) { 110 | unsigned int i; 111 | uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4; 112 | for (i=0; i<9; i++) { 113 | out->limb[i] = a->limb[i] - b->limb[i] + ((i==8) ? co2 : co1); 114 | } 115 | p521_weak_reduce(out); 116 | } 117 | 118 | void 119 | p521_copy ( 120 | p521_t *out, 121 | const p521_t *a 122 | ) { 123 | memcpy(out,a,sizeof(*a)); 124 | } 125 | 126 | void 127 | p521_bias ( 128 | p521_t *a, 129 | int amt 130 | ) { 131 | (void) a; 132 | (void) amt; 133 | } 134 | 135 | void 136 | p521_weak_reduce ( 137 | p521_t *a 138 | ) { 139 | uint64_t mask = (1ull<<58) - 1; 140 | uint64_t tmp = a->limb[8] >> 57; 141 | int i; 142 | for (i=8; i>0; i--) { 143 | a->limb[i] = (a->limb[i] & ((i==8) ? mask>>1 : mask)) + (a->limb[i-1]>>58); 144 | } 145 | a->limb[0] = (a->limb[0] & mask) + tmp; 146 | } 147 | 148 | #ifdef __cplusplus 149 | }; /* extern "C" */ 150 | #endif 151 | 152 | #endif /* __P521_H__ */ 153 | -------------------------------------------------------------------------------- /aux/decaffeinate_curve25519.sage: -------------------------------------------------------------------------------- 1 | # This is as sketch of how to decaffeinate Curve25519 2 | 3 | F = GF(2^255-19) 4 | d = -121665 5 | M = EllipticCurve(F,[0,2-4*d,0,1,0]) 6 | 7 | sqrtN1 = sqrt(F(-1)) 8 | 9 | def maybe(): return randint(0,1) 10 | 11 | def qpositive(x): 12 | return int(x) <= (2^255-19-1)/2 13 | 14 | def M_to_E(P): 15 | # P must be even 16 | (x,y) = P.xy() 17 | assert x.is_square() 18 | 19 | s = sqrt(x) 20 | if s == 0: t = 1 21 | else: t = y/s 22 | 23 | X,Y = 2*s / (1+s^2), (1-s^2) / t 24 | if maybe(): X,Y = -X,-Y 25 | if maybe(): X,Y = Y,-X 26 | # OK, have point in ed 27 | return X,Y 28 | 29 | def decaf_encode_from_E(X,Y): 30 | assert X^2 + Y^2 == 1 + d*X^2*Y^2 31 | if not qpositive(X*Y): X,Y = Y,-X 32 | assert qpositive(X*Y) 33 | 34 | assert (1-X^2).is_square() 35 | sx = sqrt(1-X^2) 36 | tos = -2*sx/X/Y 37 | if not qpositive(tos): sx = -sx 38 | s = (1 + sx) / X 39 | if not qpositive(s): s = -s 40 | 41 | return s 42 | 43 | def isqrt(x): 44 | ops = [(1,2),(1,2),(3,1),(6,0),(1,2),(12,1),(25,1),(25,1),(50,0),(125,0),(2,2),(1,2)] 45 | st = [x,x,x] 46 | for i,(sh,add) in enumerate(ops): 47 | od = i&1 48 | st[od] = st[od^^1]^(2^sh)*st[add] 49 | # assert st[2] == x^(2^252-3) 50 | 51 | assert st[1] == 1 or st[1] == -1 52 | if st[1] == 1: return st[0] 53 | else: return st[0] * sqrtN1 54 | 55 | def decaf_encode_from_E_c(X,Y): 56 | Z = F.random_element() 57 | T = X*Y*Z 58 | X = X*Z 59 | Y = Y*Z 60 | assert X^2 + Y^2 == Z^2 + d*T^2 61 | 62 | # Precompute 63 | sd = sqrt(F(1-d)) 64 | 65 | zx = Z^2-X^2 66 | TZ = T*Z 67 | assert zx.is_square 68 | ooAll = isqrt(zx*TZ^2) 69 | osx = ooAll * TZ 70 | ooTZ = ooAll * zx * osx 71 | 72 | floop = qpositive(T^2 * ooTZ) 73 | if floop: 74 | frob = zx * ooTZ 75 | else: 76 | frob = sd 77 | Y = -X 78 | 79 | osx *= frob 80 | 81 | if qpositive(-2*osx*Z) != floop: osx = -osx 82 | s = Y*(ooTZ*Z + osx) 83 | if not qpositive(s): s = -s 84 | 85 | return s 86 | 87 | def is_rotation((X,Y),(x,y)): 88 | return x*Y == X*y or x*X == -y*Y 89 | 90 | def decaf_decode_to_E(s): 91 | assert qpositive(s) 92 | t = sqrt(s^4 + (2-4*d)*s^2 + 1) 93 | if not qpositive(t/s): t = -t 94 | X,Y = 2*s / (1+s^2), (1-s^2) / t 95 | assert qpositive(X*Y) 96 | return X,Y 97 | 98 | def decaf_decode_to_E_c(s): 99 | assert qpositive(s) 100 | 101 | s2 = s^2 102 | s21 = 1+s2 103 | t2 = s21^2 - 4*d*s2 104 | 105 | alt = s21*s 106 | the = isqrt(t2*alt^2) 107 | oot = the * alt 108 | the *= t2 109 | tos = the * s21 110 | X = 2 * (tos-the) * oot 111 | Y = (1-s2) * oot 112 | 113 | if not qpositive(tos): Y = -Y 114 | assert qpositive(X*Y) 115 | 116 | return X,Y 117 | 118 | def test(): 119 | P = 2*M.random_point() 120 | X,Y = M_to_E(P) 121 | s = decaf_encode_from_E(X,Y) 122 | assert s == decaf_encode_from_E_c(X,Y) 123 | XX,YY = decaf_decode_to_E(s) 124 | XX2,YY2 = decaf_decode_to_E_c(s) 125 | assert is_rotation((X,Y),(XX,YY)) 126 | assert is_rotation((X,Y),(XX2,YY2)) 127 | 128 | 129 | -------------------------------------------------------------------------------- /src/p448/arch_ref64/p448.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | #ifndef __P448_H__ 5 | #define __P448_H__ 1 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include "word.h" 12 | 13 | typedef struct p448_t { 14 | uint64_t limb[8]; 15 | } __attribute__((aligned(32))) p448_t; 16 | 17 | #define LBITS 56 18 | #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}} 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | static __inline__ void 25 | p448_add_RAW ( 26 | p448_t *out, 27 | const p448_t *a, 28 | const p448_t *b 29 | ) __attribute__((unused)); 30 | 31 | static __inline__ void 32 | p448_sub_RAW ( 33 | p448_t *out, 34 | const p448_t *a, 35 | const p448_t *b 36 | ) __attribute__((unused)); 37 | 38 | static __inline__ void 39 | p448_copy ( 40 | p448_t *out, 41 | const p448_t *a 42 | ) __attribute__((unused)); 43 | 44 | static __inline__ void 45 | p448_weak_reduce ( 46 | p448_t *inout 47 | ) __attribute__((unused)); 48 | 49 | void 50 | p448_strong_reduce ( 51 | p448_t *inout 52 | ); 53 | 54 | static __inline__ void 55 | p448_bias ( 56 | p448_t *inout, 57 | int amount 58 | ) __attribute__((unused)); 59 | 60 | void 61 | p448_mul ( 62 | p448_t *__restrict__ out, 63 | const p448_t *a, 64 | const p448_t *b 65 | ); 66 | 67 | void 68 | p448_mulw ( 69 | p448_t *__restrict__ out, 70 | const p448_t *a, 71 | uint64_t b 72 | ); 73 | 74 | void 75 | p448_sqr ( 76 | p448_t *__restrict__ out, 77 | const p448_t *a 78 | ); 79 | 80 | void 81 | p448_serialize ( 82 | uint8_t *serial, 83 | const struct p448_t *x 84 | ); 85 | 86 | mask_t 87 | p448_deserialize ( 88 | p448_t *x, 89 | const uint8_t serial[56] 90 | ); 91 | 92 | /* -------------- Inline functions begin here -------------- */ 93 | 94 | void 95 | p448_add_RAW ( 96 | p448_t *out, 97 | const p448_t *a, 98 | const p448_t *b 99 | ) { 100 | unsigned int i; 101 | for (i=0; i<8; i++) { 102 | out->limb[i] = a->limb[i] + b->limb[i]; 103 | } 104 | p448_weak_reduce(out); 105 | } 106 | 107 | void 108 | p448_sub_RAW ( 109 | p448_t *out, 110 | const p448_t *a, 111 | const p448_t *b 112 | ) { 113 | unsigned int i; 114 | uint64_t co1 = ((1ull<<56)-1)*2, co2 = co1-2; 115 | for (i=0; i<8; i++) { 116 | out->limb[i] = a->limb[i] - b->limb[i] + ((i==4) ? co2 : co1); 117 | } 118 | p448_weak_reduce(out); 119 | } 120 | 121 | void 122 | p448_copy ( 123 | p448_t *out, 124 | const p448_t *a 125 | ) { 126 | memcpy(out,a,sizeof(*a)); 127 | } 128 | 129 | void 130 | p448_bias ( 131 | p448_t *a, 132 | int amt 133 | ) { 134 | (void) a; 135 | (void) amt; 136 | } 137 | 138 | void 139 | p448_weak_reduce ( 140 | p448_t *a 141 | ) { 142 | uint64_t mask = (1ull<<56) - 1; 143 | uint64_t tmp = a->limb[7] >> 56; 144 | int i; 145 | a->limb[4] += tmp; 146 | for (i=7; i>0; i--) { 147 | a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56); 148 | } 149 | a->limb[0] = (a->limb[0] & mask) + tmp; 150 | } 151 | 152 | #ifdef __cplusplus 153 | }; /* extern "C" */ 154 | #endif 155 | 156 | #endif /* __P448_H__ */ 157 | -------------------------------------------------------------------------------- /src/p448/arch_32/p448.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | #ifndef __P448_H__ 5 | #define __P448_H__ 1 6 | 7 | #include "word.h" 8 | 9 | #include 10 | #include 11 | 12 | typedef struct p448_t { 13 | uint32_t limb[16]; 14 | } __attribute__((aligned(32))) p448_t; 15 | 16 | #define LBITS 28 17 | #define LIMB(x) (x##ull)&((1ull<>LBITS 18 | #define FIELD_LITERAL(a,b,c,d,e,f,g,h) \ 19 | {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}} 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | static __inline__ void 26 | p448_add_RAW ( 27 | p448_t *out, 28 | const p448_t *a, 29 | const p448_t *b 30 | ) __attribute__((unused,always_inline)); 31 | 32 | static __inline__ void 33 | p448_sub_RAW ( 34 | p448_t *out, 35 | const p448_t *a, 36 | const p448_t *b 37 | ) __attribute__((unused,always_inline)); 38 | 39 | static __inline__ void 40 | p448_copy ( 41 | p448_t *out, 42 | const p448_t *a 43 | ) __attribute__((unused,always_inline)); 44 | 45 | static __inline__ void 46 | p448_weak_reduce ( 47 | p448_t *inout 48 | ) __attribute__((unused,always_inline)); 49 | 50 | void 51 | p448_strong_reduce ( 52 | p448_t *inout 53 | ); 54 | 55 | static __inline__ void 56 | p448_bias ( 57 | p448_t *inout, 58 | int amount 59 | ) __attribute__((unused,always_inline)); 60 | 61 | void 62 | p448_mul ( 63 | p448_t *__restrict__ out, 64 | const p448_t *a, 65 | const p448_t *b 66 | ); 67 | 68 | void 69 | p448_mulw ( 70 | p448_t *__restrict__ out, 71 | const p448_t *a, 72 | uint64_t b 73 | ); 74 | 75 | void 76 | p448_sqr ( 77 | p448_t *__restrict__ out, 78 | const p448_t *a 79 | ); 80 | 81 | void 82 | p448_serialize ( 83 | uint8_t *serial, 84 | const struct p448_t *x 85 | ); 86 | 87 | mask_t 88 | p448_deserialize ( 89 | p448_t *x, 90 | const uint8_t serial[56] 91 | ); 92 | 93 | /* -------------- Inline functions begin here -------------- */ 94 | 95 | void 96 | p448_add_RAW ( 97 | p448_t *out, 98 | const p448_t *a, 99 | const p448_t *b 100 | ) { 101 | unsigned int i; 102 | for (i=0; ilimb[0]); i++) { 108 | out->limb[i] = a->limb[i] + b->limb[i]; 109 | } 110 | */ 111 | } 112 | 113 | void 114 | p448_sub_RAW ( 115 | p448_t *out, 116 | const p448_t *a, 117 | const p448_t *b 118 | ) { 119 | unsigned int i; 120 | for (i=0; ilimb[0]); i++) { 126 | out->limb[i] = a->limb[i] - b->limb[i]; 127 | } 128 | */ 129 | } 130 | 131 | void 132 | p448_copy ( 133 | p448_t *out, 134 | const p448_t *a 135 | ) { 136 | *out = *a; 137 | } 138 | 139 | void 140 | p448_bias ( 141 | p448_t *a, 142 | int amt 143 | ) { 144 | uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt; 145 | uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1}; 146 | uint32x4_t *aa = (uint32x4_t*) a; 147 | aa[0] += lo; 148 | aa[1] += lo; 149 | aa[2] += hi; 150 | aa[3] += lo; 151 | } 152 | 153 | void 154 | p448_weak_reduce ( 155 | p448_t *a 156 | ) { 157 | uint64_t mask = (1ull<<28) - 1; 158 | uint64_t tmp = a->limb[15] >> 28; 159 | int i; 160 | a->limb[8] += tmp; 161 | for (i=15; i>0; i--) { 162 | a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); 163 | } 164 | a->limb[0] = (a->limb[0] & mask) + tmp; 165 | } 166 | 167 | #ifdef __cplusplus 168 | }; /* extern "C" */ 169 | #endif 170 | 171 | #endif /* __P448_H__ */ 172 | -------------------------------------------------------------------------------- /src/p448/arch_arm_32/p448.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | #ifndef __P448_H__ 5 | #define __P448_H__ 1 6 | 7 | #include "word.h" 8 | 9 | #include 10 | #include 11 | 12 | typedef struct p448_t { 13 | uint32_t limb[16]; 14 | } __attribute__((aligned(32))) p448_t; 15 | 16 | #define LBITS 28 17 | #define LIMB(x) (x##ull)&((1ull<>LBITS 18 | #define FIELD_LITERAL(a,b,c,d,e,f,g,h) \ 19 | {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}} 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | static __inline__ void 26 | p448_add_RAW ( 27 | p448_t *out, 28 | const p448_t *a, 29 | const p448_t *b 30 | ) __attribute__((unused,always_inline)); 31 | 32 | static __inline__ void 33 | p448_sub_RAW ( 34 | p448_t *out, 35 | const p448_t *a, 36 | const p448_t *b 37 | ) __attribute__((unused,always_inline)); 38 | 39 | static __inline__ void 40 | p448_copy ( 41 | p448_t *out, 42 | const p448_t *a 43 | ) __attribute__((unused,always_inline)); 44 | 45 | static __inline__ void 46 | p448_weak_reduce ( 47 | p448_t *inout 48 | ) __attribute__((unused,always_inline)); 49 | 50 | void 51 | p448_strong_reduce ( 52 | p448_t *inout 53 | ); 54 | 55 | static __inline__ void 56 | p448_bias ( 57 | p448_t *inout, 58 | int amount 59 | ) __attribute__((unused,always_inline)); 60 | 61 | void 62 | p448_mul ( 63 | p448_t *__restrict__ out, 64 | const p448_t *a, 65 | const p448_t *b 66 | ); 67 | 68 | void 69 | p448_mulw ( 70 | p448_t *__restrict__ out, 71 | const p448_t *a, 72 | uint64_t b 73 | ); 74 | 75 | void 76 | p448_sqr ( 77 | p448_t *__restrict__ out, 78 | const p448_t *a 79 | ); 80 | 81 | void 82 | p448_serialize ( 83 | uint8_t *serial, 84 | const struct p448_t *x 85 | ); 86 | 87 | mask_t 88 | p448_deserialize ( 89 | p448_t *x, 90 | const uint8_t serial[56] 91 | ); 92 | 93 | /* -------------- Inline functions begin here -------------- */ 94 | 95 | void 96 | p448_add_RAW ( 97 | p448_t *out, 98 | const p448_t *a, 99 | const p448_t *b 100 | ) { 101 | unsigned int i; 102 | for (i=0; ilimb[0]); i++) { 108 | out->limb[i] = a->limb[i] + b->limb[i]; 109 | } 110 | */ 111 | } 112 | 113 | void 114 | p448_sub_RAW ( 115 | p448_t *out, 116 | const p448_t *a, 117 | const p448_t *b 118 | ) { 119 | unsigned int i; 120 | for (i=0; ilimb[0]); i++) { 126 | out->limb[i] = a->limb[i] - b->limb[i]; 127 | } 128 | */ 129 | } 130 | 131 | void 132 | p448_copy ( 133 | p448_t *out, 134 | const p448_t *a 135 | ) { 136 | *out = *a; 137 | } 138 | 139 | void 140 | p448_bias ( 141 | p448_t *a, 142 | int amt 143 | ) { 144 | uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt; 145 | uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1}; 146 | uint32x4_t *aa = (uint32x4_t*) a; 147 | aa[0] += lo; 148 | aa[1] += lo; 149 | aa[2] += hi; 150 | aa[3] += lo; 151 | } 152 | 153 | void 154 | p448_weak_reduce ( 155 | p448_t *a 156 | ) { 157 | uint64_t mask = (1ull<<28) - 1; 158 | uint64_t tmp = a->limb[15] >> 28; 159 | int i; 160 | a->limb[8] += tmp; 161 | for (i=15; i>0; i--) { 162 | a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28); 163 | } 164 | a->limb[0] = (a->limb[0] & mask) + tmp; 165 | } 166 | 167 | #ifdef __cplusplus 168 | }; /* extern "C" */ 169 | #endif 170 | 171 | #endif /* __P448_H__ */ 172 | -------------------------------------------------------------------------------- /src/p448/arch_neon_experimental/p448.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | #ifndef __P448_H__ 5 | #define __P448_H__ 1 6 | 7 | #include "word.h" 8 | 9 | #include 10 | #include 11 | 12 | typedef struct p448_t { 13 | uint32_t limb[16]; 14 | } __attribute__((aligned(32))) p448_t; 15 | 16 | #define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15) 17 | #define USE_NEON_PERM 1 18 | #define LBITS 28 19 | #define LIMBHI(x) ((x##ull)>>LBITS) 20 | #define LIMBLO(x) ((x##ull)&((1ull<limb[0]); i++) { 126 | out->limb[i] = a->limb[i] - b->limb[i]; 127 | } 128 | */ 129 | } 130 | 131 | void 132 | p448_copy ( 133 | p448_t *out, 134 | const p448_t *a 135 | ) { 136 | *out = *a; 137 | } 138 | 139 | void 140 | p448_bias ( 141 | p448_t *a, 142 | int amt 143 | ) { 144 | uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt; 145 | uint32x4_t lo = {co1,co2,co1,co1}, hi = {co1,co1,co1,co1}; 146 | uint32x4_t *aa = (uint32x4_t*) a; 147 | aa[0] += lo; 148 | aa[1] += hi; 149 | aa[2] += hi; 150 | aa[3] += hi; 151 | } 152 | 153 | void 154 | p448_weak_reduce ( 155 | p448_t *a 156 | ) { 157 | 158 | uint32x2_t *aa = (uint32x2_t*) a, vmask = {(1ull<<28)-1, (1ull<<28)-1}, vm2 = {0,-1}, 159 | tmp = vshr_n_u32(aa[7],28); 160 | 161 | int i; 162 | for (i=7; i>=1; i--) { 163 | aa[i] = vsra_n_u32(aa[i] & vmask, aa[i-1], 28); 164 | } 165 | aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp&vm2); 166 | } 167 | 168 | #ifdef __cplusplus 169 | }; /* extern "C" */ 170 | #endif 171 | 172 | #endif /* __P448_H__ */ 173 | -------------------------------------------------------------------------------- /src/p521/arch_x86_64_r12/p521.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | #ifndef __P521_H__ 5 | #define __P521_H__ 1 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include "word.h" 12 | #include "constant_time.h" 13 | 14 | #define LIMBPERM(x) (((x)%3)*4 + (x)/3) 15 | #define USE_P521_3x3_TRANSPOSE 16 | 17 | typedef struct p521_t { 18 | uint64_t limb[12]; 19 | } __attribute__((aligned(32))) p521_t; 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | static __inline__ void 26 | p521_add_RAW ( 27 | p521_t *out, 28 | const p521_t *a, 29 | const p521_t *b 30 | ) __attribute__((unused)); 31 | 32 | static __inline__ void 33 | p521_sub_RAW ( 34 | p521_t *out, 35 | const p521_t *a, 36 | const p521_t *b 37 | ) __attribute__((unused)); 38 | 39 | static __inline__ void 40 | p521_copy ( 41 | p521_t *out, 42 | const p521_t *a 43 | ) __attribute__((unused)); 44 | 45 | static __inline__ void 46 | p521_weak_reduce ( 47 | p521_t *inout 48 | ) __attribute__((unused)); 49 | 50 | void 51 | p521_strong_reduce ( 52 | p521_t *inout 53 | ); 54 | 55 | static __inline__ void 56 | p521_bias ( 57 | p521_t *inout, 58 | int amount 59 | ) __attribute__((unused)); 60 | 61 | void 62 | p521_mul ( 63 | p521_t *__restrict__ out, 64 | const p521_t *a, 65 | const p521_t *b 66 | ); 67 | 68 | void 69 | p521_mulw ( 70 | p521_t *__restrict__ out, 71 | const p521_t *a, 72 | uint64_t b 73 | ); 74 | 75 | void 76 | p521_sqr ( 77 | p521_t *__restrict__ out, 78 | const p521_t *a 79 | ); 80 | 81 | void 82 | p521_serialize ( 83 | uint8_t *serial, 84 | const struct p521_t *x 85 | ); 86 | 87 | mask_t 88 | p521_deserialize ( 89 | p521_t *x, 90 | const uint8_t serial[66] 91 | ); 92 | 93 | /* -------------- Inline functions begin here -------------- */ 94 | 95 | typedef uint64x4_t uint64x3_t; /* fit it in a vector register */ 96 | 97 | static const uint64x3_t mask58 = { (1ull<<58) - 1, (1ull<<58) - 1, (1ull<<58) - 1, 0 }; 98 | 99 | /* Currently requires CLANG. Sorry. */ 100 | static inline uint64x3_t 101 | __attribute__((unused)) 102 | timesW ( 103 | uint64x3_t u 104 | ) { 105 | return u.zxyw + u.zwww; 106 | } 107 | 108 | void 109 | p521_add_RAW ( 110 | p521_t *out, 111 | const p521_t *a, 112 | const p521_t *b 113 | ) { 114 | unsigned int i; 115 | for (i=0; ilimb[3] == 0 && a->limb[7] == 0 && a->limb[11] == 0); 159 | for (i=0; i<12; i++) { 160 | assert(a->limb[i] < 3ull<<61); 161 | } 162 | #endif 163 | 164 | uint64x3_t 165 | ot0 = ((uint64x4_t*)a)[0], 166 | ot1 = ((uint64x4_t*)a)[1], 167 | ot2 = ((uint64x4_t*)a)[2]; 168 | 169 | uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58); 170 | uint64x3_t out1 = (ot1 & mask58) + (ot0>>58); 171 | uint64x3_t out2 = (ot2 & mask58) + (ot1>>58); 172 | 173 | ((uint64x4_t*)a)[0] = out0; 174 | ((uint64x4_t*)a)[1] = out1; 175 | ((uint64x4_t*)a)[2] = out2; 176 | } 177 | 178 | #ifdef __cplusplus 179 | }; /* extern "C" */ 180 | #endif 181 | 182 | #endif /* __P521_H__ */ 183 | -------------------------------------------------------------------------------- /src/p480/arch_x86_64/p480.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | #ifndef __p480_H__ 5 | #define __p480_H__ 1 6 | 7 | #include 8 | #include 9 | 10 | #include "word.h" 11 | 12 | typedef struct p480_t { 13 | uint64_t limb[8]; 14 | } __attribute__((aligned(32))) p480_t; 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | static __inline__ void 21 | p480_add_RAW ( 22 | p480_t *out, 23 | const p480_t *a, 24 | const p480_t *b 25 | ) __attribute__((unused,always_inline)); 26 | 27 | static __inline__ void 28 | p480_sub_RAW ( 29 | p480_t *out, 30 | const p480_t *a, 31 | const p480_t *b 32 | ) __attribute__((unused,always_inline)); 33 | 34 | static __inline__ void 35 | p480_copy ( 36 | p480_t *out, 37 | const p480_t *a 38 | ) __attribute__((unused,always_inline)); 39 | 40 | static __inline__ void 41 | p480_weak_reduce ( 42 | p480_t *inout 43 | ) __attribute__((unused,always_inline)); 44 | 45 | void 46 | p480_strong_reduce ( 47 | p480_t *inout 48 | ); 49 | 50 | static __inline__ void 51 | p480_bias ( 52 | p480_t *inout, 53 | int amount 54 | ) __attribute__((unused,always_inline)); 55 | 56 | void 57 | p480_mul ( 58 | p480_t *__restrict__ out, 59 | const p480_t *a, 60 | const p480_t *b 61 | ); 62 | 63 | void 64 | p480_mulw ( 65 | p480_t *__restrict__ out, 66 | const p480_t *a, 67 | uint64_t b 68 | ); 69 | 70 | void 71 | p480_sqr ( 72 | p480_t *__restrict__ out, 73 | const p480_t *a 74 | ); 75 | 76 | void 77 | p480_serialize ( 78 | uint8_t *serial, 79 | const struct p480_t *x 80 | ); 81 | 82 | mask_t 83 | p480_deserialize ( 84 | p480_t *x, 85 | const uint8_t serial[60] 86 | ); 87 | 88 | /* -------------- Inline functions begin here -------------- */ 89 | 90 | void 91 | p480_add_RAW ( 92 | p480_t *out, 93 | const p480_t *a, 94 | const p480_t *b 95 | ) { 96 | unsigned int i; 97 | for (i=0; ilimb[0]); i++) { 103 | out->limb[i] = a->limb[i] + b->limb[i]; 104 | } 105 | */ 106 | } 107 | 108 | void 109 | p480_sub_RAW ( 110 | p480_t *out, 111 | const p480_t *a, 112 | const p480_t *b 113 | ) { 114 | unsigned int i; 115 | for (i=0; ilimb[0]); i++) { 121 | out->limb[i] = a->limb[i] - b->limb[i]; 122 | } 123 | */ 124 | } 125 | 126 | void 127 | p480_copy ( 128 | p480_t *out, 129 | const p480_t *a 130 | ) { 131 | unsigned int i; 132 | for (i=0; ilimb[i] += (i==4) ? co2 : co1; 160 | } 161 | #endif 162 | } 163 | 164 | void 165 | p480_weak_reduce ( 166 | p480_t *a 167 | ) { 168 | /* PERF: use pshufb/palignr if anyone cares about speed of this */ 169 | uint64_t mask = (1ull<<60) - 1; 170 | uint64_t tmp = a->limb[7] >> 60; 171 | int i; 172 | a->limb[4] += tmp; 173 | for (i=7; i>0; i--) { 174 | a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>60); 175 | } 176 | a->limb[0] = (a->limb[0] & mask) + tmp; 177 | } 178 | 179 | #ifdef __cplusplus 180 | }; /* extern "C" */ 181 | #endif 182 | 183 | #endif /* __p480_H__ */ 184 | -------------------------------------------------------------------------------- /src/p448/arch_x86_64/p448.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | #ifndef __P448_H__ 5 | #define __P448_H__ 1 6 | 7 | #include 8 | #include 9 | 10 | #include "word.h" 11 | 12 | typedef struct p448_t { 13 | uint64_t limb[8]; 14 | } __attribute__((aligned(32))) p448_t; 15 | 16 | #define LBITS 56 17 | #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}} 18 | 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif 22 | 23 | static __inline__ void 24 | p448_add_RAW ( 25 | p448_t *out, 26 | const p448_t *a, 27 | const p448_t *b 28 | ) __attribute__((unused,always_inline)); 29 | 30 | static __inline__ void 31 | p448_sub_RAW ( 32 | p448_t *out, 33 | const p448_t *a, 34 | const p448_t *b 35 | ) __attribute__((unused,always_inline)); 36 | 37 | static __inline__ void 38 | p448_copy ( 39 | p448_t *out, 40 | const p448_t *a 41 | ) __attribute__((unused,always_inline)); 42 | 43 | static __inline__ void 44 | p448_weak_reduce ( 45 | p448_t *inout 46 | ) __attribute__((unused,always_inline)); 47 | 48 | void 49 | p448_strong_reduce ( 50 | p448_t *inout 51 | ); 52 | 53 | static __inline__ void 54 | p448_bias ( 55 | p448_t *inout, 56 | int amount 57 | ) __attribute__((unused,always_inline)); 58 | 59 | void 60 | p448_mul ( 61 | p448_t *__restrict__ out, 62 | const p448_t *a, 63 | const p448_t *b 64 | ); 65 | 66 | void 67 | p448_mulw ( 68 | p448_t *__restrict__ out, 69 | const p448_t *a, 70 | uint64_t b 71 | ); 72 | 73 | void 74 | p448_sqr ( 75 | p448_t *__restrict__ out, 76 | const p448_t *a 77 | ); 78 | 79 | void 80 | p448_serialize ( 81 | uint8_t *serial, 82 | const struct p448_t *x 83 | ); 84 | 85 | mask_t 86 | p448_deserialize ( 87 | p448_t *x, 88 | const uint8_t serial[56] 89 | ); 90 | 91 | /* -------------- Inline functions begin here -------------- */ 92 | 93 | void 94 | p448_add_RAW ( 95 | p448_t *out, 96 | const p448_t *a, 97 | const p448_t *b 98 | ) { 99 | unsigned int i; 100 | for (i=0; ilimb[0]); i++) { 106 | out->limb[i] = a->limb[i] + b->limb[i]; 107 | } 108 | */ 109 | } 110 | 111 | void 112 | p448_sub_RAW ( 113 | p448_t *out, 114 | const p448_t *a, 115 | const p448_t *b 116 | ) { 117 | unsigned int i; 118 | for (i=0; ilimb[0]); i++) { 124 | out->limb[i] = a->limb[i] - b->limb[i]; 125 | } 126 | */ 127 | } 128 | 129 | void 130 | p448_copy ( 131 | p448_t *out, 132 | const p448_t *a 133 | ) { 134 | unsigned int i; 135 | for (i=0; ilimb[i] += (i==4) ? co2 : co1; 163 | } 164 | #endif 165 | } 166 | 167 | void 168 | p448_weak_reduce ( 169 | p448_t *a 170 | ) { 171 | /* PERF: use pshufb/palignr if anyone cares about speed of this */ 172 | uint64_t mask = (1ull<<56) - 1; 173 | uint64_t tmp = a->limb[7] >> 56; 174 | int i; 175 | a->limb[4] += tmp; 176 | for (i=7; i>0; i--) { 177 | a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56); 178 | } 179 | a->limb[0] = (a->limb[0] & mask) + tmp; 180 | } 181 | 182 | #ifdef __cplusplus 183 | }; /* extern "C" */ 184 | #endif 185 | 186 | #endif /* __P448_H__ */ 187 | -------------------------------------------------------------------------------- /include/decaf_crypto.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file decaf_crypto.h 3 | * @copyright 4 | * Copyright (c) 2015 Cryptography Research, Inc. \n 5 | * Released under the MIT License. See LICENSE.txt for license information. 6 | * @author Mike Hamburg 7 | * @brief Example Decaf cyrpto routines. 8 | * @warning These are merely examples, though they ought to be secure. But real 9 | * protocols will decide differently on magic numbers, formats, which items to 10 | * hash, etc. 11 | * @warning Experimental! The names, parameter orders etc are likely to change. 12 | */ 13 | 14 | #ifndef __DECAF_CRYPTO_H__ 15 | #define __DECAF_CRYPTO_H__ 1 16 | 17 | #include "decaf.h" 18 | #include "shake.h" 19 | 20 | /** Number of bytes for a symmetric key (expanded to full key) */ 21 | #define DECAF_448_SYMMETRIC_KEY_BYTES 32 22 | 23 | /** @cond internal */ 24 | #define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h 25 | #define WARN_UNUSED __attribute__((warn_unused_result)) 26 | #define NONNULL1 __attribute__((nonnull(1))) 27 | #define NONNULL2 __attribute__((nonnull(1,2))) 28 | #define NONNULL3 __attribute__((nonnull(1,2,3))) 29 | #define NONNULL134 __attribute__((nonnull(1,3,4))) 30 | #define NONNULL5 __attribute__((nonnull(1,2,3,4,5))) 31 | /** @endcond */ 32 | 33 | /** A symmetric key, the compressed point of a private key. */ 34 | typedef unsigned char decaf_448_symmetric_key_t[DECAF_448_SYMMETRIC_KEY_BYTES]; 35 | 36 | /** An encoded public key. */ 37 | typedef unsigned char decaf_448_public_key_t[DECAF_448_SER_BYTES]; 38 | 39 | /** A signature. */ 40 | typedef unsigned char decaf_448_signature_t[DECAF_448_SER_BYTES + DECAF_448_SCALAR_BYTES]; 41 | 42 | typedef struct { 43 | /** @cond intetrnal */ 44 | /** The symmetric key from which everything is expanded */ 45 | decaf_448_symmetric_key_t sym; 46 | 47 | /** The scalar x */ 48 | decaf_448_scalar_t secret_scalar; 49 | 50 | /** x*Base */ 51 | decaf_448_public_key_t pub; 52 | /** @endcond */ 53 | } /** Private key structure for pointers. */ 54 | decaf_448_private_key_s, 55 | /** A private key (gmp array[1] style). */ 56 | decaf_448_private_key_t[1]; 57 | 58 | #ifdef __cplusplus 59 | extern "C" { 60 | #endif 61 | 62 | /** 63 | * @brief Derive a key from its compressed form. 64 | * @param [out] priv The derived private key. 65 | * @param [in] proto The compressed or proto-key, which must be 32 random bytes. 66 | */ 67 | void decaf_448_derive_private_key ( 68 | decaf_448_private_key_t priv, 69 | const decaf_448_symmetric_key_t proto 70 | ) NONNULL2 API_VIS; 71 | 72 | /** 73 | * @brief Destroy a private key. 74 | */ 75 | void decaf_448_destroy_private_key ( 76 | decaf_448_private_key_t priv 77 | ) NONNULL1 API_VIS; 78 | 79 | /** 80 | * @brief Convert a private key to a public one. 81 | * @param [out] pub The extracted private key. 82 | * @param [in] priv The private key. 83 | */ 84 | void decaf_448_private_to_public ( 85 | decaf_448_public_key_t pub, 86 | const decaf_448_private_key_t priv 87 | ) NONNULL2 API_VIS; 88 | 89 | /** 90 | * @brief Compute a Diffie-Hellman shared secret. 91 | * 92 | * This is an example routine; real protocols would use something 93 | * protocol-specific. 94 | * 95 | * @param [out] shared A buffer to store the shared secret. 96 | * @param [in] shared_bytes The size of the buffer. 97 | * @param [in] my_privkey My private key. 98 | * @param [in] your_pubkey Your public key. 99 | * 100 | * @retval DECAF_SUCCESS Key exchange was successful. 101 | * @retval DECAF_FAILURE Key exchange failed. 102 | * 103 | * @warning This is a pretty silly shared secret computation 104 | * and will almost definitely change in the future. 105 | */ 106 | decaf_bool_t 107 | decaf_448_shared_secret ( 108 | uint8_t *shared, 109 | size_t shared_bytes, 110 | const decaf_448_private_key_t my_privkey, 111 | const decaf_448_public_key_t your_pubkey 112 | ) NONNULL134 WARN_UNUSED API_VIS; 113 | 114 | /** 115 | * @brief Sign a message from its SHAKE context. 116 | * 117 | * @param [out] sig The signature. 118 | * @param [in] priv Your private key. 119 | * @param [in] shake A SHAKE256 context with the message. 120 | */ 121 | void 122 | decaf_448_sign_shake ( 123 | decaf_448_signature_t sig, 124 | const decaf_448_private_key_t priv, 125 | const keccak_sponge_t shake 126 | ) NONNULL3 API_VIS; 127 | 128 | /** 129 | * @brief Sign a message from its SHAKE context. 130 | * 131 | * @param [out] sig The signature. 132 | * @param [in] priv Your private key. 133 | * @param [in] message The message. 134 | * @param [in] message_len The message's length. 135 | */ 136 | void 137 | decaf_448_sign ( 138 | decaf_448_signature_t sig, 139 | const decaf_448_private_key_t priv, 140 | const unsigned char *message, 141 | size_t message_len 142 | ) NONNULL3 API_VIS; 143 | 144 | /** 145 | * @brief Verify a signed message from its SHAKE context. 146 | * 147 | * @param [in] sig The signature. 148 | * @param [in] pub The public key. 149 | * @param [in] shake A SHAKE256 context with the message. 150 | */ 151 | decaf_bool_t 152 | decaf_448_verify_shake ( 153 | const decaf_448_signature_t sig, 154 | const decaf_448_public_key_t pub, 155 | const keccak_sponge_t shake 156 | ) NONNULL3 API_VIS WARN_UNUSED; 157 | 158 | /** 159 | * @brief Verify a signed message. 160 | * 161 | * @param [in] sig The signature. 162 | * @param [in] pub The public key. 163 | * @param [in] message The message. 164 | * @param [in] message_len The message's length. 165 | */ 166 | decaf_bool_t 167 | decaf_448_verify ( 168 | const decaf_448_signature_t sig, 169 | const decaf_448_public_key_t pub, 170 | const unsigned char *message, 171 | size_t message_len 172 | ) NONNULL3 API_VIS WARN_UNUSED; 173 | 174 | #undef API_VIS 175 | #undef WARN_UNUSED 176 | #undef NONNULL1 177 | #undef NONNULL2 178 | #undef NONNULL3 179 | #undef NONNULL134 180 | #undef NONNULL5 181 | 182 | #ifdef __cplusplus 183 | } /* extern "C" */ 184 | #endif 185 | 186 | #endif /* __DECAF_CRYPTO_H__ */ 187 | 188 | 189 | -------------------------------------------------------------------------------- /src/decaf_gen_tables.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2015 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | 5 | /** 6 | * @file decaf_precompute.c 7 | * @author Mike Hamburg 8 | * @brief Decaf global constant table precomputation. 9 | */ 10 | 11 | #define _XOPEN_SOURCE 600 /* for posix_memalign */ 12 | #include 13 | #include 14 | #include "decaf.h" 15 | #include "decaf_448_config.h" /* MAGIC */ 16 | #include "field.h" 17 | 18 | #define API_NS(_id) decaf_448_##_id 19 | #define API_NS2(_pref,_id) _pref##_decaf_448_##_id 20 | 21 | /* To satisfy linker. */ 22 | const field_t API_NS(precomputed_base_as_fe)[1]; 23 | const API_NS(scalar_t) API_NS(precomputed_scalarmul_adjustment); 24 | const API_NS(scalar_t) API_NS(point_scalarmul_adjustment); 25 | const API_NS(scalar_t) sc_r2 = {{{0}}}; 26 | const decaf_word_t MONTGOMERY_FACTOR = 0; 27 | const unsigned char base_point_ser_for_pregen[DECAF_448_SER_BYTES]; 28 | 29 | const API_NS(point_t) API_NS(point_base); 30 | 31 | struct niels_s; 32 | const field_t *API_NS(precomputed_wnaf_as_fe); 33 | extern const size_t API_NS2(sizeof,precomputed_wnafs); 34 | 35 | void API_NS(precompute_wnafs) ( 36 | struct niels_s *out, 37 | const API_NS(point_t) base 38 | ); 39 | 40 | /* TODO: use SC_LIMB? */ 41 | static void scalar_print(const char *name, const API_NS(scalar_t) sc) { 42 | printf("const API_NS(scalar_t) %s = {{{\n", name); 43 | unsigned i; 44 | for (i=0; ilimb[i] ); 47 | } 48 | printf("}}};\n\n"); 49 | } 50 | 51 | static void field_print(const field_t *f) { 52 | const int FIELD_SER_BYTES = (FIELD_BITS + 7) / 8; 53 | unsigned char ser[FIELD_SER_BYTES]; 54 | field_serialize(ser,f); 55 | int b=0, i, comma=0; 56 | unsigned long long limb = 0; 57 | printf("FIELD_LITERAL("); 58 | for (i=0; i= FIELD_LIT_LIMB_BITS) { 62 | limb &= (1ull<>(8-b); 68 | } 69 | } 70 | printf(")"); 71 | assert(b<8); 72 | } 73 | 74 | int main(int argc, char **argv) { 75 | (void)argc; (void)argv; 76 | 77 | API_NS(point_t) real_point_base; 78 | int ret = API_NS(point_decode)(real_point_base,base_point_ser_for_pregen,0); 79 | if (!ret) return 1; 80 | 81 | API_NS(precomputed_s) *pre; 82 | ret = posix_memalign((void**)&pre, API_NS2(alignof,precomputed_s), API_NS2(sizeof,precomputed_s)); 83 | if (ret || !pre) return 1; 84 | API_NS(precompute)(pre, real_point_base); 85 | 86 | struct niels_s *preWnaf; 87 | ret = posix_memalign((void**)&preWnaf, API_NS2(alignof,precomputed_s), API_NS2(sizeof,precomputed_wnafs)); 88 | if (ret || !preWnaf) return 1; 89 | API_NS(precompute_wnafs)(preWnaf, real_point_base); 90 | 91 | const field_t *output; 92 | unsigned i; 93 | 94 | printf("/** @warning: this file was automatically generated. */\n"); 95 | printf("#include \"field.h\"\n\n"); 96 | printf("#include \"decaf.h\"\n\n"); 97 | printf("#define API_NS(_id) decaf_448_##_id\n"); 98 | printf("#define API_NS2(_pref,_id) _pref##_decaf_448_##_id\n"); 99 | 100 | output = (const field_t *)real_point_base; 101 | printf("const API_NS(point_t) API_NS(point_base) = {{\n"); 102 | for (i=0; i < sizeof(API_NS(point_t)); i+=sizeof(field_t)) { 103 | if (i) printf(",\n "); 104 | printf("{"); 105 | field_print(output++); 106 | printf("}"); 107 | } 108 | printf("\n}};\n"); 109 | 110 | output = (const field_t *)pre; 111 | printf("const field_t API_NS(precomputed_base_as_fe)[%d]\n", 112 | (int)(API_NS2(sizeof,precomputed_s) / sizeof(field_t))); 113 | printf("__attribute__((aligned(%d),visibility(\"hidden\"))) = {\n ", (int)API_NS2(alignof,precomputed_s)); 114 | 115 | for (i=0; i < API_NS2(sizeof,precomputed_s); i+=sizeof(field_t)) { 116 | if (i) printf(",\n "); 117 | field_print(output++); 118 | } 119 | printf("\n};\n"); 120 | 121 | output = (const field_t *)preWnaf; 122 | printf("const field_t API_NS(precomputed_wnaf_as_fe)[%d]\n", 123 | (int)(API_NS2(sizeof,precomputed_wnafs) / sizeof(field_t))); 124 | printf("__attribute__((aligned(%d),visibility(\"hidden\"))) = {\n ", (int)API_NS2(alignof,precomputed_s)); 125 | for (i=0; i < API_NS2(sizeof,precomputed_wnafs); i+=sizeof(field_t)) { 126 | if (i) printf(",\n "); 127 | field_print(output++); 128 | } 129 | printf("\n};\n"); 130 | 131 | API_NS(scalar_t) smadj; 132 | API_NS(scalar_copy)(smadj,API_NS(scalar_one)); 133 | 134 | for (i=0; ilimb[0]+1; 157 | #if DECAF_WORD_BITS == 32 158 | plo |= ((unsigned long long)smadj->limb[1]) << 32; 159 | #endif 160 | for (i=0; i<6; i++) { 161 | w *= w*plo + 2; 162 | } 163 | printf("const decaf_word_t MONTGOMERY_FACTOR = (decaf_word_t)0x%016llxull;\n\n", w); 164 | 165 | return 0; 166 | } 167 | -------------------------------------------------------------------------------- /src/p448/arch_32/p448.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | 5 | #include "word.h" 6 | #include "p448.h" 7 | 8 | static inline mask_t __attribute__((always_inline)) 9 | is_zero ( 10 | word_t x 11 | ) { 12 | dword_t xx = x; 13 | xx--; 14 | return xx >> WORD_BITS; 15 | } 16 | 17 | static uint64_t widemul_32 ( 18 | const uint32_t a, 19 | const uint32_t b 20 | ) { 21 | return ((uint64_t)a)* b; 22 | } 23 | 24 | void 25 | p448_mul ( 26 | p448_t *__restrict__ cs, 27 | const p448_t *as, 28 | const p448_t *bs 29 | ) { 30 | const uint32_t *a = as->limb, *b = bs->limb; 31 | uint32_t *c = cs->limb; 32 | 33 | uint64_t accum0 = 0, accum1 = 0, accum2 = 0; 34 | uint32_t mask = (1<<28) - 1; 35 | 36 | uint32_t aa[8], bb[8]; 37 | 38 | int i,j; 39 | for (i=0; i<8; i++) { 40 | aa[i] = a[i] + a[i+8]; 41 | bb[i] = b[i] + b[i+8]; 42 | } 43 | 44 | for (j=0; j<8; j++) { 45 | accum2 = 0; 46 | 47 | for (i=0; i<=j; i++) { 48 | accum2 += widemul_32(a[j-i],b[i]); 49 | accum1 += widemul_32(aa[j-i],bb[i]); 50 | accum0 += widemul_32(a[8+j-i], b[8+i]); 51 | } 52 | 53 | accum1 -= accum2; 54 | accum0 += accum2; 55 | accum2 = 0; 56 | 57 | for (; i<8; i++) { 58 | accum0 -= widemul_32(a[8+j-i], b[i]); 59 | accum2 += widemul_32(aa[8+j-i], bb[i]); 60 | accum1 += widemul_32(a[16+j-i], b[8+i]); 61 | } 62 | 63 | accum1 += accum2; 64 | accum0 += accum2; 65 | 66 | c[j] = ((uint32_t)(accum0)) & mask; 67 | c[j+8] = ((uint32_t)(accum1)) & mask; 68 | 69 | accum0 >>= 28; 70 | accum1 >>= 28; 71 | } 72 | 73 | accum0 += accum1; 74 | accum0 += c[8]; 75 | accum1 += c[0]; 76 | c[8] = ((uint32_t)(accum0)) & mask; 77 | c[0] = ((uint32_t)(accum1)) & mask; 78 | 79 | accum0 >>= 28; 80 | accum1 >>= 28; 81 | c[9] += ((uint32_t)(accum0)); 82 | c[1] += ((uint32_t)(accum1)); 83 | } 84 | 85 | void 86 | p448_mulw ( 87 | p448_t *__restrict__ cs, 88 | const p448_t *as, 89 | uint64_t b 90 | ) { 91 | const uint32_t bhi = b>>28, blo = b & ((1<<28)-1); 92 | 93 | const uint32_t *a = as->limb; 94 | uint32_t *c = cs->limb; 95 | 96 | uint64_t accum0, accum8; 97 | uint32_t mask = (1ull<<28)-1; 98 | 99 | int i; 100 | 101 | accum0 = widemul_32(blo, a[0]); 102 | accum8 = widemul_32(blo, a[8]); 103 | accum0 += widemul_32(bhi, a[15]); 104 | accum8 += widemul_32(bhi, a[15] + a[7]); 105 | 106 | c[0] = accum0 & mask; accum0 >>= 28; 107 | c[8] = accum8 & mask; accum8 >>= 28; 108 | 109 | for (i=1; i<8; i++) { 110 | accum0 += widemul_32(blo, a[i]); 111 | accum8 += widemul_32(blo, a[i+8]); 112 | 113 | accum0 += widemul_32(bhi, a[i-1]); 114 | accum8 += widemul_32(bhi, a[i+7]); 115 | 116 | c[i] = accum0 & mask; accum0 >>= 28; 117 | c[i+8] = accum8 & mask; accum8 >>= 28; 118 | } 119 | 120 | accum0 += accum8 + c[8]; 121 | c[8] = accum0 & mask; 122 | c[9] += accum0 >> 28; 123 | 124 | accum8 += c[0]; 125 | c[0] = accum8 & mask; 126 | c[1] += accum8 >> 28; 127 | } 128 | 129 | void 130 | p448_sqr ( 131 | p448_t *__restrict__ cs, 132 | const p448_t *as 133 | ) { 134 | p448_mul(cs,as,as); /* PERF */ 135 | } 136 | 137 | void 138 | p448_strong_reduce ( 139 | p448_t *a 140 | ) { 141 | word_t mask = (1ull<<28)-1; 142 | 143 | /* first, clear high */ 144 | a->limb[8] += a->limb[15]>>28; 145 | a->limb[0] += a->limb[15]>>28; 146 | a->limb[15] &= mask; 147 | 148 | /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */ 149 | 150 | /* compute total_value - p. No need to reduce mod p. */ 151 | 152 | dsword_t scarry = 0; 153 | int i; 154 | for (i=0; i<16; i++) { 155 | scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask); 156 | a->limb[i] = scarry & mask; 157 | scarry >>= 28; 158 | } 159 | 160 | /* uncommon case: it was >= p, so now scarry = 0 and this = x 161 | * common case: it was < p, so now scarry = -1 and this = x - p + 2^448 162 | * so let's add back in p. will carry back off the top for 2^448. 163 | */ 164 | 165 | assert(is_zero(scarry) | is_zero(scarry+1)); 166 | 167 | word_t scarry_mask = scarry & mask; 168 | dword_t carry = 0; 169 | 170 | /* add it back */ 171 | for (i=0; i<16; i++) { 172 | carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask); 173 | a->limb[i] = carry & mask; 174 | carry >>= 28; 175 | } 176 | 177 | assert(is_zero(carry + scarry)); 178 | } 179 | 180 | void 181 | p448_serialize ( 182 | uint8_t *serial, 183 | const struct p448_t *x 184 | ) { 185 | int i,j; 186 | p448_t red; 187 | p448_copy(&red, x); 188 | p448_strong_reduce(&red); 189 | for (i=0; i<8; i++) { 190 | uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28); 191 | for (j=0; j<7; j++) { 192 | serial[7*i+j] = limb; 193 | limb >>= 8; 194 | } 195 | assert(limb == 0); 196 | } 197 | } 198 | 199 | mask_t 200 | p448_deserialize ( 201 | p448_t *x, 202 | const uint8_t serial[56] 203 | ) { 204 | int i,j; 205 | for (i=0; i<8; i++) { 206 | uint64_t out = 0; 207 | for (j=0; j<7; j++) { 208 | out |= ((uint64_t)serial[7*i+j])<<(8*j); 209 | } 210 | x->limb[2*i] = out & ((1ull<<28)-1); 211 | x->limb[2*i+1] = out >> 28; 212 | } 213 | 214 | /* Check for reduction. 215 | * 216 | * The idea is to create a variable ge which is all ones (rather, 56 ones) 217 | * if and only if the low $i$ words of $x$ are >= those of p. 218 | * 219 | * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) 220 | */ 221 | uint32_t ge = -1, mask = (1ull<<28)-1; 222 | for (i=0; i<8; i++) { 223 | ge &= x->limb[i]; 224 | } 225 | 226 | /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ 227 | ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask); 228 | 229 | /* Propagate the rest */ 230 | for (i=9; i<16; i++) { 231 | ge &= x->limb[i]; 232 | } 233 | 234 | return ~is_zero(ge ^ mask); 235 | } 236 | 237 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014 Cryptography Research, Inc. 2 | # Released under the MIT License. See LICENSE.txt for license information. 3 | 4 | 5 | UNAME := $(shell uname) 6 | MACHINE := $(shell uname -m) 7 | 8 | ifeq ($(UNAME),Darwin) 9 | CC = clang 10 | CXX = clang++ 11 | else 12 | CC = gcc 13 | CXX = g++ 14 | endif 15 | LD = $(CC) 16 | LDXX = $(CXX) 17 | ASM ?= $(CC) 18 | 19 | DECAF ?= decaf_fast 20 | 21 | ifneq (,$(findstring x86_64,$(MACHINE))) 22 | ARCH ?= arch_x86_64 23 | else 24 | # no i386 port yet 25 | ARCH ?= arch_arm_32 26 | endif 27 | 28 | FIELD ?= p448 29 | 30 | WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \ 31 | -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN) 32 | 33 | 34 | INCFLAGS = -Isrc/include -Iinclude -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH) 35 | LANGFLAGS = -std=c99 -fno-strict-aliasing 36 | LANGXXFLAGS = -fno-strict-aliasing 37 | GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC 38 | OFLAGS ?= -O3 39 | 40 | TODAY = $(shell date "+%Y-%m-%d") 41 | 42 | ifneq (,$(findstring arm,$(MACHINE))) 43 | ifneq (,$(findstring neon,$(ARCH))) 44 | ARCHFLAGS += -mfpu=neon 45 | else 46 | ARCHFLAGS += -mfpu=vfpv3-d16 47 | endif 48 | ARCHFLAGS += -mcpu=cortex-a8 # FIXME 49 | GENFLAGS += -DN_TESTS_BASE=1000 # sooooo sloooooow 50 | else 51 | ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO 52 | endif 53 | 54 | ifeq ($(CC),clang) 55 | WARNFLAGS += -Wgcc-compat 56 | endif 57 | 58 | SAGE ?= sage 59 | SAGES= $(shell ls test/*.sage) 60 | BUILDPYS= $(SAGES:test/%.sage=build/%.py) 61 | 62 | ARCHFLAGS += $(XARCHFLAGS) 63 | CFLAGS = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XCFLAGS) 64 | CXXFLAGS = $(LANGXXFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XCXXFLAGS) 65 | LDFLAGS = $(ARCHFLAGS) $(XLDFLAGS) 66 | ASFLAGS = $(ARCHFLAGS) $(XASFLAGS) 67 | 68 | .PHONY: clean all test bench todo doc lib bat sage sagetest 69 | .PRECIOUS: build/%.s 70 | 71 | HEADERS= Makefile $(shell find src include test -name "*.h") $(shell find . -name "*.hxx") build/timestamp 72 | 73 | 74 | DECAFCOMPONENTS= build/$(DECAF).o build/shake.o build/decaf_crypto.o \ 75 | build/$(FIELD).o build/f_arithmetic.o # TODO 76 | ifeq ($(DECAF),decaf_fast) 77 | DECAFCOMPONENTS += build/decaf_tables.o 78 | endif 79 | 80 | BENCHCOMPONENTS = build/bench.o build/shake.o 81 | 82 | BATBASE=ed448goldilocks_decaf_bats_$(TODAY) 83 | BATNAME=build/$(BATBASE) 84 | 85 | all: lib build/test build/bench build/shakesum 86 | 87 | scan: clean 88 | scan-build --use-analyzer=`which clang` \ 89 | -enable-checker deadcode -enable-checker llvm \ 90 | -enable-checker osx -enable-checker security -enable-checker unix \ 91 | make build/bench build/test all 92 | 93 | build/test: build/test_decaf.o lib 94 | ifeq ($(UNAME),Darwin) 95 | $(LDXX) $(LDFLAGS) -o $@ $< -Lbuild -ldecaf 96 | else 97 | $(LDXX) $(LDFLAGS) -Wl,-rpath,`pwd`/build -o $@ $< -Lbuild -ldecaf 98 | endif 99 | 100 | build/bench: build/bench_decaf.o lib 101 | ifeq ($(UNAME),Darwin) 102 | $(LDXX) $(LDFLAGS) -o $@ $< -Lbuild -ldecaf 103 | else 104 | $(LDXX) $(LDFLAGS) -Wl,-rpath,`pwd`/build -o $@ $< -Lbuild -ldecaf 105 | endif 106 | 107 | build/shakesum: build/shakesum.o build/shake.o 108 | $(LD) $(LDFLAGS) -o $@ $^ 109 | 110 | lib: build/libdecaf.so 111 | 112 | build/libdecaf.so: $(DECAFCOMPONENTS) 113 | rm -f $@ 114 | ifeq ($(UNAME),Darwin) 115 | libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \ 116 | $(DECAFCOMPONENTS) 117 | else 118 | $(LD) $(LDFLAGS) -shared -Wl,-soname,libdecaf.so.1 -Wl,--gc-sections -o $@ $(DECAFCOMPONENTS) 119 | strip --discard-all $@ 120 | ln -sf `basename $@` build/libdecaf.so.1 121 | endif 122 | 123 | build/timestamp: 124 | mkdir -p build 125 | touch $@ 126 | 127 | build/%.o: build/%.s 128 | $(ASM) $(ASFLAGS) -c -o $@ $< 129 | 130 | build/decaf_gen_tables: build/decaf_gen_tables.o build/$(DECAF).o build/$(FIELD).o build/f_arithmetic.o 131 | $(LD) $(LDFLAGS) -o $@ $^ 132 | 133 | build/decaf_tables.c: build/decaf_gen_tables 134 | ./$< > $@ 135 | 136 | build/decaf_tables.s: build/decaf_tables.c $(HEADERS) 137 | $(CC) $(CFLAGS) -S -c -o $@ $< 138 | 139 | build/%.s: src/%.c $(HEADERS) 140 | $(CC) $(CFLAGS) -S -c -o $@ $< 141 | 142 | build/%.s: src/%.cxx $(HEADERS) 143 | $(CXX) $(CXXFLAGS) -S -c -o $@ $< 144 | 145 | build/%.s: test/%.c $(HEADERS) 146 | $(CC) $(CFLAGS) -S -c -o $@ $< 147 | 148 | build/%.s: test/%.cxx $(HEADERS) 149 | $(CXX) $(CXXFLAGS) -S -c -o $@ $< 150 | 151 | build/%.s: src/$(FIELD)/$(ARCH)/%.c $(HEADERS) 152 | $(CC) $(CFLAGS) -S -c -o $@ $< 153 | 154 | build/%.s: src/$(FIELD)/%.c $(HEADERS) 155 | $(CC) $(CFLAGS) -S -c -o $@ $< 156 | 157 | sage: $(BUILDPYS) 158 | 159 | sagetest: sage lib 160 | LD_LIBRARY_PATH=build sage build/test_decaf.sage 161 | 162 | $(BUILDPYS): $(SAGES) build/timestamp 163 | cp -f $(SAGES) build/ 164 | $(SAGE) --preparse $(SAGES:test/%.sage=build/%.sage) 165 | # some sage versions compile to .sage.py 166 | for f in $(SAGES:test/%.sage=build/%); do \ 167 | if [ -e $$f.sage.py ]; then \ 168 | mv $$f.sage.py $$f.py; \ 169 | fi; \ 170 | done 171 | 172 | doc/timestamp: 173 | mkdir -p doc 174 | touch $@ 175 | 176 | doc: Doxyfile doc/timestamp include/*.h src/*.c src/include/*.h src/$(FIELD)/$(ARCH)/*.c src/$(FIELD)/$(ARCH)/*.h 177 | doxygen > /dev/null 178 | 179 | bat: $(BATNAME) 180 | 181 | $(BATNAME): include/* src/* src/*/* test/batarch.map build/decaf_tables.c # TODO tables some other way 182 | rm -fr $@ 183 | for prim in dh sign; do \ 184 | targ="$@/crypto_$$prim/ed448goldilocks_decaf"; \ 185 | (while read arch where; do \ 186 | mkdir -p $$targ/`basename $$arch`; \ 187 | cp include/*.h build/decaf_tables.c src/decaf_fast.c src/decaf_crypto.c src/shake.c src/include/*.h src/bat/$$prim.c src/p448/$$where/*.c src/p448/$$where/*.h src/p448/*.c src/p448/*.h $$targ/`basename $$arch`; \ 188 | cp src/bat/api_$$prim.h $$targ/`basename $$arch`/api.h; \ 189 | perl -p -i -e 's/SYSNAME/'`basename $(BATNAME)`_`basename $$arch`'/g' $$targ/`basename $$arch`/api.h; \ 190 | perl -p -i -e 's/__TODAY__/'$(TODAY)'/g' $$targ/`basename $$arch`/api.h; \ 191 | done \ 192 | ) < test/batarch.map; \ 193 | echo 'Mike Hamburg' > $$targ/designers; \ 194 | echo 'Ed448-Goldilocks Decaf sign and dh' > $$targ/description; \ 195 | done 196 | (cd build && tar czf $(BATBASE).tgz $(BATBASE) ) 197 | 198 | 199 | todo:: 200 | @(find * -name '*.h'; find * -name '*.c') | xargs egrep --color=auto -w \ 201 | 'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC' 202 | @echo '=============================' 203 | @(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE MAGIC; do \ 204 | (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i > /dev/null || continue; \ 205 | /bin/echo -n $$i' ' | head -c 10; \ 206 | (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i| wc -l; \ 207 | done) 208 | @echo '=============================' 209 | @echo -n 'Total ' 210 | @(find * -name '*.h'; find * -name '*.c') | xargs egrep -w \ 211 | 'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC' | wc -l 212 | 213 | bench: build/bench 214 | ./$< 215 | 216 | test: build/test 217 | build/test 218 | 219 | microbench: build/bench 220 | ./$< --micro 221 | 222 | clean: 223 | rm -fr build doc $(BATNAME) 224 | -------------------------------------------------------------------------------- /src/decaf_crypto.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @cond internal 3 | * @file decaf_crypto.c 4 | * @copyright 5 | * Copyright (c) 2015 Cryptography Research, Inc. \n 6 | * Released under the MIT License. See LICENSE.txt for license information. 7 | * @author Mike Hamburg 8 | * @brief Example Decaf cyrpto routines. 9 | */ 10 | 11 | #include "decaf_crypto.h" 12 | #include 13 | 14 | static const unsigned int DECAF_448_SCALAR_OVERKILL_BYTES = DECAF_448_SCALAR_BYTES + 8; 15 | 16 | void decaf_448_derive_private_key ( 17 | decaf_448_private_key_t priv, 18 | const decaf_448_symmetric_key_t proto 19 | ) { 20 | const char *magic = "decaf_448_derive_private_key"; 21 | uint8_t encoded_scalar[DECAF_448_SCALAR_OVERKILL_BYTES]; 22 | decaf_448_point_t pub; 23 | 24 | keccak_sponge_t sponge; 25 | shake256_init(sponge); 26 | shake256_update(sponge, proto, sizeof(decaf_448_symmetric_key_t)); 27 | shake256_update(sponge, (const unsigned char *)magic, strlen(magic)); 28 | shake256_final(sponge, encoded_scalar, sizeof(encoded_scalar)); 29 | shake256_destroy(sponge); 30 | 31 | memcpy(priv->sym, proto, sizeof(decaf_448_symmetric_key_t)); 32 | decaf_448_scalar_decode_long(priv->secret_scalar, encoded_scalar, sizeof(encoded_scalar)); 33 | 34 | decaf_448_precomputed_scalarmul(pub, decaf_448_precomputed_base, priv->secret_scalar); 35 | decaf_448_point_encode(priv->pub, pub); 36 | 37 | decaf_bzero(encoded_scalar, sizeof(encoded_scalar)); 38 | } 39 | 40 | void 41 | decaf_448_destroy_private_key ( 42 | decaf_448_private_key_t priv 43 | ) { 44 | decaf_bzero((void*)priv, sizeof(decaf_448_private_key_t)); 45 | } 46 | 47 | void decaf_448_private_to_public ( 48 | decaf_448_public_key_t pub, 49 | const decaf_448_private_key_t priv 50 | ) { 51 | memcpy(pub, priv->pub, sizeof(decaf_448_public_key_t)); 52 | } 53 | 54 | decaf_bool_t 55 | decaf_448_shared_secret ( 56 | uint8_t *shared, 57 | size_t shared_bytes, 58 | const decaf_448_private_key_t my_privkey, 59 | const decaf_448_public_key_t your_pubkey 60 | ) { 61 | uint8_t ss_ser[DECAF_448_SER_BYTES]; 62 | const char *nope = "decaf_448_ss_invalid"; 63 | 64 | unsigned i; 65 | /* Lexsort keys. Less will be -1 if mine is less, and 0 otherwise. */ 66 | uint16_t less = 0; 67 | for (i=0; ipub[i]; 69 | delta -= your_pubkey[i]; 70 | /* Case: 71 | * = -> delta = 0 -> hi delta-1 = -1, hi delta = 0 72 | * > -> delta > 0 -> hi delta-1 = 0, hi delta = 0 73 | * < -> delta < 0 -> hi delta-1 = (doesnt matter), hi delta = -1 74 | */ 75 | less &= delta-1; 76 | less |= delta; 77 | } 78 | less >>= 8; 79 | 80 | keccak_sponge_t sponge; 81 | shake256_init(sponge); 82 | 83 | /* update the lesser */ 84 | for (i=0; ipub[i] & less) | (your_pubkey[i] & ~less); 86 | } 87 | shake256_update(sponge, ss_ser, sizeof(ss_ser)); 88 | 89 | /* update the greater */ 90 | for (i=0; ipub[i] & ~less) | (your_pubkey[i] & less); 92 | } 93 | shake256_update(sponge, ss_ser, sizeof(ss_ser)); 94 | 95 | decaf_bool_t ret = decaf_448_direct_scalarmul(ss_ser, your_pubkey, my_privkey->secret_scalar, DECAF_FALSE, DECAF_TRUE); 96 | /* If invalid, then replace ... */ 97 | for (i=0; isym)) { 101 | ss_ser[i] |= my_privkey->sym[i] & ~ret; 102 | } else if (i - sizeof(my_privkey->sym) < strlen(nope)) { 103 | ss_ser[i] |= nope[i-sizeof(my_privkey->sym)] & ~ret; 104 | } 105 | } 106 | 107 | shake256_update(sponge, ss_ser, sizeof(ss_ser)); 108 | shake256_final(sponge, shared, shared_bytes); 109 | shake256_destroy(sponge); 110 | 111 | decaf_bzero(ss_ser, sizeof(ss_ser)); 112 | 113 | return ret; 114 | } 115 | 116 | void 117 | decaf_448_sign_shake ( 118 | decaf_448_signature_t sig, 119 | const decaf_448_private_key_t priv, 120 | const keccak_sponge_t shake 121 | ) { 122 | const char *magic = "decaf_448_sign_shake"; 123 | 124 | uint8_t overkill[DECAF_448_SCALAR_OVERKILL_BYTES], encoded[DECAF_448_SER_BYTES]; 125 | decaf_448_point_t point; 126 | decaf_448_scalar_t nonce, challenge; 127 | 128 | /* Derive nonce */ 129 | keccak_sponge_t ctx; 130 | memcpy(ctx, shake, sizeof(ctx)); 131 | shake256_update(ctx, priv->sym, sizeof(priv->sym)); 132 | shake256_update(ctx, (const unsigned char *)magic, strlen(magic)); 133 | shake256_final(ctx, overkill, sizeof(overkill)); 134 | 135 | decaf_448_scalar_decode_long(nonce, overkill, sizeof(overkill)); 136 | decaf_448_precomputed_scalarmul(point, decaf_448_precomputed_base, nonce); 137 | decaf_448_point_encode(encoded, point); 138 | 139 | /* Derive challenge */ 140 | memcpy(ctx, shake, sizeof(ctx)); 141 | shake256_update(ctx, priv->pub, sizeof(priv->pub)); 142 | shake256_update(ctx, encoded, sizeof(encoded)); 143 | shake256_final(ctx, overkill, sizeof(overkill)); 144 | shake256_destroy(ctx); 145 | decaf_448_scalar_decode_long(challenge, overkill, sizeof(overkill)); 146 | 147 | /* Respond */ 148 | decaf_448_scalar_mul(challenge, challenge, priv->secret_scalar); 149 | decaf_448_scalar_sub(nonce, nonce, challenge); 150 | 151 | /* Save results */ 152 | memcpy(sig, encoded, sizeof(encoded)); 153 | decaf_448_scalar_encode(&sig[sizeof(encoded)], nonce); 154 | 155 | /* Clean up */ 156 | decaf_448_scalar_destroy(nonce); 157 | decaf_448_scalar_destroy(challenge); 158 | decaf_bzero(overkill,sizeof(overkill)); 159 | decaf_bzero(encoded,sizeof(encoded)); 160 | } 161 | 162 | decaf_bool_t 163 | decaf_448_verify_shake ( 164 | const decaf_448_signature_t sig, 165 | const decaf_448_public_key_t pub, 166 | const keccak_sponge_t shake 167 | ) { 168 | decaf_bool_t ret; 169 | 170 | uint8_t overkill[DECAF_448_SCALAR_OVERKILL_BYTES]; 171 | decaf_448_point_t point, pubpoint; 172 | decaf_448_scalar_t challenge, response; 173 | 174 | /* Derive challenge */ 175 | keccak_sponge_t ctx; 176 | memcpy(ctx, shake, sizeof(ctx)); 177 | shake256_update(ctx, pub, sizeof(decaf_448_public_key_t)); 178 | shake256_update(ctx, sig, DECAF_448_SER_BYTES); 179 | shake256_final(ctx, overkill, sizeof(overkill)); 180 | shake256_destroy(ctx); 181 | decaf_448_scalar_decode_long(challenge, overkill, sizeof(overkill)); 182 | 183 | /* Decode points. */ 184 | ret = decaf_448_point_decode(point, sig, DECAF_TRUE); 185 | ret &= decaf_448_point_decode(pubpoint, pub, DECAF_FALSE); 186 | ret &= decaf_448_scalar_decode(response, &sig[DECAF_448_SER_BYTES]); 187 | 188 | decaf_448_base_double_scalarmul_non_secret ( 189 | pubpoint, response, pubpoint, challenge 190 | ); 191 | 192 | ret &= decaf_448_point_eq(pubpoint, point); 193 | 194 | return ret; 195 | } 196 | 197 | void 198 | decaf_448_sign ( 199 | decaf_448_signature_t sig, 200 | const decaf_448_private_key_t priv, 201 | const unsigned char *message, 202 | size_t message_len 203 | ) { 204 | keccak_sponge_t ctx; 205 | shake256_init(ctx); 206 | shake256_update(ctx, message, message_len); 207 | decaf_448_sign_shake(sig, priv, ctx); 208 | shake256_destroy(ctx); 209 | } 210 | 211 | decaf_bool_t 212 | decaf_448_verify ( 213 | const decaf_448_signature_t sig, 214 | const decaf_448_public_key_t pub, 215 | const unsigned char *message, 216 | size_t message_len 217 | ) { 218 | keccak_sponge_t ctx; 219 | shake256_init(ctx); 220 | shake256_update(ctx, message, message_len); 221 | decaf_bool_t ret = decaf_448_verify_shake(sig, pub, ctx); 222 | shake256_destroy(ctx); 223 | return ret; 224 | } 225 | -------------------------------------------------------------------------------- /src/p448/arch_x86_64/x86-64-arith.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | 5 | #ifndef __X86_64_ARITH_H__ 6 | #define __X86_64_ARITH_H__ 7 | 8 | #include 9 | 10 | /* TODO: non x86-64 versions of these. 11 | * FUTURE: autogenerate 12 | */ 13 | 14 | static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) { 15 | #ifndef __BMI2__ 16 | uint64_t c,d; 17 | __asm__ volatile 18 | ("movq %[a], %%rax;" 19 | "mulq %[b];" 20 | : [c]"=a"(c), [d]"=d"(d) 21 | : [b]"m"(*b), [a]"m"(*a) 22 | : "cc"); 23 | return (((__uint128_t)(d))<<64) | c; 24 | #else 25 | uint64_t c,d; 26 | __asm__ volatile 27 | ("movq %[a], %%rdx;" 28 | "mulx %[b], %[c], %[d];" 29 | : [c]"=r"(c), [d]"=r"(d) 30 | : [b]"m"(*b), [a]"m"(*a) 31 | : "rdx"); 32 | return (((__uint128_t)(d))<<64) | c; 33 | #endif 34 | } 35 | 36 | static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) { 37 | #ifndef __BMI2__ 38 | uint64_t c,d; 39 | __asm__ volatile 40 | ("movq %[a], %%rax;" 41 | "mulq %[b];" 42 | : [c]"=a"(c), [d]"=d"(d) 43 | : [b]"m"(*b), [a]"r"(a) 44 | : "cc"); 45 | return (((__uint128_t)(d))<<64) | c; 46 | #else 47 | uint64_t c,d; 48 | __asm__ volatile 49 | ("mulx %[b], %[c], %[d];" 50 | : [c]"=r"(c), [d]"=r"(d) 51 | : [b]"m"(*b), [a]"d"(a)); 52 | return (((__uint128_t)(d))<<64) | c; 53 | #endif 54 | } 55 | 56 | static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) { 57 | #ifndef __BMI2__ 58 | uint64_t c,d; 59 | __asm__ volatile 60 | ("movq %[a], %%rax; " 61 | "addq %%rax, %%rax; " 62 | "mulq %[b];" 63 | : [c]"=a"(c), [d]"=d"(d) 64 | : [b]"m"(*b), [a]"m"(*a) 65 | : "cc"); 66 | return (((__uint128_t)(d))<<64) | c; 67 | #else 68 | uint64_t c,d; 69 | __asm__ volatile 70 | ("movq %[a], %%rdx;" 71 | "leaq (,%%rdx,2), %%rdx;" 72 | "mulx %[b], %[c], %[d];" 73 | : [c]"=r"(c), [d]"=r"(d) 74 | : [b]"m"(*b), [a]"m"(*a) 75 | : "rdx"); 76 | return (((__uint128_t)(d))<<64) | c; 77 | #endif 78 | } 79 | 80 | static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { 81 | uint64_t lo = *acc, hi = *acc>>64; 82 | 83 | #ifdef __BMI2__ 84 | uint64_t c,d; 85 | __asm__ volatile 86 | ("movq %[a], %%rdx; " 87 | "mulx %[b], %[c], %[d]; " 88 | "addq %[c], %[lo]; " 89 | "adcq %[d], %[hi]; " 90 | : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) 91 | : [b]"m"(*b), [a]"m"(*a) 92 | : "rdx", "cc"); 93 | #else 94 | __asm__ volatile 95 | ("movq %[a], %%rax; " 96 | "mulq %[b]; " 97 | "addq %%rax, %[lo]; " 98 | "adcq %%rdx, %[hi]; " 99 | : [lo]"+r"(lo), [hi]"+r"(hi) 100 | : [b]"m"(*b), [a]"m"(*a) 101 | : "rax", "rdx", "cc"); 102 | #endif 103 | 104 | *acc = (((__uint128_t)(hi))<<64) | lo; 105 | } 106 | 107 | static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) { 108 | uint64_t lo = *acc, hi = *acc>>64; 109 | uint64_t lo2 = *acc2, hi2 = *acc2>>64; 110 | 111 | #ifdef __BMI2__ 112 | uint64_t c,d; 113 | __asm__ volatile 114 | ("movq %[a], %%rdx; " 115 | "mulx %[b], %[c], %[d]; " 116 | "addq %[c], %[lo]; " 117 | "adcq %[d], %[hi]; " 118 | "addq %[c], %[lo2]; " 119 | "adcq %[d], %[hi2]; " 120 | : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) 121 | : [b]"m"(*b), [a]"m"(*a) 122 | : "rdx", "cc"); 123 | #else 124 | __asm__ volatile 125 | ("movq %[a], %%rax; " 126 | "mulq %[b]; " 127 | "addq %%rax, %[lo]; " 128 | "adcq %%rdx, %[hi]; " 129 | "addq %%rax, %[lo2]; " 130 | "adcq %%rdx, %[hi2]; " 131 | : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) 132 | : [b]"m"(*b), [a]"m"(*a) 133 | : "rax", "rdx", "cc"); 134 | #endif 135 | 136 | *acc = (((__uint128_t)(hi))<<64) | lo; 137 | *acc2 = (((__uint128_t)(hi2))<<64) | lo2; 138 | } 139 | 140 | static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) { 141 | uint64_t lo = *acc, hi = *acc>>64; 142 | 143 | #ifdef __BMI2__ 144 | uint64_t c,d; 145 | __asm__ volatile 146 | ("mulx %[b], %[c], %[d]; " 147 | "addq %[c], %[lo]; " 148 | "adcq %[d], %[hi]; " 149 | : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) 150 | : [b]"m"(*b), [a]"d"(a) 151 | : "cc"); 152 | #else 153 | __asm__ volatile 154 | ("movq %[a], %%rax; " 155 | "mulq %[b]; " 156 | "addq %%rax, %[lo]; " 157 | "adcq %%rdx, %[hi]; " 158 | : [lo]"+r"(lo), [hi]"+r"(hi) 159 | : [b]"m"(*b), [a]"r"(a) 160 | : "rax", "rdx", "cc"); 161 | #endif 162 | 163 | *acc = (((__uint128_t)(hi))<<64) | lo; 164 | } 165 | 166 | static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { 167 | uint64_t lo = *acc, hi = *acc>>64; 168 | 169 | #ifdef __BMI2__ 170 | uint64_t c,d; 171 | __asm__ volatile 172 | ("movq %[a], %%rdx; " 173 | "addq %%rdx, %%rdx; " 174 | "mulx %[b], %[c], %[d]; " 175 | "addq %[c], %[lo]; " 176 | "adcq %[d], %[hi]; " 177 | : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) 178 | : [b]"m"(*b), [a]"m"(*a) 179 | : "rdx", "cc"); 180 | #else 181 | __asm__ volatile 182 | ("movq %[a], %%rax; " 183 | "addq %%rax, %%rax; " 184 | "mulq %[b]; " 185 | "addq %%rax, %[lo]; " 186 | "adcq %%rdx, %[hi]; " 187 | : [lo]"+r"(lo), [hi]"+r"(hi) 188 | : [b]"m"(*b), [a]"m"(*a) 189 | : "rax", "rdx", "cc"); 190 | #endif 191 | 192 | *acc = (((__uint128_t)(hi))<<64) | lo; 193 | } 194 | 195 | static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { 196 | uint64_t lo = *acc, hi = *acc>>64; 197 | #ifdef __BMI2__ 198 | uint64_t c,d; 199 | __asm__ volatile 200 | ("movq %[a], %%rdx; " 201 | "mulx %[b], %[c], %[d]; " 202 | "subq %[c], %[lo]; " 203 | "sbbq %[d], %[hi]; " 204 | : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) 205 | : [b]"m"(*b), [a]"m"(*a) 206 | : "rdx", "cc"); 207 | #else 208 | __asm__ volatile 209 | ("movq %[a], %%rax; " 210 | "mulq %[b]; " 211 | "subq %%rax, %[lo]; " 212 | "sbbq %%rdx, %[hi]; " 213 | : [lo]"+r"(lo), [hi]"+r"(hi) 214 | : [b]"m"(*b), [a]"m"(*a) 215 | : "rax", "rdx", "cc"); 216 | #endif 217 | *acc = (((__uint128_t)(hi))<<64) | lo; 218 | } 219 | 220 | static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { 221 | uint64_t lo = *acc, hi = *acc>>64; 222 | #ifdef __BMI2__ 223 | uint64_t c,d; 224 | __asm__ volatile 225 | ("movq %[a], %%rdx; " 226 | "addq %%rdx, %%rdx; " 227 | "mulx %[b], %[c], %[d]; " 228 | "subq %[c], %[lo]; " 229 | "sbbq %[d], %[hi]; " 230 | : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) 231 | : [b]"m"(*b), [a]"m"(*a) 232 | : "rdx", "cc"); 233 | #else 234 | __asm__ volatile 235 | ("movq %[a], %%rax; " 236 | "addq %%rax, %%rax; " 237 | "mulq %[b]; " 238 | "subq %%rax, %[lo]; " 239 | "sbbq %%rdx, %[hi]; " 240 | : [lo]"+r"(lo), [hi]"+r"(hi) 241 | : [b]"m"(*b), [a]"m"(*a) 242 | : "rax", "rdx", "cc"); 243 | #endif 244 | *acc = (((__uint128_t)(hi))<<64) | lo; 245 | 246 | } 247 | 248 | static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { 249 | uint64_t c,d, lo = *acc, hi = *acc>>64; 250 | __asm__ volatile 251 | ("movq %[a], %%rdx; " 252 | "mulx %[b], %[c], %[d]; " 253 | "subq %[lo], %[c]; " 254 | "sbbq %[hi], %[d]; " 255 | : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) 256 | : [b]"m"(*b), [a]"m"(*a) 257 | : "rdx", "cc"); 258 | *acc = (((__uint128_t)(d))<<64) | c; 259 | } 260 | 261 | static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) { 262 | return ((__uint128_t)(a)) * b; 263 | } 264 | 265 | static __inline__ __int128_t widemuls(int64_t a, int64_t b) { 266 | return ((__int128_t)(a)) * b; 267 | } 268 | 269 | static __inline__ uint64_t opacify(uint64_t x) { 270 | __asm__ volatile("" : "+r"(x)); 271 | return x; 272 | } 273 | 274 | static __inline__ mask_t is_zero(uint64_t x) { 275 | __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x)); 276 | return ~x; 277 | } 278 | 279 | #endif /* __X86_64_ARITH_H__ */ 280 | -------------------------------------------------------------------------------- /src/p480/arch_x86_64/x86-64-arith.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | 5 | #ifndef __X86_64_ARITH_H__ 6 | #define __X86_64_ARITH_H__ 7 | 8 | #include 9 | 10 | /* TODO: non x86-64 versions of these. 11 | * FUTURE: autogenerate 12 | */ 13 | 14 | static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) { 15 | #ifndef __BMI2__ 16 | uint64_t c,d; 17 | __asm__ volatile 18 | ("movq %[a], %%rax;" 19 | "mulq %[b];" 20 | : [c]"=a"(c), [d]"=d"(d) 21 | : [b]"m"(*b), [a]"m"(*a) 22 | : "cc"); 23 | return (((__uint128_t)(d))<<64) | c; 24 | #else 25 | uint64_t c,d; 26 | __asm__ volatile 27 | ("movq %[a], %%rdx;" 28 | "mulx %[b], %[c], %[d];" 29 | : [c]"=r"(c), [d]"=r"(d) 30 | : [b]"m"(*b), [a]"m"(*a) 31 | : "rdx"); 32 | return (((__uint128_t)(d))<<64) | c; 33 | #endif 34 | } 35 | 36 | static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) { 37 | #ifndef __BMI2__ 38 | uint64_t c,d; 39 | __asm__ volatile 40 | ("movq %[a], %%rax;" 41 | "mulq %[b];" 42 | : [c]"=a"(c), [d]"=d"(d) 43 | : [b]"m"(*b), [a]"r"(a) 44 | : "cc"); 45 | return (((__uint128_t)(d))<<64) | c; 46 | #else 47 | uint64_t c,d; 48 | __asm__ volatile 49 | ("mulx %[b], %[c], %[d];" 50 | : [c]"=r"(c), [d]"=r"(d) 51 | : [b]"m"(*b), [a]"d"(a)); 52 | return (((__uint128_t)(d))<<64) | c; 53 | #endif 54 | } 55 | 56 | static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) { 57 | #ifndef __BMI2__ 58 | uint64_t c,d; 59 | __asm__ volatile 60 | ("movq %[a], %%rax; " 61 | "addq %%rax, %%rax; " 62 | "mulq %[b];" 63 | : [c]"=a"(c), [d]"=d"(d) 64 | : [b]"m"(*b), [a]"m"(*a) 65 | : "cc"); 66 | return (((__uint128_t)(d))<<64) | c; 67 | #else 68 | uint64_t c,d; 69 | __asm__ volatile 70 | ("movq %[a], %%rdx;" 71 | "leaq (,%%rdx,2), %%rdx;" 72 | "mulx %[b], %[c], %[d];" 73 | : [c]"=r"(c), [d]"=r"(d) 74 | : [b]"m"(*b), [a]"m"(*a) 75 | : "rdx"); 76 | return (((__uint128_t)(d))<<64) | c; 77 | #endif 78 | } 79 | 80 | static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { 81 | uint64_t lo = *acc, hi = *acc>>64; 82 | 83 | #ifdef __BMI2__ 84 | uint64_t c,d; 85 | __asm__ volatile 86 | ("movq %[a], %%rdx; " 87 | "mulx %[b], %[c], %[d]; " 88 | "addq %[c], %[lo]; " 89 | "adcq %[d], %[hi]; " 90 | : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) 91 | : [b]"m"(*b), [a]"m"(*a) 92 | : "rdx", "cc"); 93 | #else 94 | __asm__ volatile 95 | ("movq %[a], %%rax; " 96 | "mulq %[b]; " 97 | "addq %%rax, %[lo]; " 98 | "adcq %%rdx, %[hi]; " 99 | : [lo]"+r"(lo), [hi]"+r"(hi) 100 | : [b]"m"(*b), [a]"m"(*a) 101 | : "rax", "rdx", "cc"); 102 | #endif 103 | 104 | *acc = (((__uint128_t)(hi))<<64) | lo; 105 | } 106 | 107 | static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) { 108 | uint64_t lo = *acc, hi = *acc>>64; 109 | uint64_t lo2 = *acc2, hi2 = *acc2>>64; 110 | 111 | #ifdef __BMI2__ 112 | uint64_t c,d; 113 | __asm__ volatile 114 | ("movq %[a], %%rdx; " 115 | "mulx %[b], %[c], %[d]; " 116 | "addq %[c], %[lo]; " 117 | "adcq %[d], %[hi]; " 118 | "addq %[c], %[lo2]; " 119 | "adcq %[d], %[hi2]; " 120 | : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) 121 | : [b]"m"(*b), [a]"m"(*a) 122 | : "rdx", "cc"); 123 | #else 124 | __asm__ volatile 125 | ("movq %[a], %%rax; " 126 | "mulq %[b]; " 127 | "addq %%rax, %[lo]; " 128 | "adcq %%rdx, %[hi]; " 129 | "addq %%rax, %[lo2]; " 130 | "adcq %%rdx, %[hi2]; " 131 | : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) 132 | : [b]"m"(*b), [a]"m"(*a) 133 | : "rax", "rdx", "cc"); 134 | #endif 135 | 136 | *acc = (((__uint128_t)(hi))<<64) | lo; 137 | *acc2 = (((__uint128_t)(hi2))<<64) | lo2; 138 | } 139 | 140 | static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) { 141 | uint64_t lo = *acc, hi = *acc>>64; 142 | 143 | #ifdef __BMI2__ 144 | uint64_t c,d; 145 | __asm__ volatile 146 | ("mulx %[b], %[c], %[d]; " 147 | "addq %[c], %[lo]; " 148 | "adcq %[d], %[hi]; " 149 | : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) 150 | : [b]"m"(*b), [a]"d"(a) 151 | : "cc"); 152 | #else 153 | __asm__ volatile 154 | ("movq %[a], %%rax; " 155 | "mulq %[b]; " 156 | "addq %%rax, %[lo]; " 157 | "adcq %%rdx, %[hi]; " 158 | : [lo]"+r"(lo), [hi]"+r"(hi) 159 | : [b]"m"(*b), [a]"r"(a) 160 | : "rax", "rdx", "cc"); 161 | #endif 162 | 163 | *acc = (((__uint128_t)(hi))<<64) | lo; 164 | } 165 | 166 | static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { 167 | uint64_t lo = *acc, hi = *acc>>64; 168 | 169 | #ifdef __BMI2__ 170 | uint64_t c,d; 171 | __asm__ volatile 172 | ("movq %[a], %%rdx; " 173 | "addq %%rdx, %%rdx; " 174 | "mulx %[b], %[c], %[d]; " 175 | "addq %[c], %[lo]; " 176 | "adcq %[d], %[hi]; " 177 | : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) 178 | : [b]"m"(*b), [a]"m"(*a) 179 | : "rdx", "cc"); 180 | #else 181 | __asm__ volatile 182 | ("movq %[a], %%rax; " 183 | "addq %%rax, %%rax; " 184 | "mulq %[b]; " 185 | "addq %%rax, %[lo]; " 186 | "adcq %%rdx, %[hi]; " 187 | : [lo]"+r"(lo), [hi]"+r"(hi) 188 | : [b]"m"(*b), [a]"m"(*a) 189 | : "rax", "rdx", "cc"); 190 | #endif 191 | 192 | *acc = (((__uint128_t)(hi))<<64) | lo; 193 | } 194 | 195 | static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { 196 | uint64_t lo = *acc, hi = *acc>>64; 197 | #ifdef __BMI2__ 198 | uint64_t c,d; 199 | __asm__ volatile 200 | ("movq %[a], %%rdx; " 201 | "mulx %[b], %[c], %[d]; " 202 | "subq %[c], %[lo]; " 203 | "sbbq %[d], %[hi]; " 204 | : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) 205 | : [b]"m"(*b), [a]"m"(*a) 206 | : "rdx", "cc"); 207 | #else 208 | __asm__ volatile 209 | ("movq %[a], %%rax; " 210 | "mulq %[b]; " 211 | "subq %%rax, %[lo]; " 212 | "sbbq %%rdx, %[hi]; " 213 | : [lo]"+r"(lo), [hi]"+r"(hi) 214 | : [b]"m"(*b), [a]"m"(*a) 215 | : "rax", "rdx", "cc"); 216 | #endif 217 | *acc = (((__uint128_t)(hi))<<64) | lo; 218 | } 219 | 220 | static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { 221 | uint64_t lo = *acc, hi = *acc>>64; 222 | #ifdef __BMI2__ 223 | uint64_t c,d; 224 | __asm__ volatile 225 | ("movq %[a], %%rdx; " 226 | "addq %%rdx, %%rdx; " 227 | "mulx %[b], %[c], %[d]; " 228 | "subq %[c], %[lo]; " 229 | "sbbq %[d], %[hi]; " 230 | : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) 231 | : [b]"m"(*b), [a]"m"(*a) 232 | : "rdx", "cc"); 233 | #else 234 | __asm__ volatile 235 | ("movq %[a], %%rax; " 236 | "addq %%rax, %%rax; " 237 | "mulq %[b]; " 238 | "subq %%rax, %[lo]; " 239 | "sbbq %%rdx, %[hi]; " 240 | : [lo]"+r"(lo), [hi]"+r"(hi) 241 | : [b]"m"(*b), [a]"m"(*a) 242 | : "rax", "rdx", "cc"); 243 | #endif 244 | *acc = (((__uint128_t)(hi))<<64) | lo; 245 | 246 | } 247 | 248 | static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) { 249 | uint64_t c,d, lo = *acc, hi = *acc>>64; 250 | __asm__ volatile 251 | ("movq %[a], %%rdx; " 252 | "mulx %[b], %[c], %[d]; " 253 | "subq %[lo], %[c]; " 254 | "sbbq %[hi], %[d]; " 255 | : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) 256 | : [b]"m"(*b), [a]"m"(*a) 257 | : "rdx", "cc"); 258 | *acc = (((__uint128_t)(d))<<64) | c; 259 | } 260 | 261 | static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) { 262 | return ((__uint128_t)(a)) * b; 263 | } 264 | 265 | static __inline__ __int128_t widemuls(int64_t a, int64_t b) { 266 | return ((__int128_t)(a)) * b; 267 | } 268 | 269 | static __inline__ uint64_t opacify(uint64_t x) { 270 | __asm__ volatile("" : "+r"(x)); 271 | return x; 272 | } 273 | 274 | static __inline__ mask_t is_zero(uint64_t x) { 275 | __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x)); 276 | return ~x; 277 | } 278 | 279 | #endif /* __X86_64_ARITH_H__ */ 280 | -------------------------------------------------------------------------------- /src/include/word.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | 5 | #ifndef __WORD_H__ 6 | #define __WORD_H__ 7 | 8 | /* for posix_memalign */ 9 | #define _XOPEN_SOURCE 600 10 | 11 | #include "arch_config.h" 12 | 13 | 14 | #ifndef __APPLE__ 15 | #ifndef _BSD_SOURCE 16 | #define _BSD_SOURCE 1 17 | #endif 18 | #include 19 | #endif 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #if defined(__ARM_NEON__) 27 | #include 28 | #elif defined(__SSE2__) 29 | #include 30 | #endif 31 | 32 | #if (WORD_BITS == 64) 33 | typedef uint32_t hword_t; 34 | typedef uint64_t word_t; 35 | typedef __uint128_t dword_t; 36 | typedef int32_t hsword_t; 37 | typedef int64_t sword_t; 38 | typedef __int128_t dsword_t; 39 | #define PRIxWORD PRIx64 40 | #define PRIxWORDfull "%016" PRIx64 41 | #define PRIxWORD56 "%014" PRIx64 42 | #define PRIxWORD60 "%015" PRIx60 43 | #define U64LE(x) x##ull 44 | #define U58LE(x) x##ull 45 | #define U56LE(x) x##ull 46 | #define U60LE(x) x##ull 47 | #define letohWORD letoh64 48 | #define GOLDI_BITS 64 49 | #else 50 | typedef uint16_t hword_t; 51 | typedef uint32_t word_t; 52 | typedef uint64_t dword_t; 53 | typedef int16_t hsword_t; 54 | typedef int32_t sword_t; 55 | typedef int64_t dsword_t; 56 | #define PRIxWORD PRIx32 57 | #define PRIxWORDfull "%08" PRIx32 58 | #define PRIxWORD56 "%07" PRIx32 59 | #define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32 60 | #define U58LE(x) (x##ull)&((1ull<<29)-1), (x##ull)>>29 61 | #define U56LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28 62 | #define U60LE(x) (x##ull)&((1ull<<30)-1), (x##ull)>>30 63 | #define letohWORD letoh32 64 | #define GOLDI_BITS 32 65 | #endif 66 | 67 | #define DIV_CEIL(_x,_y) (((_x) + (_y) - 1)/(_y)) 68 | #define ROUND_UP(_x,_y) (DIV_CEIL((_x),(_y))*(_y)) 69 | #define WORDS_FOR_BITS(_x) (DIV_CEIL((_x),WORD_BITS)) 70 | 71 | typedef word_t mask_t; 72 | static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -(mask_t)1; 73 | 74 | 75 | 76 | #ifdef __ARM_NEON__ 77 | typedef uint32x4_t vecmask_t; 78 | #elif __clang__ 79 | typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2))); 80 | typedef int64_t int64x2_t __attribute__((ext_vector_type(2))); 81 | typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4))); 82 | typedef int64_t int64x4_t __attribute__((ext_vector_type(4))); 83 | typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4))); 84 | typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); 85 | typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2))); 86 | typedef int32_t int32x2_t __attribute__((ext_vector_type(2))); 87 | typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8))); 88 | typedef int32_t int32x8_t __attribute__((ext_vector_type(8))); 89 | typedef word_t vecmask_t __attribute__((ext_vector_type(4))); 90 | #else /* GCC-cleanliness */ 91 | typedef uint64_t uint64x2_t __attribute__((vector_size(16))); 92 | typedef int64_t int64x2_t __attribute__((vector_size(16))); 93 | typedef uint64_t uint64x4_t __attribute__((vector_size(32))); 94 | typedef int64_t int64x4_t __attribute__((vector_size(32))); 95 | typedef uint32_t uint32x4_t __attribute__((vector_size(16))); 96 | typedef int32_t int32x4_t __attribute__((vector_size(16))); 97 | typedef uint32_t uint32x2_t __attribute__((vector_size(8))); 98 | typedef int32_t int32x2_t __attribute__((vector_size(8))); 99 | typedef uint32_t uint32x8_t __attribute__((vector_size(32))); 100 | typedef int32_t int32x8_t __attribute__((vector_size(32))); 101 | typedef word_t vecmask_t __attribute__((vector_size(32))); 102 | #endif 103 | 104 | #if __AVX2__ 105 | #define VECTOR_ALIGNED __attribute__((aligned(32))) 106 | typedef uint32x8_t big_register_t; 107 | typedef uint64x4_t uint64xn_t; 108 | typedef uint32x8_t uint32xn_t; 109 | 110 | static __inline__ big_register_t 111 | br_set_to_mask(mask_t x) { 112 | uint32_t y = (uint32_t)x; 113 | big_register_t ret = {y,y,y,y,y,y,y,y}; 114 | return ret; 115 | } 116 | #elif __SSE2__ 117 | #define VECTOR_ALIGNED __attribute__((aligned(16))) 118 | typedef uint32x4_t big_register_t; 119 | typedef uint64x2_t uint64xn_t; 120 | typedef uint32x4_t uint32xn_t; 121 | 122 | static __inline__ big_register_t 123 | br_set_to_mask(mask_t x) { 124 | uint32_t y = x; 125 | big_register_t ret = {y,y,y,y}; 126 | return ret; 127 | } 128 | #elif __ARM_NEON__ 129 | #define VECTOR_ALIGNED __attribute__((aligned(16))) 130 | typedef uint32x4_t big_register_t; 131 | typedef uint64x2_t uint64xn_t; 132 | typedef uint32x4_t uint32xn_t; 133 | static __inline__ big_register_t 134 | br_set_to_mask(mask_t x) { 135 | return vdupq_n_u32(x); 136 | } 137 | #elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__ 138 | #define VECTOR_ALIGNED __attribute__((aligned(8))) 139 | typedef uint64_t big_register_t, uint64xn_t; 140 | 141 | typedef uint32_t uint32xn_t; 142 | static __inline__ big_register_t 143 | br_set_to_mask(mask_t x) { 144 | return (big_register_t)x; 145 | } 146 | #else 147 | #define VECTOR_ALIGNED __attribute__((aligned(4))) 148 | typedef uint64_t uint64xn_t; 149 | typedef uint32_t uint32xn_t; 150 | typedef uint32_t big_register_t; 151 | 152 | static __inline__ big_register_t 153 | br_set_to_mask(mask_t x) { 154 | return (big_register_t)x; 155 | } 156 | #endif 157 | 158 | typedef struct { 159 | uint64xn_t unaligned; 160 | } __attribute__((packed)) unaligned_uint64xn_t; 161 | 162 | typedef struct { 163 | uint32xn_t unaligned; 164 | } __attribute__((packed)) unaligned_uint32xn_t; 165 | 166 | /** 167 | * Return -1 if x==0, and 0 otherwise. 168 | */ 169 | static __inline__ mask_t 170 | __attribute__((always_inline,unused)) 171 | word_is_zero(word_t x) { 172 | return (mask_t)((((dword_t)(x)) - 1)>>WORD_BITS); 173 | } 174 | 175 | #if __AVX2__ 176 | static __inline__ big_register_t 177 | br_is_zero(big_register_t x) { 178 | return (big_register_t)(x == br_set_to_mask(0)); 179 | } 180 | #elif __SSE2__ 181 | static __inline__ big_register_t 182 | br_is_zero(big_register_t x) { 183 | return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128()); 184 | //return (big_register_t)(x == br_set_to_mask(0)); 185 | } 186 | #elif __ARM_NEON__ 187 | static __inline__ big_register_t 188 | br_is_zero(big_register_t x) { 189 | return vceqq_u32(x,x^x); 190 | } 191 | #else 192 | static __inline__ mask_t 193 | br_is_zero(word_t x) { 194 | return (((dword_t)x) - 1)>>WORD_BITS; 195 | } 196 | #endif 197 | 198 | 199 | 200 | 201 | #ifdef __APPLE__ 202 | static inline uint64_t 203 | htobe64 (uint64_t x) { 204 | __asm__ ("bswapq %0" : "+r"(x)); 205 | return x; 206 | } 207 | static inline uint64_t 208 | htole64 (uint64_t x) { return x; } 209 | 210 | static inline uint64_t 211 | letoh64 (uint64_t x) { return x; } 212 | #endif 213 | 214 | /** 215 | * Really call memset, in a way that prevents the compiler from optimizing it out. 216 | * @param p The object to zeroize. 217 | * @param c The char to set it to (probably zero). 218 | * @param s The size of the object. 219 | */ 220 | #if defined(__DARWIN_C_LEVEL) || defined(__STDC_LIB_EXT1__) 221 | #define HAS_MEMSET_S 222 | #endif 223 | 224 | #if !defined(__STDC_WANT_LIB_EXT1__) || __STDC_WANT_LIB_EXT1__ != 1 225 | #define NEED_MEMSET_S_EXTERN 226 | #endif 227 | 228 | #ifdef HAS_MEMSET_S 229 | #ifdef NEED_MEMSET_S_EXTERN 230 | extern int memset_s(void *, size_t, int, size_t); 231 | #endif 232 | static __inline__ void 233 | really_memset(void *p, char c, size_t s) { 234 | memset_s(p, s, c, s); 235 | } 236 | #else 237 | static __inline__ void __attribute__((always_inline,unused)) 238 | really_memset(void *p, char c, size_t s) { 239 | volatile char *pv = (volatile char *)p; 240 | size_t i; 241 | for (i=0; i 1: 58 | return str(self) 59 | for _,v in self.idealMap.iteritems(): 60 | return str(v) 61 | 62 | def prune(self): 63 | self.idealMap = {I:v for I,v in self.idealMap.iteritems() if not (I*self.R).is_one()} 64 | return self 65 | 66 | def __add__(self,other): 67 | def f(x,y): return x+y 68 | return self.op(other,f) 69 | 70 | def __radd__(self,other): 71 | def f(x,y): return y+x 72 | return self.op(other,f) 73 | 74 | def __rsub__(self,other): 75 | def f(x,y): return y-x 76 | return self.op(other,f) 77 | 78 | def __neg__(self): 79 | def f(x,y): return y-x 80 | return self.op(0,f) 81 | 82 | def __sub__(self,other): 83 | def f(x,y): return x-y 84 | return self.op(other,f) 85 | 86 | def is_square(self): 87 | for _,v in self.idealMap.iteritems(): 88 | if not is_square(v): return False 89 | return True 90 | 91 | def sqrt(self): 92 | if self._sqrt is None: 93 | s = Idealized.uvar("s") 94 | self._sqrt = s.assuming(s^2 - self) 95 | return self._sqrt 96 | 97 | def isqrt(self): 98 | if self._isqrt is None: 99 | s = Idealized.uvar("s") 100 | z = Idealized(0).assuming(Self) 101 | self._isqrt = s.assuming(s^2*self-1).union(z) 102 | return self._isqrt 103 | 104 | def __mul__(self,other): 105 | def f(x,y): return x*y 106 | return self.op(other,f) 107 | 108 | def __rmul__(self,other): 109 | def f(x,y): return y*x 110 | return self.op(other,f) 111 | 112 | def __pow__(self,n): 113 | if n < 0: return 1/self^(-n) 114 | if n == 0: return 1 115 | if n == 1: return self 116 | if is_even(n): return (self*self)^(n//2) 117 | if is_odd(n): return (self*self)^(n//2) * self 118 | 119 | def __div__(self,other): 120 | def f(x,y): return x/y 121 | return self.op(other,f) 122 | 123 | def __rdiv__(self,other): 124 | def f(x,y): return y/x 125 | return self.op(other,f) 126 | 127 | def union(self,other): 128 | return self.op(other,Idealized.UNION) 129 | 130 | def __eq__(self,other): 131 | return (self - other).is_zero() 132 | 133 | def __ne__(self,other): 134 | return not (self==other) 135 | 136 | def __hash__(self): 137 | return 0 138 | 139 | def assume_zero(self): 140 | out = {} 141 | for I,J in self.idealMap.iteritems(): 142 | IJ = I+J.numerator() 143 | if IJ.is_one(): continue 144 | out[IJ] = self.R(0) 145 | 146 | if len(out) == 0: 147 | raise Exception("Inconsistent assumption") 148 | 149 | return Idealized(self.R,out,self.varnames) 150 | 151 | def assuming(self,other): 152 | return self + other.assume_zero() 153 | 154 | def is_zero(self): 155 | for I,v in self.idealMap.iteritems(): 156 | if v.denominator() in I: return False 157 | if v.numerator() not in I: return False 158 | return True 159 | 160 | def op(self,other,f): 161 | if not isinstance(other,Idealized): 162 | other = Idealized(self.R,other,self.varnames) 163 | 164 | bad = False 165 | for v in self.varnames: 166 | if v not in other.varnames or self.varnames[v] != other.varnames[v]: 167 | bad = True 168 | break 169 | for v in other.varnames: 170 | if v not in self.varnames or self.varnames[v] != other.varnames[v]: 171 | bad = True 172 | break 173 | 174 | if bad: 175 | def incrVar(v): 176 | if v[-1] not in "0123456789": return v + "1" 177 | elif v[-1] == 9: return incrVar(v[:-1]) + "0" 178 | else: return v[:-1] + str(int(v[-1])+1) 179 | 180 | vars = {} 181 | names = set() 182 | for v,(name,_) in self.varnames.iteritems(): 183 | assert(name not in names) 184 | names.add(name) 185 | vars[v] = name 186 | subMe = {n:n for n in names} 187 | subThem = {} 188 | for v,(name,_) in other.varnames.iteritems(): 189 | if v in self.varnames: 190 | subThem[name] = self.varnames[v][0] 191 | else: 192 | oname = name 193 | while name in names: 194 | name = incrVar(name) 195 | names.add(name) 196 | subThem[oname] = name 197 | vars[v] = name 198 | 199 | R = PolynomialRing(QQ,sorted(list(names)),order="degrevlex") 200 | gd = R.gens_dict() 201 | subMe = {m:gd[n] for m,n in subMe.iteritems()} 202 | subThem = {m:gd[n] for m,n in subThem.iteritems()} 203 | 204 | vars = {v:(n,gd[n]) for v,n in vars.iteritems()} 205 | 206 | def subIdeal(I,sub): 207 | return [g(**sub) for g in I.gens()]*R 208 | idealMe = {subIdeal(I,subMe):v(**subMe) for I,v in self.idealMap.iteritems()} 209 | idealThem = {subIdeal(I,subThem):v(**subThem) for I,v in other.idealMap.iteritems()} 210 | else: 211 | R = self.R 212 | idealMe = self.idealMap 213 | idealThem = other.idealMap 214 | vars = self.varnames 215 | 216 | def consist(I,x,y): 217 | if (x-y).numerator() not in I: 218 | raise Exception("Inconsistent: %s != %s in ideal %s" % 219 | (str(x),str(y),str(I))) 220 | 221 | out = {} 222 | if f is Idealized.UNION: 223 | for I,v in idealMe.iteritems(): 224 | if I in idealThem: 225 | consist(I,v,idealThem[I]) 226 | out[I] = v 227 | for I,v in idealThem.iteritems(): 228 | if I in idealMe: 229 | consist(I,v,idealMe[I]) 230 | out[I] = v 231 | 232 | else: 233 | for I,v in idealMe.iteritems(): 234 | if I in idealThem: 235 | x = f(v,idealThem[I]) 236 | if I in out: 237 | consist(I,x,out[I]) 238 | else: out[I] = x 239 | else: 240 | for J,w in idealThem.iteritems(): 241 | IJ = I+J 242 | if not IJ.is_one(): 243 | x = f(v,w) 244 | if IJ in out: 245 | consist(IJ,x,out[IJ]) 246 | else: 247 | out[IJ] = x 248 | 249 | def gb(I): 250 | II = [0]*R 251 | for g in I.gens(): 252 | if g not in II: II = II+[g]*R 253 | return II 254 | 255 | def red(I,v): 256 | if I.is_zero(): return v 257 | return I.reduce(R(v.numerator())) / I.reduce(R(v.denominator())) 258 | 259 | out = {gb(I):v for I,v in out.iteritems()} 260 | out = {I:red(I,v) for I,v in out.iteritems()} 261 | 262 | return Idealized(R,out,vars) 263 | 264 | def reduce(self): 265 | def red(I,v): 266 | if I.is_zero(): return v 267 | return I.reduce(R(v.numerator())) / I.reduce(R(v.denominator())) 268 | out = {I:red(I,v) for I,v in self.idealMap.iteritems()} 269 | return Idealized(self.R,out,self.vars) 270 | 271 | Idealized.INF = Idealized.uvar("inf") 272 | Idealized.ZOZ = Idealized.uvar("zoz") 273 | -------------------------------------------------------------------------------- /test/test_decaf.cxx: -------------------------------------------------------------------------------- 1 | /** 2 | * @file test_decaf.cxx 3 | * @author Mike Hamburg 4 | * 5 | * @copyright 6 | * Copyright (c) 2015 Cryptography Research, Inc. \n 7 | * Released under the MIT License. See LICENSE.txt for license information. 8 | * 9 | * @brief C++ tests, because that's easier. 10 | */ 11 | 12 | #include "decaf.hxx" 13 | #include "shake.hxx" 14 | #include "decaf_crypto.h" 15 | #include 16 | 17 | 18 | static bool passing = true; 19 | static const long NTESTS = 10000; 20 | 21 | class Test { 22 | public: 23 | bool passing_now; 24 | Test(const char *test) { 25 | passing_now = true; 26 | printf("%s...", test); 27 | if (strlen(test) < 27) printf("%*s",int(27-strlen(test)),""); 28 | fflush(stdout); 29 | } 30 | ~Test() { 31 | if (std::uncaught_exception()) { 32 | fail(); 33 | printf(" due to uncaught exception.\n"); 34 | } 35 | if (passing_now) printf("[PASS]\n"); 36 | } 37 | void fail() { 38 | if (!passing_now) return; 39 | passing_now = passing = false; 40 | printf("[FAIL]\n"); 41 | } 42 | }; 43 | 44 | template struct Tests { 45 | 46 | typedef typename Group::Scalar Scalar; 47 | typedef typename Group::Point Point; 48 | typedef typename Group::Precomputed Precomputed; 49 | 50 | static void print(const char *name, const Scalar &x) { 51 | unsigned char buffer[Scalar::SER_BYTES]; 52 | x.encode(buffer); 53 | printf(" %s = 0x", name); 54 | for (int i=sizeof(buffer)-1; i>=0; i--) { 55 | printf("%02x", buffer[i]); 56 | } 57 | printf("\n"); 58 | } 59 | 60 | static void print(const char *name, const Point &x) { 61 | unsigned char buffer[Point::SER_BYTES]; 62 | x.encode(buffer); 63 | printf(" %s = 0x", name); 64 | for (int i=sizeof(buffer)-1; i>=0; i--) { 65 | printf("%02x", buffer[i]); 66 | } 67 | printf("\n"); 68 | } 69 | 70 | static bool arith_check( 71 | Test &test, 72 | const Scalar &x, 73 | const Scalar &y, 74 | const Scalar &z, 75 | const Scalar &r, 76 | const Scalar &l, 77 | const char *name 78 | ) { 79 | if (l == r) return true; 80 | test.fail(); 81 | printf(" %s", name); 82 | print("x", x); 83 | print("y", y); 84 | print("z", z); 85 | print("lhs", r); 86 | print("rhs", l); 87 | return false; 88 | } 89 | 90 | static bool point_check( 91 | Test &test, 92 | const Point &p, 93 | const Point &q, 94 | const Point &R, 95 | const Scalar &x, 96 | const Scalar &y, 97 | const Point &l, 98 | const Point &r, 99 | const char *name 100 | ) { 101 | bool good = l==r; 102 | if (!p.validate()) { good = false; printf(" p invalid\n"); } 103 | if (!q.validate()) { good = false; printf(" q invalid\n"); } 104 | if (!r.validate()) { good = false; printf(" r invalid\n"); } 105 | if (!l.validate()) { good = false; printf(" l invalid\n"); } 106 | if (good) return true; 107 | 108 | test.fail(); 109 | printf(" %s", name); 110 | print("x", x); 111 | print("y", y); 112 | print("p", p); 113 | print("q", q); 114 | print("r", R); 115 | print("lhs", r); 116 | print("rhs", l); 117 | return false; 118 | } 119 | 120 | static void test_arithmetic() { 121 | decaf::SpongeRng rng(decaf::Block("test_arithmetic")); 122 | 123 | Test test("Arithmetic"); 124 | Scalar x(0),y(0),z(0); 125 | arith_check(test,x,y,z,INT_MAX,(decaf_word_t)INT_MAX,"cast from max"); 126 | arith_check(test,x,y,z,INT_MIN,-Scalar(1+(decaf_word_t)INT_MAX),"cast from min"); 127 | 128 | for (int i=0; i=8) p.debugging_torque_in_place(); 163 | bool succ = p.invert_elligator(b1,i&7); 164 | Point q; 165 | unsigned char hint = q.set_to_hash(b1); 166 | 167 | if (succ != ((i&7) != 4) || (q != p) || (succ && (hint != (i&7)))) { 168 | test.fail(); 169 | printf("Elligator test: t=%d, h=%d->%d, q%sp, %s %02x%02x\n", 170 | i/8, i&7, hint, (q==p)?"==":"!=",succ ? "SUCC" : "FAIL", 171 | b1[0], b1[1]); 172 | } 173 | } 174 | 175 | for (int i=0; i Point::HASH_BYTES) 181 | memcpy(&b2[Point::HASH_BYTES], &b1[Point::HASH_BYTES], len-Point::HASH_BYTES); 182 | Point s; 183 | unsigned char hint = s.set_to_hash(b1); 184 | if (i&1) s.debugging_torque_in_place(); 185 | bool succ = s.invert_elligator(b2,hint); 186 | if (!succ || memcmp(b1,b2,len)) { 187 | test.fail(); 188 | printf(" Fail elligator inversion i=%d (claimed %s, hint=%d)\n", 189 | i, succ ? "success" : "failure", hint); 190 | } 191 | 192 | Point t(rng); 193 | point_check(test,t,t,t,0,0,t,Point::from_hash(t.steg_encode(rng)),"steg round-trip"); 194 | } 195 | } 196 | 197 | static void test_ec() { 198 | decaf::SpongeRng rng(decaf::Block("test_ec")); 199 | 200 | Test test("EC"); 201 | 202 | Point id = Point::identity(), base = Point::base(); 203 | point_check(test,id,id,id,0,0,Point::from_hash(""),id,"fh0"); 204 | point_check(test,id,id,id,0,0,Point::from_hash("\x01"),id,"fh1"); 205 | 206 | for (int i=0; i 241 | 242 | 243 | static void test_decaf() { 244 | Test test("Sample crypto"); 245 | decaf::SpongeRng rng(decaf::Block("test_decaf")); 246 | 247 | decaf_448_symmetric_key_t proto1,proto2; 248 | decaf_448_private_key_t s1,s2; 249 | decaf_448_public_key_t p1,p2; 250 | decaf_448_signature_t sig; 251 | unsigned char shared1[1234],shared2[1234]; 252 | const char *message = "Hello, world!"; 253 | 254 | for (int i=0; i::test_arithmetic(); 281 | Tests::test_elligator(); 282 | Tests::test_ec(); 283 | test_decaf(); 284 | 285 | if (passing) printf("Passed all tests.\n"); 286 | 287 | return passing ? 0 : 1; 288 | } 289 | -------------------------------------------------------------------------------- /aux/curve.sage: -------------------------------------------------------------------------------- 1 | from idealized import Idealized 2 | from collections import namedtuple 3 | 4 | debugging = True 5 | def debug_print(foo): 6 | if debugging: print foo 7 | 8 | checkGroupLaws = True 9 | checkTorsion = True 10 | checkIsogenies = True 11 | 12 | def memoize(f): 13 | # list cache because my __hash__ hack doesn't seem to work 14 | cache = [] 15 | def ff(*args, **kwargs): 16 | key = (tuple(args),tuple(sorted(kwargs.iteritems()))) 17 | for key_,value in cache: 18 | if key == key_: return value 19 | out = f(*args,**kwargs) 20 | cache.append((key,out)) 21 | return out 22 | 23 | try: 24 | ff.__name__ = f.__name__ 25 | except AttributeError: pass 26 | return ff 27 | 28 | def EcBase(curvename,varnames,ad=()): 29 | if isinstance(ad,str) or isinstance(ad[0],str): 30 | ad = Idealized.vars(ad) 31 | 32 | class Inner(namedtuple(curvename,(v for v in varnames))): 33 | params = ad 34 | torsion_points = {} 35 | def __new__(cls,*xy): 36 | def apply_invariants(xy,x): 37 | for inv in cls.invariants(*(ad+xy)): 38 | x = x.assuming(inv) 39 | return x 40 | 41 | xy = tuple(xy) 42 | if len(xy) == 0: 43 | xy = Idealized.uvars(varnames) 44 | xy = [apply_invariants(xy,x) for x in xy] 45 | else: 46 | for i,inv in enumerate(cls.invariants(*(ad + xy))): 47 | if inv != 0: 48 | raise Exception("Invariant inv[%d] not satisfied for %s: got \n%s" % 49 | (i,curvename,str(inv))) 50 | 51 | return super(Inner,cls).__new__(cls,*xy) 52 | 53 | varnames = "xy" 54 | 55 | @classmethod 56 | def invariants(self,*args): return [] 57 | 58 | @classmethod 59 | @memoize 60 | def check_group(cls): 61 | if checkGroupLaws: 62 | debug_print("Checking group law for %s..." % cls.__name__) 63 | a,b,c,z = cls(),cls(),cls(),cls.basepoint 64 | if a+z != a: 65 | raise Exception("Base point is not identity!") 66 | if a-a != z: 67 | raise Exception("Subtraction doesn't work!") 68 | if a+b != b+a: 69 | raise Exception("Addition is not commutative!") 70 | #if a+(b+c) != (a+b)+c: 71 | # raise Exception("Addition is not associative!") 72 | 73 | for t,n in cls.torsion(): 74 | if checkTorsion: 75 | debug_print(" Checking %d-torsion..." % n) 76 | cls.check_torsion(t,n) 77 | #if n not in cls.torsion_points: 78 | # cls.torsion_points[n] = set() 79 | #cls.torsion_points[n].add(cls(*t(cls.basepoint))) 80 | 81 | @classmethod 82 | def check_torsion(cls,f,n): 83 | P = Q = cls() 84 | good = False 85 | for i in xrange(1,n+1): 86 | Q = cls(*f(Q)) 87 | if Q == P: 88 | if i==n: 89 | good = True 90 | break 91 | raise Exception("Claimed %d-torsion, but is actually %d-torsion" % (n,i)) 92 | if not good: raise Exception("Claimed %d-torsion, but isn't" % n) 93 | if n*P+n*cls(*f(P)) == cls.basepoint: 94 | raise Exception("Torsion operation inverts element") 95 | 96 | @classmethod 97 | def torsion(cls): 98 | return [] 99 | 100 | def __sub__(self,other): 101 | return self + (-other) 102 | 103 | def __mul__(self,other): 104 | if other==0: return self.basepoint 105 | if other < 0: return -(self*-other) 106 | if other==1: return self 107 | if is_even(other): return (self+self)*(other//2) 108 | return (self+self)*(other//2) + self 109 | 110 | def __rmul__(self,other): 111 | return self*other 112 | 113 | Inner.__name__ = curvename + "_base" 114 | return Inner 115 | 116 | class Isogeny(object): 117 | 118 | isograph = DiGraph(weighted=True) 119 | isomap = {} 120 | 121 | @classmethod 122 | def generate(cls, fro, to): 123 | path = cls.isograph.shortest_path(fro,to,by_weight=True) 124 | if len(path): 125 | iso = cls.isomap[(path[0], path[1])] 126 | for i in xrange(1,len(path)-1): 127 | iso = cls.isomap[(path[i],path[i+1])].compose(iso) 128 | return iso 129 | else: 130 | return None 131 | 132 | def __init__(self,c1,c2,deg,fw,rv,check=True,dual=None,add=True): 133 | self.c1 = c1 134 | self.c2 = c2 135 | self.fw = fw 136 | self.rv = rv 137 | self.deg = deg 138 | 139 | if add: 140 | Isogeny.isomap[(c1,c2)] = self 141 | Isogeny.isograph.add_edge(c1,c2,log(deg)/log(2) + 0.1) 142 | 143 | if dual is not None: 144 | self.dual = dual 145 | else: 146 | self.dual = Isogeny(c2,c1,deg,rv,fw,False,self,add) 147 | if not check: return 148 | 149 | 150 | if not checkIsogenies: return 151 | 152 | debug_print("Checking isogeny %s <-%d-> %s..." % (c1.__name__,deg,c2.__name__)) 153 | if c2(*fw(*c1.basepoint)) != c2.basepoint: 154 | raise Exception("Isogeny doesn't preserve basepoints") 155 | if c1(*fw(*c2.basepoint)) != c1.basepoint: 156 | raise Exception("Isogeny dual doesn't preserve basepoints") 157 | 158 | foo = c1() 159 | bar = c2() 160 | 161 | c2(*fw(*foo)) 162 | c1(*rv(*bar)) 163 | 164 | if c1(*rv(*c2(*fw(*foo)))) != deg*foo: 165 | raise Exception("Isogeny degree is wrong") 166 | if c2(*fw(*c1(*rv(*bar)))) != deg*bar: 167 | raise Exception("Isogeny degree is wrong") 168 | if -c2(*fw(*foo)) != c2(*fw(*(-foo))): 169 | raise Exception("Isogeny uses wrong negmap") 170 | if -c1(*rv(*bar)) != c1(*rv(*(-bar))): 171 | raise Exception("Isogeny uses wrong negmap") 172 | 173 | 174 | def __call__(self,ipt,**kwargs): 175 | return self.c2(*self.fw(*ipt,**kwargs)) 176 | 177 | def __repr__(self): return str(self) 178 | def __str__(self): 179 | out = "Isogeny %s%s <-%d-> %s%s..." %\ 180 | (self.c1.__name__,str(self.c1.params),self.deg, 181 | self.c2.__name__,self.c2.params) 182 | out += "\n fw: %s" % str(self(self.c1())) 183 | out += "\n rv: %s" % str(self.dual(self.c2())) 184 | return out 185 | 186 | def compose(self,other): 187 | def fw(*args): return self.fw(*other.fw(*args)) 188 | def rv(*args): return other.rv(*self.rv(*args)) 189 | return Isogeny(other.c1,self.c2,self.deg*other.deg,fw,rv,False,None,False) 190 | 191 | def ec_family(defs,vars): 192 | def inner1(CLS): 193 | @memoize 194 | def inner2(*args,**kwargs): 195 | if len(args)==0 and len(kwargs)==0: 196 | args = tuple(defs) 197 | chk = True 198 | else: 199 | chk = False 200 | 201 | class ret(CLS,EcBase(CLS.__name__,vars,args)): 202 | def __new__(cls,*args,**kwargs): 203 | return super(ret,cls).__new__(cls,*args,**kwargs) 204 | 205 | ret.__name__ = CLS.__name__ 206 | ret.basepoint = ret(*ret.get_basepoint()) 207 | 208 | if chk: ret.check_group() 209 | return ret 210 | 211 | inner2.__name__ = CLS.__name__ + "_family" 212 | inner2() 213 | return inner2 214 | 215 | return inner1 216 | 217 | #checkGroupLaws = checkTorsion = False 218 | 219 | @ec_family("ad","xy") 220 | class Edwards: 221 | @classmethod 222 | def invariants(cls,a,d,x,y): 223 | return [y^2 + a*x^2 - 1 - d*x^2*y^2] 224 | 225 | def __neg__(self): 226 | return self.__class__(-self.x,self.y) 227 | 228 | def __add__(self,other): 229 | (x,y) = self 230 | (X,Y) = other 231 | a,d = self.params 232 | dd = d*x*X*y*Y 233 | return self.__class__((x*Y+X*y)/(1+dd),(y*Y-a*x*X)/(1-dd)) 234 | 235 | @classmethod 236 | def get_basepoint(cls): return (0,1) 237 | 238 | @classmethod 239 | @memoize 240 | def torsion(cls): 241 | a,d = cls.params 242 | sa = a.sqrt() 243 | sd = d.sqrt() 244 | sad = (a*d).sqrt() 245 | def tor2_1((x,y)): return (-x,-y) 246 | def tor4_1((x,y)): return (y/sa,-x*sa) 247 | def tor4_2((x,y)): return (1/(sd*y),-1/(sd*x)) 248 | def tor2_2((x,y)): return (-1/(sad*x),-a/(sad*y)) 249 | 250 | return [(tor2_1,2),(tor2_2,2),(tor4_1,4),(tor4_2,4)] 251 | 252 | @ec_family("eA","st") 253 | class JacobiQuartic: 254 | @classmethod 255 | def invariants(cls,e,A,s,t): 256 | return [-t^2 + e*s^4 + 2*A*s^2 + 1] 257 | 258 | def __neg__(self): 259 | return self.__class__(-self.s,self.t) 260 | 261 | def __add__(self,other): 262 | (x,y) = self 263 | (X,Y) = other 264 | e,A = self.params 265 | dd = e*(x*X)^2 266 | YY = (1+dd)*(y*Y+2*A*x*X) + 2*e*x*X*(x^2+X^2) 267 | return self.__class__((x*Y+X*y)/(1-dd),YY/(1-dd)^2) 268 | 269 | @classmethod 270 | def get_basepoint(cls): return (0,1) 271 | 272 | @classmethod 273 | @memoize 274 | def torsion(cls): 275 | e,A = cls.params 276 | se = e.sqrt() 277 | def tor2_1((s,t)): return (-s,-t) 278 | def tor2_2((s,t)): return (1/(se*s),-t/(se*s^2)) 279 | return [(tor2_1,2),(tor2_2,2)] 280 | 281 | a,d = Idealized.vars("ad") 282 | def phi_iso(a,d): 283 | return Isogeny(Edwards(a,d),JacobiQuartic(a^2,a-2*d), 284 | 2, 285 | lambda x,y: (x/y, (2-y^2-a*x^2)/y^2), 286 | lambda s,t: (2*s/(1+a*s^2), (1-a*s^2)/t) 287 | ) 288 | 289 | print phi_iso(a,d) 290 | print phi_iso(-a,d-a) 291 | 292 | print Isogeny.generate(Edwards(a,d),Edwards(-a,d-a)) -------------------------------------------------------------------------------- /src/include/constant_time.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file constant_time.h 3 | * @copyright 4 | * Copyright (c) 2014 Cryptography Research, Inc. \n 5 | * Released under the MIT License. See LICENSE.txt for license information. 6 | * @author Mike Hamburg 7 | * 8 | * @brief Constant-time routines. 9 | */ 10 | 11 | #ifndef __CONSTANT_TIME_H__ 12 | #define __CONSTANT_TIME_H__ 1 13 | 14 | #include "word.h" 15 | #include 16 | 17 | /* 18 | * Constant-time operations on hopefully-compile-time-sized memory 19 | * regions. Needed for flexibility / demagication: not all fields 20 | * have sizes which are multiples of the vector width, necessitating 21 | * a change from the Ed448 versions. 22 | * 23 | * These routines would be much simpler to define at the byte level, 24 | * but if not vectorized they would be a significant fraction of the 25 | * runtime. Eg on NEON-less ARM, constant_time_lookup is like 15% of 26 | * signing time, vs 6% on Haswell with its fancy AVX2 vectors. 27 | * 28 | * If the compiler could do a good job of autovectorizing the code, 29 | * we could just leave it with the byte definition. But that's unlikely 30 | * on most deployed compilers, especially if you consider that pcmpeq[size] 31 | * is much faster than moving a scalar to the vector unit (which is what 32 | * a naive autovectorizer will do with constant_time_lookup on Intel). 33 | * 34 | * Instead, we're putting our trust in the loop unroller and unswitcher. 35 | * 36 | * TODO: verify correctness and performance on each platform, to make sure 37 | * that there are no regressions. 38 | */ 39 | 40 | 41 | /** 42 | * Unaligned big (vector?) register. 43 | */ 44 | typedef struct { 45 | big_register_t unaligned; 46 | } __attribute__((packed)) unaligned_br_t; 47 | 48 | /** 49 | * Unaligned word register, for architectures where that matters. 50 | */ 51 | typedef struct { 52 | word_t unaligned; 53 | } __attribute__((packed)) unaligned_word_t; 54 | 55 | /** 56 | * @brief Constant-time conditional swap. 57 | * 58 | * If doswap, then swap elem_bytes between *a and *b. 59 | * 60 | * *a and *b must not alias. Also, they must be at least as aligned 61 | * as their sizes, if the CPU cares about that sort of thing. 62 | */ 63 | static __inline__ void 64 | __attribute__((unused,always_inline)) 65 | constant_time_cond_swap ( 66 | void *__restrict__ a_, 67 | void *__restrict__ b_, 68 | word_t elem_bytes, 69 | mask_t doswap 70 | ) { 71 | word_t k; 72 | unsigned char *a = (unsigned char *)a_; 73 | unsigned char *b = (unsigned char *)b_; 74 | 75 | big_register_t br_mask = br_set_to_mask(doswap); 76 | for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) { 77 | if (elem_bytes % sizeof(big_register_t)) { 78 | /* unaligned */ 79 | big_register_t xor = 80 | ((unaligned_br_t*)(&a[k]))->unaligned 81 | ^ ((unaligned_br_t*)(&b[k]))->unaligned; 82 | xor &= br_mask; 83 | ((unaligned_br_t*)(&a[k]))->unaligned ^= xor; 84 | ((unaligned_br_t*)(&b[k]))->unaligned ^= xor; 85 | } else { 86 | /* aligned */ 87 | big_register_t xor = 88 | *((big_register_t*)(&a[k])) 89 | ^ *((big_register_t*)(&b[k])); 90 | xor &= br_mask; 91 | *((big_register_t*)(&a[k])) ^= xor; 92 | *((big_register_t*)(&b[k])) ^= xor; 93 | } 94 | } 95 | 96 | if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) { 97 | for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) { 98 | if (elem_bytes % sizeof(word_t)) { 99 | /* unaligned */ 100 | word_t xor = 101 | ((unaligned_word_t*)(&a[k]))->unaligned 102 | ^ ((unaligned_word_t*)(&b[k]))->unaligned; 103 | xor &= doswap; 104 | ((unaligned_word_t*)(&a[k]))->unaligned ^= xor; 105 | ((unaligned_word_t*)(&b[k]))->unaligned ^= xor; 106 | } else { 107 | /* aligned */ 108 | word_t xor = 109 | *((word_t*)(&a[k])) 110 | ^ *((word_t*)(&b[k])); 111 | xor &= doswap; 112 | *((word_t*)(&a[k])) ^= xor; 113 | *((word_t*)(&b[k])) ^= xor; 114 | } 115 | } 116 | } 117 | 118 | if (elem_bytes % sizeof(word_t)) { 119 | for (; kunaligned 159 | |= br_mask & ((const unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned; 160 | } else { 161 | /* aligned */ 162 | *(big_register_t *)(out+k) |= br_mask & *(const big_register_t*)(&table[k+j*elem_bytes]); 163 | } 164 | } 165 | 166 | word_t mask = word_is_zero(idx^j); 167 | if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) { 168 | for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) { 169 | if (elem_bytes % sizeof(word_t)) { 170 | /* input unaligned, output aligned */ 171 | *(word_t *)(out+k) |= mask & ((const unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned; 172 | } else { 173 | /* aligned */ 174 | *(word_t *)(out+k) |= mask & *(const word_t*)(&table[k+j*elem_bytes]); 175 | } 176 | } 177 | } 178 | 179 | if (elem_bytes % sizeof(word_t)) { 180 | for (; kunaligned = br_mask & ((const unaligned_br_t*)(&b[k]))->unaligned; 209 | } else { 210 | /* aligned */ 211 | *(big_register_t *)(a+k) = br_mask & *(const big_register_t*)(&b[k]); 212 | } 213 | } 214 | 215 | if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) { 216 | for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) { 217 | if (elem_bytes % sizeof(word_t)) { 218 | /* unaligned */ 219 | ((unaligned_word_t*)(&a[k]))->unaligned = mask & ((const unaligned_word_t*)(&b[k]))->unaligned; 220 | } else { 221 | /* aligned */ 222 | *(word_t *)(a+k) = mask & *(const word_t*)(&b[k]); 223 | } 224 | } 225 | } 226 | 227 | if (elem_bytes % sizeof(word_t)) { 228 | for (; kunaligned = 261 | ( br_mask & ((const unaligned_br_t*)(&bTrue [k]))->unaligned) 262 | | (~br_mask & ((const unaligned_br_t*)(&bFalse[k]))->unaligned); 263 | } else { 264 | /* aligned */ 265 | *(big_register_t *)(a+k) = 266 | ( br_mask & *(const big_register_t*)(&bTrue [k])) 267 | | (~br_mask & *(const big_register_t*)(&bFalse[k])); 268 | } 269 | } 270 | 271 | if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) { 272 | for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) { 273 | if (elem_bytes % sizeof(word_t)) { 274 | /* unaligned */ 275 | ((unaligned_word_t*)(&a[k]))->unaligned = 276 | ( mask & ((const unaligned_word_t*)(&bTrue [k]))->unaligned) 277 | | (~mask & ((const unaligned_word_t*)(&bFalse[k]))->unaligned); 278 | } else { 279 | /* aligned */ 280 | *(word_t *)(a+k) = 281 | ( mask & *(const word_t*)(&bTrue [k])) 282 | | (~mask & *(const word_t*)(&bFalse[k])); 283 | } 284 | } 285 | } 286 | 287 | if (elem_bytes % sizeof(word_t)) { 288 | for (; k= 256^n: 54 | raise Exception("Integer too big in to_le(%d,%d)" % (x,n)) 55 | return "".join([chr(x>>(8*i) & 255) for i in xrange(n)]) 56 | 57 | class DecafScalar(): 58 | _UNDER = c_uint64 * int(7) 59 | def __init__(self,cstruct=None,scalar=None): 60 | if cstruct is None: 61 | cstruct = DecafScalar._UNDER() 62 | memmove(addressof(cstruct), 63 | DECAF.decaf_448_scalar_zero, 64 | 8*7 65 | ) 66 | if scalar is None: 67 | scalar = E(0) 68 | self.cstruct = cstruct 69 | self.scalar = scalar 70 | 71 | self._check() 72 | 73 | @staticmethod 74 | def _c_deser(str): 75 | buffer = (c_uint8*int(56)).from_buffer_copy(str) 76 | cstruct = DecafScalar._UNDER() 77 | ret = DECAF.decaf_448_scalar_decode(cstruct,buffer,c_uint64(-1)) 78 | if ret != -1: 79 | raise Exception("scalar didn't decode") 80 | return cstruct 81 | 82 | @staticmethod 83 | def _sage_deser(str): 84 | s = from_le(str) 85 | if s >= FQ.cardinality(): raise Exception("scalar didn't decode") 86 | return FQ(s) 87 | 88 | def __eq__(self,other): 89 | csays = bool(DECAF.decaf_448_scalar_eq(self.cstruct,other.cstruct)) 90 | sagesays = any([self.scalar == other.scalar + t for t in Tor]) 91 | if csays != sagesays: 92 | raise Exception("C and SAGE don't agree: %d %d" % (csays, sagesays)) 93 | return csays 94 | 95 | def __ne__(self,other): 96 | return not self==other 97 | 98 | def __add__(self,other): 99 | cstruct = DecafScalar._UNDER() 100 | DECAF.decaf_448_scalar_add(cstruct,self.cstruct,other.cstruct) 101 | return DecafScalar(cstruct,self.scalar + other.scalar) 102 | 103 | def __sub__(self,other): 104 | cstruct = DecafScalar._UNDER() 105 | DECAF.decaf_448_scalar_sub(cstruct,self.cstruct,other.cstruct) 106 | return DecafScalar(cstruct,self.scalar - other.scalar) 107 | 108 | def __mul__(self,other): 109 | if isinstance(other,DecafScalar): 110 | cstruct = DecafScalar._UNDER() 111 | DECAF.decaf_448_scalar_mul(cstruct,self.cstruct,other.cstruct) 112 | return DecafScalar(cstruct,self.scalar * other.scalar) 113 | elif isinstance(other,DecafPoint): 114 | cstruct = DecafPoint._UNDER() 115 | DECAF.decaf_448_point_scalarmul(cstruct,other.cstruct,self.cstruct) 116 | return DecafPoint(cstruct,int(self.scalar) * other.point) 117 | else: raise Exception("Nope") 118 | 119 | def __div__(self,other): 120 | return self / other.inverse() 121 | 122 | def inverse(self): 123 | cstruct = DecafScalar._UNDER() 124 | z = DECAF.decaf_448_scalar_invert(cstruct,self.cstruct) 125 | if bool(z) != (self.scalar == 0): 126 | raise Exception("C and SAGE don't agree") 127 | return DecafScalar(cstruct,1/self.scalar) 128 | 129 | def __neg__(self): 130 | cstruct = DecafScalar._UNDER() 131 | DECAF.decaf_448_scalar_negate(cstruct,self.cstruct) 132 | return DecafScalar(cstruct,-self.scalar) 133 | 134 | def __str__(self): 135 | return " ".join(["%02x"%ord(b) for b in self.ser()]) 136 | 137 | def __repr__(self): 138 | return "DecafScalar.fromInt(%d)" % self.scalar 139 | 140 | @classmethod 141 | def fromInt(cls,i): 142 | return cls.deser(to_le(i,56)) 143 | 144 | def to64(self): 145 | return b64encode(self.ser()) 146 | 147 | @classmethod 148 | def from64(cls,str): 149 | return cls.deser(b64decode(str)) 150 | 151 | @classmethod 152 | def deser(cls,str): 153 | good = True 154 | try: cstruct = cls._c_deser(str) 155 | except Exception: good = False 156 | 157 | good2 = True 158 | try: scalar = cls._sage_deser(str) 159 | except Exception: good2 = False 160 | 161 | if good != good2: 162 | raise Exception("C and SAGE don't agree") 163 | elif not good: 164 | raise Exception("scalar didn't decode") 165 | 166 | return cls(cstruct,scalar) 167 | 168 | @classmethod 169 | def random(cls): 170 | while True: 171 | try: return cls.deser(random_array(56)) 172 | except Exception: pass 173 | 174 | @staticmethod 175 | def _c_ser(cstruct): 176 | buffer = (c_uint8*int(56))() 177 | DECAF.decaf_448_scalar_encode(buffer,cstruct) 178 | return str(bytearray(buffer)) 179 | 180 | def ser(self): 181 | return self._c_ser(self.cstruct) 182 | 183 | @staticmethod 184 | def _sage_ser(P): 185 | return to_le(P,56) 186 | 187 | def _check(self): 188 | ss = self._sage_ser(self.scalar) 189 | cs = self._c_ser(self.cstruct) 190 | if ss != cs: 191 | print ss 192 | print cs 193 | raise Exception("Check failed!") 194 | return True 195 | 196 | class DecafPoint(): 197 | _UNDER = c_uint64 * int(8*4) 198 | def __init__(self,cstruct=None,point=None): 199 | if cstruct is None: 200 | cstruct = DecafPoint._UNDER() 201 | memmove(addressof(cstruct), 202 | DECAF.decaf_448_point_identity, 203 | 8*8*4 204 | ) 205 | if point is None: 206 | point = E(0) 207 | self.cstruct = cstruct 208 | self.point = point 209 | 210 | self._check() 211 | 212 | @staticmethod 213 | def _c_deser(str): 214 | buffer = (c_uint8*int(56)).from_buffer_copy(str) 215 | cstruct = DecafPoint._UNDER() 216 | ret = DECAF.decaf_448_point_decode(cstruct,buffer,c_uint64(-1)) 217 | if ret != -1: 218 | raise Exception("Point didn't decode") 219 | return cstruct 220 | 221 | @staticmethod 222 | def _sage_deser(str): 223 | s = from_le(str) 224 | if s > (F.cardinality()-1)/2: raise Exception("Point didn't decode") 225 | if (s==0): return E(0) 226 | if not E.is_x_coord(s^2): raise Exception("Point didn't decode") 227 | P = E.lift_x(s^2) 228 | t = P.xy()[1] / s 229 | if is_odd(int(2*t/s)): P = -P 230 | return P 231 | 232 | def __eq__(self,other): 233 | csays = bool(DECAF.decaf_448_point_eq(self.cstruct,other.cstruct)) 234 | sagesays = any([self.point == other.point + t for t in Tor]) 235 | if csays != sagesays: 236 | raise Exception("C and SAGE don't agree: %d %d" % (csays, sagesays)) 237 | return csays 238 | 239 | def __ne__(self,other): 240 | return not self==other 241 | 242 | def __add__(self,other): 243 | cstruct = DecafPoint._UNDER() 244 | DECAF.decaf_448_point_add(cstruct,self.cstruct,other.cstruct) 245 | return DecafPoint(cstruct,self.point + other.point) 246 | 247 | def __sub__(self,other): 248 | cstruct = DecafPoint._UNDER() 249 | DECAF.decaf_448_point_sub(cstruct,self.cstruct,other.cstruct) 250 | return DecafPoint(cstruct,self.point - other.point) 251 | 252 | def __mul__(self,other): 253 | if isinstance(other,DecafScalar): 254 | return other*self 255 | else: 256 | raise Exception("nope") 257 | 258 | def __div__(self,other): 259 | if isinstance(other,DecafScalar): 260 | return other.inverse()*self 261 | else: 262 | raise Exception("nope") 263 | 264 | def __neg__(self): 265 | cstruct = DecafPoint._UNDER() 266 | DECAF.decaf_448_point_negate(cstruct,self.cstruct) 267 | return DecafPoint(cstruct,-self.point) 268 | 269 | def __str__(self): 270 | return " ".join(["%02x"%ord(b) for b in self.ser()]) 271 | 272 | def __repr__(self): 273 | return "DecafPoint.from64('%s')" % self.to64() 274 | 275 | def to64(self): 276 | return b64encode(self.ser()) 277 | 278 | @classmethod 279 | def from64(cls,str): 280 | return cls.deser(b64decode(str)) 281 | 282 | @classmethod 283 | def deser(cls,str): 284 | good = True 285 | try: cstruct = cls._c_deser(str) 286 | except Exception: good = False 287 | 288 | good2 = True 289 | try: point = cls._sage_deser(str) 290 | except Exception: good2 = False 291 | 292 | if good != good2: 293 | raise Exception("C and SAGE don't agree") 294 | elif not good: 295 | raise Exception("Point didn't decode") 296 | 297 | return cls(cstruct,point) 298 | 299 | @classmethod 300 | def random(cls): 301 | while True: 302 | try: return cls.deser(random_array(56)) 303 | except Exception: pass 304 | 305 | @staticmethod 306 | def _c_ser(cstruct): 307 | buffer = (c_uint8*int(56))() 308 | DECAF.decaf_448_point_encode(buffer,cstruct) 309 | return str(bytearray(buffer)) 310 | 311 | def ser(self): 312 | return self._c_ser(self.cstruct) 313 | 314 | @staticmethod 315 | def _sage_ser(P): 316 | if P == E(0): return to_le(0,56) 317 | x,y = P.xy() 318 | s = sqrt(x) 319 | if s==0: return to_le(0,56) 320 | if is_odd(int(2*y/s^2)): s = 1/s 321 | if int(s) > (F.cardinality()-1)/2: s = -s 322 | return to_le(s,56) 323 | 324 | def _check(self): 325 | ss = self._sage_ser(self.point) 326 | cs = self._c_ser(self.cstruct) 327 | if ss != cs: 328 | print ss 329 | print cs 330 | raise Exception("Check failed!") 331 | return True 332 | 333 | run_all_tests() 334 | 335 | -------------------------------------------------------------------------------- /src/p448/arch_x86_64/p448.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | 5 | #include "p448.h" 6 | #include "x86-64-arith.h" 7 | 8 | void 9 | p448_mul ( 10 | p448_t *__restrict__ cs, 11 | const p448_t *as, 12 | const p448_t *bs 13 | ) { 14 | const uint64_t *a = as->limb, *b = bs->limb; 15 | uint64_t *c = cs->limb; 16 | 17 | __uint128_t accum0 = 0, accum1 = 0, accum2; 18 | uint64_t mask = (1ull<<56) - 1; 19 | 20 | uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32))); 21 | 22 | /* For some reason clang doesn't vectorize this without prompting? */ 23 | unsigned int i; 24 | for (i=0; i>= 56; 59 | accum1 >>= 56; 60 | 61 | mac(&accum0, &aa[1],&bb[3]); 62 | mac(&accum1, &a[5], &b[7]); 63 | mac(&accum0, &aa[2], &bb[2]); 64 | mac(&accum1, &a[6], &b[6]); 65 | mac(&accum0, &aa[3], &bb[1]); 66 | accum1 += accum0; 67 | 68 | accum2 = widemul(&a[0],&b[0]); 69 | accum1 -= accum2; 70 | accum0 += accum2; 71 | 72 | msb(&accum0, &a[1], &b[3]); 73 | msb(&accum0, &a[2], &b[2]); 74 | mac(&accum1, &a[7], &b[5]); 75 | msb(&accum0, &a[3], &b[1]); 76 | mac(&accum1, &aa[0], &bb[0]); 77 | mac(&accum0, &a[4], &b[4]); 78 | 79 | c[0] = ((uint64_t)(accum0)) & mask; 80 | c[4] = ((uint64_t)(accum1)) & mask; 81 | 82 | accum0 >>= 56; 83 | accum1 >>= 56; 84 | 85 | accum2 = widemul(&a[2],&b[7]); 86 | mac(&accum0, &a[6], &bb[3]); 87 | mac(&accum1, &aa[2], &bbb[3]); 88 | 89 | mac(&accum2, &a[3], &b[6]); 90 | mac(&accum0, &a[7], &bb[2]); 91 | mac(&accum1, &aa[3], &bbb[2]); 92 | 93 | mac(&accum2, &a[0],&b[1]); 94 | mac(&accum1, &aa[0], &bb[1]); 95 | mac(&accum0, &a[4], &b[5]); 96 | 97 | mac(&accum2, &a[1], &b[0]); 98 | mac(&accum1, &aa[1], &bb[0]); 99 | mac(&accum0, &a[5], &b[4]); 100 | 101 | accum1 -= accum2; 102 | accum0 += accum2; 103 | 104 | c[1] = ((uint64_t)(accum0)) & mask; 105 | c[5] = ((uint64_t)(accum1)) & mask; 106 | 107 | accum0 >>= 56; 108 | accum1 >>= 56; 109 | 110 | accum2 = widemul(&a[3],&b[7]); 111 | mac(&accum0, &a[7], &bb[3]); 112 | mac(&accum1, &aa[3], &bbb[3]); 113 | 114 | mac(&accum2, &a[0],&b[2]); 115 | mac(&accum1, &aa[0], &bb[2]); 116 | mac(&accum0, &a[4], &b[6]); 117 | 118 | mac(&accum2, &a[1], &b[1]); 119 | mac(&accum1, &aa[1], &bb[1]); 120 | mac(&accum0, &a[5], &b[5]); 121 | 122 | mac(&accum2, &a[2], &b[0]); 123 | mac(&accum1, &aa[2], &bb[0]); 124 | mac(&accum0, &a[6], &b[4]); 125 | 126 | accum1 -= accum2; 127 | accum0 += accum2; 128 | 129 | c[2] = ((uint64_t)(accum0)) & mask; 130 | c[6] = ((uint64_t)(accum1)) & mask; 131 | 132 | accum0 >>= 56; 133 | accum1 >>= 56; 134 | 135 | accum0 += c[3]; 136 | accum1 += c[7]; 137 | c[3] = ((uint64_t)(accum0)) & mask; 138 | c[7] = ((uint64_t)(accum1)) & mask; 139 | 140 | /* we could almost stop here, but it wouldn't be stable, so... */ 141 | 142 | accum0 >>= 56; 143 | accum1 >>= 56; 144 | c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); 145 | c[0] += ((uint64_t)(accum1)); 146 | } 147 | 148 | void 149 | p448_mulw ( 150 | p448_t *__restrict__ cs, 151 | const p448_t *as, 152 | uint64_t b 153 | ) { 154 | const uint64_t *a = as->limb; 155 | uint64_t *c = cs->limb; 156 | 157 | __uint128_t accum0, accum4; 158 | uint64_t mask = (1ull<<56) - 1; 159 | 160 | accum0 = widemul_rm(b, &a[0]); 161 | accum4 = widemul_rm(b, &a[4]); 162 | 163 | c[0] = accum0 & mask; accum0 >>= 56; 164 | c[4] = accum4 & mask; accum4 >>= 56; 165 | 166 | mac_rm(&accum0, b, &a[1]); 167 | mac_rm(&accum4, b, &a[5]); 168 | 169 | c[1] = accum0 & mask; accum0 >>= 56; 170 | c[5] = accum4 & mask; accum4 >>= 56; 171 | 172 | mac_rm(&accum0, b, &a[2]); 173 | mac_rm(&accum4, b, &a[6]); 174 | 175 | c[2] = accum0 & mask; accum0 >>= 56; 176 | c[6] = accum4 & mask; accum4 >>= 56; 177 | 178 | mac_rm(&accum0, b, &a[3]); 179 | mac_rm(&accum4, b, &a[7]); 180 | 181 | c[3] = accum0 & mask; accum0 >>= 56; 182 | c[7] = accum4 & mask; accum4 >>= 56; 183 | 184 | accum0 += accum4 + c[4]; 185 | c[4] = accum0 & mask; 186 | c[5] += accum0 >> 56; 187 | 188 | accum4 += c[0]; 189 | c[0] = accum4 & mask; 190 | c[1] += accum4 >> 56; 191 | } 192 | 193 | void 194 | p448_sqr ( 195 | p448_t *__restrict__ cs, 196 | const p448_t *as 197 | ) { 198 | const uint64_t *a = as->limb; 199 | uint64_t *c = cs->limb; 200 | 201 | __uint128_t accum0 = 0, accum1 = 0, accum2; 202 | uint64_t mask = (1ull<<56) - 1; 203 | 204 | uint64_t aa[4] __attribute__((aligned(32))); 205 | 206 | /* For some reason clang doesn't vectorize this without prompting? */ 207 | unsigned int i; 208 | for (i=0; i>= 55; 227 | accum1 >>= 55; 228 | 229 | mac2(&accum0, &aa[1],&aa[3]); 230 | mac2(&accum1, &a[5], &a[7]); 231 | mac(&accum0, &aa[2], &aa[2]); 232 | accum1 += accum0; 233 | 234 | msb2(&accum0, &a[1], &a[3]); 235 | mac(&accum1, &a[6], &a[6]); 236 | 237 | accum2 = widemul(&a[0],&a[0]); 238 | accum1 -= accum2; 239 | accum0 += accum2; 240 | 241 | msb(&accum0, &a[2], &a[2]); 242 | mac(&accum1, &aa[0], &aa[0]); 243 | mac(&accum0, &a[4], &a[4]); 244 | 245 | c[0] = ((uint64_t)(accum0)) & mask; 246 | c[4] = ((uint64_t)(accum1)) & mask; 247 | 248 | accum0 >>= 56; 249 | accum1 >>= 56; 250 | 251 | accum2 = widemul2(&aa[2],&aa[3]); 252 | msb2(&accum0, &a[2], &a[3]); 253 | mac2(&accum1, &a[6], &a[7]); 254 | 255 | accum1 += accum2; 256 | accum0 += accum2; 257 | 258 | accum2 = widemul2(&a[0],&a[1]); 259 | mac2(&accum1, &aa[0], &aa[1]); 260 | mac2(&accum0, &a[4], &a[5]); 261 | 262 | accum1 -= accum2; 263 | accum0 += accum2; 264 | 265 | c[1] = ((uint64_t)(accum0)) & mask; 266 | c[5] = ((uint64_t)(accum1)) & mask; 267 | 268 | accum0 >>= 56; 269 | accum1 >>= 56; 270 | 271 | accum2 = widemul(&aa[3],&aa[3]); 272 | msb(&accum0, &a[3], &a[3]); 273 | mac(&accum1, &a[7], &a[7]); 274 | 275 | accum1 += accum2; 276 | accum0 += accum2; 277 | 278 | accum2 = widemul2(&a[0],&a[2]); 279 | mac2(&accum1, &aa[0], &aa[2]); 280 | mac2(&accum0, &a[4], &a[6]); 281 | 282 | mac(&accum2, &a[1], &a[1]); 283 | mac(&accum1, &aa[1], &aa[1]); 284 | mac(&accum0, &a[5], &a[5]); 285 | 286 | accum1 -= accum2; 287 | accum0 += accum2; 288 | 289 | c[2] = ((uint64_t)(accum0)) & mask; 290 | c[6] = ((uint64_t)(accum1)) & mask; 291 | 292 | accum0 >>= 56; 293 | accum1 >>= 56; 294 | 295 | accum0 += c[3]; 296 | accum1 += c[7]; 297 | c[3] = ((uint64_t)(accum0)) & mask; 298 | c[7] = ((uint64_t)(accum1)) & mask; 299 | 300 | /* we could almost stop here, but it wouldn't be stable, so... */ 301 | 302 | accum0 >>= 56; 303 | accum1 >>= 56; 304 | c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); 305 | c[0] += ((uint64_t)(accum1)); 306 | } 307 | 308 | void 309 | p448_strong_reduce ( 310 | p448_t *a 311 | ) { 312 | uint64_t mask = (1ull<<56)-1; 313 | 314 | /* first, clear high */ 315 | a->limb[4] += a->limb[7]>>56; 316 | a->limb[0] += a->limb[7]>>56; 317 | a->limb[7] &= mask; 318 | 319 | /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */ 320 | 321 | /* compute total_value - p. No need to reduce mod p. */ 322 | 323 | __int128_t scarry = 0; 324 | int i; 325 | for (i=0; i<8; i++) { 326 | scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask); 327 | a->limb[i] = scarry & mask; 328 | scarry >>= 56; 329 | } 330 | 331 | /* uncommon case: it was >= p, so now scarry = 0 and this = x 332 | * common case: it was < p, so now scarry = -1 and this = x - p + 2^448 333 | * so let's add back in p. will carry back off the top for 2^448. 334 | */ 335 | 336 | assert(is_zero(scarry) | is_zero(scarry+1)); 337 | 338 | uint64_t scarry_mask = scarry & mask; 339 | __uint128_t carry = 0; 340 | 341 | /* add it back */ 342 | for (i=0; i<8; i++) { 343 | carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask); 344 | a->limb[i] = carry & mask; 345 | carry >>= 56; 346 | } 347 | 348 | assert(is_zero(carry + scarry)); 349 | } 350 | 351 | void 352 | p448_serialize ( 353 | uint8_t *serial, 354 | const struct p448_t *x 355 | ) { 356 | int i,j; 357 | p448_t red; 358 | p448_copy(&red, x); 359 | p448_strong_reduce(&red); 360 | for (i=0; i<8; i++) { 361 | for (j=0; j<7; j++) { 362 | serial[7*i+j] = red.limb[i]; 363 | red.limb[i] >>= 8; 364 | } 365 | assert(red.limb[i] == 0); 366 | } 367 | } 368 | 369 | mask_t 370 | p448_deserialize ( 371 | p448_t *x, 372 | const uint8_t serial[56] 373 | ) { 374 | int i,j; 375 | for (i=0; i<8; i++) { 376 | word_t out = 0; 377 | for (j=0; j<7; j++) { 378 | out |= ((word_t)serial[7*i+j])<<(8*j); 379 | } 380 | x->limb[i] = out; 381 | } 382 | 383 | /* Check for reduction. 384 | * 385 | * The idea is to create a variable ge which is all ones (rather, 56 ones) 386 | * if and only if the low $i$ words of $x$ are >= those of p. 387 | * 388 | * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) 389 | */ 390 | word_t ge = -1, mask = (1ull<<56)-1; 391 | for (i=0; i<4; i++) { 392 | ge &= x->limb[i]; 393 | } 394 | 395 | /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ 396 | ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask); 397 | 398 | /* Propagate the rest */ 399 | for (i=5; i<8; i++) { 400 | ge &= x->limb[i]; 401 | } 402 | 403 | return ~is_zero(ge ^ mask); 404 | } 405 | 406 | -------------------------------------------------------------------------------- /src/p480/arch_x86_64/p480.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | 5 | #include "p480.h" 6 | #include "x86-64-arith.h" 7 | 8 | void 9 | p480_mul ( 10 | p480_t *__restrict__ cs, 11 | const p480_t *as, 12 | const p480_t *bs 13 | ) { 14 | const uint64_t *a = as->limb, *b = bs->limb; 15 | uint64_t *c = cs->limb; 16 | 17 | __uint128_t accum0 = 0, accum1 = 0, accum2; 18 | uint64_t mask = (1ull<<60) - 1; 19 | 20 | uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32))); 21 | 22 | /* For some reason clang doesn't vectorize this without prompting? */ 23 | unsigned int i; 24 | for (i=0; i>= 60; 59 | accum1 >>= 60; 60 | 61 | mac(&accum0, &aa[1],&bb[3]); 62 | mac(&accum1, &a[5], &b[7]); 63 | mac(&accum0, &aa[2], &bb[2]); 64 | mac(&accum1, &a[6], &b[6]); 65 | mac(&accum0, &aa[3], &bb[1]); 66 | accum1 += accum0; 67 | 68 | accum2 = widemul(&a[0],&b[0]); 69 | accum1 -= accum2; 70 | accum0 += accum2; 71 | 72 | msb(&accum0, &a[1], &b[3]); 73 | msb(&accum0, &a[2], &b[2]); 74 | mac(&accum1, &a[7], &b[5]); 75 | msb(&accum0, &a[3], &b[1]); 76 | mac(&accum1, &aa[0], &bb[0]); 77 | mac(&accum0, &a[4], &b[4]); 78 | 79 | c[0] = ((uint64_t)(accum0)) & mask; 80 | c[4] = ((uint64_t)(accum1)) & mask; 81 | 82 | accum0 >>= 60; 83 | accum1 >>= 60; 84 | 85 | accum2 = widemul(&a[2],&b[7]); 86 | mac(&accum0, &a[6], &bb[3]); 87 | mac(&accum1, &aa[2], &bbb[3]); 88 | 89 | mac(&accum2, &a[3], &b[6]); 90 | mac(&accum0, &a[7], &bb[2]); 91 | mac(&accum1, &aa[3], &bbb[2]); 92 | 93 | mac(&accum2, &a[0],&b[1]); 94 | mac(&accum1, &aa[0], &bb[1]); 95 | mac(&accum0, &a[4], &b[5]); 96 | 97 | mac(&accum2, &a[1], &b[0]); 98 | mac(&accum1, &aa[1], &bb[0]); 99 | mac(&accum0, &a[5], &b[4]); 100 | 101 | accum1 -= accum2; 102 | accum0 += accum2; 103 | 104 | c[1] = ((uint64_t)(accum0)) & mask; 105 | c[5] = ((uint64_t)(accum1)) & mask; 106 | 107 | accum0 >>= 60; 108 | accum1 >>= 60; 109 | 110 | accum2 = widemul(&a[3],&b[7]); 111 | mac(&accum0, &a[7], &bb[3]); 112 | mac(&accum1, &aa[3], &bbb[3]); 113 | 114 | mac(&accum2, &a[0],&b[2]); 115 | mac(&accum1, &aa[0], &bb[2]); 116 | mac(&accum0, &a[4], &b[6]); 117 | 118 | mac(&accum2, &a[1], &b[1]); 119 | mac(&accum1, &aa[1], &bb[1]); 120 | mac(&accum0, &a[5], &b[5]); 121 | 122 | mac(&accum2, &a[2], &b[0]); 123 | mac(&accum1, &aa[2], &bb[0]); 124 | mac(&accum0, &a[6], &b[4]); 125 | 126 | accum1 -= accum2; 127 | accum0 += accum2; 128 | 129 | c[2] = ((uint64_t)(accum0)) & mask; 130 | c[6] = ((uint64_t)(accum1)) & mask; 131 | 132 | accum0 >>= 60; 133 | accum1 >>= 60; 134 | 135 | accum0 += c[3]; 136 | accum1 += c[7]; 137 | c[3] = ((uint64_t)(accum0)) & mask; 138 | c[7] = ((uint64_t)(accum1)) & mask; 139 | 140 | /* we could almost stop here, but it wouldn't be stable, so... */ 141 | 142 | accum0 >>= 60; 143 | accum1 >>= 60; 144 | c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); 145 | c[0] += ((uint64_t)(accum1)); 146 | } 147 | 148 | void 149 | p480_mulw ( 150 | p480_t *__restrict__ cs, 151 | const p480_t *as, 152 | uint64_t b 153 | ) { 154 | const uint64_t *a = as->limb; 155 | uint64_t *c = cs->limb; 156 | 157 | __uint128_t accum0, accum4; 158 | uint64_t mask = (1ull<<60) - 1; 159 | 160 | accum0 = widemul_rm(b, &a[0]); 161 | accum4 = widemul_rm(b, &a[4]); 162 | 163 | c[0] = accum0 & mask; accum0 >>= 60; 164 | c[4] = accum4 & mask; accum4 >>= 60; 165 | 166 | mac_rm(&accum0, b, &a[1]); 167 | mac_rm(&accum4, b, &a[5]); 168 | 169 | c[1] = accum0 & mask; accum0 >>= 60; 170 | c[5] = accum4 & mask; accum4 >>= 60; 171 | 172 | mac_rm(&accum0, b, &a[2]); 173 | mac_rm(&accum4, b, &a[6]); 174 | 175 | c[2] = accum0 & mask; accum0 >>= 60; 176 | c[6] = accum4 & mask; accum4 >>= 60; 177 | 178 | mac_rm(&accum0, b, &a[3]); 179 | mac_rm(&accum4, b, &a[7]); 180 | 181 | c[3] = accum0 & mask; accum0 >>= 60; 182 | c[7] = accum4 & mask; accum4 >>= 60; 183 | 184 | accum0 += accum4 + c[4]; 185 | c[4] = accum0 & mask; 186 | c[5] += accum0 >> 60; 187 | 188 | accum4 += c[0]; 189 | c[0] = accum4 & mask; 190 | c[1] += accum4 >> 60; 191 | } 192 | 193 | void 194 | p480_sqr ( 195 | p480_t *__restrict__ cs, 196 | const p480_t *as 197 | ) { 198 | const uint64_t *a = as->limb; 199 | uint64_t *c = cs->limb; 200 | 201 | __uint128_t accum0 = 0, accum1 = 0, accum2; 202 | uint64_t mask = (1ull<<60) - 1; 203 | 204 | uint64_t aa[4] __attribute__((aligned(32))); 205 | 206 | /* For some reason clang doesn't vectorize this without prompting? */ 207 | unsigned int i; 208 | for (i=0; i>= 59; 227 | accum1 >>= 59; 228 | 229 | mac2(&accum0, &aa[1],&aa[3]); 230 | mac2(&accum1, &a[5], &a[7]); 231 | mac(&accum0, &aa[2], &aa[2]); 232 | accum1 += accum0; 233 | 234 | msb2(&accum0, &a[1], &a[3]); 235 | mac(&accum1, &a[6], &a[6]); 236 | 237 | accum2 = widemul(&a[0],&a[0]); 238 | accum1 -= accum2; 239 | accum0 += accum2; 240 | 241 | msb(&accum0, &a[2], &a[2]); 242 | mac(&accum1, &aa[0], &aa[0]); 243 | mac(&accum0, &a[4], &a[4]); 244 | 245 | c[0] = ((uint64_t)(accum0)) & mask; 246 | c[4] = ((uint64_t)(accum1)) & mask; 247 | 248 | accum0 >>= 60; 249 | accum1 >>= 60; 250 | 251 | accum2 = widemul2(&aa[2],&aa[3]); 252 | msb2(&accum0, &a[2], &a[3]); 253 | mac2(&accum1, &a[6], &a[7]); 254 | 255 | accum1 += accum2; 256 | accum0 += accum2; 257 | 258 | accum2 = widemul2(&a[0],&a[1]); 259 | mac2(&accum1, &aa[0], &aa[1]); 260 | mac2(&accum0, &a[4], &a[5]); 261 | 262 | accum1 -= accum2; 263 | accum0 += accum2; 264 | 265 | c[1] = ((uint64_t)(accum0)) & mask; 266 | c[5] = ((uint64_t)(accum1)) & mask; 267 | 268 | accum0 >>= 60; 269 | accum1 >>= 60; 270 | 271 | accum2 = widemul(&aa[3],&aa[3]); 272 | msb(&accum0, &a[3], &a[3]); 273 | mac(&accum1, &a[7], &a[7]); 274 | 275 | accum1 += accum2; 276 | accum0 += accum2; 277 | 278 | accum2 = widemul2(&a[0],&a[2]); 279 | mac2(&accum1, &aa[0], &aa[2]); 280 | mac2(&accum0, &a[4], &a[6]); 281 | 282 | mac(&accum2, &a[1], &a[1]); 283 | mac(&accum1, &aa[1], &aa[1]); 284 | mac(&accum0, &a[5], &a[5]); 285 | 286 | accum1 -= accum2; 287 | accum0 += accum2; 288 | 289 | c[2] = ((uint64_t)(accum0)) & mask; 290 | c[6] = ((uint64_t)(accum1)) & mask; 291 | 292 | accum0 >>= 60; 293 | accum1 >>= 60; 294 | 295 | accum0 += c[3]; 296 | accum1 += c[7]; 297 | c[3] = ((uint64_t)(accum0)) & mask; 298 | c[7] = ((uint64_t)(accum1)) & mask; 299 | 300 | /* we could almost stop here, but it wouldn't be stable, so... */ 301 | 302 | accum0 >>= 60; 303 | accum1 >>= 60; 304 | c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); 305 | c[0] += ((uint64_t)(accum1)); 306 | } 307 | 308 | void 309 | p480_strong_reduce ( 310 | p480_t *a 311 | ) { 312 | uint64_t mask = (1ull<<60)-1; 313 | 314 | /* first, clear high */ 315 | a->limb[4] += a->limb[7]>>60; 316 | a->limb[0] += a->limb[7]>>60; 317 | a->limb[7] &= mask; 318 | 319 | /* now the total is less than 2^480 - 2^(480-60) + 2^(480-60+8) < 2p */ 320 | 321 | /* compute total_value - p. No need to reduce mod p. */ 322 | 323 | __int128_t scarry = 0; 324 | int i; 325 | for (i=0; i<8; i++) { 326 | scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask); 327 | a->limb[i] = scarry & mask; 328 | scarry >>= 60; 329 | } 330 | 331 | /* uncommon case: it was >= p, so now scarry = 0 and this = x 332 | * common case: it was < p, so now scarry = -1 and this = x - p + 2^480 333 | * so let's add back in p. will carry back off the top for 2^480. 334 | */ 335 | 336 | assert(is_zero(scarry) | is_zero(scarry+1)); 337 | 338 | uint64_t scarry_mask = scarry & mask; 339 | __uint128_t carry = 0; 340 | 341 | /* add it back */ 342 | for (i=0; i<8; i++) { 343 | carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask); 344 | a->limb[i] = carry & mask; 345 | carry >>= 60; 346 | } 347 | 348 | assert(is_zero(carry + scarry)); 349 | } 350 | 351 | void 352 | p480_serialize ( 353 | uint8_t *serial, 354 | const struct p480_t *x 355 | ) { 356 | int i,j,k=0; 357 | p480_t red; 358 | p480_copy(&red, x); 359 | p480_strong_reduce(&red); 360 | word_t r = 0; 361 | for (i=0; i<8; i+=2) { 362 | r = red.limb[i]; 363 | for (j=0; j<7; j++) { 364 | serial[k++] = r; 365 | r >>= 8; 366 | } 367 | assert(r<16); 368 | r += red.limb[i+1]<<4; 369 | for (j=0; j<8; j++) { 370 | serial[k++] = r; 371 | r >>= 8; 372 | } 373 | assert(r==0); 374 | } 375 | } 376 | 377 | mask_t 378 | p480_deserialize ( 379 | p480_t *x, 380 | const uint8_t serial[60] 381 | ) { 382 | int i,j,k=0; 383 | 384 | for (i=0; i<8; i+=2) { 385 | word_t r = 0; 386 | for (j=0; j<8; j++) { 387 | r |= ((word_t)serial[k++])<<(8*j); 388 | } 389 | x->limb[i] = r & ((1ull<<60)-1); 390 | r >>= 60; 391 | for (j=0; j<7; j++) { 392 | r |= ((word_t)serial[k++])<<(8*j+4); 393 | } 394 | x->limb[i+1] = r; 395 | } 396 | 397 | /* Check for reduction. 398 | * 399 | * The idea is to create a variable ge which is all ones (rather, 60 ones) 400 | * if and only if the low $i$ words of $x$ are >= those of p. 401 | * 402 | * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) 403 | */ 404 | word_t ge = -1, mask = (1ull<<60)-1; 405 | for (i=0; i<4; i++) { 406 | ge &= x->limb[i]; 407 | } 408 | 409 | /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ 410 | ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask); 411 | 412 | /* Propagate the rest */ 413 | for (i=5; i<8; i++) { 414 | ge &= x->limb[i]; 415 | } 416 | 417 | return ~is_zero(ge ^ mask); 418 | } 419 | 420 | -------------------------------------------------------------------------------- /src/p521/arch_ref64/p521.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | 5 | #include "p521.h" 6 | 7 | static __inline__ __uint128_t widemul( 8 | const uint64_t a, 9 | const uint64_t b 10 | ) { 11 | return ((__uint128_t)a) * ((__uint128_t)b); 12 | } 13 | 14 | static __inline__ uint64_t is_zero(uint64_t a) { 15 | /* let's hope the compiler isn't clever enough to optimize this. */ 16 | return (((__uint128_t)a)-1)>>64; 17 | } 18 | 19 | void 20 | p521_mul ( 21 | p521_t *__restrict__ cs, 22 | const p521_t *as, 23 | const p521_t *bs 24 | ) { 25 | uint64_t *c = cs->limb; 26 | const uint64_t *a = as->limb, *b = bs->limb; 27 | __uint128_t accum0, accum1; 28 | 29 | accum0 = widemul(2*a[8], b[8]); 30 | accum1 = widemul(a[0], b[7]); 31 | accum0 += widemul(a[1], b[6]); 32 | accum1 += widemul(a[2], b[5]); 33 | accum0 += widemul(a[3], b[4]); 34 | accum1 += widemul(a[4], b[3]); 35 | accum0 += widemul(a[5], b[2]); 36 | accum1 += widemul(a[6], b[1]); 37 | accum0 += widemul(a[7], b[0]); 38 | accum1 += accum0; 39 | c[7] = accum1 & ((1ull<<58)-1); 40 | accum1 >>= 58; 41 | 42 | accum0 = 0; 43 | accum1 += widemul(a[0], b[8-0]); 44 | accum0 += widemul(a[1], b[8-1]); 45 | accum1 += widemul(a[2], b[8-2]); 46 | accum0 += widemul(a[3], b[8-3]); 47 | accum1 += widemul(a[4], b[8-4]); 48 | accum0 += widemul(a[5], b[8-5]); 49 | accum1 += widemul(a[6], b[8-6]); 50 | accum0 += widemul(a[7], b[8-7]); 51 | accum1 += widemul(a[8], b[8-8]); 52 | accum1 += accum0; 53 | c[8] = accum1 & ((1ull<<57)-1); 54 | accum1 >>= 57; 55 | 56 | accum0 = 0; 57 | accum0 += widemul(a[1], b[0+9-1]); 58 | accum0 += widemul(a[2], b[0+9-2]); 59 | accum0 += widemul(a[3], b[0+9-3]); 60 | accum0 += widemul(a[4], b[0+9-4]); 61 | accum1 += widemul(a[0], b[0-0]); 62 | accum0 += widemul(a[5], b[0+9-5]); 63 | accum0 += widemul(a[6], b[0+9-6]); 64 | accum0 += widemul(a[7], b[0+9-7]); 65 | accum0 += widemul(a[8], b[0+9-8]); 66 | accum1 += accum0 << 1; 67 | c[0] = accum1 & ((1ull<<58)-1); 68 | accum1 >>= 58; 69 | 70 | accum0 = 0; 71 | accum0 += widemul(a[2], b[1+9-2]); 72 | accum0 += widemul(a[3], b[1+9-3]); 73 | accum1 += widemul(a[0], b[1-0]); 74 | accum0 += widemul(a[4], b[1+9-4]); 75 | accum0 += widemul(a[5], b[1+9-5]); 76 | accum1 += widemul(a[1], b[1-1]); 77 | accum0 += widemul(a[6], b[1+9-6]); 78 | accum0 += widemul(a[7], b[1+9-7]); 79 | accum0 += widemul(a[8], b[1+9-8]); 80 | accum1 += accum0 << 1; 81 | c[1] = accum1 & ((1ull<<58)-1); 82 | accum1 >>= 58; 83 | 84 | accum0 = 0; 85 | accum0 += widemul(a[3], b[2+9-3]); 86 | accum1 += widemul(a[0], b[2-0]); 87 | accum0 += widemul(a[4], b[2+9-4]); 88 | accum0 += widemul(a[5], b[2+9-5]); 89 | accum1 += widemul(a[1], b[2-1]); 90 | accum0 += widemul(a[6], b[2+9-6]); 91 | accum0 += widemul(a[7], b[2+9-7]); 92 | accum1 += widemul(a[2], b[2-2]); 93 | accum0 += widemul(a[8], b[2+9-8]); 94 | accum1 += accum0 << 1; 95 | c[2] = accum1 & ((1ull<<58)-1); 96 | accum1 >>= 58; 97 | 98 | accum0 = 0; 99 | accum0 += widemul(a[4], b[3+9-4]); 100 | accum1 += widemul(a[0], b[3-0]); 101 | accum0 += widemul(a[5], b[3+9-5]); 102 | accum1 += widemul(a[1], b[3-1]); 103 | accum0 += widemul(a[6], b[3+9-6]); 104 | accum1 += widemul(a[2], b[3-2]); 105 | accum0 += widemul(a[7], b[3+9-7]); 106 | accum1 += widemul(a[3], b[3-3]); 107 | accum0 += widemul(a[8], b[3+9-8]); 108 | accum1 += accum0 << 1; 109 | c[3] = accum1 & ((1ull<<58)-1); 110 | accum1 >>= 58; 111 | 112 | accum0 = 0; 113 | accum1 += widemul(a[0], b[4-0]); 114 | accum0 += widemul(a[5], b[4+9-5]); 115 | accum1 += widemul(a[1], b[4-1]); 116 | accum0 += widemul(a[6], b[4+9-6]); 117 | accum1 += widemul(a[2], b[4-2]); 118 | accum0 += widemul(a[7], b[4+9-7]); 119 | accum1 += widemul(a[3], b[4-3]); 120 | accum0 += widemul(a[8], b[4+9-8]); 121 | accum1 += widemul(a[4], b[4-4]); 122 | accum1 += accum0 << 1; 123 | c[4] = accum1 & ((1ull<<58)-1); 124 | accum1 >>= 58; 125 | 126 | accum0 = 0; 127 | accum1 += widemul(a[0], b[5-0]); 128 | accum0 += widemul(a[6], b[5+9-6]); 129 | accum1 += widemul(a[1], b[5-1]); 130 | accum1 += widemul(a[2], b[5-2]); 131 | accum0 += widemul(a[7], b[5+9-7]); 132 | accum1 += widemul(a[3], b[5-3]); 133 | accum1 += widemul(a[4], b[5-4]); 134 | accum0 += widemul(a[8], b[5+9-8]); 135 | accum1 += widemul(a[5], b[5-5]); 136 | accum1 += accum0 << 1; 137 | c[5] = accum1 & ((1ull<<58)-1); 138 | accum1 >>= 58; 139 | 140 | accum0 = 0; 141 | accum1 += widemul(a[0], b[6-0]); 142 | accum1 += widemul(a[1], b[6-1]); 143 | accum0 += widemul(a[7], b[6+9-7]); 144 | accum1 += widemul(a[2], b[6-2]); 145 | accum1 += widemul(a[3], b[6-3]); 146 | accum1 += widemul(a[4], b[6-4]); 147 | accum0 += widemul(a[8], b[6+9-8]); 148 | accum1 += widemul(a[5], b[6-5]); 149 | accum1 += widemul(a[6], b[6-6]); 150 | accum1 += accum0 << 1; 151 | c[6] = accum1 & ((1ull<<58)-1); 152 | accum1 >>= 58; 153 | 154 | accum1 += c[7]; 155 | c[7] = accum1 & ((1ull<<58)-1); 156 | 157 | c[8] += accum1 >> 58; 158 | } 159 | 160 | void 161 | p521_mulw ( 162 | p521_t *__restrict__ cs, 163 | const p521_t *as, 164 | uint64_t b 165 | ) { 166 | const uint64_t *a = as->limb; 167 | uint64_t *c = cs->limb; 168 | 169 | __uint128_t accum0 = 0, accum3 = 0, accum6 = 0; 170 | uint64_t mask = (1ull<<58) - 1; 171 | 172 | int i; 173 | for (i=0; i<3; i++) { 174 | accum0 += widemul(b, a[i]); 175 | accum3 += widemul(b, a[i+3]); 176 | accum6 += widemul(b, a[i+6]); 177 | c[i] = accum0 & mask; accum0 >>= 58; 178 | c[i+3] = accum3 & mask; accum3 >>= 58; 179 | if (i==2) { 180 | c[i+6] = accum6 & (mask>>1); accum6 >>= 57; 181 | } else { 182 | c[i+6] = accum6 & mask; accum6 >>= 58; 183 | } 184 | } 185 | 186 | accum0 += c[3]; 187 | c[3] = accum0 & mask; 188 | c[4] += accum0 >> 58; 189 | 190 | accum3 += c[6]; 191 | c[6] = accum3 & mask; 192 | c[7] += accum3 >> 58; 193 | 194 | accum6 += c[0]; 195 | c[0] = accum6 & mask; 196 | c[1] += accum6 >> 58; 197 | } 198 | 199 | void 200 | p521_sqr ( 201 | p521_t *__restrict__ cs, 202 | const p521_t *as 203 | ) { 204 | uint64_t *c = cs->limb; 205 | const uint64_t *a = as->limb; 206 | __uint128_t accum0, accum1; 207 | 208 | accum0 = widemul(a[8], a[8]); 209 | accum1 = widemul(a[0], a[7]); 210 | accum0 += widemul(a[1], a[6]); 211 | accum1 += widemul(a[2], a[5]); 212 | accum0 += widemul(a[3], a[4]); 213 | accum1 += accum0; 214 | c[7] = 2 * (accum1 & ((1ull<<57)-1)); 215 | accum1 >>= 57; 216 | 217 | accum0 = 0; 218 | accum0 = 0; 219 | accum1 += widemul(a[4], a[4]); 220 | accum0 += widemul(a[1], a[7]); 221 | accum1 += widemul(2*a[2], a[6]); 222 | accum0 += widemul(a[3], a[5]); 223 | accum1 += widemul(2*a[0], a[8]); 224 | accum1 += 2*accum0; 225 | c[8] = accum1 & ((1ull<<57)-1); 226 | accum1 >>= 57; 227 | 228 | accum0 = 0; 229 | accum1 += widemul(a[0], a[0]); 230 | accum0 += widemul(a[1], a[8]); 231 | accum0 += widemul(a[2], a[7]); 232 | accum0 += widemul(a[3], a[6]); 233 | accum0 += widemul(a[4], a[5]); 234 | accum1 += accum0 << 2; 235 | c[0] = accum1 & ((1ull<<58)-1); 236 | accum1 >>= 58; 237 | 238 | accum0 = 0; 239 | accum0 += widemul(a[2], a[8]); 240 | accum0 += widemul(a[3], a[7]); 241 | accum0 += widemul(a[4], a[6]); 242 | accum0 <<= 1; 243 | accum0 += widemul(a[5], a[5]); 244 | accum0 += widemul(a[0], a[1]); 245 | accum1 += accum0 << 1; 246 | c[1] = accum1 & ((1ull<<58)-1); 247 | accum1 >>= 58; 248 | 249 | accum0 = 0; 250 | accum1 += widemul(a[1], a[1]); 251 | 252 | accum0 += widemul(a[3], a[8]); 253 | accum0 += widemul(a[4], a[7]); 254 | accum0 += widemul(a[5], a[6]); 255 | accum0 <<= 1; 256 | accum0 += widemul(a[0], a[2]); 257 | accum1 += accum0 << 1; 258 | c[2] = accum1 & ((1ull<<58)-1); 259 | accum1 >>= 58; 260 | 261 | accum0 = 0; 262 | accum0 += widemul(a[6], a[6]); 263 | accum0 += widemul(2*a[5], a[7]); 264 | accum0 += widemul(2*a[4], a[8]); 265 | accum0 += widemul(a[0], a[3]); 266 | accum0 += widemul(a[1], a[2]); 267 | accum1 += accum0 << 1; 268 | c[3] = accum1 & ((1ull<<58)-1); 269 | accum1 >>= 58; 270 | 271 | accum0 = 0; 272 | accum0 += widemul(a[6], a[7]); 273 | accum0 += widemul(a[5], a[8]); 274 | accum0 <<= 1; 275 | accum1 += widemul(a[2], a[2]); 276 | accum0 += widemul(a[0], a[4]); 277 | accum0 += widemul(a[1], a[3]); 278 | accum1 += accum0 << 1; 279 | c[4] = accum1 & ((1ull<<58)-1); 280 | accum1 >>= 58; 281 | 282 | accum0 = 0; 283 | accum0 += widemul(2*a[6], a[8]); 284 | accum0 += widemul(a[7], a[7]); 285 | accum0 += widemul(a[0], a[5]); 286 | accum0 += widemul(a[1], a[4]); 287 | accum0 += widemul(a[2], a[3]); 288 | accum1 += accum0 << 1; 289 | c[5] = accum1 & ((1ull<<58)-1); 290 | accum1 >>= 58; 291 | 292 | accum0 = 0; 293 | accum1 += widemul(a[3], a[3]); 294 | accum0 += widemul(a[0], a[6]); 295 | accum0 += widemul(a[1], a[5]); 296 | accum0 += widemul(2*a[7], a[8]); 297 | accum0 += widemul(a[2], a[4]); 298 | accum1 += accum0 << 1; 299 | c[6] = accum1 & ((1ull<<58)-1); 300 | accum1 >>= 58; 301 | 302 | accum1 += c[7]; 303 | c[7] = accum1 & ((1ull<<58)-1); 304 | 305 | c[8] += accum1 >> 58; 306 | } 307 | 308 | void 309 | p521_strong_reduce ( 310 | p521_t *a 311 | ) { 312 | uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1; 313 | 314 | /* first, clear high */ 315 | __int128_t scarry = a->limb[8]>>57; 316 | a->limb[8] &= mask2; 317 | 318 | /* now the total is less than 2p */ 319 | 320 | /* compute total_value - p. No need to reduce mod p. */ 321 | 322 | int i; 323 | for (i=0; i<9; i++) { 324 | scarry = scarry + a->limb[i] - ((i==8) ? mask2 : mask); 325 | a->limb[i] = scarry & ((i==8) ? mask2 : mask); 326 | scarry >>= (i==8) ? 57 : 58; 327 | } 328 | 329 | /* uncommon case: it was >= p, so now scarry = 0 and this = x 330 | * common case: it was < p, so now scarry = -1 and this = x - p + 2^521 331 | * so let's add back in p. will carry back off the top for 2^521. 332 | */ 333 | 334 | assert(is_zero(scarry) | is_zero(scarry+1)); 335 | 336 | uint64_t scarry_mask = scarry & mask; 337 | __uint128_t carry = 0; 338 | 339 | /* add it back */ 340 | for (i=0; i<9; i++) { 341 | carry = carry + a->limb[i] + ((i==8)?(scarry_mask>>1):scarry_mask); 342 | a->limb[i] = carry & ((i==8) ? mask>>1 : mask); 343 | carry >>= (i==8) ? 57 : 58; 344 | } 345 | 346 | assert(is_zero(carry + scarry)); 347 | } 348 | 349 | void 350 | p521_serialize ( 351 | uint8_t *serial, 352 | const struct p521_t *x 353 | ) { 354 | int i,k=0; 355 | p521_t red; 356 | p521_copy(&red, x); 357 | p521_strong_reduce(&red); 358 | 359 | uint64_t r=0; 360 | int bits = 0; 361 | for (i=0; i<9; i++) { 362 | r |= red.limb[i] << bits; 363 | for (bits += 58; bits >= 8; bits -= 8) { 364 | serial[k++] = r; 365 | r >>= 8; 366 | } 367 | assert(bits <= 6); 368 | } 369 | assert(bits); 370 | serial[k++] = r; 371 | } 372 | 373 | mask_t 374 | p521_deserialize ( 375 | p521_t *x, 376 | const uint8_t serial[66] 377 | ) { 378 | int i,k=0,bits=0; 379 | __uint128_t out = 0; 380 | uint64_t mask = (1ull<<58)-1; 381 | for (i=0; i<9; i++) { 382 | out >>= 58; 383 | for (; bits<58; bits+=8) { 384 | out |= ((__uint128_t)serial[k++])<limb[i] = out & mask; 387 | bits -= 58; 388 | } 389 | 390 | /* Check for reduction. First, high has to be < 2^57 */ 391 | mask_t good = is_zero(out>>57); 392 | 393 | uint64_t and = -1ull; 394 | for (i=0; i<8; i++) { 395 | and &= x->limb[i]; 396 | } 397 | and &= (2*out+1); 398 | good &= is_zero((and+1)>>58); 399 | 400 | return good; 401 | } 402 | -------------------------------------------------------------------------------- /src/p448/arch_ref64/p448.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014 Cryptography Research, Inc. 2 | * Released under the MIT License. See LICENSE.txt for license information. 3 | */ 4 | 5 | #include "p448.h" 6 | 7 | static __inline__ __uint128_t widemul( 8 | const uint64_t a, 9 | const uint64_t b 10 | ) { 11 | return ((__uint128_t)a) * ((__uint128_t)b); 12 | } 13 | 14 | static __inline__ uint64_t is_zero(uint64_t a) { 15 | /* let's hope the compiler isn't clever enough to optimize this. */ 16 | return (((__uint128_t)a)-1)>>64; 17 | } 18 | 19 | void 20 | p448_mul ( 21 | p448_t *__restrict__ cs, 22 | const p448_t *as, 23 | const p448_t *bs 24 | ) { 25 | const uint64_t *a = as->limb, *b = bs->limb; 26 | uint64_t *c = cs->limb; 27 | 28 | __uint128_t accum0 = 0, accum1 = 0, accum2; 29 | uint64_t mask = (1ull<<56) - 1; 30 | 31 | uint64_t aa[4], bb[4], bbb[4]; 32 | 33 | unsigned int i; 34 | for (i=0; i<4; i++) { 35 | aa[i] = a[i] + a[i+4]; 36 | bb[i] = b[i] + b[i+4]; 37 | bbb[i] = bb[i] + b[i+4]; 38 | } 39 | 40 | int I_HATE_UNROLLED_LOOPS = 0; 41 | 42 | if (I_HATE_UNROLLED_LOOPS) { 43 | /* The compiler probably won't unroll this, 44 | * so it's like 80% slower. 45 | */ 46 | for (i=0; i<4; i++) { 47 | accum2 = 0; 48 | 49 | unsigned int j; 50 | for (j=0; j<=i; j++) { 51 | accum2 += widemul(a[j], b[i-j]); 52 | accum1 += widemul(aa[j], bb[i-j]); 53 | accum0 += widemul(a[j+4], b[i-j+4]); 54 | } 55 | for (; j<4; j++) { 56 | accum2 += widemul(a[j], b[i-j+8]); 57 | accum1 += widemul(aa[j], bbb[i-j+4]); 58 | accum0 += widemul(a[j+4], bb[i-j+4]); 59 | } 60 | 61 | accum1 -= accum2; 62 | accum0 += accum2; 63 | 64 | c[i] = ((uint64_t)(accum0)) & mask; 65 | c[i+4] = ((uint64_t)(accum1)) & mask; 66 | 67 | accum0 >>= 56; 68 | accum1 >>= 56; 69 | } 70 | } else { 71 | accum2 = widemul(a[0], b[0]); 72 | accum1 += widemul(aa[0], bb[0]); 73 | accum0 += widemul(a[4], b[4]); 74 | 75 | accum2 += widemul(a[1], b[7]); 76 | accum1 += widemul(aa[1], bbb[3]); 77 | accum0 += widemul(a[5], bb[3]); 78 | 79 | accum2 += widemul(a[2], b[6]); 80 | accum1 += widemul(aa[2], bbb[2]); 81 | accum0 += widemul(a[6], bb[2]); 82 | 83 | accum2 += widemul(a[3], b[5]); 84 | accum1 += widemul(aa[3], bbb[1]); 85 | accum0 += widemul(a[7], bb[1]); 86 | 87 | accum1 -= accum2; 88 | accum0 += accum2; 89 | 90 | c[0] = ((uint64_t)(accum0)) & mask; 91 | c[4] = ((uint64_t)(accum1)) & mask; 92 | 93 | accum0 >>= 56; 94 | accum1 >>= 56; 95 | 96 | accum2 = widemul(a[0], b[1]); 97 | accum1 += widemul(aa[0], bb[1]); 98 | accum0 += widemul(a[4], b[5]); 99 | 100 | accum2 += widemul(a[1], b[0]); 101 | accum1 += widemul(aa[1], bb[0]); 102 | accum0 += widemul(a[5], b[4]); 103 | 104 | accum2 += widemul(a[2], b[7]); 105 | accum1 += widemul(aa[2], bbb[3]); 106 | accum0 += widemul(a[6], bb[3]); 107 | 108 | accum2 += widemul(a[3], b[6]); 109 | accum1 += widemul(aa[3], bbb[2]); 110 | accum0 += widemul(a[7], bb[2]); 111 | 112 | accum1 -= accum2; 113 | accum0 += accum2; 114 | 115 | c[1] = ((uint64_t)(accum0)) & mask; 116 | c[5] = ((uint64_t)(accum1)) & mask; 117 | 118 | accum0 >>= 56; 119 | accum1 >>= 56; 120 | 121 | accum2 = widemul(a[0], b[2]); 122 | accum1 += widemul(aa[0], bb[2]); 123 | accum0 += widemul(a[4], b[6]); 124 | 125 | accum2 += widemul(a[1], b[1]); 126 | accum1 += widemul(aa[1], bb[1]); 127 | accum0 += widemul(a[5], b[5]); 128 | 129 | accum2 += widemul(a[2], b[0]); 130 | accum1 += widemul(aa[2], bb[0]); 131 | accum0 += widemul(a[6], b[4]); 132 | 133 | accum2 += widemul(a[3], b[7]); 134 | accum1 += widemul(aa[3], bbb[3]); 135 | accum0 += widemul(a[7], bb[3]); 136 | 137 | accum1 -= accum2; 138 | accum0 += accum2; 139 | 140 | c[2] = ((uint64_t)(accum0)) & mask; 141 | c[6] = ((uint64_t)(accum1)) & mask; 142 | 143 | accum0 >>= 56; 144 | accum1 >>= 56; 145 | 146 | accum2 = widemul(a[0], b[3]); 147 | accum1 += widemul(aa[0], bb[3]); 148 | accum0 += widemul(a[4], b[7]); 149 | 150 | accum2 += widemul(a[1], b[2]); 151 | accum1 += widemul(aa[1], bb[2]); 152 | accum0 += widemul(a[5], b[6]); 153 | 154 | accum2 += widemul(a[2], b[1]); 155 | accum1 += widemul(aa[2], bb[1]); 156 | accum0 += widemul(a[6], b[5]); 157 | 158 | accum2 += widemul(a[3], b[0]); 159 | accum1 += widemul(aa[3], bb[0]); 160 | accum0 += widemul(a[7], b[4]); 161 | 162 | accum1 -= accum2; 163 | accum0 += accum2; 164 | 165 | c[3] = ((uint64_t)(accum0)) & mask; 166 | c[7] = ((uint64_t)(accum1)) & mask; 167 | 168 | accum0 >>= 56; 169 | accum1 >>= 56; 170 | } /* !I_HATE_UNROLLED_LOOPS */ 171 | 172 | accum0 += accum1; 173 | accum0 += c[4]; 174 | accum1 += c[0]; 175 | c[4] = ((uint64_t)(accum0)) & mask; 176 | c[0] = ((uint64_t)(accum1)) & mask; 177 | 178 | accum0 >>= 56; 179 | accum1 >>= 56; 180 | 181 | c[5] += ((uint64_t)(accum0)); 182 | c[1] += ((uint64_t)(accum1)); 183 | } 184 | 185 | void 186 | p448_mulw ( 187 | p448_t *__restrict__ cs, 188 | const p448_t *as, 189 | uint64_t b 190 | ) { 191 | const uint64_t *a = as->limb; 192 | uint64_t *c = cs->limb; 193 | 194 | __uint128_t accum0 = 0, accum4 = 0; 195 | uint64_t mask = (1ull<<56) - 1; 196 | 197 | int i; 198 | for (i=0; i<4; i++) { 199 | accum0 += widemul(b, a[i]); 200 | accum4 += widemul(b, a[i+4]); 201 | c[i] = accum0 & mask; accum0 >>= 56; 202 | c[i+4] = accum4 & mask; accum4 >>= 56; 203 | } 204 | 205 | accum0 += accum4 + c[4]; 206 | c[4] = accum0 & mask; 207 | c[5] += accum0 >> 56; 208 | 209 | accum4 += c[0]; 210 | c[0] = accum4 & mask; 211 | c[1] += accum4 >> 56; 212 | } 213 | 214 | void 215 | p448_sqr ( 216 | p448_t *__restrict__ cs, 217 | const p448_t *as 218 | ) { 219 | const uint64_t *a = as->limb; 220 | uint64_t *c = cs->limb; 221 | 222 | __uint128_t accum0 = 0, accum1 = 0, accum2; 223 | uint64_t mask = (1ull<<56) - 1; 224 | 225 | uint64_t aa[4]; 226 | 227 | /* For some reason clang doesn't vectorize this without prompting? */ 228 | unsigned int i; 229 | for (i=0; i<4; i++) { 230 | aa[i] = a[i] + a[i+4]; 231 | } 232 | 233 | accum2 = widemul(a[0],a[3]); 234 | accum0 = widemul(aa[0],aa[3]); 235 | accum1 = widemul(a[4],a[7]); 236 | 237 | accum2 += widemul(a[1], a[2]); 238 | accum0 += widemul(aa[1], aa[2]); 239 | accum1 += widemul(a[5], a[6]); 240 | 241 | accum0 -= accum2; 242 | accum1 += accum2; 243 | 244 | c[3] = ((uint64_t)(accum1))<<1 & mask; 245 | c[7] = ((uint64_t)(accum0))<<1 & mask; 246 | 247 | accum0 >>= 55; 248 | accum1 >>= 55; 249 | 250 | accum0 += widemul(2*aa[1],aa[3]); 251 | accum1 += widemul(2*a[5], a[7]); 252 | accum0 += widemul(aa[2], aa[2]); 253 | accum1 += accum0; 254 | 255 | accum0 -= widemul(2*a[1], a[3]); 256 | accum1 += widemul(a[6], a[6]); 257 | 258 | accum2 = widemul(a[0],a[0]); 259 | accum1 -= accum2; 260 | accum0 += accum2; 261 | 262 | accum0 -= widemul(a[2], a[2]); 263 | accum1 += widemul(aa[0], aa[0]); 264 | accum0 += widemul(a[4], a[4]); 265 | 266 | c[0] = ((uint64_t)(accum0)) & mask; 267 | c[4] = ((uint64_t)(accum1)) & mask; 268 | 269 | accum0 >>= 56; 270 | accum1 >>= 56; 271 | 272 | accum2 = widemul(2*aa[2],aa[3]); 273 | accum0 -= widemul(2*a[2], a[3]); 274 | accum1 += widemul(2*a[6], a[7]); 275 | 276 | accum1 += accum2; 277 | accum0 += accum2; 278 | 279 | accum2 = widemul(2*a[0],a[1]); 280 | accum1 += widemul(2*aa[0], aa[1]); 281 | accum0 += widemul(2*a[4], a[5]); 282 | 283 | accum1 -= accum2; 284 | accum0 += accum2; 285 | 286 | c[1] = ((uint64_t)(accum0)) & mask; 287 | c[5] = ((uint64_t)(accum1)) & mask; 288 | 289 | accum0 >>= 56; 290 | accum1 >>= 56; 291 | 292 | accum2 = widemul(aa[3],aa[3]); 293 | accum0 -= widemul(a[3], a[3]); 294 | accum1 += widemul(a[7], a[7]); 295 | 296 | accum1 += accum2; 297 | accum0 += accum2; 298 | 299 | accum2 = widemul(2*a[0],a[2]); 300 | accum1 += widemul(2*aa[0], aa[2]); 301 | accum0 += widemul(2*a[4], a[6]); 302 | 303 | accum2 += widemul(a[1], a[1]); 304 | accum1 += widemul(aa[1], aa[1]); 305 | accum0 += widemul(a[5], a[5]); 306 | 307 | accum1 -= accum2; 308 | accum0 += accum2; 309 | 310 | c[2] = ((uint64_t)(accum0)) & mask; 311 | c[6] = ((uint64_t)(accum1)) & mask; 312 | 313 | accum0 >>= 56; 314 | accum1 >>= 56; 315 | 316 | accum0 += c[3]; 317 | accum1 += c[7]; 318 | c[3] = ((uint64_t)(accum0)) & mask; 319 | c[7] = ((uint64_t)(accum1)) & mask; 320 | 321 | /* we could almost stop here, but it wouldn't be stable, so... */ 322 | 323 | accum0 >>= 56; 324 | accum1 >>= 56; 325 | c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); 326 | c[0] += ((uint64_t)(accum1)); 327 | } 328 | 329 | void 330 | p448_strong_reduce ( 331 | p448_t *a 332 | ) { 333 | uint64_t mask = (1ull<<56)-1; 334 | 335 | /* first, clear high */ 336 | a->limb[4] += a->limb[7]>>56; 337 | a->limb[0] += a->limb[7]>>56; 338 | a->limb[7] &= mask; 339 | 340 | /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */ 341 | 342 | /* compute total_value - p. No need to reduce mod p. */ 343 | 344 | __int128_t scarry = 0; 345 | int i; 346 | for (i=0; i<8; i++) { 347 | scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask); 348 | a->limb[i] = scarry & mask; 349 | scarry >>= 56; 350 | } 351 | 352 | /* uncommon case: it was >= p, so now scarry = 0 and this = x 353 | * common case: it was < p, so now scarry = -1 and this = x - p + 2^448 354 | * so let's add back in p. will carry back off the top for 2^448. 355 | */ 356 | 357 | assert(is_zero(scarry) | is_zero(scarry+1)); 358 | 359 | uint64_t scarry_mask = scarry & mask; 360 | __uint128_t carry = 0; 361 | 362 | /* add it back */ 363 | for (i=0; i<8; i++) { 364 | carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask); 365 | a->limb[i] = carry & mask; 366 | carry >>= 56; 367 | } 368 | 369 | assert(is_zero(carry + scarry)); 370 | } 371 | 372 | void 373 | p448_serialize ( 374 | uint8_t *serial, 375 | const struct p448_t *x 376 | ) { 377 | int i,j; 378 | p448_t red; 379 | p448_copy(&red, x); 380 | p448_strong_reduce(&red); 381 | for (i=0; i<8; i++) { 382 | for (j=0; j<7; j++) { 383 | serial[7*i+j] = red.limb[i]; 384 | red.limb[i] >>= 8; 385 | } 386 | assert(red.limb[i] == 0); 387 | } 388 | } 389 | 390 | mask_t 391 | p448_deserialize ( 392 | p448_t *x, 393 | const uint8_t serial[56] 394 | ) { 395 | int i,j; 396 | for (i=0; i<8; i++) { 397 | uint64_t out = 0; 398 | for (j=0; j<7; j++) { 399 | out |= ((uint64_t)serial[7*i+j])<<(8*j); 400 | } 401 | x->limb[i] = out; 402 | } 403 | 404 | /* Check for reduction. 405 | * 406 | * The idea is to create a variable ge which is all ones (rather, 56 ones) 407 | * if and only if the low $i$ words of $x$ are >= those of p. 408 | * 409 | * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111) 410 | */ 411 | uint64_t ge = -1, mask = (1ull<<56)-1; 412 | for (i=0; i<4; i++) { 413 | ge &= x->limb[i]; 414 | } 415 | 416 | /* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */ 417 | ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask); 418 | 419 | /* Propagate the rest */ 420 | for (i=5; i<8; i++) { 421 | ge &= x->limb[i]; 422 | } 423 | 424 | return ~is_zero(ge ^ mask); 425 | } 426 | --------------------------------------------------------------------------------