├── src
    ├── p448
    │   ├── arch_32
    │   │   ├── arch_config.h
    │   │   ├── p448.h
    │   │   └── p448.c
    │   ├── arch_ref64
    │   │   ├── arch_config.h
    │   │   ├── p448.h
    │   │   └── p448.c
    │   ├── arch_arm_32
    │   │   ├── arch_config.h
    │   │   └── p448.h
    │   ├── arch_x86_64
    │   │   ├── arch_config.h
    │   │   ├── p448.h
    │   │   ├── x86-64-arith.h
    │   │   └── p448.c
    │   ├── arch_neon_experimental
    │   │   ├── arch_config.h
    │   │   └── p448.h
    │   ├── f_field.h
    │   └── f_arithmetic.c
    ├── p521
    │   ├── arch_ref64
    │   │   ├── arch_config.h
    │   │   ├── p521.h
    │   │   └── p521.c
    │   ├── arch_x86_64_r12
    │   │   ├── arch_config.h
    │   │   └── p521.h
    │   ├── f_field.h
    │   └── f_arithmetic.c
    ├── p480
    │   ├── arch_x86_64
    │   │   ├── arch_config.h
    │   │   ├── p480.h
    │   │   ├── x86-64-arith.h
    │   │   └── p480.c
    │   ├── f_field.h
    │   └── f_arithmetic.c
    ├── bat
    │   ├── api_dh.h
    │   ├── api_sign.h
    │   ├── dh.c
    │   └── sign.c
    ├── include
    │   ├── decaf_448_config.h
    │   ├── field.h
    │   ├── word.h
    │   └── constant_time.h
    ├── decaf_gen_tables.c
    └── decaf_crypto.c
├── test
    ├── batarch.map
    ├── shakesum.c
    ├── test_decaf.cxx
    └── test_decaf.sage
├── README.txt
├── LICENSE.txt
├── TODO.txt
├── aux
    ├── decaffeinate_curve25519.sage
    ├── idealized.sage
    └── curve.sage
├── include
    └── decaf_crypto.h
└── Makefile


/src/p448/arch_32/arch_config.h:
--------------------------------------------------------------------------------
1 | #define WORD_BITS 32
2 | 


--------------------------------------------------------------------------------
/src/p448/arch_ref64/arch_config.h:
--------------------------------------------------------------------------------
1 | #define WORD_BITS 64
2 | 


--------------------------------------------------------------------------------
/src/p521/arch_ref64/arch_config.h:
--------------------------------------------------------------------------------
1 | #define WORD_BITS 64
2 | 


--------------------------------------------------------------------------------
/src/p448/arch_arm_32/arch_config.h:
--------------------------------------------------------------------------------
1 | #define WORD_BITS 32
2 | 


--------------------------------------------------------------------------------
/src/p448/arch_x86_64/arch_config.h:
--------------------------------------------------------------------------------
1 | #define WORD_BITS 64
2 | 


--------------------------------------------------------------------------------
/src/p480/arch_x86_64/arch_config.h:
--------------------------------------------------------------------------------
1 | #define WORD_BITS 64
2 | 


--------------------------------------------------------------------------------
/src/p521/arch_x86_64_r12/arch_config.h:
--------------------------------------------------------------------------------
1 | #define WORD_BITS 64
2 | 


--------------------------------------------------------------------------------
/src/p448/arch_neon_experimental/arch_config.h:
--------------------------------------------------------------------------------
1 | #define WORD_BITS 32
2 | 


--------------------------------------------------------------------------------
/test/batarch.map:
--------------------------------------------------------------------------------
1 | neon arch_neon_experimental
2 | arm32 arch_arm_32
3 | 64 arch_ref64
4 | 32 arch_32
5 | amd64 arch_x86_64
6 | 


--------------------------------------------------------------------------------
/src/bat/api_dh.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file sizes.h
 3 |  * @copyright
 4 |  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 5 |  *   Released under the MIT License.  See LICENSE.txt for license information.
 6 |  * @author Mike Hamburg
 7 |  * @brief BATMAN / SUPERCOP glue for benchmarking.
 8 |  */
 9 | 
10 | #include <string.h>
11 | #include "decaf_crypto.h"
12 | 
13 | #define PUBLICKEY_BYTES (sizeof(decaf_448_public_key_t))
14 | #define SECRETKEY_BYTES (sizeof(decaf_448_private_key_t))
15 | #define SHAREDSECRET_BYTES 32
16 | 
17 | #define CRYPTO_PUBLICKEYBYTES PUBLICKEY_BYTES
18 | #define CRYPTO_SECRETKEYBYTES SECRETKEY_BYTES
19 | #define CRYPTO_BYTES SHAREDSECRET_BYTES
20 | #define PRIVATEKEY_BYTES SECRETKEY_BYTES
21 | #define CRYPTO_VERSION "__TODAY__"
22 | 
23 | #define CRYPTO_DETERMINISTIC 1
24 | 
25 | 


--------------------------------------------------------------------------------
/src/bat/api_sign.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file sizes.h
 3 |  * @copyright
 4 |  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 5 |  *   Released under the MIT License.  See LICENSE.txt for license information.
 6 |  * @author Mike Hamburg
 7 |  * @brief BATMAN / SUPERCOP glue for benchmarking.
 8 |  */
 9 | 
10 | #include <string.h>
11 | #include "decaf_crypto.h"
12 | 
13 | #define PUBLICKEY_BYTES (sizeof(decaf_448_public_key_t))
14 | #define SECRETKEY_BYTES (sizeof(decaf_448_private_key_t))
15 | #define SIGNATURE_BYTES (sizeof(decaf_448_signature_t))
16 | 
17 | #define CRYPTO_PUBLICKEYBYTES PUBLICKEY_BYTES
18 | #define CRYPTO_SECRETKEYBYTES SECRETKEY_BYTES
19 | #define CRYPTO_BYTES SIGNATURE_BYTES
20 | #define PRIVATEKEY_BYTES SECRETKEY_BYTES
21 | #define CRYPTO_VERSION "__TODAY__"
22 | 
23 | #define CRYPTO_DETERMINISTIC 1
24 | 
25 | 


--------------------------------------------------------------------------------
/src/bat/dh.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file sizes.h
 3 |  * @copyright
 4 |  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 5 |  *   Released under the MIT License.  See LICENSE.txt for license information.
 6 |  * @author Mike Hamburg
 7 |  * @brief BATMAN / SUPERCOP glue for benchmarking.
 8 |  */
 9 | 
10 | #include <string.h>
11 | #include <stdlib.h>
12 | #include "api.h"
13 | #include "crypto_dh.h"
14 | #include "randombytes.h"
15 | 
16 | int crypto_dh_keypair (
17 |     unsigned char pk[PUBLICKEY_BYTES],
18 |     unsigned char sk[SECRETKEY_BYTES]
19 | ) {
20 |     decaf_448_symmetric_key_t proto;
21 |     randombytes(proto,sizeof(proto));
22 |     decaf_448_derive_private_key((decaf_448_private_key_s *)sk,proto);
23 |     decaf_448_private_to_public(pk,(decaf_448_private_key_s *)sk);
24 |     return 0;
25 | }
26 | 
27 | int crypto_dh (
28 |     unsigned char s[SHAREDSECRET_BYTES],
29 |     const unsigned char pk[PUBLICKEY_BYTES],
30 |     const unsigned char sk[SECRETKEY_BYTES]
31 | ) {
32 |     return !decaf_448_shared_secret (s,SHAREDSECRET_BYTES,
33 |         (const decaf_448_private_key_s *)sk, pk
34 |     );
35 | }
36 | 


--------------------------------------------------------------------------------
/src/p448/f_field.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file f_field.h
 3 |  * @brief Field-specific code.
 4 |  * @copyright
 5 |  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 6 |  *   Released under the MIT License.  See LICENSE.txt for license information.
 7 |  * @author Mike Hamburg
 8 |  */
 9 | #ifndef __F_FIELD_H__
10 | #define __F_FIELD_H__ 1
11 | 
12 | #include "constant_time.h"
13 | #include <string.h>
14 | 
15 | #include "p448.h"
16 | #define FIELD_LIT_LIMB_BITS  56
17 | #define FIELD_BITS           448
18 | #define field_t              p448_t
19 | #define field_mul            p448_mul
20 | #define field_sqr            p448_sqr
21 | #define field_add_RAW        p448_add_RAW
22 | #define field_sub_RAW        p448_sub_RAW
23 | #define field_mulw           p448_mulw
24 | #define field_bias           p448_bias
25 | #define field_isr            p448_isr
26 | #define field_inverse        p448_inverse
27 | #define field_weak_reduce    p448_weak_reduce
28 | #define field_strong_reduce  p448_strong_reduce
29 | #define field_serialize      p448_serialize
30 | #define field_deserialize    p448_deserialize
31 | 
32 | #endif /* __F_FIELD_H__ */
33 | 


--------------------------------------------------------------------------------
/src/p480/f_field.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file f_field.h
 3 |  * @brief Field-specific code.
 4 |  * @copyright
 5 |  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 6 |  *   Released under the MIT License.  See LICENSE.txt for license information.
 7 |  * @author Mike Hamburg
 8 |  */
 9 | #ifndef __F_FIELD_H__
10 | #define __F_FIELD_H__ 1
11 | 
12 | #include "constant_time.h"
13 | #include <string.h>
14 | 
15 | #include "p480.h"
16 | #define FIELD_LIT_LIMB_BITS  60
17 | #define FIELD_BITS           480
18 | #define field_t              p480_t
19 | #define field_mul            p480_mul
20 | #define field_sqr            p480_sqr
21 | #define field_add_RAW        p480_add_RAW
22 | #define field_sub_RAW        p480_sub_RAW
23 | #define field_mulw           p480_mulw
24 | #define field_bias           p480_bias
25 | #define field_isr            p480_isr
26 | #define field_inverse        p480_inverse
27 | #define field_weak_reduce    p480_weak_reduce
28 | #define field_strong_reduce  p480_strong_reduce
29 | #define field_serialize      p480_serialize
30 | #define field_deserialize    p480_deserialize
31 | 
32 | #endif /* __F_FIELD_H__ */
33 | 


--------------------------------------------------------------------------------
/src/p521/f_field.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file f_field.h
 3 |  * @brief Field-specific code.
 4 |  * @copyright
 5 |  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 6 |  *   Released under the MIT License.  See LICENSE.txt for license information.
 7 |  * @author Mike Hamburg
 8 |  */
 9 | #ifndef __F_FIELD_H__
10 | #define __F_FIELD_H__ 1
11 | 
12 | #include <string.h>
13 | #include "constant_time.h"
14 | 
15 | #include "p521.h"
16 | #define FIELD_LIT_LIMB_BITS  58
17 | #define FIELD_BITS           521
18 | #define field_t              p521_t
19 | #define field_mul            p521_mul
20 | #define field_sqr            p521_sqr
21 | #define field_add_RAW        p521_add_RAW
22 | #define field_sub_RAW        p521_sub_RAW
23 | #define field_mulw           p521_mulw
24 | #define field_bias           p521_bias
25 | #define field_isr            p521_isr
26 | #define field_inverse        p521_inverse
27 | #define field_weak_reduce    p521_weak_reduce
28 | #define field_strong_reduce  p521_strong_reduce
29 | #define field_serialize      p521_serialize
30 | #define field_deserialize    p521_deserialize
31 | 
32 | #endif /* __F_FIELD_H__ */
33 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
 1 | Ed448-Goldilocks, Decaf version.
 2 | 
 3 | This software is an experimental implementation of a new 448-bit elliptic
 4 | curve called Ed448-Goldilocks, with "Decaf" cofactor removal.
 5 | 
 6 | The source files here are all by Mike Hamburg. Most of them are (c)
 7 | 2014-2015 Cryptography Research, Inc (a division of Rambus). All of these
 8 | files are usable under the MIT license contained in LICENSE.txt.
 9 | 
10 | The Makefile is set for my 2013 MacBook Air. You can `make bench` to run
11 | a completely arbitrary set of benchmarks and tests, or `make lib` to build
12 | a stripped-down version of the library. For non-Haswell platforms, you may
13 | need to replace -mavx2 -mbmi2 by an appropriate vector declaration.
14 | 
15 | I've attempted to protect against timing attacks and invalid point attacks,
16 | but as of yet no attempt to protect against power analysis.
17 | 
18 | This software is incomplete, and lacks documentation. None of the APIs are
19 | yet stable, though they may be getting there. The software is probably not
20 | secure. Please consult TODO.txt for additional agenda items. Do not taunt
21 | happy fun ball.
22 | 
23 | Cheers,
24 | -- Mike Hamburg
25 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2011 Stanford University.
 4 | Copyright (c) 2014 Cryptography Research, Inc.
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/src/p448/f_arithmetic.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @cond internal
 3 |  * @file f_arithmetic.c
 4 |  * @copyright
 5 |  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 6 |  *   Released under the MIT License.  See LICENSE.txt for license information.
 7 |  * @author Mike Hamburg
 8 |  * @brief Field-specific arithmetic.
 9 |  */
10 | 
11 | #include "field.h"
12 | 
13 | void 
14 | field_isr (
15 |     field_a_t a,
16 |     const field_a_t x
17 | ) {
18 |     field_a_t L0, L1, L2;
19 |     field_sqr  (   L1,     x );
20 |     field_mul  (   L2,     x,   L1 );
21 |     field_sqr  (   L1,   L2 );
22 |     field_mul  (   L2,     x,   L1 );
23 |     field_sqrn (   L1,   L2,     3 );
24 |     field_mul  (   L0,   L2,   L1 );
25 |     field_sqrn (   L1,   L0,     3 );
26 |     field_mul  (   L0,   L2,   L1 );
27 |     field_sqrn (   L2,   L0,     9 );
28 |     field_mul  (   L1,   L0,   L2 );
29 |     field_sqr  (   L0,   L1 );
30 |     field_mul  (   L2,     x,   L0 );
31 |     field_sqrn (   L0,   L2,    18 );
32 |     field_mul  (   L2,   L1,   L0 );
33 |     field_sqrn (   L0,   L2,    37 );
34 |     field_mul  (   L1,   L2,   L0 );
35 |     field_sqrn (   L0,   L1,    37 );
36 |     field_mul  (   L1,   L2,   L0 );
37 |     field_sqrn (   L0,   L1,   111 );
38 |     field_mul  (   L2,   L1,   L0 );
39 |     field_sqr  (   L0,   L2 );
40 |     field_mul  (   L1,     x,   L0 );
41 |     field_sqrn (   L0,   L1,   223 );
42 |     field_mul  (     a,   L2,   L0 );
43 | }
44 | 


--------------------------------------------------------------------------------
/src/p521/f_arithmetic.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @cond internal
 3 |  * @file f_arithmetic.c
 4 |  * @copyright
 5 |  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 6 |  *   Released under the MIT License.  See LICENSE.txt for license information.
 7 |  * @author Mike Hamburg
 8 |  * @brief Field-specific arithmetic.
 9 |  */
10 | 
11 | #include "field.h"
12 | 
13 | void 
14 | field_isr (
15 |     field_a_t a,
16 |     const field_a_t x
17 | ) {
18 |     field_a_t L0, L1, L2;
19 |     field_sqr  (   L1,     x );
20 |     field_mul  (   L0,     x,   L1 );
21 |     field_sqrn (   L2,   L0,     2 );
22 |     field_mul  (   L1,   L0,   L2 );
23 |     field_sqrn (   L2,   L1,     4 );
24 |     field_mul  (   L0,   L1,   L2 );
25 |     field_sqrn (   L2,   L0,     8 );
26 |     field_mul  (   L1,   L0,   L2 );
27 |     field_sqrn (   L2,   L1,    16 );
28 |     field_mul  (   L0,   L1,   L2 );
29 |     field_sqrn (   L2,   L0,    32 );
30 |     field_mul  (   L1,   L0,   L2 );
31 |     field_sqr  (   L2,   L1 );
32 |     field_mul  (   L0,     x,   L2 );
33 |     field_sqrn (   L2,   L0,    64 );
34 |     field_mul  (   L0,   L1,   L2 );
35 |     field_sqrn (   L2,   L0,   129 );
36 |     field_mul  (   L1,   L0,   L2 );
37 |     field_sqr  (   L2,   L1 );
38 |     field_mul  (   L0,     x,   L2 );
39 |     field_sqrn (   L2,   L0,   259 );
40 |     field_mul  (   L1,   L0,   L2 );
41 |     field_sqr  (   L0,   L1 );
42 |     field_mul  (     a,     x,   L0 );
43 | }
44 | 


--------------------------------------------------------------------------------
/src/p480/f_arithmetic.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @cond internal
 3 |  * @file f_arithmetic.c
 4 |  * @copyright
 5 |  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 6 |  *   Released under the MIT License.  See LICENSE.txt for license information.
 7 |  * @author Mike Hamburg
 8 |  * @brief Field-specific arithmetic.
 9 |  */
10 | 
11 | #include "field.h"
12 | 
13 | void 
14 | field_isr (
15 |     field_a_t a,
16 |     const field_a_t x
17 | ) {
18 |     field_a_t L0, L1, L2, L3;
19 |     field_sqr  (   L2,     x );
20 |     field_mul  (   L1,     x,   L2 );
21 |     field_sqrn (   L0,   L1,     2 );
22 |     field_mul  (   L2,   L1,   L0 );
23 |     field_sqrn (   L0,   L2,     4 );
24 |     field_mul  (   L1,   L2,   L0 );
25 |     field_sqr  (   L0,   L1 );
26 |     field_mul  (   L2,     x,   L0 );
27 |     field_sqrn (   L0,   L2,     8 );
28 |     field_mul  (   L2,   L1,   L0 );
29 |     field_sqrn (   L0,   L2,    17 );
30 |     field_mul  (   L1,   L2,   L0 );
31 |     field_sqrn (   L0,   L1,    17 );
32 |     field_mul  (   L1,   L2,   L0 );
33 |     field_sqrn (   L3,   L1,    17 );
34 |     field_mul  (   L0,   L2,   L3 );
35 |     field_sqrn (   L2,   L0,    51 );
36 |     field_mul  (   L0,   L1,   L2 );
37 |     field_sqrn (   L1,   L0,   119 );
38 |     field_mul  (   L2,   L0,   L1 );
39 |     field_sqr  (   L0,   L2 );
40 |     field_mul  (   L1,     x,   L0 );
41 |     field_sqrn (   L0,   L1,   239 );
42 |     field_mul  (     a,   L2,   L0 );
43 | }
44 | 


--------------------------------------------------------------------------------
/src/include/decaf_448_config.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file decaf_config.h
 3 |  * @author Mike Hamburg
 4 |  *
 5 |  * @copyright
 6 |  *   Copyright (c) 2015 Cryptography Research, Inc.  \n
 7 |  *   Released under the MIT License.  See LICENSE.txt for license information.
 8 |  *
 9 |  * @brief Configuration for decaf_fast.c
10 |  */
11 | #ifndef __DECAF_448_CONFIG_H__
12 | #define __DECAF_448_CONFIG_H__ 1
13 | 
14 | /**
15 |  * Use the Montgomery ladder for direct scalarmul.
16 |  *
17 |  * The Montgomery ladder is faster than Edwards scalarmul, but providing
18 |  * the features Decaf supports (cofactor elimination, twist rejection)
19 |  * makes it complicated and adds code.  Removing the ladder saves a few
20 |  * kilobytes at the cost of perhaps 5-10% overhead in direct scalarmul
21 |  * time.
22 |  */
23 | #define DECAF_USE_MONTGOMERY_LADDER 1
24 | 
25 | /** The number of comb tables for fixed base scalarmul. */
26 | #define DECAF_COMBS_N 5
27 | 
28 | /** The number of teeth per comb for fixed base scalarmul. */
29 | #define DECAF_COMBS_T 5
30 | 
31 | /** The comb spacing fixed base scalarmul. */
32 | #define DECAF_COMBS_S 18
33 | 
34 | /** Performance tuning: the width of the fixed window for scalar mul. */
35 | #define DECAF_WINDOW_BITS 5
36 | 
37 | /**
38 |  * The number of bits used for the precomputed table in variable-time
39 |  * double scalarmul.
40 |  */
41 | #define DECAF_WNAF_FIXED_TABLE_BITS 5
42 | 
43 | /**
44 |  * Performance tuning: bits used for the variable table in variable-time
45 |  * double scalarmul.
46 |  */
47 | #define DECAF_WNAF_VAR_TABLE_BITS 3
48 | 
49 | 
50 | #endif /* __DECAF_448_CONFIG_H__ */
51 | 


--------------------------------------------------------------------------------
/src/bat/sign.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file sizes.h
 3 |  * @copyright
 4 |  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
 5 |  *   Released under the MIT License.  See LICENSE.txt for license information.
 6 |  * @author Mike Hamburg
 7 |  * @brief BATMAN / SUPERCOP glue for benchmarking.
 8 |  */
 9 | 
10 | #include <stdlib.h>
11 | #include <string.h>
12 | #include "api.h"
13 | #include "crypto_sign.h"
14 | #include "randombytes.h"
15 | 
16 | int crypto_sign_keypair (
17 |     unsigned char pk[PUBLICKEY_BYTES],
18 |     unsigned char sk[SECRETKEY_BYTES]
19 | ) {
20 |     decaf_448_symmetric_key_t proto;
21 |     randombytes(proto,sizeof(proto));
22 |     decaf_448_derive_private_key((decaf_448_private_key_s *)sk,proto);
23 |     decaf_448_private_to_public(pk,
24 |         (decaf_448_private_key_s *)sk
25 |     );
26 |     return 0;
27 | }
28 | 
29 | int crypto_sign (
30 |     unsigned char *sm,
31 |     unsigned long long *smlen,
32 |     const unsigned char *m,
33 |     unsigned long long mlen,
34 |     const unsigned char sk[SECRETKEY_BYTES]
35 | ) {
36 |     unsigned char sig[SIGNATURE_BYTES];
37 |     decaf_448_sign(
38 |         sig,
39 |         (const decaf_448_private_key_s *)sk,
40 |         m, mlen
41 |     );
42 |     memmove(sm + SIGNATURE_BYTES, m, mlen);
43 |     memcpy(sm, sig, SIGNATURE_BYTES);
44 |     *smlen = mlen + SIGNATURE_BYTES;
45 |     return 0;
46 | }
47 | 
48 | int crypto_sign_open (
49 |     unsigned char *m,
50 |     unsigned long long *mlen,
51 |     const unsigned char *sm,
52 |     unsigned long long smlen,
53 |     const unsigned char pk[PUBLICKEY_BYTES]
54 | ) {
55 |     int ret = decaf_448_verify(
56 |         sm,pk,
57 |         sm + SIGNATURE_BYTES, smlen - SIGNATURE_BYTES
58 |     );
59 |     if (ret) {
60 |         *mlen = smlen - SIGNATURE_BYTES;
61 |         memmove(m, sm + SIGNATURE_BYTES, *mlen);
62 |     }
63 |     return ret ? 0 : -1;
64 | }
65 | 


--------------------------------------------------------------------------------
/TODO.txt:
--------------------------------------------------------------------------------
 1 | Important work items for Ed448-Goldilocks / decaf:
 2 | 
 3 | * Factor out hash, crandom from core library?
 4 |     [DONE, except for C++ headers]
 5 | 
 6 | * Signed 32-bit NEON implementation to avoid bias/reduce after subtract
 7 | 
 8 | * Documentation: write high-level API docs, and internal docs to help
 9 |   other implementors.
10 |     * Partial progress on Doxygenating the code.
11 | 
12 | * Documentation: write a spec or add to Watson's
13 | 
14 | * Cleanup: rename everything consistently.
15 |     * namespace_op or op_namespace?  namespace_op_type?
16 |     * We don't have to be super-careful with the namespacing, because
17 |       symbols will be scrubbed by visibility
18 | 
19 | * Cleanup: unify intrinsics code
20 |     * Word_t, mask_t, bigregister_t, etc.
21 |     * Generate asm intrinsics with a script?
22 | 
23 | * Testing:
24 |     * More testing.  Testing, testing and testing.
25 |     * Test corner cases better.
26 | 
27 | * Safety: add static analysis attributes for compilers that support them
28 |     * Most functions now have warn on ignored return.
29 |     * [ MOSTLY DONE ]
30 | 
31 | * Safety:
32 |     * Decide what to do about RNG failures
33 |         * abort
34 |         * return error and zeroize
35 |         * return error but continue if RNG is kind of mostly OK
36 | 
37 | * High-level API: [DONE]
38 | 
39 | * Portability: test and make clean with other compilers
40 |     * Using a fair amount of __attribute__ code.
41 |     * [DONE] Should work for GCC now.
42 | 
43 | * Portability: try to make the vector code as portable as possible
44 |     * Currently using clang ext_vector_length.
45 |     * I can't get a simple for-loop to autovectorize :-/
46 |     * SAGE tool?
47 | 
48 | * [DONE] Portability: make the outer layers of the code 32-bit clean.
49 | 
50 | * [DONE] Performance/flexibility: decide which parameters should be hard-coded.
51 |     * Perhaps useful for comb precomputation.
52 | 
53 | * Performance: Improve SHAKE.
54 |     * Improve speed.  (Maybe)
55 | 
56 | * Clear other TODO/FIXME/HACK/PERF items in the code
57 | 
58 | * Submit Decaf to SUPERCOP
59 | 


--------------------------------------------------------------------------------
/test/shakesum.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @cond internal
 3 |  * @file shakesum.c
 4 |  * @copyright
 5 |  *   Copyright (c) 2015 Cryptography Research, Inc.  \n
 6 |  *   Released under the MIT License.  See LICENSE.txt for license information.
 7 |  * @author Mike Hamburg
 8 |  * @brief SHA3 utility, to be combined with test vectors eventually...
 9 |  */
10 | 
11 | #include <stdio.h>
12 | #include <unistd.h>
13 | #include <string.h>
14 | #include "shake.h"
15 | 
16 | int main(int argc, char **argv) {
17 |     (void)argc; (void)argv;
18 | 
19 |     keccak_sponge_t sponge;
20 |     unsigned char buf[1024];
21 |     
22 |     unsigned int outlen = 512;
23 |     shake256_init(sponge);
24 | 
25 |     /* Sloppy.  Real utility would parse --algo, --size ... */
26 |     if (argc > 1) {
27 |         if (!strcmp(argv[1], "shake256") || !strcmp(argv[1], "SHAKE256")) {
28 |             outlen = 512;
29 |             shake256_init(sponge);
30 |         } else if (!strcmp(argv[1], "shake128") || !strcmp(argv[1], "SHAKE128")) {
31 |             outlen = 512;
32 |             shake128_init(sponge);
33 |         } else if (!strcmp(argv[1], "sha3-224") || !strcmp(argv[1], "SHA3-224")) {
34 |             outlen = 224/8;
35 |             sha3_224_init(sponge);
36 |         } else if (!strcmp(argv[1], "sha3-256") || !strcmp(argv[1], "SHA3-256")) {
37 |             outlen = 256/8;
38 |             sha3_256_init(sponge);
39 |         } else if (!strcmp(argv[1], "sha3-384") || !strcmp(argv[1], "SHA3-384")) {
40 |             outlen = 384/8;
41 |             sha3_384_init(sponge);
42 |         } else if (!strcmp(argv[1], "sha3-512") || !strcmp(argv[1], "SHA3-512")) {
43 |             outlen = 512/8;
44 |             sha3_512_init(sponge);
45 |         }
46 |     }
47 | 
48 |     ssize_t red;
49 |     do {
50 |         red = read(0, buf, sizeof(buf));
51 |         if (red>0) sha3_update(sponge,buf,red);
52 |     } while (red>0);
53 | 
54 |     sha3_output(sponge,buf,outlen);
55 |     sponge_destroy(sponge);
56 |     unsigned i;
57 |     for (i=0; i<outlen; i++) {
58 |         printf("%02x", buf[i]);
59 |     }
60 |     printf("\n");
61 | 
62 |     return 0;
63 | }
64 | 


--------------------------------------------------------------------------------
/src/include/field.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file field.h
  3 |  * @brief Generic field header.
  4 |  * @copyright
  5 |  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
  6 |  *   Released under the MIT License.  See LICENSE.txt for license information.
  7 |  * @author Mike Hamburg
  8 |  */
  9 | 
 10 | #ifndef __FIELD_H__
 11 | #define __FIELD_H__
 12 | 
 13 | #include "constant_time.h"
 14 | #include "f_field.h"
 15 | #include <string.h>
 16 | 
 17 | typedef struct field_t field_a_t[1];
 18 | #define field_a_restrict_t struct field_t *__restrict__
 19 | 
 20 | #define is32 (GOLDI_BITS == 32 || FIELD_BITS != 448)
 21 | #if (is32)
 22 | #define IF32(s) (s)
 23 | #else
 24 | #define IF32(s)
 25 | #endif
 26 | 
 27 | /**
 28 |  * Returns 1/sqrt(+- x).
 29 |  * 
 30 |  * The Legendre symbol of the result is the same as that of the
 31 |  * input.
 32 |  * 
 33 |  * If x=0, returns 0.
 34 |  */
 35 | void
 36 | field_isr (
 37 |     field_a_t       a,
 38 |     const field_a_t x
 39 | );
 40 | 
 41 | /**
 42 |  * Returns 1/x.
 43 |  * 
 44 |  * If x=0, returns 0.
 45 |  *
 46 |  * TODO: this is currently unused in Decaf, but I've left a decl
 47 |  * for it because field_inverse is different (and simpler) than
 48 |  * field_isqrt for 5-mod-8 fields.
 49 |  */
 50 | void
 51 | field_inverse (
 52 |     field_a_t       a,
 53 |     const field_a_t x
 54 | );
 55 |     
 56 | /**
 57 |  * Square x, n times.
 58 |  */
 59 | static __inline__ void
 60 | __attribute__((unused,always_inline))
 61 | field_sqrn (
 62 |     field_a_restrict_t y,
 63 |     const field_a_t x,
 64 |     int n
 65 | ) {
 66 |     field_a_t tmp;
 67 |     assert(n>0);
 68 |     if (n&1) {
 69 |         field_sqr(y,x);
 70 |         n--;
 71 |     } else {
 72 |         field_sqr(tmp,x);
 73 |         field_sqr(y,tmp);
 74 |         n-=2;
 75 |     }
 76 |     for (; n; n-=2) {
 77 |         field_sqr(tmp,y);
 78 |         field_sqr(y,tmp);
 79 |     }
 80 | }
 81 | 
 82 | static __inline__ void
 83 | field_subx_RAW (
 84 |     field_a_t d,
 85 |     const field_a_t a,
 86 |     const field_a_t b
 87 | ) {
 88 |     field_sub_RAW ( d, a, b );
 89 |     field_bias( d, 2 );
 90 |     IF32( field_weak_reduce ( d ) );
 91 | }
 92 | 
 93 | static __inline__ void
 94 | field_sub (
 95 |     field_a_t d,
 96 |     const field_a_t a,
 97 |     const field_a_t b
 98 | ) {
 99 |     field_sub_RAW ( d, a, b );
100 |     field_bias( d, 2 );
101 |     field_weak_reduce ( d );
102 | }
103 | 
104 | static __inline__ void
105 | field_add (
106 |     field_a_t d,
107 |     const field_a_t a,
108 |     const field_a_t b
109 | ) {
110 |     field_add_RAW ( d, a, b );
111 |     field_weak_reduce ( d );
112 | }
113 | 
114 | /** Require the warning annotation on raw routines */
115 | #define ANALYZE_THIS_ROUTINE_CAREFULLY const int ANNOTATE___ANALYZE_THIS_ROUTINE_CAREFULLY = 0;
116 | #define MUST_BE_CAREFUL (void) ANNOTATE___ANALYZE_THIS_ROUTINE_CAREFULLY
117 | #define field_add_nr(a,b,c) { MUST_BE_CAREFUL; field_add_RAW(a,b,c); }
118 | #define field_sub_nr(a,b,c) { MUST_BE_CAREFUL; field_sub_RAW(a,b,c); }
119 | #define field_subx_nr(a,b,c) { MUST_BE_CAREFUL; field_subx_RAW(a,b,c); }
120 | 
121 | #endif // __FIELD_H__
122 | 


--------------------------------------------------------------------------------
/src/p521/arch_ref64/p521.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | #ifndef __P521_H__
  5 | #define __P521_H__ 1
  6 | 
  7 | #include <stdint.h>
  8 | #include <assert.h>
  9 | #include <string.h>
 10 | 
 11 | #include "word.h"
 12 | 
 13 | typedef struct p521_t {
 14 |   uint64_t limb[9];
 15 | } p521_t;
 16 | 
 17 | #ifdef __cplusplus
 18 | extern "C" {
 19 | #endif
 20 | 
 21 | static __inline__ void
 22 | p521_add_RAW (
 23 |     p521_t *out,
 24 |     const p521_t *a,
 25 |     const p521_t *b
 26 | ) __attribute__((unused));
 27 |              
 28 | static __inline__ void
 29 | p521_sub_RAW (
 30 |     p521_t *out,
 31 |     const p521_t *a,
 32 |     const p521_t *b
 33 | ) __attribute__((unused));
 34 |              
 35 | static __inline__ void
 36 | p521_copy (
 37 |     p521_t *out,
 38 |     const p521_t *a
 39 | ) __attribute__((unused));
 40 |              
 41 | static __inline__ void
 42 | p521_weak_reduce (
 43 |     p521_t *inout
 44 | ) __attribute__((unused));
 45 |              
 46 | void
 47 | p521_strong_reduce (
 48 |     p521_t *inout
 49 | );
 50 | 
 51 | static __inline__ void
 52 | p521_bias (
 53 |     p521_t *inout,
 54 |     int amount
 55 | ) __attribute__((unused));
 56 |          
 57 | void
 58 | p521_mul (
 59 |     p521_t *__restrict__ out,
 60 |     const p521_t *a,
 61 |     const p521_t *b
 62 | );
 63 | 
 64 | void
 65 | p521_mulw (
 66 |     p521_t *__restrict__ out,
 67 |     const p521_t *a,
 68 |     uint64_t b
 69 | );
 70 | 
 71 | void
 72 | p521_sqr (
 73 |     p521_t *__restrict__ out,
 74 |     const p521_t *a
 75 | );
 76 | 
 77 | void
 78 | p521_serialize (
 79 |     uint8_t *serial,
 80 |     const struct p521_t *x
 81 | );
 82 | 
 83 | mask_t
 84 | p521_deserialize (
 85 |     p521_t *x,
 86 |     const uint8_t serial[66]
 87 | );
 88 | 
 89 | /* -------------- Inline functions begin here -------------- */
 90 | 
 91 | void
 92 | p521_add_RAW (
 93 |     p521_t *out,
 94 |     const p521_t *a,
 95 |     const p521_t *b
 96 | ) {
 97 |     unsigned int i;
 98 |     for (i=0; i<9; i++) {
 99 |         out->limb[i] = a->limb[i] + b->limb[i];
100 |     }
101 |     p521_weak_reduce(out);
102 | }
103 | 
104 | void
105 | p521_sub_RAW (
106 |     p521_t *out,
107 |     const p521_t *a,
108 |     const p521_t *b
109 | ) {
110 |     unsigned int i;
111 |     uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4;
112 |     for (i=0; i<9; i++) {
113 |         out->limb[i] = a->limb[i] - b->limb[i] + ((i==8) ? co2 : co1);
114 |     }
115 |     p521_weak_reduce(out);
116 | }
117 | 
118 | void
119 | p521_copy (
120 |     p521_t *out,
121 |     const p521_t *a
122 | ) {
123 |     memcpy(out,a,sizeof(*a));
124 | }
125 | 
126 | void
127 | p521_bias (
128 |     p521_t *a,
129 |     int amt
130 | ) {
131 |     (void) a;
132 |     (void) amt;
133 | }
134 | 
135 | void
136 | p521_weak_reduce (
137 |     p521_t *a
138 | ) {
139 |     uint64_t mask = (1ull<<58) - 1;
140 |     uint64_t tmp = a->limb[8] >> 57;
141 |     int i;
142 |     for (i=8; i>0; i--) {
143 |         a->limb[i] = (a->limb[i] & ((i==8) ? mask>>1 : mask)) + (a->limb[i-1]>>58);
144 |     }
145 |     a->limb[0] = (a->limb[0] & mask) + tmp;
146 | }
147 | 
148 | #ifdef __cplusplus
149 | }; /* extern "C" */
150 | #endif
151 | 
152 | #endif /* __P521_H__ */
153 | 


--------------------------------------------------------------------------------
/aux/decaffeinate_curve25519.sage:
--------------------------------------------------------------------------------
  1 | # This is as sketch of how to decaffeinate Curve25519
  2 | 
  3 | F = GF(2^255-19)
  4 | d = -121665
  5 | M = EllipticCurve(F,[0,2-4*d,0,1,0])
  6 |     
  7 | sqrtN1 = sqrt(F(-1))
  8 |     
  9 | def maybe(): return randint(0,1)
 10 | 
 11 | def qpositive(x):
 12 |     return int(x) <= (2^255-19-1)/2
 13 | 
 14 | def M_to_E(P):
 15 |     # P must be even
 16 |     (x,y) = P.xy()
 17 |     assert x.is_square()
 18 |     
 19 |     s = sqrt(x)
 20 |     if s == 0: t = 1
 21 |     else: t = y/s
 22 |     
 23 |     X,Y = 2*s / (1+s^2), (1-s^2) / t
 24 |     if maybe(): X,Y = -X,-Y
 25 |     if maybe(): X,Y = Y,-X
 26 |     # OK, have point in ed
 27 |     return X,Y
 28 | 
 29 | def decaf_encode_from_E(X,Y):
 30 |     assert X^2 + Y^2 == 1 + d*X^2*Y^2
 31 |     if not qpositive(X*Y): X,Y = Y,-X
 32 |     assert qpositive(X*Y)
 33 |     
 34 |     assert (1-X^2).is_square()
 35 |     sx = sqrt(1-X^2)
 36 |     tos = -2*sx/X/Y
 37 |     if not qpositive(tos): sx = -sx
 38 |     s = (1 + sx) / X
 39 |     if not qpositive(s): s = -s
 40 |     
 41 |     return s
 42 | 
 43 | def isqrt(x):
 44 |     ops = [(1,2),(1,2),(3,1),(6,0),(1,2),(12,1),(25,1),(25,1),(50,0),(125,0),(2,2),(1,2)]
 45 |     st = [x,x,x]
 46 |     for i,(sh,add) in enumerate(ops):
 47 |         od = i&1
 48 |         st[od] = st[od^^1]^(2^sh)*st[add]
 49 |     # assert st[2] == x^(2^252-3)
 50 |     
 51 |     assert st[1] == 1 or st[1] == -1
 52 |     if st[1] == 1: return st[0]
 53 |     else: return st[0] * sqrtN1
 54 | 
 55 | def decaf_encode_from_E_c(X,Y):
 56 |     Z = F.random_element()
 57 |     T = X*Y*Z
 58 |     X = X*Z
 59 |     Y = Y*Z
 60 |     assert X^2 + Y^2 == Z^2 + d*T^2
 61 |     
 62 |     # Precompute
 63 |     sd = sqrt(F(1-d))
 64 |     
 65 |     zx = Z^2-X^2
 66 |     TZ = T*Z
 67 |     assert zx.is_square
 68 |     ooAll = isqrt(zx*TZ^2)
 69 |     osx = ooAll * TZ
 70 |     ooTZ = ooAll * zx * osx
 71 |     
 72 |     floop = qpositive(T^2 * ooTZ)
 73 |     if floop:
 74 |         frob = zx * ooTZ
 75 |     else:
 76 |         frob = sd
 77 |         Y = -X
 78 |         
 79 |     osx *= frob
 80 |     
 81 |     if qpositive(-2*osx*Z) != floop: osx = -osx
 82 |     s = Y*(ooTZ*Z + osx)
 83 |     if not qpositive(s): s = -s
 84 |     
 85 |     return s
 86 |     
 87 | def is_rotation((X,Y),(x,y)):
 88 |     return x*Y == X*y or x*X == -y*Y
 89 |     
 90 | def decaf_decode_to_E(s):
 91 |     assert qpositive(s)
 92 |     t = sqrt(s^4 + (2-4*d)*s^2 + 1)
 93 |     if not qpositive(t/s): t = -t
 94 |     X,Y = 2*s / (1+s^2), (1-s^2) / t
 95 |     assert qpositive(X*Y)
 96 |     return X,Y
 97 |     
 98 | def decaf_decode_to_E_c(s):
 99 |     assert qpositive(s)
100 |     
101 |     s2 = s^2
102 |     s21 = 1+s2
103 |     t2 = s21^2 - 4*d*s2
104 |     
105 |     alt  = s21*s
106 |     the  = isqrt(t2*alt^2)
107 |     oot  = the * alt
108 |     the *= t2
109 |     tos  = the * s21
110 |     X = 2 * (tos-the) * oot
111 |     Y = (1-s2) * oot
112 |     
113 |     if not qpositive(tos): Y = -Y
114 |     assert qpositive(X*Y)
115 |     
116 |     return X,Y
117 | 
118 | def test():
119 |     P = 2*M.random_point()
120 |     X,Y = M_to_E(P)
121 |     s = decaf_encode_from_E(X,Y)
122 |     assert s == decaf_encode_from_E_c(X,Y)
123 |     XX,YY = decaf_decode_to_E(s)
124 |     XX2,YY2 = decaf_decode_to_E_c(s)
125 |     assert is_rotation((X,Y),(XX,YY))
126 |     assert is_rotation((X,Y),(XX2,YY2))
127 | 
128 | 
129 |     


--------------------------------------------------------------------------------
/src/p448/arch_ref64/p448.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | #ifndef __P448_H__
  5 | #define __P448_H__ 1
  6 | 
  7 | #include <stdint.h>
  8 | #include <assert.h>
  9 | #include <string.h>
 10 | 
 11 | #include "word.h"
 12 | 
 13 | typedef struct p448_t {
 14 |   uint64_t limb[8];
 15 | } __attribute__((aligned(32))) p448_t;
 16 | 
 17 | #define LBITS 56
 18 | #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
 19 | 
 20 | #ifdef __cplusplus
 21 | extern "C" {
 22 | #endif
 23 | 
 24 | static __inline__ void
 25 | p448_add_RAW (
 26 |     p448_t *out,
 27 |     const p448_t *a,
 28 |     const p448_t *b
 29 | ) __attribute__((unused));
 30 |              
 31 | static __inline__ void
 32 | p448_sub_RAW (
 33 |     p448_t *out,
 34 |     const p448_t *a,
 35 |     const p448_t *b
 36 | ) __attribute__((unused));
 37 |              
 38 | static __inline__ void
 39 | p448_copy (
 40 |     p448_t *out,
 41 |     const p448_t *a
 42 | ) __attribute__((unused));
 43 |              
 44 | static __inline__ void
 45 | p448_weak_reduce (
 46 |     p448_t *inout
 47 | ) __attribute__((unused));
 48 |              
 49 | void
 50 | p448_strong_reduce (
 51 |     p448_t *inout
 52 | );
 53 | 
 54 | static __inline__ void
 55 | p448_bias (
 56 |     p448_t *inout,
 57 |     int amount
 58 | ) __attribute__((unused));
 59 |          
 60 | void
 61 | p448_mul (
 62 |     p448_t *__restrict__ out,
 63 |     const p448_t *a,
 64 |     const p448_t *b
 65 | );
 66 | 
 67 | void
 68 | p448_mulw (
 69 |     p448_t *__restrict__ out,
 70 |     const p448_t *a,
 71 |     uint64_t b
 72 | );
 73 | 
 74 | void
 75 | p448_sqr (
 76 |     p448_t *__restrict__ out,
 77 |     const p448_t *a
 78 | );
 79 | 
 80 | void
 81 | p448_serialize (
 82 |     uint8_t *serial,
 83 |     const struct p448_t *x
 84 | );
 85 | 
 86 | mask_t
 87 | p448_deserialize (
 88 |     p448_t *x,
 89 |     const uint8_t serial[56]
 90 | );
 91 | 
 92 | /* -------------- Inline functions begin here -------------- */
 93 | 
 94 | void
 95 | p448_add_RAW (
 96 |     p448_t *out,
 97 |     const p448_t *a,
 98 |     const p448_t *b
 99 | ) {
100 |     unsigned int i;
101 |     for (i=0; i<8; i++) {
102 |         out->limb[i] = a->limb[i] + b->limb[i];
103 |     }
104 |     p448_weak_reduce(out);
105 | }
106 | 
107 | void
108 | p448_sub_RAW (
109 |     p448_t *out,
110 |     const p448_t *a,
111 |     const p448_t *b
112 | ) {
113 |     unsigned int i;
114 |     uint64_t co1 = ((1ull<<56)-1)*2, co2 = co1-2;
115 |     for (i=0; i<8; i++) {
116 |         out->limb[i] = a->limb[i] - b->limb[i] + ((i==4) ? co2 : co1);
117 |     }
118 |     p448_weak_reduce(out);
119 | }
120 | 
121 | void
122 | p448_copy (
123 |     p448_t *out,
124 |     const p448_t *a
125 | ) {
126 |     memcpy(out,a,sizeof(*a));
127 | }
128 | 
129 | void
130 | p448_bias (
131 |     p448_t *a,
132 |     int amt
133 | ) {
134 |     (void) a;
135 |     (void) amt;
136 | }
137 | 
138 | void
139 | p448_weak_reduce (
140 |     p448_t *a
141 | ) {
142 |     uint64_t mask = (1ull<<56) - 1;
143 |     uint64_t tmp = a->limb[7] >> 56;
144 |     int i;
145 |     a->limb[4] += tmp;
146 |     for (i=7; i>0; i--) {
147 |         a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56);
148 |     }
149 |     a->limb[0] = (a->limb[0] & mask) + tmp;
150 | }
151 | 
152 | #ifdef __cplusplus
153 | }; /* extern "C" */
154 | #endif
155 | 
156 | #endif /* __P448_H__ */
157 | 


--------------------------------------------------------------------------------
/src/p448/arch_32/p448.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | #ifndef __P448_H__
  5 | #define __P448_H__ 1
  6 | 
  7 | #include "word.h"
  8 | 
  9 | #include <stdint.h>
 10 | #include <assert.h>
 11 | 
 12 | typedef struct p448_t {
 13 |   uint32_t limb[16];
 14 | } __attribute__((aligned(32))) p448_t;
 15 | 
 16 | #define LBITS 28
 17 | #define LIMB(x) (x##ull)&((1ull<<LBITS)-1), (x##ull)>>LBITS
 18 | #define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
 19 |     {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}
 20 | 
 21 | #ifdef __cplusplus
 22 | extern "C" {
 23 | #endif
 24 | 
 25 | static __inline__ void
 26 | p448_add_RAW (
 27 |     p448_t *out,
 28 |     const p448_t *a,
 29 |     const p448_t *b
 30 | ) __attribute__((unused,always_inline));
 31 |              
 32 | static __inline__ void
 33 | p448_sub_RAW (
 34 |     p448_t *out,
 35 |     const p448_t *a,
 36 |     const p448_t *b
 37 | ) __attribute__((unused,always_inline));
 38 |              
 39 | static __inline__ void
 40 | p448_copy (
 41 |     p448_t *out,
 42 |     const p448_t *a
 43 | ) __attribute__((unused,always_inline));
 44 |              
 45 | static __inline__ void
 46 | p448_weak_reduce (
 47 |     p448_t *inout
 48 | ) __attribute__((unused,always_inline));
 49 |              
 50 | void
 51 | p448_strong_reduce (
 52 |     p448_t *inout
 53 | );
 54 |              
 55 | static __inline__ void
 56 | p448_bias (
 57 |     p448_t *inout,
 58 |     int amount
 59 | ) __attribute__((unused,always_inline));
 60 | 
 61 | void
 62 | p448_mul (
 63 |     p448_t *__restrict__ out,
 64 |     const p448_t *a,
 65 |     const p448_t *b
 66 | );
 67 | 
 68 | void
 69 | p448_mulw (
 70 |     p448_t *__restrict__ out,
 71 |     const p448_t *a,
 72 |     uint64_t b
 73 | );
 74 | 
 75 | void
 76 | p448_sqr (
 77 |     p448_t *__restrict__ out,
 78 |     const p448_t *a
 79 | );
 80 | 
 81 | void
 82 | p448_serialize (
 83 |     uint8_t *serial,
 84 |     const struct p448_t *x
 85 | );
 86 | 
 87 | mask_t
 88 | p448_deserialize (
 89 |     p448_t *x,
 90 |     const uint8_t serial[56]
 91 | );
 92 | 
 93 | /* -------------- Inline functions begin here -------------- */
 94 | 
 95 | void
 96 | p448_add_RAW (
 97 |     p448_t *out,
 98 |     const p448_t *a,
 99 |     const p448_t *b
100 | ) {
101 |     unsigned int i;
102 |     for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
103 |         ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
104 |     }
105 |     /*
106 |     unsigned int i;
107 |     for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
108 |         out->limb[i] = a->limb[i] + b->limb[i];
109 |     }
110 |     */
111 | }
112 | 
113 | void
114 | p448_sub_RAW (
115 |     p448_t *out,
116 |     const p448_t *a,
117 |     const p448_t *b
118 | ) {
119 |     unsigned int i;
120 |     for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
121 |         ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
122 |     }
123 |     /*
124 |     unsigned int i;
125 |     for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
126 |         out->limb[i] = a->limb[i] - b->limb[i];
127 |     }
128 |     */
129 | }
130 | 
131 | void
132 | p448_copy (
133 |     p448_t *out,
134 |     const p448_t *a
135 | ) {
136 |   *out = *a;
137 | }
138 | 
139 | void
140 | p448_bias (
141 |     p448_t *a,
142 |     int amt
143 | ) {
144 |     uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
145 |     uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
146 |     uint32x4_t *aa = (uint32x4_t*) a;
147 |     aa[0] += lo;
148 |     aa[1] += lo;
149 |     aa[2] += hi;
150 |     aa[3] += lo;
151 | }
152 | 
153 | void
154 | p448_weak_reduce (
155 |     p448_t *a
156 | ) {
157 |     uint64_t mask = (1ull<<28) - 1;
158 |     uint64_t tmp = a->limb[15] >> 28;
159 |     int i;
160 |     a->limb[8] += tmp;
161 |     for (i=15; i>0; i--) {
162 |         a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
163 |     }
164 |     a->limb[0] = (a->limb[0] & mask) + tmp;
165 | }
166 | 
167 | #ifdef __cplusplus
168 | }; /* extern "C" */
169 | #endif
170 | 
171 | #endif /* __P448_H__ */
172 | 


--------------------------------------------------------------------------------
/src/p448/arch_arm_32/p448.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | #ifndef __P448_H__
  5 | #define __P448_H__ 1
  6 | 
  7 | #include "word.h"
  8 | 
  9 | #include <stdint.h>
 10 | #include <assert.h>
 11 | 
 12 | typedef struct p448_t {
 13 |   uint32_t limb[16];
 14 | } __attribute__((aligned(32))) p448_t;
 15 | 
 16 | #define LBITS 28
 17 | #define LIMB(x) (x##ull)&((1ull<<LBITS)-1), (x##ull)>>LBITS
 18 | #define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
 19 |     {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}
 20 | 
 21 | #ifdef __cplusplus
 22 | extern "C" {
 23 | #endif
 24 | 
 25 | static __inline__ void
 26 | p448_add_RAW (
 27 |     p448_t *out,
 28 |     const p448_t *a,
 29 |     const p448_t *b
 30 | ) __attribute__((unused,always_inline));
 31 |              
 32 | static __inline__ void
 33 | p448_sub_RAW (
 34 |     p448_t *out,
 35 |     const p448_t *a,
 36 |     const p448_t *b
 37 | ) __attribute__((unused,always_inline));
 38 |              
 39 | static __inline__ void
 40 | p448_copy (
 41 |     p448_t *out,
 42 |     const p448_t *a
 43 | ) __attribute__((unused,always_inline));
 44 |              
 45 | static __inline__ void
 46 | p448_weak_reduce (
 47 |     p448_t *inout
 48 | ) __attribute__((unused,always_inline));
 49 |              
 50 | void
 51 | p448_strong_reduce (
 52 |     p448_t *inout
 53 | );
 54 |              
 55 | static __inline__ void
 56 | p448_bias (
 57 |     p448_t *inout,
 58 |     int amount
 59 | ) __attribute__((unused,always_inline));
 60 | 
 61 | void
 62 | p448_mul (
 63 |     p448_t *__restrict__ out,
 64 |     const p448_t *a,
 65 |     const p448_t *b
 66 | );
 67 | 
 68 | void
 69 | p448_mulw (
 70 |     p448_t *__restrict__ out,
 71 |     const p448_t *a,
 72 |     uint64_t b
 73 | );
 74 | 
 75 | void
 76 | p448_sqr (
 77 |     p448_t *__restrict__ out,
 78 |     const p448_t *a
 79 | );
 80 | 
 81 | void
 82 | p448_serialize (
 83 |     uint8_t *serial,
 84 |     const struct p448_t *x
 85 | );
 86 | 
 87 | mask_t
 88 | p448_deserialize (
 89 |     p448_t *x,
 90 |     const uint8_t serial[56]
 91 | );
 92 | 
 93 | /* -------------- Inline functions begin here -------------- */
 94 | 
 95 | void
 96 | p448_add_RAW (
 97 |     p448_t *out,
 98 |     const p448_t *a,
 99 |     const p448_t *b
100 | ) {
101 |     unsigned int i;
102 |     for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
103 |         ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
104 |     }
105 |     /*
106 |     unsigned int i;
107 |     for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
108 |         out->limb[i] = a->limb[i] + b->limb[i];
109 |     }
110 |     */
111 | }
112 | 
113 | void
114 | p448_sub_RAW (
115 |     p448_t *out,
116 |     const p448_t *a,
117 |     const p448_t *b
118 | ) {
119 |     unsigned int i;
120 |     for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
121 |         ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
122 |     }
123 |     /*
124 |     unsigned int i;
125 |     for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
126 |         out->limb[i] = a->limb[i] - b->limb[i];
127 |     }
128 |     */
129 | }
130 | 
131 | void
132 | p448_copy (
133 |     p448_t *out,
134 |     const p448_t *a
135 | ) {
136 |   *out = *a;
137 | }
138 | 
139 | void
140 | p448_bias (
141 |     p448_t *a,
142 |     int amt
143 | ) {
144 |     uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
145 |     uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
146 |     uint32x4_t *aa = (uint32x4_t*) a;
147 |     aa[0] += lo;
148 |     aa[1] += lo;
149 |     aa[2] += hi;
150 |     aa[3] += lo;
151 | }
152 | 
153 | void
154 | p448_weak_reduce (
155 |     p448_t *a
156 | ) {
157 |     uint64_t mask = (1ull<<28) - 1;
158 |     uint64_t tmp = a->limb[15] >> 28;
159 |     int i;
160 |     a->limb[8] += tmp;
161 |     for (i=15; i>0; i--) {
162 |         a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
163 |     }
164 |     a->limb[0] = (a->limb[0] & mask) + tmp;
165 | }
166 | 
167 | #ifdef __cplusplus
168 | }; /* extern "C" */
169 | #endif
170 | 
171 | #endif /* __P448_H__ */
172 | 


--------------------------------------------------------------------------------
/src/p448/arch_neon_experimental/p448.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | #ifndef __P448_H__
  5 | #define __P448_H__ 1
  6 | 
  7 | #include "word.h"
  8 | 
  9 | #include <stdint.h>
 10 | #include <assert.h>
 11 | 
 12 | typedef struct p448_t {
 13 |   uint32_t limb[16];
 14 | } __attribute__((aligned(32))) p448_t;
 15 | 
 16 | #define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15)
 17 | #define USE_NEON_PERM 1
 18 | #define LBITS 28
 19 | #define LIMBHI(x) ((x##ull)>>LBITS)
 20 | #define LIMBLO(x) ((x##ull)&((1ull<<LBITS)-1))
 21 | #  define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
 22 |     {{LIMBLO(a),LIMBLO(e), LIMBHI(a),LIMBHI(e), \
 23 |       LIMBLO(b),LIMBLO(f), LIMBHI(b),LIMBHI(f), \
 24 |       LIMBLO(c),LIMBLO(g), LIMBHI(c),LIMBHI(g), \
 25 |       LIMBLO(d),LIMBLO(h), LIMBHI(d),LIMBHI(h)}}
 26 | 
 27 | #ifdef __cplusplus
 28 | extern "C" {
 29 | #endif
 30 | 
 31 | static __inline__ void
 32 | p448_add_RAW (
 33 |     p448_t *out,
 34 |     const p448_t *a,
 35 |     const p448_t *b
 36 | ) __attribute__((unused,always_inline));
 37 |              
 38 | static __inline__ void
 39 | p448_sub_RAW (
 40 |     p448_t *out,
 41 |     const p448_t *a,
 42 |     const p448_t *b
 43 | ) __attribute__((unused,always_inline));
 44 |              
 45 | static __inline__ void
 46 | p448_copy (
 47 |     p448_t *out,
 48 |     const p448_t *a
 49 | ) __attribute__((unused,always_inline));
 50 |              
 51 | static __inline__ void
 52 | p448_weak_reduce (
 53 |     p448_t *inout
 54 | ) __attribute__((unused,always_inline));
 55 |              
 56 | void
 57 | p448_strong_reduce (
 58 |     p448_t *inout
 59 | );
 60 |              
 61 | static __inline__ void
 62 | p448_bias (
 63 |     p448_t *inout,
 64 |     int amount
 65 | ) __attribute__((unused,always_inline));
 66 | 
 67 | void
 68 | p448_mul (
 69 |     p448_t *__restrict__ out,
 70 |     const p448_t *a,
 71 |     const p448_t *b
 72 | );
 73 | 
 74 | void
 75 | p448_mulw (
 76 |     p448_t *__restrict__ out,
 77 |     const p448_t *a,
 78 |     uint64_t b
 79 | );
 80 | 
 81 | void
 82 | p448_sqr (
 83 |     p448_t *__restrict__ out,
 84 |     const p448_t *a
 85 | );
 86 | 
 87 | void
 88 | p448_serialize (
 89 |     uint8_t *serial,
 90 |     const struct p448_t *x
 91 | );
 92 | 
 93 | mask_t
 94 | p448_deserialize (
 95 |     p448_t *x,
 96 |     const uint8_t serial[56]
 97 | );
 98 | 
 99 | /* -------------- Inline functions begin here -------------- */
100 | 
101 | void
102 | p448_add_RAW (
103 |     p448_t *out,
104 |     const p448_t *a,
105 |     const p448_t *b
106 | ) {
107 |     unsigned int i;
108 |     for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
109 |         ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
110 |     }
111 | }
112 | 
113 | void
114 | p448_sub_RAW (
115 |     p448_t *out,
116 |     const p448_t *a,
117 |     const p448_t *b
118 | ) {
119 |     unsigned int i;
120 |     for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
121 |         ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
122 |     }
123 |     /*
124 |     unsigned int i;
125 |     for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
126 |         out->limb[i] = a->limb[i] - b->limb[i];
127 |     }
128 |     */
129 | }
130 | 
131 | void
132 | p448_copy (
133 |     p448_t *out,
134 |     const p448_t *a
135 | ) {
136 |   *out = *a;
137 | }
138 | 
139 | void
140 | p448_bias (
141 |     p448_t *a,
142 |     int amt
143 | ) {
144 |     uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
145 |     uint32x4_t lo = {co1,co2,co1,co1}, hi = {co1,co1,co1,co1};
146 |     uint32x4_t *aa = (uint32x4_t*) a;
147 |     aa[0] += lo;
148 |     aa[1] += hi;
149 |     aa[2] += hi;
150 |     aa[3] += hi;
151 | }
152 | 
153 | void
154 | p448_weak_reduce (
155 |     p448_t *a
156 | ) {
157 | 
158 |     uint32x2_t *aa = (uint32x2_t*) a, vmask = {(1ull<<28)-1, (1ull<<28)-1}, vm2 = {0,-1},
159 |        tmp = vshr_n_u32(aa[7],28);
160 |        
161 |     int i;
162 |     for (i=7; i>=1; i--) {
163 |         aa[i] = vsra_n_u32(aa[i] & vmask, aa[i-1], 28);
164 |     }
165 |     aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp&vm2);
166 | }
167 | 
168 | #ifdef __cplusplus
169 | }; /* extern "C" */
170 | #endif
171 | 
172 | #endif /* __P448_H__ */
173 | 


--------------------------------------------------------------------------------
/src/p521/arch_x86_64_r12/p521.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | #ifndef __P521_H__
  5 | #define __P521_H__ 1
  6 | 
  7 | #include <stdint.h>
  8 | #include <assert.h>
  9 | #include <string.h>
 10 | 
 11 | #include "word.h"
 12 | #include "constant_time.h"
 13 | 
 14 | #define LIMBPERM(x) (((x)%3)*4 + (x)/3)
 15 | #define USE_P521_3x3_TRANSPOSE
 16 | 
 17 | typedef struct p521_t {
 18 |   uint64_t limb[12];
 19 | } __attribute__((aligned(32))) p521_t;
 20 | 
 21 | #ifdef __cplusplus
 22 | extern "C" {
 23 | #endif
 24 | 
 25 | static __inline__ void
 26 | p521_add_RAW (
 27 |     p521_t *out,
 28 |     const p521_t *a,
 29 |     const p521_t *b
 30 | ) __attribute__((unused));
 31 |              
 32 | static __inline__ void
 33 | p521_sub_RAW (
 34 |     p521_t *out,
 35 |     const p521_t *a,
 36 |     const p521_t *b
 37 | ) __attribute__((unused));
 38 |              
 39 | static __inline__ void
 40 | p521_copy (
 41 |     p521_t *out,
 42 |     const p521_t *a
 43 | ) __attribute__((unused));
 44 |              
 45 | static __inline__ void
 46 | p521_weak_reduce (
 47 |     p521_t *inout
 48 | ) __attribute__((unused));
 49 |              
 50 | void
 51 | p521_strong_reduce (
 52 |     p521_t *inout
 53 | );
 54 | 
 55 | static __inline__ void
 56 | p521_bias (
 57 |     p521_t *inout,
 58 |     int amount
 59 | ) __attribute__((unused));
 60 |          
 61 | void
 62 | p521_mul (
 63 |     p521_t *__restrict__ out,
 64 |     const p521_t *a,
 65 |     const p521_t *b
 66 | );
 67 | 
 68 | void
 69 | p521_mulw (
 70 |     p521_t *__restrict__ out,
 71 |     const p521_t *a,
 72 |     uint64_t b
 73 | );
 74 | 
 75 | void
 76 | p521_sqr (
 77 |     p521_t *__restrict__ out,
 78 |     const p521_t *a
 79 | );
 80 | 
 81 | void
 82 | p521_serialize (
 83 |     uint8_t *serial,
 84 |     const struct p521_t *x
 85 | );
 86 | 
 87 | mask_t
 88 | p521_deserialize (
 89 |     p521_t *x,
 90 |     const uint8_t serial[66]
 91 | );
 92 | 
 93 | /* -------------- Inline functions begin here -------------- */
 94 | 
 95 | typedef uint64x4_t uint64x3_t; /* fit it in a vector register */
 96 | 
 97 | static const uint64x3_t mask58 = { (1ull<<58) - 1, (1ull<<58) - 1, (1ull<<58) - 1, 0 };
 98 | 
 99 | /* Currently requires CLANG.  Sorry. */
100 | static inline uint64x3_t
101 | __attribute__((unused))
102 | timesW (
103 |   uint64x3_t u
104 | ) {
105 |   return u.zxyw + u.zwww;
106 | }
107 | 
108 | void
109 | p521_add_RAW (
110 |     p521_t *out,
111 |     const p521_t *a,
112 |     const p521_t *b
113 | ) {
114 |     unsigned int i;
115 |     for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
116 |         ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
117 |     }
118 | }
119 | 
120 | void
121 | p521_sub_RAW (
122 |     p521_t *out,
123 |     const p521_t *a,
124 |     const p521_t *b
125 | ) {
126 |     unsigned int i;
127 |     for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
128 |         ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
129 |     }
130 | }
131 | 
132 | void
133 | p521_copy (
134 |     p521_t *out,
135 |     const p521_t *a
136 | ) {
137 |     memcpy(out,a,sizeof(*a));
138 | }
139 | 
140 | void
141 | p521_bias (
142 |     p521_t *a,
143 |     int amt
144 | ) {
145 |     uint64_t co0 = ((1ull<<58)-2)*amt, co1 = ((1ull<<58)-1)*amt;
146 |     uint64x4_t vlo = { co0, co1, co1, 0 }, vhi = { co1, co1, co1, 0 };
147 |     ((uint64x4_t*)a)[0] += vlo;
148 |     ((uint64x4_t*)a)[1] += vhi;
149 |     ((uint64x4_t*)a)[2] += vhi;
150 | }
151 | 
152 | void
153 | p521_weak_reduce (
154 |     p521_t *a
155 | ) {
156 | #if 0
157 |     int i;
158 |     assert(a->limb[3] == 0 && a->limb[7] == 0 && a->limb[11] == 0);
159 |     for (i=0; i<12; i++) {
160 |         assert(a->limb[i] < 3ull<<61);
161 |     }
162 | #endif
163 |     
164 |     uint64x3_t
165 |         ot0 = ((uint64x4_t*)a)[0],
166 |         ot1 = ((uint64x4_t*)a)[1],
167 |         ot2 = ((uint64x4_t*)a)[2];
168 |     
169 |     uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58);
170 |     uint64x3_t out1 = (ot1 & mask58) + (ot0>>58);
171 |     uint64x3_t out2 = (ot2 & mask58) + (ot1>>58);
172 | 
173 |     ((uint64x4_t*)a)[0] = out0;
174 |     ((uint64x4_t*)a)[1] = out1;
175 |     ((uint64x4_t*)a)[2] = out2;
176 | }
177 | 
178 | #ifdef __cplusplus
179 | }; /* extern "C" */
180 | #endif
181 | 
182 | #endif /* __P521_H__ */
183 | 


--------------------------------------------------------------------------------
/src/p480/arch_x86_64/p480.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | #ifndef __p480_H__
  5 | #define __p480_H__ 1
  6 | 
  7 | #include <stdint.h>
  8 | #include <assert.h>
  9 | 
 10 | #include "word.h"
 11 | 
 12 | typedef struct p480_t {
 13 |   uint64_t limb[8];
 14 | } __attribute__((aligned(32))) p480_t;
 15 | 
 16 | #ifdef __cplusplus
 17 | extern "C" {
 18 | #endif
 19 | 
 20 | static __inline__ void
 21 | p480_add_RAW (
 22 |     p480_t *out,
 23 |     const p480_t *a,
 24 |     const p480_t *b
 25 | ) __attribute__((unused,always_inline));
 26 |              
 27 | static __inline__ void
 28 | p480_sub_RAW (
 29 |     p480_t *out,
 30 |     const p480_t *a,
 31 |     const p480_t *b
 32 | ) __attribute__((unused,always_inline));
 33 |              
 34 | static __inline__ void
 35 | p480_copy (
 36 |     p480_t *out,
 37 |     const p480_t *a
 38 | ) __attribute__((unused,always_inline));
 39 |              
 40 | static __inline__ void
 41 | p480_weak_reduce (
 42 |     p480_t *inout
 43 | ) __attribute__((unused,always_inline));
 44 |              
 45 | void
 46 | p480_strong_reduce (
 47 |     p480_t *inout
 48 | );
 49 |   
 50 | static __inline__ void
 51 | p480_bias (
 52 |     p480_t *inout,
 53 |     int amount
 54 | ) __attribute__((unused,always_inline));
 55 |          
 56 | void
 57 | p480_mul (
 58 |     p480_t *__restrict__ out,
 59 |     const p480_t *a,
 60 |     const p480_t *b
 61 | );
 62 | 
 63 | void
 64 | p480_mulw (
 65 |     p480_t *__restrict__ out,
 66 |     const p480_t *a,
 67 |     uint64_t b
 68 | );
 69 | 
 70 | void
 71 | p480_sqr (
 72 |     p480_t *__restrict__ out,
 73 |     const p480_t *a
 74 | );
 75 | 
 76 | void
 77 | p480_serialize (
 78 |     uint8_t *serial,
 79 |     const struct p480_t *x
 80 | );
 81 | 
 82 | mask_t
 83 | p480_deserialize (
 84 |     p480_t *x,
 85 |     const uint8_t serial[60]
 86 | );
 87 | 
 88 | /* -------------- Inline functions begin here -------------- */
 89 | 
 90 | void
 91 | p480_add_RAW (
 92 |     p480_t *out,
 93 |     const p480_t *a,
 94 |     const p480_t *b
 95 | ) {
 96 |     unsigned int i;
 97 |     for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
 98 |         ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
 99 |     }
100 |     /*
101 |     unsigned int i;
102 |     for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
103 |         out->limb[i] = a->limb[i] + b->limb[i];
104 |     }
105 |     */
106 | }
107 | 
108 | void
109 | p480_sub_RAW (
110 |     p480_t *out,
111 |     const p480_t *a,
112 |     const p480_t *b
113 | ) {
114 |     unsigned int i;
115 |     for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
116 |         ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
117 |     }
118 |     /*
119 |     unsigned int i;
120 |     for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
121 |         out->limb[i] = a->limb[i] - b->limb[i];
122 |     }
123 |     */
124 | }
125 | 
126 | void
127 | p480_copy (
128 |     p480_t *out,
129 |     const p480_t *a
130 | ) {
131 |     unsigned int i;
132 |     for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
133 |         ((big_register_t *)out)[i] = ((const big_register_t *)a)[i];
134 |     }
135 | }
136 | 
137 | void
138 | p480_bias (
139 |     p480_t *a,
140 |     int amt
141 | ) {
142 |     uint64_t co1 = ((1ull<<60)-1)*amt, co2 = co1-amt;
143 |     
144 | #if __AVX2__
145 |     uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
146 |     uint64x4_t *aa = (uint64x4_t*) a;
147 |     aa[0] += lo;
148 |     aa[1] += hi;
149 | #elif __SSE2__
150 |     uint64x2_t lo = {co1,co1}, hi = {co2,co1};
151 |     uint64x2_t *aa = (uint64x2_t*) a;
152 |     aa[0] += lo;
153 |     aa[1] += lo;
154 |     aa[2] += hi;
155 |     aa[3] += lo;
156 | #else
157 |     unsigned int i;
158 |     for (i=0; i<sizeof(*a)/sizeof(uint64_t); i++) {
159 |         a->limb[i] += (i==4) ? co2 : co1;
160 |     }
161 | #endif
162 | }
163 | 
164 | void
165 | p480_weak_reduce (
166 |     p480_t *a
167 | ) {
168 |     /* PERF: use pshufb/palignr if anyone cares about speed of this */
169 |     uint64_t mask = (1ull<<60) - 1;
170 |     uint64_t tmp = a->limb[7] >> 60;
171 |     int i;
172 |     a->limb[4] += tmp;
173 |     for (i=7; i>0; i--) {
174 |         a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>60);
175 |     }
176 |     a->limb[0] = (a->limb[0] & mask) + tmp;
177 | }
178 | 
179 | #ifdef __cplusplus
180 | }; /* extern "C" */
181 | #endif
182 | 
183 | #endif /* __p480_H__ */
184 | 


--------------------------------------------------------------------------------
/src/p448/arch_x86_64/p448.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | #ifndef __P448_H__
  5 | #define __P448_H__ 1
  6 | 
  7 | #include <stdint.h>
  8 | #include <assert.h>
  9 | 
 10 | #include "word.h"
 11 | 
 12 | typedef struct p448_t {
 13 |   uint64_t limb[8];
 14 | } __attribute__((aligned(32))) p448_t;
 15 | 
 16 | #define LBITS 56
 17 | #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
 18 | 
 19 | #ifdef __cplusplus
 20 | extern "C" {
 21 | #endif
 22 | 
 23 | static __inline__ void
 24 | p448_add_RAW (
 25 |     p448_t *out,
 26 |     const p448_t *a,
 27 |     const p448_t *b
 28 | ) __attribute__((unused,always_inline));
 29 |              
 30 | static __inline__ void
 31 | p448_sub_RAW (
 32 |     p448_t *out,
 33 |     const p448_t *a,
 34 |     const p448_t *b
 35 | ) __attribute__((unused,always_inline));
 36 |              
 37 | static __inline__ void
 38 | p448_copy (
 39 |     p448_t *out,
 40 |     const p448_t *a
 41 | ) __attribute__((unused,always_inline));
 42 |              
 43 | static __inline__ void
 44 | p448_weak_reduce (
 45 |     p448_t *inout
 46 | ) __attribute__((unused,always_inline));
 47 |              
 48 | void
 49 | p448_strong_reduce (
 50 |     p448_t *inout
 51 | );
 52 | 
 53 | static __inline__ void
 54 | p448_bias (
 55 |     p448_t *inout,
 56 |     int amount
 57 | ) __attribute__((unused,always_inline));
 58 |          
 59 | void
 60 | p448_mul (
 61 |     p448_t *__restrict__ out,
 62 |     const p448_t *a,
 63 |     const p448_t *b
 64 | );
 65 | 
 66 | void
 67 | p448_mulw (
 68 |     p448_t *__restrict__ out,
 69 |     const p448_t *a,
 70 |     uint64_t b
 71 | );
 72 | 
 73 | void
 74 | p448_sqr (
 75 |     p448_t *__restrict__ out,
 76 |     const p448_t *a
 77 | );
 78 | 
 79 | void
 80 | p448_serialize (
 81 |     uint8_t *serial,
 82 |     const struct p448_t *x
 83 | );
 84 | 
 85 | mask_t
 86 | p448_deserialize (
 87 |     p448_t *x,
 88 |     const uint8_t serial[56]
 89 | );
 90 | 
 91 | /* -------------- Inline functions begin here -------------- */
 92 | 
 93 | void
 94 | p448_add_RAW (
 95 |     p448_t *out,
 96 |     const p448_t *a,
 97 |     const p448_t *b
 98 | ) {
 99 |     unsigned int i;
100 |     for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
101 |         ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
102 |     }
103 |     /*
104 |     unsigned int i;
105 |     for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
106 |         out->limb[i] = a->limb[i] + b->limb[i];
107 |     }
108 |     */
109 | }
110 | 
111 | void
112 | p448_sub_RAW (
113 |     p448_t *out,
114 |     const p448_t *a,
115 |     const p448_t *b
116 | ) {
117 |     unsigned int i;
118 |     for (i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
119 |         ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
120 |     }
121 |     /*
122 |     unsigned int i;
123 |     for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
124 |         out->limb[i] = a->limb[i] - b->limb[i];
125 |     }
126 |     */
127 | }
128 | 
129 | void
130 | p448_copy (
131 |     p448_t *out,
132 |     const p448_t *a
133 | ) {
134 |     unsigned int i;
135 |     for (i=0; i<sizeof(*out)/sizeof(big_register_t); i++) {
136 |         ((big_register_t *)out)[i] = ((const big_register_t *)a)[i];
137 |     }
138 | }
139 | 
140 | void
141 | p448_bias (
142 |     p448_t *a,
143 |     int amt
144 | ) {
145 |     uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
146 |     
147 | #if __AVX2__
148 |     uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
149 |     uint64x4_t *aa = (uint64x4_t*) a;
150 |     aa[0] += lo;
151 |     aa[1] += hi;
152 | #elif __SSE2__
153 |     uint64x2_t lo = {co1,co1}, hi = {co2,co1};
154 |     uint64x2_t *aa = (uint64x2_t*) a;
155 |     aa[0] += lo;
156 |     aa[1] += lo;
157 |     aa[2] += hi;
158 |     aa[3] += lo;
159 | #else
160 |     unsigned int i;
161 |     for (i=0; i<sizeof(*a)/sizeof(uint64_t); i++) {
162 |         a->limb[i] += (i==4) ? co2 : co1;
163 |     }
164 | #endif
165 | }
166 | 
167 | void
168 | p448_weak_reduce (
169 |     p448_t *a
170 | ) {
171 |     /* PERF: use pshufb/palignr if anyone cares about speed of this */
172 |     uint64_t mask = (1ull<<56) - 1;
173 |     uint64_t tmp = a->limb[7] >> 56;
174 |     int i;
175 |     a->limb[4] += tmp;
176 |     for (i=7; i>0; i--) {
177 |         a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56);
178 |     }
179 |     a->limb[0] = (a->limb[0] & mask) + tmp;
180 | }
181 | 
182 | #ifdef __cplusplus
183 | }; /* extern "C" */
184 | #endif
185 | 
186 | #endif /* __P448_H__ */
187 | 


--------------------------------------------------------------------------------
/include/decaf_crypto.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file decaf_crypto.h
  3 |  * @copyright
  4 |  *   Copyright (c) 2015 Cryptography Research, Inc.  \n
  5 |  *   Released under the MIT License.  See LICENSE.txt for license information.
  6 |  * @author Mike Hamburg
  7 |  * @brief Example Decaf cyrpto routines.
  8 |  * @warning These are merely examples, though they ought to be secure.  But real
  9 |  * protocols will decide differently on magic numbers, formats, which items to
 10 |  * hash, etc.
 11 |  * @warning Experimental!  The names, parameter orders etc are likely to change.
 12 |  */
 13 | 
 14 | #ifndef __DECAF_CRYPTO_H__
 15 | #define __DECAF_CRYPTO_H__ 1
 16 | 
 17 | #include "decaf.h"
 18 | #include "shake.h"
 19 | 
 20 | /** Number of bytes for a symmetric key (expanded to full key) */
 21 | #define DECAF_448_SYMMETRIC_KEY_BYTES 32
 22 | 
 23 | /** @cond internal */
 24 | #define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h
 25 | #define WARN_UNUSED __attribute__((warn_unused_result))
 26 | #define NONNULL1 __attribute__((nonnull(1)))
 27 | #define NONNULL2 __attribute__((nonnull(1,2)))
 28 | #define NONNULL3 __attribute__((nonnull(1,2,3)))
 29 | #define NONNULL134 __attribute__((nonnull(1,3,4)))
 30 | #define NONNULL5 __attribute__((nonnull(1,2,3,4,5)))
 31 | /** @endcond */
 32 | 
 33 | /** A symmetric key, the compressed point of a private key. */
 34 | typedef unsigned char decaf_448_symmetric_key_t[DECAF_448_SYMMETRIC_KEY_BYTES];
 35 | 
 36 | /** An encoded public key. */
 37 | typedef unsigned char decaf_448_public_key_t[DECAF_448_SER_BYTES];
 38 | 
 39 | /** A signature. */
 40 | typedef unsigned char decaf_448_signature_t[DECAF_448_SER_BYTES + DECAF_448_SCALAR_BYTES];
 41 | 
 42 | typedef struct {
 43 |     /** @cond intetrnal */
 44 |     /** The symmetric key from which everything is expanded */
 45 |     decaf_448_symmetric_key_t sym;
 46 |     
 47 |     /** The scalar x */
 48 |     decaf_448_scalar_t secret_scalar;
 49 |     
 50 |     /** x*Base */
 51 |     decaf_448_public_key_t pub;
 52 |     /** @endcond */
 53 | } /** Private key structure for pointers. */
 54 |   decaf_448_private_key_s,
 55 |   /** A private key (gmp array[1] style). */
 56 |   decaf_448_private_key_t[1];
 57 | 
 58 | #ifdef __cplusplus
 59 | extern "C" {
 60 | #endif
 61 |     
 62 | /**
 63 |  * @brief Derive a key from its compressed form.
 64 |  * @param [out] priv The derived private key.
 65 |  * @param [in] proto The compressed or proto-key, which must be 32 random bytes.
 66 |  */
 67 | void decaf_448_derive_private_key (
 68 |     decaf_448_private_key_t priv,
 69 |     const decaf_448_symmetric_key_t proto
 70 | ) NONNULL2 API_VIS;
 71 | 
 72 | /**
 73 |  * @brief Destroy a private key.
 74 |  */
 75 | void decaf_448_destroy_private_key (
 76 |     decaf_448_private_key_t priv
 77 | ) NONNULL1 API_VIS;
 78 | 
 79 | /**
 80 |  * @brief Convert a private key to a public one.
 81 |  * @param [out] pub The extracted private key.
 82 |  * @param [in] priv The private key.
 83 |  */
 84 | void decaf_448_private_to_public (
 85 |     decaf_448_public_key_t pub,
 86 |     const decaf_448_private_key_t priv
 87 | ) NONNULL2 API_VIS;
 88 |     
 89 | /**
 90 |  * @brief Compute a Diffie-Hellman shared secret.
 91 |  *
 92 |  * This is an example routine; real protocols would use something
 93 |  * protocol-specific.
 94 |  *
 95 |  * @param [out] shared A buffer to store the shared secret.
 96 |  * @param [in] shared_bytes The size of the buffer.
 97 |  * @param [in] my_privkey My private key.
 98 |  * @param [in] your_pubkey Your public key.
 99 |  *
100 |  * @retval DECAF_SUCCESS Key exchange was successful.
101 |  * @retval DECAF_FAILURE Key exchange failed.
102 |  *
103 |  * @warning This is a pretty silly shared secret computation
104 |  * and will almost definitely change in the future.
105 |  */
106 | decaf_bool_t
107 | decaf_448_shared_secret (
108 |     uint8_t *shared,
109 |     size_t shared_bytes,
110 |     const decaf_448_private_key_t my_privkey,
111 |     const decaf_448_public_key_t your_pubkey
112 | ) NONNULL134 WARN_UNUSED API_VIS;
113 |    
114 | /**
115 |  * @brief Sign a message from its SHAKE context.
116 |  *
117 |  * @param [out] sig The signature.
118 |  * @param [in] priv Your private key.
119 |  * @param [in] shake A SHAKE256 context with the message.
120 |  */ 
121 | void
122 | decaf_448_sign_shake (
123 |     decaf_448_signature_t sig,
124 |     const decaf_448_private_key_t priv,
125 |     const keccak_sponge_t shake
126 | ) NONNULL3 API_VIS;
127 | 
128 | /**
129 |  * @brief Sign a message from its SHAKE context.
130 |  *
131 |  * @param [out] sig The signature.
132 |  * @param [in] priv Your private key.
133 |  * @param [in] message The message.
134 |  * @param [in] message_len The message's length.
135 |  */ 
136 | void
137 | decaf_448_sign (
138 |     decaf_448_signature_t sig,
139 |     const decaf_448_private_key_t priv,
140 |     const unsigned char *message,
141 |     size_t message_len
142 | ) NONNULL3 API_VIS;
143 | 
144 | /**
145 |  * @brief Verify a signed message from its SHAKE context.
146 |  *
147 |  * @param [in] sig The signature.
148 |  * @param [in] pub The public key.
149 |  * @param [in] shake A SHAKE256 context with the message.
150 |  */    
151 | decaf_bool_t
152 | decaf_448_verify_shake (
153 |     const decaf_448_signature_t sig,
154 |     const decaf_448_public_key_t pub,
155 |     const keccak_sponge_t shake
156 | ) NONNULL3 API_VIS WARN_UNUSED;
157 | 
158 | /**
159 |  * @brief Verify a signed message.
160 |  *
161 |  * @param [in] sig The signature.
162 |  * @param [in] pub The public key.
163 |  * @param [in] message The message.
164 |  * @param [in] message_len The message's length.
165 |  */    
166 | decaf_bool_t
167 | decaf_448_verify (
168 |     const decaf_448_signature_t sig,
169 |     const decaf_448_public_key_t pub,
170 |     const unsigned char *message,
171 |     size_t message_len
172 | ) NONNULL3 API_VIS WARN_UNUSED;
173 |     
174 | #undef API_VIS
175 | #undef WARN_UNUSED
176 | #undef NONNULL1
177 | #undef NONNULL2
178 | #undef NONNULL3
179 | #undef NONNULL134
180 | #undef NONNULL5
181 | 
182 | #ifdef __cplusplus
183 | } /* extern "C" */
184 | #endif
185 | 
186 | #endif /* __DECAF_CRYPTO_H__ */
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/src/decaf_gen_tables.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2015 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | 
  5 | /**
  6 |  * @file decaf_precompute.c
  7 |  * @author Mike Hamburg
  8 |  * @brief Decaf global constant table precomputation.
  9 |  */
 10 | 
 11 | #define _XOPEN_SOURCE 600 /* for posix_memalign */
 12 | #include <stdio.h>
 13 | #include <stdlib.h>
 14 | #include "decaf.h"
 15 | #include "decaf_448_config.h" /* MAGIC */
 16 | #include "field.h"
 17 | 
 18 | #define API_NS(_id) decaf_448_##_id
 19 | #define API_NS2(_pref,_id) _pref##_decaf_448_##_id
 20 | 
 21 |  /* To satisfy linker. */
 22 | const field_t API_NS(precomputed_base_as_fe)[1];
 23 | const API_NS(scalar_t) API_NS(precomputed_scalarmul_adjustment);
 24 | const API_NS(scalar_t) API_NS(point_scalarmul_adjustment);
 25 | const API_NS(scalar_t) sc_r2 = {{{0}}};
 26 | const decaf_word_t MONTGOMERY_FACTOR = 0;
 27 | const unsigned char base_point_ser_for_pregen[DECAF_448_SER_BYTES];
 28 | 
 29 | const API_NS(point_t) API_NS(point_base);
 30 | 
 31 | struct niels_s;
 32 | const field_t *API_NS(precomputed_wnaf_as_fe);
 33 | extern const size_t API_NS2(sizeof,precomputed_wnafs);
 34 | 
 35 | void API_NS(precompute_wnafs) (
 36 |     struct niels_s *out,
 37 |     const API_NS(point_t) base
 38 | );
 39 | 
 40 | /* TODO: use SC_LIMB? */
 41 | static void scalar_print(const char *name, const API_NS(scalar_t) sc) {
 42 |     printf("const API_NS(scalar_t) %s = {{{\n", name);
 43 |     unsigned i;
 44 |     for (i=0; i<sizeof(API_NS(scalar_t))/sizeof(decaf_word_t); i++) {
 45 |         if (i) printf(", ");
 46 |         printf("0x%0*llxull", (int)sizeof(decaf_word_t)*2, (unsigned long long)sc->limb[i] );
 47 |     }
 48 |     printf("}}};\n\n");
 49 | }
 50 | 
 51 | static void field_print(const field_t *f) {
 52 |     const int FIELD_SER_BYTES = (FIELD_BITS + 7) / 8;
 53 |     unsigned char ser[FIELD_SER_BYTES];
 54 |     field_serialize(ser,f);
 55 |     int b=0, i, comma=0;
 56 |     unsigned long long limb = 0;
 57 |     printf("FIELD_LITERAL(");
 58 |     for (i=0; i<FIELD_SER_BYTES; i++) {
 59 |         limb |= ((uint64_t)ser[i])<<b;
 60 |         b += 8;
 61 |         if (b >= FIELD_LIT_LIMB_BITS) {
 62 |             limb &= (1ull<<FIELD_LIT_LIMB_BITS) -1;
 63 |             b -= FIELD_LIT_LIMB_BITS;
 64 |             if (comma) printf(",");
 65 |             comma = 1;
 66 |             printf("0x%016llx", limb);
 67 |             limb = ((uint64_t)ser[i])>>(8-b);
 68 |         }
 69 |     }
 70 |     printf(")");
 71 |     assert(b<8);
 72 | }
 73 | 
 74 | int main(int argc, char **argv) {
 75 |     (void)argc; (void)argv;
 76 |     
 77 |     API_NS(point_t) real_point_base;
 78 |     int ret = API_NS(point_decode)(real_point_base,base_point_ser_for_pregen,0);
 79 |     if (!ret) return 1;
 80 |     
 81 |     API_NS(precomputed_s) *pre;
 82 |     ret = posix_memalign((void**)&pre, API_NS2(alignof,precomputed_s), API_NS2(sizeof,precomputed_s));
 83 |     if (ret || !pre) return 1;
 84 |     API_NS(precompute)(pre, real_point_base);
 85 |     
 86 |     struct niels_s *preWnaf;
 87 |     ret = posix_memalign((void**)&preWnaf, API_NS2(alignof,precomputed_s), API_NS2(sizeof,precomputed_wnafs));
 88 |     if (ret || !preWnaf) return 1;
 89 |     API_NS(precompute_wnafs)(preWnaf, real_point_base);
 90 | 
 91 |     const field_t *output;
 92 |     unsigned i;
 93 |     
 94 |     printf("/** @warning: this file was automatically generated. */\n");
 95 |     printf("#include \"field.h\"\n\n");
 96 |     printf("#include \"decaf.h\"\n\n");
 97 |     printf("#define API_NS(_id) decaf_448_##_id\n");
 98 |     printf("#define API_NS2(_pref,_id) _pref##_decaf_448_##_id\n");
 99 |     
100 |     output = (const field_t *)real_point_base;
101 |     printf("const API_NS(point_t) API_NS(point_base) = {{\n");
102 |     for (i=0; i < sizeof(API_NS(point_t)); i+=sizeof(field_t)) {
103 |         if (i) printf(",\n  ");
104 |         printf("{");
105 |         field_print(output++);
106 |         printf("}");
107 |     }
108 |     printf("\n}};\n");
109 |     
110 |     output = (const field_t *)pre;
111 |     printf("const field_t API_NS(precomputed_base_as_fe)[%d]\n", 
112 |         (int)(API_NS2(sizeof,precomputed_s) / sizeof(field_t)));
113 |     printf("__attribute__((aligned(%d),visibility(\"hidden\"))) = {\n  ", (int)API_NS2(alignof,precomputed_s));
114 |     
115 |     for (i=0; i < API_NS2(sizeof,precomputed_s); i+=sizeof(field_t)) {
116 |         if (i) printf(",\n  ");
117 |         field_print(output++);
118 |     }
119 |     printf("\n};\n");
120 |     
121 |     output = (const field_t *)preWnaf;
122 |     printf("const field_t API_NS(precomputed_wnaf_as_fe)[%d]\n", 
123 |         (int)(API_NS2(sizeof,precomputed_wnafs) / sizeof(field_t)));
124 |     printf("__attribute__((aligned(%d),visibility(\"hidden\"))) = {\n  ", (int)API_NS2(alignof,precomputed_s));
125 |     for (i=0; i < API_NS2(sizeof,precomputed_wnafs); i+=sizeof(field_t)) {
126 |         if (i) printf(",\n  ");
127 |         field_print(output++);
128 |     }
129 |     printf("\n};\n");
130 |     
131 |     API_NS(scalar_t) smadj;
132 |     API_NS(scalar_copy)(smadj,API_NS(scalar_one));
133 | 
134 |     for (i=0; i<DECAF_COMBS_N*DECAF_COMBS_T*DECAF_COMBS_S; i++) {
135 |         API_NS(scalar_add)(smadj,smadj,smadj);
136 |     }
137 |     API_NS(scalar_sub)(smadj, smadj, API_NS(scalar_one));
138 |     scalar_print("API_NS(precomputed_scalarmul_adjustment)", smadj);
139 |     
140 |     API_NS(scalar_copy)(smadj,API_NS(scalar_one));
141 |     for (i=0; i<DECAF_448_SCALAR_BITS-1 + DECAF_WINDOW_BITS
142 |             - ((DECAF_448_SCALAR_BITS-1)%DECAF_WINDOW_BITS); i++) {
143 |         API_NS(scalar_add)(smadj,smadj,smadj);
144 |     }
145 |     API_NS(scalar_sub)(smadj, smadj, API_NS(scalar_one));
146 |     scalar_print("API_NS(point_scalarmul_adjustment)", smadj);
147 |     
148 |     API_NS(scalar_copy)(smadj,API_NS(scalar_one));
149 |     for (i=0; i<sizeof(API_NS(scalar_t))*8*2; i++) {
150 |         API_NS(scalar_add)(smadj,smadj,smadj);
151 |     }
152 |     scalar_print("sc_r2", smadj);
153 |     
154 |     API_NS(scalar_sub)(smadj,API_NS(scalar_zero),API_NS(scalar_one)); /* HACK */
155 |     
156 |     unsigned long long w = 1, plo = smadj->limb[0]+1;
157 | #if DECAF_WORD_BITS == 32
158 |     plo |= ((unsigned long long)smadj->limb[1]) << 32;
159 | #endif
160 |     for (i=0; i<6; i++) {
161 |         w *= w*plo + 2;
162 |     }
163 |     printf("const decaf_word_t MONTGOMERY_FACTOR = (decaf_word_t)0x%016llxull;\n\n", w);
164 |     
165 |     return 0;
166 | }
167 | 


--------------------------------------------------------------------------------
/src/p448/arch_32/p448.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | 
  5 | #include "word.h"
  6 | #include "p448.h"
  7 | 
  8 | static inline mask_t __attribute__((always_inline))
  9 | is_zero (
 10 |     word_t x
 11 | ) {
 12 |     dword_t xx = x;
 13 |     xx--;
 14 |     return xx >> WORD_BITS;
 15 | }
 16 | 
 17 | static uint64_t widemul_32 (
 18 |     const uint32_t a,
 19 |     const uint32_t b
 20 | ) {
 21 |     return ((uint64_t)a)* b;
 22 | }
 23 | 
 24 | void
 25 | p448_mul (
 26 |     p448_t *__restrict__ cs,
 27 |     const p448_t *as,
 28 |     const p448_t *bs
 29 | ) { 
 30 |     const uint32_t *a = as->limb, *b = bs->limb;
 31 |     uint32_t *c = cs->limb;
 32 | 
 33 |     uint64_t accum0 = 0, accum1 = 0, accum2 = 0;
 34 |     uint32_t mask = (1<<28) - 1;  
 35 | 
 36 |     uint32_t aa[8], bb[8];
 37 |     
 38 |     int i,j;
 39 |     for (i=0; i<8; i++) {
 40 |         aa[i] = a[i] + a[i+8];
 41 |         bb[i] = b[i] + b[i+8];
 42 |     }
 43 |     
 44 |     for (j=0; j<8; j++) {
 45 |         accum2 = 0;
 46 |     
 47 |         for (i=0; i<=j; i++) {      
 48 |             accum2 += widemul_32(a[j-i],b[i]);
 49 |             accum1 += widemul_32(aa[j-i],bb[i]);
 50 |             accum0 += widemul_32(a[8+j-i], b[8+i]);
 51 |         }
 52 |         
 53 |         accum1 -= accum2;
 54 |         accum0 += accum2;
 55 |         accum2 = 0;
 56 |         
 57 |         for (; i<8; i++) {
 58 |             accum0 -= widemul_32(a[8+j-i], b[i]);
 59 |             accum2 += widemul_32(aa[8+j-i], bb[i]);
 60 |             accum1 += widemul_32(a[16+j-i], b[8+i]);
 61 |         }
 62 | 
 63 |         accum1 += accum2;
 64 |         accum0 += accum2;
 65 | 
 66 |         c[j] = ((uint32_t)(accum0)) & mask;
 67 |         c[j+8] = ((uint32_t)(accum1)) & mask;
 68 | 
 69 |         accum0 >>= 28;
 70 |         accum1 >>= 28;
 71 |     }
 72 |     
 73 |     accum0 += accum1;
 74 |     accum0 += c[8];
 75 |     accum1 += c[0];
 76 |     c[8] = ((uint32_t)(accum0)) & mask;
 77 |     c[0] = ((uint32_t)(accum1)) & mask;
 78 |     
 79 |     accum0 >>= 28;
 80 |     accum1 >>= 28;
 81 |     c[9] += ((uint32_t)(accum0));
 82 |     c[1] += ((uint32_t)(accum1));
 83 | }
 84 | 
 85 | void
 86 | p448_mulw (
 87 |     p448_t *__restrict__ cs,
 88 |     const p448_t *as,
 89 |     uint64_t b
 90 | ) {
 91 |     const uint32_t bhi = b>>28, blo = b & ((1<<28)-1);
 92 |     
 93 |     const uint32_t *a = as->limb;
 94 |     uint32_t *c = cs->limb;
 95 | 
 96 |     uint64_t accum0, accum8;
 97 |     uint32_t mask = (1ull<<28)-1;  
 98 | 
 99 |     int i;
100 | 
101 |     accum0 = widemul_32(blo, a[0]);
102 |     accum8 = widemul_32(blo, a[8]);
103 |     accum0 += widemul_32(bhi, a[15]);
104 |     accum8 += widemul_32(bhi, a[15] + a[7]);
105 | 
106 |     c[0] = accum0 & mask; accum0 >>= 28;
107 |     c[8] = accum8 & mask; accum8 >>= 28;
108 |     
109 |     for (i=1; i<8; i++) {
110 |         accum0 += widemul_32(blo, a[i]);
111 |         accum8 += widemul_32(blo, a[i+8]);
112 |         
113 |         accum0 += widemul_32(bhi, a[i-1]);
114 |         accum8 += widemul_32(bhi, a[i+7]);
115 | 
116 |         c[i] = accum0 & mask; accum0 >>= 28;
117 |         c[i+8] = accum8 & mask; accum8 >>= 28;
118 |     }
119 | 
120 |     accum0 += accum8 + c[8];
121 |     c[8] = accum0 & mask;
122 |     c[9] += accum0 >> 28;
123 | 
124 |     accum8 += c[0];
125 |     c[0] = accum8 & mask;
126 |     c[1] += accum8 >> 28;
127 | }
128 | 
129 | void
130 | p448_sqr (
131 |     p448_t *__restrict__ cs,
132 |     const p448_t *as
133 | ) {
134 |     p448_mul(cs,as,as); /* PERF */
135 | }
136 | 
137 | void
138 | p448_strong_reduce (
139 |     p448_t *a
140 | ) {
141 |     word_t mask = (1ull<<28)-1;
142 | 
143 |     /* first, clear high */
144 |     a->limb[8] += a->limb[15]>>28;
145 |     a->limb[0] += a->limb[15]>>28;
146 |     a->limb[15] &= mask;
147 | 
148 |     /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
149 | 
150 |     /* compute total_value - p.  No need to reduce mod p. */
151 | 
152 |     dsword_t scarry = 0;
153 |     int i;
154 |     for (i=0; i<16; i++) {
155 |         scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask);
156 |         a->limb[i] = scarry & mask;
157 |         scarry >>= 28;
158 |     }
159 | 
160 |     /* uncommon case: it was >= p, so now scarry = 0 and this = x
161 |     * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
162 |     * so let's add back in p.  will carry back off the top for 2^448.
163 |     */
164 | 
165 |     assert(is_zero(scarry) | is_zero(scarry+1));
166 | 
167 |     word_t scarry_mask = scarry & mask;
168 |     dword_t carry = 0;
169 | 
170 |     /* add it back */
171 |     for (i=0; i<16; i++) {
172 |         carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask);
173 |         a->limb[i] = carry & mask;
174 |         carry >>= 28;
175 |     }
176 | 
177 |     assert(is_zero(carry + scarry));
178 | }
179 | 
180 | void
181 | p448_serialize (
182 |     uint8_t *serial,
183 |     const struct p448_t *x
184 | ) {
185 |     int i,j;
186 |     p448_t red;
187 |     p448_copy(&red, x);
188 |     p448_strong_reduce(&red);
189 |     for (i=0; i<8; i++) {
190 |         uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28);
191 |         for (j=0; j<7; j++) {
192 |             serial[7*i+j] = limb;
193 |             limb >>= 8;
194 |         }
195 |         assert(limb == 0);
196 |     }
197 | }
198 | 
199 | mask_t
200 | p448_deserialize (
201 |     p448_t *x,
202 |     const uint8_t serial[56]
203 | ) {
204 |     int i,j;
205 |     for (i=0; i<8; i++) {
206 |         uint64_t out = 0;
207 |         for (j=0; j<7; j++) {
208 |             out |= ((uint64_t)serial[7*i+j])<<(8*j);
209 |         }
210 |         x->limb[2*i] = out & ((1ull<<28)-1);
211 |         x->limb[2*i+1] = out >> 28;
212 |     }
213 |     
214 |     /* Check for reduction.
215 |      *
216 |      * The idea is to create a variable ge which is all ones (rather, 56 ones)
217 |      * if and only if the low $i$ words of $x$ are >= those of p.
218 |      *
219 |      * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
220 |      */
221 |     uint32_t ge = -1, mask = (1ull<<28)-1;
222 |     for (i=0; i<8; i++) {
223 |         ge &= x->limb[i];
224 |     }
225 |     
226 |     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
227 |     ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
228 |     
229 |     /* Propagate the rest */
230 |     for (i=9; i<16; i++) {
231 |         ge &= x->limb[i];
232 |     }
233 |     
234 |     return ~is_zero(ge ^ mask);
235 | }
236 | 
237 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2014 Cryptography Research, Inc.
  2 | # Released under the MIT License.  See LICENSE.txt for license information.
  3 | 
  4 | 
  5 | UNAME := $(shell uname)
  6 | MACHINE := $(shell uname -m)
  7 | 
  8 | ifeq ($(UNAME),Darwin)
  9 | CC = clang
 10 | CXX = clang++
 11 | else
 12 | CC = gcc
 13 | CXX = g++
 14 | endif
 15 | LD = $(CC)
 16 | LDXX = $(CXX)
 17 | ASM ?= $(CC)
 18 | 
 19 | DECAF ?= decaf_fast
 20 | 
 21 | ifneq (,$(findstring x86_64,$(MACHINE)))
 22 | ARCH ?= arch_x86_64
 23 | else
 24 | # no i386 port yet
 25 | ARCH ?= arch_arm_32
 26 | endif
 27 | 
 28 | FIELD ?= p448
 29 | 
 30 | WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
 31 | 	 -Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)
 32 | 	 
 33 | 	 
 34 | INCFLAGS = -Isrc/include -Iinclude -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH)
 35 | LANGFLAGS = -std=c99 -fno-strict-aliasing
 36 | LANGXXFLAGS = -fno-strict-aliasing
 37 | GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC
 38 | OFLAGS ?= -O3
 39 | 
 40 | TODAY = $(shell date "+%Y-%m-%d")
 41 | 
 42 | ifneq (,$(findstring arm,$(MACHINE)))
 43 | ifneq (,$(findstring neon,$(ARCH)))
 44 | ARCHFLAGS += -mfpu=neon
 45 | else
 46 | ARCHFLAGS += -mfpu=vfpv3-d16
 47 | endif
 48 | ARCHFLAGS += -mcpu=cortex-a8 # FIXME
 49 | GENFLAGS += -DN_TESTS_BASE=1000 # sooooo sloooooow
 50 | else
 51 | ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO
 52 | endif
 53 | 
 54 | ifeq ($(CC),clang)
 55 | WARNFLAGS += -Wgcc-compat
 56 | endif
 57 | 
 58 | SAGE ?= sage
 59 | SAGES= $(shell ls test/*.sage)
 60 | BUILDPYS= $(SAGES:test/%.sage=build/%.py)
 61 | 
 62 | ARCHFLAGS += $(XARCHFLAGS)
 63 | CFLAGS  = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XCFLAGS)
 64 | CXXFLAGS = $(LANGXXFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XCXXFLAGS) 
 65 | LDFLAGS = $(ARCHFLAGS) $(XLDFLAGS)
 66 | ASFLAGS = $(ARCHFLAGS) $(XASFLAGS)
 67 | 
 68 | .PHONY: clean all test bench todo doc lib bat sage sagetest
 69 | .PRECIOUS: build/%.s
 70 | 
 71 | HEADERS= Makefile $(shell find src include test -name "*.h") $(shell find . -name "*.hxx") build/timestamp
 72 | 
 73 | 
 74 | DECAFCOMPONENTS= build/$(DECAF).o build/shake.o build/decaf_crypto.o \
 75 | 	build/$(FIELD).o build/f_arithmetic.o # TODO
 76 | ifeq ($(DECAF),decaf_fast)
 77 | DECAFCOMPONENTS += build/decaf_tables.o
 78 | endif
 79 | 
 80 | BENCHCOMPONENTS = build/bench.o build/shake.o
 81 | 
 82 | BATBASE=ed448goldilocks_decaf_bats_$(TODAY)
 83 | BATNAME=build/$(BATBASE)
 84 | 
 85 | all: lib  build/test build/bench build/shakesum
 86 | 
 87 | scan: clean
 88 | 	scan-build --use-analyzer=`which clang` \
 89 | 		 -enable-checker deadcode -enable-checker llvm \
 90 | 		 -enable-checker osx -enable-checker security -enable-checker unix \
 91 | 		make build/bench build/test all
 92 | 
 93 | build/test: build/test_decaf.o lib
 94 | ifeq ($(UNAME),Darwin)
 95 | 	$(LDXX) $(LDFLAGS) -o $@ $< -Lbuild -ldecaf
 96 | else
 97 | 	$(LDXX) $(LDFLAGS) -Wl,-rpath,`pwd`/build -o $@ $< -Lbuild -ldecaf
 98 | endif
 99 | 
100 | build/bench: build/bench_decaf.o lib
101 | ifeq ($(UNAME),Darwin)
102 | 	$(LDXX) $(LDFLAGS) -o $@ $< -Lbuild -ldecaf
103 | else
104 | 	$(LDXX) $(LDFLAGS) -Wl,-rpath,`pwd`/build -o $@ $< -Lbuild -ldecaf
105 | endif
106 | 	
107 | build/shakesum: build/shakesum.o build/shake.o
108 | 	$(LD) $(LDFLAGS) -o $@ $^
109 | 
110 | lib: build/libdecaf.so
111 | 
112 | build/libdecaf.so: $(DECAFCOMPONENTS)
113 | 	rm -f $@
114 | ifeq ($(UNAME),Darwin)
115 | 	libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
116 | 		  $(DECAFCOMPONENTS)
117 | else
118 | 	$(LD) $(LDFLAGS) -shared -Wl,-soname,libdecaf.so.1 -Wl,--gc-sections -o $@ $(DECAFCOMPONENTS)
119 | 	strip --discard-all $@
120 | 	ln -sf `basename $@` build/libdecaf.so.1
121 | endif
122 | 
123 | build/timestamp:
124 | 	mkdir -p build
125 | 	touch $@
126 | 
127 | build/%.o: build/%.s
128 | 	$(ASM) $(ASFLAGS) -c -o $@ $<
129 | 
130 | build/decaf_gen_tables: build/decaf_gen_tables.o build/$(DECAF).o build/$(FIELD).o build/f_arithmetic.o
131 | 	$(LD) $(LDFLAGS) -o $@ $^
132 | 	
133 | build/decaf_tables.c: build/decaf_gen_tables
134 | 	./$< > $@
135 | 	
136 | build/decaf_tables.s: build/decaf_tables.c $(HEADERS)
137 | 	$(CC) $(CFLAGS) -S -c -o $@ $<
138 | 	
139 | build/%.s: src/%.c $(HEADERS)
140 | 	$(CC) $(CFLAGS) -S -c -o $@ $<
141 | 	
142 | build/%.s: src/%.cxx $(HEADERS)
143 | 	$(CXX) $(CXXFLAGS) -S -c -o $@ $<
144 | 
145 | build/%.s: test/%.c $(HEADERS)
146 | 	$(CC) $(CFLAGS) -S -c -o $@ $<
147 | 
148 | build/%.s: test/%.cxx $(HEADERS)
149 | 	$(CXX) $(CXXFLAGS) -S -c -o $@ $<
150 | 
151 | build/%.s: src/$(FIELD)/$(ARCH)/%.c $(HEADERS)
152 | 	$(CC) $(CFLAGS) -S -c -o $@ $<
153 | 
154 | build/%.s: src/$(FIELD)/%.c $(HEADERS)
155 | 	$(CC) $(CFLAGS) -S -c -o $@ $<
156 | 	
157 | sage: $(BUILDPYS)
158 | 
159 | sagetest: sage lib
160 | 	LD_LIBRARY_PATH=build sage build/test_decaf.sage
161 | 
162 | $(BUILDPYS): $(SAGES) build/timestamp
163 | 	cp -f $(SAGES) build/
164 | 	$(SAGE) --preparse $(SAGES:test/%.sage=build/%.sage)
165 | 	# some sage versions compile to .sage.py
166 | 	for f in $(SAGES:test/%.sage=build/%); do \
167 | 		 if [ -e $$f.sage.py ]; then \
168 | 		 	 mv $$f.sage.py $$f.py; \
169 | 		 fi; \
170 | 	  done
171 | 
172 | doc/timestamp:
173 | 	mkdir -p doc
174 | 	touch $@
175 | 
176 | doc: Doxyfile doc/timestamp include/*.h src/*.c src/include/*.h src/$(FIELD)/$(ARCH)/*.c src/$(FIELD)/$(ARCH)/*.h
177 | 	doxygen > /dev/null
178 | 
179 | bat: $(BATNAME)
180 | 
181 | $(BATNAME): include/* src/* src/*/* test/batarch.map build/decaf_tables.c # TODO tables some other way
182 | 	rm -fr $@
183 | 	for prim in dh sign; do \
184 |           targ="$@/crypto_$$prim/ed448goldilocks_decaf"; \
185 | 	  (while read arch where; do \
186 | 	    mkdir -p $$targ/`basename $$arch`; \
187 | 	    cp include/*.h build/decaf_tables.c src/decaf_fast.c src/decaf_crypto.c src/shake.c src/include/*.h src/bat/$$prim.c src/p448/$$where/*.c src/p448/$$where/*.h src/p448/*.c src/p448/*.h $$targ/`basename $$arch`; \
188 | 	    cp src/bat/api_$$prim.h $$targ/`basename $$arch`/api.h; \
189 | 	    perl -p -i -e 's/SYSNAME/'`basename $(BATNAME)`_`basename $$arch`'/g' $$targ/`basename $$arch`/api.h;  \
190 | 	    perl -p -i -e 's/__TODAY__/'$(TODAY)'/g' $$targ/`basename $$arch`/api.h;  \
191 | 	    done \
192 | 	  ) < test/batarch.map; \
193 | 	  echo 'Mike Hamburg' > $$targ/designers; \
194 | 	  echo 'Ed448-Goldilocks Decaf sign and dh' > $$targ/description; \
195 |         done
196 | 	(cd build && tar czf $(BATBASE).tgz $(BATBASE) )
197 | 	
198 | 
199 | todo::
200 | 	@(find * -name '*.h'; find * -name '*.c') | xargs egrep --color=auto -w \
201 | 		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC'
202 | 	@echo '============================='
203 | 	@(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE MAGIC; do \
204 | 	  (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i > /dev/null || continue; \
205 | 	  /bin/echo -n $$i'       ' | head -c 10; \
206 | 	  (find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i| wc -l; \
207 | 	done)
208 | 	@echo '============================='
209 | 	@echo -n 'Total     '
210 | 	@(find * -name '*.h'; find * -name '*.c') | xargs egrep -w \
211 | 		'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE|MAGIC' | wc -l
212 | 
213 | bench: build/bench
214 | 	./$<
215 | 
216 | test: build/test
217 | 	build/test
218 | 	
219 | microbench: build/bench
220 | 	./$< --micro
221 | 
222 | clean:
223 | 	rm -fr build doc $(BATNAME)
224 | 


--------------------------------------------------------------------------------
/src/decaf_crypto.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @cond internal
  3 |  * @file decaf_crypto.c
  4 |  * @copyright
  5 |  *   Copyright (c) 2015 Cryptography Research, Inc.  \n
  6 |  *   Released under the MIT License.  See LICENSE.txt for license information.
  7 |  * @author Mike Hamburg
  8 |  * @brief Example Decaf cyrpto routines.
  9 |  */
 10 | 
 11 | #include "decaf_crypto.h"
 12 | #include <string.h>
 13 | 
 14 | static const unsigned int DECAF_448_SCALAR_OVERKILL_BYTES = DECAF_448_SCALAR_BYTES + 8;
 15 | 
 16 | void decaf_448_derive_private_key (
 17 |     decaf_448_private_key_t priv,
 18 |     const decaf_448_symmetric_key_t proto
 19 | ) {
 20 |     const char *magic = "decaf_448_derive_private_key";
 21 |     uint8_t encoded_scalar[DECAF_448_SCALAR_OVERKILL_BYTES];
 22 |     decaf_448_point_t pub;
 23 | 
 24 |     keccak_sponge_t sponge;
 25 |     shake256_init(sponge);
 26 |     shake256_update(sponge, proto, sizeof(decaf_448_symmetric_key_t));
 27 |     shake256_update(sponge, (const unsigned char *)magic, strlen(magic));
 28 |     shake256_final(sponge, encoded_scalar, sizeof(encoded_scalar));
 29 |     shake256_destroy(sponge);
 30 |     
 31 |     memcpy(priv->sym, proto, sizeof(decaf_448_symmetric_key_t));
 32 |     decaf_448_scalar_decode_long(priv->secret_scalar, encoded_scalar, sizeof(encoded_scalar));
 33 |     
 34 |     decaf_448_precomputed_scalarmul(pub, decaf_448_precomputed_base, priv->secret_scalar);
 35 |     decaf_448_point_encode(priv->pub, pub);
 36 |     
 37 |     decaf_bzero(encoded_scalar, sizeof(encoded_scalar));
 38 | }
 39 | 
 40 | void
 41 | decaf_448_destroy_private_key (
 42 |     decaf_448_private_key_t priv
 43 | )  {
 44 |     decaf_bzero((void*)priv, sizeof(decaf_448_private_key_t));
 45 | }
 46 | 
 47 | void decaf_448_private_to_public (
 48 |     decaf_448_public_key_t pub,
 49 |     const decaf_448_private_key_t priv
 50 | ) {
 51 |     memcpy(pub, priv->pub, sizeof(decaf_448_public_key_t));
 52 | }
 53 | 
 54 | decaf_bool_t
 55 | decaf_448_shared_secret (
 56 |     uint8_t *shared,
 57 |     size_t shared_bytes,
 58 |     const decaf_448_private_key_t my_privkey,
 59 |     const decaf_448_public_key_t your_pubkey
 60 | ) {
 61 |     uint8_t ss_ser[DECAF_448_SER_BYTES];
 62 |     const char *nope = "decaf_448_ss_invalid";
 63 |     
 64 |     unsigned i;
 65 |     /* Lexsort keys.  Less will be -1 if mine is less, and 0 otherwise. */
 66 |     uint16_t less = 0;
 67 |     for (i=0; i<DECAF_448_SER_BYTES; i++) {
 68 |         uint16_t delta = my_privkey->pub[i];
 69 |         delta -= your_pubkey[i];
 70 |         /* Case:
 71 |          * = -> delta = 0 -> hi delta-1 = -1, hi delta = 0
 72 |          * > -> delta > 0 -> hi delta-1 = 0, hi delta = 0
 73 |          * < -> delta < 0 -> hi delta-1 = (doesnt matter), hi delta = -1
 74 |          */
 75 |         less &= delta-1;
 76 |         less |= delta;
 77 |     }
 78 |     less >>= 8;
 79 | 
 80 |     keccak_sponge_t sponge;
 81 |     shake256_init(sponge);
 82 | 
 83 |     /* update the lesser */
 84 |     for (i=0; i<sizeof(ss_ser); i++) {
 85 |         ss_ser[i] = (my_privkey->pub[i] & less) | (your_pubkey[i] & ~less);
 86 |     }
 87 |     shake256_update(sponge, ss_ser, sizeof(ss_ser));
 88 | 
 89 |     /* update the greater */
 90 |     for (i=0; i<sizeof(ss_ser); i++) {
 91 |         ss_ser[i] = (my_privkey->pub[i] & ~less) | (your_pubkey[i] & less);
 92 |     }
 93 |     shake256_update(sponge, ss_ser, sizeof(ss_ser));
 94 |     
 95 |     decaf_bool_t ret = decaf_448_direct_scalarmul(ss_ser, your_pubkey, my_privkey->secret_scalar, DECAF_FALSE, DECAF_TRUE);
 96 |     /* If invalid, then replace ... */
 97 |     for (i=0; i<sizeof(ss_ser); i++) {
 98 |         ss_ser[i] &= ret;
 99 |         
100 |         if (i < sizeof(my_privkey->sym)) {
101 |             ss_ser[i] |= my_privkey->sym[i] & ~ret;
102 |         } else if (i - sizeof(my_privkey->sym) < strlen(nope)) {
103 |             ss_ser[i] |= nope[i-sizeof(my_privkey->sym)] & ~ret;
104 |         }
105 |     }
106 | 
107 |     shake256_update(sponge, ss_ser, sizeof(ss_ser));
108 |     shake256_final(sponge, shared, shared_bytes);
109 |     shake256_destroy(sponge);
110 |     
111 |     decaf_bzero(ss_ser, sizeof(ss_ser));
112 |     
113 |     return ret;
114 | }
115 | 
116 | void
117 | decaf_448_sign_shake (
118 |     decaf_448_signature_t sig,
119 |     const decaf_448_private_key_t priv,
120 |     const keccak_sponge_t shake
121 | ) {
122 |     const char *magic = "decaf_448_sign_shake";
123 | 
124 |     uint8_t overkill[DECAF_448_SCALAR_OVERKILL_BYTES], encoded[DECAF_448_SER_BYTES];
125 |     decaf_448_point_t point;
126 |     decaf_448_scalar_t nonce, challenge;
127 |     
128 |     /* Derive nonce */
129 |     keccak_sponge_t ctx;
130 |     memcpy(ctx, shake, sizeof(ctx));
131 |     shake256_update(ctx, priv->sym, sizeof(priv->sym));
132 |     shake256_update(ctx, (const unsigned char *)magic, strlen(magic));
133 |     shake256_final(ctx, overkill, sizeof(overkill));
134 |     
135 |     decaf_448_scalar_decode_long(nonce, overkill, sizeof(overkill));
136 |     decaf_448_precomputed_scalarmul(point, decaf_448_precomputed_base, nonce);
137 |     decaf_448_point_encode(encoded, point);
138 | 
139 |     /* Derive challenge */
140 |     memcpy(ctx, shake, sizeof(ctx));
141 |     shake256_update(ctx, priv->pub, sizeof(priv->pub));
142 |     shake256_update(ctx, encoded, sizeof(encoded));
143 |     shake256_final(ctx, overkill, sizeof(overkill));
144 |     shake256_destroy(ctx);
145 |     decaf_448_scalar_decode_long(challenge, overkill, sizeof(overkill));
146 |     
147 |     /* Respond */
148 |     decaf_448_scalar_mul(challenge, challenge, priv->secret_scalar);
149 |     decaf_448_scalar_sub(nonce, nonce, challenge);
150 |     
151 |     /* Save results */
152 |     memcpy(sig, encoded, sizeof(encoded));
153 |     decaf_448_scalar_encode(&sig[sizeof(encoded)], nonce);
154 |     
155 |     /* Clean up */
156 |     decaf_448_scalar_destroy(nonce);
157 |     decaf_448_scalar_destroy(challenge);
158 |     decaf_bzero(overkill,sizeof(overkill));
159 |     decaf_bzero(encoded,sizeof(encoded));
160 | }
161 | 
162 | decaf_bool_t
163 | decaf_448_verify_shake (
164 |     const decaf_448_signature_t sig,
165 |     const decaf_448_public_key_t pub,
166 |     const keccak_sponge_t shake
167 | ) {
168 |     decaf_bool_t ret;
169 | 
170 |     uint8_t overkill[DECAF_448_SCALAR_OVERKILL_BYTES];
171 |     decaf_448_point_t point, pubpoint;
172 |     decaf_448_scalar_t challenge, response;
173 |     
174 |     /* Derive challenge */
175 |     keccak_sponge_t ctx;
176 |     memcpy(ctx, shake, sizeof(ctx));
177 |     shake256_update(ctx, pub, sizeof(decaf_448_public_key_t));
178 |     shake256_update(ctx, sig, DECAF_448_SER_BYTES);
179 |     shake256_final(ctx, overkill, sizeof(overkill));
180 |     shake256_destroy(ctx);
181 |     decaf_448_scalar_decode_long(challenge, overkill, sizeof(overkill));
182 | 
183 |     /* Decode points. */
184 |     ret  = decaf_448_point_decode(point, sig, DECAF_TRUE);
185 |     ret &= decaf_448_point_decode(pubpoint, pub, DECAF_FALSE);
186 |     ret &= decaf_448_scalar_decode(response, &sig[DECAF_448_SER_BYTES]);
187 | 
188 |     decaf_448_base_double_scalarmul_non_secret (
189 |         pubpoint, response, pubpoint, challenge
190 |     );
191 | 
192 |     ret &= decaf_448_point_eq(pubpoint, point);
193 |     
194 |     return ret;
195 | }
196 | 
197 | void
198 | decaf_448_sign (
199 |     decaf_448_signature_t sig,
200 |     const decaf_448_private_key_t priv,
201 |     const unsigned char *message,
202 |     size_t message_len
203 | ) {
204 |     keccak_sponge_t ctx;
205 |     shake256_init(ctx);
206 |     shake256_update(ctx, message, message_len);
207 |     decaf_448_sign_shake(sig, priv, ctx);
208 |     shake256_destroy(ctx);
209 | }
210 | 
211 | decaf_bool_t
212 | decaf_448_verify (
213 |     const decaf_448_signature_t sig,
214 |     const decaf_448_public_key_t pub,
215 |     const unsigned char *message,
216 |     size_t message_len
217 | ) {
218 |     keccak_sponge_t ctx;
219 |     shake256_init(ctx);
220 |     shake256_update(ctx, message, message_len);
221 |     decaf_bool_t ret = decaf_448_verify_shake(sig, pub, ctx);
222 |     shake256_destroy(ctx);
223 |     return ret;
224 | }
225 | 


--------------------------------------------------------------------------------
/src/p448/arch_x86_64/x86-64-arith.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | 
  5 | #ifndef __X86_64_ARITH_H__
  6 | #define __X86_64_ARITH_H__
  7 | 
  8 | #include <stdint.h>
  9 | 
 10 | /* TODO: non x86-64 versions of these.
 11 |  * FUTURE: autogenerate
 12 |  */
 13 | 
 14 | static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
 15 |   #ifndef __BMI2__
 16 |   uint64_t c,d;
 17 |   __asm__ volatile
 18 |       ("movq %[a], %%rax;"
 19 |        "mulq %[b];"
 20 |        : [c]"=a"(c), [d]"=d"(d)
 21 |        : [b]"m"(*b), [a]"m"(*a)
 22 |        : "cc");
 23 |   return (((__uint128_t)(d))<<64) | c;
 24 |   #else
 25 |   uint64_t c,d;
 26 |   __asm__ volatile
 27 |       ("movq %[a], %%rdx;"
 28 |        "mulx %[b], %[c], %[d];"
 29 |        : [c]"=r"(c), [d]"=r"(d)
 30 |        : [b]"m"(*b), [a]"m"(*a)
 31 |        : "rdx");
 32 |   return (((__uint128_t)(d))<<64) | c;
 33 |   #endif
 34 | }
 35 | 
 36 | static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
 37 |   #ifndef __BMI2__
 38 |   uint64_t c,d;
 39 |   __asm__ volatile
 40 |       ("movq %[a], %%rax;"
 41 |        "mulq %[b];"
 42 |        : [c]"=a"(c), [d]"=d"(d)
 43 |        : [b]"m"(*b), [a]"r"(a)
 44 |        : "cc");
 45 |   return (((__uint128_t)(d))<<64) | c;
 46 |   #else
 47 |   uint64_t c,d;
 48 |   __asm__ volatile
 49 |       ("mulx %[b], %[c], %[d];"
 50 |        : [c]"=r"(c), [d]"=r"(d)
 51 |        : [b]"m"(*b), [a]"d"(a));
 52 |   return (((__uint128_t)(d))<<64) | c;
 53 |   #endif
 54 | }
 55 | 
 56 | static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
 57 |   #ifndef __BMI2__
 58 |   uint64_t c,d;
 59 |   __asm__ volatile
 60 |       ("movq %[a], %%rax; "
 61 |        "addq %%rax, %%rax; "
 62 |        "mulq %[b];"
 63 |        : [c]"=a"(c), [d]"=d"(d)
 64 |        : [b]"m"(*b), [a]"m"(*a)
 65 |        : "cc");
 66 |   return (((__uint128_t)(d))<<64) | c;
 67 |   #else
 68 |   uint64_t c,d;
 69 |   __asm__ volatile
 70 |       ("movq %[a], %%rdx;"
 71 |        "leaq (,%%rdx,2), %%rdx;"
 72 |        "mulx %[b], %[c], %[d];"
 73 |        : [c]"=r"(c), [d]"=r"(d)
 74 |        : [b]"m"(*b), [a]"m"(*a)
 75 |        : "rdx");
 76 |   return (((__uint128_t)(d))<<64) | c;
 77 |   #endif
 78 | }
 79 | 
 80 | static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
 81 |   uint64_t lo = *acc, hi = *acc>>64;
 82 |   
 83 |   #ifdef __BMI2__
 84 |   uint64_t c,d;
 85 |   __asm__ volatile
 86 |       ("movq %[a], %%rdx; "
 87 |        "mulx %[b], %[c], %[d]; "
 88 |        "addq %[c], %[lo]; "
 89 |        "adcq %[d], %[hi]; "
 90 |        : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
 91 |        : [b]"m"(*b), [a]"m"(*a)
 92 |        : "rdx", "cc");
 93 |   #else
 94 |   __asm__ volatile
 95 |       ("movq %[a], %%rax; "
 96 |        "mulq %[b]; "
 97 |        "addq %%rax, %[lo]; "
 98 |        "adcq %%rdx, %[hi]; "
 99 |        : [lo]"+r"(lo), [hi]"+r"(hi)
100 |        : [b]"m"(*b), [a]"m"(*a)
101 |        : "rax", "rdx", "cc");
102 |   #endif
103 |   
104 |   *acc = (((__uint128_t)(hi))<<64) | lo;
105 | }
106 | 
107 | static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
108 |   uint64_t lo = *acc, hi = *acc>>64;
109 |   uint64_t lo2 = *acc2, hi2 = *acc2>>64;
110 |   
111 |   #ifdef __BMI2__
112 |   uint64_t c,d;
113 |   __asm__ volatile
114 |       ("movq %[a], %%rdx; "
115 |        "mulx %[b], %[c], %[d]; "
116 |        "addq %[c], %[lo]; "
117 |        "adcq %[d], %[hi]; "
118 |        "addq %[c], %[lo2]; "
119 |        "adcq %[d], %[hi2]; "
120 |        : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
121 |        : [b]"m"(*b), [a]"m"(*a)
122 |        : "rdx", "cc");
123 |   #else
124 |   __asm__ volatile
125 |       ("movq %[a], %%rax; "
126 |        "mulq %[b]; "
127 |        "addq %%rax, %[lo]; "
128 |        "adcq %%rdx, %[hi]; "
129 |        "addq %%rax, %[lo2]; "
130 |        "adcq %%rdx, %[hi2]; "
131 |        : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
132 |        : [b]"m"(*b), [a]"m"(*a)
133 |        : "rax", "rdx", "cc");
134 |   #endif
135 |   
136 |   *acc = (((__uint128_t)(hi))<<64) | lo;
137 |   *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
138 | }
139 | 
140 | static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
141 |   uint64_t lo = *acc, hi = *acc>>64;
142 |   
143 |   #ifdef __BMI2__
144 |   uint64_t c,d;
145 |   __asm__ volatile
146 |       ("mulx %[b], %[c], %[d]; "
147 |        "addq %[c], %[lo]; "
148 |        "adcq %[d], %[hi]; "
149 |        : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
150 |        : [b]"m"(*b), [a]"d"(a)
151 |        : "cc");
152 |   #else
153 |   __asm__ volatile
154 |       ("movq %[a], %%rax; "
155 |        "mulq %[b]; "
156 |        "addq %%rax, %[lo]; "
157 |        "adcq %%rdx, %[hi]; "
158 |        : [lo]"+r"(lo), [hi]"+r"(hi)
159 |        : [b]"m"(*b), [a]"r"(a)
160 |        : "rax", "rdx", "cc");
161 |   #endif
162 |   
163 |   *acc = (((__uint128_t)(hi))<<64) | lo;
164 | }
165 | 
166 | static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
167 |   uint64_t lo = *acc, hi = *acc>>64;
168 |   
169 |   #ifdef __BMI2__
170 |   uint64_t c,d;
171 |   __asm__ volatile
172 |       ("movq %[a], %%rdx; "
173 |        "addq %%rdx, %%rdx; "
174 |        "mulx %[b], %[c], %[d]; "
175 |        "addq %[c], %[lo]; "
176 |        "adcq %[d], %[hi]; "
177 |        : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
178 |        : [b]"m"(*b), [a]"m"(*a)
179 |        : "rdx", "cc");
180 |   #else
181 |   __asm__ volatile
182 |       ("movq %[a], %%rax; "
183 |        "addq %%rax, %%rax; "
184 |        "mulq %[b]; "
185 |        "addq %%rax, %[lo]; "
186 |        "adcq %%rdx, %[hi]; "
187 |        : [lo]"+r"(lo), [hi]"+r"(hi)
188 |        : [b]"m"(*b), [a]"m"(*a)
189 |        : "rax", "rdx", "cc");
190 |   #endif
191 |   
192 |   *acc = (((__uint128_t)(hi))<<64) | lo;
193 | }
194 | 
195 | static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
196 |   uint64_t lo = *acc, hi = *acc>>64;
197 |   #ifdef __BMI2__
198 |   uint64_t c,d;
199 |   __asm__ volatile
200 |       ("movq %[a], %%rdx; "
201 |        "mulx %[b], %[c], %[d]; "
202 |        "subq %[c], %[lo]; "
203 |        "sbbq %[d], %[hi]; "
204 |        : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
205 |        : [b]"m"(*b), [a]"m"(*a)
206 |        : "rdx", "cc");
207 |   #else
208 |   __asm__ volatile
209 |       ("movq %[a], %%rax; "
210 |        "mulq %[b]; "
211 |        "subq %%rax, %[lo]; "
212 |        "sbbq %%rdx, %[hi]; "
213 |        : [lo]"+r"(lo), [hi]"+r"(hi)
214 |        : [b]"m"(*b), [a]"m"(*a)
215 |        : "rax", "rdx", "cc");
216 |   #endif
217 |   *acc = (((__uint128_t)(hi))<<64) | lo;
218 | }
219 | 
220 | static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
221 |   uint64_t lo = *acc, hi = *acc>>64;
222 |   #ifdef __BMI2__
223 |   uint64_t c,d;
224 |   __asm__ volatile
225 |       ("movq %[a], %%rdx; "
226 |        "addq %%rdx, %%rdx; "
227 |        "mulx %[b], %[c], %[d]; "
228 |        "subq %[c], %[lo]; "
229 |        "sbbq %[d], %[hi]; "
230 |        : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
231 |        : [b]"m"(*b), [a]"m"(*a)
232 |        : "rdx", "cc");
233 |   #else
234 |   __asm__ volatile
235 |       ("movq %[a], %%rax; "
236 |        "addq %%rax, %%rax; "
237 |        "mulq %[b]; "
238 |        "subq %%rax, %[lo]; "
239 |        "sbbq %%rdx, %[hi]; "
240 |        : [lo]"+r"(lo), [hi]"+r"(hi)
241 |        : [b]"m"(*b), [a]"m"(*a)
242 |        : "rax", "rdx", "cc");
243 |   #endif
244 |   *acc = (((__uint128_t)(hi))<<64) | lo;
245 |   
246 | }
247 | 
248 | static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
249 |   uint64_t c,d, lo = *acc, hi = *acc>>64;
250 |   __asm__ volatile
251 |       ("movq %[a], %%rdx; "
252 |        "mulx %[b], %[c], %[d]; "
253 |        "subq %[lo], %[c]; "
254 |        "sbbq %[hi], %[d]; "
255 |        : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
256 |        : [b]"m"(*b), [a]"m"(*a)
257 |        : "rdx", "cc");
258 |   *acc = (((__uint128_t)(d))<<64) | c;
259 | }
260 | 
261 | static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
262 |   return ((__uint128_t)(a)) * b;
263 | }
264 | 
265 | static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
266 |   return ((__int128_t)(a)) * b;
267 | }
268 |  
269 | static __inline__ uint64_t opacify(uint64_t x) {
270 |   __asm__ volatile("" : "+r"(x));
271 |   return x;
272 | }
273 | 
274 | static __inline__ mask_t is_zero(uint64_t x) {
275 |   __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
276 |   return ~x;
277 | }
278 | 
279 | #endif /* __X86_64_ARITH_H__ */
280 | 


--------------------------------------------------------------------------------
/src/p480/arch_x86_64/x86-64-arith.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | 
  5 | #ifndef __X86_64_ARITH_H__
  6 | #define __X86_64_ARITH_H__
  7 | 
  8 | #include <stdint.h>
  9 | 
 10 | /* TODO: non x86-64 versions of these.
 11 |  * FUTURE: autogenerate
 12 |  */
 13 | 
 14 | static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
 15 |   #ifndef __BMI2__
 16 |   uint64_t c,d;
 17 |   __asm__ volatile
 18 |       ("movq %[a], %%rax;"
 19 |        "mulq %[b];"
 20 |        : [c]"=a"(c), [d]"=d"(d)
 21 |        : [b]"m"(*b), [a]"m"(*a)
 22 |        : "cc");
 23 |   return (((__uint128_t)(d))<<64) | c;
 24 |   #else
 25 |   uint64_t c,d;
 26 |   __asm__ volatile
 27 |       ("movq %[a], %%rdx;"
 28 |        "mulx %[b], %[c], %[d];"
 29 |        : [c]"=r"(c), [d]"=r"(d)
 30 |        : [b]"m"(*b), [a]"m"(*a)
 31 |        : "rdx");
 32 |   return (((__uint128_t)(d))<<64) | c;
 33 |   #endif
 34 | }
 35 | 
 36 | static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
 37 |   #ifndef __BMI2__
 38 |   uint64_t c,d;
 39 |   __asm__ volatile
 40 |       ("movq %[a], %%rax;"
 41 |        "mulq %[b];"
 42 |        : [c]"=a"(c), [d]"=d"(d)
 43 |        : [b]"m"(*b), [a]"r"(a)
 44 |        : "cc");
 45 |   return (((__uint128_t)(d))<<64) | c;
 46 |   #else
 47 |   uint64_t c,d;
 48 |   __asm__ volatile
 49 |       ("mulx %[b], %[c], %[d];"
 50 |        : [c]"=r"(c), [d]"=r"(d)
 51 |        : [b]"m"(*b), [a]"d"(a));
 52 |   return (((__uint128_t)(d))<<64) | c;
 53 |   #endif
 54 | }
 55 | 
 56 | static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
 57 |   #ifndef __BMI2__
 58 |   uint64_t c,d;
 59 |   __asm__ volatile
 60 |       ("movq %[a], %%rax; "
 61 |        "addq %%rax, %%rax; "
 62 |        "mulq %[b];"
 63 |        : [c]"=a"(c), [d]"=d"(d)
 64 |        : [b]"m"(*b), [a]"m"(*a)
 65 |        : "cc");
 66 |   return (((__uint128_t)(d))<<64) | c;
 67 |   #else
 68 |   uint64_t c,d;
 69 |   __asm__ volatile
 70 |       ("movq %[a], %%rdx;"
 71 |        "leaq (,%%rdx,2), %%rdx;"
 72 |        "mulx %[b], %[c], %[d];"
 73 |        : [c]"=r"(c), [d]"=r"(d)
 74 |        : [b]"m"(*b), [a]"m"(*a)
 75 |        : "rdx");
 76 |   return (((__uint128_t)(d))<<64) | c;
 77 |   #endif
 78 | }
 79 | 
 80 | static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
 81 |   uint64_t lo = *acc, hi = *acc>>64;
 82 |   
 83 |   #ifdef __BMI2__
 84 |   uint64_t c,d;
 85 |   __asm__ volatile
 86 |       ("movq %[a], %%rdx; "
 87 |        "mulx %[b], %[c], %[d]; "
 88 |        "addq %[c], %[lo]; "
 89 |        "adcq %[d], %[hi]; "
 90 |        : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
 91 |        : [b]"m"(*b), [a]"m"(*a)
 92 |        : "rdx", "cc");
 93 |   #else
 94 |   __asm__ volatile
 95 |       ("movq %[a], %%rax; "
 96 |        "mulq %[b]; "
 97 |        "addq %%rax, %[lo]; "
 98 |        "adcq %%rdx, %[hi]; "
 99 |        : [lo]"+r"(lo), [hi]"+r"(hi)
100 |        : [b]"m"(*b), [a]"m"(*a)
101 |        : "rax", "rdx", "cc");
102 |   #endif
103 |   
104 |   *acc = (((__uint128_t)(hi))<<64) | lo;
105 | }
106 | 
107 | static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
108 |   uint64_t lo = *acc, hi = *acc>>64;
109 |   uint64_t lo2 = *acc2, hi2 = *acc2>>64;
110 |   
111 |   #ifdef __BMI2__
112 |   uint64_t c,d;
113 |   __asm__ volatile
114 |       ("movq %[a], %%rdx; "
115 |        "mulx %[b], %[c], %[d]; "
116 |        "addq %[c], %[lo]; "
117 |        "adcq %[d], %[hi]; "
118 |        "addq %[c], %[lo2]; "
119 |        "adcq %[d], %[hi2]; "
120 |        : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
121 |        : [b]"m"(*b), [a]"m"(*a)
122 |        : "rdx", "cc");
123 |   #else
124 |   __asm__ volatile
125 |       ("movq %[a], %%rax; "
126 |        "mulq %[b]; "
127 |        "addq %%rax, %[lo]; "
128 |        "adcq %%rdx, %[hi]; "
129 |        "addq %%rax, %[lo2]; "
130 |        "adcq %%rdx, %[hi2]; "
131 |        : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
132 |        : [b]"m"(*b), [a]"m"(*a)
133 |        : "rax", "rdx", "cc");
134 |   #endif
135 |   
136 |   *acc = (((__uint128_t)(hi))<<64) | lo;
137 |   *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
138 | }
139 | 
140 | static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
141 |   uint64_t lo = *acc, hi = *acc>>64;
142 |   
143 |   #ifdef __BMI2__
144 |   uint64_t c,d;
145 |   __asm__ volatile
146 |       ("mulx %[b], %[c], %[d]; "
147 |        "addq %[c], %[lo]; "
148 |        "adcq %[d], %[hi]; "
149 |        : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
150 |        : [b]"m"(*b), [a]"d"(a)
151 |        : "cc");
152 |   #else
153 |   __asm__ volatile
154 |       ("movq %[a], %%rax; "
155 |        "mulq %[b]; "
156 |        "addq %%rax, %[lo]; "
157 |        "adcq %%rdx, %[hi]; "
158 |        : [lo]"+r"(lo), [hi]"+r"(hi)
159 |        : [b]"m"(*b), [a]"r"(a)
160 |        : "rax", "rdx", "cc");
161 |   #endif
162 |   
163 |   *acc = (((__uint128_t)(hi))<<64) | lo;
164 | }
165 | 
166 | static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
167 |   uint64_t lo = *acc, hi = *acc>>64;
168 |   
169 |   #ifdef __BMI2__
170 |   uint64_t c,d;
171 |   __asm__ volatile
172 |       ("movq %[a], %%rdx; "
173 |        "addq %%rdx, %%rdx; "
174 |        "mulx %[b], %[c], %[d]; "
175 |        "addq %[c], %[lo]; "
176 |        "adcq %[d], %[hi]; "
177 |        : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
178 |        : [b]"m"(*b), [a]"m"(*a)
179 |        : "rdx", "cc");
180 |   #else
181 |   __asm__ volatile
182 |       ("movq %[a], %%rax; "
183 |        "addq %%rax, %%rax; "
184 |        "mulq %[b]; "
185 |        "addq %%rax, %[lo]; "
186 |        "adcq %%rdx, %[hi]; "
187 |        : [lo]"+r"(lo), [hi]"+r"(hi)
188 |        : [b]"m"(*b), [a]"m"(*a)
189 |        : "rax", "rdx", "cc");
190 |   #endif
191 |   
192 |   *acc = (((__uint128_t)(hi))<<64) | lo;
193 | }
194 | 
195 | static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
196 |   uint64_t lo = *acc, hi = *acc>>64;
197 |   #ifdef __BMI2__
198 |   uint64_t c,d;
199 |   __asm__ volatile
200 |       ("movq %[a], %%rdx; "
201 |        "mulx %[b], %[c], %[d]; "
202 |        "subq %[c], %[lo]; "
203 |        "sbbq %[d], %[hi]; "
204 |        : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
205 |        : [b]"m"(*b), [a]"m"(*a)
206 |        : "rdx", "cc");
207 |   #else
208 |   __asm__ volatile
209 |       ("movq %[a], %%rax; "
210 |        "mulq %[b]; "
211 |        "subq %%rax, %[lo]; "
212 |        "sbbq %%rdx, %[hi]; "
213 |        : [lo]"+r"(lo), [hi]"+r"(hi)
214 |        : [b]"m"(*b), [a]"m"(*a)
215 |        : "rax", "rdx", "cc");
216 |   #endif
217 |   *acc = (((__uint128_t)(hi))<<64) | lo;
218 | }
219 | 
220 | static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
221 |   uint64_t lo = *acc, hi = *acc>>64;
222 |   #ifdef __BMI2__
223 |   uint64_t c,d;
224 |   __asm__ volatile
225 |       ("movq %[a], %%rdx; "
226 |        "addq %%rdx, %%rdx; "
227 |        "mulx %[b], %[c], %[d]; "
228 |        "subq %[c], %[lo]; "
229 |        "sbbq %[d], %[hi]; "
230 |        : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
231 |        : [b]"m"(*b), [a]"m"(*a)
232 |        : "rdx", "cc");
233 |   #else
234 |   __asm__ volatile
235 |       ("movq %[a], %%rax; "
236 |        "addq %%rax, %%rax; "
237 |        "mulq %[b]; "
238 |        "subq %%rax, %[lo]; "
239 |        "sbbq %%rdx, %[hi]; "
240 |        : [lo]"+r"(lo), [hi]"+r"(hi)
241 |        : [b]"m"(*b), [a]"m"(*a)
242 |        : "rax", "rdx", "cc");
243 |   #endif
244 |   *acc = (((__uint128_t)(hi))<<64) | lo;
245 |   
246 | }
247 | 
248 | static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
249 |   uint64_t c,d, lo = *acc, hi = *acc>>64;
250 |   __asm__ volatile
251 |       ("movq %[a], %%rdx; "
252 |        "mulx %[b], %[c], %[d]; "
253 |        "subq %[lo], %[c]; "
254 |        "sbbq %[hi], %[d]; "
255 |        : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
256 |        : [b]"m"(*b), [a]"m"(*a)
257 |        : "rdx", "cc");
258 |   *acc = (((__uint128_t)(d))<<64) | c;
259 | }
260 | 
261 | static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
262 |   return ((__uint128_t)(a)) * b;
263 | }
264 | 
265 | static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
266 |   return ((__int128_t)(a)) * b;
267 | }
268 |  
269 | static __inline__ uint64_t opacify(uint64_t x) {
270 |   __asm__ volatile("" : "+r"(x));
271 |   return x;
272 | }
273 | 
274 | static __inline__ mask_t is_zero(uint64_t x) {
275 |   __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
276 |   return ~x;
277 | }
278 | 
279 | #endif /* __X86_64_ARITH_H__ */
280 | 


--------------------------------------------------------------------------------
/src/include/word.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | 
  5 | #ifndef __WORD_H__
  6 | #define __WORD_H__
  7 | 
  8 | /* for posix_memalign */
  9 | #define _XOPEN_SOURCE 600
 10 | 
 11 | #include "arch_config.h"
 12 | 
 13 | 
 14 | #ifndef __APPLE__
 15 | #ifndef _BSD_SOURCE
 16 | #define _BSD_SOURCE 1
 17 | #endif
 18 | #include <endian.h>
 19 | #endif
 20 | 
 21 | #include <stdint.h>
 22 | #include <stdlib.h>
 23 | #include <sys/types.h>
 24 | #include <inttypes.h>
 25 | 
 26 | #if defined(__ARM_NEON__)
 27 | #include <arm_neon.h>
 28 | #elif defined(__SSE2__)
 29 | #include <immintrin.h>
 30 | #endif
 31 | 
 32 | #if (WORD_BITS == 64)
 33 | typedef uint32_t hword_t;
 34 | typedef uint64_t word_t;
 35 | typedef __uint128_t dword_t;
 36 | typedef int32_t hsword_t;
 37 | typedef int64_t sword_t;
 38 | typedef __int128_t dsword_t;
 39 | #define PRIxWORD PRIx64
 40 | #define PRIxWORDfull "%016" PRIx64
 41 | #define PRIxWORD56   "%014" PRIx64
 42 | #define PRIxWORD60   "%015" PRIx60
 43 | #define U64LE(x) x##ull
 44 | #define U58LE(x) x##ull
 45 | #define U56LE(x) x##ull
 46 | #define U60LE(x) x##ull
 47 | #define letohWORD letoh64
 48 | #define GOLDI_BITS 64
 49 | #else
 50 | typedef uint16_t hword_t;
 51 | typedef uint32_t word_t;
 52 | typedef uint64_t dword_t;
 53 | typedef int16_t hsword_t;
 54 | typedef int32_t sword_t;
 55 | typedef int64_t dsword_t;
 56 | #define PRIxWORD PRIx32
 57 | #define PRIxWORDfull "%08" PRIx32
 58 | #define PRIxWORD56   "%07" PRIx32
 59 | #define U64LE(x) (x##ull)&((1ull<<32)-1), (x##ull)>>32
 60 | #define U58LE(x) (x##ull)&((1ull<<29)-1), (x##ull)>>29
 61 | #define U56LE(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
 62 | #define U60LE(x) (x##ull)&((1ull<<30)-1), (x##ull)>>30
 63 | #define letohWORD letoh32
 64 | #define GOLDI_BITS 32
 65 | #endif
 66 | 
 67 | #define DIV_CEIL(_x,_y) (((_x) + (_y) - 1)/(_y))
 68 | #define ROUND_UP(_x,_y) (DIV_CEIL((_x),(_y))*(_y))
 69 | #define WORDS_FOR_BITS(_x) (DIV_CEIL((_x),WORD_BITS))
 70 | 
 71 | typedef word_t mask_t;
 72 | static const mask_t MASK_FAILURE = 0, MASK_SUCCESS = -(mask_t)1;
 73 | 
 74 | 
 75 | 
 76 | #ifdef __ARM_NEON__
 77 | typedef uint32x4_t vecmask_t;
 78 | #elif __clang__
 79 | typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
 80 | typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
 81 | typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
 82 | typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
 83 | typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
 84 | typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
 85 | typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
 86 | typedef int32_t  int32x2_t __attribute__((ext_vector_type(2)));
 87 | typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
 88 | typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));
 89 | typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
 90 | #else /* GCC-cleanliness */
 91 | typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
 92 | typedef int64_t  int64x2_t __attribute__((vector_size(16)));
 93 | typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
 94 | typedef int64_t  int64x4_t __attribute__((vector_size(32)));
 95 | typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
 96 | typedef int32_t  int32x4_t __attribute__((vector_size(16)));
 97 | typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
 98 | typedef int32_t  int32x2_t __attribute__((vector_size(8)));
 99 | typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
100 | typedef int32_t  int32x8_t __attribute__((vector_size(32)));
101 | typedef word_t vecmask_t __attribute__((vector_size(32)));
102 | #endif
103 | 
104 | #if __AVX2__
105 |     #define VECTOR_ALIGNED __attribute__((aligned(32)))
106 |     typedef uint32x8_t big_register_t;
107 |     typedef uint64x4_t uint64xn_t;
108 |     typedef uint32x8_t uint32xn_t;
109 | 
110 |     static __inline__ big_register_t
111 |     br_set_to_mask(mask_t x) {
112 |         uint32_t y = (uint32_t)x;
113 |         big_register_t ret = {y,y,y,y,y,y,y,y};
114 |         return ret;
115 |     }
116 | #elif __SSE2__
117 |     #define VECTOR_ALIGNED __attribute__((aligned(16)))
118 |     typedef uint32x4_t big_register_t;
119 |     typedef uint64x2_t uint64xn_t;
120 |     typedef uint32x4_t uint32xn_t;
121 | 
122 |     static __inline__ big_register_t
123 |     br_set_to_mask(mask_t x) {
124 |         uint32_t y = x;
125 |         big_register_t ret = {y,y,y,y};
126 |         return ret;
127 |     }
128 | #elif __ARM_NEON__
129 |     #define VECTOR_ALIGNED __attribute__((aligned(16)))
130 |     typedef uint32x4_t big_register_t;
131 |     typedef uint64x2_t uint64xn_t;
132 |     typedef uint32x4_t uint32xn_t;
133 |     static __inline__ big_register_t
134 |     br_set_to_mask(mask_t x) {
135 |         return vdupq_n_u32(x);
136 |     }
137 | #elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
138 |     #define VECTOR_ALIGNED __attribute__((aligned(8)))
139 |     typedef uint64_t big_register_t, uint64xn_t;
140 | 
141 |     typedef uint32_t uint32xn_t;
142 |     static __inline__ big_register_t
143 |     br_set_to_mask(mask_t x) {
144 |         return (big_register_t)x;
145 |     }
146 | #else
147 |     #define VECTOR_ALIGNED __attribute__((aligned(4)))
148 |     typedef uint64_t uint64xn_t;
149 |     typedef uint32_t uint32xn_t;
150 |     typedef uint32_t big_register_t;
151 | 
152 |     static __inline__ big_register_t
153 |     br_set_to_mask(mask_t x) {
154 |         return (big_register_t)x;
155 |     }
156 | #endif
157 | 
158 | typedef struct {
159 |     uint64xn_t unaligned;
160 | } __attribute__((packed)) unaligned_uint64xn_t;
161 | 
162 | typedef struct {
163 |     uint32xn_t unaligned;
164 | } __attribute__((packed)) unaligned_uint32xn_t;
165 |     
166 | /**
167 |  * Return -1 if x==0, and 0 otherwise.
168 |  */
169 | static __inline__ mask_t
170 | __attribute__((always_inline,unused))
171 | word_is_zero(word_t x) {
172 |     return (mask_t)((((dword_t)(x)) - 1)>>WORD_BITS);
173 | }
174 | 
175 | #if __AVX2__
176 | static __inline__ big_register_t
177 | br_is_zero(big_register_t x) {
178 |     return (big_register_t)(x == br_set_to_mask(0));
179 | }
180 | #elif __SSE2__
181 | static __inline__ big_register_t
182 | br_is_zero(big_register_t x) {
183 |     return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
184 |     //return (big_register_t)(x == br_set_to_mask(0));
185 | }
186 | #elif __ARM_NEON__
187 | static __inline__ big_register_t
188 | br_is_zero(big_register_t x) {
189 |     return vceqq_u32(x,x^x);
190 | }
191 | #else
192 | static __inline__ mask_t
193 | br_is_zero(word_t x) {
194 |     return (((dword_t)x) - 1)>>WORD_BITS;
195 | }
196 | #endif
197 | 
198 | 
199 | 
200 | 
201 | #ifdef __APPLE__
202 | static inline uint64_t
203 | htobe64 (uint64_t x) {
204 |     __asm__ ("bswapq %0" : "+r"(x));
205 |     return x;
206 | }
207 | static inline uint64_t
208 | htole64 (uint64_t x) { return x; }
209 | 
210 | static inline uint64_t
211 | letoh64 (uint64_t x) { return x; }
212 | #endif
213 | 
214 | /**
215 |  * Really call memset, in a way that prevents the compiler from optimizing it out.
216 |  * @param p The object to zeroize.
217 |  * @param c The char to set it to (probably zero).
218 |  * @param s The size of the object.
219 |  */
220 | #if defined(__DARWIN_C_LEVEL) || defined(__STDC_LIB_EXT1__)
221 | #define HAS_MEMSET_S
222 | #endif
223 | 
224 | #if !defined(__STDC_WANT_LIB_EXT1__) || __STDC_WANT_LIB_EXT1__ != 1
225 | #define NEED_MEMSET_S_EXTERN
226 | #endif
227 | 
228 | #ifdef HAS_MEMSET_S
229 | #ifdef NEED_MEMSET_S_EXTERN
230 | extern int memset_s(void *, size_t, int, size_t);
231 | #endif
232 | static __inline__ void
233 | really_memset(void *p, char c, size_t s) {
234 |     memset_s(p, s, c, s);
235 | }
236 | #else
237 | static __inline__ void __attribute__((always_inline,unused))
238 | really_memset(void *p, char c, size_t s) {
239 |     volatile char *pv = (volatile char *)p;
240 |     size_t i;
241 |     for (i=0; i<s; i++) pv[i] = c;
242 | }
243 | #endif
244 | 
245 | /**
246 |  * Allocate memory which is sufficiently aligned to be used for the
247 |  * largest vector on the system (for now that's a big_register_t).
248 |  *
249 |  * Man malloc says that it does this, but at least for AVX2 on MacOS X,
250 |  * it's lying.
251 |  *
252 |  * @param size The size of the region to allocate.
253 |  * @return A suitable pointer, which can be free'd with free(),
254 |  * or NULL if no memory can be allocated.
255 |  */
256 | static __inline__ void *
257 | malloc_vector (
258 |     size_t size
259 | ) __attribute__((always_inline, unused));
260 | 
261 | void *
262 | malloc_vector(size_t size) {
263 |     void *out = NULL;
264 |     
265 |     int ret = posix_memalign(&out, sizeof(big_register_t), size);
266 |     
267 |     if (ret) {
268 |         return NULL;
269 |     } else {
270 |         return out;
271 |     }
272 | }
273 | 
274 | #endif /* __WORD_H__ */
275 | 


--------------------------------------------------------------------------------
/aux/idealized.sage:
--------------------------------------------------------------------------------
  1 | class Unique(object):
  2 |     def __init__(self,name):
  3 |         self.name = name
  4 |     
  5 |     def __str__(self):
  6 |         return self.name
  7 |     
  8 |     def __repr__(self):
  9 |         return "Unique(\"%s\")" % self.name
 10 | 
 11 | class Idealized(object):
 12 |     UNION = ["UNION"]
 13 |     
 14 |     def __init__(self, R, idealMap = 0, vars = {}):
 15 |         self.varnames = vars
 16 |         if not isinstance(idealMap,dict):
 17 |             idealMap = {()*R:idealMap}
 18 |         self.idealMap = idealMap
 19 |         self.R = R
 20 |         self._sqrt = None
 21 |         self._isqrt = None
 22 |     
 23 |     @staticmethod
 24 |     def uvar(x):
 25 |         return Idealized.var(Unique(x))
 26 |     
 27 |     @staticmethod
 28 |     def var(x):
 29 |         name = str(x)
 30 |         R = PolynomialRing(QQ,[name])
 31 |         rx = R.gens()[0]
 32 |         return Idealized(R,rx,{x:(name,rx)})
 33 |     
 34 |     @staticmethod
 35 |     def vars(xs):
 36 |         return tuple((Idealized.var(x) for x in xs))
 37 |     
 38 |     @staticmethod
 39 |     def uvars(xs):
 40 |         return tuple((Idealized.uvar(x) for x in xs))
 41 |     
 42 |     def __str__(self):
 43 |         def rep(I,x):
 44 |             x = str(x)
 45 |             gs = I.gens()
 46 |             gs = [g for g in gs if g != 0]
 47 |             if len(gs) == 0: return x
 48 |             else:
 49 |                 g = ", ".join(["(%s)" % str(gen) for gen in gs])
 50 |                 return g + ": " + x
 51 |         return "\n".join([rep(I,self.idealMap[I]) for I in self.idealMap])
 52 |     
 53 |     def __repr__(self):
 54 |         # HACK!
 55 |         if len(self.idealMap) == 0:
 56 |             return "undef"
 57 |         if len(self.idealMap) > 1:
 58 |             return str(self)
 59 |         for _,v in self.idealMap.iteritems():
 60 |             return str(v)
 61 |         
 62 |     def prune(self):
 63 |         self.idealMap = {I:v for I,v in self.idealMap.iteritems() if not (I*self.R).is_one()}
 64 |         return self
 65 |         
 66 |     def __add__(self,other):
 67 |         def f(x,y): return x+y
 68 |         return self.op(other,f)
 69 |         
 70 |     def __radd__(self,other):
 71 |         def f(x,y): return y+x
 72 |         return self.op(other,f)
 73 |         
 74 |     def __rsub__(self,other):
 75 |         def f(x,y): return y-x
 76 |         return self.op(other,f)
 77 |         
 78 |     def __neg__(self):
 79 |         def f(x,y): return y-x
 80 |         return self.op(0,f)
 81 |         
 82 |     def __sub__(self,other):
 83 |         def f(x,y): return x-y
 84 |         return self.op(other,f)
 85 |         
 86 |     def is_square(self):
 87 |         for _,v in self.idealMap.iteritems():
 88 |             if not is_square(v): return False
 89 |         return True
 90 |         
 91 |     def sqrt(self):
 92 |         if self._sqrt is None:
 93 |             s = Idealized.uvar("s")
 94 |             self._sqrt = s.assuming(s^2 - self)
 95 |         return self._sqrt
 96 |         
 97 |     def isqrt(self):
 98 |         if self._isqrt is None:
 99 |             s = Idealized.uvar("s")
100 |             z = Idealized(0).assuming(Self)
101 |             self._isqrt = s.assuming(s^2*self-1).union(z)
102 |         return self._isqrt
103 |         
104 |     def __mul__(self,other):
105 |         def f(x,y): return x*y
106 |         return self.op(other,f)
107 |         
108 |     def __rmul__(self,other):
109 |         def f(x,y): return y*x
110 |         return self.op(other,f)
111 |     
112 |     def __pow__(self,n):
113 |         if n < 0: return 1/self^(-n)
114 |         if n == 0: return 1
115 |         if n == 1: return self
116 |         if is_even(n): return (self*self)^(n//2)
117 |         if is_odd(n): return (self*self)^(n//2) * self
118 |         
119 |     def __div__(self,other):
120 |         def f(x,y): return x/y
121 |         return self.op(other,f)
122 |         
123 |     def __rdiv__(self,other):
124 |         def f(x,y): return y/x
125 |         return self.op(other,f)
126 |         
127 |     def union(self,other):
128 |         return self.op(other,Idealized.UNION)
129 |         
130 |     def __eq__(self,other):
131 |         return (self - other).is_zero()
132 |         
133 |     def __ne__(self,other):
134 |         return not (self==other)
135 |         
136 |     def __hash__(self):
137 |         return 0
138 | 
139 |     def assume_zero(self):
140 |         out = {}
141 |         for I,J in self.idealMap.iteritems():
142 |             IJ = I+J.numerator()
143 |             if IJ.is_one(): continue
144 |             out[IJ] = self.R(0)
145 |         
146 |         if len(out) == 0:
147 |             raise Exception("Inconsistent assumption")
148 |         
149 |         return Idealized(self.R,out,self.varnames)
150 |     
151 |     def assuming(self,other):
152 |         return self + other.assume_zero()
153 |     
154 |     def is_zero(self):
155 |         for I,v in self.idealMap.iteritems():
156 |             if v.denominator() in I: return False
157 |             if v.numerator() not in I: return False
158 |         return True
159 |     
160 |     def op(self,other,f):
161 |         if not isinstance(other,Idealized):
162 |             other = Idealized(self.R,other,self.varnames)
163 |         
164 |         bad = False
165 |         for v in self.varnames:
166 |             if v not in other.varnames or self.varnames[v] != other.varnames[v]:
167 |                 bad = True
168 |                 break
169 |         for v in other.varnames:
170 |             if v not in self.varnames or self.varnames[v] != other.varnames[v]:
171 |                 bad = True
172 |                 break
173 |                 
174 |         if bad:
175 |             def incrVar(v):
176 |                 if v[-1] not in "0123456789": return v + "1"
177 |                 elif v[-1] == 9: return incrVar(v[:-1]) + "0"
178 |                 else: return v[:-1] + str(int(v[-1])+1)
179 |         
180 |             vars = {}
181 |             names = set()
182 |             for v,(name,_) in self.varnames.iteritems():
183 |                 assert(name not in names)
184 |                 names.add(name)
185 |                 vars[v] = name
186 |             subMe = {n:n for n in names}
187 |             subThem = {}
188 |             for v,(name,_) in other.varnames.iteritems():
189 |                 if v in self.varnames:
190 |                     subThem[name] = self.varnames[v][0]
191 |                 else:
192 |                     oname = name
193 |                     while name in names:
194 |                         name = incrVar(name)
195 |                     names.add(name)
196 |                     subThem[oname] = name
197 |                     vars[v] = name
198 |             
199 |             R = PolynomialRing(QQ,sorted(list(names)),order="degrevlex")
200 |             gd = R.gens_dict()
201 |             subMe = {m:gd[n] for m,n in subMe.iteritems()}
202 |             subThem = {m:gd[n] for m,n in subThem.iteritems()}
203 |         
204 |             vars = {v:(n,gd[n]) for v,n in vars.iteritems()}
205 |         
206 |             def subIdeal(I,sub):
207 |                 return [g(**sub) for g in I.gens()]*R
208 |             idealMe = {subIdeal(I,subMe):v(**subMe) for I,v in self.idealMap.iteritems()}
209 |             idealThem = {subIdeal(I,subThem):v(**subThem) for I,v in other.idealMap.iteritems()}
210 |         else:
211 |             R = self.R
212 |             idealMe = self.idealMap
213 |             idealThem = other.idealMap
214 |             vars = self.varnames
215 |         
216 |         def consist(I,x,y):
217 |             if (x-y).numerator() not in I:
218 |                 raise Exception("Inconsistent: %s != %s in ideal %s" %
219 |                     (str(x),str(y),str(I)))
220 |             
221 |         out = {}
222 |         if f is Idealized.UNION:
223 |             for I,v in idealMe.iteritems():
224 |                 if I in idealThem:
225 |                     consist(I,v,idealThem[I])
226 |                 out[I] = v
227 |             for I,v in idealThem.iteritems():
228 |                 if I in idealMe:
229 |                     consist(I,v,idealMe[I])
230 |                 out[I] = v
231 |         
232 |         else:
233 |             for I,v in idealMe.iteritems():
234 |                 if I in idealThem:
235 |                     x = f(v,idealThem[I])
236 |                     if I in out:
237 |                         consist(I,x,out[I])
238 |                     else: out[I] = x
239 |                 else:
240 |                     for J,w in idealThem.iteritems():
241 |                         IJ = I+J
242 |                         if not IJ.is_one():
243 |                             x = f(v,w)
244 |                             if IJ in out:
245 |                                 consist(IJ,x,out[IJ])
246 |                             else:
247 |                                 out[IJ] = x
248 |         
249 |         def gb(I):
250 |             II = [0]*R
251 |             for g in I.gens():
252 |                 if g not in II: II = II+[g]*R
253 |             return II
254 | 
255 |         def red(I,v):
256 |             if I.is_zero(): return v
257 |             return I.reduce(R(v.numerator())) / I.reduce(R(v.denominator()))
258 |             
259 |         out = {gb(I):v for I,v in out.iteritems()}
260 |         out = {I:red(I,v) for I,v in out.iteritems()}
261 |         
262 |         return Idealized(R,out,vars)
263 |     
264 |     def reduce(self):
265 |         def red(I,v):
266 |             if I.is_zero(): return v
267 |             return I.reduce(R(v.numerator())) / I.reduce(R(v.denominator()))
268 |         out = {I:red(I,v) for I,v in self.idealMap.iteritems()}
269 |         return Idealized(self.R,out,self.vars)
270 | 
271 | Idealized.INF = Idealized.uvar("inf")
272 | Idealized.ZOZ = Idealized.uvar("zoz")
273 |     


--------------------------------------------------------------------------------
/test/test_decaf.cxx:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file test_decaf.cxx
  3 |  * @author Mike Hamburg
  4 |  *
  5 |  * @copyright
  6 |  *   Copyright (c) 2015 Cryptography Research, Inc.  \n
  7 |  *   Released under the MIT License.  See LICENSE.txt for license information.
  8 |  *
  9 |  * @brief C++ tests, because that's easier.
 10 |  */
 11 | 
 12 | #include "decaf.hxx"
 13 | #include "shake.hxx"
 14 | #include "decaf_crypto.h"
 15 | #include <stdio.h>
 16 | 
 17 | 
 18 | static bool passing = true;
 19 | static const long NTESTS = 10000;
 20 | 
 21 | class Test {
 22 | public:
 23 |     bool passing_now;
 24 |     Test(const char *test) {
 25 |         passing_now = true;
 26 |         printf("%s...", test);
 27 |         if (strlen(test) < 27) printf("%*s",int(27-strlen(test)),"");
 28 |         fflush(stdout);
 29 |     }
 30 |     ~Test() {
 31 |         if (std::uncaught_exception()) {
 32 |             fail();
 33 |             printf("  due to uncaught exception.\n");
 34 |         }
 35 |         if (passing_now) printf("[PASS]\n");
 36 |     }
 37 |     void fail() {
 38 |         if (!passing_now) return;
 39 |         passing_now = passing = false;
 40 |         printf("[FAIL]\n");
 41 |     }
 42 | };
 43 | 
 44 | template<typename Group> struct Tests {
 45 | 
 46 | typedef typename Group::Scalar Scalar;
 47 | typedef typename Group::Point Point;
 48 | typedef typename Group::Precomputed Precomputed;
 49 | 
 50 | static void print(const char *name, const Scalar &x) {
 51 |     unsigned char buffer[Scalar::SER_BYTES];
 52 |     x.encode(buffer);
 53 |     printf("  %s = 0x", name);
 54 |     for (int i=sizeof(buffer)-1; i>=0; i--) {
 55 |         printf("%02x", buffer[i]);
 56 |     }
 57 |     printf("\n");
 58 | }
 59 | 
 60 | static void print(const char *name, const Point &x) {
 61 |     unsigned char buffer[Point::SER_BYTES];
 62 |     x.encode(buffer);
 63 |     printf("  %s = 0x", name);
 64 |     for (int i=sizeof(buffer)-1; i>=0; i--) {
 65 |         printf("%02x", buffer[i]);
 66 |     }
 67 |     printf("\n");
 68 | }
 69 | 
 70 | static bool arith_check(
 71 |     Test &test,
 72 |     const Scalar &x,
 73 |     const Scalar &y,
 74 |     const Scalar &z,
 75 |     const Scalar &r,
 76 |     const Scalar &l,
 77 |     const char *name
 78 | ) {
 79 |     if (l == r) return true;
 80 |     test.fail();
 81 |     printf("  %s", name);
 82 |     print("x", x);
 83 |     print("y", y);
 84 |     print("z", z);
 85 |     print("lhs", r);
 86 |     print("rhs", l);
 87 |     return false;
 88 | }
 89 | 
 90 | static bool point_check(
 91 |     Test &test,
 92 |     const Point &p,
 93 |     const Point &q,
 94 |     const Point &R,
 95 |     const Scalar &x,
 96 |     const Scalar &y,
 97 |     const Point &l,
 98 |     const Point &r,
 99 |     const char *name
100 | ) {
101 |     bool good = l==r;
102 |     if (!p.validate()) { good = false; printf("  p invalid\n"); }
103 |     if (!q.validate()) { good = false; printf("  q invalid\n"); }
104 |     if (!r.validate()) { good = false; printf("  r invalid\n"); }
105 |     if (!l.validate()) { good = false; printf("  l invalid\n"); }
106 |     if (good) return true;
107 |     
108 |     test.fail();
109 |     printf("  %s", name);
110 |     print("x", x);
111 |     print("y", y);
112 |     print("p", p);
113 |     print("q", q);
114 |     print("r", R);
115 |     print("lhs", r);
116 |     print("rhs", l);
117 |     return false;
118 | }
119 | 
120 | static void test_arithmetic() {
121 |     decaf::SpongeRng rng(decaf::Block("test_arithmetic"));
122 |     
123 |     Test test("Arithmetic");
124 |     Scalar x(0),y(0),z(0);
125 |     arith_check(test,x,y,z,INT_MAX,(decaf_word_t)INT_MAX,"cast from max");
126 |     arith_check(test,x,y,z,INT_MIN,-Scalar(1+(decaf_word_t)INT_MAX),"cast from min");
127 |         
128 |     for (int i=0; i<NTESTS*10 && test.passing_now; i++) {
129 |         /* TODO: pathological cases */
130 |         size_t sob = DECAF_448_SCALAR_BYTES + 8 - (i%16);
131 |         Scalar x(rng.read(sob));
132 |         Scalar y(rng.read(sob));
133 |         Scalar z(rng.read(sob));
134 |         
135 | 
136 |         arith_check(test,x,y,z,x+y,y+x,"commute add");
137 |         arith_check(test,x,y,z,x,x+0,"ident add");
138 |         arith_check(test,x,y,z,x,x-0,"ident sub");
139 |         arith_check(test,x,y,z,x+(y+z),(x+y)+z,"assoc add");
140 |         arith_check(test,x,y,z,x*(y+z),x*y + x*z,"distributive mul/add");
141 |         arith_check(test,x,y,z,x*(y-z),x*y - x*z,"distributive mul/add");
142 |         arith_check(test,x,y,z,x*(y*z),(x*y)*z,"assoc mul");
143 |         arith_check(test,x,y,z,x*y,y*x,"commute mul");
144 |         arith_check(test,x,y,z,x,x*1,"ident mul");
145 |         arith_check(test,x,y,z,0,x*0,"mul by 0");
146 |         arith_check(test,x,y,z,-x,x*-1,"mul by -1");
147 |         arith_check(test,x,y,z,x+x,x*2,"mul by 2");
148 |         
149 |         if (i%20) continue;
150 |         if (y!=0) arith_check(test,x,y,z,x*y/y,x,"invert");
151 |         arith_check(test,x,y,z,x/0,0,"invert0");
152 |     }
153 | }
154 | 
155 | static void test_elligator() {
156 |     decaf::SpongeRng rng(decaf::Block("test_elligator"));
157 |     Test test("Elligator");
158 |     
159 |     for (int i=0; i<16; i++) {
160 |         decaf::SecureBuffer b1(Point::HASH_BYTES);
161 |         Point p = Point::identity();
162 |         if (i>=8) p.debugging_torque_in_place();
163 |         bool succ = p.invert_elligator(b1,i&7);
164 |         Point q;
165 |         unsigned char hint = q.set_to_hash(b1);
166 |         
167 |         if (succ != ((i&7) != 4) || (q != p) || (succ && (hint != (i&7)))) {
168 |             test.fail();
169 |             printf("Elligator test: t=%d, h=%d->%d, q%sp, %s %02x%02x\n",
170 |                 i/8, i&7, hint, (q==p)?"==":"!=",succ ? "SUCC" : "FAIL",
171 |                 b1[0], b1[1]);
172 |         }
173 |     }
174 | 
175 |     for (int i=0; i<NTESTS && test.passing_now; i++) {
176 |         size_t len = (i % (2*Point::HASH_BYTES + 3));
177 |         decaf::SecureBuffer b1(len), b2(len);
178 |         rng.read(b1);
179 |         if (i==1) b1[0] = 1; /* special case test */
180 |         if (len > Point::HASH_BYTES)
181 |             memcpy(&b2[Point::HASH_BYTES], &b1[Point::HASH_BYTES], len-Point::HASH_BYTES);
182 |         Point s;
183 |         unsigned char hint = s.set_to_hash(b1);
184 |         if (i&1) s.debugging_torque_in_place();
185 |         bool succ = s.invert_elligator(b2,hint);
186 |         if (!succ || memcmp(b1,b2,len)) {
187 |             test.fail();
188 |             printf("    Fail elligator inversion i=%d (claimed %s, hint=%d)\n",
189 |                 i, succ ? "success" : "failure", hint);
190 |         }
191 |         
192 |         Point t(rng);
193 |         point_check(test,t,t,t,0,0,t,Point::from_hash(t.steg_encode(rng)),"steg round-trip");
194 |     }
195 | }
196 | 
197 | static void test_ec() {
198 |     decaf::SpongeRng rng(decaf::Block("test_ec"));
199 |     
200 |     Test test("EC");
201 | 
202 |     Point id = Point::identity(), base = Point::base();
203 |     point_check(test,id,id,id,0,0,Point::from_hash(""),id,"fh0");
204 |     point_check(test,id,id,id,0,0,Point::from_hash("\x01"),id,"fh1");
205 |     
206 |     for (int i=0; i<NTESTS && test.passing_now; i++) {
207 |         /* TODO: pathological cases */
208 |         Scalar x(rng);
209 |         Scalar y(rng);
210 |         Point p(rng);
211 |         Point q(rng);
212 |         
213 |         decaf::SecureBuffer buffer(2*Point::HASH_BYTES);
214 |         rng.read(buffer);
215 |         Point r = Point::from_hash(buffer);
216 |         
217 |         point_check(test,p,q,r,0,0,p,Point((decaf::SecureBuffer)p),"round-trip");
218 |         point_check(test,p,q,r,0,0,p+q,q+p,"commute add");
219 |         point_check(test,p,q,r,0,0,(p-q)+q,p,"correct sub");
220 |         point_check(test,p,q,r,0,0,p+(q+r),(p+q)+r,"assoc add");
221 |         point_check(test,p,q,r,0,0,p.times_two(),p+p,"dbl add");
222 |         
223 |         if (i%10) continue;
224 |         point_check(test,p,q,r,x,0,x*(p+q),x*p+x*q,"distr mul");
225 |         point_check(test,p,q,r,x,y,(x*y)*p,x*(y*p),"assoc mul");
226 |         point_check(test,p,q,r,x,y,x*p+y*q,Point::double_scalarmul(x,p,y,q),"ds mul");
227 |         point_check(test,base,q,r,x,y,x*base+y*q,q.non_secret_combo_with_base(y,x),"ds vt mul");
228 |         point_check(test,p,q,r,x,0,Precomputed(p)*x,p*x,"precomp mul");
229 |         point_check(test,p,q,r,0,0,r,
230 |             Point::from_hash(buffer.slice(0,Point::HASH_BYTES))
231 |             + Point::from_hash(buffer.slice(Point::HASH_BYTES,Point::HASH_BYTES)),
232 |             "unih = hash+add"
233 |         );
234 |             
235 | 
236 |         point_check(test,p,q,r,x,0,Point(x.direct_scalarmul(decaf::SecureBuffer(p))),x*p,"direct mul");
237 |     }
238 | }
239 | 
240 | }; // template<decaf::GroupId GROUP>
241 | 
242 | 
243 | static void test_decaf() {
244 |     Test test("Sample crypto");
245 |     decaf::SpongeRng rng(decaf::Block("test_decaf"));
246 | 
247 |     decaf_448_symmetric_key_t proto1,proto2;
248 |     decaf_448_private_key_t s1,s2;
249 |     decaf_448_public_key_t p1,p2;
250 |     decaf_448_signature_t sig;
251 |     unsigned char shared1[1234],shared2[1234];
252 |     const char *message = "Hello, world!";
253 | 
254 |     for (int i=0; i<NTESTS && test.passing_now; i++) {
255 |         rng.read(decaf::TmpBuffer(proto1,sizeof(proto1)));
256 |         rng.read(decaf::TmpBuffer(proto2,sizeof(proto2)));
257 |         decaf_448_derive_private_key(s1,proto1);
258 |         decaf_448_private_to_public(p1,s1);
259 |         decaf_448_derive_private_key(s2,proto2);
260 |         decaf_448_private_to_public(p2,s2);
261 |         if (!decaf_448_shared_secret (shared1,sizeof(shared1),s1,p2)) {
262 |             test.fail(); printf("Fail ss12\n");
263 |         }
264 |         if (!decaf_448_shared_secret (shared2,sizeof(shared2),s2,p1)) {
265 |             test.fail(); printf("Fail ss21\n");
266 |         }
267 |         if (memcmp(shared1,shared2,sizeof(shared1))) {
268 |             test.fail(); printf("Fail ss21 == ss12\n");   
269 |         }
270 |         decaf_448_sign (sig,s1,(const unsigned char *)message,strlen(message));
271 |         if (!decaf_448_verify (sig,p1,(const unsigned char *)message,strlen(message))) {
272 |             test.fail(); printf("Fail sig ver\n");   
273 |         }
274 |     }
275 | }
276 | 
277 | int main(int argc, char **argv) {
278 |     (void) argc; (void) argv;
279 |     
280 |     Tests<decaf::Ed448>::test_arithmetic();
281 |     Tests<decaf::Ed448>::test_elligator();
282 |     Tests<decaf::Ed448>::test_ec();
283 |     test_decaf();
284 |     
285 |     if (passing) printf("Passed all tests.\n");
286 |     
287 |     return passing ? 0 : 1;
288 | }
289 | 


--------------------------------------------------------------------------------
/aux/curve.sage:
--------------------------------------------------------------------------------
  1 | from idealized import Idealized
  2 | from collections import namedtuple
  3 | 
  4 | debugging = True
  5 | def debug_print(foo):
  6 |     if debugging: print foo
  7 | 
  8 | checkGroupLaws = True
  9 | checkTorsion = True
 10 | checkIsogenies = True
 11 | 
 12 | def memoize(f):
 13 |     # list cache because my __hash__ hack doesn't seem to work
 14 |     cache = []
 15 |     def ff(*args, **kwargs):
 16 |         key = (tuple(args),tuple(sorted(kwargs.iteritems())))
 17 |         for key_,value in cache:
 18 |             if key == key_: return value
 19 |         out = f(*args,**kwargs)
 20 |         cache.append((key,out))
 21 |         return out   
 22 |         
 23 |     try:
 24 |         ff.__name__ = f.__name__
 25 |     except AttributeError: pass
 26 |     return ff
 27 | 
 28 | def EcBase(curvename,varnames,ad=()):
 29 |     if isinstance(ad,str) or isinstance(ad[0],str):
 30 |         ad = Idealized.vars(ad)
 31 |     
 32 |     class Inner(namedtuple(curvename,(v for v in varnames))):
 33 |         params = ad
 34 |         torsion_points = {}
 35 |         def __new__(cls,*xy):
 36 |             def apply_invariants(xy,x):
 37 |                 for inv in cls.invariants(*(ad+xy)):
 38 |                     x = x.assuming(inv)
 39 |                 return x
 40 |             
 41 |             xy = tuple(xy)
 42 |             if len(xy) == 0:
 43 |                 xy = Idealized.uvars(varnames)
 44 |                 xy = [apply_invariants(xy,x) for x in xy]
 45 |             else:
 46 |                 for i,inv in enumerate(cls.invariants(*(ad + xy))):
 47 |                     if inv != 0:
 48 |                         raise Exception("Invariant inv[%d] not satisfied for %s: got \n%s" %
 49 |                              (i,curvename,str(inv)))
 50 | 
 51 |             return super(Inner,cls).__new__(cls,*xy)
 52 |                     
 53 |         varnames = "xy"
 54 |         
 55 |         @classmethod
 56 |         def invariants(self,*args): return []
 57 |         
 58 |         @classmethod
 59 |         @memoize
 60 |         def check_group(cls):
 61 |             if checkGroupLaws:
 62 |                 debug_print("Checking group law for %s..." % cls.__name__)
 63 |                 a,b,c,z = cls(),cls(),cls(),cls.basepoint
 64 |                 if a+z != a:
 65 |                     raise Exception("Base point is not identity!")
 66 |                 if a-a != z:
 67 |                     raise Exception("Subtraction doesn't work!")
 68 |                 if a+b != b+a:
 69 |                     raise Exception("Addition is not commutative!")
 70 |                 #if a+(b+c) != (a+b)+c:
 71 |                 #    raise Exception("Addition is not associative!")
 72 |             
 73 |             for t,n in cls.torsion():
 74 |                 if checkTorsion:
 75 |                     debug_print("  Checking %d-torsion..." % n)
 76 |                     cls.check_torsion(t,n)
 77 |                 #if n not in cls.torsion_points:
 78 |                 #    cls.torsion_points[n] = set()
 79 |                 #cls.torsion_points[n].add(cls(*t(cls.basepoint)))
 80 |                 
 81 |         @classmethod
 82 |         def check_torsion(cls,f,n):
 83 |             P = Q = cls()
 84 |             good = False
 85 |             for i in xrange(1,n+1):
 86 |                 Q = cls(*f(Q))
 87 |                 if Q == P:
 88 |                     if i==n:
 89 |                         good = True
 90 |                         break
 91 |                     raise Exception("Claimed %d-torsion, but is actually %d-torsion" % (n,i))
 92 |             if not good: raise Exception("Claimed %d-torsion, but isn't" % n)
 93 |             if n*P+n*cls(*f(P)) == cls.basepoint:
 94 |                 raise Exception("Torsion operation inverts element")
 95 |         
 96 |         @classmethod
 97 |         def torsion(cls):
 98 |             return []
 99 |         
100 |         def __sub__(self,other):
101 |             return self + (-other)
102 |             
103 |         def __mul__(self,other):
104 |             if other==0: return self.basepoint
105 |             if other < 0: return -(self*-other)
106 |             if other==1: return self
107 |             if is_even(other): return (self+self)*(other//2)
108 |             return (self+self)*(other//2) + self
109 |         
110 |         def __rmul__(self,other):
111 |             return self*other
112 |             
113 |     Inner.__name__ = curvename + "_base"
114 |     return Inner
115 | 
116 | class Isogeny(object):
117 | 
118 |     isograph = DiGraph(weighted=True)
119 |     isomap = {}
120 |     
121 |     @classmethod
122 |     def generate(cls, fro, to):
123 |         path = cls.isograph.shortest_path(fro,to,by_weight=True)
124 |         if len(path):
125 |             iso = cls.isomap[(path[0], path[1])]
126 |             for i in xrange(1,len(path)-1):
127 |                 iso = cls.isomap[(path[i],path[i+1])].compose(iso)
128 |             return iso
129 |         else:
130 |             return None
131 |     
132 |     def __init__(self,c1,c2,deg,fw,rv,check=True,dual=None,add=True):
133 |         self.c1 = c1
134 |         self.c2 = c2
135 |         self.fw = fw
136 |         self.rv = rv
137 |         self.deg = deg
138 |         
139 |         if add:
140 |             Isogeny.isomap[(c1,c2)] = self
141 |             Isogeny.isograph.add_edge(c1,c2,log(deg)/log(2) + 0.1)
142 |         
143 |         if dual is not None:
144 |             self.dual = dual
145 |         else:
146 |             self.dual = Isogeny(c2,c1,deg,rv,fw,False,self,add)
147 |         if not check: return
148 |         
149 |         
150 |         if not checkIsogenies: return
151 |         
152 |         debug_print("Checking isogeny %s <-%d-> %s..." % (c1.__name__,deg,c2.__name__))
153 |         if c2(*fw(*c1.basepoint)) != c2.basepoint:
154 |             raise Exception("Isogeny doesn't preserve basepoints")
155 |         if c1(*fw(*c2.basepoint)) != c1.basepoint:
156 |             raise Exception("Isogeny dual doesn't preserve basepoints")
157 |             
158 |         foo = c1()
159 |         bar = c2()
160 |         
161 |         c2(*fw(*foo))
162 |         c1(*rv(*bar))
163 |         
164 |         if c1(*rv(*c2(*fw(*foo)))) != deg*foo:
165 |             raise Exception("Isogeny degree is wrong")
166 |         if c2(*fw(*c1(*rv(*bar)))) != deg*bar:
167 |             raise Exception("Isogeny degree is wrong")
168 |         if -c2(*fw(*foo)) != c2(*fw(*(-foo))):
169 |             raise Exception("Isogeny uses wrong negmap")
170 |         if -c1(*rv(*bar)) != c1(*rv(*(-bar))):
171 |             raise Exception("Isogeny uses wrong negmap")
172 |             
173 |         
174 |     def __call__(self,ipt,**kwargs):
175 |         return self.c2(*self.fw(*ipt,**kwargs))
176 |         
177 |     def __repr__(self): return str(self)
178 |     def __str__(self):
179 |         out = "Isogeny %s%s <-%d-> %s%s..." %\
180 |             (self.c1.__name__,str(self.c1.params),self.deg,
181 |                 self.c2.__name__,self.c2.params)
182 |         out += "\n  fw: %s" % str(self(self.c1()))
183 |         out += "\n  rv: %s" % str(self.dual(self.c2()))
184 |         return out
185 |         
186 |     def compose(self,other):
187 |         def fw(*args): return self.fw(*other.fw(*args))
188 |         def rv(*args): return other.rv(*self.rv(*args))
189 |         return Isogeny(other.c1,self.c2,self.deg*other.deg,fw,rv,False,None,False)
190 | 
191 | def ec_family(defs,vars):
192 |     def inner1(CLS):
193 |         @memoize
194 |         def inner2(*args,**kwargs):
195 |             if len(args)==0 and len(kwargs)==0:
196 |                 args = tuple(defs)
197 |                 chk = True
198 |             else:
199 |                 chk = False
200 |             
201 |             class ret(CLS,EcBase(CLS.__name__,vars,args)):
202 |                 def __new__(cls,*args,**kwargs):
203 |                     return super(ret,cls).__new__(cls,*args,**kwargs)
204 |                 
205 |             ret.__name__ = CLS.__name__
206 |             ret.basepoint = ret(*ret.get_basepoint())
207 |                 
208 |             if chk: ret.check_group()
209 |             return ret
210 |             
211 |         inner2.__name__ = CLS.__name__ + "_family"
212 |         inner2()
213 |         return inner2
214 |         
215 |     return inner1
216 | 
217 | #checkGroupLaws = checkTorsion = False
218 | 
219 | @ec_family("ad","xy")
220 | class Edwards:
221 |     @classmethod
222 |     def invariants(cls,a,d,x,y):
223 |         return [y^2 + a*x^2 - 1 - d*x^2*y^2]
224 |         
225 |     def __neg__(self):
226 |         return self.__class__(-self.x,self.y)
227 |     
228 |     def __add__(self,other):
229 |         (x,y) = self
230 |         (X,Y) = other
231 |         a,d = self.params
232 |         dd = d*x*X*y*Y
233 |         return self.__class__((x*Y+X*y)/(1+dd),(y*Y-a*x*X)/(1-dd))
234 | 
235 |     @classmethod
236 |     def get_basepoint(cls): return (0,1)
237 | 
238 |     @classmethod
239 |     @memoize
240 |     def torsion(cls):
241 |         a,d = cls.params
242 |         sa = a.sqrt()
243 |         sd = d.sqrt()
244 |         sad = (a*d).sqrt()
245 |         def tor2_1((x,y)): return (-x,-y)
246 |         def tor4_1((x,y)): return (y/sa,-x*sa)
247 |         def tor4_2((x,y)): return (1/(sd*y),-1/(sd*x))
248 |         def tor2_2((x,y)): return (-1/(sad*x),-a/(sad*y))
249 |         
250 |         return [(tor2_1,2),(tor2_2,2),(tor4_1,4),(tor4_2,4)]
251 | 
252 | @ec_family("eA","st")
253 | class JacobiQuartic:
254 |     @classmethod
255 |     def invariants(cls,e,A,s,t):
256 |         return [-t^2 + e*s^4 + 2*A*s^2 + 1]
257 |         
258 |     def __neg__(self):
259 |         return self.__class__(-self.s,self.t)
260 |     
261 |     def __add__(self,other):
262 |         (x,y) = self
263 |         (X,Y) = other
264 |         e,A = self.params
265 |         dd = e*(x*X)^2
266 |         YY = (1+dd)*(y*Y+2*A*x*X) + 2*e*x*X*(x^2+X^2)
267 |         return self.__class__((x*Y+X*y)/(1-dd),YY/(1-dd)^2)
268 | 
269 |     @classmethod
270 |     def get_basepoint(cls): return (0,1)
271 | 
272 |     @classmethod
273 |     @memoize
274 |     def torsion(cls):
275 |         e,A = cls.params
276 |         se = e.sqrt()
277 |         def tor2_1((s,t)): return (-s,-t)
278 |         def tor2_2((s,t)): return (1/(se*s),-t/(se*s^2))
279 |         return [(tor2_1,2),(tor2_2,2)]
280 | 
281 | a,d = Idealized.vars("ad")
282 | def phi_iso(a,d):
283 |     return Isogeny(Edwards(a,d),JacobiQuartic(a^2,a-2*d),
284 |         2,
285 |         lambda x,y: (x/y, (2-y^2-a*x^2)/y^2),
286 |         lambda s,t: (2*s/(1+a*s^2), (1-a*s^2)/t)
287 |     )
288 | 
289 | print phi_iso(a,d)
290 | print phi_iso(-a,d-a)
291 | 
292 | print Isogeny.generate(Edwards(a,d),Edwards(-a,d-a))


--------------------------------------------------------------------------------
/src/include/constant_time.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file constant_time.h
  3 |  * @copyright
  4 |  *   Copyright (c) 2014 Cryptography Research, Inc.  \n
  5 |  *   Released under the MIT License.  See LICENSE.txt for license information.
  6 |  * @author Mike Hamburg
  7 |  *
  8 |  * @brief Constant-time routines.
  9 |  */
 10 | 
 11 | #ifndef __CONSTANT_TIME_H__
 12 | #define __CONSTANT_TIME_H__ 1
 13 | 
 14 | #include "word.h"
 15 | #include <string.h>
 16 | 
 17 | /*
 18 |  * Constant-time operations on hopefully-compile-time-sized memory
 19 |  * regions.  Needed for flexibility / demagication: not all fields
 20 |  * have sizes which are multiples of the vector width, necessitating
 21 |  * a change from the Ed448 versions.
 22 |  *
 23 |  * These routines would be much simpler to define at the byte level,
 24 |  * but if not vectorized they would be a significant fraction of the
 25 |  * runtime.  Eg on NEON-less ARM, constant_time_lookup is like 15% of
 26 |  * signing time, vs 6% on Haswell with its fancy AVX2 vectors.
 27 |  *
 28 |  * If the compiler could do a good job of autovectorizing the code,
 29 |  * we could just leave it with the byte definition.  But that's unlikely
 30 |  * on most deployed compilers, especially if you consider that pcmpeq[size]
 31 |  * is much faster than moving a scalar to the vector unit (which is what
 32 |  * a naive autovectorizer will do with constant_time_lookup on Intel).
 33 |  *
 34 |  * Instead, we're putting our trust in the loop unroller and unswitcher.
 35 |  * 
 36 |  * TODO: verify correctness and performance on each platform, to make sure
 37 |  * that there are no regressions.
 38 |  */
 39 | 
 40 | 
 41 | /**
 42 |  * Unaligned big (vector?) register.
 43 |  */
 44 | typedef struct {
 45 |     big_register_t unaligned;
 46 | } __attribute__((packed)) unaligned_br_t;
 47 | 
 48 | /**
 49 |  * Unaligned word register, for architectures where that matters.
 50 |  */
 51 | typedef struct {
 52 |     word_t unaligned;
 53 | } __attribute__((packed)) unaligned_word_t;
 54 | 
 55 | /**
 56 |  * @brief Constant-time conditional swap.
 57 |  *
 58 |  * If doswap, then swap elem_bytes between *a and *b.
 59 |  *
 60 |  * *a and *b must not alias.  Also, they must be at least as aligned
 61 |  * as their sizes, if the CPU cares about that sort of thing.
 62 |  */
 63 | static __inline__ void
 64 | __attribute__((unused,always_inline))
 65 | constant_time_cond_swap (
 66 |     void *__restrict__ a_,
 67 |     void *__restrict__ b_,
 68 |     word_t elem_bytes,
 69 |     mask_t doswap
 70 | ) {
 71 |     word_t k;
 72 |     unsigned char *a = (unsigned char *)a_;
 73 |     unsigned char *b = (unsigned char *)b_;
 74 |     
 75 |     big_register_t br_mask = br_set_to_mask(doswap);
 76 |     for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
 77 |         if (elem_bytes % sizeof(big_register_t)) {
 78 |             /* unaligned */
 79 |             big_register_t xor =
 80 |                 ((unaligned_br_t*)(&a[k]))->unaligned
 81 |               ^ ((unaligned_br_t*)(&b[k]))->unaligned;
 82 |             xor &= br_mask;
 83 |             ((unaligned_br_t*)(&a[k]))->unaligned ^= xor;
 84 |             ((unaligned_br_t*)(&b[k]))->unaligned ^= xor;
 85 |         } else {
 86 |             /* aligned */
 87 |             big_register_t xor =
 88 |                 *((big_register_t*)(&a[k]))
 89 |               ^ *((big_register_t*)(&b[k]));
 90 |             xor &= br_mask;
 91 |             *((big_register_t*)(&a[k])) ^= xor;
 92 |             *((big_register_t*)(&b[k])) ^= xor;
 93 |         }
 94 |     }
 95 | 
 96 |     if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
 97 |         for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
 98 |             if (elem_bytes % sizeof(word_t)) {
 99 |                 /* unaligned */
100 |                 word_t xor =
101 |                     ((unaligned_word_t*)(&a[k]))->unaligned
102 |                   ^ ((unaligned_word_t*)(&b[k]))->unaligned;
103 |                 xor &= doswap;
104 |                 ((unaligned_word_t*)(&a[k]))->unaligned ^= xor;
105 |                 ((unaligned_word_t*)(&b[k]))->unaligned ^= xor;
106 |             } else {
107 |                 /* aligned */
108 |                 word_t xor =
109 |                     *((word_t*)(&a[k]))
110 |                   ^ *((word_t*)(&b[k]));
111 |                 xor &= doswap;
112 |                 *((word_t*)(&a[k])) ^= xor;
113 |                 *((word_t*)(&b[k])) ^= xor;
114 |             }
115 |         }
116 |     }
117 |     
118 |     if (elem_bytes % sizeof(word_t)) {
119 |         for (; k<elem_bytes; k+=1) {
120 |             unsigned char xor = a[k] ^ b[k];
121 |             xor &= doswap;
122 |             a[k] ^= xor;
123 |             b[k] ^= xor;
124 |         }
125 |     }
126 | }
127 | 
128 | /**
129 |  * @brief Constant-time equivalent of memcpy(out, table + elem_bytes*idx, elem_bytes);
130 |  *
131 |  * The table must be at least as aligned as elem_bytes.  The output must be word aligned,
132 |  * and if the input size is vector aligned it must also be vector aligned.
133 |  *
134 |  * The table and output must not alias.
135 |  */
136 | static __inline__ void
137 | __attribute__((unused,always_inline))
138 | constant_time_lookup (
139 |     void *__restrict__ out_,
140 |     const void *table_,
141 |     word_t elem_bytes,
142 |     word_t n_table,
143 |     word_t idx
144 | ) {
145 |     big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
146 |     
147 |     /* Can't do pointer arithmetic on void* */
148 |     unsigned char *out = (unsigned char *)out_;
149 |     const unsigned char *table = (const unsigned char *)table_;
150 |     word_t j,k;
151 |     
152 |     memset(out, 0, elem_bytes);
153 |     for (j=0; j<n_table; j++, big_i-=big_one) {        
154 |         big_register_t br_mask = br_is_zero(big_i);
155 |         for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
156 |             if (elem_bytes % sizeof(big_register_t)) {
157 |                 /* unaligned */
158 |                 ((unaligned_br_t *)(out+k))->unaligned
159 | 			|= br_mask & ((const unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned;
160 |             } else {
161 |                 /* aligned */
162 |                 *(big_register_t *)(out+k) |= br_mask & *(const big_register_t*)(&table[k+j*elem_bytes]);
163 |             }
164 |         }
165 | 
166 |         word_t mask = word_is_zero(idx^j);
167 |         if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
168 |             for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
169 |                 if (elem_bytes % sizeof(word_t)) {
170 |                     /* input unaligned, output aligned */
171 |                     *(word_t *)(out+k) |= mask & ((const unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned;
172 |                 } else {
173 |                     /* aligned */
174 |                     *(word_t *)(out+k) |= mask & *(const word_t*)(&table[k+j*elem_bytes]);
175 |                 }
176 |             }
177 |         }
178 |         
179 |         if (elem_bytes % sizeof(word_t)) {
180 |             for (; k<elem_bytes; k+=1) {
181 |                 out[k] |= mask & table[k+j*elem_bytes];
182 |             }
183 |         }
184 |     }
185 | }
186 | 
187 | /**
188 |  * @brief Constant-time a = b&mask.
189 |  *
190 |  * The input and output must be at least as aligned as elem_bytes.
191 |  */
192 | static __inline__ void
193 | __attribute__((unused,always_inline))
194 | constant_time_mask (
195 |     void * a_,
196 |     const void *b_,
197 |     word_t elem_bytes,
198 |     mask_t mask
199 | ) {
200 |     unsigned char *a = (unsigned char *)a_;
201 |     const unsigned char *b = (const unsigned char *)b_;
202 |     
203 |     word_t k;
204 |     big_register_t br_mask = br_set_to_mask(mask);
205 |     for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
206 |         if (elem_bytes % sizeof(big_register_t)) {
207 |             /* unaligned */
208 |             ((unaligned_br_t*)(&a[k]))->unaligned = br_mask & ((const unaligned_br_t*)(&b[k]))->unaligned;
209 |         } else {
210 |             /* aligned */
211 |             *(big_register_t *)(a+k) = br_mask & *(const big_register_t*)(&b[k]);
212 |         }
213 |     }
214 | 
215 |     if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
216 |         for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
217 |             if (elem_bytes % sizeof(word_t)) {
218 |                 /* unaligned */
219 |                 ((unaligned_word_t*)(&a[k]))->unaligned = mask & ((const unaligned_word_t*)(&b[k]))->unaligned;
220 |             } else {
221 |                 /* aligned */
222 |                 *(word_t *)(a+k) = mask & *(const word_t*)(&b[k]);
223 |             }
224 |         }
225 |     }
226 |     
227 |     if (elem_bytes % sizeof(word_t)) {
228 |         for (; k<elem_bytes; k+=1) {
229 |             a[k] = mask & b[k];
230 |         }
231 |     }
232 | }
233 | 
234 | /**
235 |  * @brief Constant-time a = mask ? bTrue : bFalse.
236 |  *
237 |  * The input and output must be at least as aligned as elem_bytes.
238 |  *
239 |  * Note that the output is not __restrict__, but if it overlaps either
240 |  * input, it must be equal and not partially overlap.
241 |  */
242 | static __inline__ void
243 | __attribute__((unused,always_inline))
244 | constant_time_select (
245 |     void *a_,
246 |     const void *bTrue_,
247 |     const void *bFalse_,
248 |     word_t elem_bytes,
249 |     mask_t mask
250 | ) {
251 |     unsigned char *a = (unsigned char *)a_;
252 |     const unsigned char *bTrue = (const unsigned char *)bTrue_;
253 |     const unsigned char *bFalse = (const unsigned char *)bFalse_;
254 |     
255 |     word_t k;
256 |     big_register_t br_mask = br_set_to_mask(mask);
257 |     for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
258 |         if (elem_bytes % sizeof(big_register_t)) {
259 |             /* unaligned */
260 |             ((unaligned_br_t*)(&a[k]))->unaligned =
261 | 		  ( br_mask & ((const unaligned_br_t*)(&bTrue [k]))->unaligned)
262 | 		| (~br_mask & ((const unaligned_br_t*)(&bFalse[k]))->unaligned);
263 |         } else {
264 |             /* aligned */
265 |             *(big_register_t *)(a+k) =
266 | 		  ( br_mask & *(const big_register_t*)(&bTrue [k]))
267 | 		| (~br_mask & *(const big_register_t*)(&bFalse[k]));
268 |         }
269 |     }
270 | 
271 |     if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
272 |         for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
273 |             if (elem_bytes % sizeof(word_t)) {
274 |                 /* unaligned */
275 |                 ((unaligned_word_t*)(&a[k]))->unaligned =
276 | 		    ( mask & ((const unaligned_word_t*)(&bTrue [k]))->unaligned)
277 | 		  | (~mask & ((const unaligned_word_t*)(&bFalse[k]))->unaligned);
278 |             } else {
279 |                 /* aligned */
280 |                 *(word_t *)(a+k) =
281 | 		    ( mask & *(const word_t*)(&bTrue [k]))
282 | 		  | (~mask & *(const word_t*)(&bFalse[k]));
283 |             }
284 |         }
285 |     }
286 |     
287 |     if (elem_bytes % sizeof(word_t)) {
288 |         for (; k<elem_bytes; k+=1) {
289 |             a[k] = ( mask & bTrue[k]) | (~mask & bFalse[k]);
290 |         }
291 |     }
292 | }
293 | 
294 | #endif /* __CONSTANT_TIME_H__ */
295 | 


--------------------------------------------------------------------------------
/test/test_decaf.sage:
--------------------------------------------------------------------------------
  1 | from ctypes import *
  2 | from base64 import *
  3 | 
  4 | DECAF = CDLL("libdecaf.so")
  5 | 
  6 | F = GF(2^448-2^224-1)
  7 | d = -39081
  8 | E = EllipticCurve(F,[0,2-4*d,0,1,0])
  9 | p_tor4 = E.lift_x(-1)
 10 | Tor = [p_tor4 * i for i in xrange(4)]
 11 | q = 2^446-0x8335dc163bb124b65129c96fde933d8d723a70aadc873d6d54a7bb0d
 12 | FQ = GF(q)
 13 | 
 14 | passing = True
 15 | 
 16 | # TODO: pathological cases
 17 | # TODO: Elligator
 18 | # TODO: double scalar mul
 19 | 
 20 | def random_array(length):
 21 |     answer = "".join([chr(randint(0,255)) for i in xrange(length)])
 22 |     return answer
 23 | 
 24 | def from_le(buf):
 25 |     return sum([256^i * ord(x) for i,x in enumerate(buf)])
 26 | 
 27 | def youfail(why,n):
 28 |     print ("Fail on test %d!"%n), why
 29 |     global passing
 30 |     passing = False
 31 |     
 32 | def run_test(i):
 33 |     try:
 34 |         s = DecafScalar.random()
 35 |         t = DecafScalar.random()
 36 |         p = DecafPoint.random()
 37 |         q = DecafPoint.random()
 38 |         s*p + t*q
 39 |         if s*(t*p) != (s*t)*p:
 40 |             raise Exception("Mul doesn't work")
 41 |         (p+q-p-q).ser() # i guess...
 42 |     except Exception, e:
 43 |         youfail(e,i)
 44 | 
 45 | def run_all_tests(n = 100):
 46 |     for testno in xrange(n):
 47 |         run_test(testno)
 48 |     if passing:
 49 |         print "Passed all %d tests." % n
 50 | 
 51 | def to_le(x,n):
 52 |     x = int(x)
 53 |     if x >= 256^n:
 54 |         raise Exception("Integer too big in to_le(%d,%d)" % (x,n))
 55 |     return "".join([chr(x>>(8*i) & 255) for i in xrange(n)])
 56 | 
 57 | class DecafScalar():
 58 |     _UNDER = c_uint64 * int(7)
 59 |     def __init__(self,cstruct=None,scalar=None):
 60 |         if cstruct is None:
 61 |             cstruct = DecafScalar._UNDER()
 62 |             memmove(addressof(cstruct),
 63 |                 DECAF.decaf_448_scalar_zero,
 64 |                 8*7
 65 |             )
 66 |         if scalar is None:
 67 |             scalar = E(0)
 68 |         self.cstruct = cstruct
 69 |         self.scalar = scalar
 70 |         
 71 |         self._check()
 72 |     
 73 |     @staticmethod
 74 |     def _c_deser(str):
 75 |         buffer = (c_uint8*int(56)).from_buffer_copy(str)
 76 |         cstruct = DecafScalar._UNDER()
 77 |         ret = DECAF.decaf_448_scalar_decode(cstruct,buffer,c_uint64(-1))
 78 |         if ret != -1:
 79 |             raise Exception("scalar didn't decode")
 80 |         return cstruct
 81 |     
 82 |     @staticmethod
 83 |     def _sage_deser(str):
 84 |         s = from_le(str)
 85 |         if s >= FQ.cardinality(): raise Exception("scalar didn't decode")
 86 |         return FQ(s)
 87 |     
 88 |     def __eq__(self,other):
 89 |         csays = bool(DECAF.decaf_448_scalar_eq(self.cstruct,other.cstruct))
 90 |         sagesays = any([self.scalar == other.scalar + t for t in Tor])
 91 |         if csays != sagesays:
 92 |             raise Exception("C and SAGE don't agree: %d %d" % (csays, sagesays))
 93 |         return csays
 94 |         
 95 |     def __ne__(self,other):
 96 |         return not self==other
 97 |     
 98 |     def __add__(self,other):
 99 |         cstruct = DecafScalar._UNDER()
100 |         DECAF.decaf_448_scalar_add(cstruct,self.cstruct,other.cstruct)
101 |         return DecafScalar(cstruct,self.scalar + other.scalar)
102 |     
103 |     def __sub__(self,other):
104 |         cstruct = DecafScalar._UNDER()
105 |         DECAF.decaf_448_scalar_sub(cstruct,self.cstruct,other.cstruct)
106 |         return DecafScalar(cstruct,self.scalar - other.scalar)
107 |     
108 |     def __mul__(self,other):
109 |         if isinstance(other,DecafScalar):
110 |             cstruct = DecafScalar._UNDER()
111 |             DECAF.decaf_448_scalar_mul(cstruct,self.cstruct,other.cstruct)
112 |             return DecafScalar(cstruct,self.scalar * other.scalar)
113 |         elif isinstance(other,DecafPoint):
114 |             cstruct = DecafPoint._UNDER()
115 |             DECAF.decaf_448_point_scalarmul(cstruct,other.cstruct,self.cstruct)
116 |             return DecafPoint(cstruct,int(self.scalar) * other.point)
117 |         else: raise Exception("Nope")
118 |     
119 |     def __div__(self,other):
120 |         return self / other.inverse()
121 |     
122 |     def inverse(self):
123 |         cstruct = DecafScalar._UNDER()
124 |         z = DECAF.decaf_448_scalar_invert(cstruct,self.cstruct)
125 |         if bool(z) != (self.scalar == 0):
126 |             raise Exception("C and SAGE don't agree")
127 |         return DecafScalar(cstruct,1/self.scalar)
128 |     
129 |     def __neg__(self):
130 |         cstruct = DecafScalar._UNDER()
131 |         DECAF.decaf_448_scalar_negate(cstruct,self.cstruct)
132 |         return DecafScalar(cstruct,-self.scalar)
133 |         
134 |     def __str__(self):
135 |         return " ".join(["%02x"%ord(b) for b in self.ser()])
136 |     
137 |     def __repr__(self):
138 |         return "DecafScalar.fromInt(%d)" % self.scalar
139 |     
140 |     @classmethod
141 |     def fromInt(cls,i):
142 |         return cls.deser(to_le(i,56))
143 |         
144 |     def to64(self):
145 |         return b64encode(self.ser())
146 |     
147 |     @classmethod
148 |     def from64(cls,str):
149 |         return cls.deser(b64decode(str))
150 |     
151 |     @classmethod
152 |     def deser(cls,str):
153 |         good = True
154 |         try: cstruct = cls._c_deser(str)
155 |         except Exception: good = False
156 |         
157 |         good2 = True
158 |         try: scalar = cls._sage_deser(str)
159 |         except Exception: good2 = False
160 |         
161 |         if good != good2:
162 |             raise Exception("C and SAGE don't agree")
163 |         elif not good:
164 |             raise Exception("scalar didn't decode")
165 |         
166 |         return cls(cstruct,scalar)
167 |         
168 |     @classmethod
169 |     def random(cls):
170 |         while True:
171 |             try: return cls.deser(random_array(56))
172 |             except Exception: pass
173 | 
174 |     @staticmethod
175 |     def _c_ser(cstruct):
176 |         buffer = (c_uint8*int(56))()
177 |         DECAF.decaf_448_scalar_encode(buffer,cstruct)
178 |         return str(bytearray(buffer))
179 |     
180 |     def ser(self):
181 |         return self._c_ser(self.cstruct)
182 | 
183 |     @staticmethod
184 |     def _sage_ser(P):
185 |         return to_le(P,56)
186 |         
187 |     def _check(self):
188 |         ss = self._sage_ser(self.scalar)
189 |         cs = self._c_ser(self.cstruct)
190 |         if ss != cs:
191 |             print ss
192 |             print cs
193 |             raise Exception("Check failed!")
194 |         return True
195 | 
196 | class DecafPoint():
197 |     _UNDER = c_uint64 * int(8*4)
198 |     def __init__(self,cstruct=None,point=None):
199 |         if cstruct is None:
200 |             cstruct = DecafPoint._UNDER()
201 |             memmove(addressof(cstruct),
202 |                 DECAF.decaf_448_point_identity,
203 |                 8*8*4
204 |             )
205 |         if point is None:
206 |             point = E(0)
207 |         self.cstruct = cstruct
208 |         self.point = point
209 |         
210 |         self._check()
211 |     
212 |     @staticmethod
213 |     def _c_deser(str):
214 |         buffer = (c_uint8*int(56)).from_buffer_copy(str)
215 |         cstruct = DecafPoint._UNDER()
216 |         ret = DECAF.decaf_448_point_decode(cstruct,buffer,c_uint64(-1))
217 |         if ret != -1:
218 |             raise Exception("Point didn't decode")
219 |         return cstruct
220 |     
221 |     @staticmethod
222 |     def _sage_deser(str):
223 |         s = from_le(str)
224 |         if s > (F.cardinality()-1)/2: raise Exception("Point didn't decode")
225 |         if (s==0): return E(0)
226 |         if not E.is_x_coord(s^2): raise Exception("Point didn't decode")
227 |         P = E.lift_x(s^2)
228 |         t = P.xy()[1] / s
229 |         if is_odd(int(2*t/s)): P = -P
230 |         return P
231 |     
232 |     def __eq__(self,other):
233 |         csays = bool(DECAF.decaf_448_point_eq(self.cstruct,other.cstruct))
234 |         sagesays = any([self.point == other.point + t for t in Tor])
235 |         if csays != sagesays:
236 |             raise Exception("C and SAGE don't agree: %d %d" % (csays, sagesays))
237 |         return csays
238 |                 
239 |     def __ne__(self,other):
240 |         return not self==other
241 |     
242 |     def __add__(self,other):
243 |         cstruct = DecafPoint._UNDER()
244 |         DECAF.decaf_448_point_add(cstruct,self.cstruct,other.cstruct)
245 |         return DecafPoint(cstruct,self.point + other.point)
246 |     
247 |     def __sub__(self,other):
248 |         cstruct = DecafPoint._UNDER()
249 |         DECAF.decaf_448_point_sub(cstruct,self.cstruct,other.cstruct)
250 |         return DecafPoint(cstruct,self.point - other.point)
251 |         
252 |     def __mul__(self,other):
253 |         if isinstance(other,DecafScalar):
254 |             return other*self
255 |         else:
256 |             raise Exception("nope")
257 |         
258 |     def __div__(self,other):
259 |         if isinstance(other,DecafScalar):
260 |             return other.inverse()*self
261 |         else:
262 |             raise Exception("nope")
263 |     
264 |     def __neg__(self):
265 |         cstruct = DecafPoint._UNDER()
266 |         DECAF.decaf_448_point_negate(cstruct,self.cstruct)
267 |         return DecafPoint(cstruct,-self.point)
268 |         
269 |     def __str__(self):
270 |         return " ".join(["%02x"%ord(b) for b in self.ser()])
271 |     
272 |     def __repr__(self):
273 |         return "DecafPoint.from64('%s')" % self.to64()
274 |         
275 |     def to64(self):
276 |         return b64encode(self.ser())
277 |     
278 |     @classmethod
279 |     def from64(cls,str):
280 |         return cls.deser(b64decode(str))
281 |     
282 |     @classmethod
283 |     def deser(cls,str):
284 |         good = True
285 |         try: cstruct = cls._c_deser(str)
286 |         except Exception: good = False
287 |         
288 |         good2 = True
289 |         try: point = cls._sage_deser(str)
290 |         except Exception: good2 = False
291 |         
292 |         if good != good2:
293 |             raise Exception("C and SAGE don't agree")
294 |         elif not good:
295 |             raise Exception("Point didn't decode")
296 |         
297 |         return cls(cstruct,point)
298 |         
299 |     @classmethod
300 |     def random(cls):
301 |         while True:
302 |             try: return cls.deser(random_array(56))
303 |             except Exception: pass
304 | 
305 |     @staticmethod
306 |     def _c_ser(cstruct):
307 |         buffer = (c_uint8*int(56))()
308 |         DECAF.decaf_448_point_encode(buffer,cstruct)
309 |         return str(bytearray(buffer))
310 |     
311 |     def ser(self):
312 |         return self._c_ser(self.cstruct)
313 | 
314 |     @staticmethod
315 |     def _sage_ser(P):
316 |         if P == E(0): return to_le(0,56)
317 |         x,y = P.xy()
318 |         s = sqrt(x)
319 |         if s==0: return to_le(0,56)
320 |         if is_odd(int(2*y/s^2)): s = 1/s
321 |         if int(s) > (F.cardinality()-1)/2: s = -s
322 |         return to_le(s,56)
323 |         
324 |     def _check(self):
325 |         ss = self._sage_ser(self.point)
326 |         cs = self._c_ser(self.cstruct)
327 |         if ss != cs:
328 |             print ss
329 |             print cs
330 |             raise Exception("Check failed!")
331 |         return True
332 |         
333 | run_all_tests()
334 |     
335 |     


--------------------------------------------------------------------------------
/src/p448/arch_x86_64/p448.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | 
  5 | #include "p448.h"
  6 | #include "x86-64-arith.h"
  7 | 
  8 | void
  9 | p448_mul (
 10 |     p448_t *__restrict__ cs,
 11 |     const p448_t *as,
 12 |     const p448_t *bs
 13 | ) {
 14 |     const uint64_t *a = as->limb, *b = bs->limb;
 15 |     uint64_t *c = cs->limb;
 16 | 
 17 |     __uint128_t accum0 = 0, accum1 = 0, accum2;
 18 |     uint64_t mask = (1ull<<56) - 1;  
 19 | 
 20 |     uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32)));
 21 | 
 22 |     /* For some reason clang doesn't vectorize this without prompting? */
 23 |     unsigned int i;
 24 |     for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
 25 |         ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
 26 |         ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i]; 
 27 |         ((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i];     
 28 |     }
 29 |     /*
 30 |     for (int i=0; i<4; i++) {
 31 |     aa[i] = a[i] + a[i+4];
 32 |     bb[i] = b[i] + b[i+4];
 33 |     }
 34 |     */
 35 | 
 36 |     accum2  = widemul(&a[0],&b[3]);
 37 |     accum0  = widemul(&aa[0],&bb[3]);
 38 |     accum1  = widemul(&a[4],&b[7]);
 39 | 
 40 |     mac(&accum2, &a[1], &b[2]);
 41 |     mac(&accum0, &aa[1], &bb[2]);
 42 |     mac(&accum1, &a[5], &b[6]);
 43 | 
 44 |     mac(&accum2, &a[2], &b[1]);
 45 |     mac(&accum0, &aa[2], &bb[1]);
 46 |     mac(&accum1, &a[6], &b[5]);
 47 | 
 48 |     mac(&accum2, &a[3], &b[0]);
 49 |     mac(&accum0, &aa[3], &bb[0]);
 50 |     mac(&accum1, &a[7], &b[4]);
 51 | 
 52 |     accum0 -= accum2;
 53 |     accum1 += accum2;
 54 | 
 55 |     c[3] = ((uint64_t)(accum1)) & mask;
 56 |     c[7] = ((uint64_t)(accum0)) & mask;
 57 | 
 58 |     accum0 >>= 56;
 59 |     accum1 >>= 56;
 60 |     
 61 |     mac(&accum0, &aa[1],&bb[3]);
 62 |     mac(&accum1, &a[5], &b[7]);
 63 |     mac(&accum0, &aa[2], &bb[2]);
 64 |     mac(&accum1, &a[6], &b[6]);
 65 |     mac(&accum0, &aa[3], &bb[1]);
 66 |     accum1 += accum0;
 67 | 
 68 |     accum2 = widemul(&a[0],&b[0]);
 69 |     accum1 -= accum2;
 70 |     accum0 += accum2;
 71 |     
 72 |     msb(&accum0, &a[1], &b[3]);
 73 |     msb(&accum0, &a[2], &b[2]);
 74 |     mac(&accum1, &a[7], &b[5]);
 75 |     msb(&accum0, &a[3], &b[1]);
 76 |     mac(&accum1, &aa[0], &bb[0]);
 77 |     mac(&accum0, &a[4], &b[4]);
 78 | 
 79 |     c[0] = ((uint64_t)(accum0)) & mask;
 80 |     c[4] = ((uint64_t)(accum1)) & mask;
 81 | 
 82 |     accum0 >>= 56;
 83 |     accum1 >>= 56;
 84 | 
 85 |     accum2  = widemul(&a[2],&b[7]);
 86 |     mac(&accum0, &a[6], &bb[3]);
 87 |     mac(&accum1, &aa[2], &bbb[3]);
 88 | 
 89 |     mac(&accum2, &a[3], &b[6]);
 90 |     mac(&accum0, &a[7], &bb[2]);
 91 |     mac(&accum1, &aa[3], &bbb[2]);
 92 | 
 93 |     mac(&accum2, &a[0],&b[1]);
 94 |     mac(&accum1, &aa[0], &bb[1]);
 95 |     mac(&accum0, &a[4], &b[5]);
 96 | 
 97 |     mac(&accum2, &a[1], &b[0]);
 98 |     mac(&accum1, &aa[1], &bb[0]);
 99 |     mac(&accum0, &a[5], &b[4]);
100 | 
101 |     accum1 -= accum2;
102 |     accum0 += accum2;
103 | 
104 |     c[1] = ((uint64_t)(accum0)) & mask;
105 |     c[5] = ((uint64_t)(accum1)) & mask;
106 | 
107 |     accum0 >>= 56;
108 |     accum1 >>= 56;
109 | 
110 |     accum2  = widemul(&a[3],&b[7]);
111 |     mac(&accum0, &a[7], &bb[3]);
112 |     mac(&accum1, &aa[3], &bbb[3]);
113 | 
114 |     mac(&accum2, &a[0],&b[2]);
115 |     mac(&accum1, &aa[0], &bb[2]);
116 |     mac(&accum0, &a[4], &b[6]);
117 | 
118 |     mac(&accum2, &a[1], &b[1]);
119 |     mac(&accum1, &aa[1], &bb[1]);
120 |     mac(&accum0, &a[5], &b[5]);
121 | 
122 |     mac(&accum2, &a[2], &b[0]);
123 |     mac(&accum1, &aa[2], &bb[0]);
124 |     mac(&accum0, &a[6], &b[4]);
125 | 
126 |     accum1 -= accum2;
127 |     accum0 += accum2;
128 | 
129 |     c[2] = ((uint64_t)(accum0)) & mask;
130 |     c[6] = ((uint64_t)(accum1)) & mask;
131 | 
132 |     accum0 >>= 56;
133 |     accum1 >>= 56;
134 | 
135 |     accum0 += c[3];
136 |     accum1 += c[7];
137 |     c[3] = ((uint64_t)(accum0)) & mask;
138 |     c[7] = ((uint64_t)(accum1)) & mask;
139 | 
140 |     /* we could almost stop here, but it wouldn't be stable, so... */
141 | 
142 |     accum0 >>= 56;
143 |     accum1 >>= 56;
144 |     c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
145 |     c[0] += ((uint64_t)(accum1));
146 | }
147 | 
148 | void
149 | p448_mulw (
150 |     p448_t *__restrict__ cs,
151 |     const p448_t *as,
152 |     uint64_t b
153 | ) {
154 |     const uint64_t *a = as->limb;
155 |     uint64_t *c = cs->limb;
156 | 
157 |     __uint128_t accum0, accum4;
158 |     uint64_t mask = (1ull<<56) - 1;  
159 | 
160 |     accum0 = widemul_rm(b, &a[0]);
161 |     accum4 = widemul_rm(b, &a[4]);
162 | 
163 |     c[0] = accum0 & mask; accum0 >>= 56;
164 |     c[4] = accum4 & mask; accum4 >>= 56;
165 | 
166 |     mac_rm(&accum0, b, &a[1]);
167 |     mac_rm(&accum4, b, &a[5]);
168 | 
169 |     c[1] = accum0 & mask; accum0 >>= 56;
170 |     c[5] = accum4 & mask; accum4 >>= 56;
171 | 
172 |     mac_rm(&accum0, b, &a[2]);
173 |     mac_rm(&accum4, b, &a[6]);
174 | 
175 |     c[2] = accum0 & mask; accum0 >>= 56;
176 |     c[6] = accum4 & mask; accum4 >>= 56;
177 | 
178 |     mac_rm(&accum0, b, &a[3]);
179 |     mac_rm(&accum4, b, &a[7]);
180 | 
181 |     c[3] = accum0 & mask; accum0 >>= 56;
182 |     c[7] = accum4 & mask; accum4 >>= 56;
183 |     
184 |     accum0 += accum4 + c[4];
185 |     c[4] = accum0 & mask;
186 |     c[5] += accum0 >> 56;
187 | 
188 |     accum4 += c[0];
189 |     c[0] = accum4 & mask;
190 |     c[1] += accum4 >> 56;
191 | }
192 | 
193 | void
194 | p448_sqr (
195 |     p448_t *__restrict__ cs,
196 |     const p448_t *as
197 | ) {
198 |     const uint64_t *a = as->limb;
199 |     uint64_t *c = cs->limb;
200 | 
201 |     __uint128_t accum0 = 0, accum1 = 0, accum2;
202 |     uint64_t mask = (1ull<<56) - 1;  
203 | 
204 |     uint64_t aa[4] __attribute__((aligned(32)));
205 | 
206 |     /* For some reason clang doesn't vectorize this without prompting? */
207 |     unsigned int i;
208 |     for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
209 |       ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
210 |     }
211 | 
212 |     accum2  = widemul(&a[0],&a[3]);
213 |     accum0  = widemul(&aa[0],&aa[3]);
214 |     accum1  = widemul(&a[4],&a[7]);
215 | 
216 |     mac(&accum2, &a[1], &a[2]);
217 |     mac(&accum0, &aa[1], &aa[2]);
218 |     mac(&accum1, &a[5], &a[6]);
219 | 
220 |     accum0 -= accum2;
221 |     accum1 += accum2;
222 | 
223 |     c[3] = ((uint64_t)(accum1))<<1 & mask;
224 |     c[7] = ((uint64_t)(accum0))<<1 & mask;
225 | 
226 |     accum0 >>= 55;
227 |     accum1 >>= 55;
228 | 
229 |     mac2(&accum0, &aa[1],&aa[3]);
230 |     mac2(&accum1, &a[5], &a[7]);
231 |     mac(&accum0, &aa[2], &aa[2]);
232 |     accum1 += accum0;
233 | 
234 |     msb2(&accum0, &a[1], &a[3]);
235 |     mac(&accum1, &a[6], &a[6]);
236 |     
237 |     accum2 = widemul(&a[0],&a[0]);
238 |     accum1 -= accum2;
239 |     accum0 += accum2;
240 | 
241 |     msb(&accum0, &a[2], &a[2]);
242 |     mac(&accum1, &aa[0], &aa[0]);
243 |     mac(&accum0, &a[4], &a[4]);
244 | 
245 |     c[0] = ((uint64_t)(accum0)) & mask;
246 |     c[4] = ((uint64_t)(accum1)) & mask;
247 | 
248 |     accum0 >>= 56;
249 |     accum1 >>= 56;
250 | 
251 |     accum2  = widemul2(&aa[2],&aa[3]);
252 |     msb2(&accum0, &a[2], &a[3]);
253 |     mac2(&accum1, &a[6], &a[7]);
254 | 
255 |     accum1 += accum2;
256 |     accum0 += accum2;
257 | 
258 |     accum2  = widemul2(&a[0],&a[1]);
259 |     mac2(&accum1, &aa[0], &aa[1]);
260 |     mac2(&accum0, &a[4], &a[5]);
261 | 
262 |     accum1 -= accum2;
263 |     accum0 += accum2;
264 | 
265 |     c[1] = ((uint64_t)(accum0)) & mask;
266 |     c[5] = ((uint64_t)(accum1)) & mask;
267 | 
268 |     accum0 >>= 56;
269 |     accum1 >>= 56;
270 | 
271 |     accum2  = widemul(&aa[3],&aa[3]);
272 |     msb(&accum0, &a[3], &a[3]);
273 |     mac(&accum1, &a[7], &a[7]);
274 | 
275 |     accum1 += accum2;
276 |     accum0 += accum2;
277 | 
278 |     accum2  = widemul2(&a[0],&a[2]);
279 |     mac2(&accum1, &aa[0], &aa[2]);
280 |     mac2(&accum0, &a[4], &a[6]);
281 | 
282 |     mac(&accum2, &a[1], &a[1]);
283 |     mac(&accum1, &aa[1], &aa[1]);
284 |     mac(&accum0, &a[5], &a[5]);
285 | 
286 |     accum1 -= accum2;
287 |     accum0 += accum2;
288 | 
289 |     c[2] = ((uint64_t)(accum0)) & mask;
290 |     c[6] = ((uint64_t)(accum1)) & mask;
291 | 
292 |     accum0 >>= 56;
293 |     accum1 >>= 56;
294 | 
295 |     accum0 += c[3];
296 |     accum1 += c[7];
297 |     c[3] = ((uint64_t)(accum0)) & mask;
298 |     c[7] = ((uint64_t)(accum1)) & mask;
299 | 
300 |     /* we could almost stop here, but it wouldn't be stable, so... */
301 | 
302 |     accum0 >>= 56;
303 |     accum1 >>= 56;
304 |     c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
305 |     c[0] += ((uint64_t)(accum1));
306 | }
307 | 
308 | void
309 | p448_strong_reduce (
310 |     p448_t *a
311 | ) {
312 |     uint64_t mask = (1ull<<56)-1;
313 | 
314 |     /* first, clear high */
315 |     a->limb[4] += a->limb[7]>>56;
316 |     a->limb[0] += a->limb[7]>>56;
317 |     a->limb[7] &= mask;
318 | 
319 |     /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
320 | 
321 |     /* compute total_value - p.  No need to reduce mod p. */
322 | 
323 |     __int128_t scarry = 0;
324 |     int i;
325 |     for (i=0; i<8; i++) {
326 |         scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
327 |         a->limb[i] = scarry & mask;
328 |         scarry >>= 56;
329 |     }
330 | 
331 |     /* uncommon case: it was >= p, so now scarry = 0 and this = x
332 |     * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
333 |     * so let's add back in p.  will carry back off the top for 2^448.
334 |     */
335 | 
336 |     assert(is_zero(scarry) | is_zero(scarry+1));
337 | 
338 |     uint64_t scarry_mask = scarry & mask;
339 |     __uint128_t carry = 0;
340 | 
341 |     /* add it back */
342 |     for (i=0; i<8; i++) {
343 |         carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
344 |         a->limb[i] = carry & mask;
345 |         carry >>= 56;
346 |     }
347 | 
348 |     assert(is_zero(carry + scarry));
349 | }
350 | 
351 | void
352 | p448_serialize (
353 |     uint8_t *serial,
354 |     const struct p448_t *x
355 | ) {
356 |     int i,j;
357 |     p448_t red;
358 |     p448_copy(&red, x);
359 |     p448_strong_reduce(&red);
360 |     for (i=0; i<8; i++) {
361 |         for (j=0; j<7; j++) {
362 |             serial[7*i+j] = red.limb[i];
363 |             red.limb[i] >>= 8;
364 |         }
365 |         assert(red.limb[i] == 0);
366 |     }
367 | }
368 | 
369 | mask_t
370 | p448_deserialize (
371 |     p448_t *x,
372 |     const uint8_t serial[56]
373 | ) {
374 |     int i,j;
375 |     for (i=0; i<8; i++) {
376 |         word_t out = 0;
377 |         for (j=0; j<7; j++) {
378 |             out |= ((word_t)serial[7*i+j])<<(8*j);
379 |         }
380 |         x->limb[i] = out;
381 |     }
382 |     
383 |     /* Check for reduction.
384 |      *
385 |      * The idea is to create a variable ge which is all ones (rather, 56 ones)
386 |      * if and only if the low $i$ words of $x$ are >= those of p.
387 |      *
388 |      * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
389 |      */
390 |     word_t ge = -1, mask = (1ull<<56)-1;
391 |     for (i=0; i<4; i++) {
392 |         ge &= x->limb[i];
393 |     }
394 |     
395 |     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
396 |     ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
397 |     
398 |     /* Propagate the rest */
399 |     for (i=5; i<8; i++) {
400 |         ge &= x->limb[i];
401 |     }
402 |     
403 |     return ~is_zero(ge ^ mask);
404 | }
405 | 
406 | 


--------------------------------------------------------------------------------
/src/p480/arch_x86_64/p480.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | 
  5 | #include "p480.h"
  6 | #include "x86-64-arith.h"
  7 | 
  8 | void
  9 | p480_mul (
 10 |     p480_t *__restrict__ cs,
 11 |     const p480_t *as,
 12 |     const p480_t *bs
 13 | ) {
 14 |     const uint64_t *a = as->limb, *b = bs->limb;
 15 |     uint64_t *c = cs->limb;
 16 | 
 17 |     __uint128_t accum0 = 0, accum1 = 0, accum2;
 18 |     uint64_t mask = (1ull<<60) - 1;  
 19 | 
 20 |     uint64_t aa[4] __attribute__((aligned(32))), bb[4] __attribute__((aligned(32))), bbb[4] __attribute__((aligned(32)));
 21 | 
 22 |     /* For some reason clang doesn't vectorize this without prompting? */
 23 |     unsigned int i;
 24 |     for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
 25 |         ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
 26 |         ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i]; 
 27 |         ((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i];     
 28 |     }
 29 |     /*
 30 |     for (int i=0; i<4; i++) {
 31 |     aa[i] = a[i] + a[i+4];
 32 |     bb[i] = b[i] + b[i+4];
 33 |     }
 34 |     */
 35 | 
 36 |     accum2  = widemul(&a[0],&b[3]);
 37 |     accum0  = widemul(&aa[0],&bb[3]);
 38 |     accum1  = widemul(&a[4],&b[7]);
 39 | 
 40 |     mac(&accum2, &a[1], &b[2]);
 41 |     mac(&accum0, &aa[1], &bb[2]);
 42 |     mac(&accum1, &a[5], &b[6]);
 43 | 
 44 |     mac(&accum2, &a[2], &b[1]);
 45 |     mac(&accum0, &aa[2], &bb[1]);
 46 |     mac(&accum1, &a[6], &b[5]);
 47 | 
 48 |     mac(&accum2, &a[3], &b[0]);
 49 |     mac(&accum0, &aa[3], &bb[0]);
 50 |     mac(&accum1, &a[7], &b[4]);
 51 | 
 52 |     accum0 -= accum2;
 53 |     accum1 += accum2;
 54 | 
 55 |     c[3] = ((uint64_t)(accum1)) & mask;
 56 |     c[7] = ((uint64_t)(accum0)) & mask;
 57 | 
 58 |     accum0 >>= 60;
 59 |     accum1 >>= 60;
 60 |     
 61 |     mac(&accum0, &aa[1],&bb[3]);
 62 |     mac(&accum1, &a[5], &b[7]);
 63 |     mac(&accum0, &aa[2], &bb[2]);
 64 |     mac(&accum1, &a[6], &b[6]);
 65 |     mac(&accum0, &aa[3], &bb[1]);
 66 |     accum1 += accum0;
 67 | 
 68 |     accum2 = widemul(&a[0],&b[0]);
 69 |     accum1 -= accum2;
 70 |     accum0 += accum2;
 71 |     
 72 |     msb(&accum0, &a[1], &b[3]);
 73 |     msb(&accum0, &a[2], &b[2]);
 74 |     mac(&accum1, &a[7], &b[5]);
 75 |     msb(&accum0, &a[3], &b[1]);
 76 |     mac(&accum1, &aa[0], &bb[0]);
 77 |     mac(&accum0, &a[4], &b[4]);
 78 | 
 79 |     c[0] = ((uint64_t)(accum0)) & mask;
 80 |     c[4] = ((uint64_t)(accum1)) & mask;
 81 | 
 82 |     accum0 >>= 60;
 83 |     accum1 >>= 60;
 84 | 
 85 |     accum2  = widemul(&a[2],&b[7]);
 86 |     mac(&accum0, &a[6], &bb[3]);
 87 |     mac(&accum1, &aa[2], &bbb[3]);
 88 | 
 89 |     mac(&accum2, &a[3], &b[6]);
 90 |     mac(&accum0, &a[7], &bb[2]);
 91 |     mac(&accum1, &aa[3], &bbb[2]);
 92 | 
 93 |     mac(&accum2, &a[0],&b[1]);
 94 |     mac(&accum1, &aa[0], &bb[1]);
 95 |     mac(&accum0, &a[4], &b[5]);
 96 | 
 97 |     mac(&accum2, &a[1], &b[0]);
 98 |     mac(&accum1, &aa[1], &bb[0]);
 99 |     mac(&accum0, &a[5], &b[4]);
100 | 
101 |     accum1 -= accum2;
102 |     accum0 += accum2;
103 | 
104 |     c[1] = ((uint64_t)(accum0)) & mask;
105 |     c[5] = ((uint64_t)(accum1)) & mask;
106 | 
107 |     accum0 >>= 60;
108 |     accum1 >>= 60;
109 | 
110 |     accum2  = widemul(&a[3],&b[7]);
111 |     mac(&accum0, &a[7], &bb[3]);
112 |     mac(&accum1, &aa[3], &bbb[3]);
113 | 
114 |     mac(&accum2, &a[0],&b[2]);
115 |     mac(&accum1, &aa[0], &bb[2]);
116 |     mac(&accum0, &a[4], &b[6]);
117 | 
118 |     mac(&accum2, &a[1], &b[1]);
119 |     mac(&accum1, &aa[1], &bb[1]);
120 |     mac(&accum0, &a[5], &b[5]);
121 | 
122 |     mac(&accum2, &a[2], &b[0]);
123 |     mac(&accum1, &aa[2], &bb[0]);
124 |     mac(&accum0, &a[6], &b[4]);
125 | 
126 |     accum1 -= accum2;
127 |     accum0 += accum2;
128 | 
129 |     c[2] = ((uint64_t)(accum0)) & mask;
130 |     c[6] = ((uint64_t)(accum1)) & mask;
131 | 
132 |     accum0 >>= 60;
133 |     accum1 >>= 60;
134 | 
135 |     accum0 += c[3];
136 |     accum1 += c[7];
137 |     c[3] = ((uint64_t)(accum0)) & mask;
138 |     c[7] = ((uint64_t)(accum1)) & mask;
139 | 
140 |     /* we could almost stop here, but it wouldn't be stable, so... */
141 | 
142 |     accum0 >>= 60;
143 |     accum1 >>= 60;
144 |     c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
145 |     c[0] += ((uint64_t)(accum1));
146 | }
147 | 
148 | void
149 | p480_mulw (
150 |     p480_t *__restrict__ cs,
151 |     const p480_t *as,
152 |     uint64_t b
153 | ) {
154 |     const uint64_t *a = as->limb;
155 |     uint64_t *c = cs->limb;
156 | 
157 |     __uint128_t accum0, accum4;
158 |     uint64_t mask = (1ull<<60) - 1;  
159 | 
160 |     accum0 = widemul_rm(b, &a[0]);
161 |     accum4 = widemul_rm(b, &a[4]);
162 | 
163 |     c[0] = accum0 & mask; accum0 >>= 60;
164 |     c[4] = accum4 & mask; accum4 >>= 60;
165 | 
166 |     mac_rm(&accum0, b, &a[1]);
167 |     mac_rm(&accum4, b, &a[5]);
168 | 
169 |     c[1] = accum0 & mask; accum0 >>= 60;
170 |     c[5] = accum4 & mask; accum4 >>= 60;
171 | 
172 |     mac_rm(&accum0, b, &a[2]);
173 |     mac_rm(&accum4, b, &a[6]);
174 | 
175 |     c[2] = accum0 & mask; accum0 >>= 60;
176 |     c[6] = accum4 & mask; accum4 >>= 60;
177 | 
178 |     mac_rm(&accum0, b, &a[3]);
179 |     mac_rm(&accum4, b, &a[7]);
180 | 
181 |     c[3] = accum0 & mask; accum0 >>= 60;
182 |     c[7] = accum4 & mask; accum4 >>= 60;
183 |     
184 |     accum0 += accum4 + c[4];
185 |     c[4] = accum0 & mask;
186 |     c[5] += accum0 >> 60;
187 | 
188 |     accum4 += c[0];
189 |     c[0] = accum4 & mask;
190 |     c[1] += accum4 >> 60;
191 | }
192 | 
193 | void
194 | p480_sqr (
195 |     p480_t *__restrict__ cs,
196 |     const p480_t *as
197 | ) {
198 |     const uint64_t *a = as->limb;
199 |     uint64_t *c = cs->limb;
200 | 
201 |     __uint128_t accum0 = 0, accum1 = 0, accum2;
202 |     uint64_t mask = (1ull<<60) - 1;  
203 | 
204 |     uint64_t aa[4] __attribute__((aligned(32)));
205 | 
206 |     /* For some reason clang doesn't vectorize this without prompting? */
207 |     unsigned int i;
208 |     for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
209 |       ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
210 |     }
211 | 
212 |     accum2  = widemul(&a[0],&a[3]);
213 |     accum0  = widemul(&aa[0],&aa[3]);
214 |     accum1  = widemul(&a[4],&a[7]);
215 | 
216 |     mac(&accum2, &a[1], &a[2]);
217 |     mac(&accum0, &aa[1], &aa[2]);
218 |     mac(&accum1, &a[5], &a[6]);
219 | 
220 |     accum0 -= accum2;
221 |     accum1 += accum2;
222 | 
223 |     c[3] = ((uint64_t)(accum1))<<1 & mask;
224 |     c[7] = ((uint64_t)(accum0))<<1 & mask;
225 | 
226 |     accum0 >>= 59;
227 |     accum1 >>= 59;
228 | 
229 |     mac2(&accum0, &aa[1],&aa[3]);
230 |     mac2(&accum1, &a[5], &a[7]);
231 |     mac(&accum0, &aa[2], &aa[2]);
232 |     accum1 += accum0;
233 | 
234 |     msb2(&accum0, &a[1], &a[3]);
235 |     mac(&accum1, &a[6], &a[6]);
236 |     
237 |     accum2 = widemul(&a[0],&a[0]);
238 |     accum1 -= accum2;
239 |     accum0 += accum2;
240 | 
241 |     msb(&accum0, &a[2], &a[2]);
242 |     mac(&accum1, &aa[0], &aa[0]);
243 |     mac(&accum0, &a[4], &a[4]);
244 | 
245 |     c[0] = ((uint64_t)(accum0)) & mask;
246 |     c[4] = ((uint64_t)(accum1)) & mask;
247 | 
248 |     accum0 >>= 60;
249 |     accum1 >>= 60;
250 | 
251 |     accum2  = widemul2(&aa[2],&aa[3]);
252 |     msb2(&accum0, &a[2], &a[3]);
253 |     mac2(&accum1, &a[6], &a[7]);
254 | 
255 |     accum1 += accum2;
256 |     accum0 += accum2;
257 | 
258 |     accum2  = widemul2(&a[0],&a[1]);
259 |     mac2(&accum1, &aa[0], &aa[1]);
260 |     mac2(&accum0, &a[4], &a[5]);
261 | 
262 |     accum1 -= accum2;
263 |     accum0 += accum2;
264 | 
265 |     c[1] = ((uint64_t)(accum0)) & mask;
266 |     c[5] = ((uint64_t)(accum1)) & mask;
267 | 
268 |     accum0 >>= 60;
269 |     accum1 >>= 60;
270 | 
271 |     accum2  = widemul(&aa[3],&aa[3]);
272 |     msb(&accum0, &a[3], &a[3]);
273 |     mac(&accum1, &a[7], &a[7]);
274 | 
275 |     accum1 += accum2;
276 |     accum0 += accum2;
277 | 
278 |     accum2  = widemul2(&a[0],&a[2]);
279 |     mac2(&accum1, &aa[0], &aa[2]);
280 |     mac2(&accum0, &a[4], &a[6]);
281 | 
282 |     mac(&accum2, &a[1], &a[1]);
283 |     mac(&accum1, &aa[1], &aa[1]);
284 |     mac(&accum0, &a[5], &a[5]);
285 | 
286 |     accum1 -= accum2;
287 |     accum0 += accum2;
288 | 
289 |     c[2] = ((uint64_t)(accum0)) & mask;
290 |     c[6] = ((uint64_t)(accum1)) & mask;
291 | 
292 |     accum0 >>= 60;
293 |     accum1 >>= 60;
294 | 
295 |     accum0 += c[3];
296 |     accum1 += c[7];
297 |     c[3] = ((uint64_t)(accum0)) & mask;
298 |     c[7] = ((uint64_t)(accum1)) & mask;
299 | 
300 |     /* we could almost stop here, but it wouldn't be stable, so... */
301 | 
302 |     accum0 >>= 60;
303 |     accum1 >>= 60;
304 |     c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
305 |     c[0] += ((uint64_t)(accum1));
306 | }
307 | 
308 | void
309 | p480_strong_reduce (
310 |     p480_t *a
311 | ) {
312 |     uint64_t mask = (1ull<<60)-1;
313 | 
314 |     /* first, clear high */
315 |     a->limb[4] += a->limb[7]>>60;
316 |     a->limb[0] += a->limb[7]>>60;
317 |     a->limb[7] &= mask;
318 | 
319 |     /* now the total is less than 2^480 - 2^(480-60) + 2^(480-60+8) < 2p */
320 | 
321 |     /* compute total_value - p.  No need to reduce mod p. */
322 | 
323 |     __int128_t scarry = 0;
324 |     int i;
325 |     for (i=0; i<8; i++) {
326 |         scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
327 |         a->limb[i] = scarry & mask;
328 |         scarry >>= 60;
329 |     }
330 | 
331 |     /* uncommon case: it was >= p, so now scarry = 0 and this = x
332 |     * common case: it was < p, so now scarry = -1 and this = x - p + 2^480
333 |     * so let's add back in p.  will carry back off the top for 2^480.
334 |     */
335 | 
336 |     assert(is_zero(scarry) | is_zero(scarry+1));
337 | 
338 |     uint64_t scarry_mask = scarry & mask;
339 |     __uint128_t carry = 0;
340 | 
341 |     /* add it back */
342 |     for (i=0; i<8; i++) {
343 |         carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
344 |         a->limb[i] = carry & mask;
345 |         carry >>= 60;
346 |     }
347 | 
348 |     assert(is_zero(carry + scarry));
349 | }
350 | 
351 | void
352 | p480_serialize (
353 |     uint8_t *serial,
354 |     const struct p480_t *x
355 | ) {
356 |     int i,j,k=0;
357 |     p480_t red;
358 |     p480_copy(&red, x);
359 |     p480_strong_reduce(&red);
360 |     word_t r = 0;
361 |     for (i=0; i<8; i+=2) {
362 |         r = red.limb[i];
363 |         for (j=0; j<7; j++) {
364 |             serial[k++] = r;
365 |             r >>= 8;
366 |         }
367 |         assert(r<16);
368 |         r += red.limb[i+1]<<4;
369 |         for (j=0; j<8; j++) {
370 |             serial[k++] = r;
371 |             r >>= 8;
372 |         }
373 |         assert(r==0);
374 |     }
375 | }
376 | 
377 | mask_t
378 | p480_deserialize (
379 |     p480_t *x,
380 |     const uint8_t serial[60]
381 | ) {
382 |     int i,j,k=0;
383 | 
384 |     for (i=0; i<8; i+=2) {
385 |         word_t r = 0;
386 |         for (j=0; j<8; j++) {
387 |             r |= ((word_t)serial[k++])<<(8*j);
388 |         }
389 |         x->limb[i] = r & ((1ull<<60)-1);
390 |         r >>= 60;
391 |         for (j=0; j<7; j++) {
392 |             r |= ((word_t)serial[k++])<<(8*j+4);
393 |         }
394 |         x->limb[i+1] = r;
395 |     }
396 |     
397 |     /* Check for reduction.
398 |      *
399 |      * The idea is to create a variable ge which is all ones (rather, 60 ones)
400 |      * if and only if the low $i$ words of $x$ are >= those of p.
401 |      *
402 |      * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
403 |      */
404 |     word_t ge = -1, mask = (1ull<<60)-1;
405 |     for (i=0; i<4; i++) {
406 |         ge &= x->limb[i];
407 |     }
408 |     
409 |     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
410 |     ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
411 |     
412 |     /* Propagate the rest */
413 |     for (i=5; i<8; i++) {
414 |         ge &= x->limb[i];
415 |     }
416 |     
417 |     return ~is_zero(ge ^ mask);
418 | }
419 | 
420 | 


--------------------------------------------------------------------------------
/src/p521/arch_ref64/p521.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | 
  5 | #include "p521.h"
  6 | 
  7 | static __inline__ __uint128_t widemul(
  8 |     const uint64_t a,
  9 |     const uint64_t b
 10 | ) {
 11 |     return ((__uint128_t)a) * ((__uint128_t)b);
 12 | }
 13 | 
 14 | static __inline__ uint64_t is_zero(uint64_t a) {
 15 |     /* let's hope the compiler isn't clever enough to optimize this. */
 16 |     return (((__uint128_t)a)-1)>>64;
 17 | }
 18 | 
 19 | void
 20 | p521_mul (
 21 |     p521_t *__restrict__ cs,
 22 |     const p521_t *as,
 23 |     const p521_t *bs
 24 | ) {
 25 |     uint64_t *c = cs->limb;
 26 |     const uint64_t *a = as->limb, *b = bs->limb;
 27 |     __uint128_t accum0, accum1;
 28 | 
 29 |     accum0  = widemul(2*a[8], b[8]);
 30 |     accum1  = widemul(a[0], b[7]);
 31 |     accum0 += widemul(a[1], b[6]);
 32 |     accum1 += widemul(a[2], b[5]);
 33 |     accum0 += widemul(a[3], b[4]);
 34 |     accum1 += widemul(a[4], b[3]);
 35 |     accum0 += widemul(a[5], b[2]);
 36 |     accum1 += widemul(a[6], b[1]);
 37 |     accum0 += widemul(a[7], b[0]);
 38 |     accum1 += accum0;
 39 |     c[7] = accum1 & ((1ull<<58)-1);
 40 |     accum1 >>= 58;
 41 |   
 42 |     accum0 = 0;
 43 |     accum1 += widemul(a[0], b[8-0]);
 44 |     accum0 += widemul(a[1], b[8-1]);
 45 |     accum1 += widemul(a[2], b[8-2]);
 46 |     accum0 += widemul(a[3], b[8-3]);
 47 |     accum1 += widemul(a[4], b[8-4]);
 48 |     accum0 += widemul(a[5], b[8-5]);
 49 |     accum1 += widemul(a[6], b[8-6]);
 50 |     accum0 += widemul(a[7], b[8-7]);
 51 |     accum1 += widemul(a[8], b[8-8]);
 52 |     accum1 += accum0;
 53 |     c[8] = accum1 & ((1ull<<57)-1);
 54 |     accum1 >>= 57;
 55 | 
 56 |     accum0 = 0;
 57 |     accum0 += widemul(a[1], b[0+9-1]);
 58 |     accum0 += widemul(a[2], b[0+9-2]);
 59 |     accum0 += widemul(a[3], b[0+9-3]);
 60 |     accum0 += widemul(a[4], b[0+9-4]);
 61 |     accum1 += widemul(a[0], b[0-0]);
 62 |     accum0 += widemul(a[5], b[0+9-5]);
 63 |     accum0 += widemul(a[6], b[0+9-6]);
 64 |     accum0 += widemul(a[7], b[0+9-7]);
 65 |     accum0 += widemul(a[8], b[0+9-8]);
 66 |     accum1 += accum0 << 1;
 67 |     c[0] = accum1 & ((1ull<<58)-1);
 68 |     accum1 >>= 58;
 69 | 
 70 |     accum0 = 0;
 71 |     accum0 += widemul(a[2], b[1+9-2]);
 72 |     accum0 += widemul(a[3], b[1+9-3]);
 73 |     accum1 += widemul(a[0], b[1-0]);
 74 |     accum0 += widemul(a[4], b[1+9-4]);
 75 |     accum0 += widemul(a[5], b[1+9-5]);
 76 |     accum1 += widemul(a[1], b[1-1]);
 77 |     accum0 += widemul(a[6], b[1+9-6]);
 78 |     accum0 += widemul(a[7], b[1+9-7]);
 79 |     accum0 += widemul(a[8], b[1+9-8]);
 80 |     accum1 += accum0 << 1;
 81 |     c[1] = accum1 & ((1ull<<58)-1);
 82 |     accum1 >>= 58;
 83 | 
 84 |     accum0 = 0;
 85 |     accum0 += widemul(a[3], b[2+9-3]);
 86 |     accum1 += widemul(a[0], b[2-0]);
 87 |     accum0 += widemul(a[4], b[2+9-4]);
 88 |     accum0 += widemul(a[5], b[2+9-5]);
 89 |     accum1 += widemul(a[1], b[2-1]);
 90 |     accum0 += widemul(a[6], b[2+9-6]);
 91 |     accum0 += widemul(a[7], b[2+9-7]);
 92 |     accum1 += widemul(a[2], b[2-2]);
 93 |     accum0 += widemul(a[8], b[2+9-8]);
 94 |     accum1 += accum0 << 1;
 95 |     c[2] = accum1 & ((1ull<<58)-1);
 96 |     accum1 >>= 58;
 97 | 
 98 |     accum0 = 0;
 99 |     accum0 += widemul(a[4], b[3+9-4]);
100 |     accum1 += widemul(a[0], b[3-0]);
101 |     accum0 += widemul(a[5], b[3+9-5]);
102 |     accum1 += widemul(a[1], b[3-1]);
103 |     accum0 += widemul(a[6], b[3+9-6]);
104 |     accum1 += widemul(a[2], b[3-2]);
105 |     accum0 += widemul(a[7], b[3+9-7]);
106 |     accum1 += widemul(a[3], b[3-3]);
107 |     accum0 += widemul(a[8], b[3+9-8]);
108 |     accum1 += accum0 << 1;
109 |     c[3] = accum1 & ((1ull<<58)-1);
110 |     accum1 >>= 58;
111 | 
112 |     accum0 = 0;
113 |     accum1 += widemul(a[0], b[4-0]);
114 |     accum0 += widemul(a[5], b[4+9-5]);
115 |     accum1 += widemul(a[1], b[4-1]);
116 |     accum0 += widemul(a[6], b[4+9-6]);
117 |     accum1 += widemul(a[2], b[4-2]);
118 |     accum0 += widemul(a[7], b[4+9-7]);
119 |     accum1 += widemul(a[3], b[4-3]);
120 |     accum0 += widemul(a[8], b[4+9-8]);
121 |     accum1 += widemul(a[4], b[4-4]);
122 |     accum1 += accum0 << 1;
123 |     c[4] = accum1 & ((1ull<<58)-1);
124 |     accum1 >>= 58;
125 | 
126 |     accum0 = 0;
127 |     accum1 += widemul(a[0], b[5-0]);
128 |     accum0 += widemul(a[6], b[5+9-6]);
129 |     accum1 += widemul(a[1], b[5-1]);
130 |     accum1 += widemul(a[2], b[5-2]);
131 |     accum0 += widemul(a[7], b[5+9-7]);
132 |     accum1 += widemul(a[3], b[5-3]);
133 |     accum1 += widemul(a[4], b[5-4]);
134 |     accum0 += widemul(a[8], b[5+9-8]);
135 |     accum1 += widemul(a[5], b[5-5]);
136 |     accum1 += accum0 << 1;
137 |     c[5] = accum1 & ((1ull<<58)-1);
138 |     accum1 >>= 58;
139 | 
140 |     accum0 = 0;
141 |     accum1 += widemul(a[0], b[6-0]);
142 |     accum1 += widemul(a[1], b[6-1]);
143 |     accum0 += widemul(a[7], b[6+9-7]);
144 |     accum1 += widemul(a[2], b[6-2]);
145 |     accum1 += widemul(a[3], b[6-3]);
146 |     accum1 += widemul(a[4], b[6-4]);
147 |     accum0 += widemul(a[8], b[6+9-8]);
148 |     accum1 += widemul(a[5], b[6-5]);
149 |     accum1 += widemul(a[6], b[6-6]);
150 |     accum1 += accum0 << 1;
151 |     c[6] = accum1 & ((1ull<<58)-1);
152 |     accum1 >>= 58;
153 |   
154 |     accum1 += c[7];
155 |     c[7] = accum1 & ((1ull<<58)-1);
156 |   
157 |     c[8] += accum1 >> 58;
158 | }
159 | 
160 | void
161 | p521_mulw (
162 |     p521_t *__restrict__ cs,
163 |     const p521_t *as,
164 |     uint64_t b
165 | ) {
166 |     const uint64_t *a = as->limb;
167 |     uint64_t *c = cs->limb;
168 | 
169 |     __uint128_t accum0 = 0, accum3 = 0, accum6 = 0;
170 |     uint64_t mask = (1ull<<58) - 1;  
171 | 
172 |     int i;
173 |     for (i=0; i<3; i++) {
174 |         accum0 += widemul(b, a[i]);
175 |         accum3 += widemul(b, a[i+3]);
176 |         accum6 += widemul(b, a[i+6]);
177 |         c[i]   = accum0 & mask; accum0 >>= 58;
178 |         c[i+3] = accum3 & mask; accum3 >>= 58;
179 |         if (i==2) { 
180 |             c[i+6] = accum6 & (mask>>1); accum6 >>= 57;
181 |         } else {
182 |             c[i+6] = accum6 & mask; accum6 >>= 58;
183 |         }
184 |     }
185 |     
186 |     accum0 += c[3];
187 |     c[3] = accum0 & mask;
188 |     c[4] += accum0 >> 58;
189 | 
190 |     accum3 += c[6];
191 |     c[6] = accum3 & mask;
192 |     c[7] += accum3 >> 58;
193 | 
194 |     accum6 += c[0];
195 |     c[0] = accum6 & mask;
196 |     c[1] += accum6 >> 58;
197 | }
198 | 
199 | void
200 | p521_sqr (
201 |     p521_t *__restrict__ cs,
202 |     const p521_t *as
203 | ) {
204 |     uint64_t *c = cs->limb;
205 |     const uint64_t *a = as->limb;
206 |     __uint128_t accum0, accum1;
207 | 
208 |     accum0  = widemul(a[8], a[8]);
209 |     accum1  = widemul(a[0], a[7]);
210 |     accum0 += widemul(a[1], a[6]);
211 |     accum1 += widemul(a[2], a[5]);
212 |     accum0 += widemul(a[3], a[4]);
213 |     accum1 += accum0;
214 |     c[7] = 2 * (accum1 & ((1ull<<57)-1));
215 |     accum1 >>= 57;
216 |   
217 |     accum0 = 0;
218 |     accum0 = 0;
219 |     accum1 += widemul(a[4], a[4]);
220 |     accum0 += widemul(a[1], a[7]);
221 |     accum1 += widemul(2*a[2], a[6]);
222 |     accum0 += widemul(a[3], a[5]);
223 |     accum1 += widemul(2*a[0], a[8]);
224 |     accum1 += 2*accum0;
225 |     c[8] = accum1 & ((1ull<<57)-1);
226 |     accum1 >>= 57;
227 | 
228 |     accum0 = 0;
229 |     accum1 += widemul(a[0], a[0]);
230 |     accum0 += widemul(a[1], a[8]);
231 |     accum0 += widemul(a[2], a[7]);
232 |     accum0 += widemul(a[3], a[6]);
233 |     accum0 += widemul(a[4], a[5]);
234 |     accum1 += accum0 << 2;
235 |     c[0] = accum1 & ((1ull<<58)-1);
236 |     accum1 >>= 58;
237 | 
238 |     accum0 = 0;
239 |     accum0 += widemul(a[2], a[8]);
240 |     accum0 += widemul(a[3], a[7]);
241 |     accum0 += widemul(a[4], a[6]);
242 |     accum0 <<= 1;
243 |     accum0 += widemul(a[5], a[5]);
244 |     accum0 += widemul(a[0], a[1]);
245 |     accum1 += accum0 << 1;
246 |     c[1] = accum1 & ((1ull<<58)-1);
247 |     accum1 >>= 58;
248 | 
249 |     accum0 = 0;
250 |     accum1 += widemul(a[1], a[1]);
251 | 
252 |     accum0 += widemul(a[3], a[8]);
253 |     accum0 += widemul(a[4], a[7]);
254 |     accum0 += widemul(a[5], a[6]);
255 |     accum0 <<= 1;
256 |     accum0 += widemul(a[0], a[2]);
257 |     accum1 += accum0 << 1;
258 |     c[2] = accum1 & ((1ull<<58)-1);
259 |     accum1 >>= 58;
260 | 
261 |     accum0 = 0;
262 |     accum0 += widemul(a[6], a[6]);
263 |     accum0 += widemul(2*a[5], a[7]);
264 |     accum0 += widemul(2*a[4], a[8]);
265 |     accum0 += widemul(a[0], a[3]);
266 |     accum0 += widemul(a[1], a[2]);
267 |     accum1 += accum0 << 1;
268 |     c[3] = accum1 & ((1ull<<58)-1);
269 |     accum1 >>= 58;
270 | 
271 |     accum0 = 0;
272 |     accum0 += widemul(a[6], a[7]);
273 |     accum0 += widemul(a[5], a[8]);
274 |     accum0 <<= 1;
275 |     accum1 += widemul(a[2], a[2]);
276 |     accum0 += widemul(a[0], a[4]);
277 |     accum0 += widemul(a[1], a[3]);
278 |     accum1 += accum0 << 1;
279 |     c[4] = accum1 & ((1ull<<58)-1);
280 |     accum1 >>= 58;
281 | 
282 |     accum0 = 0;
283 |     accum0 += widemul(2*a[6], a[8]);
284 |     accum0 += widemul(a[7], a[7]);
285 |     accum0 += widemul(a[0], a[5]);
286 |     accum0 += widemul(a[1], a[4]);
287 |     accum0 += widemul(a[2], a[3]);
288 |     accum1 += accum0 << 1;
289 |     c[5] = accum1 & ((1ull<<58)-1);
290 |     accum1 >>= 58;
291 | 
292 |     accum0 = 0;
293 |     accum1 += widemul(a[3], a[3]);
294 |     accum0 += widemul(a[0], a[6]);
295 |     accum0 += widemul(a[1], a[5]);
296 |     accum0 += widemul(2*a[7], a[8]);
297 |     accum0 += widemul(a[2], a[4]);
298 |     accum1 += accum0 << 1;
299 |     c[6] = accum1 & ((1ull<<58)-1);
300 |     accum1 >>= 58;
301 |   
302 |     accum1 += c[7];
303 |     c[7] = accum1 & ((1ull<<58)-1);
304 |   
305 |     c[8] += accum1 >> 58;
306 | }
307 | 
308 | void
309 | p521_strong_reduce (
310 |     p521_t *a
311 | ) {
312 |     uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1;
313 | 
314 |     /* first, clear high */
315 |     __int128_t scarry = a->limb[8]>>57;
316 |     a->limb[8] &= mask2;
317 | 
318 |     /* now the total is less than 2p */
319 | 
320 |     /* compute total_value - p.  No need to reduce mod p. */
321 | 
322 |     int i;
323 |     for (i=0; i<9; i++) {
324 |         scarry = scarry + a->limb[i] - ((i==8) ? mask2 : mask);
325 |         a->limb[i] = scarry & ((i==8) ? mask2 : mask);
326 |         scarry >>= (i==8) ? 57 : 58;
327 |     }
328 | 
329 |     /* uncommon case: it was >= p, so now scarry = 0 and this = x
330 |     * common case: it was < p, so now scarry = -1 and this = x - p + 2^521
331 |     * so let's add back in p.  will carry back off the top for 2^521.
332 |     */
333 | 
334 |     assert(is_zero(scarry) | is_zero(scarry+1));
335 | 
336 |     uint64_t scarry_mask = scarry & mask;
337 |     __uint128_t carry = 0;
338 | 
339 |     /* add it back */
340 |     for (i=0; i<9; i++) {
341 |         carry = carry + a->limb[i] + ((i==8)?(scarry_mask>>1):scarry_mask);
342 |         a->limb[i] = carry & ((i==8) ? mask>>1 : mask);
343 |         carry >>= (i==8) ? 57 : 58;
344 |     }
345 | 
346 |     assert(is_zero(carry + scarry));
347 | }
348 | 
349 | void
350 | p521_serialize (
351 |     uint8_t *serial,
352 |     const struct p521_t *x
353 | ) {
354 |     int i,k=0;
355 |     p521_t red;
356 |     p521_copy(&red, x);
357 |     p521_strong_reduce(&red);
358 |     
359 |     uint64_t r=0;
360 |     int bits = 0;
361 |     for (i=0; i<9; i++) {
362 |         r |= red.limb[i] << bits;
363 |         for (bits += 58; bits >= 8; bits -= 8) {
364 |             serial[k++] = r;
365 |             r >>= 8;
366 |         }
367 |         assert(bits <= 6);
368 |     }
369 |     assert(bits);
370 |     serial[k++] = r;
371 | }
372 | 
373 | mask_t
374 | p521_deserialize (
375 |     p521_t *x,
376 |     const uint8_t serial[66]
377 | ) {
378 |     int i,k=0,bits=0;
379 |     __uint128_t out = 0;
380 |     uint64_t mask = (1ull<<58)-1;
381 |     for (i=0; i<9; i++) {
382 |         out >>= 58;
383 |         for (; bits<58; bits+=8) {
384 |             out |= ((__uint128_t)serial[k++])<<bits;
385 |         }
386 |         x->limb[i] = out & mask;
387 |         bits -= 58;
388 |     }
389 |     
390 |     /* Check for reduction.  First, high has to be < 2^57 */
391 |     mask_t good = is_zero(out>>57);
392 |     
393 |     uint64_t and = -1ull;
394 |     for (i=0; i<8; i++) {
395 |         and &= x->limb[i];
396 |     }
397 |     and &= (2*out+1);
398 |     good &= is_zero((and+1)>>58);
399 |     
400 |     return good;
401 | }
402 | 


--------------------------------------------------------------------------------
/src/p448/arch_ref64/p448.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2014 Cryptography Research, Inc.
  2 |  * Released under the MIT License.  See LICENSE.txt for license information.
  3 |  */
  4 | 
  5 | #include "p448.h"
  6 | 
  7 | static __inline__ __uint128_t widemul(
  8 |     const uint64_t a,
  9 |     const uint64_t b
 10 | ) {
 11 |     return ((__uint128_t)a) * ((__uint128_t)b);
 12 | }
 13 | 
 14 | static __inline__ uint64_t is_zero(uint64_t a) {
 15 |     /* let's hope the compiler isn't clever enough to optimize this. */
 16 |     return (((__uint128_t)a)-1)>>64;
 17 | }
 18 | 
 19 | void
 20 | p448_mul (
 21 |     p448_t *__restrict__ cs,
 22 |     const p448_t *as,
 23 |     const p448_t *bs
 24 | ) {
 25 |     const uint64_t *a = as->limb, *b = bs->limb;
 26 |     uint64_t *c = cs->limb;
 27 | 
 28 |     __uint128_t accum0 = 0, accum1 = 0, accum2;
 29 |     uint64_t mask = (1ull<<56) - 1;  
 30 | 
 31 |     uint64_t aa[4], bb[4], bbb[4];
 32 | 
 33 |     unsigned int i;
 34 |     for (i=0; i<4; i++) {
 35 |         aa[i]  = a[i] + a[i+4];
 36 |         bb[i]  = b[i] + b[i+4];
 37 |         bbb[i] = bb[i] + b[i+4];
 38 |     }
 39 | 
 40 |     int I_HATE_UNROLLED_LOOPS = 0;
 41 | 
 42 |     if (I_HATE_UNROLLED_LOOPS) {
 43 |         /* The compiler probably won't unroll this,
 44 |          * so it's like 80% slower.
 45 |          */
 46 |         for (i=0; i<4; i++) {
 47 |             accum2 = 0;
 48 | 
 49 |             unsigned int j;
 50 |             for (j=0; j<=i; j++) {
 51 |                 accum2 += widemul(a[j],   b[i-j]);
 52 |                 accum1 += widemul(aa[j], bb[i-j]);
 53 |                 accum0 += widemul(a[j+4], b[i-j+4]);
 54 |             }
 55 |             for (; j<4; j++) {
 56 |                 accum2 += widemul(a[j],   b[i-j+8]);
 57 |                 accum1 += widemul(aa[j], bbb[i-j+4]);
 58 |                 accum0 += widemul(a[j+4], bb[i-j+4]);
 59 |             }
 60 | 
 61 |             accum1 -= accum2;
 62 |             accum0 += accum2;
 63 | 
 64 |             c[i]   = ((uint64_t)(accum0)) & mask;
 65 |             c[i+4] = ((uint64_t)(accum1)) & mask;
 66 | 
 67 |             accum0 >>= 56;
 68 |             accum1 >>= 56;
 69 |         }
 70 |     } else {
 71 |         accum2  = widemul(a[0],  b[0]);
 72 |         accum1 += widemul(aa[0], bb[0]);
 73 |         accum0 += widemul(a[4],  b[4]);
 74 | 
 75 |         accum2 += widemul(a[1],  b[7]);
 76 |         accum1 += widemul(aa[1], bbb[3]);
 77 |         accum0 += widemul(a[5],  bb[3]);
 78 | 
 79 |         accum2 += widemul(a[2],  b[6]);
 80 |         accum1 += widemul(aa[2], bbb[2]);
 81 |         accum0 += widemul(a[6],  bb[2]);
 82 | 
 83 |         accum2 += widemul(a[3],  b[5]);
 84 |         accum1 += widemul(aa[3], bbb[1]);
 85 |         accum0 += widemul(a[7],  bb[1]);
 86 | 
 87 |         accum1 -= accum2;
 88 |         accum0 += accum2;
 89 | 
 90 |         c[0] = ((uint64_t)(accum0)) & mask;
 91 |         c[4] = ((uint64_t)(accum1)) & mask;
 92 | 
 93 |         accum0 >>= 56;
 94 |         accum1 >>= 56;
 95 | 
 96 |         accum2  = widemul(a[0],  b[1]);
 97 |         accum1 += widemul(aa[0], bb[1]);
 98 |         accum0 += widemul(a[4],  b[5]);
 99 | 
100 |         accum2 += widemul(a[1],  b[0]);
101 |         accum1 += widemul(aa[1], bb[0]);
102 |         accum0 += widemul(a[5],  b[4]);
103 | 
104 |         accum2 += widemul(a[2],  b[7]);
105 |         accum1 += widemul(aa[2], bbb[3]);
106 |         accum0 += widemul(a[6],  bb[3]);
107 | 
108 |         accum2 += widemul(a[3],  b[6]);
109 |         accum1 += widemul(aa[3], bbb[2]);
110 |         accum0 += widemul(a[7],  bb[2]);
111 | 
112 |         accum1 -= accum2;
113 |         accum0 += accum2;
114 | 
115 |         c[1] = ((uint64_t)(accum0)) & mask;
116 |         c[5] = ((uint64_t)(accum1)) & mask;
117 | 
118 |         accum0 >>= 56;
119 |         accum1 >>= 56;
120 | 
121 |         accum2  = widemul(a[0],  b[2]);
122 |         accum1 += widemul(aa[0], bb[2]);
123 |         accum0 += widemul(a[4],  b[6]);
124 | 
125 |         accum2 += widemul(a[1],  b[1]);
126 |         accum1 += widemul(aa[1], bb[1]);
127 |         accum0 += widemul(a[5],  b[5]);
128 | 
129 |         accum2 += widemul(a[2],  b[0]);
130 |         accum1 += widemul(aa[2], bb[0]);
131 |         accum0 += widemul(a[6],  b[4]);
132 | 
133 |         accum2 += widemul(a[3],  b[7]);
134 |         accum1 += widemul(aa[3], bbb[3]);
135 |         accum0 += widemul(a[7],  bb[3]);
136 | 
137 |         accum1 -= accum2;
138 |         accum0 += accum2;
139 | 
140 |         c[2] = ((uint64_t)(accum0)) & mask;
141 |         c[6] = ((uint64_t)(accum1)) & mask;
142 | 
143 |         accum0 >>= 56;
144 |         accum1 >>= 56;
145 | 
146 |         accum2  = widemul(a[0],  b[3]);
147 |         accum1 += widemul(aa[0], bb[3]);
148 |         accum0 += widemul(a[4],  b[7]);
149 | 
150 |         accum2 += widemul(a[1],  b[2]);
151 |         accum1 += widemul(aa[1], bb[2]);
152 |         accum0 += widemul(a[5],  b[6]);
153 | 
154 |         accum2 += widemul(a[2],  b[1]);
155 |         accum1 += widemul(aa[2], bb[1]);
156 |         accum0 += widemul(a[6],  b[5]);
157 | 
158 |         accum2 += widemul(a[3],  b[0]);
159 |         accum1 += widemul(aa[3], bb[0]);
160 |         accum0 += widemul(a[7],  b[4]);
161 | 
162 |         accum1 -= accum2;
163 |         accum0 += accum2;
164 | 
165 |         c[3] = ((uint64_t)(accum0)) & mask;
166 |         c[7] = ((uint64_t)(accum1)) & mask;
167 | 
168 |         accum0 >>= 56;
169 |         accum1 >>= 56;
170 |     } /* !I_HATE_UNROLLED_LOOPS */
171 | 
172 |     accum0 += accum1;
173 |     accum0 += c[4];
174 |     accum1 += c[0];
175 |     c[4] = ((uint64_t)(accum0)) & mask;
176 |     c[0] = ((uint64_t)(accum1)) & mask;
177 | 
178 |     accum0 >>= 56;
179 |     accum1 >>= 56;
180 | 
181 |     c[5] += ((uint64_t)(accum0));
182 |     c[1] += ((uint64_t)(accum1));
183 | }
184 | 
185 | void
186 | p448_mulw (
187 |     p448_t *__restrict__ cs,
188 |     const p448_t *as,
189 |     uint64_t b
190 | ) {
191 |     const uint64_t *a = as->limb;
192 |     uint64_t *c = cs->limb;
193 | 
194 |     __uint128_t accum0 = 0, accum4 = 0;
195 |     uint64_t mask = (1ull<<56) - 1;  
196 | 
197 |     int i;
198 |     for (i=0; i<4; i++) {
199 |         accum0 += widemul(b, a[i]);
200 |         accum4 += widemul(b, a[i+4]);
201 |         c[i]   = accum0 & mask; accum0 >>= 56;
202 |         c[i+4] = accum4 & mask; accum4 >>= 56;
203 |     }
204 |     
205 |     accum0 += accum4 + c[4];
206 |     c[4] = accum0 & mask;
207 |     c[5] += accum0 >> 56;
208 | 
209 |     accum4 += c[0];
210 |     c[0] = accum4 & mask;
211 |     c[1] += accum4 >> 56;
212 | }
213 | 
214 | void
215 | p448_sqr (
216 |     p448_t *__restrict__ cs,
217 |     const p448_t *as
218 | ) {
219 |     const uint64_t *a = as->limb;
220 |     uint64_t *c = cs->limb;
221 | 
222 |     __uint128_t accum0 = 0, accum1 = 0, accum2;
223 |     uint64_t mask = (1ull<<56) - 1;  
224 | 
225 |     uint64_t aa[4];
226 | 
227 |     /* For some reason clang doesn't vectorize this without prompting? */
228 |     unsigned int i;
229 |     for (i=0; i<4; i++) {
230 |         aa[i] = a[i] + a[i+4];
231 |     }
232 | 
233 |     accum2  = widemul(a[0],a[3]);
234 |     accum0  = widemul(aa[0],aa[3]);
235 |     accum1  = widemul(a[4],a[7]);
236 | 
237 |     accum2 += widemul(a[1], a[2]);
238 |     accum0 += widemul(aa[1], aa[2]);
239 |     accum1 += widemul(a[5], a[6]);
240 | 
241 |     accum0 -= accum2;
242 |     accum1 += accum2;
243 | 
244 |     c[3] = ((uint64_t)(accum1))<<1 & mask;
245 |     c[7] = ((uint64_t)(accum0))<<1 & mask;
246 | 
247 |     accum0 >>= 55;
248 |     accum1 >>= 55;
249 | 
250 |     accum0 += widemul(2*aa[1],aa[3]);
251 |     accum1 += widemul(2*a[5], a[7]);
252 |     accum0 += widemul(aa[2], aa[2]);
253 |     accum1 += accum0;
254 | 
255 |     accum0 -= widemul(2*a[1], a[3]);
256 |     accum1 += widemul(a[6], a[6]);
257 |     
258 |     accum2 = widemul(a[0],a[0]);
259 |     accum1 -= accum2;
260 |     accum0 += accum2;
261 | 
262 |     accum0 -= widemul(a[2], a[2]);
263 |     accum1 += widemul(aa[0], aa[0]);
264 |     accum0 += widemul(a[4], a[4]);
265 | 
266 |     c[0] = ((uint64_t)(accum0)) & mask;
267 |     c[4] = ((uint64_t)(accum1)) & mask;
268 | 
269 |     accum0 >>= 56;
270 |     accum1 >>= 56;
271 | 
272 |     accum2  = widemul(2*aa[2],aa[3]);
273 |     accum0 -= widemul(2*a[2], a[3]);
274 |     accum1 += widemul(2*a[6], a[7]);
275 | 
276 |     accum1 += accum2;
277 |     accum0 += accum2;
278 | 
279 |     accum2  = widemul(2*a[0],a[1]);
280 |     accum1 += widemul(2*aa[0], aa[1]);
281 |     accum0 += widemul(2*a[4], a[5]);
282 | 
283 |     accum1 -= accum2;
284 |     accum0 += accum2;
285 | 
286 |     c[1] = ((uint64_t)(accum0)) & mask;
287 |     c[5] = ((uint64_t)(accum1)) & mask;
288 | 
289 |     accum0 >>= 56;
290 |     accum1 >>= 56;
291 | 
292 |     accum2  = widemul(aa[3],aa[3]);
293 |     accum0 -= widemul(a[3], a[3]);
294 |     accum1 += widemul(a[7], a[7]);
295 | 
296 |     accum1 += accum2;
297 |     accum0 += accum2;
298 | 
299 |     accum2  = widemul(2*a[0],a[2]);
300 |     accum1 += widemul(2*aa[0], aa[2]);
301 |     accum0 += widemul(2*a[4], a[6]);
302 | 
303 |     accum2 += widemul(a[1], a[1]);
304 |     accum1 += widemul(aa[1], aa[1]);
305 |     accum0 += widemul(a[5], a[5]);
306 | 
307 |     accum1 -= accum2;
308 |     accum0 += accum2;
309 | 
310 |     c[2] = ((uint64_t)(accum0)) & mask;
311 |     c[6] = ((uint64_t)(accum1)) & mask;
312 | 
313 |     accum0 >>= 56;
314 |     accum1 >>= 56;
315 | 
316 |     accum0 += c[3];
317 |     accum1 += c[7];
318 |     c[3] = ((uint64_t)(accum0)) & mask;
319 |     c[7] = ((uint64_t)(accum1)) & mask;
320 | 
321 |     /* we could almost stop here, but it wouldn't be stable, so... */
322 | 
323 |     accum0 >>= 56;
324 |     accum1 >>= 56;
325 |     c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
326 |     c[0] += ((uint64_t)(accum1));
327 | }
328 | 
329 | void
330 | p448_strong_reduce (
331 |     p448_t *a
332 | ) {
333 |     uint64_t mask = (1ull<<56)-1;
334 | 
335 |     /* first, clear high */
336 |     a->limb[4] += a->limb[7]>>56;
337 |     a->limb[0] += a->limb[7]>>56;
338 |     a->limb[7] &= mask;
339 | 
340 |     /* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
341 | 
342 |     /* compute total_value - p.  No need to reduce mod p. */
343 | 
344 |     __int128_t scarry = 0;
345 |     int i;
346 |     for (i=0; i<8; i++) {
347 |         scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
348 |         a->limb[i] = scarry & mask;
349 |         scarry >>= 56;
350 |     }
351 | 
352 |     /* uncommon case: it was >= p, so now scarry = 0 and this = x
353 |     * common case: it was < p, so now scarry = -1 and this = x - p + 2^448
354 |     * so let's add back in p.  will carry back off the top for 2^448.
355 |     */
356 | 
357 |     assert(is_zero(scarry) | is_zero(scarry+1));
358 | 
359 |     uint64_t scarry_mask = scarry & mask;
360 |     __uint128_t carry = 0;
361 | 
362 |     /* add it back */
363 |     for (i=0; i<8; i++) {
364 |         carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
365 |         a->limb[i] = carry & mask;
366 |         carry >>= 56;
367 |     }
368 | 
369 |     assert(is_zero(carry + scarry));
370 | }
371 | 
372 | void
373 | p448_serialize (
374 |     uint8_t *serial,
375 |     const struct p448_t *x
376 | ) {
377 |     int i,j;
378 |     p448_t red;
379 |     p448_copy(&red, x);
380 |     p448_strong_reduce(&red);
381 |     for (i=0; i<8; i++) {
382 |         for (j=0; j<7; j++) {
383 |             serial[7*i+j] = red.limb[i];
384 |             red.limb[i] >>= 8;
385 |         }
386 |         assert(red.limb[i] == 0);
387 |     }
388 | }
389 | 
390 | mask_t
391 | p448_deserialize (
392 |     p448_t *x,
393 |     const uint8_t serial[56]
394 | ) {
395 |     int i,j;
396 |     for (i=0; i<8; i++) {
397 |         uint64_t out = 0;
398 |         for (j=0; j<7; j++) {
399 |             out |= ((uint64_t)serial[7*i+j])<<(8*j);
400 |         }
401 |         x->limb[i] = out;
402 |     }
403 |     
404 |     /* Check for reduction.
405 |      *
406 |      * The idea is to create a variable ge which is all ones (rather, 56 ones)
407 |      * if and only if the low $i$ words of $x$ are >= those of p.
408 |      *
409 |      * Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
410 |      */
411 |     uint64_t ge = -1, mask = (1ull<<56)-1;
412 |     for (i=0; i<4; i++) {
413 |         ge &= x->limb[i];
414 |     }
415 |     
416 |     /* At this point, ge = 1111 iff bottom are all 1111.  Now propagate if 1110, or set if 1111 */
417 |     ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
418 |     
419 |     /* Propagate the rest */
420 |     for (i=5; i<8; i++) {
421 |         ge &= x->limb[i];
422 |     }
423 |     
424 |     return ~is_zero(ge ^ mask);
425 | }
426 | 


--------------------------------------------------------------------------------