├── LICENSE
├── avx2
    ├── Makefile
    ├── api.h
    ├── include
    │   ├── comb.c
    │   ├── comb.h
    │   ├── constant_time.c
    │   ├── constant_time.h
    │   ├── curve.c
    │   ├── curve.h
    │   ├── f11_260.c
    │   ├── f11_260.h
    │   ├── gen.c
    │   ├── gen.h
    │   ├── scalar.c
    │   ├── scalar.h
    │   └── sign.h
    └── src
    │   ├── api.c.supercop_only
    │   ├── main.c
    │   └── sign.c
├── avx512
    ├── include
    └── src
    │   ├── f11_260.c
    │   ├── main.c
    │   ├── scalar.c
    │   └── sign.c
└── ref
    ├── Makefile
    ├── api.h
    ├── include
        ├── comb.c
        ├── comb.h
        ├── constant_time.c
        ├── constant_time.h
        ├── curve.c
        ├── curve.h
        ├── f11_260.c
        ├── f11_260.h
        ├── gen.c
        ├── gen.h
        ├── scalar.c
        ├── scalar.h
        └── sign.h
    └── src
        ├── api.c.supercop_only
        ├── main.c
        └── sign.c


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2019, Pyrofex Inc.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer.
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 |    this list of conditions and the following disclaimer in the documentation
11 |    and/or other materials provided with the distribution.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | 
24 | The views and conclusions contained in the software and documentation are those
25 | of the authors and should not be interpreted as representing official policies,
26 | either expressed or implied, of the p11_260 project.
27 | 


--------------------------------------------------------------------------------
/avx2/Makefile:
--------------------------------------------------------------------------------
1 | ../ref/Makefile


--------------------------------------------------------------------------------
/avx2/api.h:
--------------------------------------------------------------------------------
1 | #define CRYPTO_SECRETKEYBYTES 66
2 | #define CRYPTO_PUBLICKEYBYTES 33
3 | #define CRYPTO_BYTES 65
4 | #define CRYPTO_VERSION "1.0"
5 | 


--------------------------------------------------------------------------------
/avx2/include/comb.h:
--------------------------------------------------------------------------------
 1 | #ifndef COMB_H
 2 | #define COMB_H
 3 | 
 4 | #include "curve.h"
 5 | #include "scalar.h"
 6 | 
 7 | #define COMB_TABLE_SIZE 16
 8 | #define COMB_TEETH 5
 9 | #define COMB_COUNT 4
10 | #define COMB_SEPARATION 13
11 | #define COMB_LOOKUP_MASK 0xf
12 | 
13 | // A single comb table.
14 | typedef struct sabs_single_comb {
15 |   extended_affine_pt_readd_narrow_t table[COMB_TABLE_SIZE];
16 | } sabs_single_comb_t;
17 | 
18 | // A single wide comb table. Used in computing a narrow comb table.
19 | typedef struct sabs_single_comb_wide {
20 |   projective_pt_wide_t table[COMB_TABLE_SIZE];
21 | } sabs_single_comb_wide_t;
22 | 
23 | // A comb set. There is a precomputed comb set for the base point, but for
24 | // verifications of several signatures from the same key, it would be
25 | // advantageous to precompute a comb.
26 | typedef struct sabs_comb_set {
27 |   sabs_single_comb_t combs[COMB_COUNT];
28 | } sabs_comb_set_t;
29 | 
30 | // An unreduced comb set. Used just to separate the logic of comb computation
31 | // from comb reduction.
32 | typedef struct sabs_comb_set_wide {
33 |   sabs_single_comb_wide_t combs[COMB_COUNT];
34 | } sabs_comb_set_wide_t;
35 | 
36 | // used for computing the entries in the comb table.
37 | typedef struct teeth_set {
38 |   // We don't need the lowest tooth to compute the entries, because for signed
39 |   // all bits set, to change the bit, you add or subtract a value of 2*bit.
40 |   extended_pt_readd_wide_t teeth[COMB_TEETH - 1];
41 | } teeth_set_t;
42 | 
43 | // The base comb used for fast signatures.
44 | sabs_comb_set_t base_comb;
45 | 
46 | // Compute a comb set for a given point.
47 | void compute_comb_set(
48 |   sabs_comb_set_t *result, const affine_pt_narrow_t *base_pt);
49 | 
50 | // Helper function used to compute a comb set.
51 | void reduce_comb_set(sabs_comb_set_t *result, sabs_comb_set_wide_t *source);
52 | 
53 | // Constant time multiplication of a scalar times a point given the point's
54 | // comb.
55 | void scalar_comb_multiply(
56 |   projective_pt_wide_t *result, const sabs_comb_set_t * __restrict comb,
57 |   const scalar_t * __restrict n);
58 | 
59 | // Non-Constant time multiplication of a scalar times a point given the point's
60 | // comb. Can be safely used during signature verification because there are no
61 | // secrets during verification.
62 | void scalar_comb_multiply_unsafe(
63 |   projective_pt_wide_t *result, const sabs_comb_set_t * __restrict comb,
64 |   const scalar_t * __restrict n);
65 | #endif
66 | 


--------------------------------------------------------------------------------
/avx2/include/constant_time.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include "f11_260.h"
  3 | #include "curve.h"
  4 | 
  5 | #include "emmintrin.h"
  6 | #include "immintrin.h"
  7 | 
  8 | static inline void mask_copy_narrow(
  9 |   int32_t mask, residue_narrow_t *result,
 10 |   residue_narrow_t *x) {
 11 | 
 12 |   #pragma clang loop unroll(full)
 13 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
 14 |     result->limbs[i+1] |= x->limbs[i] & mask;
 15 |   }
 16 | }
 17 | 
 18 | // 12 * 32 * 4 = 6 * 256
 19 | void constant_time_extended_narrow_lookup(
 20 |   extended_pt_readd_narrow_t *result, int i, int n,
 21 |   const extended_pt_readd_narrow_t *table) {
 22 | 
 23 |   __m256i accum[6];
 24 |   __m256i big_i = _mm256_set1_epi32(i);
 25 |   __m256i big_one = _mm256_set1_epi32(1);
 26 |   #pragma clang loop unroll(full)
 27 |   for (int j = 0; j < 6; ++j) {
 28 |     accum[j] = _mm256_setzero_si256();
 29 |   }
 30 |   for (int j = 0; j < n; ++j) {
 31 |     __m256i mask = _mm256_cmpeq_epi32(big_i, _mm256_setzero_si256());
 32 |     #pragma clang loop unroll(full)
 33 |     for (int k = 0; k < 6; ++k) {
 34 |       __m256i temp = _mm256_load_si256(((__m256i*) &table[j]) + k);
 35 |       temp = _mm256_and_si256(temp, mask);
 36 |       accum[k] = _mm256_or_si256(accum[k], temp);
 37 |     }
 38 |     big_i = _mm256_sub_epi64(big_i, big_one);
 39 |   }
 40 |   for (int j = 0; j < 6; ++j) {
 41 |     _mm256_store_si256(((__m256i*) result) + j, accum[j]);
 42 |   }
 43 | }
 44 | 
 45 | void constant_time_extended_affine_narrow_lookup(
 46 |   extended_affine_pt_readd_narrow_t *result, int i, int n,
 47 |   const extended_affine_pt_readd_narrow_t *table) {
 48 | 
 49 |   __m256i accum[5];
 50 |   __m256i big_i = _mm256_set1_epi32(i);
 51 |   __m256i big_one = _mm256_set1_epi32(1);
 52 |   #pragma clang loop unroll(full)
 53 |   for (int j = 0; j < 5; ++j) {
 54 |     accum[j] = _mm256_setzero_si256();
 55 |   }
 56 |   for (int j = 0; j < n; ++j) {
 57 |     __m256i mask = _mm256_cmpeq_epi32(big_i, _mm256_setzero_si256());
 58 |     #pragma clang loop unroll(full)
 59 |     for (int k = 0; k < 5; ++k) {
 60 |       __m256i temp = _mm256_load_si256(((__m256i*) &table[j]) + k);
 61 |       temp = _mm256_and_si256(temp, mask);
 62 |       accum[k] = _mm256_or_si256(accum[k], temp);
 63 |     }
 64 |     big_i = _mm256_sub_epi64(big_i, big_one);
 65 |   }
 66 |   for (int j = 0; j < 5; ++j) {
 67 |     _mm256_store_si256(((__m256i*) result) + j, accum[j]);
 68 |   }
 69 | }
 70 | 
 71 | void constant_time_cond_extended_negate(
 72 |   extended_pt_readd_narrow_t *x, int32_t mask32) {
 73 |   __m256i zero = _mm256_setzero_si256();
 74 |   __m256i mask = _mm256_set1_epi32(mask32);
 75 |   __m256i not_mask = _mm256_set1_epi32(~mask32);
 76 | 
 77 |   #pragma clang loop unroll(full)
 78 |   for (int i = 0; i < 3; ++i) {
 79 |     __m256i temp = _mm256_load_si256(((__m256i*) x) + i);
 80 |     __m256i neg_temp = _mm256_sub_epi32(zero, temp);
 81 |     temp = _mm256_and_si256(not_mask, temp);
 82 |     neg_temp = _mm256_and_si256(mask, neg_temp);
 83 |     temp = _mm256_or_si256(temp, neg_temp);
 84 |     _mm256_store_si256(((__m256i*) x) + i, temp);
 85 |   }
 86 | }
 87 | 
 88 | void constant_time_cond_extended_affine_negate(
 89 |   extended_affine_pt_readd_narrow_t *x, int32_t mask32) {
 90 |   __m256i zero = _mm256_setzero_si256();
 91 |   __m256i mask = _mm256_set1_epi32(mask32);
 92 |   __m256i not_mask = _mm256_set1_epi32(~mask32);
 93 | 
 94 |   #pragma clang loop unroll(full)
 95 |   for (int i = 0; i < 3; ++i) {
 96 |     __m256i temp = _mm256_load_si256(((__m256i*) x) + i);
 97 |     __m256i neg_temp = _mm256_sub_epi32(zero, temp);
 98 |     temp = _mm256_and_si256(not_mask, temp);
 99 |     neg_temp = _mm256_and_si256(mask, neg_temp);
100 |     temp = _mm256_or_si256(temp, neg_temp);
101 |     _mm256_store_si256(((__m256i*) x) + i, temp);
102 |   }
103 | }
104 | 


--------------------------------------------------------------------------------
/avx2/include/constant_time.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONSTANT_TIME_H
 2 | #define CONSTANT_TIME_H
 3 | #include <stdint.h>
 4 | #include "f11_260.h"
 5 | #include "curve.h"
 6 | 
 7 | void constant_time_extended_narrow_lookup(
 8 |   extended_pt_readd_narrow_t *result, int i, int n,
 9 |   const extended_pt_readd_narrow_t *table);
10 | 
11 | void constant_time_extended_affine_narrow_lookup(
12 |   extended_affine_pt_readd_narrow_t *result, int i, int n,
13 |   const extended_affine_pt_readd_narrow_t *table);
14 | 
15 | void constant_time_cond_extended_negate(
16 |   extended_pt_readd_narrow_t *x, int32_t mask);
17 | 
18 | void constant_time_cond_extended_affine_negate(
19 |   extended_affine_pt_readd_narrow_t *x, int32_t mask);
20 | #endif
21 | 


--------------------------------------------------------------------------------
/avx2/include/curve.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include "f11_260.h"
  3 | #include "scalar.h"
  4 | #include "curve.h"
  5 | #include "constant_time.h"
  6 | 
  7 | __attribute__((__aligned__(32)))
  8 | const affine_pt_narrow_t B = {
  9 |   .x = {
 10 |     .limbs = {
 11 |       0, 0x2862b8b, 0x0f08ed2, 0x06e65ee, 0x0c05991, 0x2b12b17,
 12 |       0x0049432, 0x33a3707, 0x16e5186, 0x2947e71, 0x0ed9bab, 0,
 13 |     },
 14 |   },
 15 |   .y = {
 16 |     .limbs = {
 17 |       0x0, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
 18 |     },
 19 |   },
 20 | };
 21 | 
 22 | void copy_projective_pt_wide(
 23 |   projective_pt_wide_t *result, const projective_pt_wide_t *source) {
 24 | 
 25 |   for(int i = 0; i < NLIMBS; ++i) {
 26 |     result->x.limbs[i] = source->x.limbs[i];
 27 |     result->y.limbs[i] = source->y.limbs[i];
 28 |     result->z.limbs[i] = source->z.limbs[i];
 29 |   }
 30 | }
 31 | 
 32 | void copy_extended_pt_wide(
 33 |   extended_pt_wide_t *result,
 34 |   const extended_pt_wide_t *source) {
 35 | 
 36 |   for(int i = 0; i < NLIMBS; ++i) {
 37 |     result->x.limbs[i] = source->x.limbs[i];
 38 |     result->y.limbs[i] = source->y.limbs[i];
 39 |     result->t.limbs[i] = source->t.limbs[i];
 40 |     result->z.limbs[i] = source->z.limbs[i];
 41 |   }
 42 | }
 43 | 
 44 | void copy_extended_pt_readd_wide(
 45 |   extended_pt_readd_wide_t *result,
 46 |   const extended_pt_readd_wide_t *source) {
 47 | 
 48 |   for(int i = 0; i < NLIMBS; ++i) {
 49 |     result->x.limbs[i] = source->x.limbs[i];
 50 |     result->y.limbs[i] = source->y.limbs[i];
 51 |     result->dt.limbs[i] = source->dt.limbs[i];
 52 |     result->z.limbs[i] = source->z.limbs[i];
 53 |   }
 54 | }
 55 | 
 56 | void copy_extended_pt_readd_narrow(
 57 |   extended_pt_readd_narrow_t *result,
 58 |   const extended_pt_readd_narrow_t *source) {
 59 |   for(int i = 0; i < NLIMBS; ++i) {
 60 |     result->x.limbs[i] = source->x.limbs[i];
 61 |     result->y.limbs[i] = source->y.limbs[i];
 62 |     result->dt.limbs[i] = source->dt.limbs[i];
 63 |     result->z.limbs[i] = source->z.limbs[i];
 64 |   }
 65 | }
 66 | 
 67 | void copy_extended_affine_pt_readd_narrow(
 68 |   extended_affine_pt_readd_narrow_t *result,
 69 |   const extended_affine_pt_readd_narrow_t *source) {
 70 |   for(int i = 0; i < NLIMBS; ++i) {
 71 |     result->x.limbs[i] = source->x.limbs[i];
 72 |     result->y.limbs[i] = source->y.limbs[i];
 73 |     result->dt.limbs[i] = source->dt.limbs[i];
 74 |   }
 75 | }
 76 | 
 77 | void negate_extended_pt_readd_wide(
 78 |   extended_pt_readd_wide_t *result,
 79 |   const extended_pt_readd_wide_t *source) {
 80 |   for(int i = 0; i < NLIMBS; ++i) {
 81 |     result->x.limbs[i] = -source->x.limbs[i];
 82 |     result->y.limbs[i] = source->y.limbs[i];
 83 |     result->dt.limbs[i] = -source->dt.limbs[i];
 84 |     result->z.limbs[i] = source->z.limbs[i];
 85 |   }
 86 | }
 87 | 
 88 | void negate_extended_affine_pt_readd_narrow(
 89 |   extended_affine_pt_readd_narrow_t *result,
 90 |   const extended_affine_pt_readd_narrow_t *source) {
 91 |   for(int i = 0; i < NLIMBS; ++i) {
 92 |     result->x.limbs[i] = -source->x.limbs[i];
 93 |     result->dt.limbs[i] = -source->dt.limbs[i];
 94 |     result->y.limbs[i] = source->y.limbs[i];
 95 |   }
 96 | }
 97 | 
 98 | void negate_extended_pt_readd_narrow(
 99 |   extended_pt_readd_narrow_t *result,
100 |   const extended_pt_readd_narrow_t *source) {
101 |   for(int i = 0; i < NLIMBS; ++i) {
102 |     result->x.limbs[i] = -source->x.limbs[i];
103 |     result->dt.limbs[i] = -source->dt.limbs[i];
104 |     result->y.limbs[i] = source->y.limbs[i];
105 |     result->z.limbs[i] = source->z.limbs[i];
106 |   }
107 | }
108 | 
109 | void affine_narrow_to_extended(
110 |   extended_pt_wide_t *result,
111 |   const affine_pt_narrow_t * __restrict x) {
112 | 
113 |   for(int i = 0; i < NLIMBS; ++i) {
114 |     result->x.limbs[i] = x->x.limbs[i];
115 |     result->y.limbs[i] = x->y.limbs[i];
116 |     result->z.limbs[i] = 0;
117 |   }
118 |   result->z.limbs[1] = 1;
119 |   mul_wide(&result->t, &result->x, &result->y);
120 | }
121 | 
122 | void extended_to_projective_wide(
123 |   projective_pt_wide_t *result, const extended_pt_wide_t * __restrict x) {
124 |   for(int i = 0; i < NLIMBS; ++i) {
125 |     result->x.limbs[i] = x->x.limbs[i];
126 |     result->y.limbs[i] = x->y.limbs[i];
127 |     result->z.limbs[i] = x->z.limbs[i];
128 |   }
129 | }
130 | 
131 | void affine_to_readd_narrow(
132 |   extended_pt_readd_narrow_t *result,
133 |   const affine_pt_narrow_t * __restrict x) {
134 | 
135 |   for(int i = 0; i < NLIMBS; ++i) {
136 |     result->x.limbs[i] = x->x.limbs[i];
137 |     result->y.limbs[i] = x->y.limbs[i];
138 |     result->z.limbs[i] = 0;
139 |   }
140 |   result->z.limbs[1] = 1;
141 | 
142 |   residue_wide_t xy;
143 |   residue_wide_t dt_wide;
144 |   mul_narrow(&xy, &x->x, &x->y);
145 |   mul_wide_const(&dt_wide, &xy, D);
146 |   narrow(&result->dt, &dt_wide);
147 | }
148 | 
149 | void affine_to_readd_wide(
150 |   extended_pt_readd_wide_t *result,
151 |   const affine_pt_narrow_t * __restrict x) {
152 | 
153 |   for(int i = 0; i < NLIMBS; ++i) {
154 |     result->x.limbs[i] = x->x.limbs[i];
155 |     result->y.limbs[i] = x->y.limbs[i];
156 |     result->z.limbs[i] = 0;
157 |   }
158 |   result->z.limbs[1] = 1;
159 | 
160 |   residue_wide_t xy;
161 |   mul_narrow(&xy, &x->x, &x->y);
162 |   mul_wide_const(&result->dt, &xy, D);
163 | }
164 | 
165 | void extended_to_readd_wide_neg(
166 |   extended_pt_readd_wide_t *result,
167 |   const extended_pt_wide_t * __restrict x) {
168 | 
169 |   for(int i = 0; i < NLIMBS; ++i) {
170 |     result->x.limbs[i] = -(x->x.limbs[i]);
171 |     result->y.limbs[i] = x->y.limbs[i];
172 |     result->z.limbs[i] = x->z.limbs[i];
173 |   }
174 |   mul_wide_const(&result->dt, &x->t, -D);
175 | }
176 | 
177 | void affine_double(
178 |   projective_pt_wide_t *result,
179 |   const affine_pt_narrow_t * __restrict x) {
180 | 
181 |   residue_narrow_t x_plus_y;
182 |   residue_wide_t a, b, e, e_tmp, g, g_minus_2, h;
183 |   square_narrow(&a, &x->x);
184 |   square_narrow(&b, &x->y);
185 | 
186 |   add_narrow(&x_plus_y, &x->x, &x->y);
187 | 
188 |   square_narrow(&e, &x_plus_y);
189 |   sub_wide(&e_tmp, &e, &a);
190 |   sub_wide(&e, &e_tmp, &b);
191 |   add_wide(&g, &a, &b);
192 | 
193 |   for (int i = 0; i < NLIMBS; ++i) {
194 |     g_minus_2.limbs[i] = g.limbs[i];
195 |   }
196 |   g_minus_2.limbs[1] -= 2;
197 | 
198 |   sub_wide(&h, &a, &b);
199 |   mul_wide(&result->x, &e, &g_minus_2);
200 |   mul_wide(&result->y, &g, &h);
201 |   mul_wide(&result->z, &g, &g_minus_2);
202 | }
203 | 
204 | void affine_double_extended(
205 |   extended_pt_wide_t *result, const affine_pt_narrow_t * __restrict x) {
206 | 
207 |   residue_narrow_t x_plus_y;
208 |   residue_wide_t a, b, e, e_tmp, g, g_minus_2, h;
209 |   square_narrow(&a, &x->x);
210 |   square_narrow(&b, &x->y);
211 | 
212 |   add_narrow(&x_plus_y, &x->x, &x->y);
213 |   square_narrow(&e, &x_plus_y);
214 |   sub_wide(&e_tmp, &e, &a);
215 |   sub_wide(&e, &e_tmp, &b);
216 |   add_wide(&g, &a, &b);
217 | 
218 |   for (int i = 0; i < NLIMBS; ++i) {
219 |     g_minus_2.limbs[i] = g.limbs[i];
220 |   }
221 |   g_minus_2.limbs[1] -= 2;
222 | 
223 |   sub_wide(&h, &a, &b);
224 |   mul_wide(&result->x, &e, &g_minus_2);
225 |   mul_wide(&result->y, &g, &h);
226 |   mul_wide(&result->t, &e, &h);
227 |   mul_wide(&result->z, &g, &g_minus_2);
228 | }
229 | 
230 | void projective_double(
231 |   projective_pt_wide_t *result, const projective_pt_wide_t *x) {
232 | 
233 |   residue_wide_t x_plus_y;
234 |   residue_wide_t a, b, c, c_temp, e, e_tmp, f, g, h;
235 |   add_wide(&x_plus_y, &x->x, &x->y);
236 |   square_wide(&a, &x->x);
237 |   square_wide(&b, &x->y);
238 |   square_wide(&c_temp, &x->z);
239 |   double_wide(&c, &c_temp);
240 | 
241 |   square_wide(&e, &x_plus_y);
242 |   sub_wide(&e_tmp, &e, &a);
243 |   sub_wide(&e, &e_tmp, &b);
244 |   add_wide(&g, &a, &b);
245 |   sub_wide(&f, &g, &c);
246 |   sub_wide(&h, &a, &b);
247 | 
248 |   mul_wide(&result->x, &e, &f);
249 |   mul_wide(&result->y, &g, &h);
250 |   mul_wide(&result->z, &f, &g);
251 | }
252 | 
253 | void projective_double_extended(
254 |   extended_pt_wide_t *result, const projective_pt_wide_t * __restrict x) {
255 | 
256 |   residue_wide_t x_plus_y;
257 |   residue_wide_t a, b, c, c_temp, e, e_tmp, f, g, h;
258 |   add_wide(&x_plus_y, &x->x, &x->y);
259 |   square_wide(&a, &x->x);
260 |   square_wide(&b, &x->y);
261 |   square_wide(&c_temp, &x->z);
262 |   double_wide(&c, &c_temp);
263 | 
264 |   square_wide(&e, &x_plus_y);
265 |   sub_wide(&e_tmp, &e, &a);
266 |   sub_wide(&e, &e_tmp, &b);
267 |   add_wide(&g, &a, &b);
268 |   sub_wide(&f, &g, &c);
269 |   sub_wide(&h, &a, &b);
270 | 
271 |   mul_wide(&result->x, &e, &f);
272 |   mul_wide(&result->y, &g, &h);
273 |   mul_wide(&result->t, &e, &h);
274 |   mul_wide(&result->z, &f, &g);
275 | }
276 | 
277 | void extended_double_extended(
278 |   extended_pt_wide_t *result, const extended_pt_wide_t *x) {
279 | 
280 |   residue_wide_t x_plus_y;
281 |   residue_wide_t a, b, c, c_temp, e, e_tmp, f, g, h;
282 |   add_wide(&x_plus_y, &x->x, &x->y);
283 |   square_wide(&a, &x->x);
284 |   square_wide(&b, &x->y);
285 |   square_wide(&c_temp, &x->z);
286 |   double_wide(&c, &c_temp);
287 | 
288 |   square_wide(&e, &x_plus_y);
289 |   sub_wide(&e_tmp, &e, &a);
290 |   sub_wide(&e, &e_tmp, &b);
291 |   add_wide(&g, &a, &b);
292 |   sub_wide(&f, &g, &c);
293 |   sub_wide(&h, &a, &b);
294 | 
295 |   mul_wide(&result->x, &e, &f);
296 |   mul_wide(&result->z, &f, &g);
297 |   mul_wide(&result->y, &g, &h);
298 |   mul_wide(&result->t, &e, &h);
299 | }
300 | 
301 | void projective_add(
302 |   projective_pt_wide_t *result, const projective_pt_wide_t * __restrict x1,
303 |   const projective_pt_wide_t * __restrict x2) {
304 | 
305 |   residue_wide_t x1_plus_y1, x2_plus_y2;
306 |   residue_wide_t a, b, c, d, e, e_temp, f, g, t1, t2;
307 | 
308 |   mul_wide(&a, &x1->z, &x2->z);
309 |   square_wide(&b, &a);
310 |   mul_wide(&c, &x1->x, &x2->x);
311 |   mul_wide(&d, &x1->y, &x2->y);
312 |   mul_wide_const(&e_temp, &c, D);
313 |   mul_wide(&e, &e_temp, &d);
314 | 
315 |   sub_wide(&f, &b, &e);
316 |   add_wide(&g, &b, &e);
317 |   add_wide(&x1_plus_y1, &x1->x, &x1->y);
318 |   add_wide(&x2_plus_y2, &x2->x, &x2->y);
319 | 
320 |   mul_wide(&t1, &x1_plus_y1, &x2_plus_y2);
321 |   sub_wide(&t2, &t1, &c);
322 |   sub_wide(&t1, &t2, &d);
323 |   mul_wide(&t2, &t1, &f);
324 |   mul_wide(&result->x, &t2, &a);
325 | 
326 |   sub_wide(&t1, &d, &c);
327 |   mul_wide(&t2, &t1, &g);
328 |   mul_wide(&result->y, &t2, &a);
329 | 
330 |   mul_wide(&result->z, &f, &g);
331 | }
332 | 
333 | void extended_add(
334 |   projective_pt_wide_t *result, const extended_pt_wide_t * __restrict x1,
335 |   const extended_pt_wide_t * __restrict x2) {
336 | 
337 |   residue_wide_t x1_plus_y1, x2_plus_y2;
338 |   residue_wide_t a, b, c, c_temp, d, e, e_temp, f, g, h;
339 | 
340 |   mul_wide(&a, &x1->x, &x2->x);
341 |   mul_wide(&b, &x1->y, &x2->y);
342 |   mul_wide_const(&c_temp, &x1->t, D);
343 |   mul_wide(&c, &c_temp, &x2->t);
344 |   mul_wide(&d, &x1->z, &x2->z);
345 | 
346 |   add_wide(&x1_plus_y1, &x1->x, &x1->y);
347 |   add_wide(&x2_plus_y2, &x2->x, &x2->y);
348 |   mul_wide(&e, &x1_plus_y1, &x2_plus_y2);
349 |   sub_wide(&e_temp, &e, &a);
350 |   sub_wide(&e, &e_temp, &b);
351 |   sub_wide(&f, &d, &c);
352 |   add_wide(&g, &d, &c);
353 |   sub_wide(&h, &b, &a);
354 | 
355 |   mul_wide(&result->x, &e, &f);
356 |   mul_wide(&result->z, &f, &g);
357 |   mul_wide(&result->y, &g, &h);
358 | }
359 | 
360 | void extended_add_extended(
361 |   extended_pt_wide_t *result, const extended_pt_wide_t *x1,
362 |   const extended_pt_wide_t *x2) {
363 | 
364 |   residue_wide_t x1_plus_y1, x2_plus_y2;
365 |   residue_wide_t a, b, c, c_temp, d, e, e_temp, f, g, h;
366 | 
367 |   mul_wide(&a, &x1->x, &x2->x);
368 |   mul_wide(&b, &x1->y, &x2->y);
369 |   mul_wide_const(&c_temp, &x1->t, D);
370 |   mul_wide(&c, &c_temp, &x2->t);
371 |   mul_wide(&d, &x1->z, &x2->z);
372 | 
373 |   add_wide(&x1_plus_y1, &x1->x, &x1->y);
374 |   add_wide(&x2_plus_y2, &x2->x, &x2->y);
375 |   mul_wide(&e, &x1_plus_y1, &x2_plus_y2);
376 |   sub_wide(&e_temp, &e, &a);
377 |   sub_wide(&e, &e_temp, &b);
378 |   sub_wide(&f, &d, &c);
379 |   add_wide(&g, &d, &c);
380 |   sub_wide(&h, &b, &a);
381 | 
382 |   mul_wide(&result->x, &e, &f);
383 |   mul_wide(&result->z, &f, &g);
384 |   mul_wide(&result->y, &g, &h);
385 |   mul_wide(&result->t, &e, &h);
386 | }
387 | 
388 | void extended_readd_wide_extended(
389 |   extended_pt_wide_t *result,
390 |   const extended_pt_wide_t *x1,
391 |   const extended_pt_readd_wide_t * __restrict x2) {
392 | 
393 |   residue_wide_t x1_plus_y1, x2_plus_y2;
394 |   residue_wide_t a, b, c, d, e, e_temp, f, g, h;
395 | 
396 |   mul_wide(&a, &x1->x, &x2->x);
397 |   mul_wide(&b, &x1->y, &x2->y);
398 |   mul_wide(&c, &x1->t, &x2->dt);
399 |   mul_wide(&d, &x1->z, &x2->z);
400 | 
401 |   add_wide(&x1_plus_y1, &x1->x, &x1->y);
402 |   add_wide(&x2_plus_y2, &x2->x, &x2->y);
403 |   mul_wide(&e, &x1_plus_y1, &x2_plus_y2);
404 |   sub_wide(&e_temp, &e, &a);
405 |   sub_wide(&e, &e_temp, &b);
406 |   sub_wide(&f, &d, &c);
407 |   add_wide(&g, &d, &c);
408 |   sub_wide(&h, &b, &a);
409 | 
410 |   mul_wide(&result->x, &e, &f);
411 |   mul_wide(&result->z, &f, &g);
412 |   mul_wide(&result->y, &g, &h);
413 |   mul_wide(&result->t, &e, &h);
414 | }
415 | 
416 | void extended_readd_narrow_extended(
417 |   extended_pt_wide_t *result, const extended_pt_wide_t * __restrict x1,
418 |   const extended_pt_readd_narrow_t * __restrict x2) {
419 | 
420 |   residue_wide_t x1_plus_y1;
421 |   residue_narrow_t x2_plus_y2;
422 |   residue_wide_t a, b, c, d, e, e_temp, f, g, h;
423 | 
424 |   mul_wide_narrow(&a, &x1->x, &x2->x);
425 |   mul_wide_narrow(&b, &x1->y, &x2->y);
426 |   mul_wide_narrow(&c, &x1->t, &x2->dt);
427 |   mul_wide_narrow(&d, &x1->z, &x2->z);
428 | 
429 |   add_wide(&x1_plus_y1, &x1->x, &x1->y);
430 |   add_narrow(&x2_plus_y2, &x2->x, &x2->y);
431 |   mul_wide_narrow(&e, &x1_plus_y1, &x2_plus_y2);
432 |   sub_wide(&e_temp, &e, &a);
433 |   sub_wide(&e, &e_temp, &b);
434 |   sub_wide(&f, &d, &c);
435 |   add_wide(&g, &d, &c);
436 |   sub_wide(&h, &b, &a);
437 | 
438 |   mul_wide(&result->x, &e, &f);
439 |   mul_wide(&result->z, &f, &g);
440 |   mul_wide(&result->y, &g, &h);
441 |   mul_wide(&result->t, &e, &h);
442 | }
443 | 
444 | void extended_readd_narrow(
445 |   projective_pt_wide_t *result, const extended_pt_wide_t * __restrict x1,
446 |   const extended_pt_readd_narrow_t * __restrict x2) {
447 | 
448 |   residue_wide_t x1_plus_y1;
449 |   residue_narrow_t x2_plus_y2;
450 |   residue_wide_t a, b, c, d, e, e_temp, f, g, h;
451 | 
452 |   mul_wide_narrow(&a, &x1->x, &x2->x);
453 |   mul_wide_narrow(&b, &x1->y, &x2->y);
454 |   mul_wide_narrow(&c, &x1->t, &x2->dt);
455 |   mul_wide_narrow(&d, &x1->z, &x2->z);
456 | 
457 |   add_wide(&x1_plus_y1, &x1->x, &x1->y);
458 |   add_narrow(&x2_plus_y2, &x2->x, &x2->y);
459 |   mul_wide_narrow(&e, &x1_plus_y1, &x2_plus_y2);
460 |   sub_wide(&e_temp, &e, &a);
461 |   sub_wide(&e, &e_temp, &b);
462 |   sub_wide(&f, &d, &c);
463 |   add_wide(&g, &d, &c);
464 |   sub_wide(&h, &b, &a);
465 | 
466 |   mul_wide(&result->x, &e, &f);
467 |   mul_wide(&result->z, &f, &g);
468 |   mul_wide(&result->y, &g, &h);
469 | }
470 | 
471 | void extended_readd_affine_narrow_extended(
472 |   extended_pt_wide_t *result, const extended_pt_wide_t *x1,
473 |   const extended_affine_pt_readd_narrow_t * __restrict x2) {
474 | 
475 |   residue_wide_t x1_plus_y1;
476 |   residue_narrow_t x2_plus_y2;
477 |   residue_wide_t a, b, c, e, e_temp, f, g, h;
478 | 
479 |   mul_wide_narrow(&a, &x1->x, &x2->x);
480 |   mul_wide_narrow(&b, &x1->y, &x2->y);
481 |   mul_wide_narrow(&c, &x1->t, &x2->dt);
482 | 
483 |   add_wide(&x1_plus_y1, &x1->x, &x1->y);
484 |   add_narrow(&x2_plus_y2, &x2->x, &x2->y);
485 |   mul_wide_narrow(&e, &x1_plus_y1, &x2_plus_y2);
486 |   sub_wide(&e_temp, &e, &a);
487 |   sub_wide(&e, &e_temp, &b);
488 |   sub_wide(&f, &x1->z, &c);
489 |   add_wide(&g, &x1->z, &c);
490 |   sub_wide(&h, &b, &a);
491 | 
492 |   mul_wide(&result->x, &e, &f);
493 |   mul_wide(&result->z, &f, &g);
494 |   mul_wide(&result->y, &g, &h);
495 |   mul_wide(&result->t, &e, &h);
496 | }
497 | 
498 | void extended_readd_readd_narrow(
499 |   extended_pt_readd_narrow_t *result,
500 |   const extended_pt_wide_t * __restrict x1,
501 |   const extended_pt_readd_narrow_t * __restrict x2) {
502 | 
503 |   residue_wide_t x1_plus_y1;
504 |   residue_narrow_t x2_plus_y2;
505 |   residue_wide_t a, b, c, d, e, e_temp, f, g, h, x3, y3, t3, dt3, z3;
506 | 
507 |   mul_wide_narrow(&a, &x1->x, &x2->x);
508 |   mul_wide_narrow(&b, &x1->y, &x2->y);
509 |   mul_wide_narrow(&c, &x1->t, &x2->dt);
510 |   mul_wide_narrow(&d, &x1->z, &x2->z);
511 | 
512 |   add_wide(&x1_plus_y1, &x1->x, &x1->y);
513 |   add_narrow(&x2_plus_y2, &x2->x, &x2->y);
514 |   mul_wide_narrow(&e, &x1_plus_y1, &x2_plus_y2);
515 |   sub_wide(&e_temp, &e, &a);
516 |   sub_wide(&e, &e_temp, &b);
517 |   sub_wide(&f, &d, &c);
518 |   add_wide(&g, &d, &c);
519 |   sub_wide(&h, &b, &a);
520 | 
521 |   mul_wide(&x3, &e, &f);
522 |   mul_wide(&z3, &f, &g);
523 |   mul_wide(&y3, &g, &h);
524 |   mul_wide(&t3, &e, &h);
525 | 
526 |   narrow(&result->x, &x3);
527 |   narrow(&result->y, &y3);
528 |   mul_wide_const(&dt3, &t3, D);
529 |   narrow(&result->dt, &dt3);
530 |   narrow(&result->z, &z3);
531 | }
532 | 
533 | void readd_to_projective(
534 |   projective_pt_wide_t *result,
535 |   const extended_pt_readd_narrow_t * __restrict x) {
536 | 
537 |   widen(&result->x, &x->x);
538 |   widen(&result->y, &x->y);
539 |   widen(&result->z, &x->z);
540 | }
541 | 
542 | void affine_readd_to_extended(
543 |   extended_pt_wide_t *result,
544 |   const extended_affine_pt_readd_narrow_t * __restrict x) {
545 | 
546 |   widen(&result->x, &x->x);
547 |   widen(&result->y, &x->y);
548 |   mul_narrow(&result->t, &x->x, &x->y);
549 |   for (int i = 0; i < NLIMBS; ++i) {
550 |     result->z.limbs[i] = 0;
551 |   }
552 |   result->z.limbs[1] = 1;
553 | }
554 | 
555 | void scalar_multiply(
556 |   projective_pt_wide_t *result, const affine_pt_narrow_t * __restrict x,
557 |   const scalar_t * __restrict n) {
558 | 
559 |   scalar_t sabs_n;
560 |   convert_to_sabs(&sabs_n, n);
561 | 
562 |   const int WINDOW_BITS = 5;
563 |   const uint32_t WINDOW_MASK = (1 << WINDOW_BITS) - 1;
564 |   const uint32_t LOOKUP_MASK = WINDOW_MASK >> 1;
565 |   const int TABLE_SIZE = 16;
566 |   extended_pt_readd_narrow_t table[TABLE_SIZE];
567 | 
568 |   extended_pt_wide_t x2;
569 |   affine_double_extended(&x2, x);
570 |   affine_to_readd_narrow(&table[0], x);
571 |   for (int i = 1; i < TABLE_SIZE; ++i) {
572 |     extended_readd_readd_narrow(&table[i], &x2, &table[i-1]);
573 |   }
574 | 
575 |   int i;
576 |   int first = 1;
577 |   // Set i to the highest i such that
578 |   // a) i < SCALAR_BITS
579 |   // b) i % WINDOW_BITS = 0
580 | 
581 |   projective_pt_wide_t temp;
582 |   extended_pt_wide_t temp_ext;
583 |   extended_pt_readd_narrow_t window_pt;
584 | 
585 |   i = SCALAR_BITS - ((SCALAR_BITS - 1) % WINDOW_BITS) - 1;
586 |   for (; i >= 0; i -= WINDOW_BITS) {
587 |     uint32_t bits = sabs_n.limbs[i/SCALAR_LIMB_BITS] >> (i % SCALAR_LIMB_BITS);
588 |     if (i % SCALAR_LIMB_BITS > (SCALAR_LIMB_BITS - WINDOW_BITS) &&
589 |         i / SCALAR_LIMB_BITS < SCALAR_LIMBS - 1) {
590 | 
591 |       bits |= sabs_n.limbs[i/SCALAR_LIMB_BITS + 1] <<
592 |         (SCALAR_LIMB_BITS - i % SCALAR_LIMB_BITS);
593 |     }
594 | 
595 |     bits &= WINDOW_MASK;
596 |     int32_t invert = (bits >> (WINDOW_BITS - 1)) - 1;
597 |     bits ^= invert;
598 | 
599 |     constant_time_extended_narrow_lookup(
600 |       &window_pt, bits & LOOKUP_MASK, TABLE_SIZE, table);
601 |     constant_time_cond_extended_negate(&window_pt, invert);
602 | 
603 |     if (first) {
604 |       readd_to_projective(&temp, &window_pt);
605 |       first = 0;
606 |     } else {
607 |       for (int i = 0; i < WINDOW_BITS - 1; ++i) {
608 |         projective_double(&temp, &temp);
609 |       }
610 |       projective_double_extended(&temp_ext, &temp);
611 |       extended_readd_narrow(&temp, &temp_ext, &window_pt);
612 |     }
613 |   }
614 | 
615 |   copy_projective_pt_wide(result, &temp);
616 |   explicit_bzero(&sabs_n, sizeof(sabs_n));
617 |   explicit_bzero(&window_pt, sizeof(window_pt));
618 |   explicit_bzero(table, sizeof(table));
619 |   explicit_bzero(&temp, sizeof(temp));
620 |   explicit_bzero(&temp_ext, sizeof(temp_ext));
621 | }
622 | 
623 | void scalar_multiply_unsafe(
624 |   projective_pt_wide_t *result, const affine_pt_narrow_t * __restrict x,
625 |   const scalar_t * __restrict n) {
626 | 
627 |   scalar_t sabs_n;
628 |   convert_to_sabs(&sabs_n, n);
629 | 
630 |   const int WINDOW_BITS = 5;
631 |   const uint32_t WINDOW_MASK = (1 << WINDOW_BITS) - 1;
632 |   const uint32_t LOOKUP_MASK = WINDOW_MASK >> 1;
633 |   const int TABLE_SIZE = 16;
634 |   extended_pt_readd_narrow_t table[TABLE_SIZE];
635 | 
636 |   extended_pt_wide_t x2;
637 |   affine_double_extended(&x2, x);
638 |   affine_to_readd_narrow(&table[0], x);
639 |   for (int i = 1; i < TABLE_SIZE; ++i) {
640 |     extended_readd_readd_narrow(&table[i], &x2, &table[i-1]);
641 |   }
642 | 
643 |   int i;
644 |   int first = 1;
645 |   // Set i to the highest i such that
646 |   // a) i < SCALAR_BITS
647 |   // b) i % WINDOW_BITS = 0
648 | 
649 |   projective_pt_wide_t temp;
650 |   extended_pt_wide_t temp_ext;
651 |   extended_pt_readd_narrow_t window_pt;
652 | 
653 |   i = SCALAR_BITS - ((SCALAR_BITS - 1) % WINDOW_BITS) - 1;
654 |   for (; i >= 0; i -= WINDOW_BITS) {
655 |     uint32_t bits = sabs_n.limbs[i/SCALAR_LIMB_BITS] >> (i % SCALAR_LIMB_BITS);
656 |     if (i % SCALAR_LIMB_BITS > (SCALAR_LIMB_BITS - WINDOW_BITS) &&
657 |         i / SCALAR_LIMB_BITS < SCALAR_LIMBS - 1) {
658 | 
659 |       bits |= sabs_n.limbs[i/SCALAR_LIMB_BITS + 1] <<
660 |         (SCALAR_LIMB_BITS - i % SCALAR_LIMB_BITS);
661 |     }
662 | 
663 |     bits &= WINDOW_MASK;
664 |     int32_t invert = (bits >> (WINDOW_BITS - 1)) - 1;
665 |     bits ^= invert;
666 | 
667 |     copy_extended_pt_readd_narrow(&window_pt, &table[bits & LOOKUP_MASK]);
668 |     if (invert) {
669 |       negate_extended_pt_readd_narrow(&window_pt, &window_pt);
670 |     }
671 | 
672 |     if (first) {
673 |       readd_to_projective(&temp, &window_pt);
674 |       first = 0;
675 |     } else {
676 |       for (int i = 0; i < WINDOW_BITS - 1; ++i) {
677 |         projective_double(&temp, &temp);
678 |       }
679 |       projective_double_extended(&temp_ext, &temp);
680 |       extended_readd_narrow(&temp, &temp_ext, &window_pt);
681 |     }
682 |   }
683 | 
684 |   copy_projective_pt_wide(result, &temp);
685 | }
686 | 
687 | int point_decompress(
688 |   affine_pt_narrow_t *result,
689 |   residue_narrow_reduced_t *y, int low_bit) {
690 | 
691 |   residue_narrow_t y_n;
692 | 
693 |   residue_wide_t u;
694 |   residue_wide_t v;
695 | 
696 |   residue_wide_t y2;
697 |   residue_narrow_reduced_t temp;
698 |   residue_wide_t x_wide;
699 | 
700 |   unnarrow_reduce(&y_n, y);
701 |   square_narrow(&y2, &y_n);
702 |   copy_narrow(&result->y, &y_n);
703 | 
704 |   sub_wide(&u, &one_wide, &y2);
705 |   mul_wide_const(&y2, &y2, D);
706 |   sub_wide(&v, &one_wide, &y2);
707 | 
708 |   if (sqrt_inv_wide(&x_wide, &u, &v)) {
709 |     narrow(&result->x, &x_wide);
710 |     narrow_partial_complete(&temp, &result->x);
711 | 
712 |     int x_is_odd = is_odd(&temp);
713 |     if ((x_is_odd && !low_bit) || (low_bit && !x_is_odd)) {
714 |       negate_narrow(&result->x, &result->x);
715 |     }
716 | 
717 |     return 1;
718 |   }
719 | 
720 |   return 0;
721 | }
722 | 


--------------------------------------------------------------------------------
/avx2/include/curve.h:
--------------------------------------------------------------------------------
  1 | #ifndef CURVE_H
  2 | #define CURVE_H
  3 | #include "f11_260.h"
  4 | #include "scalar.h"
  5 | 
  6 | typedef struct affine_pt_narrow {
  7 |   residue_narrow_t x;
  8 |   residue_narrow_t y;
  9 | } affine_pt_narrow_t;
 10 | 
 11 | typedef struct extended_pt_readd_narrow {
 12 |   __attribute__((__aligned__(32)))
 13 |   residue_narrow_t x;
 14 |   residue_narrow_t dt;
 15 |   residue_narrow_t y;
 16 |   residue_narrow_t z;
 17 | } extended_pt_readd_narrow_t;
 18 | 
 19 | typedef struct extended_pt_readd_wide {
 20 |   residue_wide_t x;
 21 |   residue_wide_t dt;
 22 |   residue_wide_t y;
 23 |   residue_wide_t z;
 24 | } extended_pt_readd_wide_t;
 25 | 
 26 | typedef struct extended_affine_pt_readd_narrow {
 27 |   __attribute__((__aligned__(32)))
 28 |   residue_narrow_t x;
 29 |   residue_narrow_t dt;
 30 |   residue_narrow_t y;
 31 |   uint32_t pad[4]; // So that it takes an even 5 vector
 32 |   // loads to load the structure.
 33 | } extended_affine_pt_readd_narrow_t;
 34 | 
 35 | // For use in doubling.
 36 | typedef struct projective_pt_wide {
 37 |   residue_wide_t x;
 38 |   residue_wide_t y;
 39 |   residue_wide_t z;
 40 | } projective_pt_wide_t;
 41 | 
 42 | // For use in addition.
 43 | typedef struct extended_pt_wide {
 44 |   residue_wide_t x;
 45 |   residue_wide_t y;
 46 |   residue_wide_t t;
 47 |   residue_wide_t z;
 48 | } extended_pt_wide_t;
 49 | 
 50 | #define D (-49142)
 51 | 
 52 | __attribute__((__aligned__(32)))
 53 | const affine_pt_narrow_t B;
 54 | 
 55 | void copy_projective_pt_wide(
 56 |   projective_pt_wide_t *result, const projective_pt_wide_t *source);
 57 | 
 58 | void copy_extended_pt_wide(
 59 |   extended_pt_wide_t *result, const extended_pt_wide_t *source);
 60 | 
 61 | void copy_extended_pt_readd_wide(
 62 |   extended_pt_readd_wide_t *result, const extended_pt_readd_wide_t *source);
 63 | 
 64 | void copy_extended_pt_readd_narrow(
 65 |   extended_pt_readd_narrow_t *result, const extended_pt_readd_narrow_t *source);
 66 | 
 67 | void copy_extended_affine_pt_readd_narrow(
 68 |   extended_affine_pt_readd_narrow_t *result,
 69 |   const extended_affine_pt_readd_narrow_t *source);
 70 | 
 71 | void negate_extended_pt_readd_wide(
 72 |   extended_pt_readd_wide_t *result,
 73 |   const extended_pt_readd_wide_t *source);
 74 | 
 75 | void negate_extended_affine_pt_readd_narrow(
 76 |   extended_affine_pt_readd_narrow_t *result,
 77 |   const extended_affine_pt_readd_narrow_t *source);
 78 | 
 79 | void affine_narrow_to_extended(
 80 |   extended_pt_wide_t *result,
 81 |   const affine_pt_narrow_t * __restrict x);
 82 | 
 83 | void affine_to_projective(
 84 |   projective_pt_wide_t *result,
 85 |   const affine_pt_narrow_t * __restrict x);
 86 | 
 87 | void affine_to_readd_wide(
 88 |   extended_pt_readd_wide_t *result,
 89 |   const affine_pt_narrow_t * __restrict x);
 90 | 
 91 | void extended_to_readd_wide_neg(
 92 |   extended_pt_readd_wide_t *result,
 93 |   const extended_pt_wide_t * __restrict x);
 94 | 
 95 | void affine_to_readd_narrow(
 96 |   extended_pt_readd_narrow_t *result,
 97 |   const affine_pt_narrow_t * __restrict x);
 98 | 
 99 | void projective_to_extended_wide(
100 |   extended_pt_wide_t *result, projective_pt_wide_t * __restrict x);
101 | 
102 | void extended_to_projective_wide(
103 |   projective_pt_wide_t *result, const extended_pt_wide_t * __restrict x);
104 | 
105 | void readd_to_projective(
106 |   projective_pt_wide_t *result,
107 |   const extended_pt_readd_narrow_t * __restrict x);
108 | 
109 | void affine_readd_to_extended(
110 |   extended_pt_wide_t *result,
111 |   const extended_affine_pt_readd_narrow_t * __restrict x);
112 | 
113 | void negate_extended_affine_pt_readd_narrow(
114 |   extended_affine_pt_readd_narrow_t *result,
115 |   const extended_affine_pt_readd_narrow_t *source);
116 | 
117 | void affine_double(
118 |   projective_pt_wide_t *result,
119 |   const affine_pt_narrow_t * __restrict x);
120 | 
121 | void affine_double_extended(
122 |   extended_pt_wide_t *result, const affine_pt_narrow_t * __restrict x);
123 | 
124 | void projective_double(
125 |   projective_pt_wide_t *result, const projective_pt_wide_t *x);
126 | 
127 | void projective_double_extended(
128 |   extended_pt_wide_t *result, const projective_pt_wide_t * __restrict x);
129 | 
130 | void extended_double_extended(
131 |   extended_pt_wide_t *result, const extended_pt_wide_t *x);
132 | 
133 | void projective_add(
134 |   projective_pt_wide_t *result, const projective_pt_wide_t * __restrict x1,
135 |   const projective_pt_wide_t * __restrict x2);
136 | 
137 | void extended_add(
138 |   projective_pt_wide_t *result, const extended_pt_wide_t * __restrict x,
139 |   const extended_pt_wide_t * __restrict y);
140 | 
141 | void extended_add_extended(
142 |   extended_pt_wide_t *result, const extended_pt_wide_t * __restrict x,
143 |   const extended_pt_wide_t * __restrict y);
144 | 
145 | void extended_readd_narrow(
146 |   projective_pt_wide_t *result, const extended_pt_wide_t * __restrict x,
147 |   const extended_pt_readd_narrow_t * __restrict y);
148 | 
149 | void extended_readd_narrow_extended(
150 |   extended_pt_wide_t *result, const extended_pt_wide_t * __restrict x,
151 |   const extended_pt_readd_narrow_t * __restrict y);
152 | 
153 | void extended_readd_affine_narrow_extended(
154 |   extended_pt_wide_t *result, const extended_pt_wide_t * __restrict x,
155 |   const extended_affine_pt_readd_narrow_t * __restrict y);
156 | 
157 | void extended_add_extended(
158 |   extended_pt_wide_t *result, const extended_pt_wide_t * __restrict x,
159 |   const extended_pt_wide_t * __restrict y);
160 | 
161 | void extended_readd_readd_narrow(
162 |   extended_pt_readd_narrow_t *result,
163 |   const extended_pt_wide_t * __restrict x,
164 |   const extended_pt_readd_narrow_t * __restrict y);
165 | 
166 | void extended_readd_wide_extended(
167 |   extended_pt_wide_t *result,
168 |   const extended_pt_wide_t *x1,
169 |   const extended_pt_readd_wide_t * __restrict x2);
170 | 
171 | void scalar_multiply(
172 |   projective_pt_wide_t *result, const affine_pt_narrow_t * __restrict x,
173 |   const scalar_t * __restrict n);
174 | 
175 | void scalar_multiply_unsafe(
176 |   projective_pt_wide_t *result, const affine_pt_narrow_t * __restrict x,
177 |   const scalar_t * __restrict n);
178 | 
179 | int point_decompress(
180 |   affine_pt_narrow_t *result, residue_narrow_reduced_t *y, int low_bit);
181 | #endif
182 | 


--------------------------------------------------------------------------------
/avx2/include/f11_260.h:
--------------------------------------------------------------------------------
  1 | // Types and functions for manipulating field elements
  2 | 
  3 | #ifndef F11_260_H
  4 | #define F11_260_H
  5 | #include <stdint.h>
  6 | 
  7 | #define NLIMBS_REDUCED 10
  8 | #define NLIMBS 12
  9 | #define T ((1 << 26) - 15)
 10 | #define TBITS 26
 11 | #define TMASK ((1 << 26) - 1)
 12 | #define T_CBITS 4
 13 | #define RESIDUE_LENGTH_BYTES 33
 14 | 
 15 | // Reduced to 10 limbs. For final results.
 16 | typedef struct residue_narrow_reduced {
 17 |   __attribute__((__aligned__(8)))
 18 |   int32_t limbs[10];
 19 | } residue_narrow_reduced_t;
 20 | 
 21 | // 11 limbs. Limb 10 is placed in slot 0, and slot 11.
 22 | typedef struct residue_narrow {
 23 |   __attribute__((__aligned__(16)))
 24 |   int32_t limbs[12];
 25 | } residue_narrow_t;
 26 | 
 27 | // 11 limbs. Limb 10 is placed in slot 0 and slot 11. Wider for vector
 28 | // compatibility.
 29 | typedef struct residue_wide {
 30 |   __attribute__((__aligned__(32)))
 31 |   int64_t limbs[12];
 32 | } residue_wide_t;
 33 | 
 34 | residue_wide_t zero_wide;
 35 | residue_wide_t one_wide;
 36 | residue_narrow_t zero_narrow;
 37 | residue_narrow_t one_narrow;
 38 | 
 39 | // Shrink to 32 bits. Assumes reduction has already occurred, and wide storage
 40 | // is being used for vector compatibility.
 41 | void narrow(residue_narrow_t *result, const residue_wide_t * __restrict w);
 42 | 
 43 | // Reduce to 10 limbs. Useful for debugging
 44 | void narrow_reduce(
 45 |   residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w);
 46 | 
 47 | // Reduce to unique representative.
 48 | // This is expensive. Only used for final signature or DH Key
 49 | void narrow_complete(
 50 |   residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w);
 51 | 
 52 | // Reduce to mostly unique representative.
 53 | // All coefficients are reduced to 0 <= xi <= t
 54 | // Unique up to carries (xi == t) => (xi = 0; x[i+1] += 1);
 55 | // This is sufficient to determine if x is even or odd.
 56 | // Still pretty expensive. Used in point compression.
 57 | void narrow_partial_complete(
 58 |   residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w);
 59 | 
 60 | int is_odd(residue_narrow_reduced_t *x);
 61 | 
 62 | // Produce a 32-bit entry with 11 limbs
 63 | static inline void unnarrow_reduce(
 64 |   residue_narrow_t *result, const residue_narrow_reduced_t * __restrict x) {
 65 | 
 66 |   result->limbs[0] = result->limbs[NLIMBS - 1] = 0;
 67 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
 68 |     result->limbs[i+1] = x->limbs[i];
 69 |   }
 70 | }
 71 | 
 72 | // Produce a 64-bit residue
 73 | void widen(
 74 |   residue_wide_t *result, const residue_narrow_t * __restrict x);
 75 | 
 76 | // Copy a 64-bit residue
 77 | void copy_wide(
 78 |   residue_wide_t *result, const residue_wide_t * __restrict x);
 79 | 
 80 | // Copy a 32-bit residue
 81 | void copy_narrow(
 82 |   residue_narrow_t *result, const residue_narrow_t * __restrict x);
 83 | 
 84 | void copy_narrow_reduced(
 85 |   residue_narrow_reduced_t *result,
 86 |   const residue_narrow_reduced_t * __restrict x);
 87 | 
 88 | // Subtract 2 12x64-bit residues.
 89 | void sub_wide(
 90 |   residue_wide_t *result, const residue_wide_t * __restrict x,
 91 |   const residue_wide_t * __restrict y);
 92 | 
 93 | void negate_wide(residue_wide_t *result, const residue_wide_t *x);
 94 | 
 95 | void negate_narrow(residue_narrow_t *result, const residue_narrow_t *x);
 96 | 
 97 | // Add 2 12x32-bit residues.
 98 | void add_narrow(
 99 |   residue_narrow_t *result, const residue_narrow_t * __restrict x,
100 |   const residue_narrow_t * __restrict y);
101 | 
102 | // Add 2 12x64-bit residues.
103 | void add_wide(
104 |   residue_wide_t *result, const residue_wide_t * __restrict x,
105 |   const residue_wide_t * __restrict y);
106 | 
107 | // Scale a wide residue by 2.
108 | void double_wide(
109 |   residue_wide_t *result, const residue_wide_t * __restrict x);
110 | 
111 | // Multiply two wide residues, and produce a wide result. The result is reduced
112 | // to 32 bits, but not narrowed for performance reasons.
113 | void mul_wide(
114 |   residue_wide_t *result, const residue_wide_t *x,
115 |   const residue_wide_t *y);
116 | // Multiply a wide residues by a narrow and produce a wide result. The result is
117 | // reduced to 32 bits, but not narrowed for performance reasons.
118 | void mul_wide_narrow(
119 |   residue_wide_t *result, const residue_wide_t *x,
120 |   const residue_narrow_t *y);
121 | // Multiply two narrow residues and produce a wide result. The result is reduced
122 | // to 32 bits, but not narrowed for performance reasons.
123 | void mul_narrow(
124 |   residue_wide_t *result, const residue_narrow_t *x,
125 |   const residue_narrow_t *y);
126 | 
127 | // Multiply a wide residue by a constant.
128 | void mul_wide_const(
129 |   residue_wide_t *result, const residue_wide_t * __restrict x, int32_t d);
130 | 
131 | // Multiply a narrow residue by a constant, producing a wide result
132 | void mul_narrow_const(
133 |   residue_wide_t *result, const residue_narrow_t *x, int32_t d);
134 | 
135 | // Square a wide residue and produce a wide result. The result is reduced to 32
136 | // bits but not narrowed for performance reasons.
137 | void square_wide(
138 |   residue_wide_t *result, const residue_wide_t *x);
139 | 
140 | // Square a narrow residue and produce a wide result. The result is reduced to
141 | // 32 bits but not narrowed for performance reasons.
142 | void square_narrow(
143 |   residue_wide_t *result, const residue_narrow_t *x);
144 | 
145 | // Approximately divide each coefficient by t. Carry the results.
146 | void reduce_step_narrow(
147 |   residue_narrow_t *result, const residue_narrow_t *x);
148 | 
149 | // Approximately divide each coefficient by t. Carry the results.
150 | void reduce_step_wide(
151 |   residue_wide_t *result, const residue_wide_t *x);
152 | 
153 | // Invert via fermat's theorem
154 | void invert_wide(
155 |   residue_wide_t *result, const residue_wide_t * __restrict x);
156 | 
157 | // Compute combined inverse and square root
158 | // returns true if x/y was a quadratic residue, and false otherwise.
159 | int sqrt_inv_wide(
160 |   residue_wide_t *result, const residue_wide_t * __restrict x,
161 |   const residue_wide_t * __restrict y);
162 | 
163 | // Returns true if x == y. Computes in constant time.
164 | int equal_wide(const residue_wide_t * x, const residue_wide_t * y);
165 | 
166 | int equal_narrow_reduced(
167 |   const residue_narrow_reduced_t * x, const residue_narrow_reduced_t * y);
168 | 
169 | void encode(uint8_t *out, const residue_narrow_reduced_t * __restrict x);
170 | void encode_compressed(
171 |   uint8_t *out, const residue_narrow_reduced_t * __restrict x, int is_odd);
172 | 
173 | void decode(residue_narrow_reduced_t *out, const uint8_t *in);
174 | #endif
175 | 


--------------------------------------------------------------------------------
/avx2/include/gen.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <string.h>
 3 | #include "comb.h"
 4 | #include "curve.h"
 5 | #include "gen.h"
 6 | #include "scalar.h"
 7 | 
 8 | void gen_key(scalar_t * __restrict priv_key,
 9 |              affine_pt_narrow_t * __restrict pub_key) {
10 |   scalar_hash_t large_key;
11 |   char *large_key_ptr = (char *) &large_key;
12 |   arc4random_buf(large_key_ptr, sizeof(large_key));
13 | 
14 |   // It's just as random to use montgomery reduction as to correct for the
15 |   // montgomery factor.
16 |   mont_reduce_hash_mod_l(priv_key, &large_key);
17 | 
18 |   projective_pt_wide_t result_pt;
19 |   scalar_comb_multiply(&result_pt, &base_comb, priv_key);
20 | 
21 |   residue_wide_t z_inv;
22 | 
23 |   invert_wide(&z_inv, &result_pt.z);
24 |   mul_wide(&result_pt.x, &result_pt.x, &z_inv);
25 |   mul_wide(&result_pt.y, &result_pt.y, &z_inv);
26 | 
27 |   residue_narrow_t temp_narrow;
28 |   narrow(&pub_key->x, &result_pt.x);
29 | 
30 |   narrow(&pub_key->y, &result_pt.y);
31 | 
32 |   explicit_bzero(&large_key, sizeof(large_key));
33 |   explicit_bzero(&result_pt, sizeof(result_pt));
34 |   explicit_bzero(&z_inv, sizeof(z_inv));
35 |   explicit_bzero(&temp_narrow, sizeof(temp_narrow));
36 | }
37 | 
38 | void encode_pub_key(uint8_t *result, const affine_pt_narrow_t *pub_key) {
39 |   residue_narrow_reduced_t y_reduced;
40 |   residue_narrow_reduced_t x_reduced;
41 |   narrow_complete(&y_reduced, &pub_key->y);
42 |   narrow_partial_complete(&x_reduced, &pub_key->x);
43 | 
44 |   y_reduced.limbs[NLIMBS_REDUCED - 1] |= is_odd(&x_reduced) << TBITS;
45 |   encode(result, &y_reduced);
46 | }
47 | 
48 | int decode_pub_key(affine_pt_narrow_t *result, const uint8_t *encoded_key) {
49 |   residue_narrow_reduced_t y_decoded;
50 |   decode(&y_decoded, encoded_key);
51 |   int is_odd = y_decoded.limbs[NLIMBS_REDUCED - 1] >> TBITS;
52 |   y_decoded.limbs[NLIMBS_REDUCED - 1] &= TMASK;
53 |   return point_decompress(result, &y_decoded, is_odd);
54 | }
55 | 


--------------------------------------------------------------------------------
/avx2/include/gen.h:
--------------------------------------------------------------------------------
 1 | #ifndef GEN_H
 2 | #define GEN_H
 3 | 
 4 | #include "scalar.h"
 5 | #include "curve.h"
 6 | 
 7 | void gen_key(scalar_t * __restrict priv_key,
 8 |              affine_pt_narrow_t * __restrict pub_key);
 9 | void encode_pub_key(uint8_t *result, const affine_pt_narrow_t *pub_key);
10 | int decode_pub_key(affine_pt_narrow_t *result, const uint8_t *encoded_key);
11 | #endif
12 | 


--------------------------------------------------------------------------------
/avx2/include/scalar.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include <stdint.h>
  3 | #include "f11_260.h"
  4 | #include "scalar.h"
  5 | 
  6 | // Plenty of inspiration for this file was taken from Mike Hamburg's
  7 | // Ed448 code.
  8 | 
  9 | // Constants:
 10 | __attribute__((__aligned__(32)))
 11 | const scalar_t l_bits = {
 12 |   .limbs = {0x28ad9c41, 0xe6dcf7e8, 0x34b804af, 0x5af91169,
 13 |             0x5cf68f2f, 0x125277f4, 0x9c1bf9f, 0xffff6b00, 0x3,},
 14 | };
 15 | 
 16 | __attribute__((__aligned__(32)))
 17 | const scalar_t signed_bits_set_adjustment = {
 18 |   .limbs = {0x5d498efb, 0x648c205f, 0x2d1fed40, 0x941bba5b,
 19 |             0x8c25c342, 0xb6b6202e, 0xd8f90183, 0x000253ff, 0x0,},
 20 | };
 21 | 
 22 | __attribute__((__aligned__(32)))
 23 | const scalar_t SCALAR_MONT_R2 = {
 24 |   .limbs = {0x30ba45c7, 0xf3422093, 0x054bbbf6, 0x017ab264,
 25 |             0x914ee18b, 0x250f1097, 0xf6bc1224, 0x5e97c70e, 0x2,},
 26 | };
 27 | 
 28 | const uint32_t SCALAR_MONT_N_PRIME = 0xb3138c3f;
 29 | 
 30 | __attribute__((__aligned__(32)))
 31 | const scalar_t SCALAR_MONT_R2_HASH = {
 32 |   .limbs = {
 33 |             0x202dd8e7, 0xcb1bf7be, 0xd219daf6, 0xb85aba0a,
 34 |             0xdc8da05f, 0xbd23bfce, 0xb7642c95, 0xbb13e4ad, 0x0,},
 35 | };
 36 | 
 37 | __attribute__((__aligned__(32)))
 38 | const scalar_t SCALAR_MONT_R2_HASH_MUL = {
 39 |   .limbs = {0x8b9c7a13, 0x37bb3081, 0xe4f0c2b0, 0x99b4a8b2,
 40 |             0xb4538c55, 0x34c9db2a, 0x2ade0e63, 0xa7cb6782, 0x1,},
 41 | };
 42 | 
 43 | void divide_by_2_mod_l(
 44 |   scalar_t *result, const scalar_t *x) {
 45 | 
 46 |   uint32_t mask = -(x->limbs[0] & 1);
 47 | 
 48 |   uint64_t chain = 0;
 49 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
 50 |     chain = (chain + x->limbs[i]) + (mask & l_bits.limbs[i]);
 51 |     result->limbs[i] = chain;
 52 |     chain >>= SCALAR_LIMB_BITS;
 53 |   }
 54 | 
 55 |   int i;
 56 |   for (i = 0; i < SCALAR_LIMBS - 1; ++i) {
 57 |     result->limbs[i] = result->limbs[i] >> 1 |
 58 |       (result->limbs[i+1] << (SCALAR_LIMB_BITS - 1));
 59 |   }
 60 |   result->limbs[i] >>= 1;
 61 | }
 62 | 
 63 | void add_mod_l(
 64 |   scalar_t *result, const scalar_t *x,
 65 |   const scalar_t * __restrict y) {
 66 | 
 67 |   uint64_t chain = 0;
 68 |   int i;
 69 |   for (i = 0; i < SCALAR_LIMBS; ++i) {
 70 |     chain = (chain + x->limbs[i]) + y->limbs[i];
 71 |     result->limbs[i] = chain;
 72 |     chain >>= SCALAR_LIMB_BITS;
 73 |   }
 74 | 
 75 |   sub_mod_l(result, result, &l_bits);
 76 | }
 77 | 
 78 | void sub_mod_l(
 79 |   scalar_t *result, const scalar_t *x,
 80 |   const scalar_t *y) {
 81 |   sub_mod_l_accum(result, x->limbs, y);
 82 | }
 83 | 
 84 | // x is a pointer and not a scalar_t so that this function can be used to reduce
 85 | // accumulators after multiplication.
 86 | void sub_mod_l_accum(
 87 |   scalar_t *result, const uint32_t *x,
 88 |   const scalar_t *y) {
 89 | 
 90 |   int64_t chain = 0;
 91 |   int i;
 92 |   for (i = 0; i < SCALAR_LIMBS; ++i) {
 93 |     chain = (chain + x[i]) - y->limbs[i];
 94 |     result->limbs[i] = chain;
 95 |     chain >>= SCALAR_LIMB_BITS;
 96 |   }
 97 | 
 98 |   //Should be 0 or -1 (to function as a mask)
 99 |   int32_t borrow = chain;
100 | 
101 |   chain = 0;
102 |   for (i = 0; i < SCALAR_LIMBS; ++i) {
103 |     chain = (chain + result->limbs[i]) + (l_bits.limbs[i] & borrow);
104 |     result->limbs[i] = chain;
105 |     chain >>= SCALAR_LIMB_BITS;
106 |   }
107 | }
108 | 
109 | void convert_to_sabs(
110 |   scalar_t *result, const scalar_t *x) {
111 |   add_mod_l(result, x, &signed_bits_set_adjustment);
112 |   divide_by_2_mod_l(result, result);
113 | }
114 | 
115 | void mont_reduce_hash_mod_l(
116 |   scalar_t *result, const scalar_hash_t * __restrict x) {
117 |   uint32_t accum[HASH_LIMBS];
118 | 
119 |   for (int i = 0; i < HASH_LIMBS; ++i) {
120 |     accum[i] = x->limbs[i];
121 |   }
122 | 
123 |   uint64_t chain = 0;
124 |   for (int i = 0; i <= HASH_LIMBS - SCALAR_LIMBS; ++i) {
125 |     uint32_t q = accum[0] * SCALAR_MONT_N_PRIME;
126 |     for (int j = 0; j < SCALAR_LIMBS; ++j) {
127 |       chain += accum[j] + ((uint64_t) q) * l_bits.limbs[j];
128 |       if (j > 0) {
129 |         accum[j - 1] = chain;
130 |       }
131 |       chain >>= SCALAR_LIMB_BITS;
132 |     }
133 |     int j;
134 |     for (j = SCALAR_LIMBS; j < HASH_LIMBS - i; ++j) {
135 |       chain += accum[j];
136 |       accum[j - 1] = chain;
137 |       chain >>= SCALAR_LIMB_BITS;
138 |     }
139 |     accum[j - 1] = chain;
140 |   }
141 | 
142 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
143 |     result->limbs[i] = accum[i];
144 |   }
145 |   explicit_bzero(accum, sizeof(accum));
146 | }
147 | 
148 | void reduce_hash_mod_l(scalar_t *result, const scalar_hash_t * __restrict x) {
149 |   mont_reduce_hash_mod_l(result, x);
150 |   mont_mult_mod_l(result, result, &SCALAR_MONT_R2_HASH);
151 | }
152 | 
153 | void mont_mult_mod_l(scalar_t *result, const scalar_t *x,
154 |                      const scalar_t *y) {
155 |   uint32_t accum[SCALAR_LIMBS + 1] = {0};
156 | 
157 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
158 |     uint32_t x_limb = x->limbs[i];
159 | 
160 |     uint64_t chain = 0;
161 |     int j;
162 |     for (j = 0; j < SCALAR_LIMBS; ++j) {
163 |       chain += accum[j] + ((uint64_t) y->limbs[j]) * x_limb;
164 |       accum[j] = chain;
165 |       chain >>= SCALAR_LIMB_BITS;
166 |     }
167 | 
168 |     // 2 bit value
169 |     accum[j] = chain;
170 | 
171 |     uint32_t q = accum[0] * SCALAR_MONT_N_PRIME;
172 |     chain = 0;
173 |     for (int j = 0; j < SCALAR_LIMBS; ++j) {
174 |       chain += accum[j] + ((uint64_t) l_bits.limbs[j]) * q;
175 |       if (j > 0) {
176 |         accum[j - 1] = chain;
177 |       }
178 |       chain >>= SCALAR_LIMB_BITS;
179 |     }
180 | 
181 |     // chain is a 2-bit value with a possible carry.
182 |     // result is a 3 bit value
183 |     chain += accum[j];
184 |     accum[j - 1] = chain;
185 |   }
186 | 
187 |   sub_mod_l_accum(result, accum, &l_bits);
188 |   explicit_bzero(accum, sizeof(accum));
189 | }
190 | 
191 | void mult_mod_l(scalar_t *result, const scalar_t * __restrict x,
192 |                 const scalar_t * __restrict y) {
193 |   scalar_t temp;
194 |   mont_mult_mod_l(&temp, x, y);
195 |   mont_mult_mod_l(result, &temp, &SCALAR_MONT_R2);
196 |   explicit_bzero(&temp, sizeof(temp));
197 | }
198 | 


--------------------------------------------------------------------------------
/avx2/include/scalar.h:
--------------------------------------------------------------------------------
 1 | #ifndef SCALAR_H
 2 | #define SCALAR_H
 3 | #include <stdint.h>
 4 | #include "f11_260.h"
 5 | 
 6 | typedef struct scalar {
 7 |   uint32_t limbs[9];
 8 | } scalar_t;
 9 | 
10 | typedef struct scalar_hash {
11 |   uint32_t limbs[16];
12 | } scalar_hash_t;
13 | 
14 | // const int SCALAR_LIMBS = 9;
15 | #define HASH_LIMBS 16
16 | #define SCALAR_LIMBS 9
17 | #define SCALAR_BITS 258
18 | #define SCALAR_BYTES 33
19 | #define SCALAR_LIMB_BITS 32
20 | #define SCALAR_LAST_LIMB_BITS 2
21 | #define SCALAR_LAST_LIMB_MASK 0x3
22 | 
23 | // Constants
24 | // A scalar representing l, the order of the prime subgroup.
25 | const scalar_t l_bits;
26 | // For converting to SABS representation
27 | const scalar_t signed_bits_set_adjustment;
28 | // l * N' is congruent to -1 mod 2^32
29 | const uint32_t SCALAR_MONT_N_PRIME;
30 | // (2 ^ 32)^18 mod l. Used to convert to montgomery domain.
31 | // Or to fix the result of a single multiply via a 2nd multiply.
32 | const scalar_t SCALAR_MONT_R2;
33 | // (2 ^ 32)^17 mod l.
34 | // Used to fix the result of a hash reduction via a multiply
35 | // A hash is reduced from HASH_LIMBS to SCALAR_LIMBS via
36 | // HASH_LIMBS - SCALAR_LIMBS + 1 divisions by 2^32. So a hash reduction produces
37 | // h * (2^32)^-8 mod l. Montgomery multiplying by (2^32)^17 mod l produces h mod
38 | // l
39 | const scalar_t SCALAR_MONT_R2_HASH;
40 | // (2 ^ 32)^26 mod l.
41 | // Used to fix the result of a hash reduction followed by a multiply.
42 | // By similar logic we need to get rid of a factor of (2^32)^-17
43 | const scalar_t SCALAR_MONT_R2_HASH_MUL;
44 | 
45 | // Functions for manipulating scalars. May need more for ECDSA.
46 | 
47 | // This is used to convert to SABS representation.
48 | void divide_by_2_mod_l(scalar_t *result, const scalar_t * __restrict x);
49 | 
50 | void add_mod_l(scalar_t *result, const scalar_t * __restrict x,
51 |                const scalar_t * __restrict y);
52 | 
53 | void sub_mod_l(scalar_t *result, const scalar_t * __restrict x,
54 |                const scalar_t * __restrict y);
55 | 
56 | void sub_mod_l_accum(scalar_t *result, const uint32_t * __restrict x,
57 |                      const scalar_t * __restrict y);
58 | 
59 | void mont_mult_mod_l(scalar_t *result, const scalar_t * __restrict x,
60 |                      const scalar_t * __restrict y);
61 | 
62 | void mult_mod_l(scalar_t *result, const scalar_t * __restrict x,
63 |                 const scalar_t * __restrict y);
64 | 
65 | void mont_reduce_hash_mod_l(
66 |   scalar_t *result, const scalar_hash_t * __restrict x);
67 | void reduce_hash_mod_l(scalar_t *result, const scalar_hash_t * __restrict x);
68 | 
69 | void convert_to_sabs(scalar_t *result, const scalar_t * __restrict x);
70 | #endif
71 | 


--------------------------------------------------------------------------------
/avx2/include/sign.h:
--------------------------------------------------------------------------------
 1 | #ifndef SIGN_H
 2 | #define SIGN_H
 3 | #include "curve.h"
 4 | #include "scalar.h"
 5 | 
 6 | #define SIG_LENGTH 65
 7 | 
 8 | typedef struct signature {
 9 |   residue_narrow_reduced_t y;
10 |   scalar_t s;
11 | } signature_t;
12 | 
13 | void sign(signature_t *result, scalar_t *priv_key,
14 |   const uint8_t *pub_key, const uint8_t *msg, size_t msg_len);
15 | 
16 | int verify(
17 |   const signature_t *sig, const uint8_t *r_bytes, const uint8_t *pub_key_bytes,
18 |   const affine_pt_narrow_t *pub_key_pt, const uint8_t *msg,
19 |   size_t msg_len);
20 | 
21 | void encode_sig(uint8_t *result, const signature_t *sig);
22 | void decode_sig(signature_t *result, const uint8_t *encoded_sig);
23 | #endif
24 | 


--------------------------------------------------------------------------------
/avx2/src/api.c.supercop_only:
--------------------------------------------------------------------------------
1 | ../../ref/src/api.c


--------------------------------------------------------------------------------
/avx2/src/main.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <stdio.h>
  3 | #include <stdint.h>
  4 | #include <stdlib.h>
  5 | #include <string.h>
  6 | #include "comb.h"
  7 | #include "curve.h"
  8 | #include "f11_260.h"
  9 | #include "gen.h"
 10 | #include "scalar.h"
 11 | #include "sign.h"
 12 | 
 13 | int main(int _argc, char **argv) {
 14 |   residue_narrow_t x = {
 15 |     .limbs = {
 16 |       0x14e8b6e, 0x3553e74, 0x0464e4c, 0x61de408,
 17 |       0x006a30e, 0x6e9b25b, 0x3e6f39e, 0x19ec754,
 18 |       0x5c71cc3, 0x2bc1c0e, 0x554338e, 0x14e8b6e,
 19 |     },
 20 |   };
 21 | 
 22 |   residue_wide_t two = {
 23 |     .limbs = {
 24 |       0x0, 0x2, 0x0, 0x0,
 25 |       0x0, 0x0, 0x0, 0x0,
 26 |       0x0, 0x0, 0x0, 0x0,
 27 |     },
 28 |   };
 29 | 
 30 |   residue_wide_t x_plus_two;
 31 | 
 32 |   residue_narrow_reduced_t x_narrow_reduced = {
 33 |     .limbs = {
 34 |       0x206b305, 0x2f7c2ce, 0x0cf58a7, 0x2b81791, 0x19b26fa,
 35 |       0x2986830, 0x0503be5, 0x0789163, 0x16d90a0, 0x005a82e,
 36 |     },
 37 |   };
 38 | 
 39 |   residue_wide_t x_wide;
 40 | 
 41 |   residue_narrow_t y = {
 42 |     .limbs = {
 43 |       0x56ed38e, 0x5f5b0e1, 0x4668277, 0x0f7d85a,
 44 |       0x4515e42, 0x00cb559, 0x3f8a910, 0x6655708,
 45 |       0x3085b4d, 0x581ceff, 0x3324c03, 0x56ed38e,
 46 |     },
 47 |   };
 48 | 
 49 |   residue_narrow_reduced_t y_narrow_reduced = {
 50 |     .limbs = {
 51 |       0x086dd54, 0x2f7aedb, 0x38904ae, 0x2e28aa4, 0x29de1ad,
 52 |       0x289d572, 0x0f6837a, 0x19987b1, 0x012fb71, 0x1c37867,
 53 |     },
 54 |   };
 55 | 
 56 |   residue_wide_t y_wide;
 57 | 
 58 |   residue_wide_t mul_expected = {
 59 |     .limbs = {
 60 |       0x06e9e1d, 0x1c508c4, 0x3eeb85d, 0x04bc914,
 61 |       0x0a57e1c, 0x1f13f9a, 0x2d8aa7d, 0x232cce3,
 62 |       0x31e92c4, 0x04fb073, 0x2582507, 0x06e9e1d,
 63 |     },
 64 |   };
 65 | 
 66 |   residue_wide_t square_expected = {
 67 |     .limbs = {
 68 |       0x3088d3c, 0x2073353, 0x18e5de4, 0x320a4ab,
 69 |       0x3ee123a, 0x2d88419, 0x3d1ae13, 0x02b3dcf,
 70 |       0x2997027, 0x3d550a2, 0x220a052, 0x3088d3c,
 71 |     },
 72 |   };
 73 | 
 74 |   residue_narrow_t negative_one_redundant = {
 75 |     .limbs = {
 76 |       0x000000e, 0x3ffffff, 0x3ffffff, 0x3ffffff,
 77 |       0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff,
 78 |       0x3ffffff, 0x3ffffff, 0x3ffffff, 0x000000e,
 79 |     },
 80 |   };
 81 | 
 82 |   residue_narrow_t negative_t2_plus_one = {
 83 |     .limbs = {
 84 |       0x000000e,
 85 |       0x3ffffff, 0x000000e, 0x3ffffff,
 86 |       0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff,
 87 |       0x3ffffff, 0x3ffffff, 0x3ffffff, 0x000000e,
 88 |     },
 89 |   };
 90 | 
 91 |   residue_narrow_reduced_t negative_t2_plus_one_partial = {
 92 |     .limbs = {
 93 |       0x3fffff1, 0x0000000, 0x3fffff1, 0x3fffff1,
 94 |       0x3fffff1, 0x3fffff1, 0x3fffff1, 0x3fffff1,
 95 |       0x3fffff1, 0x3fffff1,
 96 |     },
 97 |   };
 98 | 
 99 |   residue_narrow_reduced_t negative_t2_plus_one_complete = {
100 |     .limbs = {
101 |       0x0000000, 0x0000001, 0x3fffff1, 0x3fffff1,
102 |       0x3fffff1, 0x3fffff1, 0x3fffff1, 0x3fffff1,
103 |       0x3fffff1, 0x3fffff1,
104 |     },
105 |   };
106 | 
107 |   residue_wide_t sqrt_x_plus_2_over_y = {
108 |     .limbs = {
109 |       0x040bbb0, 0x3fa8549, 0x0706e5c, 0x3b33dc9,
110 |       0x3401712, 0x3a58fb3, 0x076ec4f, 0x3347ad0,
111 |       0x16ca1b0, 0x26ed559, 0x06033f0, 0x040bbb0,
112 |     },
113 |   };
114 | 
115 |   residue_wide_t x_inverse = {
116 |     .limbs = {
117 |       0x09fd09b, 0x17a9f53, 0x22e2983, 0x0f09456,
118 |       0x11fb41e, 0x1e47b3f, 0x37dd25f, 0x3bc6938,
119 |       0x2b654cd, 0x233a0b2, 0x3f8c25b, 0x09fd09b,
120 |     },
121 |   };
122 | 
123 |   #if 1
124 |   residue_wide_t result;
125 |   residue_narrow_t result_narrow;
126 |   residue_narrow_reduced_t result_narrow_reduced;
127 | 
128 |   mul_narrow(&result, &x, &y);
129 |   for (int i = 0; i < NLIMBS; ++i) {
130 |     assert(mul_expected.limbs[i] == result.limbs[i]);
131 |   }
132 | 
133 |   widen(&x_wide, &x);
134 |   for (int i = 0; i < NLIMBS; ++i) {
135 |     assert(x.limbs[i] == x_wide.limbs[i]);
136 |   }
137 | 
138 |   mul_wide_narrow(&result, &x_wide, &y);
139 |   for (int i = 0; i < NLIMBS; ++i) {
140 |     assert(mul_expected.limbs[i] == result.limbs[i]);
141 |   }
142 | 
143 |   widen(&y_wide, &y);
144 |   mul_wide(&result, &x_wide, &y_wide);
145 |   for (int i = 0; i < NLIMBS; ++i) {
146 |     assert(mul_expected.limbs[i] == result.limbs[i]);
147 |   }
148 | 
149 |   square_narrow(&result, &x);
150 |   for (int i = 0; i < NLIMBS; ++i) {
151 |     assert(square_expected.limbs[i] == result.limbs[i]);
152 |   }
153 | 
154 |   square_wide(&result, &x_wide);
155 |   for (int i = 0; i < NLIMBS; ++i) {
156 |     assert(square_expected.limbs[i] == result.limbs[i]);
157 |   }
158 | 
159 |   // The reduction function doesn't reduce this redundant version of negative
160 |   // one any more.
161 |   reduce_step_narrow(&result_narrow, &negative_one_redundant);
162 |   for (int i = 0; i < NLIMBS; ++i) {
163 |     assert(negative_one_redundant.limbs[i] == result_narrow.limbs[i]);
164 |   }
165 | 
166 |   reduce_step_narrow(&result_narrow, &negative_t2_plus_one);
167 |   for (int i = 0; i < NLIMBS; ++i) {
168 |     assert(negative_t2_plus_one.limbs[i] == result_narrow.limbs[i]);
169 |   }
170 | 
171 |   narrow_partial_complete(&result_narrow_reduced, &negative_t2_plus_one);
172 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
173 |     assert(negative_t2_plus_one_partial.limbs[i] ==
174 |         result_narrow_reduced.limbs[i]);
175 |   }
176 | 
177 |   narrow_complete(&result_narrow_reduced, &negative_t2_plus_one);
178 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
179 |     assert(negative_t2_plus_one_complete.limbs[i] ==
180 |         result_narrow_reduced.limbs[i]);
181 |   }
182 | 
183 |   scalar_t scalar_result;
184 |   scalar_t scalar_x = {
185 |     .limbs = {
186 |       0xa46168f9, 0x4cbf07a5, 0x62cf2928, 0xfd04242b, 0x3b12d23f,
187 |       0x355e9e63, 0xc22e849e, 0x6331c34a, 0x1,
188 |     },
189 |   };
190 |   scalar_t scalar_y = {
191 |     .limbs = {
192 |       0x148b9452, 0xaca9b6bb, 0xe0eeb33d, 0x7e64c899, 0xd61c602a,
193 |       0x96dcbb6b, 0x6a037c88, 0x39fbbaf0, 0x0,
194 |     },
195 |   };
196 |   scalar_t scalar_x_plus_y = {
197 |     .limbs = {
198 |       0xb8ecfd4b, 0xf968be60, 0x43bddc65, 0x7b68ecc5, 0x112f326a,
199 |       0xcc3b59cf, 0x2c320126, 0x9d2d7e3b, 0x1,
200 |     },
201 |   };
202 |   scalar_t scalar_x_plus_x = {
203 |     .limbs = {
204 |       0x48c2d1f2, 0x997e0f4b, 0xc59e5250, 0xfa084856, 0x7625a47f,
205 |       0x6abd3cc6, 0x845d093c, 0xc6638695, 0x2,
206 |     },
207 |   };
208 |   scalar_t scalar_x_plus_x_plus_x_plus_y = {
209 |     .limbs = {
210 |       0xd90232fc, 0xac09d5c3, 0xd4a42a06, 0x1a7823b2, 0x2a5e47bb,
211 |       0x24a61ea1, 0xa6cd4ac4, 0x639199d0, 0x0,
212 |     },
213 |   };
214 |   scalar_t scalar_x_minus_y = {
215 |     .limbs = {
216 |       0x8fd5d4a7, 0xa01550ea, 0x81e075ea, 0x7e9f5b91, 0x64f67215,
217 |       0x9e81e2f7, 0x582b0815, 0x2936085a, 0x1,
218 |     },
219 |   };
220 |   scalar_t scalar_y_minus_x = {
221 |     .limbs = {
222 |       0x98d7c79a, 0x46c7a6fd, 0xb2d78ec5, 0xdc59b5d7, 0xf8001d19,
223 |       0x73d094fc, 0xb196b789, 0xd6c962a5, 0x2,
224 |     },
225 |   };
226 |   scalar_t scalar_x_times_y = {
227 |     .limbs = {
228 |       0x30b3d35a, 0x9ca90acf, 0x6926efdd, 0x80620b0a, 0x52e190e7,
229 |       0x8011b9b8, 0x8c7d8f43, 0x90491703, 0x3,
230 |     },
231 |   };
232 |   scalar_t scalar_x_sabs = {
233 |     .limbs = {
234 |       0x80d57bfa, 0x58a59402, 0x47f78b34, 0x488fef43, 0xe39c4ac1,
235 |       0xf60a5f48, 0x4d93c310, 0xb19a0ba5, 0x0,
236 |     },
237 |   };
238 |   scalar_t scalar_y_sabs = {
239 |     .limbs = {
240 |       0x4d415fc7, 0xfc096781, 0x21635296, 0x36bcca2f, 0x5f9c594e,
241 |       0xaff2a9c7, 0x265f1ed5, 0x1cfebcf8, 0x2,
242 |     },
243 |   };
244 |   scalar_hash_t scalar_hash_val = {
245 |     .limbs = {
246 |       0xcbbc3de7, 0xa212405d, 0x5c85f47c, 0x79aa991c,
247 |       0xfe310944, 0x54075530, 0xd5ef6878, 0x72e57186,
248 |       0x36dcac18, 0xb72461e2, 0x5405caca, 0x4e9e0bff,
249 |       0x8d67a990, 0xf62f262c, 0x6df205dd, 0x24d78573,
250 |     },
251 |   };
252 |   scalar_t reduced_hash_val = {
253 |     .limbs = {
254 |       0xef1d4f9d, 0xd832a3a5, 0xdf1682be, 0x8d257e79, 0x41b1f2ca,
255 |       0x5be9564c, 0x320d4cb6, 0x108f8d04, 0x3,
256 |     },
257 |   };
258 | 
259 |   uint8_t buffer[33];
260 |   uint8_t encode_x[33] = {
261 |     0x05, 0xb3, 0x06, 0x3a, 0x0b, 0xdf,
262 |     0x7b, 0x8a, 0xf5, 0x4c, 0xe4, 0x05, 0xae,
263 |     0xfa, 0x26, 0x9b, 0xc1, 0xa0, 0x61,
264 |     0x5a, 0xbe, 0x03, 0xc5, 0x58, 0x24, 0x1e,
265 |     0xa0, 0x90, 0x6d, 0xb9, 0xa0, 0x16, 0x00,
266 |   };
267 |   uint8_t encode_y[33] = {
268 |     0x54, 0xdd, 0x86, 0x6c, 0xbb, 0xde,
269 |     0xeb, 0x4a, 0x90, 0x38, 0xa9, 0xa2, 0xb8,
270 |     0xad, 0xe1, 0x9d, 0xca, 0x55, 0x27,
271 |     0xaa, 0x37, 0x68, 0x4f, 0xec, 0x61, 0x66,
272 |     0x71, 0xfb, 0x12, 0x9c, 0xe1, 0x0d, 0x07,
273 |   };
274 | 
275 |   add_mod_l(&scalar_result, &scalar_x, &scalar_y);
276 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
277 |     assert(scalar_x_plus_y.limbs[i] == scalar_result.limbs[i]);
278 |   }
279 |   add_mod_l(&scalar_result, &scalar_x, &scalar_x);
280 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
281 |     assert(scalar_x_plus_x.limbs[i] == scalar_result.limbs[i]);
282 |   }
283 |   add_mod_l(&scalar_result, &scalar_x_plus_x, &scalar_x_plus_y);
284 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
285 |     assert(scalar_x_plus_x_plus_x_plus_y.limbs[i] == scalar_result.limbs[i]);
286 |   }
287 |   sub_mod_l(&scalar_result, &scalar_x, &scalar_y);
288 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
289 |     assert(scalar_x_minus_y.limbs[i] == scalar_result.limbs[i]);
290 |   }
291 | 
292 |   sub_mod_l(&scalar_result, &scalar_y, &scalar_x);
293 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
294 |     assert(scalar_y_minus_x.limbs[i] == scalar_result.limbs[i]);
295 |   }
296 | 
297 |   mult_mod_l(&scalar_result, &scalar_x, &scalar_y);
298 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
299 |     assert(scalar_x_times_y.limbs[i] == scalar_result.limbs[i]);
300 |   }
301 | 
302 |   convert_to_sabs(&scalar_result, &scalar_x);
303 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
304 |     assert(scalar_x_sabs.limbs[i] == scalar_result.limbs[i]);
305 |   }
306 | 
307 |   convert_to_sabs(&scalar_result, &scalar_y);
308 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
309 |     assert(scalar_y_sabs.limbs[i] == scalar_result.limbs[i]);
310 |   }
311 | 
312 |   encode(buffer, &x_narrow_reduced);
313 |   for (int i = 0; i < 33; ++i) {
314 |     assert(encode_x[i] == buffer[i]);
315 |   }
316 | 
317 |   encode(buffer, &y_narrow_reduced);
318 |   for (int i = 0; i < 33; ++i) {
319 |     assert(encode_y[i] == buffer[i]);
320 |   }
321 | 
322 |   decode(&result_narrow_reduced, encode_x);
323 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
324 |     assert(x_narrow_reduced.limbs[i] == result_narrow_reduced.limbs[i]);
325 |   }
326 | 
327 |   decode(&result_narrow_reduced, encode_y);
328 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
329 |     assert(y_narrow_reduced.limbs[i] == result_narrow_reduced.limbs[i]);
330 |   }
331 | 
332 |   //x/y is not a quadratic residue, but (x+2)/y is.
333 |   assert(!sqrt_inv_wide(&result, &x_wide, &y_wide));
334 |   add_wide(&x_plus_two, &x_wide, &two);
335 |   assert(sqrt_inv_wide(&result, &x_plus_two, &y_wide));
336 |   for (int i = 0; i < NLIMBS; ++i) {
337 |     assert(sqrt_x_plus_2_over_y.limbs[i] == result.limbs[i]);
338 |   }
339 | 
340 |   invert_wide(&result, &x_wide);
341 |   assert(equal_wide(&result, &x_inverse));
342 | 
343 | 
344 |   reduce_hash_mod_l(&scalar_result, &scalar_hash_val);
345 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
346 |     assert(reduced_hash_val.limbs[i] == scalar_result.limbs[i]);
347 |   }
348 | 
349 |   scalar_t mult_scalar = {
350 |     .limbs = {
351 |       0x55f0b9a3, 0x82b106c5, 0xcb2e2b7d, 0x30735cbc,
352 |       0xa512a8ba, 0x4c5cd391, 0xe9d0c788, 0x92bb2562, 0x3,
353 |     },
354 |   };
355 |   projective_pt_wide_t expected_scalar_mult = {
356 |     .x = {
357 |       .limbs = {
358 |         0x0350abe, 0x1267d8d, 0x39a3cd3, 0x09e1275, 0x2d21378, 0x24771d9,
359 |         0x3558a1d, 0x3bdca9b, 0x0dd862d, 0x0bb230a, 0x1668292, 0x0350abe,
360 |       },
361 |     },
362 |     .y = {
363 |       .limbs = {
364 |         0x0b090d6, 0x04d69fd, 0x03e739d, 0x36ce258, 0x0b6464b, 0x19dab22,
365 |         0x249c1a8, 0x1d28c7d, 0x1591dbc, 0x085ebab, 0x0e8274f, 0x0b090d6,
366 |       },
367 |     },
368 |     .z = {
369 |       .limbs = {0, 0x1},
370 |     },
371 |   };
372 |   projective_pt_wide_t result_pt;
373 | 
374 |   for (int i = 0; i<1; ++i) {
375 |     scalar_multiply(&result_pt, &B, &mult_scalar);
376 |   }
377 |   {
378 |     residue_wide_t tmp;
379 |     mul_wide(&tmp, &expected_scalar_mult.x, &result_pt.z);
380 |     assert(equal_wide(&tmp, &result_pt.x));
381 |     mul_wide(&tmp, &expected_scalar_mult.y, &result_pt.z);
382 |     assert(equal_wide(&tmp, &result_pt.y));
383 |   }
384 | 
385 |   affine_pt_narrow_t expected_everything0 = {
386 |     .x = {
387 |       .limbs = {
388 |         0, 0x20eef1a, 0x3c30e66, 0x0d710f0, 0x248a6fa, 0x30c967f,
389 |         0x3ce302c, 0x0ccd1f2, 0x197e993, 0x2ebaef3, 0x0f2f019, 0,
390 |       },
391 |     },
392 |     .y = {
393 |       .limbs = {
394 |         0, 0x3017cc0, 0x02a5110, 0x06d37e5, 0x283a64a, 0x01484b5,
395 |         0x196f37b, 0x13de2d2, 0x0da32d1, 0x392e0fc, 0x221d742, 0,
396 |       },
397 |     },
398 |   };
399 | 
400 |   affine_pt_narrow_t expected_everything1 = {
401 |     .x = {
402 |       .limbs = {
403 |         0, 0x0e35d45, 0x038f90c, 0x0283483, 0x01ee50a, 0x1e364f9,
404 |         0x362414c, 0x156b1ed, 0x006fff6, 0x271f9ed, 0x0ffa45d, 0,
405 |       },
406 |     },
407 |     .y = {
408 |       .limbs = {
409 |         0, 0x156ae67, 0x27941ab, 0x19a3000, 0x3572ab5, 0x2b90ce3,
410 |         0x136156c, 0x0727496, 0x0edae82, 0x0fa5dfd, 0x16f293c, 0,
411 |       },
412 |     },
413 |   };
414 | 
415 |   affine_pt_narrow_t expected_everything2 = {
416 |     .x = {
417 |       .limbs = {
418 |         0, 0x37fcb1b, 0x16004b9, 0x1d18743, 0x0bce648, 0x0d78db6,
419 |         0x35b1d65, 0x23bb620, 0x2fbc323, 0x1a9a586, 0x3b22577, 0,
420 |       },
421 |     },
422 |     .y = {
423 |       .limbs = {
424 |         0, 0x082fb15, 0x03487d6, 0x3d1c2c9, 0x2c9e7ad, 0x187be10,
425 |         0x2e9b6ba, 0x15b8f89, 0x243ae4c, 0x328bb11, 0x00b12a9, 0,
426 |       },
427 |     },
428 |   };
429 | 
430 | 
431 |   affine_pt_narrow_t expected_everything3 = {
432 |     .x = {
433 |       .limbs = {
434 |         0, 0x3e79b25, 0x2ca71b7, 0x2b2ea3c, 0x0de7ac4, 0x3026d10,
435 |         0x2bce79e, 0x1153866, 0x03e5a80, 0x22b9a37, 0x03e9c59, 0,
436 |       },
437 |     },
438 |     .y = {
439 |       .limbs = {
440 |         0, 0x20100d6, 0x2330974, 0x3402585, 0x172cfd6, 0x275a21c,
441 |         0x213e87c, 0x29989f2, 0x155e437, 0x096a378, 0x3a674eb, 0,
442 |       },
443 |     },
444 |   };
445 | 
446 |   affine_pt_narrow_t expected_gray_code_end0 = {
447 |     .x = {
448 |       .limbs = {
449 |         0, 0x14dd884, 0x12c9e33, 0x2d42122, 0x26f0b14, 0x1b9ea17,
450 |         0x3779e94, 0x2562a88, 0x0be34f0, 0x192ead9, 0x089ec45, 0,
451 |       },
452 |     },
453 |     .y = {
454 |       .limbs = {
455 |         0, 0x1de5221, 0x172f820, 0x28c1b33, 0x08003c6, 0x0e65926,
456 |         0x188cd49, 0x3bb39fd, 0x1b9d8d7, 0x03d5020, 0x045742b, 0,
457 |       },
458 |     },
459 |   };
460 | 
461 |   affine_pt_narrow_t expected_gray_code_end1 = {
462 |     .x = {
463 |       .limbs = {
464 |         0, 0x1d1cf29, 0x2e289d7, 0x1a83709, 0x2252d11, 0x3d6411c,
465 |         0x3fd73ad, 0x2737d9c, 0x2ca9eba, 0x058f290, 0x3879a7c, 0,
466 |       },
467 |     },
468 |     .y = {
469 |       .limbs = {
470 |         0, 0x357399d, 0x0276752, 0x0d5199f, 0x1bbd3a0, 0x39044f1,
471 |         0x0c5e83a, 0x1a99cdd, 0x0dcb61f, 0x35b7272, 0x1184cff, 0,
472 |       },
473 |     },
474 |   };
475 | 
476 |   affine_pt_narrow_t expected_gray_code_end2 = {
477 |     .x = {
478 |       .limbs = {
479 |         0, 0x1ea3c19, 0x081dc9e, 0x1a0b337, 0x1d7f3f4, 0x295a0aa,
480 |         0x1ebff45, 0x0956bf0, 0x17aae80, 0x05d8632, 0x3082c9a, 0,
481 |       },
482 |     },
483 |     .y = {
484 |       .limbs = {
485 |         0, 0x22ad91f, 0x1ffcc65, 0x37b4f5c, 0x29c51ab, 0x3f9bd02,
486 |         0x296aaf9, 0x2a58b82, 0x2c54e16, 0x2a7672c, 0x21486e2, 0,
487 |       },
488 |     },
489 |   };
490 | 
491 |   affine_pt_narrow_t expected_gray_code_end3 = {
492 |     .x = {
493 |       .limbs = {
494 |         0, 0x06b9c9d, 0x3d00674, 0x10a73fc, 0x30fda83, 0x139185c,
495 |         0x043e082, 0x3c67915, 0x208192a, 0x025e451, 0x258a566, 0,
496 |       },
497 |     },
498 |     .y = {
499 |       .limbs = {
500 |         0, 0x3d2a04f, 0x1314c36, 0x131c7a3, 0x1882ef3, 0x1a0a5e8,
501 |         0x1919356, 0x0a5616a, 0x1eea31d, 0x2c216b3, 0x18ba4aa, 0,
502 |       },
503 |     },
504 |   };
505 | 
506 |   sabs_comb_set_t computed_base_comb;
507 |   compute_comb_set(&computed_base_comb, &B);
508 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
509 |     //assert(computed_base_comb.combs[0].table[COMB_TABLE_SIZE - 1].x.limbs[i] ==
510 |       //expected_everything0.x.limbs[i]);
511 |     //assert(computed_base_comb.combs[0].table[COMB_TABLE_SIZE - 1].y.limbs[i] ==
512 |       //expected_everything0.y.limbs[i]);
513 |     //assert(computed_base_comb.combs[1].table[COMB_TABLE_SIZE - 1].x.limbs[i] ==
514 |       //expected_everything1.x.limbs[i]);
515 |     //assert(computed_base_comb.combs[1].table[COMB_TABLE_SIZE - 1].y.limbs[i] ==
516 |       //expected_everything1.y.limbs[i]);
517 |     //assert(computed_base_comb.combs[2].table[COMB_TABLE_SIZE - 1].x.limbs[i] ==
518 |       //expected_everything2.x.limbs[i]);
519 |     //assert(computed_base_comb.combs[2].table[COMB_TABLE_SIZE - 1].y.limbs[i] ==
520 |       //expected_everything2.y.limbs[i]);
521 |     //assert(computed_base_comb.combs[3].table[COMB_TABLE_SIZE - 1].x.limbs[i] ==
522 |       //expected_everything3.x.limbs[i]);
523 |     //assert(computed_base_comb.combs[3].table[COMB_TABLE_SIZE - 1].y.limbs[i] ==
524 |       //expected_everything3.y.limbs[i]);
525 |   }
526 | 
527 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
528 |     //assert(computed_base_comb.combs[0].table[7].x.limbs[i] ==
529 |       //expected_gray_code_end0.x.limbs[i]);
530 |     //assert(computed_base_comb.combs[0].table[7].y.limbs[i] ==
531 |       //expected_gray_code_end0.y.limbs[i]);
532 |     //assert(computed_base_comb.combs[1].table[7].x.limbs[i] ==
533 |       //expected_gray_code_end1.x.limbs[i]);
534 |     //assert(computed_base_comb.combs[1].table[7].y.limbs[i] ==
535 |       //expected_gray_code_end1.y.limbs[i]);
536 |     //assert(computed_base_comb.combs[2].table[7].x.limbs[i] ==
537 |       //expected_gray_code_end2.x.limbs[i]);
538 |     //assert(computed_base_comb.combs[2].table[7].y.limbs[i] ==
539 |       //expected_gray_code_end2.y.limbs[i]);
540 |     //assert(computed_base_comb.combs[3].table[7].x.limbs[i] ==
541 |       //expected_gray_code_end3.x.limbs[i]);
542 |     //assert(computed_base_comb.combs[3].table[7].y.limbs[i] ==
543 |       //expected_gray_code_end3.y.limbs[i]);
544 |   }
545 |   #endif
546 | 
547 |   #if 1
548 |   for (int i = 0; i<1; ++i) {
549 |     scalar_comb_multiply(&result_pt, &base_comb, &mult_scalar);
550 |   }
551 |   {
552 |     residue_wide_t tmp;
553 |     mul_wide(&tmp, &expected_scalar_mult.x, &result_pt.z);
554 |     assert(equal_wide(&tmp, &result_pt.x));
555 |     mul_wide(&tmp, &expected_scalar_mult.y, &result_pt.z);
556 |     assert(equal_wide(&tmp, &result_pt.y));
557 |   }
558 |   #endif
559 |   #if 0
560 |   for (int i = 0; i<1; ++i) {
561 |     scalar_t priv_key;
562 |     affine_pt_narrow_reduced_t pub_key;
563 |     gen_key(&priv_key, &pub_key);
564 |   }
565 |   #endif
566 |   for (int i = 0; i < 1; ++i) {
567 |     uint8_t encoded_sk[66];
568 |     uint8_t encoded_sig[65];
569 |     const uint8_t *msg = (uint8_t *) "Hello World!";
570 |     const size_t msglen = 13;
571 |     scalar_t priv_key;
572 |     scalar_t priv_key_decoded;
573 |     affine_pt_narrow_t pub_key;
574 |     affine_pt_narrow_t pub_key_decoded;
575 |     gen_key(&priv_key, &pub_key);
576 |     memcpy(encoded_sk, &priv_key, SCALAR_BYTES);
577 |     encode_pub_key(encoded_sk + SCALAR_BYTES, &pub_key);
578 |     priv_key_decoded.limbs[SCALAR_LIMBS - 1] = 0;
579 |     memcpy(&priv_key_decoded, encoded_sk, SCALAR_BYTES);
580 |     for (int j = 0; j < SCALAR_LIMBS; ++j) {
581 |       assert(priv_key.limbs[j] == priv_key_decoded.limbs[j]);
582 |     }
583 |     signature_t result;
584 |     sign(&result, &priv_key_decoded, encoded_sk + SCALAR_BYTES, msg, msglen);
585 |     encode_sig(encoded_sig, &result);
586 |     signature_t result_decoded;
587 |     decode_sig(&result_decoded, encoded_sig);
588 |     for (int j = 0; j < SCALAR_LIMBS; ++j) {
589 |       assert(result.s.limbs[j] == result_decoded.s.limbs[j]);
590 |     }
591 |     for (int j = 0; j < NLIMBS_REDUCED; ++j) {
592 |       assert(result.y.limbs[j] == result_decoded.y.limbs[j]);
593 |     }
594 |     assert(decode_pub_key(&pub_key_decoded, encoded_sk + SCALAR_BYTES));
595 | 
596 |     uint8_t y_buf[RESIDUE_LENGTH_BYTES];
597 |     encode(y_buf, &result_decoded.y);
598 |     if(!verify(&result, y_buf, encoded_sk + SCALAR_BYTES, &pub_key_decoded, msg,
599 |                msglen)) {
600 |       printf("verification failed\n");
601 |       exit(1);
602 |     }
603 |   }
604 | }
605 | 


--------------------------------------------------------------------------------
/avx2/src/sign.c:
--------------------------------------------------------------------------------
  1 | #define _DEFAULT_SOURCE
  2 | #include <blake2.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | 
  6 | #include "comb.h"
  7 | #include "curve.h"
  8 | #include "scalar.h"
  9 | 
 10 | #include "sign.h"
 11 | 
 12 | #include "f11_260.c"
 13 | #include "curve.c"
 14 | #include "scalar.c"
 15 | #include "gen.c"
 16 | #include "constant_time.c"
 17 | #include "comb.c"
 18 | 
 19 | void sign(signature_t *result, scalar_t *priv_key,
 20 |   const uint8_t *pub_key, const uint8_t *msg, size_t msg_len) {
 21 |   blake2b_state hash_ctxt;
 22 | 
 23 |   char session_key_wash[16];
 24 | 
 25 |   scalar_hash_t scalar_large;
 26 |   scalar_t session_key;
 27 | 
 28 |   arc4random_buf(session_key_wash, sizeof(session_key_wash));
 29 |   blake2b_init_key(&hash_ctxt, 64, session_key_wash, sizeof(session_key_wash));
 30 |   blake2b_update(&hash_ctxt, (uint8_t *) priv_key, SCALAR_BYTES);
 31 |   blake2b_update(&hash_ctxt, (uint8_t *) msg, msg_len);
 32 |   blake2b_final(&hash_ctxt, (uint8_t *) &scalar_large, sizeof(scalar_hash_t));
 33 | 
 34 |   reduce_hash_mod_l(&session_key, &scalar_large);
 35 | 
 36 |   projective_pt_wide_t result_pt;
 37 |   scalar_comb_multiply(&result_pt, &base_comb, &session_key);
 38 |   residue_wide_t z_inv;
 39 | 
 40 |   invert_wide(&z_inv, &result_pt.z);
 41 |   mul_wide(&result_pt.x, &result_pt.x, &z_inv);
 42 |   mul_wide(&result_pt.y, &result_pt.y, &z_inv);
 43 | 
 44 |   residue_narrow_t temp_narrow;
 45 |   narrow(&temp_narrow, &result_pt.y);
 46 |   narrow_complete(&result->y, &temp_narrow);
 47 | 
 48 |   residue_narrow_reduced_t temp_narrow_reduced;
 49 |   narrow(&temp_narrow, &result_pt.x);
 50 |   narrow_partial_complete(&temp_narrow_reduced, &temp_narrow);
 51 |   result->y.limbs[NLIMBS_REDUCED - 1] |=
 52 |       is_odd(&temp_narrow_reduced) << (TBITS);
 53 | 
 54 |   uint8_t y_buf[RESIDUE_LENGTH_BYTES];
 55 |   encode(y_buf, &result->y);
 56 | 
 57 |   blake2b_init(&hash_ctxt, 64);
 58 |   blake2b_update(&hash_ctxt, y_buf, RESIDUE_LENGTH_BYTES);
 59 |   blake2b_update(&hash_ctxt, pub_key, RESIDUE_LENGTH_BYTES);
 60 |   blake2b_update(&hash_ctxt, msg, msg_len);
 61 |   blake2b_final(&hash_ctxt, (uint8_t *) &scalar_large, sizeof(scalar_hash_t));
 62 | 
 63 |   scalar_t hash_scalar;
 64 |   mont_reduce_hash_mod_l(&hash_scalar, &scalar_large);
 65 |   mont_mult_mod_l(&hash_scalar, &hash_scalar, priv_key);
 66 |   mont_mult_mod_l(&hash_scalar, &hash_scalar, &SCALAR_MONT_R2_HASH_MUL);
 67 |   sub_mod_l(&result->s, &session_key, &hash_scalar);
 68 | 
 69 |   explicit_bzero(&session_key, sizeof(session_key));
 70 |   explicit_bzero(&hash_scalar, sizeof(hash_scalar));
 71 |   explicit_bzero(&session_key_wash, sizeof(session_key_wash));
 72 | }
 73 | 
 74 | int verify(
 75 |   const signature_t *sig, const uint8_t *r_bytes, const uint8_t *pub_key_bytes,
 76 |   const affine_pt_narrow_t *pub_key_pt, const uint8_t *msg,
 77 |   size_t msg_len) {
 78 | 
 79 |   projective_pt_wide_t sB;
 80 |   projective_pt_wide_t hA;
 81 |   projective_pt_wide_t result_pt;
 82 |   residue_narrow_reduced_t result_y;
 83 | 
 84 |   scalar_hash_t scalar_large;
 85 |   blake2b_state hash_ctxt;
 86 |   blake2b_init(&hash_ctxt, 64);
 87 |   blake2b_update(&hash_ctxt, r_bytes, RESIDUE_LENGTH_BYTES);
 88 |   blake2b_update(&hash_ctxt, pub_key_bytes, RESIDUE_LENGTH_BYTES);
 89 |   blake2b_update(&hash_ctxt, msg, msg_len);
 90 |   blake2b_final(&hash_ctxt, (uint8_t *) &scalar_large, sizeof(scalar_hash_t));
 91 | 
 92 |   scalar_t hash_scalar;
 93 |   reduce_hash_mod_l(&hash_scalar, &scalar_large);
 94 | 
 95 |   // Can use non-const version for both of these.
 96 |   scalar_comb_multiply_unsafe(&sB, &base_comb, &sig->s);
 97 |   scalar_multiply_unsafe(&hA, pub_key_pt, &hash_scalar);
 98 |   projective_add(&result_pt, &sB, &hA);
 99 | 
100 |   // Everything below except the comparison should eventually be in helper
101 |   // functions: Point affinization, and point compression bit-for-bit.
102 |   // Same applies for the signing.
103 |   residue_wide_t z_inv;
104 | 
105 |   invert_wide(&z_inv, &result_pt.z);
106 |   mul_wide(&result_pt.x, &result_pt.x, &z_inv);
107 |   mul_wide(&result_pt.y, &result_pt.y, &z_inv);
108 | 
109 |   residue_narrow_t temp_narrow;
110 |   narrow(&temp_narrow, &result_pt.y);
111 |   narrow_complete(&result_y, &temp_narrow);
112 | 
113 |   residue_narrow_reduced_t temp_narrow_reduced;
114 |   narrow(&temp_narrow, &result_pt.x);
115 |   narrow_partial_complete(&temp_narrow_reduced, &temp_narrow);
116 |   result_y.limbs[NLIMBS_REDUCED - 1] |=
117 |       is_odd(&temp_narrow_reduced) << TBITS;
118 | 
119 |   return equal_narrow_reduced(&sig->y, &result_y);
120 | }
121 | 
122 | void encode_sig(uint8_t *result, const signature_t *sig) {
123 |   residue_narrow_reduced_t pack;
124 | 
125 |   memcpy(&pack, &sig->y, sizeof(residue_narrow_reduced_t));
126 |   // Save the upper two bits in the uppermost part of the 33rd byte
127 |   pack.limbs[NLIMBS_REDUCED - 1] |=
128 |     (sig->s.limbs[SCALAR_LIMBS - 1] & 0x3) << 28;
129 |   encode(result, &pack);
130 |   memcpy(result + RESIDUE_LENGTH_BYTES,
131 |          &sig->s, sizeof(uint32_t) * (SCALAR_LIMBS - 1));
132 | }
133 | 
134 | void decode_sig(signature_t *result, const uint8_t *encoded_sig) {
135 |   decode(&result->y, encoded_sig);
136 |   result->s.limbs[SCALAR_LIMBS - 1] = result->y.limbs[NLIMBS_REDUCED - 1] >> 28;
137 |   // We leave an extra bit for the sign bit from compression.
138 |   result->y.limbs[NLIMBS_REDUCED - 1] &= ((1 << (TBITS + 1)) - 1);
139 |   memcpy(&result->s, encoded_sig + RESIDUE_LENGTH_BYTES,
140 |          sizeof(uint32_t) * (SCALAR_LIMBS - 1));
141 | }
142 | 


--------------------------------------------------------------------------------
/avx512/include:
--------------------------------------------------------------------------------
1 | ../ref/include


--------------------------------------------------------------------------------
/avx512/src/f11_260.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include "f11_260.h"
  3 | #include "mul_inline.h"
  4 | #include "emmintrin.h"
  5 | #include "immintrin.h"
  6 | 
  7 | residue_narrow_t zero_narrow = {0};
  8 | residue_narrow_t one_narrow = {
  9 |   .limbs = {1},
 10 | };
 11 | 
 12 | #define NVECTORS 3
 13 | #define VECTWIDTH 4
 14 | 
 15 | __attribute__((__aligned__(32)))
 16 | static const int32_t COLLAPSE[8] = { 0, 2, 4, 6, 4, 5, 6, 7 };
 17 | 
 18 | // Shrink to 32 bits. Assumes reduction has already occurred, and wide storage
 19 | // is being used for vector compatibility.
 20 | void narrow(residue_narrow_t *result, const residue_wide_t * __restrict w) {
 21 |   __m256i collapse_perm = _mm256_load_si256((__m256i*) COLLAPSE);
 22 |   __m128i packed_result;
 23 |   #pragma clang loop unroll(full)
 24 |   for (int i = 0; i < NVECTORS; ++i) {
 25 |     __m256i x = _mm256_load_si256((__m256i*) (&w->limbs[i * VECTWIDTH]));
 26 |     packed_result = _mm256_castsi256_si128(
 27 |         _mm256_permutevar8x32_epi32(x, collapse_perm));
 28 |     _mm_store_si128((__m128i*) &result->limbs[i * VECTWIDTH], packed_result);
 29 |   }
 30 | }
 31 | 
 32 | // Reduce to 10 limbs. Useful for debugging.
 33 | void narrow_reduce(
 34 |   residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w) {
 35 |   residue_narrow_t temp;
 36 | 
 37 |   __m128i x = _mm_load_si128((__m128i *) (&w->limbs[0 * VECTWIDTH]));
 38 |   __m128i x10 = _mm_broadcastd_epi32(x);
 39 |   x = _mm_sub_epi32(x, x10);
 40 |   _mm_store_si128((__m128i *) &temp.limbs[0 * VECTWIDTH], x);
 41 |   x = _mm_load_si128((__m128i *) (&w->limbs[1 * VECTWIDTH]));
 42 |   x = _mm_sub_epi32(x, x10);
 43 |   _mm_store_si128((__m128i *) &temp.limbs[1 * VECTWIDTH], x);
 44 |   x = _mm_load_si128((__m128i *) (&w->limbs[2 * VECTWIDTH]));
 45 |   x = _mm_sub_epi32(x, x10);
 46 |   _mm_store_si128((__m128i *) &temp.limbs[2 * VECTWIDTH], x);
 47 | 
 48 |   reduce_step_narrow(&temp, &temp);
 49 | 
 50 |   // May want to use vpalignr here.
 51 |   #pragma clang loop unroll(full)
 52 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
 53 |     result->limbs[i] = temp.limbs[i] - temp.limbs[10];
 54 |   }
 55 | }
 56 | 
 57 | // Reduce to unique representative.
 58 | // This is expensive. Only used for final signature or DH Key
 59 | void narrow_complete(
 60 |   residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w) {
 61 | 
 62 |   residue_narrow_t temp;
 63 |   for (int i = 0; i < NLIMBS; ++i) {
 64 |     temp.limbs[i] = w->limbs[i] - w->limbs[10];
 65 |   }
 66 | 
 67 |   // This may be combined with the final reduction from a multiply.
 68 |   reduce_step_narrow(&temp, &temp);
 69 | 
 70 |   int gt_mask = 0;
 71 |   int lt_mask = 0;
 72 |   int32_t limit[NLIMBS];
 73 |   for (int i = 0; i < NLIMBS; ++i) {
 74 |     temp.limbs[i] = temp.limbs[i] - temp.limbs[10];
 75 |     temp.limbs[i] += 1 & gt_mask;
 76 |     temp.limbs[i] -= 1 & lt_mask;
 77 |     gt_mask = -(temp.limbs[i] > T);
 78 |     lt_mask = -(temp.limbs[i] < 0);
 79 |     temp.limbs[i] -= (T & gt_mask);
 80 |     temp.limbs[i] += (T & lt_mask);
 81 |   }
 82 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
 83 |     temp.limbs[i] -= temp.limbs[10];
 84 |     limit[i] = T;
 85 |   }
 86 |   int64_t all_t = -1;
 87 |   for (int i = NLIMBS_REDUCED - 2; i >= 0; --i) {
 88 |     all_t &= -(temp.limbs[i+1] == T);
 89 |     limit[i] -= 1 & (~all_t);
 90 |   }
 91 |   gt_mask = 0;
 92 |   lt_mask = 0;
 93 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
 94 |     temp.limbs[i] += 1 & gt_mask;
 95 |     temp.limbs[i] -= 1 & lt_mask;
 96 |     gt_mask = -(temp.limbs[i] > limit[i]);
 97 |     lt_mask = -(temp.limbs[i] < 0);
 98 |     temp.limbs[i] -= (T & gt_mask);
 99 |     temp.limbs[i] += (T & lt_mask);
100 |     result->limbs[i] = temp.limbs[i];
101 |   }
102 | }
103 | 
104 | // Reduce to mostly unique representative.
105 | // All coefficients are reduced to 0 <= xi <= t
106 | // Unique up to carries (xi == t) => (xi = 0; x[i+1] += 1);
107 | // This is sufficient to determine if x is even or odd.
108 | // Still pretty expensive. Used in point compression.
109 | void narrow_partial_complete(
110 |   residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w) {
111 | 
112 |   residue_narrow_t temp;
113 |   for (int i = 0; i < NLIMBS; ++i) {
114 |     temp.limbs[i] = w->limbs[i] - w->limbs[10];
115 |   }
116 | 
117 |   // This may be combined with the final reduction from a multiply.
118 |   reduce_step_narrow(&temp, &temp);
119 | 
120 |   int gt_mask = 0;
121 |   int lt_mask = 0;
122 |   for (int i = 0; i < NLIMBS; ++i) {
123 |     temp.limbs[i] = temp.limbs[i] - temp.limbs[10];
124 |     temp.limbs[i] += 1 & gt_mask;
125 |     temp.limbs[i] -= 1 & lt_mask;
126 |     gt_mask = -(temp.limbs[i] > T);
127 |     lt_mask = -(temp.limbs[i] < 0);
128 |     temp.limbs[i] -= (T & gt_mask);
129 |     temp.limbs[i] += (T & lt_mask);
130 |   }
131 |   for (int i = 0; i < NLIMBS - 1; ++i) {
132 |     temp.limbs[i] -= temp.limbs[10];
133 |   }
134 |   gt_mask = 0;
135 |   lt_mask = 0;
136 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
137 |     temp.limbs[i] += 1 & gt_mask;
138 |     temp.limbs[i] -= 1 & lt_mask;
139 |     gt_mask = -(temp.limbs[i] > T);
140 |     lt_mask = -(temp.limbs[i] < 0);
141 |     temp.limbs[i] -= (T & gt_mask);
142 |     temp.limbs[i] += (T & lt_mask);
143 |     result->limbs[i] = temp.limbs[i];
144 |   }
145 | }
146 | 
147 | int is_odd(residue_narrow_reduced_t *x) {
148 |   int result = 0;
149 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
150 |     result ^= x->limbs[i] & 0x1;
151 |   }
152 |   return result;
153 | }
154 | 
155 | // Copy a 12x32-bit residue
156 | void copy_narrow(
157 |   residue_narrow_t *result, const residue_narrow_t * __restrict x) {
158 | 
159 |   for (int i = 0; i < NLIMBS; ++i) {
160 |     result->limbs[i] = x->limbs[i];
161 |   }
162 | }
163 | 
164 | // Copy a 10x32-bit residue
165 | void copy_narrow_reduced(
166 |   residue_narrow_reduced_t *result,
167 |   const residue_narrow_reduced_t * __restrict x) {
168 | 
169 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
170 |     result->limbs[i] = x->limbs[i];
171 |   }
172 | }
173 | 
174 | static inline __m256i load_extend_32_64(__m128i *x) {
175 |   return _mm256_cvtepi32_epi64(_mm_load_si128(x));
176 | }
177 | 
178 | static inline __m256i loadu_extend_32_64(__m128i *x) {
179 |   return _mm256_cvtepi32_epi64(_mm_loadu_si128(x));
180 | }
181 | 
182 | static inline __m512i load512_extend_32_64(__m256i *x) {
183 |   return _mm512_cvtepi32_epi64(_mm256_load_si256(x));
184 | }
185 | 
186 | static inline __m512i loadu512_extend_32_64(__m256i *x) {
187 |   return _mm512_cvtepi32_epi64(_mm256_loadu_si256(x));
188 | }
189 | 
190 | static inline __m512i loadu512_mask_extend_32_64(__m256i *x, __mmask8 k) {
191 |   return _mm512_cvtepi32_epi64(
192 |     _mm256_mask_loadu_epi32(_mm256_setzero_si256(), k, x));
193 | }
194 | 
195 | void reduce_step_narrow(
196 |   residue_narrow_t *result, const residue_narrow_t *x) {
197 |   return reduce_step_narrow_i(result, x);
198 | }
199 | 
200 | void reduce_step_wide(
201 |   residue_wide_t *result, const residue_wide_t *x) {
202 |   return reduce_step_wide_i(result, x);
203 | }
204 | 
205 | void mul_narrow(
206 |   residue_narrow_t *result, const residue_narrow_t *x,
207 |   const residue_narrow_t *y) {
208 |   return mul_narrow_i(result, x, y);
209 | }
210 | 
211 | void square_narrow(
212 |   residue_narrow_t *result, const residue_narrow_t *x) {
213 |   return square_narrow_i(result, x);
214 | }
215 | 
216 | // Produce a 64-bit residue
217 | void widen(
218 |   residue_wide_t *result, const residue_narrow_t * __restrict x) {
219 |   __m256i wide10 = loadu_extend_32_64((__m128i *) x);
220 |   __m512i wide3 = loadu512_extend_32_64((__m256i *) &x[4]);
221 |   _mm256_store_si256((__m256i*) &result->limbs[0], wide10);
222 |   _mm512_storeu_si512((__m512i*) &result->limbs[4], wide3);
223 | }
224 | 
225 | // Subtract 2 12x32-bit residues.
226 | void sub_narrow(
227 |   residue_narrow_t *result, const residue_narrow_t * __restrict x,
228 |   const residue_narrow_t * __restrict y) {
229 | 
230 |   __m512i lhs = _mm512_load_si512((__m512i*) &x->limbs[0]);
231 |   __m512i rhs = _mm512_load_si512((__m512i*) &y->limbs[0]);
232 |   __m512i sub = _mm512_sub_epi32(lhs, rhs);
233 |   _mm512_store_si512((__m512i*) &result->limbs[0], sub);
234 | }
235 | 
236 | // negate a 12x64-bit residue.
237 | void negate_wide(residue_wide_t *result, const residue_wide_t *x) {
238 | 
239 |   __m256i zero = _mm256_setzero_si256();
240 |   #pragma clang loop unroll(full)
241 |   for (int i = 0; i < NVECTORS; ++i) {
242 |     __m256i xv = _mm256_load_si256((__m256i*) (&x->limbs[i * VECTWIDTH]));
243 |     xv = _mm256_sub_epi64(zero, xv);
244 |     _mm256_store_si256((__m256i*) &result->limbs[i * VECTWIDTH], xv);
245 |   }
246 | }
247 | 
248 | // negate a 12x32-bit residue.
249 | void negate_narrow(
250 |   residue_narrow_t *result, const residue_narrow_t *x) {
251 | 
252 |   __m512i lhs = _mm512_load_si512((__m512i*) &x->limbs[0]);
253 |   __m512i zero = _mm512_setzero();
254 |   __m512i neg = _mm512_sub_epi32(zero, lhs);
255 |   _mm512_store_si512((__m512i*) &result->limbs[0], neg);
256 | }
257 | 
258 | // Add 2 12x32-bit residues.
259 | void add_narrow(
260 |   residue_narrow_t *result, const residue_narrow_t * __restrict x,
261 |   const residue_narrow_t * __restrict y) {
262 | 
263 |   __m512i lhs = _mm512_load_si512((__m512i*) &x->limbs[0]);
264 |   __m512i rhs = _mm512_load_si512((__m512i*) &y->limbs[0]);
265 |   __m512i add = _mm512_add_epi32(lhs, rhs);
266 |   _mm512_store_si512((__m512i*) &result->limbs[0], add);
267 | }
268 | 
269 | // Scale a narrow residue by 2.
270 | void double_narrow(
271 |   residue_narrow_t *result, const residue_narrow_t *x) {
272 | 
273 |   __m512i lhs = _mm512_load_si512((__m512i*) &x->limbs[0]);
274 |   __m512i dub = _mm512_slli_epi32(lhs, 1);
275 |   _mm512_store_si512((__m512i*) &result->limbs[0], dub);
276 | }
277 | 
278 | // Scale a wide residue by 2.
279 | void double_wide(
280 |   residue_wide_t *result, const residue_wide_t *x) {
281 | 
282 |   for (int i = 0; i < NLIMBS; ++i) {
283 |     result->limbs[i] = x->limbs[i] << 1;
284 |   }
285 | }
286 | 
287 | #include <stdio.h>
288 | #include <string.h>
289 | // static void print4x64(__m256i x, const char * preamble) {
290 | //   uint64_t x_vals[4];
291 | //   memcpy(x_vals, &x, sizeof(x_vals));
292 | //   printf("%s\n", preamble);
293 | //   for (int i = 0; i < 4; ++i) {
294 | //     printf("%#lx\n", x_vals[i]);
295 | //   }
296 | // }
297 | 
298 | // static void print8x64(__m512i x, const char * preamble) {
299 | //   uint64_t x_vals[8];
300 | //   memcpy(x_vals, &x, sizeof(x_vals));
301 | //   printf("%s\n", preamble);
302 | //   for (int i = 0; i < 8; ++i) {
303 | //     printf("%#lx\n", x_vals[i]);
304 | //   }
305 | // }
306 | 
307 | // static void print16x32(__m512i x, const char * preamble) {
308 | //   uint32_t x_vals[16];
309 | //   memcpy(x_vals, &x, sizeof(x_vals));
310 | //   printf("%s\n", preamble);
311 | //   for (int i = 0; i < 16; ++i) {
312 | //     printf("%#x\n", x_vals[i]);
313 | //   }
314 | // }
315 | 
316 | // The swaps below trade 32 bit words within 128 bit lanes
317 | // in low endian order: 01 00 11 10
318 | // in big endian order 10 11 00 01 = 0xb1
319 | 
320 | static const int32_t permute_final_result[16] = {
321 |   0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
322 | };
323 | 
324 | // Multiply a narrow residue by a small constant. The result is reduced to 32
325 | // bits.
326 | void mul_narrow_const(
327 |   residue_narrow_t *result, const residue_narrow_t *x, int32_t d) {
328 | 
329 |   residue_wide_t temp;
330 |   for (int i = 0; i < NLIMBS; ++i) {
331 |     temp.limbs[i] = ((uint64_t) x->limbs[i]) * d;
332 |   }
333 |   reduce_step_wide(&temp, &temp);
334 |   __m512i _permute_final_result = _mm512_load_si512(permute_final_result);
335 |   __m512i accum0 = _mm512_load_si512((__m512i*) &temp.limbs[0]);
336 |   __m256i accum8 = _mm256_load_si256((__m256i*) &temp.limbs[8]);
337 |   __m512i final_result = _mm512_permutex2var_epi32(
338 |     accum0, _permute_final_result, _mm512_castsi256_si512(accum8));
339 |   _mm512_store_si512((__m512i*) &result->limbs[0], final_result);
340 | }
341 | 
342 | // Takes advantage of the fact that if a residue z *is zero* then after setting
343 | // one coefficient to T/2, all the remaining coefficients should be near to
344 | // T/2. They should therefore resolve all carries in a single step, and all be
345 | // equal to the same value. Some other value may not reduce completely, but this
346 | // is fine, we will know it is not zero.
347 | int equal_narrow(const residue_narrow_t *x, const residue_narrow_t *y) {
348 |   residue_narrow_t temp;
349 | 
350 |   sub_narrow(&temp, x, y);
351 |   int32_t delta = -temp.limbs[0] + (T / 2);
352 |   for (int i = 0; i < NLIMBS; ++i) {
353 |     temp.limbs[i] += delta;
354 |   }
355 | 
356 |   reduce_step_narrow(&temp, &temp);
357 | 
358 |   delta = temp.limbs[0];
359 |   int result = 0;
360 |   for (int i = 1; i < NLIMBS; ++i) {
361 |     result |= (temp.limbs[i] ^ delta);
362 |   }
363 | 
364 |   return !result;
365 | }
366 | 
367 | int equal_narrow_reduced(
368 |   const residue_narrow_reduced_t * x, const residue_narrow_reduced_t * y) {
369 | 
370 |   int result = 0;
371 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
372 |     result |= (x->limbs[i] ^ y->limbs[i]);
373 |   }
374 | 
375 |   return !result;
376 | }
377 | 
378 | static inline void nsquare_narrow(
379 |   residue_narrow_t *result, const residue_narrow_t *x, int n) {
380 | 
381 |   square_narrow_i(result, x);
382 |   for (int i = 1; i < n; ++i) {
383 |     square_narrow_i(result, result);
384 |   }
385 | }
386 | 
387 | static void raise_to_t(
388 |   residue_narrow_t *result, const residue_narrow_t *x) {
389 |   // zi = z^(2^i - 1), z1 = x
390 |   residue_narrow_t z2;
391 |   residue_narrow_t z3;
392 |   residue_narrow_t z5;
393 |   residue_narrow_t z10;
394 |   residue_narrow_t z11;
395 |   residue_narrow_t z22;
396 |   residue_narrow_t result_t;
397 | 
398 |   square_narrow_i(&z2, x);
399 |   mul_narrow_i(&z2, &z2, x);
400 |   square_narrow_i(&z3, &z2);
401 |   mul_narrow_i(&z3, &z3, x);
402 |   nsquare_narrow(&z5, &z3, 2);
403 |   mul_narrow_i(&z5, &z5, &z2);
404 |   nsquare_narrow(&z10, &z5, 5);
405 |   mul_narrow_i(&z10, &z10, &z5);
406 |   square_narrow_i(&z11, &z10);
407 |   mul_narrow_i(&z11, &z11, x);
408 |   nsquare_narrow(&z22, &z11, 11);
409 |   mul_narrow_i(&z22, &z22, &z11);
410 |   nsquare_narrow(&result_t, &z22, 4);
411 |   mul_narrow_i(result, &result_t, x);
412 | }
413 | 
414 | static void raise_to_t2(
415 |   residue_narrow_t *result, const residue_narrow_t *x) {
416 |   // t^2 = 0xfffff880000e1
417 |   // zi = z^(2^i - 1), z1 = x
418 |   residue_narrow_t z2;
419 |   residue_narrow_t z3;
420 |   residue_narrow_t z5;
421 |   residue_narrow_t z10;
422 |   residue_narrow_t z20;
423 |   residue_narrow_t result_t;
424 | 
425 |   square_narrow_i(&z2, x);
426 |   mul_narrow_i(&z2, &z2, x);
427 |   square_narrow_i(&z3, &z2);
428 |   mul_narrow_i(&z3, &z3, x);
429 |   nsquare_narrow(&z5, &z3, 2);
430 |   mul_narrow_i(&z5, &z5, &z2);
431 |   nsquare_narrow(&z10, &z5, 5);
432 |   mul_narrow_i(&z10, &z10, &z5);
433 |   nsquare_narrow(&z20, &z10, 10);
434 |   mul_narrow_i(&z20, &z20, &z10);
435 |   square_narrow_i(&result_t, &z20);
436 |   mul_narrow_i(&result_t, &result_t, x);
437 |   nsquare_narrow(&result_t, &result_t, 4);
438 |   mul_narrow_i(&result_t, &result_t, x);
439 |   // 22 = 3 for zeros in 8, 16 for zeros in 0000, 3 to make room for e.
440 |   nsquare_narrow(&result_t, &result_t, 22);
441 |   mul_narrow_i(&result_t, &result_t, &z3);
442 |   nsquare_narrow(&result_t, &result_t, 5);
443 |   mul_narrow_i(result, &result_t, x);
444 | }
445 | 
446 | static void raise_to_phi_t(
447 |   residue_narrow_t *result, const residue_narrow_t *x, int n) {
448 |   residue_narrow_t temp;
449 | 
450 |   raise_to_t(&temp, x);
451 | 
452 |   for (int i = 1; i < n; ++i) {
453 |     mul_narrow(&temp, &temp, x);
454 |     raise_to_t(&temp, &temp);
455 |   }
456 | 
457 |   mul_narrow(result, &temp, x);
458 | }
459 | 
460 | static void raise_to_t_minus_1_over_4(
461 |   residue_narrow_t *result, const residue_narrow_t *x) {
462 |   // zi = z^(2^i - 1), z1 = x
463 |   residue_narrow_t z2;
464 |   residue_narrow_t z3;
465 |   residue_narrow_t z5;
466 |   residue_narrow_t z10;
467 |   residue_narrow_t z11;
468 |   residue_narrow_t z22;
469 | 
470 |   square_narrow_i(&z2, x);
471 |   mul_narrow_i(&z2, &z2, x);
472 |   square_narrow_i(&z3, &z2);
473 |   mul_narrow_i(&z3, &z3, x);
474 |   nsquare_narrow(&z5, &z3, 2);
475 |   mul_narrow_i(&z5, &z5, &z2);
476 |   nsquare_narrow(&z10, &z5, 5);
477 |   mul_narrow_i(&z10, &z10, &z5);
478 |   square_narrow_i(&z11, &z10);
479 |   mul_narrow_i(&z11, &z11, x);
480 |   nsquare_narrow(&z22, &z11, 11);
481 |   mul_narrow_i(&z22, &z22, &z11);
482 |   nsquare_narrow(result, &z22, 2);
483 | }
484 | 
485 | static void raise_to_p_minus_3_over_4(
486 |   residue_narrow_t *result, const residue_narrow_t *x) {
487 | 
488 |   residue_narrow_t z4; //z to (t-1)/4
489 |   residue_narrow_t z2; //z to (t-1)/2
490 |   residue_narrow_t z3_4; //z to (3t+1)/4
491 |   residue_narrow_t y_small;
492 |   residue_narrow_t y, y_t4_y;
493 |   residue_narrow_t raised;
494 | 
495 |   raise_to_t_minus_1_over_4(&z4, x);
496 |   square_narrow(&z2, &z4);
497 |   mul_narrow(&z3_4, &z2, &z4);
498 |   mul_narrow(&z3_4, &z3_4, x);
499 |   raise_to_t(&raised, &z4);
500 |   mul_narrow(&y_small, &z2, &raised);
501 |   raise_to_t(&raised, &y_small);
502 |   mul_narrow(&y, &z3_4, &raised);
503 |   raise_to_t(&raised, &y);
504 |   raise_to_t(&raised, &raised);
505 |   raise_to_t(&raised, &raised);
506 |   raise_to_t(&raised, &raised);
507 |   mul_narrow(&y_t4_y, &raised, &y);
508 |   raise_to_t(&raised, &y_t4_y);
509 |   raise_to_t(&raised, &raised);
510 |   raise_to_t(&raised, &raised);
511 |   mul_narrow(result, &raised, &y_small);
512 | }
513 | 
514 | int sqrt_inv_narrow(
515 |   residue_narrow_t *result, const residue_narrow_t * __restrict x,
516 |   const residue_narrow_t * __restrict y) {
517 |   residue_narrow_t xy;
518 |   residue_narrow_t y2;
519 |   residue_narrow_t xy3;
520 |   residue_narrow_t xy3_p_3_over_4;
521 |   residue_narrow_t cand2;
522 |   residue_narrow_t should_be_x;
523 | 
524 |   square_narrow(&y2, y);
525 |   mul_narrow(&xy, x, y);
526 |   mul_narrow(&xy3, &xy, &y2);
527 |   raise_to_p_minus_3_over_4(&xy3_p_3_over_4, &xy3);
528 |   mul_narrow(result, &xy, &xy3_p_3_over_4);
529 |   square_narrow(&cand2, result);
530 |   mul_narrow(&should_be_x, y, &cand2);
531 | 
532 |   return equal_narrow(&should_be_x, x);
533 | }
534 | 
535 | void invert_narrow(
536 |   residue_narrow_t *result, const residue_narrow_t * __restrict x) {
537 | 
538 |   residue_narrow_t x_t_minus_1_over_4;
539 |   residue_narrow_t x_t_minus_1;
540 |   // x^2 (trades a multiply for a square)
541 |   residue_narrow_t x2;
542 |   // rho_k = x^((t^k - 1)/(t - 1))
543 |   // rho_1 = x
544 |   residue_narrow_t rho_2, rho_4, rho_8, rho_9;
545 |   residue_narrow_t result_t;
546 | 
547 |   raise_to_t_minus_1_over_4(&x_t_minus_1_over_4, x);
548 |   nsquare_narrow(&x_t_minus_1, &x_t_minus_1_over_4, 2);
549 |   square_narrow_i(&x2, x);
550 |   mul_narrow_i(&rho_2, &x_t_minus_1, &x2);
551 |   raise_to_t2(&rho_4, &rho_2);
552 |   mul_narrow_i(&rho_4, &rho_4, &rho_2);
553 |   raise_to_t2(&rho_8, &rho_4);
554 |   raise_to_t2(&rho_8, &rho_8);
555 |   mul_narrow_i(&rho_8, &rho_8, &rho_4);
556 |   raise_to_t(&rho_9, &rho_8);
557 |   mul_narrow_i(&rho_9, &rho_9, x);
558 |   raise_to_t2(&result_t, &rho_9);
559 |   mul_narrow_i(result, &result_t, &x_t_minus_1);
560 | }
561 | 
562 | void encode(uint8_t *out, const residue_narrow_reduced_t * __restrict x) {
563 |   uint32_t collect = x->limbs[0];
564 | 
565 |   int space = 32 - TBITS;
566 |   int i = 1;
567 |   int bits_remaining = TBITS * NLIMBS_REDUCED;
568 |   while (bits_remaining > 0) {
569 |     *out++ = collect & 0xff;
570 |     collect >>= 8;
571 |     space += 8;
572 |     bits_remaining -= 8;
573 |     if (space >= TBITS && i < NLIMBS_REDUCED) {
574 |       collect |= x->limbs[i] << (32 - space);
575 |       space -= TBITS;
576 |       ++i;
577 |     }
578 |   }
579 | }
580 | 
581 | void decode(residue_narrow_reduced_t *out, const uint8_t *in) {
582 |   uint32_t collect = 0;
583 | 
584 |   int shift = 0;
585 |   int i = 0;
586 |   int bits_remaining = TBITS * NLIMBS_REDUCED;
587 |   while (bits_remaining > 0) {
588 |     collect |= (*in++) << shift;
589 |     shift += 8;
590 |     bits_remaining -= 8;
591 |     if (shift >= TBITS) {
592 |       if (bits_remaining > 0) {
593 |         out->limbs[i] = collect & TMASK;
594 |         collect >>= 26;
595 |         shift -= 26;
596 |         ++i;
597 |       } else {
598 |         out->limbs[i] = collect;
599 |       }
600 |     }
601 |   }
602 | }
603 | 


--------------------------------------------------------------------------------
/avx512/src/main.c:
--------------------------------------------------------------------------------
1 | ../../ref/src/main.c


--------------------------------------------------------------------------------
/avx512/src/scalar.c:
--------------------------------------------------------------------------------
1 | ../../ref/src/scalar.c


--------------------------------------------------------------------------------
/avx512/src/sign.c:
--------------------------------------------------------------------------------
1 | ../../ref/src/sign.c


--------------------------------------------------------------------------------
/ref/Makefile:
--------------------------------------------------------------------------------
  1 | #### PROJECT SETTINGS ####
  2 | # The name of the executable to be created
  3 | BIN_NAME := p11_260_test
  4 | # Compiler used
  5 | CC = clang-10
  6 | # Extension of source files used in the project
  7 | SRC_EXT = c
  8 | # Path to the source directory, relative to the makefile
  9 | SRC_PATH = src
 10 | # Space-separated pkg-config libraries used by this project
 11 | LIBS =
 12 | # General compiler flags
 13 | COMPILE_FLAGS = -march=haswell -std=c11 -Wall -Wextra
 14 | # Additional release-specific flags
 15 | RCOMPILE_FLAGS = -O2 -D DEBUG -g
 16 | # Additional debug-specific flags
 17 | DCOMPILE_FLAGS = -g -D DEBUG
 18 | # Add additional include paths
 19 | INCLUDES = -Iinclude -isystem /usr/include/bsd -DLIBBSD_OVERLAY
 20 | # General linker settings
 21 | LINK_FLAGS = -lbsd -lb2
 22 | # Additional release-specific linker settings
 23 | RLINK_FLAGS =
 24 | # Additional debug-specific linker settings
 25 | DLINK_FLAGS =
 26 | # Destination directory, like a jail or mounted system
 27 | DESTDIR = /
 28 | # Install path (bin/ is appended automatically)
 29 | INSTALL_PREFIX = home/kyle/.local
 30 | #### END PROJECT SETTINGS ####
 31 | 
 32 | # Optionally you may move the section above to a separate config.mk file, and
 33 | # uncomment the line below
 34 | # include config.mk
 35 | 
 36 | # Generally should not need to edit below this line
 37 | 
 38 | # Obtains the OS type, either 'Darwin' (OS X) or 'Linux'
 39 | UNAME_S:=$(shell uname -s)
 40 | 
 41 | # Function used to check variables. Use on the command line:
 42 | # make print-VARNAME
 43 | # Useful for debugging and adding features
 44 | print-%: ; @echo $*=$($*)
 45 | 
 46 | # Shell used in this makefile
 47 | # bash is used for 'echo -en'
 48 | SHELL = /bin/bash
 49 | # Clear built-in rules
 50 | .SUFFIXES:
 51 | # Programs for installation
 52 | INSTALL = install
 53 | INSTALL_PROGRAM = $(INSTALL)
 54 | INSTALL_DATA = $(INSTALL) -m 644
 55 | 
 56 | # Append pkg-config specific libraries if need be
 57 | ifneq ($(LIBS),)
 58 | 	COMPILE_FLAGS += $(shell pkg-config --cflags $(LIBS))
 59 | 	LINK_FLAGS += $(shell pkg-config --libs $(LIBS))
 60 | endif
 61 | 
 62 | # Verbose option, to output compile and link commands
 63 | export V := false
 64 | export CMD_PREFIX := @
 65 | ifeq ($(V),true)
 66 | 	CMD_PREFIX :=
 67 | endif
 68 | 
 69 | # Combine compiler and linker flags
 70 | release: export CFLAGS := $(CFLAGS) $(COMPILE_FLAGS) $(RCOMPILE_FLAGS)
 71 | release: export LDFLAGS := $(LDFLAGS) $(LINK_FLAGS) $(RLINK_FLAGS)
 72 | debug: export CFLAGS := $(CFLAGS) $(COMPILE_FLAGS) $(DCOMPILE_FLAGS)
 73 | debug: export LDFLAGS := $(LDFLAGS) $(LINK_FLAGS) $(DLINK_FLAGS)
 74 | 
 75 | # Build and output paths
 76 | release: export BUILD_PATH := build/release
 77 | release: export BIN_PATH := bin/release
 78 | debug: export BUILD_PATH := build/debug
 79 | debug: export BIN_PATH := bin/debug
 80 | install: export BIN_PATH := bin/release
 81 | 
 82 | # Find all source files in the source directory, sorted by most
 83 | # recently modified
 84 | ifeq ($(UNAME_S),Darwin)
 85 | 	SOURCES = $(shell find $(SRC_PATH) -name '*.$(SRC_EXT)' | sort -k 1nr | cut -f2-)
 86 | else
 87 | 	SOURCES = $(shell find $(SRC_PATH) -name '*.$(SRC_EXT)' -printf '%T@\t%p\n' \
 88 | 						| sort -k 1nr | cut -f2-)
 89 | endif
 90 | 
 91 | # fallback in case the above fails
 92 | rwildcard = $(foreach d, $(wildcard $1*), $(call rwildcard,$d/,$2) \
 93 | 						$(filter $(subst *,%,$2), $d))
 94 | ifeq ($(SOURCES),)
 95 | 	SOURCES := $(call rwildcard, $(SRC_PATH), *.$(SRC_EXT))
 96 | endif
 97 | 
 98 | # Set the object file names, with the source directory stripped
 99 | # from the path, and the build path prepended in its place
100 | OBJECTS = $(SOURCES:$(SRC_PATH)/%.$(SRC_EXT)=$(BUILD_PATH)/%.o)
101 | # Set the dependency files that will be used to add header dependencies
102 | DEPS = $(OBJECTS:.o=.d)
103 | 
104 | # Macros for timing compilation
105 | ifeq ($(UNAME_S),Darwin)
106 | 	CUR_TIME = awk 'BEGIN{srand(); print srand()}'
107 | 	TIME_FILE = $(dir $@).$(notdir $@)_time
108 | 	START_TIME = $(CUR_TIME) > $(TIME_FILE)
109 | 	END_TIME = read st < $(TIME_FILE) ; \
110 | 		$(RM) $(TIME_FILE) ; \
111 | 		st=$$((`$(CUR_TIME)` - $$st)) ; \
112 | 		echo $$st
113 | else
114 | 	TIME_FILE = $(dir $@).$(notdir $@)_time
115 | 	START_TIME = date '+%s' > $(TIME_FILE)
116 | 	END_TIME = read st < $(TIME_FILE) ; \
117 | 		$(RM) $(TIME_FILE) ; \
118 | 		st=$$((`date '+%s'` - $$st - 86400)) ; \
119 | 		echo `date -u -d @$$st '+%H:%M:%S'`
120 | endif
121 | 
122 | # Version macros
123 | # Comment/remove this section to remove versioning
124 | USE_VERSION := false
125 | # If this isn't a git repo or the repo has no tags, git describe will return non-zero
126 | ifeq ($(shell git describe > /dev/null 2>&1 ; echo $$?), 0)
127 | 	USE_VERSION := true
128 | 	VERSION := $(shell git describe --tags --long --dirty --always | \
129 | 		sed 's/v\([0-9]*\)\.\([0-9]*\)\.\([0-9]*\)-\?.*-\([0-9]*\)-\(.*\)/\1 \2 \3 \4 \5/g')
130 | 	VERSION_MAJOR := $(word 1, $(VERSION))
131 | 	VERSION_MINOR := $(word 2, $(VERSION))
132 | 	VERSION_PATCH := $(word 3, $(VERSION))
133 | 	VERSION_REVISION := $(word 4, $(VERSION))
134 | 	VERSION_HASH := $(word 5, $(VERSION))
135 | 	VERSION_STRING := \
136 | 		"$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH).$(VERSION_REVISION)-$(VERSION_HASH)"
137 | 	override CFLAGS := $(CFLAGS) \
138 | 		-D VERSION_MAJOR=$(VERSION_MAJOR) \
139 | 		-D VERSION_MINOR=$(VERSION_MINOR) \
140 | 		-D VERSION_PATCH=$(VERSION_PATCH) \
141 | 		-D VERSION_REVISION=$(VERSION_REVISION) \
142 | 		-D VERSION_HASH=\"$(VERSION_HASH)\"
143 | endif
144 | 
145 | # Standard, non-optimized release build
146 | .PHONY: release
147 | release: dirs
148 | ifeq ($(USE_VERSION), true)
149 | 	@echo "Beginning release build v$(VERSION_STRING)"
150 | else
151 | 	@echo "Beginning release build"
152 | endif
153 | 	@$(MAKE) all --no-print-directory
154 | 
155 | # Debug build for gdb debugging
156 | .PHONY: debug
157 | debug: dirs
158 | ifeq ($(USE_VERSION), true)
159 | 	@echo "Beginning debug build v$(VERSION_STRING)"
160 | else
161 | 	@echo "Beginning debug build"
162 | endif
163 | 	@$(MAKE) all --no-print-directory
164 | 
165 | # Create the directories used in the build
166 | .PHONY: dirs
167 | dirs:
168 | 	@echo "Creating directories"
169 | 	@mkdir -p $(dir $(OBJECTS))
170 | 	@mkdir -p $(BIN_PATH)
171 | 
172 | # Installs to the set path
173 | .PHONY: install
174 | install:
175 | 	@echo "Installing to $(DESTDIR)$(INSTALL_PREFIX)/bin"
176 | 	@$(INSTALL_PROGRAM) $(BIN_PATH)/$(BIN_NAME) $(DESTDIR)$(INSTALL_PREFIX)/bin
177 | 
178 | # Uninstalls the program
179 | .PHONY: uninstall
180 | uninstall:
181 | 	@echo "Removing $(DESTDIR)$(INSTALL_PREFIX)/bin/$(BIN_NAME)"
182 | 	@$(RM) $(DESTDIR)$(INSTALL_PREFIX)/bin/$(BIN_NAME)
183 | 
184 | # Removes all build files
185 | .PHONY: clean
186 | clean:
187 | 	@echo "Deleting $(BIN_NAME) symlink"
188 | 	@$(RM) $(BIN_NAME)
189 | 	@echo "Deleting directories"
190 | 	@$(RM) -r build
191 | 	@$(RM) -r bin
192 | 
193 | # Main rule, checks the executable and symlinks to the output
194 | all: $(BIN_PATH)/$(BIN_NAME)
195 | 	@echo "Making symlink: $(BIN_NAME) -> $<"
196 | 	@$(RM) $(BIN_NAME)
197 | 	@ln -s $(BIN_PATH)/$(BIN_NAME) $(BIN_NAME)
198 | 
199 | # Link the executable
200 | $(BIN_PATH)/$(BIN_NAME): $(OBJECTS)
201 | 	@echo "Linking: $@"
202 | 	@$(START_TIME)
203 | 	$(CMD_PREFIX)$(CC) $(OBJECTS) $(LDFLAGS) -o $@
204 | 	@echo -en "\t Link time: "
205 | 	@$(END_TIME)
206 | 
207 | # Add dependency files, if they exist
208 | -include $(DEPS)
209 | 
210 | # Source file rules
211 | # After the first compilation they will be joined with the rules from the
212 | # dependency files to provide header dependencies
213 | $(BUILD_PATH)/%.o: $(SRC_PATH)/%.$(SRC_EXT)
214 | 	@echo "Compiling: $< -> $@"
215 | 	$(CMD_PREFIX)$(CC) $(CFLAGS) $(INCLUDES) -MP -MMD -c $< -o $@
216 | 


--------------------------------------------------------------------------------
/ref/api.h:
--------------------------------------------------------------------------------
1 | #define CRYPTO_SECRETKEYBYTES 66
2 | #define CRYPTO_PUBLICKEYBYTES 33
3 | #define CRYPTO_BYTES 65
4 | #define CRYPTO_VERSION "1.0"
5 | 


--------------------------------------------------------------------------------
/ref/include/comb.h:
--------------------------------------------------------------------------------
 1 | #ifndef COMB_H
 2 | #define COMB_H
 3 | 
 4 | #include "curve.h"
 5 | #include "scalar.h"
 6 | 
 7 | #define COMB_TABLE_SIZE 16
 8 | #define COMB_TEETH 5
 9 | #define COMB_COUNT 4
10 | #define COMB_SEPARATION 13
11 | #define COMB_LOOKUP_MASK 0xf
12 | 
13 | // A single comb table.
14 | typedef struct sabs_single_comb {
15 |   extended_affine_pt_readd_narrow_t table[COMB_TABLE_SIZE];
16 | } sabs_single_comb_t;
17 | 
18 | // A single narrow comb table. Used in computing a narrow comb table.
19 | typedef struct sabs_single_comb_narrow {
20 |   projective_pt_narrow_t table[COMB_TABLE_SIZE];
21 | } sabs_single_comb_narrow_t;
22 | 
23 | // A comb set. There is a precomputed comb set for the base point, but for
24 | // verifications of several signatures from the same key, it would be
25 | // advantageous to precompute a comb.
26 | typedef struct sabs_comb_set {
27 |   sabs_single_comb_t combs[COMB_COUNT];
28 | } sabs_comb_set_t;
29 | 
30 | // An unreduced comb set. Used just to separate the logic of comb computation
31 | // from comb reduction.
32 | typedef struct sabs_comb_set_narrow {
33 |   sabs_single_comb_narrow_t combs[COMB_COUNT];
34 | } sabs_comb_set_narrow_t;
35 | 
36 | // used for computing the entries in the comb table.
37 | typedef struct teeth_set {
38 |   // We don't need the lowest tooth to compute the entries, because for signed
39 |   // all bits set, to change the bit, you add or subtract a value of 2*bit.
40 |   extended_pt_readd_narrow_t teeth[COMB_TEETH - 1];
41 | } teeth_set_t;
42 | 
43 | // The base comb used for fast signatures.
44 | sabs_comb_set_t base_comb;
45 | 
46 | // Compute a comb set for a given point.
47 | void compute_comb_set(
48 |   sabs_comb_set_t *result, const affine_pt_narrow_t *base_pt);
49 | 
50 | // Helper function used to compute a comb set.
51 | void reduce_comb_set(sabs_comb_set_t *result, sabs_comb_set_narrow_t *source);
52 | 
53 | // Constant time multiplication of a scalar times a point given the point's
54 | // comb.
55 | void scalar_comb_multiply(
56 |   projective_pt_narrow_t *result, const sabs_comb_set_t * __restrict comb,
57 |   const scalar_t * __restrict n);
58 | 
59 | // Non-Constant time multiplication of a scalar times a point given the point's
60 | // comb. Can be safely used during signature verification because there are no
61 | // secrets during verification.
62 | void scalar_comb_multiply_unsafe(
63 |   projective_pt_narrow_t *result, const sabs_comb_set_t * __restrict comb,
64 |   const scalar_t * __restrict n);
65 | #endif
66 | 


--------------------------------------------------------------------------------
/ref/include/constant_time.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include "f11_260.h"
 3 | #include "curve.h"
 4 | 
 5 | static inline void mask_copy_narrow(
 6 |   int32_t mask, residue_narrow_t *result,
 7 |   const residue_narrow_t *x) {
 8 | 
 9 |   #pragma clang loop unroll(full)
10 |   for (int i = 0; i < NLIMBS; ++i) {
11 |     result->limbs[i] |= x->limbs[i] & mask;
12 |   }
13 | }
14 | 
15 | void constant_time_extended_narrow_lookup(
16 |   extended_pt_readd_narrow_t *result, int i, int n,
17 |   const extended_pt_readd_narrow_t *table) {
18 | 
19 |   #pragma clang loop unroll(full)
20 |   for (int i = 0; i < NLIMBS; ++i) {
21 |     result->x.limbs[i] = 0;
22 |     result->dt.limbs[i] = 0;
23 |     result->y.limbs[i] = 0;
24 |     result->z.limbs[i] = 0;
25 |   }
26 |   for (int j = 0; j < n; ++j) {
27 |     int32_t mask = -(i == j);
28 | 
29 |     mask_copy_narrow(mask, &result->x, &table[j].x);
30 |     mask_copy_narrow(mask, &result->dt, &table[j].dt);
31 |     mask_copy_narrow(mask, &result->y, &table[j].y);
32 |     mask_copy_narrow(mask, &result->z, &table[j].z);
33 |   }
34 | }
35 | 
36 | void constant_time_extended_affine_narrow_lookup(
37 |   extended_affine_pt_readd_narrow_t *result, int i, int n,
38 |   const extended_affine_pt_readd_narrow_t *table) {
39 | 
40 |   #pragma clang loop unroll(full)
41 |   for (int i = 0; i < NLIMBS; ++i) {
42 |     result->x.limbs[i] = 0;
43 |     result->dt.limbs[i] = 0;
44 |     result->y.limbs[i] = 0;
45 |   }
46 | 
47 |   for (int j = 0; j < n; ++j) {
48 |     int32_t mask = -(i == j);
49 | 
50 |     mask_copy_narrow(mask, &result->x, &table[j].x);
51 |     mask_copy_narrow(mask, &result->dt, &table[j].dt);
52 |     mask_copy_narrow(mask, &result->y, &table[j].y);
53 |   }
54 | }
55 | 
56 | void constant_time_cond_extended_negate(
57 |   extended_pt_readd_narrow_t *x, int32_t mask) {
58 |   #pragma clang loop unroll(full)
59 |   for (int i = 0; i < NLIMBS; ++i) {
60 |     x->x.limbs[i] = (x->x.limbs[i] & ~mask) | ((-x->x.limbs[i]) & mask);
61 |   }
62 |   #pragma clang loop unroll(full)
63 |   for (int i = 0; i < NLIMBS; ++i) {
64 |     x->dt.limbs[i] = (x->dt.limbs[i] & ~mask) | ((-x->dt.limbs[i]) & mask);
65 |   }
66 | }
67 | 
68 | void constant_time_cond_extended_affine_negate(
69 |   extended_affine_pt_readd_narrow_t *x, int32_t mask) {
70 |   for (int i = 0; i < NLIMBS; ++i) {
71 |     x->x.limbs[i] = (x->x.limbs[i] & ~mask) | ((-x->x.limbs[i]) & mask);
72 |   }
73 |   for (int i = 0; i < NLIMBS; ++i) {
74 |     x->dt.limbs[i] = (x->dt.limbs[i] & ~mask) | ((-x->dt.limbs[i]) & mask);
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/ref/include/constant_time.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONSTANT_TIME_H
 2 | #define CONSTANT_TIME_H
 3 | #include <stdint.h>
 4 | #include "f11_260.h"
 5 | #include "curve.h"
 6 | 
 7 | inline void constant_time_extended_narrow_lookup(
 8 |   extended_pt_readd_narrow_t *result, int i, int n,
 9 |   const extended_pt_readd_narrow_t *table);
10 | 
11 | inline void constant_time_extended_affine_narrow_lookup(
12 |   extended_affine_pt_readd_narrow_t *result, int i, int n,
13 |   const extended_affine_pt_readd_narrow_t *table);
14 | 
15 | inline void constant_time_cond_extended_negate(
16 |   extended_pt_readd_narrow_t *x, int32_t mask);
17 | 
18 | inline void constant_time_cond_extended_affine_negate(
19 |   extended_affine_pt_readd_narrow_t *x, int32_t mask);
20 | #endif
21 | 


--------------------------------------------------------------------------------
/ref/include/curve.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include "f11_260.h"
  3 | #include "scalar.h"
  4 | #include "curve.h"
  5 | #include "constant_time.h"
  6 | 
  7 | __attribute__((__aligned__(64)))
  8 | const affine_pt_narrow_t B = {
  9 |   .x = {
 10 |     .limbs = {
 11 |       0x2862b8b, 0x0f08ed2, 0x06e65ee, 0x0c05991, 0x2b12b17,
 12 |       0x0049432, 0x33a3707, 0x16e5186, 0x2947e71, 0x0ed9bab,
 13 |       0,
 14 |     },
 15 |   },
 16 |   .y = {
 17 |     .limbs = {
 18 |       0x4,
 19 |     },
 20 |   },
 21 | };
 22 | 
 23 | void copy_projective_pt_narrow(
 24 |   projective_pt_narrow_t *result, const projective_pt_narrow_t *source) {
 25 | 
 26 |   for(int i = 0; i < NLIMBS; ++i) {
 27 |     result->x.limbs[i] = source->x.limbs[i];
 28 |     result->y.limbs[i] = source->y.limbs[i];
 29 |     result->z.limbs[i] = source->z.limbs[i];
 30 |   }
 31 | }
 32 | 
 33 | void copy_extended_pt_narrow(
 34 |   extended_pt_narrow_t *result,
 35 |   const extended_pt_narrow_t *source) {
 36 | 
 37 |   for(int i = 0; i < NLIMBS; ++i) {
 38 |     result->x.limbs[i] = source->x.limbs[i];
 39 |     result->y.limbs[i] = source->y.limbs[i];
 40 |     result->t.limbs[i] = source->t.limbs[i];
 41 |     result->z.limbs[i] = source->z.limbs[i];
 42 |   }
 43 | }
 44 | 
 45 | void copy_extended_pt_readd_narrow(
 46 |   extended_pt_readd_narrow_t *result,
 47 |   const extended_pt_readd_narrow_t *source) {
 48 |   for(int i = 0; i < NLIMBS; ++i) {
 49 |     result->x.limbs[i] = source->x.limbs[i];
 50 |     result->y.limbs[i] = source->y.limbs[i];
 51 |     result->dt.limbs[i] = source->dt.limbs[i];
 52 |     result->z.limbs[i] = source->z.limbs[i];
 53 |   }
 54 | }
 55 | 
 56 | void copy_extended_affine_pt_readd_narrow(
 57 |   extended_affine_pt_readd_narrow_t *result,
 58 |   const extended_affine_pt_readd_narrow_t *source) {
 59 |   for(int i = 0; i < NLIMBS; ++i) {
 60 |     result->x.limbs[i] = source->x.limbs[i];
 61 |     result->y.limbs[i] = source->y.limbs[i];
 62 |     result->dt.limbs[i] = source->dt.limbs[i];
 63 |   }
 64 | }
 65 | 
 66 | void negate_extended_pt_readd_narrow(
 67 |   extended_pt_readd_narrow_t *result,
 68 |   const extended_pt_readd_narrow_t *source) {
 69 |   for(int i = 0; i < NLIMBS; ++i) {
 70 |     result->x.limbs[i] = -source->x.limbs[i];
 71 |     result->y.limbs[i] = source->y.limbs[i];
 72 |     result->dt.limbs[i] = -source->dt.limbs[i];
 73 |     result->z.limbs[i] = source->z.limbs[i];
 74 |   }
 75 | }
 76 | 
 77 | void negate_extended_affine_pt_readd_narrow(
 78 |   extended_affine_pt_readd_narrow_t *result,
 79 |   const extended_affine_pt_readd_narrow_t *source) {
 80 |   for(int i = 0; i < NLIMBS; ++i) {
 81 |     result->x.limbs[i] = -source->x.limbs[i];
 82 |     result->dt.limbs[i] = -source->dt.limbs[i];
 83 |     result->y.limbs[i] = source->y.limbs[i];
 84 |   }
 85 | }
 86 | 
 87 | void affine_narrow_to_extended(
 88 |   extended_pt_narrow_t *result,
 89 |   const affine_pt_narrow_t * __restrict x) {
 90 | 
 91 |   for(int i = 0; i < NLIMBS; ++i) {
 92 |     result->x.limbs[i] = x->x.limbs[i];
 93 |     result->y.limbs[i] = x->y.limbs[i];
 94 |     result->z.limbs[i] = 0;
 95 |   }
 96 |   result->z.limbs[0] = 1;
 97 |   mul_narrow(&result->t, &result->x, &result->y);
 98 | }
 99 | 
100 | void extended_to_projective_narrow(
101 |   projective_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x) {
102 |   for(int i = 0; i < NLIMBS; ++i) {
103 |     result->x.limbs[i] = x->x.limbs[i];
104 |     result->y.limbs[i] = x->y.limbs[i];
105 |     result->z.limbs[i] = x->z.limbs[i];
106 |   }
107 | }
108 | 
109 | void affine_to_readd_narrow(
110 |   extended_pt_readd_narrow_t *result,
111 |   const affine_pt_narrow_t * __restrict x) {
112 | 
113 |   for(int i = 0; i < NLIMBS; ++i) {
114 |     result->x.limbs[i] = x->x.limbs[i];
115 |     result->y.limbs[i] = x->y.limbs[i];
116 |     result->z.limbs[i] = 0;
117 |   }
118 |   result->z.limbs[0] = 1;
119 | 
120 |   residue_narrow_t xy;
121 |   mul_narrow(&xy, &x->x, &x->y);
122 |   mul_narrow_const(&result->dt, &xy, D);
123 | }
124 | 
125 | void extended_to_readd_narrow_neg(
126 |   extended_pt_readd_narrow_t *result,
127 |   const extended_pt_narrow_t * __restrict x) {
128 | 
129 |   for(int i = 0; i < NLIMBS; ++i) {
130 |     result->x.limbs[i] = -(x->x.limbs[i]);
131 |     result->y.limbs[i] = x->y.limbs[i];
132 |     result->z.limbs[i] = x->z.limbs[i];
133 |   }
134 |   mul_narrow_const(&result->dt, &x->t, -D);
135 | }
136 | 
137 | void affine_double(
138 |   projective_pt_narrow_t *result,
139 |   const affine_pt_narrow_t * __restrict x) {
140 | 
141 |   residue_narrow_t x_plus_y;
142 |   residue_narrow_t a, b, e, e_tmp, g, g_minus_2, h;
143 |   square_narrow(&a, &x->x);
144 |   square_narrow(&b, &x->y);
145 | 
146 |   add_narrow(&x_plus_y, &x->x, &x->y);
147 | 
148 |   square_narrow(&e, &x_plus_y);
149 |   sub_narrow(&e_tmp, &e, &a);
150 |   sub_narrow(&e, &e_tmp, &b);
151 |   add_narrow(&g, &a, &b);
152 | 
153 |   for (int i = 0; i < NLIMBS; ++i) {
154 |     g_minus_2.limbs[i] = g.limbs[i];
155 |   }
156 |   g_minus_2.limbs[0] -= 2;
157 | 
158 |   sub_narrow(&h, &a, &b);
159 |   mul_narrow(&result->x, &e, &g_minus_2);
160 |   mul_narrow(&result->y, &g, &h);
161 |   mul_narrow(&result->z, &g, &g_minus_2);
162 | }
163 | 
164 | void affine_double_extended(
165 |   extended_pt_narrow_t *result, const affine_pt_narrow_t * __restrict x) {
166 | 
167 |   residue_narrow_t x_plus_y;
168 |   residue_narrow_t a, b, e, e_tmp, g, g_minus_2, h;
169 |   square_narrow(&a, &x->x);
170 |   square_narrow(&b, &x->y);
171 | 
172 |   add_narrow(&x_plus_y, &x->x, &x->y);
173 |   square_narrow(&e, &x_plus_y);
174 |   sub_narrow(&e_tmp, &e, &a);
175 |   sub_narrow(&e, &e_tmp, &b);
176 |   add_narrow(&g, &a, &b);
177 | 
178 |   for (int i = 0; i < NLIMBS; ++i) {
179 |     g_minus_2.limbs[i] = g.limbs[i];
180 |   }
181 |   g_minus_2.limbs[0] -= 2;
182 | 
183 |   sub_narrow(&h, &a, &b);
184 |   mul_narrow(&result->x, &e, &g_minus_2);
185 |   mul_narrow(&result->y, &g, &h);
186 |   mul_narrow(&result->t, &e, &h);
187 |   mul_narrow(&result->z, &g, &g_minus_2);
188 | }
189 | 
190 | void projective_double(
191 |   projective_pt_narrow_t *result, const projective_pt_narrow_t *x) {
192 | 
193 |   residue_narrow_t x_plus_y;
194 |   residue_narrow_t a, b, c, c_temp, e, e_tmp, f, g, h;
195 |   add_narrow(&x_plus_y, &x->x, &x->y);
196 |   square_narrow(&a, &x->x);
197 |   square_narrow(&b, &x->y);
198 |   square_narrow(&c_temp, &x->z);
199 |   double_narrow(&c, &c_temp);
200 | 
201 |   square_narrow(&e, &x_plus_y);
202 |   sub_narrow(&e_tmp, &e, &a);
203 |   sub_narrow(&e, &e_tmp, &b);
204 |   add_narrow(&g, &a, &b);
205 |   sub_narrow(&f, &g, &c);
206 |   sub_narrow(&h, &a, &b);
207 | 
208 |   mul_narrow(&result->x, &e, &f);
209 |   mul_narrow(&result->y, &g, &h);
210 |   mul_narrow(&result->z, &f, &g);
211 | }
212 | 
213 | void projective_double_extended(
214 |   extended_pt_narrow_t *result, const projective_pt_narrow_t * __restrict x) {
215 | 
216 |   residue_narrow_t x_plus_y;
217 |   residue_narrow_t a, b, c, c_temp, e, e_tmp, f, g, h;
218 |   add_narrow(&x_plus_y, &x->x, &x->y);
219 |   square_narrow(&a, &x->x);
220 |   square_narrow(&b, &x->y);
221 |   square_narrow(&c_temp, &x->z);
222 |   double_narrow(&c, &c_temp);
223 | 
224 |   square_narrow(&e, &x_plus_y);
225 |   sub_narrow(&e_tmp, &e, &a);
226 |   sub_narrow(&e, &e_tmp, &b);
227 |   add_narrow(&g, &a, &b);
228 |   sub_narrow(&f, &g, &c);
229 |   sub_narrow(&h, &a, &b);
230 | 
231 |   mul_narrow(&result->x, &e, &f);
232 |   mul_narrow(&result->y, &g, &h);
233 |   mul_narrow(&result->t, &e, &h);
234 |   mul_narrow(&result->z, &f, &g);
235 | }
236 | 
237 | void extended_double_extended(
238 |   extended_pt_narrow_t *result, const extended_pt_narrow_t *x) {
239 | 
240 |   residue_narrow_t x_plus_y;
241 |   residue_narrow_t a, b, c, c_temp, e, e_tmp, f, g, h;
242 |   add_narrow(&x_plus_y, &x->x, &x->y);
243 |   square_narrow(&a, &x->x);
244 |   square_narrow(&b, &x->y);
245 |   square_narrow(&c_temp, &x->z);
246 |   double_narrow(&c, &c_temp);
247 | 
248 |   square_narrow(&e, &x_plus_y);
249 |   sub_narrow(&e_tmp, &e, &a);
250 |   sub_narrow(&e, &e_tmp, &b);
251 |   add_narrow(&g, &a, &b);
252 |   sub_narrow(&f, &g, &c);
253 |   sub_narrow(&h, &a, &b);
254 | 
255 |   mul_narrow(&result->x, &e, &f);
256 |   mul_narrow(&result->z, &f, &g);
257 |   mul_narrow(&result->y, &g, &h);
258 |   mul_narrow(&result->t, &e, &h);
259 | }
260 | 
261 | void projective_add(
262 |   projective_pt_narrow_t *result, const projective_pt_narrow_t * __restrict x1,
263 |   const projective_pt_narrow_t * __restrict x2) {
264 | 
265 |   residue_narrow_t x1_plus_y1, x2_plus_y2;
266 |   residue_narrow_t a, b, c, d, e, e_temp, f, g, t1, t2;
267 | 
268 |   mul_narrow(&a, &x1->z, &x2->z);
269 |   square_narrow(&b, &a);
270 |   mul_narrow(&c, &x1->x, &x2->x);
271 |   mul_narrow(&d, &x1->y, &x2->y);
272 |   mul_narrow_const(&e_temp, &c, D);
273 |   mul_narrow(&e, &e_temp, &d);
274 | 
275 |   sub_narrow(&f, &b, &e);
276 |   add_narrow(&g, &b, &e);
277 |   add_narrow(&x1_plus_y1, &x1->x, &x1->y);
278 |   add_narrow(&x2_plus_y2, &x2->x, &x2->y);
279 | 
280 |   mul_narrow(&t1, &x1_plus_y1, &x2_plus_y2);
281 |   sub_narrow(&t2, &t1, &c);
282 |   sub_narrow(&t1, &t2, &d);
283 |   mul_narrow(&t2, &t1, &f);
284 |   mul_narrow(&result->x, &t2, &a);
285 | 
286 |   sub_narrow(&t1, &d, &c);
287 |   mul_narrow(&t2, &t1, &g);
288 |   mul_narrow(&result->y, &t2, &a);
289 | 
290 |   mul_narrow(&result->z, &f, &g);
291 | }
292 | 
293 | void extended_add(
294 |   projective_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x1,
295 |   const extended_pt_narrow_t * __restrict x2) {
296 | 
297 |   residue_narrow_t x1_plus_y1, x2_plus_y2;
298 |   residue_narrow_t a, b, c, c_temp, d, e, e_temp, f, g, h;
299 | 
300 |   mul_narrow(&a, &x1->x, &x2->x);
301 |   mul_narrow(&b, &x1->y, &x2->y);
302 |   mul_narrow_const(&c_temp, &x1->t, D);
303 |   mul_narrow(&c, &c_temp, &x2->t);
304 |   mul_narrow(&d, &x1->z, &x2->z);
305 | 
306 |   add_narrow(&x1_plus_y1, &x1->x, &x1->y);
307 |   add_narrow(&x2_plus_y2, &x2->x, &x2->y);
308 |   mul_narrow(&e, &x1_plus_y1, &x2_plus_y2);
309 |   sub_narrow(&e_temp, &e, &a);
310 |   sub_narrow(&e, &e_temp, &b);
311 |   sub_narrow(&f, &d, &c);
312 |   add_narrow(&g, &d, &c);
313 |   sub_narrow(&h, &b, &a);
314 | 
315 |   mul_narrow(&result->x, &e, &f);
316 |   mul_narrow(&result->z, &f, &g);
317 |   mul_narrow(&result->y, &g, &h);
318 | }
319 | 
320 | void extended_add_extended(
321 |   extended_pt_narrow_t *result, const extended_pt_narrow_t *x1,
322 |   const extended_pt_narrow_t *x2) {
323 | 
324 |   residue_narrow_t x1_plus_y1, x2_plus_y2;
325 |   residue_narrow_t a, b, c, c_temp, d, e, e_temp, f, g, h;
326 | 
327 |   mul_narrow(&a, &x1->x, &x2->x);
328 |   mul_narrow(&b, &x1->y, &x2->y);
329 |   mul_narrow_const(&c_temp, &x1->t, D);
330 |   mul_narrow(&c, &c_temp, &x2->t);
331 |   mul_narrow(&d, &x1->z, &x2->z);
332 | 
333 |   add_narrow(&x1_plus_y1, &x1->x, &x1->y);
334 |   add_narrow(&x2_plus_y2, &x2->x, &x2->y);
335 |   mul_narrow(&e, &x1_plus_y1, &x2_plus_y2);
336 |   sub_narrow(&e_temp, &e, &a);
337 |   sub_narrow(&e, &e_temp, &b);
338 |   sub_narrow(&f, &d, &c);
339 |   add_narrow(&g, &d, &c);
340 |   sub_narrow(&h, &b, &a);
341 | 
342 |   mul_narrow(&result->x, &e, &f);
343 |   mul_narrow(&result->z, &f, &g);
344 |   mul_narrow(&result->y, &g, &h);
345 |   mul_narrow(&result->t, &e, &h);
346 | }
347 | 
348 | void extended_readd_narrow_extended(
349 |   extended_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x1,
350 |   const extended_pt_readd_narrow_t * __restrict x2) {
351 | 
352 |   residue_narrow_t x1_plus_y1;
353 |   residue_narrow_t x2_plus_y2;
354 |   residue_narrow_t a, b, c, d, e, e_temp, f, g, h;
355 | 
356 |   mul_narrow(&a, &x1->x, &x2->x);
357 |   mul_narrow(&b, &x1->y, &x2->y);
358 |   mul_narrow(&c, &x1->t, &x2->dt);
359 |   mul_narrow(&d, &x1->z, &x2->z);
360 | 
361 |   add_narrow(&x1_plus_y1, &x1->x, &x1->y);
362 |   add_narrow(&x2_plus_y2, &x2->x, &x2->y);
363 |   mul_narrow(&e, &x1_plus_y1, &x2_plus_y2);
364 |   sub_narrow(&e_temp, &e, &a);
365 |   sub_narrow(&e, &e_temp, &b);
366 |   sub_narrow(&f, &d, &c);
367 |   add_narrow(&g, &d, &c);
368 |   sub_narrow(&h, &b, &a);
369 | 
370 |   mul_narrow(&result->x, &e, &f);
371 |   mul_narrow(&result->z, &f, &g);
372 |   mul_narrow(&result->y, &g, &h);
373 |   mul_narrow(&result->t, &e, &h);
374 | }
375 | 
376 | void extended_readd_narrow(
377 |   projective_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x1,
378 |   const extended_pt_readd_narrow_t * __restrict x2) {
379 | 
380 |   residue_narrow_t x1_plus_y1;
381 |   residue_narrow_t x2_plus_y2;
382 |   residue_narrow_t a, b, c, d, e, e_temp, f, g, h;
383 | 
384 |   mul_narrow(&a, &x1->x, &x2->x);
385 |   mul_narrow(&b, &x1->y, &x2->y);
386 |   mul_narrow(&c, &x1->t, &x2->dt);
387 |   mul_narrow(&d, &x1->z, &x2->z);
388 | 
389 |   add_narrow(&x1_plus_y1, &x1->x, &x1->y);
390 |   add_narrow(&x2_plus_y2, &x2->x, &x2->y);
391 |   mul_narrow(&e, &x1_plus_y1, &x2_plus_y2);
392 |   sub_narrow(&e_temp, &e, &a);
393 |   sub_narrow(&e, &e_temp, &b);
394 |   sub_narrow(&f, &d, &c);
395 |   add_narrow(&g, &d, &c);
396 |   sub_narrow(&h, &b, &a);
397 | 
398 |   mul_narrow(&result->x, &e, &f);
399 |   mul_narrow(&result->z, &f, &g);
400 |   mul_narrow(&result->y, &g, &h);
401 | }
402 | 
403 | #include <stdio.h>
404 | // static void print_narrow(const residue_narrow_t *x, const char *prefix) {
405 | //   printf("%s\n", prefix);
406 | //   for (int i = 0; i < NLIMBS; ++i) {
407 | //     printf("%#x\n", x->limbs[i]);
408 | //   }
409 | // }
410 | 
411 | void extended_readd_affine_narrow_extended(
412 |   extended_pt_narrow_t *result, const extended_pt_narrow_t *x1,
413 |   const extended_affine_pt_readd_narrow_t * __restrict x2) {
414 | 
415 |   residue_narrow_t x1_plus_y1;
416 |   residue_narrow_t x2_plus_y2;
417 |   residue_narrow_t a, b, c, e, e_temp, f, g, h;
418 | 
419 |   mul_narrow(&a, &x1->x, &x2->x);
420 |   mul_narrow(&b, &x1->y, &x2->y);
421 |   mul_narrow(&c, &x1->t, &x2->dt);
422 | 
423 |   add_narrow(&x1_plus_y1, &x1->x, &x1->y);
424 |   add_narrow(&x2_plus_y2, &x2->x, &x2->y);
425 |   mul_narrow(&e, &x1_plus_y1, &x2_plus_y2);
426 |   sub_narrow(&e_temp, &e, &a);
427 |   sub_narrow(&e, &e_temp, &b);
428 |   sub_narrow(&f, &x1->z, &c);
429 |   add_narrow(&g, &x1->z, &c);
430 |   sub_narrow(&h, &b, &a);
431 | 
432 |   mul_narrow(&result->x, &e, &f);
433 |   mul_narrow(&result->z, &f, &g);
434 |   mul_narrow(&result->y, &g, &h);
435 |   mul_narrow(&result->t, &e, &h);
436 | }
437 | 
438 | void extended_readd_readd_narrow(
439 |   extended_pt_readd_narrow_t *result,
440 |   const extended_pt_narrow_t * __restrict x1,
441 |   const extended_pt_readd_narrow_t * __restrict x2) {
442 | 
443 |   residue_narrow_t x1_plus_y1;
444 |   residue_narrow_t x2_plus_y2;
445 |   residue_narrow_t a, b, c, d, e, e_temp, f, g, h, t3;
446 | 
447 |   mul_narrow(&a, &x1->x, &x2->x);
448 |   mul_narrow(&b, &x1->y, &x2->y);
449 |   mul_narrow(&c, &x1->t, &x2->dt);
450 |   mul_narrow(&d, &x1->z, &x2->z);
451 | 
452 |   add_narrow(&x1_plus_y1, &x1->x, &x1->y);
453 |   add_narrow(&x2_plus_y2, &x2->x, &x2->y);
454 |   mul_narrow(&e, &x1_plus_y1, &x2_plus_y2);
455 |   sub_narrow(&e_temp, &e, &a);
456 |   sub_narrow(&e, &e_temp, &b);
457 |   sub_narrow(&f, &d, &c);
458 |   add_narrow(&g, &d, &c);
459 |   sub_narrow(&h, &b, &a);
460 | 
461 |   mul_narrow(&result->x, &e, &f);
462 |   mul_narrow(&result->z, &f, &g);
463 |   mul_narrow(&result->y, &g, &h);
464 |   mul_narrow(&t3, &e, &h);
465 | 
466 |   mul_narrow_const(&result->dt, &t3, D);
467 | }
468 | 
469 | void readd_to_projective(
470 |   projective_pt_narrow_t *result,
471 |   const extended_pt_readd_narrow_t * __restrict x) {
472 | 
473 |   copy_narrow(&result->x, &x->x);
474 |   copy_narrow(&result->y, &x->y);
475 |   copy_narrow(&result->z, &x->z);
476 | }
477 | 
478 | void affine_readd_to_extended(
479 |   extended_pt_narrow_t *result,
480 |   const extended_affine_pt_readd_narrow_t * __restrict x) {
481 | 
482 |   copy_narrow(&result->x, &x->x);
483 |   copy_narrow(&result->y, &x->y);
484 |   mul_narrow(&result->t, &x->x, &x->y);
485 |   for (int i = 0; i < NLIMBS; ++i) {
486 |     result->z.limbs[i] = 0;
487 |   }
488 |   result->z.limbs[0] = 1;
489 | }
490 | 
491 | void scalar_multiply(
492 |   projective_pt_narrow_t *result, const affine_pt_narrow_t * __restrict x,
493 |   const scalar_t * __restrict n) {
494 | 
495 |   scalar_t sabs_n;
496 |   convert_to_sabs(&sabs_n, n);
497 | 
498 |   const int WINDOW_BITS = 5;
499 |   const uint32_t WINDOW_MASK = (1 << WINDOW_BITS) - 1;
500 |   const uint32_t LOOKUP_MASK = WINDOW_MASK >> 1;
501 |   const int TABLE_SIZE = 16;
502 |   extended_pt_readd_narrow_t table[TABLE_SIZE];
503 | 
504 |   extended_pt_narrow_t x2;
505 |   affine_double_extended(&x2, x);
506 |   affine_to_readd_narrow(&table[0], x);
507 |   for (int i = 1; i < TABLE_SIZE; ++i) {
508 |     extended_readd_readd_narrow(&table[i], &x2, &table[i-1]);
509 |   }
510 | 
511 |   int i;
512 |   int first = 1;
513 |   // Set i to the highest i such that
514 |   // a) i < SCALAR_BITS
515 |   // b) i % WINDOW_BITS = 0
516 | 
517 |   projective_pt_narrow_t temp;
518 |   extended_pt_narrow_t temp_ext;
519 |   extended_pt_readd_narrow_t window_pt;
520 | 
521 |   i = SCALAR_BITS - ((SCALAR_BITS - 1) % WINDOW_BITS) - 1;
522 |   for (; i >= 0; i -= WINDOW_BITS) {
523 |     uint32_t bits = sabs_n.limbs[i/SCALAR_LIMB_BITS] >> (i % SCALAR_LIMB_BITS);
524 |     if (i % SCALAR_LIMB_BITS > (SCALAR_LIMB_BITS - WINDOW_BITS) &&
525 |         i / SCALAR_LIMB_BITS < SCALAR_LIMBS - 1) {
526 | 
527 |       bits |= sabs_n.limbs[i/SCALAR_LIMB_BITS + 1] <<
528 |         (SCALAR_LIMB_BITS - i % SCALAR_LIMB_BITS);
529 |     }
530 | 
531 |     bits &= WINDOW_MASK;
532 |     int32_t invert = (bits >> (WINDOW_BITS - 1)) - 1;
533 |     bits ^= invert;
534 | 
535 |     constant_time_extended_narrow_lookup(
536 |       &window_pt, bits & LOOKUP_MASK, TABLE_SIZE, table);
537 |     constant_time_cond_extended_negate(&window_pt, invert);
538 | 
539 |     if (first) {
540 |       readd_to_projective(&temp, &window_pt);
541 |       first = 0;
542 |     } else {
543 |       for (int i = 0; i < WINDOW_BITS - 1; ++i) {
544 |         projective_double(&temp, &temp);
545 |       }
546 |       projective_double_extended(&temp_ext, &temp);
547 |       extended_readd_narrow(&temp, &temp_ext, &window_pt);
548 |     }
549 |   }
550 | 
551 |   copy_projective_pt_narrow(result, &temp);
552 |   explicit_bzero(&sabs_n, sizeof(sabs_n));
553 |   explicit_bzero(&window_pt, sizeof(window_pt));
554 |   explicit_bzero(table, sizeof(table));
555 |   explicit_bzero(&temp, sizeof(temp));
556 |   explicit_bzero(&temp_ext, sizeof(temp_ext));
557 | }
558 | 
559 | void scalar_multiply_unsafe(
560 |   projective_pt_narrow_t *result, const affine_pt_narrow_t * __restrict x,
561 |   const scalar_t * __restrict n) {
562 | 
563 |   scalar_t sabs_n;
564 |   convert_to_sabs(&sabs_n, n);
565 | 
566 |   const int WINDOW_BITS = 5;
567 |   const uint32_t WINDOW_MASK = (1 << WINDOW_BITS) - 1;
568 |   const uint32_t LOOKUP_MASK = WINDOW_MASK >> 1;
569 |   const int TABLE_SIZE = 16;
570 |   extended_pt_readd_narrow_t table[TABLE_SIZE];
571 | 
572 |   extended_pt_narrow_t x2;
573 |   affine_double_extended(&x2, x);
574 |   affine_to_readd_narrow(&table[0], x);
575 |   for (int i = 1; i < TABLE_SIZE; ++i) {
576 |     extended_readd_readd_narrow(&table[i], &x2, &table[i-1]);
577 |   }
578 | 
579 |   int i;
580 |   int first = 1;
581 |   // Set i to the highest i such that
582 |   // a) i < SCALAR_BITS
583 |   // b) i % WINDOW_BITS = 0
584 | 
585 |   projective_pt_narrow_t temp;
586 |   extended_pt_narrow_t temp_ext;
587 |   extended_pt_readd_narrow_t window_pt;
588 | 
589 |   i = SCALAR_BITS - ((SCALAR_BITS - 1) % WINDOW_BITS) - 1;
590 |   for (; i >= 0; i -= WINDOW_BITS) {
591 |     uint32_t bits = sabs_n.limbs[i/SCALAR_LIMB_BITS] >> (i % SCALAR_LIMB_BITS);
592 |     if (i % SCALAR_LIMB_BITS > (SCALAR_LIMB_BITS - WINDOW_BITS) &&
593 |         i / SCALAR_LIMB_BITS < SCALAR_LIMBS - 1) {
594 | 
595 |       bits |= sabs_n.limbs[i/SCALAR_LIMB_BITS + 1] <<
596 |         (SCALAR_LIMB_BITS - i % SCALAR_LIMB_BITS);
597 |     }
598 | 
599 |     bits &= WINDOW_MASK;
600 |     int32_t invert = (bits >> (WINDOW_BITS - 1)) - 1;
601 |     bits ^= invert;
602 | 
603 |     copy_extended_pt_readd_narrow(&window_pt, &table[bits & LOOKUP_MASK]);
604 |     if (invert) {
605 |       negate_extended_pt_readd_narrow(&window_pt, &window_pt);
606 |     }
607 | 
608 |     if (first) {
609 |       readd_to_projective(&temp, &window_pt);
610 |       first = 0;
611 |     } else {
612 |       for (int i = 0; i < WINDOW_BITS - 1; ++i) {
613 |         projective_double(&temp, &temp);
614 |       }
615 |       projective_double_extended(&temp_ext, &temp);
616 |       extended_readd_narrow(&temp, &temp_ext, &window_pt);
617 |     }
618 |   }
619 | 
620 |   copy_projective_pt_narrow(result, &temp);
621 | }
622 | 
623 | int point_decompress(
624 |   affine_pt_narrow_t *result,
625 |   residue_narrow_reduced_t *y, int low_bit) {
626 | 
627 |   residue_narrow_t y_n;
628 | 
629 |   residue_narrow_t u;
630 |   residue_narrow_t v;
631 | 
632 |   residue_narrow_t y2;
633 |   residue_narrow_reduced_t temp;
634 | 
635 |   unnarrow_reduce(&y_n, y);
636 |   square_narrow(&y2, &y_n);
637 |   copy_narrow(&result->y, &y_n);
638 | 
639 |   sub_narrow(&u, &one_narrow, &y2);
640 |   mul_narrow_const(&y2, &y2, D);
641 |   sub_narrow(&v, &one_narrow, &y2);
642 | 
643 |   if (sqrt_inv_narrow(&result->x, &u, &v)) {
644 |     narrow_partial_complete(&temp, &result->x);
645 | 
646 |     int x_is_odd = is_odd(&temp);
647 |     if ((x_is_odd && !low_bit) || (low_bit && !x_is_odd)) {
648 |       negate_narrow(&result->x, &result->x);
649 |     }
650 | 
651 |     return 1;
652 |   }
653 | 
654 |   return 0;
655 | }
656 | 


--------------------------------------------------------------------------------
/ref/include/curve.h:
--------------------------------------------------------------------------------
  1 | #ifndef CURVE_H
  2 | #define CURVE_H
  3 | #include "f11_260.h"
  4 | #include "scalar.h"
  5 | 
  6 | typedef struct affine_pt_narrow {
  7 |   residue_narrow_t x;
  8 |   residue_narrow_t y;
  9 | } affine_pt_narrow_t;
 10 | 
 11 | typedef struct extended_pt_readd_narrow {
 12 |   __attribute__((__aligned__(64)))
 13 |   residue_narrow_t x;
 14 |   residue_narrow_t dt;
 15 |   residue_narrow_t y;
 16 |   residue_narrow_t z;
 17 | } extended_pt_readd_narrow_t;
 18 | 
 19 | typedef struct extended_affine_pt_readd_narrow {
 20 |   __attribute__((__aligned__(64)))
 21 |   residue_narrow_t x;
 22 |   residue_narrow_t dt;
 23 |   residue_narrow_t y;
 24 | } extended_affine_pt_readd_narrow_t;
 25 | 
 26 | // For use in doubling.
 27 | typedef struct projective_pt_narrow {
 28 |   residue_narrow_t x;
 29 |   residue_narrow_t y;
 30 |   residue_narrow_t z;
 31 | } projective_pt_narrow_t;
 32 | 
 33 | // For use in addition.
 34 | typedef struct extended_pt_narrow {
 35 |   residue_narrow_t x;
 36 |   residue_narrow_t y;
 37 |   residue_narrow_t t;
 38 |   residue_narrow_t z;
 39 | } extended_pt_narrow_t;
 40 | 
 41 | #define D (-49142)
 42 | 
 43 | __attribute__((__aligned__(32)))
 44 | const affine_pt_narrow_t B;
 45 | 
 46 | void copy_projective_pt_narrow(
 47 |   projective_pt_narrow_t *result, const projective_pt_narrow_t *source);
 48 | 
 49 | void copy_extended_pt_narrow(
 50 |   extended_pt_narrow_t *result, const extended_pt_narrow_t *source);
 51 | 
 52 | void copy_extended_pt_readd_narrow(
 53 |   extended_pt_readd_narrow_t *result, const extended_pt_readd_narrow_t *source);
 54 | 
 55 | void copy_extended_pt_readd_narrow(
 56 |   extended_pt_readd_narrow_t *result, const extended_pt_readd_narrow_t *source);
 57 | 
 58 | void copy_extended_affine_pt_readd_narrow(
 59 |   extended_affine_pt_readd_narrow_t *result,
 60 |   const extended_affine_pt_readd_narrow_t *source);
 61 | 
 62 | void negate_extended_pt_readd_narrow(
 63 |   extended_pt_readd_narrow_t *result,
 64 |   const extended_pt_readd_narrow_t *source);
 65 | 
 66 | void negate_extended_affine_pt_readd_narrow(
 67 |   extended_affine_pt_readd_narrow_t *result,
 68 |   const extended_affine_pt_readd_narrow_t *source);
 69 | 
 70 | void affine_narrow_to_extended(
 71 |   extended_pt_narrow_t *result,
 72 |   const affine_pt_narrow_t * __restrict x);
 73 | 
 74 | void affine_to_projective(
 75 |   projective_pt_narrow_t *result,
 76 |   const affine_pt_narrow_t * __restrict x);
 77 | 
 78 | void affine_to_readd_narrow(
 79 |   extended_pt_readd_narrow_t *result,
 80 |   const affine_pt_narrow_t * __restrict x);
 81 | 
 82 | void extended_to_readd_narrow_neg(
 83 |   extended_pt_readd_narrow_t *result,
 84 |   const extended_pt_narrow_t * __restrict x);
 85 | 
 86 | void affine_to_readd_narrow(
 87 |   extended_pt_readd_narrow_t *result,
 88 |   const affine_pt_narrow_t * __restrict x);
 89 | 
 90 | void projective_to_extended_narrow(
 91 |   extended_pt_narrow_t *result, projective_pt_narrow_t * __restrict x);
 92 | 
 93 | void extended_to_projective_narrow(
 94 |   projective_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x);
 95 | 
 96 | void readd_to_projective(
 97 |   projective_pt_narrow_t *result,
 98 |   const extended_pt_readd_narrow_t * __restrict x);
 99 | 
100 | void affine_readd_to_extended(
101 |   extended_pt_narrow_t *result,
102 |   const extended_affine_pt_readd_narrow_t * __restrict x);
103 | 
104 | void negate_extended_affine_pt_readd_narrow(
105 |   extended_affine_pt_readd_narrow_t *result,
106 |   const extended_affine_pt_readd_narrow_t *source);
107 | 
108 | void affine_double(
109 |   projective_pt_narrow_t *result,
110 |   const affine_pt_narrow_t * __restrict x);
111 | 
112 | void affine_double_extended(
113 |   extended_pt_narrow_t *result, const affine_pt_narrow_t * __restrict x);
114 | 
115 | void projective_double(
116 |   projective_pt_narrow_t *result, const projective_pt_narrow_t *x);
117 | 
118 | void projective_double_extended(
119 |   extended_pt_narrow_t *result, const projective_pt_narrow_t * __restrict x);
120 | 
121 | void extended_double_extended(
122 |   extended_pt_narrow_t *result, const extended_pt_narrow_t *x);
123 | 
124 | void projective_add(
125 |   projective_pt_narrow_t *result, const projective_pt_narrow_t * __restrict x1,
126 |   const projective_pt_narrow_t * __restrict x2);
127 | 
128 | void extended_add(
129 |   projective_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x,
130 |   const extended_pt_narrow_t * __restrict y);
131 | 
132 | void extended_add_extended(
133 |   extended_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x,
134 |   const extended_pt_narrow_t * __restrict y);
135 | 
136 | void extended_readd_narrow(
137 |   projective_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x,
138 |   const extended_pt_readd_narrow_t * __restrict y);
139 | 
140 | void extended_readd_narrow_extended(
141 |   extended_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x,
142 |   const extended_pt_readd_narrow_t * __restrict y);
143 | 
144 | void extended_readd_affine_narrow_extended(
145 |   extended_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x,
146 |   const extended_affine_pt_readd_narrow_t * __restrict y);
147 | 
148 | void extended_add_extended(
149 |   extended_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x,
150 |   const extended_pt_narrow_t * __restrict y);
151 | 
152 | void extended_readd_readd_narrow(
153 |   extended_pt_readd_narrow_t *result,
154 |   const extended_pt_narrow_t * __restrict x,
155 |   const extended_pt_readd_narrow_t * __restrict y);
156 | 
157 | void extended_readd_narrow_extended(
158 |   extended_pt_narrow_t *result,
159 |   const extended_pt_narrow_t *x1,
160 |   const extended_pt_readd_narrow_t * __restrict x2);
161 | 
162 | void scalar_multiply(
163 |   projective_pt_narrow_t *result, const affine_pt_narrow_t * __restrict x,
164 |   const scalar_t * __restrict n);
165 | 
166 | void scalar_multiply_unsafe(
167 |   projective_pt_narrow_t *result, const affine_pt_narrow_t * __restrict x,
168 |   const scalar_t * __restrict n);
169 | 
170 | int point_decompress(
171 |   affine_pt_narrow_t *result, residue_narrow_reduced_t *y, int low_bit);
172 | #endif
173 | 


--------------------------------------------------------------------------------
/ref/include/f11_260.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include "f11_260.h"
  3 | 
  4 | residue_narrow_t zero_narrow = {0};
  5 | residue_narrow_t one_narrow = {
  6 |   .limbs = {1},
  7 | };
  8 | 
  9 | // Shrink to 32 bits. Assumes reduction has already occurred, and wide storage
 10 | // is being used for vector compatibility.
 11 | void narrow(residue_narrow_t *result, const residue_wide_t * __restrict w) {
 12 |   for (int i = 0; i < NLIMBS; ++i) {
 13 |     result->limbs[i] = w->limbs[i];
 14 |   }
 15 | }
 16 | 
 17 | // Reduce to 10 limbs. Useful for debugging
 18 | void narrow_reduce(
 19 |   residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w) {
 20 |   residue_narrow_t temp;
 21 |   for (int i = 0; i < NLIMBS; ++i) {
 22 |     temp.limbs[i] = w->limbs[i] - w->limbs[10];
 23 |   }
 24 | 
 25 |   reduce_step_narrow(&temp, &temp);
 26 | 
 27 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
 28 |     result->limbs[i] = temp.limbs[i] - temp.limbs[10];
 29 |   }
 30 | }
 31 | 
 32 | // Reduce to unique representative.
 33 | // This is expensive. Only used for final signature or DH Key
 34 | void narrow_complete(
 35 |   residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w) {
 36 | 
 37 |   residue_narrow_t temp;
 38 |   for (int i = 0; i < NLIMBS; ++i) {
 39 |     temp.limbs[i] = w->limbs[i] - w->limbs[10];
 40 |   }
 41 | 
 42 |   // This may be combined with the final reduction from a multiply.
 43 |   reduce_step_narrow(&temp, &temp);
 44 | 
 45 |   int gt_mask = 0;
 46 |   int lt_mask = 0;
 47 |   int32_t limit[NLIMBS];
 48 |   for (int i = 0; i < NLIMBS; ++i) {
 49 |     temp.limbs[i] = temp.limbs[i] - temp.limbs[10];
 50 |     temp.limbs[i] += 1 & gt_mask;
 51 |     temp.limbs[i] -= 1 & lt_mask;
 52 |     gt_mask = -(temp.limbs[i] > T);
 53 |     lt_mask = -(temp.limbs[i] < 0);
 54 |     temp.limbs[i] -= (T & gt_mask);
 55 |     temp.limbs[i] += (T & lt_mask);
 56 |   }
 57 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
 58 |     temp.limbs[i] -= temp.limbs[10];
 59 |     limit[i] = T;
 60 |   }
 61 |   int64_t all_t = -1;
 62 |   for (int i = NLIMBS_REDUCED - 2; i >= 0; --i) {
 63 |     all_t &= -(temp.limbs[i+1] == T);
 64 |     limit[i] -= 1 & (~all_t);
 65 |   }
 66 |   gt_mask = 0;
 67 |   lt_mask = 0;
 68 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
 69 |     temp.limbs[i] += 1 & gt_mask;
 70 |     temp.limbs[i] -= 1 & lt_mask;
 71 |     gt_mask = -(temp.limbs[i] > limit[i]);
 72 |     lt_mask = -(temp.limbs[i] < 0);
 73 |     temp.limbs[i] -= (T & gt_mask);
 74 |     temp.limbs[i] += (T & lt_mask);
 75 |     result->limbs[i] = temp.limbs[i];
 76 |   }
 77 | }
 78 | 
 79 | // Reduce to mostly unique representative.
 80 | // All coefficients are reduced to 0 <= xi <= t
 81 | // Unique up to carries (xi == t) => (xi = 0; x[i+1] += 1);
 82 | // This is sufficient to determine if x is even or odd.
 83 | // Still pretty expensive. Used in point compression.
 84 | void narrow_partial_complete(
 85 |   residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w) {
 86 | 
 87 |   residue_narrow_t temp;
 88 |   for (int i = 0; i < NLIMBS; ++i) {
 89 |     temp.limbs[i] = w->limbs[i] - w->limbs[10];
 90 |   }
 91 | 
 92 |   // This may be combined with the final reduction from a multiply.
 93 |   reduce_step_narrow(&temp, &temp);
 94 | 
 95 |   int gt_mask = 0;
 96 |   int lt_mask = 0;
 97 |   for (int i = 0; i < NLIMBS - 1; ++i) {
 98 |     temp.limbs[i] = temp.limbs[i] - temp.limbs[10];
 99 |     temp.limbs[i] += 1 & gt_mask;
100 |     temp.limbs[i] -= 1 & lt_mask;
101 |     gt_mask = -(temp.limbs[i] > T);
102 |     lt_mask = -(temp.limbs[i] < 0);
103 |     temp.limbs[i] -= (T & gt_mask);
104 |     temp.limbs[i] += (T & lt_mask);
105 |   }
106 |   for (int i = 0; i < NLIMBS - 1; ++i) {
107 |     temp.limbs[i] -= temp.limbs[10];
108 |   }
109 |   gt_mask = 0;
110 |   lt_mask = 0;
111 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
112 |     temp.limbs[i] += 1 & gt_mask;
113 |     temp.limbs[i] -= 1 & lt_mask;
114 |     gt_mask = -(temp.limbs[i] > T);
115 |     lt_mask = -(temp.limbs[i] < 0);
116 |     temp.limbs[i] -= (T & gt_mask);
117 |     temp.limbs[i] += (T & lt_mask);
118 |     result->limbs[i] = temp.limbs[i];
119 |   }
120 | }
121 | 
122 | int is_odd(residue_narrow_reduced_t *x) {
123 |   int result = 0;
124 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
125 |     result ^= x->limbs[i] & 0x1;
126 |   }
127 |   return result;
128 | }
129 | 
130 | // Copy a 12x32-bit residue
131 | void copy_narrow(
132 |   residue_narrow_t *result, const residue_narrow_t * __restrict x) {
133 | 
134 |   for (int i = 0; i < NLIMBS; ++i) {
135 |     result->limbs[i] = x->limbs[i];
136 |   }
137 | }
138 | 
139 | // Copy a 10x32-bit residue
140 | void copy_narrow_reduced(
141 |   residue_narrow_reduced_t *result,
142 |   const residue_narrow_reduced_t * __restrict x) {
143 | 
144 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
145 |     result->limbs[i] = x->limbs[i];
146 |   }
147 | }
148 | 
149 | // Subtract 2 12x32-bit residues.
150 | void sub_narrow(
151 |   residue_narrow_t *result, const residue_narrow_t * __restrict x,
152 |   const residue_narrow_t * __restrict y) {
153 | 
154 |   for (int i = 0; i < NLIMBS; ++i) {
155 |     result->limbs[i] = x->limbs[i] - y->limbs[i];
156 |   }
157 | }
158 | 
159 | // negate a 12x32-bit residue.
160 | void negate_narrow(
161 |   residue_narrow_t *result, const residue_narrow_t *x) {
162 | 
163 |   for (int i = 0; i < NLIMBS; ++i) {
164 |     result->limbs[i] = -(x->limbs[i]);
165 |   }
166 | }
167 | 
168 | // Add 2 12x32-bit residues.
169 | void add_narrow(
170 |   residue_narrow_t *result, const residue_narrow_t * __restrict x,
171 |   const residue_narrow_t * __restrict y) {
172 | 
173 |   for (int i = 0; i < NLIMBS; ++i) {
174 |     result->limbs[i] = x->limbs[i] + y->limbs[i];
175 |   }
176 | }
177 | 
178 | // Scale a narrow residue by 2.
179 | void double_narrow(
180 |   residue_narrow_t *result, const residue_narrow_t *x) {
181 | 
182 |   for (int i = 0; i < NLIMBS; ++i) {
183 |     result->limbs[i] = x->limbs[i] << 1;
184 |   }
185 | }
186 | 
187 | // Scale a wide residue by 2.
188 | void double_wide(
189 |   residue_wide_t *result, const residue_wide_t *x) {
190 | 
191 |   for (int i = 0; i < NLIMBS; ++i) {
192 |     result->limbs[i] = x->limbs[i] << 1;
193 |   }
194 | }
195 | 
196 | #define wrap(x) (((x + NLIMBS) % NLIMBS))
197 | // Multiply two wide residues, and produce a wide result. The result is reduced
198 | // to 32 bits, but not narrowed for performance reasons.
199 | void mul_wide(
200 |   residue_wide_t *result, const residue_wide_t *x, const residue_wide_t *y) {
201 | 
202 |   residue_wide_t temp;
203 |   for (int i = 0; i < NLIMBS; ++i) {
204 |     temp.limbs[i] = 0;
205 |     int i_2 = (i + (-(i & 1) & NLIMBS)) >> 1;
206 |     for (int j = 1; j <= NLIMBS / 2; ++ j) {
207 |       temp.limbs[i] +=
208 |         (x->limbs[wrap(i_2 + j)] - x->limbs[wrap(i_2 - j)]) *
209 |         (y->limbs[wrap(i_2 - j)] - y->limbs[wrap(i_2 + j)]);
210 |     }
211 |   }
212 |   reduce_step_wide(&temp, &temp);
213 |   reduce_step_wide(result, &temp);
214 | }
215 | 
216 | // Multiply a wide residues by a narrow and produce a wide result. The result is
217 | // reduced to 32 bits, but not narrowed for performance reasons.
218 | void mul_wide_narrow(
219 |   residue_wide_t *result, const residue_wide_t *x, const residue_narrow_t *y) {
220 | 
221 |   residue_wide_t temp;
222 |   for (int i = 0; i < NLIMBS; ++i) {
223 |     temp.limbs[i] = 0;
224 |     int i_2 = (i + (-(i & 1) & NLIMBS)) >> 1;
225 |     for (int j = 1; j <= NLIMBS / 2; ++j) {
226 |       temp.limbs[i] +=
227 |         (x->limbs[wrap(i_2 + j)] - x->limbs[wrap(i_2 - j)]) *
228 |         ((int64_t) (y->limbs[wrap(i_2 - j)] - y->limbs[wrap(i_2 + j)]));
229 |     }
230 |   }
231 |   reduce_step_wide(&temp, &temp);
232 |   reduce_step_wide(result, &temp);
233 | }
234 | 
235 | // Multiply two narrow residues and produce a narrow result.
236 | void mul_narrow(
237 |   residue_narrow_t *result, const residue_narrow_t *x,
238 |   const residue_narrow_t *y) {
239 | 
240 |   residue_wide_t temp;
241 |   for (int i = 0; i < NLIMBS; ++i) {
242 |     temp.limbs[i] = 0;
243 |     int i_2 = (i + (-(i & 1) & NLIMBS)) >> 1;
244 |     for (int j = 1; j <= NLIMBS / 2; ++ j) {
245 |       temp.limbs[i] +=
246 |         ((int64_t) (x->limbs[wrap(i_2 + j)] - x->limbs[wrap(i_2 - j)])) *
247 |         ((int64_t) (y->limbs[wrap(i_2 - j)] - y->limbs[wrap(i_2 + j)]));
248 |     }
249 |   }
250 |   reduce_step_wide(&temp, &temp);
251 |   reduce_step_wide(&temp, &temp);
252 |   narrow(result, &temp);
253 | }
254 | 
255 | // Multiply a narrow residue by a small constant. The result is reduced to 32
256 | // bits, but not narrowed for performance reasons.
257 | void mul_narrow_const(
258 |   residue_narrow_t *result, const residue_narrow_t *x, int32_t d) {
259 | 
260 |   residue_wide_t temp;
261 |   for (int i = 0; i < NLIMBS; ++i) {
262 |     temp.limbs[i] = ((uint64_t) x->limbs[i]) * d;
263 |   }
264 |   reduce_step_wide(&temp, &temp);
265 |   narrow(result, &temp);
266 | }
267 | 
268 | 
269 | // Square a narrow residue and produce a wide result. The result is reduced to
270 | // 32 bits but not narrowed for performance reasons.
271 | void square_narrow(
272 |   residue_narrow_t *result, const residue_narrow_t *x) {
273 | 
274 |   residue_wide_t temp;
275 |   for (int i = 0; i < NLIMBS; ++i) {
276 |     temp.limbs[i] = 0;
277 |     int i_2 = (i + (-(i & 1) & NLIMBS)) >> 1;
278 |     for (int j = 1; j <= NLIMBS / 2; ++ j) {
279 |       temp.limbs[i] -=
280 |         ((int64_t) (x->limbs[wrap(i_2 + j)] - x->limbs[wrap(i_2 - j)])) *
281 |         ((int64_t) (x->limbs[wrap(i_2 + j)] - x->limbs[wrap(i_2 - j)]));
282 |     }
283 |   }
284 |   reduce_step_wide(&temp, &temp);
285 |   reduce_step_wide(&temp, &temp);
286 |   narrow(result, &temp);
287 | }
288 | 
289 | // Approximately divide each coefficient by t. Carry the results.
290 | void reduce_step_narrow(
291 |   residue_narrow_t *result, const residue_narrow_t *x) {
292 | 
293 |   int32_t carries[NLIMBS];
294 | 
295 |   for (int i = 0; i < NLIMBS; ++i) {
296 |     carries[i] = x->limbs[i] >> TBITS;
297 |     result->limbs[i] = (x->limbs[i] & TMASK) +
298 |       (carries[i] << T_CBITS) - carries[i];
299 |   }
300 | 
301 |   for (int i = 1; i < NLIMBS; ++i) {
302 |     result->limbs[i] += carries[i - 1];
303 |   }
304 |   result->limbs[0] += carries[NLIMBS - 1];
305 | }
306 | 
307 | // Approximately divide each coefficient by t. Carry the results.
308 | void reduce_step_wide(
309 |   residue_wide_t *result, const residue_wide_t *x) {
310 | 
311 |   int64_t carries[NLIMBS];
312 | 
313 |   for (int i = 0; i < NLIMBS; ++i) {
314 |     carries[i] = x->limbs[i] >> TBITS;
315 |     result->limbs[i] = (x->limbs[i] & TMASK) +
316 |       (carries[i] << T_CBITS) - carries[i];
317 |   }
318 | 
319 |   for (int i = 1; i < NLIMBS; ++i) {
320 |     result->limbs[i] += carries[i - 1];
321 |   }
322 |   result->limbs[0] += carries[NLIMBS - 1];
323 | }
324 | 
325 | // Takes advantage of the fact that if a residue z *is zero* then after setting
326 | // one coefficient to T/2, all the remaining coefficients should be near to
327 | // T/2. They should therefore resolve all carries in a single step, and all be
328 | // equal to the same value. Some other value may not reduce completely, but this
329 | // is fine, we will know it is not zero.
330 | int equal_narrow(const residue_narrow_t *x, const residue_narrow_t *y) {
331 |   residue_narrow_t temp;
332 | 
333 |   sub_narrow(&temp, x, y);
334 |   int32_t delta = -temp.limbs[0] + (T / 2);
335 |   for (int i = 0; i < NLIMBS; ++i) {
336 |     temp.limbs[i] += delta;
337 |   }
338 | 
339 |   reduce_step_narrow(&temp, &temp);
340 | 
341 |   delta = temp.limbs[0];
342 |   int result = 0;
343 |   for (int i = 1; i < NLIMBS; ++i) {
344 |     result |= (temp.limbs[i] ^ delta);
345 |   }
346 | 
347 |   return !result;
348 | }
349 | 
350 | int equal_narrow_reduced(
351 |   const residue_narrow_reduced_t * x, const residue_narrow_reduced_t * y) {
352 | 
353 |   int result = 0;
354 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
355 |     result |= (x->limbs[i] ^ y->limbs[i]);
356 |   }
357 | 
358 |   return !result;
359 | }
360 | 
361 | static inline void nsquare_narrow(
362 |   residue_narrow_t *result, const residue_narrow_t *x, int n) {
363 | 
364 |   square_narrow(result, x);
365 |   for (int i = 1; i < n; ++i) {
366 |     square_narrow(result, result);
367 |   }
368 | }
369 | 
370 | static void raise_to_t(
371 |   residue_narrow_t *result, const residue_narrow_t *x) {
372 |   // zi = z^(2^i - 1), z1 = x
373 |   residue_narrow_t z2;
374 |   residue_narrow_t z3;
375 |   residue_narrow_t z5;
376 |   residue_narrow_t z10;
377 |   residue_narrow_t z11;
378 |   residue_narrow_t z22;
379 |   residue_narrow_t result_t;
380 | 
381 |   square_narrow(&z2, x);
382 |   mul_narrow(&z2, &z2, x);
383 |   square_narrow(&z3, &z2);
384 |   mul_narrow(&z3, &z3, x);
385 |   nsquare_narrow(&z5, &z3, 2);
386 |   mul_narrow(&z5, &z5, &z2);
387 |   nsquare_narrow(&z10, &z5, 5);
388 |   mul_narrow(&z10, &z10, &z5);
389 |   square_narrow(&z11, &z10);
390 |   mul_narrow(&z11, &z11, x);
391 |   nsquare_narrow(&z22, &z11, 11);
392 |   mul_narrow(&z22, &z22, &z11);
393 |   nsquare_narrow(&result_t, &z22, 4);
394 |   mul_narrow(result, &result_t, x);
395 | }
396 | 
397 | static void raise_to_phi_t(
398 |   residue_narrow_t *result, const residue_narrow_t *x, int n) {
399 |   residue_narrow_t temp;
400 | 
401 |   raise_to_t(&temp, x);
402 | 
403 |   for (int i = 1; i < n; ++i) {
404 |     mul_narrow(&temp, &temp, x);
405 |     raise_to_t(&temp, &temp);
406 |   }
407 | 
408 |   mul_narrow(result, &temp, x);
409 | }
410 | 
411 | static void raise_to_t_minus_1_over_4(
412 |   residue_narrow_t *result, const residue_narrow_t *x) {
413 |   // zi = z^(2^i - 1), z1 = x
414 |   residue_narrow_t z2;
415 |   residue_narrow_t z3;
416 |   residue_narrow_t z5;
417 |   residue_narrow_t z10;
418 |   residue_narrow_t z11;
419 |   residue_narrow_t z22;
420 | 
421 |   square_narrow(&z2, x);
422 |   mul_narrow(&z2, &z2, x);
423 |   square_narrow(&z3, &z2);
424 |   mul_narrow(&z3, &z3, x);
425 |   nsquare_narrow(&z5, &z3, 2);
426 |   mul_narrow(&z5, &z5, &z2);
427 |   nsquare_narrow(&z10, &z5, 5);
428 |   mul_narrow(&z10, &z10, &z5);
429 |   square_narrow(&z11, &z10);
430 |   mul_narrow(&z11, &z11, x);
431 |   nsquare_narrow(&z22, &z11, 11);
432 |   mul_narrow(&z22, &z22, &z11);
433 |   nsquare_narrow(result, &z22, 2);
434 | }
435 | 
436 | static void raise_to_p_minus_3_over_4(
437 |   residue_narrow_t *result, const residue_narrow_t *x) {
438 | 
439 |   residue_narrow_t z4; //z to (t-1)/4
440 |   residue_narrow_t z2; //z to (t-1)/2
441 |   residue_narrow_t z3_4; //z to (3t+1)/4
442 |   residue_narrow_t y_small;
443 |   residue_narrow_t y, y_t4_y;
444 |   residue_narrow_t raised;
445 | 
446 |   raise_to_t_minus_1_over_4(&z4, x);
447 |   square_narrow(&z2, &z4);
448 |   mul_narrow(&z3_4, &z2, &z4);
449 |   mul_narrow(&z3_4, &z3_4, x);
450 |   raise_to_t(&raised, &z4);
451 |   mul_narrow(&y_small, &z2, &raised);
452 |   raise_to_t(&raised, &y_small);
453 |   mul_narrow(&y, &z3_4, &raised);
454 |   raise_to_t(&raised, &y);
455 |   raise_to_t(&raised, &raised);
456 |   raise_to_t(&raised, &raised);
457 |   raise_to_t(&raised, &raised);
458 |   mul_narrow(&y_t4_y, &raised, &y);
459 |   raise_to_t(&raised, &y_t4_y);
460 |   raise_to_t(&raised, &raised);
461 |   raise_to_t(&raised, &raised);
462 |   mul_narrow(result, &raised, &y_small);
463 | }
464 | 
465 | int sqrt_inv_narrow(
466 |   residue_narrow_t *result, const residue_narrow_t * __restrict x,
467 |   const residue_narrow_t * __restrict y) {
468 |   residue_narrow_t xy;
469 |   residue_narrow_t y2;
470 |   residue_narrow_t xy3;
471 |   residue_narrow_t xy3_p_3_over_4;
472 |   residue_narrow_t cand2;
473 |   residue_narrow_t should_be_x;
474 | 
475 |   square_narrow(&y2, y);
476 |   mul_narrow(&xy, x, y);
477 |   mul_narrow(&xy3, &xy, &y2);
478 |   raise_to_p_minus_3_over_4(&xy3_p_3_over_4, &xy3);
479 |   mul_narrow(result, &xy, &xy3_p_3_over_4);
480 |   square_narrow(&cand2, result);
481 |   mul_narrow(&should_be_x, y, &cand2);
482 | 
483 |   return equal_narrow(&should_be_x, x);
484 | }
485 | 
486 | void invert_narrow(
487 |   residue_narrow_t *result, const residue_narrow_t * __restrict x) {
488 | 
489 |   residue_narrow_t x_t_minus_1_over_4;
490 |   residue_narrow_t x_t_minus_1;
491 |   residue_narrow_t x_t;
492 |   residue_narrow_t phi_8_x_t;
493 |   residue_narrow_t phi_8_x_t_t;
494 | 
495 |   raise_to_t_minus_1_over_4(&x_t_minus_1_over_4, x);
496 |   nsquare_narrow(&x_t_minus_1, &x_t_minus_1_over_4, 2);
497 |   mul_narrow(&x_t, &x_t_minus_1, x);
498 |   raise_to_phi_t(&phi_8_x_t, &x_t, 8);
499 |   raise_to_t(&phi_8_x_t_t, &phi_8_x_t);
500 |   mul_narrow(result, &phi_8_x_t_t, &x_t_minus_1);
501 | }
502 | 
503 | void encode(uint8_t *out, const residue_narrow_reduced_t * __restrict x) {
504 |   uint32_t collect = x->limbs[0];
505 | 
506 |   int space = 32 - TBITS;
507 |   int i = 1;
508 |   int bits_remaining = TBITS * NLIMBS_REDUCED;
509 |   while (bits_remaining > 0) {
510 |     *out++ = collect & 0xff;
511 |     collect >>= 8;
512 |     space += 8;
513 |     bits_remaining -= 8;
514 |     if (space >= TBITS && i < NLIMBS_REDUCED) {
515 |       collect |= x->limbs[i] << (32 - space);
516 |       space -= TBITS;
517 |       ++i;
518 |     }
519 |   }
520 | }
521 | 
522 | void decode(residue_narrow_reduced_t *out, const uint8_t *in) {
523 |   uint32_t collect = 0;
524 | 
525 |   int shift = 0;
526 |   int i = 0;
527 |   int bits_remaining = TBITS * NLIMBS_REDUCED;
528 |   while (bits_remaining > 0) {
529 |     collect |= (*in++) << shift;
530 |     shift += 8;
531 |     bits_remaining -= 8;
532 |     if (shift >= TBITS) {
533 |       if (bits_remaining > 0) {
534 |         out->limbs[i] = collect & TMASK;
535 |         collect >>= 26;
536 |         shift -= 26;
537 |         ++i;
538 |       } else {
539 |         out->limbs[i] = collect;
540 |       }
541 |     }
542 |   }
543 | }
544 | 


--------------------------------------------------------------------------------
/ref/include/f11_260.h:
--------------------------------------------------------------------------------
  1 | // Types and functions for manipulating field elements
  2 | 
  3 | #ifndef F11_260_H
  4 | #define F11_260_H
  5 | #include <stdint.h>
  6 | 
  7 | #define NLIMBS_REDUCED 10
  8 | #define NLIMBS 11
  9 | #define T ((1 << 26) - 15)
 10 | #define TBITS 26
 11 | #define TMASK ((1 << 26) - 1)
 12 | #define T_CBITS 4
 13 | #define RESIDUE_LENGTH_BYTES 33
 14 | 
 15 | // Reduced to 10 limbs. For final results.
 16 | typedef struct residue_narrow_reduced {
 17 |   __attribute__((__aligned__(8)))
 18 |   int32_t limbs[NLIMBS_REDUCED];
 19 | } residue_narrow_reduced_t;
 20 | 
 21 | // 11 limbs.
 22 | typedef struct residue_narrow {
 23 |   __attribute__((__aligned__(64)))
 24 |   int32_t limbs[NLIMBS];
 25 |   int32_t pad[16 - NLIMBS];
 26 | } residue_narrow_t;
 27 | 
 28 | // 11 limbs.
 29 | // compatibility.
 30 | typedef struct residue_wide {
 31 |   __attribute__((__aligned__(64)))
 32 |   int64_t limbs[NLIMBS];
 33 |   int64_t pad[16 - NLIMBS];
 34 | } residue_wide_t;
 35 | 
 36 | residue_wide_t zero_wide;
 37 | residue_wide_t one_wide;
 38 | residue_narrow_t zero_narrow;
 39 | residue_narrow_t one_narrow;
 40 | 
 41 | // Shrink to 32 bits. Assumes reduction has already occurred, and wide storage
 42 | // is being used for vector compatibility.
 43 | void narrow(residue_narrow_t *result, const residue_wide_t * __restrict w);
 44 | 
 45 | // Reduce to 10 limbs. Useful for debugging
 46 | void narrow_reduce(
 47 |   residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w);
 48 | 
 49 | // Reduce to unique representative.
 50 | // This is expensive. Only used for final signature or DH Key
 51 | void narrow_complete(
 52 |   residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w);
 53 | 
 54 | // Reduce to mostly unique representative.
 55 | // All coefficients are reduced to 0 <= xi <= t
 56 | // Unique up to carries (xi == t) => (xi = 0; x[i+1] += 1);
 57 | // This is sufficient to determine if x is even or odd.
 58 | // Still pretty expensive. Used in point compression.
 59 | void narrow_partial_complete(
 60 |   residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w);
 61 | 
 62 | int is_odd(residue_narrow_reduced_t *x);
 63 | 
 64 | // Produce a 32-bit entry with 11 limbs
 65 | static inline void unnarrow_reduce(
 66 |   residue_narrow_t *result, const residue_narrow_reduced_t * __restrict x) {
 67 | 
 68 |   result->limbs[10] = 0;
 69 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
 70 |     result->limbs[i] = x->limbs[i];
 71 |   }
 72 | }
 73 | 
 74 | // Produce a 64-bit residue
 75 | void widen(
 76 |   residue_wide_t *result, const residue_narrow_t * __restrict x);
 77 | 
 78 | // Copy a 64-bit residue
 79 | void copy_wide(
 80 |   residue_wide_t *result, const residue_wide_t * __restrict x);
 81 | 
 82 | // Copy a 32-bit residue
 83 | void copy_narrow(
 84 |   residue_narrow_t *result, const residue_narrow_t * __restrict x);
 85 | 
 86 | void copy_narrow_reduced(
 87 |   residue_narrow_reduced_t *result,
 88 |   const residue_narrow_reduced_t * __restrict x);
 89 | 
 90 | // Subtract 2 11x32-bit residues.
 91 | void sub_narrow(
 92 |   residue_narrow_t *result, const residue_narrow_t * __restrict x,
 93 |   const residue_narrow_t * __restrict y);
 94 | 
 95 | void negate_wide(residue_wide_t *result, const residue_wide_t *x);
 96 | 
 97 | void negate_narrow(residue_narrow_t *result, const residue_narrow_t *x);
 98 | 
 99 | // Add 2 11x32-bit residues.
100 | void add_narrow(
101 |   residue_narrow_t *result, const residue_narrow_t * __restrict x,
102 |   const residue_narrow_t * __restrict y);
103 | 
104 | // Add 2 11x64-bit residues.
105 | void add_wide(
106 |   residue_wide_t *result, const residue_wide_t * __restrict x,
107 |   const residue_wide_t * __restrict y);
108 | 
109 | // Scale a wide residue by 2.
110 | void double_narrow(
111 |   residue_narrow_t *result, const residue_narrow_t * __restrict x);
112 | 
113 | // Multiply two narrow residues and produce a wide result. The result is reduced
114 | // to 32 bits.
115 | void mul_narrow(
116 |   residue_narrow_t *result, const residue_narrow_t *x,
117 |   const residue_narrow_t *y);
118 | 
119 | // Multiply a residue by a constant.
120 | void mul_narrow_const(
121 |   residue_narrow_t *result, const residue_narrow_t * __restrict x, int32_t d);
122 | 
123 | // Square a narrow residue and produce a narrow result. The result is reduced to
124 | // 32 bits.
125 | void square_narrow(
126 |   residue_narrow_t *result, const residue_narrow_t *x);
127 | 
128 | // Approximately divide each coefficient by t. Carry the results.
129 | void reduce_step_narrow(
130 |   residue_narrow_t *result, const residue_narrow_t *x);
131 | 
132 | // Approximately divide each coefficient by t. Carry the results.
133 | void reduce_step_wide(
134 |   residue_wide_t *result, const residue_wide_t *x);
135 | 
136 | // Invert via fermat's theorem
137 | void invert_narrow(
138 |   residue_narrow_t *result, const residue_narrow_t * __restrict x);
139 | 
140 | // Compute combined inverse and square root
141 | // returns true if x/y was a quadratic residue, and false otherwise.
142 | int sqrt_inv_narrow(
143 |   residue_narrow_t *result, const residue_narrow_t * __restrict x,
144 |   const residue_narrow_t * __restrict y);
145 | 
146 | // Returns true if x == y. Computes in constant time.
147 | int equal_narrow(const residue_narrow_t * x, const residue_narrow_t * y);
148 | 
149 | int equal_narrow_reduced(
150 |   const residue_narrow_reduced_t * x, const residue_narrow_reduced_t * y);
151 | 
152 | void encode(uint8_t *out, const residue_narrow_reduced_t * __restrict x);
153 | void encode_compressed(
154 |   uint8_t *out, const residue_narrow_reduced_t * __restrict x, int is_odd);
155 | 
156 | void decode(residue_narrow_reduced_t *out, const uint8_t *in);
157 | #endif
158 | 


--------------------------------------------------------------------------------
/ref/include/gen.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <string.h>
 3 | #include "comb.h"
 4 | #include "curve.h"
 5 | #include "gen.h"
 6 | #include "scalar.h"
 7 | 
 8 | void gen_key(scalar_t * __restrict priv_key,
 9 |              affine_pt_narrow_t * __restrict pub_key) {
10 |   scalar_hash_t large_key;
11 |   char *large_key_ptr = (char *) &large_key;
12 |   arc4random_buf(large_key_ptr, sizeof(large_key));
13 | 
14 |   // It's just as random to use montgomery reduction as to correct for the
15 |   // montgomery factor.
16 |   mont_reduce_hash_mod_l(priv_key, &large_key);
17 | 
18 |   projective_pt_narrow_t result_pt;
19 |   scalar_comb_multiply(&result_pt, &base_comb, priv_key);
20 | 
21 |   residue_narrow_t z_inv;
22 | 
23 |   invert_narrow(&z_inv, &result_pt.z);
24 |   mul_narrow(&result_pt.x, &result_pt.x, &z_inv);
25 |   mul_narrow(&result_pt.y, &result_pt.y, &z_inv);
26 | 
27 |   residue_narrow_t temp_narrow;
28 |   copy_narrow(&pub_key->x, &result_pt.x);
29 | 
30 |   copy_narrow(&pub_key->y, &result_pt.y);
31 | 
32 |   // explicit_bzero(&large_key, sizeof(large_key));
33 |   explicit_bzero(&result_pt, sizeof(result_pt));
34 |   explicit_bzero(&z_inv, sizeof(z_inv));
35 |   explicit_bzero(&temp_narrow, sizeof(temp_narrow));
36 | }
37 | 
38 | void encode_pub_key(uint8_t *result, const affine_pt_narrow_t *pub_key) {
39 |   residue_narrow_reduced_t y_reduced;
40 |   residue_narrow_reduced_t x_reduced;
41 |   narrow_complete(&y_reduced, &pub_key->y);
42 |   narrow_partial_complete(&x_reduced, &pub_key->x);
43 | 
44 |   y_reduced.limbs[NLIMBS_REDUCED - 1] |= is_odd(&x_reduced) << TBITS;
45 |   encode(result, &y_reduced);
46 | }
47 | 
48 | int decode_pub_key(affine_pt_narrow_t *result, const uint8_t *encoded_key) {
49 |   residue_narrow_reduced_t y_decoded;
50 |   decode(&y_decoded, encoded_key);
51 |   int is_odd = y_decoded.limbs[NLIMBS_REDUCED - 1] >> TBITS;
52 |   y_decoded.limbs[NLIMBS_REDUCED - 1] &= TMASK;
53 |   return point_decompress(result, &y_decoded, is_odd);
54 | }
55 | 


--------------------------------------------------------------------------------
/ref/include/gen.h:
--------------------------------------------------------------------------------
 1 | #ifndef GEN_H
 2 | #define GEN_H
 3 | 
 4 | #include "scalar.h"
 5 | #include "curve.h"
 6 | 
 7 | void gen_key(scalar_t * __restrict priv_key,
 8 |              affine_pt_narrow_t * __restrict pub_key);
 9 | void encode_pub_key(uint8_t *result, const affine_pt_narrow_t *pub_key);
10 | int decode_pub_key(affine_pt_narrow_t *result, const uint8_t *encoded_key);
11 | #endif
12 | 


--------------------------------------------------------------------------------
/ref/include/scalar.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include <stdint.h>
  3 | #include "f11_260.h"
  4 | #include "scalar.h"
  5 | 
  6 | // Plenty of inspiration for this file was taken from Mike Hamburg's
  7 | // Ed448 code.
  8 | 
  9 | // Constants:
 10 | __attribute__((__aligned__(32)))
 11 | const scalar_t l_bits = {
 12 |   .limbs = {0x28ad9c41, 0xe6dcf7e8, 0x34b804af, 0x5af91169,
 13 |             0x5cf68f2f, 0x125277f4, 0x9c1bf9f, 0xffff6b00, 0x3,},
 14 | };
 15 | 
 16 | __attribute__((__aligned__(32)))
 17 | const scalar_t signed_bits_set_adjustment = {
 18 |   .limbs = {0x5d498efb, 0x648c205f, 0x2d1fed40, 0x941bba5b,
 19 |             0x8c25c342, 0xb6b6202e, 0xd8f90183, 0x000253ff, 0x0,},
 20 | };
 21 | 
 22 | __attribute__((__aligned__(32)))
 23 | const scalar_t SCALAR_MONT_R2 = {
 24 |   .limbs = {0x30ba45c7, 0xf3422093, 0x054bbbf6, 0x017ab264,
 25 |             0x914ee18b, 0x250f1097, 0xf6bc1224, 0x5e97c70e, 0x2,},
 26 | };
 27 | 
 28 | const uint32_t SCALAR_MONT_N_PRIME = 0xb3138c3f;
 29 | 
 30 | __attribute__((__aligned__(32)))
 31 | const scalar_t SCALAR_MONT_R2_HASH = {
 32 |   .limbs = {
 33 |             0x202dd8e7, 0xcb1bf7be, 0xd219daf6, 0xb85aba0a,
 34 |             0xdc8da05f, 0xbd23bfce, 0xb7642c95, 0xbb13e4ad, 0x0,},
 35 | };
 36 | 
 37 | __attribute__((__aligned__(32)))
 38 | const scalar_t SCALAR_MONT_R2_HASH_MUL = {
 39 |   .limbs = {0x8b9c7a13, 0x37bb3081, 0xe4f0c2b0, 0x99b4a8b2,
 40 |             0xb4538c55, 0x34c9db2a, 0x2ade0e63, 0xa7cb6782, 0x1,},
 41 | };
 42 | 
 43 | void divide_by_2_mod_l(
 44 |   scalar_t *result, const scalar_t *x) {
 45 | 
 46 |   uint32_t mask = -(x->limbs[0] & 1);
 47 | 
 48 |   uint64_t chain = 0;
 49 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
 50 |     chain = (chain + x->limbs[i]) + (mask & l_bits.limbs[i]);
 51 |     result->limbs[i] = chain;
 52 |     chain >>= SCALAR_LIMB_BITS;
 53 |   }
 54 | 
 55 |   int i;
 56 |   for (i = 0; i < SCALAR_LIMBS - 1; ++i) {
 57 |     result->limbs[i] = result->limbs[i] >> 1 |
 58 |       (result->limbs[i+1] << (SCALAR_LIMB_BITS - 1));
 59 |   }
 60 |   result->limbs[i] >>= 1;
 61 | }
 62 | 
 63 | void add_mod_l(
 64 |   scalar_t *result, const scalar_t *x,
 65 |   const scalar_t * __restrict y) {
 66 | 
 67 |   uint64_t chain = 0;
 68 |   int i;
 69 |   for (i = 0; i < SCALAR_LIMBS; ++i) {
 70 |     chain = (chain + x->limbs[i]) + y->limbs[i];
 71 |     result->limbs[i] = chain;
 72 |     chain >>= SCALAR_LIMB_BITS;
 73 |   }
 74 | 
 75 |   sub_mod_l(result, result, &l_bits);
 76 | }
 77 | 
 78 | void sub_mod_l(
 79 |   scalar_t *result, const scalar_t *x,
 80 |   const scalar_t *y) {
 81 |   sub_mod_l_accum(result, x->limbs, y);
 82 | }
 83 | 
 84 | // x is a pointer and not a scalar_t so that this function can be used to reduce
 85 | // accumulators after multiplication.
 86 | void sub_mod_l_accum(
 87 |   scalar_t *result, const uint32_t *x,
 88 |   const scalar_t *y) {
 89 | 
 90 |   int64_t chain = 0;
 91 |   int i;
 92 |   for (i = 0; i < SCALAR_LIMBS; ++i) {
 93 |     chain = (chain + x[i]) - y->limbs[i];
 94 |     result->limbs[i] = chain;
 95 |     chain >>= SCALAR_LIMB_BITS;
 96 |   }
 97 | 
 98 |   //Should be 0 or -1 (to function as a mask)
 99 |   int32_t borrow = chain;
100 | 
101 |   chain = 0;
102 |   for (i = 0; i < SCALAR_LIMBS; ++i) {
103 |     chain = (chain + result->limbs[i]) + (l_bits.limbs[i] & borrow);
104 |     result->limbs[i] = chain;
105 |     chain >>= SCALAR_LIMB_BITS;
106 |   }
107 | }
108 | 
109 | void convert_to_sabs(
110 |   scalar_t *result, const scalar_t *x) {
111 |   add_mod_l(result, x, &signed_bits_set_adjustment);
112 |   divide_by_2_mod_l(result, result);
113 | }
114 | 
115 | void mont_reduce_hash_mod_l(
116 |   scalar_t *result, const scalar_hash_t * __restrict x) {
117 |   uint32_t accum[HASH_LIMBS];
118 | 
119 |   for (int i = 0; i < HASH_LIMBS; ++i) {
120 |     accum[i] = x->limbs[i];
121 |   }
122 | 
123 |   uint64_t chain = 0;
124 |   for (int i = 0; i <= HASH_LIMBS - SCALAR_LIMBS; ++i) {
125 |     uint32_t q = accum[0] * SCALAR_MONT_N_PRIME;
126 |     for (int j = 0; j < SCALAR_LIMBS; ++j) {
127 |       chain += accum[j] + ((uint64_t) q) * l_bits.limbs[j];
128 |       if (j > 0) {
129 |         accum[j - 1] = chain;
130 |       }
131 |       chain >>= SCALAR_LIMB_BITS;
132 |     }
133 |     int j;
134 |     for (j = SCALAR_LIMBS; j < HASH_LIMBS - i; ++j) {
135 |       chain += accum[j];
136 |       accum[j - 1] = chain;
137 |       chain >>= SCALAR_LIMB_BITS;
138 |     }
139 |     accum[j - 1] = chain;
140 |   }
141 | 
142 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
143 |     result->limbs[i] = accum[i];
144 |   }
145 |   explicit_bzero(accum, sizeof(accum));
146 | }
147 | 
148 | void reduce_hash_mod_l(scalar_t *result, const scalar_hash_t * __restrict x) {
149 |   mont_reduce_hash_mod_l(result, x);
150 |   mont_mult_mod_l(result, result, &SCALAR_MONT_R2_HASH);
151 | }
152 | 
153 | void mont_mult_mod_l(scalar_t *result, const scalar_t *x,
154 |                      const scalar_t *y) {
155 |   uint32_t accum[SCALAR_LIMBS + 1] = {0};
156 | 
157 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
158 |     uint32_t x_limb = x->limbs[i];
159 | 
160 |     uint64_t chain = 0;
161 |     int j;
162 |     for (j = 0; j < SCALAR_LIMBS; ++j) {
163 |       chain += accum[j] + ((uint64_t) y->limbs[j]) * x_limb;
164 |       accum[j] = chain;
165 |       chain >>= SCALAR_LIMB_BITS;
166 |     }
167 | 
168 |     // 2 bit value
169 |     accum[j] = chain;
170 | 
171 |     uint32_t q = accum[0] * SCALAR_MONT_N_PRIME;
172 |     chain = 0;
173 |     for (int j = 0; j < SCALAR_LIMBS; ++j) {
174 |       chain += accum[j] + ((uint64_t) l_bits.limbs[j]) * q;
175 |       if (j > 0) {
176 |         accum[j - 1] = chain;
177 |       }
178 |       chain >>= SCALAR_LIMB_BITS;
179 |     }
180 | 
181 |     // chain is a 2-bit value with a possible carry.
182 |     // result is a 3 bit value
183 |     chain += accum[j];
184 |     accum[j - 1] = chain;
185 |   }
186 | 
187 |   sub_mod_l_accum(result, accum, &l_bits);
188 |   explicit_bzero(accum, sizeof(accum));
189 | }
190 | 
191 | void mult_mod_l(scalar_t *result, const scalar_t * __restrict x,
192 |                 const scalar_t * __restrict y) {
193 |   scalar_t temp;
194 |   mont_mult_mod_l(&temp, x, y);
195 |   mont_mult_mod_l(result, &temp, &SCALAR_MONT_R2);
196 |   explicit_bzero(&temp, sizeof(temp));
197 | }
198 | 


--------------------------------------------------------------------------------
/ref/include/scalar.h:
--------------------------------------------------------------------------------
 1 | #ifndef SCALAR_H
 2 | #define SCALAR_H
 3 | #include <stdint.h>
 4 | #include "f11_260.h"
 5 | 
 6 | typedef struct scalar {
 7 |   uint32_t limbs[9];
 8 | } scalar_t;
 9 | 
10 | typedef struct scalar_hash {
11 |   uint32_t limbs[16];
12 | } scalar_hash_t;
13 | 
14 | // const int SCALAR_LIMBS = 9;
15 | #define HASH_LIMBS 16
16 | #define SCALAR_LIMBS 9
17 | #define SCALAR_BITS 258
18 | #define SCALAR_BYTES 33
19 | #define SCALAR_LIMB_BITS 32
20 | #define SCALAR_LAST_LIMB_BITS 2
21 | #define SCALAR_LAST_LIMB_MASK 0x3
22 | 
23 | // Constants
24 | // A scalar representing l, the order of the prime subgroup.
25 | const scalar_t l_bits;
26 | // For converting to SABS representation
27 | const scalar_t signed_bits_set_adjustment;
28 | // l * N' is congruent to -1 mod 2^32
29 | const uint32_t SCALAR_MONT_N_PRIME;
30 | // (2 ^ 32)^18 mod l. Used to convert to montgomery domain.
31 | // Or to fix the result of a single multiply via a 2nd multiply.
32 | const scalar_t SCALAR_MONT_R2;
33 | // (2 ^ 32)^17 mod l.
34 | // Used to fix the result of a hash reduction via a multiply
35 | // A hash is reduced from HASH_LIMBS to SCALAR_LIMBS via
36 | // HASH_LIMBS - SCALAR_LIMBS + 1 divisions by 2^32. So a hash reduction produces
37 | // h * (2^32)^-8 mod l. Montgomery multiplying by (2^32)^17 mod l produces h mod
38 | // l
39 | const scalar_t SCALAR_MONT_R2_HASH;
40 | // (2 ^ 32)^26 mod l.
41 | // Used to fix the result of a hash reduction followed by a multiply.
42 | // By similar logic we need to get rid of a factor of (2^32)^-17
43 | const scalar_t SCALAR_MONT_R2_HASH_MUL;
44 | 
45 | // Functions for manipulating scalars. May need more for ECDSA.
46 | 
47 | // This is used to convert to SABS representation.
48 | void divide_by_2_mod_l(scalar_t *result, const scalar_t * __restrict x);
49 | 
50 | void add_mod_l(scalar_t *result, const scalar_t * __restrict x,
51 |                const scalar_t * __restrict y);
52 | 
53 | void sub_mod_l(scalar_t *result, const scalar_t * __restrict x,
54 |                const scalar_t * __restrict y);
55 | 
56 | void sub_mod_l_accum(scalar_t *result, const uint32_t * __restrict x,
57 |                      const scalar_t * __restrict y);
58 | 
59 | void mont_mult_mod_l(scalar_t *result, const scalar_t * __restrict x,
60 |                      const scalar_t * __restrict y);
61 | 
62 | void mult_mod_l(scalar_t *result, const scalar_t * __restrict x,
63 |                 const scalar_t * __restrict y);
64 | 
65 | void mont_reduce_hash_mod_l(
66 |   scalar_t *result, const scalar_hash_t * __restrict x);
67 | void reduce_hash_mod_l(scalar_t *result, const scalar_hash_t * __restrict x);
68 | 
69 | void convert_to_sabs(scalar_t *result, const scalar_t * __restrict x);
70 | #endif
71 | 


--------------------------------------------------------------------------------
/ref/include/sign.h:
--------------------------------------------------------------------------------
 1 | #ifndef SIGN_H
 2 | #define SIGN_H
 3 | #include "curve.h"
 4 | #include "scalar.h"
 5 | 
 6 | #define SIG_LENGTH 65
 7 | 
 8 | typedef struct signature {
 9 |   residue_narrow_reduced_t y;
10 |   scalar_t s;
11 | } signature_t;
12 | 
13 | void sign(signature_t *result, scalar_t *priv_key,
14 |   const uint8_t *pub_key, const uint8_t *msg, size_t msg_len);
15 | 
16 | int verify(
17 |   const signature_t *sig, const uint8_t *r_bytes, const uint8_t *pub_key_bytes,
18 |   const affine_pt_narrow_t *pub_key_pt, const uint8_t *msg,
19 |   size_t msg_len);
20 | 
21 | void encode_sig(uint8_t *result, const signature_t *sig);
22 | void decode_sig(signature_t *result, const uint8_t *encoded_sig);
23 | #endif
24 | 


--------------------------------------------------------------------------------
/ref/src/api.c.supercop_only:
--------------------------------------------------------------------------------
 1 | #define _DEFAULT_SOURCE
 2 | #include <string.h>
 3 | #include "crypto_sign.h"
 4 | #include "curve.h"
 5 | #include "gen.h"
 6 | #include "scalar.h"
 7 | #include "sign.h"
 8 | 
 9 | int crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
10 |   affine_pt_narrow_t pub_key_pt;
11 |   scalar_t priv_key;
12 |   gen_key(&priv_key, &pub_key_pt);
13 |   encode_pub_key(pk, &pub_key_pt);
14 |   memcpy(sk, &priv_key, SCALAR_BYTES);
15 |   memcpy(sk + SCALAR_BYTES, pk, RESIDUE_LENGTH_BYTES);
16 |   explicit_bzero(&priv_key, sizeof(priv_key));
17 |   return 0;
18 | }
19 | 
20 | int crypto_sign(
21 |     unsigned char *sm,unsigned long long *smlen,
22 |     const unsigned char *m,unsigned long long mlen,
23 |     const unsigned char *sk) {
24 |   signature_t sig_struct;
25 |   scalar_t priv_key;
26 |   priv_key.limbs[SCALAR_LIMBS - 1] = 0;
27 |   memcpy(&priv_key, sk, SCALAR_BYTES);
28 |   sign(&sig_struct, &priv_key, sk + SCALAR_BYTES, m, mlen);
29 | 
30 |   *smlen = mlen + SIG_LENGTH;
31 |   encode_sig(sm, &sig_struct);
32 |   memcpy(sm + SIG_LENGTH, m, mlen);
33 |   return 0;
34 | }
35 | 
36 | int crypto_sign_open(
37 |     unsigned char *m,unsigned long long *mlen,
38 |     const unsigned char *sm,unsigned long long smlen,
39 |     const unsigned char *pk) {
40 |   signature_t sig_struct;
41 |   decode_sig(&sig_struct, sm);
42 |   affine_pt_narrow_t pub_key_pt;
43 |   if (!decode_pub_key(&pub_key_pt, pk)) {
44 |     return -1;
45 |   }
46 | 
47 |   uint8_t y_buf[RESIDUE_LENGTH_BYTES];
48 |   encode(y_buf, &sig_struct.y);
49 | 
50 |   if (!verify(&sig_struct, y_buf, pk, &pub_key_pt, sm + SIG_LENGTH,
51 |               smlen - SIG_LENGTH)) {
52 |     return -2;
53 |   }
54 |   *mlen = smlen - SIG_LENGTH;
55 |   memcpy(m, sm + SIG_LENGTH, smlen - SIG_LENGTH);
56 |   return 0;
57 | }
58 | 


--------------------------------------------------------------------------------
/ref/src/main.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <stdio.h>
  3 | #include <stdint.h>
  4 | #include <stdlib.h>
  5 | #include <string.h>
  6 | #include "comb.h"
  7 | #include "curve.h"
  8 | #include "f11_260.h"
  9 | #include "gen.h"
 10 | #include "scalar.h"
 11 | #include "sign.h"
 12 | 
 13 | #include <unistd.h>
 14 | 
 15 | int main(int _argc, char **argv) {
 16 |   residue_narrow_t x = {
 17 |     .limbs = {
 18 |       0x3553e74, 0x0464e4c, 0x61de408, 0x006a30e,
 19 |       0x6e9b25b, 0x3e6f39e, 0x19ec754, 0x5c71cc3,
 20 |       0x2bc1c0e, 0x554338e, 0x14e8b6e,
 21 |     },
 22 |   };
 23 | 
 24 |   residue_narrow_t two = {
 25 |     .limbs = {0x2},
 26 |   };
 27 | 
 28 |   residue_narrow_t x_plus_two;
 29 | 
 30 |   residue_narrow_reduced_t x_narrow_reduced = {
 31 |     .limbs = {
 32 |       0x206b305, 0x2f7c2ce, 0x0cf58a7, 0x2b81791, 0x19b26fa,
 33 |       0x2986830, 0x0503be5, 0x0789163, 0x16d90a0, 0x005a82e,
 34 |     },
 35 |   };
 36 | 
 37 |   residue_narrow_t y = {
 38 |     .limbs = {
 39 |       0x5f5b0e1, 0x4668277, 0x0f7d85a, 0x4515e42,
 40 |       0x00cb559, 0x3f8a910, 0x6655708, 0x3085b4d,
 41 |       0x581ceff, 0x3324c03, 0x56ed38e,
 42 |     },
 43 |   };
 44 | 
 45 |   residue_narrow_reduced_t y_narrow_reduced = {
 46 |     .limbs = {
 47 |       0x086dd54, 0x2f7aedb, 0x38904ae, 0x2e28aa4, 0x29de1ad,
 48 |       0x289d572, 0x0f6837a, 0x19987b1, 0x012fb71, 0x1c37867,
 49 |     },
 50 |   };
 51 | 
 52 |   residue_wide_t mul_expected = {
 53 |     .limbs = {
 54 |       0x1c508c4, 0x3eeb85d, 0x04bc914, 0x0a57e1c,
 55 |       0x1f13f9a, 0x2d8aa7d, 0x232cce3, 0x31e92c4,
 56 |       0x04fb073, 0x2582507, 0x06e9e1d,
 57 |     },
 58 |   };
 59 | 
 60 |   residue_wide_t square_expected = {
 61 |     .limbs = {
 62 |       0x2073353, 0x18e5de4, 0x320a4ab, 0x3ee123a,
 63 |       0x2d88419, 0x3d1ae13, 0x02b3dcf, 0x2997027,
 64 |       0x3d550a2, 0x220a052, 0x3088d3c,
 65 |     },
 66 |   };
 67 | 
 68 |   residue_narrow_t negative_one_redundant = {
 69 |     .limbs = {
 70 |       0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff,
 71 |       0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff,
 72 |       0x3ffffff, 0x3ffffff, 0x000000e,
 73 |     },
 74 |   };
 75 | 
 76 |   residue_narrow_t negative_t2_plus_one = {
 77 |     .limbs = {
 78 |       0x3ffffff, 0x000000e, 0x3ffffff, 0x3ffffff,
 79 |       0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff,
 80 |       0x3ffffff, 0x3ffffff, 0x000000e,
 81 |     },
 82 |   };
 83 | 
 84 |   residue_narrow_reduced_t negative_t2_plus_one_partial = {
 85 |     .limbs = {
 86 |       0x3fffff1, 0x0000000, 0x3fffff1, 0x3fffff1,
 87 |       0x3fffff1, 0x3fffff1, 0x3fffff1, 0x3fffff1,
 88 |       0x3fffff1, 0x3fffff1,
 89 |     },
 90 |   };
 91 | 
 92 |   residue_narrow_reduced_t negative_t2_plus_one_complete = {
 93 |     .limbs = {
 94 |       0x0000000, 0x0000001, 0x3fffff1, 0x3fffff1,
 95 |       0x3fffff1, 0x3fffff1, 0x3fffff1, 0x3fffff1,
 96 |       0x3fffff1, 0x3fffff1,
 97 |     },
 98 |   };
 99 | 
100 |   residue_narrow_t sqrt_x_plus_2_over_y = {
101 |     .limbs = {
102 |       0x3fa8549, 0x0706e5c, 0x3b33dc9, 0x3401712,
103 |       0x3a58fb3, 0x076ec4f, 0x3347ad0, 0x16ca1b0,
104 |       0x26ed559, 0x06033f0, 0x040bbb0,
105 |     },
106 |   };
107 | 
108 |   residue_narrow_t x_inverse = {
109 |     .limbs = {
110 |       0x17a9f53, 0x22e2983, 0x0f09456, 0x11fb41e,
111 |       0x1e47b3f, 0x37dd25f, 0x3bc6938, 0x2b654cd,
112 |       0x233a0b2, 0x3f8c25b, 0x09fd09b,
113 |     },
114 |   };
115 | 
116 |   #if 1
117 |   residue_narrow_t result;
118 |   residue_narrow_reduced_t result_narrow_reduced;
119 | 
120 |   mul_narrow(&result, &x, &y);
121 |   for (int i = 0; i < NLIMBS; ++i) {
122 |     assert(mul_expected.limbs[i] == result.limbs[i]);
123 |   }
124 | 
125 |   square_narrow(&result, &x);
126 |   for (int i = 0; i < NLIMBS; ++i) {
127 |     assert(square_expected.limbs[i] == result.limbs[i]);
128 |   }
129 | 
130 |   // The reduction function doesn't reduce this redundant version of negative
131 |   // one any more.
132 |   reduce_step_narrow(&result, &negative_one_redundant);
133 |   for (int i = 0; i < NLIMBS; ++i) {
134 |     assert(negative_one_redundant.limbs[i] == result.limbs[i]);
135 |   }
136 | 
137 |   reduce_step_narrow(&result, &negative_t2_plus_one);
138 |   for (int i = 0; i < NLIMBS; ++i) {
139 |     assert(negative_t2_plus_one.limbs[i] == result.limbs[i]);
140 |   }
141 | 
142 |   narrow_partial_complete(&result_narrow_reduced, &negative_t2_plus_one);
143 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
144 |     assert(negative_t2_plus_one_partial.limbs[i] ==
145 |         result_narrow_reduced.limbs[i]);
146 |   }
147 | 
148 |   narrow_complete(&result_narrow_reduced, &negative_t2_plus_one);
149 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
150 |     assert(negative_t2_plus_one_complete.limbs[i] ==
151 |         result_narrow_reduced.limbs[i]);
152 |   }
153 | 
154 |   scalar_t scalar_result;
155 |   scalar_t scalar_x = {
156 |     .limbs = {
157 |       0xa46168f9, 0x4cbf07a5, 0x62cf2928, 0xfd04242b, 0x3b12d23f,
158 |       0x355e9e63, 0xc22e849e, 0x6331c34a, 0x1,
159 |     },
160 |   };
161 |   scalar_t scalar_y = {
162 |     .limbs = {
163 |       0x148b9452, 0xaca9b6bb, 0xe0eeb33d, 0x7e64c899, 0xd61c602a,
164 |       0x96dcbb6b, 0x6a037c88, 0x39fbbaf0, 0x0,
165 |     },
166 |   };
167 |   scalar_t scalar_x_plus_y = {
168 |     .limbs = {
169 |       0xb8ecfd4b, 0xf968be60, 0x43bddc65, 0x7b68ecc5, 0x112f326a,
170 |       0xcc3b59cf, 0x2c320126, 0x9d2d7e3b, 0x1,
171 |     },
172 |   };
173 |   scalar_t scalar_x_plus_x = {
174 |     .limbs = {
175 |       0x48c2d1f2, 0x997e0f4b, 0xc59e5250, 0xfa084856, 0x7625a47f,
176 |       0x6abd3cc6, 0x845d093c, 0xc6638695, 0x2,
177 |     },
178 |   };
179 |   scalar_t scalar_x_plus_x_plus_x_plus_y = {
180 |     .limbs = {
181 |       0xd90232fc, 0xac09d5c3, 0xd4a42a06, 0x1a7823b2, 0x2a5e47bb,
182 |       0x24a61ea1, 0xa6cd4ac4, 0x639199d0, 0x0,
183 |     },
184 |   };
185 |   scalar_t scalar_x_minus_y = {
186 |     .limbs = {
187 |       0x8fd5d4a7, 0xa01550ea, 0x81e075ea, 0x7e9f5b91, 0x64f67215,
188 |       0x9e81e2f7, 0x582b0815, 0x2936085a, 0x1,
189 |     },
190 |   };
191 |   scalar_t scalar_y_minus_x = {
192 |     .limbs = {
193 |       0x98d7c79a, 0x46c7a6fd, 0xb2d78ec5, 0xdc59b5d7, 0xf8001d19,
194 |       0x73d094fc, 0xb196b789, 0xd6c962a5, 0x2,
195 |     },
196 |   };
197 |   scalar_t scalar_x_times_y = {
198 |     .limbs = {
199 |       0x30b3d35a, 0x9ca90acf, 0x6926efdd, 0x80620b0a, 0x52e190e7,
200 |       0x8011b9b8, 0x8c7d8f43, 0x90491703, 0x3,
201 |     },
202 |   };
203 |   scalar_t scalar_x_sabs = {
204 |     .limbs = {
205 |       0x80d57bfa, 0x58a59402, 0x47f78b34, 0x488fef43, 0xe39c4ac1,
206 |       0xf60a5f48, 0x4d93c310, 0xb19a0ba5, 0x0,
207 |     },
208 |   };
209 |   scalar_t scalar_y_sabs = {
210 |     .limbs = {
211 |       0x4d415fc7, 0xfc096781, 0x21635296, 0x36bcca2f, 0x5f9c594e,
212 |       0xaff2a9c7, 0x265f1ed5, 0x1cfebcf8, 0x2,
213 |     },
214 |   };
215 |   scalar_hash_t scalar_hash_val = {
216 |     .limbs = {
217 |       0xcbbc3de7, 0xa212405d, 0x5c85f47c, 0x79aa991c,
218 |       0xfe310944, 0x54075530, 0xd5ef6878, 0x72e57186,
219 |       0x36dcac18, 0xb72461e2, 0x5405caca, 0x4e9e0bff,
220 |       0x8d67a990, 0xf62f262c, 0x6df205dd, 0x24d78573,
221 |     },
222 |   };
223 |   scalar_t reduced_hash_val = {
224 |     .limbs = {
225 |       0xef1d4f9d, 0xd832a3a5, 0xdf1682be, 0x8d257e79, 0x41b1f2ca,
226 |       0x5be9564c, 0x320d4cb6, 0x108f8d04, 0x3,
227 |     },
228 |   };
229 | 
230 |   uint8_t buffer[33];
231 |   uint8_t encode_x[33] = {
232 |     0x05, 0xb3, 0x06, 0x3a, 0x0b, 0xdf,
233 |     0x7b, 0x8a, 0xf5, 0x4c, 0xe4, 0x05, 0xae,
234 |     0xfa, 0x26, 0x9b, 0xc1, 0xa0, 0x61,
235 |     0x5a, 0xbe, 0x03, 0xc5, 0x58, 0x24, 0x1e,
236 |     0xa0, 0x90, 0x6d, 0xb9, 0xa0, 0x16, 0x00,
237 |   };
238 |   uint8_t encode_y[33] = {
239 |     0x54, 0xdd, 0x86, 0x6c, 0xbb, 0xde,
240 |     0xeb, 0x4a, 0x90, 0x38, 0xa9, 0xa2, 0xb8,
241 |     0xad, 0xe1, 0x9d, 0xca, 0x55, 0x27,
242 |     0xaa, 0x37, 0x68, 0x4f, 0xec, 0x61, 0x66,
243 |     0x71, 0xfb, 0x12, 0x9c, 0xe1, 0x0d, 0x07,
244 |   };
245 | 
246 |   add_mod_l(&scalar_result, &scalar_x, &scalar_y);
247 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
248 |     assert(scalar_x_plus_y.limbs[i] == scalar_result.limbs[i]);
249 |   }
250 |   add_mod_l(&scalar_result, &scalar_x, &scalar_x);
251 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
252 |     assert(scalar_x_plus_x.limbs[i] == scalar_result.limbs[i]);
253 |   }
254 |   add_mod_l(&scalar_result, &scalar_x_plus_x, &scalar_x_plus_y);
255 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
256 |     assert(scalar_x_plus_x_plus_x_plus_y.limbs[i] == scalar_result.limbs[i]);
257 |   }
258 |   sub_mod_l(&scalar_result, &scalar_x, &scalar_y);
259 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
260 |     assert(scalar_x_minus_y.limbs[i] == scalar_result.limbs[i]);
261 |   }
262 | 
263 |   sub_mod_l(&scalar_result, &scalar_y, &scalar_x);
264 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
265 |     assert(scalar_y_minus_x.limbs[i] == scalar_result.limbs[i]);
266 |   }
267 | 
268 |   mult_mod_l(&scalar_result, &scalar_x, &scalar_y);
269 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
270 |     assert(scalar_x_times_y.limbs[i] == scalar_result.limbs[i]);
271 |   }
272 | 
273 |   convert_to_sabs(&scalar_result, &scalar_x);
274 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
275 |     assert(scalar_x_sabs.limbs[i] == scalar_result.limbs[i]);
276 |   }
277 | 
278 |   convert_to_sabs(&scalar_result, &scalar_y);
279 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
280 |     assert(scalar_y_sabs.limbs[i] == scalar_result.limbs[i]);
281 |   }
282 | 
283 |   encode(buffer, &x_narrow_reduced);
284 |   for (int i = 0; i < 33; ++i) {
285 |     assert(encode_x[i] == buffer[i]);
286 |   }
287 | 
288 |   encode(buffer, &y_narrow_reduced);
289 |   for (int i = 0; i < 33; ++i) {
290 |     assert(encode_y[i] == buffer[i]);
291 |   }
292 | 
293 |   decode(&result_narrow_reduced, encode_x);
294 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
295 |     assert(x_narrow_reduced.limbs[i] == result_narrow_reduced.limbs[i]);
296 |   }
297 | 
298 |   decode(&result_narrow_reduced, encode_y);
299 |   for (int i = 0; i < NLIMBS_REDUCED; ++i) {
300 |     assert(y_narrow_reduced.limbs[i] == result_narrow_reduced.limbs[i]);
301 |   }
302 | 
303 |   //x/y is not a quadratic residue, but (x+2)/y is.
304 |   assert(!sqrt_inv_narrow(&result, &x, &y));
305 |   add_narrow(&x_plus_two, &x, &two);
306 |   assert(sqrt_inv_narrow(&result, &x_plus_two, &y));
307 |   for (int i = 0; i < NLIMBS; ++i) {
308 |     assert(sqrt_x_plus_2_over_y.limbs[i] == result.limbs[i]);
309 |   }
310 | 
311 |   invert_narrow(&result, &x);
312 |   assert(equal_narrow(&result, &x_inverse));
313 | 
314 |   reduce_hash_mod_l(&scalar_result, &scalar_hash_val);
315 |   for (int i = 0; i < SCALAR_LIMBS; ++i) {
316 |     assert(reduced_hash_val.limbs[i] == scalar_result.limbs[i]);
317 |   }
318 | 
319 |   scalar_t mult_scalar = {
320 |     .limbs = {
321 |       0x55f0b9a3, 0x82b106c5, 0xcb2e2b7d, 0x30735cbc,
322 |       0xa512a8ba, 0x4c5cd391, 0xe9d0c788, 0x92bb2562, 0x3,
323 |     },
324 |   };
325 |   projective_pt_narrow_t expected_scalar_mult = {
326 |     .x = {
327 |       .limbs = {
328 |         0x1267d8d, 0x39a3cd3, 0x09e1275, 0x2d21378, 0x24771d9, 0x3558a1d,
329 |         0x3bdca9b, 0x0dd862d, 0x0bb230a, 0x1668292, 0x0350abe,
330 |       },
331 |     },
332 |     .y = {
333 |       .limbs = {
334 |         0x04d69fd, 0x03e739d, 0x36ce258, 0x0b6464b, 0x19dab22, 0x249c1a8,
335 |         0x1d28c7d, 0x1591dbc, 0x085ebab, 0x0e8274f, 0x0b090d6,
336 |       },
337 |     },
338 |     .z = {
339 |       .limbs = {0x1},
340 |     },
341 |   };
342 |   projective_pt_narrow_t result_pt;
343 | 
344 |   for (int i = 0; i<1; ++i) {
345 |     scalar_multiply(&result_pt, &B, &mult_scalar);
346 |   }
347 |   {
348 |     residue_narrow_t tmp;
349 |     mul_narrow(&tmp, &expected_scalar_mult.x, &result_pt.z);
350 |     assert(equal_narrow(&tmp, &result_pt.x));
351 |     mul_narrow(&tmp, &expected_scalar_mult.y, &result_pt.z);
352 |     assert(equal_narrow(&tmp, &result_pt.y));
353 |   }
354 | 
355 |   affine_pt_narrow_t expected_everything0 = {
356 |     .x = {
357 |       .limbs = {
358 |         0x20eef1a, 0x3c30e66, 0x0d710f0, 0x248a6fa, 0x30c967f, 0x3ce302c,
359 |         0x0ccd1f2, 0x197e993, 0x2ebaef3, 0x0f2f019, 0,
360 |       },
361 |     },
362 |     .y = {
363 |       .limbs = {
364 |         0x3017cc0, 0x02a5110, 0x06d37e5, 0x283a64a, 0x01484b5, 0x196f37b,
365 |         0x13de2d2, 0x0da32d1, 0x392e0fc, 0x221d742, 0,
366 |       },
367 |     },
368 |   };
369 | 
370 |   affine_pt_narrow_t expected_everything1 = {
371 |     .x = {
372 |       .limbs = {
373 |         0x0e35d45, 0x038f90c, 0x0283483, 0x01ee50a, 0x1e364f9, 0x362414c,
374 |         0x156b1ed, 0x006fff6, 0x271f9ed, 0x0ffa45d, 0,
375 |       },
376 |     },
377 |     .y = {
378 |       .limbs = {
379 |         0x156ae67, 0x27941ab, 0x19a3000, 0x3572ab5, 0x2b90ce3, 0x136156c,
380 |         0x0727496, 0x0edae82, 0x0fa5dfd, 0x16f293c, 0,
381 |       },
382 |     },
383 |   };
384 | 
385 |   affine_pt_narrow_t expected_everything2 = {
386 |     .x = {
387 |       .limbs = {
388 |         0x37fcb1b, 0x16004b9, 0x1d18743, 0x0bce648, 0x0d78db6, 0x35b1d65,
389 |         0x23bb620, 0x2fbc323, 0x1a9a586, 0x3b22577, 0,
390 |       },
391 |     },
392 |     .y = {
393 |       .limbs = {
394 |         0x082fb15, 0x03487d6, 0x3d1c2c9, 0x2c9e7ad, 0x187be10, 0x2e9b6ba,
395 |         0x15b8f89, 0x243ae4c, 0x328bb11, 0x00b12a9, 0,
396 |       },
397 |     },
398 |   };
399 | 
400 | 
401 |   affine_pt_narrow_t expected_everything3 = {
402 |     .x = {
403 |       .limbs = {
404 |         0x3e79b25, 0x2ca71b7, 0x2b2ea3c, 0x0de7ac4, 0x3026d10, 0x2bce79e,
405 |         0x1153866, 0x03e5a80, 0x22b9a37, 0x03e9c59, 0,
406 |       },
407 |     },
408 |     .y = {
409 |       .limbs = {
410 |         0x20100d6, 0x2330974, 0x3402585, 0x172cfd6, 0x275a21c, 0x213e87c,
411 |         0x29989f2, 0x155e437, 0x096a378, 0x3a674eb, 0,
412 |       },
413 |     },
414 |   };
415 | 
416 |   affine_pt_narrow_t expected_gray_code_end0 = {
417 |     .x = {
418 |       .limbs = {
419 |         0x14dd884, 0x12c9e33, 0x2d42122, 0x26f0b14, 0x1b9ea17, 0x3779e94,
420 |         0x2562a88, 0x0be34f0, 0x192ead9, 0x089ec45, 0,
421 |       },
422 |     },
423 |     .y = {
424 |       .limbs = {
425 |         0x1de5221, 0x172f820, 0x28c1b33, 0x08003c6, 0x0e65926, 0x188cd49,
426 |         0x3bb39fd, 0x1b9d8d7, 0x03d5020, 0x045742b, 0,
427 |       },
428 |     },
429 |   };
430 | 
431 |   affine_pt_narrow_t expected_gray_code_end1 = {
432 |     .x = {
433 |       .limbs = {
434 |         0x1d1cf29, 0x2e289d7, 0x1a83709, 0x2252d11, 0x3d6411c, 0x3fd73ad,
435 |         0x2737d9c, 0x2ca9eba, 0x058f290, 0x3879a7c, 0,
436 |       },
437 |     },
438 |     .y = {
439 |       .limbs = {
440 |         0x357399d, 0x0276752, 0x0d5199f, 0x1bbd3a0, 0x39044f1, 0x0c5e83a,
441 |         0x1a99cdd, 0x0dcb61f, 0x35b7272, 0x1184cff, 0,
442 |       },
443 |     },
444 |   };
445 | 
446 |   affine_pt_narrow_t expected_gray_code_end2 = {
447 |     .x = {
448 |       .limbs = {
449 |         0x1ea3c19, 0x081dc9e, 0x1a0b337, 0x1d7f3f4, 0x295a0aa, 0x1ebff45,
450 |         0x0956bf0, 0x17aae80, 0x05d8632, 0x3082c9a, 0,
451 |       },
452 |     },
453 |     .y = {
454 |       .limbs = {
455 |         0x22ad91f, 0x1ffcc65, 0x37b4f5c, 0x29c51ab, 0x3f9bd02, 0x296aaf9,
456 |         0x2a58b82, 0x2c54e16, 0x2a7672c, 0x21486e2, 0,
457 |       },
458 |     },
459 |   };
460 | 
461 |   affine_pt_narrow_t expected_gray_code_end3 = {
462 |     .x = {
463 |       .limbs = {
464 |         0x06b9c9d, 0x3d00674, 0x10a73fc, 0x30fda83, 0x139185c, 0x043e082,
465 |         0x3c67915, 0x208192a, 0x025e451, 0x258a566, 0,
466 |       },
467 |     },
468 |     .y = {
469 |       .limbs = {
470 |         0x3d2a04f, 0x1314c36, 0x131c7a3, 0x1882ef3, 0x1a0a5e8, 0x1919356,
471 |         0x0a5616a, 0x1eea31d, 0x2c216b3, 0x18ba4aa, 0,
472 |       },
473 |     },
474 |   };
475 | 
476 |   sabs_comb_set_t computed_base_comb;
477 |   compute_comb_set(&computed_base_comb, &B);
478 |   assert(equal_narrow(&computed_base_comb.combs[0].table[COMB_TABLE_SIZE - 1].x,
479 |     &expected_everything0.x));
480 |   assert(equal_narrow(&computed_base_comb.combs[0].table[COMB_TABLE_SIZE - 1].y,
481 |     &expected_everything0.y));
482 |   assert(equal_narrow(&computed_base_comb.combs[1].table[COMB_TABLE_SIZE - 1].x,
483 |     &expected_everything1.x));
484 |   assert(equal_narrow(&computed_base_comb.combs[1].table[COMB_TABLE_SIZE - 1].y,
485 |     &expected_everything1.y));
486 |   assert(equal_narrow(&computed_base_comb.combs[2].table[COMB_TABLE_SIZE - 1].x,
487 |     &expected_everything2.x));
488 |   assert(equal_narrow(&computed_base_comb.combs[2].table[COMB_TABLE_SIZE - 1].y,
489 |     &expected_everything2.y));
490 |   assert(equal_narrow(&computed_base_comb.combs[3].table[COMB_TABLE_SIZE - 1].x,
491 |     &expected_everything3.x));
492 |   assert(equal_narrow(&computed_base_comb.combs[3].table[COMB_TABLE_SIZE - 1].y,
493 |     &expected_everything3.y));
494 | 
495 | 
496 |   assert(equal_narrow(&computed_base_comb.combs[0].table[7].x,
497 |     &expected_gray_code_end0.x));
498 |   assert(equal_narrow(&computed_base_comb.combs[0].table[7].y,
499 |     &expected_gray_code_end0.y));
500 |   assert(equal_narrow(&computed_base_comb.combs[1].table[7].x,
501 |     &expected_gray_code_end1.x));
502 |   assert(equal_narrow(&computed_base_comb.combs[1].table[7].y,
503 |     &expected_gray_code_end1.y));
504 |   assert(equal_narrow(&computed_base_comb.combs[2].table[7].x,
505 |     &expected_gray_code_end2.x));
506 |   assert(equal_narrow(&computed_base_comb.combs[2].table[7].y,
507 |     &expected_gray_code_end2.y));
508 |   assert(equal_narrow(&computed_base_comb.combs[3].table[7].x,
509 |     &expected_gray_code_end3.x));
510 |   assert(equal_narrow(&computed_base_comb.combs[3].table[7].y,
511 |     &expected_gray_code_end3.y));
512 |   #endif
513 | 
514 |   #if 1
515 |   for (int i = 0; i<1; ++i) {
516 |     scalar_comb_multiply(&result_pt, &base_comb, &mult_scalar);
517 |   }
518 |   {
519 |     residue_narrow_t tmp;
520 |     mul_narrow(&tmp, &expected_scalar_mult.x, &result_pt.z);
521 |     assert(equal_narrow(&tmp, &result_pt.x));
522 |     mul_narrow(&tmp, &expected_scalar_mult.y, &result_pt.z);
523 |     assert(equal_narrow(&tmp, &result_pt.y));
524 |   }
525 |   #endif
526 |   #if 0
527 |   for (int i = 0; i<100000; ++i) {
528 |     scalar_t priv_key;
529 |     affine_pt_narrow_reduced_t pub_key;
530 |     gen_key(&priv_key, &pub_key);
531 |   }
532 |   #endif
533 |   #if 1
534 |   uint8_t encoded_sk[66];
535 |   scalar_t priv_key;
536 |   scalar_t priv_key_decoded;
537 |   affine_pt_narrow_t pub_key;
538 |   affine_pt_narrow_t pub_key_decoded;
539 |   gen_key(&priv_key, &pub_key);
540 |   memcpy(encoded_sk, &priv_key, SCALAR_BYTES);
541 |   encode_pub_key(encoded_sk + SCALAR_BYTES, &pub_key);
542 |   #if 1
543 |   priv_key_decoded.limbs[SCALAR_LIMBS - 1] = 0;
544 |   memcpy(&priv_key_decoded, encoded_sk, SCALAR_BYTES);
545 |   for (int j = 0; j < SCALAR_LIMBS; ++j) {
546 |     assert(priv_key.limbs[j] == priv_key_decoded.limbs[j]);
547 |   }
548 |   #endif
549 |   for (int i = 0; i < 1; ++i) {
550 |     uint8_t encoded_sig[65];
551 |     const uint8_t *msg = (uint8_t *) "Hello World!";
552 |     const size_t msglen = 13;
553 |     signature_t result;
554 |     sign(&result, &priv_key_decoded, encoded_sk + SCALAR_BYTES, msg, msglen);
555 |     encode_sig(encoded_sig, &result);
556 |     #if 1
557 |     if (1) {
558 |       signature_t result_decoded;
559 |       decode_sig(&result_decoded, encoded_sig);
560 |       for (int j = 0; j < SCALAR_LIMBS; ++j) {
561 |         assert(result.s.limbs[j] == result_decoded.s.limbs[j]);
562 |       }
563 |       for (int j = 0; j < NLIMBS_REDUCED; ++j) {
564 |         assert(result.y.limbs[j] == result_decoded.y.limbs[j]);
565 |       }
566 |       assert(decode_pub_key(&pub_key_decoded, encoded_sk + SCALAR_BYTES));
567 | 
568 |       uint8_t y_buf[RESIDUE_LENGTH_BYTES];
569 |       encode(y_buf, &result_decoded.y);
570 |       if(!verify(&result, y_buf, encoded_sk + SCALAR_BYTES, &pub_key_decoded, msg,
571 |                  msglen)) {
572 |         printf("verification failed\n");
573 |         exit(1);
574 |       }
575 |     }
576 |     #endif
577 |   }
578 |   #endif
579 | }
580 | 


--------------------------------------------------------------------------------
/ref/src/sign.c:
--------------------------------------------------------------------------------
  1 | #define _DEFAULT_SOURCE
  2 | #include <blake2.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | 
  6 | #include "comb.h"
  7 | #include "curve.h"
  8 | #include "scalar.h"
  9 | 
 10 | #include "sign.h"
 11 | 
 12 | #include "f11_260.c"
 13 | #include "curve.c"
 14 | #include "scalar.c"
 15 | #include "gen.c"
 16 | #include "constant_time.c"
 17 | #include "comb.c"
 18 | 
 19 | void sign(signature_t *result, scalar_t *priv_key,
 20 |   const uint8_t *pub_key, const uint8_t *msg, size_t msg_len) {
 21 |   blake2b_state hash_ctxt;
 22 | 
 23 |   char session_key_wash[16];
 24 | 
 25 |   scalar_hash_t scalar_large;
 26 |   scalar_t session_key;
 27 | 
 28 |   arc4random_buf(session_key_wash, sizeof(session_key_wash));
 29 |   blake2b_init_key(&hash_ctxt, 64, session_key_wash, sizeof(session_key_wash));
 30 |   blake2b_update(&hash_ctxt, (uint8_t *) priv_key, SCALAR_BYTES);
 31 |   blake2b_update(&hash_ctxt, (uint8_t *) msg, msg_len);
 32 |   blake2b_final(&hash_ctxt, (uint8_t *) &scalar_large, sizeof(scalar_hash_t));
 33 | 
 34 |   reduce_hash_mod_l(&session_key, &scalar_large);
 35 | 
 36 |   projective_pt_narrow_t result_pt;
 37 |   scalar_comb_multiply(&result_pt, &base_comb, &session_key);
 38 |   residue_narrow_t z_inv;
 39 | 
 40 |   invert_narrow(&z_inv, &result_pt.z);
 41 |   mul_narrow(&result_pt.x, &result_pt.x, &z_inv);
 42 |   mul_narrow(&result_pt.y, &result_pt.y, &z_inv);
 43 | 
 44 |   narrow_complete(&result->y, &result_pt.y);
 45 | 
 46 |   residue_narrow_reduced_t temp_narrow_reduced;
 47 |   narrow_partial_complete(&temp_narrow_reduced, &result_pt.x);
 48 |   result->y.limbs[NLIMBS_REDUCED - 1] |=
 49 |       is_odd(&temp_narrow_reduced) << (TBITS);
 50 | 
 51 |   uint8_t y_buf[RESIDUE_LENGTH_BYTES];
 52 |   encode(y_buf, &result->y);
 53 | 
 54 |   blake2b_init(&hash_ctxt, 64);
 55 |   blake2b_update(&hash_ctxt, y_buf, RESIDUE_LENGTH_BYTES);
 56 |   blake2b_update(&hash_ctxt, pub_key, RESIDUE_LENGTH_BYTES);
 57 |   blake2b_update(&hash_ctxt, msg, msg_len);
 58 |   blake2b_final(&hash_ctxt, (uint8_t *) &scalar_large, sizeof(scalar_hash_t));
 59 | 
 60 |   scalar_t hash_scalar;
 61 |   mont_reduce_hash_mod_l(&hash_scalar, &scalar_large);
 62 |   mont_mult_mod_l(&hash_scalar, &hash_scalar, priv_key);
 63 |   mont_mult_mod_l(&hash_scalar, &hash_scalar, &SCALAR_MONT_R2_HASH_MUL);
 64 |   sub_mod_l(&result->s, &session_key, &hash_scalar);
 65 | 
 66 |   explicit_bzero(&session_key, sizeof(session_key));
 67 |   explicit_bzero(&hash_scalar, sizeof(hash_scalar));
 68 |   explicit_bzero(&session_key_wash, sizeof(session_key_wash));
 69 | }
 70 | 
 71 | int verify(
 72 |   const signature_t *sig, const uint8_t *r_bytes, const uint8_t *pub_key_bytes,
 73 |   const affine_pt_narrow_t *pub_key_pt, const uint8_t *msg,
 74 |   size_t msg_len) {
 75 | 
 76 |   projective_pt_narrow_t sB;
 77 |   projective_pt_narrow_t hA;
 78 |   projective_pt_narrow_t result_pt;
 79 |   residue_narrow_reduced_t result_y;
 80 | 
 81 |   scalar_hash_t scalar_large;
 82 |   blake2b_state hash_ctxt;
 83 |   blake2b_init(&hash_ctxt, 64);
 84 |   blake2b_update(&hash_ctxt, r_bytes, RESIDUE_LENGTH_BYTES);
 85 |   blake2b_update(&hash_ctxt, pub_key_bytes, RESIDUE_LENGTH_BYTES);
 86 |   blake2b_update(&hash_ctxt, msg, msg_len);
 87 |   blake2b_final(&hash_ctxt, (uint8_t *) &scalar_large, sizeof(scalar_hash_t));
 88 | 
 89 |   scalar_t hash_scalar;
 90 |   reduce_hash_mod_l(&hash_scalar, &scalar_large);
 91 | 
 92 |   // Can use non-const version for both of these.
 93 |   scalar_comb_multiply_unsafe(&sB, &base_comb, &sig->s);
 94 |   scalar_multiply_unsafe(&hA, pub_key_pt, &hash_scalar);
 95 |   projective_add(&result_pt, &sB, &hA);
 96 | 
 97 |   // Everything below except the comparison should eventually be in helper
 98 |   // functions: Point affinization, and point compression bit-for-bit.
 99 |   // Same applies for the signing.
100 |   residue_narrow_t z_inv;
101 | 
102 |   invert_narrow(&z_inv, &result_pt.z);
103 |   mul_narrow(&result_pt.x, &result_pt.x, &z_inv);
104 |   mul_narrow(&result_pt.y, &result_pt.y, &z_inv);
105 | 
106 |   narrow_complete(&result_y, &result_pt.y);
107 | 
108 |   residue_narrow_reduced_t temp_narrow_reduced;
109 |   narrow_partial_complete(&temp_narrow_reduced, &result_pt.x);
110 |   result_y.limbs[NLIMBS_REDUCED - 1] |=
111 |       is_odd(&temp_narrow_reduced) << TBITS;
112 | 
113 |   return equal_narrow_reduced(&sig->y, &result_y);
114 | }
115 | 
116 | void encode_sig(uint8_t *result, const signature_t *sig) {
117 |   residue_narrow_reduced_t pack;
118 | 
119 |   memcpy(&pack, &sig->y, sizeof(residue_narrow_reduced_t));
120 |   // Save the upper two bits in the uppermost part of the 33rd byte
121 |   pack.limbs[NLIMBS_REDUCED - 1] |=
122 |     (sig->s.limbs[SCALAR_LIMBS - 1] & 0x3) << 28;
123 |   encode(result, &pack);
124 |   memcpy(result + RESIDUE_LENGTH_BYTES,
125 |          &sig->s, sizeof(uint32_t) * (SCALAR_LIMBS - 1));
126 | }
127 | 
128 | void decode_sig(signature_t *result, const uint8_t *encoded_sig) {
129 |   decode(&result->y, encoded_sig);
130 |   result->s.limbs[SCALAR_LIMBS - 1] = result->y.limbs[NLIMBS_REDUCED - 1] >> 28;
131 |   // We leave an extra bit for the sign bit from compression.
132 |   result->y.limbs[NLIMBS_REDUCED - 1] &= ((1 << (TBITS + 1)) - 1);
133 |   memcpy(&result->s, encoded_sig + RESIDUE_LENGTH_BYTES,
134 |          sizeof(uint32_t) * (SCALAR_LIMBS - 1));
135 | }
136 | 


--------------------------------------------------------------------------------