├── LICENSE ├── avx2 ├── Makefile ├── api.h ├── include │ ├── comb.c │ ├── comb.h │ ├── constant_time.c │ ├── constant_time.h │ ├── curve.c │ ├── curve.h │ ├── f11_260.c │ ├── f11_260.h │ ├── gen.c │ ├── gen.h │ ├── scalar.c │ ├── scalar.h │ └── sign.h └── src │ ├── api.c.supercop_only │ ├── main.c │ └── sign.c ├── avx512 ├── include └── src │ ├── f11_260.c │ ├── main.c │ ├── scalar.c │ └── sign.c └── ref ├── Makefile ├── api.h ├── include ├── comb.c ├── comb.h ├── constant_time.c ├── constant_time.h ├── curve.c ├── curve.h ├── f11_260.c ├── f11_260.h ├── gen.c ├── gen.h ├── scalar.c ├── scalar.h └── sign.h └── src ├── api.c.supercop_only ├── main.c └── sign.c /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019, Pyrofex Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the p11_260 project. 27 | -------------------------------------------------------------------------------- /avx2/Makefile: -------------------------------------------------------------------------------- 1 | ../ref/Makefile -------------------------------------------------------------------------------- /avx2/api.h: -------------------------------------------------------------------------------- 1 | #define CRYPTO_SECRETKEYBYTES 66 2 | #define CRYPTO_PUBLICKEYBYTES 33 3 | #define CRYPTO_BYTES 65 4 | #define CRYPTO_VERSION "1.0" 5 | -------------------------------------------------------------------------------- /avx2/include/comb.h: -------------------------------------------------------------------------------- 1 | #ifndef COMB_H 2 | #define COMB_H 3 | 4 | #include "curve.h" 5 | #include "scalar.h" 6 | 7 | #define COMB_TABLE_SIZE 16 8 | #define COMB_TEETH 5 9 | #define COMB_COUNT 4 10 | #define COMB_SEPARATION 13 11 | #define COMB_LOOKUP_MASK 0xf 12 | 13 | // A single comb table. 14 | typedef struct sabs_single_comb { 15 | extended_affine_pt_readd_narrow_t table[COMB_TABLE_SIZE]; 16 | } sabs_single_comb_t; 17 | 18 | // A single wide comb table. Used in computing a narrow comb table. 19 | typedef struct sabs_single_comb_wide { 20 | projective_pt_wide_t table[COMB_TABLE_SIZE]; 21 | } sabs_single_comb_wide_t; 22 | 23 | // A comb set. There is a precomputed comb set for the base point, but for 24 | // verifications of several signatures from the same key, it would be 25 | // advantageous to precompute a comb. 26 | typedef struct sabs_comb_set { 27 | sabs_single_comb_t combs[COMB_COUNT]; 28 | } sabs_comb_set_t; 29 | 30 | // An unreduced comb set. Used just to separate the logic of comb computation 31 | // from comb reduction. 32 | typedef struct sabs_comb_set_wide { 33 | sabs_single_comb_wide_t combs[COMB_COUNT]; 34 | } sabs_comb_set_wide_t; 35 | 36 | // used for computing the entries in the comb table. 37 | typedef struct teeth_set { 38 | // We don't need the lowest tooth to compute the entries, because for signed 39 | // all bits set, to change the bit, you add or subtract a value of 2*bit. 40 | extended_pt_readd_wide_t teeth[COMB_TEETH - 1]; 41 | } teeth_set_t; 42 | 43 | // The base comb used for fast signatures. 44 | sabs_comb_set_t base_comb; 45 | 46 | // Compute a comb set for a given point. 47 | void compute_comb_set( 48 | sabs_comb_set_t *result, const affine_pt_narrow_t *base_pt); 49 | 50 | // Helper function used to compute a comb set. 51 | void reduce_comb_set(sabs_comb_set_t *result, sabs_comb_set_wide_t *source); 52 | 53 | // Constant time multiplication of a scalar times a point given the point's 54 | // comb. 55 | void scalar_comb_multiply( 56 | projective_pt_wide_t *result, const sabs_comb_set_t * __restrict comb, 57 | const scalar_t * __restrict n); 58 | 59 | // Non-Constant time multiplication of a scalar times a point given the point's 60 | // comb. Can be safely used during signature verification because there are no 61 | // secrets during verification. 62 | void scalar_comb_multiply_unsafe( 63 | projective_pt_wide_t *result, const sabs_comb_set_t * __restrict comb, 64 | const scalar_t * __restrict n); 65 | #endif 66 | -------------------------------------------------------------------------------- /avx2/include/constant_time.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "f11_260.h" 3 | #include "curve.h" 4 | 5 | #include "emmintrin.h" 6 | #include "immintrin.h" 7 | 8 | static inline void mask_copy_narrow( 9 | int32_t mask, residue_narrow_t *result, 10 | residue_narrow_t *x) { 11 | 12 | #pragma clang loop unroll(full) 13 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 14 | result->limbs[i+1] |= x->limbs[i] & mask; 15 | } 16 | } 17 | 18 | // 12 * 32 * 4 = 6 * 256 19 | void constant_time_extended_narrow_lookup( 20 | extended_pt_readd_narrow_t *result, int i, int n, 21 | const extended_pt_readd_narrow_t *table) { 22 | 23 | __m256i accum[6]; 24 | __m256i big_i = _mm256_set1_epi32(i); 25 | __m256i big_one = _mm256_set1_epi32(1); 26 | #pragma clang loop unroll(full) 27 | for (int j = 0; j < 6; ++j) { 28 | accum[j] = _mm256_setzero_si256(); 29 | } 30 | for (int j = 0; j < n; ++j) { 31 | __m256i mask = _mm256_cmpeq_epi32(big_i, _mm256_setzero_si256()); 32 | #pragma clang loop unroll(full) 33 | for (int k = 0; k < 6; ++k) { 34 | __m256i temp = _mm256_load_si256(((__m256i*) &table[j]) + k); 35 | temp = _mm256_and_si256(temp, mask); 36 | accum[k] = _mm256_or_si256(accum[k], temp); 37 | } 38 | big_i = _mm256_sub_epi64(big_i, big_one); 39 | } 40 | for (int j = 0; j < 6; ++j) { 41 | _mm256_store_si256(((__m256i*) result) + j, accum[j]); 42 | } 43 | } 44 | 45 | void constant_time_extended_affine_narrow_lookup( 46 | extended_affine_pt_readd_narrow_t *result, int i, int n, 47 | const extended_affine_pt_readd_narrow_t *table) { 48 | 49 | __m256i accum[5]; 50 | __m256i big_i = _mm256_set1_epi32(i); 51 | __m256i big_one = _mm256_set1_epi32(1); 52 | #pragma clang loop unroll(full) 53 | for (int j = 0; j < 5; ++j) { 54 | accum[j] = _mm256_setzero_si256(); 55 | } 56 | for (int j = 0; j < n; ++j) { 57 | __m256i mask = _mm256_cmpeq_epi32(big_i, _mm256_setzero_si256()); 58 | #pragma clang loop unroll(full) 59 | for (int k = 0; k < 5; ++k) { 60 | __m256i temp = _mm256_load_si256(((__m256i*) &table[j]) + k); 61 | temp = _mm256_and_si256(temp, mask); 62 | accum[k] = _mm256_or_si256(accum[k], temp); 63 | } 64 | big_i = _mm256_sub_epi64(big_i, big_one); 65 | } 66 | for (int j = 0; j < 5; ++j) { 67 | _mm256_store_si256(((__m256i*) result) + j, accum[j]); 68 | } 69 | } 70 | 71 | void constant_time_cond_extended_negate( 72 | extended_pt_readd_narrow_t *x, int32_t mask32) { 73 | __m256i zero = _mm256_setzero_si256(); 74 | __m256i mask = _mm256_set1_epi32(mask32); 75 | __m256i not_mask = _mm256_set1_epi32(~mask32); 76 | 77 | #pragma clang loop unroll(full) 78 | for (int i = 0; i < 3; ++i) { 79 | __m256i temp = _mm256_load_si256(((__m256i*) x) + i); 80 | __m256i neg_temp = _mm256_sub_epi32(zero, temp); 81 | temp = _mm256_and_si256(not_mask, temp); 82 | neg_temp = _mm256_and_si256(mask, neg_temp); 83 | temp = _mm256_or_si256(temp, neg_temp); 84 | _mm256_store_si256(((__m256i*) x) + i, temp); 85 | } 86 | } 87 | 88 | void constant_time_cond_extended_affine_negate( 89 | extended_affine_pt_readd_narrow_t *x, int32_t mask32) { 90 | __m256i zero = _mm256_setzero_si256(); 91 | __m256i mask = _mm256_set1_epi32(mask32); 92 | __m256i not_mask = _mm256_set1_epi32(~mask32); 93 | 94 | #pragma clang loop unroll(full) 95 | for (int i = 0; i < 3; ++i) { 96 | __m256i temp = _mm256_load_si256(((__m256i*) x) + i); 97 | __m256i neg_temp = _mm256_sub_epi32(zero, temp); 98 | temp = _mm256_and_si256(not_mask, temp); 99 | neg_temp = _mm256_and_si256(mask, neg_temp); 100 | temp = _mm256_or_si256(temp, neg_temp); 101 | _mm256_store_si256(((__m256i*) x) + i, temp); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /avx2/include/constant_time.h: -------------------------------------------------------------------------------- 1 | #ifndef CONSTANT_TIME_H 2 | #define CONSTANT_TIME_H 3 | #include 4 | #include "f11_260.h" 5 | #include "curve.h" 6 | 7 | void constant_time_extended_narrow_lookup( 8 | extended_pt_readd_narrow_t *result, int i, int n, 9 | const extended_pt_readd_narrow_t *table); 10 | 11 | void constant_time_extended_affine_narrow_lookup( 12 | extended_affine_pt_readd_narrow_t *result, int i, int n, 13 | const extended_affine_pt_readd_narrow_t *table); 14 | 15 | void constant_time_cond_extended_negate( 16 | extended_pt_readd_narrow_t *x, int32_t mask); 17 | 18 | void constant_time_cond_extended_affine_negate( 19 | extended_affine_pt_readd_narrow_t *x, int32_t mask); 20 | #endif 21 | -------------------------------------------------------------------------------- /avx2/include/curve.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "f11_260.h" 3 | #include "scalar.h" 4 | #include "curve.h" 5 | #include "constant_time.h" 6 | 7 | __attribute__((__aligned__(32))) 8 | const affine_pt_narrow_t B = { 9 | .x = { 10 | .limbs = { 11 | 0, 0x2862b8b, 0x0f08ed2, 0x06e65ee, 0x0c05991, 0x2b12b17, 12 | 0x0049432, 0x33a3707, 0x16e5186, 0x2947e71, 0x0ed9bab, 0, 13 | }, 14 | }, 15 | .y = { 16 | .limbs = { 17 | 0x0, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 18 | }, 19 | }, 20 | }; 21 | 22 | void copy_projective_pt_wide( 23 | projective_pt_wide_t *result, const projective_pt_wide_t *source) { 24 | 25 | for(int i = 0; i < NLIMBS; ++i) { 26 | result->x.limbs[i] = source->x.limbs[i]; 27 | result->y.limbs[i] = source->y.limbs[i]; 28 | result->z.limbs[i] = source->z.limbs[i]; 29 | } 30 | } 31 | 32 | void copy_extended_pt_wide( 33 | extended_pt_wide_t *result, 34 | const extended_pt_wide_t *source) { 35 | 36 | for(int i = 0; i < NLIMBS; ++i) { 37 | result->x.limbs[i] = source->x.limbs[i]; 38 | result->y.limbs[i] = source->y.limbs[i]; 39 | result->t.limbs[i] = source->t.limbs[i]; 40 | result->z.limbs[i] = source->z.limbs[i]; 41 | } 42 | } 43 | 44 | void copy_extended_pt_readd_wide( 45 | extended_pt_readd_wide_t *result, 46 | const extended_pt_readd_wide_t *source) { 47 | 48 | for(int i = 0; i < NLIMBS; ++i) { 49 | result->x.limbs[i] = source->x.limbs[i]; 50 | result->y.limbs[i] = source->y.limbs[i]; 51 | result->dt.limbs[i] = source->dt.limbs[i]; 52 | result->z.limbs[i] = source->z.limbs[i]; 53 | } 54 | } 55 | 56 | void copy_extended_pt_readd_narrow( 57 | extended_pt_readd_narrow_t *result, 58 | const extended_pt_readd_narrow_t *source) { 59 | for(int i = 0; i < NLIMBS; ++i) { 60 | result->x.limbs[i] = source->x.limbs[i]; 61 | result->y.limbs[i] = source->y.limbs[i]; 62 | result->dt.limbs[i] = source->dt.limbs[i]; 63 | result->z.limbs[i] = source->z.limbs[i]; 64 | } 65 | } 66 | 67 | void copy_extended_affine_pt_readd_narrow( 68 | extended_affine_pt_readd_narrow_t *result, 69 | const extended_affine_pt_readd_narrow_t *source) { 70 | for(int i = 0; i < NLIMBS; ++i) { 71 | result->x.limbs[i] = source->x.limbs[i]; 72 | result->y.limbs[i] = source->y.limbs[i]; 73 | result->dt.limbs[i] = source->dt.limbs[i]; 74 | } 75 | } 76 | 77 | void negate_extended_pt_readd_wide( 78 | extended_pt_readd_wide_t *result, 79 | const extended_pt_readd_wide_t *source) { 80 | for(int i = 0; i < NLIMBS; ++i) { 81 | result->x.limbs[i] = -source->x.limbs[i]; 82 | result->y.limbs[i] = source->y.limbs[i]; 83 | result->dt.limbs[i] = -source->dt.limbs[i]; 84 | result->z.limbs[i] = source->z.limbs[i]; 85 | } 86 | } 87 | 88 | void negate_extended_affine_pt_readd_narrow( 89 | extended_affine_pt_readd_narrow_t *result, 90 | const extended_affine_pt_readd_narrow_t *source) { 91 | for(int i = 0; i < NLIMBS; ++i) { 92 | result->x.limbs[i] = -source->x.limbs[i]; 93 | result->dt.limbs[i] = -source->dt.limbs[i]; 94 | result->y.limbs[i] = source->y.limbs[i]; 95 | } 96 | } 97 | 98 | void negate_extended_pt_readd_narrow( 99 | extended_pt_readd_narrow_t *result, 100 | const extended_pt_readd_narrow_t *source) { 101 | for(int i = 0; i < NLIMBS; ++i) { 102 | result->x.limbs[i] = -source->x.limbs[i]; 103 | result->dt.limbs[i] = -source->dt.limbs[i]; 104 | result->y.limbs[i] = source->y.limbs[i]; 105 | result->z.limbs[i] = source->z.limbs[i]; 106 | } 107 | } 108 | 109 | void affine_narrow_to_extended( 110 | extended_pt_wide_t *result, 111 | const affine_pt_narrow_t * __restrict x) { 112 | 113 | for(int i = 0; i < NLIMBS; ++i) { 114 | result->x.limbs[i] = x->x.limbs[i]; 115 | result->y.limbs[i] = x->y.limbs[i]; 116 | result->z.limbs[i] = 0; 117 | } 118 | result->z.limbs[1] = 1; 119 | mul_wide(&result->t, &result->x, &result->y); 120 | } 121 | 122 | void extended_to_projective_wide( 123 | projective_pt_wide_t *result, const extended_pt_wide_t * __restrict x) { 124 | for(int i = 0; i < NLIMBS; ++i) { 125 | result->x.limbs[i] = x->x.limbs[i]; 126 | result->y.limbs[i] = x->y.limbs[i]; 127 | result->z.limbs[i] = x->z.limbs[i]; 128 | } 129 | } 130 | 131 | void affine_to_readd_narrow( 132 | extended_pt_readd_narrow_t *result, 133 | const affine_pt_narrow_t * __restrict x) { 134 | 135 | for(int i = 0; i < NLIMBS; ++i) { 136 | result->x.limbs[i] = x->x.limbs[i]; 137 | result->y.limbs[i] = x->y.limbs[i]; 138 | result->z.limbs[i] = 0; 139 | } 140 | result->z.limbs[1] = 1; 141 | 142 | residue_wide_t xy; 143 | residue_wide_t dt_wide; 144 | mul_narrow(&xy, &x->x, &x->y); 145 | mul_wide_const(&dt_wide, &xy, D); 146 | narrow(&result->dt, &dt_wide); 147 | } 148 | 149 | void affine_to_readd_wide( 150 | extended_pt_readd_wide_t *result, 151 | const affine_pt_narrow_t * __restrict x) { 152 | 153 | for(int i = 0; i < NLIMBS; ++i) { 154 | result->x.limbs[i] = x->x.limbs[i]; 155 | result->y.limbs[i] = x->y.limbs[i]; 156 | result->z.limbs[i] = 0; 157 | } 158 | result->z.limbs[1] = 1; 159 | 160 | residue_wide_t xy; 161 | mul_narrow(&xy, &x->x, &x->y); 162 | mul_wide_const(&result->dt, &xy, D); 163 | } 164 | 165 | void extended_to_readd_wide_neg( 166 | extended_pt_readd_wide_t *result, 167 | const extended_pt_wide_t * __restrict x) { 168 | 169 | for(int i = 0; i < NLIMBS; ++i) { 170 | result->x.limbs[i] = -(x->x.limbs[i]); 171 | result->y.limbs[i] = x->y.limbs[i]; 172 | result->z.limbs[i] = x->z.limbs[i]; 173 | } 174 | mul_wide_const(&result->dt, &x->t, -D); 175 | } 176 | 177 | void affine_double( 178 | projective_pt_wide_t *result, 179 | const affine_pt_narrow_t * __restrict x) { 180 | 181 | residue_narrow_t x_plus_y; 182 | residue_wide_t a, b, e, e_tmp, g, g_minus_2, h; 183 | square_narrow(&a, &x->x); 184 | square_narrow(&b, &x->y); 185 | 186 | add_narrow(&x_plus_y, &x->x, &x->y); 187 | 188 | square_narrow(&e, &x_plus_y); 189 | sub_wide(&e_tmp, &e, &a); 190 | sub_wide(&e, &e_tmp, &b); 191 | add_wide(&g, &a, &b); 192 | 193 | for (int i = 0; i < NLIMBS; ++i) { 194 | g_minus_2.limbs[i] = g.limbs[i]; 195 | } 196 | g_minus_2.limbs[1] -= 2; 197 | 198 | sub_wide(&h, &a, &b); 199 | mul_wide(&result->x, &e, &g_minus_2); 200 | mul_wide(&result->y, &g, &h); 201 | mul_wide(&result->z, &g, &g_minus_2); 202 | } 203 | 204 | void affine_double_extended( 205 | extended_pt_wide_t *result, const affine_pt_narrow_t * __restrict x) { 206 | 207 | residue_narrow_t x_plus_y; 208 | residue_wide_t a, b, e, e_tmp, g, g_minus_2, h; 209 | square_narrow(&a, &x->x); 210 | square_narrow(&b, &x->y); 211 | 212 | add_narrow(&x_plus_y, &x->x, &x->y); 213 | square_narrow(&e, &x_plus_y); 214 | sub_wide(&e_tmp, &e, &a); 215 | sub_wide(&e, &e_tmp, &b); 216 | add_wide(&g, &a, &b); 217 | 218 | for (int i = 0; i < NLIMBS; ++i) { 219 | g_minus_2.limbs[i] = g.limbs[i]; 220 | } 221 | g_minus_2.limbs[1] -= 2; 222 | 223 | sub_wide(&h, &a, &b); 224 | mul_wide(&result->x, &e, &g_minus_2); 225 | mul_wide(&result->y, &g, &h); 226 | mul_wide(&result->t, &e, &h); 227 | mul_wide(&result->z, &g, &g_minus_2); 228 | } 229 | 230 | void projective_double( 231 | projective_pt_wide_t *result, const projective_pt_wide_t *x) { 232 | 233 | residue_wide_t x_plus_y; 234 | residue_wide_t a, b, c, c_temp, e, e_tmp, f, g, h; 235 | add_wide(&x_plus_y, &x->x, &x->y); 236 | square_wide(&a, &x->x); 237 | square_wide(&b, &x->y); 238 | square_wide(&c_temp, &x->z); 239 | double_wide(&c, &c_temp); 240 | 241 | square_wide(&e, &x_plus_y); 242 | sub_wide(&e_tmp, &e, &a); 243 | sub_wide(&e, &e_tmp, &b); 244 | add_wide(&g, &a, &b); 245 | sub_wide(&f, &g, &c); 246 | sub_wide(&h, &a, &b); 247 | 248 | mul_wide(&result->x, &e, &f); 249 | mul_wide(&result->y, &g, &h); 250 | mul_wide(&result->z, &f, &g); 251 | } 252 | 253 | void projective_double_extended( 254 | extended_pt_wide_t *result, const projective_pt_wide_t * __restrict x) { 255 | 256 | residue_wide_t x_plus_y; 257 | residue_wide_t a, b, c, c_temp, e, e_tmp, f, g, h; 258 | add_wide(&x_plus_y, &x->x, &x->y); 259 | square_wide(&a, &x->x); 260 | square_wide(&b, &x->y); 261 | square_wide(&c_temp, &x->z); 262 | double_wide(&c, &c_temp); 263 | 264 | square_wide(&e, &x_plus_y); 265 | sub_wide(&e_tmp, &e, &a); 266 | sub_wide(&e, &e_tmp, &b); 267 | add_wide(&g, &a, &b); 268 | sub_wide(&f, &g, &c); 269 | sub_wide(&h, &a, &b); 270 | 271 | mul_wide(&result->x, &e, &f); 272 | mul_wide(&result->y, &g, &h); 273 | mul_wide(&result->t, &e, &h); 274 | mul_wide(&result->z, &f, &g); 275 | } 276 | 277 | void extended_double_extended( 278 | extended_pt_wide_t *result, const extended_pt_wide_t *x) { 279 | 280 | residue_wide_t x_plus_y; 281 | residue_wide_t a, b, c, c_temp, e, e_tmp, f, g, h; 282 | add_wide(&x_plus_y, &x->x, &x->y); 283 | square_wide(&a, &x->x); 284 | square_wide(&b, &x->y); 285 | square_wide(&c_temp, &x->z); 286 | double_wide(&c, &c_temp); 287 | 288 | square_wide(&e, &x_plus_y); 289 | sub_wide(&e_tmp, &e, &a); 290 | sub_wide(&e, &e_tmp, &b); 291 | add_wide(&g, &a, &b); 292 | sub_wide(&f, &g, &c); 293 | sub_wide(&h, &a, &b); 294 | 295 | mul_wide(&result->x, &e, &f); 296 | mul_wide(&result->z, &f, &g); 297 | mul_wide(&result->y, &g, &h); 298 | mul_wide(&result->t, &e, &h); 299 | } 300 | 301 | void projective_add( 302 | projective_pt_wide_t *result, const projective_pt_wide_t * __restrict x1, 303 | const projective_pt_wide_t * __restrict x2) { 304 | 305 | residue_wide_t x1_plus_y1, x2_plus_y2; 306 | residue_wide_t a, b, c, d, e, e_temp, f, g, t1, t2; 307 | 308 | mul_wide(&a, &x1->z, &x2->z); 309 | square_wide(&b, &a); 310 | mul_wide(&c, &x1->x, &x2->x); 311 | mul_wide(&d, &x1->y, &x2->y); 312 | mul_wide_const(&e_temp, &c, D); 313 | mul_wide(&e, &e_temp, &d); 314 | 315 | sub_wide(&f, &b, &e); 316 | add_wide(&g, &b, &e); 317 | add_wide(&x1_plus_y1, &x1->x, &x1->y); 318 | add_wide(&x2_plus_y2, &x2->x, &x2->y); 319 | 320 | mul_wide(&t1, &x1_plus_y1, &x2_plus_y2); 321 | sub_wide(&t2, &t1, &c); 322 | sub_wide(&t1, &t2, &d); 323 | mul_wide(&t2, &t1, &f); 324 | mul_wide(&result->x, &t2, &a); 325 | 326 | sub_wide(&t1, &d, &c); 327 | mul_wide(&t2, &t1, &g); 328 | mul_wide(&result->y, &t2, &a); 329 | 330 | mul_wide(&result->z, &f, &g); 331 | } 332 | 333 | void extended_add( 334 | projective_pt_wide_t *result, const extended_pt_wide_t * __restrict x1, 335 | const extended_pt_wide_t * __restrict x2) { 336 | 337 | residue_wide_t x1_plus_y1, x2_plus_y2; 338 | residue_wide_t a, b, c, c_temp, d, e, e_temp, f, g, h; 339 | 340 | mul_wide(&a, &x1->x, &x2->x); 341 | mul_wide(&b, &x1->y, &x2->y); 342 | mul_wide_const(&c_temp, &x1->t, D); 343 | mul_wide(&c, &c_temp, &x2->t); 344 | mul_wide(&d, &x1->z, &x2->z); 345 | 346 | add_wide(&x1_plus_y1, &x1->x, &x1->y); 347 | add_wide(&x2_plus_y2, &x2->x, &x2->y); 348 | mul_wide(&e, &x1_plus_y1, &x2_plus_y2); 349 | sub_wide(&e_temp, &e, &a); 350 | sub_wide(&e, &e_temp, &b); 351 | sub_wide(&f, &d, &c); 352 | add_wide(&g, &d, &c); 353 | sub_wide(&h, &b, &a); 354 | 355 | mul_wide(&result->x, &e, &f); 356 | mul_wide(&result->z, &f, &g); 357 | mul_wide(&result->y, &g, &h); 358 | } 359 | 360 | void extended_add_extended( 361 | extended_pt_wide_t *result, const extended_pt_wide_t *x1, 362 | const extended_pt_wide_t *x2) { 363 | 364 | residue_wide_t x1_plus_y1, x2_plus_y2; 365 | residue_wide_t a, b, c, c_temp, d, e, e_temp, f, g, h; 366 | 367 | mul_wide(&a, &x1->x, &x2->x); 368 | mul_wide(&b, &x1->y, &x2->y); 369 | mul_wide_const(&c_temp, &x1->t, D); 370 | mul_wide(&c, &c_temp, &x2->t); 371 | mul_wide(&d, &x1->z, &x2->z); 372 | 373 | add_wide(&x1_plus_y1, &x1->x, &x1->y); 374 | add_wide(&x2_plus_y2, &x2->x, &x2->y); 375 | mul_wide(&e, &x1_plus_y1, &x2_plus_y2); 376 | sub_wide(&e_temp, &e, &a); 377 | sub_wide(&e, &e_temp, &b); 378 | sub_wide(&f, &d, &c); 379 | add_wide(&g, &d, &c); 380 | sub_wide(&h, &b, &a); 381 | 382 | mul_wide(&result->x, &e, &f); 383 | mul_wide(&result->z, &f, &g); 384 | mul_wide(&result->y, &g, &h); 385 | mul_wide(&result->t, &e, &h); 386 | } 387 | 388 | void extended_readd_wide_extended( 389 | extended_pt_wide_t *result, 390 | const extended_pt_wide_t *x1, 391 | const extended_pt_readd_wide_t * __restrict x2) { 392 | 393 | residue_wide_t x1_plus_y1, x2_plus_y2; 394 | residue_wide_t a, b, c, d, e, e_temp, f, g, h; 395 | 396 | mul_wide(&a, &x1->x, &x2->x); 397 | mul_wide(&b, &x1->y, &x2->y); 398 | mul_wide(&c, &x1->t, &x2->dt); 399 | mul_wide(&d, &x1->z, &x2->z); 400 | 401 | add_wide(&x1_plus_y1, &x1->x, &x1->y); 402 | add_wide(&x2_plus_y2, &x2->x, &x2->y); 403 | mul_wide(&e, &x1_plus_y1, &x2_plus_y2); 404 | sub_wide(&e_temp, &e, &a); 405 | sub_wide(&e, &e_temp, &b); 406 | sub_wide(&f, &d, &c); 407 | add_wide(&g, &d, &c); 408 | sub_wide(&h, &b, &a); 409 | 410 | mul_wide(&result->x, &e, &f); 411 | mul_wide(&result->z, &f, &g); 412 | mul_wide(&result->y, &g, &h); 413 | mul_wide(&result->t, &e, &h); 414 | } 415 | 416 | void extended_readd_narrow_extended( 417 | extended_pt_wide_t *result, const extended_pt_wide_t * __restrict x1, 418 | const extended_pt_readd_narrow_t * __restrict x2) { 419 | 420 | residue_wide_t x1_plus_y1; 421 | residue_narrow_t x2_plus_y2; 422 | residue_wide_t a, b, c, d, e, e_temp, f, g, h; 423 | 424 | mul_wide_narrow(&a, &x1->x, &x2->x); 425 | mul_wide_narrow(&b, &x1->y, &x2->y); 426 | mul_wide_narrow(&c, &x1->t, &x2->dt); 427 | mul_wide_narrow(&d, &x1->z, &x2->z); 428 | 429 | add_wide(&x1_plus_y1, &x1->x, &x1->y); 430 | add_narrow(&x2_plus_y2, &x2->x, &x2->y); 431 | mul_wide_narrow(&e, &x1_plus_y1, &x2_plus_y2); 432 | sub_wide(&e_temp, &e, &a); 433 | sub_wide(&e, &e_temp, &b); 434 | sub_wide(&f, &d, &c); 435 | add_wide(&g, &d, &c); 436 | sub_wide(&h, &b, &a); 437 | 438 | mul_wide(&result->x, &e, &f); 439 | mul_wide(&result->z, &f, &g); 440 | mul_wide(&result->y, &g, &h); 441 | mul_wide(&result->t, &e, &h); 442 | } 443 | 444 | void extended_readd_narrow( 445 | projective_pt_wide_t *result, const extended_pt_wide_t * __restrict x1, 446 | const extended_pt_readd_narrow_t * __restrict x2) { 447 | 448 | residue_wide_t x1_plus_y1; 449 | residue_narrow_t x2_plus_y2; 450 | residue_wide_t a, b, c, d, e, e_temp, f, g, h; 451 | 452 | mul_wide_narrow(&a, &x1->x, &x2->x); 453 | mul_wide_narrow(&b, &x1->y, &x2->y); 454 | mul_wide_narrow(&c, &x1->t, &x2->dt); 455 | mul_wide_narrow(&d, &x1->z, &x2->z); 456 | 457 | add_wide(&x1_plus_y1, &x1->x, &x1->y); 458 | add_narrow(&x2_plus_y2, &x2->x, &x2->y); 459 | mul_wide_narrow(&e, &x1_plus_y1, &x2_plus_y2); 460 | sub_wide(&e_temp, &e, &a); 461 | sub_wide(&e, &e_temp, &b); 462 | sub_wide(&f, &d, &c); 463 | add_wide(&g, &d, &c); 464 | sub_wide(&h, &b, &a); 465 | 466 | mul_wide(&result->x, &e, &f); 467 | mul_wide(&result->z, &f, &g); 468 | mul_wide(&result->y, &g, &h); 469 | } 470 | 471 | void extended_readd_affine_narrow_extended( 472 | extended_pt_wide_t *result, const extended_pt_wide_t *x1, 473 | const extended_affine_pt_readd_narrow_t * __restrict x2) { 474 | 475 | residue_wide_t x1_plus_y1; 476 | residue_narrow_t x2_plus_y2; 477 | residue_wide_t a, b, c, e, e_temp, f, g, h; 478 | 479 | mul_wide_narrow(&a, &x1->x, &x2->x); 480 | mul_wide_narrow(&b, &x1->y, &x2->y); 481 | mul_wide_narrow(&c, &x1->t, &x2->dt); 482 | 483 | add_wide(&x1_plus_y1, &x1->x, &x1->y); 484 | add_narrow(&x2_plus_y2, &x2->x, &x2->y); 485 | mul_wide_narrow(&e, &x1_plus_y1, &x2_plus_y2); 486 | sub_wide(&e_temp, &e, &a); 487 | sub_wide(&e, &e_temp, &b); 488 | sub_wide(&f, &x1->z, &c); 489 | add_wide(&g, &x1->z, &c); 490 | sub_wide(&h, &b, &a); 491 | 492 | mul_wide(&result->x, &e, &f); 493 | mul_wide(&result->z, &f, &g); 494 | mul_wide(&result->y, &g, &h); 495 | mul_wide(&result->t, &e, &h); 496 | } 497 | 498 | void extended_readd_readd_narrow( 499 | extended_pt_readd_narrow_t *result, 500 | const extended_pt_wide_t * __restrict x1, 501 | const extended_pt_readd_narrow_t * __restrict x2) { 502 | 503 | residue_wide_t x1_plus_y1; 504 | residue_narrow_t x2_plus_y2; 505 | residue_wide_t a, b, c, d, e, e_temp, f, g, h, x3, y3, t3, dt3, z3; 506 | 507 | mul_wide_narrow(&a, &x1->x, &x2->x); 508 | mul_wide_narrow(&b, &x1->y, &x2->y); 509 | mul_wide_narrow(&c, &x1->t, &x2->dt); 510 | mul_wide_narrow(&d, &x1->z, &x2->z); 511 | 512 | add_wide(&x1_plus_y1, &x1->x, &x1->y); 513 | add_narrow(&x2_plus_y2, &x2->x, &x2->y); 514 | mul_wide_narrow(&e, &x1_plus_y1, &x2_plus_y2); 515 | sub_wide(&e_temp, &e, &a); 516 | sub_wide(&e, &e_temp, &b); 517 | sub_wide(&f, &d, &c); 518 | add_wide(&g, &d, &c); 519 | sub_wide(&h, &b, &a); 520 | 521 | mul_wide(&x3, &e, &f); 522 | mul_wide(&z3, &f, &g); 523 | mul_wide(&y3, &g, &h); 524 | mul_wide(&t3, &e, &h); 525 | 526 | narrow(&result->x, &x3); 527 | narrow(&result->y, &y3); 528 | mul_wide_const(&dt3, &t3, D); 529 | narrow(&result->dt, &dt3); 530 | narrow(&result->z, &z3); 531 | } 532 | 533 | void readd_to_projective( 534 | projective_pt_wide_t *result, 535 | const extended_pt_readd_narrow_t * __restrict x) { 536 | 537 | widen(&result->x, &x->x); 538 | widen(&result->y, &x->y); 539 | widen(&result->z, &x->z); 540 | } 541 | 542 | void affine_readd_to_extended( 543 | extended_pt_wide_t *result, 544 | const extended_affine_pt_readd_narrow_t * __restrict x) { 545 | 546 | widen(&result->x, &x->x); 547 | widen(&result->y, &x->y); 548 | mul_narrow(&result->t, &x->x, &x->y); 549 | for (int i = 0; i < NLIMBS; ++i) { 550 | result->z.limbs[i] = 0; 551 | } 552 | result->z.limbs[1] = 1; 553 | } 554 | 555 | void scalar_multiply( 556 | projective_pt_wide_t *result, const affine_pt_narrow_t * __restrict x, 557 | const scalar_t * __restrict n) { 558 | 559 | scalar_t sabs_n; 560 | convert_to_sabs(&sabs_n, n); 561 | 562 | const int WINDOW_BITS = 5; 563 | const uint32_t WINDOW_MASK = (1 << WINDOW_BITS) - 1; 564 | const uint32_t LOOKUP_MASK = WINDOW_MASK >> 1; 565 | const int TABLE_SIZE = 16; 566 | extended_pt_readd_narrow_t table[TABLE_SIZE]; 567 | 568 | extended_pt_wide_t x2; 569 | affine_double_extended(&x2, x); 570 | affine_to_readd_narrow(&table[0], x); 571 | for (int i = 1; i < TABLE_SIZE; ++i) { 572 | extended_readd_readd_narrow(&table[i], &x2, &table[i-1]); 573 | } 574 | 575 | int i; 576 | int first = 1; 577 | // Set i to the highest i such that 578 | // a) i < SCALAR_BITS 579 | // b) i % WINDOW_BITS = 0 580 | 581 | projective_pt_wide_t temp; 582 | extended_pt_wide_t temp_ext; 583 | extended_pt_readd_narrow_t window_pt; 584 | 585 | i = SCALAR_BITS - ((SCALAR_BITS - 1) % WINDOW_BITS) - 1; 586 | for (; i >= 0; i -= WINDOW_BITS) { 587 | uint32_t bits = sabs_n.limbs[i/SCALAR_LIMB_BITS] >> (i % SCALAR_LIMB_BITS); 588 | if (i % SCALAR_LIMB_BITS > (SCALAR_LIMB_BITS - WINDOW_BITS) && 589 | i / SCALAR_LIMB_BITS < SCALAR_LIMBS - 1) { 590 | 591 | bits |= sabs_n.limbs[i/SCALAR_LIMB_BITS + 1] << 592 | (SCALAR_LIMB_BITS - i % SCALAR_LIMB_BITS); 593 | } 594 | 595 | bits &= WINDOW_MASK; 596 | int32_t invert = (bits >> (WINDOW_BITS - 1)) - 1; 597 | bits ^= invert; 598 | 599 | constant_time_extended_narrow_lookup( 600 | &window_pt, bits & LOOKUP_MASK, TABLE_SIZE, table); 601 | constant_time_cond_extended_negate(&window_pt, invert); 602 | 603 | if (first) { 604 | readd_to_projective(&temp, &window_pt); 605 | first = 0; 606 | } else { 607 | for (int i = 0; i < WINDOW_BITS - 1; ++i) { 608 | projective_double(&temp, &temp); 609 | } 610 | projective_double_extended(&temp_ext, &temp); 611 | extended_readd_narrow(&temp, &temp_ext, &window_pt); 612 | } 613 | } 614 | 615 | copy_projective_pt_wide(result, &temp); 616 | explicit_bzero(&sabs_n, sizeof(sabs_n)); 617 | explicit_bzero(&window_pt, sizeof(window_pt)); 618 | explicit_bzero(table, sizeof(table)); 619 | explicit_bzero(&temp, sizeof(temp)); 620 | explicit_bzero(&temp_ext, sizeof(temp_ext)); 621 | } 622 | 623 | void scalar_multiply_unsafe( 624 | projective_pt_wide_t *result, const affine_pt_narrow_t * __restrict x, 625 | const scalar_t * __restrict n) { 626 | 627 | scalar_t sabs_n; 628 | convert_to_sabs(&sabs_n, n); 629 | 630 | const int WINDOW_BITS = 5; 631 | const uint32_t WINDOW_MASK = (1 << WINDOW_BITS) - 1; 632 | const uint32_t LOOKUP_MASK = WINDOW_MASK >> 1; 633 | const int TABLE_SIZE = 16; 634 | extended_pt_readd_narrow_t table[TABLE_SIZE]; 635 | 636 | extended_pt_wide_t x2; 637 | affine_double_extended(&x2, x); 638 | affine_to_readd_narrow(&table[0], x); 639 | for (int i = 1; i < TABLE_SIZE; ++i) { 640 | extended_readd_readd_narrow(&table[i], &x2, &table[i-1]); 641 | } 642 | 643 | int i; 644 | int first = 1; 645 | // Set i to the highest i such that 646 | // a) i < SCALAR_BITS 647 | // b) i % WINDOW_BITS = 0 648 | 649 | projective_pt_wide_t temp; 650 | extended_pt_wide_t temp_ext; 651 | extended_pt_readd_narrow_t window_pt; 652 | 653 | i = SCALAR_BITS - ((SCALAR_BITS - 1) % WINDOW_BITS) - 1; 654 | for (; i >= 0; i -= WINDOW_BITS) { 655 | uint32_t bits = sabs_n.limbs[i/SCALAR_LIMB_BITS] >> (i % SCALAR_LIMB_BITS); 656 | if (i % SCALAR_LIMB_BITS > (SCALAR_LIMB_BITS - WINDOW_BITS) && 657 | i / SCALAR_LIMB_BITS < SCALAR_LIMBS - 1) { 658 | 659 | bits |= sabs_n.limbs[i/SCALAR_LIMB_BITS + 1] << 660 | (SCALAR_LIMB_BITS - i % SCALAR_LIMB_BITS); 661 | } 662 | 663 | bits &= WINDOW_MASK; 664 | int32_t invert = (bits >> (WINDOW_BITS - 1)) - 1; 665 | bits ^= invert; 666 | 667 | copy_extended_pt_readd_narrow(&window_pt, &table[bits & LOOKUP_MASK]); 668 | if (invert) { 669 | negate_extended_pt_readd_narrow(&window_pt, &window_pt); 670 | } 671 | 672 | if (first) { 673 | readd_to_projective(&temp, &window_pt); 674 | first = 0; 675 | } else { 676 | for (int i = 0; i < WINDOW_BITS - 1; ++i) { 677 | projective_double(&temp, &temp); 678 | } 679 | projective_double_extended(&temp_ext, &temp); 680 | extended_readd_narrow(&temp, &temp_ext, &window_pt); 681 | } 682 | } 683 | 684 | copy_projective_pt_wide(result, &temp); 685 | } 686 | 687 | int point_decompress( 688 | affine_pt_narrow_t *result, 689 | residue_narrow_reduced_t *y, int low_bit) { 690 | 691 | residue_narrow_t y_n; 692 | 693 | residue_wide_t u; 694 | residue_wide_t v; 695 | 696 | residue_wide_t y2; 697 | residue_narrow_reduced_t temp; 698 | residue_wide_t x_wide; 699 | 700 | unnarrow_reduce(&y_n, y); 701 | square_narrow(&y2, &y_n); 702 | copy_narrow(&result->y, &y_n); 703 | 704 | sub_wide(&u, &one_wide, &y2); 705 | mul_wide_const(&y2, &y2, D); 706 | sub_wide(&v, &one_wide, &y2); 707 | 708 | if (sqrt_inv_wide(&x_wide, &u, &v)) { 709 | narrow(&result->x, &x_wide); 710 | narrow_partial_complete(&temp, &result->x); 711 | 712 | int x_is_odd = is_odd(&temp); 713 | if ((x_is_odd && !low_bit) || (low_bit && !x_is_odd)) { 714 | negate_narrow(&result->x, &result->x); 715 | } 716 | 717 | return 1; 718 | } 719 | 720 | return 0; 721 | } 722 | -------------------------------------------------------------------------------- /avx2/include/curve.h: -------------------------------------------------------------------------------- 1 | #ifndef CURVE_H 2 | #define CURVE_H 3 | #include "f11_260.h" 4 | #include "scalar.h" 5 | 6 | typedef struct affine_pt_narrow { 7 | residue_narrow_t x; 8 | residue_narrow_t y; 9 | } affine_pt_narrow_t; 10 | 11 | typedef struct extended_pt_readd_narrow { 12 | __attribute__((__aligned__(32))) 13 | residue_narrow_t x; 14 | residue_narrow_t dt; 15 | residue_narrow_t y; 16 | residue_narrow_t z; 17 | } extended_pt_readd_narrow_t; 18 | 19 | typedef struct extended_pt_readd_wide { 20 | residue_wide_t x; 21 | residue_wide_t dt; 22 | residue_wide_t y; 23 | residue_wide_t z; 24 | } extended_pt_readd_wide_t; 25 | 26 | typedef struct extended_affine_pt_readd_narrow { 27 | __attribute__((__aligned__(32))) 28 | residue_narrow_t x; 29 | residue_narrow_t dt; 30 | residue_narrow_t y; 31 | uint32_t pad[4]; // So that it takes an even 5 vector 32 | // loads to load the structure. 33 | } extended_affine_pt_readd_narrow_t; 34 | 35 | // For use in doubling. 36 | typedef struct projective_pt_wide { 37 | residue_wide_t x; 38 | residue_wide_t y; 39 | residue_wide_t z; 40 | } projective_pt_wide_t; 41 | 42 | // For use in addition. 43 | typedef struct extended_pt_wide { 44 | residue_wide_t x; 45 | residue_wide_t y; 46 | residue_wide_t t; 47 | residue_wide_t z; 48 | } extended_pt_wide_t; 49 | 50 | #define D (-49142) 51 | 52 | __attribute__((__aligned__(32))) 53 | const affine_pt_narrow_t B; 54 | 55 | void copy_projective_pt_wide( 56 | projective_pt_wide_t *result, const projective_pt_wide_t *source); 57 | 58 | void copy_extended_pt_wide( 59 | extended_pt_wide_t *result, const extended_pt_wide_t *source); 60 | 61 | void copy_extended_pt_readd_wide( 62 | extended_pt_readd_wide_t *result, const extended_pt_readd_wide_t *source); 63 | 64 | void copy_extended_pt_readd_narrow( 65 | extended_pt_readd_narrow_t *result, const extended_pt_readd_narrow_t *source); 66 | 67 | void copy_extended_affine_pt_readd_narrow( 68 | extended_affine_pt_readd_narrow_t *result, 69 | const extended_affine_pt_readd_narrow_t *source); 70 | 71 | void negate_extended_pt_readd_wide( 72 | extended_pt_readd_wide_t *result, 73 | const extended_pt_readd_wide_t *source); 74 | 75 | void negate_extended_affine_pt_readd_narrow( 76 | extended_affine_pt_readd_narrow_t *result, 77 | const extended_affine_pt_readd_narrow_t *source); 78 | 79 | void affine_narrow_to_extended( 80 | extended_pt_wide_t *result, 81 | const affine_pt_narrow_t * __restrict x); 82 | 83 | void affine_to_projective( 84 | projective_pt_wide_t *result, 85 | const affine_pt_narrow_t * __restrict x); 86 | 87 | void affine_to_readd_wide( 88 | extended_pt_readd_wide_t *result, 89 | const affine_pt_narrow_t * __restrict x); 90 | 91 | void extended_to_readd_wide_neg( 92 | extended_pt_readd_wide_t *result, 93 | const extended_pt_wide_t * __restrict x); 94 | 95 | void affine_to_readd_narrow( 96 | extended_pt_readd_narrow_t *result, 97 | const affine_pt_narrow_t * __restrict x); 98 | 99 | void projective_to_extended_wide( 100 | extended_pt_wide_t *result, projective_pt_wide_t * __restrict x); 101 | 102 | void extended_to_projective_wide( 103 | projective_pt_wide_t *result, const extended_pt_wide_t * __restrict x); 104 | 105 | void readd_to_projective( 106 | projective_pt_wide_t *result, 107 | const extended_pt_readd_narrow_t * __restrict x); 108 | 109 | void affine_readd_to_extended( 110 | extended_pt_wide_t *result, 111 | const extended_affine_pt_readd_narrow_t * __restrict x); 112 | 113 | void negate_extended_affine_pt_readd_narrow( 114 | extended_affine_pt_readd_narrow_t *result, 115 | const extended_affine_pt_readd_narrow_t *source); 116 | 117 | void affine_double( 118 | projective_pt_wide_t *result, 119 | const affine_pt_narrow_t * __restrict x); 120 | 121 | void affine_double_extended( 122 | extended_pt_wide_t *result, const affine_pt_narrow_t * __restrict x); 123 | 124 | void projective_double( 125 | projective_pt_wide_t *result, const projective_pt_wide_t *x); 126 | 127 | void projective_double_extended( 128 | extended_pt_wide_t *result, const projective_pt_wide_t * __restrict x); 129 | 130 | void extended_double_extended( 131 | extended_pt_wide_t *result, const extended_pt_wide_t *x); 132 | 133 | void projective_add( 134 | projective_pt_wide_t *result, const projective_pt_wide_t * __restrict x1, 135 | const projective_pt_wide_t * __restrict x2); 136 | 137 | void extended_add( 138 | projective_pt_wide_t *result, const extended_pt_wide_t * __restrict x, 139 | const extended_pt_wide_t * __restrict y); 140 | 141 | void extended_add_extended( 142 | extended_pt_wide_t *result, const extended_pt_wide_t * __restrict x, 143 | const extended_pt_wide_t * __restrict y); 144 | 145 | void extended_readd_narrow( 146 | projective_pt_wide_t *result, const extended_pt_wide_t * __restrict x, 147 | const extended_pt_readd_narrow_t * __restrict y); 148 | 149 | void extended_readd_narrow_extended( 150 | extended_pt_wide_t *result, const extended_pt_wide_t * __restrict x, 151 | const extended_pt_readd_narrow_t * __restrict y); 152 | 153 | void extended_readd_affine_narrow_extended( 154 | extended_pt_wide_t *result, const extended_pt_wide_t * __restrict x, 155 | const extended_affine_pt_readd_narrow_t * __restrict y); 156 | 157 | void extended_add_extended( 158 | extended_pt_wide_t *result, const extended_pt_wide_t * __restrict x, 159 | const extended_pt_wide_t * __restrict y); 160 | 161 | void extended_readd_readd_narrow( 162 | extended_pt_readd_narrow_t *result, 163 | const extended_pt_wide_t * __restrict x, 164 | const extended_pt_readd_narrow_t * __restrict y); 165 | 166 | void extended_readd_wide_extended( 167 | extended_pt_wide_t *result, 168 | const extended_pt_wide_t *x1, 169 | const extended_pt_readd_wide_t * __restrict x2); 170 | 171 | void scalar_multiply( 172 | projective_pt_wide_t *result, const affine_pt_narrow_t * __restrict x, 173 | const scalar_t * __restrict n); 174 | 175 | void scalar_multiply_unsafe( 176 | projective_pt_wide_t *result, const affine_pt_narrow_t * __restrict x, 177 | const scalar_t * __restrict n); 178 | 179 | int point_decompress( 180 | affine_pt_narrow_t *result, residue_narrow_reduced_t *y, int low_bit); 181 | #endif 182 | -------------------------------------------------------------------------------- /avx2/include/f11_260.h: -------------------------------------------------------------------------------- 1 | // Types and functions for manipulating field elements 2 | 3 | #ifndef F11_260_H 4 | #define F11_260_H 5 | #include 6 | 7 | #define NLIMBS_REDUCED 10 8 | #define NLIMBS 12 9 | #define T ((1 << 26) - 15) 10 | #define TBITS 26 11 | #define TMASK ((1 << 26) - 1) 12 | #define T_CBITS 4 13 | #define RESIDUE_LENGTH_BYTES 33 14 | 15 | // Reduced to 10 limbs. For final results. 16 | typedef struct residue_narrow_reduced { 17 | __attribute__((__aligned__(8))) 18 | int32_t limbs[10]; 19 | } residue_narrow_reduced_t; 20 | 21 | // 11 limbs. Limb 10 is placed in slot 0, and slot 11. 22 | typedef struct residue_narrow { 23 | __attribute__((__aligned__(16))) 24 | int32_t limbs[12]; 25 | } residue_narrow_t; 26 | 27 | // 11 limbs. Limb 10 is placed in slot 0 and slot 11. Wider for vector 28 | // compatibility. 29 | typedef struct residue_wide { 30 | __attribute__((__aligned__(32))) 31 | int64_t limbs[12]; 32 | } residue_wide_t; 33 | 34 | residue_wide_t zero_wide; 35 | residue_wide_t one_wide; 36 | residue_narrow_t zero_narrow; 37 | residue_narrow_t one_narrow; 38 | 39 | // Shrink to 32 bits. Assumes reduction has already occurred, and wide storage 40 | // is being used for vector compatibility. 41 | void narrow(residue_narrow_t *result, const residue_wide_t * __restrict w); 42 | 43 | // Reduce to 10 limbs. Useful for debugging 44 | void narrow_reduce( 45 | residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w); 46 | 47 | // Reduce to unique representative. 48 | // This is expensive. Only used for final signature or DH Key 49 | void narrow_complete( 50 | residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w); 51 | 52 | // Reduce to mostly unique representative. 53 | // All coefficients are reduced to 0 <= xi <= t 54 | // Unique up to carries (xi == t) => (xi = 0; x[i+1] += 1); 55 | // This is sufficient to determine if x is even or odd. 56 | // Still pretty expensive. Used in point compression. 57 | void narrow_partial_complete( 58 | residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w); 59 | 60 | int is_odd(residue_narrow_reduced_t *x); 61 | 62 | // Produce a 32-bit entry with 11 limbs 63 | static inline void unnarrow_reduce( 64 | residue_narrow_t *result, const residue_narrow_reduced_t * __restrict x) { 65 | 66 | result->limbs[0] = result->limbs[NLIMBS - 1] = 0; 67 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 68 | result->limbs[i+1] = x->limbs[i]; 69 | } 70 | } 71 | 72 | // Produce a 64-bit residue 73 | void widen( 74 | residue_wide_t *result, const residue_narrow_t * __restrict x); 75 | 76 | // Copy a 64-bit residue 77 | void copy_wide( 78 | residue_wide_t *result, const residue_wide_t * __restrict x); 79 | 80 | // Copy a 32-bit residue 81 | void copy_narrow( 82 | residue_narrow_t *result, const residue_narrow_t * __restrict x); 83 | 84 | void copy_narrow_reduced( 85 | residue_narrow_reduced_t *result, 86 | const residue_narrow_reduced_t * __restrict x); 87 | 88 | // Subtract 2 12x64-bit residues. 89 | void sub_wide( 90 | residue_wide_t *result, const residue_wide_t * __restrict x, 91 | const residue_wide_t * __restrict y); 92 | 93 | void negate_wide(residue_wide_t *result, const residue_wide_t *x); 94 | 95 | void negate_narrow(residue_narrow_t *result, const residue_narrow_t *x); 96 | 97 | // Add 2 12x32-bit residues. 98 | void add_narrow( 99 | residue_narrow_t *result, const residue_narrow_t * __restrict x, 100 | const residue_narrow_t * __restrict y); 101 | 102 | // Add 2 12x64-bit residues. 103 | void add_wide( 104 | residue_wide_t *result, const residue_wide_t * __restrict x, 105 | const residue_wide_t * __restrict y); 106 | 107 | // Scale a wide residue by 2. 108 | void double_wide( 109 | residue_wide_t *result, const residue_wide_t * __restrict x); 110 | 111 | // Multiply two wide residues, and produce a wide result. The result is reduced 112 | // to 32 bits, but not narrowed for performance reasons. 113 | void mul_wide( 114 | residue_wide_t *result, const residue_wide_t *x, 115 | const residue_wide_t *y); 116 | // Multiply a wide residues by a narrow and produce a wide result. The result is 117 | // reduced to 32 bits, but not narrowed for performance reasons. 118 | void mul_wide_narrow( 119 | residue_wide_t *result, const residue_wide_t *x, 120 | const residue_narrow_t *y); 121 | // Multiply two narrow residues and produce a wide result. The result is reduced 122 | // to 32 bits, but not narrowed for performance reasons. 123 | void mul_narrow( 124 | residue_wide_t *result, const residue_narrow_t *x, 125 | const residue_narrow_t *y); 126 | 127 | // Multiply a wide residue by a constant. 128 | void mul_wide_const( 129 | residue_wide_t *result, const residue_wide_t * __restrict x, int32_t d); 130 | 131 | // Multiply a narrow residue by a constant, producing a wide result 132 | void mul_narrow_const( 133 | residue_wide_t *result, const residue_narrow_t *x, int32_t d); 134 | 135 | // Square a wide residue and produce a wide result. The result is reduced to 32 136 | // bits but not narrowed for performance reasons. 137 | void square_wide( 138 | residue_wide_t *result, const residue_wide_t *x); 139 | 140 | // Square a narrow residue and produce a wide result. The result is reduced to 141 | // 32 bits but not narrowed for performance reasons. 142 | void square_narrow( 143 | residue_wide_t *result, const residue_narrow_t *x); 144 | 145 | // Approximately divide each coefficient by t. Carry the results. 146 | void reduce_step_narrow( 147 | residue_narrow_t *result, const residue_narrow_t *x); 148 | 149 | // Approximately divide each coefficient by t. Carry the results. 150 | void reduce_step_wide( 151 | residue_wide_t *result, const residue_wide_t *x); 152 | 153 | // Invert via fermat's theorem 154 | void invert_wide( 155 | residue_wide_t *result, const residue_wide_t * __restrict x); 156 | 157 | // Compute combined inverse and square root 158 | // returns true if x/y was a quadratic residue, and false otherwise. 159 | int sqrt_inv_wide( 160 | residue_wide_t *result, const residue_wide_t * __restrict x, 161 | const residue_wide_t * __restrict y); 162 | 163 | // Returns true if x == y. Computes in constant time. 164 | int equal_wide(const residue_wide_t * x, const residue_wide_t * y); 165 | 166 | int equal_narrow_reduced( 167 | const residue_narrow_reduced_t * x, const residue_narrow_reduced_t * y); 168 | 169 | void encode(uint8_t *out, const residue_narrow_reduced_t * __restrict x); 170 | void encode_compressed( 171 | uint8_t *out, const residue_narrow_reduced_t * __restrict x, int is_odd); 172 | 173 | void decode(residue_narrow_reduced_t *out, const uint8_t *in); 174 | #endif 175 | -------------------------------------------------------------------------------- /avx2/include/gen.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "comb.h" 4 | #include "curve.h" 5 | #include "gen.h" 6 | #include "scalar.h" 7 | 8 | void gen_key(scalar_t * __restrict priv_key, 9 | affine_pt_narrow_t * __restrict pub_key) { 10 | scalar_hash_t large_key; 11 | char *large_key_ptr = (char *) &large_key; 12 | arc4random_buf(large_key_ptr, sizeof(large_key)); 13 | 14 | // It's just as random to use montgomery reduction as to correct for the 15 | // montgomery factor. 16 | mont_reduce_hash_mod_l(priv_key, &large_key); 17 | 18 | projective_pt_wide_t result_pt; 19 | scalar_comb_multiply(&result_pt, &base_comb, priv_key); 20 | 21 | residue_wide_t z_inv; 22 | 23 | invert_wide(&z_inv, &result_pt.z); 24 | mul_wide(&result_pt.x, &result_pt.x, &z_inv); 25 | mul_wide(&result_pt.y, &result_pt.y, &z_inv); 26 | 27 | residue_narrow_t temp_narrow; 28 | narrow(&pub_key->x, &result_pt.x); 29 | 30 | narrow(&pub_key->y, &result_pt.y); 31 | 32 | explicit_bzero(&large_key, sizeof(large_key)); 33 | explicit_bzero(&result_pt, sizeof(result_pt)); 34 | explicit_bzero(&z_inv, sizeof(z_inv)); 35 | explicit_bzero(&temp_narrow, sizeof(temp_narrow)); 36 | } 37 | 38 | void encode_pub_key(uint8_t *result, const affine_pt_narrow_t *pub_key) { 39 | residue_narrow_reduced_t y_reduced; 40 | residue_narrow_reduced_t x_reduced; 41 | narrow_complete(&y_reduced, &pub_key->y); 42 | narrow_partial_complete(&x_reduced, &pub_key->x); 43 | 44 | y_reduced.limbs[NLIMBS_REDUCED - 1] |= is_odd(&x_reduced) << TBITS; 45 | encode(result, &y_reduced); 46 | } 47 | 48 | int decode_pub_key(affine_pt_narrow_t *result, const uint8_t *encoded_key) { 49 | residue_narrow_reduced_t y_decoded; 50 | decode(&y_decoded, encoded_key); 51 | int is_odd = y_decoded.limbs[NLIMBS_REDUCED - 1] >> TBITS; 52 | y_decoded.limbs[NLIMBS_REDUCED - 1] &= TMASK; 53 | return point_decompress(result, &y_decoded, is_odd); 54 | } 55 | -------------------------------------------------------------------------------- /avx2/include/gen.h: -------------------------------------------------------------------------------- 1 | #ifndef GEN_H 2 | #define GEN_H 3 | 4 | #include "scalar.h" 5 | #include "curve.h" 6 | 7 | void gen_key(scalar_t * __restrict priv_key, 8 | affine_pt_narrow_t * __restrict pub_key); 9 | void encode_pub_key(uint8_t *result, const affine_pt_narrow_t *pub_key); 10 | int decode_pub_key(affine_pt_narrow_t *result, const uint8_t *encoded_key); 11 | #endif 12 | -------------------------------------------------------------------------------- /avx2/include/scalar.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "f11_260.h" 4 | #include "scalar.h" 5 | 6 | // Plenty of inspiration for this file was taken from Mike Hamburg's 7 | // Ed448 code. 8 | 9 | // Constants: 10 | __attribute__((__aligned__(32))) 11 | const scalar_t l_bits = { 12 | .limbs = {0x28ad9c41, 0xe6dcf7e8, 0x34b804af, 0x5af91169, 13 | 0x5cf68f2f, 0x125277f4, 0x9c1bf9f, 0xffff6b00, 0x3,}, 14 | }; 15 | 16 | __attribute__((__aligned__(32))) 17 | const scalar_t signed_bits_set_adjustment = { 18 | .limbs = {0x5d498efb, 0x648c205f, 0x2d1fed40, 0x941bba5b, 19 | 0x8c25c342, 0xb6b6202e, 0xd8f90183, 0x000253ff, 0x0,}, 20 | }; 21 | 22 | __attribute__((__aligned__(32))) 23 | const scalar_t SCALAR_MONT_R2 = { 24 | .limbs = {0x30ba45c7, 0xf3422093, 0x054bbbf6, 0x017ab264, 25 | 0x914ee18b, 0x250f1097, 0xf6bc1224, 0x5e97c70e, 0x2,}, 26 | }; 27 | 28 | const uint32_t SCALAR_MONT_N_PRIME = 0xb3138c3f; 29 | 30 | __attribute__((__aligned__(32))) 31 | const scalar_t SCALAR_MONT_R2_HASH = { 32 | .limbs = { 33 | 0x202dd8e7, 0xcb1bf7be, 0xd219daf6, 0xb85aba0a, 34 | 0xdc8da05f, 0xbd23bfce, 0xb7642c95, 0xbb13e4ad, 0x0,}, 35 | }; 36 | 37 | __attribute__((__aligned__(32))) 38 | const scalar_t SCALAR_MONT_R2_HASH_MUL = { 39 | .limbs = {0x8b9c7a13, 0x37bb3081, 0xe4f0c2b0, 0x99b4a8b2, 40 | 0xb4538c55, 0x34c9db2a, 0x2ade0e63, 0xa7cb6782, 0x1,}, 41 | }; 42 | 43 | void divide_by_2_mod_l( 44 | scalar_t *result, const scalar_t *x) { 45 | 46 | uint32_t mask = -(x->limbs[0] & 1); 47 | 48 | uint64_t chain = 0; 49 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 50 | chain = (chain + x->limbs[i]) + (mask & l_bits.limbs[i]); 51 | result->limbs[i] = chain; 52 | chain >>= SCALAR_LIMB_BITS; 53 | } 54 | 55 | int i; 56 | for (i = 0; i < SCALAR_LIMBS - 1; ++i) { 57 | result->limbs[i] = result->limbs[i] >> 1 | 58 | (result->limbs[i+1] << (SCALAR_LIMB_BITS - 1)); 59 | } 60 | result->limbs[i] >>= 1; 61 | } 62 | 63 | void add_mod_l( 64 | scalar_t *result, const scalar_t *x, 65 | const scalar_t * __restrict y) { 66 | 67 | uint64_t chain = 0; 68 | int i; 69 | for (i = 0; i < SCALAR_LIMBS; ++i) { 70 | chain = (chain + x->limbs[i]) + y->limbs[i]; 71 | result->limbs[i] = chain; 72 | chain >>= SCALAR_LIMB_BITS; 73 | } 74 | 75 | sub_mod_l(result, result, &l_bits); 76 | } 77 | 78 | void sub_mod_l( 79 | scalar_t *result, const scalar_t *x, 80 | const scalar_t *y) { 81 | sub_mod_l_accum(result, x->limbs, y); 82 | } 83 | 84 | // x is a pointer and not a scalar_t so that this function can be used to reduce 85 | // accumulators after multiplication. 86 | void sub_mod_l_accum( 87 | scalar_t *result, const uint32_t *x, 88 | const scalar_t *y) { 89 | 90 | int64_t chain = 0; 91 | int i; 92 | for (i = 0; i < SCALAR_LIMBS; ++i) { 93 | chain = (chain + x[i]) - y->limbs[i]; 94 | result->limbs[i] = chain; 95 | chain >>= SCALAR_LIMB_BITS; 96 | } 97 | 98 | //Should be 0 or -1 (to function as a mask) 99 | int32_t borrow = chain; 100 | 101 | chain = 0; 102 | for (i = 0; i < SCALAR_LIMBS; ++i) { 103 | chain = (chain + result->limbs[i]) + (l_bits.limbs[i] & borrow); 104 | result->limbs[i] = chain; 105 | chain >>= SCALAR_LIMB_BITS; 106 | } 107 | } 108 | 109 | void convert_to_sabs( 110 | scalar_t *result, const scalar_t *x) { 111 | add_mod_l(result, x, &signed_bits_set_adjustment); 112 | divide_by_2_mod_l(result, result); 113 | } 114 | 115 | void mont_reduce_hash_mod_l( 116 | scalar_t *result, const scalar_hash_t * __restrict x) { 117 | uint32_t accum[HASH_LIMBS]; 118 | 119 | for (int i = 0; i < HASH_LIMBS; ++i) { 120 | accum[i] = x->limbs[i]; 121 | } 122 | 123 | uint64_t chain = 0; 124 | for (int i = 0; i <= HASH_LIMBS - SCALAR_LIMBS; ++i) { 125 | uint32_t q = accum[0] * SCALAR_MONT_N_PRIME; 126 | for (int j = 0; j < SCALAR_LIMBS; ++j) { 127 | chain += accum[j] + ((uint64_t) q) * l_bits.limbs[j]; 128 | if (j > 0) { 129 | accum[j - 1] = chain; 130 | } 131 | chain >>= SCALAR_LIMB_BITS; 132 | } 133 | int j; 134 | for (j = SCALAR_LIMBS; j < HASH_LIMBS - i; ++j) { 135 | chain += accum[j]; 136 | accum[j - 1] = chain; 137 | chain >>= SCALAR_LIMB_BITS; 138 | } 139 | accum[j - 1] = chain; 140 | } 141 | 142 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 143 | result->limbs[i] = accum[i]; 144 | } 145 | explicit_bzero(accum, sizeof(accum)); 146 | } 147 | 148 | void reduce_hash_mod_l(scalar_t *result, const scalar_hash_t * __restrict x) { 149 | mont_reduce_hash_mod_l(result, x); 150 | mont_mult_mod_l(result, result, &SCALAR_MONT_R2_HASH); 151 | } 152 | 153 | void mont_mult_mod_l(scalar_t *result, const scalar_t *x, 154 | const scalar_t *y) { 155 | uint32_t accum[SCALAR_LIMBS + 1] = {0}; 156 | 157 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 158 | uint32_t x_limb = x->limbs[i]; 159 | 160 | uint64_t chain = 0; 161 | int j; 162 | for (j = 0; j < SCALAR_LIMBS; ++j) { 163 | chain += accum[j] + ((uint64_t) y->limbs[j]) * x_limb; 164 | accum[j] = chain; 165 | chain >>= SCALAR_LIMB_BITS; 166 | } 167 | 168 | // 2 bit value 169 | accum[j] = chain; 170 | 171 | uint32_t q = accum[0] * SCALAR_MONT_N_PRIME; 172 | chain = 0; 173 | for (int j = 0; j < SCALAR_LIMBS; ++j) { 174 | chain += accum[j] + ((uint64_t) l_bits.limbs[j]) * q; 175 | if (j > 0) { 176 | accum[j - 1] = chain; 177 | } 178 | chain >>= SCALAR_LIMB_BITS; 179 | } 180 | 181 | // chain is a 2-bit value with a possible carry. 182 | // result is a 3 bit value 183 | chain += accum[j]; 184 | accum[j - 1] = chain; 185 | } 186 | 187 | sub_mod_l_accum(result, accum, &l_bits); 188 | explicit_bzero(accum, sizeof(accum)); 189 | } 190 | 191 | void mult_mod_l(scalar_t *result, const scalar_t * __restrict x, 192 | const scalar_t * __restrict y) { 193 | scalar_t temp; 194 | mont_mult_mod_l(&temp, x, y); 195 | mont_mult_mod_l(result, &temp, &SCALAR_MONT_R2); 196 | explicit_bzero(&temp, sizeof(temp)); 197 | } 198 | -------------------------------------------------------------------------------- /avx2/include/scalar.h: -------------------------------------------------------------------------------- 1 | #ifndef SCALAR_H 2 | #define SCALAR_H 3 | #include 4 | #include "f11_260.h" 5 | 6 | typedef struct scalar { 7 | uint32_t limbs[9]; 8 | } scalar_t; 9 | 10 | typedef struct scalar_hash { 11 | uint32_t limbs[16]; 12 | } scalar_hash_t; 13 | 14 | // const int SCALAR_LIMBS = 9; 15 | #define HASH_LIMBS 16 16 | #define SCALAR_LIMBS 9 17 | #define SCALAR_BITS 258 18 | #define SCALAR_BYTES 33 19 | #define SCALAR_LIMB_BITS 32 20 | #define SCALAR_LAST_LIMB_BITS 2 21 | #define SCALAR_LAST_LIMB_MASK 0x3 22 | 23 | // Constants 24 | // A scalar representing l, the order of the prime subgroup. 25 | const scalar_t l_bits; 26 | // For converting to SABS representation 27 | const scalar_t signed_bits_set_adjustment; 28 | // l * N' is congruent to -1 mod 2^32 29 | const uint32_t SCALAR_MONT_N_PRIME; 30 | // (2 ^ 32)^18 mod l. Used to convert to montgomery domain. 31 | // Or to fix the result of a single multiply via a 2nd multiply. 32 | const scalar_t SCALAR_MONT_R2; 33 | // (2 ^ 32)^17 mod l. 34 | // Used to fix the result of a hash reduction via a multiply 35 | // A hash is reduced from HASH_LIMBS to SCALAR_LIMBS via 36 | // HASH_LIMBS - SCALAR_LIMBS + 1 divisions by 2^32. So a hash reduction produces 37 | // h * (2^32)^-8 mod l. Montgomery multiplying by (2^32)^17 mod l produces h mod 38 | // l 39 | const scalar_t SCALAR_MONT_R2_HASH; 40 | // (2 ^ 32)^26 mod l. 41 | // Used to fix the result of a hash reduction followed by a multiply. 42 | // By similar logic we need to get rid of a factor of (2^32)^-17 43 | const scalar_t SCALAR_MONT_R2_HASH_MUL; 44 | 45 | // Functions for manipulating scalars. May need more for ECDSA. 46 | 47 | // This is used to convert to SABS representation. 48 | void divide_by_2_mod_l(scalar_t *result, const scalar_t * __restrict x); 49 | 50 | void add_mod_l(scalar_t *result, const scalar_t * __restrict x, 51 | const scalar_t * __restrict y); 52 | 53 | void sub_mod_l(scalar_t *result, const scalar_t * __restrict x, 54 | const scalar_t * __restrict y); 55 | 56 | void sub_mod_l_accum(scalar_t *result, const uint32_t * __restrict x, 57 | const scalar_t * __restrict y); 58 | 59 | void mont_mult_mod_l(scalar_t *result, const scalar_t * __restrict x, 60 | const scalar_t * __restrict y); 61 | 62 | void mult_mod_l(scalar_t *result, const scalar_t * __restrict x, 63 | const scalar_t * __restrict y); 64 | 65 | void mont_reduce_hash_mod_l( 66 | scalar_t *result, const scalar_hash_t * __restrict x); 67 | void reduce_hash_mod_l(scalar_t *result, const scalar_hash_t * __restrict x); 68 | 69 | void convert_to_sabs(scalar_t *result, const scalar_t * __restrict x); 70 | #endif 71 | -------------------------------------------------------------------------------- /avx2/include/sign.h: -------------------------------------------------------------------------------- 1 | #ifndef SIGN_H 2 | #define SIGN_H 3 | #include "curve.h" 4 | #include "scalar.h" 5 | 6 | #define SIG_LENGTH 65 7 | 8 | typedef struct signature { 9 | residue_narrow_reduced_t y; 10 | scalar_t s; 11 | } signature_t; 12 | 13 | void sign(signature_t *result, scalar_t *priv_key, 14 | const uint8_t *pub_key, const uint8_t *msg, size_t msg_len); 15 | 16 | int verify( 17 | const signature_t *sig, const uint8_t *r_bytes, const uint8_t *pub_key_bytes, 18 | const affine_pt_narrow_t *pub_key_pt, const uint8_t *msg, 19 | size_t msg_len); 20 | 21 | void encode_sig(uint8_t *result, const signature_t *sig); 22 | void decode_sig(signature_t *result, const uint8_t *encoded_sig); 23 | #endif 24 | -------------------------------------------------------------------------------- /avx2/src/api.c.supercop_only: -------------------------------------------------------------------------------- 1 | ../../ref/src/api.c -------------------------------------------------------------------------------- /avx2/src/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "comb.h" 7 | #include "curve.h" 8 | #include "f11_260.h" 9 | #include "gen.h" 10 | #include "scalar.h" 11 | #include "sign.h" 12 | 13 | int main(int _argc, char **argv) { 14 | residue_narrow_t x = { 15 | .limbs = { 16 | 0x14e8b6e, 0x3553e74, 0x0464e4c, 0x61de408, 17 | 0x006a30e, 0x6e9b25b, 0x3e6f39e, 0x19ec754, 18 | 0x5c71cc3, 0x2bc1c0e, 0x554338e, 0x14e8b6e, 19 | }, 20 | }; 21 | 22 | residue_wide_t two = { 23 | .limbs = { 24 | 0x0, 0x2, 0x0, 0x0, 25 | 0x0, 0x0, 0x0, 0x0, 26 | 0x0, 0x0, 0x0, 0x0, 27 | }, 28 | }; 29 | 30 | residue_wide_t x_plus_two; 31 | 32 | residue_narrow_reduced_t x_narrow_reduced = { 33 | .limbs = { 34 | 0x206b305, 0x2f7c2ce, 0x0cf58a7, 0x2b81791, 0x19b26fa, 35 | 0x2986830, 0x0503be5, 0x0789163, 0x16d90a0, 0x005a82e, 36 | }, 37 | }; 38 | 39 | residue_wide_t x_wide; 40 | 41 | residue_narrow_t y = { 42 | .limbs = { 43 | 0x56ed38e, 0x5f5b0e1, 0x4668277, 0x0f7d85a, 44 | 0x4515e42, 0x00cb559, 0x3f8a910, 0x6655708, 45 | 0x3085b4d, 0x581ceff, 0x3324c03, 0x56ed38e, 46 | }, 47 | }; 48 | 49 | residue_narrow_reduced_t y_narrow_reduced = { 50 | .limbs = { 51 | 0x086dd54, 0x2f7aedb, 0x38904ae, 0x2e28aa4, 0x29de1ad, 52 | 0x289d572, 0x0f6837a, 0x19987b1, 0x012fb71, 0x1c37867, 53 | }, 54 | }; 55 | 56 | residue_wide_t y_wide; 57 | 58 | residue_wide_t mul_expected = { 59 | .limbs = { 60 | 0x06e9e1d, 0x1c508c4, 0x3eeb85d, 0x04bc914, 61 | 0x0a57e1c, 0x1f13f9a, 0x2d8aa7d, 0x232cce3, 62 | 0x31e92c4, 0x04fb073, 0x2582507, 0x06e9e1d, 63 | }, 64 | }; 65 | 66 | residue_wide_t square_expected = { 67 | .limbs = { 68 | 0x3088d3c, 0x2073353, 0x18e5de4, 0x320a4ab, 69 | 0x3ee123a, 0x2d88419, 0x3d1ae13, 0x02b3dcf, 70 | 0x2997027, 0x3d550a2, 0x220a052, 0x3088d3c, 71 | }, 72 | }; 73 | 74 | residue_narrow_t negative_one_redundant = { 75 | .limbs = { 76 | 0x000000e, 0x3ffffff, 0x3ffffff, 0x3ffffff, 77 | 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 78 | 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x000000e, 79 | }, 80 | }; 81 | 82 | residue_narrow_t negative_t2_plus_one = { 83 | .limbs = { 84 | 0x000000e, 85 | 0x3ffffff, 0x000000e, 0x3ffffff, 86 | 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 87 | 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x000000e, 88 | }, 89 | }; 90 | 91 | residue_narrow_reduced_t negative_t2_plus_one_partial = { 92 | .limbs = { 93 | 0x3fffff1, 0x0000000, 0x3fffff1, 0x3fffff1, 94 | 0x3fffff1, 0x3fffff1, 0x3fffff1, 0x3fffff1, 95 | 0x3fffff1, 0x3fffff1, 96 | }, 97 | }; 98 | 99 | residue_narrow_reduced_t negative_t2_plus_one_complete = { 100 | .limbs = { 101 | 0x0000000, 0x0000001, 0x3fffff1, 0x3fffff1, 102 | 0x3fffff1, 0x3fffff1, 0x3fffff1, 0x3fffff1, 103 | 0x3fffff1, 0x3fffff1, 104 | }, 105 | }; 106 | 107 | residue_wide_t sqrt_x_plus_2_over_y = { 108 | .limbs = { 109 | 0x040bbb0, 0x3fa8549, 0x0706e5c, 0x3b33dc9, 110 | 0x3401712, 0x3a58fb3, 0x076ec4f, 0x3347ad0, 111 | 0x16ca1b0, 0x26ed559, 0x06033f0, 0x040bbb0, 112 | }, 113 | }; 114 | 115 | residue_wide_t x_inverse = { 116 | .limbs = { 117 | 0x09fd09b, 0x17a9f53, 0x22e2983, 0x0f09456, 118 | 0x11fb41e, 0x1e47b3f, 0x37dd25f, 0x3bc6938, 119 | 0x2b654cd, 0x233a0b2, 0x3f8c25b, 0x09fd09b, 120 | }, 121 | }; 122 | 123 | #if 1 124 | residue_wide_t result; 125 | residue_narrow_t result_narrow; 126 | residue_narrow_reduced_t result_narrow_reduced; 127 | 128 | mul_narrow(&result, &x, &y); 129 | for (int i = 0; i < NLIMBS; ++i) { 130 | assert(mul_expected.limbs[i] == result.limbs[i]); 131 | } 132 | 133 | widen(&x_wide, &x); 134 | for (int i = 0; i < NLIMBS; ++i) { 135 | assert(x.limbs[i] == x_wide.limbs[i]); 136 | } 137 | 138 | mul_wide_narrow(&result, &x_wide, &y); 139 | for (int i = 0; i < NLIMBS; ++i) { 140 | assert(mul_expected.limbs[i] == result.limbs[i]); 141 | } 142 | 143 | widen(&y_wide, &y); 144 | mul_wide(&result, &x_wide, &y_wide); 145 | for (int i = 0; i < NLIMBS; ++i) { 146 | assert(mul_expected.limbs[i] == result.limbs[i]); 147 | } 148 | 149 | square_narrow(&result, &x); 150 | for (int i = 0; i < NLIMBS; ++i) { 151 | assert(square_expected.limbs[i] == result.limbs[i]); 152 | } 153 | 154 | square_wide(&result, &x_wide); 155 | for (int i = 0; i < NLIMBS; ++i) { 156 | assert(square_expected.limbs[i] == result.limbs[i]); 157 | } 158 | 159 | // The reduction function doesn't reduce this redundant version of negative 160 | // one any more. 161 | reduce_step_narrow(&result_narrow, &negative_one_redundant); 162 | for (int i = 0; i < NLIMBS; ++i) { 163 | assert(negative_one_redundant.limbs[i] == result_narrow.limbs[i]); 164 | } 165 | 166 | reduce_step_narrow(&result_narrow, &negative_t2_plus_one); 167 | for (int i = 0; i < NLIMBS; ++i) { 168 | assert(negative_t2_plus_one.limbs[i] == result_narrow.limbs[i]); 169 | } 170 | 171 | narrow_partial_complete(&result_narrow_reduced, &negative_t2_plus_one); 172 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 173 | assert(negative_t2_plus_one_partial.limbs[i] == 174 | result_narrow_reduced.limbs[i]); 175 | } 176 | 177 | narrow_complete(&result_narrow_reduced, &negative_t2_plus_one); 178 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 179 | assert(negative_t2_plus_one_complete.limbs[i] == 180 | result_narrow_reduced.limbs[i]); 181 | } 182 | 183 | scalar_t scalar_result; 184 | scalar_t scalar_x = { 185 | .limbs = { 186 | 0xa46168f9, 0x4cbf07a5, 0x62cf2928, 0xfd04242b, 0x3b12d23f, 187 | 0x355e9e63, 0xc22e849e, 0x6331c34a, 0x1, 188 | }, 189 | }; 190 | scalar_t scalar_y = { 191 | .limbs = { 192 | 0x148b9452, 0xaca9b6bb, 0xe0eeb33d, 0x7e64c899, 0xd61c602a, 193 | 0x96dcbb6b, 0x6a037c88, 0x39fbbaf0, 0x0, 194 | }, 195 | }; 196 | scalar_t scalar_x_plus_y = { 197 | .limbs = { 198 | 0xb8ecfd4b, 0xf968be60, 0x43bddc65, 0x7b68ecc5, 0x112f326a, 199 | 0xcc3b59cf, 0x2c320126, 0x9d2d7e3b, 0x1, 200 | }, 201 | }; 202 | scalar_t scalar_x_plus_x = { 203 | .limbs = { 204 | 0x48c2d1f2, 0x997e0f4b, 0xc59e5250, 0xfa084856, 0x7625a47f, 205 | 0x6abd3cc6, 0x845d093c, 0xc6638695, 0x2, 206 | }, 207 | }; 208 | scalar_t scalar_x_plus_x_plus_x_plus_y = { 209 | .limbs = { 210 | 0xd90232fc, 0xac09d5c3, 0xd4a42a06, 0x1a7823b2, 0x2a5e47bb, 211 | 0x24a61ea1, 0xa6cd4ac4, 0x639199d0, 0x0, 212 | }, 213 | }; 214 | scalar_t scalar_x_minus_y = { 215 | .limbs = { 216 | 0x8fd5d4a7, 0xa01550ea, 0x81e075ea, 0x7e9f5b91, 0x64f67215, 217 | 0x9e81e2f7, 0x582b0815, 0x2936085a, 0x1, 218 | }, 219 | }; 220 | scalar_t scalar_y_minus_x = { 221 | .limbs = { 222 | 0x98d7c79a, 0x46c7a6fd, 0xb2d78ec5, 0xdc59b5d7, 0xf8001d19, 223 | 0x73d094fc, 0xb196b789, 0xd6c962a5, 0x2, 224 | }, 225 | }; 226 | scalar_t scalar_x_times_y = { 227 | .limbs = { 228 | 0x30b3d35a, 0x9ca90acf, 0x6926efdd, 0x80620b0a, 0x52e190e7, 229 | 0x8011b9b8, 0x8c7d8f43, 0x90491703, 0x3, 230 | }, 231 | }; 232 | scalar_t scalar_x_sabs = { 233 | .limbs = { 234 | 0x80d57bfa, 0x58a59402, 0x47f78b34, 0x488fef43, 0xe39c4ac1, 235 | 0xf60a5f48, 0x4d93c310, 0xb19a0ba5, 0x0, 236 | }, 237 | }; 238 | scalar_t scalar_y_sabs = { 239 | .limbs = { 240 | 0x4d415fc7, 0xfc096781, 0x21635296, 0x36bcca2f, 0x5f9c594e, 241 | 0xaff2a9c7, 0x265f1ed5, 0x1cfebcf8, 0x2, 242 | }, 243 | }; 244 | scalar_hash_t scalar_hash_val = { 245 | .limbs = { 246 | 0xcbbc3de7, 0xa212405d, 0x5c85f47c, 0x79aa991c, 247 | 0xfe310944, 0x54075530, 0xd5ef6878, 0x72e57186, 248 | 0x36dcac18, 0xb72461e2, 0x5405caca, 0x4e9e0bff, 249 | 0x8d67a990, 0xf62f262c, 0x6df205dd, 0x24d78573, 250 | }, 251 | }; 252 | scalar_t reduced_hash_val = { 253 | .limbs = { 254 | 0xef1d4f9d, 0xd832a3a5, 0xdf1682be, 0x8d257e79, 0x41b1f2ca, 255 | 0x5be9564c, 0x320d4cb6, 0x108f8d04, 0x3, 256 | }, 257 | }; 258 | 259 | uint8_t buffer[33]; 260 | uint8_t encode_x[33] = { 261 | 0x05, 0xb3, 0x06, 0x3a, 0x0b, 0xdf, 262 | 0x7b, 0x8a, 0xf5, 0x4c, 0xe4, 0x05, 0xae, 263 | 0xfa, 0x26, 0x9b, 0xc1, 0xa0, 0x61, 264 | 0x5a, 0xbe, 0x03, 0xc5, 0x58, 0x24, 0x1e, 265 | 0xa0, 0x90, 0x6d, 0xb9, 0xa0, 0x16, 0x00, 266 | }; 267 | uint8_t encode_y[33] = { 268 | 0x54, 0xdd, 0x86, 0x6c, 0xbb, 0xde, 269 | 0xeb, 0x4a, 0x90, 0x38, 0xa9, 0xa2, 0xb8, 270 | 0xad, 0xe1, 0x9d, 0xca, 0x55, 0x27, 271 | 0xaa, 0x37, 0x68, 0x4f, 0xec, 0x61, 0x66, 272 | 0x71, 0xfb, 0x12, 0x9c, 0xe1, 0x0d, 0x07, 273 | }; 274 | 275 | add_mod_l(&scalar_result, &scalar_x, &scalar_y); 276 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 277 | assert(scalar_x_plus_y.limbs[i] == scalar_result.limbs[i]); 278 | } 279 | add_mod_l(&scalar_result, &scalar_x, &scalar_x); 280 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 281 | assert(scalar_x_plus_x.limbs[i] == scalar_result.limbs[i]); 282 | } 283 | add_mod_l(&scalar_result, &scalar_x_plus_x, &scalar_x_plus_y); 284 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 285 | assert(scalar_x_plus_x_plus_x_plus_y.limbs[i] == scalar_result.limbs[i]); 286 | } 287 | sub_mod_l(&scalar_result, &scalar_x, &scalar_y); 288 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 289 | assert(scalar_x_minus_y.limbs[i] == scalar_result.limbs[i]); 290 | } 291 | 292 | sub_mod_l(&scalar_result, &scalar_y, &scalar_x); 293 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 294 | assert(scalar_y_minus_x.limbs[i] == scalar_result.limbs[i]); 295 | } 296 | 297 | mult_mod_l(&scalar_result, &scalar_x, &scalar_y); 298 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 299 | assert(scalar_x_times_y.limbs[i] == scalar_result.limbs[i]); 300 | } 301 | 302 | convert_to_sabs(&scalar_result, &scalar_x); 303 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 304 | assert(scalar_x_sabs.limbs[i] == scalar_result.limbs[i]); 305 | } 306 | 307 | convert_to_sabs(&scalar_result, &scalar_y); 308 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 309 | assert(scalar_y_sabs.limbs[i] == scalar_result.limbs[i]); 310 | } 311 | 312 | encode(buffer, &x_narrow_reduced); 313 | for (int i = 0; i < 33; ++i) { 314 | assert(encode_x[i] == buffer[i]); 315 | } 316 | 317 | encode(buffer, &y_narrow_reduced); 318 | for (int i = 0; i < 33; ++i) { 319 | assert(encode_y[i] == buffer[i]); 320 | } 321 | 322 | decode(&result_narrow_reduced, encode_x); 323 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 324 | assert(x_narrow_reduced.limbs[i] == result_narrow_reduced.limbs[i]); 325 | } 326 | 327 | decode(&result_narrow_reduced, encode_y); 328 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 329 | assert(y_narrow_reduced.limbs[i] == result_narrow_reduced.limbs[i]); 330 | } 331 | 332 | //x/y is not a quadratic residue, but (x+2)/y is. 333 | assert(!sqrt_inv_wide(&result, &x_wide, &y_wide)); 334 | add_wide(&x_plus_two, &x_wide, &two); 335 | assert(sqrt_inv_wide(&result, &x_plus_two, &y_wide)); 336 | for (int i = 0; i < NLIMBS; ++i) { 337 | assert(sqrt_x_plus_2_over_y.limbs[i] == result.limbs[i]); 338 | } 339 | 340 | invert_wide(&result, &x_wide); 341 | assert(equal_wide(&result, &x_inverse)); 342 | 343 | 344 | reduce_hash_mod_l(&scalar_result, &scalar_hash_val); 345 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 346 | assert(reduced_hash_val.limbs[i] == scalar_result.limbs[i]); 347 | } 348 | 349 | scalar_t mult_scalar = { 350 | .limbs = { 351 | 0x55f0b9a3, 0x82b106c5, 0xcb2e2b7d, 0x30735cbc, 352 | 0xa512a8ba, 0x4c5cd391, 0xe9d0c788, 0x92bb2562, 0x3, 353 | }, 354 | }; 355 | projective_pt_wide_t expected_scalar_mult = { 356 | .x = { 357 | .limbs = { 358 | 0x0350abe, 0x1267d8d, 0x39a3cd3, 0x09e1275, 0x2d21378, 0x24771d9, 359 | 0x3558a1d, 0x3bdca9b, 0x0dd862d, 0x0bb230a, 0x1668292, 0x0350abe, 360 | }, 361 | }, 362 | .y = { 363 | .limbs = { 364 | 0x0b090d6, 0x04d69fd, 0x03e739d, 0x36ce258, 0x0b6464b, 0x19dab22, 365 | 0x249c1a8, 0x1d28c7d, 0x1591dbc, 0x085ebab, 0x0e8274f, 0x0b090d6, 366 | }, 367 | }, 368 | .z = { 369 | .limbs = {0, 0x1}, 370 | }, 371 | }; 372 | projective_pt_wide_t result_pt; 373 | 374 | for (int i = 0; i<1; ++i) { 375 | scalar_multiply(&result_pt, &B, &mult_scalar); 376 | } 377 | { 378 | residue_wide_t tmp; 379 | mul_wide(&tmp, &expected_scalar_mult.x, &result_pt.z); 380 | assert(equal_wide(&tmp, &result_pt.x)); 381 | mul_wide(&tmp, &expected_scalar_mult.y, &result_pt.z); 382 | assert(equal_wide(&tmp, &result_pt.y)); 383 | } 384 | 385 | affine_pt_narrow_t expected_everything0 = { 386 | .x = { 387 | .limbs = { 388 | 0, 0x20eef1a, 0x3c30e66, 0x0d710f0, 0x248a6fa, 0x30c967f, 389 | 0x3ce302c, 0x0ccd1f2, 0x197e993, 0x2ebaef3, 0x0f2f019, 0, 390 | }, 391 | }, 392 | .y = { 393 | .limbs = { 394 | 0, 0x3017cc0, 0x02a5110, 0x06d37e5, 0x283a64a, 0x01484b5, 395 | 0x196f37b, 0x13de2d2, 0x0da32d1, 0x392e0fc, 0x221d742, 0, 396 | }, 397 | }, 398 | }; 399 | 400 | affine_pt_narrow_t expected_everything1 = { 401 | .x = { 402 | .limbs = { 403 | 0, 0x0e35d45, 0x038f90c, 0x0283483, 0x01ee50a, 0x1e364f9, 404 | 0x362414c, 0x156b1ed, 0x006fff6, 0x271f9ed, 0x0ffa45d, 0, 405 | }, 406 | }, 407 | .y = { 408 | .limbs = { 409 | 0, 0x156ae67, 0x27941ab, 0x19a3000, 0x3572ab5, 0x2b90ce3, 410 | 0x136156c, 0x0727496, 0x0edae82, 0x0fa5dfd, 0x16f293c, 0, 411 | }, 412 | }, 413 | }; 414 | 415 | affine_pt_narrow_t expected_everything2 = { 416 | .x = { 417 | .limbs = { 418 | 0, 0x37fcb1b, 0x16004b9, 0x1d18743, 0x0bce648, 0x0d78db6, 419 | 0x35b1d65, 0x23bb620, 0x2fbc323, 0x1a9a586, 0x3b22577, 0, 420 | }, 421 | }, 422 | .y = { 423 | .limbs = { 424 | 0, 0x082fb15, 0x03487d6, 0x3d1c2c9, 0x2c9e7ad, 0x187be10, 425 | 0x2e9b6ba, 0x15b8f89, 0x243ae4c, 0x328bb11, 0x00b12a9, 0, 426 | }, 427 | }, 428 | }; 429 | 430 | 431 | affine_pt_narrow_t expected_everything3 = { 432 | .x = { 433 | .limbs = { 434 | 0, 0x3e79b25, 0x2ca71b7, 0x2b2ea3c, 0x0de7ac4, 0x3026d10, 435 | 0x2bce79e, 0x1153866, 0x03e5a80, 0x22b9a37, 0x03e9c59, 0, 436 | }, 437 | }, 438 | .y = { 439 | .limbs = { 440 | 0, 0x20100d6, 0x2330974, 0x3402585, 0x172cfd6, 0x275a21c, 441 | 0x213e87c, 0x29989f2, 0x155e437, 0x096a378, 0x3a674eb, 0, 442 | }, 443 | }, 444 | }; 445 | 446 | affine_pt_narrow_t expected_gray_code_end0 = { 447 | .x = { 448 | .limbs = { 449 | 0, 0x14dd884, 0x12c9e33, 0x2d42122, 0x26f0b14, 0x1b9ea17, 450 | 0x3779e94, 0x2562a88, 0x0be34f0, 0x192ead9, 0x089ec45, 0, 451 | }, 452 | }, 453 | .y = { 454 | .limbs = { 455 | 0, 0x1de5221, 0x172f820, 0x28c1b33, 0x08003c6, 0x0e65926, 456 | 0x188cd49, 0x3bb39fd, 0x1b9d8d7, 0x03d5020, 0x045742b, 0, 457 | }, 458 | }, 459 | }; 460 | 461 | affine_pt_narrow_t expected_gray_code_end1 = { 462 | .x = { 463 | .limbs = { 464 | 0, 0x1d1cf29, 0x2e289d7, 0x1a83709, 0x2252d11, 0x3d6411c, 465 | 0x3fd73ad, 0x2737d9c, 0x2ca9eba, 0x058f290, 0x3879a7c, 0, 466 | }, 467 | }, 468 | .y = { 469 | .limbs = { 470 | 0, 0x357399d, 0x0276752, 0x0d5199f, 0x1bbd3a0, 0x39044f1, 471 | 0x0c5e83a, 0x1a99cdd, 0x0dcb61f, 0x35b7272, 0x1184cff, 0, 472 | }, 473 | }, 474 | }; 475 | 476 | affine_pt_narrow_t expected_gray_code_end2 = { 477 | .x = { 478 | .limbs = { 479 | 0, 0x1ea3c19, 0x081dc9e, 0x1a0b337, 0x1d7f3f4, 0x295a0aa, 480 | 0x1ebff45, 0x0956bf0, 0x17aae80, 0x05d8632, 0x3082c9a, 0, 481 | }, 482 | }, 483 | .y = { 484 | .limbs = { 485 | 0, 0x22ad91f, 0x1ffcc65, 0x37b4f5c, 0x29c51ab, 0x3f9bd02, 486 | 0x296aaf9, 0x2a58b82, 0x2c54e16, 0x2a7672c, 0x21486e2, 0, 487 | }, 488 | }, 489 | }; 490 | 491 | affine_pt_narrow_t expected_gray_code_end3 = { 492 | .x = { 493 | .limbs = { 494 | 0, 0x06b9c9d, 0x3d00674, 0x10a73fc, 0x30fda83, 0x139185c, 495 | 0x043e082, 0x3c67915, 0x208192a, 0x025e451, 0x258a566, 0, 496 | }, 497 | }, 498 | .y = { 499 | .limbs = { 500 | 0, 0x3d2a04f, 0x1314c36, 0x131c7a3, 0x1882ef3, 0x1a0a5e8, 501 | 0x1919356, 0x0a5616a, 0x1eea31d, 0x2c216b3, 0x18ba4aa, 0, 502 | }, 503 | }, 504 | }; 505 | 506 | sabs_comb_set_t computed_base_comb; 507 | compute_comb_set(&computed_base_comb, &B); 508 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 509 | //assert(computed_base_comb.combs[0].table[COMB_TABLE_SIZE - 1].x.limbs[i] == 510 | //expected_everything0.x.limbs[i]); 511 | //assert(computed_base_comb.combs[0].table[COMB_TABLE_SIZE - 1].y.limbs[i] == 512 | //expected_everything0.y.limbs[i]); 513 | //assert(computed_base_comb.combs[1].table[COMB_TABLE_SIZE - 1].x.limbs[i] == 514 | //expected_everything1.x.limbs[i]); 515 | //assert(computed_base_comb.combs[1].table[COMB_TABLE_SIZE - 1].y.limbs[i] == 516 | //expected_everything1.y.limbs[i]); 517 | //assert(computed_base_comb.combs[2].table[COMB_TABLE_SIZE - 1].x.limbs[i] == 518 | //expected_everything2.x.limbs[i]); 519 | //assert(computed_base_comb.combs[2].table[COMB_TABLE_SIZE - 1].y.limbs[i] == 520 | //expected_everything2.y.limbs[i]); 521 | //assert(computed_base_comb.combs[3].table[COMB_TABLE_SIZE - 1].x.limbs[i] == 522 | //expected_everything3.x.limbs[i]); 523 | //assert(computed_base_comb.combs[3].table[COMB_TABLE_SIZE - 1].y.limbs[i] == 524 | //expected_everything3.y.limbs[i]); 525 | } 526 | 527 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 528 | //assert(computed_base_comb.combs[0].table[7].x.limbs[i] == 529 | //expected_gray_code_end0.x.limbs[i]); 530 | //assert(computed_base_comb.combs[0].table[7].y.limbs[i] == 531 | //expected_gray_code_end0.y.limbs[i]); 532 | //assert(computed_base_comb.combs[1].table[7].x.limbs[i] == 533 | //expected_gray_code_end1.x.limbs[i]); 534 | //assert(computed_base_comb.combs[1].table[7].y.limbs[i] == 535 | //expected_gray_code_end1.y.limbs[i]); 536 | //assert(computed_base_comb.combs[2].table[7].x.limbs[i] == 537 | //expected_gray_code_end2.x.limbs[i]); 538 | //assert(computed_base_comb.combs[2].table[7].y.limbs[i] == 539 | //expected_gray_code_end2.y.limbs[i]); 540 | //assert(computed_base_comb.combs[3].table[7].x.limbs[i] == 541 | //expected_gray_code_end3.x.limbs[i]); 542 | //assert(computed_base_comb.combs[3].table[7].y.limbs[i] == 543 | //expected_gray_code_end3.y.limbs[i]); 544 | } 545 | #endif 546 | 547 | #if 1 548 | for (int i = 0; i<1; ++i) { 549 | scalar_comb_multiply(&result_pt, &base_comb, &mult_scalar); 550 | } 551 | { 552 | residue_wide_t tmp; 553 | mul_wide(&tmp, &expected_scalar_mult.x, &result_pt.z); 554 | assert(equal_wide(&tmp, &result_pt.x)); 555 | mul_wide(&tmp, &expected_scalar_mult.y, &result_pt.z); 556 | assert(equal_wide(&tmp, &result_pt.y)); 557 | } 558 | #endif 559 | #if 0 560 | for (int i = 0; i<1; ++i) { 561 | scalar_t priv_key; 562 | affine_pt_narrow_reduced_t pub_key; 563 | gen_key(&priv_key, &pub_key); 564 | } 565 | #endif 566 | for (int i = 0; i < 1; ++i) { 567 | uint8_t encoded_sk[66]; 568 | uint8_t encoded_sig[65]; 569 | const uint8_t *msg = (uint8_t *) "Hello World!"; 570 | const size_t msglen = 13; 571 | scalar_t priv_key; 572 | scalar_t priv_key_decoded; 573 | affine_pt_narrow_t pub_key; 574 | affine_pt_narrow_t pub_key_decoded; 575 | gen_key(&priv_key, &pub_key); 576 | memcpy(encoded_sk, &priv_key, SCALAR_BYTES); 577 | encode_pub_key(encoded_sk + SCALAR_BYTES, &pub_key); 578 | priv_key_decoded.limbs[SCALAR_LIMBS - 1] = 0; 579 | memcpy(&priv_key_decoded, encoded_sk, SCALAR_BYTES); 580 | for (int j = 0; j < SCALAR_LIMBS; ++j) { 581 | assert(priv_key.limbs[j] == priv_key_decoded.limbs[j]); 582 | } 583 | signature_t result; 584 | sign(&result, &priv_key_decoded, encoded_sk + SCALAR_BYTES, msg, msglen); 585 | encode_sig(encoded_sig, &result); 586 | signature_t result_decoded; 587 | decode_sig(&result_decoded, encoded_sig); 588 | for (int j = 0; j < SCALAR_LIMBS; ++j) { 589 | assert(result.s.limbs[j] == result_decoded.s.limbs[j]); 590 | } 591 | for (int j = 0; j < NLIMBS_REDUCED; ++j) { 592 | assert(result.y.limbs[j] == result_decoded.y.limbs[j]); 593 | } 594 | assert(decode_pub_key(&pub_key_decoded, encoded_sk + SCALAR_BYTES)); 595 | 596 | uint8_t y_buf[RESIDUE_LENGTH_BYTES]; 597 | encode(y_buf, &result_decoded.y); 598 | if(!verify(&result, y_buf, encoded_sk + SCALAR_BYTES, &pub_key_decoded, msg, 599 | msglen)) { 600 | printf("verification failed\n"); 601 | exit(1); 602 | } 603 | } 604 | } 605 | -------------------------------------------------------------------------------- /avx2/src/sign.c: -------------------------------------------------------------------------------- 1 | #define _DEFAULT_SOURCE 2 | #include 3 | #include 4 | #include 5 | 6 | #include "comb.h" 7 | #include "curve.h" 8 | #include "scalar.h" 9 | 10 | #include "sign.h" 11 | 12 | #include "f11_260.c" 13 | #include "curve.c" 14 | #include "scalar.c" 15 | #include "gen.c" 16 | #include "constant_time.c" 17 | #include "comb.c" 18 | 19 | void sign(signature_t *result, scalar_t *priv_key, 20 | const uint8_t *pub_key, const uint8_t *msg, size_t msg_len) { 21 | blake2b_state hash_ctxt; 22 | 23 | char session_key_wash[16]; 24 | 25 | scalar_hash_t scalar_large; 26 | scalar_t session_key; 27 | 28 | arc4random_buf(session_key_wash, sizeof(session_key_wash)); 29 | blake2b_init_key(&hash_ctxt, 64, session_key_wash, sizeof(session_key_wash)); 30 | blake2b_update(&hash_ctxt, (uint8_t *) priv_key, SCALAR_BYTES); 31 | blake2b_update(&hash_ctxt, (uint8_t *) msg, msg_len); 32 | blake2b_final(&hash_ctxt, (uint8_t *) &scalar_large, sizeof(scalar_hash_t)); 33 | 34 | reduce_hash_mod_l(&session_key, &scalar_large); 35 | 36 | projective_pt_wide_t result_pt; 37 | scalar_comb_multiply(&result_pt, &base_comb, &session_key); 38 | residue_wide_t z_inv; 39 | 40 | invert_wide(&z_inv, &result_pt.z); 41 | mul_wide(&result_pt.x, &result_pt.x, &z_inv); 42 | mul_wide(&result_pt.y, &result_pt.y, &z_inv); 43 | 44 | residue_narrow_t temp_narrow; 45 | narrow(&temp_narrow, &result_pt.y); 46 | narrow_complete(&result->y, &temp_narrow); 47 | 48 | residue_narrow_reduced_t temp_narrow_reduced; 49 | narrow(&temp_narrow, &result_pt.x); 50 | narrow_partial_complete(&temp_narrow_reduced, &temp_narrow); 51 | result->y.limbs[NLIMBS_REDUCED - 1] |= 52 | is_odd(&temp_narrow_reduced) << (TBITS); 53 | 54 | uint8_t y_buf[RESIDUE_LENGTH_BYTES]; 55 | encode(y_buf, &result->y); 56 | 57 | blake2b_init(&hash_ctxt, 64); 58 | blake2b_update(&hash_ctxt, y_buf, RESIDUE_LENGTH_BYTES); 59 | blake2b_update(&hash_ctxt, pub_key, RESIDUE_LENGTH_BYTES); 60 | blake2b_update(&hash_ctxt, msg, msg_len); 61 | blake2b_final(&hash_ctxt, (uint8_t *) &scalar_large, sizeof(scalar_hash_t)); 62 | 63 | scalar_t hash_scalar; 64 | mont_reduce_hash_mod_l(&hash_scalar, &scalar_large); 65 | mont_mult_mod_l(&hash_scalar, &hash_scalar, priv_key); 66 | mont_mult_mod_l(&hash_scalar, &hash_scalar, &SCALAR_MONT_R2_HASH_MUL); 67 | sub_mod_l(&result->s, &session_key, &hash_scalar); 68 | 69 | explicit_bzero(&session_key, sizeof(session_key)); 70 | explicit_bzero(&hash_scalar, sizeof(hash_scalar)); 71 | explicit_bzero(&session_key_wash, sizeof(session_key_wash)); 72 | } 73 | 74 | int verify( 75 | const signature_t *sig, const uint8_t *r_bytes, const uint8_t *pub_key_bytes, 76 | const affine_pt_narrow_t *pub_key_pt, const uint8_t *msg, 77 | size_t msg_len) { 78 | 79 | projective_pt_wide_t sB; 80 | projective_pt_wide_t hA; 81 | projective_pt_wide_t result_pt; 82 | residue_narrow_reduced_t result_y; 83 | 84 | scalar_hash_t scalar_large; 85 | blake2b_state hash_ctxt; 86 | blake2b_init(&hash_ctxt, 64); 87 | blake2b_update(&hash_ctxt, r_bytes, RESIDUE_LENGTH_BYTES); 88 | blake2b_update(&hash_ctxt, pub_key_bytes, RESIDUE_LENGTH_BYTES); 89 | blake2b_update(&hash_ctxt, msg, msg_len); 90 | blake2b_final(&hash_ctxt, (uint8_t *) &scalar_large, sizeof(scalar_hash_t)); 91 | 92 | scalar_t hash_scalar; 93 | reduce_hash_mod_l(&hash_scalar, &scalar_large); 94 | 95 | // Can use non-const version for both of these. 96 | scalar_comb_multiply_unsafe(&sB, &base_comb, &sig->s); 97 | scalar_multiply_unsafe(&hA, pub_key_pt, &hash_scalar); 98 | projective_add(&result_pt, &sB, &hA); 99 | 100 | // Everything below except the comparison should eventually be in helper 101 | // functions: Point affinization, and point compression bit-for-bit. 102 | // Same applies for the signing. 103 | residue_wide_t z_inv; 104 | 105 | invert_wide(&z_inv, &result_pt.z); 106 | mul_wide(&result_pt.x, &result_pt.x, &z_inv); 107 | mul_wide(&result_pt.y, &result_pt.y, &z_inv); 108 | 109 | residue_narrow_t temp_narrow; 110 | narrow(&temp_narrow, &result_pt.y); 111 | narrow_complete(&result_y, &temp_narrow); 112 | 113 | residue_narrow_reduced_t temp_narrow_reduced; 114 | narrow(&temp_narrow, &result_pt.x); 115 | narrow_partial_complete(&temp_narrow_reduced, &temp_narrow); 116 | result_y.limbs[NLIMBS_REDUCED - 1] |= 117 | is_odd(&temp_narrow_reduced) << TBITS; 118 | 119 | return equal_narrow_reduced(&sig->y, &result_y); 120 | } 121 | 122 | void encode_sig(uint8_t *result, const signature_t *sig) { 123 | residue_narrow_reduced_t pack; 124 | 125 | memcpy(&pack, &sig->y, sizeof(residue_narrow_reduced_t)); 126 | // Save the upper two bits in the uppermost part of the 33rd byte 127 | pack.limbs[NLIMBS_REDUCED - 1] |= 128 | (sig->s.limbs[SCALAR_LIMBS - 1] & 0x3) << 28; 129 | encode(result, &pack); 130 | memcpy(result + RESIDUE_LENGTH_BYTES, 131 | &sig->s, sizeof(uint32_t) * (SCALAR_LIMBS - 1)); 132 | } 133 | 134 | void decode_sig(signature_t *result, const uint8_t *encoded_sig) { 135 | decode(&result->y, encoded_sig); 136 | result->s.limbs[SCALAR_LIMBS - 1] = result->y.limbs[NLIMBS_REDUCED - 1] >> 28; 137 | // We leave an extra bit for the sign bit from compression. 138 | result->y.limbs[NLIMBS_REDUCED - 1] &= ((1 << (TBITS + 1)) - 1); 139 | memcpy(&result->s, encoded_sig + RESIDUE_LENGTH_BYTES, 140 | sizeof(uint32_t) * (SCALAR_LIMBS - 1)); 141 | } 142 | -------------------------------------------------------------------------------- /avx512/include: -------------------------------------------------------------------------------- 1 | ../ref/include -------------------------------------------------------------------------------- /avx512/src/f11_260.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "f11_260.h" 3 | #include "mul_inline.h" 4 | #include "emmintrin.h" 5 | #include "immintrin.h" 6 | 7 | residue_narrow_t zero_narrow = {0}; 8 | residue_narrow_t one_narrow = { 9 | .limbs = {1}, 10 | }; 11 | 12 | #define NVECTORS 3 13 | #define VECTWIDTH 4 14 | 15 | __attribute__((__aligned__(32))) 16 | static const int32_t COLLAPSE[8] = { 0, 2, 4, 6, 4, 5, 6, 7 }; 17 | 18 | // Shrink to 32 bits. Assumes reduction has already occurred, and wide storage 19 | // is being used for vector compatibility. 20 | void narrow(residue_narrow_t *result, const residue_wide_t * __restrict w) { 21 | __m256i collapse_perm = _mm256_load_si256((__m256i*) COLLAPSE); 22 | __m128i packed_result; 23 | #pragma clang loop unroll(full) 24 | for (int i = 0; i < NVECTORS; ++i) { 25 | __m256i x = _mm256_load_si256((__m256i*) (&w->limbs[i * VECTWIDTH])); 26 | packed_result = _mm256_castsi256_si128( 27 | _mm256_permutevar8x32_epi32(x, collapse_perm)); 28 | _mm_store_si128((__m128i*) &result->limbs[i * VECTWIDTH], packed_result); 29 | } 30 | } 31 | 32 | // Reduce to 10 limbs. Useful for debugging. 33 | void narrow_reduce( 34 | residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w) { 35 | residue_narrow_t temp; 36 | 37 | __m128i x = _mm_load_si128((__m128i *) (&w->limbs[0 * VECTWIDTH])); 38 | __m128i x10 = _mm_broadcastd_epi32(x); 39 | x = _mm_sub_epi32(x, x10); 40 | _mm_store_si128((__m128i *) &temp.limbs[0 * VECTWIDTH], x); 41 | x = _mm_load_si128((__m128i *) (&w->limbs[1 * VECTWIDTH])); 42 | x = _mm_sub_epi32(x, x10); 43 | _mm_store_si128((__m128i *) &temp.limbs[1 * VECTWIDTH], x); 44 | x = _mm_load_si128((__m128i *) (&w->limbs[2 * VECTWIDTH])); 45 | x = _mm_sub_epi32(x, x10); 46 | _mm_store_si128((__m128i *) &temp.limbs[2 * VECTWIDTH], x); 47 | 48 | reduce_step_narrow(&temp, &temp); 49 | 50 | // May want to use vpalignr here. 51 | #pragma clang loop unroll(full) 52 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 53 | result->limbs[i] = temp.limbs[i] - temp.limbs[10]; 54 | } 55 | } 56 | 57 | // Reduce to unique representative. 58 | // This is expensive. Only used for final signature or DH Key 59 | void narrow_complete( 60 | residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w) { 61 | 62 | residue_narrow_t temp; 63 | for (int i = 0; i < NLIMBS; ++i) { 64 | temp.limbs[i] = w->limbs[i] - w->limbs[10]; 65 | } 66 | 67 | // This may be combined with the final reduction from a multiply. 68 | reduce_step_narrow(&temp, &temp); 69 | 70 | int gt_mask = 0; 71 | int lt_mask = 0; 72 | int32_t limit[NLIMBS]; 73 | for (int i = 0; i < NLIMBS; ++i) { 74 | temp.limbs[i] = temp.limbs[i] - temp.limbs[10]; 75 | temp.limbs[i] += 1 & gt_mask; 76 | temp.limbs[i] -= 1 & lt_mask; 77 | gt_mask = -(temp.limbs[i] > T); 78 | lt_mask = -(temp.limbs[i] < 0); 79 | temp.limbs[i] -= (T & gt_mask); 80 | temp.limbs[i] += (T & lt_mask); 81 | } 82 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 83 | temp.limbs[i] -= temp.limbs[10]; 84 | limit[i] = T; 85 | } 86 | int64_t all_t = -1; 87 | for (int i = NLIMBS_REDUCED - 2; i >= 0; --i) { 88 | all_t &= -(temp.limbs[i+1] == T); 89 | limit[i] -= 1 & (~all_t); 90 | } 91 | gt_mask = 0; 92 | lt_mask = 0; 93 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 94 | temp.limbs[i] += 1 & gt_mask; 95 | temp.limbs[i] -= 1 & lt_mask; 96 | gt_mask = -(temp.limbs[i] > limit[i]); 97 | lt_mask = -(temp.limbs[i] < 0); 98 | temp.limbs[i] -= (T & gt_mask); 99 | temp.limbs[i] += (T & lt_mask); 100 | result->limbs[i] = temp.limbs[i]; 101 | } 102 | } 103 | 104 | // Reduce to mostly unique representative. 105 | // All coefficients are reduced to 0 <= xi <= t 106 | // Unique up to carries (xi == t) => (xi = 0; x[i+1] += 1); 107 | // This is sufficient to determine if x is even or odd. 108 | // Still pretty expensive. Used in point compression. 109 | void narrow_partial_complete( 110 | residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w) { 111 | 112 | residue_narrow_t temp; 113 | for (int i = 0; i < NLIMBS; ++i) { 114 | temp.limbs[i] = w->limbs[i] - w->limbs[10]; 115 | } 116 | 117 | // This may be combined with the final reduction from a multiply. 118 | reduce_step_narrow(&temp, &temp); 119 | 120 | int gt_mask = 0; 121 | int lt_mask = 0; 122 | for (int i = 0; i < NLIMBS; ++i) { 123 | temp.limbs[i] = temp.limbs[i] - temp.limbs[10]; 124 | temp.limbs[i] += 1 & gt_mask; 125 | temp.limbs[i] -= 1 & lt_mask; 126 | gt_mask = -(temp.limbs[i] > T); 127 | lt_mask = -(temp.limbs[i] < 0); 128 | temp.limbs[i] -= (T & gt_mask); 129 | temp.limbs[i] += (T & lt_mask); 130 | } 131 | for (int i = 0; i < NLIMBS - 1; ++i) { 132 | temp.limbs[i] -= temp.limbs[10]; 133 | } 134 | gt_mask = 0; 135 | lt_mask = 0; 136 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 137 | temp.limbs[i] += 1 & gt_mask; 138 | temp.limbs[i] -= 1 & lt_mask; 139 | gt_mask = -(temp.limbs[i] > T); 140 | lt_mask = -(temp.limbs[i] < 0); 141 | temp.limbs[i] -= (T & gt_mask); 142 | temp.limbs[i] += (T & lt_mask); 143 | result->limbs[i] = temp.limbs[i]; 144 | } 145 | } 146 | 147 | int is_odd(residue_narrow_reduced_t *x) { 148 | int result = 0; 149 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 150 | result ^= x->limbs[i] & 0x1; 151 | } 152 | return result; 153 | } 154 | 155 | // Copy a 12x32-bit residue 156 | void copy_narrow( 157 | residue_narrow_t *result, const residue_narrow_t * __restrict x) { 158 | 159 | for (int i = 0; i < NLIMBS; ++i) { 160 | result->limbs[i] = x->limbs[i]; 161 | } 162 | } 163 | 164 | // Copy a 10x32-bit residue 165 | void copy_narrow_reduced( 166 | residue_narrow_reduced_t *result, 167 | const residue_narrow_reduced_t * __restrict x) { 168 | 169 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 170 | result->limbs[i] = x->limbs[i]; 171 | } 172 | } 173 | 174 | static inline __m256i load_extend_32_64(__m128i *x) { 175 | return _mm256_cvtepi32_epi64(_mm_load_si128(x)); 176 | } 177 | 178 | static inline __m256i loadu_extend_32_64(__m128i *x) { 179 | return _mm256_cvtepi32_epi64(_mm_loadu_si128(x)); 180 | } 181 | 182 | static inline __m512i load512_extend_32_64(__m256i *x) { 183 | return _mm512_cvtepi32_epi64(_mm256_load_si256(x)); 184 | } 185 | 186 | static inline __m512i loadu512_extend_32_64(__m256i *x) { 187 | return _mm512_cvtepi32_epi64(_mm256_loadu_si256(x)); 188 | } 189 | 190 | static inline __m512i loadu512_mask_extend_32_64(__m256i *x, __mmask8 k) { 191 | return _mm512_cvtepi32_epi64( 192 | _mm256_mask_loadu_epi32(_mm256_setzero_si256(), k, x)); 193 | } 194 | 195 | void reduce_step_narrow( 196 | residue_narrow_t *result, const residue_narrow_t *x) { 197 | return reduce_step_narrow_i(result, x); 198 | } 199 | 200 | void reduce_step_wide( 201 | residue_wide_t *result, const residue_wide_t *x) { 202 | return reduce_step_wide_i(result, x); 203 | } 204 | 205 | void mul_narrow( 206 | residue_narrow_t *result, const residue_narrow_t *x, 207 | const residue_narrow_t *y) { 208 | return mul_narrow_i(result, x, y); 209 | } 210 | 211 | void square_narrow( 212 | residue_narrow_t *result, const residue_narrow_t *x) { 213 | return square_narrow_i(result, x); 214 | } 215 | 216 | // Produce a 64-bit residue 217 | void widen( 218 | residue_wide_t *result, const residue_narrow_t * __restrict x) { 219 | __m256i wide10 = loadu_extend_32_64((__m128i *) x); 220 | __m512i wide3 = loadu512_extend_32_64((__m256i *) &x[4]); 221 | _mm256_store_si256((__m256i*) &result->limbs[0], wide10); 222 | _mm512_storeu_si512((__m512i*) &result->limbs[4], wide3); 223 | } 224 | 225 | // Subtract 2 12x32-bit residues. 226 | void sub_narrow( 227 | residue_narrow_t *result, const residue_narrow_t * __restrict x, 228 | const residue_narrow_t * __restrict y) { 229 | 230 | __m512i lhs = _mm512_load_si512((__m512i*) &x->limbs[0]); 231 | __m512i rhs = _mm512_load_si512((__m512i*) &y->limbs[0]); 232 | __m512i sub = _mm512_sub_epi32(lhs, rhs); 233 | _mm512_store_si512((__m512i*) &result->limbs[0], sub); 234 | } 235 | 236 | // negate a 12x64-bit residue. 237 | void negate_wide(residue_wide_t *result, const residue_wide_t *x) { 238 | 239 | __m256i zero = _mm256_setzero_si256(); 240 | #pragma clang loop unroll(full) 241 | for (int i = 0; i < NVECTORS; ++i) { 242 | __m256i xv = _mm256_load_si256((__m256i*) (&x->limbs[i * VECTWIDTH])); 243 | xv = _mm256_sub_epi64(zero, xv); 244 | _mm256_store_si256((__m256i*) &result->limbs[i * VECTWIDTH], xv); 245 | } 246 | } 247 | 248 | // negate a 12x32-bit residue. 249 | void negate_narrow( 250 | residue_narrow_t *result, const residue_narrow_t *x) { 251 | 252 | __m512i lhs = _mm512_load_si512((__m512i*) &x->limbs[0]); 253 | __m512i zero = _mm512_setzero(); 254 | __m512i neg = _mm512_sub_epi32(zero, lhs); 255 | _mm512_store_si512((__m512i*) &result->limbs[0], neg); 256 | } 257 | 258 | // Add 2 12x32-bit residues. 259 | void add_narrow( 260 | residue_narrow_t *result, const residue_narrow_t * __restrict x, 261 | const residue_narrow_t * __restrict y) { 262 | 263 | __m512i lhs = _mm512_load_si512((__m512i*) &x->limbs[0]); 264 | __m512i rhs = _mm512_load_si512((__m512i*) &y->limbs[0]); 265 | __m512i add = _mm512_add_epi32(lhs, rhs); 266 | _mm512_store_si512((__m512i*) &result->limbs[0], add); 267 | } 268 | 269 | // Scale a narrow residue by 2. 270 | void double_narrow( 271 | residue_narrow_t *result, const residue_narrow_t *x) { 272 | 273 | __m512i lhs = _mm512_load_si512((__m512i*) &x->limbs[0]); 274 | __m512i dub = _mm512_slli_epi32(lhs, 1); 275 | _mm512_store_si512((__m512i*) &result->limbs[0], dub); 276 | } 277 | 278 | // Scale a wide residue by 2. 279 | void double_wide( 280 | residue_wide_t *result, const residue_wide_t *x) { 281 | 282 | for (int i = 0; i < NLIMBS; ++i) { 283 | result->limbs[i] = x->limbs[i] << 1; 284 | } 285 | } 286 | 287 | #include 288 | #include 289 | // static void print4x64(__m256i x, const char * preamble) { 290 | // uint64_t x_vals[4]; 291 | // memcpy(x_vals, &x, sizeof(x_vals)); 292 | // printf("%s\n", preamble); 293 | // for (int i = 0; i < 4; ++i) { 294 | // printf("%#lx\n", x_vals[i]); 295 | // } 296 | // } 297 | 298 | // static void print8x64(__m512i x, const char * preamble) { 299 | // uint64_t x_vals[8]; 300 | // memcpy(x_vals, &x, sizeof(x_vals)); 301 | // printf("%s\n", preamble); 302 | // for (int i = 0; i < 8; ++i) { 303 | // printf("%#lx\n", x_vals[i]); 304 | // } 305 | // } 306 | 307 | // static void print16x32(__m512i x, const char * preamble) { 308 | // uint32_t x_vals[16]; 309 | // memcpy(x_vals, &x, sizeof(x_vals)); 310 | // printf("%s\n", preamble); 311 | // for (int i = 0; i < 16; ++i) { 312 | // printf("%#x\n", x_vals[i]); 313 | // } 314 | // } 315 | 316 | // The swaps below trade 32 bit words within 128 bit lanes 317 | // in low endian order: 01 00 11 10 318 | // in big endian order 10 11 00 01 = 0xb1 319 | 320 | static const int32_t permute_final_result[16] = { 321 | 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 322 | }; 323 | 324 | // Multiply a narrow residue by a small constant. The result is reduced to 32 325 | // bits. 326 | void mul_narrow_const( 327 | residue_narrow_t *result, const residue_narrow_t *x, int32_t d) { 328 | 329 | residue_wide_t temp; 330 | for (int i = 0; i < NLIMBS; ++i) { 331 | temp.limbs[i] = ((uint64_t) x->limbs[i]) * d; 332 | } 333 | reduce_step_wide(&temp, &temp); 334 | __m512i _permute_final_result = _mm512_load_si512(permute_final_result); 335 | __m512i accum0 = _mm512_load_si512((__m512i*) &temp.limbs[0]); 336 | __m256i accum8 = _mm256_load_si256((__m256i*) &temp.limbs[8]); 337 | __m512i final_result = _mm512_permutex2var_epi32( 338 | accum0, _permute_final_result, _mm512_castsi256_si512(accum8)); 339 | _mm512_store_si512((__m512i*) &result->limbs[0], final_result); 340 | } 341 | 342 | // Takes advantage of the fact that if a residue z *is zero* then after setting 343 | // one coefficient to T/2, all the remaining coefficients should be near to 344 | // T/2. They should therefore resolve all carries in a single step, and all be 345 | // equal to the same value. Some other value may not reduce completely, but this 346 | // is fine, we will know it is not zero. 347 | int equal_narrow(const residue_narrow_t *x, const residue_narrow_t *y) { 348 | residue_narrow_t temp; 349 | 350 | sub_narrow(&temp, x, y); 351 | int32_t delta = -temp.limbs[0] + (T / 2); 352 | for (int i = 0; i < NLIMBS; ++i) { 353 | temp.limbs[i] += delta; 354 | } 355 | 356 | reduce_step_narrow(&temp, &temp); 357 | 358 | delta = temp.limbs[0]; 359 | int result = 0; 360 | for (int i = 1; i < NLIMBS; ++i) { 361 | result |= (temp.limbs[i] ^ delta); 362 | } 363 | 364 | return !result; 365 | } 366 | 367 | int equal_narrow_reduced( 368 | const residue_narrow_reduced_t * x, const residue_narrow_reduced_t * y) { 369 | 370 | int result = 0; 371 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 372 | result |= (x->limbs[i] ^ y->limbs[i]); 373 | } 374 | 375 | return !result; 376 | } 377 | 378 | static inline void nsquare_narrow( 379 | residue_narrow_t *result, const residue_narrow_t *x, int n) { 380 | 381 | square_narrow_i(result, x); 382 | for (int i = 1; i < n; ++i) { 383 | square_narrow_i(result, result); 384 | } 385 | } 386 | 387 | static void raise_to_t( 388 | residue_narrow_t *result, const residue_narrow_t *x) { 389 | // zi = z^(2^i - 1), z1 = x 390 | residue_narrow_t z2; 391 | residue_narrow_t z3; 392 | residue_narrow_t z5; 393 | residue_narrow_t z10; 394 | residue_narrow_t z11; 395 | residue_narrow_t z22; 396 | residue_narrow_t result_t; 397 | 398 | square_narrow_i(&z2, x); 399 | mul_narrow_i(&z2, &z2, x); 400 | square_narrow_i(&z3, &z2); 401 | mul_narrow_i(&z3, &z3, x); 402 | nsquare_narrow(&z5, &z3, 2); 403 | mul_narrow_i(&z5, &z5, &z2); 404 | nsquare_narrow(&z10, &z5, 5); 405 | mul_narrow_i(&z10, &z10, &z5); 406 | square_narrow_i(&z11, &z10); 407 | mul_narrow_i(&z11, &z11, x); 408 | nsquare_narrow(&z22, &z11, 11); 409 | mul_narrow_i(&z22, &z22, &z11); 410 | nsquare_narrow(&result_t, &z22, 4); 411 | mul_narrow_i(result, &result_t, x); 412 | } 413 | 414 | static void raise_to_t2( 415 | residue_narrow_t *result, const residue_narrow_t *x) { 416 | // t^2 = 0xfffff880000e1 417 | // zi = z^(2^i - 1), z1 = x 418 | residue_narrow_t z2; 419 | residue_narrow_t z3; 420 | residue_narrow_t z5; 421 | residue_narrow_t z10; 422 | residue_narrow_t z20; 423 | residue_narrow_t result_t; 424 | 425 | square_narrow_i(&z2, x); 426 | mul_narrow_i(&z2, &z2, x); 427 | square_narrow_i(&z3, &z2); 428 | mul_narrow_i(&z3, &z3, x); 429 | nsquare_narrow(&z5, &z3, 2); 430 | mul_narrow_i(&z5, &z5, &z2); 431 | nsquare_narrow(&z10, &z5, 5); 432 | mul_narrow_i(&z10, &z10, &z5); 433 | nsquare_narrow(&z20, &z10, 10); 434 | mul_narrow_i(&z20, &z20, &z10); 435 | square_narrow_i(&result_t, &z20); 436 | mul_narrow_i(&result_t, &result_t, x); 437 | nsquare_narrow(&result_t, &result_t, 4); 438 | mul_narrow_i(&result_t, &result_t, x); 439 | // 22 = 3 for zeros in 8, 16 for zeros in 0000, 3 to make room for e. 440 | nsquare_narrow(&result_t, &result_t, 22); 441 | mul_narrow_i(&result_t, &result_t, &z3); 442 | nsquare_narrow(&result_t, &result_t, 5); 443 | mul_narrow_i(result, &result_t, x); 444 | } 445 | 446 | static void raise_to_phi_t( 447 | residue_narrow_t *result, const residue_narrow_t *x, int n) { 448 | residue_narrow_t temp; 449 | 450 | raise_to_t(&temp, x); 451 | 452 | for (int i = 1; i < n; ++i) { 453 | mul_narrow(&temp, &temp, x); 454 | raise_to_t(&temp, &temp); 455 | } 456 | 457 | mul_narrow(result, &temp, x); 458 | } 459 | 460 | static void raise_to_t_minus_1_over_4( 461 | residue_narrow_t *result, const residue_narrow_t *x) { 462 | // zi = z^(2^i - 1), z1 = x 463 | residue_narrow_t z2; 464 | residue_narrow_t z3; 465 | residue_narrow_t z5; 466 | residue_narrow_t z10; 467 | residue_narrow_t z11; 468 | residue_narrow_t z22; 469 | 470 | square_narrow_i(&z2, x); 471 | mul_narrow_i(&z2, &z2, x); 472 | square_narrow_i(&z3, &z2); 473 | mul_narrow_i(&z3, &z3, x); 474 | nsquare_narrow(&z5, &z3, 2); 475 | mul_narrow_i(&z5, &z5, &z2); 476 | nsquare_narrow(&z10, &z5, 5); 477 | mul_narrow_i(&z10, &z10, &z5); 478 | square_narrow_i(&z11, &z10); 479 | mul_narrow_i(&z11, &z11, x); 480 | nsquare_narrow(&z22, &z11, 11); 481 | mul_narrow_i(&z22, &z22, &z11); 482 | nsquare_narrow(result, &z22, 2); 483 | } 484 | 485 | static void raise_to_p_minus_3_over_4( 486 | residue_narrow_t *result, const residue_narrow_t *x) { 487 | 488 | residue_narrow_t z4; //z to (t-1)/4 489 | residue_narrow_t z2; //z to (t-1)/2 490 | residue_narrow_t z3_4; //z to (3t+1)/4 491 | residue_narrow_t y_small; 492 | residue_narrow_t y, y_t4_y; 493 | residue_narrow_t raised; 494 | 495 | raise_to_t_minus_1_over_4(&z4, x); 496 | square_narrow(&z2, &z4); 497 | mul_narrow(&z3_4, &z2, &z4); 498 | mul_narrow(&z3_4, &z3_4, x); 499 | raise_to_t(&raised, &z4); 500 | mul_narrow(&y_small, &z2, &raised); 501 | raise_to_t(&raised, &y_small); 502 | mul_narrow(&y, &z3_4, &raised); 503 | raise_to_t(&raised, &y); 504 | raise_to_t(&raised, &raised); 505 | raise_to_t(&raised, &raised); 506 | raise_to_t(&raised, &raised); 507 | mul_narrow(&y_t4_y, &raised, &y); 508 | raise_to_t(&raised, &y_t4_y); 509 | raise_to_t(&raised, &raised); 510 | raise_to_t(&raised, &raised); 511 | mul_narrow(result, &raised, &y_small); 512 | } 513 | 514 | int sqrt_inv_narrow( 515 | residue_narrow_t *result, const residue_narrow_t * __restrict x, 516 | const residue_narrow_t * __restrict y) { 517 | residue_narrow_t xy; 518 | residue_narrow_t y2; 519 | residue_narrow_t xy3; 520 | residue_narrow_t xy3_p_3_over_4; 521 | residue_narrow_t cand2; 522 | residue_narrow_t should_be_x; 523 | 524 | square_narrow(&y2, y); 525 | mul_narrow(&xy, x, y); 526 | mul_narrow(&xy3, &xy, &y2); 527 | raise_to_p_minus_3_over_4(&xy3_p_3_over_4, &xy3); 528 | mul_narrow(result, &xy, &xy3_p_3_over_4); 529 | square_narrow(&cand2, result); 530 | mul_narrow(&should_be_x, y, &cand2); 531 | 532 | return equal_narrow(&should_be_x, x); 533 | } 534 | 535 | void invert_narrow( 536 | residue_narrow_t *result, const residue_narrow_t * __restrict x) { 537 | 538 | residue_narrow_t x_t_minus_1_over_4; 539 | residue_narrow_t x_t_minus_1; 540 | // x^2 (trades a multiply for a square) 541 | residue_narrow_t x2; 542 | // rho_k = x^((t^k - 1)/(t - 1)) 543 | // rho_1 = x 544 | residue_narrow_t rho_2, rho_4, rho_8, rho_9; 545 | residue_narrow_t result_t; 546 | 547 | raise_to_t_minus_1_over_4(&x_t_minus_1_over_4, x); 548 | nsquare_narrow(&x_t_minus_1, &x_t_minus_1_over_4, 2); 549 | square_narrow_i(&x2, x); 550 | mul_narrow_i(&rho_2, &x_t_minus_1, &x2); 551 | raise_to_t2(&rho_4, &rho_2); 552 | mul_narrow_i(&rho_4, &rho_4, &rho_2); 553 | raise_to_t2(&rho_8, &rho_4); 554 | raise_to_t2(&rho_8, &rho_8); 555 | mul_narrow_i(&rho_8, &rho_8, &rho_4); 556 | raise_to_t(&rho_9, &rho_8); 557 | mul_narrow_i(&rho_9, &rho_9, x); 558 | raise_to_t2(&result_t, &rho_9); 559 | mul_narrow_i(result, &result_t, &x_t_minus_1); 560 | } 561 | 562 | void encode(uint8_t *out, const residue_narrow_reduced_t * __restrict x) { 563 | uint32_t collect = x->limbs[0]; 564 | 565 | int space = 32 - TBITS; 566 | int i = 1; 567 | int bits_remaining = TBITS * NLIMBS_REDUCED; 568 | while (bits_remaining > 0) { 569 | *out++ = collect & 0xff; 570 | collect >>= 8; 571 | space += 8; 572 | bits_remaining -= 8; 573 | if (space >= TBITS && i < NLIMBS_REDUCED) { 574 | collect |= x->limbs[i] << (32 - space); 575 | space -= TBITS; 576 | ++i; 577 | } 578 | } 579 | } 580 | 581 | void decode(residue_narrow_reduced_t *out, const uint8_t *in) { 582 | uint32_t collect = 0; 583 | 584 | int shift = 0; 585 | int i = 0; 586 | int bits_remaining = TBITS * NLIMBS_REDUCED; 587 | while (bits_remaining > 0) { 588 | collect |= (*in++) << shift; 589 | shift += 8; 590 | bits_remaining -= 8; 591 | if (shift >= TBITS) { 592 | if (bits_remaining > 0) { 593 | out->limbs[i] = collect & TMASK; 594 | collect >>= 26; 595 | shift -= 26; 596 | ++i; 597 | } else { 598 | out->limbs[i] = collect; 599 | } 600 | } 601 | } 602 | } 603 | -------------------------------------------------------------------------------- /avx512/src/main.c: -------------------------------------------------------------------------------- 1 | ../../ref/src/main.c -------------------------------------------------------------------------------- /avx512/src/scalar.c: -------------------------------------------------------------------------------- 1 | ../../ref/src/scalar.c -------------------------------------------------------------------------------- /avx512/src/sign.c: -------------------------------------------------------------------------------- 1 | ../../ref/src/sign.c -------------------------------------------------------------------------------- /ref/Makefile: -------------------------------------------------------------------------------- 1 | #### PROJECT SETTINGS #### 2 | # The name of the executable to be created 3 | BIN_NAME := p11_260_test 4 | # Compiler used 5 | CC = clang-10 6 | # Extension of source files used in the project 7 | SRC_EXT = c 8 | # Path to the source directory, relative to the makefile 9 | SRC_PATH = src 10 | # Space-separated pkg-config libraries used by this project 11 | LIBS = 12 | # General compiler flags 13 | COMPILE_FLAGS = -march=haswell -std=c11 -Wall -Wextra 14 | # Additional release-specific flags 15 | RCOMPILE_FLAGS = -O2 -D DEBUG -g 16 | # Additional debug-specific flags 17 | DCOMPILE_FLAGS = -g -D DEBUG 18 | # Add additional include paths 19 | INCLUDES = -Iinclude -isystem /usr/include/bsd -DLIBBSD_OVERLAY 20 | # General linker settings 21 | LINK_FLAGS = -lbsd -lb2 22 | # Additional release-specific linker settings 23 | RLINK_FLAGS = 24 | # Additional debug-specific linker settings 25 | DLINK_FLAGS = 26 | # Destination directory, like a jail or mounted system 27 | DESTDIR = / 28 | # Install path (bin/ is appended automatically) 29 | INSTALL_PREFIX = home/kyle/.local 30 | #### END PROJECT SETTINGS #### 31 | 32 | # Optionally you may move the section above to a separate config.mk file, and 33 | # uncomment the line below 34 | # include config.mk 35 | 36 | # Generally should not need to edit below this line 37 | 38 | # Obtains the OS type, either 'Darwin' (OS X) or 'Linux' 39 | UNAME_S:=$(shell uname -s) 40 | 41 | # Function used to check variables. Use on the command line: 42 | # make print-VARNAME 43 | # Useful for debugging and adding features 44 | print-%: ; @echo $*=$($*) 45 | 46 | # Shell used in this makefile 47 | # bash is used for 'echo -en' 48 | SHELL = /bin/bash 49 | # Clear built-in rules 50 | .SUFFIXES: 51 | # Programs for installation 52 | INSTALL = install 53 | INSTALL_PROGRAM = $(INSTALL) 54 | INSTALL_DATA = $(INSTALL) -m 644 55 | 56 | # Append pkg-config specific libraries if need be 57 | ifneq ($(LIBS),) 58 | COMPILE_FLAGS += $(shell pkg-config --cflags $(LIBS)) 59 | LINK_FLAGS += $(shell pkg-config --libs $(LIBS)) 60 | endif 61 | 62 | # Verbose option, to output compile and link commands 63 | export V := false 64 | export CMD_PREFIX := @ 65 | ifeq ($(V),true) 66 | CMD_PREFIX := 67 | endif 68 | 69 | # Combine compiler and linker flags 70 | release: export CFLAGS := $(CFLAGS) $(COMPILE_FLAGS) $(RCOMPILE_FLAGS) 71 | release: export LDFLAGS := $(LDFLAGS) $(LINK_FLAGS) $(RLINK_FLAGS) 72 | debug: export CFLAGS := $(CFLAGS) $(COMPILE_FLAGS) $(DCOMPILE_FLAGS) 73 | debug: export LDFLAGS := $(LDFLAGS) $(LINK_FLAGS) $(DLINK_FLAGS) 74 | 75 | # Build and output paths 76 | release: export BUILD_PATH := build/release 77 | release: export BIN_PATH := bin/release 78 | debug: export BUILD_PATH := build/debug 79 | debug: export BIN_PATH := bin/debug 80 | install: export BIN_PATH := bin/release 81 | 82 | # Find all source files in the source directory, sorted by most 83 | # recently modified 84 | ifeq ($(UNAME_S),Darwin) 85 | SOURCES = $(shell find $(SRC_PATH) -name '*.$(SRC_EXT)' | sort -k 1nr | cut -f2-) 86 | else 87 | SOURCES = $(shell find $(SRC_PATH) -name '*.$(SRC_EXT)' -printf '%T@\t%p\n' \ 88 | | sort -k 1nr | cut -f2-) 89 | endif 90 | 91 | # fallback in case the above fails 92 | rwildcard = $(foreach d, $(wildcard $1*), $(call rwildcard,$d/,$2) \ 93 | $(filter $(subst *,%,$2), $d)) 94 | ifeq ($(SOURCES),) 95 | SOURCES := $(call rwildcard, $(SRC_PATH), *.$(SRC_EXT)) 96 | endif 97 | 98 | # Set the object file names, with the source directory stripped 99 | # from the path, and the build path prepended in its place 100 | OBJECTS = $(SOURCES:$(SRC_PATH)/%.$(SRC_EXT)=$(BUILD_PATH)/%.o) 101 | # Set the dependency files that will be used to add header dependencies 102 | DEPS = $(OBJECTS:.o=.d) 103 | 104 | # Macros for timing compilation 105 | ifeq ($(UNAME_S),Darwin) 106 | CUR_TIME = awk 'BEGIN{srand(); print srand()}' 107 | TIME_FILE = $(dir $@).$(notdir $@)_time 108 | START_TIME = $(CUR_TIME) > $(TIME_FILE) 109 | END_TIME = read st < $(TIME_FILE) ; \ 110 | $(RM) $(TIME_FILE) ; \ 111 | st=$$((`$(CUR_TIME)` - $$st)) ; \ 112 | echo $$st 113 | else 114 | TIME_FILE = $(dir $@).$(notdir $@)_time 115 | START_TIME = date '+%s' > $(TIME_FILE) 116 | END_TIME = read st < $(TIME_FILE) ; \ 117 | $(RM) $(TIME_FILE) ; \ 118 | st=$$((`date '+%s'` - $$st - 86400)) ; \ 119 | echo `date -u -d @$$st '+%H:%M:%S'` 120 | endif 121 | 122 | # Version macros 123 | # Comment/remove this section to remove versioning 124 | USE_VERSION := false 125 | # If this isn't a git repo or the repo has no tags, git describe will return non-zero 126 | ifeq ($(shell git describe > /dev/null 2>&1 ; echo $$?), 0) 127 | USE_VERSION := true 128 | VERSION := $(shell git describe --tags --long --dirty --always | \ 129 | sed 's/v\([0-9]*\)\.\([0-9]*\)\.\([0-9]*\)-\?.*-\([0-9]*\)-\(.*\)/\1 \2 \3 \4 \5/g') 130 | VERSION_MAJOR := $(word 1, $(VERSION)) 131 | VERSION_MINOR := $(word 2, $(VERSION)) 132 | VERSION_PATCH := $(word 3, $(VERSION)) 133 | VERSION_REVISION := $(word 4, $(VERSION)) 134 | VERSION_HASH := $(word 5, $(VERSION)) 135 | VERSION_STRING := \ 136 | "$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH).$(VERSION_REVISION)-$(VERSION_HASH)" 137 | override CFLAGS := $(CFLAGS) \ 138 | -D VERSION_MAJOR=$(VERSION_MAJOR) \ 139 | -D VERSION_MINOR=$(VERSION_MINOR) \ 140 | -D VERSION_PATCH=$(VERSION_PATCH) \ 141 | -D VERSION_REVISION=$(VERSION_REVISION) \ 142 | -D VERSION_HASH=\"$(VERSION_HASH)\" 143 | endif 144 | 145 | # Standard, non-optimized release build 146 | .PHONY: release 147 | release: dirs 148 | ifeq ($(USE_VERSION), true) 149 | @echo "Beginning release build v$(VERSION_STRING)" 150 | else 151 | @echo "Beginning release build" 152 | endif 153 | @$(MAKE) all --no-print-directory 154 | 155 | # Debug build for gdb debugging 156 | .PHONY: debug 157 | debug: dirs 158 | ifeq ($(USE_VERSION), true) 159 | @echo "Beginning debug build v$(VERSION_STRING)" 160 | else 161 | @echo "Beginning debug build" 162 | endif 163 | @$(MAKE) all --no-print-directory 164 | 165 | # Create the directories used in the build 166 | .PHONY: dirs 167 | dirs: 168 | @echo "Creating directories" 169 | @mkdir -p $(dir $(OBJECTS)) 170 | @mkdir -p $(BIN_PATH) 171 | 172 | # Installs to the set path 173 | .PHONY: install 174 | install: 175 | @echo "Installing to $(DESTDIR)$(INSTALL_PREFIX)/bin" 176 | @$(INSTALL_PROGRAM) $(BIN_PATH)/$(BIN_NAME) $(DESTDIR)$(INSTALL_PREFIX)/bin 177 | 178 | # Uninstalls the program 179 | .PHONY: uninstall 180 | uninstall: 181 | @echo "Removing $(DESTDIR)$(INSTALL_PREFIX)/bin/$(BIN_NAME)" 182 | @$(RM) $(DESTDIR)$(INSTALL_PREFIX)/bin/$(BIN_NAME) 183 | 184 | # Removes all build files 185 | .PHONY: clean 186 | clean: 187 | @echo "Deleting $(BIN_NAME) symlink" 188 | @$(RM) $(BIN_NAME) 189 | @echo "Deleting directories" 190 | @$(RM) -r build 191 | @$(RM) -r bin 192 | 193 | # Main rule, checks the executable and symlinks to the output 194 | all: $(BIN_PATH)/$(BIN_NAME) 195 | @echo "Making symlink: $(BIN_NAME) -> $<" 196 | @$(RM) $(BIN_NAME) 197 | @ln -s $(BIN_PATH)/$(BIN_NAME) $(BIN_NAME) 198 | 199 | # Link the executable 200 | $(BIN_PATH)/$(BIN_NAME): $(OBJECTS) 201 | @echo "Linking: $@" 202 | @$(START_TIME) 203 | $(CMD_PREFIX)$(CC) $(OBJECTS) $(LDFLAGS) -o $@ 204 | @echo -en "\t Link time: " 205 | @$(END_TIME) 206 | 207 | # Add dependency files, if they exist 208 | -include $(DEPS) 209 | 210 | # Source file rules 211 | # After the first compilation they will be joined with the rules from the 212 | # dependency files to provide header dependencies 213 | $(BUILD_PATH)/%.o: $(SRC_PATH)/%.$(SRC_EXT) 214 | @echo "Compiling: $< -> $@" 215 | $(CMD_PREFIX)$(CC) $(CFLAGS) $(INCLUDES) -MP -MMD -c $< -o $@ 216 | -------------------------------------------------------------------------------- /ref/api.h: -------------------------------------------------------------------------------- 1 | #define CRYPTO_SECRETKEYBYTES 66 2 | #define CRYPTO_PUBLICKEYBYTES 33 3 | #define CRYPTO_BYTES 65 4 | #define CRYPTO_VERSION "1.0" 5 | -------------------------------------------------------------------------------- /ref/include/comb.h: -------------------------------------------------------------------------------- 1 | #ifndef COMB_H 2 | #define COMB_H 3 | 4 | #include "curve.h" 5 | #include "scalar.h" 6 | 7 | #define COMB_TABLE_SIZE 16 8 | #define COMB_TEETH 5 9 | #define COMB_COUNT 4 10 | #define COMB_SEPARATION 13 11 | #define COMB_LOOKUP_MASK 0xf 12 | 13 | // A single comb table. 14 | typedef struct sabs_single_comb { 15 | extended_affine_pt_readd_narrow_t table[COMB_TABLE_SIZE]; 16 | } sabs_single_comb_t; 17 | 18 | // A single narrow comb table. Used in computing a narrow comb table. 19 | typedef struct sabs_single_comb_narrow { 20 | projective_pt_narrow_t table[COMB_TABLE_SIZE]; 21 | } sabs_single_comb_narrow_t; 22 | 23 | // A comb set. There is a precomputed comb set for the base point, but for 24 | // verifications of several signatures from the same key, it would be 25 | // advantageous to precompute a comb. 26 | typedef struct sabs_comb_set { 27 | sabs_single_comb_t combs[COMB_COUNT]; 28 | } sabs_comb_set_t; 29 | 30 | // An unreduced comb set. Used just to separate the logic of comb computation 31 | // from comb reduction. 32 | typedef struct sabs_comb_set_narrow { 33 | sabs_single_comb_narrow_t combs[COMB_COUNT]; 34 | } sabs_comb_set_narrow_t; 35 | 36 | // used for computing the entries in the comb table. 37 | typedef struct teeth_set { 38 | // We don't need the lowest tooth to compute the entries, because for signed 39 | // all bits set, to change the bit, you add or subtract a value of 2*bit. 40 | extended_pt_readd_narrow_t teeth[COMB_TEETH - 1]; 41 | } teeth_set_t; 42 | 43 | // The base comb used for fast signatures. 44 | sabs_comb_set_t base_comb; 45 | 46 | // Compute a comb set for a given point. 47 | void compute_comb_set( 48 | sabs_comb_set_t *result, const affine_pt_narrow_t *base_pt); 49 | 50 | // Helper function used to compute a comb set. 51 | void reduce_comb_set(sabs_comb_set_t *result, sabs_comb_set_narrow_t *source); 52 | 53 | // Constant time multiplication of a scalar times a point given the point's 54 | // comb. 55 | void scalar_comb_multiply( 56 | projective_pt_narrow_t *result, const sabs_comb_set_t * __restrict comb, 57 | const scalar_t * __restrict n); 58 | 59 | // Non-Constant time multiplication of a scalar times a point given the point's 60 | // comb. Can be safely used during signature verification because there are no 61 | // secrets during verification. 62 | void scalar_comb_multiply_unsafe( 63 | projective_pt_narrow_t *result, const sabs_comb_set_t * __restrict comb, 64 | const scalar_t * __restrict n); 65 | #endif 66 | -------------------------------------------------------------------------------- /ref/include/constant_time.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "f11_260.h" 3 | #include "curve.h" 4 | 5 | static inline void mask_copy_narrow( 6 | int32_t mask, residue_narrow_t *result, 7 | const residue_narrow_t *x) { 8 | 9 | #pragma clang loop unroll(full) 10 | for (int i = 0; i < NLIMBS; ++i) { 11 | result->limbs[i] |= x->limbs[i] & mask; 12 | } 13 | } 14 | 15 | void constant_time_extended_narrow_lookup( 16 | extended_pt_readd_narrow_t *result, int i, int n, 17 | const extended_pt_readd_narrow_t *table) { 18 | 19 | #pragma clang loop unroll(full) 20 | for (int i = 0; i < NLIMBS; ++i) { 21 | result->x.limbs[i] = 0; 22 | result->dt.limbs[i] = 0; 23 | result->y.limbs[i] = 0; 24 | result->z.limbs[i] = 0; 25 | } 26 | for (int j = 0; j < n; ++j) { 27 | int32_t mask = -(i == j); 28 | 29 | mask_copy_narrow(mask, &result->x, &table[j].x); 30 | mask_copy_narrow(mask, &result->dt, &table[j].dt); 31 | mask_copy_narrow(mask, &result->y, &table[j].y); 32 | mask_copy_narrow(mask, &result->z, &table[j].z); 33 | } 34 | } 35 | 36 | void constant_time_extended_affine_narrow_lookup( 37 | extended_affine_pt_readd_narrow_t *result, int i, int n, 38 | const extended_affine_pt_readd_narrow_t *table) { 39 | 40 | #pragma clang loop unroll(full) 41 | for (int i = 0; i < NLIMBS; ++i) { 42 | result->x.limbs[i] = 0; 43 | result->dt.limbs[i] = 0; 44 | result->y.limbs[i] = 0; 45 | } 46 | 47 | for (int j = 0; j < n; ++j) { 48 | int32_t mask = -(i == j); 49 | 50 | mask_copy_narrow(mask, &result->x, &table[j].x); 51 | mask_copy_narrow(mask, &result->dt, &table[j].dt); 52 | mask_copy_narrow(mask, &result->y, &table[j].y); 53 | } 54 | } 55 | 56 | void constant_time_cond_extended_negate( 57 | extended_pt_readd_narrow_t *x, int32_t mask) { 58 | #pragma clang loop unroll(full) 59 | for (int i = 0; i < NLIMBS; ++i) { 60 | x->x.limbs[i] = (x->x.limbs[i] & ~mask) | ((-x->x.limbs[i]) & mask); 61 | } 62 | #pragma clang loop unroll(full) 63 | for (int i = 0; i < NLIMBS; ++i) { 64 | x->dt.limbs[i] = (x->dt.limbs[i] & ~mask) | ((-x->dt.limbs[i]) & mask); 65 | } 66 | } 67 | 68 | void constant_time_cond_extended_affine_negate( 69 | extended_affine_pt_readd_narrow_t *x, int32_t mask) { 70 | for (int i = 0; i < NLIMBS; ++i) { 71 | x->x.limbs[i] = (x->x.limbs[i] & ~mask) | ((-x->x.limbs[i]) & mask); 72 | } 73 | for (int i = 0; i < NLIMBS; ++i) { 74 | x->dt.limbs[i] = (x->dt.limbs[i] & ~mask) | ((-x->dt.limbs[i]) & mask); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /ref/include/constant_time.h: -------------------------------------------------------------------------------- 1 | #ifndef CONSTANT_TIME_H 2 | #define CONSTANT_TIME_H 3 | #include 4 | #include "f11_260.h" 5 | #include "curve.h" 6 | 7 | inline void constant_time_extended_narrow_lookup( 8 | extended_pt_readd_narrow_t *result, int i, int n, 9 | const extended_pt_readd_narrow_t *table); 10 | 11 | inline void constant_time_extended_affine_narrow_lookup( 12 | extended_affine_pt_readd_narrow_t *result, int i, int n, 13 | const extended_affine_pt_readd_narrow_t *table); 14 | 15 | inline void constant_time_cond_extended_negate( 16 | extended_pt_readd_narrow_t *x, int32_t mask); 17 | 18 | inline void constant_time_cond_extended_affine_negate( 19 | extended_affine_pt_readd_narrow_t *x, int32_t mask); 20 | #endif 21 | -------------------------------------------------------------------------------- /ref/include/curve.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "f11_260.h" 3 | #include "scalar.h" 4 | #include "curve.h" 5 | #include "constant_time.h" 6 | 7 | __attribute__((__aligned__(64))) 8 | const affine_pt_narrow_t B = { 9 | .x = { 10 | .limbs = { 11 | 0x2862b8b, 0x0f08ed2, 0x06e65ee, 0x0c05991, 0x2b12b17, 12 | 0x0049432, 0x33a3707, 0x16e5186, 0x2947e71, 0x0ed9bab, 13 | 0, 14 | }, 15 | }, 16 | .y = { 17 | .limbs = { 18 | 0x4, 19 | }, 20 | }, 21 | }; 22 | 23 | void copy_projective_pt_narrow( 24 | projective_pt_narrow_t *result, const projective_pt_narrow_t *source) { 25 | 26 | for(int i = 0; i < NLIMBS; ++i) { 27 | result->x.limbs[i] = source->x.limbs[i]; 28 | result->y.limbs[i] = source->y.limbs[i]; 29 | result->z.limbs[i] = source->z.limbs[i]; 30 | } 31 | } 32 | 33 | void copy_extended_pt_narrow( 34 | extended_pt_narrow_t *result, 35 | const extended_pt_narrow_t *source) { 36 | 37 | for(int i = 0; i < NLIMBS; ++i) { 38 | result->x.limbs[i] = source->x.limbs[i]; 39 | result->y.limbs[i] = source->y.limbs[i]; 40 | result->t.limbs[i] = source->t.limbs[i]; 41 | result->z.limbs[i] = source->z.limbs[i]; 42 | } 43 | } 44 | 45 | void copy_extended_pt_readd_narrow( 46 | extended_pt_readd_narrow_t *result, 47 | const extended_pt_readd_narrow_t *source) { 48 | for(int i = 0; i < NLIMBS; ++i) { 49 | result->x.limbs[i] = source->x.limbs[i]; 50 | result->y.limbs[i] = source->y.limbs[i]; 51 | result->dt.limbs[i] = source->dt.limbs[i]; 52 | result->z.limbs[i] = source->z.limbs[i]; 53 | } 54 | } 55 | 56 | void copy_extended_affine_pt_readd_narrow( 57 | extended_affine_pt_readd_narrow_t *result, 58 | const extended_affine_pt_readd_narrow_t *source) { 59 | for(int i = 0; i < NLIMBS; ++i) { 60 | result->x.limbs[i] = source->x.limbs[i]; 61 | result->y.limbs[i] = source->y.limbs[i]; 62 | result->dt.limbs[i] = source->dt.limbs[i]; 63 | } 64 | } 65 | 66 | void negate_extended_pt_readd_narrow( 67 | extended_pt_readd_narrow_t *result, 68 | const extended_pt_readd_narrow_t *source) { 69 | for(int i = 0; i < NLIMBS; ++i) { 70 | result->x.limbs[i] = -source->x.limbs[i]; 71 | result->y.limbs[i] = source->y.limbs[i]; 72 | result->dt.limbs[i] = -source->dt.limbs[i]; 73 | result->z.limbs[i] = source->z.limbs[i]; 74 | } 75 | } 76 | 77 | void negate_extended_affine_pt_readd_narrow( 78 | extended_affine_pt_readd_narrow_t *result, 79 | const extended_affine_pt_readd_narrow_t *source) { 80 | for(int i = 0; i < NLIMBS; ++i) { 81 | result->x.limbs[i] = -source->x.limbs[i]; 82 | result->dt.limbs[i] = -source->dt.limbs[i]; 83 | result->y.limbs[i] = source->y.limbs[i]; 84 | } 85 | } 86 | 87 | void affine_narrow_to_extended( 88 | extended_pt_narrow_t *result, 89 | const affine_pt_narrow_t * __restrict x) { 90 | 91 | for(int i = 0; i < NLIMBS; ++i) { 92 | result->x.limbs[i] = x->x.limbs[i]; 93 | result->y.limbs[i] = x->y.limbs[i]; 94 | result->z.limbs[i] = 0; 95 | } 96 | result->z.limbs[0] = 1; 97 | mul_narrow(&result->t, &result->x, &result->y); 98 | } 99 | 100 | void extended_to_projective_narrow( 101 | projective_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x) { 102 | for(int i = 0; i < NLIMBS; ++i) { 103 | result->x.limbs[i] = x->x.limbs[i]; 104 | result->y.limbs[i] = x->y.limbs[i]; 105 | result->z.limbs[i] = x->z.limbs[i]; 106 | } 107 | } 108 | 109 | void affine_to_readd_narrow( 110 | extended_pt_readd_narrow_t *result, 111 | const affine_pt_narrow_t * __restrict x) { 112 | 113 | for(int i = 0; i < NLIMBS; ++i) { 114 | result->x.limbs[i] = x->x.limbs[i]; 115 | result->y.limbs[i] = x->y.limbs[i]; 116 | result->z.limbs[i] = 0; 117 | } 118 | result->z.limbs[0] = 1; 119 | 120 | residue_narrow_t xy; 121 | mul_narrow(&xy, &x->x, &x->y); 122 | mul_narrow_const(&result->dt, &xy, D); 123 | } 124 | 125 | void extended_to_readd_narrow_neg( 126 | extended_pt_readd_narrow_t *result, 127 | const extended_pt_narrow_t * __restrict x) { 128 | 129 | for(int i = 0; i < NLIMBS; ++i) { 130 | result->x.limbs[i] = -(x->x.limbs[i]); 131 | result->y.limbs[i] = x->y.limbs[i]; 132 | result->z.limbs[i] = x->z.limbs[i]; 133 | } 134 | mul_narrow_const(&result->dt, &x->t, -D); 135 | } 136 | 137 | void affine_double( 138 | projective_pt_narrow_t *result, 139 | const affine_pt_narrow_t * __restrict x) { 140 | 141 | residue_narrow_t x_plus_y; 142 | residue_narrow_t a, b, e, e_tmp, g, g_minus_2, h; 143 | square_narrow(&a, &x->x); 144 | square_narrow(&b, &x->y); 145 | 146 | add_narrow(&x_plus_y, &x->x, &x->y); 147 | 148 | square_narrow(&e, &x_plus_y); 149 | sub_narrow(&e_tmp, &e, &a); 150 | sub_narrow(&e, &e_tmp, &b); 151 | add_narrow(&g, &a, &b); 152 | 153 | for (int i = 0; i < NLIMBS; ++i) { 154 | g_minus_2.limbs[i] = g.limbs[i]; 155 | } 156 | g_minus_2.limbs[0] -= 2; 157 | 158 | sub_narrow(&h, &a, &b); 159 | mul_narrow(&result->x, &e, &g_minus_2); 160 | mul_narrow(&result->y, &g, &h); 161 | mul_narrow(&result->z, &g, &g_minus_2); 162 | } 163 | 164 | void affine_double_extended( 165 | extended_pt_narrow_t *result, const affine_pt_narrow_t * __restrict x) { 166 | 167 | residue_narrow_t x_plus_y; 168 | residue_narrow_t a, b, e, e_tmp, g, g_minus_2, h; 169 | square_narrow(&a, &x->x); 170 | square_narrow(&b, &x->y); 171 | 172 | add_narrow(&x_plus_y, &x->x, &x->y); 173 | square_narrow(&e, &x_plus_y); 174 | sub_narrow(&e_tmp, &e, &a); 175 | sub_narrow(&e, &e_tmp, &b); 176 | add_narrow(&g, &a, &b); 177 | 178 | for (int i = 0; i < NLIMBS; ++i) { 179 | g_minus_2.limbs[i] = g.limbs[i]; 180 | } 181 | g_minus_2.limbs[0] -= 2; 182 | 183 | sub_narrow(&h, &a, &b); 184 | mul_narrow(&result->x, &e, &g_minus_2); 185 | mul_narrow(&result->y, &g, &h); 186 | mul_narrow(&result->t, &e, &h); 187 | mul_narrow(&result->z, &g, &g_minus_2); 188 | } 189 | 190 | void projective_double( 191 | projective_pt_narrow_t *result, const projective_pt_narrow_t *x) { 192 | 193 | residue_narrow_t x_plus_y; 194 | residue_narrow_t a, b, c, c_temp, e, e_tmp, f, g, h; 195 | add_narrow(&x_plus_y, &x->x, &x->y); 196 | square_narrow(&a, &x->x); 197 | square_narrow(&b, &x->y); 198 | square_narrow(&c_temp, &x->z); 199 | double_narrow(&c, &c_temp); 200 | 201 | square_narrow(&e, &x_plus_y); 202 | sub_narrow(&e_tmp, &e, &a); 203 | sub_narrow(&e, &e_tmp, &b); 204 | add_narrow(&g, &a, &b); 205 | sub_narrow(&f, &g, &c); 206 | sub_narrow(&h, &a, &b); 207 | 208 | mul_narrow(&result->x, &e, &f); 209 | mul_narrow(&result->y, &g, &h); 210 | mul_narrow(&result->z, &f, &g); 211 | } 212 | 213 | void projective_double_extended( 214 | extended_pt_narrow_t *result, const projective_pt_narrow_t * __restrict x) { 215 | 216 | residue_narrow_t x_plus_y; 217 | residue_narrow_t a, b, c, c_temp, e, e_tmp, f, g, h; 218 | add_narrow(&x_plus_y, &x->x, &x->y); 219 | square_narrow(&a, &x->x); 220 | square_narrow(&b, &x->y); 221 | square_narrow(&c_temp, &x->z); 222 | double_narrow(&c, &c_temp); 223 | 224 | square_narrow(&e, &x_plus_y); 225 | sub_narrow(&e_tmp, &e, &a); 226 | sub_narrow(&e, &e_tmp, &b); 227 | add_narrow(&g, &a, &b); 228 | sub_narrow(&f, &g, &c); 229 | sub_narrow(&h, &a, &b); 230 | 231 | mul_narrow(&result->x, &e, &f); 232 | mul_narrow(&result->y, &g, &h); 233 | mul_narrow(&result->t, &e, &h); 234 | mul_narrow(&result->z, &f, &g); 235 | } 236 | 237 | void extended_double_extended( 238 | extended_pt_narrow_t *result, const extended_pt_narrow_t *x) { 239 | 240 | residue_narrow_t x_plus_y; 241 | residue_narrow_t a, b, c, c_temp, e, e_tmp, f, g, h; 242 | add_narrow(&x_plus_y, &x->x, &x->y); 243 | square_narrow(&a, &x->x); 244 | square_narrow(&b, &x->y); 245 | square_narrow(&c_temp, &x->z); 246 | double_narrow(&c, &c_temp); 247 | 248 | square_narrow(&e, &x_plus_y); 249 | sub_narrow(&e_tmp, &e, &a); 250 | sub_narrow(&e, &e_tmp, &b); 251 | add_narrow(&g, &a, &b); 252 | sub_narrow(&f, &g, &c); 253 | sub_narrow(&h, &a, &b); 254 | 255 | mul_narrow(&result->x, &e, &f); 256 | mul_narrow(&result->z, &f, &g); 257 | mul_narrow(&result->y, &g, &h); 258 | mul_narrow(&result->t, &e, &h); 259 | } 260 | 261 | void projective_add( 262 | projective_pt_narrow_t *result, const projective_pt_narrow_t * __restrict x1, 263 | const projective_pt_narrow_t * __restrict x2) { 264 | 265 | residue_narrow_t x1_plus_y1, x2_plus_y2; 266 | residue_narrow_t a, b, c, d, e, e_temp, f, g, t1, t2; 267 | 268 | mul_narrow(&a, &x1->z, &x2->z); 269 | square_narrow(&b, &a); 270 | mul_narrow(&c, &x1->x, &x2->x); 271 | mul_narrow(&d, &x1->y, &x2->y); 272 | mul_narrow_const(&e_temp, &c, D); 273 | mul_narrow(&e, &e_temp, &d); 274 | 275 | sub_narrow(&f, &b, &e); 276 | add_narrow(&g, &b, &e); 277 | add_narrow(&x1_plus_y1, &x1->x, &x1->y); 278 | add_narrow(&x2_plus_y2, &x2->x, &x2->y); 279 | 280 | mul_narrow(&t1, &x1_plus_y1, &x2_plus_y2); 281 | sub_narrow(&t2, &t1, &c); 282 | sub_narrow(&t1, &t2, &d); 283 | mul_narrow(&t2, &t1, &f); 284 | mul_narrow(&result->x, &t2, &a); 285 | 286 | sub_narrow(&t1, &d, &c); 287 | mul_narrow(&t2, &t1, &g); 288 | mul_narrow(&result->y, &t2, &a); 289 | 290 | mul_narrow(&result->z, &f, &g); 291 | } 292 | 293 | void extended_add( 294 | projective_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x1, 295 | const extended_pt_narrow_t * __restrict x2) { 296 | 297 | residue_narrow_t x1_plus_y1, x2_plus_y2; 298 | residue_narrow_t a, b, c, c_temp, d, e, e_temp, f, g, h; 299 | 300 | mul_narrow(&a, &x1->x, &x2->x); 301 | mul_narrow(&b, &x1->y, &x2->y); 302 | mul_narrow_const(&c_temp, &x1->t, D); 303 | mul_narrow(&c, &c_temp, &x2->t); 304 | mul_narrow(&d, &x1->z, &x2->z); 305 | 306 | add_narrow(&x1_plus_y1, &x1->x, &x1->y); 307 | add_narrow(&x2_plus_y2, &x2->x, &x2->y); 308 | mul_narrow(&e, &x1_plus_y1, &x2_plus_y2); 309 | sub_narrow(&e_temp, &e, &a); 310 | sub_narrow(&e, &e_temp, &b); 311 | sub_narrow(&f, &d, &c); 312 | add_narrow(&g, &d, &c); 313 | sub_narrow(&h, &b, &a); 314 | 315 | mul_narrow(&result->x, &e, &f); 316 | mul_narrow(&result->z, &f, &g); 317 | mul_narrow(&result->y, &g, &h); 318 | } 319 | 320 | void extended_add_extended( 321 | extended_pt_narrow_t *result, const extended_pt_narrow_t *x1, 322 | const extended_pt_narrow_t *x2) { 323 | 324 | residue_narrow_t x1_plus_y1, x2_plus_y2; 325 | residue_narrow_t a, b, c, c_temp, d, e, e_temp, f, g, h; 326 | 327 | mul_narrow(&a, &x1->x, &x2->x); 328 | mul_narrow(&b, &x1->y, &x2->y); 329 | mul_narrow_const(&c_temp, &x1->t, D); 330 | mul_narrow(&c, &c_temp, &x2->t); 331 | mul_narrow(&d, &x1->z, &x2->z); 332 | 333 | add_narrow(&x1_plus_y1, &x1->x, &x1->y); 334 | add_narrow(&x2_plus_y2, &x2->x, &x2->y); 335 | mul_narrow(&e, &x1_plus_y1, &x2_plus_y2); 336 | sub_narrow(&e_temp, &e, &a); 337 | sub_narrow(&e, &e_temp, &b); 338 | sub_narrow(&f, &d, &c); 339 | add_narrow(&g, &d, &c); 340 | sub_narrow(&h, &b, &a); 341 | 342 | mul_narrow(&result->x, &e, &f); 343 | mul_narrow(&result->z, &f, &g); 344 | mul_narrow(&result->y, &g, &h); 345 | mul_narrow(&result->t, &e, &h); 346 | } 347 | 348 | void extended_readd_narrow_extended( 349 | extended_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x1, 350 | const extended_pt_readd_narrow_t * __restrict x2) { 351 | 352 | residue_narrow_t x1_plus_y1; 353 | residue_narrow_t x2_plus_y2; 354 | residue_narrow_t a, b, c, d, e, e_temp, f, g, h; 355 | 356 | mul_narrow(&a, &x1->x, &x2->x); 357 | mul_narrow(&b, &x1->y, &x2->y); 358 | mul_narrow(&c, &x1->t, &x2->dt); 359 | mul_narrow(&d, &x1->z, &x2->z); 360 | 361 | add_narrow(&x1_plus_y1, &x1->x, &x1->y); 362 | add_narrow(&x2_plus_y2, &x2->x, &x2->y); 363 | mul_narrow(&e, &x1_plus_y1, &x2_plus_y2); 364 | sub_narrow(&e_temp, &e, &a); 365 | sub_narrow(&e, &e_temp, &b); 366 | sub_narrow(&f, &d, &c); 367 | add_narrow(&g, &d, &c); 368 | sub_narrow(&h, &b, &a); 369 | 370 | mul_narrow(&result->x, &e, &f); 371 | mul_narrow(&result->z, &f, &g); 372 | mul_narrow(&result->y, &g, &h); 373 | mul_narrow(&result->t, &e, &h); 374 | } 375 | 376 | void extended_readd_narrow( 377 | projective_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x1, 378 | const extended_pt_readd_narrow_t * __restrict x2) { 379 | 380 | residue_narrow_t x1_plus_y1; 381 | residue_narrow_t x2_plus_y2; 382 | residue_narrow_t a, b, c, d, e, e_temp, f, g, h; 383 | 384 | mul_narrow(&a, &x1->x, &x2->x); 385 | mul_narrow(&b, &x1->y, &x2->y); 386 | mul_narrow(&c, &x1->t, &x2->dt); 387 | mul_narrow(&d, &x1->z, &x2->z); 388 | 389 | add_narrow(&x1_plus_y1, &x1->x, &x1->y); 390 | add_narrow(&x2_plus_y2, &x2->x, &x2->y); 391 | mul_narrow(&e, &x1_plus_y1, &x2_plus_y2); 392 | sub_narrow(&e_temp, &e, &a); 393 | sub_narrow(&e, &e_temp, &b); 394 | sub_narrow(&f, &d, &c); 395 | add_narrow(&g, &d, &c); 396 | sub_narrow(&h, &b, &a); 397 | 398 | mul_narrow(&result->x, &e, &f); 399 | mul_narrow(&result->z, &f, &g); 400 | mul_narrow(&result->y, &g, &h); 401 | } 402 | 403 | #include 404 | // static void print_narrow(const residue_narrow_t *x, const char *prefix) { 405 | // printf("%s\n", prefix); 406 | // for (int i = 0; i < NLIMBS; ++i) { 407 | // printf("%#x\n", x->limbs[i]); 408 | // } 409 | // } 410 | 411 | void extended_readd_affine_narrow_extended( 412 | extended_pt_narrow_t *result, const extended_pt_narrow_t *x1, 413 | const extended_affine_pt_readd_narrow_t * __restrict x2) { 414 | 415 | residue_narrow_t x1_plus_y1; 416 | residue_narrow_t x2_plus_y2; 417 | residue_narrow_t a, b, c, e, e_temp, f, g, h; 418 | 419 | mul_narrow(&a, &x1->x, &x2->x); 420 | mul_narrow(&b, &x1->y, &x2->y); 421 | mul_narrow(&c, &x1->t, &x2->dt); 422 | 423 | add_narrow(&x1_plus_y1, &x1->x, &x1->y); 424 | add_narrow(&x2_plus_y2, &x2->x, &x2->y); 425 | mul_narrow(&e, &x1_plus_y1, &x2_plus_y2); 426 | sub_narrow(&e_temp, &e, &a); 427 | sub_narrow(&e, &e_temp, &b); 428 | sub_narrow(&f, &x1->z, &c); 429 | add_narrow(&g, &x1->z, &c); 430 | sub_narrow(&h, &b, &a); 431 | 432 | mul_narrow(&result->x, &e, &f); 433 | mul_narrow(&result->z, &f, &g); 434 | mul_narrow(&result->y, &g, &h); 435 | mul_narrow(&result->t, &e, &h); 436 | } 437 | 438 | void extended_readd_readd_narrow( 439 | extended_pt_readd_narrow_t *result, 440 | const extended_pt_narrow_t * __restrict x1, 441 | const extended_pt_readd_narrow_t * __restrict x2) { 442 | 443 | residue_narrow_t x1_plus_y1; 444 | residue_narrow_t x2_plus_y2; 445 | residue_narrow_t a, b, c, d, e, e_temp, f, g, h, t3; 446 | 447 | mul_narrow(&a, &x1->x, &x2->x); 448 | mul_narrow(&b, &x1->y, &x2->y); 449 | mul_narrow(&c, &x1->t, &x2->dt); 450 | mul_narrow(&d, &x1->z, &x2->z); 451 | 452 | add_narrow(&x1_plus_y1, &x1->x, &x1->y); 453 | add_narrow(&x2_plus_y2, &x2->x, &x2->y); 454 | mul_narrow(&e, &x1_plus_y1, &x2_plus_y2); 455 | sub_narrow(&e_temp, &e, &a); 456 | sub_narrow(&e, &e_temp, &b); 457 | sub_narrow(&f, &d, &c); 458 | add_narrow(&g, &d, &c); 459 | sub_narrow(&h, &b, &a); 460 | 461 | mul_narrow(&result->x, &e, &f); 462 | mul_narrow(&result->z, &f, &g); 463 | mul_narrow(&result->y, &g, &h); 464 | mul_narrow(&t3, &e, &h); 465 | 466 | mul_narrow_const(&result->dt, &t3, D); 467 | } 468 | 469 | void readd_to_projective( 470 | projective_pt_narrow_t *result, 471 | const extended_pt_readd_narrow_t * __restrict x) { 472 | 473 | copy_narrow(&result->x, &x->x); 474 | copy_narrow(&result->y, &x->y); 475 | copy_narrow(&result->z, &x->z); 476 | } 477 | 478 | void affine_readd_to_extended( 479 | extended_pt_narrow_t *result, 480 | const extended_affine_pt_readd_narrow_t * __restrict x) { 481 | 482 | copy_narrow(&result->x, &x->x); 483 | copy_narrow(&result->y, &x->y); 484 | mul_narrow(&result->t, &x->x, &x->y); 485 | for (int i = 0; i < NLIMBS; ++i) { 486 | result->z.limbs[i] = 0; 487 | } 488 | result->z.limbs[0] = 1; 489 | } 490 | 491 | void scalar_multiply( 492 | projective_pt_narrow_t *result, const affine_pt_narrow_t * __restrict x, 493 | const scalar_t * __restrict n) { 494 | 495 | scalar_t sabs_n; 496 | convert_to_sabs(&sabs_n, n); 497 | 498 | const int WINDOW_BITS = 5; 499 | const uint32_t WINDOW_MASK = (1 << WINDOW_BITS) - 1; 500 | const uint32_t LOOKUP_MASK = WINDOW_MASK >> 1; 501 | const int TABLE_SIZE = 16; 502 | extended_pt_readd_narrow_t table[TABLE_SIZE]; 503 | 504 | extended_pt_narrow_t x2; 505 | affine_double_extended(&x2, x); 506 | affine_to_readd_narrow(&table[0], x); 507 | for (int i = 1; i < TABLE_SIZE; ++i) { 508 | extended_readd_readd_narrow(&table[i], &x2, &table[i-1]); 509 | } 510 | 511 | int i; 512 | int first = 1; 513 | // Set i to the highest i such that 514 | // a) i < SCALAR_BITS 515 | // b) i % WINDOW_BITS = 0 516 | 517 | projective_pt_narrow_t temp; 518 | extended_pt_narrow_t temp_ext; 519 | extended_pt_readd_narrow_t window_pt; 520 | 521 | i = SCALAR_BITS - ((SCALAR_BITS - 1) % WINDOW_BITS) - 1; 522 | for (; i >= 0; i -= WINDOW_BITS) { 523 | uint32_t bits = sabs_n.limbs[i/SCALAR_LIMB_BITS] >> (i % SCALAR_LIMB_BITS); 524 | if (i % SCALAR_LIMB_BITS > (SCALAR_LIMB_BITS - WINDOW_BITS) && 525 | i / SCALAR_LIMB_BITS < SCALAR_LIMBS - 1) { 526 | 527 | bits |= sabs_n.limbs[i/SCALAR_LIMB_BITS + 1] << 528 | (SCALAR_LIMB_BITS - i % SCALAR_LIMB_BITS); 529 | } 530 | 531 | bits &= WINDOW_MASK; 532 | int32_t invert = (bits >> (WINDOW_BITS - 1)) - 1; 533 | bits ^= invert; 534 | 535 | constant_time_extended_narrow_lookup( 536 | &window_pt, bits & LOOKUP_MASK, TABLE_SIZE, table); 537 | constant_time_cond_extended_negate(&window_pt, invert); 538 | 539 | if (first) { 540 | readd_to_projective(&temp, &window_pt); 541 | first = 0; 542 | } else { 543 | for (int i = 0; i < WINDOW_BITS - 1; ++i) { 544 | projective_double(&temp, &temp); 545 | } 546 | projective_double_extended(&temp_ext, &temp); 547 | extended_readd_narrow(&temp, &temp_ext, &window_pt); 548 | } 549 | } 550 | 551 | copy_projective_pt_narrow(result, &temp); 552 | explicit_bzero(&sabs_n, sizeof(sabs_n)); 553 | explicit_bzero(&window_pt, sizeof(window_pt)); 554 | explicit_bzero(table, sizeof(table)); 555 | explicit_bzero(&temp, sizeof(temp)); 556 | explicit_bzero(&temp_ext, sizeof(temp_ext)); 557 | } 558 | 559 | void scalar_multiply_unsafe( 560 | projective_pt_narrow_t *result, const affine_pt_narrow_t * __restrict x, 561 | const scalar_t * __restrict n) { 562 | 563 | scalar_t sabs_n; 564 | convert_to_sabs(&sabs_n, n); 565 | 566 | const int WINDOW_BITS = 5; 567 | const uint32_t WINDOW_MASK = (1 << WINDOW_BITS) - 1; 568 | const uint32_t LOOKUP_MASK = WINDOW_MASK >> 1; 569 | const int TABLE_SIZE = 16; 570 | extended_pt_readd_narrow_t table[TABLE_SIZE]; 571 | 572 | extended_pt_narrow_t x2; 573 | affine_double_extended(&x2, x); 574 | affine_to_readd_narrow(&table[0], x); 575 | for (int i = 1; i < TABLE_SIZE; ++i) { 576 | extended_readd_readd_narrow(&table[i], &x2, &table[i-1]); 577 | } 578 | 579 | int i; 580 | int first = 1; 581 | // Set i to the highest i such that 582 | // a) i < SCALAR_BITS 583 | // b) i % WINDOW_BITS = 0 584 | 585 | projective_pt_narrow_t temp; 586 | extended_pt_narrow_t temp_ext; 587 | extended_pt_readd_narrow_t window_pt; 588 | 589 | i = SCALAR_BITS - ((SCALAR_BITS - 1) % WINDOW_BITS) - 1; 590 | for (; i >= 0; i -= WINDOW_BITS) { 591 | uint32_t bits = sabs_n.limbs[i/SCALAR_LIMB_BITS] >> (i % SCALAR_LIMB_BITS); 592 | if (i % SCALAR_LIMB_BITS > (SCALAR_LIMB_BITS - WINDOW_BITS) && 593 | i / SCALAR_LIMB_BITS < SCALAR_LIMBS - 1) { 594 | 595 | bits |= sabs_n.limbs[i/SCALAR_LIMB_BITS + 1] << 596 | (SCALAR_LIMB_BITS - i % SCALAR_LIMB_BITS); 597 | } 598 | 599 | bits &= WINDOW_MASK; 600 | int32_t invert = (bits >> (WINDOW_BITS - 1)) - 1; 601 | bits ^= invert; 602 | 603 | copy_extended_pt_readd_narrow(&window_pt, &table[bits & LOOKUP_MASK]); 604 | if (invert) { 605 | negate_extended_pt_readd_narrow(&window_pt, &window_pt); 606 | } 607 | 608 | if (first) { 609 | readd_to_projective(&temp, &window_pt); 610 | first = 0; 611 | } else { 612 | for (int i = 0; i < WINDOW_BITS - 1; ++i) { 613 | projective_double(&temp, &temp); 614 | } 615 | projective_double_extended(&temp_ext, &temp); 616 | extended_readd_narrow(&temp, &temp_ext, &window_pt); 617 | } 618 | } 619 | 620 | copy_projective_pt_narrow(result, &temp); 621 | } 622 | 623 | int point_decompress( 624 | affine_pt_narrow_t *result, 625 | residue_narrow_reduced_t *y, int low_bit) { 626 | 627 | residue_narrow_t y_n; 628 | 629 | residue_narrow_t u; 630 | residue_narrow_t v; 631 | 632 | residue_narrow_t y2; 633 | residue_narrow_reduced_t temp; 634 | 635 | unnarrow_reduce(&y_n, y); 636 | square_narrow(&y2, &y_n); 637 | copy_narrow(&result->y, &y_n); 638 | 639 | sub_narrow(&u, &one_narrow, &y2); 640 | mul_narrow_const(&y2, &y2, D); 641 | sub_narrow(&v, &one_narrow, &y2); 642 | 643 | if (sqrt_inv_narrow(&result->x, &u, &v)) { 644 | narrow_partial_complete(&temp, &result->x); 645 | 646 | int x_is_odd = is_odd(&temp); 647 | if ((x_is_odd && !low_bit) || (low_bit && !x_is_odd)) { 648 | negate_narrow(&result->x, &result->x); 649 | } 650 | 651 | return 1; 652 | } 653 | 654 | return 0; 655 | } 656 | -------------------------------------------------------------------------------- /ref/include/curve.h: -------------------------------------------------------------------------------- 1 | #ifndef CURVE_H 2 | #define CURVE_H 3 | #include "f11_260.h" 4 | #include "scalar.h" 5 | 6 | typedef struct affine_pt_narrow { 7 | residue_narrow_t x; 8 | residue_narrow_t y; 9 | } affine_pt_narrow_t; 10 | 11 | typedef struct extended_pt_readd_narrow { 12 | __attribute__((__aligned__(64))) 13 | residue_narrow_t x; 14 | residue_narrow_t dt; 15 | residue_narrow_t y; 16 | residue_narrow_t z; 17 | } extended_pt_readd_narrow_t; 18 | 19 | typedef struct extended_affine_pt_readd_narrow { 20 | __attribute__((__aligned__(64))) 21 | residue_narrow_t x; 22 | residue_narrow_t dt; 23 | residue_narrow_t y; 24 | } extended_affine_pt_readd_narrow_t; 25 | 26 | // For use in doubling. 27 | typedef struct projective_pt_narrow { 28 | residue_narrow_t x; 29 | residue_narrow_t y; 30 | residue_narrow_t z; 31 | } projective_pt_narrow_t; 32 | 33 | // For use in addition. 34 | typedef struct extended_pt_narrow { 35 | residue_narrow_t x; 36 | residue_narrow_t y; 37 | residue_narrow_t t; 38 | residue_narrow_t z; 39 | } extended_pt_narrow_t; 40 | 41 | #define D (-49142) 42 | 43 | __attribute__((__aligned__(32))) 44 | const affine_pt_narrow_t B; 45 | 46 | void copy_projective_pt_narrow( 47 | projective_pt_narrow_t *result, const projective_pt_narrow_t *source); 48 | 49 | void copy_extended_pt_narrow( 50 | extended_pt_narrow_t *result, const extended_pt_narrow_t *source); 51 | 52 | void copy_extended_pt_readd_narrow( 53 | extended_pt_readd_narrow_t *result, const extended_pt_readd_narrow_t *source); 54 | 55 | void copy_extended_pt_readd_narrow( 56 | extended_pt_readd_narrow_t *result, const extended_pt_readd_narrow_t *source); 57 | 58 | void copy_extended_affine_pt_readd_narrow( 59 | extended_affine_pt_readd_narrow_t *result, 60 | const extended_affine_pt_readd_narrow_t *source); 61 | 62 | void negate_extended_pt_readd_narrow( 63 | extended_pt_readd_narrow_t *result, 64 | const extended_pt_readd_narrow_t *source); 65 | 66 | void negate_extended_affine_pt_readd_narrow( 67 | extended_affine_pt_readd_narrow_t *result, 68 | const extended_affine_pt_readd_narrow_t *source); 69 | 70 | void affine_narrow_to_extended( 71 | extended_pt_narrow_t *result, 72 | const affine_pt_narrow_t * __restrict x); 73 | 74 | void affine_to_projective( 75 | projective_pt_narrow_t *result, 76 | const affine_pt_narrow_t * __restrict x); 77 | 78 | void affine_to_readd_narrow( 79 | extended_pt_readd_narrow_t *result, 80 | const affine_pt_narrow_t * __restrict x); 81 | 82 | void extended_to_readd_narrow_neg( 83 | extended_pt_readd_narrow_t *result, 84 | const extended_pt_narrow_t * __restrict x); 85 | 86 | void affine_to_readd_narrow( 87 | extended_pt_readd_narrow_t *result, 88 | const affine_pt_narrow_t * __restrict x); 89 | 90 | void projective_to_extended_narrow( 91 | extended_pt_narrow_t *result, projective_pt_narrow_t * __restrict x); 92 | 93 | void extended_to_projective_narrow( 94 | projective_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x); 95 | 96 | void readd_to_projective( 97 | projective_pt_narrow_t *result, 98 | const extended_pt_readd_narrow_t * __restrict x); 99 | 100 | void affine_readd_to_extended( 101 | extended_pt_narrow_t *result, 102 | const extended_affine_pt_readd_narrow_t * __restrict x); 103 | 104 | void negate_extended_affine_pt_readd_narrow( 105 | extended_affine_pt_readd_narrow_t *result, 106 | const extended_affine_pt_readd_narrow_t *source); 107 | 108 | void affine_double( 109 | projective_pt_narrow_t *result, 110 | const affine_pt_narrow_t * __restrict x); 111 | 112 | void affine_double_extended( 113 | extended_pt_narrow_t *result, const affine_pt_narrow_t * __restrict x); 114 | 115 | void projective_double( 116 | projective_pt_narrow_t *result, const projective_pt_narrow_t *x); 117 | 118 | void projective_double_extended( 119 | extended_pt_narrow_t *result, const projective_pt_narrow_t * __restrict x); 120 | 121 | void extended_double_extended( 122 | extended_pt_narrow_t *result, const extended_pt_narrow_t *x); 123 | 124 | void projective_add( 125 | projective_pt_narrow_t *result, const projective_pt_narrow_t * __restrict x1, 126 | const projective_pt_narrow_t * __restrict x2); 127 | 128 | void extended_add( 129 | projective_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x, 130 | const extended_pt_narrow_t * __restrict y); 131 | 132 | void extended_add_extended( 133 | extended_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x, 134 | const extended_pt_narrow_t * __restrict y); 135 | 136 | void extended_readd_narrow( 137 | projective_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x, 138 | const extended_pt_readd_narrow_t * __restrict y); 139 | 140 | void extended_readd_narrow_extended( 141 | extended_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x, 142 | const extended_pt_readd_narrow_t * __restrict y); 143 | 144 | void extended_readd_affine_narrow_extended( 145 | extended_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x, 146 | const extended_affine_pt_readd_narrow_t * __restrict y); 147 | 148 | void extended_add_extended( 149 | extended_pt_narrow_t *result, const extended_pt_narrow_t * __restrict x, 150 | const extended_pt_narrow_t * __restrict y); 151 | 152 | void extended_readd_readd_narrow( 153 | extended_pt_readd_narrow_t *result, 154 | const extended_pt_narrow_t * __restrict x, 155 | const extended_pt_readd_narrow_t * __restrict y); 156 | 157 | void extended_readd_narrow_extended( 158 | extended_pt_narrow_t *result, 159 | const extended_pt_narrow_t *x1, 160 | const extended_pt_readd_narrow_t * __restrict x2); 161 | 162 | void scalar_multiply( 163 | projective_pt_narrow_t *result, const affine_pt_narrow_t * __restrict x, 164 | const scalar_t * __restrict n); 165 | 166 | void scalar_multiply_unsafe( 167 | projective_pt_narrow_t *result, const affine_pt_narrow_t * __restrict x, 168 | const scalar_t * __restrict n); 169 | 170 | int point_decompress( 171 | affine_pt_narrow_t *result, residue_narrow_reduced_t *y, int low_bit); 172 | #endif 173 | -------------------------------------------------------------------------------- /ref/include/f11_260.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "f11_260.h" 3 | 4 | residue_narrow_t zero_narrow = {0}; 5 | residue_narrow_t one_narrow = { 6 | .limbs = {1}, 7 | }; 8 | 9 | // Shrink to 32 bits. Assumes reduction has already occurred, and wide storage 10 | // is being used for vector compatibility. 11 | void narrow(residue_narrow_t *result, const residue_wide_t * __restrict w) { 12 | for (int i = 0; i < NLIMBS; ++i) { 13 | result->limbs[i] = w->limbs[i]; 14 | } 15 | } 16 | 17 | // Reduce to 10 limbs. Useful for debugging 18 | void narrow_reduce( 19 | residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w) { 20 | residue_narrow_t temp; 21 | for (int i = 0; i < NLIMBS; ++i) { 22 | temp.limbs[i] = w->limbs[i] - w->limbs[10]; 23 | } 24 | 25 | reduce_step_narrow(&temp, &temp); 26 | 27 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 28 | result->limbs[i] = temp.limbs[i] - temp.limbs[10]; 29 | } 30 | } 31 | 32 | // Reduce to unique representative. 33 | // This is expensive. Only used for final signature or DH Key 34 | void narrow_complete( 35 | residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w) { 36 | 37 | residue_narrow_t temp; 38 | for (int i = 0; i < NLIMBS; ++i) { 39 | temp.limbs[i] = w->limbs[i] - w->limbs[10]; 40 | } 41 | 42 | // This may be combined with the final reduction from a multiply. 43 | reduce_step_narrow(&temp, &temp); 44 | 45 | int gt_mask = 0; 46 | int lt_mask = 0; 47 | int32_t limit[NLIMBS]; 48 | for (int i = 0; i < NLIMBS; ++i) { 49 | temp.limbs[i] = temp.limbs[i] - temp.limbs[10]; 50 | temp.limbs[i] += 1 & gt_mask; 51 | temp.limbs[i] -= 1 & lt_mask; 52 | gt_mask = -(temp.limbs[i] > T); 53 | lt_mask = -(temp.limbs[i] < 0); 54 | temp.limbs[i] -= (T & gt_mask); 55 | temp.limbs[i] += (T & lt_mask); 56 | } 57 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 58 | temp.limbs[i] -= temp.limbs[10]; 59 | limit[i] = T; 60 | } 61 | int64_t all_t = -1; 62 | for (int i = NLIMBS_REDUCED - 2; i >= 0; --i) { 63 | all_t &= -(temp.limbs[i+1] == T); 64 | limit[i] -= 1 & (~all_t); 65 | } 66 | gt_mask = 0; 67 | lt_mask = 0; 68 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 69 | temp.limbs[i] += 1 & gt_mask; 70 | temp.limbs[i] -= 1 & lt_mask; 71 | gt_mask = -(temp.limbs[i] > limit[i]); 72 | lt_mask = -(temp.limbs[i] < 0); 73 | temp.limbs[i] -= (T & gt_mask); 74 | temp.limbs[i] += (T & lt_mask); 75 | result->limbs[i] = temp.limbs[i]; 76 | } 77 | } 78 | 79 | // Reduce to mostly unique representative. 80 | // All coefficients are reduced to 0 <= xi <= t 81 | // Unique up to carries (xi == t) => (xi = 0; x[i+1] += 1); 82 | // This is sufficient to determine if x is even or odd. 83 | // Still pretty expensive. Used in point compression. 84 | void narrow_partial_complete( 85 | residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w) { 86 | 87 | residue_narrow_t temp; 88 | for (int i = 0; i < NLIMBS; ++i) { 89 | temp.limbs[i] = w->limbs[i] - w->limbs[10]; 90 | } 91 | 92 | // This may be combined with the final reduction from a multiply. 93 | reduce_step_narrow(&temp, &temp); 94 | 95 | int gt_mask = 0; 96 | int lt_mask = 0; 97 | for (int i = 0; i < NLIMBS - 1; ++i) { 98 | temp.limbs[i] = temp.limbs[i] - temp.limbs[10]; 99 | temp.limbs[i] += 1 & gt_mask; 100 | temp.limbs[i] -= 1 & lt_mask; 101 | gt_mask = -(temp.limbs[i] > T); 102 | lt_mask = -(temp.limbs[i] < 0); 103 | temp.limbs[i] -= (T & gt_mask); 104 | temp.limbs[i] += (T & lt_mask); 105 | } 106 | for (int i = 0; i < NLIMBS - 1; ++i) { 107 | temp.limbs[i] -= temp.limbs[10]; 108 | } 109 | gt_mask = 0; 110 | lt_mask = 0; 111 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 112 | temp.limbs[i] += 1 & gt_mask; 113 | temp.limbs[i] -= 1 & lt_mask; 114 | gt_mask = -(temp.limbs[i] > T); 115 | lt_mask = -(temp.limbs[i] < 0); 116 | temp.limbs[i] -= (T & gt_mask); 117 | temp.limbs[i] += (T & lt_mask); 118 | result->limbs[i] = temp.limbs[i]; 119 | } 120 | } 121 | 122 | int is_odd(residue_narrow_reduced_t *x) { 123 | int result = 0; 124 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 125 | result ^= x->limbs[i] & 0x1; 126 | } 127 | return result; 128 | } 129 | 130 | // Copy a 12x32-bit residue 131 | void copy_narrow( 132 | residue_narrow_t *result, const residue_narrow_t * __restrict x) { 133 | 134 | for (int i = 0; i < NLIMBS; ++i) { 135 | result->limbs[i] = x->limbs[i]; 136 | } 137 | } 138 | 139 | // Copy a 10x32-bit residue 140 | void copy_narrow_reduced( 141 | residue_narrow_reduced_t *result, 142 | const residue_narrow_reduced_t * __restrict x) { 143 | 144 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 145 | result->limbs[i] = x->limbs[i]; 146 | } 147 | } 148 | 149 | // Subtract 2 12x32-bit residues. 150 | void sub_narrow( 151 | residue_narrow_t *result, const residue_narrow_t * __restrict x, 152 | const residue_narrow_t * __restrict y) { 153 | 154 | for (int i = 0; i < NLIMBS; ++i) { 155 | result->limbs[i] = x->limbs[i] - y->limbs[i]; 156 | } 157 | } 158 | 159 | // negate a 12x32-bit residue. 160 | void negate_narrow( 161 | residue_narrow_t *result, const residue_narrow_t *x) { 162 | 163 | for (int i = 0; i < NLIMBS; ++i) { 164 | result->limbs[i] = -(x->limbs[i]); 165 | } 166 | } 167 | 168 | // Add 2 12x32-bit residues. 169 | void add_narrow( 170 | residue_narrow_t *result, const residue_narrow_t * __restrict x, 171 | const residue_narrow_t * __restrict y) { 172 | 173 | for (int i = 0; i < NLIMBS; ++i) { 174 | result->limbs[i] = x->limbs[i] + y->limbs[i]; 175 | } 176 | } 177 | 178 | // Scale a narrow residue by 2. 179 | void double_narrow( 180 | residue_narrow_t *result, const residue_narrow_t *x) { 181 | 182 | for (int i = 0; i < NLIMBS; ++i) { 183 | result->limbs[i] = x->limbs[i] << 1; 184 | } 185 | } 186 | 187 | // Scale a wide residue by 2. 188 | void double_wide( 189 | residue_wide_t *result, const residue_wide_t *x) { 190 | 191 | for (int i = 0; i < NLIMBS; ++i) { 192 | result->limbs[i] = x->limbs[i] << 1; 193 | } 194 | } 195 | 196 | #define wrap(x) (((x + NLIMBS) % NLIMBS)) 197 | // Multiply two wide residues, and produce a wide result. The result is reduced 198 | // to 32 bits, but not narrowed for performance reasons. 199 | void mul_wide( 200 | residue_wide_t *result, const residue_wide_t *x, const residue_wide_t *y) { 201 | 202 | residue_wide_t temp; 203 | for (int i = 0; i < NLIMBS; ++i) { 204 | temp.limbs[i] = 0; 205 | int i_2 = (i + (-(i & 1) & NLIMBS)) >> 1; 206 | for (int j = 1; j <= NLIMBS / 2; ++ j) { 207 | temp.limbs[i] += 208 | (x->limbs[wrap(i_2 + j)] - x->limbs[wrap(i_2 - j)]) * 209 | (y->limbs[wrap(i_2 - j)] - y->limbs[wrap(i_2 + j)]); 210 | } 211 | } 212 | reduce_step_wide(&temp, &temp); 213 | reduce_step_wide(result, &temp); 214 | } 215 | 216 | // Multiply a wide residues by a narrow and produce a wide result. The result is 217 | // reduced to 32 bits, but not narrowed for performance reasons. 218 | void mul_wide_narrow( 219 | residue_wide_t *result, const residue_wide_t *x, const residue_narrow_t *y) { 220 | 221 | residue_wide_t temp; 222 | for (int i = 0; i < NLIMBS; ++i) { 223 | temp.limbs[i] = 0; 224 | int i_2 = (i + (-(i & 1) & NLIMBS)) >> 1; 225 | for (int j = 1; j <= NLIMBS / 2; ++j) { 226 | temp.limbs[i] += 227 | (x->limbs[wrap(i_2 + j)] - x->limbs[wrap(i_2 - j)]) * 228 | ((int64_t) (y->limbs[wrap(i_2 - j)] - y->limbs[wrap(i_2 + j)])); 229 | } 230 | } 231 | reduce_step_wide(&temp, &temp); 232 | reduce_step_wide(result, &temp); 233 | } 234 | 235 | // Multiply two narrow residues and produce a narrow result. 236 | void mul_narrow( 237 | residue_narrow_t *result, const residue_narrow_t *x, 238 | const residue_narrow_t *y) { 239 | 240 | residue_wide_t temp; 241 | for (int i = 0; i < NLIMBS; ++i) { 242 | temp.limbs[i] = 0; 243 | int i_2 = (i + (-(i & 1) & NLIMBS)) >> 1; 244 | for (int j = 1; j <= NLIMBS / 2; ++ j) { 245 | temp.limbs[i] += 246 | ((int64_t) (x->limbs[wrap(i_2 + j)] - x->limbs[wrap(i_2 - j)])) * 247 | ((int64_t) (y->limbs[wrap(i_2 - j)] - y->limbs[wrap(i_2 + j)])); 248 | } 249 | } 250 | reduce_step_wide(&temp, &temp); 251 | reduce_step_wide(&temp, &temp); 252 | narrow(result, &temp); 253 | } 254 | 255 | // Multiply a narrow residue by a small constant. The result is reduced to 32 256 | // bits, but not narrowed for performance reasons. 257 | void mul_narrow_const( 258 | residue_narrow_t *result, const residue_narrow_t *x, int32_t d) { 259 | 260 | residue_wide_t temp; 261 | for (int i = 0; i < NLIMBS; ++i) { 262 | temp.limbs[i] = ((uint64_t) x->limbs[i]) * d; 263 | } 264 | reduce_step_wide(&temp, &temp); 265 | narrow(result, &temp); 266 | } 267 | 268 | 269 | // Square a narrow residue and produce a wide result. The result is reduced to 270 | // 32 bits but not narrowed for performance reasons. 271 | void square_narrow( 272 | residue_narrow_t *result, const residue_narrow_t *x) { 273 | 274 | residue_wide_t temp; 275 | for (int i = 0; i < NLIMBS; ++i) { 276 | temp.limbs[i] = 0; 277 | int i_2 = (i + (-(i & 1) & NLIMBS)) >> 1; 278 | for (int j = 1; j <= NLIMBS / 2; ++ j) { 279 | temp.limbs[i] -= 280 | ((int64_t) (x->limbs[wrap(i_2 + j)] - x->limbs[wrap(i_2 - j)])) * 281 | ((int64_t) (x->limbs[wrap(i_2 + j)] - x->limbs[wrap(i_2 - j)])); 282 | } 283 | } 284 | reduce_step_wide(&temp, &temp); 285 | reduce_step_wide(&temp, &temp); 286 | narrow(result, &temp); 287 | } 288 | 289 | // Approximately divide each coefficient by t. Carry the results. 290 | void reduce_step_narrow( 291 | residue_narrow_t *result, const residue_narrow_t *x) { 292 | 293 | int32_t carries[NLIMBS]; 294 | 295 | for (int i = 0; i < NLIMBS; ++i) { 296 | carries[i] = x->limbs[i] >> TBITS; 297 | result->limbs[i] = (x->limbs[i] & TMASK) + 298 | (carries[i] << T_CBITS) - carries[i]; 299 | } 300 | 301 | for (int i = 1; i < NLIMBS; ++i) { 302 | result->limbs[i] += carries[i - 1]; 303 | } 304 | result->limbs[0] += carries[NLIMBS - 1]; 305 | } 306 | 307 | // Approximately divide each coefficient by t. Carry the results. 308 | void reduce_step_wide( 309 | residue_wide_t *result, const residue_wide_t *x) { 310 | 311 | int64_t carries[NLIMBS]; 312 | 313 | for (int i = 0; i < NLIMBS; ++i) { 314 | carries[i] = x->limbs[i] >> TBITS; 315 | result->limbs[i] = (x->limbs[i] & TMASK) + 316 | (carries[i] << T_CBITS) - carries[i]; 317 | } 318 | 319 | for (int i = 1; i < NLIMBS; ++i) { 320 | result->limbs[i] += carries[i - 1]; 321 | } 322 | result->limbs[0] += carries[NLIMBS - 1]; 323 | } 324 | 325 | // Takes advantage of the fact that if a residue z *is zero* then after setting 326 | // one coefficient to T/2, all the remaining coefficients should be near to 327 | // T/2. They should therefore resolve all carries in a single step, and all be 328 | // equal to the same value. Some other value may not reduce completely, but this 329 | // is fine, we will know it is not zero. 330 | int equal_narrow(const residue_narrow_t *x, const residue_narrow_t *y) { 331 | residue_narrow_t temp; 332 | 333 | sub_narrow(&temp, x, y); 334 | int32_t delta = -temp.limbs[0] + (T / 2); 335 | for (int i = 0; i < NLIMBS; ++i) { 336 | temp.limbs[i] += delta; 337 | } 338 | 339 | reduce_step_narrow(&temp, &temp); 340 | 341 | delta = temp.limbs[0]; 342 | int result = 0; 343 | for (int i = 1; i < NLIMBS; ++i) { 344 | result |= (temp.limbs[i] ^ delta); 345 | } 346 | 347 | return !result; 348 | } 349 | 350 | int equal_narrow_reduced( 351 | const residue_narrow_reduced_t * x, const residue_narrow_reduced_t * y) { 352 | 353 | int result = 0; 354 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 355 | result |= (x->limbs[i] ^ y->limbs[i]); 356 | } 357 | 358 | return !result; 359 | } 360 | 361 | static inline void nsquare_narrow( 362 | residue_narrow_t *result, const residue_narrow_t *x, int n) { 363 | 364 | square_narrow(result, x); 365 | for (int i = 1; i < n; ++i) { 366 | square_narrow(result, result); 367 | } 368 | } 369 | 370 | static void raise_to_t( 371 | residue_narrow_t *result, const residue_narrow_t *x) { 372 | // zi = z^(2^i - 1), z1 = x 373 | residue_narrow_t z2; 374 | residue_narrow_t z3; 375 | residue_narrow_t z5; 376 | residue_narrow_t z10; 377 | residue_narrow_t z11; 378 | residue_narrow_t z22; 379 | residue_narrow_t result_t; 380 | 381 | square_narrow(&z2, x); 382 | mul_narrow(&z2, &z2, x); 383 | square_narrow(&z3, &z2); 384 | mul_narrow(&z3, &z3, x); 385 | nsquare_narrow(&z5, &z3, 2); 386 | mul_narrow(&z5, &z5, &z2); 387 | nsquare_narrow(&z10, &z5, 5); 388 | mul_narrow(&z10, &z10, &z5); 389 | square_narrow(&z11, &z10); 390 | mul_narrow(&z11, &z11, x); 391 | nsquare_narrow(&z22, &z11, 11); 392 | mul_narrow(&z22, &z22, &z11); 393 | nsquare_narrow(&result_t, &z22, 4); 394 | mul_narrow(result, &result_t, x); 395 | } 396 | 397 | static void raise_to_phi_t( 398 | residue_narrow_t *result, const residue_narrow_t *x, int n) { 399 | residue_narrow_t temp; 400 | 401 | raise_to_t(&temp, x); 402 | 403 | for (int i = 1; i < n; ++i) { 404 | mul_narrow(&temp, &temp, x); 405 | raise_to_t(&temp, &temp); 406 | } 407 | 408 | mul_narrow(result, &temp, x); 409 | } 410 | 411 | static void raise_to_t_minus_1_over_4( 412 | residue_narrow_t *result, const residue_narrow_t *x) { 413 | // zi = z^(2^i - 1), z1 = x 414 | residue_narrow_t z2; 415 | residue_narrow_t z3; 416 | residue_narrow_t z5; 417 | residue_narrow_t z10; 418 | residue_narrow_t z11; 419 | residue_narrow_t z22; 420 | 421 | square_narrow(&z2, x); 422 | mul_narrow(&z2, &z2, x); 423 | square_narrow(&z3, &z2); 424 | mul_narrow(&z3, &z3, x); 425 | nsquare_narrow(&z5, &z3, 2); 426 | mul_narrow(&z5, &z5, &z2); 427 | nsquare_narrow(&z10, &z5, 5); 428 | mul_narrow(&z10, &z10, &z5); 429 | square_narrow(&z11, &z10); 430 | mul_narrow(&z11, &z11, x); 431 | nsquare_narrow(&z22, &z11, 11); 432 | mul_narrow(&z22, &z22, &z11); 433 | nsquare_narrow(result, &z22, 2); 434 | } 435 | 436 | static void raise_to_p_minus_3_over_4( 437 | residue_narrow_t *result, const residue_narrow_t *x) { 438 | 439 | residue_narrow_t z4; //z to (t-1)/4 440 | residue_narrow_t z2; //z to (t-1)/2 441 | residue_narrow_t z3_4; //z to (3t+1)/4 442 | residue_narrow_t y_small; 443 | residue_narrow_t y, y_t4_y; 444 | residue_narrow_t raised; 445 | 446 | raise_to_t_minus_1_over_4(&z4, x); 447 | square_narrow(&z2, &z4); 448 | mul_narrow(&z3_4, &z2, &z4); 449 | mul_narrow(&z3_4, &z3_4, x); 450 | raise_to_t(&raised, &z4); 451 | mul_narrow(&y_small, &z2, &raised); 452 | raise_to_t(&raised, &y_small); 453 | mul_narrow(&y, &z3_4, &raised); 454 | raise_to_t(&raised, &y); 455 | raise_to_t(&raised, &raised); 456 | raise_to_t(&raised, &raised); 457 | raise_to_t(&raised, &raised); 458 | mul_narrow(&y_t4_y, &raised, &y); 459 | raise_to_t(&raised, &y_t4_y); 460 | raise_to_t(&raised, &raised); 461 | raise_to_t(&raised, &raised); 462 | mul_narrow(result, &raised, &y_small); 463 | } 464 | 465 | int sqrt_inv_narrow( 466 | residue_narrow_t *result, const residue_narrow_t * __restrict x, 467 | const residue_narrow_t * __restrict y) { 468 | residue_narrow_t xy; 469 | residue_narrow_t y2; 470 | residue_narrow_t xy3; 471 | residue_narrow_t xy3_p_3_over_4; 472 | residue_narrow_t cand2; 473 | residue_narrow_t should_be_x; 474 | 475 | square_narrow(&y2, y); 476 | mul_narrow(&xy, x, y); 477 | mul_narrow(&xy3, &xy, &y2); 478 | raise_to_p_minus_3_over_4(&xy3_p_3_over_4, &xy3); 479 | mul_narrow(result, &xy, &xy3_p_3_over_4); 480 | square_narrow(&cand2, result); 481 | mul_narrow(&should_be_x, y, &cand2); 482 | 483 | return equal_narrow(&should_be_x, x); 484 | } 485 | 486 | void invert_narrow( 487 | residue_narrow_t *result, const residue_narrow_t * __restrict x) { 488 | 489 | residue_narrow_t x_t_minus_1_over_4; 490 | residue_narrow_t x_t_minus_1; 491 | residue_narrow_t x_t; 492 | residue_narrow_t phi_8_x_t; 493 | residue_narrow_t phi_8_x_t_t; 494 | 495 | raise_to_t_minus_1_over_4(&x_t_minus_1_over_4, x); 496 | nsquare_narrow(&x_t_minus_1, &x_t_minus_1_over_4, 2); 497 | mul_narrow(&x_t, &x_t_minus_1, x); 498 | raise_to_phi_t(&phi_8_x_t, &x_t, 8); 499 | raise_to_t(&phi_8_x_t_t, &phi_8_x_t); 500 | mul_narrow(result, &phi_8_x_t_t, &x_t_minus_1); 501 | } 502 | 503 | void encode(uint8_t *out, const residue_narrow_reduced_t * __restrict x) { 504 | uint32_t collect = x->limbs[0]; 505 | 506 | int space = 32 - TBITS; 507 | int i = 1; 508 | int bits_remaining = TBITS * NLIMBS_REDUCED; 509 | while (bits_remaining > 0) { 510 | *out++ = collect & 0xff; 511 | collect >>= 8; 512 | space += 8; 513 | bits_remaining -= 8; 514 | if (space >= TBITS && i < NLIMBS_REDUCED) { 515 | collect |= x->limbs[i] << (32 - space); 516 | space -= TBITS; 517 | ++i; 518 | } 519 | } 520 | } 521 | 522 | void decode(residue_narrow_reduced_t *out, const uint8_t *in) { 523 | uint32_t collect = 0; 524 | 525 | int shift = 0; 526 | int i = 0; 527 | int bits_remaining = TBITS * NLIMBS_REDUCED; 528 | while (bits_remaining > 0) { 529 | collect |= (*in++) << shift; 530 | shift += 8; 531 | bits_remaining -= 8; 532 | if (shift >= TBITS) { 533 | if (bits_remaining > 0) { 534 | out->limbs[i] = collect & TMASK; 535 | collect >>= 26; 536 | shift -= 26; 537 | ++i; 538 | } else { 539 | out->limbs[i] = collect; 540 | } 541 | } 542 | } 543 | } 544 | -------------------------------------------------------------------------------- /ref/include/f11_260.h: -------------------------------------------------------------------------------- 1 | // Types and functions for manipulating field elements 2 | 3 | #ifndef F11_260_H 4 | #define F11_260_H 5 | #include 6 | 7 | #define NLIMBS_REDUCED 10 8 | #define NLIMBS 11 9 | #define T ((1 << 26) - 15) 10 | #define TBITS 26 11 | #define TMASK ((1 << 26) - 1) 12 | #define T_CBITS 4 13 | #define RESIDUE_LENGTH_BYTES 33 14 | 15 | // Reduced to 10 limbs. For final results. 16 | typedef struct residue_narrow_reduced { 17 | __attribute__((__aligned__(8))) 18 | int32_t limbs[NLIMBS_REDUCED]; 19 | } residue_narrow_reduced_t; 20 | 21 | // 11 limbs. 22 | typedef struct residue_narrow { 23 | __attribute__((__aligned__(64))) 24 | int32_t limbs[NLIMBS]; 25 | int32_t pad[16 - NLIMBS]; 26 | } residue_narrow_t; 27 | 28 | // 11 limbs. 29 | // compatibility. 30 | typedef struct residue_wide { 31 | __attribute__((__aligned__(64))) 32 | int64_t limbs[NLIMBS]; 33 | int64_t pad[16 - NLIMBS]; 34 | } residue_wide_t; 35 | 36 | residue_wide_t zero_wide; 37 | residue_wide_t one_wide; 38 | residue_narrow_t zero_narrow; 39 | residue_narrow_t one_narrow; 40 | 41 | // Shrink to 32 bits. Assumes reduction has already occurred, and wide storage 42 | // is being used for vector compatibility. 43 | void narrow(residue_narrow_t *result, const residue_wide_t * __restrict w); 44 | 45 | // Reduce to 10 limbs. Useful for debugging 46 | void narrow_reduce( 47 | residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w); 48 | 49 | // Reduce to unique representative. 50 | // This is expensive. Only used for final signature or DH Key 51 | void narrow_complete( 52 | residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w); 53 | 54 | // Reduce to mostly unique representative. 55 | // All coefficients are reduced to 0 <= xi <= t 56 | // Unique up to carries (xi == t) => (xi = 0; x[i+1] += 1); 57 | // This is sufficient to determine if x is even or odd. 58 | // Still pretty expensive. Used in point compression. 59 | void narrow_partial_complete( 60 | residue_narrow_reduced_t *result, const residue_narrow_t * __restrict w); 61 | 62 | int is_odd(residue_narrow_reduced_t *x); 63 | 64 | // Produce a 32-bit entry with 11 limbs 65 | static inline void unnarrow_reduce( 66 | residue_narrow_t *result, const residue_narrow_reduced_t * __restrict x) { 67 | 68 | result->limbs[10] = 0; 69 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 70 | result->limbs[i] = x->limbs[i]; 71 | } 72 | } 73 | 74 | // Produce a 64-bit residue 75 | void widen( 76 | residue_wide_t *result, const residue_narrow_t * __restrict x); 77 | 78 | // Copy a 64-bit residue 79 | void copy_wide( 80 | residue_wide_t *result, const residue_wide_t * __restrict x); 81 | 82 | // Copy a 32-bit residue 83 | void copy_narrow( 84 | residue_narrow_t *result, const residue_narrow_t * __restrict x); 85 | 86 | void copy_narrow_reduced( 87 | residue_narrow_reduced_t *result, 88 | const residue_narrow_reduced_t * __restrict x); 89 | 90 | // Subtract 2 11x32-bit residues. 91 | void sub_narrow( 92 | residue_narrow_t *result, const residue_narrow_t * __restrict x, 93 | const residue_narrow_t * __restrict y); 94 | 95 | void negate_wide(residue_wide_t *result, const residue_wide_t *x); 96 | 97 | void negate_narrow(residue_narrow_t *result, const residue_narrow_t *x); 98 | 99 | // Add 2 11x32-bit residues. 100 | void add_narrow( 101 | residue_narrow_t *result, const residue_narrow_t * __restrict x, 102 | const residue_narrow_t * __restrict y); 103 | 104 | // Add 2 11x64-bit residues. 105 | void add_wide( 106 | residue_wide_t *result, const residue_wide_t * __restrict x, 107 | const residue_wide_t * __restrict y); 108 | 109 | // Scale a wide residue by 2. 110 | void double_narrow( 111 | residue_narrow_t *result, const residue_narrow_t * __restrict x); 112 | 113 | // Multiply two narrow residues and produce a wide result. The result is reduced 114 | // to 32 bits. 115 | void mul_narrow( 116 | residue_narrow_t *result, const residue_narrow_t *x, 117 | const residue_narrow_t *y); 118 | 119 | // Multiply a residue by a constant. 120 | void mul_narrow_const( 121 | residue_narrow_t *result, const residue_narrow_t * __restrict x, int32_t d); 122 | 123 | // Square a narrow residue and produce a narrow result. The result is reduced to 124 | // 32 bits. 125 | void square_narrow( 126 | residue_narrow_t *result, const residue_narrow_t *x); 127 | 128 | // Approximately divide each coefficient by t. Carry the results. 129 | void reduce_step_narrow( 130 | residue_narrow_t *result, const residue_narrow_t *x); 131 | 132 | // Approximately divide each coefficient by t. Carry the results. 133 | void reduce_step_wide( 134 | residue_wide_t *result, const residue_wide_t *x); 135 | 136 | // Invert via fermat's theorem 137 | void invert_narrow( 138 | residue_narrow_t *result, const residue_narrow_t * __restrict x); 139 | 140 | // Compute combined inverse and square root 141 | // returns true if x/y was a quadratic residue, and false otherwise. 142 | int sqrt_inv_narrow( 143 | residue_narrow_t *result, const residue_narrow_t * __restrict x, 144 | const residue_narrow_t * __restrict y); 145 | 146 | // Returns true if x == y. Computes in constant time. 147 | int equal_narrow(const residue_narrow_t * x, const residue_narrow_t * y); 148 | 149 | int equal_narrow_reduced( 150 | const residue_narrow_reduced_t * x, const residue_narrow_reduced_t * y); 151 | 152 | void encode(uint8_t *out, const residue_narrow_reduced_t * __restrict x); 153 | void encode_compressed( 154 | uint8_t *out, const residue_narrow_reduced_t * __restrict x, int is_odd); 155 | 156 | void decode(residue_narrow_reduced_t *out, const uint8_t *in); 157 | #endif 158 | -------------------------------------------------------------------------------- /ref/include/gen.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "comb.h" 4 | #include "curve.h" 5 | #include "gen.h" 6 | #include "scalar.h" 7 | 8 | void gen_key(scalar_t * __restrict priv_key, 9 | affine_pt_narrow_t * __restrict pub_key) { 10 | scalar_hash_t large_key; 11 | char *large_key_ptr = (char *) &large_key; 12 | arc4random_buf(large_key_ptr, sizeof(large_key)); 13 | 14 | // It's just as random to use montgomery reduction as to correct for the 15 | // montgomery factor. 16 | mont_reduce_hash_mod_l(priv_key, &large_key); 17 | 18 | projective_pt_narrow_t result_pt; 19 | scalar_comb_multiply(&result_pt, &base_comb, priv_key); 20 | 21 | residue_narrow_t z_inv; 22 | 23 | invert_narrow(&z_inv, &result_pt.z); 24 | mul_narrow(&result_pt.x, &result_pt.x, &z_inv); 25 | mul_narrow(&result_pt.y, &result_pt.y, &z_inv); 26 | 27 | residue_narrow_t temp_narrow; 28 | copy_narrow(&pub_key->x, &result_pt.x); 29 | 30 | copy_narrow(&pub_key->y, &result_pt.y); 31 | 32 | // explicit_bzero(&large_key, sizeof(large_key)); 33 | explicit_bzero(&result_pt, sizeof(result_pt)); 34 | explicit_bzero(&z_inv, sizeof(z_inv)); 35 | explicit_bzero(&temp_narrow, sizeof(temp_narrow)); 36 | } 37 | 38 | void encode_pub_key(uint8_t *result, const affine_pt_narrow_t *pub_key) { 39 | residue_narrow_reduced_t y_reduced; 40 | residue_narrow_reduced_t x_reduced; 41 | narrow_complete(&y_reduced, &pub_key->y); 42 | narrow_partial_complete(&x_reduced, &pub_key->x); 43 | 44 | y_reduced.limbs[NLIMBS_REDUCED - 1] |= is_odd(&x_reduced) << TBITS; 45 | encode(result, &y_reduced); 46 | } 47 | 48 | int decode_pub_key(affine_pt_narrow_t *result, const uint8_t *encoded_key) { 49 | residue_narrow_reduced_t y_decoded; 50 | decode(&y_decoded, encoded_key); 51 | int is_odd = y_decoded.limbs[NLIMBS_REDUCED - 1] >> TBITS; 52 | y_decoded.limbs[NLIMBS_REDUCED - 1] &= TMASK; 53 | return point_decompress(result, &y_decoded, is_odd); 54 | } 55 | -------------------------------------------------------------------------------- /ref/include/gen.h: -------------------------------------------------------------------------------- 1 | #ifndef GEN_H 2 | #define GEN_H 3 | 4 | #include "scalar.h" 5 | #include "curve.h" 6 | 7 | void gen_key(scalar_t * __restrict priv_key, 8 | affine_pt_narrow_t * __restrict pub_key); 9 | void encode_pub_key(uint8_t *result, const affine_pt_narrow_t *pub_key); 10 | int decode_pub_key(affine_pt_narrow_t *result, const uint8_t *encoded_key); 11 | #endif 12 | -------------------------------------------------------------------------------- /ref/include/scalar.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "f11_260.h" 4 | #include "scalar.h" 5 | 6 | // Plenty of inspiration for this file was taken from Mike Hamburg's 7 | // Ed448 code. 8 | 9 | // Constants: 10 | __attribute__((__aligned__(32))) 11 | const scalar_t l_bits = { 12 | .limbs = {0x28ad9c41, 0xe6dcf7e8, 0x34b804af, 0x5af91169, 13 | 0x5cf68f2f, 0x125277f4, 0x9c1bf9f, 0xffff6b00, 0x3,}, 14 | }; 15 | 16 | __attribute__((__aligned__(32))) 17 | const scalar_t signed_bits_set_adjustment = { 18 | .limbs = {0x5d498efb, 0x648c205f, 0x2d1fed40, 0x941bba5b, 19 | 0x8c25c342, 0xb6b6202e, 0xd8f90183, 0x000253ff, 0x0,}, 20 | }; 21 | 22 | __attribute__((__aligned__(32))) 23 | const scalar_t SCALAR_MONT_R2 = { 24 | .limbs = {0x30ba45c7, 0xf3422093, 0x054bbbf6, 0x017ab264, 25 | 0x914ee18b, 0x250f1097, 0xf6bc1224, 0x5e97c70e, 0x2,}, 26 | }; 27 | 28 | const uint32_t SCALAR_MONT_N_PRIME = 0xb3138c3f; 29 | 30 | __attribute__((__aligned__(32))) 31 | const scalar_t SCALAR_MONT_R2_HASH = { 32 | .limbs = { 33 | 0x202dd8e7, 0xcb1bf7be, 0xd219daf6, 0xb85aba0a, 34 | 0xdc8da05f, 0xbd23bfce, 0xb7642c95, 0xbb13e4ad, 0x0,}, 35 | }; 36 | 37 | __attribute__((__aligned__(32))) 38 | const scalar_t SCALAR_MONT_R2_HASH_MUL = { 39 | .limbs = {0x8b9c7a13, 0x37bb3081, 0xe4f0c2b0, 0x99b4a8b2, 40 | 0xb4538c55, 0x34c9db2a, 0x2ade0e63, 0xa7cb6782, 0x1,}, 41 | }; 42 | 43 | void divide_by_2_mod_l( 44 | scalar_t *result, const scalar_t *x) { 45 | 46 | uint32_t mask = -(x->limbs[0] & 1); 47 | 48 | uint64_t chain = 0; 49 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 50 | chain = (chain + x->limbs[i]) + (mask & l_bits.limbs[i]); 51 | result->limbs[i] = chain; 52 | chain >>= SCALAR_LIMB_BITS; 53 | } 54 | 55 | int i; 56 | for (i = 0; i < SCALAR_LIMBS - 1; ++i) { 57 | result->limbs[i] = result->limbs[i] >> 1 | 58 | (result->limbs[i+1] << (SCALAR_LIMB_BITS - 1)); 59 | } 60 | result->limbs[i] >>= 1; 61 | } 62 | 63 | void add_mod_l( 64 | scalar_t *result, const scalar_t *x, 65 | const scalar_t * __restrict y) { 66 | 67 | uint64_t chain = 0; 68 | int i; 69 | for (i = 0; i < SCALAR_LIMBS; ++i) { 70 | chain = (chain + x->limbs[i]) + y->limbs[i]; 71 | result->limbs[i] = chain; 72 | chain >>= SCALAR_LIMB_BITS; 73 | } 74 | 75 | sub_mod_l(result, result, &l_bits); 76 | } 77 | 78 | void sub_mod_l( 79 | scalar_t *result, const scalar_t *x, 80 | const scalar_t *y) { 81 | sub_mod_l_accum(result, x->limbs, y); 82 | } 83 | 84 | // x is a pointer and not a scalar_t so that this function can be used to reduce 85 | // accumulators after multiplication. 86 | void sub_mod_l_accum( 87 | scalar_t *result, const uint32_t *x, 88 | const scalar_t *y) { 89 | 90 | int64_t chain = 0; 91 | int i; 92 | for (i = 0; i < SCALAR_LIMBS; ++i) { 93 | chain = (chain + x[i]) - y->limbs[i]; 94 | result->limbs[i] = chain; 95 | chain >>= SCALAR_LIMB_BITS; 96 | } 97 | 98 | //Should be 0 or -1 (to function as a mask) 99 | int32_t borrow = chain; 100 | 101 | chain = 0; 102 | for (i = 0; i < SCALAR_LIMBS; ++i) { 103 | chain = (chain + result->limbs[i]) + (l_bits.limbs[i] & borrow); 104 | result->limbs[i] = chain; 105 | chain >>= SCALAR_LIMB_BITS; 106 | } 107 | } 108 | 109 | void convert_to_sabs( 110 | scalar_t *result, const scalar_t *x) { 111 | add_mod_l(result, x, &signed_bits_set_adjustment); 112 | divide_by_2_mod_l(result, result); 113 | } 114 | 115 | void mont_reduce_hash_mod_l( 116 | scalar_t *result, const scalar_hash_t * __restrict x) { 117 | uint32_t accum[HASH_LIMBS]; 118 | 119 | for (int i = 0; i < HASH_LIMBS; ++i) { 120 | accum[i] = x->limbs[i]; 121 | } 122 | 123 | uint64_t chain = 0; 124 | for (int i = 0; i <= HASH_LIMBS - SCALAR_LIMBS; ++i) { 125 | uint32_t q = accum[0] * SCALAR_MONT_N_PRIME; 126 | for (int j = 0; j < SCALAR_LIMBS; ++j) { 127 | chain += accum[j] + ((uint64_t) q) * l_bits.limbs[j]; 128 | if (j > 0) { 129 | accum[j - 1] = chain; 130 | } 131 | chain >>= SCALAR_LIMB_BITS; 132 | } 133 | int j; 134 | for (j = SCALAR_LIMBS; j < HASH_LIMBS - i; ++j) { 135 | chain += accum[j]; 136 | accum[j - 1] = chain; 137 | chain >>= SCALAR_LIMB_BITS; 138 | } 139 | accum[j - 1] = chain; 140 | } 141 | 142 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 143 | result->limbs[i] = accum[i]; 144 | } 145 | explicit_bzero(accum, sizeof(accum)); 146 | } 147 | 148 | void reduce_hash_mod_l(scalar_t *result, const scalar_hash_t * __restrict x) { 149 | mont_reduce_hash_mod_l(result, x); 150 | mont_mult_mod_l(result, result, &SCALAR_MONT_R2_HASH); 151 | } 152 | 153 | void mont_mult_mod_l(scalar_t *result, const scalar_t *x, 154 | const scalar_t *y) { 155 | uint32_t accum[SCALAR_LIMBS + 1] = {0}; 156 | 157 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 158 | uint32_t x_limb = x->limbs[i]; 159 | 160 | uint64_t chain = 0; 161 | int j; 162 | for (j = 0; j < SCALAR_LIMBS; ++j) { 163 | chain += accum[j] + ((uint64_t) y->limbs[j]) * x_limb; 164 | accum[j] = chain; 165 | chain >>= SCALAR_LIMB_BITS; 166 | } 167 | 168 | // 2 bit value 169 | accum[j] = chain; 170 | 171 | uint32_t q = accum[0] * SCALAR_MONT_N_PRIME; 172 | chain = 0; 173 | for (int j = 0; j < SCALAR_LIMBS; ++j) { 174 | chain += accum[j] + ((uint64_t) l_bits.limbs[j]) * q; 175 | if (j > 0) { 176 | accum[j - 1] = chain; 177 | } 178 | chain >>= SCALAR_LIMB_BITS; 179 | } 180 | 181 | // chain is a 2-bit value with a possible carry. 182 | // result is a 3 bit value 183 | chain += accum[j]; 184 | accum[j - 1] = chain; 185 | } 186 | 187 | sub_mod_l_accum(result, accum, &l_bits); 188 | explicit_bzero(accum, sizeof(accum)); 189 | } 190 | 191 | void mult_mod_l(scalar_t *result, const scalar_t * __restrict x, 192 | const scalar_t * __restrict y) { 193 | scalar_t temp; 194 | mont_mult_mod_l(&temp, x, y); 195 | mont_mult_mod_l(result, &temp, &SCALAR_MONT_R2); 196 | explicit_bzero(&temp, sizeof(temp)); 197 | } 198 | -------------------------------------------------------------------------------- /ref/include/scalar.h: -------------------------------------------------------------------------------- 1 | #ifndef SCALAR_H 2 | #define SCALAR_H 3 | #include 4 | #include "f11_260.h" 5 | 6 | typedef struct scalar { 7 | uint32_t limbs[9]; 8 | } scalar_t; 9 | 10 | typedef struct scalar_hash { 11 | uint32_t limbs[16]; 12 | } scalar_hash_t; 13 | 14 | // const int SCALAR_LIMBS = 9; 15 | #define HASH_LIMBS 16 16 | #define SCALAR_LIMBS 9 17 | #define SCALAR_BITS 258 18 | #define SCALAR_BYTES 33 19 | #define SCALAR_LIMB_BITS 32 20 | #define SCALAR_LAST_LIMB_BITS 2 21 | #define SCALAR_LAST_LIMB_MASK 0x3 22 | 23 | // Constants 24 | // A scalar representing l, the order of the prime subgroup. 25 | const scalar_t l_bits; 26 | // For converting to SABS representation 27 | const scalar_t signed_bits_set_adjustment; 28 | // l * N' is congruent to -1 mod 2^32 29 | const uint32_t SCALAR_MONT_N_PRIME; 30 | // (2 ^ 32)^18 mod l. Used to convert to montgomery domain. 31 | // Or to fix the result of a single multiply via a 2nd multiply. 32 | const scalar_t SCALAR_MONT_R2; 33 | // (2 ^ 32)^17 mod l. 34 | // Used to fix the result of a hash reduction via a multiply 35 | // A hash is reduced from HASH_LIMBS to SCALAR_LIMBS via 36 | // HASH_LIMBS - SCALAR_LIMBS + 1 divisions by 2^32. So a hash reduction produces 37 | // h * (2^32)^-8 mod l. Montgomery multiplying by (2^32)^17 mod l produces h mod 38 | // l 39 | const scalar_t SCALAR_MONT_R2_HASH; 40 | // (2 ^ 32)^26 mod l. 41 | // Used to fix the result of a hash reduction followed by a multiply. 42 | // By similar logic we need to get rid of a factor of (2^32)^-17 43 | const scalar_t SCALAR_MONT_R2_HASH_MUL; 44 | 45 | // Functions for manipulating scalars. May need more for ECDSA. 46 | 47 | // This is used to convert to SABS representation. 48 | void divide_by_2_mod_l(scalar_t *result, const scalar_t * __restrict x); 49 | 50 | void add_mod_l(scalar_t *result, const scalar_t * __restrict x, 51 | const scalar_t * __restrict y); 52 | 53 | void sub_mod_l(scalar_t *result, const scalar_t * __restrict x, 54 | const scalar_t * __restrict y); 55 | 56 | void sub_mod_l_accum(scalar_t *result, const uint32_t * __restrict x, 57 | const scalar_t * __restrict y); 58 | 59 | void mont_mult_mod_l(scalar_t *result, const scalar_t * __restrict x, 60 | const scalar_t * __restrict y); 61 | 62 | void mult_mod_l(scalar_t *result, const scalar_t * __restrict x, 63 | const scalar_t * __restrict y); 64 | 65 | void mont_reduce_hash_mod_l( 66 | scalar_t *result, const scalar_hash_t * __restrict x); 67 | void reduce_hash_mod_l(scalar_t *result, const scalar_hash_t * __restrict x); 68 | 69 | void convert_to_sabs(scalar_t *result, const scalar_t * __restrict x); 70 | #endif 71 | -------------------------------------------------------------------------------- /ref/include/sign.h: -------------------------------------------------------------------------------- 1 | #ifndef SIGN_H 2 | #define SIGN_H 3 | #include "curve.h" 4 | #include "scalar.h" 5 | 6 | #define SIG_LENGTH 65 7 | 8 | typedef struct signature { 9 | residue_narrow_reduced_t y; 10 | scalar_t s; 11 | } signature_t; 12 | 13 | void sign(signature_t *result, scalar_t *priv_key, 14 | const uint8_t *pub_key, const uint8_t *msg, size_t msg_len); 15 | 16 | int verify( 17 | const signature_t *sig, const uint8_t *r_bytes, const uint8_t *pub_key_bytes, 18 | const affine_pt_narrow_t *pub_key_pt, const uint8_t *msg, 19 | size_t msg_len); 20 | 21 | void encode_sig(uint8_t *result, const signature_t *sig); 22 | void decode_sig(signature_t *result, const uint8_t *encoded_sig); 23 | #endif 24 | -------------------------------------------------------------------------------- /ref/src/api.c.supercop_only: -------------------------------------------------------------------------------- 1 | #define _DEFAULT_SOURCE 2 | #include 3 | #include "crypto_sign.h" 4 | #include "curve.h" 5 | #include "gen.h" 6 | #include "scalar.h" 7 | #include "sign.h" 8 | 9 | int crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { 10 | affine_pt_narrow_t pub_key_pt; 11 | scalar_t priv_key; 12 | gen_key(&priv_key, &pub_key_pt); 13 | encode_pub_key(pk, &pub_key_pt); 14 | memcpy(sk, &priv_key, SCALAR_BYTES); 15 | memcpy(sk + SCALAR_BYTES, pk, RESIDUE_LENGTH_BYTES); 16 | explicit_bzero(&priv_key, sizeof(priv_key)); 17 | return 0; 18 | } 19 | 20 | int crypto_sign( 21 | unsigned char *sm,unsigned long long *smlen, 22 | const unsigned char *m,unsigned long long mlen, 23 | const unsigned char *sk) { 24 | signature_t sig_struct; 25 | scalar_t priv_key; 26 | priv_key.limbs[SCALAR_LIMBS - 1] = 0; 27 | memcpy(&priv_key, sk, SCALAR_BYTES); 28 | sign(&sig_struct, &priv_key, sk + SCALAR_BYTES, m, mlen); 29 | 30 | *smlen = mlen + SIG_LENGTH; 31 | encode_sig(sm, &sig_struct); 32 | memcpy(sm + SIG_LENGTH, m, mlen); 33 | return 0; 34 | } 35 | 36 | int crypto_sign_open( 37 | unsigned char *m,unsigned long long *mlen, 38 | const unsigned char *sm,unsigned long long smlen, 39 | const unsigned char *pk) { 40 | signature_t sig_struct; 41 | decode_sig(&sig_struct, sm); 42 | affine_pt_narrow_t pub_key_pt; 43 | if (!decode_pub_key(&pub_key_pt, pk)) { 44 | return -1; 45 | } 46 | 47 | uint8_t y_buf[RESIDUE_LENGTH_BYTES]; 48 | encode(y_buf, &sig_struct.y); 49 | 50 | if (!verify(&sig_struct, y_buf, pk, &pub_key_pt, sm + SIG_LENGTH, 51 | smlen - SIG_LENGTH)) { 52 | return -2; 53 | } 54 | *mlen = smlen - SIG_LENGTH; 55 | memcpy(m, sm + SIG_LENGTH, smlen - SIG_LENGTH); 56 | return 0; 57 | } 58 | -------------------------------------------------------------------------------- /ref/src/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "comb.h" 7 | #include "curve.h" 8 | #include "f11_260.h" 9 | #include "gen.h" 10 | #include "scalar.h" 11 | #include "sign.h" 12 | 13 | #include 14 | 15 | int main(int _argc, char **argv) { 16 | residue_narrow_t x = { 17 | .limbs = { 18 | 0x3553e74, 0x0464e4c, 0x61de408, 0x006a30e, 19 | 0x6e9b25b, 0x3e6f39e, 0x19ec754, 0x5c71cc3, 20 | 0x2bc1c0e, 0x554338e, 0x14e8b6e, 21 | }, 22 | }; 23 | 24 | residue_narrow_t two = { 25 | .limbs = {0x2}, 26 | }; 27 | 28 | residue_narrow_t x_plus_two; 29 | 30 | residue_narrow_reduced_t x_narrow_reduced = { 31 | .limbs = { 32 | 0x206b305, 0x2f7c2ce, 0x0cf58a7, 0x2b81791, 0x19b26fa, 33 | 0x2986830, 0x0503be5, 0x0789163, 0x16d90a0, 0x005a82e, 34 | }, 35 | }; 36 | 37 | residue_narrow_t y = { 38 | .limbs = { 39 | 0x5f5b0e1, 0x4668277, 0x0f7d85a, 0x4515e42, 40 | 0x00cb559, 0x3f8a910, 0x6655708, 0x3085b4d, 41 | 0x581ceff, 0x3324c03, 0x56ed38e, 42 | }, 43 | }; 44 | 45 | residue_narrow_reduced_t y_narrow_reduced = { 46 | .limbs = { 47 | 0x086dd54, 0x2f7aedb, 0x38904ae, 0x2e28aa4, 0x29de1ad, 48 | 0x289d572, 0x0f6837a, 0x19987b1, 0x012fb71, 0x1c37867, 49 | }, 50 | }; 51 | 52 | residue_wide_t mul_expected = { 53 | .limbs = { 54 | 0x1c508c4, 0x3eeb85d, 0x04bc914, 0x0a57e1c, 55 | 0x1f13f9a, 0x2d8aa7d, 0x232cce3, 0x31e92c4, 56 | 0x04fb073, 0x2582507, 0x06e9e1d, 57 | }, 58 | }; 59 | 60 | residue_wide_t square_expected = { 61 | .limbs = { 62 | 0x2073353, 0x18e5de4, 0x320a4ab, 0x3ee123a, 63 | 0x2d88419, 0x3d1ae13, 0x02b3dcf, 0x2997027, 64 | 0x3d550a2, 0x220a052, 0x3088d3c, 65 | }, 66 | }; 67 | 68 | residue_narrow_t negative_one_redundant = { 69 | .limbs = { 70 | 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 71 | 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 72 | 0x3ffffff, 0x3ffffff, 0x000000e, 73 | }, 74 | }; 75 | 76 | residue_narrow_t negative_t2_plus_one = { 77 | .limbs = { 78 | 0x3ffffff, 0x000000e, 0x3ffffff, 0x3ffffff, 79 | 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 80 | 0x3ffffff, 0x3ffffff, 0x000000e, 81 | }, 82 | }; 83 | 84 | residue_narrow_reduced_t negative_t2_plus_one_partial = { 85 | .limbs = { 86 | 0x3fffff1, 0x0000000, 0x3fffff1, 0x3fffff1, 87 | 0x3fffff1, 0x3fffff1, 0x3fffff1, 0x3fffff1, 88 | 0x3fffff1, 0x3fffff1, 89 | }, 90 | }; 91 | 92 | residue_narrow_reduced_t negative_t2_plus_one_complete = { 93 | .limbs = { 94 | 0x0000000, 0x0000001, 0x3fffff1, 0x3fffff1, 95 | 0x3fffff1, 0x3fffff1, 0x3fffff1, 0x3fffff1, 96 | 0x3fffff1, 0x3fffff1, 97 | }, 98 | }; 99 | 100 | residue_narrow_t sqrt_x_plus_2_over_y = { 101 | .limbs = { 102 | 0x3fa8549, 0x0706e5c, 0x3b33dc9, 0x3401712, 103 | 0x3a58fb3, 0x076ec4f, 0x3347ad0, 0x16ca1b0, 104 | 0x26ed559, 0x06033f0, 0x040bbb0, 105 | }, 106 | }; 107 | 108 | residue_narrow_t x_inverse = { 109 | .limbs = { 110 | 0x17a9f53, 0x22e2983, 0x0f09456, 0x11fb41e, 111 | 0x1e47b3f, 0x37dd25f, 0x3bc6938, 0x2b654cd, 112 | 0x233a0b2, 0x3f8c25b, 0x09fd09b, 113 | }, 114 | }; 115 | 116 | #if 1 117 | residue_narrow_t result; 118 | residue_narrow_reduced_t result_narrow_reduced; 119 | 120 | mul_narrow(&result, &x, &y); 121 | for (int i = 0; i < NLIMBS; ++i) { 122 | assert(mul_expected.limbs[i] == result.limbs[i]); 123 | } 124 | 125 | square_narrow(&result, &x); 126 | for (int i = 0; i < NLIMBS; ++i) { 127 | assert(square_expected.limbs[i] == result.limbs[i]); 128 | } 129 | 130 | // The reduction function doesn't reduce this redundant version of negative 131 | // one any more. 132 | reduce_step_narrow(&result, &negative_one_redundant); 133 | for (int i = 0; i < NLIMBS; ++i) { 134 | assert(negative_one_redundant.limbs[i] == result.limbs[i]); 135 | } 136 | 137 | reduce_step_narrow(&result, &negative_t2_plus_one); 138 | for (int i = 0; i < NLIMBS; ++i) { 139 | assert(negative_t2_plus_one.limbs[i] == result.limbs[i]); 140 | } 141 | 142 | narrow_partial_complete(&result_narrow_reduced, &negative_t2_plus_one); 143 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 144 | assert(negative_t2_plus_one_partial.limbs[i] == 145 | result_narrow_reduced.limbs[i]); 146 | } 147 | 148 | narrow_complete(&result_narrow_reduced, &negative_t2_plus_one); 149 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 150 | assert(negative_t2_plus_one_complete.limbs[i] == 151 | result_narrow_reduced.limbs[i]); 152 | } 153 | 154 | scalar_t scalar_result; 155 | scalar_t scalar_x = { 156 | .limbs = { 157 | 0xa46168f9, 0x4cbf07a5, 0x62cf2928, 0xfd04242b, 0x3b12d23f, 158 | 0x355e9e63, 0xc22e849e, 0x6331c34a, 0x1, 159 | }, 160 | }; 161 | scalar_t scalar_y = { 162 | .limbs = { 163 | 0x148b9452, 0xaca9b6bb, 0xe0eeb33d, 0x7e64c899, 0xd61c602a, 164 | 0x96dcbb6b, 0x6a037c88, 0x39fbbaf0, 0x0, 165 | }, 166 | }; 167 | scalar_t scalar_x_plus_y = { 168 | .limbs = { 169 | 0xb8ecfd4b, 0xf968be60, 0x43bddc65, 0x7b68ecc5, 0x112f326a, 170 | 0xcc3b59cf, 0x2c320126, 0x9d2d7e3b, 0x1, 171 | }, 172 | }; 173 | scalar_t scalar_x_plus_x = { 174 | .limbs = { 175 | 0x48c2d1f2, 0x997e0f4b, 0xc59e5250, 0xfa084856, 0x7625a47f, 176 | 0x6abd3cc6, 0x845d093c, 0xc6638695, 0x2, 177 | }, 178 | }; 179 | scalar_t scalar_x_plus_x_plus_x_plus_y = { 180 | .limbs = { 181 | 0xd90232fc, 0xac09d5c3, 0xd4a42a06, 0x1a7823b2, 0x2a5e47bb, 182 | 0x24a61ea1, 0xa6cd4ac4, 0x639199d0, 0x0, 183 | }, 184 | }; 185 | scalar_t scalar_x_minus_y = { 186 | .limbs = { 187 | 0x8fd5d4a7, 0xa01550ea, 0x81e075ea, 0x7e9f5b91, 0x64f67215, 188 | 0x9e81e2f7, 0x582b0815, 0x2936085a, 0x1, 189 | }, 190 | }; 191 | scalar_t scalar_y_minus_x = { 192 | .limbs = { 193 | 0x98d7c79a, 0x46c7a6fd, 0xb2d78ec5, 0xdc59b5d7, 0xf8001d19, 194 | 0x73d094fc, 0xb196b789, 0xd6c962a5, 0x2, 195 | }, 196 | }; 197 | scalar_t scalar_x_times_y = { 198 | .limbs = { 199 | 0x30b3d35a, 0x9ca90acf, 0x6926efdd, 0x80620b0a, 0x52e190e7, 200 | 0x8011b9b8, 0x8c7d8f43, 0x90491703, 0x3, 201 | }, 202 | }; 203 | scalar_t scalar_x_sabs = { 204 | .limbs = { 205 | 0x80d57bfa, 0x58a59402, 0x47f78b34, 0x488fef43, 0xe39c4ac1, 206 | 0xf60a5f48, 0x4d93c310, 0xb19a0ba5, 0x0, 207 | }, 208 | }; 209 | scalar_t scalar_y_sabs = { 210 | .limbs = { 211 | 0x4d415fc7, 0xfc096781, 0x21635296, 0x36bcca2f, 0x5f9c594e, 212 | 0xaff2a9c7, 0x265f1ed5, 0x1cfebcf8, 0x2, 213 | }, 214 | }; 215 | scalar_hash_t scalar_hash_val = { 216 | .limbs = { 217 | 0xcbbc3de7, 0xa212405d, 0x5c85f47c, 0x79aa991c, 218 | 0xfe310944, 0x54075530, 0xd5ef6878, 0x72e57186, 219 | 0x36dcac18, 0xb72461e2, 0x5405caca, 0x4e9e0bff, 220 | 0x8d67a990, 0xf62f262c, 0x6df205dd, 0x24d78573, 221 | }, 222 | }; 223 | scalar_t reduced_hash_val = { 224 | .limbs = { 225 | 0xef1d4f9d, 0xd832a3a5, 0xdf1682be, 0x8d257e79, 0x41b1f2ca, 226 | 0x5be9564c, 0x320d4cb6, 0x108f8d04, 0x3, 227 | }, 228 | }; 229 | 230 | uint8_t buffer[33]; 231 | uint8_t encode_x[33] = { 232 | 0x05, 0xb3, 0x06, 0x3a, 0x0b, 0xdf, 233 | 0x7b, 0x8a, 0xf5, 0x4c, 0xe4, 0x05, 0xae, 234 | 0xfa, 0x26, 0x9b, 0xc1, 0xa0, 0x61, 235 | 0x5a, 0xbe, 0x03, 0xc5, 0x58, 0x24, 0x1e, 236 | 0xa0, 0x90, 0x6d, 0xb9, 0xa0, 0x16, 0x00, 237 | }; 238 | uint8_t encode_y[33] = { 239 | 0x54, 0xdd, 0x86, 0x6c, 0xbb, 0xde, 240 | 0xeb, 0x4a, 0x90, 0x38, 0xa9, 0xa2, 0xb8, 241 | 0xad, 0xe1, 0x9d, 0xca, 0x55, 0x27, 242 | 0xaa, 0x37, 0x68, 0x4f, 0xec, 0x61, 0x66, 243 | 0x71, 0xfb, 0x12, 0x9c, 0xe1, 0x0d, 0x07, 244 | }; 245 | 246 | add_mod_l(&scalar_result, &scalar_x, &scalar_y); 247 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 248 | assert(scalar_x_plus_y.limbs[i] == scalar_result.limbs[i]); 249 | } 250 | add_mod_l(&scalar_result, &scalar_x, &scalar_x); 251 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 252 | assert(scalar_x_plus_x.limbs[i] == scalar_result.limbs[i]); 253 | } 254 | add_mod_l(&scalar_result, &scalar_x_plus_x, &scalar_x_plus_y); 255 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 256 | assert(scalar_x_plus_x_plus_x_plus_y.limbs[i] == scalar_result.limbs[i]); 257 | } 258 | sub_mod_l(&scalar_result, &scalar_x, &scalar_y); 259 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 260 | assert(scalar_x_minus_y.limbs[i] == scalar_result.limbs[i]); 261 | } 262 | 263 | sub_mod_l(&scalar_result, &scalar_y, &scalar_x); 264 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 265 | assert(scalar_y_minus_x.limbs[i] == scalar_result.limbs[i]); 266 | } 267 | 268 | mult_mod_l(&scalar_result, &scalar_x, &scalar_y); 269 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 270 | assert(scalar_x_times_y.limbs[i] == scalar_result.limbs[i]); 271 | } 272 | 273 | convert_to_sabs(&scalar_result, &scalar_x); 274 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 275 | assert(scalar_x_sabs.limbs[i] == scalar_result.limbs[i]); 276 | } 277 | 278 | convert_to_sabs(&scalar_result, &scalar_y); 279 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 280 | assert(scalar_y_sabs.limbs[i] == scalar_result.limbs[i]); 281 | } 282 | 283 | encode(buffer, &x_narrow_reduced); 284 | for (int i = 0; i < 33; ++i) { 285 | assert(encode_x[i] == buffer[i]); 286 | } 287 | 288 | encode(buffer, &y_narrow_reduced); 289 | for (int i = 0; i < 33; ++i) { 290 | assert(encode_y[i] == buffer[i]); 291 | } 292 | 293 | decode(&result_narrow_reduced, encode_x); 294 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 295 | assert(x_narrow_reduced.limbs[i] == result_narrow_reduced.limbs[i]); 296 | } 297 | 298 | decode(&result_narrow_reduced, encode_y); 299 | for (int i = 0; i < NLIMBS_REDUCED; ++i) { 300 | assert(y_narrow_reduced.limbs[i] == result_narrow_reduced.limbs[i]); 301 | } 302 | 303 | //x/y is not a quadratic residue, but (x+2)/y is. 304 | assert(!sqrt_inv_narrow(&result, &x, &y)); 305 | add_narrow(&x_plus_two, &x, &two); 306 | assert(sqrt_inv_narrow(&result, &x_plus_two, &y)); 307 | for (int i = 0; i < NLIMBS; ++i) { 308 | assert(sqrt_x_plus_2_over_y.limbs[i] == result.limbs[i]); 309 | } 310 | 311 | invert_narrow(&result, &x); 312 | assert(equal_narrow(&result, &x_inverse)); 313 | 314 | reduce_hash_mod_l(&scalar_result, &scalar_hash_val); 315 | for (int i = 0; i < SCALAR_LIMBS; ++i) { 316 | assert(reduced_hash_val.limbs[i] == scalar_result.limbs[i]); 317 | } 318 | 319 | scalar_t mult_scalar = { 320 | .limbs = { 321 | 0x55f0b9a3, 0x82b106c5, 0xcb2e2b7d, 0x30735cbc, 322 | 0xa512a8ba, 0x4c5cd391, 0xe9d0c788, 0x92bb2562, 0x3, 323 | }, 324 | }; 325 | projective_pt_narrow_t expected_scalar_mult = { 326 | .x = { 327 | .limbs = { 328 | 0x1267d8d, 0x39a3cd3, 0x09e1275, 0x2d21378, 0x24771d9, 0x3558a1d, 329 | 0x3bdca9b, 0x0dd862d, 0x0bb230a, 0x1668292, 0x0350abe, 330 | }, 331 | }, 332 | .y = { 333 | .limbs = { 334 | 0x04d69fd, 0x03e739d, 0x36ce258, 0x0b6464b, 0x19dab22, 0x249c1a8, 335 | 0x1d28c7d, 0x1591dbc, 0x085ebab, 0x0e8274f, 0x0b090d6, 336 | }, 337 | }, 338 | .z = { 339 | .limbs = {0x1}, 340 | }, 341 | }; 342 | projective_pt_narrow_t result_pt; 343 | 344 | for (int i = 0; i<1; ++i) { 345 | scalar_multiply(&result_pt, &B, &mult_scalar); 346 | } 347 | { 348 | residue_narrow_t tmp; 349 | mul_narrow(&tmp, &expected_scalar_mult.x, &result_pt.z); 350 | assert(equal_narrow(&tmp, &result_pt.x)); 351 | mul_narrow(&tmp, &expected_scalar_mult.y, &result_pt.z); 352 | assert(equal_narrow(&tmp, &result_pt.y)); 353 | } 354 | 355 | affine_pt_narrow_t expected_everything0 = { 356 | .x = { 357 | .limbs = { 358 | 0x20eef1a, 0x3c30e66, 0x0d710f0, 0x248a6fa, 0x30c967f, 0x3ce302c, 359 | 0x0ccd1f2, 0x197e993, 0x2ebaef3, 0x0f2f019, 0, 360 | }, 361 | }, 362 | .y = { 363 | .limbs = { 364 | 0x3017cc0, 0x02a5110, 0x06d37e5, 0x283a64a, 0x01484b5, 0x196f37b, 365 | 0x13de2d2, 0x0da32d1, 0x392e0fc, 0x221d742, 0, 366 | }, 367 | }, 368 | }; 369 | 370 | affine_pt_narrow_t expected_everything1 = { 371 | .x = { 372 | .limbs = { 373 | 0x0e35d45, 0x038f90c, 0x0283483, 0x01ee50a, 0x1e364f9, 0x362414c, 374 | 0x156b1ed, 0x006fff6, 0x271f9ed, 0x0ffa45d, 0, 375 | }, 376 | }, 377 | .y = { 378 | .limbs = { 379 | 0x156ae67, 0x27941ab, 0x19a3000, 0x3572ab5, 0x2b90ce3, 0x136156c, 380 | 0x0727496, 0x0edae82, 0x0fa5dfd, 0x16f293c, 0, 381 | }, 382 | }, 383 | }; 384 | 385 | affine_pt_narrow_t expected_everything2 = { 386 | .x = { 387 | .limbs = { 388 | 0x37fcb1b, 0x16004b9, 0x1d18743, 0x0bce648, 0x0d78db6, 0x35b1d65, 389 | 0x23bb620, 0x2fbc323, 0x1a9a586, 0x3b22577, 0, 390 | }, 391 | }, 392 | .y = { 393 | .limbs = { 394 | 0x082fb15, 0x03487d6, 0x3d1c2c9, 0x2c9e7ad, 0x187be10, 0x2e9b6ba, 395 | 0x15b8f89, 0x243ae4c, 0x328bb11, 0x00b12a9, 0, 396 | }, 397 | }, 398 | }; 399 | 400 | 401 | affine_pt_narrow_t expected_everything3 = { 402 | .x = { 403 | .limbs = { 404 | 0x3e79b25, 0x2ca71b7, 0x2b2ea3c, 0x0de7ac4, 0x3026d10, 0x2bce79e, 405 | 0x1153866, 0x03e5a80, 0x22b9a37, 0x03e9c59, 0, 406 | }, 407 | }, 408 | .y = { 409 | .limbs = { 410 | 0x20100d6, 0x2330974, 0x3402585, 0x172cfd6, 0x275a21c, 0x213e87c, 411 | 0x29989f2, 0x155e437, 0x096a378, 0x3a674eb, 0, 412 | }, 413 | }, 414 | }; 415 | 416 | affine_pt_narrow_t expected_gray_code_end0 = { 417 | .x = { 418 | .limbs = { 419 | 0x14dd884, 0x12c9e33, 0x2d42122, 0x26f0b14, 0x1b9ea17, 0x3779e94, 420 | 0x2562a88, 0x0be34f0, 0x192ead9, 0x089ec45, 0, 421 | }, 422 | }, 423 | .y = { 424 | .limbs = { 425 | 0x1de5221, 0x172f820, 0x28c1b33, 0x08003c6, 0x0e65926, 0x188cd49, 426 | 0x3bb39fd, 0x1b9d8d7, 0x03d5020, 0x045742b, 0, 427 | }, 428 | }, 429 | }; 430 | 431 | affine_pt_narrow_t expected_gray_code_end1 = { 432 | .x = { 433 | .limbs = { 434 | 0x1d1cf29, 0x2e289d7, 0x1a83709, 0x2252d11, 0x3d6411c, 0x3fd73ad, 435 | 0x2737d9c, 0x2ca9eba, 0x058f290, 0x3879a7c, 0, 436 | }, 437 | }, 438 | .y = { 439 | .limbs = { 440 | 0x357399d, 0x0276752, 0x0d5199f, 0x1bbd3a0, 0x39044f1, 0x0c5e83a, 441 | 0x1a99cdd, 0x0dcb61f, 0x35b7272, 0x1184cff, 0, 442 | }, 443 | }, 444 | }; 445 | 446 | affine_pt_narrow_t expected_gray_code_end2 = { 447 | .x = { 448 | .limbs = { 449 | 0x1ea3c19, 0x081dc9e, 0x1a0b337, 0x1d7f3f4, 0x295a0aa, 0x1ebff45, 450 | 0x0956bf0, 0x17aae80, 0x05d8632, 0x3082c9a, 0, 451 | }, 452 | }, 453 | .y = { 454 | .limbs = { 455 | 0x22ad91f, 0x1ffcc65, 0x37b4f5c, 0x29c51ab, 0x3f9bd02, 0x296aaf9, 456 | 0x2a58b82, 0x2c54e16, 0x2a7672c, 0x21486e2, 0, 457 | }, 458 | }, 459 | }; 460 | 461 | affine_pt_narrow_t expected_gray_code_end3 = { 462 | .x = { 463 | .limbs = { 464 | 0x06b9c9d, 0x3d00674, 0x10a73fc, 0x30fda83, 0x139185c, 0x043e082, 465 | 0x3c67915, 0x208192a, 0x025e451, 0x258a566, 0, 466 | }, 467 | }, 468 | .y = { 469 | .limbs = { 470 | 0x3d2a04f, 0x1314c36, 0x131c7a3, 0x1882ef3, 0x1a0a5e8, 0x1919356, 471 | 0x0a5616a, 0x1eea31d, 0x2c216b3, 0x18ba4aa, 0, 472 | }, 473 | }, 474 | }; 475 | 476 | sabs_comb_set_t computed_base_comb; 477 | compute_comb_set(&computed_base_comb, &B); 478 | assert(equal_narrow(&computed_base_comb.combs[0].table[COMB_TABLE_SIZE - 1].x, 479 | &expected_everything0.x)); 480 | assert(equal_narrow(&computed_base_comb.combs[0].table[COMB_TABLE_SIZE - 1].y, 481 | &expected_everything0.y)); 482 | assert(equal_narrow(&computed_base_comb.combs[1].table[COMB_TABLE_SIZE - 1].x, 483 | &expected_everything1.x)); 484 | assert(equal_narrow(&computed_base_comb.combs[1].table[COMB_TABLE_SIZE - 1].y, 485 | &expected_everything1.y)); 486 | assert(equal_narrow(&computed_base_comb.combs[2].table[COMB_TABLE_SIZE - 1].x, 487 | &expected_everything2.x)); 488 | assert(equal_narrow(&computed_base_comb.combs[2].table[COMB_TABLE_SIZE - 1].y, 489 | &expected_everything2.y)); 490 | assert(equal_narrow(&computed_base_comb.combs[3].table[COMB_TABLE_SIZE - 1].x, 491 | &expected_everything3.x)); 492 | assert(equal_narrow(&computed_base_comb.combs[3].table[COMB_TABLE_SIZE - 1].y, 493 | &expected_everything3.y)); 494 | 495 | 496 | assert(equal_narrow(&computed_base_comb.combs[0].table[7].x, 497 | &expected_gray_code_end0.x)); 498 | assert(equal_narrow(&computed_base_comb.combs[0].table[7].y, 499 | &expected_gray_code_end0.y)); 500 | assert(equal_narrow(&computed_base_comb.combs[1].table[7].x, 501 | &expected_gray_code_end1.x)); 502 | assert(equal_narrow(&computed_base_comb.combs[1].table[7].y, 503 | &expected_gray_code_end1.y)); 504 | assert(equal_narrow(&computed_base_comb.combs[2].table[7].x, 505 | &expected_gray_code_end2.x)); 506 | assert(equal_narrow(&computed_base_comb.combs[2].table[7].y, 507 | &expected_gray_code_end2.y)); 508 | assert(equal_narrow(&computed_base_comb.combs[3].table[7].x, 509 | &expected_gray_code_end3.x)); 510 | assert(equal_narrow(&computed_base_comb.combs[3].table[7].y, 511 | &expected_gray_code_end3.y)); 512 | #endif 513 | 514 | #if 1 515 | for (int i = 0; i<1; ++i) { 516 | scalar_comb_multiply(&result_pt, &base_comb, &mult_scalar); 517 | } 518 | { 519 | residue_narrow_t tmp; 520 | mul_narrow(&tmp, &expected_scalar_mult.x, &result_pt.z); 521 | assert(equal_narrow(&tmp, &result_pt.x)); 522 | mul_narrow(&tmp, &expected_scalar_mult.y, &result_pt.z); 523 | assert(equal_narrow(&tmp, &result_pt.y)); 524 | } 525 | #endif 526 | #if 0 527 | for (int i = 0; i<100000; ++i) { 528 | scalar_t priv_key; 529 | affine_pt_narrow_reduced_t pub_key; 530 | gen_key(&priv_key, &pub_key); 531 | } 532 | #endif 533 | #if 1 534 | uint8_t encoded_sk[66]; 535 | scalar_t priv_key; 536 | scalar_t priv_key_decoded; 537 | affine_pt_narrow_t pub_key; 538 | affine_pt_narrow_t pub_key_decoded; 539 | gen_key(&priv_key, &pub_key); 540 | memcpy(encoded_sk, &priv_key, SCALAR_BYTES); 541 | encode_pub_key(encoded_sk + SCALAR_BYTES, &pub_key); 542 | #if 1 543 | priv_key_decoded.limbs[SCALAR_LIMBS - 1] = 0; 544 | memcpy(&priv_key_decoded, encoded_sk, SCALAR_BYTES); 545 | for (int j = 0; j < SCALAR_LIMBS; ++j) { 546 | assert(priv_key.limbs[j] == priv_key_decoded.limbs[j]); 547 | } 548 | #endif 549 | for (int i = 0; i < 1; ++i) { 550 | uint8_t encoded_sig[65]; 551 | const uint8_t *msg = (uint8_t *) "Hello World!"; 552 | const size_t msglen = 13; 553 | signature_t result; 554 | sign(&result, &priv_key_decoded, encoded_sk + SCALAR_BYTES, msg, msglen); 555 | encode_sig(encoded_sig, &result); 556 | #if 1 557 | if (1) { 558 | signature_t result_decoded; 559 | decode_sig(&result_decoded, encoded_sig); 560 | for (int j = 0; j < SCALAR_LIMBS; ++j) { 561 | assert(result.s.limbs[j] == result_decoded.s.limbs[j]); 562 | } 563 | for (int j = 0; j < NLIMBS_REDUCED; ++j) { 564 | assert(result.y.limbs[j] == result_decoded.y.limbs[j]); 565 | } 566 | assert(decode_pub_key(&pub_key_decoded, encoded_sk + SCALAR_BYTES)); 567 | 568 | uint8_t y_buf[RESIDUE_LENGTH_BYTES]; 569 | encode(y_buf, &result_decoded.y); 570 | if(!verify(&result, y_buf, encoded_sk + SCALAR_BYTES, &pub_key_decoded, msg, 571 | msglen)) { 572 | printf("verification failed\n"); 573 | exit(1); 574 | } 575 | } 576 | #endif 577 | } 578 | #endif 579 | } 580 | -------------------------------------------------------------------------------- /ref/src/sign.c: -------------------------------------------------------------------------------- 1 | #define _DEFAULT_SOURCE 2 | #include 3 | #include 4 | #include 5 | 6 | #include "comb.h" 7 | #include "curve.h" 8 | #include "scalar.h" 9 | 10 | #include "sign.h" 11 | 12 | #include "f11_260.c" 13 | #include "curve.c" 14 | #include "scalar.c" 15 | #include "gen.c" 16 | #include "constant_time.c" 17 | #include "comb.c" 18 | 19 | void sign(signature_t *result, scalar_t *priv_key, 20 | const uint8_t *pub_key, const uint8_t *msg, size_t msg_len) { 21 | blake2b_state hash_ctxt; 22 | 23 | char session_key_wash[16]; 24 | 25 | scalar_hash_t scalar_large; 26 | scalar_t session_key; 27 | 28 | arc4random_buf(session_key_wash, sizeof(session_key_wash)); 29 | blake2b_init_key(&hash_ctxt, 64, session_key_wash, sizeof(session_key_wash)); 30 | blake2b_update(&hash_ctxt, (uint8_t *) priv_key, SCALAR_BYTES); 31 | blake2b_update(&hash_ctxt, (uint8_t *) msg, msg_len); 32 | blake2b_final(&hash_ctxt, (uint8_t *) &scalar_large, sizeof(scalar_hash_t)); 33 | 34 | reduce_hash_mod_l(&session_key, &scalar_large); 35 | 36 | projective_pt_narrow_t result_pt; 37 | scalar_comb_multiply(&result_pt, &base_comb, &session_key); 38 | residue_narrow_t z_inv; 39 | 40 | invert_narrow(&z_inv, &result_pt.z); 41 | mul_narrow(&result_pt.x, &result_pt.x, &z_inv); 42 | mul_narrow(&result_pt.y, &result_pt.y, &z_inv); 43 | 44 | narrow_complete(&result->y, &result_pt.y); 45 | 46 | residue_narrow_reduced_t temp_narrow_reduced; 47 | narrow_partial_complete(&temp_narrow_reduced, &result_pt.x); 48 | result->y.limbs[NLIMBS_REDUCED - 1] |= 49 | is_odd(&temp_narrow_reduced) << (TBITS); 50 | 51 | uint8_t y_buf[RESIDUE_LENGTH_BYTES]; 52 | encode(y_buf, &result->y); 53 | 54 | blake2b_init(&hash_ctxt, 64); 55 | blake2b_update(&hash_ctxt, y_buf, RESIDUE_LENGTH_BYTES); 56 | blake2b_update(&hash_ctxt, pub_key, RESIDUE_LENGTH_BYTES); 57 | blake2b_update(&hash_ctxt, msg, msg_len); 58 | blake2b_final(&hash_ctxt, (uint8_t *) &scalar_large, sizeof(scalar_hash_t)); 59 | 60 | scalar_t hash_scalar; 61 | mont_reduce_hash_mod_l(&hash_scalar, &scalar_large); 62 | mont_mult_mod_l(&hash_scalar, &hash_scalar, priv_key); 63 | mont_mult_mod_l(&hash_scalar, &hash_scalar, &SCALAR_MONT_R2_HASH_MUL); 64 | sub_mod_l(&result->s, &session_key, &hash_scalar); 65 | 66 | explicit_bzero(&session_key, sizeof(session_key)); 67 | explicit_bzero(&hash_scalar, sizeof(hash_scalar)); 68 | explicit_bzero(&session_key_wash, sizeof(session_key_wash)); 69 | } 70 | 71 | int verify( 72 | const signature_t *sig, const uint8_t *r_bytes, const uint8_t *pub_key_bytes, 73 | const affine_pt_narrow_t *pub_key_pt, const uint8_t *msg, 74 | size_t msg_len) { 75 | 76 | projective_pt_narrow_t sB; 77 | projective_pt_narrow_t hA; 78 | projective_pt_narrow_t result_pt; 79 | residue_narrow_reduced_t result_y; 80 | 81 | scalar_hash_t scalar_large; 82 | blake2b_state hash_ctxt; 83 | blake2b_init(&hash_ctxt, 64); 84 | blake2b_update(&hash_ctxt, r_bytes, RESIDUE_LENGTH_BYTES); 85 | blake2b_update(&hash_ctxt, pub_key_bytes, RESIDUE_LENGTH_BYTES); 86 | blake2b_update(&hash_ctxt, msg, msg_len); 87 | blake2b_final(&hash_ctxt, (uint8_t *) &scalar_large, sizeof(scalar_hash_t)); 88 | 89 | scalar_t hash_scalar; 90 | reduce_hash_mod_l(&hash_scalar, &scalar_large); 91 | 92 | // Can use non-const version for both of these. 93 | scalar_comb_multiply_unsafe(&sB, &base_comb, &sig->s); 94 | scalar_multiply_unsafe(&hA, pub_key_pt, &hash_scalar); 95 | projective_add(&result_pt, &sB, &hA); 96 | 97 | // Everything below except the comparison should eventually be in helper 98 | // functions: Point affinization, and point compression bit-for-bit. 99 | // Same applies for the signing. 100 | residue_narrow_t z_inv; 101 | 102 | invert_narrow(&z_inv, &result_pt.z); 103 | mul_narrow(&result_pt.x, &result_pt.x, &z_inv); 104 | mul_narrow(&result_pt.y, &result_pt.y, &z_inv); 105 | 106 | narrow_complete(&result_y, &result_pt.y); 107 | 108 | residue_narrow_reduced_t temp_narrow_reduced; 109 | narrow_partial_complete(&temp_narrow_reduced, &result_pt.x); 110 | result_y.limbs[NLIMBS_REDUCED - 1] |= 111 | is_odd(&temp_narrow_reduced) << TBITS; 112 | 113 | return equal_narrow_reduced(&sig->y, &result_y); 114 | } 115 | 116 | void encode_sig(uint8_t *result, const signature_t *sig) { 117 | residue_narrow_reduced_t pack; 118 | 119 | memcpy(&pack, &sig->y, sizeof(residue_narrow_reduced_t)); 120 | // Save the upper two bits in the uppermost part of the 33rd byte 121 | pack.limbs[NLIMBS_REDUCED - 1] |= 122 | (sig->s.limbs[SCALAR_LIMBS - 1] & 0x3) << 28; 123 | encode(result, &pack); 124 | memcpy(result + RESIDUE_LENGTH_BYTES, 125 | &sig->s, sizeof(uint32_t) * (SCALAR_LIMBS - 1)); 126 | } 127 | 128 | void decode_sig(signature_t *result, const uint8_t *encoded_sig) { 129 | decode(&result->y, encoded_sig); 130 | result->s.limbs[SCALAR_LIMBS - 1] = result->y.limbs[NLIMBS_REDUCED - 1] >> 28; 131 | // We leave an extra bit for the sign bit from compression. 132 | result->y.limbs[NLIMBS_REDUCED - 1] &= ((1 << (TBITS + 1)) - 1); 133 | memcpy(&result->s, encoded_sig + RESIDUE_LENGTH_BYTES, 134 | sizeof(uint32_t) * (SCALAR_LIMBS - 1)); 135 | } 136 | --------------------------------------------------------------------------------