├── python-src └── curve25519 │ ├── test │ ├── __init__.py │ ├── test_speed.py │ └── test_curve25519.py │ ├── __init__.py │ ├── keys.py │ └── curve25519module.c ├── fuzz ├── curve25519-donna.c ├── curve25519-donna-sse2.c ├── curve25519-ref10.h ├── curve25519-donna.h ├── build-nix.php ├── README.md ├── fuzz-curve25519.c └── curve25519-ref10.c ├── .gitignore ├── curve25519.h ├── curve25519-donna.h ├── curve25519.c ├── test-ticks.h ├── setup.py ├── curve25519-donna-common.h ├── curve25519-donna-scalarmult-base.h ├── curve25519-donna-scalarmult-sse2.h ├── curve25519-donna-portable-identify.h ├── test.c ├── README.md ├── curve25519-donna-portable.h ├── curve25519-optimizations-32bit.md ├── curve25519-donna-64bit.h ├── curve25519-donna-32bit.h └── curve25519-donna-sse2.h /python-src/curve25519/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fuzz/curve25519-donna.c: -------------------------------------------------------------------------------- 1 | #include "../curve25519.c" 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /build 2 | *.o 3 | *.pyc 4 | /dist 5 | /MANIFEST 6 | -------------------------------------------------------------------------------- /fuzz/curve25519-donna-sse2.c: -------------------------------------------------------------------------------- 1 | #define CURVE25519_SSE2 2 | #define CURVE25519_SUFFIX _sse2 3 | #include "../curve25519.c" 4 | -------------------------------------------------------------------------------- /python-src/curve25519/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .keys import Private, Public 3 | 4 | hush_pyflakes = [Private, Public]; del hush_pyflakes 5 | -------------------------------------------------------------------------------- /fuzz/curve25519-ref10.h: -------------------------------------------------------------------------------- 1 | #ifndef CURVE25519_REF10_H 2 | #define CURVE25519_REF10_H 3 | 4 | int crypto_scalarmult_base_ref10(unsigned char *q,const unsigned char *n); 5 | int crypto_scalarmult_ref10(unsigned char *q, const unsigned char *n, const unsigned char *p); 6 | 7 | #endif /* CURVE25519_REF10_H */ 8 | 9 | -------------------------------------------------------------------------------- /curve25519.h: -------------------------------------------------------------------------------- 1 | #ifndef CURVE25519_H 2 | #define CURVE25519_H 3 | 4 | typedef unsigned char curve25519_key[32]; 5 | 6 | void curve25519_donna(curve25519_key mypublic, const curve25519_key secret, const curve25519_key basepoint); 7 | void curve25519_donna_basepoint(curve25519_key mypublic, const curve25519_key secret); 8 | 9 | #endif /* CURVE25519_H */ 10 | 11 | -------------------------------------------------------------------------------- /curve25519-donna.h: -------------------------------------------------------------------------------- 1 | #include "curve25519.h" 2 | #include "curve25519-donna-portable.h" 3 | 4 | #if defined(CURVE25519_SSE2) 5 | #else 6 | #if defined(HAVE_UINT128) && !defined(CURVE25519_FORCE_32BIT) 7 | #define CURVE25519_64BIT 8 | #else 9 | #define CURVE25519_32BIT 10 | #endif 11 | #endif 12 | 13 | #if !defined(CURVE25519_NO_INLINE_ASM) 14 | #endif 15 | 16 | 17 | #if defined(CURVE25519_SSE2) 18 | #include "curve25519-donna-sse2.h" 19 | #elif defined(CURVE25519_64BIT) 20 | #include "curve25519-donna-64bit.h" 21 | #else 22 | #include "curve25519-donna-32bit.h" 23 | #endif 24 | 25 | #include "curve25519-donna-common.h" 26 | 27 | #if defined(CURVE25519_SSE2) 28 | #include "curve25519-donna-scalarmult-sse2.h" 29 | #else 30 | #include "curve25519-donna-scalarmult-base.h" 31 | #endif 32 | 33 | -------------------------------------------------------------------------------- /fuzz/curve25519-donna.h: -------------------------------------------------------------------------------- 1 | #ifndef CURVE25519_H 2 | #define CURVE25519_H 3 | 4 | typedef unsigned char curve25519_key[32]; 5 | 6 | void curve25519_donna(curve25519_key mypublic, const curve25519_key secret, const curve25519_key basepoint); 7 | void curve25519_donna_basepoint(curve25519_key mypublic, const curve25519_key secret); 8 | void curve25519_donna_raw(curve25519_key mypublic, const curve25519_key secret, const curve25519_key basepoint); 9 | 10 | #if defined(CURVE25519_SSE2) 11 | void curve25519_donna_sse2(curve25519_key mypublic, const curve25519_key secret, const curve25519_key basepoint); 12 | void curve25519_donna_basepoint_sse2(curve25519_key mypublic, const curve25519_key secret); 13 | void curve25519_donna_raw_sse2(curve25519_key mypublic, const curve25519_key secret, const curve25519_key basepoint); 14 | #endif 15 | 16 | #endif /* CURVE25519_H */ 17 | 18 | -------------------------------------------------------------------------------- /curve25519.c: -------------------------------------------------------------------------------- 1 | #include "curve25519-donna.h" 2 | 3 | #if !defined(CURVE25519_SUFFIX) 4 | #define CURVE25519_SUFFIX 5 | #endif 6 | 7 | #define CURVE25519_FN3(fn,suffix) fn##suffix 8 | #define CURVE25519_FN2(fn,suffix) CURVE25519_FN3(fn,suffix) 9 | #define CURVE25519_FN(fn) CURVE25519_FN2(fn,CURVE25519_SUFFIX) 10 | 11 | void 12 | CURVE25519_FN(curve25519_donna) (curve25519_key mypublic, const curve25519_key secret, const curve25519_key basepoint) { 13 | curve25519_key e; 14 | size_t i; 15 | 16 | for (i = 0;i < 32;++i) e[i] = secret[i]; 17 | e[0] &= 0xf8; 18 | e[31] &= 0x7f; 19 | e[31] |= 0x40; 20 | curve25519_scalarmult_donna(mypublic, e, basepoint); 21 | } 22 | 23 | void 24 | CURVE25519_FN(curve25519_donna_basepoint) (curve25519_key mypublic, const curve25519_key secret) { 25 | static const curve25519_key basepoint = {9}; 26 | CURVE25519_FN(curve25519_donna)(mypublic, secret, basepoint); 27 | } 28 | -------------------------------------------------------------------------------- /python-src/curve25519/test/test_speed.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | from time import time 4 | from curve25519 import Private 5 | 6 | count = 10000 7 | elapsed_get_public = 0.0 8 | elapsed_get_shared = 0.0 9 | 10 | def abbreviate_time(data): 11 | # 1.23s, 790ms, 132us 12 | if data is None: 13 | return "" 14 | s = float(data) 15 | if s >= 10: 16 | #return abbreviate.abbreviate_time(data) 17 | return "%d" % s 18 | if s >= 1.0: 19 | return "%.2fs" % s 20 | if s >= 0.01: 21 | return "%dms" % (1000*s) 22 | if s >= 0.001: 23 | return "%.1fms" % (1000*s) 24 | if s >= 0.000001: 25 | return "%.1fus" % (1000000*s) 26 | return "%dns" % (1000000000*s) 27 | 28 | def nohash(key): return key 29 | 30 | for i in range(count): 31 | p = Private() 32 | start = time() 33 | pub = p.get_public() 34 | elapsed_get_public += time() - start 35 | pub2 = Private().get_public() 36 | start = time() 37 | shared = p.get_shared_key(pub2) #, hashfunc=nohash) 38 | elapsed_get_shared += time() - start 39 | 40 | print("get_public: %s" % abbreviate_time(elapsed_get_public / count)) 41 | print("get_shared: %s" % abbreviate_time(elapsed_get_shared / count)) 42 | 43 | # these take about 560us-570us each (with the default compiler settings, -Os) 44 | # on my laptop, same with -O2 45 | # of which the python overhead is about 5us 46 | # and the get_shared_key() hash step adds about 5us 47 | -------------------------------------------------------------------------------- /test-ticks.h: -------------------------------------------------------------------------------- 1 | #include "curve25519-donna-portable-identify.h" 2 | 3 | /* ticks - not tested on anything other than x86 */ 4 | static uint64_t 5 | get_ticks(void) { 6 | #if defined(CPU_X86) || defined(CPU_X86_64) 7 | #if defined(COMPILER_INTEL) 8 | return _rdtsc(); 9 | #elif defined(COMPILER_MSVC) 10 | return __rdtsc(); 11 | #elif defined(COMPILER_GCC) 12 | uint32_t lo, hi; 13 | __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi)); 14 | return ((uint64_t)lo | ((uint64_t)hi << 32)); 15 | #else 16 | need rdtsc for this compiler 17 | #endif 18 | #elif defined(OS_SOLARIS) 19 | return (uint64_t)gethrtime(); 20 | #elif defined(CPU_SPARC) && !defined(OS_OPENBSD) 21 | uint64_t t; 22 | __asm__ __volatile__("rd %%tick, %0" : "=r" (t)); 23 | return t; 24 | #elif defined(CPU_PPC) 25 | uint32_t lo = 0, hi = 0; 26 | __asm__ __volatile__("mftbu %0; mftb %1" : "=r" (hi), "=r" (lo)); 27 | return ((uint64_t)lo | ((uint64_t)hi << 32)); 28 | #elif defined(CPU_IA64) 29 | uint64_t t; 30 | __asm__ __volatile__("mov %0=ar.itc" : "=r" (t)); 31 | return t; 32 | #elif defined(OS_NIX) 33 | timeval t2; 34 | gettimeofday(&t2, NULL); 35 | t = ((uint64_t)t2.tv_usec << 32) | (uint64_t)t2.tv_sec; 36 | return t; 37 | #else 38 | need ticks for this platform 39 | #endif 40 | } 41 | 42 | #define timeit(x,minvar) \ 43 | ticks = get_ticks(); \ 44 | x; \ 45 | ticks = get_ticks() - ticks; \ 46 | if (ticks < minvar) \ 47 | minvar = ticks; 48 | 49 | #define maxticks 0xffffffffffffffffull 50 | 51 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | from subprocess import Popen, PIPE 4 | from distutils.core import setup, Extension 5 | 6 | version = Popen(["git", "describe", "--tags"], stdout=PIPE).communicate()[0]\ 7 | .strip().decode("utf8") 8 | 9 | ext_modules = [Extension("curve25519._curve25519", 10 | ["python-src/curve25519/curve25519module.c", 11 | "curve25519.c"], 12 | )] 13 | 14 | short_description="Python wrapper for the Curve25519 cryptographic library" 15 | long_description="""\ 16 | Curve25519 is a fast elliptic-curve key-agreement protocol, in which two 17 | parties Alice and Bob each generate a (public,private) keypair, exchange 18 | public keys, and can then compute the same shared key. Specifically, Alice 19 | computes F(Aprivate, Bpublic), Bob computes F(Bprivate, Apublic), and both 20 | get the same value (and nobody else can guess that shared value, even if they 21 | know Apublic and Bpublic). 22 | 23 | This is a Python wrapper for the portable 'curve25519-donna' implementation 24 | of this algorithm, written by Adam Langley, hosted at 25 | http://code.google.com/p/curve25519-donna/ 26 | """ 27 | 28 | setup(name="curve25519-donna", 29 | version=version, 30 | description=short_description, 31 | long_description=long_description, 32 | author="Brian Warner", 33 | author_email="warner-pycurve25519-donna@lothar.com", 34 | license="BSD", 35 | packages=["curve25519", "curve25519.test"], 36 | package_dir={"curve25519": "python-src/curve25519"}, 37 | ext_modules=ext_modules, 38 | ) 39 | -------------------------------------------------------------------------------- /curve25519-donna-common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * In: b = 2^5 - 2^0 3 | * Out: b = 2^250 - 2^0 4 | */ 5 | static void 6 | curve25519_pow_two5mtwo0_two250mtwo0(bignum25519 b) { 7 | bignum25519 ALIGN(16) t0,c; 8 | 9 | /* 2^5 - 2^0 */ /* b */ 10 | /* 2^10 - 2^5 */ curve25519_square_times(t0, b, 5); 11 | /* 2^10 - 2^0 */ curve25519_mul(b, t0, b); 12 | /* 2^20 - 2^10 */ curve25519_square_times(t0, b, 10); 13 | /* 2^20 - 2^0 */ curve25519_mul(c, t0, b); 14 | /* 2^40 - 2^20 */ curve25519_square_times(t0, c, 20); 15 | /* 2^40 - 2^0 */ curve25519_mul(t0, t0, c); 16 | /* 2^50 - 2^10 */ curve25519_square_times(t0, t0, 10); 17 | /* 2^50 - 2^0 */ curve25519_mul(b, t0, b); 18 | /* 2^100 - 2^50 */ curve25519_square_times(t0, b, 50); 19 | /* 2^100 - 2^0 */ curve25519_mul(c, t0, b); 20 | /* 2^200 - 2^100 */ curve25519_square_times(t0, c, 100); 21 | /* 2^200 - 2^0 */ curve25519_mul(t0, t0, c); 22 | /* 2^250 - 2^50 */ curve25519_square_times(t0, t0, 50); 23 | /* 2^250 - 2^0 */ curve25519_mul(b, t0, b); 24 | } 25 | 26 | /* 27 | * z^(p - 2) = z(2^255 - 21) 28 | */ 29 | static void 30 | curve25519_recip(bignum25519 out, const bignum25519 z) { 31 | bignum25519 ALIGN(16) a,t0,b; 32 | 33 | /* 2 */ curve25519_square(a, z); /* a = 2 */ 34 | /* 8 */ curve25519_square_times(t0, a, 2); 35 | /* 9 */ curve25519_mul(b, t0, z); /* b = 9 */ 36 | /* 11 */ curve25519_mul(a, b, a); /* a = 11 */ 37 | /* 22 */ curve25519_square(t0, a); 38 | /* 2^5 - 2^0 = 31 */ curve25519_mul(b, t0, b); 39 | /* 2^250 - 2^0 */ curve25519_pow_two5mtwo0_two250mtwo0(b); 40 | /* 2^255 - 2^5 */ curve25519_square_times(b, b, 5); 41 | /* 2^255 - 21 */ curve25519_mul(out, b, a); 42 | } 43 | 44 | -------------------------------------------------------------------------------- /python-src/curve25519/keys.py: -------------------------------------------------------------------------------- 1 | from . import _curve25519 2 | from hashlib import sha256 3 | import os 4 | 5 | # the curve25519 functions are really simple, and could be used without an 6 | # OOP layer, but it's a bit too easy to accidentally swap the private and 7 | # public keys that way. 8 | 9 | def _hash_shared(shared): 10 | return sha256(b"curve25519-shared:"+shared).digest() 11 | 12 | class Private: 13 | def __init__(self, secret=None, seed=None): 14 | if secret is None: 15 | if seed is None: 16 | secret = os.urandom(32) 17 | else: 18 | secret = sha256(b"curve25519-private:"+seed).digest() 19 | else: 20 | assert seed is None, "provide secret, seed, or neither, not both" 21 | if not isinstance(secret, bytes) or len(secret) != 32: 22 | raise TypeError("secret= must be 32-byte string") 23 | self.private = _curve25519.make_private(secret) 24 | 25 | def serialize(self): 26 | return self.private 27 | 28 | def get_public(self): 29 | return Public(_curve25519.make_public(self.private)) 30 | 31 | def get_shared_key(self, public, hashfunc=None): 32 | if not isinstance(public, Public): 33 | raise ValueError("'public' must be an instance of Public") 34 | if hashfunc is None: 35 | hashfunc = _hash_shared 36 | shared = _curve25519.make_shared(self.private, public.public) 37 | return hashfunc(shared) 38 | 39 | class Public: 40 | def __init__(self, public): 41 | assert isinstance(public, bytes) 42 | assert len(public) == 32 43 | self.public = public 44 | 45 | def serialize(self): 46 | return self.public 47 | -------------------------------------------------------------------------------- /curve25519-donna-scalarmult-base.h: -------------------------------------------------------------------------------- 1 | /* Calculates nQ where Q is the x-coordinate of a point on the curve 2 | * 3 | * mypublic: the packed little endian x coordinate of the resulting curve point 4 | * n: a little endian, 32-byte number 5 | * basepoint: a packed little endian point of the curve 6 | */ 7 | 8 | static void 9 | curve25519_scalarmult_donna(curve25519_key mypublic, const curve25519_key n, const curve25519_key basepoint) { 10 | bignum25519 nqpqx = {1}, nqpqz = {0}, nqz = {1}, nqx; 11 | bignum25519 q, qx, qpqx, qqx, zzz, zmone; 12 | size_t bit, lastbit; 13 | int32_t i; 14 | 15 | curve25519_expand(q, basepoint); 16 | curve25519_copy(nqx, q); 17 | 18 | /* bit 255 is always 0, and bit 254 is always 1, so skip bit 255 and 19 | start pre-swapped on bit 254 */ 20 | lastbit = 1; 21 | 22 | /* we are doing bits 254..3 in the loop, but are swapping in bits 253..2 */ 23 | for (i = 253; i >= 2; i--) { 24 | curve25519_add(qx, nqx, nqz); 25 | curve25519_sub(nqz, nqx, nqz); 26 | curve25519_add(qpqx, nqpqx, nqpqz); 27 | curve25519_sub(nqpqz, nqpqx, nqpqz); 28 | curve25519_mul(nqpqx, qpqx, nqz); 29 | curve25519_mul(nqpqz, qx, nqpqz); 30 | curve25519_add(qqx, nqpqx, nqpqz); 31 | curve25519_sub(nqpqz, nqpqx, nqpqz); 32 | curve25519_square(nqpqz, nqpqz); 33 | curve25519_square(nqpqx, qqx); 34 | curve25519_mul(nqpqz, nqpqz, q); 35 | curve25519_square(qx, qx); 36 | curve25519_square(nqz, nqz); 37 | curve25519_mul(nqx, qx, nqz); 38 | curve25519_sub(nqz, qx, nqz); 39 | curve25519_scalar_product(zzz, nqz, 121665); 40 | curve25519_add(zzz, zzz, qx); 41 | curve25519_mul(nqz, nqz, zzz); 42 | 43 | bit = (n[i/8] >> (i & 7)) & 1; 44 | curve25519_swap_conditional(nqx, nqpqx, bit ^ lastbit); 45 | curve25519_swap_conditional(nqz, nqpqz, bit ^ lastbit); 46 | lastbit = bit; 47 | } 48 | 49 | /* the final 3 bits are always zero, so we only need to double */ 50 | for (i = 0; i < 3; i++) { 51 | curve25519_add(qx, nqx, nqz); 52 | curve25519_sub(nqz, nqx, nqz); 53 | curve25519_square(qx, qx); 54 | curve25519_square(nqz, nqz); 55 | curve25519_mul(nqx, qx, nqz); 56 | curve25519_sub(nqz, qx, nqz); 57 | curve25519_scalar_product(zzz, nqz, 121665); 58 | curve25519_add(zzz, zzz, qx); 59 | curve25519_mul(nqz, nqz, zzz); 60 | } 61 | 62 | curve25519_recip(zmone, nqz); 63 | curve25519_mul(nqz, nqx, zmone); 64 | curve25519_contract(mypublic, nqz); 65 | } 66 | 67 | -------------------------------------------------------------------------------- /curve25519-donna-scalarmult-sse2.h: -------------------------------------------------------------------------------- 1 | 2 | /* Calculates nQ where Q is the x-coordinate of a point on the curve 3 | * 4 | * mypublic: the packed little endian x coordinate of the resulting curve point 5 | * n: a little endian, 32-byte number 6 | * basepoint: a packed little endian point of the curve 7 | */ 8 | static void 9 | curve25519_scalarmult_donna(curve25519_key mypublic, const curve25519_key n, const curve25519_key basepoint) { 10 | bignum25519 ALIGN(16) nqx = {1}, nqpqz = {1}, nqz = {0}, nqpqx, zmone; 11 | packed32bignum25519 qx, qz, pqz, pqx; 12 | packed64bignum25519 nq, sq, sqscalar, prime, primex, primez, nqpq; 13 | bignum25519mulprecomp preq; 14 | size_t bit, lastbit, i; 15 | 16 | curve25519_expand(nqpqx, basepoint); 17 | curve25519_mul_precompute(&preq, nqpqx); 18 | 19 | /* do bits 254..3 */ 20 | for (i = 254, lastbit = 0; i >= 3; i--) { 21 | bit = (n[i/8] >> (i & 7)) & 1; 22 | curve25519_swap_conditional(nqx, nqpqx, bit ^ lastbit); 23 | curve25519_swap_conditional(nqz, nqpqz, bit ^ lastbit); 24 | lastbit = bit; 25 | 26 | curve25519_tangle32(qx, nqx, nqpqx); /* qx = [nqx,nqpqx] */ 27 | curve25519_tangle32(qz, nqz, nqpqz); /* qz = [nqz,nqpqz] */ 28 | 29 | curve25519_add_packed32(pqx, qx, qz); /* pqx = [nqx+nqz,nqpqx+nqpqz] */ 30 | curve25519_sub_packed32(pqz, qx, qz); /* pqz = [nqx-nqz,nqpqx-nqpqz] */ 31 | 32 | curve25519_make_nqpq(primex, primez, pqx, pqz); /* primex = [nqx+nqz,nqpqx+nqpqz], primez = [nqpqx-nqpqz,nqx-nqz] */ 33 | curve25519_mul_packed64(prime, primex, primez); /* prime = [nqx+nqz,nqpqx+nqpqz] * [nqpqx-nqpqz,nqx-nqz] */ 34 | curve25519_addsub_packed64(prime); /* prime = [prime.x+prime.z,prime.x-prime.z] */ 35 | curve25519_square_packed64(nqpq, prime); /* nqpq = prime^2 */ 36 | curve25519_untangle64(nqpqx, nqpqz, nqpq); 37 | curve25519_mul_precomputed(nqpqz, nqpqz, &preq); /* nqpqz = nqpqz * q */ 38 | 39 | /* (((sq.x-sq.z)*121665)+sq.x) * (sq.x-sq.z) is equivalent to (sq.x*121666-sq.z*121665) * (sq.x-sq.z) */ 40 | curve25519_make_nq(nq, pqx, pqz); /* nq = [nqx+nqz,nqx-nqz] */ 41 | curve25519_square_packed64(sq, nq); /* sq = nq^2 */ 42 | curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */ 43 | curve25519_final_nq(nq, sq, sqscalar); /* nq = [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */ 44 | curve25519_untangle64(nqx, nqz, nq); 45 | }; 46 | 47 | /* it's possible to get rid of this swap with the swap in the above loop 48 | at the bottom instead of the top, but compilers seem to optimize better this way */ 49 | curve25519_swap_conditional(nqx, nqpqx, bit); 50 | curve25519_swap_conditional(nqz, nqpqz, bit); 51 | 52 | /* do bits 2..0 */ 53 | for (i = 0; i < 3; i++) { 54 | curve25519_compute_nq(nq, nqx, nqz); 55 | curve25519_square_packed64(sq, nq); /* sq = nq^2 */ 56 | curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */ 57 | curve25519_final_nq(nq, sq, sqscalar); /* nq = [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */ 58 | curve25519_untangle64(nqx, nqz, nq); 59 | } 60 | 61 | curve25519_recip(zmone, nqz); 62 | curve25519_mul(nqz, nqx, zmone); 63 | curve25519_contract(mypublic, nqz); 64 | } 65 | 66 | -------------------------------------------------------------------------------- /fuzz/build-nix.php: -------------------------------------------------------------------------------- 1 | set = false; 44 | 45 | for ($i = 1; $i < $argc; $i++) { 46 | if (!preg_match("!--".$flag."=(.*)!", $argv[$i], $m)) 47 | continue; 48 | $this->value = $m[1]; 49 | $this->set = true; 50 | return; 51 | } 52 | } 53 | } 54 | 55 | class multiargument extends anyargument { 56 | function multiargument($flag, $legal_values) { 57 | parent::anyargument($flag); 58 | 59 | if (!$this->set) 60 | return; 61 | 62 | $map = array(); 63 | foreach($legal_values as $value) 64 | $map[$value] = true; 65 | 66 | if (!isset($map[$this->value])) { 67 | usage("{$this->value} is not a valid parameter to --{$flag}!"); 68 | exit(1); 69 | } 70 | } 71 | } 72 | 73 | class flag extends argument { 74 | function flag($flag) { 75 | global $argc, $argv; 76 | 77 | $this->set = false; 78 | 79 | $flag = "--{$flag}"; 80 | for ($i = 1; $i < $argc; $i++) { 81 | if ($argv[$i] !== $flag) 82 | continue; 83 | $this->value = true; 84 | $this->set = true; 85 | return; 86 | } 87 | } 88 | } 89 | 90 | $bits = new multiargument("bits", array("32", "64")); 91 | $compiler = new multiargument("compiler", array("gcc", "clang", "icc")); 92 | $with_sse2 = new flag("with-sse2"); 93 | $out = new anyargument("out"); 94 | 95 | $err = ""; 96 | if (!$bits->set) 97 | $err .= "--bits not set\n"; 98 | 99 | if ($err !== "") { 100 | usage($err); 101 | exit; 102 | } 103 | 104 | $compile = ($compiler->set) ? $compiler->value : "gcc"; 105 | $filename = ($out->set) ? $out->value : "fuzz-curve25519"; 106 | $link = ""; 107 | $flags = "-O3 -m{$bits->value}"; 108 | $ret = 0; 109 | 110 | 111 | runcmd("building ref10..", "{$compile} {$flags} curve25519-ref10.c -c -o curve25519-ref10.o"); 112 | runcmd("building curve25519..", "{$compile} {$flags} curve25519-donna.c -c -o curve25519-donna.o"); 113 | if ($with_sse2->set) { 114 | runcmd("building curve25519-sse2..", "{$compile} {$flags} curve25519-donna-sse2.c -c -o curve25519-donna-sse2.o -msse2"); 115 | $link .= " curve25519-donna-sse2.o -DCURVE25519_SSE2"; 116 | } 117 | runcmd("linking..", "{$compile} {$flags} {$link} fuzz-curve25519.c curve25519-donna.o curve25519-ref10.o -o {$filename}"); 118 | echoln("{$filename} built."); 119 | 120 | 121 | cleanup(); 122 | ?> 123 | -------------------------------------------------------------------------------- /curve25519-donna-portable-identify.h: -------------------------------------------------------------------------------- 1 | /* os */ 2 | #if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__) 3 | #define OS_WINDOWS 4 | #elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__) 5 | #define OS_SOLARIS 6 | #else 7 | #include /* need this to define BSD */ 8 | #define OS_NIX 9 | #if defined(__linux__) 10 | #define OS_LINUX 11 | #elif defined(BSD) 12 | #define OS_BSD 13 | #if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__)) 14 | #define OS_OSX 15 | #elif defined(macintosh) || defined(Macintosh) 16 | #define OS_MAC 17 | #elif defined(__OpenBSD__) 18 | #define OS_OPENBSD 19 | #endif 20 | #endif 21 | #endif 22 | 23 | 24 | /* compiler */ 25 | #if defined(_MSC_VER) 26 | #define COMPILER_MSVC 27 | #endif 28 | #if defined(__ICC) 29 | #define COMPILER_INTEL 30 | #endif 31 | #if defined(__GNUC__) 32 | #if (__GNUC__ >= 3) 33 | #define COMPILER_GCC ((__GNUC__ * 10000) + (__GNUC_MINOR__ * 100) + (__GNUC_PATCHLEVEL__)) 34 | #else 35 | #define COMPILER_GCC ((__GNUC__ * 10000) + (__GNUC_MINOR__ * 100) ) 36 | #endif 37 | #endif 38 | #if defined(__PATHCC__) 39 | #define COMPILER_PATHCC 40 | #endif 41 | #if defined(__clang__) 42 | #define COMPILER_CLANG ((__clang_major__ * 10000) + (__clang_minor__ * 100) + (__clang_patchlevel__)) 43 | #endif 44 | 45 | 46 | 47 | /* cpu */ 48 | #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64) 49 | #define CPU_X86_64 50 | #elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500)) 51 | #define CPU_X86 500 52 | #elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400)) 53 | #define CPU_X86 400 54 | #elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__) 55 | #define CPU_X86 300 56 | #elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64) 57 | #define CPU_IA64 58 | #endif 59 | 60 | #if defined(__sparc__) || defined(__sparc) || defined(__sparcv9) 61 | #define CPU_SPARC 62 | #if defined(__sparcv9) 63 | #define CPU_SPARC64 64 | #endif 65 | #endif 66 | 67 | #if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC) 68 | #define CPU_PPC 69 | #if defined(_ARCH_PWR7) 70 | #define CPU_POWER7 71 | #elif defined(__64BIT__) 72 | #define CPU_PPC64 73 | #else 74 | #define CPU_PPC32 75 | #endif 76 | #endif 77 | 78 | #if defined(__hppa__) || defined(__hppa) 79 | #define CPU_HPPA 80 | #endif 81 | 82 | #if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) 83 | #define CPU_ALPHA 84 | #endif 85 | 86 | /* 64 bit cpu */ 87 | #if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64)) 88 | #define CPU_64BITS 89 | #endif 90 | 91 | #if defined(COMPILER_MSVC) 92 | typedef signed char int8_t; 93 | typedef unsigned char uint8_t; 94 | typedef signed short int16_t; 95 | typedef unsigned short uint16_t; 96 | typedef signed int int32_t; 97 | typedef unsigned int uint32_t; 98 | typedef signed __int64 int64_t; 99 | typedef unsigned __int64 uint64_t; 100 | #else 101 | #include 102 | #endif 103 | 104 | -------------------------------------------------------------------------------- /python-src/curve25519/curve25519module.c: -------------------------------------------------------------------------------- 1 | /* tell python that PyArg_ParseTuple(t#) means Py_ssize_t, not int */ 2 | #define PY_SSIZE_T_CLEAN 3 | #include 4 | #if (PY_VERSION_HEX < 0x02050000) 5 | typedef int Py_ssize_t; 6 | #endif 7 | 8 | /* This is required for compatibility with Python 2. */ 9 | #if PY_MAJOR_VERSION >= 3 10 | #include 11 | #define y "y" 12 | #else 13 | #define PyBytes_FromStringAndSize PyString_FromStringAndSize 14 | #define y "t" 15 | #endif 16 | 17 | int curve25519_donna(char *mypublic, 18 | const char *secret, const char *basepoint); 19 | 20 | static PyObject * 21 | pycurve25519_makeprivate(PyObject *self, PyObject *args) 22 | { 23 | char *in1; 24 | Py_ssize_t in1len; 25 | if (!PyArg_ParseTuple(args, y"#:clamp", &in1, &in1len)) 26 | return NULL; 27 | if (in1len != 32) { 28 | PyErr_SetString(PyExc_ValueError, "input must be 32-byte string"); 29 | return NULL; 30 | } 31 | in1[0] &= 248; 32 | in1[31] &= 127; 33 | in1[31] |= 64; 34 | return PyBytes_FromStringAndSize((char *)in1, 32); 35 | } 36 | 37 | static PyObject * 38 | pycurve25519_makepublic(PyObject *self, PyObject *args) 39 | { 40 | const char *private; 41 | char mypublic[32]; 42 | char basepoint[32] = {9}; 43 | Py_ssize_t privatelen; 44 | if (!PyArg_ParseTuple(args, y"#:makepublic", &private, &privatelen)) 45 | return NULL; 46 | if (privatelen != 32) { 47 | PyErr_SetString(PyExc_ValueError, "input must be 32-byte string"); 48 | return NULL; 49 | } 50 | curve25519_donna(mypublic, private, basepoint); 51 | return PyBytes_FromStringAndSize((char *)mypublic, 32); 52 | } 53 | 54 | static PyObject * 55 | pycurve25519_makeshared(PyObject *self, PyObject *args) 56 | { 57 | const char *myprivate, *theirpublic; 58 | char shared_key[32]; 59 | Py_ssize_t myprivatelen, theirpubliclen; 60 | if (!PyArg_ParseTuple(args, y"#"y"#:generate", 61 | &myprivate, &myprivatelen, &theirpublic, &theirpubliclen)) 62 | return NULL; 63 | if (myprivatelen != 32) { 64 | PyErr_SetString(PyExc_ValueError, "input must be 32-byte string"); 65 | return NULL; 66 | } 67 | if (theirpubliclen != 32) { 68 | PyErr_SetString(PyExc_ValueError, "input must be 32-byte string"); 69 | return NULL; 70 | } 71 | curve25519_donna(shared_key, myprivate, theirpublic); 72 | return PyBytes_FromStringAndSize((char *)shared_key, 32); 73 | } 74 | 75 | 76 | static PyMethodDef 77 | curve25519_functions[] = { 78 | {"make_private", pycurve25519_makeprivate, METH_VARARGS, "data->private"}, 79 | {"make_public", pycurve25519_makepublic, METH_VARARGS, "private->public"}, 80 | {"make_shared", pycurve25519_makeshared, METH_VARARGS, "private+public->shared"}, 81 | {NULL, NULL, 0, NULL}, 82 | }; 83 | 84 | #if PY_MAJOR_VERSION >= 3 85 | static struct PyModuleDef 86 | curve25519_module = { 87 | PyModuleDef_HEAD_INIT, 88 | "_curve25519", 89 | NULL, 90 | NULL, 91 | curve25519_functions, 92 | }; 93 | 94 | PyObject * 95 | PyInit__curve25519(void) 96 | { 97 | return PyModule_Create(&curve25519_module); 98 | } 99 | #else 100 | PyMODINIT_FUNC 101 | init_curve25519(void) 102 | { 103 | (void)Py_InitModule("_curve25519", curve25519_functions); 104 | } 105 | #endif -------------------------------------------------------------------------------- /test.c: -------------------------------------------------------------------------------- 1 | /* 2 | */ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "curve25519.h" 9 | 10 | #include "test-ticks.h" 11 | 12 | static void 13 | curveassert_die(const unsigned char *a, const unsigned char *b, size_t len, int round, const char *failreason) { 14 | size_t i; 15 | if (round > 0) 16 | printf("round %d, %s\n", round, failreason); 17 | else 18 | printf("%s\n", failreason); 19 | printf("want: "); for (i = 0; i < len; i++) printf("%02x,", a[i]); printf("\n"); 20 | printf("got : "); for (i = 0; i < len; i++) printf("%02x,", b[i]); printf("\n"); 21 | printf("diff: "); for (i = 0; i < len; i++) if (a[i] ^ b[i]) printf("%02x,", a[i] ^ b[i]); else printf(" ,"); printf("\n\n"); 22 | exit(1); 23 | } 24 | 25 | static void 26 | curveassert_equal(const unsigned char *a, const unsigned char *b, size_t len, const char *failreason) { 27 | if (memcmp(a, b, len) == 0) 28 | return; 29 | curveassert_die(a, b, len, -1, failreason); 30 | } 31 | 32 | /* result of the curve25519 scalarmult |((|max| * |max|) * |max|)... 1024 times| * basepoint */ 33 | 34 | /* 35 | static const curve25519_key curve25519_expected = { 36 | 0x8e,0x74,0xac,0x44,0x38,0xa6,0x87,0x54, 37 | 0xc8,0xc6,0x1b,0xa0,0x8b,0xd2,0xf7,0x7b, 38 | 0xbb,0xc6,0x26,0xd5,0x24,0xb3,0xbe,0xa0, 39 | 0x38,0x30,0x1d,0xec,0x2d,0x92,0xe7,0x51 40 | }; 41 | */ 42 | 43 | /* this is the result if the 256th bit of a point is ignored:*/ 44 | const curve25519_key curve25519_expected = { 45 | 0x1e,0x61,0x8e,0xc0,0x2f,0x25,0x1b,0x8d, 46 | 0x62,0xed,0x0e,0x57,0x3c,0x83,0x11,0x49, 47 | 0x7b,0xa5,0x85,0x40,0x1a,0xcf,0xd4,0x3e, 48 | 0x5b,0xeb,0xa8,0xb5,0xae,0x75,0x96,0x2d 49 | }; 50 | 51 | 52 | /* shared key resulting from the private keys |max| and |mid| */ 53 | static const curve25519_key curve25519_shared = { 54 | 0x78,0x0e,0x63,0xa6,0x58,0x5c,0x6d,0x56, 55 | 0xf1,0xa0,0x18,0x2d,0xec,0xe6,0x96,0x3b, 56 | 0x5b,0x4d,0x63,0x08,0x7b,0xf9,0x19,0x0e, 57 | 0x3a,0x77,0xf5,0x27,0x9c,0xd7,0x8b,0x44 58 | }; 59 | 60 | 61 | static void 62 | test_main(void) { 63 | int i; 64 | static const curve25519_key max = { 65 | 255,255,255,255,255,255,255,255, 66 | 255,255,255,255,255,255,255,255, 67 | 255,255,255,255,255,255,255,255, 68 | 255,255,255,255,255,255,255,255 69 | }; 70 | static const curve25519_key mid = { 71 | 127,127,127,127,127,127,127,127, 72 | 127,127,127,127,127,127,127,127, 73 | 127,127,127,127,127,127,127,127, 74 | 127,127,127,127,127,127,127,127 75 | }; 76 | curve25519_key pk[2]; 77 | curve25519_key shared[2]; 78 | uint64_t ticks, curveticks = maxticks; 79 | 80 | curve25519_donna(pk[0], max, max); 81 | for (i = 0; i < 1023; i++) 82 | curve25519_donna(pk[(i & 1) ^ 1], pk[i & 1], max); 83 | curve25519_donna_basepoint(pk[0], pk[1]); 84 | curveassert_equal(curve25519_expected, pk[0], sizeof(curve25519_key), "curve25519 sanity test failed to generate correct value"); 85 | 86 | curve25519_donna_basepoint(pk[0], max); 87 | curve25519_donna_basepoint(pk[1], mid); 88 | curve25519_donna(shared[0], max, pk[1]); 89 | curve25519_donna(shared[1], mid, pk[0]); 90 | curveassert_equal(curve25519_shared, shared[0], sizeof(curve25519_key), "curve25519 failed to generate the same shared key (1)"); 91 | curveassert_equal(curve25519_shared, shared[1], sizeof(curve25519_key), "curve25519 failed to generate the same shared key (2)"); 92 | 93 | for (i = 0; i < 2048; i++) { 94 | timeit(curve25519_donna(pk[1], pk[0], max), curveticks); 95 | } 96 | 97 | printf("%.0f ticks/curve25519 scalarmult\n", (double)curveticks); 98 | } 99 | 100 | int 101 | main(void) { 102 | test_main(); 103 | return 0; 104 | } 105 | 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [curve25519](http://cr.yp.to/ecdh.html) is an elliptic curve, developed by 2 | [Dan Bernstein](http://cr.yp.to/djb.html), for fast 3 | [Diffie-Hellman](http://en.wikipedia.org/wiki/Diffie-Hellman) key agreement. 4 | DJB's [original implementation](http://cr.yp.to/ecdh.html) was written in a 5 | language of his own devising called [qhasm](http://cr.yp.to/qhasm.html). 6 | The original qhasm source isn't available, only the x86 32-bit assembly output. 7 | 8 | This project provides performant, portable 32-bit & 64-bit implementations. 9 | All implementations are of course constant time in regard to secret data. 10 | 11 | #### Performance 12 | 13 | Compilers versions are gcc 4.6.3, icc 13.1.1, clang 3.4-1~exp1. 14 | 15 | Counts are in thousands of cycles. 16 | 17 | Note that SSE2 performance may be less impressive on AMD & older CPUs with slower SSE ops! 18 | 19 | ##### E5200 @ 2.5ghz, march=core2 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 |
Versiongcciccclang
64-bit SSE2 278k 265k 302k
64-bit 273k 271k 377k
32-bit SSE2 304k 289k 317k
32-bit 1417k 845k 981k
30 | 31 | ##### E3-1270 @ 3.4ghz, march=corei7-avx 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 |
Versiongcciccclang
64-bit 201k 192k 233k
64-bit SSE2 201k 201k 261k
32-bit SSE2 238k 225k 250k
32-bit 1293k 822k 848k
42 | 43 | #### Compilation 44 | 45 | No configuration is needed. 46 | 47 | ##### 32-bit 48 | 49 | gcc curve25519.c -m32 -O3 -c 50 | 51 | ##### 64-bit 52 | 53 | gcc curve25519.c -m64 -O3 -c 54 | 55 | ##### SSE2 56 | 57 | gcc curve25519.c -m32 -O3 -c -DCURVE25519_SSE2 -msse2 58 | gcc curve25519.c -m64 -O3 -c -DCURVE25519_SSE2 59 | 60 | clang, icc, and msvc are also supported 61 | 62 | ##### Named Versions 63 | 64 | Define CURVE25519_SUFFIX to append a suffix to public functions, e.g. 65 | `-DCURVE25519_SUFFIX=_sse2` to create curve25519_donna_sse2 and 66 | curve25519_donna_basepoint_sse2. 67 | 68 | #### Usage 69 | 70 | To use the code, link against `curve25519.o` and: 71 | 72 | #include "curve25519.h" 73 | 74 | To generate a private/secret key, generate 32 cryptographically random bytes: 75 | 76 | curve25519_key sk; 77 | randombytes(sk, sizeof(curve25519_key)); 78 | 79 | Manual clamping is not needed, and it is actually not possible to use unclamped 80 | keys due to the code taking advantage of the clamped bits internally. 81 | 82 | To generate the public key from the private/secret key: 83 | 84 | curve25519_key pk; 85 | curve25519_donna_basepoint(pk, sk); 86 | 87 | To generate a shared key with your private/secret key and someone elses public key: 88 | 89 | curve25519_key shared; 90 | curve25519_donna(shared, mysk, yourpk); 91 | 92 | And hash `shared` with a cryptographic hash before using, or e.g. pass `shared` through 93 | HSalsa20/HChacha as NaCl does. 94 | 95 | #### Testing 96 | 97 | Fuzzing against a reference implemenation is now available. See [fuzz/README](fuzz/README.md). 98 | 99 | Building `curve25519.c` and linking with `test.c` will run basic sanity tests and benchmark curve25519_donna. 100 | 101 | #### Papers 102 | 103 | [djb's curve25519 paper](http://cr.yp.to/ecdh/curve25519-20060209.pdf) 104 | 105 | #### License 106 | 107 | Public Domain, or MIT -------------------------------------------------------------------------------- /curve25519-donna-portable.h: -------------------------------------------------------------------------------- 1 | #include "curve25519-donna-portable-identify.h" 2 | 3 | #define mul32x32_64(a,b) (((uint64_t)(a))*(b)) 4 | 5 | /* platform */ 6 | #if defined(COMPILER_MSVC) 7 | #include 8 | #if !defined(_DEBUG) 9 | #undef mul32x32_64 10 | #define mul32x32_64(a,b) __emulu(a,b) 11 | #endif 12 | #undef inline 13 | #define inline __forceinline 14 | #define DONNA_INLINE __forceinline 15 | #define DONNA_NOINLINE __declspec(noinline) 16 | #define ALIGN(x) __declspec(align(x)) 17 | #define ROTL32(a,b) _rotl(a,b) 18 | #define ROTR32(a,b) _rotr(a,b) 19 | #else 20 | #include 21 | #define DONNA_INLINE inline __attribute__((always_inline)) 22 | #define DONNA_NOINLINE __attribute__((noinline)) 23 | #define ALIGN(x) __attribute__((aligned(x))) 24 | #define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) 25 | #define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) 26 | #endif 27 | 28 | /* uint128_t */ 29 | #if defined(CPU_64BITS) && !defined(ED25519_FORCE_32BIT) 30 | #if defined(COMPILER_CLANG) && (COMPILER_CLANG >= 30100) 31 | #define HAVE_NATIVE_UINT128 32 | typedef unsigned __int128 uint128_t; 33 | #elif defined(COMPILER_MSVC) 34 | #define HAVE_UINT128 35 | typedef struct uint128_t { 36 | uint64_t lo, hi; 37 | } uint128_t; 38 | #define mul64x64_128(out,a,b) out.lo = _umul128(a,b,&out.hi); 39 | #define shr128_pair(out,hi,lo,shift) out = __shiftright128(lo, hi, shift); 40 | #define shl128_pair(out,hi,lo,shift) out = __shiftleft128(lo, hi, shift); 41 | #define shr128(out,in,shift) shr128_pair(out, in.hi, in.lo, shift) 42 | #define shl128(out,in,shift) shl128_pair(out, in.hi, in.lo, shift) 43 | #define add128(a,b) { uint64_t p = a.lo; a.lo += b.lo; a.hi += b.hi + (a.lo < p); } 44 | #define add128_64(a,b) { uint64_t p = a.lo; a.lo += b; a.hi += (a.lo < p); } 45 | #define lo128(a) (a.lo) 46 | #define hi128(a) (a.hi) 47 | #elif defined(COMPILER_GCC) && !defined(HAVE_NATIVE_UINT128) 48 | #if defined(__SIZEOF_INT128__) 49 | #define HAVE_NATIVE_UINT128 50 | typedef unsigned __int128 uint128_t; 51 | #elif (COMPILER_GCC >= 40400) 52 | #define HAVE_NATIVE_UINT128 53 | typedef unsigned uint128_t __attribute__((mode(TI))); 54 | #elif defined(CPU_X86_64) 55 | #define HAVE_UINT128 56 | typedef struct uint128_t { 57 | uint64_t lo, hi; 58 | } uint128_t; 59 | #define mul64x64_128(out,a,b) __asm__ ("mulq %3" : "=a" (out.lo), "=d" (out.hi) : "a" (a), "rm" (b)); 60 | #define shr128_pair(out,hi,lo,shift) __asm__ ("shrdq %2,%1,%0" : "+r" (lo) : "r" (hi), "J" (shift)); out = lo; 61 | #define shl128_pair(out,hi,lo,shift) __asm__ ("shldq %2,%1,%0" : "+r" (hi) : "r" (lo), "J" (shift)); out = hi; 62 | #define shr128(out,in,shift) shr128_pair(out,in.hi, in.lo, shift) 63 | #define shl128(out,in,shift) shl128_pair(out,in.hi, in.lo, shift) 64 | #define add128(a,b) __asm__ ("addq %4,%2; adcq %5,%3" : "=r" (a.hi), "=r" (a.lo) : "1" (a.lo), "0" (a.hi), "rm" (b.lo), "rm" (b.hi) : "cc"); 65 | #define add128_64(a,b) __asm__ ("addq %4,%2; adcq $0,%3" : "=r" (a.hi), "=r" (a.lo) : "1" (a.lo), "0" (a.hi), "rm" (b) : "cc"); 66 | #define lo128(a) (a.lo) 67 | #define hi128(a) (a.hi) 68 | #endif 69 | #endif 70 | 71 | #if defined(HAVE_NATIVE_UINT128) 72 | #define HAVE_UINT128 73 | #define mul64x64_128(out,a,b) out = (uint128_t)a * b; 74 | #define shr128_pair(out,hi,lo,shift) out = (uint64_t)((((uint128_t)hi << 64) | lo) >> (shift)); 75 | #define shl128_pair(out,hi,lo,shift) out = (uint64_t)(((((uint128_t)hi << 64) | lo) << (shift)) >> 64); 76 | #define shr128(out,in,shift) out = (uint64_t)(in >> (shift)); 77 | #define shl128(out,in,shift) out = (uint64_t)((in << shift) >> 64); 78 | #define add128(a,b) a += b; 79 | #define add128_64(a,b) a += (uint64_t)b; 80 | #define lo128(a) ((uint64_t)a) 81 | #define hi128(a) ((uint64_t)(a >> 64)) 82 | #endif 83 | 84 | #if !defined(HAVE_UINT128) 85 | #error Need a uint128_t implementation! 86 | #endif 87 | #endif 88 | 89 | #include 90 | #include 91 | 92 | 93 | -------------------------------------------------------------------------------- /curve25519-optimizations-32bit.md: -------------------------------------------------------------------------------- 1 | Partial Reductions for multiplications 2 | -------------------------------------- 3 | 4 | It is possible to get away with partial reductions for multiplications 5 | instead of fully reducing everything. The largest input to square/mult 6 | will come from an unreduced add, which will double the element values. 7 | Test values are 1 bit larger than actual maximum values. 8 | 9 | max27 = (1 << 27) - 1 10 | max26 = (1 << 26) - 1 11 | 12 | Largest values from an add of full bit values (max27,max26,max27,max26..) 13 | 14 | m0 0x1f1fffea8000042c 15 | m1 0x133ffff190000268 16 | m2 0x185fffef00000354 17 | m3 0x0ebffff4f00001d8 18 | m4 0x119ffff38000027c 19 | m5 0x0a3ffff850000148 20 | m6 0x0adffff8000001a4 21 | m7 0x05bffffbb00000b8 22 | m8 0x041ffffc800000cc 23 | m9 0x013fffff10000028 24 | 25 | Carry values from reducing sums 26 | 27 | c0 0x00000007c7fffaa0 28 | c1 0x000000099ffffcab 29 | c2 0x0000000617fffe27 30 | c3 0x000000075ffffd83 31 | c4 0x0000000467fffeb7 32 | c5 0x000000051ffffe5b 33 | c6 0x00000002b7ffff47 34 | c7 0x00000002dfffff33 35 | c8 0x0000000107ffffd7 36 | c9 0xa000000b 37 | c0 0x000002f8 38 | 39 | 40 | The largest carried value r1 could receive is 0x2f8, with everything else 41 | fitting in 25 or 26 bits. Assuming full values for everything, with 0x2f8 42 | added to r1 (max27,maxr1,max27,max26..): 43 | 44 | max27 = (1 << 27) - 1 45 | max26 = (1 << 26) - 1 46 | maxr1 = (((1 << 25) - 1) + 0x2f8) * 2 47 | 48 | m0 0x1f2006f77ffc7dac 49 | m1 0x134000508fffeaa8 50 | m2 0x1860004e004655d4 51 | m3 0x0ec00053efffea18 52 | m4 0x11a000527fffd2fc 53 | m5 0x0a4000574fffe988 54 | m6 0x0ae00056ffffd224 55 | m7 0x05c0005aafffe8f8 56 | m8 0x0420005b7fffd14c 57 | m9 0x0140005e0fffe868 58 | 59 | Carry values 60 | 61 | c0 0x00000007c801bddf 62 | c1 0x00000009a0002c2c 63 | c2 0x00000006180015e8 64 | c3 0x0000000760002d04 65 | c4 0x0000000468001678 66 | c5 0x0000000520002ddc 67 | c6 0x00000002b8001708 68 | c7 0x00000002e0002eb4 69 | c8 0x0000000108001798 70 | c9 0xa0002f8c 71 | c0 0x000002f9 72 | 73 | The largest carried value is now 0x2f9 (max27,maxr1b,max27,max26..) 74 | 75 | max27 = (1 << 27) - 1 76 | max26 = (1 << 26) - 1 77 | maxr1b = (((1 << 25) - 1) + 0x2f9) * 2 78 | 79 | m0 0x1f2006f9dffc7c7c 80 | m1 0x13400050afffeaa0 81 | m2 0x1860004e2046854c 82 | m3 0x0ec000540fffea10 83 | m4 0x11a000529fffd2ec 84 | m5 0x0a4000576fffe980 85 | m6 0x0ae000571fffd214 86 | m7 0x05c0005acfffe8f0 87 | m8 0x0420005b9fffd13c 88 | m9 0x0140005e2fffe860 89 | 90 | Carry values 91 | 92 | c0 0x00000007c801be77 93 | c1 0x00000009a0002c3c 94 | c2 0x00000006180015f0 95 | c3 0x0000000760002d14 96 | c4 0x0000000468001680 97 | c5 0x0000000520002dec 98 | c6 0x00000002b8001710 99 | c7 0x00000002e0002ec4 100 | c8 0x00000001080017a0 101 | c9 0xa0002f9c 102 | c0 0x000002f9 103 | 104 | The largest carried value is fixed at 0x2f9. Subtracting the largest values 105 | from 0 will result in r0 exceeding 26 bits, but r0-r4 are safe for 106 | multiplications up to 30 bits, so partial reductions throughout the entire 107 | calculation should be safe to chain. This especially helps with speeding up 108 | the SSE2 version by freeing it from large serial carry chains. Testing of 109 | course continues, but no problems as of yet have shown up. 110 | 111 | 112 | Subtraction 113 | ----------- 114 | Subtraction with unsigned elements is done using Emilia Kasper's trick, via 115 | agl: http://www.imperialviolet.org/2010/12/04/ecc.html 116 | 117 | Adding a large enough value that is equivalent to 0 mod p before subracting 118 | ensures no elements underflow. 119 | 120 | Compiler 121 | -------- 122 | gcc (as of 4.4.5) has a difficult time optimizing the 32 bit C version properly. 123 | icc produces code that is roughly 40% faster. -------------------------------------------------------------------------------- /python-src/curve25519/test/test_curve25519.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | import unittest 4 | 5 | from curve25519 import Private, Public 6 | from hashlib import sha1, sha256 7 | from binascii import hexlify 8 | 9 | class Basic(unittest.TestCase): 10 | def test_basic(self): 11 | secret1 = b"abcdefghijklmnopqrstuvwxyz123456" 12 | self.assertEqual(len(secret1), 32) 13 | 14 | secret2 = b"654321zyxwvutsrqponmlkjihgfedcba" 15 | self.assertEqual(len(secret2), 32) 16 | priv1 = Private(secret=secret1) 17 | pub1 = priv1.get_public() 18 | priv2 = Private(secret=secret2) 19 | pub2 = priv2.get_public() 20 | shared12 = priv1.get_shared_key(pub2) 21 | e = b"b0818125eab42a8ac1af5e8b9b9c15ed2605c2bbe9675de89e5e6e7f442b9598" 22 | self.assertEqual(hexlify(shared12), e) 23 | shared21 = priv2.get_shared_key(pub1) 24 | self.assertEqual(shared12, shared21) 25 | 26 | pub2a = Public(pub2.serialize()) 27 | shared12a = priv1.get_shared_key(pub2a) 28 | self.assertEqual(hexlify(shared12a), e) 29 | 30 | def test_errors(self): 31 | priv1 = Private() 32 | self.assertRaises(ValueError, priv1.get_shared_key, priv1) 33 | 34 | def test_seed(self): 35 | # use 32-byte secret 36 | self.assertRaises(TypeError, Private, secret=123) 37 | self.assertRaises(TypeError, Private, secret=b"too short") 38 | secret1 = b"abcdefghijklmnopqrstuvwxyz123456" 39 | assert len(secret1) == 32 40 | priv1 = Private(secret=secret1) 41 | priv1a = Private(secret=secret1) 42 | priv1b = Private(priv1.serialize()) 43 | self.assertEqual(priv1.serialize(), priv1a.serialize()) 44 | self.assertEqual(priv1.serialize(), priv1b.serialize()) 45 | e = b"6062636465666768696a6b6c6d6e6f707172737475767778797a313233343576" 46 | self.assertEqual(hexlify(priv1.serialize()), e) 47 | 48 | # the private key is a clamped form of the secret, so they won't 49 | # quite be the same 50 | p = Private(secret=b"\x00"*32) 51 | self.assertEqual(hexlify(p.serialize()), b"00"*31+b"40") 52 | p = Private(secret=b"\xff"*32) 53 | self.assertEqual(hexlify(p.serialize()), b"f8"+b"ff"*30+b"7f") 54 | 55 | # use arbitrary-length seed 56 | self.assertRaises(TypeError, Private, seed=123) 57 | priv1 = Private(seed=b"abc") 58 | priv1a = Private(seed=b"abc") 59 | priv1b = Private(priv1.serialize()) 60 | self.assertEqual(priv1.serialize(), priv1a.serialize()) 61 | self.assertEqual(priv1.serialize(), priv1b.serialize()) 62 | self.assertRaises(AssertionError, Private, seed=b"abc", secret=b"no") 63 | 64 | priv1 = Private(seed=b"abc") 65 | priv1a = Private(priv1.serialize()) 66 | self.assertEqual(priv1.serialize(), priv1a.serialize()) 67 | self.assertRaises(AssertionError, Private, seed=b"abc", secret=b"no") 68 | 69 | # use built-in os.urandom 70 | priv2 = Private() 71 | priv2a = Private(priv2.private) 72 | self.assertEqual(priv2.serialize(), priv2a.serialize()) 73 | 74 | # attempt to use both secret= and seed=, not allowed 75 | self.assertRaises(AssertionError, Private, seed=b"abc", secret=b"no") 76 | 77 | def test_hashfunc(self): 78 | priv1 = Private(seed=b"abc") 79 | priv2 = Private(seed=b"def") 80 | shared_sha256 = priv1.get_shared_key(priv2.get_public()) 81 | e = b"da959ffe77ebeb4757fe5ba310e28ede425ae0d0ff5ec9c884e2d08f311cf5e5" 82 | self.assertEqual(hexlify(shared_sha256), e) 83 | 84 | # confirm the hash function remains what we think it is 85 | def myhash(shared_key): 86 | return sha256(b"curve25519-shared:"+shared_key).digest() 87 | shared_myhash = priv1.get_shared_key(priv2.get_public(), myhash) 88 | self.assertEqual(hexlify(shared_myhash), e) 89 | 90 | def hexhash(shared_key): 91 | return sha1(shared_key).hexdigest().encode() 92 | shared_hexhash = priv1.get_shared_key(priv2.get_public(), hexhash) 93 | self.assertEqual(shared_hexhash, 94 | b"80eec98222c8edc4324fb9477a3c775ce7c6c93a") 95 | 96 | 97 | if __name__ == "__main__": 98 | unittest.main() 99 | 100 | -------------------------------------------------------------------------------- /fuzz/README.md: -------------------------------------------------------------------------------- 1 | This code fuzzes curve25519-donna (and optionally curve25519-donna-sse2) against the ref10 implementation of 2 | [curve25519](https://github.com/floodyberry/supercop/tree/master/crypto_scalarmult/curve25519/ref10). 3 | 4 | # Building 5 | 6 | ## *nix + PHP 7 | 8 | `php build-nix.php (required parameters) (optional parameters)` 9 | 10 | Required parameters: 11 | 12 | * `--bits=[32,64]` 13 | 14 | Optional parameters: 15 | 16 | * `--with-sse2` 17 | 18 | Also fuzz against ed25519-donna-sse2 19 | 20 | * `--compiler=[gcc,clang,icc]` 21 | 22 | Default: gcc 23 | 24 | * `--out=filename` 25 | 26 | Filename to write to. Default is fuzz-curve25519 27 | 28 | example: 29 | 30 | php build-nix.php --bits=64 --with-sse2 --compiler=icc 31 | 32 | ## Windows 33 | 34 | Create a project with access to the curve25519 files. 35 | 36 | Add the following files to the project: 37 | 38 | * `fuzz/curve25519-ref10.c` 39 | * `fuzz/curve25519-donna.c` 40 | * `fuzz/curve25519-donna-sse2.c` (optional) 41 | * `fuzz-curve25519.c` 42 | 43 | If you are also fuzzing against curve25519-donna-sse2, add the `CURVE25519_SSE2` define for `fuzz-curve25519.c` under 44 | its "Properties/Preprocessor/Preprocessor Definitions" option. 45 | 46 | # Running 47 | 48 | If everything agrees, the program will only output occasional status dots (every 0x100 passes) 49 | and a 64bit progress count (every 0x2000 passes): 50 | 51 | fuzzing: ref10 curve25519 curve25519-sse2 52 | 53 | ................................ [0000000000020000] 54 | ................................ [0000000000040000] 55 | ................................ [0000000000060000] 56 | ................................ [0000000000080000] 57 | ................................ [00000000000a0000] 58 | ................................ [00000000000c0000] 59 | 60 | If any of the implementations do not agree with the ref10 implementation, the program will dump 61 | the random data that was used, the data generated by the ref10 implementation, and diffs of the 62 | curve25519-donna data against the ref10 data. 63 | 64 | ## Example errors 65 | 66 | These are example error dumps (with intentionally introduced errors). 67 | 68 | ### Curve25519 69 | 70 | Random data: 71 | 72 | * sk, or Secret Key 73 | 74 | Generated data: 75 | 76 | * pk, or Public Key 77 | * shared, or Derived Shared Key 78 | 79 | #### Public Key Mismatch 80 | 81 | sk: 82 | 0x51,0x24,0xb5,0xdf,0x10,0xbe,0x6e,0xb9,0x34,0x32,0x14,0x2d,0xed,0x34,0x85,0x9f, 83 | 0xd6,0xa5,0xf0,0x19,0x8f,0x12,0xa3,0x3e,0x3e,0xcf,0xf2,0x28,0x44,0xfc,0x63,0xea, 84 | 85 | 86 | ref10 pk: 87 | 0x7c,0x1d,0xe7,0x34,0xf8,0x23,0x9c,0x17,0x8e,0x0a,0xa3,0xa8,0xf9,0xe3,0x1a,0x0b, 88 | 0x19,0x65,0x59,0x98,0x41,0x0c,0x08,0x28,0xfc,0xed,0x70,0x76,0x5f,0x4a,0x06,0x0d, 89 | 90 | 91 | curve25519 pk diff: 92 | 0xb4,0x1a,0x17,0x34,0x95,0xfa,0xbd,0x62,0x96,0x94,0x04,0xdf,0xf3,0x4b,0x65,0x4b, 93 | 0x06,0x45,0xdf,0x25,0x0a,0x55,0xcc,0x4f,0xe7,0x89,0xf5,0x64,0xd9,0xb5,0x37,0x24, 94 | 95 | 96 | curve25519-sse2 pk diff: 97 | ____,____,____,____,____,____,____,____,____,____,____,____,____,____,____,____, 98 | ____,____,____,____,____,____,____,____,____,____,____,____,____,____,____,____, 99 | 100 | 101 | In this case, curve25519 is totally wrong, while curve25519-sse2 matches the reference 102 | implementation. 103 | 104 | #### Derived Shared Key Mismatch 105 | 106 | sk: 107 | 0xaf,0xd1,0x4f,0xce,0x36,0x5d,0x4d,0xb1,0x0d,0xb5,0x1e,0xe8,0x3f,0x35,0x82,0x40, 108 | 0x8d,0x3c,0x98,0x75,0x8a,0x5d,0xd0,0xda,0xe0,0xfe,0x94,0x8e,0x9f,0xd5,0x9f,0x71, 109 | 110 | 111 | pk: 112 | 0x45,0x52,0x5b,0xa3,0x3a,0x0d,0xe7,0xaf,0x55,0xeb,0x7e,0x97,0xc8,0xfb,0x32,0x3a, 113 | 0x8d,0xea,0xae,0x04,0x9a,0xc8,0x76,0x75,0xcf,0xa4,0xe3,0x12,0x95,0x03,0xc4,0x2a, 114 | 115 | 116 | ref10 shared: 117 | 0x07,0xb8,0x00,0xb1,0x9c,0xbd,0xa0,0x82,0x76,0x98,0xb3,0x02,0x0d,0x59,0xc6,0x13, 118 | 0x27,0xeb,0x5d,0x05,0x74,0x83,0x78,0x64,0x65,0x5b,0xd5,0x41,0xe1,0x32,0xe8,0x0b, 119 | 120 | 121 | curve25519 shared diff: 122 | ____,____,____,____,____,____,____,____,____,____,____,____,____,____,____,____, 123 | ____,____,____,____,____,____,____,____,____,____,____,____,____,____,____,____, 124 | 125 | 126 | curve25519-sse2 shared diff: 127 | 0xa0,0xa1,0x6f,0x72,0xd9,0x9a,0xbb,0xb3,0xef,0xb7,0xb2,____,0xa3,0xd0,0x6a,0x1e, 128 | 0x04,0x46,0x71,0xc8,0x37,0x85,0xea,0x33,0x68,0x0f,0xc2,0xf7,0xed,0xc7,0xea,0x76, 129 | 130 | This time curve25519-sse2 is off, while curve25519 matches the reference implementation. -------------------------------------------------------------------------------- /fuzz/fuzz-curve25519.c: -------------------------------------------------------------------------------- 1 | #if defined(_WIN32) 2 | #include 3 | #include 4 | typedef unsigned int uint32_t; 5 | typedef unsigned __int64 uint64_t; 6 | #else 7 | #include 8 | #endif 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #include "curve25519-donna.h" 15 | #include "curve25519-ref10.h" 16 | 17 | static void 18 | print_diff(const char *desc, const unsigned char *a, const unsigned char *b, size_t len) { 19 | size_t p = 0; 20 | unsigned char diff; 21 | printf("%s diff:\n", desc); 22 | while (len--) { 23 | diff = *a++ ^ *b++; 24 | if (!diff) 25 | printf("____,"); 26 | else 27 | printf("0x%02x,", diff); 28 | if ((++p & 15) == 0) 29 | printf("\n"); 30 | } 31 | printf("\n\n"); 32 | } 33 | 34 | static void 35 | print_bytes(const char *desc, const unsigned char *bytes, size_t len) { 36 | size_t p = 0; 37 | printf("%s:\n", desc); 38 | while (len--) { 39 | printf("0x%02x,", *bytes++); 40 | if ((++p & 15) == 0) 41 | printf("\n"); 42 | } 43 | printf("\n\n"); 44 | } 45 | 46 | 47 | /* chacha20/12 prng */ 48 | void 49 | prng(unsigned char *out, size_t bytes) { 50 | static uint32_t state[16]; 51 | static int init = 0; 52 | uint32_t x[16], t; 53 | size_t i; 54 | 55 | if (!init) { 56 | #if defined(_WIN32) 57 | HCRYPTPROV csp; 58 | if (!CryptAcquireContext(&csp, 0, 0, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT)) { 59 | printf("CryptAcquireContext failed\n"); 60 | exit(1); 61 | } 62 | if (!CryptGenRandom(csp, (DWORD)sizeof(state), (BYTE*)state)) { 63 | printf("CryptGenRandom failed\n"); 64 | exit(1); 65 | } 66 | CryptReleaseContext(csp, 0); 67 | #else 68 | FILE *f = NULL; 69 | f = fopen("/dev/urandom", "rb"); 70 | if (!f) { 71 | printf("failed to open /dev/urandom\n"); 72 | exit(1); 73 | } 74 | if (fread(state, sizeof(state), 1, f) != 1) { 75 | printf("read error on /dev/urandom\n"); 76 | exit(1); 77 | } 78 | #endif 79 | init = 1; 80 | } 81 | 82 | while (bytes) { 83 | for (i = 0; i < 16; i++) x[i] = state[i]; 84 | 85 | #define rotl32(x,k) ((x << k) | (x >> (32 - k))) 86 | #define quarter(a,b,c,d) \ 87 | x[a] += x[b]; t = x[d]^x[a]; x[d] = rotl32(t,16); \ 88 | x[c] += x[d]; t = x[b]^x[c]; x[b] = rotl32(t,12); \ 89 | x[a] += x[b]; t = x[d]^x[a]; x[d] = rotl32(t, 8); \ 90 | x[c] += x[d]; t = x[b]^x[c]; x[b] = rotl32(t, 7); 91 | 92 | for (i = 0; i < 12; i += 2) { 93 | quarter( 0, 4, 8,12) 94 | quarter( 1, 5, 9,13) 95 | quarter( 2, 6,10,14) 96 | quarter( 3, 7,11,15) 97 | quarter( 0, 5,10,15) 98 | quarter( 1, 6,11,12) 99 | quarter( 2, 7, 8,13) 100 | quarter( 3, 4, 9,14) 101 | }; 102 | 103 | if (bytes <= 64) { 104 | memcpy(out, x, bytes); 105 | bytes = 0; 106 | } else { 107 | memcpy(out, x, 64); 108 | bytes -= 64; 109 | out += 64; 110 | } 111 | 112 | /* don't need a nonce, so last 4 words are the counter. 2^136 bytes can be generated */ 113 | if (!++state[12]) if (!++state[13]) if (!++state[14]) ++state[15]; 114 | } 115 | } 116 | 117 | 118 | 119 | int main() { 120 | #define max_impls 32 121 | const size_t skmax = 1024; 122 | static curve25519_key sk[1024][2]; 123 | static curve25519_key pk[max_impls][2]; 124 | static curve25519_key shared[max_impls][2]; 125 | size_t ski, pki, sharedi, i, j; 126 | uint64_t ctr; 127 | 128 | printf("fuzzing: "); 129 | printf(" ref10"); 130 | printf(" curve25519"); 131 | #if defined(CURVE25519_SSE2) 132 | printf(" curve25519-sse2"); 133 | #endif 134 | printf("\n\n"); 135 | 136 | for (ctr = 0, ski = skmax;;ctr++,ski++) { 137 | if (ski == skmax) { 138 | prng((unsigned char *)sk, sizeof(sk)); 139 | ski = 0; 140 | } 141 | 142 | /* derive two public keys from two secret keys */ 143 | pki = 0; 144 | crypto_scalarmult_base_ref10(pk[pki][0], sk[ski][0]); 145 | crypto_scalarmult_base_ref10(pk[pki][1], sk[ski][1]); pki++; 146 | curve25519_donna_basepoint(pk[pki][0], sk[ski][0]); 147 | curve25519_donna_basepoint(pk[pki][1], sk[ski][1]); pki++; 148 | #if defined(CURVE25519_SSE2) 149 | curve25519_donna_basepoint_sse2(pk[pki][0], sk[ski][0]); 150 | curve25519_donna_basepoint_sse2(pk[pki][1], sk[ski][1]); pki++; 151 | #endif 152 | 153 | /* make sure all public keys match */ 154 | for (i = 1; i < pki; i++) { 155 | for (j = 0; j < 2; j++) { 156 | if (memcmp(pk[0][j], pk[i][j], 32) != 0) { 157 | printf("public key mismatch:" "\n\n"); 158 | print_bytes("sk", sk[ski][j], 32); 159 | print_bytes("ref10 pk", pk[0][j], 32); 160 | pki = 1; 161 | print_diff("curve25519 pk", pk[0][j], pk[pki][j], 32); pki++; 162 | #if defined(CURVE25519_SSE2) 163 | print_diff("curve25519-sse2 pk", pk[0][j], pk[pki][j], 32); pki++; 164 | #endif 165 | exit(1); 166 | } 167 | } 168 | } 169 | 170 | /* derive the shared secret between the two secret keys */ 171 | sharedi = 0; 172 | crypto_scalarmult_ref10(shared[sharedi][0], sk[ski][0], pk[sharedi][1]); 173 | crypto_scalarmult_ref10(shared[sharedi][1], sk[ski][1], pk[sharedi][0]); sharedi++; 174 | curve25519_donna(shared[sharedi][0], sk[ski][0], pk[sharedi][1]); 175 | curve25519_donna(shared[sharedi][1], sk[ski][1], pk[sharedi][0]); sharedi++; 176 | #if defined(CURVE25519_SSE2) 177 | curve25519_donna_sse2(shared[sharedi][0], sk[ski][0], pk[sharedi][1]); 178 | curve25519_donna_sse2(shared[sharedi][1], sk[ski][1], pk[sharedi][0]); sharedi++; 179 | #endif 180 | 181 | /* make sure all shared keys match */ 182 | for (i = 1; i < sharedi; i++) { 183 | for (j = 0; j < 2; j++) { 184 | if (memcmp(shared[0][j], shared[i][j], 32) != 0) { 185 | printf("shared key mismatch:" "\n\n"); 186 | print_bytes("sk", sk[ski][j], 32); 187 | print_bytes("pk", pk[0][j], 32); 188 | print_bytes("ref10 shared", shared[0][j], 32); sharedi++; 189 | sharedi = 1; 190 | print_diff("curve25519 shared", shared[0][j], shared[sharedi][j], 32); sharedi++; 191 | #if defined(CURVE25519_SSE2) 192 | print_diff("curve25519-sse2 shared", shared[0][j], shared[sharedi][j], 32); sharedi++; 193 | #endif 194 | exit(1); 195 | } 196 | } 197 | } 198 | 199 | if (ctr && (ctr % 0x100 == 0)) { 200 | printf("."); 201 | if ((ctr % 0x2000) == 0) { 202 | printf(" ["); 203 | for (i = 0; i < 8; i++) 204 | printf("%02x", (unsigned char)(ctr >> ((7 - i) * 8))); 205 | printf("]\n"); 206 | } 207 | } 208 | } 209 | } 210 | 211 | -------------------------------------------------------------------------------- /curve25519-donna-64bit.h: -------------------------------------------------------------------------------- 1 | typedef uint64_t bignum25519[5]; 2 | 3 | static const uint64_t reduce_mask_51 = ((uint64_t)1 << 51) - 1; 4 | static const uint64_t reduce_mask_52 = ((uint64_t)1 << 52) - 1; 5 | 6 | /* out = in */ 7 | DONNA_INLINE static void 8 | curve25519_copy(bignum25519 out, const bignum25519 in) { 9 | out[0] = in[0]; 10 | out[1] = in[1]; 11 | out[2] = in[2]; 12 | out[3] = in[3]; 13 | out[4] = in[4]; 14 | } 15 | 16 | /* out = a + b */ 17 | DONNA_INLINE static void 18 | curve25519_add(bignum25519 out, const bignum25519 a, const bignum25519 b) { 19 | out[0] = a[0] + b[0]; 20 | out[1] = a[1] + b[1]; 21 | out[2] = a[2] + b[2]; 22 | out[3] = a[3] + b[3]; 23 | out[4] = a[4] + b[4]; 24 | } 25 | 26 | static const uint64_t two54m152 = (((uint64_t)1) << 54) - 152; 27 | static const uint64_t two54m8 = (((uint64_t)1) << 54) - 8; 28 | 29 | /* out = a - b */ 30 | DONNA_INLINE static void 31 | curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) { 32 | out[0] = a[0] + two54m152 - b[0]; 33 | out[1] = a[1] + two54m8 - b[1]; 34 | out[2] = a[2] + two54m8 - b[2]; 35 | out[3] = a[3] + two54m8 - b[3]; 36 | out[4] = a[4] + two54m8 - b[4]; 37 | } 38 | 39 | 40 | /* out = (in * scalar) */ 41 | DONNA_INLINE static void 42 | curve25519_scalar_product(bignum25519 out, const bignum25519 in, const uint64_t scalar) { 43 | uint128_t a; 44 | uint64_t c; 45 | 46 | #if defined(HAVE_NATIVE_UINT128) 47 | a = ((uint128_t) in[0]) * scalar; out[0] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51); 48 | a = ((uint128_t) in[1]) * scalar + c; out[1] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51); 49 | a = ((uint128_t) in[2]) * scalar + c; out[2] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51); 50 | a = ((uint128_t) in[3]) * scalar + c; out[3] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51); 51 | a = ((uint128_t) in[4]) * scalar + c; out[4] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51); 52 | out[0] += c * 19; 53 | #else 54 | mul64x64_128(a, in[0], scalar) out[0] = lo128(a) & reduce_mask_51; shr128(c, a, 51); 55 | mul64x64_128(a, in[1], scalar) add128_64(a, c) out[1] = lo128(a) & reduce_mask_51; shr128(c, a, 51); 56 | mul64x64_128(a, in[2], scalar) add128_64(a, c) out[2] = lo128(a) & reduce_mask_51; shr128(c, a, 51); 57 | mul64x64_128(a, in[3], scalar) add128_64(a, c) out[3] = lo128(a) & reduce_mask_51; shr128(c, a, 51); 58 | mul64x64_128(a, in[4], scalar) add128_64(a, c) out[4] = lo128(a) & reduce_mask_51; shr128(c, a, 51); 59 | out[0] += c * 19; 60 | #endif 61 | } 62 | 63 | /* out = a * b */ 64 | DONNA_INLINE static void 65 | curve25519_mul(bignum25519 out, const bignum25519 a, const bignum25519 b) { 66 | #if !defined(HAVE_NATIVE_UINT128) 67 | uint128_t mul; 68 | #endif 69 | uint128_t t[5]; 70 | uint64_t r0,r1,r2,r3,r4,s0,s1,s2,s3,s4,c; 71 | 72 | r0 = b[0]; 73 | r1 = b[1]; 74 | r2 = b[2]; 75 | r3 = b[3]; 76 | r4 = b[4]; 77 | 78 | s0 = a[0]; 79 | s1 = a[1]; 80 | s2 = a[2]; 81 | s3 = a[3]; 82 | s4 = a[4]; 83 | 84 | #if defined(HAVE_NATIVE_UINT128) 85 | t[0] = ((uint128_t) r0) * s0; 86 | t[1] = ((uint128_t) r0) * s1 + ((uint128_t) r1) * s0; 87 | t[2] = ((uint128_t) r0) * s2 + ((uint128_t) r2) * s0 + ((uint128_t) r1) * s1; 88 | t[3] = ((uint128_t) r0) * s3 + ((uint128_t) r3) * s0 + ((uint128_t) r1) * s2 + ((uint128_t) r2) * s1; 89 | t[4] = ((uint128_t) r0) * s4 + ((uint128_t) r4) * s0 + ((uint128_t) r3) * s1 + ((uint128_t) r1) * s3 + ((uint128_t) r2) * s2; 90 | #else 91 | mul64x64_128(t[0], r0, s0) 92 | mul64x64_128(t[1], r0, s1) mul64x64_128(mul, r1, s0) add128(t[1], mul) 93 | mul64x64_128(t[2], r0, s2) mul64x64_128(mul, r2, s0) add128(t[2], mul) mul64x64_128(mul, r1, s1) add128(t[2], mul) 94 | mul64x64_128(t[3], r0, s3) mul64x64_128(mul, r3, s0) add128(t[3], mul) mul64x64_128(mul, r1, s2) add128(t[3], mul) mul64x64_128(mul, r2, s1) add128(t[3], mul) 95 | mul64x64_128(t[4], r0, s4) mul64x64_128(mul, r4, s0) add128(t[4], mul) mul64x64_128(mul, r3, s1) add128(t[4], mul) mul64x64_128(mul, r1, s3) add128(t[4], mul) mul64x64_128(mul, r2, s2) add128(t[4], mul) 96 | #endif 97 | 98 | r1 *= 19; 99 | r2 *= 19; 100 | r3 *= 19; 101 | r4 *= 19; 102 | 103 | #if defined(HAVE_NATIVE_UINT128) 104 | t[0] += ((uint128_t) r4) * s1 + ((uint128_t) r1) * s4 + ((uint128_t) r2) * s3 + ((uint128_t) r3) * s2; 105 | t[1] += ((uint128_t) r4) * s2 + ((uint128_t) r2) * s4 + ((uint128_t) r3) * s3; 106 | t[2] += ((uint128_t) r4) * s3 + ((uint128_t) r3) * s4; 107 | t[3] += ((uint128_t) r4) * s4; 108 | #else 109 | mul64x64_128(mul, r4, s1) add128(t[0], mul) mul64x64_128(mul, r1, s4) add128(t[0], mul) mul64x64_128(mul, r2, s3) add128(t[0], mul) mul64x64_128(mul, r3, s2) add128(t[0], mul) 110 | mul64x64_128(mul, r4, s2) add128(t[1], mul) mul64x64_128(mul, r2, s4) add128(t[1], mul) mul64x64_128(mul, r3, s3) add128(t[1], mul) 111 | mul64x64_128(mul, r4, s3) add128(t[2], mul) mul64x64_128(mul, r3, s4) add128(t[2], mul) 112 | mul64x64_128(mul, r4, s4) add128(t[3], mul) 113 | #endif 114 | 115 | r0 = lo128(t[0]) & reduce_mask_51; shr128(c, t[0], 51); 116 | add128_64(t[1], c) r1 = lo128(t[1]) & reduce_mask_51; shr128(c, t[1], 51); 117 | add128_64(t[2], c) r2 = lo128(t[2]) & reduce_mask_51; shr128(c, t[2], 51); 118 | add128_64(t[3], c) r3 = lo128(t[3]) & reduce_mask_51; shr128(c, t[3], 51); 119 | add128_64(t[4], c) r4 = lo128(t[4]) & reduce_mask_51; shr128(c, t[4], 51); 120 | r0 += c * 19; c = r0 >> 51; r0 = r0 & reduce_mask_51; 121 | r1 += c; 122 | 123 | out[0] = r0; 124 | out[1] = r1; 125 | out[2] = r2; 126 | out[3] = r3; 127 | out[4] = r4; 128 | } 129 | 130 | /* out = in^(2 * count) */ 131 | DONNA_INLINE static void 132 | curve25519_square_times(bignum25519 out, const bignum25519 in, uint64_t count) { 133 | #if !defined(HAVE_NATIVE_UINT128) 134 | uint128_t mul; 135 | #endif 136 | uint128_t t[5]; 137 | uint64_t r0,r1,r2,r3,r4,c; 138 | uint64_t d0,d1,d2,d4,d419; 139 | 140 | r0 = in[0]; 141 | r1 = in[1]; 142 | r2 = in[2]; 143 | r3 = in[3]; 144 | r4 = in[4]; 145 | 146 | do { 147 | d0 = r0 * 2; 148 | d1 = r1 * 2; 149 | d2 = r2 * 2 * 19; 150 | d419 = r4 * 19; 151 | d4 = d419 * 2; 152 | 153 | #if defined(HAVE_NATIVE_UINT128) 154 | t[0] = ((uint128_t) r0) * r0 + ((uint128_t) d4) * r1 + (((uint128_t) d2) * (r3 )); 155 | t[1] = ((uint128_t) d0) * r1 + ((uint128_t) d4) * r2 + (((uint128_t) r3) * (r3 * 19)); 156 | t[2] = ((uint128_t) d0) * r2 + ((uint128_t) r1) * r1 + (((uint128_t) d4) * (r3 )); 157 | t[3] = ((uint128_t) d0) * r3 + ((uint128_t) d1) * r2 + (((uint128_t) r4) * (d419 )); 158 | t[4] = ((uint128_t) d0) * r4 + ((uint128_t) d1) * r3 + (((uint128_t) r2) * (r2 )); 159 | #else 160 | mul64x64_128(t[0], r0, r0) mul64x64_128(mul, d4, r1) add128(t[0], mul) mul64x64_128(mul, d2, r3) add128(t[0], mul) 161 | mul64x64_128(t[1], d0, r1) mul64x64_128(mul, d4, r2) add128(t[1], mul) mul64x64_128(mul, r3, r3 * 19) add128(t[1], mul) 162 | mul64x64_128(t[2], d0, r2) mul64x64_128(mul, r1, r1) add128(t[2], mul) mul64x64_128(mul, d4, r3) add128(t[2], mul) 163 | mul64x64_128(t[3], d0, r3) mul64x64_128(mul, d1, r2) add128(t[3], mul) mul64x64_128(mul, r4, d419) add128(t[3], mul) 164 | mul64x64_128(t[4], d0, r4) mul64x64_128(mul, d1, r3) add128(t[4], mul) mul64x64_128(mul, r2, r2) add128(t[4], mul) 165 | #endif 166 | 167 | r0 = lo128(t[0]) & reduce_mask_51; shr128(c, t[0], 51); 168 | add128_64(t[1], c) r1 = lo128(t[1]) & reduce_mask_51; shr128(c, t[1], 51); 169 | add128_64(t[2], c) r2 = lo128(t[2]) & reduce_mask_51; shr128(c, t[2], 51); 170 | add128_64(t[3], c) r3 = lo128(t[3]) & reduce_mask_51; shr128(c, t[3], 51); 171 | add128_64(t[4], c) r4 = lo128(t[4]) & reduce_mask_51; shr128(c, t[4], 51); 172 | r0 += c * 19; c = r0 >> 51; r0 = r0 & reduce_mask_51; 173 | r1 += c; 174 | } while(--count); 175 | 176 | out[0] = r0; 177 | out[1] = r1; 178 | out[2] = r2; 179 | out[3] = r3; 180 | out[4] = r4; 181 | } 182 | 183 | DONNA_INLINE static void 184 | curve25519_square(bignum25519 out, const bignum25519 in) { 185 | #if !defined(HAVE_NATIVE_UINT128) 186 | uint128_t mul; 187 | #endif 188 | uint128_t t[5]; 189 | uint64_t r0,r1,r2,r3,r4,c; 190 | uint64_t d0,d1,d2,d4,d419; 191 | 192 | r0 = in[0]; 193 | r1 = in[1]; 194 | r2 = in[2]; 195 | r3 = in[3]; 196 | r4 = in[4]; 197 | 198 | d0 = r0 * 2; 199 | d1 = r1 * 2; 200 | d2 = r2 * 2 * 19; 201 | d419 = r4 * 19; 202 | d4 = d419 * 2; 203 | 204 | #if defined(HAVE_NATIVE_UINT128) 205 | t[0] = ((uint128_t) r0) * r0 + ((uint128_t) d4) * r1 + (((uint128_t) d2) * (r3 )); 206 | t[1] = ((uint128_t) d0) * r1 + ((uint128_t) d4) * r2 + (((uint128_t) r3) * (r3 * 19)); 207 | t[2] = ((uint128_t) d0) * r2 + ((uint128_t) r1) * r1 + (((uint128_t) d4) * (r3 )); 208 | t[3] = ((uint128_t) d0) * r3 + ((uint128_t) d1) * r2 + (((uint128_t) r4) * (d419 )); 209 | t[4] = ((uint128_t) d0) * r4 + ((uint128_t) d1) * r3 + (((uint128_t) r2) * (r2 )); 210 | #else 211 | mul64x64_128(t[0], r0, r0) mul64x64_128(mul, d4, r1) add128(t[0], mul) mul64x64_128(mul, d2, r3) add128(t[0], mul) 212 | mul64x64_128(t[1], d0, r1) mul64x64_128(mul, d4, r2) add128(t[1], mul) mul64x64_128(mul, r3, r3 * 19) add128(t[1], mul) 213 | mul64x64_128(t[2], d0, r2) mul64x64_128(mul, r1, r1) add128(t[2], mul) mul64x64_128(mul, d4, r3) add128(t[2], mul) 214 | mul64x64_128(t[3], d0, r3) mul64x64_128(mul, d1, r2) add128(t[3], mul) mul64x64_128(mul, r4, d419) add128(t[3], mul) 215 | mul64x64_128(t[4], d0, r4) mul64x64_128(mul, d1, r3) add128(t[4], mul) mul64x64_128(mul, r2, r2) add128(t[4], mul) 216 | #endif 217 | 218 | r0 = lo128(t[0]) & reduce_mask_51; shr128(c, t[0], 51); 219 | add128_64(t[1], c) r1 = lo128(t[1]) & reduce_mask_51; shr128(c, t[1], 51); 220 | add128_64(t[2], c) r2 = lo128(t[2]) & reduce_mask_51; shr128(c, t[2], 51); 221 | add128_64(t[3], c) r3 = lo128(t[3]) & reduce_mask_51; shr128(c, t[3], 51); 222 | add128_64(t[4], c) r4 = lo128(t[4]) & reduce_mask_51; shr128(c, t[4], 51); 223 | r0 += c * 19; c = r0 >> 51; r0 = r0 & reduce_mask_51; 224 | r1 += c; 225 | 226 | out[0] = r0; 227 | out[1] = r1; 228 | out[2] = r2; 229 | out[3] = r3; 230 | out[4] = r4; 231 | } 232 | 233 | 234 | /* Take a little-endian, 32-byte number and expand it into polynomial form */ 235 | DONNA_INLINE static void 236 | curve25519_expand(bignum25519 out, const unsigned char *in) { 237 | static const union { uint8_t b[2]; uint16_t s; } endian_check = {{1,0}}; 238 | uint64_t x0,x1,x2,x3; 239 | 240 | if (endian_check.s == 1) { 241 | x0 = *(uint64_t *)(in + 0); 242 | x1 = *(uint64_t *)(in + 8); 243 | x2 = *(uint64_t *)(in + 16); 244 | x3 = *(uint64_t *)(in + 24); 245 | } else { 246 | #define F(s) \ 247 | ((((uint64_t)in[s + 0]) ) | \ 248 | (((uint64_t)in[s + 1]) << 8) | \ 249 | (((uint64_t)in[s + 2]) << 16) | \ 250 | (((uint64_t)in[s + 3]) << 24) | \ 251 | (((uint64_t)in[s + 4]) << 32) | \ 252 | (((uint64_t)in[s + 5]) << 40) | \ 253 | (((uint64_t)in[s + 6]) << 48) | \ 254 | (((uint64_t)in[s + 7]) << 56)) 255 | 256 | x0 = F(0); 257 | x1 = F(8); 258 | x2 = F(16); 259 | x3 = F(24); 260 | } 261 | 262 | out[0] = x0 & reduce_mask_51; x0 = (x0 >> 51) | (x1 << 13); 263 | out[1] = x0 & reduce_mask_51; x1 = (x1 >> 38) | (x2 << 26); 264 | out[2] = x1 & reduce_mask_51; x2 = (x2 >> 25) | (x3 << 39); 265 | out[3] = x2 & reduce_mask_51; x3 = (x3 >> 12); 266 | out[4] = x3 & reduce_mask_51; /* ignore the top bit */ 267 | } 268 | 269 | /* Take a fully reduced polynomial form number and contract it into a 270 | * little-endian, 32-byte array 271 | */ 272 | DONNA_INLINE static void 273 | curve25519_contract(unsigned char *out, const bignum25519 input) { 274 | uint64_t t[5]; 275 | uint64_t f, i; 276 | 277 | t[0] = input[0]; 278 | t[1] = input[1]; 279 | t[2] = input[2]; 280 | t[3] = input[3]; 281 | t[4] = input[4]; 282 | 283 | #define curve25519_contract_carry() \ 284 | t[1] += t[0] >> 51; t[0] &= reduce_mask_51; \ 285 | t[2] += t[1] >> 51; t[1] &= reduce_mask_51; \ 286 | t[3] += t[2] >> 51; t[2] &= reduce_mask_51; \ 287 | t[4] += t[3] >> 51; t[3] &= reduce_mask_51; 288 | 289 | #define curve25519_contract_carry_full() curve25519_contract_carry() \ 290 | t[0] += 19 * (t[4] >> 51); t[4] &= reduce_mask_51; 291 | 292 | #define curve25519_contract_carry_final() curve25519_contract_carry() \ 293 | t[4] &= reduce_mask_51; 294 | 295 | curve25519_contract_carry_full() 296 | curve25519_contract_carry_full() 297 | 298 | /* now t is between 0 and 2^255-1, properly carried. */ 299 | /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */ 300 | t[0] += 19; 301 | curve25519_contract_carry_full() 302 | 303 | /* now between 19 and 2^255-1 in both cases, and offset by 19. */ 304 | t[0] += 0x8000000000000 - 19; 305 | t[1] += 0x8000000000000 - 1; 306 | t[2] += 0x8000000000000 - 1; 307 | t[3] += 0x8000000000000 - 1; 308 | t[4] += 0x8000000000000 - 1; 309 | 310 | /* now between 2^255 and 2^256-20, and offset by 2^255. */ 311 | curve25519_contract_carry_final() 312 | 313 | #define write51full(n,shift) \ 314 | f = ((t[n] >> shift) | (t[n+1] << (51 - shift))); \ 315 | for (i = 0; i < 8; i++, f >>= 8) *out++ = (unsigned char)f; 316 | #define write51(n) write51full(n,13*n) 317 | 318 | write51(0) 319 | write51(1) 320 | write51(2) 321 | write51(3) 322 | 323 | #undef curve25519_contract_carry 324 | #undef curve25519_contract_carry_full 325 | #undef curve25519_contract_carry_final 326 | #undef write51full 327 | #undef write51 328 | } 329 | 330 | /* 331 | * Swap the contents of [qx] and [qpx] iff @swap is non-zero 332 | */ 333 | DONNA_INLINE static void 334 | curve25519_swap_conditional(bignum25519 x, bignum25519 qpx, uint64_t iswap) { 335 | const uint64_t swap = (uint64_t)(-(int64_t)iswap); 336 | uint64_t x0,x1,x2,x3,x4; 337 | 338 | x0 = swap & (x[0] ^ qpx[0]); x[0] ^= x0; qpx[0] ^= x0; 339 | x1 = swap & (x[1] ^ qpx[1]); x[1] ^= x1; qpx[1] ^= x1; 340 | x2 = swap & (x[2] ^ qpx[2]); x[2] ^= x2; qpx[2] ^= x2; 341 | x3 = swap & (x[3] ^ qpx[3]); x[3] ^= x3; qpx[3] ^= x3; 342 | x4 = swap & (x[4] ^ qpx[4]); x[4] ^= x4; qpx[4] ^= x4; 343 | 344 | } 345 | 346 | -------------------------------------------------------------------------------- /curve25519-donna-32bit.h: -------------------------------------------------------------------------------- 1 | typedef uint32_t bignum25519[10]; 2 | 3 | static const uint32_t reduce_mask_26 = (1 << 26) - 1; 4 | static const uint32_t reduce_mask_25 = (1 << 25) - 1; 5 | 6 | /* out = in */ 7 | DONNA_INLINE static void 8 | curve25519_copy(bignum25519 out, const bignum25519 in) { 9 | out[0] = in[0]; 10 | out[1] = in[1]; 11 | out[2] = in[2]; 12 | out[3] = in[3]; 13 | out[4] = in[4]; 14 | out[5] = in[5]; 15 | out[6] = in[6]; 16 | out[7] = in[7]; 17 | out[8] = in[8]; 18 | out[9] = in[9]; 19 | } 20 | 21 | /* out = a + b */ 22 | DONNA_INLINE static void 23 | curve25519_add(bignum25519 out, const bignum25519 a, const bignum25519 b) { 24 | out[0] = a[0] + b[0]; 25 | out[1] = a[1] + b[1]; 26 | out[2] = a[2] + b[2]; 27 | out[3] = a[3] + b[3]; 28 | out[4] = a[4] + b[4]; 29 | out[5] = a[5] + b[5]; 30 | out[6] = a[6] + b[6]; 31 | out[7] = a[7] + b[7]; 32 | out[8] = a[8] + b[8]; 33 | out[9] = a[9] + b[9]; 34 | } 35 | 36 | /* out = a - b */ 37 | DONNA_INLINE static void 38 | curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) { 39 | uint32_t c; 40 | out[0] = 0x7ffffda + a[0] - b[0] ; c = (out[0] >> 26); out[0] &= reduce_mask_26; 41 | out[1] = 0x3fffffe + a[1] - b[1] + c; c = (out[1] >> 25); out[1] &= reduce_mask_25; 42 | out[2] = 0x7fffffe + a[2] - b[2] + c; c = (out[2] >> 26); out[2] &= reduce_mask_26; 43 | out[3] = 0x3fffffe + a[3] - b[3] + c; c = (out[3] >> 25); out[3] &= reduce_mask_25; 44 | out[4] = 0x7fffffe + a[4] - b[4] + c; c = (out[4] >> 26); out[4] &= reduce_mask_26; 45 | out[5] = 0x3fffffe + a[5] - b[5] + c; c = (out[5] >> 25); out[5] &= reduce_mask_25; 46 | out[6] = 0x7fffffe + a[6] - b[6] + c; c = (out[6] >> 26); out[6] &= reduce_mask_26; 47 | out[7] = 0x3fffffe + a[7] - b[7] + c; c = (out[7] >> 25); out[7] &= reduce_mask_25; 48 | out[8] = 0x7fffffe + a[8] - b[8] + c; c = (out[8] >> 26); out[8] &= reduce_mask_26; 49 | out[9] = 0x3fffffe + a[9] - b[9] + c; c = (out[9] >> 25); out[9] &= reduce_mask_25; 50 | out[0] += 19 * c; 51 | } 52 | 53 | /* out = in * scalar */ 54 | DONNA_INLINE static void 55 | curve25519_scalar_product(bignum25519 out, const bignum25519 in, const uint32_t scalar) { 56 | uint64_t a; 57 | uint32_t c; 58 | a = mul32x32_64(in[0], scalar); out[0] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26); 59 | a = mul32x32_64(in[1], scalar) + c; out[1] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25); 60 | a = mul32x32_64(in[2], scalar) + c; out[2] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26); 61 | a = mul32x32_64(in[3], scalar) + c; out[3] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25); 62 | a = mul32x32_64(in[4], scalar) + c; out[4] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26); 63 | a = mul32x32_64(in[5], scalar) + c; out[5] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25); 64 | a = mul32x32_64(in[6], scalar) + c; out[6] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26); 65 | a = mul32x32_64(in[7], scalar) + c; out[7] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25); 66 | a = mul32x32_64(in[8], scalar) + c; out[8] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26); 67 | a = mul32x32_64(in[9], scalar) + c; out[9] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25); 68 | out[0] += c * 19; 69 | } 70 | 71 | /* out = a * b */ 72 | DONNA_INLINE static void 73 | curve25519_mul(bignum25519 out, const bignum25519 a, const bignum25519 b) { 74 | uint32_t r0,r1,r2,r3,r4,r5,r6,r7,r8,r9; 75 | uint32_t s0,s1,s2,s3,s4,s5,s6,s7,s8,s9; 76 | uint64_t m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,c; 77 | uint32_t p; 78 | 79 | r0 = b[0]; 80 | r1 = b[1]; 81 | r2 = b[2]; 82 | r3 = b[3]; 83 | r4 = b[4]; 84 | r5 = b[5]; 85 | r6 = b[6]; 86 | r7 = b[7]; 87 | r8 = b[8]; 88 | r9 = b[9]; 89 | 90 | s0 = a[0]; 91 | s1 = a[1]; 92 | s2 = a[2]; 93 | s3 = a[3]; 94 | s4 = a[4]; 95 | s5 = a[5]; 96 | s6 = a[6]; 97 | s7 = a[7]; 98 | s8 = a[8]; 99 | s9 = a[9]; 100 | 101 | m1 = mul32x32_64(r0, s1) + mul32x32_64(r1, s0); 102 | m3 = mul32x32_64(r0, s3) + mul32x32_64(r1, s2) + mul32x32_64(r2, s1) + mul32x32_64(r3, s0); 103 | m5 = mul32x32_64(r0, s5) + mul32x32_64(r1, s4) + mul32x32_64(r2, s3) + mul32x32_64(r3, s2) + mul32x32_64(r4, s1) + mul32x32_64(r5, s0); 104 | m7 = mul32x32_64(r0, s7) + mul32x32_64(r1, s6) + mul32x32_64(r2, s5) + mul32x32_64(r3, s4) + mul32x32_64(r4, s3) + mul32x32_64(r5, s2) + mul32x32_64(r6, s1) + mul32x32_64(r7, s0); 105 | m9 = mul32x32_64(r0, s9) + mul32x32_64(r1, s8) + mul32x32_64(r2, s7) + mul32x32_64(r3, s6) + mul32x32_64(r4, s5) + mul32x32_64(r5, s4) + mul32x32_64(r6, s3) + mul32x32_64(r7, s2) + mul32x32_64(r8, s1) + mul32x32_64(r9, s0); 106 | 107 | r1 *= 2; 108 | r3 *= 2; 109 | r5 *= 2; 110 | r7 *= 2; 111 | 112 | m0 = mul32x32_64(r0, s0); 113 | m2 = mul32x32_64(r0, s2) + mul32x32_64(r1, s1) + mul32x32_64(r2, s0); 114 | m4 = mul32x32_64(r0, s4) + mul32x32_64(r1, s3) + mul32x32_64(r2, s2) + mul32x32_64(r3, s1) + mul32x32_64(r4, s0); 115 | m6 = mul32x32_64(r0, s6) + mul32x32_64(r1, s5) + mul32x32_64(r2, s4) + mul32x32_64(r3, s3) + mul32x32_64(r4, s2) + mul32x32_64(r5, s1) + mul32x32_64(r6, s0); 116 | m8 = mul32x32_64(r0, s8) + mul32x32_64(r1, s7) + mul32x32_64(r2, s6) + mul32x32_64(r3, s5) + mul32x32_64(r4, s4) + mul32x32_64(r5, s3) + mul32x32_64(r6, s2) + mul32x32_64(r7, s1) + mul32x32_64(r8, s0); 117 | 118 | r1 *= 19; 119 | r2 *= 19; 120 | r3 = (r3 / 2) * 19; 121 | r4 *= 19; 122 | r5 = (r5 / 2) * 19; 123 | r6 *= 19; 124 | r7 = (r7 / 2) * 19; 125 | r8 *= 19; 126 | r9 *= 19; 127 | 128 | m1 += (mul32x32_64(r9, s2) + mul32x32_64(r8, s3) + mul32x32_64(r7, s4) + mul32x32_64(r6, s5) + mul32x32_64(r5, s6) + mul32x32_64(r4, s7) + mul32x32_64(r3, s8) + mul32x32_64(r2, s9)); 129 | m3 += (mul32x32_64(r9, s4) + mul32x32_64(r8, s5) + mul32x32_64(r7, s6) + mul32x32_64(r6, s7) + mul32x32_64(r5, s8) + mul32x32_64(r4, s9)); 130 | m5 += (mul32x32_64(r9, s6) + mul32x32_64(r8, s7) + mul32x32_64(r7, s8) + mul32x32_64(r6, s9)); 131 | m7 += (mul32x32_64(r9, s8) + mul32x32_64(r8, s9)); 132 | 133 | r3 *= 2; 134 | r5 *= 2; 135 | r7 *= 2; 136 | r9 *= 2; 137 | 138 | m0 += (mul32x32_64(r9, s1) + mul32x32_64(r8, s2) + mul32x32_64(r7, s3) + mul32x32_64(r6, s4) + mul32x32_64(r5, s5) + mul32x32_64(r4, s6) + mul32x32_64(r3, s7) + mul32x32_64(r2, s8) + mul32x32_64(r1, s9)); 139 | m2 += (mul32x32_64(r9, s3) + mul32x32_64(r8, s4) + mul32x32_64(r7, s5) + mul32x32_64(r6, s6) + mul32x32_64(r5, s7) + mul32x32_64(r4, s8) + mul32x32_64(r3, s9)); 140 | m4 += (mul32x32_64(r9, s5) + mul32x32_64(r8, s6) + mul32x32_64(r7, s7) + mul32x32_64(r6, s8) + mul32x32_64(r5, s9)); 141 | m6 += (mul32x32_64(r9, s7) + mul32x32_64(r8, s8) + mul32x32_64(r7, s9)); 142 | m8 += (mul32x32_64(r9, s9)); 143 | 144 | r0 = (uint32_t)m0 & reduce_mask_26; c = (m0 >> 26); 145 | m1 += c; r1 = (uint32_t)m1 & reduce_mask_25; c = (m1 >> 25); 146 | m2 += c; r2 = (uint32_t)m2 & reduce_mask_26; c = (m2 >> 26); 147 | m3 += c; r3 = (uint32_t)m3 & reduce_mask_25; c = (m3 >> 25); 148 | m4 += c; r4 = (uint32_t)m4 & reduce_mask_26; c = (m4 >> 26); 149 | m5 += c; r5 = (uint32_t)m5 & reduce_mask_25; c = (m5 >> 25); 150 | m6 += c; r6 = (uint32_t)m6 & reduce_mask_26; c = (m6 >> 26); 151 | m7 += c; r7 = (uint32_t)m7 & reduce_mask_25; c = (m7 >> 25); 152 | m8 += c; r8 = (uint32_t)m8 & reduce_mask_26; c = (m8 >> 26); 153 | m9 += c; r9 = (uint32_t)m9 & reduce_mask_25; p = (uint32_t)(m9 >> 25); 154 | m0 = r0 + mul32x32_64(p,19); r0 = (uint32_t)m0 & reduce_mask_26; p = (uint32_t)(m0 >> 26); 155 | r1 += p; 156 | 157 | out[0] = r0; 158 | out[1] = r1; 159 | out[2] = r2; 160 | out[3] = r3; 161 | out[4] = r4; 162 | out[5] = r5; 163 | out[6] = r6; 164 | out[7] = r7; 165 | out[8] = r8; 166 | out[9] = r9; 167 | } 168 | 169 | /* out = in * in */ 170 | DONNA_INLINE static void 171 | curve25519_square(bignum25519 out, const bignum25519 in) { 172 | uint32_t r0,r1,r2,r3,r4,r5,r6,r7,r8,r9; 173 | uint32_t d6,d7,d8,d9; 174 | uint64_t m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,c; 175 | uint32_t p; 176 | 177 | r0 = in[0]; 178 | r1 = in[1]; 179 | r2 = in[2]; 180 | r3 = in[3]; 181 | r4 = in[4]; 182 | r5 = in[5]; 183 | r6 = in[6]; 184 | r7 = in[7]; 185 | r8 = in[8]; 186 | r9 = in[9]; 187 | 188 | 189 | m0 = mul32x32_64(r0, r0); 190 | r0 *= 2; 191 | m1 = mul32x32_64(r0, r1); 192 | m2 = mul32x32_64(r0, r2) + mul32x32_64(r1, r1 * 2); 193 | r1 *= 2; 194 | m3 = mul32x32_64(r0, r3) + mul32x32_64(r1, r2 ); 195 | m4 = mul32x32_64(r0, r4) + mul32x32_64(r1, r3 * 2) + mul32x32_64(r2, r2); 196 | r2 *= 2; 197 | m5 = mul32x32_64(r0, r5) + mul32x32_64(r1, r4 ) + mul32x32_64(r2, r3); 198 | m6 = mul32x32_64(r0, r6) + mul32x32_64(r1, r5 * 2) + mul32x32_64(r2, r4) + mul32x32_64(r3, r3 * 2); 199 | r3 *= 2; 200 | m7 = mul32x32_64(r0, r7) + mul32x32_64(r1, r6 ) + mul32x32_64(r2, r5) + mul32x32_64(r3, r4 ); 201 | m8 = mul32x32_64(r0, r8) + mul32x32_64(r1, r7 * 2) + mul32x32_64(r2, r6) + mul32x32_64(r3, r5 * 2) + mul32x32_64(r4, r4 ); 202 | m9 = mul32x32_64(r0, r9) + mul32x32_64(r1, r8 ) + mul32x32_64(r2, r7) + mul32x32_64(r3, r6 ) + mul32x32_64(r4, r5 * 2); 203 | 204 | d6 = r6 * 19; 205 | d7 = r7 * 2 * 19; 206 | d8 = r8 * 19; 207 | d9 = r9 * 2 * 19; 208 | 209 | m0 += (mul32x32_64(d9, r1 ) + mul32x32_64(d8, r2 ) + mul32x32_64(d7, r3 ) + mul32x32_64(d6, r4 * 2) + mul32x32_64(r5, r5 * 2 * 19)); 210 | m1 += (mul32x32_64(d9, r2 / 2) + mul32x32_64(d8, r3 ) + mul32x32_64(d7, r4 ) + mul32x32_64(d6, r5 * 2)); 211 | m2 += (mul32x32_64(d9, r3 ) + mul32x32_64(d8, r4 * 2) + mul32x32_64(d7, r5 * 2) + mul32x32_64(d6, r6 )); 212 | m3 += (mul32x32_64(d9, r4 ) + mul32x32_64(d8, r5 * 2) + mul32x32_64(d7, r6 )); 213 | m4 += (mul32x32_64(d9, r5 * 2) + mul32x32_64(d8, r6 * 2) + mul32x32_64(d7, r7 )); 214 | m5 += (mul32x32_64(d9, r6 ) + mul32x32_64(d8, r7 * 2)); 215 | m6 += (mul32x32_64(d9, r7 * 2) + mul32x32_64(d8, r8 )); 216 | m7 += (mul32x32_64(d9, r8 )); 217 | m8 += (mul32x32_64(d9, r9 )); 218 | 219 | r0 = (uint32_t)m0 & reduce_mask_26; c = (m0 >> 26); 220 | m1 += c; r1 = (uint32_t)m1 & reduce_mask_25; c = (m1 >> 25); 221 | m2 += c; r2 = (uint32_t)m2 & reduce_mask_26; c = (m2 >> 26); 222 | m3 += c; r3 = (uint32_t)m3 & reduce_mask_25; c = (m3 >> 25); 223 | m4 += c; r4 = (uint32_t)m4 & reduce_mask_26; c = (m4 >> 26); 224 | m5 += c; r5 = (uint32_t)m5 & reduce_mask_25; c = (m5 >> 25); 225 | m6 += c; r6 = (uint32_t)m6 & reduce_mask_26; c = (m6 >> 26); 226 | m7 += c; r7 = (uint32_t)m7 & reduce_mask_25; c = (m7 >> 25); 227 | m8 += c; r8 = (uint32_t)m8 & reduce_mask_26; c = (m8 >> 26); 228 | m9 += c; r9 = (uint32_t)m9 & reduce_mask_25; p = (uint32_t)(m9 >> 25); 229 | m0 = r0 + mul32x32_64(p,19); r0 = (uint32_t)m0 & reduce_mask_26; p = (uint32_t)(m0 >> 26); 230 | r1 += p; 231 | 232 | out[0] = r0; 233 | out[1] = r1; 234 | out[2] = r2; 235 | out[3] = r3; 236 | out[4] = r4; 237 | out[5] = r5; 238 | out[6] = r6; 239 | out[7] = r7; 240 | out[8] = r8; 241 | out[9] = r9; 242 | } 243 | 244 | /* out = in^(2 * count) */ 245 | static void 246 | curve25519_square_times(bignum25519 out, const bignum25519 in, int count) { 247 | uint32_t r0,r1,r2,r3,r4,r5,r6,r7,r8,r9; 248 | uint32_t d6,d7,d8,d9; 249 | uint64_t m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,c; 250 | uint32_t p; 251 | 252 | r0 = in[0]; 253 | r1 = in[1]; 254 | r2 = in[2]; 255 | r3 = in[3]; 256 | r4 = in[4]; 257 | r5 = in[5]; 258 | r6 = in[6]; 259 | r7 = in[7]; 260 | r8 = in[8]; 261 | r9 = in[9]; 262 | 263 | do { 264 | m0 = mul32x32_64(r0, r0); 265 | r0 *= 2; 266 | m1 = mul32x32_64(r0, r1); 267 | m2 = mul32x32_64(r0, r2) + mul32x32_64(r1, r1 * 2); 268 | r1 *= 2; 269 | m3 = mul32x32_64(r0, r3) + mul32x32_64(r1, r2 ); 270 | m4 = mul32x32_64(r0, r4) + mul32x32_64(r1, r3 * 2) + mul32x32_64(r2, r2); 271 | r2 *= 2; 272 | m5 = mul32x32_64(r0, r5) + mul32x32_64(r1, r4 ) + mul32x32_64(r2, r3); 273 | m6 = mul32x32_64(r0, r6) + mul32x32_64(r1, r5 * 2) + mul32x32_64(r2, r4) + mul32x32_64(r3, r3 * 2); 274 | r3 *= 2; 275 | m7 = mul32x32_64(r0, r7) + mul32x32_64(r1, r6 ) + mul32x32_64(r2, r5) + mul32x32_64(r3, r4 ); 276 | m8 = mul32x32_64(r0, r8) + mul32x32_64(r1, r7 * 2) + mul32x32_64(r2, r6) + mul32x32_64(r3, r5 * 2) + mul32x32_64(r4, r4 ); 277 | m9 = mul32x32_64(r0, r9) + mul32x32_64(r1, r8 ) + mul32x32_64(r2, r7) + mul32x32_64(r3, r6 ) + mul32x32_64(r4, r5 * 2); 278 | 279 | d6 = r6 * 19; 280 | d7 = r7 * 2 * 19; 281 | d8 = r8 * 19; 282 | d9 = r9 * 2 * 19; 283 | 284 | m0 += (mul32x32_64(d9, r1 ) + mul32x32_64(d8, r2 ) + mul32x32_64(d7, r3 ) + mul32x32_64(d6, r4 * 2) + mul32x32_64(r5, r5 * 2 * 19)); 285 | m1 += (mul32x32_64(d9, r2 / 2) + mul32x32_64(d8, r3 ) + mul32x32_64(d7, r4 ) + mul32x32_64(d6, r5 * 2)); 286 | m2 += (mul32x32_64(d9, r3 ) + mul32x32_64(d8, r4 * 2) + mul32x32_64(d7, r5 * 2) + mul32x32_64(d6, r6 )); 287 | m3 += (mul32x32_64(d9, r4 ) + mul32x32_64(d8, r5 * 2) + mul32x32_64(d7, r6 )); 288 | m4 += (mul32x32_64(d9, r5 * 2) + mul32x32_64(d8, r6 * 2) + mul32x32_64(d7, r7 )); 289 | m5 += (mul32x32_64(d9, r6 ) + mul32x32_64(d8, r7 * 2)); 290 | m6 += (mul32x32_64(d9, r7 * 2) + mul32x32_64(d8, r8 )); 291 | m7 += (mul32x32_64(d9, r8 )); 292 | m8 += (mul32x32_64(d9, r9 )); 293 | 294 | r0 = (uint32_t)m0 & reduce_mask_26; c = (m0 >> 26); 295 | m1 += c; r1 = (uint32_t)m1 & reduce_mask_25; c = (m1 >> 25); 296 | m2 += c; r2 = (uint32_t)m2 & reduce_mask_26; c = (m2 >> 26); 297 | m3 += c; r3 = (uint32_t)m3 & reduce_mask_25; c = (m3 >> 25); 298 | m4 += c; r4 = (uint32_t)m4 & reduce_mask_26; c = (m4 >> 26); 299 | m5 += c; r5 = (uint32_t)m5 & reduce_mask_25; c = (m5 >> 25); 300 | m6 += c; r6 = (uint32_t)m6 & reduce_mask_26; c = (m6 >> 26); 301 | m7 += c; r7 = (uint32_t)m7 & reduce_mask_25; c = (m7 >> 25); 302 | m8 += c; r8 = (uint32_t)m8 & reduce_mask_26; c = (m8 >> 26); 303 | m9 += c; r9 = (uint32_t)m9 & reduce_mask_25; p = (uint32_t)(m9 >> 25); 304 | m0 = r0 + mul32x32_64(p,19); r0 = (uint32_t)m0 & reduce_mask_26; p = (uint32_t)(m0 >> 26); 305 | r1 += p; 306 | } while (--count); 307 | 308 | out[0] = r0; 309 | out[1] = r1; 310 | out[2] = r2; 311 | out[3] = r3; 312 | out[4] = r4; 313 | out[5] = r5; 314 | out[6] = r6; 315 | out[7] = r7; 316 | out[8] = r8; 317 | out[9] = r9; 318 | } 319 | 320 | 321 | /* Take a little-endian, 32-byte number and expand it into polynomial form */ 322 | static void 323 | curve25519_expand(bignum25519 out, const unsigned char in[32]) { 324 | static const union { uint8_t b[2]; uint16_t s; } endian_check = {{1,0}}; 325 | uint32_t x0,x1,x2,x3,x4,x5,x6,x7; 326 | 327 | if (endian_check.s == 1) { 328 | x0 = *(uint32_t *)(in + 0); 329 | x1 = *(uint32_t *)(in + 4); 330 | x2 = *(uint32_t *)(in + 8); 331 | x3 = *(uint32_t *)(in + 12); 332 | x4 = *(uint32_t *)(in + 16); 333 | x5 = *(uint32_t *)(in + 20); 334 | x6 = *(uint32_t *)(in + 24); 335 | x7 = *(uint32_t *)(in + 28); 336 | } else { 337 | #define F(s) \ 338 | ((((uint32_t)in[s + 0]) ) | \ 339 | (((uint32_t)in[s + 1]) << 8) | \ 340 | (((uint32_t)in[s + 2]) << 16) | \ 341 | (((uint32_t)in[s + 3]) << 24)) 342 | x0 = F(0); 343 | x1 = F(4); 344 | x2 = F(8); 345 | x3 = F(12); 346 | x4 = F(16); 347 | x5 = F(20); 348 | x6 = F(24); 349 | x7 = F(28); 350 | #undef F 351 | } 352 | 353 | out[0] = ( x0 ) & reduce_mask_26; 354 | out[1] = ((((uint64_t)x1 << 32) | x0) >> 26) & reduce_mask_25; 355 | out[2] = ((((uint64_t)x2 << 32) | x1) >> 19) & reduce_mask_26; 356 | out[3] = ((((uint64_t)x3 << 32) | x2) >> 13) & reduce_mask_25; 357 | out[4] = (( x3) >> 6) & reduce_mask_26; 358 | out[5] = ( x4 ) & reduce_mask_25; 359 | out[6] = ((((uint64_t)x5 << 32) | x4) >> 25) & reduce_mask_26; 360 | out[7] = ((((uint64_t)x6 << 32) | x5) >> 19) & reduce_mask_25; 361 | out[8] = ((((uint64_t)x7 << 32) | x6) >> 12) & reduce_mask_26; 362 | out[9] = (( x7) >> 6) & reduce_mask_25; /* ignore the top bit */ 363 | } 364 | 365 | /* Take a fully reduced polynomial form number and contract it into a little-endian, 32-byte array */ 366 | static void 367 | curve25519_contract(unsigned char out[32], const bignum25519 in) { 368 | bignum25519 f; 369 | curve25519_copy(f, in); 370 | 371 | #define carry_pass() \ 372 | f[1] += f[0] >> 26; f[0] &= reduce_mask_26; \ 373 | f[2] += f[1] >> 25; f[1] &= reduce_mask_25; \ 374 | f[3] += f[2] >> 26; f[2] &= reduce_mask_26; \ 375 | f[4] += f[3] >> 25; f[3] &= reduce_mask_25; \ 376 | f[5] += f[4] >> 26; f[4] &= reduce_mask_26; \ 377 | f[6] += f[5] >> 25; f[5] &= reduce_mask_25; \ 378 | f[7] += f[6] >> 26; f[6] &= reduce_mask_26; \ 379 | f[8] += f[7] >> 25; f[7] &= reduce_mask_25; \ 380 | f[9] += f[8] >> 26; f[8] &= reduce_mask_26; 381 | 382 | #define carry_pass_full() \ 383 | carry_pass() \ 384 | f[0] += 19 * (f[9] >> 25); f[9] &= reduce_mask_25; 385 | 386 | #define carry_pass_final() \ 387 | carry_pass() \ 388 | f[9] &= reduce_mask_25; 389 | 390 | carry_pass_full() 391 | carry_pass_full() 392 | 393 | /* now t is between 0 and 2^255-1, properly carried. */ 394 | /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */ 395 | f[0] += 19; 396 | carry_pass_full() 397 | 398 | /* now between 19 and 2^255-1 in both cases, and offset by 19. */ 399 | f[0] += (1 << 26) - 19; 400 | f[1] += (1 << 25) - 1; 401 | f[2] += (1 << 26) - 1; 402 | f[3] += (1 << 25) - 1; 403 | f[4] += (1 << 26) - 1; 404 | f[5] += (1 << 25) - 1; 405 | f[6] += (1 << 26) - 1; 406 | f[7] += (1 << 25) - 1; 407 | f[8] += (1 << 26) - 1; 408 | f[9] += (1 << 25) - 1; 409 | 410 | /* now between 2^255 and 2^256-20, and offset by 2^255. */ 411 | carry_pass_final() 412 | 413 | #undef carry_pass 414 | #undef carry_full 415 | #undef carry_final 416 | 417 | f[1] <<= 2; 418 | f[2] <<= 3; 419 | f[3] <<= 5; 420 | f[4] <<= 6; 421 | f[6] <<= 1; 422 | f[7] <<= 3; 423 | f[8] <<= 4; 424 | f[9] <<= 6; 425 | 426 | #define F(i, s) \ 427 | out[s+0] |= (unsigned char )(f[i] & 0xff); \ 428 | out[s+1] = (unsigned char )((f[i] >> 8) & 0xff); \ 429 | out[s+2] = (unsigned char )((f[i] >> 16) & 0xff); \ 430 | out[s+3] = (unsigned char )((f[i] >> 24) & 0xff); 431 | 432 | out[0] = 0; 433 | out[16] = 0; 434 | F(0,0); 435 | F(1,3); 436 | F(2,6); 437 | F(3,9); 438 | F(4,12); 439 | F(5,16); 440 | F(6,19); 441 | F(7,22); 442 | F(8,25); 443 | F(9,28); 444 | #undef F 445 | } 446 | 447 | /* 448 | * Swap the contents of [qx] and [qpx] iff @swap is non-zero 449 | */ 450 | DONNA_INLINE static void 451 | curve25519_swap_conditional(bignum25519 x, bignum25519 qpx, uint32_t iswap) { 452 | const uint32_t swap = (uint32_t)(-(int32_t)iswap); 453 | uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9; 454 | 455 | x0 = swap & (x[0] ^ qpx[0]); x[0] ^= x0; qpx[0] ^= x0; 456 | x1 = swap & (x[1] ^ qpx[1]); x[1] ^= x1; qpx[1] ^= x1; 457 | x2 = swap & (x[2] ^ qpx[2]); x[2] ^= x2; qpx[2] ^= x2; 458 | x3 = swap & (x[3] ^ qpx[3]); x[3] ^= x3; qpx[3] ^= x3; 459 | x4 = swap & (x[4] ^ qpx[4]); x[4] ^= x4; qpx[4] ^= x4; 460 | x5 = swap & (x[5] ^ qpx[5]); x[5] ^= x5; qpx[5] ^= x5; 461 | x6 = swap & (x[6] ^ qpx[6]); x[6] ^= x6; qpx[6] ^= x6; 462 | x7 = swap & (x[7] ^ qpx[7]); x[7] ^= x7; qpx[7] ^= x7; 463 | x8 = swap & (x[8] ^ qpx[8]); x[8] ^= x8; qpx[8] ^= x8; 464 | x9 = swap & (x[9] ^ qpx[9]); x[9] ^= x9; qpx[9] ^= x9; 465 | } 466 | 467 | -------------------------------------------------------------------------------- /fuzz/curve25519-ref10.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | typedef int32_t crypto_int32; 4 | typedef int64_t crypto_int64; 5 | typedef uint64_t crypto_uint64; 6 | 7 | typedef crypto_int32 fe[10]; 8 | 9 | /* 10 | h = 0 11 | */ 12 | 13 | void fe_0(fe h) 14 | { 15 | h[0] = 0; 16 | h[1] = 0; 17 | h[2] = 0; 18 | h[3] = 0; 19 | h[4] = 0; 20 | h[5] = 0; 21 | h[6] = 0; 22 | h[7] = 0; 23 | h[8] = 0; 24 | h[9] = 0; 25 | } 26 | 27 | /* 28 | h = 1 29 | */ 30 | 31 | void fe_1(fe h) 32 | { 33 | h[0] = 1; 34 | h[1] = 0; 35 | h[2] = 0; 36 | h[3] = 0; 37 | h[4] = 0; 38 | h[5] = 0; 39 | h[6] = 0; 40 | h[7] = 0; 41 | h[8] = 0; 42 | h[9] = 0; 43 | } 44 | 45 | /* 46 | h = f + g 47 | Can overlap h with f or g. 48 | 49 | Preconditions: 50 | |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. 51 | |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. 52 | 53 | Postconditions: 54 | |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc. 55 | */ 56 | 57 | void fe_add(fe h,fe f,fe g) 58 | { 59 | crypto_int32 f0 = f[0]; 60 | crypto_int32 f1 = f[1]; 61 | crypto_int32 f2 = f[2]; 62 | crypto_int32 f3 = f[3]; 63 | crypto_int32 f4 = f[4]; 64 | crypto_int32 f5 = f[5]; 65 | crypto_int32 f6 = f[6]; 66 | crypto_int32 f7 = f[7]; 67 | crypto_int32 f8 = f[8]; 68 | crypto_int32 f9 = f[9]; 69 | crypto_int32 g0 = g[0]; 70 | crypto_int32 g1 = g[1]; 71 | crypto_int32 g2 = g[2]; 72 | crypto_int32 g3 = g[3]; 73 | crypto_int32 g4 = g[4]; 74 | crypto_int32 g5 = g[5]; 75 | crypto_int32 g6 = g[6]; 76 | crypto_int32 g7 = g[7]; 77 | crypto_int32 g8 = g[8]; 78 | crypto_int32 g9 = g[9]; 79 | crypto_int32 h0 = f0 + g0; 80 | crypto_int32 h1 = f1 + g1; 81 | crypto_int32 h2 = f2 + g2; 82 | crypto_int32 h3 = f3 + g3; 83 | crypto_int32 h4 = f4 + g4; 84 | crypto_int32 h5 = f5 + g5; 85 | crypto_int32 h6 = f6 + g6; 86 | crypto_int32 h7 = f7 + g7; 87 | crypto_int32 h8 = f8 + g8; 88 | crypto_int32 h9 = f9 + g9; 89 | h[0] = h0; 90 | h[1] = h1; 91 | h[2] = h2; 92 | h[3] = h3; 93 | h[4] = h4; 94 | h[5] = h5; 95 | h[6] = h6; 96 | h[7] = h7; 97 | h[8] = h8; 98 | h[9] = h9; 99 | } 100 | 101 | /* 102 | h = f 103 | */ 104 | 105 | void fe_copy(fe h,fe f) 106 | { 107 | crypto_int32 f0 = f[0]; 108 | crypto_int32 f1 = f[1]; 109 | crypto_int32 f2 = f[2]; 110 | crypto_int32 f3 = f[3]; 111 | crypto_int32 f4 = f[4]; 112 | crypto_int32 f5 = f[5]; 113 | crypto_int32 f6 = f[6]; 114 | crypto_int32 f7 = f[7]; 115 | crypto_int32 f8 = f[8]; 116 | crypto_int32 f9 = f[9]; 117 | h[0] = f0; 118 | h[1] = f1; 119 | h[2] = f2; 120 | h[3] = f3; 121 | h[4] = f4; 122 | h[5] = f5; 123 | h[6] = f6; 124 | h[7] = f7; 125 | h[8] = f8; 126 | h[9] = f9; 127 | } 128 | 129 | 130 | /* 131 | Replace (f,g) with (g,f) if b == 1; 132 | replace (f,g) with (f,g) if b == 0. 133 | 134 | Preconditions: b in {0,1}. 135 | */ 136 | 137 | void fe_cswap(fe f,fe g,unsigned int b) 138 | { 139 | crypto_int32 f0 = f[0]; 140 | crypto_int32 f1 = f[1]; 141 | crypto_int32 f2 = f[2]; 142 | crypto_int32 f3 = f[3]; 143 | crypto_int32 f4 = f[4]; 144 | crypto_int32 f5 = f[5]; 145 | crypto_int32 f6 = f[6]; 146 | crypto_int32 f7 = f[7]; 147 | crypto_int32 f8 = f[8]; 148 | crypto_int32 f9 = f[9]; 149 | crypto_int32 g0 = g[0]; 150 | crypto_int32 g1 = g[1]; 151 | crypto_int32 g2 = g[2]; 152 | crypto_int32 g3 = g[3]; 153 | crypto_int32 g4 = g[4]; 154 | crypto_int32 g5 = g[5]; 155 | crypto_int32 g6 = g[6]; 156 | crypto_int32 g7 = g[7]; 157 | crypto_int32 g8 = g[8]; 158 | crypto_int32 g9 = g[9]; 159 | crypto_int32 x0 = f0 ^ g0; 160 | crypto_int32 x1 = f1 ^ g1; 161 | crypto_int32 x2 = f2 ^ g2; 162 | crypto_int32 x3 = f3 ^ g3; 163 | crypto_int32 x4 = f4 ^ g4; 164 | crypto_int32 x5 = f5 ^ g5; 165 | crypto_int32 x6 = f6 ^ g6; 166 | crypto_int32 x7 = f7 ^ g7; 167 | crypto_int32 x8 = f8 ^ g8; 168 | crypto_int32 x9 = f9 ^ g9; 169 | b = -b; 170 | x0 &= b; 171 | x1 &= b; 172 | x2 &= b; 173 | x3 &= b; 174 | x4 &= b; 175 | x5 &= b; 176 | x6 &= b; 177 | x7 &= b; 178 | x8 &= b; 179 | x9 &= b; 180 | f[0] = f0 ^ x0; 181 | f[1] = f1 ^ x1; 182 | f[2] = f2 ^ x2; 183 | f[3] = f3 ^ x3; 184 | f[4] = f4 ^ x4; 185 | f[5] = f5 ^ x5; 186 | f[6] = f6 ^ x6; 187 | f[7] = f7 ^ x7; 188 | f[8] = f8 ^ x8; 189 | f[9] = f9 ^ x9; 190 | g[0] = g0 ^ x0; 191 | g[1] = g1 ^ x1; 192 | g[2] = g2 ^ x2; 193 | g[3] = g3 ^ x3; 194 | g[4] = g4 ^ x4; 195 | g[5] = g5 ^ x5; 196 | g[6] = g6 ^ x6; 197 | g[7] = g7 ^ x7; 198 | g[8] = g8 ^ x8; 199 | g[9] = g9 ^ x9; 200 | } 201 | 202 | static crypto_uint64 load_3(const unsigned char *in) 203 | { 204 | crypto_uint64 result; 205 | result = (crypto_uint64) in[0]; 206 | result |= ((crypto_uint64) in[1]) << 8; 207 | result |= ((crypto_uint64) in[2]) << 16; 208 | return result; 209 | } 210 | 211 | static crypto_uint64 load_4(const unsigned char *in) 212 | { 213 | crypto_uint64 result; 214 | result = (crypto_uint64) in[0]; 215 | result |= ((crypto_uint64) in[1]) << 8; 216 | result |= ((crypto_uint64) in[2]) << 16; 217 | result |= ((crypto_uint64) in[3]) << 24; 218 | return result; 219 | } 220 | 221 | void fe_frombytes(fe h,const unsigned char *s) 222 | { 223 | crypto_int64 h0 = load_4(s); 224 | crypto_int64 h1 = load_3(s + 4) << 6; 225 | crypto_int64 h2 = load_3(s + 7) << 5; 226 | crypto_int64 h3 = load_3(s + 10) << 3; 227 | crypto_int64 h4 = load_3(s + 13) << 2; 228 | crypto_int64 h5 = load_4(s + 16); 229 | crypto_int64 h6 = load_3(s + 20) << 7; 230 | crypto_int64 h7 = load_3(s + 23) << 5; 231 | crypto_int64 h8 = load_3(s + 26) << 4; 232 | crypto_int64 h9 = load_3(s + 29) << 2; 233 | crypto_int64 carry0; 234 | crypto_int64 carry1; 235 | crypto_int64 carry2; 236 | crypto_int64 carry3; 237 | crypto_int64 carry4; 238 | crypto_int64 carry5; 239 | crypto_int64 carry6; 240 | crypto_int64 carry7; 241 | crypto_int64 carry8; 242 | crypto_int64 carry9; 243 | 244 | carry9 = (h9 + (crypto_int64) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25; 245 | carry1 = (h1 + (crypto_int64) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25; 246 | carry3 = (h3 + (crypto_int64) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25; 247 | carry5 = (h5 + (crypto_int64) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25; 248 | carry7 = (h7 + (crypto_int64) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25; 249 | 250 | carry0 = (h0 + (crypto_int64) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26; 251 | carry2 = (h2 + (crypto_int64) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26; 252 | carry4 = (h4 + (crypto_int64) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26; 253 | carry6 = (h6 + (crypto_int64) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26; 254 | carry8 = (h8 + (crypto_int64) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26; 255 | 256 | h[0] = h0; 257 | h[1] = h1; 258 | h[2] = h2; 259 | h[3] = h3; 260 | h[4] = h4; 261 | h[5] = h5; 262 | h[6] = h6; 263 | h[7] = h7; 264 | h[8] = h8; 265 | h[9] = h9; 266 | } 267 | 268 | 269 | /* 270 | h = f * g 271 | Can overlap h with f or g. 272 | 273 | Preconditions: 274 | |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc. 275 | |g| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc. 276 | 277 | Postconditions: 278 | |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. 279 | */ 280 | 281 | /* 282 | Notes on implementation strategy: 283 | 284 | Using schoolbook multiplication. 285 | Karatsuba would save a little in some cost models. 286 | 287 | Most multiplications by 2 and 19 are 32-bit precomputations; 288 | cheaper than 64-bit postcomputations. 289 | 290 | There is one remaining multiplication by 19 in the carry chain; 291 | one *19 precomputation can be merged into this, 292 | but the resulting data flow is considerably less clean. 293 | 294 | There are 12 carries below. 295 | 10 of them are 2-way parallelizable and vectorizable. 296 | Can get away with 11 carries, but then data flow is much deeper. 297 | 298 | With tighter constraints on inputs can squeeze carries into int32. 299 | */ 300 | 301 | void fe_mul(fe h,fe f,fe g) 302 | { 303 | crypto_int32 f0 = f[0]; 304 | crypto_int32 f1 = f[1]; 305 | crypto_int32 f2 = f[2]; 306 | crypto_int32 f3 = f[3]; 307 | crypto_int32 f4 = f[4]; 308 | crypto_int32 f5 = f[5]; 309 | crypto_int32 f6 = f[6]; 310 | crypto_int32 f7 = f[7]; 311 | crypto_int32 f8 = f[8]; 312 | crypto_int32 f9 = f[9]; 313 | crypto_int32 g0 = g[0]; 314 | crypto_int32 g1 = g[1]; 315 | crypto_int32 g2 = g[2]; 316 | crypto_int32 g3 = g[3]; 317 | crypto_int32 g4 = g[4]; 318 | crypto_int32 g5 = g[5]; 319 | crypto_int32 g6 = g[6]; 320 | crypto_int32 g7 = g[7]; 321 | crypto_int32 g8 = g[8]; 322 | crypto_int32 g9 = g[9]; 323 | crypto_int32 g1_19 = 19 * g1; /* 1.4*2^29 */ 324 | crypto_int32 g2_19 = 19 * g2; /* 1.4*2^30; still ok */ 325 | crypto_int32 g3_19 = 19 * g3; 326 | crypto_int32 g4_19 = 19 * g4; 327 | crypto_int32 g5_19 = 19 * g5; 328 | crypto_int32 g6_19 = 19 * g6; 329 | crypto_int32 g7_19 = 19 * g7; 330 | crypto_int32 g8_19 = 19 * g8; 331 | crypto_int32 g9_19 = 19 * g9; 332 | crypto_int32 f1_2 = 2 * f1; 333 | crypto_int32 f3_2 = 2 * f3; 334 | crypto_int32 f5_2 = 2 * f5; 335 | crypto_int32 f7_2 = 2 * f7; 336 | crypto_int32 f9_2 = 2 * f9; 337 | crypto_int64 f0g0 = f0 * (crypto_int64) g0; 338 | crypto_int64 f0g1 = f0 * (crypto_int64) g1; 339 | crypto_int64 f0g2 = f0 * (crypto_int64) g2; 340 | crypto_int64 f0g3 = f0 * (crypto_int64) g3; 341 | crypto_int64 f0g4 = f0 * (crypto_int64) g4; 342 | crypto_int64 f0g5 = f0 * (crypto_int64) g5; 343 | crypto_int64 f0g6 = f0 * (crypto_int64) g6; 344 | crypto_int64 f0g7 = f0 * (crypto_int64) g7; 345 | crypto_int64 f0g8 = f0 * (crypto_int64) g8; 346 | crypto_int64 f0g9 = f0 * (crypto_int64) g9; 347 | crypto_int64 f1g0 = f1 * (crypto_int64) g0; 348 | crypto_int64 f1g1_2 = f1_2 * (crypto_int64) g1; 349 | crypto_int64 f1g2 = f1 * (crypto_int64) g2; 350 | crypto_int64 f1g3_2 = f1_2 * (crypto_int64) g3; 351 | crypto_int64 f1g4 = f1 * (crypto_int64) g4; 352 | crypto_int64 f1g5_2 = f1_2 * (crypto_int64) g5; 353 | crypto_int64 f1g6 = f1 * (crypto_int64) g6; 354 | crypto_int64 f1g7_2 = f1_2 * (crypto_int64) g7; 355 | crypto_int64 f1g8 = f1 * (crypto_int64) g8; 356 | crypto_int64 f1g9_38 = f1_2 * (crypto_int64) g9_19; 357 | crypto_int64 f2g0 = f2 * (crypto_int64) g0; 358 | crypto_int64 f2g1 = f2 * (crypto_int64) g1; 359 | crypto_int64 f2g2 = f2 * (crypto_int64) g2; 360 | crypto_int64 f2g3 = f2 * (crypto_int64) g3; 361 | crypto_int64 f2g4 = f2 * (crypto_int64) g4; 362 | crypto_int64 f2g5 = f2 * (crypto_int64) g5; 363 | crypto_int64 f2g6 = f2 * (crypto_int64) g6; 364 | crypto_int64 f2g7 = f2 * (crypto_int64) g7; 365 | crypto_int64 f2g8_19 = f2 * (crypto_int64) g8_19; 366 | crypto_int64 f2g9_19 = f2 * (crypto_int64) g9_19; 367 | crypto_int64 f3g0 = f3 * (crypto_int64) g0; 368 | crypto_int64 f3g1_2 = f3_2 * (crypto_int64) g1; 369 | crypto_int64 f3g2 = f3 * (crypto_int64) g2; 370 | crypto_int64 f3g3_2 = f3_2 * (crypto_int64) g3; 371 | crypto_int64 f3g4 = f3 * (crypto_int64) g4; 372 | crypto_int64 f3g5_2 = f3_2 * (crypto_int64) g5; 373 | crypto_int64 f3g6 = f3 * (crypto_int64) g6; 374 | crypto_int64 f3g7_38 = f3_2 * (crypto_int64) g7_19; 375 | crypto_int64 f3g8_19 = f3 * (crypto_int64) g8_19; 376 | crypto_int64 f3g9_38 = f3_2 * (crypto_int64) g9_19; 377 | crypto_int64 f4g0 = f4 * (crypto_int64) g0; 378 | crypto_int64 f4g1 = f4 * (crypto_int64) g1; 379 | crypto_int64 f4g2 = f4 * (crypto_int64) g2; 380 | crypto_int64 f4g3 = f4 * (crypto_int64) g3; 381 | crypto_int64 f4g4 = f4 * (crypto_int64) g4; 382 | crypto_int64 f4g5 = f4 * (crypto_int64) g5; 383 | crypto_int64 f4g6_19 = f4 * (crypto_int64) g6_19; 384 | crypto_int64 f4g7_19 = f4 * (crypto_int64) g7_19; 385 | crypto_int64 f4g8_19 = f4 * (crypto_int64) g8_19; 386 | crypto_int64 f4g9_19 = f4 * (crypto_int64) g9_19; 387 | crypto_int64 f5g0 = f5 * (crypto_int64) g0; 388 | crypto_int64 f5g1_2 = f5_2 * (crypto_int64) g1; 389 | crypto_int64 f5g2 = f5 * (crypto_int64) g2; 390 | crypto_int64 f5g3_2 = f5_2 * (crypto_int64) g3; 391 | crypto_int64 f5g4 = f5 * (crypto_int64) g4; 392 | crypto_int64 f5g5_38 = f5_2 * (crypto_int64) g5_19; 393 | crypto_int64 f5g6_19 = f5 * (crypto_int64) g6_19; 394 | crypto_int64 f5g7_38 = f5_2 * (crypto_int64) g7_19; 395 | crypto_int64 f5g8_19 = f5 * (crypto_int64) g8_19; 396 | crypto_int64 f5g9_38 = f5_2 * (crypto_int64) g9_19; 397 | crypto_int64 f6g0 = f6 * (crypto_int64) g0; 398 | crypto_int64 f6g1 = f6 * (crypto_int64) g1; 399 | crypto_int64 f6g2 = f6 * (crypto_int64) g2; 400 | crypto_int64 f6g3 = f6 * (crypto_int64) g3; 401 | crypto_int64 f6g4_19 = f6 * (crypto_int64) g4_19; 402 | crypto_int64 f6g5_19 = f6 * (crypto_int64) g5_19; 403 | crypto_int64 f6g6_19 = f6 * (crypto_int64) g6_19; 404 | crypto_int64 f6g7_19 = f6 * (crypto_int64) g7_19; 405 | crypto_int64 f6g8_19 = f6 * (crypto_int64) g8_19; 406 | crypto_int64 f6g9_19 = f6 * (crypto_int64) g9_19; 407 | crypto_int64 f7g0 = f7 * (crypto_int64) g0; 408 | crypto_int64 f7g1_2 = f7_2 * (crypto_int64) g1; 409 | crypto_int64 f7g2 = f7 * (crypto_int64) g2; 410 | crypto_int64 f7g3_38 = f7_2 * (crypto_int64) g3_19; 411 | crypto_int64 f7g4_19 = f7 * (crypto_int64) g4_19; 412 | crypto_int64 f7g5_38 = f7_2 * (crypto_int64) g5_19; 413 | crypto_int64 f7g6_19 = f7 * (crypto_int64) g6_19; 414 | crypto_int64 f7g7_38 = f7_2 * (crypto_int64) g7_19; 415 | crypto_int64 f7g8_19 = f7 * (crypto_int64) g8_19; 416 | crypto_int64 f7g9_38 = f7_2 * (crypto_int64) g9_19; 417 | crypto_int64 f8g0 = f8 * (crypto_int64) g0; 418 | crypto_int64 f8g1 = f8 * (crypto_int64) g1; 419 | crypto_int64 f8g2_19 = f8 * (crypto_int64) g2_19; 420 | crypto_int64 f8g3_19 = f8 * (crypto_int64) g3_19; 421 | crypto_int64 f8g4_19 = f8 * (crypto_int64) g4_19; 422 | crypto_int64 f8g5_19 = f8 * (crypto_int64) g5_19; 423 | crypto_int64 f8g6_19 = f8 * (crypto_int64) g6_19; 424 | crypto_int64 f8g7_19 = f8 * (crypto_int64) g7_19; 425 | crypto_int64 f8g8_19 = f8 * (crypto_int64) g8_19; 426 | crypto_int64 f8g9_19 = f8 * (crypto_int64) g9_19; 427 | crypto_int64 f9g0 = f9 * (crypto_int64) g0; 428 | crypto_int64 f9g1_38 = f9_2 * (crypto_int64) g1_19; 429 | crypto_int64 f9g2_19 = f9 * (crypto_int64) g2_19; 430 | crypto_int64 f9g3_38 = f9_2 * (crypto_int64) g3_19; 431 | crypto_int64 f9g4_19 = f9 * (crypto_int64) g4_19; 432 | crypto_int64 f9g5_38 = f9_2 * (crypto_int64) g5_19; 433 | crypto_int64 f9g6_19 = f9 * (crypto_int64) g6_19; 434 | crypto_int64 f9g7_38 = f9_2 * (crypto_int64) g7_19; 435 | crypto_int64 f9g8_19 = f9 * (crypto_int64) g8_19; 436 | crypto_int64 f9g9_38 = f9_2 * (crypto_int64) g9_19; 437 | crypto_int64 h0 = f0g0+f1g9_38+f2g8_19+f3g7_38+f4g6_19+f5g5_38+f6g4_19+f7g3_38+f8g2_19+f9g1_38; 438 | crypto_int64 h1 = f0g1+f1g0 +f2g9_19+f3g8_19+f4g7_19+f5g6_19+f6g5_19+f7g4_19+f8g3_19+f9g2_19; 439 | crypto_int64 h2 = f0g2+f1g1_2 +f2g0 +f3g9_38+f4g8_19+f5g7_38+f6g6_19+f7g5_38+f8g4_19+f9g3_38; 440 | crypto_int64 h3 = f0g3+f1g2 +f2g1 +f3g0 +f4g9_19+f5g8_19+f6g7_19+f7g6_19+f8g5_19+f9g4_19; 441 | crypto_int64 h4 = f0g4+f1g3_2 +f2g2 +f3g1_2 +f4g0 +f5g9_38+f6g8_19+f7g7_38+f8g6_19+f9g5_38; 442 | crypto_int64 h5 = f0g5+f1g4 +f2g3 +f3g2 +f4g1 +f5g0 +f6g9_19+f7g8_19+f8g7_19+f9g6_19; 443 | crypto_int64 h6 = f0g6+f1g5_2 +f2g4 +f3g3_2 +f4g2 +f5g1_2 +f6g0 +f7g9_38+f8g8_19+f9g7_38; 444 | crypto_int64 h7 = f0g7+f1g6 +f2g5 +f3g4 +f4g3 +f5g2 +f6g1 +f7g0 +f8g9_19+f9g8_19; 445 | crypto_int64 h8 = f0g8+f1g7_2 +f2g6 +f3g5_2 +f4g4 +f5g3_2 +f6g2 +f7g1_2 +f8g0 +f9g9_38; 446 | crypto_int64 h9 = f0g9+f1g8 +f2g7 +f3g6 +f4g5 +f5g4 +f6g3 +f7g2 +f8g1 +f9g0 ; 447 | crypto_int64 carry0; 448 | crypto_int64 carry1; 449 | crypto_int64 carry2; 450 | crypto_int64 carry3; 451 | crypto_int64 carry4; 452 | crypto_int64 carry5; 453 | crypto_int64 carry6; 454 | crypto_int64 carry7; 455 | crypto_int64 carry8; 456 | crypto_int64 carry9; 457 | 458 | /* 459 | |h0| <= (1.1*1.1*2^52*(1+19+19+19+19)+1.1*1.1*2^50*(38+38+38+38+38)) 460 | i.e. |h0| <= 1.2*2^59; narrower ranges for h2, h4, h6, h8 461 | |h1| <= (1.1*1.1*2^51*(1+1+19+19+19+19+19+19+19+19)) 462 | i.e. |h1| <= 1.5*2^58; narrower ranges for h3, h5, h7, h9 463 | */ 464 | 465 | carry0 = (h0 + (crypto_int64) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26; 466 | carry4 = (h4 + (crypto_int64) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26; 467 | /* |h0| <= 2^25 */ 468 | /* |h4| <= 2^25 */ 469 | /* |h1| <= 1.51*2^58 */ 470 | /* |h5| <= 1.51*2^58 */ 471 | 472 | carry1 = (h1 + (crypto_int64) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25; 473 | carry5 = (h5 + (crypto_int64) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25; 474 | /* |h1| <= 2^24; from now on fits into int32 */ 475 | /* |h5| <= 2^24; from now on fits into int32 */ 476 | /* |h2| <= 1.21*2^59 */ 477 | /* |h6| <= 1.21*2^59 */ 478 | 479 | carry2 = (h2 + (crypto_int64) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26; 480 | carry6 = (h6 + (crypto_int64) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26; 481 | /* |h2| <= 2^25; from now on fits into int32 unchanged */ 482 | /* |h6| <= 2^25; from now on fits into int32 unchanged */ 483 | /* |h3| <= 1.51*2^58 */ 484 | /* |h7| <= 1.51*2^58 */ 485 | 486 | carry3 = (h3 + (crypto_int64) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25; 487 | carry7 = (h7 + (crypto_int64) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25; 488 | /* |h3| <= 2^24; from now on fits into int32 unchanged */ 489 | /* |h7| <= 2^24; from now on fits into int32 unchanged */ 490 | /* |h4| <= 1.52*2^33 */ 491 | /* |h8| <= 1.52*2^33 */ 492 | 493 | carry4 = (h4 + (crypto_int64) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26; 494 | carry8 = (h8 + (crypto_int64) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26; 495 | /* |h4| <= 2^25; from now on fits into int32 unchanged */ 496 | /* |h8| <= 2^25; from now on fits into int32 unchanged */ 497 | /* |h5| <= 1.01*2^24 */ 498 | /* |h9| <= 1.51*2^58 */ 499 | 500 | carry9 = (h9 + (crypto_int64) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25; 501 | /* |h9| <= 2^24; from now on fits into int32 unchanged */ 502 | /* |h0| <= 1.8*2^37 */ 503 | 504 | carry0 = (h0 + (crypto_int64) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26; 505 | /* |h0| <= 2^25; from now on fits into int32 unchanged */ 506 | /* |h1| <= 1.01*2^24 */ 507 | 508 | h[0] = h0; 509 | h[1] = h1; 510 | h[2] = h2; 511 | h[3] = h3; 512 | h[4] = h4; 513 | h[5] = h5; 514 | h[6] = h6; 515 | h[7] = h7; 516 | h[8] = h8; 517 | h[9] = h9; 518 | } 519 | 520 | /* 521 | h = f * 121666 522 | Can overlap h with f. 523 | 524 | Preconditions: 525 | |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc. 526 | 527 | Postconditions: 528 | |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. 529 | */ 530 | 531 | void fe_mul121666(fe h,fe f) 532 | { 533 | crypto_int32 f0 = f[0]; 534 | crypto_int32 f1 = f[1]; 535 | crypto_int32 f2 = f[2]; 536 | crypto_int32 f3 = f[3]; 537 | crypto_int32 f4 = f[4]; 538 | crypto_int32 f5 = f[5]; 539 | crypto_int32 f6 = f[6]; 540 | crypto_int32 f7 = f[7]; 541 | crypto_int32 f8 = f[8]; 542 | crypto_int32 f9 = f[9]; 543 | crypto_int64 h0 = f0 * (crypto_int64) 121666; 544 | crypto_int64 h1 = f1 * (crypto_int64) 121666; 545 | crypto_int64 h2 = f2 * (crypto_int64) 121666; 546 | crypto_int64 h3 = f3 * (crypto_int64) 121666; 547 | crypto_int64 h4 = f4 * (crypto_int64) 121666; 548 | crypto_int64 h5 = f5 * (crypto_int64) 121666; 549 | crypto_int64 h6 = f6 * (crypto_int64) 121666; 550 | crypto_int64 h7 = f7 * (crypto_int64) 121666; 551 | crypto_int64 h8 = f8 * (crypto_int64) 121666; 552 | crypto_int64 h9 = f9 * (crypto_int64) 121666; 553 | crypto_int64 carry0; 554 | crypto_int64 carry1; 555 | crypto_int64 carry2; 556 | crypto_int64 carry3; 557 | crypto_int64 carry4; 558 | crypto_int64 carry5; 559 | crypto_int64 carry6; 560 | crypto_int64 carry7; 561 | crypto_int64 carry8; 562 | crypto_int64 carry9; 563 | 564 | carry9 = (h9 + (crypto_int64) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25; 565 | carry1 = (h1 + (crypto_int64) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25; 566 | carry3 = (h3 + (crypto_int64) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25; 567 | carry5 = (h5 + (crypto_int64) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25; 568 | carry7 = (h7 + (crypto_int64) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25; 569 | 570 | carry0 = (h0 + (crypto_int64) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26; 571 | carry2 = (h2 + (crypto_int64) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26; 572 | carry4 = (h4 + (crypto_int64) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26; 573 | carry6 = (h6 + (crypto_int64) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26; 574 | carry8 = (h8 + (crypto_int64) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26; 575 | 576 | h[0] = h0; 577 | h[1] = h1; 578 | h[2] = h2; 579 | h[3] = h3; 580 | h[4] = h4; 581 | h[5] = h5; 582 | h[6] = h6; 583 | h[7] = h7; 584 | h[8] = h8; 585 | h[9] = h9; 586 | } 587 | 588 | /* 589 | h = f * f 590 | Can overlap h with f. 591 | 592 | Preconditions: 593 | |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc. 594 | 595 | Postconditions: 596 | |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. 597 | */ 598 | 599 | /* 600 | See fe_mul.c for discussion of implementation strategy. 601 | */ 602 | 603 | void fe_sq(fe h,fe f) 604 | { 605 | crypto_int32 f0 = f[0]; 606 | crypto_int32 f1 = f[1]; 607 | crypto_int32 f2 = f[2]; 608 | crypto_int32 f3 = f[3]; 609 | crypto_int32 f4 = f[4]; 610 | crypto_int32 f5 = f[5]; 611 | crypto_int32 f6 = f[6]; 612 | crypto_int32 f7 = f[7]; 613 | crypto_int32 f8 = f[8]; 614 | crypto_int32 f9 = f[9]; 615 | crypto_int32 f0_2 = 2 * f0; 616 | crypto_int32 f1_2 = 2 * f1; 617 | crypto_int32 f2_2 = 2 * f2; 618 | crypto_int32 f3_2 = 2 * f3; 619 | crypto_int32 f4_2 = 2 * f4; 620 | crypto_int32 f5_2 = 2 * f5; 621 | crypto_int32 f6_2 = 2 * f6; 622 | crypto_int32 f7_2 = 2 * f7; 623 | crypto_int32 f5_38 = 38 * f5; /* 1.31*2^30 */ 624 | crypto_int32 f6_19 = 19 * f6; /* 1.31*2^30 */ 625 | crypto_int32 f7_38 = 38 * f7; /* 1.31*2^30 */ 626 | crypto_int32 f8_19 = 19 * f8; /* 1.31*2^30 */ 627 | crypto_int32 f9_38 = 38 * f9; /* 1.31*2^30 */ 628 | crypto_int64 f0f0 = f0 * (crypto_int64) f0; 629 | crypto_int64 f0f1_2 = f0_2 * (crypto_int64) f1; 630 | crypto_int64 f0f2_2 = f0_2 * (crypto_int64) f2; 631 | crypto_int64 f0f3_2 = f0_2 * (crypto_int64) f3; 632 | crypto_int64 f0f4_2 = f0_2 * (crypto_int64) f4; 633 | crypto_int64 f0f5_2 = f0_2 * (crypto_int64) f5; 634 | crypto_int64 f0f6_2 = f0_2 * (crypto_int64) f6; 635 | crypto_int64 f0f7_2 = f0_2 * (crypto_int64) f7; 636 | crypto_int64 f0f8_2 = f0_2 * (crypto_int64) f8; 637 | crypto_int64 f0f9_2 = f0_2 * (crypto_int64) f9; 638 | crypto_int64 f1f1_2 = f1_2 * (crypto_int64) f1; 639 | crypto_int64 f1f2_2 = f1_2 * (crypto_int64) f2; 640 | crypto_int64 f1f3_4 = f1_2 * (crypto_int64) f3_2; 641 | crypto_int64 f1f4_2 = f1_2 * (crypto_int64) f4; 642 | crypto_int64 f1f5_4 = f1_2 * (crypto_int64) f5_2; 643 | crypto_int64 f1f6_2 = f1_2 * (crypto_int64) f6; 644 | crypto_int64 f1f7_4 = f1_2 * (crypto_int64) f7_2; 645 | crypto_int64 f1f8_2 = f1_2 * (crypto_int64) f8; 646 | crypto_int64 f1f9_76 = f1_2 * (crypto_int64) f9_38; 647 | crypto_int64 f2f2 = f2 * (crypto_int64) f2; 648 | crypto_int64 f2f3_2 = f2_2 * (crypto_int64) f3; 649 | crypto_int64 f2f4_2 = f2_2 * (crypto_int64) f4; 650 | crypto_int64 f2f5_2 = f2_2 * (crypto_int64) f5; 651 | crypto_int64 f2f6_2 = f2_2 * (crypto_int64) f6; 652 | crypto_int64 f2f7_2 = f2_2 * (crypto_int64) f7; 653 | crypto_int64 f2f8_38 = f2_2 * (crypto_int64) f8_19; 654 | crypto_int64 f2f9_38 = f2 * (crypto_int64) f9_38; 655 | crypto_int64 f3f3_2 = f3_2 * (crypto_int64) f3; 656 | crypto_int64 f3f4_2 = f3_2 * (crypto_int64) f4; 657 | crypto_int64 f3f5_4 = f3_2 * (crypto_int64) f5_2; 658 | crypto_int64 f3f6_2 = f3_2 * (crypto_int64) f6; 659 | crypto_int64 f3f7_76 = f3_2 * (crypto_int64) f7_38; 660 | crypto_int64 f3f8_38 = f3_2 * (crypto_int64) f8_19; 661 | crypto_int64 f3f9_76 = f3_2 * (crypto_int64) f9_38; 662 | crypto_int64 f4f4 = f4 * (crypto_int64) f4; 663 | crypto_int64 f4f5_2 = f4_2 * (crypto_int64) f5; 664 | crypto_int64 f4f6_38 = f4_2 * (crypto_int64) f6_19; 665 | crypto_int64 f4f7_38 = f4 * (crypto_int64) f7_38; 666 | crypto_int64 f4f8_38 = f4_2 * (crypto_int64) f8_19; 667 | crypto_int64 f4f9_38 = f4 * (crypto_int64) f9_38; 668 | crypto_int64 f5f5_38 = f5 * (crypto_int64) f5_38; 669 | crypto_int64 f5f6_38 = f5_2 * (crypto_int64) f6_19; 670 | crypto_int64 f5f7_76 = f5_2 * (crypto_int64) f7_38; 671 | crypto_int64 f5f8_38 = f5_2 * (crypto_int64) f8_19; 672 | crypto_int64 f5f9_76 = f5_2 * (crypto_int64) f9_38; 673 | crypto_int64 f6f6_19 = f6 * (crypto_int64) f6_19; 674 | crypto_int64 f6f7_38 = f6 * (crypto_int64) f7_38; 675 | crypto_int64 f6f8_38 = f6_2 * (crypto_int64) f8_19; 676 | crypto_int64 f6f9_38 = f6 * (crypto_int64) f9_38; 677 | crypto_int64 f7f7_38 = f7 * (crypto_int64) f7_38; 678 | crypto_int64 f7f8_38 = f7_2 * (crypto_int64) f8_19; 679 | crypto_int64 f7f9_76 = f7_2 * (crypto_int64) f9_38; 680 | crypto_int64 f8f8_19 = f8 * (crypto_int64) f8_19; 681 | crypto_int64 f8f9_38 = f8 * (crypto_int64) f9_38; 682 | crypto_int64 f9f9_38 = f9 * (crypto_int64) f9_38; 683 | crypto_int64 h0 = f0f0 +f1f9_76+f2f8_38+f3f7_76+f4f6_38+f5f5_38; 684 | crypto_int64 h1 = f0f1_2+f2f9_38+f3f8_38+f4f7_38+f5f6_38; 685 | crypto_int64 h2 = f0f2_2+f1f1_2 +f3f9_76+f4f8_38+f5f7_76+f6f6_19; 686 | crypto_int64 h3 = f0f3_2+f1f2_2 +f4f9_38+f5f8_38+f6f7_38; 687 | crypto_int64 h4 = f0f4_2+f1f3_4 +f2f2 +f5f9_76+f6f8_38+f7f7_38; 688 | crypto_int64 h5 = f0f5_2+f1f4_2 +f2f3_2 +f6f9_38+f7f8_38; 689 | crypto_int64 h6 = f0f6_2+f1f5_4 +f2f4_2 +f3f3_2 +f7f9_76+f8f8_19; 690 | crypto_int64 h7 = f0f7_2+f1f6_2 +f2f5_2 +f3f4_2 +f8f9_38; 691 | crypto_int64 h8 = f0f8_2+f1f7_4 +f2f6_2 +f3f5_4 +f4f4 +f9f9_38; 692 | crypto_int64 h9 = f0f9_2+f1f8_2 +f2f7_2 +f3f6_2 +f4f5_2; 693 | crypto_int64 carry0; 694 | crypto_int64 carry1; 695 | crypto_int64 carry2; 696 | crypto_int64 carry3; 697 | crypto_int64 carry4; 698 | crypto_int64 carry5; 699 | crypto_int64 carry6; 700 | crypto_int64 carry7; 701 | crypto_int64 carry8; 702 | crypto_int64 carry9; 703 | 704 | carry0 = (h0 + (crypto_int64) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26; 705 | carry4 = (h4 + (crypto_int64) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26; 706 | 707 | carry1 = (h1 + (crypto_int64) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25; 708 | carry5 = (h5 + (crypto_int64) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25; 709 | 710 | carry2 = (h2 + (crypto_int64) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26; 711 | carry6 = (h6 + (crypto_int64) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26; 712 | 713 | carry3 = (h3 + (crypto_int64) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25; 714 | carry7 = (h7 + (crypto_int64) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25; 715 | 716 | carry4 = (h4 + (crypto_int64) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26; 717 | carry8 = (h8 + (crypto_int64) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26; 718 | 719 | carry9 = (h9 + (crypto_int64) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25; 720 | 721 | carry0 = (h0 + (crypto_int64) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26; 722 | 723 | h[0] = h0; 724 | h[1] = h1; 725 | h[2] = h2; 726 | h[3] = h3; 727 | h[4] = h4; 728 | h[5] = h5; 729 | h[6] = h6; 730 | h[7] = h7; 731 | h[8] = h8; 732 | h[9] = h9; 733 | } 734 | 735 | /* 736 | h = f - g 737 | Can overlap h with f or g. 738 | 739 | Preconditions: 740 | |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. 741 | |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. 742 | 743 | Postconditions: 744 | |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc. 745 | */ 746 | 747 | void fe_sub(fe h,fe f,fe g) 748 | { 749 | crypto_int32 f0 = f[0]; 750 | crypto_int32 f1 = f[1]; 751 | crypto_int32 f2 = f[2]; 752 | crypto_int32 f3 = f[3]; 753 | crypto_int32 f4 = f[4]; 754 | crypto_int32 f5 = f[5]; 755 | crypto_int32 f6 = f[6]; 756 | crypto_int32 f7 = f[7]; 757 | crypto_int32 f8 = f[8]; 758 | crypto_int32 f9 = f[9]; 759 | crypto_int32 g0 = g[0]; 760 | crypto_int32 g1 = g[1]; 761 | crypto_int32 g2 = g[2]; 762 | crypto_int32 g3 = g[3]; 763 | crypto_int32 g4 = g[4]; 764 | crypto_int32 g5 = g[5]; 765 | crypto_int32 g6 = g[6]; 766 | crypto_int32 g7 = g[7]; 767 | crypto_int32 g8 = g[8]; 768 | crypto_int32 g9 = g[9]; 769 | crypto_int32 h0 = f0 - g0; 770 | crypto_int32 h1 = f1 - g1; 771 | crypto_int32 h2 = f2 - g2; 772 | crypto_int32 h3 = f3 - g3; 773 | crypto_int32 h4 = f4 - g4; 774 | crypto_int32 h5 = f5 - g5; 775 | crypto_int32 h6 = f6 - g6; 776 | crypto_int32 h7 = f7 - g7; 777 | crypto_int32 h8 = f8 - g8; 778 | crypto_int32 h9 = f9 - g9; 779 | h[0] = h0; 780 | h[1] = h1; 781 | h[2] = h2; 782 | h[3] = h3; 783 | h[4] = h4; 784 | h[5] = h5; 785 | h[6] = h6; 786 | h[7] = h7; 787 | h[8] = h8; 788 | h[9] = h9; 789 | } 790 | 791 | /* 792 | Preconditions: 793 | |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. 794 | 795 | Write p=2^255-19; q=floor(h/p). 796 | Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))). 797 | 798 | Proof: 799 | Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4. 800 | Also have |h-2^230 h9|<2^230 so |19 2^(-255)(h-2^230 h9)|<1/4. 801 | 802 | Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9). 803 | Then 0> 25; 841 | q = (h0 + q) >> 26; 842 | q = (h1 + q) >> 25; 843 | q = (h2 + q) >> 26; 844 | q = (h3 + q) >> 25; 845 | q = (h4 + q) >> 26; 846 | q = (h5 + q) >> 25; 847 | q = (h6 + q) >> 26; 848 | q = (h7 + q) >> 25; 849 | q = (h8 + q) >> 26; 850 | q = (h9 + q) >> 25; 851 | 852 | /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */ 853 | h0 += 19 * q; 854 | /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */ 855 | 856 | carry0 = h0 >> 26; h1 += carry0; h0 -= carry0 << 26; 857 | carry1 = h1 >> 25; h2 += carry1; h1 -= carry1 << 25; 858 | carry2 = h2 >> 26; h3 += carry2; h2 -= carry2 << 26; 859 | carry3 = h3 >> 25; h4 += carry3; h3 -= carry3 << 25; 860 | carry4 = h4 >> 26; h5 += carry4; h4 -= carry4 << 26; 861 | carry5 = h5 >> 25; h6 += carry5; h5 -= carry5 << 25; 862 | carry6 = h6 >> 26; h7 += carry6; h6 -= carry6 << 26; 863 | carry7 = h7 >> 25; h8 += carry7; h7 -= carry7 << 25; 864 | carry8 = h8 >> 26; h9 += carry8; h8 -= carry8 << 26; 865 | carry9 = h9 >> 25; h9 -= carry9 << 25; 866 | /* h10 = carry9 */ 867 | 868 | /* 869 | Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20. 870 | Have h0+...+2^230 h9 between 0 and 2^255-1; 871 | evidently 2^255 h10-2^255 q = 0. 872 | Goal: Output h0+...+2^230 h9. 873 | */ 874 | 875 | s[0] = h0 >> 0; 876 | s[1] = h0 >> 8; 877 | s[2] = h0 >> 16; 878 | s[3] = (h0 >> 24) | (h1 << 2); 879 | s[4] = h1 >> 6; 880 | s[5] = h1 >> 14; 881 | s[6] = (h1 >> 22) | (h2 << 3); 882 | s[7] = h2 >> 5; 883 | s[8] = h2 >> 13; 884 | s[9] = (h2 >> 21) | (h3 << 5); 885 | s[10] = h3 >> 3; 886 | s[11] = h3 >> 11; 887 | s[12] = (h3 >> 19) | (h4 << 6); 888 | s[13] = h4 >> 2; 889 | s[14] = h4 >> 10; 890 | s[15] = h4 >> 18; 891 | s[16] = h5 >> 0; 892 | s[17] = h5 >> 8; 893 | s[18] = h5 >> 16; 894 | s[19] = (h5 >> 24) | (h6 << 1); 895 | s[20] = h6 >> 7; 896 | s[21] = h6 >> 15; 897 | s[22] = (h6 >> 23) | (h7 << 3); 898 | s[23] = h7 >> 5; 899 | s[24] = h7 >> 13; 900 | s[25] = (h7 >> 21) | (h8 << 4); 901 | s[26] = h8 >> 4; 902 | s[27] = h8 >> 12; 903 | s[28] = (h8 >> 20) | (h9 << 6); 904 | s[29] = h9 >> 2; 905 | s[30] = h9 >> 10; 906 | s[31] = h9 >> 18; 907 | } 908 | 909 | void fe_invert(fe out,fe z) 910 | { 911 | fe t0; 912 | fe t1; 913 | fe t2; 914 | fe t3; 915 | int i; 916 | 917 | 918 | /* qhasm: fe z1 */ 919 | 920 | /* qhasm: fe z2 */ 921 | 922 | /* qhasm: fe z8 */ 923 | 924 | /* qhasm: fe z9 */ 925 | 926 | /* qhasm: fe z11 */ 927 | 928 | /* qhasm: fe z22 */ 929 | 930 | /* qhasm: fe z_5_0 */ 931 | 932 | /* qhasm: fe z_10_5 */ 933 | 934 | /* qhasm: fe z_10_0 */ 935 | 936 | /* qhasm: fe z_20_10 */ 937 | 938 | /* qhasm: fe z_20_0 */ 939 | 940 | /* qhasm: fe z_40_20 */ 941 | 942 | /* qhasm: fe z_40_0 */ 943 | 944 | /* qhasm: fe z_50_10 */ 945 | 946 | /* qhasm: fe z_50_0 */ 947 | 948 | /* qhasm: fe z_100_50 */ 949 | 950 | /* qhasm: fe z_100_0 */ 951 | 952 | /* qhasm: fe z_200_100 */ 953 | 954 | /* qhasm: fe z_200_0 */ 955 | 956 | /* qhasm: fe z_250_50 */ 957 | 958 | /* qhasm: fe z_250_0 */ 959 | 960 | /* qhasm: fe z_255_5 */ 961 | 962 | /* qhasm: fe z_255_21 */ 963 | 964 | /* qhasm: enter pow225521 */ 965 | 966 | /* qhasm: z2 = z1^2^1 */ 967 | /* asm 1: fe_sq(>z2=fe#1,z2=fe#1,>z2=fe#1); */ 968 | /* asm 2: fe_sq(>z2=t0,z2=t0,>z2=t0); */ 969 | fe_sq(t0,z); for (i = 1;i < 1;++i) fe_sq(t0,t0); 970 | 971 | /* qhasm: z8 = z2^2^2 */ 972 | /* asm 1: fe_sq(>z8=fe#2,z8=fe#2,>z8=fe#2); */ 973 | /* asm 2: fe_sq(>z8=t1,z8=t1,>z8=t1); */ 974 | fe_sq(t1,t0); for (i = 1;i < 2;++i) fe_sq(t1,t1); 975 | 976 | /* qhasm: z9 = z1*z8 */ 977 | /* asm 1: fe_mul(>z9=fe#2,z9=t1,z11=fe#1,z11=t0,z22=fe#3,z22=fe#3,>z22=fe#3); */ 988 | /* asm 2: fe_sq(>z22=t2,z22=t2,>z22=t2); */ 989 | fe_sq(t2,t0); for (i = 1;i < 1;++i) fe_sq(t2,t2); 990 | 991 | /* qhasm: z_5_0 = z9*z22 */ 992 | /* asm 1: fe_mul(>z_5_0=fe#2,z_5_0=t1,z_10_5=fe#3,z_10_5=fe#3,>z_10_5=fe#3); */ 998 | /* asm 2: fe_sq(>z_10_5=t2,z_10_5=t2,>z_10_5=t2); */ 999 | fe_sq(t2,t1); for (i = 1;i < 5;++i) fe_sq(t2,t2); 1000 | 1001 | /* qhasm: z_10_0 = z_10_5*z_5_0 */ 1002 | /* asm 1: fe_mul(>z_10_0=fe#2,z_10_0=t1,z_20_10=fe#3,z_20_10=fe#3,>z_20_10=fe#3); */ 1008 | /* asm 2: fe_sq(>z_20_10=t2,z_20_10=t2,>z_20_10=t2); */ 1009 | fe_sq(t2,t1); for (i = 1;i < 10;++i) fe_sq(t2,t2); 1010 | 1011 | /* qhasm: z_20_0 = z_20_10*z_10_0 */ 1012 | /* asm 1: fe_mul(>z_20_0=fe#3,z_20_0=t2,z_40_20=fe#4,z_40_20=fe#4,>z_40_20=fe#4); */ 1018 | /* asm 2: fe_sq(>z_40_20=t3,z_40_20=t3,>z_40_20=t3); */ 1019 | fe_sq(t3,t2); for (i = 1;i < 20;++i) fe_sq(t3,t3); 1020 | 1021 | /* qhasm: z_40_0 = z_40_20*z_20_0 */ 1022 | /* asm 1: fe_mul(>z_40_0=fe#3,z_40_0=t2,z_50_10=fe#3,z_50_10=fe#3,>z_50_10=fe#3); */ 1028 | /* asm 2: fe_sq(>z_50_10=t2,z_50_10=t2,>z_50_10=t2); */ 1029 | fe_sq(t2,t2); for (i = 1;i < 10;++i) fe_sq(t2,t2); 1030 | 1031 | /* qhasm: z_50_0 = z_50_10*z_10_0 */ 1032 | /* asm 1: fe_mul(>z_50_0=fe#2,z_50_0=t1,z_100_50=fe#3,z_100_50=fe#3,>z_100_50=fe#3); */ 1038 | /* asm 2: fe_sq(>z_100_50=t2,z_100_50=t2,>z_100_50=t2); */ 1039 | fe_sq(t2,t1); for (i = 1;i < 50;++i) fe_sq(t2,t2); 1040 | 1041 | /* qhasm: z_100_0 = z_100_50*z_50_0 */ 1042 | /* asm 1: fe_mul(>z_100_0=fe#3,z_100_0=t2,z_200_100=fe#4,z_200_100=fe#4,>z_200_100=fe#4); */ 1048 | /* asm 2: fe_sq(>z_200_100=t3,z_200_100=t3,>z_200_100=t3); */ 1049 | fe_sq(t3,t2); for (i = 1;i < 100;++i) fe_sq(t3,t3); 1050 | 1051 | /* qhasm: z_200_0 = z_200_100*z_100_0 */ 1052 | /* asm 1: fe_mul(>z_200_0=fe#3,z_200_0=t2,z_250_50=fe#3,z_250_50=fe#3,>z_250_50=fe#3); */ 1058 | /* asm 2: fe_sq(>z_250_50=t2,z_250_50=t2,>z_250_50=t2); */ 1059 | fe_sq(t2,t2); for (i = 1;i < 50;++i) fe_sq(t2,t2); 1060 | 1061 | /* qhasm: z_250_0 = z_250_50*z_50_0 */ 1062 | /* asm 1: fe_mul(>z_250_0=fe#2,z_250_0=t1,z_255_5=fe#2,z_255_5=fe#2,>z_255_5=fe#2); */ 1068 | /* asm 2: fe_sq(>z_255_5=t1,z_255_5=t1,>z_255_5=t1); */ 1069 | fe_sq(t1,t1); for (i = 1;i < 5;++i) fe_sq(t1,t1); 1070 | 1071 | /* qhasm: z_255_21 = z_255_5*z11 */ 1072 | /* asm 1: fe_mul(>z_255_21=fe#12,z_255_21=out,= 0;--pos) { 1111 | b = e[pos / 8] >> (pos & 7); 1112 | b &= 1; 1113 | swap ^= b; 1114 | fe_cswap(x2,x3,swap); 1115 | fe_cswap(z2,z3,swap); 1116 | swap = b; 1117 | /* qhasm: fe X2 */ 1118 | 1119 | /* qhasm: fe Z2 */ 1120 | 1121 | /* qhasm: fe X3 */ 1122 | 1123 | /* qhasm: fe Z3 */ 1124 | 1125 | /* qhasm: fe X4 */ 1126 | 1127 | /* qhasm: fe Z4 */ 1128 | 1129 | /* qhasm: fe X5 */ 1130 | 1131 | /* qhasm: fe Z5 */ 1132 | 1133 | /* qhasm: fe A */ 1134 | 1135 | /* qhasm: fe B */ 1136 | 1137 | /* qhasm: fe C */ 1138 | 1139 | /* qhasm: fe D */ 1140 | 1141 | /* qhasm: fe E */ 1142 | 1143 | /* qhasm: fe AA */ 1144 | 1145 | /* qhasm: fe BB */ 1146 | 1147 | /* qhasm: fe DA */ 1148 | 1149 | /* qhasm: fe CB */ 1150 | 1151 | /* qhasm: fe t0 */ 1152 | 1153 | /* qhasm: fe t1 */ 1154 | 1155 | /* qhasm: fe t2 */ 1156 | 1157 | /* qhasm: fe t3 */ 1158 | 1159 | /* qhasm: fe t4 */ 1160 | 1161 | /* qhasm: enter ladder */ 1162 | 1163 | /* qhasm: D = X3-Z3 */ 1164 | /* asm 1: fe_sub(>D=fe#5,D=tmp0,B=fe#6,B=tmp1,A=fe#1,A=x2,C=fe#2,C=z2,DA=fe#4,DA=z3,CB=fe#2,CB=z2,BB=fe#5,BB=tmp0,AA=fe#6,AA=tmp1,t0=fe#3,t0=x3,t1=fe#2,t1=z2,X4=fe#1,X4=x2,E=fe#6,E=tmp1,t2=fe#2,t2=z2,t3=fe#4,t3=z3,X5=fe#3,X5=x3,t4=fe#5,t4=tmp0,Z5=fe#4,x1,Z5=z3,x1,Z4=fe#2,Z4=z2, 2 | typedef __m128i xmmi; 3 | 4 | typedef union packedelem8_t { 5 | unsigned char u[16]; 6 | xmmi v; 7 | } packedelem8; 8 | 9 | typedef union packedelem32_t { 10 | uint32_t u[4]; 11 | xmmi v; 12 | } packedelem32; 13 | 14 | typedef union packedelem64_t { 15 | uint64_t u[2]; 16 | xmmi v; 17 | } packedelem64; 18 | 19 | /* 10 elements + an extra 2 to fit in 3 xmm registers */ 20 | typedef uint32_t bignum25519[10+2]; 21 | typedef packedelem32 packed32bignum25519[5]; 22 | typedef packedelem64 packed64bignum25519[10]; 23 | 24 | static const uint32_t reduce_mask_26 = (1 << 26) - 1; 25 | static const uint32_t reduce_mask_25 = (1 << 25) - 1; 26 | 27 | static const packedelem32 sse2_bot32bitmask = {{0xffffffff, 0x00000000, 0xffffffff, 0x00000000}}; 28 | static const packedelem32 sse2_top32bitmask = {{0x00000000, 0xffffffff, 0x00000000, 0xffffffff}}; 29 | static const packedelem32 sse2_top64bitmask = {{0x00000000, 0x00000000, 0xffffffff, 0xffffffff}}; 30 | static const packedelem32 sse2_bot64bitmask = {{0xffffffff, 0xffffffff, 0x00000000, 0x00000000}}; 31 | 32 | /* reduction masks */ 33 | static const packedelem64 packedmask26 = {{0x03ffffff, 0x03ffffff}}; 34 | static const packedelem64 packedmask25 = {{0x01ffffff, 0x01ffffff}}; 35 | static const packedelem32 packedmask2625 = {{0x3ffffff,0,0x1ffffff,0}}; 36 | static const packedelem32 packedmask26262626 = {{0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff}}; 37 | static const packedelem32 packedmask25252525 = {{0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}}; 38 | 39 | /* multipliers */ 40 | static const packedelem64 packednineteen = {{19, 19}}; 41 | static const packedelem64 packednineteenone = {{19, 1}}; 42 | static const packedelem64 packedthirtyeight = {{38, 38}}; 43 | static const packedelem64 packed3819 = {{19*2,19}}; 44 | static const packedelem64 packed9638 = {{19*4,19*2}}; 45 | 46 | /* 121666,121665 */ 47 | static const packedelem64 packed121666121665 = {{121666, 121665}}; 48 | 49 | /* 2*(2^255 - 19) = 0 mod p */ 50 | static const packedelem32 packed2p0 = {{0x7ffffda,0x3fffffe,0x7fffffe,0x3fffffe}}; 51 | static const packedelem32 packed2p1 = {{0x7fffffe,0x3fffffe,0x7fffffe,0x3fffffe}}; 52 | static const packedelem32 packed2p2 = {{0x7fffffe,0x3fffffe,0x0000000,0x0000000}}; 53 | 54 | static const packedelem32 packed32zeromodp0 = {{0x7ffffda,0x7ffffda,0x3fffffe,0x3fffffe}}; 55 | static const packedelem32 packed32zeromodp1 = {{0x7fffffe,0x7fffffe,0x3fffffe,0x3fffffe}}; 56 | 57 | /* Copy a bignum to another: out = in */ 58 | DONNA_INLINE static void 59 | curve25519_copy(bignum25519 out, const bignum25519 in) { 60 | xmmi x0,x1,x2; 61 | x0 = _mm_load_si128((xmmi*)in + 0); 62 | x1 = _mm_load_si128((xmmi*)in + 1); 63 | x2 = _mm_load_si128((xmmi*)in + 2); 64 | _mm_store_si128((xmmi*)out + 0, x0); 65 | _mm_store_si128((xmmi*)out + 1, x1); 66 | _mm_store_si128((xmmi*)out + 2, x2); 67 | } 68 | 69 | /* Take a little-endian, 32-byte number and expand it into polynomial form */ 70 | DONNA_INLINE static void 71 | curve25519_expand(bignum25519 out, const unsigned char in[32]) { 72 | uint32_t x0,x1,x2,x3,x4,x5,x6,x7; 73 | 74 | x0 = *(uint32_t *)(in + 0); 75 | x1 = *(uint32_t *)(in + 4); 76 | x2 = *(uint32_t *)(in + 8); 77 | x3 = *(uint32_t *)(in + 12); 78 | x4 = *(uint32_t *)(in + 16); 79 | x5 = *(uint32_t *)(in + 20); 80 | x6 = *(uint32_t *)(in + 24); 81 | x7 = *(uint32_t *)(in + 28); 82 | 83 | out[0] = ( x0 ) & reduce_mask_26; 84 | out[1] = ((((uint64_t)x1 << 32) | x0) >> 26) & reduce_mask_25; 85 | out[2] = ((((uint64_t)x2 << 32) | x1) >> 19) & reduce_mask_26; 86 | out[3] = ((((uint64_t)x3 << 32) | x2) >> 13) & reduce_mask_25; 87 | out[4] = (( x3) >> 6) & reduce_mask_26; 88 | out[5] = ( x4 ) & reduce_mask_25; 89 | out[6] = ((((uint64_t)x5 << 32) | x4) >> 25) & reduce_mask_26; 90 | out[7] = ((((uint64_t)x6 << 32) | x5) >> 19) & reduce_mask_25; 91 | out[8] = ((((uint64_t)x7 << 32) | x6) >> 12) & reduce_mask_26; 92 | out[9] = (( x7) >> 6) & reduce_mask_25; /* ignore the top bit */ 93 | 94 | out[10] = 0; 95 | out[11] = 0; 96 | } 97 | 98 | /* Take a fully reduced polynomial form number and contract it into a 99 | * little-endian, 32-byte array 100 | */ 101 | DONNA_INLINE static void 102 | curve25519_contract(unsigned char out[32], const bignum25519 in) { 103 | bignum25519 ALIGN(16) f; 104 | 105 | curve25519_copy(f, in); 106 | 107 | #define carry_pass() \ 108 | f[1] += f[0] >> 26; f[0] &= reduce_mask_26; \ 109 | f[2] += f[1] >> 25; f[1] &= reduce_mask_25; \ 110 | f[3] += f[2] >> 26; f[2] &= reduce_mask_26; \ 111 | f[4] += f[3] >> 25; f[3] &= reduce_mask_25; \ 112 | f[5] += f[4] >> 26; f[4] &= reduce_mask_26; \ 113 | f[6] += f[5] >> 25; f[5] &= reduce_mask_25; \ 114 | f[7] += f[6] >> 26; f[6] &= reduce_mask_26; \ 115 | f[8] += f[7] >> 25; f[7] &= reduce_mask_25; \ 116 | f[9] += f[8] >> 26; f[8] &= reduce_mask_26; 117 | 118 | #define carry_pass_full() \ 119 | carry_pass() \ 120 | f[0] += 19 * (f[9] >> 25); f[9] &= reduce_mask_25; 121 | 122 | #define carry_pass_final() \ 123 | carry_pass() \ 124 | f[9] &= reduce_mask_25; 125 | 126 | carry_pass_full() 127 | carry_pass_full() 128 | 129 | /* now t is between 0 and 2^255-1, properly carried. */ 130 | /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */ 131 | f[0] += 19; 132 | carry_pass_full() 133 | 134 | /* now between 19 and 2^255-1 in both cases, and offset by 19. */ 135 | f[0] += (1 << 26) - 19; 136 | f[1] += (1 << 25) - 1; 137 | f[2] += (1 << 26) - 1; 138 | f[3] += (1 << 25) - 1; 139 | f[4] += (1 << 26) - 1; 140 | f[5] += (1 << 25) - 1; 141 | f[6] += (1 << 26) - 1; 142 | f[7] += (1 << 25) - 1; 143 | f[8] += (1 << 26) - 1; 144 | f[9] += (1 << 25) - 1; 145 | 146 | /* now between 2^255 and 2^256-20, and offset by 2^255. */ 147 | carry_pass_final() 148 | 149 | #undef carry_pass 150 | #undef carry_full 151 | #undef carry_final 152 | 153 | *(uint32_t *)(out + 0) = ((f[0] ) | (f[1] << 26)); 154 | *(uint32_t *)(out + 4) = ((f[1] >> 6) | (f[2] << 19)); 155 | *(uint32_t *)(out + 8) = ((f[2] >> 13) | (f[3] << 13)); 156 | *(uint32_t *)(out + 12) = ((f[3] >> 19) | (f[4] << 6)); 157 | *(uint32_t *)(out + 16) = ((f[5] ) | (f[6] << 25)); 158 | *(uint32_t *)(out + 20) = ((f[6] >> 7) | (f[7] << 19)); 159 | *(uint32_t *)(out + 24) = ((f[7] >> 13) | (f[8] << 12)); 160 | *(uint32_t *)(out + 28) = ((f[8] >> 20) | (f[9] << 6)); 161 | } 162 | 163 | /* 164 | * Maybe swap the contents of two felem arrays (@a and @b), each 5 elements 165 | * long. Perform the swap iff @swap is non-zero. 166 | */ 167 | DONNA_INLINE static void 168 | curve25519_swap_conditional(bignum25519 a, bignum25519 b, uint32_t iswap) { 169 | const uint32_t swap = (uint32_t)(-(int32_t)iswap); 170 | xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2; 171 | xmmi mask = _mm_cvtsi32_si128(swap); 172 | mask = _mm_shuffle_epi32(mask, 0); 173 | a0 = _mm_load_si128((xmmi *)a + 0); 174 | a1 = _mm_load_si128((xmmi *)a + 1); 175 | a2 = _mm_load_si128((xmmi *)a + 2); 176 | b0 = _mm_load_si128((xmmi *)b + 0); 177 | b1 = _mm_load_si128((xmmi *)b + 1); 178 | b2 = _mm_load_si128((xmmi *)b + 2); 179 | b0 = _mm_xor_si128(a0, b0); 180 | b1 = _mm_xor_si128(a1, b1); 181 | b2 = _mm_xor_si128(a2, b2); 182 | x0 = _mm_and_si128(b0, mask); 183 | x1 = _mm_and_si128(b1, mask); 184 | x2 = _mm_and_si128(b2, mask); 185 | x0 = _mm_xor_si128(x0, a0); 186 | x1 = _mm_xor_si128(x1, a1); 187 | x2 = _mm_xor_si128(x2, a2); 188 | a0 = _mm_xor_si128(x0, b0); 189 | a1 = _mm_xor_si128(x1, b1); 190 | a2 = _mm_xor_si128(x2, b2); 191 | _mm_store_si128((xmmi *)a + 0, x0); 192 | _mm_store_si128((xmmi *)a + 1, x1); 193 | _mm_store_si128((xmmi *)a + 2, x2); 194 | _mm_store_si128((xmmi *)b + 0, a0); 195 | _mm_store_si128((xmmi *)b + 1, a1); 196 | _mm_store_si128((xmmi *)b + 2, a2); 197 | } 198 | 199 | /* interleave two bignums */ 200 | DONNA_INLINE static void 201 | curve25519_tangle32(packedelem32 *out, const bignum25519 x, const bignum25519 z) { 202 | xmmi x0,x1,x2,z0,z1,z2; 203 | 204 | x0 = _mm_load_si128((xmmi *)(x + 0)); 205 | x1 = _mm_load_si128((xmmi *)(x + 4)); 206 | x2 = _mm_load_si128((xmmi *)(x + 8)); 207 | z0 = _mm_load_si128((xmmi *)(z + 0)); 208 | z1 = _mm_load_si128((xmmi *)(z + 4)); 209 | z2 = _mm_load_si128((xmmi *)(z + 8)); 210 | 211 | out[0].v = _mm_unpacklo_epi32(x0, z0); 212 | out[1].v = _mm_unpackhi_epi32(x0, z0); 213 | out[2].v = _mm_unpacklo_epi32(x1, z1); 214 | out[3].v = _mm_unpackhi_epi32(x1, z1); 215 | out[4].v = _mm_unpacklo_epi32(x2, z2); 216 | } 217 | 218 | /* split a packed bignum in to it's two parts */ 219 | DONNA_INLINE static void 220 | curve25519_untangle64(bignum25519 x, bignum25519 z, const packedelem64 *in) { 221 | _mm_store_si128((xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v))); 222 | _mm_store_si128((xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v))); 223 | _mm_store_si128((xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v) ); 224 | _mm_store_si128((xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v))); 225 | _mm_store_si128((xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v))); 226 | _mm_store_si128((xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v) ); 227 | } 228 | 229 | /* add two packed bignums */ 230 | DONNA_INLINE static void 231 | curve25519_add_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) { 232 | out[0].v = _mm_add_epi32(r[0].v, s[0].v); 233 | out[1].v = _mm_add_epi32(r[1].v, s[1].v); 234 | out[2].v = _mm_add_epi32(r[2].v, s[2].v); 235 | out[3].v = _mm_add_epi32(r[3].v, s[3].v); 236 | out[4].v = _mm_add_epi32(r[4].v, s[4].v); 237 | } 238 | 239 | /* subtract two packed bignums */ 240 | DONNA_INLINE static void 241 | curve25519_sub_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) { 242 | xmmi r0,r1,r2,r3,r4; 243 | xmmi s0,s1,s2,s3; 244 | xmmi c1,c2; 245 | 246 | r0 = _mm_add_epi32(r[0].v, packed32zeromodp0.v); 247 | r1 = _mm_add_epi32(r[1].v, packed32zeromodp1.v); 248 | r2 = _mm_add_epi32(r[2].v, packed32zeromodp1.v); 249 | r3 = _mm_add_epi32(r[3].v, packed32zeromodp1.v); 250 | r4 = _mm_add_epi32(r[4].v, packed32zeromodp1.v); 251 | r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */ 252 | r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */ 253 | r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */ 254 | r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */ 255 | r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */ 256 | 257 | s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */ 258 | s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */ 259 | s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */ 260 | s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */ 261 | 262 | c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2); 263 | c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0, _mm_slli_si128(c2, 8)); 264 | 265 | out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */ 266 | out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */ 267 | out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */ 268 | out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */ 269 | out[4].v = r4; /* 88 99 */ 270 | } 271 | 272 | /* multiply two packed bignums */ 273 | DONNA_INLINE static void 274 | curve25519_mul_packed64(packedelem64 *out, const packedelem64 *r, const packedelem64 *s) { 275 | xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9; 276 | xmmi r1_2,r3_2,r5_2,r7_2,r9_2; 277 | xmmi c1,c2; 278 | 279 | out[0].v = _mm_mul_epu32(r[0].v, s[0].v); 280 | out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v)); 281 | r1_2 = _mm_slli_epi32(r[1].v, 1); 282 | out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[1].v), _mm_mul_epu32(r[2].v, s[0].v))); 283 | out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v)))); 284 | r3_2 = _mm_slli_epi32(r[3].v, 1); 285 | out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[1].v), _mm_mul_epu32(r[4].v, s[0].v))))); 286 | out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v)))))); 287 | r5_2 = _mm_slli_epi32(r[5].v, 1); 288 | out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[1].v), _mm_mul_epu32(r[6].v, s[0].v))))))); 289 | out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v , s[0].v)))))))); 290 | r7_2 = _mm_slli_epi32(r[7].v, 1); 291 | out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2 , s[1].v), _mm_mul_epu32(r[8].v, s[0].v))))))))); 292 | out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v)))))))))); 293 | 294 | r1 = _mm_mul_epu32(r[1].v, packednineteen.v); 295 | r2 = _mm_mul_epu32(r[2].v, packednineteen.v); 296 | r1_2 = _mm_slli_epi32(r1, 1); 297 | r3 = _mm_mul_epu32(r[3].v, packednineteen.v); 298 | r4 = _mm_mul_epu32(r[4].v, packednineteen.v); 299 | r3_2 = _mm_slli_epi32(r3, 1); 300 | r5 = _mm_mul_epu32(r[5].v, packednineteen.v); 301 | r6 = _mm_mul_epu32(r[6].v, packednineteen.v); 302 | r5_2 = _mm_slli_epi32(r5, 1); 303 | r7 = _mm_mul_epu32(r[7].v, packednineteen.v); 304 | r8 = _mm_mul_epu32(r[8].v, packednineteen.v); 305 | r7_2 = _mm_slli_epi32(r7, 1); 306 | r9 = _mm_mul_epu32(r[9].v, packednineteen.v); 307 | r9_2 = _mm_slli_epi32(r9, 1); 308 | 309 | out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v)))))))))); 310 | out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3 , s[8].v), _mm_mul_epu32(r2, s[9].v))))))))); 311 | out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v)))))))); 312 | out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[8].v), _mm_mul_epu32(r4, s[9].v))))))); 313 | out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v)))))); 314 | out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[8].v), _mm_mul_epu32(r6, s[9].v))))); 315 | out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v)))); 316 | out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[8].v), _mm_mul_epu32(r8, s[9].v))); 317 | out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v)); 318 | 319 | c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2); 320 | c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2); 321 | c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2); 322 | c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2); 323 | c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2); 324 | c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v)); 325 | c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2); 326 | } 327 | 328 | /* multiply a bignum */ 329 | static void 330 | curve25519_mul(bignum25519 out, const bignum25519 r, const bignum25519 s) { 331 | xmmi m01,m23,m45,m67,m89; 332 | xmmi m0123,m4567; 333 | xmmi s0123,s4567; 334 | xmmi s01,s23,s45,s67,s89; 335 | xmmi s12,s34,s56,s78,s9; 336 | xmmi r0,r2,r4,r6,r8; 337 | xmmi r1,r3,r5,r7,r9; 338 | xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919; 339 | xmmi c1,c2,c3; 340 | 341 | s0123 = _mm_load_si128((xmmi*)s + 0); 342 | s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0)); 343 | s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1)); 344 | s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2)); 345 | s4567 = _mm_load_si128((xmmi*)s + 1); 346 | s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567); 347 | s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0)); 348 | s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1)); 349 | s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2)); 350 | s89 = _mm_load_si128((xmmi*)s + 2); 351 | s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89); 352 | s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0)); 353 | s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2)); 354 | 355 | r0 = _mm_load_si128((xmmi*)r + 0); 356 | r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1)); 357 | r1 = _mm_add_epi64(r1, _mm_and_si128(r1, sse2_top64bitmask.v)); 358 | r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2)); 359 | r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3)); 360 | r3 = _mm_add_epi64(r3, _mm_and_si128(r3, sse2_top64bitmask.v)); 361 | r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0)); 362 | r4 = _mm_load_si128((xmmi*)r + 1); 363 | r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1)); 364 | r5 = _mm_add_epi64(r5, _mm_and_si128(r5, sse2_top64bitmask.v)); 365 | r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2)); 366 | r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3)); 367 | r7 = _mm_add_epi64(r7, _mm_and_si128(r7, sse2_top64bitmask.v)); 368 | r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0)); 369 | r8 = _mm_load_si128((xmmi*)r + 2); 370 | r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1)); 371 | r9 = _mm_add_epi64(r9, _mm_and_si128(r9, sse2_top64bitmask.v)); 372 | r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0)); 373 | 374 | m01 = _mm_mul_epu32(r1,s01); 375 | m23 = _mm_mul_epu32(r1,s23); 376 | m45 = _mm_mul_epu32(r1,s45); 377 | m67 = _mm_mul_epu32(r1,s67); 378 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01)); 379 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23)); 380 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45)); 381 | m89 = _mm_mul_epu32(r1,s89); 382 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01)); 383 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23)); 384 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67)); 385 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01)); 386 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45)); 387 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23)); 388 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01)); 389 | 390 | /* shift up */ 391 | m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8)); 392 | m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8)); 393 | m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8)); 394 | m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8)); 395 | m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8)); 396 | 397 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01)); 398 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23)); 399 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45)); 400 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67)); 401 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01)); 402 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23)); 403 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23)); 404 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89)); 405 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01)); 406 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45)); 407 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67)); 408 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01)); 409 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45)); 410 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23)); 411 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01)); 412 | 413 | r219 = _mm_mul_epu32(r2, packednineteen.v); 414 | r419 = _mm_mul_epu32(r4, packednineteen.v); 415 | r619 = _mm_mul_epu32(r6, packednineteen.v); 416 | r819 = _mm_mul_epu32(r8, packednineteen.v); 417 | r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.v); 418 | r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.v); 419 | r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.v); 420 | r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.v); 421 | r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.v); 422 | 423 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12)); 424 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34)); 425 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56)); 426 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78)); 427 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34)); 428 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56)); 429 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78)); 430 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9)); 431 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56)); 432 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78)); 433 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9)); 434 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89)); 435 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78)); 436 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9)); 437 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89)); 438 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9)); 439 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23)); 440 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45)); 441 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67)); 442 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45)); 443 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67)); 444 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67)); 445 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89)); 446 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89)); 447 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9)); 448 | 449 | r0 = _mm_unpacklo_epi64(m01, m45); 450 | r1 = _mm_unpackhi_epi64(m01, m45); 451 | r2 = _mm_unpacklo_epi64(m23, m67); 452 | r3 = _mm_unpackhi_epi64(m23, m67); 453 | r4 = _mm_unpacklo_epi64(m89, m89); 454 | r5 = _mm_unpackhi_epi64(m89, m89); 455 | 456 | c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 457 | c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8); 458 | c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1); 459 | c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3)); 460 | c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 461 | 462 | m0123 = _mm_unpacklo_epi32(r0, r1); 463 | m4567 = _mm_unpackhi_epi32(r0, r1); 464 | m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3)); 465 | m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3)); 466 | m89 = _mm_unpackhi_epi32(r4, r5); 467 | 468 | _mm_store_si128((xmmi*)out + 0, m0123); 469 | _mm_store_si128((xmmi*)out + 1, m4567); 470 | _mm_store_si128((xmmi*)out + 2, m89); 471 | } 472 | 473 | typedef struct bignum25519mulprecomp_t { 474 | xmmi r0,r2,r4,r6,r8; 475 | xmmi r1,r3,r5,r7,r9; 476 | xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919; 477 | } bignum25519mulprecomp; 478 | 479 | /* precompute a constant to multiply by */ 480 | DONNA_INLINE static void 481 | curve25519_mul_precompute(bignum25519mulprecomp *pre, const bignum25519 r) { 482 | pre->r0 = _mm_load_si128((xmmi*)r + 0); 483 | pre->r1 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(1,1,1,1)); 484 | pre->r1 = _mm_add_epi64(pre->r1, _mm_and_si128(pre->r1, sse2_top64bitmask.v)); 485 | pre->r2 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(2,2,2,2)); 486 | pre->r3 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(3,3,3,3)); 487 | pre->r3 = _mm_add_epi64(pre->r3, _mm_and_si128(pre->r3, sse2_top64bitmask.v)); 488 | pre->r0 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(0,0,0,0)); 489 | pre->r4 = _mm_load_si128((xmmi*)r + 1); 490 | pre->r5 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(1,1,1,1)); 491 | pre->r5 = _mm_add_epi64(pre->r5, _mm_and_si128(pre->r5, sse2_top64bitmask.v)); 492 | pre->r6 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(2,2,2,2)); 493 | pre->r7 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(3,3,3,3)); 494 | pre->r7 = _mm_add_epi64(pre->r7, _mm_and_si128(pre->r7, sse2_top64bitmask.v)); 495 | pre->r4 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(0,0,0,0)); 496 | pre->r8 = _mm_load_si128((xmmi*)r + 2); 497 | pre->r9 = _mm_shuffle_epi32(pre->r8, _MM_SHUFFLE(3,1,3,1)); 498 | pre->r9 = _mm_add_epi64(pre->r9, _mm_and_si128(pre->r9, sse2_top64bitmask.v)); 499 | pre->r8 = _mm_shuffle_epi32(pre->r8, _MM_SHUFFLE(3,0,3,0)); 500 | 501 | pre->r219 = _mm_mul_epu32(pre->r2, packednineteen.v); 502 | pre->r419 = _mm_mul_epu32(pre->r4, packednineteen.v); 503 | pre->r619 = _mm_mul_epu32(pre->r6, packednineteen.v); 504 | pre->r819 = _mm_mul_epu32(pre->r8, packednineteen.v); 505 | pre->r119 = _mm_shuffle_epi32(pre->r1,_MM_SHUFFLE(0,0,2,2)); pre->r119 = _mm_mul_epu32(pre->r119, packednineteen.v); 506 | pre->r319 = _mm_shuffle_epi32(pre->r3,_MM_SHUFFLE(0,0,2,2)); pre->r319 = _mm_mul_epu32(pre->r319, packednineteen.v); 507 | pre->r519 = _mm_shuffle_epi32(pre->r5,_MM_SHUFFLE(0,0,2,2)); pre->r519 = _mm_mul_epu32(pre->r519, packednineteen.v); 508 | pre->r719 = _mm_shuffle_epi32(pre->r7,_MM_SHUFFLE(0,0,2,2)); pre->r719 = _mm_mul_epu32(pre->r719, packednineteen.v); 509 | pre->r919 = _mm_shuffle_epi32(pre->r9,_MM_SHUFFLE(0,0,2,2)); pre->r919 = _mm_mul_epu32(pre->r919, packednineteen.v); 510 | } 511 | 512 | 513 | /* multiply a bignum by a pre-computed constant */ 514 | DONNA_INLINE static void 515 | curve25519_mul_precomputed(bignum25519 out, const bignum25519 s, const bignum25519mulprecomp *r) { 516 | xmmi m01,m23,m45,m67,m89; 517 | xmmi m0123,m4567; 518 | xmmi s0123,s4567; 519 | xmmi s01,s23,s45,s67,s89; 520 | xmmi s12,s34,s56,s78,s9; 521 | xmmi r0,r1,r2,r3,r4,r5; 522 | xmmi c1,c2,c3; 523 | 524 | s0123 = _mm_load_si128((xmmi*)s + 0); 525 | s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0)); 526 | s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1)); 527 | s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2)); 528 | s4567 = _mm_load_si128((xmmi*)s + 1); 529 | s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567); 530 | s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0)); 531 | s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1)); 532 | s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2)); 533 | s89 = _mm_load_si128((xmmi*)s + 2); 534 | s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89); 535 | s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0)); 536 | s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2)); 537 | 538 | m01 = _mm_mul_epu32(r->r1,s01); 539 | m23 = _mm_mul_epu32(r->r1,s23); 540 | m45 = _mm_mul_epu32(r->r1,s45); 541 | m67 = _mm_mul_epu32(r->r1,s67); 542 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r3,s01)); 543 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r3,s23)); 544 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r3,s45)); 545 | m89 = _mm_mul_epu32(r->r1,s89); 546 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r5,s01)); 547 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r5,s23)); 548 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r3,s67)); 549 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r7,s01)); 550 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r5,s45)); 551 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r7,s23)); 552 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r9,s01)); 553 | 554 | /* shift up */ 555 | m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8)); 556 | m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8)); 557 | m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8)); 558 | m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8)); 559 | m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8)); 560 | 561 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r0,s01)); 562 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r0,s23)); 563 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r0,s45)); 564 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r0,s67)); 565 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r2,s01)); 566 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r2,s23)); 567 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r4,s23)); 568 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r0,s89)); 569 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r4,s01)); 570 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r2,s45)); 571 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r2,s67)); 572 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r6,s01)); 573 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r4,s45)); 574 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r6,s23)); 575 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r8,s01)); 576 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r919,s12)); 577 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r919,s34)); 578 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r919,s56)); 579 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r919,s78)); 580 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r719,s34)); 581 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r719,s56)); 582 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r719,s78)); 583 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r719,s9)); 584 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r519,s56)); 585 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r519,s78)); 586 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r519,s9)); 587 | m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r819,s89)); 588 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r319,s78)); 589 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r319,s9)); 590 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r619,s89)); 591 | m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r919,s9)); 592 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r819,s23)); 593 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r819,s45)); 594 | m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r819,s67)); 595 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r619,s45)); 596 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r619,s67)); 597 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r419,s67)); 598 | m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r419,s89)); 599 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r219,s89)); 600 | m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r119,s9)); 601 | 602 | r0 = _mm_unpacklo_epi64(m01, m45); 603 | r1 = _mm_unpackhi_epi64(m01, m45); 604 | r2 = _mm_unpacklo_epi64(m23, m67); 605 | r3 = _mm_unpackhi_epi64(m23, m67); 606 | r4 = _mm_unpacklo_epi64(m89, m89); 607 | r5 = _mm_unpackhi_epi64(m89, m89); 608 | 609 | c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 610 | c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8); 611 | c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1); 612 | c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3)); 613 | c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 614 | 615 | m0123 = _mm_unpacklo_epi32(r0, r1); 616 | m4567 = _mm_unpackhi_epi32(r0, r1); 617 | m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3)); 618 | m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3)); 619 | m89 = _mm_unpackhi_epi32(r4, r5); 620 | 621 | _mm_store_si128((xmmi*)out + 0, m0123); 622 | _mm_store_si128((xmmi*)out + 1, m4567); 623 | _mm_store_si128((xmmi*)out + 2, m89); 624 | } 625 | 626 | /* square a bignum 'count' times */ 627 | #define curve25519_square(r,x) curve25519_square_times(r,x,1) 628 | 629 | static void 630 | curve25519_square_times(bignum25519 r, const bignum25519 in, int count) { 631 | xmmi m01,m23,m45,m67,m89; 632 | xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9; 633 | xmmi r0a,r1a,r2a,r3a,r7a,r9a; 634 | xmmi r0123,r4567; 635 | xmmi r01,r23,r45,r67,r6x,r89,r8x; 636 | xmmi r12,r34,r56,r78,r9x; 637 | xmmi r5619; 638 | xmmi c1,c2,c3; 639 | 640 | r0123 = _mm_load_si128((xmmi*)in + 0); 641 | r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0)); 642 | r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2)); 643 | r4567 = _mm_load_si128((xmmi*)in + 1); 644 | r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0)); 645 | r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2)); 646 | r89 = _mm_load_si128((xmmi*)in + 2); 647 | r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0)); 648 | 649 | do { 650 | r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8)); 651 | r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0)); 652 | r0 = _mm_add_epi64(r0, _mm_and_si128(r0, sse2_top64bitmask.v)); 653 | r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2)); 654 | r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2)); 655 | r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0)); 656 | r2 = _mm_add_epi64(r2, _mm_and_si128(r2, sse2_top64bitmask.v)); 657 | r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2)); 658 | r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2)); 659 | r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8)); 660 | r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0)); 661 | r4 = _mm_add_epi64(r4, _mm_and_si128(r4, sse2_top64bitmask.v)); 662 | r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8)); 663 | r5619 = _mm_mul_epu32(r56, packednineteen.v); 664 | r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0)); 665 | r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2)); 666 | r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8)); 667 | r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128()); 668 | r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2)); 669 | r7 = _mm_mul_epu32(r7, packed3819.v); 670 | r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2)); 671 | r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128()); 672 | r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0)); 673 | r8 = _mm_mul_epu32(r8, packednineteen.v); 674 | r9 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2)); 675 | r9x = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1); 676 | r9 = _mm_mul_epu32(r9, packed3819.v); 677 | r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2)); 678 | 679 | m01 = _mm_mul_epu32(r01, r0); 680 | m23 = _mm_mul_epu32(r23, r0a); 681 | m45 = _mm_mul_epu32(r45, r0a); 682 | m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2)); 683 | r23 = _mm_slli_epi32(r23, 1); 684 | m67 = _mm_mul_epu32(r67, r0a); 685 | m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a)); 686 | m89 = _mm_mul_epu32(r89, r0a); 687 | m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a)); 688 | r67 = _mm_slli_epi32(r67, 1); 689 | m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4)); 690 | r45 = _mm_slli_epi32(r45, 1); 691 | 692 | r1 = _mm_slli_epi32(r1, 1); 693 | r3 = _mm_slli_epi32(r3, 1); 694 | r1a = _mm_add_epi64(r1, _mm_and_si128(r1, sse2_bot64bitmask.v)); 695 | r3a = _mm_add_epi64(r3, _mm_and_si128(r3, sse2_bot64bitmask.v)); 696 | 697 | m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1)); 698 | m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a)); 699 | m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a)); 700 | m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3)); 701 | r34 = _mm_slli_epi32(r34, 1); 702 | m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a)); 703 | r78 = _mm_slli_epi32(r78, 1); 704 | m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a)); 705 | r56 = _mm_slli_epi32(r56, 1); 706 | 707 | m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9)); 708 | m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7)); 709 | m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9)); 710 | m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5)); 711 | m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7)); 712 | m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9)); 713 | m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8)); 714 | m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6)); 715 | m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8)); 716 | m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6)); 717 | m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a)); 718 | m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9)); 719 | m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8)); 720 | m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8)); 721 | m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a)); 722 | 723 | r0 = _mm_unpacklo_epi64(m01, m45); 724 | r1 = _mm_unpackhi_epi64(m01, m45); 725 | r2 = _mm_unpacklo_epi64(m23, m67); 726 | r3 = _mm_unpackhi_epi64(m23, m67); 727 | r4 = _mm_unpacklo_epi64(m89, m89); 728 | r5 = _mm_unpackhi_epi64(m89, m89); 729 | 730 | c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 731 | c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8); 732 | c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1); 733 | c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3)); 734 | c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 735 | 736 | r01 = _mm_unpacklo_epi64(r0, r1); 737 | r45 = _mm_unpackhi_epi64(r0, r1); 738 | r23 = _mm_unpacklo_epi64(r2, r3); 739 | r67 = _mm_unpackhi_epi64(r2, r3); 740 | r89 = _mm_unpackhi_epi64(r4, r5); 741 | } while (--count); 742 | 743 | r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3)); 744 | r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3)); 745 | r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0))); 746 | r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0))); 747 | r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0)); 748 | 749 | _mm_store_si128((xmmi*)r + 0, r0123); 750 | _mm_store_si128((xmmi*)r + 1, r4567); 751 | _mm_store_si128((xmmi*)r + 2, r89); 752 | } 753 | 754 | /* square two packed bignums */ 755 | DONNA_INLINE static void 756 | curve25519_square_packed64(packedelem64 *out, const packedelem64 *r) { 757 | xmmi r0,r1,r2,r3; 758 | xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2; 759 | xmmi d5,d6,d7,d8,d9; 760 | xmmi c1,c2; 761 | 762 | r0 = r[0].v; 763 | r1 = r[1].v; 764 | r2 = r[2].v; 765 | r3 = r[3].v; 766 | 767 | out[0].v = _mm_mul_epu32(r0, r0); 768 | r0 = _mm_slli_epi32(r0, 1); 769 | out[1].v = _mm_mul_epu32(r0, r1); 770 | r1_2 = _mm_slli_epi32(r1, 1); 771 | out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2 ), _mm_mul_epu32(r1, r1_2)); 772 | r1 = r1_2; 773 | out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3 ), _mm_mul_epu32(r1, r2 )); 774 | r3_2 = _mm_slli_epi32(r3, 1); 775 | out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2 ), _mm_mul_epu32(r2, r2))); 776 | r2 = _mm_slli_epi32(r2, 1); 777 | out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3))); 778 | r5_2 = _mm_slli_epi32(r[5].v, 1); 779 | out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2 )))); 780 | r3 = r3_2; 781 | out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v)))); 782 | r7_2 = _mm_slli_epi32(r[7].v, 1); 783 | out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2 ), _mm_mul_epu32(r[4].v, r[4].v))))); 784 | out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2 ))))); 785 | 786 | d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.v); 787 | d6 = _mm_mul_epu32(r[6].v, packednineteen.v); 788 | d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.v); 789 | d8 = _mm_mul_epu32(r[8].v, packednineteen.v); 790 | d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.v); 791 | 792 | r4_2 = _mm_slli_epi32(r[4].v, 1); 793 | r6_2 = _mm_slli_epi32(r[6].v, 1); 794 | out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1 ), _mm_add_epi64(_mm_mul_epu32(d8, r2 ), _mm_add_epi64(_mm_mul_epu32(d7, r3 ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v)))))); 795 | out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3 ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2 ))))); 796 | out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3 ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2 ), _mm_mul_epu32(d6, r[6].v))))); 797 | out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v)))); 798 | out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v)))); 799 | out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v ), _mm_mul_epu32(d8, r7_2 ))); 800 | out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2 ), _mm_mul_epu32(d8, r[8].v))); 801 | out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v)); 802 | out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v)); 803 | 804 | c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2); 805 | c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2); 806 | c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2); 807 | c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2); 808 | c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2); 809 | c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v)); 810 | c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2); 811 | } 812 | 813 | /* make [nqx+nqz,nqpqx+nqpqz], [nqpqx-nqpqz,nqx-nqz] from [nqx+nqz,nqpqx+nqpqz], [nqx-nqz,nqpqx-nqpqz] */ 814 | DONNA_INLINE static void 815 | curve25519_make_nqpq(packedelem64 *primex, packedelem64 *primez, const packedelem32 *pqx, const packedelem32 *pqz) { 816 | primex[0].v = _mm_shuffle_epi32(pqx[0].v, _MM_SHUFFLE(1,1,0,0)); 817 | primex[1].v = _mm_shuffle_epi32(pqx[0].v, _MM_SHUFFLE(3,3,2,2)); 818 | primex[2].v = _mm_shuffle_epi32(pqx[1].v, _MM_SHUFFLE(1,1,0,0)); 819 | primex[3].v = _mm_shuffle_epi32(pqx[1].v, _MM_SHUFFLE(3,3,2,2)); 820 | primex[4].v = _mm_shuffle_epi32(pqx[2].v, _MM_SHUFFLE(1,1,0,0)); 821 | primex[5].v = _mm_shuffle_epi32(pqx[2].v, _MM_SHUFFLE(3,3,2,2)); 822 | primex[6].v = _mm_shuffle_epi32(pqx[3].v, _MM_SHUFFLE(1,1,0,0)); 823 | primex[7].v = _mm_shuffle_epi32(pqx[3].v, _MM_SHUFFLE(3,3,2,2)); 824 | primex[8].v = _mm_shuffle_epi32(pqx[4].v, _MM_SHUFFLE(1,1,0,0)); 825 | primex[9].v = _mm_shuffle_epi32(pqx[4].v, _MM_SHUFFLE(3,3,2,2)); 826 | primez[0].v = _mm_shuffle_epi32(pqz[0].v, _MM_SHUFFLE(0,0,1,1)); 827 | primez[1].v = _mm_shuffle_epi32(pqz[0].v, _MM_SHUFFLE(2,2,3,3)); 828 | primez[2].v = _mm_shuffle_epi32(pqz[1].v, _MM_SHUFFLE(0,0,1,1)); 829 | primez[3].v = _mm_shuffle_epi32(pqz[1].v, _MM_SHUFFLE(2,2,3,3)); 830 | primez[4].v = _mm_shuffle_epi32(pqz[2].v, _MM_SHUFFLE(0,0,1,1)); 831 | primez[5].v = _mm_shuffle_epi32(pqz[2].v, _MM_SHUFFLE(2,2,3,3)); 832 | primez[6].v = _mm_shuffle_epi32(pqz[3].v, _MM_SHUFFLE(0,0,1,1)); 833 | primez[7].v = _mm_shuffle_epi32(pqz[3].v, _MM_SHUFFLE(2,2,3,3)); 834 | primez[8].v = _mm_shuffle_epi32(pqz[4].v, _MM_SHUFFLE(0,0,1,1)); 835 | primez[9].v = _mm_shuffle_epi32(pqz[4].v, _MM_SHUFFLE(2,2,3,3)); 836 | } 837 | 838 | /* make [nqx+nqz,nqx-nqz] from [nqx+nqz,nqpqx+nqpqz], [nqx-nqz,nqpqx-nqpqz] */ 839 | DONNA_INLINE static void 840 | curve25519_make_nq(packedelem64 *nq, const packedelem32 *pqx, const packedelem32 *pqz) { 841 | nq[0].v = _mm_unpacklo_epi64(pqx[0].v, pqz[0].v); 842 | nq[1].v = _mm_unpackhi_epi64(pqx[0].v, pqz[0].v); 843 | nq[2].v = _mm_unpacklo_epi64(pqx[1].v, pqz[1].v); 844 | nq[3].v = _mm_unpackhi_epi64(pqx[1].v, pqz[1].v); 845 | nq[4].v = _mm_unpacklo_epi64(pqx[2].v, pqz[2].v); 846 | nq[5].v = _mm_unpackhi_epi64(pqx[2].v, pqz[2].v); 847 | nq[6].v = _mm_unpacklo_epi64(pqx[3].v, pqz[3].v); 848 | nq[7].v = _mm_unpackhi_epi64(pqx[3].v, pqz[3].v); 849 | nq[8].v = _mm_unpacklo_epi64(pqx[4].v, pqz[4].v); 850 | nq[9].v = _mm_unpackhi_epi64(pqx[4].v, pqz[4].v); 851 | } 852 | 853 | /* compute [nqx+nqz,nqx-nqz] from nqx, nqz */ 854 | DONNA_INLINE static void 855 | curve25519_compute_nq(packedelem64 *nq, const bignum25519 nqx, const bignum25519 nqz) { 856 | xmmi x0,x1,x2; 857 | xmmi z0,z1,z2; 858 | xmmi a0,a1,a2; 859 | xmmi s0,s1,s2; 860 | xmmi r0,r1; 861 | xmmi c1,c2; 862 | x0 = _mm_load_si128((xmmi*)nqx + 0); 863 | x1 = _mm_load_si128((xmmi*)nqx + 1); 864 | x2 = _mm_load_si128((xmmi*)nqx + 2); 865 | z0 = _mm_load_si128((xmmi*)nqz + 0); 866 | z1 = _mm_load_si128((xmmi*)nqz + 1); 867 | z2 = _mm_load_si128((xmmi*)nqz + 2); 868 | a0 = _mm_add_epi32(x0, z0); 869 | a1 = _mm_add_epi32(x1, z1); 870 | a2 = _mm_add_epi32(x2, z2); 871 | s0 = _mm_add_epi32(x0, packed2p0.v); 872 | s1 = _mm_add_epi32(x1, packed2p1.v); 873 | s2 = _mm_add_epi32(x2, packed2p2.v); 874 | s0 = _mm_sub_epi32(s0, z0); 875 | s1 = _mm_sub_epi32(s1, z1); 876 | s2 = _mm_sub_epi32(s2, z2); 877 | r0 = _mm_and_si128(_mm_shuffle_epi32(s0, _MM_SHUFFLE(2,2,0,0)), sse2_bot32bitmask.v); 878 | r1 = _mm_and_si128(_mm_shuffle_epi32(s0, _MM_SHUFFLE(3,3,1,1)), sse2_bot32bitmask.v); 879 | c1 = _mm_srli_epi32(r0, 26); 880 | c2 = _mm_srli_epi32(r1, 25); 881 | r0 = _mm_and_si128(r0, packedmask26.v); 882 | r1 = _mm_and_si128(r1, packedmask25.v); 883 | r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8)); 884 | r1 = _mm_add_epi32(r1, c1); 885 | s0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1)); 886 | s1 = _mm_add_epi32(s1, _mm_srli_si128(c2, 8)); 887 | nq[0].v = _mm_unpacklo_epi64(a0, s0); 888 | nq[2].v = _mm_unpackhi_epi64(a0, s0); 889 | nq[4].v = _mm_unpacklo_epi64(a1, s1); 890 | nq[6].v = _mm_unpackhi_epi64(a1, s1); 891 | nq[8].v = _mm_unpacklo_epi64(a2, s2); 892 | nq[1].v = _mm_shuffle_epi32(nq[0].v, _MM_SHUFFLE(3,3,1,1)); 893 | nq[3].v = _mm_shuffle_epi32(nq[2].v, _MM_SHUFFLE(3,3,1,1)); 894 | nq[5].v = _mm_shuffle_epi32(nq[4].v, _MM_SHUFFLE(3,3,1,1)); 895 | nq[7].v = _mm_shuffle_epi32(nq[6].v, _MM_SHUFFLE(3,3,1,1)); 896 | nq[9].v = _mm_shuffle_epi32(nq[8].v, _MM_SHUFFLE(3,3,1,1)); 897 | } 898 | 899 | 900 | /* compute [x+z,x-z] from [x,z] */ 901 | DONNA_INLINE static void 902 | curve25519_addsub_packed64(packedelem64 *r) { 903 | packed32bignum25519 x,z,add,sub; 904 | 905 | x[0].v = _mm_unpacklo_epi64(r[0].v, r[1].v); 906 | z[0].v = _mm_unpackhi_epi64(r[0].v, r[1].v); 907 | x[1].v = _mm_unpacklo_epi64(r[2].v, r[3].v); 908 | z[1].v = _mm_unpackhi_epi64(r[2].v, r[3].v); 909 | x[2].v = _mm_unpacklo_epi64(r[4].v, r[5].v); 910 | z[2].v = _mm_unpackhi_epi64(r[4].v, r[5].v); 911 | x[3].v = _mm_unpacklo_epi64(r[6].v, r[7].v); 912 | z[3].v = _mm_unpackhi_epi64(r[6].v, r[7].v); 913 | x[4].v = _mm_unpacklo_epi64(r[8].v, r[9].v); 914 | z[4].v = _mm_unpackhi_epi64(r[8].v, r[9].v); 915 | 916 | curve25519_add_packed32(add, x, z); 917 | curve25519_sub_packed32(sub, x, z); 918 | 919 | r[0].v = _mm_unpacklo_epi64(add[0].v, sub[0].v); 920 | r[1].v = _mm_unpackhi_epi64(add[0].v, sub[0].v); 921 | r[2].v = _mm_unpacklo_epi64(add[1].v, sub[1].v); 922 | r[3].v = _mm_unpackhi_epi64(add[1].v, sub[1].v); 923 | r[4].v = _mm_unpacklo_epi64(add[2].v, sub[2].v); 924 | r[5].v = _mm_unpackhi_epi64(add[2].v, sub[2].v); 925 | r[6].v = _mm_unpacklo_epi64(add[3].v, sub[3].v); 926 | r[7].v = _mm_unpackhi_epi64(add[3].v, sub[3].v); 927 | r[8].v = _mm_unpacklo_epi64(add[4].v, sub[4].v); 928 | r[9].v = _mm_unpackhi_epi64(add[4].v, sub[4].v); 929 | } 930 | 931 | /* compute [x,z] * [121666,121665] */ 932 | DONNA_INLINE static void 933 | curve25519_121665_packed64(packedelem64 *out, const packedelem64 *in) { 934 | xmmi c1,c2; 935 | 936 | out[0].v = _mm_mul_epu32(in[0].v, packed121666121665.v); 937 | out[1].v = _mm_mul_epu32(in[1].v, packed121666121665.v); 938 | out[2].v = _mm_mul_epu32(in[2].v, packed121666121665.v); 939 | out[3].v = _mm_mul_epu32(in[3].v, packed121666121665.v); 940 | out[4].v = _mm_mul_epu32(in[4].v, packed121666121665.v); 941 | out[5].v = _mm_mul_epu32(in[5].v, packed121666121665.v); 942 | out[6].v = _mm_mul_epu32(in[6].v, packed121666121665.v); 943 | out[7].v = _mm_mul_epu32(in[7].v, packed121666121665.v); 944 | out[8].v = _mm_mul_epu32(in[8].v, packed121666121665.v); 945 | out[9].v = _mm_mul_epu32(in[9].v, packed121666121665.v); 946 | 947 | c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2); 948 | c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2); 949 | c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2); 950 | c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2); 951 | c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2); 952 | c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v)); 953 | c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2); 954 | } 955 | 956 | /* compute [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */ 957 | DONNA_INLINE static void 958 | curve25519_final_nq(packedelem64 *nq, const packedelem64 *sq, const packedelem64 *sq121665) { 959 | packed32bignum25519 x, z, sub; 960 | packed64bignum25519 t, nqa, nqb; 961 | 962 | x[0].v = _mm_or_si128(_mm_unpacklo_epi64(sq[0].v, sq[1].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[0].v, sq121665[1].v), 4)); 963 | z[0].v = _mm_or_si128(_mm_unpackhi_epi64(sq[0].v, sq[1].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[0].v, sq121665[1].v), 4)); 964 | x[1].v = _mm_or_si128(_mm_unpacklo_epi64(sq[2].v, sq[3].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[2].v, sq121665[3].v), 4)); 965 | z[1].v = _mm_or_si128(_mm_unpackhi_epi64(sq[2].v, sq[3].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[2].v, sq121665[3].v), 4)); 966 | x[2].v = _mm_or_si128(_mm_unpacklo_epi64(sq[4].v, sq[5].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[4].v, sq121665[5].v), 4)); 967 | z[2].v = _mm_or_si128(_mm_unpackhi_epi64(sq[4].v, sq[5].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[4].v, sq121665[5].v), 4)); 968 | x[3].v = _mm_or_si128(_mm_unpacklo_epi64(sq[6].v, sq[7].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[6].v, sq121665[7].v), 4)); 969 | z[3].v = _mm_or_si128(_mm_unpackhi_epi64(sq[6].v, sq[7].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[6].v, sq121665[7].v), 4)); 970 | x[4].v = _mm_or_si128(_mm_unpacklo_epi64(sq[8].v, sq[9].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[8].v, sq121665[9].v), 4)); 971 | z[4].v = _mm_or_si128(_mm_unpackhi_epi64(sq[8].v, sq[9].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[8].v, sq121665[9].v), 4)); 972 | 973 | curve25519_sub_packed32(sub, x, z); 974 | 975 | t[0].v = _mm_shuffle_epi32(sub[0].v, _MM_SHUFFLE(1,1,0,0)); 976 | t[1].v = _mm_shuffle_epi32(sub[0].v, _MM_SHUFFLE(3,3,2,2)); 977 | t[2].v = _mm_shuffle_epi32(sub[1].v, _MM_SHUFFLE(1,1,0,0)); 978 | t[3].v = _mm_shuffle_epi32(sub[1].v, _MM_SHUFFLE(3,3,2,2)); 979 | t[4].v = _mm_shuffle_epi32(sub[2].v, _MM_SHUFFLE(1,1,0,0)); 980 | t[5].v = _mm_shuffle_epi32(sub[2].v, _MM_SHUFFLE(3,3,2,2)); 981 | t[6].v = _mm_shuffle_epi32(sub[3].v, _MM_SHUFFLE(1,1,0,0)); 982 | t[7].v = _mm_shuffle_epi32(sub[3].v, _MM_SHUFFLE(3,3,2,2)); 983 | t[8].v = _mm_shuffle_epi32(sub[4].v, _MM_SHUFFLE(1,1,0,0)); 984 | t[9].v = _mm_shuffle_epi32(sub[4].v, _MM_SHUFFLE(3,3,2,2)); 985 | 986 | nqa[0].v = _mm_unpacklo_epi64(sq[0].v, t[0].v); 987 | nqb[0].v = _mm_unpackhi_epi64(sq[0].v, t[0].v); 988 | nqa[1].v = _mm_unpacklo_epi64(sq[1].v, t[1].v); 989 | nqb[1].v = _mm_unpackhi_epi64(sq[1].v, t[1].v); 990 | nqa[2].v = _mm_unpacklo_epi64(sq[2].v, t[2].v); 991 | nqb[2].v = _mm_unpackhi_epi64(sq[2].v, t[2].v); 992 | nqa[3].v = _mm_unpacklo_epi64(sq[3].v, t[3].v); 993 | nqb[3].v = _mm_unpackhi_epi64(sq[3].v, t[3].v); 994 | nqa[4].v = _mm_unpacklo_epi64(sq[4].v, t[4].v); 995 | nqb[4].v = _mm_unpackhi_epi64(sq[4].v, t[4].v); 996 | nqa[5].v = _mm_unpacklo_epi64(sq[5].v, t[5].v); 997 | nqb[5].v = _mm_unpackhi_epi64(sq[5].v, t[5].v); 998 | nqa[6].v = _mm_unpacklo_epi64(sq[6].v, t[6].v); 999 | nqb[6].v = _mm_unpackhi_epi64(sq[6].v, t[6].v); 1000 | nqa[7].v = _mm_unpacklo_epi64(sq[7].v, t[7].v); 1001 | nqb[7].v = _mm_unpackhi_epi64(sq[7].v, t[7].v); 1002 | nqa[8].v = _mm_unpacklo_epi64(sq[8].v, t[8].v); 1003 | nqb[8].v = _mm_unpackhi_epi64(sq[8].v, t[8].v); 1004 | nqa[9].v = _mm_unpacklo_epi64(sq[9].v, t[9].v); 1005 | nqb[9].v = _mm_unpackhi_epi64(sq[9].v, t[9].v); 1006 | 1007 | curve25519_mul_packed64(nq, nqa, nqb); 1008 | } 1009 | 1010 | --------------------------------------------------------------------------------