├── LICENSE.txt
├── README.md
├── convert-keil-to-gcc.sh
├── nrf52_tests_main.c
├── p256-cortex-m4-asm-gcc.S
├── p256-cortex-m4-asm-keil.s
├── p256-cortex-m4-config.h
├── p256-cortex-m4.c
├── p256-cortex-m4.h
└── testgen.js


/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017-2021 Emil Lenngren
 2 | Copyright (c) 2021 Shortcut Labs AB
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all
12 | copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 | SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # P256-Cortex-M4
  2 | P256 ECDH and ECDSA for Cortex-M4, Cortex-M33 and other 32-bit ARM processors
  3 | 
  4 | This library implements highly optimimzed assembler versions for the NIST P-256 (secp256r1) elliptic curve for Cortex-M4/Cortex-M33. While optimized for these processors, it works on other newer 32-bit ARM processors as well.
  5 | 
  6 | The DSP extension CPU feature is required for Cortex-M33.
  7 | 
  8 | ### API
  9 | 
 10 | For full API documentation, see the header file `p256-cortex-m4.h`.
 11 | 
 12 | ### How to use
 13 | 
 14 | To use it in your project, add the following files to your project: `p256-cortex-m4.h`, `p256-cortex-m4-config.h`, `p256-cortex-m4.c`. Then add _only_ one of the asm files that suits you best as a compilation unit to your project. If you use Keil, add `p256-cortex-m4-asm-keil.s` as a source file and add `--cpreproc` to "Misc Controls" under Options -> Asm for the file. If you use GCC, add `p256-cortex-m4-asm-gcc.S` to your Makefile just like any other C source file.
 15 | 
 16 | To only compile in the features needed, the file `p256-cortex-m4-config.h` can be modified to include only specific algorithms. If used on a Cortex-A processor, the `has_d_cache` setting shall also be enabled in order to prevent side-channel attacks. There are also optimization options to trade code space for performance. The same options can also be defined directly at the command line when compiling, using e.g. `-Dinclude_p256_sign=0` to omit the code for creating a signature.
 17 | 
 18 | ### Examples
 19 | 
 20 | The library does not include a hash implementation (used during sign and verify), nor does it include a secure random number generator (used during keygen and sign). These functions must be implemented externally. Note that the secure random number generator must be for cryptographic purposes. In particular, `rand()` from the C standard library must not be used, while `/dev/urandom`, as can be found on many Unix systems, is compliant.
 21 | 
 22 | Note: all `uint32_t` arrays represent 256-bit integers in little-endian byte order (native to the CPU), located at a 4-byte alignment byte boundary. The `uint8_t` arrays either represent pure byte strings, or integers in big-endian byte order (no alignment requirements). When interacting with other libraries, make sure to carefully understand the data format used by those libraries. Some data conversion routines for easier interopability are included in the API.
 23 | 
 24 | #### ECDSA/ECDH Keygen
 25 | 
 26 | Generate a key pair for either ECDSA or ECDH (a key pair should not be used for both purposes).
 27 | 
 28 | ```C
 29 | uint32_t pubkey_x[8], pubkey_y[8], privkey[8];
 30 | do {
 31 |     generate_secure_random_data(privkey, sizeof(privkey));
 32 | } while (!p256_keygen(pubkey_x, pubkey_y, privkey));
 33 | ```
 34 | 
 35 | The result will now be contained in `pubkey_x`, `pubkey_y` and `privkey` (little-endian).
 36 | 
 37 | #### ECDSA Sign
 38 | 
 39 | In this example, SHA-256 is used as hash algorithm.
 40 | 
 41 | ```C
 42 | // Input values
 43 | uint8_t message[] = ...;
 44 | size_t message_len = ...;
 45 | uint32_t privkey[8] = ...;
 46 | 
 47 | // Output values (the signature)
 48 | uint32_t signature_r[8], signature_s[8];
 49 | 
 50 | uint8_t hash[32];
 51 | sha256_hash(message, message_len, hash);
 52 | 
 53 | uint32_t k[8]; // must be kept secret
 54 | do {
 55 |     generate_secure_random_data(k, sizeof(k));
 56 | } while (!p256_sign(signature_r, signature_s, hash, sizeof(hash), privkey, k));
 57 | ```
 58 | 
 59 | #### ECDSA Verify
 60 | 
 61 | In this example, SHA-256 is used as hash algorithm.
 62 | 
 63 | ```C
 64 | // Input values
 65 | uint8_t message[] = ...;
 66 | size_t message_len = ...;
 67 | uint32_t pubkey_x[8] = ..., pubkey_y[8] = ...;
 68 | uint32_t signature_r[8] = ..., signature_s[8] = ...;
 69 | 
 70 | uint8_t hash[32];
 71 | sha256_hash(message, message_len, hash);
 72 | 
 73 | if (p256_verify(pubkey_x, pubkey_y, hash, sizeof(hash), signature_r, signature_s)) {
 74 |     // Signature is valid
 75 | } else {
 76 |     // Signature is invalid
 77 | }
 78 | ```
 79 | 
 80 | #### ECDH Shared secret
 81 | 
 82 | After both parties have generated their key pair and exchanged their public keys, the shared secret can be generated. Both parties execute the following code.
 83 | 
 84 | ```C
 85 | // Input values
 86 | uint32_t others_public_key_x[8] = ..., others_public_key_y[8] = ...; // Received from remote party
 87 | uint32_t my_private_key[8] = ...; // Generated locally earlier during keygen
 88 | 
 89 | // Output value
 90 | uint8_t shared_secret[32];
 91 | 
 92 | if (!p256_ecdh_calc_shared_secret(shared_secret, my_private_key, others_public_key_x, others_public_key_y)) {
 93 |     // The other part sent an invalid public key, so abort and take actions
 94 |     // The shared_secret will at this point contain an undefined value, and should hence not be read
 95 | } else {
 96 |     // The shared_secret is now the same for both parts and may be used for cryptographic purposes
 97 | }
 98 | ```
 99 | 
100 | #### Endianness conversion
101 | 
102 | If you are receiving or sending 32-byte long `uint8_t` arrays representing 256-bit integers in big-endian byte order, you may convert them to or from `uint32_t` arrays in little-endian byte order (which are commonly used in this library) using `p256_convert_endianness`.
103 | 
104 | For example, before validating a signature, call:
105 | 
106 | ```C
107 | // Input values
108 | uint8_t signature_r_in[32] = ..., signature_s_in[32] = ...;
109 | 
110 | // Output values
111 | uint32_t signature_r[8], signature_s[8];
112 | 
113 | p256_convert_endianness(signature_r, signature_r_in, 32);
114 | p256_convert_endianness(signature_s, signature_s_in, 32);
115 | ```
116 | 
117 | After generating a signature, call:
118 | 
119 | ```C
120 | // Input values
121 | uint32_t signature_r[8] = ..., signature_s[8] = ...; // from p256_sign
122 | 
123 | // Output values
124 | uint8_t signature_r_out[32], signature_s_out[32];
125 | 
126 | p256_convert_endianness(signature_r_out, signature_r, 32);
127 | p256_convert_endianness(signature_s_out, signature_s, 32);
128 | ```
129 | 
130 | The same technique can be used for public keys.
131 | 
132 | ### Testing
133 | 
134 | The library has been tested against test vectors from Project Wycheproof (https://github.com/google/wycheproof). To run the tests, first execute `node testgen.js > tests.c` using Node >= 10.4. Then add the project files according to "How to use" plus `tests.c` and `nrf52_tests_main.c` to a new clean nRF52840 project using e.g. Segger Embedded Studio or Keil µVision. Compile and run and make sure all tests pass, by verifying that `main` returns 0.
135 | 
136 | Currently the work has been tested successfully on nRF52840, nRF5340 and MAX32670.
137 | 
138 | ### Performance
139 | The following numbers were obtained on a nRF52840 with ICACHE turned on, using GCC as compiler with `-O2` optimization.
140 | 
141 | Operation | Cycles | Time at 64 MHz
142 | --- | --- | ---
143 | Key generation ECDH/ECDSA | 327k | 5.1 ms
144 | Sign ECDSA | 375k | 5.9 ms
145 | Verify ECDSA | 976k | 15.3 ms
146 | Shared secret ECDH | 906k | 14.2 ms
147 | Point decompression | 48k | 0.75 ms
148 | 
149 | With all features enabled, the full library takes 8.9 kB in compiled form. 1.5 kB can be saved by enabling options that trades code space for performance.
150 | 
151 | The stack usage is at most 2 kB.
152 | 
153 | ### Security
154 | The implementation runs in constant time (unless input values are invalid) and uses a constant code memory access pattern, regardless of the scalar/private key in order to protect against side channel attacks. If desired, in particular when the processor has a data cache (like Cortex-A processors), the `has_d_cache` option can be enabled which also causes the RAM access pattern to be constant, at the expense of ~10% performance decrease.
155 | 
156 | ### Code
157 | The code is written in Keil's assembler format but was converted to GCC's assembler syntax using the included script `convert-keil-to-gcc.sh` (reads from stdin and writes to stdout).
158 | 
159 | ### Copying
160 | The code is licensed under the MIT license.
161 | 
162 | ### Thanks
163 | Thanks to ASSA ABLOY PPI for funding this work!
164 | 
165 | https://github.com/assaabloy-ppi
166 | 
167 | https://assaabloy.com
168 | 


--------------------------------------------------------------------------------
/convert-keil-to-gcc.sh:
--------------------------------------------------------------------------------
1 | echo "	.syntax unified"
2 | echo "	.thumb"
3 | perl -0777 -pe 's/\r?\n/\n/g;s/;thumb_func/.thumb_func/g;s/;/\/\//g;s/export/\.global/g;s/(([a-zA-Z0-9_]+) proc\n[\W\w]+?)endp/\1\.size \2, \.-\2/g;s/([a-zA-Z0-9_]+) proc\n/\t\.type \1, %function\n\1:\n/g;s/(\n)(\d+)(\n)/\1\2:\3/g;s/%b(\d+)/\1b/g;s/%f(\d+)/\1f/g;s/(frame[\W\w]+?\n)/\/\/\1/g;s/area \|([^\|]+)\|[^\n]*\n/\1\n/g;s/align 2/.align 1/g;s/align 4/.align 2/g;s/\n([a-zA-Z0-9_]+)(\n\s)dcd/\n\1:\2.word/g;s/\n([a-zA-Z0-9_]+)(\n\s)(.global [a-zA-Z0-9_]+\n\s)dcd/\n\t.type \1, \%object\n\1:\n\t.global \1\2.word/g;s/dcd/.word/g;s/\/\/ end ([a-zA-Z0-9_]+)/.size \1, .-\1/g;s/\n([a-zA-Z0-9_]+)(\n\s)dcw/\n\1:\2.hword/g;s/dcw/.hword/g;s/\n([a-zA-Z0-9_]+)(\n\s)dcb/\n\1:\2.byte/g;s/dcb/.byte/g;s/(X\d+) RN (\d+)/\t\1 .req r\2/g;s/ltorg/.ltorg/g;s/end(\n)/.end\1/g;s/(adcs|adc|sbcs|sbc) ([r|X]\d+|lr),([^,]+?)(\n| +?\/\/)/\1 \2,\2,\3\4/g;s/ +?\/\/label definition/: \/\//g'
4 | 


--------------------------------------------------------------------------------
/nrf52_tests_main.c:
--------------------------------------------------------------------------------
 1 | #include <stdbool.h>
 2 | #include <assert.h>
 3 | #include <nrf52.h>
 4 | #include <nrf52_bitfields.h>
 5 | 
 6 | // run "node testgen.js > tests.c" first, with a nodejs version >= 10.4
 7 | 
 8 | bool run_tests(void);
 9 | 
10 | int main() {
11 |     NRF_NVMC->ICACHECNF = NVMC_ICACHECNF_CACHEEN_Enabled << NVMC_ICACHECNF_CACHEEN_Pos;
12 |     bool res = run_tests();
13 |     assert(res);
14 |     return res ? 0 : 1;
15 | }
16 | 


--------------------------------------------------------------------------------
/p256-cortex-m4-asm-gcc.S:
--------------------------------------------------------------------------------
   1 | 	.syntax unified
   2 | 	.thumb
   3 | // Copyright (c) 2017-2021 Emil Lenngren
   4 | // Copyright (c) 2021 Shortcut Labs AB
   5 | //
   6 | // Permission is hereby granted, free of charge, to any person obtaining a copy
   7 | // of this software and associated documentation files (the "Software"), to deal
   8 | // in the Software without restriction, including without limitation the rights
   9 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 | // copies of the Software, and to permit persons to whom the Software is
  11 | // furnished to do so, subject to the following conditions:
  12 | //
  13 | // The above copyright notice and this permission notice shall be included in all
  14 | // copies or substantial portions of the Software.
  15 | //
  16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 | // SOFTWARE.
  23 | 
  24 | // P256-Cortex-M4
  25 | 
  26 | #include "p256-cortex-m4-config.h"
  27 | 
  28 | // This is an armv7 implementation of P-256.
  29 | //
  30 | // When secret data is processed, the implementation runs in constant time,
  31 | // and no conditional branches depend on secret data.
  32 | 
  33 | 	.text
  34 | 	.align 2
  35 | 
  36 | #if (include_p256_basemult || include_p256_varmult) && has_d_cache
  37 | // Selects one of many values
  38 | // *r0 = output, *r1 = table, r2 = num coordinates, r3 = index to choose [0..7]
  39 | // 547 cycles for affine coordinates
  40 | 	.type P256_select_point, %function
  41 | P256_select_point:
  42 | 	.global P256_select_point
  43 | 	push {r0,r2,r3,r4-r11,lr}
  44 | 	//frame push {r4-r11,lr}
  45 | 	//frame address sp,48
  46 | 	
  47 | 	subs r2,#1
  48 | 	lsls r2,#5
  49 | 
  50 | 0:
  51 | 	rsbs r3,#0
  52 | 	sbcs r3,r3,r3
  53 | 	mvns r3,r3
  54 | 	
  55 | 	ldm r1!,{r6-r12,lr}
  56 | 	ands r6,r3
  57 | 	ands r7,r3
  58 | 	and r8,r3
  59 | 	and r9,r3
  60 | 	and r10,r3
  61 | 	and r11,r3
  62 | 	and r12,r3
  63 | 	and lr,r3
  64 | 	
  65 | 	adds r1,r2
  66 | 	
  67 | 	movs r3,#1
  68 | 1:
  69 | 	ldr r0,[sp,#8]
  70 | 	eors r0,r3
  71 | 	mrs r0,apsr
  72 | 	lsrs r0,#30
  73 | 	
  74 | 	ldm r1!,{r4,r5}
  75 | 	umlal r6,r2,r0,r4
  76 | 	umlal r7,r3,r0,r5
  77 | 	ldm r1!,{r4,r5}
  78 | 	umlal r8,r2,r0,r4
  79 | 	umlal r9,r3,r0,r5
  80 | 	ldm r1!,{r4,r5}
  81 | 	umlal r10,r2,r0,r4
  82 | 	umlal r11,r3,r0,r5
  83 | 	ldm r1!,{r4,r5}
  84 | 	umlal r12,r2,r0,r4
  85 | 	umlal lr,r3,r0,r5
  86 | 	
  87 | 	adds r1,r2
  88 | 	adds r3,#1
  89 | 	cmp r3,#8
  90 | 	bne 1b
  91 | 	
  92 | 	ldm sp,{r0,r4}
  93 | 	stm r0!,{r6-r12,lr}
  94 | 	str r0,[sp]
  95 | 	
  96 | 	sub r1,r1,r2, lsl #3
  97 | 	subs r1,#224
  98 | 	
  99 | 	subs r4,#1
 100 | 	str r4,[sp,#4]
 101 | 	ldr r3,[sp,#8]
 102 | 	bne 0b
 103 | 	
 104 | 	add sp,#12
 105 | 	//frame address sp,36
 106 | 	pop {r4-r11,pc}
 107 | 	.size P256_select_point, .-P256_select_point
 108 | #endif
 109 | 	
 110 | #if include_p256_verify || include_p256_sign
 111 | // in: *r0 = out, *r1 = a, *r2 = b
 112 | // quite slow, so only used in code not critical for performance
 113 | 	.type mul288x288, %function
 114 | mul288x288:
 115 | 	push {r4-r11,lr}
 116 | 	//frame push {r4-r11,lr}
 117 | 	
 118 | 	mov r4,r0
 119 | 	mov r5,r2
 120 | 	mov r6,r1
 121 | 	
 122 | 	movs r1,#72
 123 | 	bl setzero
 124 | 	
 125 | 	ldm r5,{r0-r2,r8-r12,lr}
 126 | 
 127 | 	movs r7,#9
 128 | 0:
 129 | 	ldm r6!,{r5}
 130 | 	push {r6,r7}
 131 | 	//frame address sp,44
 132 | 	movs r3,#0
 133 | 	ldm r4,{r6,r7}
 134 | 	umaal r6,r3,r5,r0
 135 | 	umaal r7,r3,r5,r1
 136 | 	stm r4!,{r6,r7}
 137 | 	ldm r4,{r6,r7}
 138 | 	umaal r6,r3,r5,r2
 139 | 	umaal r7,r3,r5,r8
 140 | 	stm r4!,{r6,r7}
 141 | 	ldm r4,{r6,r7}
 142 | 	umaal r6,r3,r5,r9
 143 | 	umaal r7,r3,r5,r10
 144 | 	stm r4!,{r6,r7}
 145 | 	ldm r4,{r6,r7}
 146 | 	umaal r6,r3,r5,r11
 147 | 	umaal r7,r3,r5,r12
 148 | 	stm r4!,{r6,r7}
 149 | 	ldm r4,{r6}
 150 | 	umaal r3,r6,r5,lr
 151 | 	stm r4!,{r3,r6}
 152 | 	
 153 | 	subs r4,r4,#36
 154 | 	pop {r6,r7}
 155 | 	//frame address sp,36
 156 | 	subs r7,r7,#1
 157 | 	bne 0b
 158 | 	
 159 | 	pop {r4-r11,pc}
 160 | 	.size mul288x288, .-mul288x288
 161 | // in: r0 = address, r1 = num bytes (> 0, must be multiple of 8)
 162 | 	.type setzero, %function
 163 | setzero:
 164 | 	movs r2,#0
 165 | 	movs r3,#0
 166 | 0:
 167 | 	stm r0!,{r2,r3}
 168 | 	subs r1,r1,#8
 169 | 	bne 0b
 170 | 	bx lr
 171 | 	.size setzero, .-setzero
 172 | #endif
 173 | 	
 174 | 	
 175 | // Field arithmetics for the prime field where p = 2^256 - 2^224 + 2^192 + 2^96 - 1
 176 | // Multiplication and Squaring use Montgomery Modular Multiplication where R = 2^256
 177 | // To convert a value to Montgomery class, use P256_mulmod(value, R^512 mod p)
 178 | // To convert a value from Montgomery class to standard form, use P256_mulmod(value, 1)
 179 | 
 180 | #if include_p256_mult || include_p256_decompress_point || include_p256_decode_point
 181 | #if use_mul_for_sqr
 182 | 	.type P256_sqrmod, %function
 183 | P256_sqrmod:
 184 | 	push {r0-r7,lr}
 185 | 	//frame push {lr}
 186 | 	//frame address sp,36
 187 | 	mov r1,sp
 188 | 	mov r2,sp
 189 | 	bl P256_mulmod
 190 | 	add sp,sp,#32
 191 | 	//frame address sp,4
 192 | 	pop {pc}
 193 | 	.size P256_sqrmod, .-P256_sqrmod
 194 | #endif
 195 | 	
 196 | #if has_fpu
 197 | // If inputs are A*R mod p and B*R mod p, computes AB*R mod p
 198 | // *r1 = in1, *r2 = in2
 199 | // out: r0-r7
 200 | // clobbers all other registers
 201 | 	.type P256_mulmod, %function
 202 | P256_mulmod:
 203 | 	push {lr}
 204 | 	//frame push {lr}
 205 | 	
 206 | 	vmov s4,r2
 207 | 	vldm r1,{s8-s15}
 208 | 	
 209 | 	ldm r2,{r2,r3,r4,r5}
 210 | 	
 211 | 	vmov r0,r10,s8,s9
 212 | 	umull r6,r1,r2,r0
 213 | 	
 214 | 	umull r7,r12,r3,r0
 215 | 	umaal r7,r1,r2,r10
 216 | 	
 217 | 	vmov s0,s1,r6,r7
 218 | 	
 219 | 	umull r8,r6,r4,r0
 220 | 	umaal r8,r1,r3,r10
 221 | 	
 222 | 	umull r9,r7,r5,r0
 223 | 	umaal r9,r1,r4,r10
 224 | 	
 225 | 	umaal r1,r7,r5,r10
 226 | 	
 227 | 	vmov lr,r0,s10,s11
 228 | 	
 229 | 	umaal r8,r12,r2,lr
 230 | 	umaal r9,r12,r3,lr
 231 | 	umaal r1,r12,r4,lr
 232 | 	umaal r12,r7,r5,lr
 233 | 	
 234 | 	umaal r9,r6,r2,r0
 235 | 	umaal r1,r6,r3,r0
 236 | 	umaal r12,r6,r4,r0
 237 | 	umaal r6,r7,r5,r0
 238 | 	
 239 | 	vmov s2,s3,r8,r9
 240 | 	
 241 | 	vmov r10,lr,s12,s13
 242 | 	
 243 | 	mov r9,#0
 244 | 	umaal r1,r9,r2,r10
 245 | 	umaal r12,r9,r3,r10
 246 | 	umaal r6,r9,r4,r10
 247 | 	umaal r7,r9,r5,r10
 248 | 	
 249 | 	mov r10,#0
 250 | 	umaal r12,r10,r2,lr
 251 | 	umaal r6,r10,r3,lr
 252 | 	umaal r7,r10,r4,lr
 253 | 	umaal r9,r10,r5,lr
 254 | 	
 255 | 	vmov r8,s14
 256 | 	mov lr,#0
 257 | 	umaal lr,r6,r2,r8
 258 | 	umaal r7,r6,r3,r8
 259 | 	umaal r9,r6,r4,r8
 260 | 	umaal r10,r6,r5,r8
 261 | 	
 262 | 	//_ _ _ _ _ 6 10 9| 7 | lr 12 1 _ _ _ _
 263 | 	
 264 | 	vmov r8,s15
 265 | 	mov r11,#0
 266 | 	umaal r7,r11,r2,r8
 267 | 	umaal r9,r11,r3,r8
 268 | 	umaal r10,r11,r4,r8
 269 | 	umaal r6,r11,r5,r8
 270 | 	
 271 | 	//_ _ _ _ 11 6 10 9| 7 | lr 12 1 _ _ _ _
 272 | 	
 273 | 	vmov r2,s4
 274 | 	adds r2,r2,#16
 275 | 	ldm r2,{r2,r3,r4,r5}
 276 | 	
 277 | 	vmov r8,s8
 278 | 	movs r0,#0
 279 | 	umaal r1,r0,r2,r8
 280 | 	vmov s4,r1
 281 | 	umaal r12,r0,r3,r8
 282 | 	umaal lr,r0,r4,r8
 283 | 	umaal r0,r7,r5,r8 // 7=carry for 9
 284 | 	
 285 | 	//_ _ _ _ 11 6 10 9+7| 0 | lr 12 _ _ _ _ _
 286 | 	
 287 | 	vmov r8,s9
 288 | 	movs r1,#0
 289 | 	umaal r12,r1,r2,r8
 290 | 	vmov s5,r12
 291 | 	umaal lr,r1,r3,r8
 292 | 	umaal r0,r1,r4,r8
 293 | 	umaal r1,r7,r5,r8 // 7=carry for 10
 294 | 	
 295 | 	//_ _ _ _ 11 6 10+7 9+1| 0 | lr _ _ _ _ _ _
 296 | 	
 297 | 	vmov r8,s10
 298 | 	mov r12,#0
 299 | 	umaal lr,r12,r2,r8
 300 | 	vmov s6,lr
 301 | 	umaal r0,r12,r3,r8
 302 | 	umaal r1,r12,r4,r8
 303 | 	umaal r10,r12,r5,r8 // 12=carry for 6
 304 | 	
 305 | 	//_ _ _ _ 11 6+12 10+7 9+1| 0 | _ _ _ _ _ _ _
 306 | 	
 307 | 	vmov r8,s11
 308 | 	mov lr,#0
 309 | 	umaal r0,lr,r2,r8
 310 | 	vmov s7,r0
 311 | 	umaal r1,lr,r3,r8
 312 | 	umaal r10,lr,r4,r8
 313 | 	umaal r6,lr,r5,r8 // lr=carry for saved
 314 | 	
 315 | 	//_ _ _ _ 11+lr 6+12 10+7 9+1| _ | _ _ _ _ _ _ _
 316 | 	
 317 | 	vmov r0,r8,s12,s13
 318 | 	umaal r1,r9,r2,r0
 319 | 	vmov s8,r1
 320 | 	umaal r9,r10,r3,r0
 321 | 	umaal r10,r6,r4,r0
 322 | 	umaal r11,r6,r5,r0 // 6=carry for next
 323 | 	
 324 | 	//_ _ _ 6 11+lr 10+12 9+7 _ | _ | _ _ _ _ _ _ _
 325 | 	
 326 | 	umaal r9,r7,r2,r8
 327 | 	umaal r10,r7,r3,r8
 328 | 	umaal r11,r7,r4,r8
 329 | 	umaal r6,r7,r5,r8
 330 | 	
 331 | 	vmov r0,r8,s14,s15
 332 | 	umaal r10,r12,r2,r0
 333 | 	umaal r11,r12,r3,r0
 334 | 	umaal r6,r12,r4,r0
 335 | 	umaal r7,r12,r5,r0
 336 | 	
 337 | 	umaal r11,lr,r2,r8
 338 | 	umaal lr,r6,r3,r8
 339 | 	umaal r6,r7,r4,r8
 340 | 	umaal r7,r12,r5,r8
 341 | 	
 342 | 	// 12 7 6 lr 11 10 9 s8 s7 s6 s5 s4 s3 s2 s1 s0
 343 | 	
 344 | 	//now reduce
 345 | 	vmov s13,s14,r6,r7
 346 | 	vmov s15,r12
 347 | 	
 348 | 	vmov r0,r1,s0,s1
 349 | 	vmov r2,r3,s2,s3
 350 | 	vmov r4,r5,s4,s5
 351 | 	vmov r6,r7,s6,s7
 352 | 	vmov r8,s8
 353 | 	
 354 | 	mov r12,#0
 355 | 
 356 | 	adds r3,r0
 357 | 	adcs r4,r4,r1
 358 | 	adcs r5,r5,r2
 359 | 	adcs r6,r6,r0
 360 | 	adcs r7,r7,r1
 361 | 	adcs r8,r8,r0
 362 | 	adcs r9,r9,r1
 363 | 	adcs r10,r10,#0
 364 | 	adcs r11,r11,#0
 365 | 	adcs r12,r12,#0
 366 | 
 367 | 	adds r6,r3
 368 | 	adcs r7,r7,r4 // r4 instead of 0
 369 | 	adcs r8,r8,r2
 370 | 	adcs r9,r9,r3
 371 | 	adcs r10,r10,r2
 372 | 	adcs r11,r11,r3
 373 | 	adcs r12,r12,#0
 374 | 
 375 | 	subs r7,r0
 376 | 	sbcs r8,r8,r1
 377 | 	sbcs r9,r9,r2
 378 | 	sbcs r10,r10,r3
 379 | 	sbcs r11,r11,#0
 380 | 	sbcs r12,r12,#0 // r12 is between 0 and 2
 381 | 
 382 | 	vmov r1,r2,s13,s14
 383 | 	vmov r3,s15
 384 | 
 385 | 	adds r0,lr,r12
 386 | 	adcs r1,r1,#0
 387 | 	mov r12,#0
 388 | 	adcs r12,r12,#0
 389 | 
 390 | 	//adds r7,r4 (added above instead)
 391 | 	adcs r8,r8,r5
 392 | 	adcs r9,r9,r6
 393 | 	adcs r10,r10,r4
 394 | 	adcs r11,r11,r5
 395 | 	adcs r0,r0,r4
 396 | 	adcs r1,r1,r5
 397 | 	adcs r2,r2,r12
 398 | 	adcs r3,r3,#0
 399 | 	mov r12,#0
 400 | 	adcs r12,r12,#0
 401 | 
 402 | 	adcs r10,r10,r7
 403 | 	adcs r11,r11,#0
 404 | 	adcs r0,r0,r6
 405 | 	adcs r1,r1,r7
 406 | 	adcs r2,r2,r6
 407 | 	adcs r3,r3,r7
 408 | 	adcs r12,r12,#0
 409 | 
 410 | 	subs r11,r4
 411 | 	sbcs r0,r0,r5
 412 | 	sbcs r1,r1,r6
 413 | 	sbcs r2,r2,r7
 414 | 	sbcs r3,r3,#0
 415 | 	sbcs r12,r12,#0
 416 | 	
 417 | 	// now (T + mN) / R is
 418 | 	// 8 9 10 11 0 1 2 3 12 (lsb -> msb)
 419 | 	
 420 | 	subs r8,r8,#0xffffffff
 421 | 	sbcs r9,r9,#0xffffffff
 422 | 	sbcs r10,r10,#0xffffffff
 423 | 	sbcs r11,r11,#0
 424 | 	sbcs r4,r0,#0
 425 | 	sbcs r5,r1,#0
 426 | 	sbcs r6,r2,#1
 427 | 	sbcs r7,r3,#0xffffffff
 428 | 	sbc r12,r12,#0
 429 | 	
 430 | 	adds r0,r8,r12
 431 | 	adcs r1,r9,r12
 432 | 	adcs r2,r10,r12
 433 | 	adcs r3,r11,#0
 434 | 	adcs r4,r4,#0
 435 | 	adcs r5,r5,#0
 436 | 	adcs r6,r6,r12, lsr #31
 437 | 	adcs r7,r7,r12
 438 | 	
 439 | 	pop {pc}
 440 | 	.size P256_mulmod, .-P256_mulmod
 441 | 	
 442 | #if !use_mul_for_sqr
 443 | // If input is A*R mod p, computes A^2*R mod p
 444 | // in/out: r0-r7
 445 | // clobbers all other registers
 446 | 	.type P256_sqrmod, %function
 447 | P256_sqrmod:
 448 | 	push {lr}
 449 | 	//frame push {lr}
 450 | 	
 451 | 	//mul 01, 00
 452 | 	umull r9,r10,r0,r0
 453 | 	umull r11,r12,r0,r1
 454 | 	adds r11,r11,r11
 455 | 	mov lr,#0
 456 | 	umaal r10,r11,lr,lr
 457 | 	
 458 | 	//r9 r10 done
 459 | 	//r12 carry for 3rd before col
 460 | 	//r11+C carry for 3rd final col
 461 | 	
 462 | 	vmov s0,s1,r9,r10
 463 | 	
 464 | 	//mul 02, 11
 465 | 	mov r8,#0
 466 | 	umaal r8,r12,r0,r2
 467 | 	adcs r8,r8,r8
 468 | 	umaal r8,r11,r1,r1
 469 | 	
 470 | 	//r8 done (3rd col)
 471 | 	//r12 carry for 4th before col
 472 | 	//r11+C carry for 4th final col
 473 | 	
 474 | 	//mul 03, 12
 475 | 	umull r9,r10,r0,r3
 476 | 	umaal r9,r12,r1,r2
 477 | 	adcs r9,r9,r9
 478 | 	umaal r9,r11,lr,lr
 479 | 	
 480 | 	//r9 done (4th col)
 481 | 	//r10+r12 carry for 5th before col
 482 | 	//r11+C carry for 5th final col
 483 | 	
 484 | 	vmov s2,s3,r8,r9
 485 | 	
 486 | 	//mul 04, 13, 22
 487 | 	mov r9,#0
 488 | 	umaal r9,r10,r0,r4
 489 | 	umaal r9,r12,r1,r3
 490 | 	adcs r9,r9,r9
 491 | 	umaal r9,r11,r2,r2
 492 | 	
 493 | 	//r9 done (5th col)
 494 | 	//r10+r12 carry for 6th before col
 495 | 	//r11+C carry for 6th final col
 496 | 	
 497 | 	vmov s4,r9
 498 | 	
 499 | 	//mul 05, 14, 23
 500 | 	umull r9,r8,r0,r5
 501 | 	umaal r9,r10,r1,r4
 502 | 	umaal r9,r12,r2,r3
 503 | 	adcs r9,r9,r9
 504 | 	umaal r9,r11,lr,lr
 505 | 	
 506 | 	//r9 done (6th col)
 507 | 	//r10+r12+r8 carry for 7th before col
 508 | 	//r11+C carry for 7th final col
 509 | 	
 510 | 	vmov s5,r9
 511 | 	
 512 | 	//mul 06, 15, 24, 33
 513 | 	mov r9,#0
 514 | 	umaal r9,r8,r1,r5
 515 | 	umaal r9,r12,r2,r4
 516 | 	umaal r9,r10,r0,r6
 517 | 	adcs r9,r9,r9
 518 | 	umaal r9,r11,r3,r3
 519 | 	
 520 | 	//r9 done (7th col)
 521 | 	//r8+r10+r12 carry for 8th before col
 522 | 	//r11+C carry for 8th final col
 523 | 	
 524 | 	vmov s6,r9
 525 | 	
 526 | 	//mul 07, 16, 25, 34
 527 | 	umull r0,r9,r0,r7
 528 | 	umaal r0,r10,r1,r6
 529 | 	umaal r0,r12,r2,r5
 530 | 	umaal r0,r8,r3,r4
 531 | 	adcs r0,r0,r0
 532 | 	umaal r0,r11,lr,lr
 533 | 	
 534 | 	//r0 done (8th col)
 535 | 	//r9+r8+r10+r12 carry for 9th before col
 536 | 	//r11+C carry for 9th final col
 537 | 	
 538 | 	//mul 17, 26, 35, 44
 539 | 	umaal r9,r8,r1,r7 //r1 is now dead
 540 | 	umaal r9,r10,r2,r6
 541 | 	umaal r12,r9,r3,r5
 542 | 	adcs r12,r12,r12
 543 | 	umaal r11,r12,r4,r4
 544 | 	
 545 | 	//r11 done (9th col)
 546 | 	//r8+r10+r9 carry for 10th before col
 547 | 	//r12+C carry for 10th final col
 548 | 	
 549 | 	//mul 27, 36, 45
 550 | 	umaal r9,r8,r2,r7 //r2 is now dead
 551 | 	umaal r10,r9,r3,r6
 552 | 	movs r2,#0
 553 | 	umaal r10,r2,r4,r5
 554 | 	adcs r10,r10,r10
 555 | 	umaal r12,r10,lr,lr
 556 | 	
 557 | 	//r12 done (10th col)
 558 | 	//r8+r9+r2 carry for 11th before col
 559 | 	//r10+C carry for 11th final col
 560 | 	
 561 | 	//mul 37, 46, 55
 562 | 	umaal r2,r8,r3,r7 //r3 is now dead
 563 | 	umaal r9,r2,r4,r6
 564 | 	adcs r9,r9,r9
 565 | 	umaal r10,r9,r5,r5
 566 | 	
 567 | 	//r10 done (11th col)
 568 | 	//r8+r2 carry for 12th before col
 569 | 	//r9+C carry for 12th final col
 570 | 	
 571 | 	//mul 47, 56
 572 | 	movs r3,#0
 573 | 	umaal r3,r8,r4,r7 //r4 is now dead
 574 | 	umaal r3,r2,r5,r6
 575 | 	adcs r3,r3,r3
 576 | 	umaal r9,r3,lr,lr
 577 | 	
 578 | 	//r9 done (12th col)
 579 | 	//r8+r2 carry for 13th before col
 580 | 	//r3+C carry for 13th final col
 581 | 	
 582 | 	//mul 57, 66
 583 | 	umaal r8,r2,r5,r7 //r5 is now dead
 584 | 	adcs r8,r8,r8
 585 | 	umaal r3,r8,r6,r6
 586 | 	
 587 | 	//r3 done (13th col)
 588 | 	//r2 carry for 14th before col
 589 | 	//r8+C carry for 14th final col
 590 | 	
 591 | 	//mul 67
 592 | 	umull r4,r5,lr,lr // set 0
 593 | 	umaal r4,r2,r6,r7
 594 | 	adcs r4,r4,r4
 595 | 	umaal r4,r8,lr,lr
 596 | 	
 597 | 	//r4 done (14th col)
 598 | 	//r2 carry for 15th before col
 599 | 	//r8+C carry for 15th final col
 600 | 	
 601 | 	//mul 77
 602 | 	adcs r2,r2,r2
 603 | 	umaal r8,r2,r7,r7
 604 | 	adcs r2,r2,lr
 605 | 	
 606 | 	//r8 done (15th col)
 607 | 	//r2 done (16th col)
 608 | 	
 609 | 	//msb -> lsb: r2 r8 r4 r3 r9 r10 r12 r11 r0 s6 s5 s4 s3 s2 s1 s0
 610 | 	//lr: 0
 611 | 	//now do reduction
 612 | 	
 613 | 	vmov s13,s14,r4,r8
 614 | 	vmov s15,r2 //s15
 615 | 	
 616 | 	vmov r1,r2,s0,s1
 617 | 	vmov r8,r7,s2,s3
 618 | 	vmov r6,r5,s4,s5
 619 | 	vmov r4,s6
 620 | 	//lr is already 0
 621 | 	X0 .req r1
 622 | 	X1 .req r2
 623 | 	X2 .req r8
 624 | 	X3 .req r7
 625 | 	X4 .req r6
 626 | 	X5 .req r5
 627 | 	X6 .req r4
 628 | 	X7 .req r0
 629 | 	X8 .req r11
 630 | 	X9 .req r12
 631 | 	X10 .req r10
 632 | 	X11 .req r9
 633 | 	X12 .req r3
 634 | 
 635 | 	X13 .req r7
 636 | 	X14 .req r8
 637 | 	X15 .req r2
 638 | 
 639 | 	adcs X3,X3,X0
 640 | 	adcs X4,X4,X1
 641 | 	adcs X5,X5,X2
 642 | 	adcs X6,X6,X0
 643 | 	adcs X7,X7,X1
 644 | 	adcs X8,X8,X0
 645 | 	adcs X9,X9,X1
 646 | 	adcs X10,X10,#0
 647 | 	adcs X11,X11,#0
 648 | 	adcs lr,lr,#0
 649 | 
 650 | 	adds X6,X3
 651 | 	adcs X7,X7,X4 // X4 instead of 0
 652 | 	adcs X8,X8,X2
 653 | 	adcs X9,X9,X3
 654 | 	adcs X10,X10,X2
 655 | 	adcs X11,X11,X3
 656 | 	adcs lr,lr,#0
 657 | 
 658 | 	subs X7,X0
 659 | 	sbcs X8,X8,X1
 660 | 	sbcs X9,X9,X2
 661 | 	sbcs X10,X10,X3
 662 | 	sbcs X11,X11,#0
 663 | 	sbcs lr,lr,#0 // lr is between 0 and 2
 664 | 	
 665 | 	vmov X13,X14,s13,s14
 666 | 	vmov X15,s15
 667 | 
 668 | 	adds X0,X12,lr
 669 | 	adcs X13,X13,#0
 670 | 	mov lr,#0
 671 | 	adcs lr,lr,#0
 672 | 
 673 | 	//adds X7,X4 (added above instead)
 674 | 	adcs X8,X8,X5
 675 | 	adcs X9,X9,X6
 676 | 	adcs X10,X10,X4
 677 | 	adcs X11,X11,X5
 678 | 	adcs X0,X0,X4
 679 | 	adcs X13,X13,X5
 680 | 	adcs X14,X14,lr
 681 | 	adcs X15,X15,#0
 682 | 	mov lr,#0
 683 | 	adcs lr,lr,#0
 684 | 
 685 | 	adcs X10,X10,X7
 686 | 	adcs X11,X11,#0
 687 | 	adcs X0,X0,X6
 688 | 	adcs X13,X13,X7
 689 | 	adcs X14,X14,X6
 690 | 	adcs X15,X15,X7
 691 | 	adcs lr,lr,#0
 692 | 
 693 | 	subs X11,X4
 694 | 	sbcs X0,X0,X5
 695 | 	sbcs X13,X13,X6
 696 | 	sbcs X14,X14,X7
 697 | 	sbcs X15,X15,#0
 698 | 	sbcs lr,lr,#0
 699 | 	
 700 | 	// now (T + mN) / R is
 701 | 	// X8 X9 X10 X11 X0 X13 X14 X15 lr (lsb -> msb)
 702 | 	// r11 r12 r10 r9 r1 r7 r8 r2 lr
 703 | 	
 704 | 	subs r0,r11,#0xffffffff
 705 | 	sbcs r12,r12,#0xffffffff
 706 | 	sbcs r4,r10,#0xffffffff
 707 | 	sbcs r9,r9,#0
 708 | 	sbcs r6,r1,#0
 709 | 	sbcs r5,r7,#0
 710 | 	sbcs r10,r8,#1
 711 | 	sbcs r8,r2,#0xffffffff
 712 | 	sbcs r7,lr,#0
 713 | 	
 714 | 	adds r0,r0,r7
 715 | 	adcs r1,r12,r7
 716 | 	adcs r2,r4,r7
 717 | 	adcs r3,r9,#0
 718 | 	adcs r4,r6,#0
 719 | 	adcs r5,r5,#0
 720 | 	adcs r6,r10,r7, lsr #31
 721 | 	adcs r7,r8,r7
 722 | 	
 723 | 	pop {pc}
 724 | 	.size P256_sqrmod, .-P256_sqrmod
 725 | #endif
 726 | 	
 727 | #else
 728 | // If inputs are A*R mod p and B*R mod p, computes AB*R mod p
 729 | // *r1 = in1, *r2 = in2
 730 | // out: r0-r7
 731 | // clobbers all other registers
 732 | // cycles: 231
 733 | 	.type P256_mulmod, %function
 734 | P256_mulmod:
 735 | 	push {r2,lr}
 736 | 	//frame push {lr}
 737 | 	//frame address sp,8
 738 | 	
 739 | 	sub sp,#28
 740 | 	//frame address sp,36
 741 | 	ldm r2,{r2,r3,r4,r5}
 742 | 	
 743 | 	ldm r1!,{r0,r10,lr}
 744 | 	umull r6,r11,r2,r0
 745 | 	
 746 | 	umull r7,r12,r3,r0
 747 | 	umaal r7,r11,r2,r10
 748 | 	
 749 | 	push {r6,r7}
 750 | 	//frame address sp,44
 751 | 	
 752 | 	umull r8,r6,r4,r0
 753 | 	umaal r8,r11,r3,r10
 754 | 	
 755 | 	umull r9,r7,r5,r0
 756 | 	umaal r9,r11,r4,r10
 757 | 	
 758 | 	umaal r11,r7,r5,r10
 759 | 	
 760 | 	umaal r8,r12,r2,lr
 761 | 	umaal r9,r12,r3,lr
 762 | 	umaal r11,r12,r4,lr
 763 | 	umaal r12,r7,r5,lr
 764 | 	
 765 | 	ldm r1!,{r0,r10,lr}
 766 | 	
 767 | 	umaal r9,r6,r2,r0
 768 | 	umaal r11,r6,r3,r0
 769 | 	umaal r12,r6,r4,r0
 770 | 	umaal r6,r7,r5,r0
 771 | 	
 772 | 	strd r8,r9,[sp,#8]
 773 | 	
 774 | 	mov r9,#0
 775 | 	umaal r11,r9,r2,r10
 776 | 	umaal r12,r9,r3,r10
 777 | 	umaal r6,r9,r4,r10
 778 | 	umaal r7,r9,r5,r10
 779 | 	
 780 | 	mov r10,#0
 781 | 	umaal r12,r10,r2,lr
 782 | 	umaal r6,r10,r3,lr
 783 | 	umaal r7,r10,r4,lr
 784 | 	umaal r9,r10,r5,lr
 785 | 	
 786 | 	ldr r8,[r1],#4
 787 | 	mov lr,#0
 788 | 	umaal lr,r6,r2,r8
 789 | 	umaal r7,r6,r3,r8
 790 | 	umaal r9,r6,r4,r8
 791 | 	umaal r10,r6,r5,r8
 792 | 	
 793 | 	//_ _ _ _ _ 6 10 9| 7 | lr 12 11 _ _ _ _
 794 | 	
 795 | 	ldr r8,[r1],#-28
 796 | 	mov r0,#0
 797 | 	umaal r7,r0,r2,r8
 798 | 	umaal r9,r0,r3,r8
 799 | 	umaal r10,r0,r4,r8
 800 | 	umaal r6,r0,r5,r8
 801 | 	
 802 | 	push {r0}
 803 | 	//frame address sp,48
 804 | 	
 805 | 	//_ _ _ _ s 6 10 9| 7 | lr 12 11 _ _ _ _
 806 | 	
 807 | 	ldr r2,[sp,#40]
 808 | 	adds r2,r2,#16
 809 | 	ldm r2,{r2,r3,r4,r5}
 810 | 	
 811 | 	ldr r8,[r1],#4
 812 | 	mov r0,#0
 813 | 	umaal r11,r0,r2,r8
 814 | 	str r11,[sp,#16+4]
 815 | 	umaal r12,r0,r3,r8
 816 | 	umaal lr,r0,r4,r8
 817 | 	umaal r0,r7,r5,r8 // 7=carry for 9
 818 | 	
 819 | 	//_ _ _ _ s 6 10 9+7| 0 | lr 12 _ _ _ _ _
 820 | 	
 821 | 	ldr r8,[r1],#4
 822 | 	mov r11,#0
 823 | 	umaal r12,r11,r2,r8
 824 | 	str r12,[sp,#20+4]
 825 | 	umaal lr,r11,r3,r8
 826 | 	umaal r0,r11,r4,r8
 827 | 	umaal r11,r7,r5,r8 // 7=carry for 10
 828 | 	
 829 | 	//_ _ _ _ s 6 10+7 9+11| 0 | lr _ _ _ _ _ _
 830 | 	
 831 | 	ldr r8,[r1],#4
 832 | 	mov r12,#0
 833 | 	umaal lr,r12,r2,r8
 834 | 	str lr,[sp,#24+4]
 835 | 	umaal r0,r12,r3,r8
 836 | 	umaal r11,r12,r4,r8
 837 | 	umaal r10,r12,r5,r8 // 12=carry for 6
 838 | 	
 839 | 	//_ _ _ _ s 6+12 10+7 9+11| 0 | _ _ _ _ _ _ _
 840 | 	
 841 | 	ldr r8,[r1],#4
 842 | 	mov lr,#0
 843 | 	umaal r0,lr,r2,r8
 844 | 	str r0,[sp,#28+4]
 845 | 	umaal r11,lr,r3,r8
 846 | 	umaal r10,lr,r4,r8
 847 | 	umaal r6,lr,r5,r8 // lr=carry for saved
 848 | 	
 849 | 	//_ _ _ _ s+lr 6+12 10+7 9+11| _ | _ _ _ _ _ _ _
 850 | 	
 851 | 	ldm r1!,{r0,r8}
 852 | 	umaal r11,r9,r2,r0
 853 | 	str r11,[sp,#32+4]
 854 | 	umaal r9,r10,r3,r0
 855 | 	umaal r10,r6,r4,r0
 856 | 	pop {r11}
 857 | 	//frame address sp,44
 858 | 	umaal r11,r6,r5,r0 // 6=carry for next
 859 | 	
 860 | 	//_ _ _ 6 11+lr 10+12 9+7 _ | _ | _ _ _ _ _ _ _
 861 | 	
 862 | 	umaal r9,r7,r2,r8
 863 | 	umaal r10,r7,r3,r8
 864 | 	umaal r11,r7,r4,r8
 865 | 	umaal r6,r7,r5,r8
 866 | 	
 867 | 	ldm r1!,{r0,r8}
 868 | 	umaal r10,r12,r2,r0
 869 | 	umaal r11,r12,r3,r0
 870 | 	umaal r6,r12,r4,r0
 871 | 	umaal r7,r12,r5,r0
 872 | 	
 873 | 	umaal r11,lr,r2,r8
 874 | 	umaal lr,r6,r3,r8
 875 | 	umaal r6,r7,r4,r8
 876 | 	umaal r7,r12,r5,r8
 877 | 	
 878 | 	// 12 7 6 lr 11 10 9 stack*9
 879 | 	push {r6,r7,r12}
 880 | 	//frame address sp,56
 881 | 	add r7,sp,#12
 882 | 	ldm r7,{r0-r8}
 883 | 	
 884 | 	mov r12,#0
 885 | 
 886 | 	adds r3,r0
 887 | 	adcs r4,r4,r1
 888 | 	adcs r5,r5,r2
 889 | 	adcs r6,r6,r0
 890 | 	adcs r7,r7,r1
 891 | 	adcs r8,r8,r0
 892 | 	adcs r9,r9,r1
 893 | 	adcs r10,r10,#0
 894 | 	adcs r11,r11,#0
 895 | 	adcs r12,r12,#0
 896 | 
 897 | 	adds r6,r3
 898 | 	adcs r7,r7,r4 // r4 instead of 0
 899 | 	adcs r8,r8,r2
 900 | 	adcs r9,r9,r3
 901 | 	adcs r10,r10,r2
 902 | 	adcs r11,r11,r3
 903 | 	adcs r12,r12,#0
 904 | 
 905 | 	subs r7,r0
 906 | 	sbcs r8,r8,r1
 907 | 	sbcs r9,r9,r2
 908 | 	sbcs r10,r10,r3
 909 | 	sbcs r11,r11,#0
 910 | 	sbcs r12,r12,#0 // r12 is between 0 and 2
 911 | 
 912 | 	pop {r1-r3}
 913 | 	//frame address sp,44
 914 | 
 915 | 	adds r0,lr,r12
 916 | 	adcs r1,r1,#0
 917 | 	mov r12,#0
 918 | 	adcs r12,r12,#0
 919 | 
 920 | 	//adds r7,r4 (added above instead)
 921 | 	adcs r8,r8,r5
 922 | 	adcs r9,r9,r6
 923 | 	adcs r10,r10,r4
 924 | 	adcs r11,r11,r5
 925 | 	adcs r0,r0,r4
 926 | 	adcs r1,r1,r5
 927 | 	adcs r2,r2,r12
 928 | 	adcs r3,r3,#0
 929 | 	mov r12,#0
 930 | 	adcs r12,r12,#0
 931 | 
 932 | 	adcs r10,r10,r7
 933 | 	adcs r11,r11,#0
 934 | 	adcs r0,r0,r6
 935 | 	adcs r1,r1,r7
 936 | 	adcs r2,r2,r6
 937 | 	adcs r3,r3,r7
 938 | 	adcs r12,r12,#0
 939 | 
 940 | 	subs r11,r4
 941 | 	sbcs r0,r0,r5
 942 | 	sbcs r1,r1,r6
 943 | 	sbcs r2,r2,r7
 944 | 	sbcs r3,r3,#0
 945 | 	sbcs r12,r12,#0
 946 | 	
 947 | 	// now (T + mN) / R is
 948 | 	// 8 9 10 11 0 1 2 3 12 (lsb -> msb)
 949 | 	
 950 | 	subs r8,r8,#0xffffffff
 951 | 	sbcs r9,r9,#0xffffffff
 952 | 	sbcs r10,r10,#0xffffffff
 953 | 	sbcs r11,r11,#0
 954 | 	sbcs r4,r0,#0
 955 | 	sbcs r5,r1,#0
 956 | 	sbcs r6,r2,#1
 957 | 	sbcs r7,r3,#0xffffffff
 958 | 	sbc r12,r12,#0
 959 | 	
 960 | 	adds r0,r8,r12
 961 | 	adcs r1,r9,r12
 962 | 	adcs r2,r10,r12
 963 | 	adcs r3,r11,#0
 964 | 	adcs r4,r4,#0
 965 | 	adcs r5,r5,#0
 966 | 	adcs r6,r6,r12, lsr #31
 967 | 	adcs r7,r7,r12
 968 | 	
 969 | 	add sp,sp,#40
 970 | 	//frame address sp,4
 971 | 	
 972 | 	pop {pc}
 973 | 	
 974 | 	.size P256_mulmod, .-P256_mulmod
 975 | 
 976 | #if !use_mul_for_sqr
 977 | // 173 cycles
 978 | // If input is A*R mod p, computes A^2*R mod p
 979 | // in/out: r0-r7
 980 | // clobbers all other registers
 981 | 	.type P256_sqrmod, %function
 982 | P256_sqrmod:
 983 | 	push {lr}
 984 | 	//frame push {lr}
 985 | 	
 986 | 	//mul 01, 00
 987 | 	umull r9,r10,r0,r0
 988 | 	umull r11,r12,r0,r1
 989 | 	adds r11,r11,r11
 990 | 	mov lr,#0
 991 | 	umaal r10,r11,lr,lr
 992 | 	
 993 | 	//r10 r9 done
 994 | 	//r12 carry for 3rd before col
 995 | 	//r11+C carry for 3rd final col
 996 | 	
 997 | 	push {r9,r10}
 998 | 	//frame address sp,12
 999 | 	
1000 | 	//mul 02, 11
1001 | 	mov r9,#0
1002 | 	umaal r9,r12,r0,r2
1003 | 	adcs r9,r9,r9
1004 | 	umaal r9,r11,r1,r1
1005 | 	
1006 | 	//r9 done (3rd col)
1007 | 	//r12 carry for 4th before col
1008 | 	//r11+C carry for 4th final col
1009 | 	
1010 | 	push {r9}
1011 | 	//frame address sp,16
1012 | 	
1013 | 	//mul 03, 12
1014 | 	umull r9,r10,r0,r3
1015 | 	umaal r9,r12,r1,r2
1016 | 	adcs r9,r9,r9
1017 | 	umaal r9,r11,lr,lr
1018 | 	
1019 | 	//r9 done (4th col)
1020 | 	//r10+r12 carry for 5th before col
1021 | 	//r11+C carry for 5th final col
1022 | 	
1023 | 	push {r9}
1024 | 	//frame address sp,20
1025 | 	
1026 | 	//mul 04, 13, 22
1027 | 	mov r9,#0
1028 | 	umaal r9,r10,r0,r4
1029 | 	umaal r9,r12,r1,r3
1030 | 	adcs r9,r9,r9
1031 | 	umaal r9,r11,r2,r2
1032 | 	
1033 | 	//r9 done (5th col)
1034 | 	//r10+r12 carry for 6th before col
1035 | 	//r11+C carry for 6th final col
1036 | 	
1037 | 	push {r9}
1038 | 	//frame address sp,24
1039 | 	
1040 | 	//mul 05, 14, 23
1041 | 	umull r9,r8,r0,r5
1042 | 	umaal r9,r10,r1,r4
1043 | 	umaal r9,r12,r2,r3
1044 | 	adcs r9,r9,r9
1045 | 	umaal r9,r11,lr,lr
1046 | 	
1047 | 	//r9 done (6th col)
1048 | 	//r10+r12+r8 carry for 7th before col
1049 | 	//r11+C carry for 7th final col
1050 | 	
1051 | 	push {r9}
1052 | 	//frame address sp,28
1053 | 	
1054 | 	//mul 06, 15, 24, 33
1055 | 	mov r9,#0
1056 | 	umaal r9,r8,r1,r5
1057 | 	umaal r9,r12,r2,r4
1058 | 	umaal r9,r10,r0,r6
1059 | 	adcs r9,r9,r9
1060 | 	umaal r9,r11,r3,r3
1061 | 	
1062 | 	//r9 done (7th col)
1063 | 	//r8+r10+r12 carry for 8th before col
1064 | 	//r11+C carry for 8th final col
1065 | 	
1066 | 	push {r9}
1067 | 	//frame address sp,32
1068 | 	
1069 | 	//mul 07, 16, 25, 34
1070 | 	umull r9,r0,r0,r7
1071 | 	umaal r9,r10,r1,r6
1072 | 	umaal r9,r12,r2,r5
1073 | 	umaal r9,r8,r3,r4
1074 | 	adcs r9,r9,r9
1075 | 	//push {r12}
1076 | 	////frame address sp,36
1077 | 	umaal r9,r11,lr,lr
1078 | 	
1079 | 	//r9 done (8th col)
1080 | 	//r0+r8+r10+r12 carry for 9th before col
1081 | 	//r11+C carry for 9th final col
1082 | 	
1083 | 	//mul 17, 26, 35, 44
1084 | 	umaal r0,r8,r1,r7 //r1 is now dead
1085 | 	umaal r0,r10,r2,r6
1086 | 	//pop {r1}
1087 | 	////frame address sp,32
1088 | 	umaal r0,r12,r3,r5
1089 | 	adcs r0,r0,r0
1090 | 	umaal r11,r0,r4,r4
1091 | 	
1092 | 	//r11 done (9th col)
1093 | 	//r8+r10+r12 carry for 10th before col
1094 | 	//r0+C carry for 10th final col
1095 | 	
1096 | 	//mul 27, 36, 45
1097 | 	umaal r12,r8,r2,r7 //r2 is now dead
1098 | 	umaal r12,r10,r3,r6
1099 | 	movs r2,#0
1100 | 	umaal r12,r2,r4,r5
1101 | 	adcs r1,r12,r12
1102 | 	umaal r0,r1,lr,lr
1103 | 	
1104 | 	//r0 done (10th col)
1105 | 	//r8+r10+r2 carry for 11th before col
1106 | 	//r1+C carry for 11th final col
1107 | 	
1108 | 	//mul 37, 46, 55
1109 | 	umaal r2,r8,r3,r7 //r3 is now dead
1110 | 	umaal r2,r10,r4,r6
1111 | 	adcs r2,r2,r2
1112 | 	umaal r1,r2,r5,r5
1113 | 	
1114 | 	//r1 done (11th col)
1115 | 	//r8+r10 carry for 12th before col
1116 | 	//r2+C carry for 12th final col
1117 | 	
1118 | 	//mul 47, 56
1119 | 	movs r3,#0
1120 | 	umaal r3,r8,r4,r7 //r4 is now dead
1121 | 	umaal r3,r10,r5,r6
1122 | 	adcs r3,r3,r3
1123 | 	umaal r2,r3,lr,lr
1124 | 	
1125 | 	//r2 done (12th col)
1126 | 	//r8+r10 carry for 13th before col
1127 | 	//r3+C carry for 13th final col
1128 | 	
1129 | 	//mul 57, 66
1130 | 	umaal r8,r10,r5,r7 //r5 is now dead
1131 | 	adcs r8,r8,r8
1132 | 	umaal r3,r8,r6,r6
1133 | 	
1134 | 	//r3 done (13th col)
1135 | 	//r10 carry for 14th before col
1136 | 	//r8+C carry for 14th final col
1137 | 	
1138 | 	//mul 67
1139 | 	umull r4,r5,lr,lr // set 0
1140 | 	umaal r4,r10,r6,r7
1141 | 	adcs r4,r4,r4
1142 | 	umaal r4,r8,lr,lr
1143 | 	
1144 | 	//r4 done (14th col)
1145 | 	//r10 carry for 15th before col
1146 | 	//r8+C carry for 15th final col
1147 | 	
1148 | 	//mul 77
1149 | 	adcs r10,r10,r10
1150 | 	umaal r8,r10,r7,r7
1151 | 	adcs r10,r10,lr
1152 | 	
1153 | 	//r8 done (15th col)
1154 | 	//r10 done (16th col)
1155 | 	
1156 | 	//msb -> lsb: r10 r8 r4 r3 r2 r1 r0 r11 r9 sp sp+4 sp+8 sp+12 sp+16 sp+24 sp+20
1157 | 	//now do reduction
1158 | 	
1159 | 	push {r4,r8,r10}
1160 | 	//frame address sp,44
1161 | 	add r4,sp,#12
1162 | 	ldm r4,{r4-r8,r10,r12}
1163 | 	//lr is already 0
1164 | 	X0 .req r10
1165 | 	X1 .req r12
1166 | 	X2 .req r8
1167 | 	X3 .req r7
1168 | 	X4 .req r6
1169 | 	X5 .req r5
1170 | 	X6 .req r4
1171 | 	X7 .req r9
1172 | 	X8 .req r11
1173 | 	X9 .req r0
1174 | 	X10 .req r1
1175 | 	X11 .req r2
1176 | 	X12 .req r3
1177 | 
1178 | 	X13 .req r7
1179 | 	X14 .req r8
1180 | 	X15 .req r12
1181 | 
1182 | 	adcs X3,X3,X0
1183 | 	adcs X4,X4,X1
1184 | 	adcs X5,X5,X2
1185 | 	adcs X6,X6,X0
1186 | 	adcs X7,X7,X1
1187 | 	adcs X8,X8,X0
1188 | 	adcs X9,X9,X1
1189 | 	adcs X10,X10,#0
1190 | 	adcs X11,X11,#0
1191 | 	adcs lr,lr,#0
1192 | 
1193 | 	adds X6,X3
1194 | 	adcs X7,X7,X4 // X4 instead of 0
1195 | 	adcs X8,X8,X2
1196 | 	adcs X9,X9,X3
1197 | 	adcs X10,X10,X2
1198 | 	adcs X11,X11,X3
1199 | 	adcs lr,lr,#0
1200 | 
1201 | 	subs X7,X0
1202 | 	sbcs X8,X8,X1
1203 | 	sbcs X9,X9,X2
1204 | 	sbcs X10,X10,X3
1205 | 	sbcs X11,X11,#0
1206 | 	sbcs lr,lr,#0 // lr is between 0 and 2
1207 | 	
1208 | 	pop {X13,X14,X15}
1209 | 	//frame address sp,32
1210 | 
1211 | 	adds X0,X12,lr
1212 | 	adcs X13,X13,#0
1213 | 	mov lr,#0
1214 | 	adcs lr,lr,#0
1215 | 
1216 | 	//adds X7,X4 (added above instead)
1217 | 	adcs X8,X8,X5
1218 | 	adcs X9,X9,X6
1219 | 	adcs X10,X10,X4
1220 | 	adcs X11,X11,X5
1221 | 	adcs X0,X0,X4
1222 | 	adcs X13,X13,X5
1223 | 	adcs X14,X14,lr
1224 | 	adcs X15,X15,#0
1225 | 	mov lr,#0
1226 | 	adcs lr,lr,#0
1227 | 
1228 | 	adcs X10,X10,X7
1229 | 	adcs X11,X11,#0
1230 | 	adcs X0,X0,X6
1231 | 	adcs X13,X13,X7
1232 | 	adcs X14,X14,X6
1233 | 	adcs X15,X15,X7
1234 | 	adcs lr,lr,#0
1235 | 
1236 | 	subs X11,X4
1237 | 	sbcs X0,X0,X5
1238 | 	sbcs X13,X13,X6
1239 | 	sbcs X14,X14,X7
1240 | 	sbcs X15,X15,#0
1241 | 	sbcs lr,lr,#0
1242 | 	
1243 | 	// now (T + mN) / R is
1244 | 	// X8 X9 X10 X11 X0 X13 X14 X15 lr (lsb -> msb)
1245 | 	// r11 r0 r1 r2 r10 r7 r8 r12 lr
1246 | 	
1247 | 	subs r11,r11,#0xffffffff
1248 | 	sbcs r9,r0,#0xffffffff
1249 | 	sbcs r4,r1,#0xffffffff
1250 | 	sbcs r3,r2,#0
1251 | 	sbcs r6,r10,#0
1252 | 	sbcs r5,r7,#0
1253 | 	sbcs r10,r8,#1
1254 | 	sbcs r8,r12,#0xffffffff
1255 | 	sbcs r7,lr,#0
1256 | 	
1257 | 	adds r0,r11,r7
1258 | 	adcs r1,r9,r7
1259 | 	adcs r2,r4,r7
1260 | 	adcs r3,r3,#0
1261 | 	adcs r4,r6,#0
1262 | 	adcs r5,r5,#0
1263 | 	adcs r6,r10,r7, lsr #31
1264 | 	adcs r7,r8,r7
1265 | 	
1266 | 	add sp,#28
1267 | 	//frame address sp,4
1268 | 	pop {pc}
1269 | 	
1270 | 	.size P256_sqrmod, .-P256_sqrmod
1271 | #endif
1272 | #endif
1273 | 
1274 | // 42 cycles
1275 | // Computes A - B mod p, assumes A, B < p
1276 | // in: *r1, *r2
1277 | // out: r0-r7
1278 | // clobbers all other registers
1279 | 	.type P256_submod, %function
1280 | P256_submod:
1281 | 	ldm r1,{r3-r10}
1282 | 	ldm r2!,{r0,r1,r11,r12}
1283 | 	subs r3,r0
1284 | 	sbcs r4,r4,r1
1285 | 	sbcs r5,r5,r11
1286 | 	sbcs r6,r6,r12
1287 | 	ldm r2,{r0,r1,r11,r12}
1288 | 	sbcs r7,r7,r0
1289 | 	sbcs r8,r8,r1
1290 | 	sbcs r9,r9,r11
1291 | 	sbcs r10,r10,r12
1292 | 	
1293 | 	sbcs r11,r11,r11
1294 | 	
1295 | 	adds r0,r3,r11
1296 | 	adcs r1,r4,r11
1297 | 	adcs r2,r5,r11
1298 | 	adcs r3,r6,#0
1299 | 	adcs r4,r7,#0
1300 | 	adcs r5,r8,#0
1301 | 	adcs r6,r9,r11, lsr #31
1302 | 	adcs r7,r10,r11
1303 | 	
1304 | 	bx lr
1305 | 	
1306 | 	.size P256_submod, .-P256_submod
1307 | #endif
1308 | 
1309 | #if include_p256_mult || include_p256_decompress_point
1310 | // 52 cycles
1311 | // Computes A + B mod p, assumes A, B < p
1312 | // in: *r1, *r2
1313 | // out: r0-r7
1314 | // clobbers all other registers
1315 | 	.type P256_addmod, %function
1316 | P256_addmod:
1317 | 	ldm r2,{r2-r9}
1318 | 	ldm r1!,{r0,r10,r11,r12}
1319 | 	adds r2,r0
1320 | 	adcs r3,r3,r10
1321 | 	adcs r4,r4,r11
1322 | 	adcs r5,r5,r12
1323 | 	ldm r1,{r0,r1,r11,r12}
1324 | 	adcs r6,r6,r0
1325 | 	adcs r7,r7,r1
1326 | 	adcs r8,r8,r11
1327 | 	adcs r9,r9,r12
1328 | 	movs r10,#0
1329 | 	adcs r10,r10,r10
1330 | 	
1331 | 	subs r2,#0xffffffff
1332 | 	sbcs r3,r3,#0xffffffff
1333 | 	sbcs r4,r4,#0xffffffff
1334 | 	sbcs r5,r5,#0
1335 | 	sbcs r6,r6,#0
1336 | 	sbcs r7,r7,#0
1337 | 	sbcs r8,r8,#1
1338 | 	sbcs r9,r9,#0xffffffff
1339 | 	sbcs r10,r10,#0
1340 | 	
1341 | 	adds r0,r2,r10
1342 | 	adcs r1,r3,r10
1343 | 	adcs r2,r4,r10
1344 | 	adcs r3,r5,#0
1345 | 	adcs r4,r6,#0
1346 | 	adcs r5,r7,#0
1347 | 	adcs r6,r8,r10, lsr #31
1348 | 	adcs r7,r9,r10
1349 | 	
1350 | 	bx lr
1351 | 	
1352 | 	.size P256_addmod, .-P256_addmod
1353 | #endif
1354 | 	
1355 | #if include_p256_mult || include_p256_decompress_point
1356 | // cycles: 19 + 181*n
1357 | 	.type P256_sqrmod_many, %function
1358 | P256_sqrmod_many:
1359 | 	// in: r0-r7, count: r8
1360 | 	// out: r0-r7
1361 | 	push {r8,lr}
1362 | 	//frame push {r8,lr}
1363 | 0:
1364 | 	bl P256_sqrmod
1365 | 	
1366 | 	ldr r8,[sp,#0]
1367 | 	subs r8,r8,#1
1368 | 	str r8,[sp,#0]
1369 | 	bne 0b
1370 | 	
1371 | 	pop {r8,pc}
1372 | 	.size P256_sqrmod_many, .-P256_sqrmod_many
1373 | 
1374 | // in/out: r0-r7, r8: count, *r9: operand for final multiplication
1375 | 	.type P256_sqrmod_many_and_mulmod, %function
1376 | P256_sqrmod_many_and_mulmod:
1377 | 	push {r9,lr}
1378 | 	//frame push {r9,lr}
1379 | 	bl P256_sqrmod_many
1380 | 	push {r0-r7}
1381 | 	//frame address sp,40
1382 | 	mov r1,sp
1383 | 	ldr r2,[sp,#32]
1384 | 	bl P256_mulmod
1385 | 	add sp,#36
1386 | 	//frame address sp,4
1387 | 	pop {pc}
1388 | 	.size P256_sqrmod_many_and_mulmod, .-P256_sqrmod_many_and_mulmod
1389 | 	
1390 | 
1391 | // in: r0-r7 = value, r8 = 0 for modinv and 1 for sqrt
1392 | // out: r0-r7
1393 | // for modinv, call input a, then if a = A * R % p, then it calculates A^-1 * R % p = (a/R)^-1 * R % p = R^2 / a % p
1394 | // for sqrt, call input a, then if a = A * R % p, then it calculates sqrt(A) * R % p
1395 | 	.type P256_modinv_sqrt, %function
1396 | P256_modinv_sqrt:
1397 | 	push {r0-r8,lr}
1398 | 	
1399 | 	// t = a^2*a
1400 | 	mov r8,#1
1401 | 	mov r9,sp
1402 | 	bl P256_sqrmod_many_and_mulmod
1403 | 	push {r0-r7}
1404 | 	
1405 | 	// a4_2 = a2_0^(2^2)
1406 | 	bl P256_sqrmod
1407 | 	bl P256_sqrmod
1408 | 	push {r0-r7}
1409 | 	
1410 | 	// a4_0 = a4_2*a2_0
1411 | 	mov r1,sp
1412 | 	add r2,sp,#32
1413 | 	bl P256_mulmod
1414 | 	add r8,sp,#32
1415 | 	stm r8,{r0-r7}
1416 | 	
1417 | 	// a8_0 = a4_0^(2^(8-4))*a4_0
1418 | 	mov r8,#8-4
1419 | 	add r9,sp,#32
1420 | 	bl P256_sqrmod_many_and_mulmod
1421 | 	push {r0-r7}
1422 | 	
1423 | 	// a16_0 = a8_0^(2^(16-8))*a8_0
1424 | 	mov r8,#16-8
1425 | 	mov r9,sp
1426 | 	bl P256_sqrmod_many_and_mulmod
1427 | 	push {r0-r7}
1428 | 	
1429 | 	// a32_0 = a16_0^(2^(32-16))*a16_0
1430 | 	mov r8,#16
1431 | 	mov r9,sp
1432 | 	bl P256_sqrmod_many_and_mulmod
1433 | 	push {r0-r7}
1434 | 	
1435 | 	// t = a32_0^(2^(64-32))*a
1436 | 	mov r8,#32
1437 | 	add r9,sp,#5*32
1438 | 	bl P256_sqrmod_many_and_mulmod
1439 | 	
1440 | 	ldr r8,[sp,#6*32]
1441 | 	cmp r8,#0
1442 | 	bne 0f
1443 | 	
1444 | 	// t = t^(2^(192-64))*a32_0
1445 | 	mov r8,#192-64
1446 | 	mov r9,sp
1447 | 	bl P256_sqrmod_many_and_mulmod
1448 | 	
1449 | 	// t = t^(2^(224-192))*a32_0
1450 | 	mov r8,#224-192
1451 | 	mov r9,sp
1452 | 	bl P256_sqrmod_many_and_mulmod
1453 | 	
1454 | 	// t = t^(2^(240-224))*a16_0
1455 | 	mov r8,#240-224
1456 | 	add r9,sp,#32
1457 | 	bl P256_sqrmod_many_and_mulmod
1458 | 	
1459 | 	// t = t^(2^(248-240))*a8_0
1460 | 	mov r8,#248-240
1461 | 	add r9,sp,#64
1462 | 	bl P256_sqrmod_many_and_mulmod
1463 | 	
1464 | 	// t = t^(2^(252-248))*a4_0
1465 | 	mov r8,#252-248
1466 | 	add r9,sp,#128
1467 | 	bl P256_sqrmod_many_and_mulmod
1468 | 	
1469 | 	// t = t^(2^(256-252))*a4_2
1470 | 	mov r8,#256-252
1471 | 	add r9,sp,#96
1472 | 	bl P256_sqrmod_many_and_mulmod
1473 | 	stm sp,{r0-r7}
1474 | 	
1475 | 	// r = t*a
1476 | 	mov r1,sp
1477 | 	add r2,sp,#5*32
1478 | 	bl P256_mulmod
1479 | 	b 1f
1480 | 
1481 | 0:
1482 | 	// t = t^(2^(160-64))*a
1483 | 	mov r8,#160-64
1484 | 	add r9,sp,#5*32
1485 | 	bl P256_sqrmod_many_and_mulmod
1486 | 	
1487 | 	// t = t^(2^(254-160))
1488 | 	mov r8,#254-160
1489 | 	bl P256_sqrmod_many
1490 | 1:
1491 | 
1492 | 	add sp,#6*32+4
1493 | 	
1494 | 	pop {pc}
1495 | 	
1496 | 	.size P256_modinv_sqrt, .-P256_modinv_sqrt
1497 | #endif
1498 | 
1499 | #if include_p256_mult
1500 | // 33 cycles
1501 | // in: r0-r7
1502 | 	.type P256_times2, %function
1503 | P256_times2:
1504 | 	adds r0,r0
1505 | 	adcs r1,r1,r1
1506 | 	adcs r2,r2,r2
1507 | 	adcs r3,r3,r3
1508 | 	adcs r4,r4,r4
1509 | 	adcs r5,r5,r5
1510 | 	adcs r6,r6,r6
1511 | 	adcs r7,r7,r7
1512 | 	mov r8,#0
1513 | 	adcs r8,r8,r8
1514 | 	
1515 | 	subs r0,#0xffffffff
1516 | 	sbcs r1,r1,#0xffffffff
1517 | 	sbcs r2,r2,#0xffffffff
1518 | 	sbcs r3,r3,#0
1519 | 	sbcs r4,r4,#0
1520 | 	sbcs r5,r5,#0
1521 | 	sbcs r6,r6,#1
1522 | 	sbcs r7,r7,#0xffffffff
1523 | 	sbcs r8,r8,#0
1524 | 	
1525 | 	adds r0,r8
1526 | 	adcs r1,r1,r8
1527 | 	adcs r2,r2,r8
1528 | 	adcs r3,r3,#0
1529 | 	adcs r4,r4,#0
1530 | 	adcs r5,r5,#0
1531 | 	adcs r6,r6,r8, lsr #31
1532 | 	adcs r7,r7,r8
1533 | 	
1534 | 	bx lr
1535 | 	.size P256_times2, .-P256_times2
1536 | #endif
1537 | 
1538 | #if include_p256_verify || include_p256_varmult || include_p256_decompress_point
1539 | 	.align 2
1540 | 	// (2^256)^2 mod p
1541 | R2_mod_p:
1542 | 	.word 3
1543 | 	.word 0
1544 | 	.word 0xffffffff
1545 | 	.word 0xfffffffb
1546 | 	.word 0xfffffffe
1547 | 	.word 0xffffffff
1548 | 	.word 0xfffffffd
1549 | 	.word 4
1550 | 
1551 | // in: *r1
1552 | // out: *r0
1553 | 	.type P256_to_montgomery, %function
1554 | P256_to_montgomery:
1555 | 	.global P256_to_montgomery
1556 | 	push {r0,r4-r11,lr}
1557 | 	//frame push {r4-r11,lr}
1558 | 	//frame address sp,40
1559 | 	adr r2,R2_mod_p
1560 | 	bl P256_mulmod
1561 | 	pop {r8}
1562 | 	//frame address sp,36
1563 | 	stm r8,{r0-r7}
1564 | 	pop {r4-r11,pc}
1565 | 	.size P256_to_montgomery, .-P256_to_montgomery
1566 | #endif
1567 | 
1568 | #if include_p256_basemult || include_p256_varmult || include_p256_decompress_point
1569 | // in: *r1
1570 | // out: *r0
1571 | 	.type P256_from_montgomery, %function
1572 | P256_from_montgomery:
1573 | 	.global P256_from_montgomery
1574 | 	push {r0,r4-r11,lr}
1575 | 	//frame push {r4-r11,lr}
1576 | 	//frame address sp,40
1577 | 	movs r2,#0
1578 | 	movs r3,#0
1579 | 	push {r2-r3}
1580 | 	//frame address sp,48
1581 | 	push {r2-r3}
1582 | 	//frame address sp,56
1583 | 	push {r2-r3}
1584 | 	//frame address sp,64
1585 | 	movs r2,#1
1586 | 	push {r2-r3}
1587 | 	//frame address sp,72
1588 | 	mov r2,sp
1589 | 	bl P256_mulmod
1590 | 	add sp,#32
1591 | 	//frame address sp,40
1592 | 	pop {r8}
1593 | 	//frame address sp,36
1594 | 	stm r8,{r0-r7}
1595 | 	pop {r4-r11,pc}
1596 | 	.size P256_from_montgomery, .-P256_from_montgomery
1597 | #endif
1598 | 
1599 | #if include_p256_verify || include_p256_varmult || include_p256_decompress_point || include_p256_decode_point
1600 | // Checks whether the input number is within [0,p-1]
1601 | // in: *r0
1602 | // out: r0 = 1 if ok, else 0
1603 | 	.type P256_check_range_p, %function
1604 | P256_check_range_p:
1605 | 	.global P256_check_range_p
1606 | 	push {r4-r8,lr}
1607 | 	//frame push {r4-r8,lr}
1608 | 	
1609 | 	ldm r0,{r1-r8}
1610 | 	
1611 | 	movs r0,#0xffffffff
1612 | 	
1613 | 	subs r1,r0
1614 | 	sbcs r2,r2,r0
1615 | 	sbcs r3,r3,r0
1616 | 	sbcs r4,r4,#0
1617 | 	sbcs r5,r5,#0
1618 | 	sbcs r6,r6,#0
1619 | 	sbcs r7,r7,#1
1620 | 	sbcs r8,r8,r0
1621 | 	
1622 | 	sbcs r0,r0,r0
1623 | 	lsrs r0,#31
1624 | 	
1625 | 	pop {r4-r8,pc}
1626 | 	
1627 | 	.size P256_check_range_p, .-P256_check_range_p
1628 | #endif
1629 | 
1630 | 
1631 | // Arithmetics for the group order n =
1632 | // 0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551
1633 | 
1634 | #if include_p256_verify || include_p256_sign
1635 | 	.align 2
1636 | P256_order_mu:
1637 | 	.word 0xeedf9bfe
1638 | 	.word 0x012ffd85
1639 | 	.word 0xdf1a6c21
1640 | 	.word 0x43190552
1641 | 	.word 0xffffffff
1642 | 	.word 0xfffffffe
1643 | 	.word 0xffffffff
1644 | 	.word 0x0
1645 | 	.word 0x1
1646 | 
1647 | // in: r0-r8 = value
1648 | // out: r0-r8
1649 | // returns input - n if input >= n, else input
1650 | // clobbers all other registers
1651 | 	.type P256_reduce_mod_n_once, %function
1652 | P256_reduce_mod_n_once:
1653 | 	push {lr}
1654 | 	//frame push {lr}
1655 | 	
1656 | 	adr r10,P256_order
1657 | 	ldm r10,{r10,r11,r12,lr}
1658 | 	subs r0,r10
1659 | 	sbcs r1,r1,r11
1660 | 	sbcs r2,r2,r12
1661 | 	sbcs r3,r3,lr
1662 | 	sbcs r4,r4,#0xffffffff
1663 | 	sbcs r5,r5,#0xffffffff
1664 | 	sbcs r6,r6,#0
1665 | 	sbcs r7,r7,#0xffffffff
1666 | 	sbcs r8,r8,#0
1667 | 	
1668 | 	sbc r9,r9,r9 // sets r9 to -1 if input < n, else 0
1669 | 	and r10,r9
1670 | 	and r11,r9
1671 | 	and r12,r9
1672 | 	and lr,r9
1673 | 	
1674 | 	adds r0,r10
1675 | 	adcs r1,r1,r11
1676 | 	adcs r2,r2,r12
1677 | 	adcs r3,r3,lr
1678 | 	adcs r4,r4,r9
1679 | 	adcs r5,r5,r9
1680 | 	adcs r6,r6,#0
1681 | 	adcs r7,r7,r9
1682 | 	adcs r8,r8,#0
1683 | 	
1684 | 	pop {pc}
1685 | 	.size P256_reduce_mod_n_once, .-P256_reduce_mod_n_once
1686 | 
1687 | // *r0 = out, *r1 = in
1688 | // uses Barrett Reduction
1689 | 	.type P256_reduce_mod_n_64bytes, %function
1690 | P256_reduce_mod_n_64bytes:
1691 | 	push {r0,r4-r11,lr}
1692 | 	//frame push {r4-r11,lr}
1693 | 	sub sp,sp,#108
1694 | 	//frame address sp,148
1695 | 	
1696 | 	mov r10,r1
1697 | 	
1698 | 	add r0,sp,#36
1699 | 	adds r1,r1,#28
1700 | 	adr r2,P256_order_mu
1701 | 	bl mul288x288
1702 | 	
1703 | 	mov r0,sp
1704 | 	add r1,sp,#72
1705 | 	adr r2,P256_order
1706 | 	bl mul288x288
1707 | 	
1708 | 	ldm r10,{r0-r8}
1709 | 	pop {r9-r12}
1710 | 	//frame address sp,132
1711 | 	subs r0,r0,r9
1712 | 	sbcs r1,r1,r10
1713 | 	sbcs r2,r2,r11
1714 | 	sbcs r3,r3,r12
1715 | 	pop {r9-r12,lr}
1716 | 	//frame address sp,112
1717 | 	sbcs r4,r4,r9
1718 | 	sbcs r5,r5,r10
1719 | 	sbcs r6,r6,r11
1720 | 	sbcs r7,r7,r12
1721 | 	sbcs r8,r8,lr
1722 | 	
1723 | 	bl P256_reduce_mod_n_once
1724 | 	bl P256_reduce_mod_n_once
1725 | 	add sp,sp,#72
1726 | 	//frame address sp,40
1727 | 	pop {r9}
1728 | 	//frame address sp,36
1729 | 	
1730 | 	stm r9,{r0-r7}
1731 | 	
1732 | 	pop {r4-r11,pc}
1733 | 	.size P256_reduce_mod_n_64bytes, .-P256_reduce_mod_n_64bytes
1734 | #endif
1735 | 
1736 | #if include_p256_sign
1737 | // in: *r0 = out, *r1 = in
1738 | 	.type P256_reduce_mod_n_32bytes, %function
1739 | P256_reduce_mod_n_32bytes:
1740 | 	.global P256_reduce_mod_n_32bytes
1741 | 	push {r0,r4-r11,lr}
1742 | 	//frame push {r4-r11,lr}
1743 | 	//frame address sp,40
1744 | 	ldm r1,{r0-r7}
1745 | 	mov r8,#0
1746 | 	bl P256_reduce_mod_n_once
1747 | 	pop {r8}
1748 | 	//frame address sp,36
1749 | 	stm r8,{r0-r7}
1750 | 	pop {r4-r11,pc}
1751 | 	.size P256_reduce_mod_n_32bytes, .-P256_reduce_mod_n_32bytes
1752 | 
1753 | 
1754 | // Adds two numbers mod n, both inputs can be any 256-bit numbers
1755 | // out and in may overlap
1756 | // in: *r1, *r2
1757 | // out: *r0
1758 | 	.type P256_add_mod_n, %function
1759 | P256_add_mod_n:
1760 | 	.global P256_add_mod_n
1761 | 	push {r0,r4-r11,lr}
1762 | 	//frame push {r4-r11,lr}
1763 | 	//frame address sp,40
1764 | 	
1765 | 	mov r12,r1
1766 | 	
1767 | 	ldm r2,{r4-r11}
1768 | 	ldm r12!,{r0-r3}
1769 | 	adds r0,r4
1770 | 	adcs r1,r1,r5
1771 | 	adcs r2,r2,r6
1772 | 	adcs r3,r3,r7
1773 | 	ldm r12,{r4-r7}
1774 | 	adcs r4,r4,r8
1775 | 	adcs r5,r5,r9
1776 | 	adcs r6,r6,r10
1777 | 	adcs r7,r7,r11
1778 | 	movs r8,#0
1779 | 	adcs r8,r8,r8
1780 | 	
1781 | 	bl P256_reduce_mod_n_once
1782 | 	bl P256_reduce_mod_n_once
1783 | 	pop {r8}
1784 | 	//frame address sp,36
1785 | 	stm r8,{r0-r7}
1786 | 	
1787 | 	pop {r4-r11,pc}
1788 | 	
1789 | 	.size P256_add_mod_n, .-P256_add_mod_n
1790 | #endif
1791 | 	
1792 | #if include_p256_verify || include_p256_sign
1793 | // Multiplies two numbers in the range [0,2^256-1] mod n
1794 | // out and in may overlap
1795 | // in: *r1, *r2
1796 | // out: *r0
1797 | 	.type P256_mul_mod_n, %function
1798 | P256_mul_mod_n:
1799 | 	.global P256_mul_mod_n
1800 | 	movs r3,#0
1801 | 	push {r3-r10,lr}
1802 | 	//frame push {r4-r10,lr}
1803 | 	//frame address sp,36
1804 | 	
1805 | 	mov r4,r0
1806 | 	
1807 | 	ldm r1,{r1,r3,r5-r10}
1808 | 	push {r1,r3,r5-r10}
1809 | 	//frame address sp,68
1810 | 	
1811 | 	movs r1,#0
1812 | 	push {r1}
1813 | 	//frame address sp,72
1814 | 	ldm r2,{r1,r3,r5-r10}
1815 | 	push {r1,r3,r5-r10}
1816 | 	//frame address sp,104
1817 | 	
1818 | 	sub sp,#72
1819 | 	//frame address sp,176
1820 | 	
1821 | 	mov r0,sp
1822 | 	add r1,sp,#72
1823 | 	add r2,sp,#108
1824 | 	bl mul288x288 // just reuse the 288x288-bit multiplier rather than also writing a 256x256
1825 | 	
1826 | 	mov r0,r4
1827 | 	mov r1,sp
1828 | 	bl P256_reduce_mod_n_64bytes
1829 | 	
1830 | 	add sp,#144
1831 | 	//frame address sp,32
1832 | 	pop {r4-r10,pc}
1833 | 	
1834 | 	.size P256_mul_mod_n, .-P256_mul_mod_n
1835 | 
1836 | #if include_p256_sign
1837 | // r0: delta (also returned)
1838 | // r1: f
1839 | // r2: g
1840 | // r3: dest
1841 | 	.type P256_divsteps2_31, %function
1842 | P256_divsteps2_31:
1843 | 	.global P256_divsteps2_31
1844 | 	push {r3,r4-r8,lr}
1845 | 	//frame push {r4-r8,lr}
1846 | 	//frame address sp,28
1847 | 	
1848 | 	// u,v,q,r
1849 | 	movs r4,#1
1850 | 	movs r5,#0
1851 | 	movs r6,#0
1852 | 	movs r7,#1
1853 | 	
1854 | 	// counter
1855 | 	mov lr,#31
1856 | 	
1857 | 0:
1858 | 	subs r3,r0,#1
1859 | 	lsl r12,r2,#31
1860 | 	bic r3,r12,r3
1861 | 	asrs r3,r3,#31 // mask
1862 | 	lsr r8,r3,#31 // b
1863 | 	
1864 | 	// conditionally negate delta
1865 | 	eors r0,r0,r3
1866 | 	subs r0,r0,r3
1867 | 	
1868 | 	mul r12,r1,r3 // t = f * -b (= f * m)
1869 | 	bics r1,r1,r3 // f &= ~m
1870 | 	umlal r1,r12,r2,r8 // f += g * b
1871 | 	umaal r2,r12,r2,r3 // g += t + g * -b (= g * m)
1872 | 	
1873 | 	mul r12,r4,r3
1874 | 	bics r4,r4,r3
1875 | 	umlal r4,r12,r6,r8
1876 | 	umaal r6,r12,r6,r3
1877 | 	
1878 | 	mul r12,r5,r3
1879 | 	bics r5,r5,r3
1880 | 	umlal r5,r12,r7,r8
1881 | 	umaal r7,r12,r7,r3
1882 | 	
1883 | 	ands r12,r2,#1 // g0 = g & 1
1884 | 	adds r0,r0,#1 // delta += 1
1885 | 	
1886 | 	// g = (g + g0 * f) / 2
1887 | 	mul r3,r12,r1
1888 | 	adds r2,r2,r3
1889 | 	lsrs r2,r2,#1 // we don't need the MSB
1890 | 	
1891 | 	umlal r6,r8,r12,r4 // q += g0 * u
1892 | 	umlal r7,r8,r12,r5 // r += g0 * v
1893 | 	
1894 | 	adds r4,r4,r4 // u *= 2
1895 | 	adds r5,r5,r5 // v *= 2
1896 | 	
1897 | 	subs lr,lr,#1
1898 | 	bne 0b
1899 | 	
1900 | 	pop {r3}
1901 | 	stm r3!,{r4-r7}
1902 | 	
1903 | 	pop {r4-r8,pc}
1904 | 	.size P256_divsteps2_31, .-P256_divsteps2_31
1905 | 
1906 | // r0: a, r1: b
1907 | // *r2: f,g
1908 | // *r3: out
1909 | // cycles: 132
1910 | 	.type P256_matrix_mul_fg_9, %function
1911 | P256_matrix_mul_fg_9:
1912 | 	.global P256_matrix_mul_fg_9
1913 | 	push {r4-r11,lr}
1914 | 	//frame push {r4-r11,lr}
1915 | 	
1916 | 	// this function calculates (a * f + b * g) / 2^31, which shall be an integer
1917 | 	
1918 | 	// the range is [-2^30, 2^31], so if negative, the top 2 bits are both 1s
1919 | 	// convert to absolute value and sign
1920 | 	and r4,r0,r0,lsl #1
1921 | 	asrs r4,r4,#31
1922 | 	eors r0,r0,r4
1923 | 	subs r0,r0,r4
1924 | 	
1925 | 	and r5,r1,r1,lsl #1
1926 | 	asrs r5,r5,#31
1927 | 	eors r1,r1,r5
1928 | 	subs r1,r1,r5
1929 | 	
1930 | 	ldm r2!,{r6} // f sign
1931 | 	ldr r7,[r2,#36] // g sign
1932 | 	
1933 | 	// compute the resulting sign, which will be negative if exactly one of g'sign and b's sign is negative
1934 | 	eors r4,r4,r6 // combine f's sign and a's sign
1935 | 	eors r5,r5,r7 // combine g's sign and b's sign
1936 | 	eors r4,r4,r5 // mask for negating a * f before adding to b * g
1937 | 	stm r3!,{r5}
1938 | 	push {r1,r2,r3}
1939 | 	//frame address sp,48
1940 | 	
1941 | 	// load f, which is stored as a signed 257-bit number (sign extended to 288 bits) and initially conditionally negated through r6
1942 | 	// now conditionally negate it depending on the r4 mask
1943 | 	ldm r2!,{r1,r3,r5-r11}
1944 | 	eors r1,r1,r4
1945 | 	eors r3,r3,r4
1946 | 	eors r5,r5,r4
1947 | 	eors r6,r6,r4
1948 | 	eors r7,r7,r4
1949 | 	eor r8,r8,r4
1950 | 	eor r9,r9,r4
1951 | 	eor r10,r10,r4
1952 | 	
1953 | 	subs r1,r1,r4
1954 | 	sbcs r3,r3,r4
1955 | 	sbcs r5,r5,r4
1956 | 	sbcs r6,r6,r4
1957 | 	sbcs r7,r7,r4
1958 | 	sbcs r8,r8,r4
1959 | 	sbcs r9,r9,r4
1960 | 	sbcs r10,r10,r4
1961 | 	// f is never 0, so we can skip last sbcs (for r11), since we know carry flag would be 0
1962 | 	eor r4,r4,r11
1963 | 	
1964 | 	// multiply the signed 257-bit value by |a| (|a| <= 2^31), to get a signed 288-bit result
1965 | 	umull r1,lr,r0,r1
1966 | 	movs r2,#0
1967 | 	umull r11,r12,r2,r2
1968 | 	umaal r2,lr,r0,r3
1969 | 	umaal r11,lr,r0,r5
1970 | 	umull r3,r5,r12,r12
1971 | 	umaal r3,lr,r0,r6
1972 | 	umaal r5,lr,r0,r7
1973 | 	umull r6,r7,r12,r12
1974 | 	umaal r6,lr,r0,r8
1975 | 	umaal r7,lr,r0,r9
1976 | 	umaal r12,lr,r0,r10
1977 | 	mla lr,r0,r4,lr
1978 | 	// result: r1, r2, r11, r3, r5, r6, r7, r12, lr
1979 | 	
1980 | 	// add b*g (which also fits in a signed 288-bit value) and divide by 2^31 (the low 31 bits will all be zero before div)
1981 | 	pop {r0,r4}
1982 | 	//frame address sp,40
1983 | 	adds r4,r4,#40
1984 | 	ldm r4!,{r8,r9}
1985 | 	mov r10,#0
1986 | 	umaal r1,r10,r0,r8
1987 | 	umaal r2,r10,r0,r9
1988 | 	adds r1,r1,r1
1989 | 	adcs r2,r2,r2
1990 | 	ldm r4!,{r1,r8,r9}
1991 | 	umaal r10,r11,r0,r1
1992 | 	umaal r11,r3,r0,r8
1993 | 	umaal r3,r5,r0,r9
1994 | 	adcs r10,r10,r10
1995 | 	adcs r11,r11,r11
1996 | 	adcs r3,r3,r3
1997 | 	ldm r4,{r1,r4,r8,r9}
1998 | 	umaal r5,r6,r0,r1
1999 | 	umaal r6,r7,r0,r4
2000 | 	umaal r7,r12,r0,r8
2001 | 	umaal r12,lr,r0,r9 // by divsteps2 invariant, lr will now be 0 since both f and g each fits in a signed 257-bit value
2002 | 	adcs r5,r5,r5
2003 | 	adcs r6,r6,r6
2004 | 	adcs r7,r7,r7
2005 | 	adcs r12,r12,r12
2006 | 	sbcs lr,lr,lr // extract the sign bit and sign-extend it
2007 | 	mvn lr,lr
2008 | 	pop {r1}
2009 | 	//frame address sp,36
2010 | 	stm r1!,{r2,r10,r11}
2011 | 	stm r1!,{r3,r5,r6,r7,r12,lr}
2012 | 	
2013 | 	pop {r4-r11,pc}
2014 | 	.size P256_matrix_mul_fg_9, .-P256_matrix_mul_fg_9
2015 | 
2016 | // r0: a, r1: b
2017 | // *r2: x,y
2018 | // *r3: out
2019 | // cycles: 184
2020 | 	.align 2
2021 | 	.type P256_matrix_mul_mod_n, %function
2022 | P256_matrix_mul_mod_n:
2023 | 	.global P256_matrix_mul_mod_n
2024 | 	push {r4-r11,lr}
2025 | 	//frame push {r4-r11,lr}
2026 | 	
2027 | 	// this function calculates a * x + b * y mod N (where N is the order of the P-256 curve)
2028 | 	
2029 | 	// the range is [-2^30, 2^31], so if negative, the top 2 bits are both 1s
2030 | 	// convert to absolute value and sign
2031 | 	and r4,r0,r0,lsl #1
2032 | 	asrs r4,r4,#31
2033 | 	eors r0,r0,r4
2034 | 	subs r0,r0,r4
2035 | 	
2036 | 	and r5,r1,r1,lsl #1
2037 | 	asrs r5,r5,#31
2038 | 	eors r1,r1,r5
2039 | 	subs r1,r1,r5
2040 | 	
2041 | 	ldm r2!,{r6} // x sign
2042 | 	ldr r7,[r2,#32] // y sign
2043 | 	
2044 | 	// compute the resulting sign, which will be negative if exactly one of x'sign and y's sign is negative
2045 | 	eors r4,r4,r6 // combine x's sign and a's sign
2046 | 	eors r5,r5,r7 // combine y's sign and b's sign
2047 | 	eors r4,r4,r5 // mask for negating a * x before adding to b * y
2048 | 	stm r3!,{r5}
2049 | 	push {r1,r2,r3}
2050 | 	//frame address sp,48
2051 | 	
2052 | 	// load x, which is stored as an unsigned 256-bit integer and initially conditionally negated through r6
2053 | 	// now conditionally negate it depending on the r4 mask
2054 | 	ldm r2,{r1-r3,r5-r9}
2055 | 	eors r1,r1,r4
2056 | 	eors r2,r2,r4
2057 | 	eors r3,r3,r4
2058 | 	eors r5,r5,r4
2059 | 	eors r6,r6,r4
2060 | 	eors r7,r7,r4
2061 | 	eor r8,r8,r4
2062 | 	eor r9,r9,r4
2063 | 	
2064 | 	subs r1,r1,r4
2065 | 	sbcs r2,r2,r4
2066 | 	sbcs r3,r3,r4
2067 | 	sbcs r5,r5,r4
2068 | 	sbcs r6,r6,r4
2069 | 	sbcs r7,r7,r4
2070 | 	sbcs r8,r8,r4
2071 | 	sbcs r9,r9,r4
2072 | 	
2073 | 	sbcs r4,r4,r4 // if the value is nonzero, r4 will now contain -1 and we will add N to make it positive
2074 | 	
2075 | 	lsrs lr,r4,#31
2076 | 	mov r12,#0
2077 | 	ldrd r10,r11,P256_order_local
2078 | 	umaal r1,r12,lr,r10
2079 | 	umaal r2,r12,lr,r11
2080 | 	ldrd r10,r11,P256_order_local+8
2081 | 	umaal r3,r12,lr,r10
2082 | 	umaal r5,r12,lr,r11
2083 | 	umaal r6,r12,lr,r4
2084 | 	umaal r7,r12,lr,r4
2085 | 	mov r10,#0
2086 | 	umaal r8,r12,lr,r10
2087 | 	umaal r9,r12,lr,r4
2088 | 	
2089 | 	// calculate a * x, the result fits in 287 bits
2090 | 	umull r11,lr,r10,r10
2091 | 	umull r10,lr,r0,r1
2092 | 	umull r1,r12,r11,r11
2093 | 	umaal r11,lr,r0,r2
2094 | 	umaal r1,lr,r0,r3
2095 | 	umull r2,r3,r12,r12
2096 | 	umaal r2,lr,r0,r5
2097 | 	umaal r3,lr,r0,r6
2098 | 	umull r4,r5,r12,r12
2099 | 	umaal r4,lr,r0,r7
2100 | 	umaal r5,lr,r0,r8
2101 | 	umaal r12,lr,r0,r9
2102 | 	
2103 | 	// add b*y, the result will fit in 288 bits
2104 | 	pop {r0,r6}
2105 | 	//frame address sp,40
2106 | 	adds r6,r6,#36
2107 | 	ldm r6!,{r8,r9}
2108 | 	movs r7,#0
2109 | 	umaal r10,r7,r0,r8
2110 | 	umaal r11,r7,r0,r9
2111 | 	ldm r6!,{r8,r9}
2112 | 	umaal r1,r7,r0,r8
2113 | 	umaal r2,r7,r0,r9
2114 | 	ldm r6!,{r8,r9}
2115 | 	umaal r3,r7,r0,r8
2116 | 	umaal r4,r7,r0,r9
2117 | 	ldm r6!,{r8,r9}
2118 | 	umaal r5,r7,r0,r8
2119 | 	umaal r12,r7,r0,r9
2120 | 	add lr,lr,r7
2121 | 	
2122 | 	// reduce modulo N using montgomery redc algorithm
2123 | 	ldr r0,=0xee00bc4f // montgomery multiplication factor N' (when R = 2^32), N*N' = -1 mod R
2124 | 	mul r0,r10,r0 // m = ((T mod R)N') mod R
2125 | 	movs r6,#0				// need 4-byte alignment on next instruction
2126 | 	ldrd r8,r9,P256_order_local
2127 | 	umaal r10,r6,r0,r8 // t = (T + mN) / R
2128 | 	umaal r11,r6,r0,r9
2129 | 	subs r11,r11,r8 // conditionally subtract by N unless we later find out the result becomes negative
2130 | 	ldrd r8,r10,P256_order_local+8
2131 | 	umaal r1,r6,r0,r8
2132 | 	sbcs r1,r1,r9
2133 | 	umaal r2,r6,r0,r10
2134 | 	mov r9,#-1
2135 | 	umaal r3,r6,r0,r9
2136 | 	umaal r4,r6,r0,r9
2137 | 	movs r7,#0
2138 | 	umaal r5,r6,r0,r7
2139 | 	umaal r12,r6,r0,r9
2140 | 	umaal lr,r6,r7,r7
2141 | 	sbcs r2,r2,r8
2142 | 	sbcs r3,r3,r10
2143 | 	sbcs r4,r4,r9
2144 | 	sbcs r5,r5,r9
2145 | 	sbcs r12,r12,r7
2146 | 	sbcs lr,lr,r9
2147 | 	sbcs r6,r6,r7 // if the result becomes negative, r6 becomes -1
2148 | 	
2149 | 	// conditionally add back N
2150 | 	ldrd r0,r9,P256_order_local
2151 | 	lsrs r6,r6,#31
2152 | 	umaal r7,r11,r6,r0
2153 | 	umaal r1,r11,r6,r9
2154 | 	umaal r2,r11,r6,r8
2155 | 	umaal r3,r11,r6,r10
2156 | 	rsbs r0,r6,#0
2157 | 	umaal r4,r11,r6,r0
2158 | 	umaal r5,r11,r6,r0
2159 | 	mov r8,#0
2160 | 	umaal r11,r12,r6,r8
2161 | 	umaal r12,lr,r6,r0
2162 | 	
2163 | 	pop {r6}
2164 | 	//frame address sp,36
2165 | 	stm r6!,{r7}
2166 | 	stm r6!,{r1,r2,r3,r4,r5,r11,r12}
2167 | 	
2168 | 	pop {r4-r11,pc}
2169 | 	
2170 | 	.ltorg
2171 | 	.size P256_matrix_mul_mod_n, .-P256_matrix_mul_mod_n
2172 | #else
2173 | // *r0=u
2174 | // *r1=x1
2175 | 	.type mod_inv_vartime_inner_n, %function
2176 | mod_inv_vartime_inner_n:
2177 | 	adr r11,P256_order
2178 | 	ldm r0,{r2-r9}
2179 | 	cmp r2,#1
2180 | 	bne 1f
2181 | 
2182 | 	orrs r10,r3,r4
2183 | 	orrs r10,r5
2184 | 	orrs r10,r6
2185 | 	orrs r10,r7
2186 | 	orrs r10,r8
2187 | 	orrs r10,r9
2188 | 	itt eq
2189 | 	moveq r0,#1
2190 | 	bxeq lr
2191 | 
2192 | 1:
2193 | 	tst r2,#1
2194 | 	itt ne
2195 | 	movne r0,#0
2196 | 	bxne lr
2197 | 2:
2198 | 	lsrs r9,#1
2199 | 	rrxs r8,r8
2200 | 	rrxs r7,r7
2201 | 	rrxs r6,r6
2202 | 	rrxs r5,r5
2203 | 	rrxs r4,r4
2204 | 	rrxs r3,r3
2205 | 	rrxs r2,r2
2206 | 	stm r0,{r2-r9}
2207 | 	ldm r1,{r3-r10}
2208 | 	tst r3,#1
2209 | 	beq 3f
2210 | 	ldr r12,[r11,#0]
2211 | 	adds r3,r12
2212 | 	ldr r12,[r11,#4]
2213 | 	adcs r4,r4,r12
2214 | 	ldr r12,[r11,#8]
2215 | 	adcs r5,r5,r12
2216 | 	ldr r12,[r11,#12]
2217 | 	adcs r6,r6,r12
2218 | 	adcs r7,r7,#0xffffffff
2219 | 	adcs r8,r8,#0xffffffff
2220 | 	adcs r9,r9,#0
2221 | 	adcs r10,r10,#0xffffffff
2222 | 3:
2223 | 	rrxs r10,r10
2224 | 	rrxs r9,r9
2225 | 	rrxs r8,r8
2226 | 	rrxs r7,r7
2227 | 	rrxs r6,r6
2228 | 	rrxs r5,r5
2229 | 	rrxs r4,r4
2230 | 	rrx r3,r3
2231 | 	stm r1,{r3-r10}
2232 | 	tst r2,#1
2233 | 	itt ne
2234 | 	movne r0,#0
2235 | 	bxne lr
2236 | 	ldm r0,{r2-r9}
2237 | 	b 2b
2238 | 	
2239 | 	.size mod_inv_vartime_inner_n, .-mod_inv_vartime_inner_n
2240 | 
2241 | // *r0 = result
2242 | // *r1 = input
2243 | 	.type P256_mod_n_inv_vartime, %function
2244 | P256_mod_n_inv_vartime:
2245 | 	.global P256_mod_n_inv_vartime
2246 | 	push {r0,r4-r11,lr}
2247 | 	//frame push {r4-r11,lr}
2248 | 	//frame address sp,40
2249 | 	sub sp,#128
2250 | 	//frame address sp,168
2251 | 	mov r0,sp
2252 | 	
2253 | 	// stack: u x1 v x2
2254 | 	// init: u=*r1, v=p, x1=1, x2=0
2255 | 
2256 | 	ldm r1,{r1-r8}
2257 | 	stm r0!,{r1-r8}
2258 | 
2259 | 	movs r1,#1
2260 | 	movs r2,#0
2261 | 	umull r3,r4,r2,r2
2262 | 	umull r5,r6,r2,r2
2263 | 	umull r7,r8,r2,r2
2264 | 	mov r9,#0
2265 | 
2266 | 	stm r0,{r1-r8}
2267 | 	add r0,sp,#96
2268 | 	stm r0,{r2-r9}
2269 | 	adr r2,P256_order
2270 | 	ldm r2,{r2-r9}
2271 | 	add r0,sp,#64
2272 | 	stm r0,{r2-r9}
2273 | 
2274 | 0:
2275 | 	mov r0,sp
2276 | 	add r1,sp,#32
2277 | 	bl mod_inv_vartime_inner_n
2278 | 	cmp r0,#0
2279 | 	it ne
2280 | 	addne r0,sp,#32
2281 | 	bne 2f
2282 | 
2283 | 	add r0,sp,#64
2284 | 	add r1,sp,#96
2285 | 	bl mod_inv_vartime_inner_n
2286 | 	cmp r0,#0
2287 | 	it ne
2288 | 	addne r0,sp,#96
2289 | 	bne 2f
2290 | 
2291 | 	ldm sp,{r0-r7}
2292 | 	add lr,sp,#64
2293 | 	ldm lr!,{r8-r11}
2294 | 	subs r0,r8
2295 | 	sbcs r1,r1,r9
2296 | 	sbcs r2,r2,r10
2297 | 	sbcs r3,r3,r11
2298 | 	ldm lr!,{r8-r11}
2299 | 	sbcs r4,r4,r8
2300 | 	sbcs r5,r5,r9
2301 | 	sbcs r6,r6,r10
2302 | 	sbcs r7,r7,r11
2303 | 
2304 | 	bcc 1f
2305 | 	stm sp,{r0-r7}
2306 | 	add r0,sp,#32
2307 | 	add r1,sp,#32
2308 | 	add r2,sp,#96
2309 | 3:
2310 | 	// submod here
2311 | 	ldm r1,{r1,r3-r9}
2312 | 	ldm r2!,{r10,r11,r12,lr}
2313 | 	subs r1,r10
2314 | 	sbcs r3,r3,r11
2315 | 	sbcs r4,r4,r12
2316 | 	sbcs r5,r5,lr
2317 | 	ldm r2!,{r10,r11,r12,lr}
2318 | 	sbcs r6,r6,r10
2319 | 	sbcs r7,r7,r11
2320 | 	sbcs r8,r8,r12
2321 | 	sbcs r9,r9,lr
2322 | 	
2323 | 	sbcs r10,r10,r10
2324 | 	adr r11,P256_order
2325 | 	ldm r11,{r2,r11,r12,lr}
2326 | 	and r2,r10
2327 | 	and r11,r10
2328 | 	and r12,r10
2329 | 	and lr,r10
2330 | 	adds r1,r2
2331 | 	adcs r3,r3,r11
2332 | 	adcs r4,r4,r12
2333 | 	adcs r5,r5,lr
2334 | 	adcs r6,r6,r10
2335 | 	adcs r7,r7,r10
2336 | 	adcs r8,r8,#0
2337 | 	adcs r9,r9,r10
2338 | 	stm r0,{r1,r3-r9}
2339 | 	b 0b
2340 | 1:
2341 | 	movs r8,#0
2342 | 	subs r0,r8,r0
2343 | 	sbcs r1,r8,r1
2344 | 	sbcs r2,r8,r2
2345 | 	sbcs r3,r8,r3
2346 | 	sbcs r4,r8,r4
2347 | 	sbcs r5,r8,r5
2348 | 	sbcs r6,r8,r6
2349 | 	sbcs r7,r8,r7
2350 | 	add r8,sp,#64
2351 | 	stm r8,{r0-r7}
2352 | 	add r0,sp,#96
2353 | 	add r1,sp,#96
2354 | 	add r2,sp,#32
2355 | 	b 3b
2356 | 
2357 | 2:
2358 | 	ldm r0,{r0-r7}
2359 | 	add sp,#128
2360 | 	//frame address sp,40
2361 | 	pop {r8}
2362 | 	//frame address sp,36
2363 | 	stm r8,{r0-r7}
2364 | 	pop {r4-r11,pc}
2365 | 	
2366 | 	.size P256_mod_n_inv_vartime, .-P256_mod_n_inv_vartime
2367 | #endif
2368 | #endif
2369 | 
2370 | #if include_p256_mult
2371 | 	.align 2
2372 | P256_order_local: // (arm clang assembler is broken for ldrd global labels defined in the same file)
2373 | 	.type P256_order, %object
2374 | P256_order:
2375 | 	.global P256_order
2376 | 	.word 0xFC632551
2377 | 	.word 0xF3B9CAC2
2378 | 	.word 0xA7179E84
2379 | 	.word 0xBCE6FAAD
2380 | 	.word 0xFFFFFFFF
2381 | 	.word 0xFFFFFFFF
2382 | 	.word 0
2383 | 	.word 0xFFFFFFFF
2384 | 	.word 0
2385 | 	.size P256_order, .-P256_order
2386 | #endif
2387 | 
2388 | #if include_p256_verify || include_p256_basemult || include_p256_raw_scalarmult_generic
2389 | // Checks whether the input number is within [1,n-1]
2390 | // in: *r0
2391 | // out: r0 = 1 if ok, else 0
2392 | 	.type P256_check_range_n, %function
2393 | P256_check_range_n:
2394 | 	.global P256_check_range_n
2395 | 	push {r4-r11,lr}
2396 | 	//frame push {r4-r11,lr}
2397 | 	ldm r0,{r1-r8}
2398 | 	orrs r0,r1,r2
2399 | 	orrs r0,r3
2400 | 	orrs r0,r4
2401 | 	orrs r0,r5
2402 | 	orrs r0,r6
2403 | 	orrs r0,r7
2404 | 	orrs r0,r8
2405 | 	beq 0f
2406 | 	
2407 | 	adr r0,P256_order
2408 | 	ldm r0!,{r9-r12}
2409 | 	subs r1,r9
2410 | 	sbcs r2,r2,r10
2411 | 	sbcs r3,r3,r11
2412 | 	sbcs r4,r4,r12
2413 | 	ldm r0,{r0-r3}
2414 | 	sbcs r5,r5,r0
2415 | 	sbcs r6,r6,r1
2416 | 	sbcs r7,r7,r2
2417 | 	sbcs r8,r8,r3
2418 | 	
2419 | 	sbcs r0,r0,r0
2420 | 	lsrs r0,#31
2421 | 0:
2422 | 	pop {r4-r11,pc}
2423 | 	
2424 | 	.size P256_check_range_n, .-P256_check_range_n
2425 | #endif
2426 | 	
2427 | 
2428 | // Elliptic curve operations on the NIST curve P-256
2429 | 
2430 | #if include_p256_verify || include_p256_varmult || include_p256_decompress_point || include_p256_decode_point
2431 | 	.align 2
2432 | b_mont:
2433 | 	.word 0x29c4bddf
2434 | 	.word 0xd89cdf62
2435 | 	.word 0x78843090
2436 | 	.word 0xacf005cd
2437 | 	.word 0xf7212ed6
2438 | 	.word 0xe5a220ab
2439 | 	.word 0x04874834
2440 | 	.word 0xdc30061d
2441 | three_mont:
2442 | 	.word 0x3
2443 | 	.word 0x0
2444 | 	.word 0x0
2445 | 	.word 0xfffffffd
2446 | 	.word 0xffffffff
2447 | 	.word 0xffffffff
2448 | 	.word 0xfffffffc
2449 | 	.word 0x2
2450 | #endif
2451 | 
2452 | #if include_p256_verify || include_p256_varmult || include_p256_decode_point
2453 | // Checks if a point is on curve
2454 | // in: *r0 = x, *r1 = y, in Montgomery form
2455 | // out: r0 = 1 if on curve, else 0
2456 | 	.type P256_point_is_on_curve, %function
2457 | P256_point_is_on_curve:
2458 | 	.global P256_point_is_on_curve
2459 | 	push {r0,r4-r11,lr}
2460 | 	//frame push {r4-r11,lr}
2461 | 	//frame address sp,40
2462 | 	
2463 | 	// We verify y^2 - x(x^2 - 3) = b
2464 | 	
2465 | 	// y^2
2466 | 	ldm r1,{r0-r7}
2467 | 	bl P256_sqrmod
2468 | 	push {r0-r7}
2469 | 	//frame address sp,72
2470 | 	
2471 | 	// x^2
2472 | 	ldr r0,[sp,#32]
2473 | 	ldm r0,{r0-r7}
2474 | 	bl P256_sqrmod
2475 | 	push {r0-r7}
2476 | 	//frame address sp,104
2477 | 	
2478 | 	// x^2 - 3
2479 | 	mov r1,sp
2480 | 	adr r2,three_mont
2481 | 	bl P256_submod
2482 | 	stm sp,{r0-r7}
2483 | 	
2484 | 	// x(x^2 - 3)
2485 | 	ldr r1,[sp,#64]
2486 | 	mov r2,sp
2487 | 	bl P256_mulmod
2488 | 	stm sp,{r0-r7}
2489 | 	
2490 | 	// y^2 - x(x^2 - 3)
2491 | 	add r1,sp,#32
2492 | 	mov r2,sp
2493 | 	bl P256_submod
2494 | 	
2495 | 	// compare with b
2496 | 	adr r8,b_mont
2497 | 	ldm r8!,{r9-r12}
2498 | 	eors r0,r9
2499 | 	ittt eq
2500 | 	eorseq r1,r10
2501 | 	eorseq r2,r11
2502 | 	eorseq r3,r12
2503 | 	ldm r8,{r9-r12}
2504 | 	itttt eq
2505 | 	eorseq r4,r9
2506 | 	eorseq r5,r10
2507 | 	eorseq r6,r11
2508 | 	eorseq r7,r12
2509 | 	mov r0,#0
2510 | 	it eq
2511 | 	moveq r0,#1
2512 | 	
2513 | 	add sp,#68
2514 | 	//frame address sp,36
2515 | 	
2516 | 	pop {r4-r11,pc}
2517 | 	
2518 | 	.size P256_point_is_on_curve, .-P256_point_is_on_curve
2519 | #endif
2520 | 
2521 | #if include_p256_basemult || include_p256_varmult || include_p256_decompress_point
2522 | 	.align 2
2523 | P256_p:
2524 | 	.word 0xffffffff
2525 | 	.word 0xffffffff
2526 | 	.word 0xffffffff
2527 | 	.word 0
2528 | 	.word 0
2529 | 	.word 0
2530 | 	.word 1
2531 | 	.word 0xffffffff
2532 | #endif
2533 | 
2534 | #if include_p256_decompress_point
2535 | // in: r0 = output location for y, *r1 = x, r2 = parity bit for y
2536 | // out: r0 = 1 if ok, 0 if invalid x
2537 | 	.type P256_decompress_point, %function
2538 | P256_decompress_point:
2539 | 	.global P256_decompress_point
2540 | 	push {r0,r2,r4-r11,lr}
2541 | 	//frame push {r4-r11,lr}
2542 | 	//frame address sp,44
2543 | 	sub sp,#32
2544 | 	//frame address sp,76
2545 | 	
2546 | 	mov r0,sp
2547 | 	bl P256_to_montgomery
2548 | 	ldm sp,{r0-r7}
2549 | 	
2550 | 	bl P256_sqrmod
2551 | 	push {r0-r7}
2552 | 	
2553 | 	mov r1,sp
2554 | 	adr r2,three_mont
2555 | 	bl P256_submod
2556 | 	stm sp,{r0-r7}
2557 | 	//frame address sp,108
2558 | 	
2559 | 	add r1,sp,#32
2560 | 	mov r2,sp
2561 | 	bl P256_mulmod
2562 | 	stm sp,{r0-r7}
2563 | 	
2564 | 	mov r1,sp
2565 | 	adr r2,b_mont
2566 | 	bl P256_addmod
2567 | 	stm sp,{r0-r7}
2568 | 	
2569 | 	mov r8,#1
2570 | 	bl P256_modinv_sqrt
2571 | 	add r8,sp,#32
2572 | 	stm r8,{r0-r7}
2573 | 	
2574 | 	bl P256_sqrmod
2575 | 	
2576 | 	pop {r8-r11}
2577 | 	//frame address sp,92
2578 | 	eors r8,r0
2579 | 	ittt eq
2580 | 	eorseq r9,r1
2581 | 	eorseq r10,r2
2582 | 	eorseq r11,r3
2583 | 	pop {r8-r11}
2584 | 	//frame address sp,76
2585 | 	itttt eq
2586 | 	eorseq r8,r4
2587 | 	eorseq r9,r5
2588 | 	eorseq r10,r6
2589 | 	eorseq r11,r7
2590 | 	it ne
2591 | 	movne r0,#0
2592 | 	bne 1f
2593 | 	
2594 | 	mov r0,sp
2595 | 	mov r1,sp
2596 | 	bl P256_from_montgomery
2597 | 	
2598 | 	ldr r3,[sp]
2599 | 	ldrd r0,r1,[sp,#32]
2600 | 	and r2,r3,#1
2601 | 	eors r2,r1
2602 | 	mov r1,sp
2603 | 	adr r3,P256_p
2604 | 	bl P256_negate_mod_m_if
2605 | 	movs r0,#1
2606 | 1:
2607 | 	add sp,#32+8
2608 | 	//frame address sp,36
2609 | 	pop {r4-r11,pc}
2610 | 	
2611 | 	.size P256_decompress_point, .-P256_decompress_point
2612 | #endif
2613 | 
2614 | #if include_p256_basemult || include_p256_varmult
2615 | // *r0 = output affine montgomery x
2616 | // *r1 = output affine montgomery y
2617 | // *r2 = input jacobian montgomery
2618 | 	.type P256_jacobian_to_affine, %function
2619 | P256_jacobian_to_affine:
2620 | 	.global P256_jacobian_to_affine
2621 | 	push {r0,r1,r2,r4-r11,lr}
2622 | 	//frame push {r4-r11,lr}
2623 | 	//frame address sp,48
2624 | 	
2625 | 	adds r2,#64
2626 | 	ldm r2,{r0-r7}
2627 | 	mov r8,#0
2628 | 	bl P256_modinv_sqrt
2629 | 	push {r0-r7}
2630 | 	//frame address sp,80
2631 | 	
2632 | 	bl P256_sqrmod
2633 | 	push {r0-r7}
2634 | 	//frame address sp,112
2635 | 	
2636 | 	add r1,sp,#32
2637 | 	mov r2,sp
2638 | 	bl P256_mulmod
2639 | 	add r8,sp,#32
2640 | 	stm r8,{r0-r7}
2641 | 	
2642 | 	mov r1,sp
2643 | 	ldr r2,[sp,#72]
2644 | 	bl P256_mulmod
2645 | 	ldr r8,[sp,#64]
2646 | 	stm r8,{r0-r7}
2647 | 	
2648 | 	ldr r2,[sp,#72]
2649 | 	add r1,sp,#32
2650 | 	adds r2,r2,#32
2651 | 	bl P256_mulmod
2652 | 	ldr r8,[sp,#68]
2653 | 	stm r8,{r0-r7}
2654 | 	
2655 | 	add sp,#76
2656 | 	//frame address sp,36
2657 | 	
2658 | 	pop {r4-r11,pc}
2659 | 	.size P256_jacobian_to_affine, .-P256_jacobian_to_affine
2660 | #endif
2661 | 	
2662 | #if include_p256_mult
2663 | // Doubles the point in Jacobian form (integers are in Montgomery form)
2664 | // *r0 = out, *r1 = in
2665 | 	.type P256_double_j, %function
2666 | P256_double_j:
2667 | 	.global P256_double_j
2668 | 	push {r0,r1,r4-r11,lr}
2669 | 	//frame push {r4-r11,lr}
2670 | 	//frame address sp,44
2671 | 	
2672 | 	// https://eprint.iacr.org/2014/130.pdf, algorithm 10
2673 | 	
2674 | 	// t1 = Z1^2
2675 | 	adds r1,#64
2676 | 	ldm r1,{r0-r7}
2677 | 	bl P256_sqrmod
2678 | 	push {r0-r7}
2679 | 	//frame address sp,76
2680 | 	
2681 | 	// Z2 = Y1 * Z1
2682 | 	ldr r1,[sp,#36]
2683 | 	adds r1,#32
2684 | 	add r2,r1,#32
2685 | 	bl P256_mulmod
2686 | 	ldr r8,[sp,#32]
2687 | 	add r8,#64
2688 | 	stm r8,{r0-r7}
2689 | 	
2690 | 	// t2 = X1 + t1
2691 | 	ldr r1,[sp,#36]
2692 | 	mov r2,sp
2693 | 	bl P256_addmod
2694 | 	push {r0-r7}
2695 | 	//frame address sp,108
2696 | 	
2697 | 	// t1 = X1 - t1
2698 | 	ldr r1,[sp,#68]
2699 | 	add r2,sp,#32
2700 | 	bl P256_submod
2701 | 	add r8,sp,#32
2702 | 	stm r8,{r0-r7}
2703 | 	
2704 | 	// t1 = t1 * t2
2705 | 	add r1,sp,#32
2706 | 	mov r2,sp
2707 | 	bl P256_mulmod
2708 | 	add r8,sp,#32
2709 | 	stm r8,{r0-r7}
2710 | 	
2711 | 	// t2 = t1 / 2
2712 | 	lsl r8,r0,#31
2713 | 	adds r0,r0,r8, asr #31
2714 | 	adcs r1,r1,r8, asr #31
2715 | 	adcs r2,r2,r8, asr #31
2716 | 	adcs r3,r3,#0
2717 | 	adcs r4,r4,#0
2718 | 	adcs r5,r5,#0
2719 | 	adcs r6,r6,r8, lsr #31
2720 | 	adcs r7,r7,r8, asr #31
2721 | 	rrxs r7,r7
2722 | 	rrxs r6,r6
2723 | 	rrxs r5,r5
2724 | 	rrxs r4,r4
2725 | 	rrxs r3,r3
2726 | 	rrxs r2,r2
2727 | 	rrxs r1,r1
2728 | 	rrx r0,r0
2729 | 	stm sp,{r0-r7}
2730 | 	
2731 | 	// t1 = t1 + t2
2732 | 	add r1,sp,#32
2733 | 	mov r2,sp
2734 | 	bl P256_addmod
2735 | 	add r8,sp,#32
2736 | 	stm r8,{r0-r7}
2737 | 	
2738 | 	// t2 = t1^2
2739 | 	bl P256_sqrmod
2740 | 	stm sp,{r0-r7}
2741 | 	
2742 | 	// Y2 = Y1^2
2743 | 	ldr r0,[sp,#68]
2744 | 	adds r0,#32
2745 | 	ldm r0,{r0-r7}
2746 | 	bl P256_sqrmod
2747 | 	ldr r8,[sp,#64]
2748 | 	add r8,#32
2749 | 	stm r8,{r0-r7}
2750 | 	
2751 | 	// t3 = Y2^2
2752 | 	bl P256_sqrmod
2753 | 	push {r0-r7}
2754 | 	//frame address sp,140
2755 | 	
2756 | 	// Y2 = X1 * Y2
2757 | 	ldrd r0,r1,[sp,#96]
2758 | 	add r2,r0,#32
2759 | 	bl P256_mulmod
2760 | 	ldr r8,[sp,#96]
2761 | 	add r8,#32
2762 | 	stm r8,{r0-r7}
2763 | 	
2764 | 	// X2 = 2 * Y2
2765 | 	bl P256_times2
2766 | 	ldr r8,[sp,#96]
2767 | 	stm r8,{r0-r7}
2768 | 	
2769 | 	// X2 = t2 - X2
2770 | 	add r1,sp,#32
2771 | 	mov r2,r8
2772 | 	bl P256_submod
2773 | 	ldr r8,[sp,#96]
2774 | 	stm r8,{r0-r7}
2775 | 	
2776 | 	// t2 = Y2 - X2
2777 | 	mov r2,r8
2778 | 	add r1,r2,#32
2779 | 	bl P256_submod
2780 | 	add r8,sp,#32
2781 | 	stm r8,{r0-r7}
2782 | 	
2783 | 	// t1 = t1 * t2
2784 | 	add r1,sp,#64
2785 | 	add r2,sp,#32
2786 | 	bl P256_mulmod
2787 | 	add r8,sp,#64
2788 | 	stm r8,{r0-r7}
2789 | 	
2790 | 	// Y2 = t1 - t3
2791 | 	add r1,sp,#64
2792 | 	mov r2,sp
2793 | 	bl P256_submod
2794 | 	ldr r8,[sp,#96]
2795 | 	add r8,#32
2796 | 	stm r8,{r0-r7}
2797 | 	
2798 | 	add sp,#104
2799 | 	//frame address sp,36
2800 | 	
2801 | 	pop {r4-r11,pc}
2802 | 	.size P256_double_j, .-P256_double_j
2803 | 
2804 | // sets the jacobian *r0 point to *r1
2805 | // if r2=1, then Y will be negated
2806 | // if r3=1, then Z will be set to 1
2807 | // clobbers all registers
2808 | 	.type add_sub_helper, %function
2809 | add_sub_helper:
2810 | 	push {lr}
2811 | 	//frame push {lr}
2812 | 	ldm r1!,{r5-r12}
2813 | 	stm r0!,{r5-r12}
2814 | 	ldm r1!,{r5-r12}
2815 | 	cbz r2,0f
2816 | 	// note that Y is never 0 for a valid point
2817 | 	mov lr,#0
2818 | 	rsbs r4,r2,#0
2819 | 	subs r5,r4,r5
2820 | 	sbcs r6,r4,r6
2821 | 	sbcs r7,r4,r7
2822 | 	sbcs r8,lr,r8
2823 | 	sbcs r9,lr,r9
2824 | 	sbcs r10,lr,r10
2825 | 	sbcs r11,r2,r11
2826 | 	sbcs r12,r4,r12
2827 | 0:
2828 | 	stm r0!,{r5-r12}
2829 | 	cbnz r3,1f
2830 | 	ldm r1,{r5-r12}
2831 | 	stm r0,{r5-r12}
2832 | 	b 2f
2833 | 1:
2834 | 	// Set Z3 to 1 in Montgomery form
2835 | 	movs r4,#0
2836 | 	umull r5,r10,r4,r4
2837 | 	mvns r6,r4
2838 | 	mvns r7,r4
2839 | 	mov r8,#0xffffffff
2840 | 	mov r9,#0xfffffffe
2841 | 	
2842 | 	stm r0,{r3-r10}
2843 | 2:
2844 | 	pop {pc}
2845 | 	
2846 | 	.size add_sub_helper, .-add_sub_helper
2847 | 
2848 | // Adds or subtracts points in Jacobian form (integers are in Montgomery form)
2849 | // The first operand is located in *r0, the second in *r1 (may not overlap)
2850 | // The result is stored at *r0
2851 | // r2 = 0 if add, 1 if sub
2852 | // r3 = 1 if the second point's Z point is 1 and therefore not loaded
2853 | //
2854 | // This function assumes the second operand is not the point at infinity,
2855 | // otherwise it handles all inputs.
2856 | // The first operand is treated at the point at infinity as long as its Z coordinate is 0.
2857 | 	.type P256_add_sub_j, %function
2858 | P256_add_sub_j:
2859 | 	.global P256_add_sub_j
2860 | 	push {r0-r11,lr}
2861 | 	//frame push {r4-r11,lr}
2862 | 	//frame address sp,52
2863 | 	
2864 | 	//ldr r4,[r0,#64]
2865 | 	//cbnz r4,2f
2866 | 	add r4,r0,#64
2867 | 	ldm r4,{r4-r11}
2868 | 	orrs r4,r5
2869 | 	orrs r4,r6
2870 | 	orrs r4,r7
2871 | 	orrs r4,r8
2872 | 	orrs r4,r9
2873 | 	orrs r4,r10
2874 | 	orrs r4,r11
2875 | 	bne 2f
2876 | 	
2877 | 	// First point is 0, so just set result to (-) the other point
2878 | 	bl add_sub_helper
2879 | 	add sp,#16
2880 | 	//frame address sp,36
2881 | 	pop {r4-r11,pc}
2882 | 2:
2883 | 	//frame address sp,52
2884 | 	// Here a variant of
2885 | 	// https://www.hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-3/addition/add-1998-cmo-2.op3
2886 | 	// is used, but rearranged and uses less temporaries.
2887 | 	// The first operand to the function is both (X3,Y3,Z3) and (X2,Y2,Z2).
2888 | 	// The second operand to the function is (X1,Y1,Z1)
2889 | 	
2890 | 	cbnz r3,100f
2891 | 	
2892 | 	// Z1Z1 = Z1^2
2893 | 	adds r1,#64
2894 | 	ldm r1,{r0-r7}
2895 | 	bl P256_sqrmod
2896 | 	push {r0-r7}
2897 | 	//frame address sp,84
2898 | 	
2899 | 	// U2 = X2*Z1Z1
2900 | 	ldr r1,[sp,#32]
2901 | 	mov r2,sp
2902 | 	bl P256_mulmod
2903 | 	ldr r8,[sp,#32]
2904 | 	stm r8,{r0-r7}
2905 | 	
2906 | 	// t1 = Z1*Z1Z1
2907 | 	ldr r1,[sp,#36]
2908 | 	adds r1,#64
2909 | 	mov r2,sp
2910 | 	bl P256_mulmod
2911 | 	stm sp,{r0-r7}
2912 | 	
2913 | 	// S2 = Y2*t1
2914 | 	ldr r1,[sp,#32]
2915 | 	adds r1,#32
2916 | 	mov r2,sp
2917 | 	bl P256_mulmod
2918 | 	ldr r8,[sp,#32]
2919 | 	add r8,#32
2920 | 	stm r8,{r0-r7}
2921 | 	b 101f
2922 | 100:
2923 | 	sub sp,#32
2924 | 	//frame address sp,84
2925 | 101:
2926 | 	
2927 | 	// Z2Z2 = Z2^2
2928 | 	ldr r1,[sp,#32]
2929 | 	adds r1,#64
2930 | 	ldm r1,{r0-r7}
2931 | 	bl P256_sqrmod
2932 | 	push {r0-r7}
2933 | 	//frame address sp,116
2934 | 	
2935 | 	// U1 = X1*Z2Z2
2936 | 	ldr r1,[sp,#68]
2937 | 	mov r2,sp
2938 | 	bl P256_mulmod
2939 | 	add r8,sp,#32
2940 | 	stm r8,{r0-r7}
2941 | 	
2942 | 	// t2 = Z2*Z2Z2
2943 | 	ldr r1,[sp,#64]
2944 | 	adds r1,#64
2945 | 	mov r2,sp
2946 | 	bl P256_mulmod
2947 | 	stm sp,{r0-r7}
2948 | 	
2949 | 	// S1 = Y1*t2
2950 | 	ldr r1,[sp,#68]
2951 | 	adds r1,#32
2952 | 	mov r2,sp
2953 | 	bl P256_mulmod
2954 | 	stm sp,{r0-r7}
2955 | 	
2956 | 	
2957 | 	// H = U2-U1
2958 | 	ldr r1,[sp,#64]
2959 | 	add r2,sp,#32
2960 | 	bl P256_submod
2961 | 	ldr r8,[sp,#64]
2962 | 	stm r8,{r0-r7}
2963 | 	
2964 | 	// HH = H^2
2965 | 	bl P256_sqrmod
2966 | 	push {r0-r7}
2967 | 	//frame address sp,148
2968 | 	
2969 | 	// Z3 = Z2*H
2970 | 	ldr r2,[sp,#96]
2971 | 	add r1,r2,#64
2972 | 	bl P256_mulmod
2973 | 	ldr r8,[sp,#96]
2974 | 	add r8,#64
2975 | 	stm r8,{r0-r7}
2976 | 	
2977 | 	// Z3 = Z1*Z3
2978 | 	ldr r1,[sp,#108]
2979 | 	cbnz r1,102f
2980 | 	ldr r1,[sp,#100]
2981 | 	adds r1,#64
2982 | 	mov r2,r8
2983 | 	bl P256_mulmod
2984 | 	ldr r8,[sp,#96]
2985 | 	add r8,#64
2986 | 	stm r8,{r0-r7}
2987 | 102:
2988 | 	
2989 | 	// HHH = H*HH
2990 | 	ldr r1,[sp,#96]
2991 | 	mov r2,sp
2992 | 	bl P256_mulmod
2993 | 	ldr r8,[sp,#96]
2994 | 	stm r8,{r0-r7}
2995 | 	
2996 | 	//cbnz r0,3f
2997 | 	orrs r1,r0 ////
2998 | 	orrs r1,r2
2999 | 	orrs r1,r3
3000 | 	orrs r1,r4
3001 | 	orrs r1,r5
3002 | 	orrs r1,r6
3003 | 	orrs r0,r1,r7
3004 | 3:
3005 | 	push {r0} // if r0 == 0: HHH is 0, which means the two input points have the same affine x coordinates
3006 | 	//frame address sp,152
3007 | 	
3008 | 	// r = S2-+S1
3009 | 	ldr r1,[sp,#100]
3010 | 	adds r1,#32
3011 | 	add r2,sp,#36
3012 | 	ldr r3,[sp,#108]
3013 | 	cbz r3,4f
3014 | 	bl P256_addmod
3015 | 	b 5f
3016 | 4:
3017 | 	bl P256_submod
3018 | 5:
3019 | 	ldr r8,[sp,#100]
3020 | 	add r8,#32
3021 | 	stm r8,{r0-r7}
3022 | 	
3023 | 	// check r == 0 && HHH == 0
3024 | 	pop {r8}
3025 | 	//frame address sp,148
3026 | 	//cbnz r0,6f
3027 | 	orrs r1,r0 ////
3028 | 	orrs r1,r2
3029 | 	orrs r1,r3
3030 | 	orrs r1,r4
3031 | 	orrs r1,r5
3032 | 	orrs r1,r6
3033 | 	orrs r1,r7
3034 | 	orrs r1,r8
3035 | 	bne 6f
3036 | 	// Points should be doubled since addition formula can't handle this case
3037 | 	// Since we have already overwritten the first point,
3038 | 	// we must copy the second point after possibly negating it
3039 | 	add sp,#96
3040 | 	//frame address sp,52
3041 | 	ldm sp,{r0-r3}
3042 | 	bl add_sub_helper
3043 | 	
3044 | 	ldr r0,[sp,#0]
3045 | 	mov r1,r0
3046 | 	add sp,#16
3047 | 	//frame address sp,36
3048 | 	bl P256_double_j
3049 | 	pop {r4-r11,pc}
3050 | 6:
3051 | 	//frame address sp,148
3052 | 	
3053 | 	// V = U1*HH
3054 | 	add r1,sp,#64
3055 | 	mov r2,sp
3056 | 	bl P256_mulmod
3057 | 	add r8,sp,#64
3058 | 	stm r8,{r0-r7}
3059 | 	
3060 | 	// t3 = r^2
3061 | 	ldr r0,[sp,#96]
3062 | 	adds r0,#32
3063 | 	ldm r0,{r0-r7}
3064 | 	bl P256_sqrmod
3065 | 	stm sp,{r0-r7}
3066 | 	
3067 | 	// t2 = S1*HHH
3068 | 	add r1,sp,#32
3069 | 	ldr r2,[sp,#96]
3070 | 	bl P256_mulmod
3071 | 	add r8,sp,#32
3072 | 	stm r8,{r0-r7}
3073 | 	
3074 | 	// X3 = t3-HHH
3075 | 	mov r1,sp
3076 | 	ldr r2,[sp,#96]
3077 | 	bl P256_submod
3078 | 	ldr r8,[sp,#96]
3079 | 	stm r8,{r0-r7}
3080 | 	
3081 | 	// t3 = 2*V
3082 | 	add r0,sp,#64
3083 | 	ldm r0,{r0-r7}
3084 | 	bl P256_times2
3085 | 	stm sp,{r0-r7}
3086 | 	
3087 | 	// X3 = X3-t3
3088 | 	ldr r1,[sp,#96]
3089 | 	mov r2,sp
3090 | 	bl P256_submod
3091 | 	ldr r8,[sp,#96]
3092 | 	stm r8,{r0-r7}
3093 | 	
3094 | 	// t3 = V-X3
3095 | 	add r1,sp,#64
3096 | 	ldr r2,[sp,#96]
3097 | 	bl P256_submod
3098 | 	stm sp,{r0-r7}
3099 | 	
3100 | 	// t3 = r*t3
3101 | 	ldr r1,[sp,#96]
3102 | 	adds r1,#32
3103 | 	mov r2,sp
3104 | 	bl P256_mulmod
3105 | 	stm sp,{r0-r7}
3106 | 	
3107 | 	// Y3 = t3-+t2
3108 | 	ldr r0,[sp,#104]
3109 | 	mov r1,sp
3110 | 	add r2,sp,#32
3111 | 	cbz r0,7f
3112 | 	bl P256_addmod
3113 | 	b 8f
3114 | 7:
3115 | 	bl P256_submod
3116 | 8:
3117 | 	ldr r8,[sp,#96]
3118 | 	add r8,#32
3119 | 	stm r8,{r0-r7}
3120 | 	
3121 | 	add sp,#112
3122 | 	//frame address sp,36
3123 | 	
3124 | 	pop {r4-r11,pc}
3125 | 	.size P256_add_sub_j, .-P256_add_sub_j
3126 | #endif
3127 | 
3128 | #if include_p256_verify
3129 | // Determines whether r = x (mod n)
3130 | // in: *r0 = r, *r1 = the result of the double scalarmult in jacobian form (Montgomery form)
3131 | // out: r0 will contain 1 if valid, else 0
3132 | 	.type P256_verify_last_step, %function
3133 | P256_verify_last_step:
3134 | 	.global P256_verify_last_step
3135 | 	push {r0,r1,r4-r11,lr}
3136 | 	//frame push {r4-r11,lr}
3137 | 	//frame address sp,44
3138 | 	sub sp,#32
3139 | 	//frame address sp,76
3140 | 	
3141 | 	// Instead of doing an expensive field inversion and checking r = (X/Z^2 % p) (mod n),
3142 | 	// accept the signature iff r*Z^2 % p = X OR (r+n<p AND (r+n)*Z^2 % p = X).
3143 | 	// Proof that this is correct:
3144 | 	//   if we use the standard approach, that would mean we check that
3145 | 	//   r = (X/Z^2 % p) (mod n)
3146 | 	//   which is the same as r+k*n = (X/Z^2 % p) for any integer k,
3147 | 	//   but since the RHS is less than p and 2n > p, we only need to check for k=0,1
3148 | 	//   which means checking r = (X/Z^2 % p) OR r+n = (X/Z^2 % p)
3149 | 	//   For r = (X/Z^2 % p) we have that r < p and so we can instead check r*Z^2 % p = X
3150 | 	//   For r+n = (X/Z^2 % p) we must first check that r+n < p and can then check (r+n)*Z^2 % p = X
3151 | 	//
3152 | 	// Note that since p-n is around sqrt(n), it is extremely unlikely that r+n<p
3153 | 	//
3154 | 	// Note that X and Z are in Montgomery form but not r,
3155 | 	// so we must convert r to Montgomery form when it's time to do the multiplications
3156 | 	
3157 | 	// Calculate Z^2
3158 | 	add r1,#64
3159 | 	ldm r1,{r0-r7}
3160 | 	bl P256_sqrmod
3161 | 	push {r0-r7}
3162 | 	//frame address sp,108
3163 | 	
3164 | 	// Check if Z^2 if 0, if so reject
3165 | 	orrs r0,r1
3166 | 	orrs r0,r2
3167 | 	orrs r0,r3
3168 | 	orrs r0,r4
3169 | 	orrs r0,r5
3170 | 	orrs r0,r6
3171 | 	orrs r0,r7
3172 | 	beq 0f
3173 | 	
3174 | 	// Convert r to Montgomery form
3175 | 	ldr r1,[sp,#64]
3176 | 2:
3177 | 	add r0,sp,#32
3178 | 	bl P256_to_montgomery
3179 | 	
3180 | 	// Calculate r*Z^2
3181 | 	add r1,sp,#32
3182 | 	mov r2,sp
3183 | 	bl P256_mulmod
3184 | 	
3185 | 	// Now we will check if r*Z^2 = X
3186 | 	ldr r8,[sp,#68]
3187 | 	ldm r8!,{r9-r12}
3188 | 	eors r0,r9
3189 | 	ittt eq
3190 | 	eorseq r1,r10
3191 | 	eorseq r2,r11
3192 | 	eorseq r3,r12
3193 | 	ldm r8!,{r9-r12}
3194 | 	itttt eq
3195 | 	eorseq r4,r9
3196 | 	eorseq r5,r10
3197 | 	eorseq r6,r11
3198 | 	eorseq r7,r12
3199 | 	mov r0,#1
3200 | 	beq 1f
3201 | 	
3202 | 	// The check may fail if r < p-n, so also check for r' = r+n
3203 | 	adr r0,P256_order
3204 | 	ldm r0,{r8-r11}
3205 | 	ldr r0,[sp,#64]
3206 | 	cbz r0,0f // if we already tried once, abort
3207 | 	ldm r0,{r0-r7}
3208 | 	adds r0,r8
3209 | 	adcs r1,r1,r9
3210 | 	adcs r2,r2,r10
3211 | 	adcs r3,r3,r11
3212 | 	adcs r4,r4,#0xffffffff
3213 | 	adcs r5,r5,#0xffffffff
3214 | 	adcs r6,r6,#0
3215 | 	adcs r7,r7,#0xffffffff
3216 | 	bcs 0f // reject if r+n >= 2^256 (which is >= p)
3217 | 	
3218 | 	subs r8,r0,#0xffffffff
3219 | 	sbcs r8,r1,#0xffffffff
3220 | 	sbcs r8,r2,#0xffffffff
3221 | 	sbcs r8,r3,#0
3222 | 	sbcs r8,r4,#0
3223 | 	sbcs r8,r5,#0
3224 | 	sbcs r8,r6,#1
3225 | 	sbcs r8,r7,#0xffffffff
3226 | 	bcs 0f // reject if r+n >= p
3227 | 	
3228 | 	add r8,sp,#32
3229 | 	stm r8,{r0-r7}
3230 | 	movs r2,#0
3231 | 	str r2,[sp,#64] // set r variable to NULL to avoid yet another try
3232 | 	
3233 | 	mov r1,r8
3234 | 	b 2b
3235 | 
3236 | 0:
3237 | 	movs r0,#0
3238 | 1:
3239 | 	add sp,#72
3240 | 	//frame address sp,36
3241 | 	pop {r4-r11,pc}
3242 | 	
3243 | 	.size P256_verify_last_step, .-P256_verify_last_step
3244 | #endif
3245 | 
3246 | #if include_p256_basemult || include_p256_varmult || include_p256_decompress_point
3247 | // in: *r0 = output location, *r1 = input, *r2 = 0/1, *r3 = m
3248 | // if r2 = 0, then *r0 is set to *r1
3249 | // if r2 = 1, then *r0 is set to m - *r1
3250 | // note that *r1 should be in the range [1,m-1]
3251 | // out: r0 and r1 will have advanced 32 bytes, r2 will remain as the input
3252 | 	.type P256_negate_mod_m_if, %function
3253 | P256_negate_mod_m_if:
3254 | 	push {r4-r8,lr}
3255 | 	//frame push {r4-r8,lr}
3256 | 	rsb r8,r2,#1
3257 | 	movs r6,#8
3258 | 	subs r7,r7 // set r7=0 and C=1
3259 | 0:
3260 | 	ldm r1!,{r4,r12}
3261 | 	ldm r3!,{r5,lr}
3262 | 	sbcs r5,r5,r4
3263 | 	umull r4,r7,r8,r4
3264 | 	umaal r4,r7,r2,r5
3265 | 	sbcs lr,lr,r12
3266 | 	umull r12,r7,r8,r12
3267 | 	umaal r12,r7,r2,lr
3268 | 	stm r0!,{r4,r12}
3269 | 	sub r6,#2
3270 | 	cbz r6,1f
3271 | 	b 0b
3272 | 1:
3273 | 	pop {r4-r8,pc}
3274 | 	.size P256_negate_mod_m_if, .-P256_negate_mod_m_if
3275 | #endif
3276 | 
3277 | #if include_p256_basemult || include_p256_varmult
3278 | 	.type P256_negate_mod_n_if, %function
3279 | P256_negate_mod_n_if:
3280 | 	.global P256_negate_mod_n_if
3281 | 	ldr r3,=P256_order
3282 | 	b P256_negate_mod_m_if
3283 | 	.size P256_negate_mod_n_if, .-P256_negate_mod_n_if
3284 | 
3285 | 	.type P256_negate_mod_p_if, %function
3286 | P256_negate_mod_p_if:
3287 | 	.global P256_negate_mod_p_if
3288 | 	adr r3,P256_p
3289 | 	b P256_negate_mod_m_if
3290 | 	.size P256_negate_mod_p_if, .-P256_negate_mod_p_if
3291 | #endif
3292 | 
3293 | 	.align 2
3294 | 	.end
3295 | 


--------------------------------------------------------------------------------
/p256-cortex-m4-asm-keil.s:
--------------------------------------------------------------------------------
   1 | ; Copyright (c) 2017-2021 Emil Lenngren
   2 | ; Copyright (c) 2021 Shortcut Labs AB
   3 | ;
   4 | ; Permission is hereby granted, free of charge, to any person obtaining a copy
   5 | ; of this software and associated documentation files (the "Software"), to deal
   6 | ; in the Software without restriction, including without limitation the rights
   7 | ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   8 | ; copies of the Software, and to permit persons to whom the Software is
   9 | ; furnished to do so, subject to the following conditions:
  10 | ;
  11 | ; The above copyright notice and this permission notice shall be included in all
  12 | ; copies or substantial portions of the Software.
  13 | ;
  14 | ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 | ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 | ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17 | ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18 | ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19 | ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  20 | ; SOFTWARE.
  21 | 
  22 | ; P256-Cortex-M4
  23 | 
  24 | #include "p256-cortex-m4-config.h"
  25 | 
  26 | ; This is an armv7 implementation of P-256.
  27 | ;
  28 | ; When secret data is processed, the implementation runs in constant time,
  29 | ; and no conditional branches depend on secret data.
  30 | 
  31 | 	area |.text|, code, readonly
  32 | 	align 4
  33 | 
  34 | #if (include_p256_basemult || include_p256_varmult) && has_d_cache
  35 | ; Selects one of many values
  36 | ; *r0 = output, *r1 = table, r2 = num coordinates, r3 = index to choose [0..7]
  37 | ; 547 cycles for affine coordinates
  38 | P256_select_point proc
  39 | 	export P256_select_point
  40 | 	push {r0,r2,r3,r4-r11,lr}
  41 | 	frame push {r4-r11,lr}
  42 | 	frame address sp,48
  43 | 	
  44 | 	subs r2,#1
  45 | 	lsls r2,#5
  46 | 
  47 | 0
  48 | 	rsbs r3,#0
  49 | 	sbcs r3,r3
  50 | 	mvns r3,r3
  51 | 	
  52 | 	ldm r1!,{r6-r12,lr}
  53 | 	ands r6,r3
  54 | 	ands r7,r3
  55 | 	and r8,r3
  56 | 	and r9,r3
  57 | 	and r10,r3
  58 | 	and r11,r3
  59 | 	and r12,r3
  60 | 	and lr,r3
  61 | 	
  62 | 	adds r1,r2
  63 | 	
  64 | 	movs r3,#1
  65 | 1
  66 | 	ldr r0,[sp,#8]
  67 | 	eors r0,r3
  68 | 	mrs r0,apsr
  69 | 	lsrs r0,#30
  70 | 	
  71 | 	ldm r1!,{r4,r5}
  72 | 	umlal r6,r2,r0,r4
  73 | 	umlal r7,r3,r0,r5
  74 | 	ldm r1!,{r4,r5}
  75 | 	umlal r8,r2,r0,r4
  76 | 	umlal r9,r3,r0,r5
  77 | 	ldm r1!,{r4,r5}
  78 | 	umlal r10,r2,r0,r4
  79 | 	umlal r11,r3,r0,r5
  80 | 	ldm r1!,{r4,r5}
  81 | 	umlal r12,r2,r0,r4
  82 | 	umlal lr,r3,r0,r5
  83 | 	
  84 | 	adds r1,r2
  85 | 	adds r3,#1
  86 | 	cmp r3,#8
  87 | 	bne %b1
  88 | 	
  89 | 	ldm sp,{r0,r4}
  90 | 	stm r0!,{r6-r12,lr}
  91 | 	str r0,[sp]
  92 | 	
  93 | 	sub r1,r1,r2, lsl #3
  94 | 	subs r1,#224
  95 | 	
  96 | 	subs r4,#1
  97 | 	str r4,[sp,#4]
  98 | 	ldr r3,[sp,#8]
  99 | 	bne %b0
 100 | 	
 101 | 	add sp,#12
 102 | 	frame address sp,36
 103 | 	pop {r4-r11,pc}
 104 | 	endp
 105 | #endif
 106 | 	
 107 | #if include_p256_verify || include_p256_sign
 108 | ; in: *r0 = out, *r1 = a, *r2 = b
 109 | ; quite slow, so only used in code not critical for performance
 110 | mul288x288 proc
 111 | 	push {r4-r11,lr}
 112 | 	frame push {r4-r11,lr}
 113 | 	
 114 | 	mov r4,r0
 115 | 	mov r5,r2
 116 | 	mov r6,r1
 117 | 	
 118 | 	movs r1,#72
 119 | 	bl setzero
 120 | 	
 121 | 	ldm r5,{r0-r2,r8-r12,lr}
 122 | 
 123 | 	movs r7,#9
 124 | 0
 125 | 	ldm r6!,{r5}
 126 | 	push {r6,r7}
 127 | 	frame address sp,44
 128 | 	movs r3,#0
 129 | 	ldm r4,{r6,r7}
 130 | 	umaal r6,r3,r5,r0
 131 | 	umaal r7,r3,r5,r1
 132 | 	stm r4!,{r6,r7}
 133 | 	ldm r4,{r6,r7}
 134 | 	umaal r6,r3,r5,r2
 135 | 	umaal r7,r3,r5,r8
 136 | 	stm r4!,{r6,r7}
 137 | 	ldm r4,{r6,r7}
 138 | 	umaal r6,r3,r5,r9
 139 | 	umaal r7,r3,r5,r10
 140 | 	stm r4!,{r6,r7}
 141 | 	ldm r4,{r6,r7}
 142 | 	umaal r6,r3,r5,r11
 143 | 	umaal r7,r3,r5,r12
 144 | 	stm r4!,{r6,r7}
 145 | 	ldm r4,{r6}
 146 | 	umaal r3,r6,r5,lr
 147 | 	stm r4!,{r3,r6}
 148 | 	
 149 | 	subs r4,r4,#36
 150 | 	pop {r6,r7}
 151 | 	frame address sp,36
 152 | 	subs r7,r7,#1
 153 | 	bne %b0
 154 | 	
 155 | 	pop {r4-r11,pc}
 156 | 	endp
 157 | ; in: r0 = address, r1 = num bytes (> 0, must be multiple of 8)
 158 | setzero proc
 159 | 	movs r2,#0
 160 | 	movs r3,#0
 161 | 0
 162 | 	stm r0!,{r2,r3}
 163 | 	subs r1,r1,#8
 164 | 	bne %b0
 165 | 	bx lr
 166 | 	endp
 167 | #endif
 168 | 	
 169 | 	
 170 | ; Field arithmetics for the prime field where p = 2^256 - 2^224 + 2^192 + 2^96 - 1
 171 | ; Multiplication and Squaring use Montgomery Modular Multiplication where R = 2^256
 172 | ; To convert a value to Montgomery class, use P256_mulmod(value, R^512 mod p)
 173 | ; To convert a value from Montgomery class to standard form, use P256_mulmod(value, 1)
 174 | 
 175 | #if include_p256_mult || include_p256_decompress_point || include_p256_decode_point
 176 | #if use_mul_for_sqr
 177 | P256_sqrmod proc
 178 | 	push {r0-r7,lr}
 179 | 	frame push {lr}
 180 | 	frame address sp,36
 181 | 	mov r1,sp
 182 | 	mov r2,sp
 183 | 	bl P256_mulmod
 184 | 	add sp,sp,#32
 185 | 	frame address sp,4
 186 | 	pop {pc}
 187 | 	endp
 188 | #endif
 189 | 	
 190 | #if has_fpu
 191 | ; If inputs are A*R mod p and B*R mod p, computes AB*R mod p
 192 | ; *r1 = in1, *r2 = in2
 193 | ; out: r0-r7
 194 | ; clobbers all other registers
 195 | P256_mulmod proc
 196 | 	push {lr}
 197 | 	frame push {lr}
 198 | 	
 199 | 	vmov s4,r2
 200 | 	vldm r1,{s8-s15}
 201 | 	
 202 | 	ldm r2,{r2,r3,r4,r5}
 203 | 	
 204 | 	vmov r0,r10,s8,s9
 205 | 	umull r6,r1,r2,r0
 206 | 	
 207 | 	umull r7,r12,r3,r0
 208 | 	umaal r7,r1,r2,r10
 209 | 	
 210 | 	vmov s0,s1,r6,r7
 211 | 	
 212 | 	umull r8,r6,r4,r0
 213 | 	umaal r8,r1,r3,r10
 214 | 	
 215 | 	umull r9,r7,r5,r0
 216 | 	umaal r9,r1,r4,r10
 217 | 	
 218 | 	umaal r1,r7,r5,r10
 219 | 	
 220 | 	vmov lr,r0,s10,s11
 221 | 	
 222 | 	umaal r8,r12,r2,lr
 223 | 	umaal r9,r12,r3,lr
 224 | 	umaal r1,r12,r4,lr
 225 | 	umaal r12,r7,r5,lr
 226 | 	
 227 | 	umaal r9,r6,r2,r0
 228 | 	umaal r1,r6,r3,r0
 229 | 	umaal r12,r6,r4,r0
 230 | 	umaal r6,r7,r5,r0
 231 | 	
 232 | 	vmov s2,s3,r8,r9
 233 | 	
 234 | 	vmov r10,lr,s12,s13
 235 | 	
 236 | 	mov r9,#0
 237 | 	umaal r1,r9,r2,r10
 238 | 	umaal r12,r9,r3,r10
 239 | 	umaal r6,r9,r4,r10
 240 | 	umaal r7,r9,r5,r10
 241 | 	
 242 | 	mov r10,#0
 243 | 	umaal r12,r10,r2,lr
 244 | 	umaal r6,r10,r3,lr
 245 | 	umaal r7,r10,r4,lr
 246 | 	umaal r9,r10,r5,lr
 247 | 	
 248 | 	vmov r8,s14
 249 | 	mov lr,#0
 250 | 	umaal lr,r6,r2,r8
 251 | 	umaal r7,r6,r3,r8
 252 | 	umaal r9,r6,r4,r8
 253 | 	umaal r10,r6,r5,r8
 254 | 	
 255 | 	;_ _ _ _ _ 6 10 9| 7 | lr 12 1 _ _ _ _
 256 | 	
 257 | 	vmov r8,s15
 258 | 	mov r11,#0
 259 | 	umaal r7,r11,r2,r8
 260 | 	umaal r9,r11,r3,r8
 261 | 	umaal r10,r11,r4,r8
 262 | 	umaal r6,r11,r5,r8
 263 | 	
 264 | 	;_ _ _ _ 11 6 10 9| 7 | lr 12 1 _ _ _ _
 265 | 	
 266 | 	vmov r2,s4
 267 | 	adds r2,r2,#16
 268 | 	ldm r2,{r2,r3,r4,r5}
 269 | 	
 270 | 	vmov r8,s8
 271 | 	movs r0,#0
 272 | 	umaal r1,r0,r2,r8
 273 | 	vmov s4,r1
 274 | 	umaal r12,r0,r3,r8
 275 | 	umaal lr,r0,r4,r8
 276 | 	umaal r0,r7,r5,r8 ; 7=carry for 9
 277 | 	
 278 | 	;_ _ _ _ 11 6 10 9+7| 0 | lr 12 _ _ _ _ _
 279 | 	
 280 | 	vmov r8,s9
 281 | 	movs r1,#0
 282 | 	umaal r12,r1,r2,r8
 283 | 	vmov s5,r12
 284 | 	umaal lr,r1,r3,r8
 285 | 	umaal r0,r1,r4,r8
 286 | 	umaal r1,r7,r5,r8 ; 7=carry for 10
 287 | 	
 288 | 	;_ _ _ _ 11 6 10+7 9+1| 0 | lr _ _ _ _ _ _
 289 | 	
 290 | 	vmov r8,s10
 291 | 	mov r12,#0
 292 | 	umaal lr,r12,r2,r8
 293 | 	vmov s6,lr
 294 | 	umaal r0,r12,r3,r8
 295 | 	umaal r1,r12,r4,r8
 296 | 	umaal r10,r12,r5,r8 ; 12=carry for 6
 297 | 	
 298 | 	;_ _ _ _ 11 6+12 10+7 9+1| 0 | _ _ _ _ _ _ _
 299 | 	
 300 | 	vmov r8,s11
 301 | 	mov lr,#0
 302 | 	umaal r0,lr,r2,r8
 303 | 	vmov s7,r0
 304 | 	umaal r1,lr,r3,r8
 305 | 	umaal r10,lr,r4,r8
 306 | 	umaal r6,lr,r5,r8 ; lr=carry for saved
 307 | 	
 308 | 	;_ _ _ _ 11+lr 6+12 10+7 9+1| _ | _ _ _ _ _ _ _
 309 | 	
 310 | 	vmov r0,r8,s12,s13
 311 | 	umaal r1,r9,r2,r0
 312 | 	vmov s8,r1
 313 | 	umaal r9,r10,r3,r0
 314 | 	umaal r10,r6,r4,r0
 315 | 	umaal r11,r6,r5,r0 ; 6=carry for next
 316 | 	
 317 | 	;_ _ _ 6 11+lr 10+12 9+7 _ | _ | _ _ _ _ _ _ _
 318 | 	
 319 | 	umaal r9,r7,r2,r8
 320 | 	umaal r10,r7,r3,r8
 321 | 	umaal r11,r7,r4,r8
 322 | 	umaal r6,r7,r5,r8
 323 | 	
 324 | 	vmov r0,r8,s14,s15
 325 | 	umaal r10,r12,r2,r0
 326 | 	umaal r11,r12,r3,r0
 327 | 	umaal r6,r12,r4,r0
 328 | 	umaal r7,r12,r5,r0
 329 | 	
 330 | 	umaal r11,lr,r2,r8
 331 | 	umaal lr,r6,r3,r8
 332 | 	umaal r6,r7,r4,r8
 333 | 	umaal r7,r12,r5,r8
 334 | 	
 335 | 	; 12 7 6 lr 11 10 9 s8 s7 s6 s5 s4 s3 s2 s1 s0
 336 | 	
 337 | 	;now reduce
 338 | 	vmov s13,s14,r6,r7
 339 | 	vmov s15,r12
 340 | 	
 341 | 	vmov r0,r1,s0,s1
 342 | 	vmov r2,r3,s2,s3
 343 | 	vmov r4,r5,s4,s5
 344 | 	vmov r6,r7,s6,s7
 345 | 	vmov r8,s8
 346 | 	
 347 | 	mov r12,#0
 348 | 
 349 | 	adds r3,r0
 350 | 	adcs r4,r1
 351 | 	adcs r5,r2
 352 | 	adcs r6,r0
 353 | 	adcs r7,r1
 354 | 	adcs r8,r0
 355 | 	adcs r9,r1
 356 | 	adcs r10,#0
 357 | 	adcs r11,#0
 358 | 	adcs r12,#0
 359 | 
 360 | 	adds r6,r3
 361 | 	adcs r7,r4 ; r4 instead of 0
 362 | 	adcs r8,r2
 363 | 	adcs r9,r3
 364 | 	adcs r10,r2
 365 | 	adcs r11,r3
 366 | 	adcs r12,#0
 367 | 
 368 | 	subs r7,r0
 369 | 	sbcs r8,r1
 370 | 	sbcs r9,r2
 371 | 	sbcs r10,r3
 372 | 	sbcs r11,#0
 373 | 	sbcs r12,#0 ; r12 is between 0 and 2
 374 | 
 375 | 	vmov r1,r2,s13,s14
 376 | 	vmov r3,s15
 377 | 
 378 | 	adds r0,lr,r12
 379 | 	adcs r1,#0
 380 | 	mov r12,#0
 381 | 	adcs r12,#0
 382 | 
 383 | 	;adds r7,r4 (added above instead)
 384 | 	adcs r8,r5
 385 | 	adcs r9,r6
 386 | 	adcs r10,r4
 387 | 	adcs r11,r5
 388 | 	adcs r0,r4
 389 | 	adcs r1,r5
 390 | 	adcs r2,r12
 391 | 	adcs r3,#0
 392 | 	mov r12,#0
 393 | 	adcs r12,#0
 394 | 
 395 | 	adcs r10,r7
 396 | 	adcs r11,#0
 397 | 	adcs r0,r6
 398 | 	adcs r1,r7
 399 | 	adcs r2,r6
 400 | 	adcs r3,r7
 401 | 	adcs r12,#0
 402 | 
 403 | 	subs r11,r4
 404 | 	sbcs r0,r5
 405 | 	sbcs r1,r6
 406 | 	sbcs r2,r7
 407 | 	sbcs r3,#0
 408 | 	sbcs r12,#0
 409 | 	
 410 | 	; now (T + mN) / R is
 411 | 	; 8 9 10 11 0 1 2 3 12 (lsb -> msb)
 412 | 	
 413 | 	subs r8,r8,#0xffffffff
 414 | 	sbcs r9,r9,#0xffffffff
 415 | 	sbcs r10,r10,#0xffffffff
 416 | 	sbcs r11,r11,#0
 417 | 	sbcs r4,r0,#0
 418 | 	sbcs r5,r1,#0
 419 | 	sbcs r6,r2,#1
 420 | 	sbcs r7,r3,#0xffffffff
 421 | 	sbc r12,r12,#0
 422 | 	
 423 | 	adds r0,r8,r12
 424 | 	adcs r1,r9,r12
 425 | 	adcs r2,r10,r12
 426 | 	adcs r3,r11,#0
 427 | 	adcs r4,r4,#0
 428 | 	adcs r5,r5,#0
 429 | 	adcs r6,r6,r12, lsr #31
 430 | 	adcs r7,r7,r12
 431 | 	
 432 | 	pop {pc}
 433 | 	endp
 434 | 	
 435 | #if !use_mul_for_sqr
 436 | ; If input is A*R mod p, computes A^2*R mod p
 437 | ; in/out: r0-r7
 438 | ; clobbers all other registers
 439 | P256_sqrmod proc
 440 | 	push {lr}
 441 | 	frame push {lr}
 442 | 	
 443 | 	;mul 01, 00
 444 | 	umull r9,r10,r0,r0
 445 | 	umull r11,r12,r0,r1
 446 | 	adds r11,r11,r11
 447 | 	mov lr,#0
 448 | 	umaal r10,r11,lr,lr
 449 | 	
 450 | 	;r9 r10 done
 451 | 	;r12 carry for 3rd before col
 452 | 	;r11+C carry for 3rd final col
 453 | 	
 454 | 	vmov s0,s1,r9,r10
 455 | 	
 456 | 	;mul 02, 11
 457 | 	mov r8,#0
 458 | 	umaal r8,r12,r0,r2
 459 | 	adcs r8,r8,r8
 460 | 	umaal r8,r11,r1,r1
 461 | 	
 462 | 	;r8 done (3rd col)
 463 | 	;r12 carry for 4th before col
 464 | 	;r11+C carry for 4th final col
 465 | 	
 466 | 	;mul 03, 12
 467 | 	umull r9,r10,r0,r3
 468 | 	umaal r9,r12,r1,r2
 469 | 	adcs r9,r9,r9
 470 | 	umaal r9,r11,lr,lr
 471 | 	
 472 | 	;r9 done (4th col)
 473 | 	;r10+r12 carry for 5th before col
 474 | 	;r11+C carry for 5th final col
 475 | 	
 476 | 	vmov s2,s3,r8,r9
 477 | 	
 478 | 	;mul 04, 13, 22
 479 | 	mov r9,#0
 480 | 	umaal r9,r10,r0,r4
 481 | 	umaal r9,r12,r1,r3
 482 | 	adcs r9,r9,r9
 483 | 	umaal r9,r11,r2,r2
 484 | 	
 485 | 	;r9 done (5th col)
 486 | 	;r10+r12 carry for 6th before col
 487 | 	;r11+C carry for 6th final col
 488 | 	
 489 | 	vmov s4,r9
 490 | 	
 491 | 	;mul 05, 14, 23
 492 | 	umull r9,r8,r0,r5
 493 | 	umaal r9,r10,r1,r4
 494 | 	umaal r9,r12,r2,r3
 495 | 	adcs r9,r9,r9
 496 | 	umaal r9,r11,lr,lr
 497 | 	
 498 | 	;r9 done (6th col)
 499 | 	;r10+r12+r8 carry for 7th before col
 500 | 	;r11+C carry for 7th final col
 501 | 	
 502 | 	vmov s5,r9
 503 | 	
 504 | 	;mul 06, 15, 24, 33
 505 | 	mov r9,#0
 506 | 	umaal r9,r8,r1,r5
 507 | 	umaal r9,r12,r2,r4
 508 | 	umaal r9,r10,r0,r6
 509 | 	adcs r9,r9,r9
 510 | 	umaal r9,r11,r3,r3
 511 | 	
 512 | 	;r9 done (7th col)
 513 | 	;r8+r10+r12 carry for 8th before col
 514 | 	;r11+C carry for 8th final col
 515 | 	
 516 | 	vmov s6,r9
 517 | 	
 518 | 	;mul 07, 16, 25, 34
 519 | 	umull r0,r9,r0,r7
 520 | 	umaal r0,r10,r1,r6
 521 | 	umaal r0,r12,r2,r5
 522 | 	umaal r0,r8,r3,r4
 523 | 	adcs r0,r0,r0
 524 | 	umaal r0,r11,lr,lr
 525 | 	
 526 | 	;r0 done (8th col)
 527 | 	;r9+r8+r10+r12 carry for 9th before col
 528 | 	;r11+C carry for 9th final col
 529 | 	
 530 | 	;mul 17, 26, 35, 44
 531 | 	umaal r9,r8,r1,r7 ;r1 is now dead
 532 | 	umaal r9,r10,r2,r6
 533 | 	umaal r12,r9,r3,r5
 534 | 	adcs r12,r12,r12
 535 | 	umaal r11,r12,r4,r4
 536 | 	
 537 | 	;r11 done (9th col)
 538 | 	;r8+r10+r9 carry for 10th before col
 539 | 	;r12+C carry for 10th final col
 540 | 	
 541 | 	;mul 27, 36, 45
 542 | 	umaal r9,r8,r2,r7 ;r2 is now dead
 543 | 	umaal r10,r9,r3,r6
 544 | 	movs r2,#0
 545 | 	umaal r10,r2,r4,r5
 546 | 	adcs r10,r10,r10
 547 | 	umaal r12,r10,lr,lr
 548 | 	
 549 | 	;r12 done (10th col)
 550 | 	;r8+r9+r2 carry for 11th before col
 551 | 	;r10+C carry for 11th final col
 552 | 	
 553 | 	;mul 37, 46, 55
 554 | 	umaal r2,r8,r3,r7 ;r3 is now dead
 555 | 	umaal r9,r2,r4,r6
 556 | 	adcs r9,r9,r9
 557 | 	umaal r10,r9,r5,r5
 558 | 	
 559 | 	;r10 done (11th col)
 560 | 	;r8+r2 carry for 12th before col
 561 | 	;r9+C carry for 12th final col
 562 | 	
 563 | 	;mul 47, 56
 564 | 	movs r3,#0
 565 | 	umaal r3,r8,r4,r7 ;r4 is now dead
 566 | 	umaal r3,r2,r5,r6
 567 | 	adcs r3,r3,r3
 568 | 	umaal r9,r3,lr,lr
 569 | 	
 570 | 	;r9 done (12th col)
 571 | 	;r8+r2 carry for 13th before col
 572 | 	;r3+C carry for 13th final col
 573 | 	
 574 | 	;mul 57, 66
 575 | 	umaal r8,r2,r5,r7 ;r5 is now dead
 576 | 	adcs r8,r8,r8
 577 | 	umaal r3,r8,r6,r6
 578 | 	
 579 | 	;r3 done (13th col)
 580 | 	;r2 carry for 14th before col
 581 | 	;r8+C carry for 14th final col
 582 | 	
 583 | 	;mul 67
 584 | 	umull r4,r5,lr,lr ; set 0
 585 | 	umaal r4,r2,r6,r7
 586 | 	adcs r4,r4,r4
 587 | 	umaal r4,r8,lr,lr
 588 | 	
 589 | 	;r4 done (14th col)
 590 | 	;r2 carry for 15th before col
 591 | 	;r8+C carry for 15th final col
 592 | 	
 593 | 	;mul 77
 594 | 	adcs r2,r2,r2
 595 | 	umaal r8,r2,r7,r7
 596 | 	adcs r2,r2,lr
 597 | 	
 598 | 	;r8 done (15th col)
 599 | 	;r2 done (16th col)
 600 | 	
 601 | 	;msb -> lsb: r2 r8 r4 r3 r9 r10 r12 r11 r0 s6 s5 s4 s3 s2 s1 s0
 602 | 	;lr: 0
 603 | 	;now do reduction
 604 | 	
 605 | 	vmov s13,s14,r4,r8
 606 | 	vmov s15,r2 ;s15
 607 | 	
 608 | 	vmov r1,r2,s0,s1
 609 | 	vmov r8,r7,s2,s3
 610 | 	vmov r6,r5,s4,s5
 611 | 	vmov r4,s6
 612 | 	;lr is already 0
 613 | X0 RN 1
 614 | X1 RN 2
 615 | X2 RN 8
 616 | X3 RN 7
 617 | X4 RN 6
 618 | X5 RN 5
 619 | X6 RN 4
 620 | X7 RN 0
 621 | X8 RN 11
 622 | X9 RN 12
 623 | X10 RN 10
 624 | X11 RN 9
 625 | X12 RN 3
 626 | 
 627 | X13 RN 7
 628 | X14 RN 8
 629 | X15 RN 2
 630 | 
 631 | 	adcs X3,X0
 632 | 	adcs X4,X1
 633 | 	adcs X5,X2
 634 | 	adcs X6,X0
 635 | 	adcs X7,X1
 636 | 	adcs X8,X0
 637 | 	adcs X9,X1
 638 | 	adcs X10,#0
 639 | 	adcs X11,#0
 640 | 	adcs lr,#0
 641 | 
 642 | 	adds X6,X3
 643 | 	adcs X7,X4 ; X4 instead of 0
 644 | 	adcs X8,X2
 645 | 	adcs X9,X3
 646 | 	adcs X10,X2
 647 | 	adcs X11,X3
 648 | 	adcs lr,#0
 649 | 
 650 | 	subs X7,X0
 651 | 	sbcs X8,X1
 652 | 	sbcs X9,X2
 653 | 	sbcs X10,X3
 654 | 	sbcs X11,#0
 655 | 	sbcs lr,#0 ; lr is between 0 and 2
 656 | 	
 657 | 	vmov X13,X14,s13,s14
 658 | 	vmov X15,s15
 659 | 
 660 | 	adds X0,X12,lr
 661 | 	adcs X13,#0
 662 | 	mov lr,#0
 663 | 	adcs lr,#0
 664 | 
 665 | 	;adds X7,X4 (added above instead)
 666 | 	adcs X8,X5
 667 | 	adcs X9,X6
 668 | 	adcs X10,X4
 669 | 	adcs X11,X5
 670 | 	adcs X0,X4
 671 | 	adcs X13,X5
 672 | 	adcs X14,lr
 673 | 	adcs X15,#0
 674 | 	mov lr,#0
 675 | 	adcs lr,#0
 676 | 
 677 | 	adcs X10,X7
 678 | 	adcs X11,#0
 679 | 	adcs X0,X6
 680 | 	adcs X13,X7
 681 | 	adcs X14,X6
 682 | 	adcs X15,X7
 683 | 	adcs lr,#0
 684 | 
 685 | 	subs X11,X4
 686 | 	sbcs X0,X5
 687 | 	sbcs X13,X6
 688 | 	sbcs X14,X7
 689 | 	sbcs X15,#0
 690 | 	sbcs lr,#0
 691 | 	
 692 | 	; now (T + mN) / R is
 693 | 	; X8 X9 X10 X11 X0 X13 X14 X15 lr (lsb -> msb)
 694 | 	; r11 r12 r10 r9 r1 r7 r8 r2 lr
 695 | 	
 696 | 	subs r0,r11,#0xffffffff
 697 | 	sbcs r12,r12,#0xffffffff
 698 | 	sbcs r4,r10,#0xffffffff
 699 | 	sbcs r9,r9,#0
 700 | 	sbcs r6,r1,#0
 701 | 	sbcs r5,r7,#0
 702 | 	sbcs r10,r8,#1
 703 | 	sbcs r8,r2,#0xffffffff
 704 | 	sbcs r7,lr,#0
 705 | 	
 706 | 	adds r0,r0,r7
 707 | 	adcs r1,r12,r7
 708 | 	adcs r2,r4,r7
 709 | 	adcs r3,r9,#0
 710 | 	adcs r4,r6,#0
 711 | 	adcs r5,r5,#0
 712 | 	adcs r6,r10,r7, lsr #31
 713 | 	adcs r7,r8,r7
 714 | 	
 715 | 	pop {pc}
 716 | 	endp
 717 | #endif
 718 | 	
 719 | #else
 720 | ; If inputs are A*R mod p and B*R mod p, computes AB*R mod p
 721 | ; *r1 = in1, *r2 = in2
 722 | ; out: r0-r7
 723 | ; clobbers all other registers
 724 | ; cycles: 231
 725 | P256_mulmod proc
 726 | 	push {r2,lr}
 727 | 	frame push {lr}
 728 | 	frame address sp,8
 729 | 	
 730 | 	sub sp,#28
 731 | 	frame address sp,36
 732 | 	ldm r2,{r2,r3,r4,r5}
 733 | 	
 734 | 	ldm r1!,{r0,r10,lr}
 735 | 	umull r6,r11,r2,r0
 736 | 	
 737 | 	umull r7,r12,r3,r0
 738 | 	umaal r7,r11,r2,r10
 739 | 	
 740 | 	push {r6,r7}
 741 | 	frame address sp,44
 742 | 	
 743 | 	umull r8,r6,r4,r0
 744 | 	umaal r8,r11,r3,r10
 745 | 	
 746 | 	umull r9,r7,r5,r0
 747 | 	umaal r9,r11,r4,r10
 748 | 	
 749 | 	umaal r11,r7,r5,r10
 750 | 	
 751 | 	umaal r8,r12,r2,lr
 752 | 	umaal r9,r12,r3,lr
 753 | 	umaal r11,r12,r4,lr
 754 | 	umaal r12,r7,r5,lr
 755 | 	
 756 | 	ldm r1!,{r0,r10,lr}
 757 | 	
 758 | 	umaal r9,r6,r2,r0
 759 | 	umaal r11,r6,r3,r0
 760 | 	umaal r12,r6,r4,r0
 761 | 	umaal r6,r7,r5,r0
 762 | 	
 763 | 	strd r8,r9,[sp,#8]
 764 | 	
 765 | 	mov r9,#0
 766 | 	umaal r11,r9,r2,r10
 767 | 	umaal r12,r9,r3,r10
 768 | 	umaal r6,r9,r4,r10
 769 | 	umaal r7,r9,r5,r10
 770 | 	
 771 | 	mov r10,#0
 772 | 	umaal r12,r10,r2,lr
 773 | 	umaal r6,r10,r3,lr
 774 | 	umaal r7,r10,r4,lr
 775 | 	umaal r9,r10,r5,lr
 776 | 	
 777 | 	ldr r8,[r1],#4
 778 | 	mov lr,#0
 779 | 	umaal lr,r6,r2,r8
 780 | 	umaal r7,r6,r3,r8
 781 | 	umaal r9,r6,r4,r8
 782 | 	umaal r10,r6,r5,r8
 783 | 	
 784 | 	;_ _ _ _ _ 6 10 9| 7 | lr 12 11 _ _ _ _
 785 | 	
 786 | 	ldr r8,[r1],#-28
 787 | 	mov r0,#0
 788 | 	umaal r7,r0,r2,r8
 789 | 	umaal r9,r0,r3,r8
 790 | 	umaal r10,r0,r4,r8
 791 | 	umaal r6,r0,r5,r8
 792 | 	
 793 | 	push {r0}
 794 | 	frame address sp,48
 795 | 	
 796 | 	;_ _ _ _ s 6 10 9| 7 | lr 12 11 _ _ _ _
 797 | 	
 798 | 	ldr r2,[sp,#40]
 799 | 	adds r2,r2,#16
 800 | 	ldm r2,{r2,r3,r4,r5}
 801 | 	
 802 | 	ldr r8,[r1],#4
 803 | 	mov r0,#0
 804 | 	umaal r11,r0,r2,r8
 805 | 	str r11,[sp,#16+4]
 806 | 	umaal r12,r0,r3,r8
 807 | 	umaal lr,r0,r4,r8
 808 | 	umaal r0,r7,r5,r8 ; 7=carry for 9
 809 | 	
 810 | 	;_ _ _ _ s 6 10 9+7| 0 | lr 12 _ _ _ _ _
 811 | 	
 812 | 	ldr r8,[r1],#4
 813 | 	mov r11,#0
 814 | 	umaal r12,r11,r2,r8
 815 | 	str r12,[sp,#20+4]
 816 | 	umaal lr,r11,r3,r8
 817 | 	umaal r0,r11,r4,r8
 818 | 	umaal r11,r7,r5,r8 ; 7=carry for 10
 819 | 	
 820 | 	;_ _ _ _ s 6 10+7 9+11| 0 | lr _ _ _ _ _ _
 821 | 	
 822 | 	ldr r8,[r1],#4
 823 | 	mov r12,#0
 824 | 	umaal lr,r12,r2,r8
 825 | 	str lr,[sp,#24+4]
 826 | 	umaal r0,r12,r3,r8
 827 | 	umaal r11,r12,r4,r8
 828 | 	umaal r10,r12,r5,r8 ; 12=carry for 6
 829 | 	
 830 | 	;_ _ _ _ s 6+12 10+7 9+11| 0 | _ _ _ _ _ _ _
 831 | 	
 832 | 	ldr r8,[r1],#4
 833 | 	mov lr,#0
 834 | 	umaal r0,lr,r2,r8
 835 | 	str r0,[sp,#28+4]
 836 | 	umaal r11,lr,r3,r8
 837 | 	umaal r10,lr,r4,r8
 838 | 	umaal r6,lr,r5,r8 ; lr=carry for saved
 839 | 	
 840 | 	;_ _ _ _ s+lr 6+12 10+7 9+11| _ | _ _ _ _ _ _ _
 841 | 	
 842 | 	ldm r1!,{r0,r8}
 843 | 	umaal r11,r9,r2,r0
 844 | 	str r11,[sp,#32+4]
 845 | 	umaal r9,r10,r3,r0
 846 | 	umaal r10,r6,r4,r0
 847 | 	pop {r11}
 848 | 	frame address sp,44
 849 | 	umaal r11,r6,r5,r0 ; 6=carry for next
 850 | 	
 851 | 	;_ _ _ 6 11+lr 10+12 9+7 _ | _ | _ _ _ _ _ _ _
 852 | 	
 853 | 	umaal r9,r7,r2,r8
 854 | 	umaal r10,r7,r3,r8
 855 | 	umaal r11,r7,r4,r8
 856 | 	umaal r6,r7,r5,r8
 857 | 	
 858 | 	ldm r1!,{r0,r8}
 859 | 	umaal r10,r12,r2,r0
 860 | 	umaal r11,r12,r3,r0
 861 | 	umaal r6,r12,r4,r0
 862 | 	umaal r7,r12,r5,r0
 863 | 	
 864 | 	umaal r11,lr,r2,r8
 865 | 	umaal lr,r6,r3,r8
 866 | 	umaal r6,r7,r4,r8
 867 | 	umaal r7,r12,r5,r8
 868 | 	
 869 | 	; 12 7 6 lr 11 10 9 stack*9
 870 | 	push {r6,r7,r12}
 871 | 	frame address sp,56
 872 | 	add r7,sp,#12
 873 | 	ldm r7,{r0-r8}
 874 | 	
 875 | 	mov r12,#0
 876 | 
 877 | 	adds r3,r0
 878 | 	adcs r4,r1
 879 | 	adcs r5,r2
 880 | 	adcs r6,r0
 881 | 	adcs r7,r1
 882 | 	adcs r8,r0
 883 | 	adcs r9,r1
 884 | 	adcs r10,#0
 885 | 	adcs r11,#0
 886 | 	adcs r12,#0
 887 | 
 888 | 	adds r6,r3
 889 | 	adcs r7,r4 ; r4 instead of 0
 890 | 	adcs r8,r2
 891 | 	adcs r9,r3
 892 | 	adcs r10,r2
 893 | 	adcs r11,r3
 894 | 	adcs r12,#0
 895 | 
 896 | 	subs r7,r0
 897 | 	sbcs r8,r1
 898 | 	sbcs r9,r2
 899 | 	sbcs r10,r3
 900 | 	sbcs r11,#0
 901 | 	sbcs r12,#0 ; r12 is between 0 and 2
 902 | 
 903 | 	pop {r1-r3}
 904 | 	frame address sp,44
 905 | 
 906 | 	adds r0,lr,r12
 907 | 	adcs r1,#0
 908 | 	mov r12,#0
 909 | 	adcs r12,#0
 910 | 
 911 | 	;adds r7,r4 (added above instead)
 912 | 	adcs r8,r5
 913 | 	adcs r9,r6
 914 | 	adcs r10,r4
 915 | 	adcs r11,r5
 916 | 	adcs r0,r4
 917 | 	adcs r1,r5
 918 | 	adcs r2,r12
 919 | 	adcs r3,#0
 920 | 	mov r12,#0
 921 | 	adcs r12,#0
 922 | 
 923 | 	adcs r10,r7
 924 | 	adcs r11,#0
 925 | 	adcs r0,r6
 926 | 	adcs r1,r7
 927 | 	adcs r2,r6
 928 | 	adcs r3,r7
 929 | 	adcs r12,#0
 930 | 
 931 | 	subs r11,r4
 932 | 	sbcs r0,r5
 933 | 	sbcs r1,r6
 934 | 	sbcs r2,r7
 935 | 	sbcs r3,#0
 936 | 	sbcs r12,#0
 937 | 	
 938 | 	; now (T + mN) / R is
 939 | 	; 8 9 10 11 0 1 2 3 12 (lsb -> msb)
 940 | 	
 941 | 	subs r8,r8,#0xffffffff
 942 | 	sbcs r9,r9,#0xffffffff
 943 | 	sbcs r10,r10,#0xffffffff
 944 | 	sbcs r11,r11,#0
 945 | 	sbcs r4,r0,#0
 946 | 	sbcs r5,r1,#0
 947 | 	sbcs r6,r2,#1
 948 | 	sbcs r7,r3,#0xffffffff
 949 | 	sbc r12,r12,#0
 950 | 	
 951 | 	adds r0,r8,r12
 952 | 	adcs r1,r9,r12
 953 | 	adcs r2,r10,r12
 954 | 	adcs r3,r11,#0
 955 | 	adcs r4,r4,#0
 956 | 	adcs r5,r5,#0
 957 | 	adcs r6,r6,r12, lsr #31
 958 | 	adcs r7,r7,r12
 959 | 	
 960 | 	add sp,sp,#40
 961 | 	frame address sp,4
 962 | 	
 963 | 	pop {pc}
 964 | 	
 965 | 	endp
 966 | 
 967 | #if !use_mul_for_sqr
 968 | ; 173 cycles
 969 | ; If input is A*R mod p, computes A^2*R mod p
 970 | ; in/out: r0-r7
 971 | ; clobbers all other registers
 972 | P256_sqrmod proc
 973 | 	push {lr}
 974 | 	frame push {lr}
 975 | 	
 976 | 	;mul 01, 00
 977 | 	umull r9,r10,r0,r0
 978 | 	umull r11,r12,r0,r1
 979 | 	adds r11,r11,r11
 980 | 	mov lr,#0
 981 | 	umaal r10,r11,lr,lr
 982 | 	
 983 | 	;r10 r9 done
 984 | 	;r12 carry for 3rd before col
 985 | 	;r11+C carry for 3rd final col
 986 | 	
 987 | 	push {r9,r10}
 988 | 	frame address sp,12
 989 | 	
 990 | 	;mul 02, 11
 991 | 	mov r9,#0
 992 | 	umaal r9,r12,r0,r2
 993 | 	adcs r9,r9,r9
 994 | 	umaal r9,r11,r1,r1
 995 | 	
 996 | 	;r9 done (3rd col)
 997 | 	;r12 carry for 4th before col
 998 | 	;r11+C carry for 4th final col
 999 | 	
1000 | 	push {r9}
1001 | 	frame address sp,16
1002 | 	
1003 | 	;mul 03, 12
1004 | 	umull r9,r10,r0,r3
1005 | 	umaal r9,r12,r1,r2
1006 | 	adcs r9,r9,r9
1007 | 	umaal r9,r11,lr,lr
1008 | 	
1009 | 	;r9 done (4th col)
1010 | 	;r10+r12 carry for 5th before col
1011 | 	;r11+C carry for 5th final col
1012 | 	
1013 | 	push {r9}
1014 | 	frame address sp,20
1015 | 	
1016 | 	;mul 04, 13, 22
1017 | 	mov r9,#0
1018 | 	umaal r9,r10,r0,r4
1019 | 	umaal r9,r12,r1,r3
1020 | 	adcs r9,r9,r9
1021 | 	umaal r9,r11,r2,r2
1022 | 	
1023 | 	;r9 done (5th col)
1024 | 	;r10+r12 carry for 6th before col
1025 | 	;r11+C carry for 6th final col
1026 | 	
1027 | 	push {r9}
1028 | 	frame address sp,24
1029 | 	
1030 | 	;mul 05, 14, 23
1031 | 	umull r9,r8,r0,r5
1032 | 	umaal r9,r10,r1,r4
1033 | 	umaal r9,r12,r2,r3
1034 | 	adcs r9,r9,r9
1035 | 	umaal r9,r11,lr,lr
1036 | 	
1037 | 	;r9 done (6th col)
1038 | 	;r10+r12+r8 carry for 7th before col
1039 | 	;r11+C carry for 7th final col
1040 | 	
1041 | 	push {r9}
1042 | 	frame address sp,28
1043 | 	
1044 | 	;mul 06, 15, 24, 33
1045 | 	mov r9,#0
1046 | 	umaal r9,r8,r1,r5
1047 | 	umaal r9,r12,r2,r4
1048 | 	umaal r9,r10,r0,r6
1049 | 	adcs r9,r9,r9
1050 | 	umaal r9,r11,r3,r3
1051 | 	
1052 | 	;r9 done (7th col)
1053 | 	;r8+r10+r12 carry for 8th before col
1054 | 	;r11+C carry for 8th final col
1055 | 	
1056 | 	push {r9}
1057 | 	frame address sp,32
1058 | 	
1059 | 	;mul 07, 16, 25, 34
1060 | 	umull r9,r0,r0,r7
1061 | 	umaal r9,r10,r1,r6
1062 | 	umaal r9,r12,r2,r5
1063 | 	umaal r9,r8,r3,r4
1064 | 	adcs r9,r9,r9
1065 | 	;push {r12}
1066 | 	;frame address sp,36
1067 | 	umaal r9,r11,lr,lr
1068 | 	
1069 | 	;r9 done (8th col)
1070 | 	;r0+r8+r10+r12 carry for 9th before col
1071 | 	;r11+C carry for 9th final col
1072 | 	
1073 | 	;mul 17, 26, 35, 44
1074 | 	umaal r0,r8,r1,r7 ;r1 is now dead
1075 | 	umaal r0,r10,r2,r6
1076 | 	;pop {r1}
1077 | 	;frame address sp,32
1078 | 	umaal r0,r12,r3,r5
1079 | 	adcs r0,r0,r0
1080 | 	umaal r11,r0,r4,r4
1081 | 	
1082 | 	;r11 done (9th col)
1083 | 	;r8+r10+r12 carry for 10th before col
1084 | 	;r0+C carry for 10th final col
1085 | 	
1086 | 	;mul 27, 36, 45
1087 | 	umaal r12,r8,r2,r7 ;r2 is now dead
1088 | 	umaal r12,r10,r3,r6
1089 | 	movs r2,#0
1090 | 	umaal r12,r2,r4,r5
1091 | 	adcs r1,r12,r12
1092 | 	umaal r0,r1,lr,lr
1093 | 	
1094 | 	;r0 done (10th col)
1095 | 	;r8+r10+r2 carry for 11th before col
1096 | 	;r1+C carry for 11th final col
1097 | 	
1098 | 	;mul 37, 46, 55
1099 | 	umaal r2,r8,r3,r7 ;r3 is now dead
1100 | 	umaal r2,r10,r4,r6
1101 | 	adcs r2,r2,r2
1102 | 	umaal r1,r2,r5,r5
1103 | 	
1104 | 	;r1 done (11th col)
1105 | 	;r8+r10 carry for 12th before col
1106 | 	;r2+C carry for 12th final col
1107 | 	
1108 | 	;mul 47, 56
1109 | 	movs r3,#0
1110 | 	umaal r3,r8,r4,r7 ;r4 is now dead
1111 | 	umaal r3,r10,r5,r6
1112 | 	adcs r3,r3,r3
1113 | 	umaal r2,r3,lr,lr
1114 | 	
1115 | 	;r2 done (12th col)
1116 | 	;r8+r10 carry for 13th before col
1117 | 	;r3+C carry for 13th final col
1118 | 	
1119 | 	;mul 57, 66
1120 | 	umaal r8,r10,r5,r7 ;r5 is now dead
1121 | 	adcs r8,r8,r8
1122 | 	umaal r3,r8,r6,r6
1123 | 	
1124 | 	;r3 done (13th col)
1125 | 	;r10 carry for 14th before col
1126 | 	;r8+C carry for 14th final col
1127 | 	
1128 | 	;mul 67
1129 | 	umull r4,r5,lr,lr ; set 0
1130 | 	umaal r4,r10,r6,r7
1131 | 	adcs r4,r4,r4
1132 | 	umaal r4,r8,lr,lr
1133 | 	
1134 | 	;r4 done (14th col)
1135 | 	;r10 carry for 15th before col
1136 | 	;r8+C carry for 15th final col
1137 | 	
1138 | 	;mul 77
1139 | 	adcs r10,r10,r10
1140 | 	umaal r8,r10,r7,r7
1141 | 	adcs r10,r10,lr
1142 | 	
1143 | 	;r8 done (15th col)
1144 | 	;r10 done (16th col)
1145 | 	
1146 | 	;msb -> lsb: r10 r8 r4 r3 r2 r1 r0 r11 r9 sp sp+4 sp+8 sp+12 sp+16 sp+24 sp+20
1147 | 	;now do reduction
1148 | 	
1149 | 	push {r4,r8,r10}
1150 | 	frame address sp,44
1151 | 	add r4,sp,#12
1152 | 	ldm r4,{r4-r8,r10,r12}
1153 | 	;lr is already 0
1154 | X0 RN 10
1155 | X1 RN 12
1156 | X2 RN 8
1157 | X3 RN 7
1158 | X4 RN 6
1159 | X5 RN 5
1160 | X6 RN 4
1161 | X7 RN 9
1162 | X8 RN 11
1163 | X9 RN 0
1164 | X10 RN 1
1165 | X11 RN 2
1166 | X12 RN 3
1167 | 
1168 | X13 RN 7
1169 | X14 RN 8
1170 | X15 RN 12
1171 | 
1172 | 	adcs X3,X0
1173 | 	adcs X4,X1
1174 | 	adcs X5,X2
1175 | 	adcs X6,X0
1176 | 	adcs X7,X1
1177 | 	adcs X8,X0
1178 | 	adcs X9,X1
1179 | 	adcs X10,#0
1180 | 	adcs X11,#0
1181 | 	adcs lr,#0
1182 | 
1183 | 	adds X6,X3
1184 | 	adcs X7,X4 ; X4 instead of 0
1185 | 	adcs X8,X2
1186 | 	adcs X9,X3
1187 | 	adcs X10,X2
1188 | 	adcs X11,X3
1189 | 	adcs lr,#0
1190 | 
1191 | 	subs X7,X0
1192 | 	sbcs X8,X1
1193 | 	sbcs X9,X2
1194 | 	sbcs X10,X3
1195 | 	sbcs X11,#0
1196 | 	sbcs lr,#0 ; lr is between 0 and 2
1197 | 	
1198 | 	pop {X13,X14,X15}
1199 | 	frame address sp,32
1200 | 
1201 | 	adds X0,X12,lr
1202 | 	adcs X13,#0
1203 | 	mov lr,#0
1204 | 	adcs lr,#0
1205 | 
1206 | 	;adds X7,X4 (added above instead)
1207 | 	adcs X8,X5
1208 | 	adcs X9,X6
1209 | 	adcs X10,X4
1210 | 	adcs X11,X5
1211 | 	adcs X0,X4
1212 | 	adcs X13,X5
1213 | 	adcs X14,lr
1214 | 	adcs X15,#0
1215 | 	mov lr,#0
1216 | 	adcs lr,#0
1217 | 
1218 | 	adcs X10,X7
1219 | 	adcs X11,#0
1220 | 	adcs X0,X6
1221 | 	adcs X13,X7
1222 | 	adcs X14,X6
1223 | 	adcs X15,X7
1224 | 	adcs lr,#0
1225 | 
1226 | 	subs X11,X4
1227 | 	sbcs X0,X5
1228 | 	sbcs X13,X6
1229 | 	sbcs X14,X7
1230 | 	sbcs X15,#0
1231 | 	sbcs lr,#0
1232 | 	
1233 | 	; now (T + mN) / R is
1234 | 	; X8 X9 X10 X11 X0 X13 X14 X15 lr (lsb -> msb)
1235 | 	; r11 r0 r1 r2 r10 r7 r8 r12 lr
1236 | 	
1237 | 	subs r11,r11,#0xffffffff
1238 | 	sbcs r9,r0,#0xffffffff
1239 | 	sbcs r4,r1,#0xffffffff
1240 | 	sbcs r3,r2,#0
1241 | 	sbcs r6,r10,#0
1242 | 	sbcs r5,r7,#0
1243 | 	sbcs r10,r8,#1
1244 | 	sbcs r8,r12,#0xffffffff
1245 | 	sbcs r7,lr,#0
1246 | 	
1247 | 	adds r0,r11,r7
1248 | 	adcs r1,r9,r7
1249 | 	adcs r2,r4,r7
1250 | 	adcs r3,r3,#0
1251 | 	adcs r4,r6,#0
1252 | 	adcs r5,r5,#0
1253 | 	adcs r6,r10,r7, lsr #31
1254 | 	adcs r7,r8,r7
1255 | 	
1256 | 	add sp,#28
1257 | 	frame address sp,4
1258 | 	pop {pc}
1259 | 	
1260 | 	endp
1261 | #endif
1262 | #endif
1263 | 
1264 | ; 42 cycles
1265 | ; Computes A - B mod p, assumes A, B < p
1266 | ; in: *r1, *r2
1267 | ; out: r0-r7
1268 | ; clobbers all other registers
1269 | P256_submod proc
1270 | 	ldm r1,{r3-r10}
1271 | 	ldm r2!,{r0,r1,r11,r12}
1272 | 	subs r3,r0
1273 | 	sbcs r4,r1
1274 | 	sbcs r5,r11
1275 | 	sbcs r6,r12
1276 | 	ldm r2,{r0,r1,r11,r12}
1277 | 	sbcs r7,r0
1278 | 	sbcs r8,r1
1279 | 	sbcs r9,r11
1280 | 	sbcs r10,r12
1281 | 	
1282 | 	sbcs r11,r11
1283 | 	
1284 | 	adds r0,r3,r11
1285 | 	adcs r1,r4,r11
1286 | 	adcs r2,r5,r11
1287 | 	adcs r3,r6,#0
1288 | 	adcs r4,r7,#0
1289 | 	adcs r5,r8,#0
1290 | 	adcs r6,r9,r11, lsr #31
1291 | 	adcs r7,r10,r11
1292 | 	
1293 | 	bx lr
1294 | 	
1295 | 	endp
1296 | #endif
1297 | 
1298 | #if include_p256_mult || include_p256_decompress_point
1299 | ; 52 cycles
1300 | ; Computes A + B mod p, assumes A, B < p
1301 | ; in: *r1, *r2
1302 | ; out: r0-r7
1303 | ; clobbers all other registers
1304 | P256_addmod proc
1305 | 	ldm r2,{r2-r9}
1306 | 	ldm r1!,{r0,r10,r11,r12}
1307 | 	adds r2,r0
1308 | 	adcs r3,r10
1309 | 	adcs r4,r11
1310 | 	adcs r5,r12
1311 | 	ldm r1,{r0,r1,r11,r12}
1312 | 	adcs r6,r0
1313 | 	adcs r7,r1
1314 | 	adcs r8,r11
1315 | 	adcs r9,r12
1316 | 	movs r10,#0
1317 | 	adcs r10,r10
1318 | 	
1319 | 	subs r2,#0xffffffff
1320 | 	sbcs r3,#0xffffffff
1321 | 	sbcs r4,#0xffffffff
1322 | 	sbcs r5,#0
1323 | 	sbcs r6,#0
1324 | 	sbcs r7,#0
1325 | 	sbcs r8,#1
1326 | 	sbcs r9,#0xffffffff
1327 | 	sbcs r10,#0
1328 | 	
1329 | 	adds r0,r2,r10
1330 | 	adcs r1,r3,r10
1331 | 	adcs r2,r4,r10
1332 | 	adcs r3,r5,#0
1333 | 	adcs r4,r6,#0
1334 | 	adcs r5,r7,#0
1335 | 	adcs r6,r8,r10, lsr #31
1336 | 	adcs r7,r9,r10
1337 | 	
1338 | 	bx lr
1339 | 	
1340 | 	endp
1341 | #endif
1342 | 	
1343 | #if include_p256_mult || include_p256_decompress_point
1344 | ; cycles: 19 + 181*n
1345 | P256_sqrmod_many proc
1346 | 	; in: r0-r7, count: r8
1347 | 	; out: r0-r7
1348 | 	push {r8,lr}
1349 | 	frame push {r8,lr}
1350 | 0
1351 | 	bl P256_sqrmod
1352 | 	
1353 | 	ldr r8,[sp,#0]
1354 | 	subs r8,r8,#1
1355 | 	str r8,[sp,#0]
1356 | 	bne %b0
1357 | 	
1358 | 	pop {r8,pc}
1359 | 	endp
1360 | 
1361 | ; in/out: r0-r7, r8: count, *r9: operand for final multiplication
1362 | P256_sqrmod_many_and_mulmod proc
1363 | 	push {r9,lr}
1364 | 	frame push {r9,lr}
1365 | 	bl P256_sqrmod_many
1366 | 	push {r0-r7}
1367 | 	frame address sp,40
1368 | 	mov r1,sp
1369 | 	ldr r2,[sp,#32]
1370 | 	bl P256_mulmod
1371 | 	add sp,#36
1372 | 	frame address sp,4
1373 | 	pop {pc}
1374 | 	endp
1375 | 	
1376 | 
1377 | ; in: r0-r7 = value, r8 = 0 for modinv and 1 for sqrt
1378 | ; out: r0-r7
1379 | ; for modinv, call input a, then if a = A * R % p, then it calculates A^-1 * R % p = (a/R)^-1 * R % p = R^2 / a % p
1380 | ; for sqrt, call input a, then if a = A * R % p, then it calculates sqrt(A) * R % p
1381 | P256_modinv_sqrt proc
1382 | 	push {r0-r8,lr}
1383 | 	
1384 | 	; t = a^2*a
1385 | 	mov r8,#1
1386 | 	mov r9,sp
1387 | 	bl P256_sqrmod_many_and_mulmod
1388 | 	push {r0-r7}
1389 | 	
1390 | 	; a4_2 = a2_0^(2^2)
1391 | 	bl P256_sqrmod
1392 | 	bl P256_sqrmod
1393 | 	push {r0-r7}
1394 | 	
1395 | 	; a4_0 = a4_2*a2_0
1396 | 	mov r1,sp
1397 | 	add r2,sp,#32
1398 | 	bl P256_mulmod
1399 | 	add r8,sp,#32
1400 | 	stm r8,{r0-r7}
1401 | 	
1402 | 	; a8_0 = a4_0^(2^(8-4))*a4_0
1403 | 	mov r8,#8-4
1404 | 	add r9,sp,#32
1405 | 	bl P256_sqrmod_many_and_mulmod
1406 | 	push {r0-r7}
1407 | 	
1408 | 	; a16_0 = a8_0^(2^(16-8))*a8_0
1409 | 	mov r8,#16-8
1410 | 	mov r9,sp
1411 | 	bl P256_sqrmod_many_and_mulmod
1412 | 	push {r0-r7}
1413 | 	
1414 | 	; a32_0 = a16_0^(2^(32-16))*a16_0
1415 | 	mov r8,#16
1416 | 	mov r9,sp
1417 | 	bl P256_sqrmod_many_and_mulmod
1418 | 	push {r0-r7}
1419 | 	
1420 | 	; t = a32_0^(2^(64-32))*a
1421 | 	mov r8,#32
1422 | 	add r9,sp,#5*32
1423 | 	bl P256_sqrmod_many_and_mulmod
1424 | 	
1425 | 	ldr r8,[sp,#6*32]
1426 | 	cmp r8,#0
1427 | 	bne %f0
1428 | 	
1429 | 	; t = t^(2^(192-64))*a32_0
1430 | 	mov r8,#192-64
1431 | 	mov r9,sp
1432 | 	bl P256_sqrmod_many_and_mulmod
1433 | 	
1434 | 	; t = t^(2^(224-192))*a32_0
1435 | 	mov r8,#224-192
1436 | 	mov r9,sp
1437 | 	bl P256_sqrmod_many_and_mulmod
1438 | 	
1439 | 	; t = t^(2^(240-224))*a16_0
1440 | 	mov r8,#240-224
1441 | 	add r9,sp,#32
1442 | 	bl P256_sqrmod_many_and_mulmod
1443 | 	
1444 | 	; t = t^(2^(248-240))*a8_0
1445 | 	mov r8,#248-240
1446 | 	add r9,sp,#64
1447 | 	bl P256_sqrmod_many_and_mulmod
1448 | 	
1449 | 	; t = t^(2^(252-248))*a4_0
1450 | 	mov r8,#252-248
1451 | 	add r9,sp,#128
1452 | 	bl P256_sqrmod_many_and_mulmod
1453 | 	
1454 | 	; t = t^(2^(256-252))*a4_2
1455 | 	mov r8,#256-252
1456 | 	add r9,sp,#96
1457 | 	bl P256_sqrmod_many_and_mulmod
1458 | 	stm sp,{r0-r7}
1459 | 	
1460 | 	; r = t*a
1461 | 	mov r1,sp
1462 | 	add r2,sp,#5*32
1463 | 	bl P256_mulmod
1464 | 	b %f1
1465 | 
1466 | 0
1467 | 	; t = t^(2^(160-64))*a
1468 | 	mov r8,#160-64
1469 | 	add r9,sp,#5*32
1470 | 	bl P256_sqrmod_many_and_mulmod
1471 | 	
1472 | 	; t = t^(2^(254-160))
1473 | 	mov r8,#254-160
1474 | 	bl P256_sqrmod_many
1475 | 1
1476 | 
1477 | 	add sp,#6*32+4
1478 | 	
1479 | 	pop {pc}
1480 | 	
1481 | 	endp
1482 | #endif
1483 | 
1484 | #if include_p256_mult
1485 | ; 33 cycles
1486 | ; in: r0-r7
1487 | P256_times2 proc
1488 | 	adds r0,r0
1489 | 	adcs r1,r1
1490 | 	adcs r2,r2
1491 | 	adcs r3,r3
1492 | 	adcs r4,r4
1493 | 	adcs r5,r5
1494 | 	adcs r6,r6
1495 | 	adcs r7,r7
1496 | 	mov r8,#0
1497 | 	adcs r8,r8
1498 | 	
1499 | 	subs r0,#0xffffffff
1500 | 	sbcs r1,#0xffffffff
1501 | 	sbcs r2,#0xffffffff
1502 | 	sbcs r3,#0
1503 | 	sbcs r4,#0
1504 | 	sbcs r5,#0
1505 | 	sbcs r6,#1
1506 | 	sbcs r7,#0xffffffff
1507 | 	sbcs r8,#0
1508 | 	
1509 | 	adds r0,r8
1510 | 	adcs r1,r8
1511 | 	adcs r2,r8
1512 | 	adcs r3,#0
1513 | 	adcs r4,#0
1514 | 	adcs r5,#0
1515 | 	adcs r6,r6,r8, lsr #31
1516 | 	adcs r7,r8
1517 | 	
1518 | 	bx lr
1519 | 	endp
1520 | #endif
1521 | 
1522 | #if include_p256_verify || include_p256_varmult || include_p256_decompress_point
1523 | 	align 4
1524 | 	; (2^256)^2 mod p
1525 | R2_mod_p
1526 | 	dcd 3
1527 | 	dcd 0
1528 | 	dcd 0xffffffff
1529 | 	dcd 0xfffffffb
1530 | 	dcd 0xfffffffe
1531 | 	dcd 0xffffffff
1532 | 	dcd 0xfffffffd
1533 | 	dcd 4
1534 | 
1535 | ; in: *r1
1536 | ; out: *r0
1537 | P256_to_montgomery proc
1538 | 	export P256_to_montgomery
1539 | 	push {r0,r4-r11,lr}
1540 | 	frame push {r4-r11,lr}
1541 | 	frame address sp,40
1542 | 	adr r2,R2_mod_p
1543 | 	bl P256_mulmod
1544 | 	pop {r8}
1545 | 	frame address sp,36
1546 | 	stm r8,{r0-r7}
1547 | 	pop {r4-r11,pc}
1548 | 	endp
1549 | #endif
1550 | 
1551 | #if include_p256_basemult || include_p256_varmult || include_p256_decompress_point
1552 | ; in: *r1
1553 | ; out: *r0
1554 | P256_from_montgomery proc
1555 | 	export P256_from_montgomery
1556 | 	push {r0,r4-r11,lr}
1557 | 	frame push {r4-r11,lr}
1558 | 	frame address sp,40
1559 | 	movs r2,#0
1560 | 	movs r3,#0
1561 | 	push {r2-r3}
1562 | 	frame address sp,48
1563 | 	push {r2-r3}
1564 | 	frame address sp,56
1565 | 	push {r2-r3}
1566 | 	frame address sp,64
1567 | 	movs r2,#1
1568 | 	push {r2-r3}
1569 | 	frame address sp,72
1570 | 	mov r2,sp
1571 | 	bl P256_mulmod
1572 | 	add sp,#32
1573 | 	frame address sp,40
1574 | 	pop {r8}
1575 | 	frame address sp,36
1576 | 	stm r8,{r0-r7}
1577 | 	pop {r4-r11,pc}
1578 | 	endp
1579 | #endif
1580 | 
1581 | #if include_p256_verify || include_p256_varmult || include_p256_decompress_point || include_p256_decode_point
1582 | ; Checks whether the input number is within [0,p-1]
1583 | ; in: *r0
1584 | ; out: r0 = 1 if ok, else 0
1585 | P256_check_range_p proc
1586 | 	export P256_check_range_p
1587 | 	push {r4-r8,lr}
1588 | 	frame push {r4-r8,lr}
1589 | 	
1590 | 	ldm r0,{r1-r8}
1591 | 	
1592 | 	movs r0,#0xffffffff
1593 | 	
1594 | 	subs r1,r0
1595 | 	sbcs r2,r0
1596 | 	sbcs r3,r0
1597 | 	sbcs r4,#0
1598 | 	sbcs r5,#0
1599 | 	sbcs r6,#0
1600 | 	sbcs r7,#1
1601 | 	sbcs r8,r0
1602 | 	
1603 | 	sbcs r0,r0
1604 | 	lsrs r0,#31
1605 | 	
1606 | 	pop {r4-r8,pc}
1607 | 	
1608 | 	endp
1609 | #endif
1610 | 
1611 | 
1612 | ; Arithmetics for the group order n =
1613 | ; 0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551
1614 | 
1615 | #if include_p256_verify || include_p256_sign
1616 | 	align 4
1617 | P256_order_mu
1618 | 	dcd 0xeedf9bfe
1619 | 	dcd 0x012ffd85
1620 | 	dcd 0xdf1a6c21
1621 | 	dcd 0x43190552
1622 | 	dcd 0xffffffff
1623 | 	dcd 0xfffffffe
1624 | 	dcd 0xffffffff
1625 | 	dcd 0x0
1626 | 	dcd 0x1
1627 | 
1628 | ; in: r0-r8 = value
1629 | ; out: r0-r8
1630 | ; returns input - n if input >= n, else input
1631 | ; clobbers all other registers
1632 | P256_reduce_mod_n_once proc
1633 | 	push {lr}
1634 | 	frame push {lr}
1635 | 	
1636 | 	adr r10,P256_order
1637 | 	ldm r10,{r10,r11,r12,lr}
1638 | 	subs r0,r10
1639 | 	sbcs r1,r11
1640 | 	sbcs r2,r12
1641 | 	sbcs r3,lr
1642 | 	sbcs r4,#0xffffffff
1643 | 	sbcs r5,#0xffffffff
1644 | 	sbcs r6,#0
1645 | 	sbcs r7,#0xffffffff
1646 | 	sbcs r8,#0
1647 | 	
1648 | 	sbc r9,r9 ; sets r9 to -1 if input < n, else 0
1649 | 	and r10,r9
1650 | 	and r11,r9
1651 | 	and r12,r9
1652 | 	and lr,r9
1653 | 	
1654 | 	adds r0,r10
1655 | 	adcs r1,r11
1656 | 	adcs r2,r12
1657 | 	adcs r3,lr
1658 | 	adcs r4,r9
1659 | 	adcs r5,r9
1660 | 	adcs r6,#0
1661 | 	adcs r7,r9
1662 | 	adcs r8,#0
1663 | 	
1664 | 	pop {pc}
1665 | 	endp
1666 | 
1667 | ; *r0 = out, *r1 = in
1668 | ; uses Barrett Reduction
1669 | P256_reduce_mod_n_64bytes proc
1670 | 	push {r0,r4-r11,lr}
1671 | 	frame push {r4-r11,lr}
1672 | 	sub sp,sp,#108
1673 | 	frame address sp,148
1674 | 	
1675 | 	mov r10,r1
1676 | 	
1677 | 	add r0,sp,#36
1678 | 	adds r1,r1,#28
1679 | 	adr r2,P256_order_mu
1680 | 	bl mul288x288
1681 | 	
1682 | 	mov r0,sp
1683 | 	add r1,sp,#72
1684 | 	adr r2,P256_order
1685 | 	bl mul288x288
1686 | 	
1687 | 	ldm r10,{r0-r8}
1688 | 	pop {r9-r12}
1689 | 	frame address sp,132
1690 | 	subs r0,r0,r9
1691 | 	sbcs r1,r1,r10
1692 | 	sbcs r2,r2,r11
1693 | 	sbcs r3,r3,r12
1694 | 	pop {r9-r12,lr}
1695 | 	frame address sp,112
1696 | 	sbcs r4,r4,r9
1697 | 	sbcs r5,r5,r10
1698 | 	sbcs r6,r6,r11
1699 | 	sbcs r7,r7,r12
1700 | 	sbcs r8,r8,lr
1701 | 	
1702 | 	bl P256_reduce_mod_n_once
1703 | 	bl P256_reduce_mod_n_once
1704 | 	add sp,sp,#72
1705 | 	frame address sp,40
1706 | 	pop {r9}
1707 | 	frame address sp,36
1708 | 	
1709 | 	stm r9,{r0-r7}
1710 | 	
1711 | 	pop {r4-r11,pc}
1712 | 	endp
1713 | #endif
1714 | 
1715 | #if include_p256_sign
1716 | ; in: *r0 = out, *r1 = in
1717 | P256_reduce_mod_n_32bytes proc
1718 | 	export P256_reduce_mod_n_32bytes
1719 | 	push {r0,r4-r11,lr}
1720 | 	frame push {r4-r11,lr}
1721 | 	frame address sp,40
1722 | 	ldm r1,{r0-r7}
1723 | 	mov r8,#0
1724 | 	bl P256_reduce_mod_n_once
1725 | 	pop {r8}
1726 | 	frame address sp,36
1727 | 	stm r8,{r0-r7}
1728 | 	pop {r4-r11,pc}
1729 | 	endp
1730 | 
1731 | 
1732 | ; Adds two numbers mod n, both inputs can be any 256-bit numbers
1733 | ; out and in may overlap
1734 | ; in: *r1, *r2
1735 | ; out: *r0
1736 | P256_add_mod_n proc
1737 | 	export P256_add_mod_n
1738 | 	push {r0,r4-r11,lr}
1739 | 	frame push {r4-r11,lr}
1740 | 	frame address sp,40
1741 | 	
1742 | 	mov r12,r1
1743 | 	
1744 | 	ldm r2,{r4-r11}
1745 | 	ldm r12!,{r0-r3}
1746 | 	adds r0,r4
1747 | 	adcs r1,r5
1748 | 	adcs r2,r6
1749 | 	adcs r3,r7
1750 | 	ldm r12,{r4-r7}
1751 | 	adcs r4,r8
1752 | 	adcs r5,r9
1753 | 	adcs r6,r10
1754 | 	adcs r7,r11
1755 | 	movs r8,#0
1756 | 	adcs r8,r8
1757 | 	
1758 | 	bl P256_reduce_mod_n_once
1759 | 	bl P256_reduce_mod_n_once
1760 | 	pop {r8}
1761 | 	frame address sp,36
1762 | 	stm r8,{r0-r7}
1763 | 	
1764 | 	pop {r4-r11,pc}
1765 | 	
1766 | 	endp
1767 | #endif
1768 | 	
1769 | #if include_p256_verify || include_p256_sign
1770 | ; Multiplies two numbers in the range [0,2^256-1] mod n
1771 | ; out and in may overlap
1772 | ; in: *r1, *r2
1773 | ; out: *r0
1774 | P256_mul_mod_n proc
1775 | 	export P256_mul_mod_n
1776 | 	movs r3,#0
1777 | 	push {r3-r10,lr}
1778 | 	frame push {r4-r10,lr}
1779 | 	frame address sp,36
1780 | 	
1781 | 	mov r4,r0
1782 | 	
1783 | 	ldm r1,{r1,r3,r5-r10}
1784 | 	push {r1,r3,r5-r10}
1785 | 	frame address sp,68
1786 | 	
1787 | 	movs r1,#0
1788 | 	push {r1}
1789 | 	frame address sp,72
1790 | 	ldm r2,{r1,r3,r5-r10}
1791 | 	push {r1,r3,r5-r10}
1792 | 	frame address sp,104
1793 | 	
1794 | 	sub sp,#72
1795 | 	frame address sp,176
1796 | 	
1797 | 	mov r0,sp
1798 | 	add r1,sp,#72
1799 | 	add r2,sp,#108
1800 | 	bl mul288x288 ; just reuse the 288x288-bit multiplier rather than also writing a 256x256
1801 | 	
1802 | 	mov r0,r4
1803 | 	mov r1,sp
1804 | 	bl P256_reduce_mod_n_64bytes
1805 | 	
1806 | 	add sp,#144
1807 | 	frame address sp,32
1808 | 	pop {r4-r10,pc}
1809 | 	
1810 | 	endp
1811 | 
1812 | #if include_p256_sign
1813 | ; r0: delta (also returned)
1814 | ; r1: f
1815 | ; r2: g
1816 | ; r3: dest
1817 | P256_divsteps2_31 proc
1818 | 	export P256_divsteps2_31
1819 | 	push {r3,r4-r8,lr}
1820 | 	frame push {r4-r8,lr}
1821 | 	frame address sp,28
1822 | 	
1823 | 	; u,v,q,r
1824 | 	movs r4,#1
1825 | 	movs r5,#0
1826 | 	movs r6,#0
1827 | 	movs r7,#1
1828 | 	
1829 | 	; counter
1830 | 	mov lr,#31
1831 | 	
1832 | 0
1833 | 	subs r3,r0,#1
1834 | 	lsl r12,r2,#31
1835 | 	bic r3,r12,r3
1836 | 	asrs r3,r3,#31 ; mask
1837 | 	lsr r8,r3,#31 ; b
1838 | 	
1839 | 	; conditionally negate delta
1840 | 	eors r0,r0,r3
1841 | 	subs r0,r0,r3
1842 | 	
1843 | 	mul r12,r1,r3 ; t = f * -b (= f * m)
1844 | 	bics r1,r1,r3 ; f &= ~m
1845 | 	umlal r1,r12,r2,r8 ; f += g * b
1846 | 	umaal r2,r12,r2,r3 ; g += t + g * -b (= g * m)
1847 | 	
1848 | 	mul r12,r4,r3
1849 | 	bics r4,r4,r3
1850 | 	umlal r4,r12,r6,r8
1851 | 	umaal r6,r12,r6,r3
1852 | 	
1853 | 	mul r12,r5,r3
1854 | 	bics r5,r5,r3
1855 | 	umlal r5,r12,r7,r8
1856 | 	umaal r7,r12,r7,r3
1857 | 	
1858 | 	ands r12,r2,#1 ; g0 = g & 1
1859 | 	adds r0,r0,#1 ; delta += 1
1860 | 	
1861 | 	; g = (g + g0 * f) / 2
1862 | 	mul r3,r12,r1
1863 | 	adds r2,r2,r3
1864 | 	lsrs r2,r2,#1 ; we don't need the MSB
1865 | 	
1866 | 	umlal r6,r8,r12,r4 ; q += g0 * u
1867 | 	umlal r7,r8,r12,r5 ; r += g0 * v
1868 | 	
1869 | 	adds r4,r4,r4 ; u *= 2
1870 | 	adds r5,r5,r5 ; v *= 2
1871 | 	
1872 | 	subs lr,lr,#1
1873 | 	bne %b0
1874 | 	
1875 | 	pop {r3}
1876 | 	stm r3!,{r4-r7}
1877 | 	
1878 | 	pop {r4-r8,pc}
1879 | 	endp
1880 | 
1881 | ; r0: a, r1: b
1882 | ; *r2: f,g
1883 | ; *r3: out
1884 | ; cycles: 132
1885 | P256_matrix_mul_fg_9 proc
1886 | 	export P256_matrix_mul_fg_9
1887 | 	push {r4-r11,lr}
1888 | 	frame push {r4-r11,lr}
1889 | 	
1890 | 	; this function calculates (a * f + b * g) / 2^31, which shall be an integer
1891 | 	
1892 | 	; the range is [-2^30, 2^31], so if negative, the top 2 bits are both 1s
1893 | 	; convert to absolute value and sign
1894 | 	and r4,r0,r0,lsl #1
1895 | 	asrs r4,r4,#31
1896 | 	eors r0,r0,r4
1897 | 	subs r0,r0,r4
1898 | 	
1899 | 	and r5,r1,r1,lsl #1
1900 | 	asrs r5,r5,#31
1901 | 	eors r1,r1,r5
1902 | 	subs r1,r1,r5
1903 | 	
1904 | 	ldm r2!,{r6} ; f sign
1905 | 	ldr r7,[r2,#36] ; g sign
1906 | 	
1907 | 	; compute the resulting sign, which will be negative if exactly one of g'sign and b's sign is negative
1908 | 	eors r4,r4,r6 ; combine f's sign and a's sign
1909 | 	eors r5,r5,r7 ; combine g's sign and b's sign
1910 | 	eors r4,r4,r5 ; mask for negating a * f before adding to b * g
1911 | 	stm r3!,{r5}
1912 | 	push {r1,r2,r3}
1913 | 	frame address sp,48
1914 | 	
1915 | 	; load f, which is stored as a signed 257-bit number (sign extended to 288 bits) and initially conditionally negated through r6
1916 | 	; now conditionally negate it depending on the r4 mask
1917 | 	ldm r2!,{r1,r3,r5-r11}
1918 | 	eors r1,r1,r4
1919 | 	eors r3,r3,r4
1920 | 	eors r5,r5,r4
1921 | 	eors r6,r6,r4
1922 | 	eors r7,r7,r4
1923 | 	eor r8,r8,r4
1924 | 	eor r9,r9,r4
1925 | 	eor r10,r10,r4
1926 | 	
1927 | 	subs r1,r1,r4
1928 | 	sbcs r3,r3,r4
1929 | 	sbcs r5,r5,r4
1930 | 	sbcs r6,r6,r4
1931 | 	sbcs r7,r7,r4
1932 | 	sbcs r8,r8,r4
1933 | 	sbcs r9,r9,r4
1934 | 	sbcs r10,r10,r4
1935 | 	; f is never 0, so we can skip last sbcs (for r11), since we know carry flag would be 0
1936 | 	eor r4,r4,r11
1937 | 	
1938 | 	; multiply the signed 257-bit value by |a| (|a| <= 2^31), to get a signed 288-bit result
1939 | 	umull r1,lr,r0,r1
1940 | 	movs r2,#0
1941 | 	umull r11,r12,r2,r2
1942 | 	umaal r2,lr,r0,r3
1943 | 	umaal r11,lr,r0,r5
1944 | 	umull r3,r5,r12,r12
1945 | 	umaal r3,lr,r0,r6
1946 | 	umaal r5,lr,r0,r7
1947 | 	umull r6,r7,r12,r12
1948 | 	umaal r6,lr,r0,r8
1949 | 	umaal r7,lr,r0,r9
1950 | 	umaal r12,lr,r0,r10
1951 | 	mla lr,r0,r4,lr
1952 | 	; result: r1, r2, r11, r3, r5, r6, r7, r12, lr
1953 | 	
1954 | 	; add b*g (which also fits in a signed 288-bit value) and divide by 2^31 (the low 31 bits will all be zero before div)
1955 | 	pop {r0,r4}
1956 | 	frame address sp,40
1957 | 	adds r4,r4,#40
1958 | 	ldm r4!,{r8,r9}
1959 | 	mov r10,#0
1960 | 	umaal r1,r10,r0,r8
1961 | 	umaal r2,r10,r0,r9
1962 | 	adds r1,r1,r1
1963 | 	adcs r2,r2,r2
1964 | 	ldm r4!,{r1,r8,r9}
1965 | 	umaal r10,r11,r0,r1
1966 | 	umaal r11,r3,r0,r8
1967 | 	umaal r3,r5,r0,r9
1968 | 	adcs r10,r10,r10
1969 | 	adcs r11,r11,r11
1970 | 	adcs r3,r3,r3
1971 | 	ldm r4,{r1,r4,r8,r9}
1972 | 	umaal r5,r6,r0,r1
1973 | 	umaal r6,r7,r0,r4
1974 | 	umaal r7,r12,r0,r8
1975 | 	umaal r12,lr,r0,r9 ; by divsteps2 invariant, lr will now be 0 since both f and g each fits in a signed 257-bit value
1976 | 	adcs r5,r5,r5
1977 | 	adcs r6,r6,r6
1978 | 	adcs r7,r7,r7
1979 | 	adcs r12,r12,r12
1980 | 	sbcs lr,lr,lr ; extract the sign bit and sign-extend it
1981 | 	mvn lr,lr
1982 | 	pop {r1}
1983 | 	frame address sp,36
1984 | 	stm r1!,{r2,r10,r11}
1985 | 	stm r1!,{r3,r5,r6,r7,r12,lr}
1986 | 	
1987 | 	pop {r4-r11,pc}
1988 | 	endp
1989 | 
1990 | ; r0: a, r1: b
1991 | ; *r2: x,y
1992 | ; *r3: out
1993 | ; cycles: 184
1994 | 	align 4
1995 | P256_matrix_mul_mod_n proc
1996 | 	export P256_matrix_mul_mod_n
1997 | 	push {r4-r11,lr}
1998 | 	frame push {r4-r11,lr}
1999 | 	
2000 | 	; this function calculates a * x + b * y mod N (where N is the order of the P-256 curve)
2001 | 	
2002 | 	; the range is [-2^30, 2^31], so if negative, the top 2 bits are both 1s
2003 | 	; convert to absolute value and sign
2004 | 	and r4,r0,r0,lsl #1
2005 | 	asrs r4,r4,#31
2006 | 	eors r0,r0,r4
2007 | 	subs r0,r0,r4
2008 | 	
2009 | 	and r5,r1,r1,lsl #1
2010 | 	asrs r5,r5,#31
2011 | 	eors r1,r1,r5
2012 | 	subs r1,r1,r5
2013 | 	
2014 | 	ldm r2!,{r6} ; x sign
2015 | 	ldr r7,[r2,#32] ; y sign
2016 | 	
2017 | 	; compute the resulting sign, which will be negative if exactly one of x'sign and y's sign is negative
2018 | 	eors r4,r4,r6 ; combine x's sign and a's sign
2019 | 	eors r5,r5,r7 ; combine y's sign and b's sign
2020 | 	eors r4,r4,r5 ; mask for negating a * x before adding to b * y
2021 | 	stm r3!,{r5}
2022 | 	push {r1,r2,r3}
2023 | 	frame address sp,48
2024 | 	
2025 | 	; load x, which is stored as an unsigned 256-bit integer and initially conditionally negated through r6
2026 | 	; now conditionally negate it depending on the r4 mask
2027 | 	ldm r2,{r1-r3,r5-r9}
2028 | 	eors r1,r1,r4
2029 | 	eors r2,r2,r4
2030 | 	eors r3,r3,r4
2031 | 	eors r5,r5,r4
2032 | 	eors r6,r6,r4
2033 | 	eors r7,r7,r4
2034 | 	eor r8,r8,r4
2035 | 	eor r9,r9,r4
2036 | 	
2037 | 	subs r1,r1,r4
2038 | 	sbcs r2,r2,r4
2039 | 	sbcs r3,r3,r4
2040 | 	sbcs r5,r5,r4
2041 | 	sbcs r6,r6,r4
2042 | 	sbcs r7,r7,r4
2043 | 	sbcs r8,r8,r4
2044 | 	sbcs r9,r9,r4
2045 | 	
2046 | 	sbcs r4,r4,r4 ; if the value is nonzero, r4 will now contain -1 and we will add N to make it positive
2047 | 	
2048 | 	lsrs lr,r4,#31
2049 | 	mov r12,#0
2050 | 	ldrd r10,r11,P256_order_local
2051 | 	umaal r1,r12,lr,r10
2052 | 	umaal r2,r12,lr,r11
2053 | 	ldrd r10,r11,P256_order_local+8
2054 | 	umaal r3,r12,lr,r10
2055 | 	umaal r5,r12,lr,r11
2056 | 	umaal r6,r12,lr,r4
2057 | 	umaal r7,r12,lr,r4
2058 | 	mov r10,#0
2059 | 	umaal r8,r12,lr,r10
2060 | 	umaal r9,r12,lr,r4
2061 | 	
2062 | 	; calculate a * x, the result fits in 287 bits
2063 | 	umull r11,lr,r10,r10
2064 | 	umull r10,lr,r0,r1
2065 | 	umull r1,r12,r11,r11
2066 | 	umaal r11,lr,r0,r2
2067 | 	umaal r1,lr,r0,r3
2068 | 	umull r2,r3,r12,r12
2069 | 	umaal r2,lr,r0,r5
2070 | 	umaal r3,lr,r0,r6
2071 | 	umull r4,r5,r12,r12
2072 | 	umaal r4,lr,r0,r7
2073 | 	umaal r5,lr,r0,r8
2074 | 	umaal r12,lr,r0,r9
2075 | 	
2076 | 	; add b*y, the result will fit in 288 bits
2077 | 	pop {r0,r6}
2078 | 	frame address sp,40
2079 | 	adds r6,r6,#36
2080 | 	ldm r6!,{r8,r9}
2081 | 	movs r7,#0
2082 | 	umaal r10,r7,r0,r8
2083 | 	umaal r11,r7,r0,r9
2084 | 	ldm r6!,{r8,r9}
2085 | 	umaal r1,r7,r0,r8
2086 | 	umaal r2,r7,r0,r9
2087 | 	ldm r6!,{r8,r9}
2088 | 	umaal r3,r7,r0,r8
2089 | 	umaal r4,r7,r0,r9
2090 | 	ldm r6!,{r8,r9}
2091 | 	umaal r5,r7,r0,r8
2092 | 	umaal r12,r7,r0,r9
2093 | 	add lr,lr,r7
2094 | 	
2095 | 	; reduce modulo N using montgomery redc algorithm
2096 | 	ldr r0,=0xee00bc4f ; montgomery multiplication factor N' (when R = 2^32), N*N' = -1 mod R
2097 | 	mul r0,r10,r0 ; m = ((T mod R)N') mod R
2098 | 	movs r6,#0				; need 4-byte alignment on next instruction
2099 | 	ldrd r8,r9,P256_order_local
2100 | 	umaal r10,r6,r0,r8 ; t = (T + mN) / R
2101 | 	umaal r11,r6,r0,r9
2102 | 	subs r11,r11,r8 ; conditionally subtract by N unless we later find out the result becomes negative
2103 | 	ldrd r8,r10,P256_order_local+8
2104 | 	umaal r1,r6,r0,r8
2105 | 	sbcs r1,r1,r9
2106 | 	umaal r2,r6,r0,r10
2107 | 	mov r9,#-1
2108 | 	umaal r3,r6,r0,r9
2109 | 	umaal r4,r6,r0,r9
2110 | 	movs r7,#0
2111 | 	umaal r5,r6,r0,r7
2112 | 	umaal r12,r6,r0,r9
2113 | 	umaal lr,r6,r7,r7
2114 | 	sbcs r2,r2,r8
2115 | 	sbcs r3,r3,r10
2116 | 	sbcs r4,r4,r9
2117 | 	sbcs r5,r5,r9
2118 | 	sbcs r12,r12,r7
2119 | 	sbcs lr,lr,r9
2120 | 	sbcs r6,r6,r7 ; if the result becomes negative, r6 becomes -1
2121 | 	
2122 | 	; conditionally add back N
2123 | 	ldrd r0,r9,P256_order_local
2124 | 	lsrs r6,r6,#31
2125 | 	umaal r7,r11,r6,r0
2126 | 	umaal r1,r11,r6,r9
2127 | 	umaal r2,r11,r6,r8
2128 | 	umaal r3,r11,r6,r10
2129 | 	rsbs r0,r6,#0
2130 | 	umaal r4,r11,r6,r0
2131 | 	umaal r5,r11,r6,r0
2132 | 	mov r8,#0
2133 | 	umaal r11,r12,r6,r8
2134 | 	umaal r12,lr,r6,r0
2135 | 	
2136 | 	pop {r6}
2137 | 	frame address sp,36
2138 | 	stm r6!,{r7}
2139 | 	stm r6!,{r1,r2,r3,r4,r5,r11,r12}
2140 | 	
2141 | 	pop {r4-r11,pc}
2142 | 	
2143 | 	ltorg
2144 | 	endp
2145 | #else
2146 | ; *r0=u
2147 | ; *r1=x1
2148 | mod_inv_vartime_inner_n proc
2149 | 	adr r11,P256_order
2150 | 	ldm r0,{r2-r9}
2151 | 	cmp r2,#1
2152 | 	bne %f1
2153 | 
2154 | 	orrs r10,r3,r4
2155 | 	orrs r10,r5
2156 | 	orrs r10,r6
2157 | 	orrs r10,r7
2158 | 	orrs r10,r8
2159 | 	orrs r10,r9
2160 | 	itt eq
2161 | 	moveq r0,#1
2162 | 	bxeq lr
2163 | 
2164 | 1
2165 | 	tst r2,#1
2166 | 	itt ne
2167 | 	movne r0,#0
2168 | 	bxne lr
2169 | 2
2170 | 	lsrs r9,#1
2171 | 	rrxs r8,r8
2172 | 	rrxs r7,r7
2173 | 	rrxs r6,r6
2174 | 	rrxs r5,r5
2175 | 	rrxs r4,r4
2176 | 	rrxs r3,r3
2177 | 	rrxs r2,r2
2178 | 	stm r0,{r2-r9}
2179 | 	ldm r1,{r3-r10}
2180 | 	tst r3,#1
2181 | 	beq %f3
2182 | 	ldr r12,[r11,#0]
2183 | 	adds r3,r12
2184 | 	ldr r12,[r11,#4]
2185 | 	adcs r4,r12
2186 | 	ldr r12,[r11,#8]
2187 | 	adcs r5,r12
2188 | 	ldr r12,[r11,#12]
2189 | 	adcs r6,r12
2190 | 	adcs r7,#0xffffffff
2191 | 	adcs r8,#0xffffffff
2192 | 	adcs r9,#0
2193 | 	adcs r10,#0xffffffff
2194 | 3
2195 | 	rrxs r10,r10
2196 | 	rrxs r9,r9
2197 | 	rrxs r8,r8
2198 | 	rrxs r7,r7
2199 | 	rrxs r6,r6
2200 | 	rrxs r5,r5
2201 | 	rrxs r4,r4
2202 | 	rrx r3,r3
2203 | 	stm r1,{r3-r10}
2204 | 	tst r2,#1
2205 | 	itt ne
2206 | 	movne r0,#0
2207 | 	bxne lr
2208 | 	ldm r0,{r2-r9}
2209 | 	b %b2
2210 | 	
2211 | 	endp
2212 | 
2213 | ; *r0 = result
2214 | ; *r1 = input
2215 | P256_mod_n_inv_vartime proc
2216 | 	export P256_mod_n_inv_vartime
2217 | 	push {r0,r4-r11,lr}
2218 | 	frame push {r4-r11,lr}
2219 | 	frame address sp,40
2220 | 	sub sp,#128
2221 | 	frame address sp,168
2222 | 	mov r0,sp
2223 | 	
2224 | 	; stack: u x1 v x2
2225 | 	; init: u=*r1, v=p, x1=1, x2=0
2226 | 
2227 | 	ldm r1,{r1-r8}
2228 | 	stm r0!,{r1-r8}
2229 | 
2230 | 	movs r1,#1
2231 | 	movs r2,#0
2232 | 	umull r3,r4,r2,r2
2233 | 	umull r5,r6,r2,r2
2234 | 	umull r7,r8,r2,r2
2235 | 	mov r9,#0
2236 | 
2237 | 	stm r0,{r1-r8}
2238 | 	add r0,sp,#96
2239 | 	stm r0,{r2-r9}
2240 | 	adr r2,P256_order
2241 | 	ldm r2,{r2-r9}
2242 | 	add r0,sp,#64
2243 | 	stm r0,{r2-r9}
2244 | 
2245 | 0
2246 | 	mov r0,sp
2247 | 	add r1,sp,#32
2248 | 	bl mod_inv_vartime_inner_n
2249 | 	cmp r0,#0
2250 | 	it ne
2251 | 	addne r0,sp,#32
2252 | 	bne %f2
2253 | 
2254 | 	add r0,sp,#64
2255 | 	add r1,sp,#96
2256 | 	bl mod_inv_vartime_inner_n
2257 | 	cmp r0,#0
2258 | 	it ne
2259 | 	addne r0,sp,#96
2260 | 	bne %f2
2261 | 
2262 | 	ldm sp,{r0-r7}
2263 | 	add lr,sp,#64
2264 | 	ldm lr!,{r8-r11}
2265 | 	subs r0,r8
2266 | 	sbcs r1,r9
2267 | 	sbcs r2,r10
2268 | 	sbcs r3,r11
2269 | 	ldm lr!,{r8-r11}
2270 | 	sbcs r4,r8
2271 | 	sbcs r5,r9
2272 | 	sbcs r6,r10
2273 | 	sbcs r7,r11
2274 | 
2275 | 	bcc %f1
2276 | 	stm sp,{r0-r7}
2277 | 	add r0,sp,#32
2278 | 	add r1,sp,#32
2279 | 	add r2,sp,#96
2280 | 3
2281 | 	; submod here
2282 | 	ldm r1,{r1,r3-r9}
2283 | 	ldm r2!,{r10,r11,r12,lr}
2284 | 	subs r1,r10
2285 | 	sbcs r3,r11
2286 | 	sbcs r4,r12
2287 | 	sbcs r5,lr
2288 | 	ldm r2!,{r10,r11,r12,lr}
2289 | 	sbcs r6,r10
2290 | 	sbcs r7,r11
2291 | 	sbcs r8,r12
2292 | 	sbcs r9,lr
2293 | 	
2294 | 	sbcs r10,r10,r10
2295 | 	adr r11,P256_order
2296 | 	ldm r11,{r2,r11,r12,lr}
2297 | 	and r2,r10
2298 | 	and r11,r10
2299 | 	and r12,r10
2300 | 	and lr,r10
2301 | 	adds r1,r2
2302 | 	adcs r3,r11
2303 | 	adcs r4,r12
2304 | 	adcs r5,lr
2305 | 	adcs r6,r10
2306 | 	adcs r7,r10
2307 | 	adcs r8,#0
2308 | 	adcs r9,r10
2309 | 	stm r0,{r1,r3-r9}
2310 | 	b %b0
2311 | 1
2312 | 	movs r8,#0
2313 | 	subs r0,r8,r0
2314 | 	sbcs r1,r8,r1
2315 | 	sbcs r2,r8,r2
2316 | 	sbcs r3,r8,r3
2317 | 	sbcs r4,r8,r4
2318 | 	sbcs r5,r8,r5
2319 | 	sbcs r6,r8,r6
2320 | 	sbcs r7,r8,r7
2321 | 	add r8,sp,#64
2322 | 	stm r8,{r0-r7}
2323 | 	add r0,sp,#96
2324 | 	add r1,sp,#96
2325 | 	add r2,sp,#32
2326 | 	b %b3
2327 | 
2328 | 2
2329 | 	ldm r0,{r0-r7}
2330 | 	add sp,#128
2331 | 	frame address sp,40
2332 | 	pop {r8}
2333 | 	frame address sp,36
2334 | 	stm r8,{r0-r7}
2335 | 	pop {r4-r11,pc}
2336 | 	
2337 | 	endp
2338 | #endif
2339 | #endif
2340 | 
2341 | #if include_p256_mult
2342 | 	align 4
2343 | P256_order_local ;label definition (arm clang assembler is broken for ldrd global labels defined in the same file)
2344 | P256_order
2345 | 	export P256_order
2346 | 	dcd 0xFC632551
2347 | 	dcd 0xF3B9CAC2
2348 | 	dcd 0xA7179E84
2349 | 	dcd 0xBCE6FAAD
2350 | 	dcd 0xFFFFFFFF
2351 | 	dcd 0xFFFFFFFF
2352 | 	dcd 0
2353 | 	dcd 0xFFFFFFFF
2354 | 	dcd 0
2355 | 	; end P256_order
2356 | #endif
2357 | 
2358 | #if include_p256_verify || include_p256_basemult || include_p256_raw_scalarmult_generic
2359 | ; Checks whether the input number is within [1,n-1]
2360 | ; in: *r0
2361 | ; out: r0 = 1 if ok, else 0
2362 | P256_check_range_n proc
2363 | 	export P256_check_range_n
2364 | 	push {r4-r11,lr}
2365 | 	frame push {r4-r11,lr}
2366 | 	ldm r0,{r1-r8}
2367 | 	orrs r0,r1,r2
2368 | 	orrs r0,r3
2369 | 	orrs r0,r4
2370 | 	orrs r0,r5
2371 | 	orrs r0,r6
2372 | 	orrs r0,r7
2373 | 	orrs r0,r8
2374 | 	beq %f0
2375 | 	
2376 | 	adr r0,P256_order
2377 | 	ldm r0!,{r9-r12}
2378 | 	subs r1,r9
2379 | 	sbcs r2,r10
2380 | 	sbcs r3,r11
2381 | 	sbcs r4,r12
2382 | 	ldm r0,{r0-r3}
2383 | 	sbcs r5,r0
2384 | 	sbcs r6,r1
2385 | 	sbcs r7,r2
2386 | 	sbcs r8,r3
2387 | 	
2388 | 	sbcs r0,r0
2389 | 	lsrs r0,#31
2390 | 0
2391 | 	pop {r4-r11,pc}
2392 | 	
2393 | 	endp
2394 | #endif
2395 | 	
2396 | 
2397 | ; Elliptic curve operations on the NIST curve P-256
2398 | 
2399 | #if include_p256_verify || include_p256_varmult || include_p256_decompress_point || include_p256_decode_point
2400 | 	align 4
2401 | b_mont
2402 | 	dcd 0x29c4bddf
2403 | 	dcd 0xd89cdf62
2404 | 	dcd 0x78843090
2405 | 	dcd 0xacf005cd
2406 | 	dcd 0xf7212ed6
2407 | 	dcd 0xe5a220ab
2408 | 	dcd 0x04874834
2409 | 	dcd 0xdc30061d
2410 | three_mont
2411 | 	dcd 0x3
2412 | 	dcd 0x0
2413 | 	dcd 0x0
2414 | 	dcd 0xfffffffd
2415 | 	dcd 0xffffffff
2416 | 	dcd 0xffffffff
2417 | 	dcd 0xfffffffc
2418 | 	dcd 0x2
2419 | #endif
2420 | 
2421 | #if include_p256_verify || include_p256_varmult || include_p256_decode_point
2422 | ; Checks if a point is on curve
2423 | ; in: *r0 = x, *r1 = y, in Montgomery form
2424 | ; out: r0 = 1 if on curve, else 0
2425 | P256_point_is_on_curve proc
2426 | 	export P256_point_is_on_curve
2427 | 	push {r0,r4-r11,lr}
2428 | 	frame push {r4-r11,lr}
2429 | 	frame address sp,40
2430 | 	
2431 | 	; We verify y^2 - x(x^2 - 3) = b
2432 | 	
2433 | 	; y^2
2434 | 	ldm r1,{r0-r7}
2435 | 	bl P256_sqrmod
2436 | 	push {r0-r7}
2437 | 	frame address sp,72
2438 | 	
2439 | 	; x^2
2440 | 	ldr r0,[sp,#32]
2441 | 	ldm r0,{r0-r7}
2442 | 	bl P256_sqrmod
2443 | 	push {r0-r7}
2444 | 	frame address sp,104
2445 | 	
2446 | 	; x^2 - 3
2447 | 	mov r1,sp
2448 | 	adr r2,three_mont
2449 | 	bl P256_submod
2450 | 	stm sp,{r0-r7}
2451 | 	
2452 | 	; x(x^2 - 3)
2453 | 	ldr r1,[sp,#64]
2454 | 	mov r2,sp
2455 | 	bl P256_mulmod
2456 | 	stm sp,{r0-r7}
2457 | 	
2458 | 	; y^2 - x(x^2 - 3)
2459 | 	add r1,sp,#32
2460 | 	mov r2,sp
2461 | 	bl P256_submod
2462 | 	
2463 | 	; compare with b
2464 | 	adr r8,b_mont
2465 | 	ldm r8!,{r9-r12}
2466 | 	eors r0,r9
2467 | 	ittt eq
2468 | 	eorseq r1,r10
2469 | 	eorseq r2,r11
2470 | 	eorseq r3,r12
2471 | 	ldm r8,{r9-r12}
2472 | 	itttt eq
2473 | 	eorseq r4,r9
2474 | 	eorseq r5,r10
2475 | 	eorseq r6,r11
2476 | 	eorseq r7,r12
2477 | 	mov r0,#0
2478 | 	it eq
2479 | 	moveq r0,#1
2480 | 	
2481 | 	add sp,#68
2482 | 	frame address sp,36
2483 | 	
2484 | 	pop {r4-r11,pc}
2485 | 	
2486 | 	endp
2487 | #endif
2488 | 
2489 | #if include_p256_basemult || include_p256_varmult || include_p256_decompress_point
2490 | 	align 4
2491 | P256_p
2492 | 	dcd 0xffffffff
2493 | 	dcd 0xffffffff
2494 | 	dcd 0xffffffff
2495 | 	dcd 0
2496 | 	dcd 0
2497 | 	dcd 0
2498 | 	dcd 1
2499 | 	dcd 0xffffffff
2500 | #endif
2501 | 
2502 | #if include_p256_decompress_point
2503 | ; in: r0 = output location for y, *r1 = x, r2 = parity bit for y
2504 | ; out: r0 = 1 if ok, 0 if invalid x
2505 | P256_decompress_point proc
2506 | 	export P256_decompress_point
2507 | 	push {r0,r2,r4-r11,lr}
2508 | 	frame push {r4-r11,lr}
2509 | 	frame address sp,44
2510 | 	sub sp,#32
2511 | 	frame address sp,76
2512 | 	
2513 | 	mov r0,sp
2514 | 	bl P256_to_montgomery
2515 | 	ldm sp,{r0-r7}
2516 | 	
2517 | 	bl P256_sqrmod
2518 | 	push {r0-r7}
2519 | 	
2520 | 	mov r1,sp
2521 | 	adr r2,three_mont
2522 | 	bl P256_submod
2523 | 	stm sp,{r0-r7}
2524 | 	frame address sp,108
2525 | 	
2526 | 	add r1,sp,#32
2527 | 	mov r2,sp
2528 | 	bl P256_mulmod
2529 | 	stm sp,{r0-r7}
2530 | 	
2531 | 	mov r1,sp
2532 | 	adr r2,b_mont
2533 | 	bl P256_addmod
2534 | 	stm sp,{r0-r7}
2535 | 	
2536 | 	mov r8,#1
2537 | 	bl P256_modinv_sqrt
2538 | 	add r8,sp,#32
2539 | 	stm r8,{r0-r7}
2540 | 	
2541 | 	bl P256_sqrmod
2542 | 	
2543 | 	pop {r8-r11}
2544 | 	frame address sp,92
2545 | 	eors r8,r0
2546 | 	ittt eq
2547 | 	eorseq r9,r1
2548 | 	eorseq r10,r2
2549 | 	eorseq r11,r3
2550 | 	pop {r8-r11}
2551 | 	frame address sp,76
2552 | 	itttt eq
2553 | 	eorseq r8,r4
2554 | 	eorseq r9,r5
2555 | 	eorseq r10,r6
2556 | 	eorseq r11,r7
2557 | 	it ne
2558 | 	movne r0,#0
2559 | 	bne %f1
2560 | 	
2561 | 	mov r0,sp
2562 | 	mov r1,sp
2563 | 	bl P256_from_montgomery
2564 | 	
2565 | 	ldr r3,[sp]
2566 | 	ldrd r0,r1,[sp,#32]
2567 | 	and r2,r3,#1
2568 | 	eors r2,r1
2569 | 	mov r1,sp
2570 | 	adr r3,P256_p
2571 | 	bl P256_negate_mod_m_if
2572 | 	movs r0,#1
2573 | 1
2574 | 	add sp,#32+8
2575 | 	frame address sp,36
2576 | 	pop {r4-r11,pc}
2577 | 	
2578 | 	endp
2579 | #endif
2580 | 
2581 | #if include_p256_basemult || include_p256_varmult
2582 | ; *r0 = output affine montgomery x
2583 | ; *r1 = output affine montgomery y
2584 | ; *r2 = input jacobian montgomery
2585 | P256_jacobian_to_affine proc
2586 | 	export P256_jacobian_to_affine
2587 | 	push {r0,r1,r2,r4-r11,lr}
2588 | 	frame push {r4-r11,lr}
2589 | 	frame address sp,48
2590 | 	
2591 | 	adds r2,#64
2592 | 	ldm r2,{r0-r7}
2593 | 	mov r8,#0
2594 | 	bl P256_modinv_sqrt
2595 | 	push {r0-r7}
2596 | 	frame address sp,80
2597 | 	
2598 | 	bl P256_sqrmod
2599 | 	push {r0-r7}
2600 | 	frame address sp,112
2601 | 	
2602 | 	add r1,sp,#32
2603 | 	mov r2,sp
2604 | 	bl P256_mulmod
2605 | 	add r8,sp,#32
2606 | 	stm r8,{r0-r7}
2607 | 	
2608 | 	mov r1,sp
2609 | 	ldr r2,[sp,#72]
2610 | 	bl P256_mulmod
2611 | 	ldr r8,[sp,#64]
2612 | 	stm r8,{r0-r7}
2613 | 	
2614 | 	ldr r2,[sp,#72]
2615 | 	add r1,sp,#32
2616 | 	adds r2,r2,#32
2617 | 	bl P256_mulmod
2618 | 	ldr r8,[sp,#68]
2619 | 	stm r8,{r0-r7}
2620 | 	
2621 | 	add sp,#76
2622 | 	frame address sp,36
2623 | 	
2624 | 	pop {r4-r11,pc}
2625 | 	endp
2626 | #endif
2627 | 	
2628 | #if include_p256_mult
2629 | ; Doubles the point in Jacobian form (integers are in Montgomery form)
2630 | ; *r0 = out, *r1 = in
2631 | P256_double_j proc
2632 | 	export P256_double_j
2633 | 	push {r0,r1,r4-r11,lr}
2634 | 	frame push {r4-r11,lr}
2635 | 	frame address sp,44
2636 | 	
2637 | 	; https://eprint.iacr.org/2014/130.pdf, algorithm 10
2638 | 	
2639 | 	; t1 = Z1^2
2640 | 	adds r1,#64
2641 | 	ldm r1,{r0-r7}
2642 | 	bl P256_sqrmod
2643 | 	push {r0-r7}
2644 | 	frame address sp,76
2645 | 	
2646 | 	; Z2 = Y1 * Z1
2647 | 	ldr r1,[sp,#36]
2648 | 	adds r1,#32
2649 | 	add r2,r1,#32
2650 | 	bl P256_mulmod
2651 | 	ldr r8,[sp,#32]
2652 | 	add r8,#64
2653 | 	stm r8,{r0-r7}
2654 | 	
2655 | 	; t2 = X1 + t1
2656 | 	ldr r1,[sp,#36]
2657 | 	mov r2,sp
2658 | 	bl P256_addmod
2659 | 	push {r0-r7}
2660 | 	frame address sp,108
2661 | 	
2662 | 	; t1 = X1 - t1
2663 | 	ldr r1,[sp,#68]
2664 | 	add r2,sp,#32
2665 | 	bl P256_submod
2666 | 	add r8,sp,#32
2667 | 	stm r8,{r0-r7}
2668 | 	
2669 | 	; t1 = t1 * t2
2670 | 	add r1,sp,#32
2671 | 	mov r2,sp
2672 | 	bl P256_mulmod
2673 | 	add r8,sp,#32
2674 | 	stm r8,{r0-r7}
2675 | 	
2676 | 	; t2 = t1 / 2
2677 | 	lsl r8,r0,#31
2678 | 	adds r0,r0,r8, asr #31
2679 | 	adcs r1,r1,r8, asr #31
2680 | 	adcs r2,r2,r8, asr #31
2681 | 	adcs r3,#0
2682 | 	adcs r4,#0
2683 | 	adcs r5,#0
2684 | 	adcs r6,r6,r8, lsr #31
2685 | 	adcs r7,r7,r8, asr #31
2686 | 	rrxs r7,r7
2687 | 	rrxs r6,r6
2688 | 	rrxs r5,r5
2689 | 	rrxs r4,r4
2690 | 	rrxs r3,r3
2691 | 	rrxs r2,r2
2692 | 	rrxs r1,r1
2693 | 	rrx r0,r0
2694 | 	stm sp,{r0-r7}
2695 | 	
2696 | 	; t1 = t1 + t2
2697 | 	add r1,sp,#32
2698 | 	mov r2,sp
2699 | 	bl P256_addmod
2700 | 	add r8,sp,#32
2701 | 	stm r8,{r0-r7}
2702 | 	
2703 | 	; t2 = t1^2
2704 | 	bl P256_sqrmod
2705 | 	stm sp,{r0-r7}
2706 | 	
2707 | 	; Y2 = Y1^2
2708 | 	ldr r0,[sp,#68]
2709 | 	adds r0,#32
2710 | 	ldm r0,{r0-r7}
2711 | 	bl P256_sqrmod
2712 | 	ldr r8,[sp,#64]
2713 | 	add r8,#32
2714 | 	stm r8,{r0-r7}
2715 | 	
2716 | 	; t3 = Y2^2
2717 | 	bl P256_sqrmod
2718 | 	push {r0-r7}
2719 | 	frame address sp,140
2720 | 	
2721 | 	; Y2 = X1 * Y2
2722 | 	ldrd r0,r1,[sp,#96]
2723 | 	add r2,r0,#32
2724 | 	bl P256_mulmod
2725 | 	ldr r8,[sp,#96]
2726 | 	add r8,#32
2727 | 	stm r8,{r0-r7}
2728 | 	
2729 | 	; X2 = 2 * Y2
2730 | 	bl P256_times2
2731 | 	ldr r8,[sp,#96]
2732 | 	stm r8,{r0-r7}
2733 | 	
2734 | 	; X2 = t2 - X2
2735 | 	add r1,sp,#32
2736 | 	mov r2,r8
2737 | 	bl P256_submod
2738 | 	ldr r8,[sp,#96]
2739 | 	stm r8,{r0-r7}
2740 | 	
2741 | 	; t2 = Y2 - X2
2742 | 	mov r2,r8
2743 | 	add r1,r2,#32
2744 | 	bl P256_submod
2745 | 	add r8,sp,#32
2746 | 	stm r8,{r0-r7}
2747 | 	
2748 | 	; t1 = t1 * t2
2749 | 	add r1,sp,#64
2750 | 	add r2,sp,#32
2751 | 	bl P256_mulmod
2752 | 	add r8,sp,#64
2753 | 	stm r8,{r0-r7}
2754 | 	
2755 | 	; Y2 = t1 - t3
2756 | 	add r1,sp,#64
2757 | 	mov r2,sp
2758 | 	bl P256_submod
2759 | 	ldr r8,[sp,#96]
2760 | 	add r8,#32
2761 | 	stm r8,{r0-r7}
2762 | 	
2763 | 	add sp,#104
2764 | 	frame address sp,36
2765 | 	
2766 | 	pop {r4-r11,pc}
2767 | 	endp
2768 | 
2769 | ; sets the jacobian *r0 point to *r1
2770 | ; if r2=1, then Y will be negated
2771 | ; if r3=1, then Z will be set to 1
2772 | ; clobbers all registers
2773 | add_sub_helper proc
2774 | 	push {lr}
2775 | 	frame push {lr}
2776 | 	ldm r1!,{r5-r12}
2777 | 	stm r0!,{r5-r12}
2778 | 	ldm r1!,{r5-r12}
2779 | 	cbz r2,%f0
2780 | 	; note that Y is never 0 for a valid point
2781 | 	mov lr,#0
2782 | 	rsbs r4,r2,#0
2783 | 	subs r5,r4,r5
2784 | 	sbcs r6,r4,r6
2785 | 	sbcs r7,r4,r7
2786 | 	sbcs r8,lr,r8
2787 | 	sbcs r9,lr,r9
2788 | 	sbcs r10,lr,r10
2789 | 	sbcs r11,r2,r11
2790 | 	sbcs r12,r4,r12
2791 | 0
2792 | 	stm r0!,{r5-r12}
2793 | 	cbnz r3,%f1
2794 | 	ldm r1,{r5-r12}
2795 | 	stm r0,{r5-r12}
2796 | 	b %f2
2797 | 1
2798 | 	; Set Z3 to 1 in Montgomery form
2799 | 	movs r4,#0
2800 | 	umull r5,r10,r4,r4
2801 | 	mvns r6,r4
2802 | 	mvns r7,r4
2803 | 	mov r8,#0xffffffff
2804 | 	mov r9,#0xfffffffe
2805 | 	
2806 | 	stm r0,{r3-r10}
2807 | 2
2808 | 	pop {pc}
2809 | 	
2810 | 	endp
2811 | 
2812 | ; Adds or subtracts points in Jacobian form (integers are in Montgomery form)
2813 | ; The first operand is located in *r0, the second in *r1 (may not overlap)
2814 | ; The result is stored at *r0
2815 | ; r2 = 0 if add, 1 if sub
2816 | ; r3 = 1 if the second point's Z point is 1 and therefore not loaded
2817 | ;
2818 | ; This function assumes the second operand is not the point at infinity,
2819 | ; otherwise it handles all inputs.
2820 | ; The first operand is treated at the point at infinity as long as its Z coordinate is 0.
2821 | P256_add_sub_j proc
2822 | 	export P256_add_sub_j
2823 | 	push {r0-r11,lr}
2824 | 	frame push {r4-r11,lr}
2825 | 	frame address sp,52
2826 | 	
2827 | 	;ldr r4,[r0,#64]
2828 | 	;cbnz r4,%f2
2829 | 	add r4,r0,#64
2830 | 	ldm r4,{r4-r11}
2831 | 	orrs r4,r5
2832 | 	orrs r4,r6
2833 | 	orrs r4,r7
2834 | 	orrs r4,r8
2835 | 	orrs r4,r9
2836 | 	orrs r4,r10
2837 | 	orrs r4,r11
2838 | 	bne %f2
2839 | 	
2840 | 	; First point is 0, so just set result to (-) the other point
2841 | 	bl add_sub_helper
2842 | 	add sp,#16
2843 | 	frame address sp,36
2844 | 	pop {r4-r11,pc}
2845 | 2
2846 | 	frame address sp,52
2847 | 	; Here a variant of
2848 | 	; https://www.hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-3/addition/add-1998-cmo-2.op3
2849 | 	; is used, but rearranged and uses less temporaries.
2850 | 	; The first operand to the function is both (X3,Y3,Z3) and (X2,Y2,Z2).
2851 | 	; The second operand to the function is (X1,Y1,Z1)
2852 | 	
2853 | 	cbnz r3,%f100
2854 | 	
2855 | 	; Z1Z1 = Z1^2
2856 | 	adds r1,#64
2857 | 	ldm r1,{r0-r7}
2858 | 	bl P256_sqrmod
2859 | 	push {r0-r7}
2860 | 	frame address sp,84
2861 | 	
2862 | 	; U2 = X2*Z1Z1
2863 | 	ldr r1,[sp,#32]
2864 | 	mov r2,sp
2865 | 	bl P256_mulmod
2866 | 	ldr r8,[sp,#32]
2867 | 	stm r8,{r0-r7}
2868 | 	
2869 | 	; t1 = Z1*Z1Z1
2870 | 	ldr r1,[sp,#36]
2871 | 	adds r1,#64
2872 | 	mov r2,sp
2873 | 	bl P256_mulmod
2874 | 	stm sp,{r0-r7}
2875 | 	
2876 | 	; S2 = Y2*t1
2877 | 	ldr r1,[sp,#32]
2878 | 	adds r1,#32
2879 | 	mov r2,sp
2880 | 	bl P256_mulmod
2881 | 	ldr r8,[sp,#32]
2882 | 	add r8,#32
2883 | 	stm r8,{r0-r7}
2884 | 	b %f101
2885 | 100
2886 | 	sub sp,#32
2887 | 	frame address sp,84
2888 | 101
2889 | 	
2890 | 	; Z2Z2 = Z2^2
2891 | 	ldr r1,[sp,#32]
2892 | 	adds r1,#64
2893 | 	ldm r1,{r0-r7}
2894 | 	bl P256_sqrmod
2895 | 	push {r0-r7}
2896 | 	frame address sp,116
2897 | 	
2898 | 	; U1 = X1*Z2Z2
2899 | 	ldr r1,[sp,#68]
2900 | 	mov r2,sp
2901 | 	bl P256_mulmod
2902 | 	add r8,sp,#32
2903 | 	stm r8,{r0-r7}
2904 | 	
2905 | 	; t2 = Z2*Z2Z2
2906 | 	ldr r1,[sp,#64]
2907 | 	adds r1,#64
2908 | 	mov r2,sp
2909 | 	bl P256_mulmod
2910 | 	stm sp,{r0-r7}
2911 | 	
2912 | 	; S1 = Y1*t2
2913 | 	ldr r1,[sp,#68]
2914 | 	adds r1,#32
2915 | 	mov r2,sp
2916 | 	bl P256_mulmod
2917 | 	stm sp,{r0-r7}
2918 | 	
2919 | 	
2920 | 	; H = U2-U1
2921 | 	ldr r1,[sp,#64]
2922 | 	add r2,sp,#32
2923 | 	bl P256_submod
2924 | 	ldr r8,[sp,#64]
2925 | 	stm r8,{r0-r7}
2926 | 	
2927 | 	; HH = H^2
2928 | 	bl P256_sqrmod
2929 | 	push {r0-r7}
2930 | 	frame address sp,148
2931 | 	
2932 | 	; Z3 = Z2*H
2933 | 	ldr r2,[sp,#96]
2934 | 	add r1,r2,#64
2935 | 	bl P256_mulmod
2936 | 	ldr r8,[sp,#96]
2937 | 	add r8,#64
2938 | 	stm r8,{r0-r7}
2939 | 	
2940 | 	; Z3 = Z1*Z3
2941 | 	ldr r1,[sp,#108]
2942 | 	cbnz r1,%f102
2943 | 	ldr r1,[sp,#100]
2944 | 	adds r1,#64
2945 | 	mov r2,r8
2946 | 	bl P256_mulmod
2947 | 	ldr r8,[sp,#96]
2948 | 	add r8,#64
2949 | 	stm r8,{r0-r7}
2950 | 102
2951 | 	
2952 | 	; HHH = H*HH
2953 | 	ldr r1,[sp,#96]
2954 | 	mov r2,sp
2955 | 	bl P256_mulmod
2956 | 	ldr r8,[sp,#96]
2957 | 	stm r8,{r0-r7}
2958 | 	
2959 | 	;cbnz r0,%f3
2960 | 	orrs r1,r0 ;;
2961 | 	orrs r1,r2
2962 | 	orrs r1,r3
2963 | 	orrs r1,r4
2964 | 	orrs r1,r5
2965 | 	orrs r1,r6
2966 | 	orrs r0,r1,r7
2967 | 3
2968 | 	push {r0} ; if r0 == 0: HHH is 0, which means the two input points have the same affine x coordinates
2969 | 	frame address sp,152
2970 | 	
2971 | 	; r = S2-+S1
2972 | 	ldr r1,[sp,#100]
2973 | 	adds r1,#32
2974 | 	add r2,sp,#36
2975 | 	ldr r3,[sp,#108]
2976 | 	cbz r3,%f4
2977 | 	bl P256_addmod
2978 | 	b %f5
2979 | 4
2980 | 	bl P256_submod
2981 | 5
2982 | 	ldr r8,[sp,#100]
2983 | 	add r8,#32
2984 | 	stm r8,{r0-r7}
2985 | 	
2986 | 	; check r == 0 && HHH == 0
2987 | 	pop {r8}
2988 | 	frame address sp,148
2989 | 	;cbnz r0,%f6
2990 | 	orrs r1,r0 ;;
2991 | 	orrs r1,r2
2992 | 	orrs r1,r3
2993 | 	orrs r1,r4
2994 | 	orrs r1,r5
2995 | 	orrs r1,r6
2996 | 	orrs r1,r7
2997 | 	orrs r1,r8
2998 | 	bne %f6
2999 | 	; Points should be doubled since addition formula can't handle this case
3000 | 	; Since we have already overwritten the first point,
3001 | 	; we must copy the second point after possibly negating it
3002 | 	add sp,#96
3003 | 	frame address sp,52
3004 | 	ldm sp,{r0-r3}
3005 | 	bl add_sub_helper
3006 | 	
3007 | 	ldr r0,[sp,#0]
3008 | 	mov r1,r0
3009 | 	add sp,#16
3010 | 	frame address sp,36
3011 | 	bl P256_double_j
3012 | 	pop {r4-r11,pc}
3013 | 6
3014 | 	frame address sp,148
3015 | 	
3016 | 	; V = U1*HH
3017 | 	add r1,sp,#64
3018 | 	mov r2,sp
3019 | 	bl P256_mulmod
3020 | 	add r8,sp,#64
3021 | 	stm r8,{r0-r7}
3022 | 	
3023 | 	; t3 = r^2
3024 | 	ldr r0,[sp,#96]
3025 | 	adds r0,#32
3026 | 	ldm r0,{r0-r7}
3027 | 	bl P256_sqrmod
3028 | 	stm sp,{r0-r7}
3029 | 	
3030 | 	; t2 = S1*HHH
3031 | 	add r1,sp,#32
3032 | 	ldr r2,[sp,#96]
3033 | 	bl P256_mulmod
3034 | 	add r8,sp,#32
3035 | 	stm r8,{r0-r7}
3036 | 	
3037 | 	; X3 = t3-HHH
3038 | 	mov r1,sp
3039 | 	ldr r2,[sp,#96]
3040 | 	bl P256_submod
3041 | 	ldr r8,[sp,#96]
3042 | 	stm r8,{r0-r7}
3043 | 	
3044 | 	; t3 = 2*V
3045 | 	add r0,sp,#64
3046 | 	ldm r0,{r0-r7}
3047 | 	bl P256_times2
3048 | 	stm sp,{r0-r7}
3049 | 	
3050 | 	; X3 = X3-t3
3051 | 	ldr r1,[sp,#96]
3052 | 	mov r2,sp
3053 | 	bl P256_submod
3054 | 	ldr r8,[sp,#96]
3055 | 	stm r8,{r0-r7}
3056 | 	
3057 | 	; t3 = V-X3
3058 | 	add r1,sp,#64
3059 | 	ldr r2,[sp,#96]
3060 | 	bl P256_submod
3061 | 	stm sp,{r0-r7}
3062 | 	
3063 | 	; t3 = r*t3
3064 | 	ldr r1,[sp,#96]
3065 | 	adds r1,#32
3066 | 	mov r2,sp
3067 | 	bl P256_mulmod
3068 | 	stm sp,{r0-r7}
3069 | 	
3070 | 	; Y3 = t3-+t2
3071 | 	ldr r0,[sp,#104]
3072 | 	mov r1,sp
3073 | 	add r2,sp,#32
3074 | 	cbz r0,%f7
3075 | 	bl P256_addmod
3076 | 	b %f8
3077 | 7
3078 | 	bl P256_submod
3079 | 8
3080 | 	ldr r8,[sp,#96]
3081 | 	add r8,#32
3082 | 	stm r8,{r0-r7}
3083 | 	
3084 | 	add sp,#112
3085 | 	frame address sp,36
3086 | 	
3087 | 	pop {r4-r11,pc}
3088 | 	endp
3089 | #endif
3090 | 
3091 | #if include_p256_verify
3092 | ; Determines whether r = x (mod n)
3093 | ; in: *r0 = r, *r1 = the result of the double scalarmult in jacobian form (Montgomery form)
3094 | ; out: r0 will contain 1 if valid, else 0
3095 | P256_verify_last_step proc
3096 | 	export P256_verify_last_step
3097 | 	push {r0,r1,r4-r11,lr}
3098 | 	frame push {r4-r11,lr}
3099 | 	frame address sp,44
3100 | 	sub sp,#32
3101 | 	frame address sp,76
3102 | 	
3103 | 	; Instead of doing an expensive field inversion and checking r = (X/Z^2 % p) (mod n),
3104 | 	; accept the signature iff r*Z^2 % p = X OR (r+n<p AND (r+n)*Z^2 % p = X).
3105 | 	; Proof that this is correct:
3106 | 	;   if we use the standard approach, that would mean we check that
3107 | 	;   r = (X/Z^2 % p) (mod n)
3108 | 	;   which is the same as r+k*n = (X/Z^2 % p) for any integer k,
3109 | 	;   but since the RHS is less than p and 2n > p, we only need to check for k=0,1
3110 | 	;   which means checking r = (X/Z^2 % p) OR r+n = (X/Z^2 % p)
3111 | 	;   For r = (X/Z^2 % p) we have that r < p and so we can instead check r*Z^2 % p = X
3112 | 	;   For r+n = (X/Z^2 % p) we must first check that r+n < p and can then check (r+n)*Z^2 % p = X
3113 | 	;
3114 | 	; Note that since p-n is around sqrt(n), it is extremely unlikely that r+n<p
3115 | 	;
3116 | 	; Note that X and Z are in Montgomery form but not r,
3117 | 	; so we must convert r to Montgomery form when it's time to do the multiplications
3118 | 	
3119 | 	; Calculate Z^2
3120 | 	add r1,#64
3121 | 	ldm r1,{r0-r7}
3122 | 	bl P256_sqrmod
3123 | 	push {r0-r7}
3124 | 	frame address sp,108
3125 | 	
3126 | 	; Check if Z^2 if 0, if so reject
3127 | 	orrs r0,r1
3128 | 	orrs r0,r2
3129 | 	orrs r0,r3
3130 | 	orrs r0,r4
3131 | 	orrs r0,r5
3132 | 	orrs r0,r6
3133 | 	orrs r0,r7
3134 | 	beq %f0
3135 | 	
3136 | 	; Convert r to Montgomery form
3137 | 	ldr r1,[sp,#64]
3138 | 2
3139 | 	add r0,sp,#32
3140 | 	bl P256_to_montgomery
3141 | 	
3142 | 	; Calculate r*Z^2
3143 | 	add r1,sp,#32
3144 | 	mov r2,sp
3145 | 	bl P256_mulmod
3146 | 	
3147 | 	; Now we will check if r*Z^2 = X
3148 | 	ldr r8,[sp,#68]
3149 | 	ldm r8!,{r9-r12}
3150 | 	eors r0,r9
3151 | 	ittt eq
3152 | 	eorseq r1,r10
3153 | 	eorseq r2,r11
3154 | 	eorseq r3,r12
3155 | 	ldm r8!,{r9-r12}
3156 | 	itttt eq
3157 | 	eorseq r4,r9
3158 | 	eorseq r5,r10
3159 | 	eorseq r6,r11
3160 | 	eorseq r7,r12
3161 | 	mov r0,#1
3162 | 	beq %f1
3163 | 	
3164 | 	; The check may fail if r < p-n, so also check for r' = r+n
3165 | 	adr r0,P256_order
3166 | 	ldm r0,{r8-r11}
3167 | 	ldr r0,[sp,#64]
3168 | 	cbz r0,%f0 ; if we already tried once, abort
3169 | 	ldm r0,{r0-r7}
3170 | 	adds r0,r8
3171 | 	adcs r1,r9
3172 | 	adcs r2,r10
3173 | 	adcs r3,r11
3174 | 	adcs r4,#0xffffffff
3175 | 	adcs r5,#0xffffffff
3176 | 	adcs r6,#0
3177 | 	adcs r7,#0xffffffff
3178 | 	bcs %f0 ; reject if r+n >= 2^256 (which is >= p)
3179 | 	
3180 | 	subs r8,r0,#0xffffffff
3181 | 	sbcs r8,r1,#0xffffffff
3182 | 	sbcs r8,r2,#0xffffffff
3183 | 	sbcs r8,r3,#0
3184 | 	sbcs r8,r4,#0
3185 | 	sbcs r8,r5,#0
3186 | 	sbcs r8,r6,#1
3187 | 	sbcs r8,r7,#0xffffffff
3188 | 	bcs %f0 ; reject if r+n >= p
3189 | 	
3190 | 	add r8,sp,#32
3191 | 	stm r8,{r0-r7}
3192 | 	movs r2,#0
3193 | 	str r2,[sp,#64] ; set r variable to NULL to avoid yet another try
3194 | 	
3195 | 	mov r1,r8
3196 | 	b %b2
3197 | 
3198 | 0
3199 | 	movs r0,#0
3200 | 1
3201 | 	add sp,#72
3202 | 	frame address sp,36
3203 | 	pop {r4-r11,pc}
3204 | 	
3205 | 	endp
3206 | #endif
3207 | 
3208 | #if include_p256_basemult || include_p256_varmult || include_p256_decompress_point
3209 | ; in: *r0 = output location, *r1 = input, *r2 = 0/1, *r3 = m
3210 | ; if r2 = 0, then *r0 is set to *r1
3211 | ; if r2 = 1, then *r0 is set to m - *r1
3212 | ; note that *r1 should be in the range [1,m-1]
3213 | ; out: r0 and r1 will have advanced 32 bytes, r2 will remain as the input
3214 | P256_negate_mod_m_if proc
3215 | 	push {r4-r8,lr}
3216 | 	frame push {r4-r8,lr}
3217 | 	rsb r8,r2,#1
3218 | 	movs r6,#8
3219 | 	subs r7,r7 ; set r7=0 and C=1
3220 | 0
3221 | 	ldm r1!,{r4,r12}
3222 | 	ldm r3!,{r5,lr}
3223 | 	sbcs r5,r4
3224 | 	umull r4,r7,r8,r4
3225 | 	umaal r4,r7,r2,r5
3226 | 	sbcs lr,r12
3227 | 	umull r12,r7,r8,r12
3228 | 	umaal r12,r7,r2,lr
3229 | 	stm r0!,{r4,r12}
3230 | 	sub r6,#2
3231 | 	cbz r6,%f1
3232 | 	b %b0
3233 | 1
3234 | 	pop {r4-r8,pc}
3235 | 	endp
3236 | #endif
3237 | 
3238 | #if include_p256_basemult || include_p256_varmult
3239 | P256_negate_mod_n_if proc
3240 | 	export P256_negate_mod_n_if
3241 | 	ldr r3,=P256_order
3242 | 	b P256_negate_mod_m_if
3243 | 	endp
3244 | 
3245 | P256_negate_mod_p_if proc
3246 | 	export P256_negate_mod_p_if
3247 | 	adr r3,P256_p
3248 | 	b P256_negate_mod_m_if
3249 | 	endp
3250 | #endif
3251 | 
3252 | 	align 4
3253 | 	end
3254 | 


--------------------------------------------------------------------------------
/p256-cortex-m4-config.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2017-2021 Emil Lenngren
  3 |  * Copyright (c) 2021 Shortcut Labs AB
  4 |  *
  5 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  6 |  * of this software and associated documentation files (the "Software"), to deal
  7 |  * in the Software without restriction, including without limitation the rights
  8 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 |  * copies of the Software, and to permit persons to whom the Software is
 10 |  * furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included in all
 13 |  * copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 |  * SOFTWARE.
 22 |  */
 23 | 
 24 | #ifndef P256_CORTEX_M4_CONFIG
 25 | #define P256_CORTEX_M4_CONFIG
 26 | 
 27 | // To enable a flag, define to 1. To disable, define to 0.
 28 | 
 29 | // Algorithm inclusion settings
 30 | 
 31 | // See p256-cortex-m4.h to understand what functions are included for each flag.
 32 | 
 33 | #ifndef include_p256_verify
 34 | #define include_p256_verify 1
 35 | #endif
 36 | 
 37 | #ifndef include_p256_sign
 38 | #define include_p256_sign 1
 39 | #endif
 40 | 
 41 | #ifndef include_p256_keygen
 42 | #define include_p256_keygen 1
 43 | #endif
 44 | 
 45 | #ifndef include_p256_ecdh
 46 | #define include_p256_ecdh 1
 47 | #endif
 48 | 
 49 | #ifndef include_p256_raw_scalarmult_generic
 50 | #define include_p256_raw_scalarmult_generic 1
 51 | #endif
 52 | 
 53 | #ifndef include_p256_raw_scalarmult_base
 54 | #define include_p256_raw_scalarmult_base 1
 55 | #endif
 56 | 
 57 | #ifndef include_p256_to_octet_string_uncompressed
 58 | #define include_p256_to_octet_string_uncompressed 1
 59 | #endif
 60 | 
 61 | #ifndef include_p256_to_octet_string_compressed
 62 | #define include_p256_to_octet_string_compressed 1
 63 | #endif
 64 | 
 65 | #ifndef include_p256_to_octet_string_hybrid
 66 | #define include_p256_to_octet_string_hybrid 1
 67 | #endif
 68 | 
 69 | #ifndef include_p256_decompress_point
 70 | #define include_p256_decompress_point 1
 71 | #endif
 72 | 
 73 | #ifndef include_p256_decode_point
 74 | #define include_p256_decode_point 1
 75 | #endif
 76 | 
 77 | 
 78 | // Target settings
 79 | 
 80 | /**
 81 |  * Enables the use of FPU instructions (vmov, vldm).
 82 |  * This will only work if the CPU has an FPU, such as Cortex-M4F.
 83 |  * It will not work on Cortex-M4 (without FPU).
 84 |  * If enabled, the code will run faster.
 85 |  * The default is to use the __ARM_FP macro that the compiler defines.
 86 |  */
 87 | #ifndef has_fpu
 88 | #ifdef __ARM_FP
 89 | #define has_fpu 1
 90 | #else
 91 | #define has_fpu 0
 92 | #endif
 93 | #endif
 94 | 
 95 | /**
 96 |  * If 0, the implementation conditionally loads data from different RAM locations depending
 97 |  * on secret data, so 0 should not be used on CPUs that have data cache, such as Cortex-A53.
 98 |  *
 99 |  * 0 is suited for embedded devices running CPUs like Cortex-M4 and Cortex-M33, which don't
100 |  * have any data cache.
101 |  *
102 |  * 1 is suited for Cortex-A processors.
103 |  */
104 | #ifndef has_d_cache
105 | #define has_d_cache 0
106 | #endif
107 | 
108 | // Optimization settings
109 | 
110 | /**
111 |  * If enabled, keygen and sign uses a specialized scalar multiplication routine when multiplying by
112 |  * the base point, which dramatically improves performance, at the expense of using more code space.
113 |  * If disabled, keygen and sign will use the generic variable base scalar multiplication routine.
114 |  */
115 | #ifndef use_fast_p256_basemult
116 | #define use_fast_p256_basemult 1
117 | #endif
118 | 
119 | /**
120 |  * Enable this to save some code space, at expense of performance.
121 |  * When disabled, a specialized field squaring routine will be used rather
122 |  * than re-using the multiplication routine.
123 |  */
124 | #ifndef use_mul_for_sqr
125 | #define use_mul_for_sqr 0
126 | #endif
127 | 
128 | // Derived settings (do not modify)
129 | #define include_p256_basemult (include_p256_keygen || include_p256_sign || include_p256_raw_scalarmult_base)
130 | #define include_fast_p256_basemult (use_fast_p256_basemult && include_p256_basemult)
131 | #define include_p256_varmult (include_p256_ecdh || include_p256_raw_scalarmult_generic)
132 | #define include_p256_mult (include_p256_verify || include_p256_basemult || include_p256_varmult)
133 | 
134 | #endif
135 | 


--------------------------------------------------------------------------------
/p256-cortex-m4.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2017-2021 Emil Lenngren
  3 |  * Copyright (c) 2021 Shortcut Labs AB
  4 |  *
  5 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  6 |  * of this software and associated documentation files (the "Software"), to deal
  7 |  * in the Software without restriction, including without limitation the rights
  8 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 |  * copies of the Software, and to permit persons to whom the Software is
 10 |  * furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included in all
 13 |  * copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 |  * SOFTWARE.
 22 |  */
 23 | 
 24 | #include <stdint.h>
 25 | #include <stdbool.h>
 26 | #include <string.h>
 27 | 
 28 | #include "p256-cortex-m4-config.h"
 29 | #include "p256-cortex-m4.h"
 30 | 
 31 | typedef const uint32_t (*constarr)[8];
 32 | 
 33 | struct FGInteger {
 34 |     // To get the value this struct represents,
 35 |     // interpret signed_value as a two's complement 288-bit little endian integer,
 36 |     // and negate if flip_sign is -1
 37 |     int flip_sign; // 0 or -1
 38 |     uint32_t signed_value[9]; // of 288 bits, 257 are useful (top 31 bits are sign-extended from bit 256)
 39 | };
 40 | 
 41 | struct XYInteger {
 42 |     // To get the value this struct represents,
 43 |     // interpret signed_value as an unsigned 288-bit little endian integer,
 44 |     // and negate if flip_sign is -1
 45 |     int flip_sign; // 0 or -1
 46 |     uint32_t value[8]; // unsigned value, 0 <= value < P256_order
 47 | };
 48 | 
 49 | int P256_divsteps2_31(int delta, uint32_t f, uint32_t g, uint32_t res_matrix[4]);
 50 | void P256_matrix_mul_fg_9(uint32_t a, uint32_t b, const struct FGInteger fg[2], struct FGInteger *res);
 51 | void P256_matrix_mul_mod_n(uint32_t a, uint32_t b, const struct XYInteger xy[2], struct XYInteger *res);
 52 | 
 53 | void P256_to_montgomery(uint32_t aR[8], const uint32_t a[8]);
 54 | void P256_from_montgomery(uint32_t a[8], const uint32_t aR[8]);
 55 | bool P256_check_range_p(const uint32_t a[8]);
 56 | 
 57 | bool P256_check_range_n(const uint32_t a[8]);
 58 | void P256_mul_mod_n(uint32_t res[8], const uint32_t a[8], const uint32_t b[8]);
 59 | void P256_add_mod_n(uint32_t res[8], const uint32_t a[8], const uint32_t b[8]);
 60 | void P256_mod_n_inv_vartime(uint32_t res[8], const uint32_t a[8]);
 61 | void P256_reduce_mod_n_32bytes(uint32_t res[8], const uint32_t a[8]);
 62 | 
 63 | void P256_select_point(uint32_t (*output)[8], uint32_t* table, uint32_t num_coordinates, uint32_t index);
 64 | 
 65 | void P256_jacobian_to_affine(uint32_t affine_mont_x[8], uint32_t affine_mont_y[8], const uint32_t jacobian_mont[3][8]);
 66 | bool P256_point_is_on_curve(const uint32_t x_mont[8], const uint32_t y_mont[8]);
 67 | bool P256_decompress_point(uint32_t y[8], const uint32_t x[8], uint32_t y_parity);
 68 | void P256_double_j(uint32_t jacobian_point_out[3][8], const uint32_t jacobian_point_in[3][8]);
 69 | void P256_add_sub_j(uint32_t jacobian_point1[3][8], const uint32_t (*point2)[8], bool is_sub, bool p2_is_affine);
 70 | bool P256_verify_last_step(const uint32_t r[8], const uint32_t jacobian_point[3][8]);
 71 | 
 72 | void P256_negate_mod_p_if(uint32_t out[8], const uint32_t in[8], uint32_t should_negate);
 73 | void P256_negate_mod_n_if(uint32_t out[8], const uint32_t in[8], uint32_t should_negate);
 74 | 
 75 | extern uint32_t P256_order[9];
 76 | 
 77 | #if include_p256_mult
 78 | static const uint32_t one_montgomery[8] = {1, 0, 0, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0};
 79 | #endif
 80 | 
 81 | #if include_p256_verify
 82 | // This table contains 1G, 3G, 5G, ... 15G in affine coordinates in montgomery form
 83 | static const uint32_t p256_basepoint_precomp[8][2][8] = {
 84 | {{0x18a9143c, 0x79e730d4, 0x5fedb601, 0x75ba95fc, 0x77622510, 0x79fb732b, 0xa53755c6, 0x18905f76},
 85 | {0xce95560a, 0xddf25357, 0xba19e45c, 0x8b4ab8e4, 0xdd21f325, 0xd2e88688, 0x25885d85, 0x8571ff18}},
 86 | {{0x4eebc127, 0xffac3f90, 0x87d81fb, 0xb027f84a, 0x87cbbc98, 0x66ad77dd, 0xb6ff747e, 0x26936a3f},
 87 | {0xc983a7eb, 0xb04c5c1f, 0x861fe1a, 0x583e47ad, 0x1a2ee98e, 0x78820831, 0xe587cc07, 0xd5f06a29}},
 88 | {{0xc45c61f5, 0xbe1b8aae, 0x94b9537d, 0x90ec649a, 0xd076c20c, 0x941cb5aa, 0x890523c8, 0xc9079605},
 89 | {0xe7ba4f10, 0xeb309b4a, 0xe5eb882b, 0x73c568ef, 0x7e7a1f68, 0x3540a987, 0x2dd1e916, 0x73a076bb}},
 90 | {{0xa0173b4f, 0x746354e, 0xd23c00f7, 0x2bd20213, 0xc23bb08, 0xf43eaab5, 0xc3123e03, 0x13ba5119},
 91 | {0x3f5b9d4d, 0x2847d030, 0x5da67bdd, 0x6742f2f2, 0x77c94195, 0xef933bdc, 0x6e240867, 0xeaedd915}},
 92 | {{0x264e20e8, 0x75c96e8f, 0x59a7a841, 0xabe6bfed, 0x44c8eb00, 0x2cc09c04, 0xf0c4e16b, 0xe05b3080},
 93 | {0xa45f3314, 0x1eb7777a, 0xce5d45e3, 0x56af7bed, 0x88b12f1a, 0x2b6e019a, 0xfd835f9b, 0x86659cd}},
 94 | {{0x6245e404, 0xea7d260a, 0x6e7fdfe0, 0x9de40795, 0x8dac1ab5, 0x1ff3a415, 0x649c9073, 0x3e7090f1},
 95 | {0x2b944e88, 0x1a768561, 0xe57f61c8, 0x250f939e, 0x1ead643d, 0xc0daa89, 0xe125b88e, 0x68930023}},
 96 | {{0x4b2ed709, 0xccc42563, 0x856fd30d, 0xe356769, 0x559e9811, 0xbcbcd43f, 0x5395b759, 0x738477ac},
 97 | {0xc00ee17f, 0x35752b90, 0x742ed2e3, 0x68748390, 0xbd1f5bc1, 0x7cd06422, 0xc9e7b797, 0xfbc08769}},
 98 | {{0xbc60055b, 0x72bcd8b7, 0x56e27e4b, 0x3cc23ee, 0xe4819370, 0xee337424, 0xad3da09, 0xe2aa0e43},
 99 | {0x6383c45d, 0x40b8524f, 0x42a41b25, 0xd7663554, 0x778a4797, 0x64efa6de, 0x7079adf4, 0x2042170a}}
100 | };
101 | #endif
102 | 
103 | #if include_fast_p256_basemult
104 | // This contains two tables, 8 points each in affine coordinates in montgomery form
105 | // The first table contains these points:
106 | // (2^192 - 2^128 - 2^64 - 1)G
107 | // (2^192 - 2^128 - 2^64 + 1)G
108 | // (2^192 - 2^128 + 2^64 - 1)G
109 | // (2^192 - 2^128 + 2^64 + 1)G
110 | // (2^192 + 2^128 - 2^64 - 1)G
111 | // (2^192 + 2^128 - 2^64 + 1)G
112 | // (2^192 + 2^128 + 2^64 - 1)G
113 | // (2^192 + 2^128 + 2^64 + 1)G
114 | // The second table contains the same points multiplied by 2^32
115 | static const uint32_t p256_basepoint_precomp2[2][8][2][8] =
116 | {
117 | {
118 | {{0x670844e0, 0x52d8a7c9, 0xef68a29d, 0xe33bdc, 0x4bdb7361, 0xf3d2848, 0x91c5304d, 0x5222c821},
119 | {0xdf73fc25, 0xea6d2944, 0x255c81b, 0xa04c0f55, 0xefe488a8, 0x29acdc97, 0x80a560de, 0xbe2e158f}},
120 | {{0x2b13e673, 0xfc8511ee, 0xd103ed24, 0xffc58dee, 0xea7e99b8, 0x1022523a, 0x4afc8a17, 0x8f43ea39},
121 | {0xc5f33d0b, 0x8f4e2dbc, 0xd0aa1681, 0x3bc099fa, 0x79ff9df1, 0xffbb7b41, 0xd58b57c4, 0x180de09d}},
122 | {{0x8bd1cda5, 0x56430752, 0x8e05eda5, 0x1807577f, 0x956896e9, 0x99c699b, 0xf1f0efb5, 0x83d6093d},
123 | {0xed97061c, 0xef5af17e, 0x30d4c3c, 0x35b977b8, 0x49229439, 0x81fa75a2, 0xa0b6d35d, 0xf5a22070}},
124 | {{0x74f81cf1, 0x814c5365, 0x120065b, 0xe30baff7, 0x15132621, 0x80ae1256, 0x36a80788, 0x16d2b8cb},
125 | {0xecc50bca, 0x33d14697, 0x17aedd21, 0x19a9dfb0, 0xedc3f766, 0x523fbcc7, 0xb2cf5afd, 0x9c4de6dd}},
126 | {{0xcf0d9f6d, 0x5305a9e6, 0x81a9b021, 0x5839172f, 0x75c687cf, 0xcca7a4dd, 0x844be22f, 0x36d59b3e},
127 | {0x111a53e9, 0xcace7e62, 0xf063f3a1, 0x91c843d4, 0xda812da, 0xbf77e5f0, 0x437f3176, 0xe64af9c}},
128 | {{0xcf07517d, 0xdbd568bb, 0xba6830b9, 0x2f1afba2, 0xe6c4c2a6, 0x15b6807c, 0xe4966aef, 0x91c7eabc},
129 | {0xd6b2b6e6, 0x716dea1b, 0x19f85b4b, 0x248c43d1, 0x4a315e2a, 0x16dcfd60, 0xc72b3d0b, 0x15fdd303}},
130 | {{0x42b7dfd5, 0xe40bf9f4, 0x2d934f2a, 0x673689f3, 0x30a6f50b, 0x8314beb4, 0x976ec64e, 0xd17af2bc},
131 | {0x1ee7ddf1, 0x39f66c4f, 0x68ea373c, 0x7f68e18b, 0x53d0b186, 0x5166c1f2, 0x7be58f14, 0x95dda601}},
132 | {{0x42913074, 0xd5ae356, 0x48a542b1, 0x55491b27, 0xb310732a, 0x469ca665, 0x5f1a4cc1, 0x29591d52},
133 | {0xb84f983f, 0xe76f5b6b, 0x9f5f84e1, 0xbe7eef41, 0x80baa189, 0x1200d496, 0x18ef332c, 0x6376551f}}
134 | },
135 | {
136 | {{0x7c4e54f5, 0xb9e5cbc0, 0xe1410e34, 0xc53a1a17, 0xec454425, 0x3e199130, 0x1700902e, 0xb029c97e},
137 | {0x786423b6, 0x2de66e11, 0xb41a95be, 0x262dc914, 0x451b683, 0x51766abd, 0x85bb6fb1, 0x55ad5f34}},
138 | {{0x9066cb79, 0x74f4f1c, 0x30c8b94e, 0x1ab31bd6, 0xd74275b3, 0x6d3f012f, 0x9ddcce40, 0xa214d0b1},
139 | {0xd165050a, 0x24aedf74, 0xe0e5dc3e, 0x95f17ece, 0xd9224456, 0x6ada9cda, 0x2dd60eea, 0x1fadb2d1}},
140 | {{0xe20cfb9b, 0xa3d83091, 0xba76e0cb, 0xae79c975, 0xc8858a6e, 0xa5f2a588, 0x874a3168, 0xe897a5f4},
141 | {0x7d48f096, 0xf6c1ef40, 0xc35b132c, 0x1f9c516b, 0x53c479fd, 0xe1040f91, 0x9df06743, 0x60e881f}},
142 | {{0x52a90e51, 0x9e0ad72, 0x38c50a96, 0xb7e66ea3, 0x7d997770, 0xab32ad05, 0x445671cb, 0xceaffe2},
143 | {0x5d37cc99, 0xdfbe753c, 0xe0fea2d5, 0x95d068cc, 0x4dd77cb6, 0x1e37cdda, 0x55530688, 0x88c5a4bb}},
144 | {{0xc7744f1, 0x3413f033, 0xbc816702, 0x23c05c89, 0x1192b5ac, 0x2322ee9a, 0x373180bb, 0xc1636a0},
145 | {0xbdde0207, 0xfe2f3d4, 0xc23578d8, 0xe1a093a, 0xc888ead, 0x6e5f0d1, 0x52a2b660, 0x9ca285a5}},
146 | {{0xce923964, 0xdae76995, 0xa34c7993, 0xcc96493a, 0xea73d9e7, 0xd19b5144, 0x311e6e34, 0x4a5c263},
147 | {0xd9a2a443, 0x7db5b32b, 0x2cfd960c, 0x3754bd33, 0xa430f15, 0xc5bcc98, 0xd9a94574, 0x5651201f}},
148 | {{0xfc0418fe, 0xebdd8921, 0x34e20036, 0x37015b39, 0xdf03a353, 0xcf4fcd8f, 0xf12cab16, 0xdc2de6e1},
149 | {0xd071df14, 0x9c17cc1a, 0x63415530, 0xd7c5e6a3, 0x68f3fb1e, 0xb5301660, 0x18269301, 0xb5f70bc9}},
150 | {{0x79ec1a0f, 0x2d8daefd, 0xceb39c97, 0x3bbcd6fd, 0x58f61a95, 0xf5575ffc, 0xadf7b420, 0xdbd986c4},
151 | {0x15f39eb7, 0x81aa8814, 0xb98d976c, 0x6ee2fcf5, 0xcf2f717d, 0x5465475d, 0x6860bbd0, 0x8e24d3c4}}
152 | }
153 | };
154 | #endif
155 | 
156 | #if include_p256_verify || include_p256_sign
157 | // Takes the leftmost 256 bits in hash (treated as big endian),
158 | // and converts to little endian integer z.
159 | static void hash_to_z(uint32_t z[8], const uint8_t* hash, uint32_t hashlen) {
160 |     if (hashlen > 32) {
161 |         hashlen = 32;
162 |     }
163 |     for (uint32_t i = 0; i < hashlen; i++) {
164 |         ((uint8_t*)z)[i] = hash[hashlen - 1 - i];
165 |     }
166 |     for (uint32_t i = hashlen; i < 32; i++) {
167 |         ((uint8_t*)z)[i] = 0;
168 |     }
169 | }
170 | #endif
171 | 
172 | #if include_p256_verify
173 | // Creates a representation of a (little endian integer),
174 | // so that r[0] + 2*r[1] + 2^2*r[2] + 2^3*r[3] + ... = a,
175 | // where each r[i] is -15, -13, ..., 11, 13, 15 or 0.
176 | // Only around 1/5.5 of the r[i] will be non-zero.
177 | static void slide_257(signed char r[257], const uint8_t a[32]) {
178 |     for (int i = 0; i < 256; ++i) {
179 |         r[i] = 1 & (a[i >> 3] >> (i & 7));
180 |     }
181 |     r[256] = 0;
182 | 
183 |     for (int i = 0; i < 256; i++) {
184 |         if (r[i] != 0) {
185 |             for (int b = 1; b <= 4 && i + b < 256; b++) {
186 |                 if (r[i + b] != 0) {
187 |                     if (r[i] + (r[i + b] << b) <= 15) {
188 |                         r[i] += r[i + b] << b; r[i + b] = 0;
189 |                     } else if (r[i] - (r[i + b] << b) >= -15) {
190 |                         r[i] -= r[i + b] << b;
191 |                         for (;;) {
192 |                             r[i + b] = 0;
193 |                             b++;
194 |                             if (!r[i + b]) {
195 |                                 r[i + b] = 1;
196 |                                 b--; // Will be added back after loop footer b++
197 |                                 break;
198 |                             }
199 |                         }
200 |                     } else {
201 |                         break;
202 |                     }
203 |                 }
204 |             }
205 |         }
206 |     }
207 | }
208 | #endif
209 | 
210 | #if include_p256_sign
211 | void P256_mod_n_inv(uint32_t out[8], const uint32_t in[8]) {
212 |     // This function follows the algorithm in section 12.1 of https://gcd.cr.yp.to/safegcd-20190413.pdf.
213 |     // It has been altered in the following ways:
214 |     //   1. Due to 32-bit cpu, we use 24 * 31 iterations instead of 12 * 62.
215 |     //   2. P-256 modulus instead of 2^255-19.
216 |     //      744 iterations are still enough and slightly more than the required 741 (floor((49*256+57)/17)).
217 |     //   3. Step 5 has been corrected to go back to step 2 instead of step 3.
218 |     //   4. The order of the matrix multiplications in step 6 has been changed to (T24*(T23*(T22*(...*(T1*[0, 1]))))),
219 |     //      where [0, 1] is a column vector to make it possible to be able to extract the "top-right corner", v, of T24*T23*...*T1.
220 |     //      The result v will then be contained in the first element of the resulting column vector.
221 |     
222 |     struct {
223 |         struct FGInteger fg[2]; // f and g
224 |         struct XYInteger xy[2]; // x and y
225 |     } state[2]; // "current" and "next"
226 |     
227 |     state[0].fg[0].flip_sign = 0; // non-negative f
228 |     memcpy(&state[0].fg[0].signed_value, P256_order, 36); // f
229 |     state[0].fg[1].flip_sign = 0; // non-negative g
230 |     memcpy(&state[0].fg[1].signed_value, in, 32); // g
231 |     state[0].fg[1].signed_value[8] = 0; // upper bits of g are 0
232 |     memset(&state[0].xy, 0, sizeof(state[0].xy));
233 |     // We later need a factor 2^-744. The montgomery multiplication gives 2^(24*-32)=2^-768, so multiply the init value (1) by 2^24 here.
234 |     state[0].xy[1].value[0] = 1U << 24;
235 |     
236 |     int delta = 1;
237 |     for (int i = 0; i < 24; i++) {
238 |         // Scaled translation matrix Ti
239 |         uint32_t matrix[4]; // element range: [-2^30, 2^31] (negative numbers are stored in two's complement form)
240 |         
241 |         // Decode f and g into two's complement representation and use the lowest 32 bits in the P256_divsteps2_31 calculation
242 |         uint32_t negate_f = state[i % 2].fg[0].flip_sign;
243 |         uint32_t negate_g = state[i % 2].fg[1].flip_sign;
244 |         delta = P256_divsteps2_31(delta, (state[i % 2].fg[0].signed_value[0] ^ negate_f) - negate_f, (state[i % 2].fg[1].signed_value[0] ^ negate_g) - negate_g, matrix);
245 |         
246 |         // "Jump step", calculates the new f and g values that applies after 31 divstep2 iterations
247 |         P256_matrix_mul_fg_9(matrix[0], matrix[1], state[i % 2].fg, &state[(i + 1) % 2].fg[0]);
248 |         P256_matrix_mul_fg_9(matrix[2], matrix[3], state[i % 2].fg, &state[(i + 1) % 2].fg[1]);
249 |         
250 |         // Iterate the result vector
251 |         // Due to montgomery multiplication inside this function, each step also adds a 2^-32 factor
252 |         P256_matrix_mul_mod_n(matrix[0], matrix[1], state[i % 2].xy, &state[(i + 1) % 2].xy[0]);
253 |         P256_matrix_mul_mod_n(matrix[2], matrix[3], state[i % 2].xy, &state[(i + 1) % 2].xy[1]);
254 |     }
255 |     // Calculates val^-1 = sgn(f) * v * 2^-744, where v is the "top-right corner" of the resulting T24*T23*...*T1 matrix.
256 |     // In this implementation, at this point x contains v * 2^-744.
257 |     P256_negate_mod_n_if(out, &state[0].xy[0].value[0], (state[0].xy[0].flip_sign ^ state[0].fg[0].flip_sign ^ state[0].fg[0].signed_value[8]) & 1);
258 | }
259 | #endif
260 | 
261 | #if include_p256_varmult || (include_p256_basemult && !use_fast_p256_basemult)
262 | // Constant time abs
263 | static inline uint32_t abs_int(int8_t a) {
264 |     uint32_t a_u = (uint32_t)(int32_t)a;
265 |     uint32_t mask = a_u >> 31;
266 |     mask |= mask << 1;
267 |     mask |= mask << 2;
268 |     uint32_t result = (-a) & mask;
269 |     result |= a & (mask ^ 0xf);
270 |     return result;
271 | }
272 | 
273 | // Calculates scalar*P in constant time (except for the scalars 2 and n-2, for which the results take a few extra cycles to compute)
274 | static void scalarmult_variable_base(uint32_t output_mont_x[8], uint32_t output_mont_y[8], const uint32_t input_mont_x[8], const uint32_t input_mont_y[8], const uint32_t scalar[8]) {
275 |     // Based on https://eprint.iacr.org/2014/130.pdf, Algorithm 1.
276 |     
277 |     uint32_t scalar2[8];
278 |     int8_t e[64];
279 |     
280 |     // The algorithm used requires the scalar to be odd. If even, negate the scalar modulo p to make it odd, and later negate the end result.
281 |     bool even = (scalar[0] & 1) ^ 1;
282 |     P256_negate_mod_n_if(scalar2, scalar, even);
283 |     
284 |     // Rewrite the scalar as e[0] + 2^4*e[1] + 2^8*e[2] + ... + 2^252*e[63], where each e[i] is an odd number and -15 <= e[i] <= 15.
285 |     e[0] = scalar2[0] & 0xf;
286 |     for (int i = 1; i < 64; i++) {
287 |         // Extract 4 bits
288 |         e[i] = (scalar2[i / 8] >> ((i % 8) * 4)) & 0xf;
289 |         // If even, subtract 2^4 from e[i - 1] and add 1 to e[i]
290 |         e[i - 1] -= ((e[i] & 1) ^ 1) << 4;
291 |         e[i] |= 1;
292 |     }
293 |     
294 |     // Create a table of P, 3P, 5P, ... 15P.
295 |     uint32_t table[8][3][8];
296 |     memcpy(table[0][0], input_mont_x, 32);
297 |     memcpy(table[0][1], input_mont_y, 32);
298 |     memcpy(table[0][2], one_montgomery, 32);
299 |     P256_double_j(table[7], (constarr)table[0]);
300 |     for (int i = 1; i < 8; i++) {
301 |         memcpy(table[i], table[7], 96);
302 |         P256_add_sub_j(table[i], (constarr)table[i - 1], 0, 0);
303 |     }
304 |     
305 |     // Calculate the result as (((((((((e[63]*G)*2^4)+e[62])*2^4)+e[61])*2^4)...)+e[1])*2^4)+e[0] = (2^252*e[63] + 2^248*e[62] + ... + e[0])*G.
306 |     
307 |     uint32_t current_point[3][8];
308 |     
309 |     // e[63] is never negative
310 |     #if has_d_cache
311 |     P256_select_point(current_point, (uint32_t*)table, 3, e[63] >> 1);
312 |     #else
313 |     memcpy(current_point, table[e[63] >> 1], 96);
314 |     #endif
315 |     
316 |     for (uint32_t i = 63; i --> 0;) {
317 |         for (int j = 3; j >= 0; j--) {
318 |             P256_double_j(current_point, (constarr)current_point);
319 |         }
320 |         uint32_t selected_point[3][8];
321 |         #if has_d_cache
322 |         P256_select_point(selected_point, (uint32_t*)table, 3, abs_int(e[i]) >> 1);
323 |         #else
324 |         memcpy(selected_point, table[abs_int(e[i]) >> 1], 96);
325 |         #endif
326 |         P256_negate_mod_p_if(selected_point[1], selected_point[1], (uint8_t)e[i] >> 7);
327 |         
328 |         // There is (only) one odd input scalar that leads to an exception when i == 0: n-2,
329 |         // in that case current_point will be equal to selected_point and hence a doubling
330 |         // will occur instead. We don't bother fixing the same constant time for that case since
331 |         // the probability of that random value to be generated is around 1/2^255 and an
332 |         // attacker could easily test this case anyway.
333 |         P256_add_sub_j(current_point, (constarr)selected_point, false, false);
334 |     }
335 |     P256_jacobian_to_affine(output_mont_x, output_mont_y, (constarr)current_point);
336 |     
337 |     // If the scalar was initially even, we now negate the result to get the correct result, since -(scalar*G) = (-scalar*G).
338 |     // This is done by negating y, since -(x,y) = (x,-y).
339 |     P256_negate_mod_p_if(output_mont_y, output_mont_y, even);
340 | }
341 | #endif
342 | 
343 | #define get_bit(arr, i) ((arr[(i) / 32] >> ((i) % 32)) & 1)
344 | 
345 | #if include_p256_basemult
346 | #if include_fast_p256_basemult
347 | // Calculates scalar*G in constant time
348 | static void scalarmult_fixed_base(uint32_t output_mont_x[8], uint32_t output_mont_y[8], const uint32_t scalar[8]) {
349 |     uint32_t scalar2[8];
350 |     
351 |     // Just as with the algorithm used in variable base scalar multiplication, this algorithm requires the scalar to be odd.
352 |     bool even = (scalar[0] & 1) ^ 1;
353 |     P256_negate_mod_n_if(scalar2, scalar, even);
354 |     
355 |     // This algorithm conceptually rewrites the odd scalar as s[0] + 2^1*s[1] + 2^2*s[2] + ... + 2^255*s[255], where each s[i] is -1 or 1.
356 |     // By initially setting s[i] to the corresponding bit S[i] in the original odd scalar S, we go from lsb to msb, and whenever a value s[i] is 0,
357 |     // increase s[i] by 1 and decrease s[i-1] by 2.
358 |     // This will result in that s[i] = S[i+1] == 1 ? 1 : -1 for i < 255, and s[255] = 1.
359 |     
360 |     // We then form the scalars abs(s[j] + s[j+64]*2^64 + s[j+128]*2^128 + s[j+192]*2^192)*(2^32 * floor(j / 32)) for different 0 <= j < 64.
361 |     // Each scalar times G has already been precomputed in p256_basepoint_precomp2.
362 |     // That way we only need 31 point doublings and 63 point additions.
363 |     
364 |     uint32_t current_point[3][8];
365 |     uint32_t selected_point[2][8];
366 |     
367 |     #if !has_d_cache
368 |     // Load table into RAM, for example if the the table lies on external memory mapped flash, which can easily be intercepted.
369 |     uint32_t precomp[2][8][2][8];
370 |     memcpy(precomp, p256_basepoint_precomp2, sizeof(p256_basepoint_precomp2));
371 |     #endif
372 |     
373 |     for (uint32_t i = 32; i --> 0;) {
374 |         {
375 |             uint32_t mask = get_bit(scalar2, i + 32 + 1) | (get_bit(scalar2, i + 64 + 32 + 1) << 1) | (get_bit(scalar2, i + 2 * 64 + 32 + 1) << 2);
376 |             if (i == 31) {
377 |                 #if has_d_cache
378 |                 P256_select_point(current_point, (uint32_t*)p256_basepoint_precomp2[1], 2, mask);
379 |                 #else
380 |                 memcpy(current_point, precomp[1][mask], 64);
381 |                 #endif
382 |                 memcpy(current_point[2], one_montgomery, 32);
383 |             } else {
384 |                 P256_double_j(current_point, (constarr)current_point);
385 |                 
386 |                 uint32_t sign = get_bit(scalar2, i + 3 * 64 + 32 + 1) - 1; // positive: 0, negative: -1
387 |                 mask = (mask ^ sign) & 7;
388 |                 #if has_d_cache
389 |                 P256_select_point(selected_point, (uint32_t*)p256_basepoint_precomp2[1], 2, mask);
390 |                 #else
391 |                 memcpy(selected_point, precomp[1][mask], 64);
392 |                 #endif
393 |                 P256_negate_mod_p_if(selected_point[1], selected_point[1], sign & 1);
394 |                 P256_add_sub_j(current_point, (constarr)selected_point, false, true);
395 |             }
396 |         }
397 |         {
398 |             uint32_t mask = get_bit(scalar2, i + 1) | (get_bit(scalar2, i + 64 + 1) << 1) | (get_bit(scalar2, i + 2 * 64 + 1) << 2);
399 |             uint32_t sign = get_bit(scalar2, i + 3 * 64 + 1) - 1; // positive: 0, negative: -1
400 |             mask = (mask ^ sign) & 7;
401 |             #if has_d_cache
402 |             P256_select_point(selected_point, (uint32_t*)p256_basepoint_precomp2[0], 2, mask);
403 |             #else
404 |             memcpy(selected_point, precomp[0][mask], 64);
405 |             #endif
406 |             P256_negate_mod_p_if(selected_point[1], selected_point[1], sign & 1);
407 |             P256_add_sub_j(current_point, (constarr)selected_point, false, true);
408 |         }
409 |     }
410 |     P256_jacobian_to_affine(output_mont_x, output_mont_y, (constarr)current_point);
411 |     
412 |     // Negate final result if the scalar was initially even.
413 |     P256_negate_mod_p_if(output_mont_y, output_mont_y, even);
414 | }
415 | #else
416 | static void scalarmult_fixed_base(uint32_t output_mont_x[8], uint32_t output_mont_y[8], const uint32_t scalar[8]) {
417 |     #if !include_p256_verify
418 |     static const uint32_t p[2][8] =
419 |     {{0x18a9143c, 0x79e730d4, 0x5fedb601, 0x75ba95fc, 0x77622510, 0x79fb732b, 0xa53755c6, 0x18905f76},
420 |     {0xce95560a, 0xddf25357, 0xba19e45c, 0x8b4ab8e4, 0xdd21f325, 0xd2e88688, 0x25885d85, 0x8571ff18}};
421 |     scalarmult_variable_base(output_mont_x, output_mont_y, p[0], p[1], scalar);
422 |     #else
423 |     scalarmult_variable_base(output_mont_x, output_mont_y, p256_basepoint_precomp[0][0], p256_basepoint_precomp[0][1], scalar);
424 |     #endif
425 | }
426 | #endif
427 | #endif
428 | 
429 | void p256_convert_endianness(void* output, const void* input, size_t byte_len) {
430 |     for (size_t i = 0; i < byte_len / 2; i++) {
431 |         uint8_t t = ((uint8_t*)input)[byte_len - 1 - i];
432 |         ((uint8_t*)output)[byte_len - 1 - i] = ((uint8_t*)input)[i];
433 |         ((uint8_t*)output)[i] = t;
434 |     }
435 | }
436 | 
437 | #if include_p256_verify
438 | bool p256_verify(const uint32_t public_key_x[8], const uint32_t public_key_y[8], const uint8_t* hash, uint32_t hashlen_in_bytes, const uint32_t r[8], const uint32_t s[8]) {
439 |     if (!P256_check_range_n(r) || !P256_check_range_n(s)) {
440 |         return false;
441 |     }
442 |     
443 |     if (!P256_check_range_p(public_key_x) || !P256_check_range_p(public_key_y)) {
444 |         return false;
445 |     }
446 |     
447 |     uint32_t pk_table[8][3][8];
448 |     P256_to_montgomery(pk_table[0][0], public_key_x);
449 |     P256_to_montgomery(pk_table[0][1], public_key_y);
450 |     memcpy(pk_table[0][2], one_montgomery, 32);
451 |     
452 |     if (!P256_point_is_on_curve(pk_table[0][0], pk_table[0][1])) {
453 |         return false;
454 |     }
455 |     
456 |     // Create a table of P, 3P, 5P, ..., 15P, where P is the public key.
457 |     P256_double_j(pk_table[7], (constarr)pk_table[0]);
458 |     for (int i = 1; i < 8; i++) {
459 |         memcpy(pk_table[i], pk_table[7], 96);
460 |         P256_add_sub_j(pk_table[i], (constarr)pk_table[i - 1], 0, 0);
461 |     }
462 |     
463 |     uint32_t z[8], w[8], u1[8], u2[8];
464 |     
465 |     hash_to_z(z, hash, hashlen_in_bytes);
466 |     
467 |     #if include_p256_sign
468 |     P256_mod_n_inv(w, s);
469 |     #else
470 |     // Use smaller implementation if we don't need constant time version
471 |     P256_mod_n_inv_vartime(w, s);
472 |     #endif
473 |     
474 |     P256_mul_mod_n(u1, z, w);
475 |     P256_mul_mod_n(u2, r, w);
476 |     
477 |     // Each value in these arrays will be an odd integer v, so that -15 <= v <= 15.
478 |     // Around 1/5.5 of them will be non-zero.
479 |     signed char slide_bp[257], slide_pk[257];
480 |     slide_257(slide_bp, (uint8_t*)u1);
481 |     slide_257(slide_pk, (uint8_t*)u2);
482 |     
483 |     uint32_t cp[3][8] = {0};
484 |     
485 |     for (int i = 256; i >= 0; i--) {
486 |         P256_double_j(cp, (constarr)cp);
487 |         if (slide_bp[i] > 0) {
488 |             P256_add_sub_j(cp, p256_basepoint_precomp[slide_bp[i]/2], 0, 1);
489 |         } else if (slide_bp[i] < 0) {
490 |             P256_add_sub_j(cp, p256_basepoint_precomp[(-slide_bp[i])/2], 1, 1);
491 |         }
492 |         if (slide_pk[i] > 0) {
493 |             P256_add_sub_j(cp, (constarr)pk_table[slide_pk[i]/2], 0, 0);
494 |         } else if (slide_pk[i] < 0) {
495 |             P256_add_sub_j(cp, (constarr)pk_table[(-slide_pk[i])/2], 1, 0);
496 |         }
497 |     }
498 |     
499 |     return P256_verify_last_step(r, (constarr)cp);
500 | }
501 | #endif
502 | 
503 | #if include_p256_sign
504 | bool p256_sign_step1(struct SignPrecomp *result, const uint32_t k[8]) {
505 |     do {
506 |         uint32_t point_res[2][8];
507 |         if (!P256_check_range_n(k)) {
508 |             break;
509 |         }
510 |         scalarmult_fixed_base(point_res[0], point_res[1], k);
511 |         P256_mod_n_inv(result->k_inv, k);
512 |         P256_from_montgomery(result->r, point_res[0]);
513 |         P256_reduce_mod_n_32bytes(result->r, result->r);
514 |         
515 |         uint32_t r_sum = 0;
516 |         for (int i = 0; i < 8; i++) {
517 |             r_sum |= result->r[i];
518 |         }
519 |         if (r_sum == 0) {
520 |             break;
521 |         }
522 |         return true;
523 |     } while (false);
524 |     
525 |     memset(result, 0, sizeof(struct SignPrecomp));
526 |     return false;
527 | }
528 | 
529 | bool p256_sign_step2(uint32_t r[8], uint32_t s[8], const uint8_t* hash, uint32_t hashlen_in_bytes, const uint32_t private_key[8], struct SignPrecomp *sign_precomp) {
530 |     do {
531 |         if (!P256_check_range_n(sign_precomp->k_inv) || !P256_check_range_n(sign_precomp->r)) { // just make sure user did not input an obviously invalid precomp
532 |             break;
533 |         }
534 |         uint32_t *const z = r;
535 |         hash_to_z(z, hash, hashlen_in_bytes);
536 |         P256_mul_mod_n(s, sign_precomp->r, private_key);
537 |         P256_add_mod_n(s, z, s);
538 |         P256_mul_mod_n(s, sign_precomp->k_inv, s);
539 |         
540 |         memcpy(r, sign_precomp->r, 32);
541 |         
542 |         uint32_t s_sum = 0;
543 |         for (int i = 0; i < 8; i++) {
544 |             s_sum |= s[i];
545 |         }
546 |         if (s_sum == 0) {
547 |             break;
548 |         }
549 |         memset(sign_precomp, 0, sizeof(*sign_precomp));
550 |         return true;
551 |     } while (false);
552 |     
553 |     memset(r, 0, 32);
554 |     memset(s, 0, 32);
555 |     return false;
556 | }
557 | 
558 | bool p256_sign(uint32_t r[8], uint32_t s[8], const uint8_t* hash, uint32_t hashlen_in_bytes, const uint32_t private_key[8], const uint32_t k[8]) {
559 |     struct SignPrecomp t;
560 |     if (!p256_sign_step1(&t, k)) {
561 |         memset(r, 0, 32);
562 |         memset(s, 0, 32);
563 |         return false;
564 |     }
565 |     return p256_sign_step2(r, s, hash, hashlen_in_bytes, private_key, &t);
566 | }
567 | #endif
568 | 
569 | #if include_p256_keygen || include_p256_raw_scalarmult_base
570 | bool p256_scalarmult_base(uint32_t result_x[8], uint32_t result_y[8], const uint32_t scalar[8]) {
571 |     if (!P256_check_range_n(scalar)) {
572 |         return false;
573 |     }
574 |     scalarmult_fixed_base(result_x, result_y, scalar);
575 |     P256_from_montgomery(result_x, result_x);
576 |     P256_from_montgomery(result_y, result_y);
577 |     return true;
578 |     
579 | }
580 | 
581 | #if include_p256_keygen
582 | bool p256_keygen(uint32_t public_key_x[8], uint32_t public_key_y[8], const uint32_t private_key[8]) {
583 |     return p256_scalarmult_base(public_key_x, public_key_y, private_key);
584 | }
585 | #endif
586 | #endif
587 | 
588 | 
589 | #if include_p256_varmult
590 | static bool p256_scalarmult_generic_no_scalar_check(uint32_t output_mont_x[8], uint32_t output_mont_y[8], const uint32_t scalar[8], const uint32_t in_x[8], const uint32_t in_y[8]) {
591 |     if (!P256_check_range_p(in_x) || !P256_check_range_p(in_y)) {
592 |         return false;
593 |     }
594 |     
595 |     P256_to_montgomery(output_mont_x, in_x);
596 |     P256_to_montgomery(output_mont_y, in_y);
597 |     
598 |     if (!P256_point_is_on_curve(output_mont_x, output_mont_y)) {
599 |         return false;
600 |     }
601 |     
602 |     scalarmult_variable_base(output_mont_x, output_mont_y, output_mont_x, output_mont_y, scalar);
603 |     return true;
604 | }
605 | 
606 | #if include_p256_raw_scalarmult_generic
607 | bool p256_scalarmult_generic(uint32_t result_x[8], uint32_t result_y[8], const uint32_t scalar[8], const uint32_t in_x[8], const uint32_t in_y[8]) {
608 |     if (!P256_check_range_n(scalar) || !p256_scalarmult_generic_no_scalar_check(result_x, result_y, scalar, in_x, in_y)) {
609 |         return false;
610 |     }
611 |     P256_from_montgomery(result_x, result_x);
612 |     P256_from_montgomery(result_y, result_y);
613 |     return true;
614 | }
615 | #endif
616 | 
617 | #if include_p256_ecdh
618 | bool p256_ecdh_calc_shared_secret(uint8_t shared_secret[32], const uint32_t private_key[8], const uint32_t others_public_key_x[8], const uint32_t others_public_key_y[8]) {
619 |     uint32_t result_x[8], result_y[8];
620 |     if (!p256_scalarmult_generic_no_scalar_check(result_x, result_y, private_key, others_public_key_x, others_public_key_y)) {
621 |         return false;
622 |     }
623 |     P256_from_montgomery(result_x, result_x);
624 |     p256_convert_endianness(shared_secret, result_x, 32);
625 |     return true;
626 | }
627 | #endif
628 | #endif
629 | 
630 | #if include_p256_to_octet_string_uncompressed
631 | void p256_point_to_octet_string_uncompressed(uint8_t out[65], const uint32_t x[8], const uint32_t y[8]) {
632 |     out[0] = 4;
633 |     p256_convert_endianness(out + 1, x, 32);
634 |     p256_convert_endianness(out + 33, y, 32);
635 | }
636 | #endif
637 | 
638 | #if include_p256_to_octet_string_compressed
639 | void p256_point_to_octet_string_compressed(uint8_t out[33], const uint32_t x[8], const uint32_t y[8]) {
640 |     out[0] = 2 + (y[0] & 1);
641 |     p256_convert_endianness(out + 1, x, 32);
642 | }
643 | #endif
644 | 
645 | #if include_p256_to_octet_string_hybrid
646 | void p256_point_to_octet_string_hybrid(uint8_t out[65], const uint32_t x[8], const uint32_t y[8]) {
647 |     out[0] = 6 + (y[0] & 1);
648 |     p256_convert_endianness(out + 1, x, 32);
649 |     p256_convert_endianness(out + 33, y, 32);
650 | }
651 | #endif
652 | 
653 | #if include_p256_decode_point || include_p256_decompress_point
654 | bool p256_octet_string_to_point(uint32_t x[8], uint32_t y[8], const uint8_t* input, uint32_t input_len_in_bytes) {
655 |     if (input_len_in_bytes < 33) return false;
656 |     p256_convert_endianness(x, input + 1, 32);
657 |     if (!P256_check_range_p(x)) {
658 |         return false;
659 |     }
660 |     #if include_p256_decode_point
661 |     if ((input[0] == 4 || (input[0] >> 1) == 3) && input_len_in_bytes == 65) {
662 |         p256_convert_endianness(y, input + 33, 32);
663 |         if (!P256_check_range_p(y)) {
664 |             return false;
665 |         }
666 |         if ((input[0] >> 1) == 3 && (input[0] & 1) != (y[0] & 1)) {
667 |             return false;
668 |         }
669 |         uint32_t x_mont[8], y_mont[8];
670 |         P256_to_montgomery(x_mont, x);
671 |         P256_to_montgomery(y_mont, y);
672 |         return P256_point_is_on_curve(x_mont, y_mont);
673 |     }
674 |     #endif
675 |     #if include_p256_decompress_point
676 |     if ((input[0] >> 1) == 1 && input_len_in_bytes == 33) {
677 |         return P256_decompress_point(y, x, input[0] & 1);
678 |     }
679 |     #endif
680 |     return false;
681 | }
682 | #endif
683 | 


--------------------------------------------------------------------------------
/p256-cortex-m4.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2017-2021 Emil Lenngren
  3 |  * Copyright (c) 2021 Shortcut Labs AB
  4 |  *
  5 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  6 |  * of this software and associated documentation files (the "Software"), to deal
  7 |  * in the Software without restriction, including without limitation the rights
  8 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 |  * copies of the Software, and to permit persons to whom the Software is
 10 |  * furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included in all
 13 |  * copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 |  * SOFTWARE.
 22 |  */
 23 | 
 24 | #ifndef P256_CORTEX_M4_H
 25 | #define P256_CORTEX_M4_H
 26 | 
 27 | #include <stdint.h>
 28 | #include <stdbool.h>
 29 | #include <stddef.h>
 30 | 
 31 | #include "p256-cortex-m4-config.h"
 32 | 
 33 | /*
 34 | 
 35 | Implementation of P-256 Elliptic Curve operations for 32-bit ARMv7E-M processors or later.
 36 | 
 37 | The functions below have the following conventions:
 38 | - Arrays of type uint32_t represent 256-bit integers, stored using little endian byte order on a 4-byte alignment.
 39 | - Pointer/array parameters are input parameters when they are const and output parameters when they are not
 40 |     const, unless otherwise stated.
 41 | - All functions that take a public key as parameter will validate that the public key corresponds to a valid
 42 |     point and return false if validation fails.
 43 | - When a function returns false, the output parameters, if any, will not contain valid data and should hence
 44 |     not be inspected.
 45 | - There are no checks for null pointers, unaligned uint32_t-pointers, invalid private keys etc.
 46 | 
 47 | If there is a need to convert a big endian byte string to an array of little endian uint32_t integers or vice
 48 | versa, the p256_convert_endianness function may be used for this purpose.
 49 | 
 50 | Note: code that processes secret data runs in constant time, in order to mitigate side channel attacks.
 51 | 
 52 | */
 53 | 
 54 | /**
 55 |  * Converts endianness by reversing the input value.
 56 |  *
 57 |  * The output and input pointers may refer to the same location and have no alignment requirements.
 58 |  */
 59 | void p256_convert_endianness(void* output, const void* input, size_t byte_len);
 60 | 
 61 | #if include_p256_verify
 62 | /**
 63 |  * Verifies an ECDSA signature.
 64 |  *
 65 |  * Returns true if the signature is valid for the given input, otherwise false.
 66 |  */
 67 | bool p256_verify(const uint32_t public_key_x[8], const uint32_t public_key_y[8],
 68 |                  const uint8_t* hash, uint32_t hashlen_in_bytes,
 69 |                  const uint32_t r[8], const uint32_t s[8])
 70 |                  __attribute__((warn_unused_result));
 71 | #endif
 72 | 
 73 | #if include_p256_sign
 74 | /**
 75 |  * Creates an ECDSA signature.
 76 |  *
 77 |  * The parameter "k" shall consist of a 256-bit random integer value. This random value MUST be generated from
 78 |  * a cryptographically secure random number generator, and MUST be unique for every pair of message hash and
 79 |  * private key.
 80 |  *
 81 |  * With a small probability (~ 2^-32), this function will fail and return false for the given "k" and this
 82 |  * function MUST in that case be called again with a new random "k", until true is returned. This is in line
 83 |  * with the ECDSA standard.
 84 |  *
 85 |  * As an alternative to using a random "k", "k" might be derived deterministically from the input, using a
 86 |  * sophisticated hash construction such as RFC 6979, or e.g. by hashing the private key, message hash and a
 87 |  * retry counter, using a secure hash function such as SHA-256.
 88 |  */
 89 | bool p256_sign(uint32_t r[8], uint32_t s[8],
 90 |                const uint8_t* hash, uint32_t hashlen_in_bytes,
 91 |                const uint32_t private_key[8], const uint32_t k[8])
 92 |                __attribute__((warn_unused_result));
 93 | 
 94 | /**
 95 |  * Sign precomputation state.
 96 |  *
 97 |  * The content shall be treated as opaque to the API user and shall not be inspected or modified.
 98 |  */
 99 | struct SignPrecomp {
100 |     uint32_t r[8];
101 |     uint32_t k_inv[8];
102 | };
103 | 
104 | /**
105 |  * Creates an ECDSA signature, using a two-step procedure.
106 |  *
107 |  * This function performs the first of two steps, and accounts for 99% of the time spent for generating an
108 |  * ECDSA signature.
109 |  *
110 |  * By splitting up into two steps, most of the work could be spent before deciding what message to sign, or
111 |  * which private key to use.
112 |  *
113 |  * The parameter "k" shall consist of a 256-bit random integer value. This random value MUST be generated from
114 |  * a cryptographically secure random number generator, and MUST be unique for every pair of message hash and
115 |  * private key.
116 |  *
117 |  * With a small probability (~ 2^-32), this function will fail and return false for the given "k" and this
118 |  * function MUST in that case be called again with a new random "k", until true is returned. This is in line
119 |  * with the ECDSA standard.
120 |  *
121 |  * As an alternative to using a random "k", "k" might be derived deterministically from the input, using a
122 |  * sophisticated hash construction such as RFC 6979, or e.g. by hashing the private key, message hash and a
123 |  * retry counter, using a secure hash function such as SHA-256.
124 |  *
125 |  * The "result" parameter will contain the computed state, that is later to be passed to p256_sign_step2.
126 |  * A result state MUST NOT be reused for generating multiple signatures.
127 |  */
128 | bool p256_sign_step1(struct SignPrecomp *result, const uint32_t k[8]) __attribute__((warn_unused_result));
129 | 
130 | /**
131 |  * Second step of creating an ECDSA signature, using a two-step procedure.
132 |  *
133 |  * This function performs the second of two steps, and accounts for the last 1% of the time spent for generating
134 |  * an ECDSA signature.
135 |  *
136 |  * The "sign_precomp" parameter shall contain a pointer to a state generated by p256_sign_step1.
137 |  *
138 |  * With a small probability (~ 2^-256), this function will fail, due to the given "k" from the first step is
139 |  * not compatible with the rest of the input, and return false. In this case, the procedure MUST be started
140 |  * over from step 1 with a new random "k".  This is in line with the ECDSA standard. Otherwise true is returned
141 |  * and the signature is placed in "r" and "s".
142 |  *
143 |  * When this function returns, "sign_precomp" is also zeroed out and may hence not be reused.
144 |  */
145 | bool p256_sign_step2(uint32_t r[8], uint32_t s[8], const uint8_t* hash, uint32_t hashlen_in_bytes,
146 |                      const uint32_t private_key[8], struct SignPrecomp *sign_precomp)
147 |                      __attribute__((warn_unused_result));
148 | #endif
149 | 
150 | #if include_p256_keygen
151 | /**
152 |  * Calculates the public key from a given private key for use by either ECDSA or ECDH.
153 |  *
154 |  * The private key shall be taken from a random value that MUST have been generated by a cryptographically
155 |  * secure random number generator that generates 256 random bits. This function validates that the private key
156 |  * lies in the accepted range 1 to n-1, where n is the order of the elliptic curve, and returns true only if
157 |  * this validation succeeds. If random value is out of that range, false is returned and in this case a new
158 |  * random value needs to be generated and this function MUST be called again until true is returned.
159 |  *
160 |  * The public key is created by performing a scalar multiplication of the private key and the base point of
161 |  * the curve.
162 |  *
163 |  * Only use a keypair for either ECDSA or ECDH, not both, and don't use the private key for any other purposes.
164 |  */
165 | bool p256_keygen(uint32_t public_key_x[8], uint32_t public_key_y[8],
166 |                  const uint32_t private_key[8])
167 |                  __attribute__((warn_unused_result));
168 | #endif
169 | 
170 | #if include_p256_ecdh
171 | /**
172 |  * Generates the shared secret according to the ECDH standard.
173 |  *
174 |  * The shared secret parameter will contain the big endian encoding for the x coordinate of the scalar
175 |  * multiplication of the private key and the input point (other's public key), if the function succeeds.
176 |  *
177 |  * If the other's public key point does not lie on the curve, this function fails and false is returned.
178 |  * Otherwise, shared secret is calculated and true is returned.
179 |  *
180 |  * NOTE: The return value MUST be checked since the other's public key point cannot generally be trusted.
181 |  */
182 | bool p256_ecdh_calc_shared_secret(uint8_t shared_secret[32], const uint32_t private_key[8],
183 |                                   const uint32_t others_public_key_x[8], const uint32_t others_public_key_y[8])
184 |                                   __attribute__((warn_unused_result));
185 | #endif
186 | 
187 | #if include_p256_raw_scalarmult_base
188 | /**
189 |  * Raw scalar multiplication by the base point of the elliptic curve.
190 |  *
191 |  * This function can be used to implement custom algorithms using the P-256 curve.
192 |  *
193 |  * This function validates that the scalar lies in the accepted range 1 to n-1, where n is the order of the
194 |  * elliptic curve, and returns true only if this validation succeeds. Otherwise false is returned.
195 |  */
196 | bool p256_scalarmult_base(uint32_t result_x[8], uint32_t result_y[8], const uint32_t scalar[8]);
197 | #endif
198 | 
199 | #if include_p256_raw_scalarmult_generic
200 | /**
201 |  * Raw scalar multiplication by any point on the elliptic curve.
202 |  *
203 |  * This function can be used to implement custom algorithms using the P-256 curve.
204 |  *
205 |  * This function validates all inputs and proceeds only if the scalar is within the range 1 to n-1, where n
206 |  * is the order of the elliptic curve, and the input point's coordinates are each less than the order of
207 |  * the prime field. If validation succeeds, true is returned. Otherwise false is returned.
208 |  */
209 | bool p256_scalarmult_generic(uint32_t result_x[8], uint32_t result_y[8],
210 | 						     const uint32_t scalar[8], const uint32_t in_x[8], const uint32_t in_y[8]);
211 | #endif
212 | 
213 | // These functions create a big endian octet string representation of a point according to the X.92 standard.
214 | 
215 | #if include_p256_to_octet_string_uncompressed
216 | /**
217 |  * Uncompressed encoding: "04 || Px || Py".
218 |  */
219 | void p256_point_to_octet_string_uncompressed(uint8_t out[65], const uint32_t x[8], const uint32_t y[8]);
220 | #endif
221 | 
222 | #if include_p256_to_octet_string_compressed
223 | /**
224 |  * Compressed encoding: "02 || Px" if Py is even and "03 || Px" if Py is odd.
225 |  */
226 | void p256_point_to_octet_string_compressed(uint8_t out[33], const uint32_t x[8], const uint32_t y[8]);
227 | #endif
228 | 
229 | #if include_p256_to_octet_string_hybrid
230 | /**
231 |  * Hybrid encoding: "06 || Px || Py" if Py is even and "07 || Px || Py" if Py is odd (a pretty useless encoding).
232 |  */
233 | void p256_point_to_octet_string_hybrid(uint8_t out[65], const uint32_t x[8], const uint32_t y[8]);
234 | #endif
235 | 
236 | #if include_p256_decode_point || include_p256_decompress_point
237 | /**
238 |  * Decodes a point according to the three encodings above.
239 |  *
240 |  * include_p256_decode_point: first byte is "04", "06" or "07" and input length is 65 bytes
241 |  * include_p256_decompress_point: first byte is "02" or "03" and input length is 33 bytes
242 |  *
243 |  * Returns true if the input string confirms to a valid encoding and the point lies on the curve,
244 |  * otherwise false.
245 |  *
246 |  * NOTE: The return value MUST be checked in case the point is not guaranteed to lie on the curve (e.g. if it
247 |  * is received from an untrusted party).
248 |  */
249 | bool p256_octet_string_to_point(uint32_t x[8], uint32_t y[8],
250 |                                 const uint8_t* input, uint32_t input_len_in_bytes)
251 |                                 __attribute__((warn_unused_result));
252 | #endif
253 | 
254 | #endif
255 | 


--------------------------------------------------------------------------------
/testgen.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2021 Shortcut Labs AB
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  5 |  * of this software and associated documentation files (the "Software"), to deal
  6 |  * in the Software without restriction, including without limitation the rights
  7 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 |  * copies of the Software, and to permit persons to whom the Software is
  9 |  * furnished to do so, subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in all
 12 |  * copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 20 |  * SOFTWARE.
 21 |  */
 22 | 
 23 | // execute "node testgen.js > tests.c", with a nodejs version >= 10.4
 24 | 
 25 | const assert = require('assert');
 26 | const https = require('https');
 27 | const crypto = require('crypto');
 28 | 
 29 | // Simple ECDSA implementation that is correct, but slow and not side channel safe. Only used to generate test data.
 30 | q = 2n**256n - 2n**224n + 2n**192n + 2n**96n - 1n
 31 | G = {x: 0x6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296n, y: 0x4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5n}
 32 | n = 0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551n
 33 | 
 34 | function div2(v, mod) {
 35 | 	return (v & 1n) ? (v + mod) >> 1n : v >> 1n;
 36 | }
 37 | 
 38 | // https://eprint.iacr.org/2014/130.pdf, algorithm 10
 39 | function pointDbl(p) {
 40 | 	t1 = p.z**2n
 41 | 	t2 = p.x + t1
 42 | 	t1 = p.x - t1
 43 | 	t1 = t1 * t2
 44 | 	t2 = div2(t1, q)
 45 | 	t1 = t1 + t2
 46 | 	t2 = p.y**2n
 47 | 	t3 = p.x * t2
 48 | 	t4 = t1**2n
 49 | 	t4 = t4 - t3
 50 | 	x = t4 - t3
 51 | 	z = p.y * p.z
 52 | 	t2 = t2**2n
 53 | 	t4 = t3 - x
 54 | 	t1 = t1 * t4
 55 | 	y = t1 - t2
 56 | 	return {x: x % q, y: y % q, z: z % q}
 57 | }
 58 | 
 59 | // https://eprint.iacr.org/2014/130.pdf, algorithm 13
 60 | function pointAdd(p1, p2) {
 61 | 	if (p1.z === 0n) {
 62 | 		return {x: p2.x, y: p2.y, z: 1n}
 63 | 	}
 64 | 	t1 = p1.z**2n
 65 | 	t2 = p1.z * t1
 66 | 	t1 = t1 * p2.x
 67 | 	t2 = t2 * p2.y
 68 | 	t1 = t1 - p1.x
 69 | 	t2 = t2 - p1.y
 70 | 	t1 %= q
 71 | 	t2 %= q
 72 | 	if (t1 === 0n) {
 73 | 		if (t2 === 0n) {
 74 | 			return pointDbl(p1)
 75 | 		} else {
 76 | 			return {x: 0n, y: 0n, z: 0n}
 77 | 		}
 78 | 	}
 79 | 	z = p1.z * t1
 80 | 	t3 = t1**2n
 81 | 	t4 = t1 * t3
 82 | 	t3 = p1.x * t3
 83 | 	t1 = t3 + t3
 84 | 	x = t2**2n
 85 | 	x = x - t1
 86 | 	x = x - t4
 87 | 	t3 = t3 - x
 88 | 	t3 = t2 * t3
 89 | 	t4 = t4 * p1.y
 90 | 	y = t3 - t4
 91 | 	return {x: x % q, y: y % q, z: z % q}
 92 | }
 93 | 
 94 | // "D. Hankerson, A. Menezes, and S. Vanstone, Guide to
 95 | // Elliptic Curve Cryptography, 2004" Algorithm 2.22
 96 | // (Extended Stein's GCD algorithm)
 97 | function modInv(val, mod) {
 98 | 	if (val === 0n) {
 99 | 		return 0n;
100 | 	}
101 | 	let u = val, v = mod;
102 | 	let x1 = 1n, x2 = 0n;
103 | 	while (u !== 1n && v !== 1n) {
104 | 		while ((u & 1n) === 0n) {
105 | 			u = u >> 1n;
106 | 			x1 = div2(x1, mod);
107 | 		}
108 | 		while ((v & 1n) === 0n) {
109 | 			v = v >> 1n;
110 | 			x2 = div2(x2, mod);
111 | 		}
112 | 		if (u >= v) {
113 | 			u = u - v;
114 | 			x1 = x1 - x2;
115 | 			if (x1 < 0n) {
116 | 				x1 = x1 + mod;
117 | 			}
118 | 		} else {
119 | 			v = v - u;
120 | 			x2 = x2 - x1;
121 | 			if (x2 < 0n) {
122 | 				x2 = x2 + mod;
123 | 			}
124 | 		}
125 | 	}
126 | 	return u === 1n ? x1 : x2;
127 | }
128 | 
129 | function scalarmult(scalar, p) {
130 | 	// this implementation does not verify that p lies on the curve
131 | 	if (scalar === 0n) {
132 | 		return {x: 0n, y: 0n};
133 | 	}
134 | 	let neg = scalar < 0n;
135 | 	if (neg) {
136 | 		scalar = -scalar;
137 | 	}
138 | 	let result = {x: 0n, y: 0n, z: 0n};
139 | 	const numBits = scalar.toString(2).length;
140 | 	for (let i = numBits - 1; i >= 0; i--) {
141 | 		result = pointDbl(result);
142 | 		if (scalar & (1n << BigInt(i))) {
143 | 			result = pointAdd(result, p);
144 | 		}
145 | 	}
146 | 	if (neg) {
147 | 		result.y = -result.y;
148 | 	}
149 | 	result = {x: (result.x + q) % q, y: (result.y + q) % q, z: (result.z + q) % q};
150 | 	const zInv = modInv(result.z, q);
151 | 	return {x: (result.x * zInv**2n) % q, y: (result.y * zInv**3n) % q};
152 | }
153 | 
154 | function sign(z, privKey, k) {
155 | 	if ((k < 1n || k >= n) || (privKey < 1n || privKey >= n)) {
156 | 		return null;
157 | 	}
158 | 	const point = scalarmult(k, G);
159 | 	const r = point.x % n;
160 | 	if (r === 0n) {
161 | 		return null;
162 | 	}
163 | 	const s = modInv(k, n) * (z + r * privKey) % n;
164 | 	if (s === 0n) {
165 | 		return null;
166 | 	}
167 | 	return {r: r, s: s};
168 | }
169 | 
170 | function bufferToBigInt(b) {
171 | 	return BigInt('0x' + b.toString('hex'));
172 | }
173 | 
174 | function bigIntToBuffer(v, len) {
175 | 	let str = v.toString(16);
176 | 	if (str.length & 1) {
177 | 		str = '0' + str;
178 | 	}
179 | 	let buf = Buffer.from(str, 'hex');
180 | 	if (buf.length < len) {
181 | 		buf = Buffer.concat([Buffer.alloc(len - buf.length), buf]);
182 | 	}
183 | 	return buf;
184 | }
185 | 
186 | function sha256(v) {
187 | 	return crypto.createHash('sha256').update(v).digest();
188 | }
189 | 
190 | function httpsRequest(url) {
191 | 	return new Promise((resolve, reject) => {
192 | 		let data = '';
193 | 
194 | 		https.get(url, (res) => {
195 | 			res.on('data', (chunk) => {
196 | 				data += chunk;
197 | 			});
198 | 			res.on('end', () => {
199 | 				resolve(data);
200 | 			});
201 | 			res.on('error', (error) => {
202 | 				reject(error);
203 | 			});
204 | 		});
205 | 	});
206 | }
207 | 
208 | function toUIntArr(v, size, resizeLen) {
209 | 	const len = v.length / size;
210 | 	let res = [];
211 | 	for (let i = 0; i < len; i++) {
212 | 		res.push('0x' + v.slice(i * size, (i + 1) * size).toString('hex'));
213 | 	}
214 | 	for (let i = 0; i < res.length / resizeLen; i++) {
215 | 		const start = i * resizeLen;
216 | 		for (let j = 0; j < resizeLen / 2; j++) {
217 | 			const v1 = res[start + j];
218 | 			const v2 = res[start + resizeLen - 1 - j];
219 | 			res[start + j] = v2;
220 | 			res[start + resizeLen - 1 - j] = v1;
221 | 		}
222 | 	}
223 | 	
224 | 	return '{' + res.join(',') + '}';
225 | }
226 | 
227 | async function ecdhTests() {
228 | 	let data = await httpsRequest('https://raw.githubusercontent.com/google/wycheproof/master/testvectors/ecdh_secp256r1_test.json');
229 | 	
230 | 	data = JSON.parse(data);
231 | 	const testGroups = data.testGroups;
232 | 	const newTests = [];
233 | 	testGroups.forEach((testGroup) => {
234 | 		testGroup.tests.forEach((test) => {
235 | 			let pub = Buffer.from(test.public, 'hex');
236 | 			const prefix = Buffer.from('301306072a8648ce3d020106082a8648ce3d03010703', 'hex');
237 | 			if (pub[0] !== 0x30 || pub[1] !== pub.length - 2 || !pub.slice(2, 24).equals(prefix) || pub[24] !== pub.length - 25 || pub[25] !== 0x00 || (pub[24] !== 66 && pub[24] !== 34)) {
238 | 				if (test.flags.some(f => f === 'UnnamedCurve' || f === 'InvalidAsn')) {
239 | 					return;
240 | 				}
241 | 				assert.equal(test.result, 'invalid', test.tcId);
242 | 				//console.log(test.tcId + ' ' + test.comment);
243 | 				return;
244 | 			}
245 | 			pub = pub.slice(26);
246 | 			
247 | 			if (!((pub.length == 65 && pub[0] == 0x04) || (pub.length == 66 && pub[0] != 0x04))) {
248 | 				test.result = 'invalid';
249 | 				test.shared = '';
250 | 			}
251 | 			if (pub.equals(Buffer.from('042998705a9a71c783e1cf4397dbed9375a44e4cb88053594b0ea982203b6363b063d0af4971d1c3813db3c7799f9f9324cbe1b90054c81b510ff6297160add6eb', 'hex'))) {
252 | 				// Bug in test case #454, should not be accepted, but invalid, since point does not lie on curve
253 | 				test.result = 'invalid';
254 | 				test.shared = '';
255 | 			}
256 | 			
257 | 			
258 | 			let priv = Buffer.from(test.private, 'hex');
259 | 			if (priv.length > 32) {
260 | 				if (priv.length !== 33 || priv[0] !== 0x00 || priv[1] < 0x80) {
261 | 					assert.equal(test.result, 'invalid', test.tcId);
262 | 					//console.log(test.tcId + ' ' + test.comment);
263 | 					return;
264 | 				}
265 | 				priv = priv.slice(1);
266 | 			} else if (priv.length < 32) {
267 | 				priv = Buffer.concat([Buffer.alloc(32 - priv.length), priv]);
268 | 			}
269 | 			
270 | 			const shared = Buffer.from(test.shared, 'hex');
271 | 			assert.equal(shared.length === 32, test.result !== 'invalid');
272 | 			
273 | 			newTests.push({tcId: test.tcId, pub: pub, priv: priv, shared: shared, result: test.result !== 'invalid'});
274 | 		});
275 | 	});
276 | 	const pubs = {}; let pubsSize = 0; const pubsArr = [];
277 | 	const privs = {}; let privsSize = 0; const privsArr = [];
278 | 	const shareds = {}; let sharedsSize = 0; const sharedsArr = [];
279 | 	newTests.forEach((test) => {
280 | 		if (!(test.pub.toString('hex') in pubs)) {pubs[test.pub.toString('hex')] = pubsSize++; pubsArr.push(test.pub);}
281 | 		if (!(test.priv.toString('hex') in privs)) {privs[test.priv.toString('hex')] = privsSize++; privsArr.push(test.priv);}
282 | 		if (test.shared.length && !(test.shared.toString('hex') in shareds)) {shareds[test.shared.toString('hex')] = sharedsSize++; sharedsArr.push(test.shared);}
283 | 	});
284 | 	const resArr = newTests.map((test) => {
285 | 		return ['pub_' + pubs[test.pub.toString('hex')], 'priv_' + privs[test.priv.toString('hex')], test.shared.length ? 'shared_' + shareds[test.shared.toString('hex')] : 'NULL', test.pub.length, test.result ? 1 : 0 /*, test.tcId*/];
286 | 	});
287 | 	
288 | 	const invalidScalars = [0n, n, n + 1n, n + 2n, 2n**256n - 2n, 2n**256n - 1n].map(v => toUIntArr(bigIntToBuffer(v, 32), 4, 8));
289 | 	pubsArr.forEach((v, i) => console.log('static const uint8_t pub_' + i + '[] = ' + toUIntArr(v, 1, 1) + ';'));
290 | 	privsArr.forEach((v, i) => console.log('static const uint32_t priv_' + i + '[] = ' + toUIntArr(v, 4, 8) + ';'));
291 | 	sharedsArr.forEach((v, i) => console.log('static const uint8_t shared_' + i + '[] = ' + toUIntArr(v, 1, 1) + ';'));
292 | 	privsArr.forEach((v, i) => {
293 | 		const pub = scalarmult(bufferToBigInt(v), G);
294 | 		console.log('static const uint32_t pub_for_priv_' + i + '[] = ' + toUIntArr(Buffer.concat([bigIntToBuffer(pub.x, 32), bigIntToBuffer(pub.y, 32)]), 4, 8) + ';');
295 | 	});
296 | 	console.log('static const struct EcdhTest ecdh_tests[] = {' + resArr.map((test) => '{' + test.join(',') + '}').join(',\n') + '};\n');
297 | 	console.log('static const struct KeygenTest keygen_tests_ok[] = {' + privsArr.map((v, i) => '{priv_' + i + ', pub_for_priv_' + i + '}').join(',\n') + '};\n');
298 | 	console.log('static const uint32_t keygen_tests_fail[][8] = {' + invalidScalars.join(',\n') + '};\n');
299 | 	
300 | 	const invalidSignData = [
301 | 		{k: 1n, z: n - G.x, priv: 1n},
302 | 		{k: 0n, z: 0n, priv: 1n},
303 | 		{k: n, z: 0n, priv: 1n}
304 | 	];
305 | 	const validSignData = [
306 | 		{k: 1n, z: 0n, priv: 1n},
307 | 		{k: n - 1n, z: 2n**256n - 1n, priv: n - 1n},
308 | 		{k: bufferToBigInt(sha256('test0k')), z: bufferToBigInt(sha256('test0z')), priv: bufferToBigInt(sha256('test0p'))},
309 | 		{k: bufferToBigInt(sha256('test1k')), z: bufferToBigInt(sha256('test1z')), priv: bufferToBigInt(sha256('test1p'))},
310 | 		{k: bufferToBigInt(sha256('test2k')), z: bufferToBigInt(sha256('test2z')), priv: bufferToBigInt(sha256('test2p'))},
311 | 		{k: bufferToBigInt(sha256('test3k')), z: bufferToBigInt(sha256('test3z')), priv: bufferToBigInt(sha256('test3p'))},
312 | 		{k: bufferToBigInt(sha256('test4k')), z: bufferToBigInt(sha256('test4z')), priv: bufferToBigInt(sha256('test4p'))},
313 | 	].map(d => {
314 | 		const rs = sign(d.z, d.priv, d.k);
315 | 		return {k: d.k, z: d.z, priv: d.priv, r: rs.r, s: rs.s};
316 | 	});
317 | 	console.log('static const struct InvalidSign invalid_signs[] = {' + invalidSignData.map(d =>
318 | 		'{' + toUIntArr(bigIntToBuffer(d.k, 32), 4, 8) + ',\n' + toUIntArr(bigIntToBuffer(d.z, 32), 1, 1) + ',\n' + toUIntArr(bigIntToBuffer(d.priv, 32), 4, 8) + '}'
319 | 	).join(',\n') + '};');
320 | 	console.log('static const struct ValidSign valid_signs[] = {' + validSignData.map(d =>
321 | 		'{' + toUIntArr(bigIntToBuffer(d.k, 32), 4, 8) + ',\n' + toUIntArr(bigIntToBuffer(d.z, 32), 1, 1) + ',\n' + toUIntArr(bigIntToBuffer(d.priv, 32), 4, 8) + ',\n' +
322 | 		toUIntArr(Buffer.concat([bigIntToBuffer(d.r, 32), bigIntToBuffer(d.s, 32)]), 4, 8) + '}'
323 | 	).join(',\n') + '};');
324 | }
325 | 
326 | async function ecdsaVerifyTests() {
327 | 	let data = await httpsRequest('https://raw.githubusercontent.com/google/wycheproof/master/testvectors/ecdsa_secp256r1_sha256_test.json');
328 | 	
329 | 	data = JSON.parse(data);
330 | 	const testGroups = data.testGroups;
331 | 	const newTests = [];
332 | 	testGroups.forEach((testGroup) => {
333 | 		const publicKey = Buffer.from(testGroup.key.uncompressed.slice(2), 'hex');
334 | 		testGroup.tests.forEach((test) => {
335 | 			const msg = crypto.createHash('sha256').update(test.msg, 'hex').digest();
336 | 			const sig = Buffer.from(test.sig, 'hex');
337 | 			if (sig[0] !== 0x30 || sig[1] !== sig.length - 2) {
338 | 				assert.equal(test.result, 'invalid');
339 | 				//console.log(test.tcId + ' ' + test.comment);
340 | 				return;
341 | 			}
342 | 			let rs = [];
343 | 			let pos = 2;
344 | 			for (let i = 0; i < 2; i++) {
345 | 				const len = sig[pos + 1];
346 | 				if (sig[pos] !== 0x02 || len > 33 || len > sig.length - pos - 2) {
347 | 					assert.equal(test.result, 'invalid', test.tcId);
348 | 					//console.log(test.tcId + ' ' + test.comment);
349 | 					return;
350 | 				}
351 | 				let num = sig.slice(pos + 2, pos + 2 + len);
352 | 				if (num.length == 33) {
353 | 					if (num[0] !== 0) {
354 | 						assert.equal(test.result, 'invalid');
355 | 						//console.log(test.tcId + ' ' + test.comment);
356 | 						return;
357 | 					}
358 | 					num = num.slice(1);
359 | 				}
360 | 				num = Buffer.concat([Buffer.alloc(32 - num.length), num]);
361 | 				rs.push(num);
362 | 				pos += 2 + len;
363 | 			}
364 | 			if (pos !== sig.length) {
365 | 				assert.equal(test.result, 'invalid');
366 | 				return;
367 | 			}
368 | 			newTests.push({tcId: test.tcId, key: publicKey, msg: msg, sig: Buffer.concat(rs), result: test.result === 'valid' || test.result === 'acceptable'});
369 | 		});
370 | 	});
371 | 	const keys = {}; let keysSize = 0; const keysArr = [];
372 | 	const msgs = {}; let msgsSize = 0; const msgsArr = [];
373 | 	const sigs = {}; let sigsSize = 0; const sigsArr = [];
374 | 	newTests.forEach((test) => {
375 | 		if (!(test.key in keys)) {keys[test.key] = keysSize++; keysArr.push(test.key);}
376 | 		if (!(test.msg in msgs)) {msgs[test.msg] = msgsSize++; msgsArr.push(test.msg);}
377 | 		if (!(test.sig in sigs)) {sigs[test.sig] = sigsSize++; sigsArr.push(test.sig);}
378 | 	});
379 | 	const resArr = newTests.map((test) => {
380 | 		return {key: keys[test.key], msg: msgs[test.msg], sig: sigs[test.sig], result: test.result, tcId: test.tcId};
381 | 	}).map((test) => {
382 | 		return ['key_' + test.key, 'msg_' + test.msg, 'sig_' + test.sig, test.result ? 1 : 0 /*, test.tcId*/];
383 | 	});
384 | 	keysArr.forEach((v, i) => console.log('static const uint32_t key_' + i + '[] = ' + toUIntArr(v, 4, 8) + ';'));
385 | 	msgsArr.forEach((v, i) => console.log('static const uint8_t msg_' + i + '[] = ' + toUIntArr(v, 1, 1) + ';'));
386 | 	sigsArr.forEach((v, i) => console.log('static const uint32_t sig_' + i + '[] = ' + toUIntArr(v, 4, 8) + ';'));
387 | 	console.log('static const struct VerifyTest verify_tests[] = {' + resArr.map((test) => '{' + test.join(',') + '}').join(',\n') + '};\n');
388 | }
389 | 
390 | (async () => {
391 | 	console.log(`#include <string.h>
392 | #include <stdint.h>
393 | #include <stdbool.h>
394 | #include "p256-cortex-m4.h"
395 | #define COUNTOF(a) (sizeof(a) / sizeof((a)[0]))
396 | struct VerifyTest {const uint32_t* key; const uint8_t* msg; const uint32_t* sig; bool result;};
397 | struct EcdhTest {const uint8_t* pub; const uint32_t* priv; const uint8_t* shared; uint8_t publen; bool valid;};
398 | struct KeygenTest {const uint32_t* priv; const uint32_t* pub;};
399 | struct InvalidSign {const uint32_t k[8]; const uint8_t z[32]; const uint32_t priv[8];};
400 | struct ValidSign {const uint32_t k[8]; const uint8_t z[32]; const uint32_t priv[8]; const uint32_t sig[16];};
401 | `)
402 | 	await ecdhTests();
403 | 	await ecdsaVerifyTests();
404 | 	console.log(`
405 | bool run_tests(void) {
406 | 	for (int i = 0; i < COUNTOF(verify_tests); i++) {
407 | 		const struct VerifyTest* t = &verify_tests[i];
408 | 		if (p256_verify(t->key, t->key + 8, t->msg, 32, t->sig, t->sig + 8) != t->result) {
409 | 			return false;
410 | 		}
411 | 	}
412 | 	for (int i = 0; i < COUNTOF(ecdh_tests); i++) {
413 | 		const struct EcdhTest* t = &ecdh_tests[i];
414 | 		uint32_t x[8], y[8];
415 | 		uint8_t shared[32];
416 | 		if ((p256_octet_string_to_point(x, y, t->pub, t->publen) && p256_ecdh_calc_shared_secret(shared, t->priv, x, y) && memcmp(shared, t->shared, 32) == 0) != t->valid) {
417 | 			return false;
418 | 		}
419 | 	}
420 | 	for (int i = 0; i < COUNTOF(keygen_tests_ok); i++) {
421 | 		const struct KeygenTest* t = &keygen_tests_ok[i];
422 | 		uint32_t pub[16];
423 | 		if (!p256_keygen(pub, pub + 8, t->priv) || memcmp(pub, t->pub, 64) != 0) {
424 | 			return false;
425 | 		}
426 | 	}
427 | 	for (int i = 0; i < COUNTOF(keygen_tests_fail); i++) {
428 | 		uint32_t x[8], y[8];
429 | 		if (p256_keygen(x, y, keygen_tests_fail[i])) {
430 | 			return false;
431 | 		}
432 | 	}
433 | 	for (int i = 0; i < COUNTOF(invalid_signs); i++) {
434 | 		const struct InvalidSign* t = &invalid_signs[i];
435 | 		uint32_t sig[16];
436 | 		if (p256_sign(sig, sig + 8, t->z, 32, t->priv, t->k)) {
437 | 			return false;
438 | 		}
439 | 	}
440 | 	for (int i = 0; i < COUNTOF(valid_signs); i++) {
441 | 		const struct ValidSign* t = &valid_signs[i];
442 | 		uint32_t sig[16];
443 | 		if (!p256_sign(sig, sig + 8, t->z, 32, t->priv, t->k) || memcmp(sig, t->sig, 64) != 0) {
444 | 			return false;
445 | 		}
446 | 	}
447 | 	return true;
448 | }
449 | `);
450 | 	
451 | })();
452 | 


--------------------------------------------------------------------------------