├── LICENSE
├── README.md
└── src
    ├── Makefile
    ├── Makefile.win32
    ├── api.c
    ├── api_128_256.c
    ├── api_257_512.c
    ├── api_769_1024.c
    ├── bat.h
    ├── blake2.h
    ├── blake2b.c
    ├── blake2s.c
    ├── codec.c
    ├── fft.c
    ├── fnr.c
    ├── inner.h
    ├── kem128.c
    ├── kem257.c
    ├── kem769.c
    ├── keygen.c
    ├── modgen.c
    ├── modqp.c
    ├── prng.c
    ├── speed_bat.c
    └── test_bat.c


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Thomas Pornin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BAT
2 | BAT KEM/Signature Reference Implementation
3 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | .POSIX:
 2 | 
 3 | CC = clang
 4 | 
 5 | CFLAGS = -Wall -Wextra -Wshadow -Wundef -O3
 6 | # To allow clang to optimize for the local machine, add:
 7 | #    -march=native
 8 | # (this will activate automatic vectorization by clang, if the local machine
 9 | # can do it; but the binary may fail to run on older CPU)
10 | #
11 | # To use the AVX2-specific code, add:
12 | #    -DBAT_AVX2
13 | # (this will use all the code wil explicit AVX2 intrinsic functions; the
14 | # binary won't run on a machine without AVX2)
15 | #
16 | # You can use -march=native and -DBAT_AVX2 at the same time, for the
17 | # highest optimization:
18 | # CFLAGS = -Wall -Wextra -Wshadow -Wundef -O3 -march=native -DBAT_AVX2
19 | 
20 | LD = clang
21 | LDFLAGS =
22 | LIBS =
23 | 
24 | OBJ = api_128_256.o api_257_512.o api_769_1024.o codec.o fft.o fnr.o kem128.o kem257.o kem769.o keygen.o modqp.o prng.o blake2b.o blake2s.o
25 | 
26 | all: test_bat speed_bat
27 | 
28 | clean:
29 | 	-rm -f $(OBJ) test_bat test_bat.o speed_bat speed_bat.o
30 | 
31 | test_bat: $(OBJ) test_bat.o
32 | 	$(LD) $(LDFLAGS) -o test_bat test_bat.o $(OBJ) $(LIBS)
33 | 
34 | speed_bat: speed_bat.o $(OBJ)
35 | 	$(LD) $(LDFLAGS) -o speed_bat speed_bat.o $(OBJ) $(LIBS)
36 | 
37 | api_128_256.o: api_128_256.c api.c bat.h inner.h blake2.h
38 | 	$(CC) $(CFLAGS) -c -o api_128_256.o api_128_256.c
39 | 
40 | api_257_512.o: api_257_512.c api.c bat.h inner.h blake2.h
41 | 	$(CC) $(CFLAGS) -c -o api_257_512.o api_257_512.c
42 | 
43 | api_769_1024.o: api_769_1024.c api.c bat.h inner.h blake2.h
44 | 	$(CC) $(CFLAGS) -c -o api_769_1024.o api_769_1024.c
45 | 
46 | codec.o: codec.c inner.h blake2.h
47 | 	$(CC) $(CFLAGS) -c -o codec.o codec.c
48 | 
49 | fft.o: fft.c inner.h blake2.h
50 | 	$(CC) $(CFLAGS) -c -o fft.o fft.c
51 | 
52 | fnr.o: fnr.c inner.h blake2.h
53 | 	$(CC) $(CFLAGS) -c -o fnr.o fnr.c
54 | 
55 | kem128.o: kem128.c inner.h blake2.h
56 | 	$(CC) $(CFLAGS) -c -o kem128.o kem128.c
57 | 
58 | kem257.o: kem257.c modgen.c inner.h blake2.h
59 | 	$(CC) $(CFLAGS) -c -o kem257.o kem257.c
60 | 
61 | kem769.o: kem769.c modgen.c inner.h blake2.h
62 | 	$(CC) $(CFLAGS) -c -o kem769.o kem769.c
63 | 
64 | keygen.o: keygen.c inner.h blake2.h
65 | 	$(CC) $(CFLAGS) -c -o keygen.o keygen.c
66 | 
67 | modqp.o: modqp.c modgen.c inner.h blake2.h
68 | 	$(CC) $(CFLAGS) -c -o modqp.o modqp.c
69 | 
70 | prng.o: prng.c inner.h blake2.h
71 | 	$(CC) $(CFLAGS) -c -o prng.o prng.c
72 | 
73 | blake2b.o: blake2b.c inner.h blake2.h
74 | 	$(CC) $(CFLAGS) -c -o blake2b.o blake2b.c
75 | 
76 | blake2s.o: blake2s.c inner.h blake2.h
77 | 	$(CC) $(CFLAGS) -c -o blake2s.o blake2s.c
78 | 
79 | speed_bat.o: speed_bat.c bat.h inner.h blake2.h
80 | 	$(CC) $(CFLAGS) -c -o speed_bat.o speed_bat.c
81 | 
82 | test_bat.o: test_bat.c bat.h inner.h blake2.h
83 | 	$(CC) $(CFLAGS) -c -o test_bat.o test_bat.c
84 | 


--------------------------------------------------------------------------------
/src/Makefile.win32:
--------------------------------------------------------------------------------
 1 | # Makefile specific for MSVC.
 2 | # Use in a Visual C command window, with:
 3 | #    nmake -f Makefile.win32
 4 | 
 5 | CC = cl
 6 | 
 7 | CFLAGS = /nologo /W4 /O2
 8 | # To use the AVX2-specific code, add:
 9 | #    /arch:AVX2 /DBAT_AVX2
10 | # to the options above.
11 | 
12 | LD = cl
13 | LDFLAGS = /nologo
14 | LIBS =
15 | 
16 | OBJ = api_128_256.obj api_257_512.obj api_769_1024.obj codec.obj fft.obj fnr.obj kem128.obj kem257.obj kem769.obj keygen.obj modqp.obj prng.obj blake2b.obj blake2s.obj
17 | 
18 | all: test_bat.exe speed_bat.exe
19 | 
20 | clean:
21 | 	-del /Q $(OBJ) test_bat.exe test_bat.obj speed_bat.exe speed_bat.obj
22 | 
23 | test_bat.exe: $(OBJ) test_bat.obj
24 | 	$(LD) $(LDFLAGS) /Fe:test_bat.exe test_bat.obj $(OBJ) $(LIBS)
25 | 
26 | speed_bat.exe: $(OBJ) speed_bat.obj
27 | 	$(LD) $(LDFLAGS) /Fe:speed_bat.exe speed_bat.obj $(OBJ) $(LIBS)
28 | 
29 | api_128_256.obj: api_128_256.c api.c bat.h inner.h blake2.h
30 | 	$(CC) $(CFLAGS) /c /Fo:api_128_256.obj api_128_256.c
31 | 
32 | api_257_512.obj: api_257_512.c api.c bat.h inner.h blake2.h
33 | 	$(CC) $(CFLAGS) /c /Fo:api_257_512.obj api_257_512.c
34 | 
35 | api_769_1024.obj: api_769_1024.c api.c bat.h inner.h blake2.h
36 | 	$(CC) $(CFLAGS) /c /Fo:api_769_1024.obj api_769_1024.c
37 | 
38 | codec.obj: codec.c inner.h blake2.h
39 | 	$(CC) $(CFLAGS) /c /Fo:codec.obj codec.c
40 | 
41 | fft.obj: fft.c inner.h blake2.h
42 | 	$(CC) $(CFLAGS) /c /Fo:fft.obj fft.c
43 | 
44 | fnr.obj: fnr.c inner.h blake2.h
45 | 	$(CC) $(CFLAGS) /c /Fo:fnr.obj fnr.c
46 | 
47 | kem128.obj: kem128.c inner.h blake2.h
48 | 	$(CC) $(CFLAGS) /c /Fo:kem128.obj kem128.c
49 | 
50 | kem257.obj: kem257.c modgen.c inner.h blake2.h
51 | 	$(CC) $(CFLAGS) /c /Fo:kem257.obj kem257.c
52 | 
53 | kem769.obj: kem769.c modgen.c inner.h blake2.h
54 | 	$(CC) $(CFLAGS) /c /Fo:kem769.obj kem769.c
55 | 
56 | keygen.obj: keygen.c inner.h blake2.h
57 | 	$(CC) $(CFLAGS) /c /Fo:keygen.obj keygen.c
58 | 
59 | modqp.obj: modqp.c modgen.c inner.h blake2.h
60 | 	$(CC) $(CFLAGS) /c /Fo:modqp.obj modqp.c
61 | 
62 | prng.obj: prng.c inner.h blake2.h
63 | 	$(CC) $(CFLAGS) /c /Fo:prng.obj prng.c
64 | 
65 | blake2b.obj: blake2b.c inner.h blake2.h
66 | 	$(CC) $(CFLAGS) /c /Fo:blake2b.obj blake2b.c
67 | 
68 | blake2s.obj: blake2s.c inner.h blake2.h
69 | 	$(CC) $(CFLAGS) /c /Fo:blake2s.obj blake2s.c
70 | 
71 | speed_bat.obj: speed_bat.c bat.h inner.h blake2.h
72 | 	$(CC) $(CFLAGS) /c /Fo:speed_bat.obj speed_bat.c
73 | 
74 | test_bat.obj: test_bat.c bat.h inner.h blake2.h
75 | 	$(CC) $(CFLAGS) /c /Fo:test_bat.obj test_bat.c
76 | 


--------------------------------------------------------------------------------
/src/api.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is not meant to be compiled independently, but to be
  3 |  * included (with #include) by another C file. The caller shall
  4 |  * first define the Q, N and LOGN macros to relevant values (decimal
  5 |  * literal constants only).
  6 |  */
  7 | 
  8 | #if !defined Q || !defined N || !defined LOGN || !defined LVLBYTES
  9 | #error This module must not be compiled separately.
 10 | #endif
 11 | 
 12 | #include "bat.h"
 13 | #include "inner.h"
 14 | 
 15 | #define XCAT(x, y)    XCAT_(x, y)
 16 | #define XCAT_(x, y)   x ## y
 17 | #define XSTR(x)       XSTR_(x)
 18 | #define XSTR_(x)      #x
 19 | 
 20 | #define Zn(name)   XCAT(XCAT(XCAT(bat_, Q), XCAT(_, N)), XCAT(_, name))
 21 | #define ZN(name)   XCAT(XCAT(XCAT(BAT_, Q), XCAT(_, N)), XCAT(_, name))
 22 | 
 23 | /*
 24 |  * Degrees up to 512 use BLAKE2s; degree 1024 uses BLAKE2b.
 25 |  */
 26 | #if LOGN <= 9
 27 | #define HASH           blake2s
 28 | #define HASH_context   blake2s_context
 29 | #define HASH_init      blake2s_init
 30 | #define HASH_update    blake2s_update
 31 | #define HASH_final     blake2s_final
 32 | #define EXPAND         blake2s_expand
 33 | #else
 34 | #define HASH           blake2b
 35 | #define HASH_context   blake2b_context
 36 | #define HASH_init      blake2b_init
 37 | #define HASH_update    blake2b_update
 38 | #define HASH_final     blake2b_final
 39 | #define EXPAND         blake2b_expand
 40 | #endif
 41 | 
 42 | /*
 43 |  * Ensure good alignment of the provided pointer (8-byte alignment in
 44 |  * general, 32-byte alignment if the AVX2 implementation is used).
 45 |  * Returned value is the aligned pointer. If, after alignment, the size
 46 |  * is not at least equal to min_tmp_len, then NULL is returned.
 47 |  */
 48 | static void *
 49 | tmp_align(void *tmp, size_t tmp_len, size_t min_tmp_len)
 50 | {
 51 | 	unsigned off;
 52 | 
 53 | 	if (tmp == NULL) {
 54 | 		return NULL;
 55 | 	}
 56 | #if BAT_AVX2
 57 | 	off = (32u - (unsigned)(uintptr_t)tmp) & 31u;
 58 | #else
 59 | 	off = (8u - (unsigned)(uintptr_t)tmp) & 7u;
 60 | #endif
 61 | 	if (tmp_len < off || (tmp_len - off) < min_tmp_len) {
 62 | 		return NULL;
 63 | 	}
 64 | 	return (void *)((uintptr_t)tmp + off);
 65 | }
 66 | 
 67 | /*
 68 |  * Recompute the additional secret seed (rr) from the private key seed.
 69 |  */
 70 | static void
 71 | make_rr(Zn(private_key) *sk)
 72 | {
 73 | 	EXPAND(sk->rr, sizeof sk->rr, sk->seed, sizeof sk->seed,
 74 | 		(uint32_t)Q | ((uint32_t)LOGN << 16) | 0x72000000);
 75 | }
 76 | 
 77 | /*
 78 |  * Compute the hash function Hash_m(), used over the plaintext polynomial
 79 |  * 's' to generate the encryption seed. Output size matches the security
 80 |  * level.
 81 |  */
 82 | static void
 83 | hash_m(void *dst, const void *sbuf, size_t sbuf_len)
 84 | {
 85 | 	/*
 86 | 	 * We use a raw hash here because in practice sbuf_len exactly
 87 | 	 * matches the block length of the BLAKE2 function and we want
 88 | 	 * to stick to a single invocation of the primitive.
 89 | 	 *
 90 | 	 * Note that the output size used here is at most 16 (with BLAKE2s,
 91 | 	 * for degree N <= 512) or 32 (with BLAKE2b, for degree N = 1024),
 92 | 	 * i.e. strictly less than the natural hash output size. The output
 93 | 	 * size is part of the personalization block of BLAKE2, so this
 94 | 	 * already ensures domain separation from the BLAKE2 invocations
 95 | 	 * in the expand() calls in other functions used in this file.
 96 | 	 */
 97 | 	HASH(dst, LVLBYTES, NULL, 0, sbuf, sbuf_len);
 98 | }
 99 | 
100 | /*
101 |  * Compute the combination of Hash_s() and Sample_s(): the provided input
102 |  * is nominally hashed into a seed, which is extended into enough bytes
103 |  * with a KDF. The seed is used for nothing else. Moreover, the input is
104 |  * guaranteed to be small (at most 32 bytes), so we can just use the
105 |  * hash expand function.
106 |  */
107 | static void
108 | hash_and_sample_s(void *sbuf, size_t sbuf_len, const void *m, size_t m_len)
109 | {
110 | 	EXPAND(sbuf, sbuf_len, m, m_len,
111 | 		(uint32_t)Q | ((uint32_t)LOGN << 16) | 0x73000000);
112 | }
113 | 
114 | /*
115 |  * Make an alternate seed for key derivation, to be used on decapsulation
116 |  * failure. This function is called F() in the BAT specification.
117 |  */
118 | static void
119 | make_kdf_seed_bad(void *m, size_t m_len,
120 | 	const Zn(private_key) *sk, const Zn(ciphertext) *ct)
121 | {
122 | 	HASH_context hc;
123 | 	uint8_t tmp[8];
124 | 
125 | 	enc64le(tmp, (uint32_t)Q | ((uint32_t)LOGN << 16) | 0x62000000);
126 | 	HASH_init(&hc, m_len);
127 | 	HASH_update(&hc, tmp, sizeof tmp);
128 | 	HASH_update(&hc, sk->rr, sizeof sk->rr);
129 | 	HASH_update(&hc, ct->c, sizeof ct->c);
130 | 	HASH_update(&hc, ct->c2, sizeof ct->c2);
131 | 	HASH_final(&hc, m);
132 | }
133 | 
134 | /*
135 |  * Make the secret value from the plaintext s.
136 |  * 'good' should be 1 for normal secret derivation, or 0 when doing
137 |  * fake derivation after decapsulation failure.
138 |  */
139 | static void
140 | make_secret(void *secret, size_t secret_len,
141 | 	const void *m, size_t m_len, uint32_t good)
142 | {
143 | 	EXPAND(secret, secret_len, m, m_len,
144 | 		(uint32_t)Q | ((uint32_t)LOGN << 16) | ((good + 0x66) << 24));
145 | }
146 | 
147 | /* see bat.h */
148 | int
149 | Zn(keygen)(Zn(private_key) *sk, void *tmp, size_t tmp_len)
150 | {
151 | 	prng_context rng;
152 | 	uint8_t rng_seed[32];
153 | 
154 | 	tmp = tmp_align(tmp, tmp_len, ZN(TMP_KEYGEN) - 31);
155 | 	if (tmp == NULL) {
156 | 		return BAT_ERR_NOSPACE;
157 | 	}
158 | 	if (!bat_get_seed(rng_seed, sizeof rng_seed)) {
159 | 		return BAT_ERR_RANDOM;
160 | 	}
161 | 	prng_init(&rng, rng_seed, sizeof rng_seed, 0);
162 | 	for (;;) {
163 | 		prng_get_bytes(&rng, sk->seed, sizeof sk->seed);
164 | 		if (!bat_keygen_make_fg(sk->f, sk->g,
165 | 			(uint16_t *)sk->h, Q, LOGN,
166 | 			sk->seed, sizeof sk->seed, tmp))
167 | 		{
168 | 			continue;
169 | 		}
170 | 		if (!bat_keygen_solve_FG(sk->F, sk->G, sk->f, sk->g,
171 | 			Q, LOGN, tmp))
172 | 		{
173 | 			continue;
174 | 		}
175 | 		if (!bat_keygen_compute_w(sk->w,
176 | 			sk->f, sk->g, sk->F, sk->G, Q, LOGN, tmp))
177 | 		{
178 | 			continue;
179 | 		}
180 | 		make_rr(sk);
181 | 		return 0;
182 | 	}
183 | }
184 | 
185 | /* see bat.h */
186 | void
187 | Zn(get_public_key)(Zn(public_key) *pk, const Zn(private_key) *sk)
188 | {
189 | 	memmove(pk->h, sk->h, sizeof sk->h);
190 | }
191 | 
192 | static size_t
193 | get_privkey_length(const Zn(private_key) *sk, int short_format)
194 | {
195 | 	if (short_format) {
196 | 		return 1 + sizeof(sk->seed) + bat_trim_i8_encode(
197 | 			NULL, 0, NULL, LOGN, bat_max_FG_bits[LOGN]);
198 | 	} else {
199 | 		return 1 + sizeof(sk->seed) + sizeof(sk->rr)
200 | 			+ bat_trim_i8_encode(NULL, 0,
201 | 				sk->f, LOGN, bat_max_fg_bits[LOGN])
202 | 			+ bat_trim_i8_encode(NULL, 0,
203 | 				sk->g, LOGN, bat_max_fg_bits[LOGN])
204 | 			+ bat_trim_i8_encode(NULL, 0,
205 | 				sk->F, LOGN, bat_max_FG_bits[LOGN])
206 | 			+ bat_trim_i8_encode(NULL, 0,
207 | 				sk->G, LOGN, bat_max_FG_bits[LOGN])
208 | 			+ bat_trim_i32_encode(NULL, 0,
209 | 				sk->w, LOGN, bat_max_w_bits[LOGN])
210 | 			+ XCAT(bat_encode_, Q)(NULL, 0, sk->h, LOGN);
211 | 	}
212 | }
213 | 
214 | /* see bat.h */
215 | size_t
216 | Zn(encode_private_key)(void *out, size_t max_out_len,
217 | 	const Zn(private_key) *sk, int short_format)
218 | {
219 | 	uint8_t *buf;
220 | 	size_t len, off, out_len;
221 | 
222 | 	out_len = get_privkey_length(sk, short_format);
223 | 	if (out == NULL) {
224 | 		return out_len;
225 | 	}
226 | 	if (max_out_len < out_len) {
227 | 		return 0;
228 | 	}
229 | 	buf = out;
230 | 	if (short_format) {
231 | 		buf[0] = ZN(TAG_PRIVKEY_SHORT);
232 | 		memmove(&buf[1], sk->seed, sizeof sk->seed);
233 | 		off = 1 + sizeof sk->seed;
234 | 		len = bat_trim_i8_encode(buf + off, out_len - off,
235 | 			sk->F, LOGN, bat_max_FG_bits[LOGN]);
236 | 		if (len == 0) {
237 | 			/* This should never happen in practice. */
238 | 			return 0;
239 | 		}
240 | 		off += len;
241 | 		return off;
242 | 	} else {
243 | 		buf[0] = ZN(TAG_PRIVKEY_LONG);
244 | 		memmove(&buf[1], sk->seed, sizeof sk->seed);
245 | 		off = 1 + sizeof sk->seed;
246 | 		memmove(&buf[off], sk->rr, sizeof sk->rr);
247 | 		off += sizeof sk->rr;
248 | 		len = bat_trim_i8_encode(buf + off, out_len - off,
249 | 			sk->f, LOGN, bat_max_fg_bits[LOGN]);
250 | 		if (len == 0) {
251 | 			/* This should never happen in practice. */
252 | 			return 0;
253 | 		}
254 | 		off += len;
255 | 		len = bat_trim_i8_encode(buf + off, out_len - off,
256 | 			sk->g, LOGN, bat_max_fg_bits[LOGN]);
257 | 		if (len == 0) {
258 | 			/* This should never happen in practice. */
259 | 			return 0;
260 | 		}
261 | 		off += len;
262 | 		len = bat_trim_i8_encode(buf + off, out_len - off,
263 | 			sk->F, LOGN, bat_max_FG_bits[LOGN]);
264 | 		if (len == 0) {
265 | 			/* This should never happen in practice. */
266 | 			return 0;
267 | 		}
268 | 		off += len;
269 | 		len = bat_trim_i8_encode(buf + off, out_len - off,
270 | 			sk->G, LOGN, bat_max_FG_bits[LOGN]);
271 | 		if (len == 0) {
272 | 			/* This should never happen in practice. */
273 | 			return 0;
274 | 		}
275 | 		off += len;
276 | 		len = bat_trim_i32_encode(buf + off, out_len - off,
277 | 			sk->w, LOGN, bat_max_w_bits[LOGN]);
278 | 		if (len == 0) {
279 | 			/* This should never happen in practice. */
280 | 			return 0;
281 | 		}
282 | 		off += len;
283 | 		len = XCAT(bat_encode_, Q)(buf + off, out_len - off,
284 | 			sk->h, LOGN);
285 | 		if (len == 0) {
286 | 			/* This should never happen in practice. */
287 | 			return 0;
288 | 		}
289 | 		off += len;
290 | 		return off;
291 | 	}
292 | }
293 | 
294 | /* see bat.h */
295 | size_t
296 | Zn(decode_private_key)(Zn(private_key) *sk, const void *in, size_t max_in_len,
297 | 	void *tmp, size_t tmp_len)
298 | {
299 | 	const uint8_t *buf;
300 | 	size_t off, len;
301 | 
302 | 	if (in == NULL || max_in_len == 0) {
303 | 		return 0;
304 | 	}
305 | 	buf = in;
306 | 	switch (buf[0]) {
307 | 	case ZN(TAG_PRIVKEY_SHORT):
308 | 		if (max_in_len < get_privkey_length(sk, 1)) {
309 | 			return 0;
310 | 		}
311 | 		memmove(sk->seed, buf + 1, sizeof sk->seed);
312 | 		off = 1 + sizeof sk->seed;
313 | 		len = bat_trim_i8_decode(sk->F, LOGN, bat_max_FG_bits[LOGN],
314 | 			buf + off, max_in_len - off);
315 | 		if (len == 0) {
316 | 			return 0;
317 | 		}
318 | 		off += len;
319 | 		tmp = tmp_align(tmp, tmp_len, ZN(TMP_DECODE_PRIV) - 31);
320 | 		if (tmp == NULL) {
321 | 			return 0;
322 | 		}
323 | 		if (!bat_keygen_make_fg(sk->f, sk->g,
324 | 			(uint16_t *)sk->h, Q, LOGN,
325 | 			sk->seed, sizeof sk->seed, tmp))
326 | 		{
327 | 			return 0;
328 | 		}
329 | 		if (!bat_keygen_rebuild_G(sk->G, sk->f, sk->g, sk->F,
330 | 			Q, LOGN, tmp))
331 | 		{
332 | 			return 0;
333 | 		}
334 | 		if (!bat_keygen_compute_w(sk->w,
335 | 			sk->f, sk->g, sk->F, sk->G, Q, LOGN, tmp))
336 | 		{
337 | 			return 0;
338 | 		}
339 | 		make_rr(sk);
340 | 		return off;
341 | 	case ZN(TAG_PRIVKEY_LONG):
342 | 		if (max_in_len < get_privkey_length(sk, 0)) {
343 | 			return 0;
344 | 		}
345 | 		memmove(sk->seed, buf + 1, sizeof sk->seed);
346 | 		off = 1 + sizeof sk->seed;
347 | 		memmove(sk->rr, buf + off, sizeof sk->rr);
348 | 		off += sizeof sk->rr;
349 | 		len = bat_trim_i8_decode(sk->f, LOGN, bat_max_fg_bits[LOGN],
350 | 			buf + off, max_in_len - off);
351 | 		if (len == 0) {
352 | 			return 0;
353 | 		}
354 | 		off += len;
355 | 		len = bat_trim_i8_decode(sk->g, LOGN, bat_max_fg_bits[LOGN],
356 | 			buf + off, max_in_len - off);
357 | 		if (len == 0) {
358 | 			return 0;
359 | 		}
360 | 		off += len;
361 | 		len = bat_trim_i8_decode(sk->F, LOGN, bat_max_FG_bits[LOGN],
362 | 			buf + off, max_in_len - off);
363 | 		if (len == 0) {
364 | 			return 0;
365 | 		}
366 | 		off += len;
367 | 		len = bat_trim_i8_decode(sk->G, LOGN, bat_max_FG_bits[LOGN],
368 | 			buf + off, max_in_len - off);
369 | 		if (len == 0) {
370 | 			return 0;
371 | 		}
372 | 		off += len;
373 | 		len = bat_trim_i32_decode(sk->w, LOGN, bat_max_w_bits[LOGN],
374 | 			buf + off, max_in_len - off);
375 | 		if (len == 0) {
376 | 			return 0;
377 | 		}
378 | 		off += len;
379 | 		len = XCAT(bat_decode_, Q)(sk->h, LOGN,
380 | 			buf + off, max_in_len - off);
381 | 		if (len == 0) {
382 | 			return 0;
383 | 		}
384 | 		off += len;
385 | 		return off;
386 | 	default:
387 | 		return 0;
388 | 	}
389 | }
390 | 
391 | /* see bat.h */
392 | size_t
393 | Zn(encode_public_key)(void *out, size_t max_out_len, const Zn(public_key) *pk)
394 | {
395 | 	uint8_t *buf;
396 | 	size_t out_len, len;
397 | 
398 | 	out_len = 1 + XCAT(bat_encode_, Q)(NULL, 0, pk->h, LOGN);
399 | 	if (out == NULL) {
400 | 		return out_len;
401 | 	}
402 | 	if (max_out_len < out_len) {
403 | 		return 0;
404 | 	}
405 | 	buf = out;
406 | 	buf[0] = ZN(TAG_PUBKEY);
407 | 	len = XCAT(bat_encode_, Q)(buf + 1, max_out_len - 1, pk->h, LOGN);
408 | 	if (len == 0) {
409 | 		return 0;
410 | 	}
411 | 	return 1 + len;
412 | }
413 | 
414 | /* see bat.h */
415 | size_t
416 | Zn(decode_public_key)(Zn(public_key) *pk, const void *in, size_t max_in_len)
417 | {
418 | 	const uint8_t *buf;
419 | 	size_t len;
420 | 
421 | 	if (max_in_len == 0) {
422 | 		return 0;
423 | 	}
424 | 	buf = in;
425 | 	if (buf[0] != ZN(TAG_PUBKEY)) {
426 | 		return 0;
427 | 	}
428 | 	len = XCAT(bat_decode_, Q)(pk->h, LOGN, buf + 1, max_in_len - 1);
429 | 	if (len == 0) {
430 | 		return 0;
431 | 	}
432 | 	return 1 + len;
433 | }
434 | 
435 | /* see bat.h */
436 | size_t
437 | Zn(encode_ciphertext)(void *out, size_t max_out_len, const Zn(ciphertext) *ct)
438 | {
439 | 	uint8_t *buf;
440 | 	size_t out_len, len, off;
441 | 
442 | 	out_len = 1 + XCAT(bat_encode_ciphertext_, Q)(NULL, 0, ct->c, LOGN)
443 | 		+ sizeof ct->c2;
444 | 	if (out == NULL) {
445 | 		return out_len;
446 | 	}
447 | 	if (max_out_len < out_len) {
448 | 		return 0;
449 | 	}
450 | 	buf = out;
451 | 	buf[0] = ZN(TAG_CIPHERTEXT);
452 | 	off = 1;
453 | 	len = XCAT(bat_encode_ciphertext_, Q)(
454 | 		buf + off, max_out_len - off, ct->c, LOGN);
455 | 	if (len == 0) {
456 | 		return 0;
457 | 	}
458 | 	off += len;
459 | 	memcpy(buf + off, ct->c2, sizeof ct->c2);
460 | 	off += sizeof ct->c2;
461 | 	return off;
462 | }
463 | 
464 | /* see bat.h */
465 | size_t
466 | Zn(decode_ciphertext)(Zn(ciphertext) *ct, const void *in, size_t max_in_len)
467 | {
468 | 	const uint8_t *buf;
469 | 	size_t off, len;
470 | 
471 | 	if (max_in_len < 1) {
472 | 		return 0;
473 | 	}
474 | 	buf = in;
475 | 	if (buf[0] != ZN(TAG_CIPHERTEXT)) {
476 | 		return 0;
477 | 	}
478 | 	off = 1;
479 | 	len = XCAT(bat_decode_ciphertext_, Q)(
480 | 		ct->c, LOGN, buf + off, max_in_len - off);
481 | 	if (len == 0) {
482 | 		return 0;
483 | 	}
484 | 	off += len;
485 | 	if (max_in_len - off < sizeof ct->c2) {
486 | 		return 0;
487 | 	}
488 | 	memcpy(ct->c2, buf + off, sizeof ct->c2);
489 | 	off += sizeof ct->c2;
490 | 	return off;
491 | }
492 | 
493 | /* see bat.h */
494 | int
495 | Zn(encapsulate)(void *secret, size_t secret_len,
496 | 	Zn(ciphertext) *ct, const Zn(public_key) *pk,
497 | 	void *tmp, size_t tmp_len)
498 | {
499 | 	tmp = tmp_align(tmp, tmp_len, ZN(TMP_ENCAPS) - 31);
500 | 	if (tmp == NULL) {
501 | 		return BAT_ERR_NOSPACE;
502 | 	}
503 | 
504 | 	/*
505 | 	 * Encapsulation may theoretically fail if the resulting
506 | 	 * vector norm is higher than a specific bound. However, this
507 | 	 * is very rare (it cannot happen at all for q = 257). Thus,
508 | 	 * we expect not to have to loop. Correspondingly, it is more
509 | 	 * efficient to use the random seed from the OS directly.
510 | 	 */
511 | 	for (;;) {
512 | 		uint8_t m[LVLBYTES], sbuf[SBUF_LEN(LOGN)];
513 | 		size_t u;
514 | 
515 | 		/*
516 | 		 * Get a random message m from the OS.
517 | 		 */
518 | 		if (!bat_get_seed(m, sizeof m)) {
519 | 			return BAT_ERR_RANDOM;
520 | 		}
521 | 
522 | 		/*
523 | 		 * Hash m to sample s.
524 | 		 */
525 | 		hash_and_sample_s(sbuf, sizeof sbuf, m, sizeof m);
526 | #if N < 8
527 | 		/* For very reduced toy versions, we don't even have a
528 | 		   full byte, and we must clear the unused bits. */
529 | 		sbuf[0] &= (1u << N) - 1u;
530 | #endif
531 | 
532 | 		/*
533 | 		 * Compute c1. This may fail (rarely!) only for q = 769.
534 | 		 */
535 | 		if (!XCAT(bat_encrypt_, Q)(ct->c, sbuf, pk->h, LOGN, tmp)) {
536 | 			continue;
537 | 		}
538 | 
539 | 		/*
540 | 		 * Make c2 = Hash_m(s) XOR m.
541 | 		 */
542 | 		hash_m(ct->c2, sbuf, sizeof sbuf);
543 | 		for (u = 0; u < sizeof m; u ++) {
544 | 			ct->c2[u] ^= m[u];
545 | 		}
546 | 
547 | 		/*
548 | 		 * Produce the shared secret (output of a successful key
549 | 		 * exchange).
550 | 		 */
551 | 		make_secret(secret, secret_len, m, sizeof m, 1);
552 | 
553 | 		return 0;
554 | 	}
555 | }
556 | 
557 | /* see bat.h */
558 | int
559 | Zn(encapsulate_explicit_seed)(void *secret, size_t secret_len,
560 | 	Zn(ciphertext) *ct, const Zn(public_key) *pk,
561 | 	const void *m, void *tmp, size_t tmp_len)
562 | {
563 | 	uint8_t m2[LVLBYTES];
564 | 
565 | 	tmp = tmp_align(tmp, tmp_len, ZN(TMP_ENCAPS) - 31);
566 | 	if (tmp == NULL) {
567 | 		return BAT_ERR_NOSPACE;
568 | 	}
569 | 
570 | 	for (;;) {
571 | 		uint8_t sbuf[SBUF_LEN(LOGN)];
572 | 		size_t u;
573 | 
574 | 		/*
575 | 		 * If no seed is provided, then generate one randomly.
576 | 		 */
577 | 		if (m == NULL) {
578 | 			if (!bat_get_seed(m2, sizeof m2)) {
579 | 				return BAT_ERR_RANDOM;
580 | 			}
581 | 			m = m2;
582 | 		}
583 | 
584 | 		/*
585 | 		 * Hash m to sample s.
586 | 		 */
587 | 		hash_and_sample_s(sbuf, sizeof sbuf, m, LVLBYTES);
588 | #if N < 8
589 | 		/* For very reduced toy versions, we don't even have a
590 | 		   full byte, and we must clear the unused bits. */
591 | 		sbuf[0] &= (1u << N) - 1u;
592 | #endif
593 | 
594 | 		/*
595 | 		 * Compute c1. This may fail (very rarely!) only for q = 769;
596 | 		 * we just hash the current seed. Since this occurrence is
597 | 		 * very rare in practice, this process does not induce any
598 | 		 * non-negligible bias.
599 | 		 */
600 | 		if (!XCAT(bat_encrypt_, Q)(ct->c, sbuf, pk->h, LOGN, tmp)) {
601 | 			blake2s(m2, LVLBYTES, NULL, 0, m, LVLBYTES);
602 | 			m = m2;
603 | 			continue;
604 | 		}
605 | 
606 | 		/*
607 | 		 * Make c2 = Hash_m(s) XOR m.
608 | 		 */
609 | 		hash_m(ct->c2, sbuf, sizeof sbuf);
610 | 		for (u = 0; u < LVLBYTES; u ++) {
611 | 			ct->c2[u] ^= ((const uint8_t *)m)[u];
612 | 		}
613 | 
614 | 		/*
615 | 		 * Produce the shared secret (output of a successful key
616 | 		 * exchange).
617 | 		 */
618 | 		make_secret(secret, secret_len, m, LVLBYTES, 1);
619 | 
620 | 		return 0;
621 | 	}
622 | }
623 | 
624 | /* see bat.h */
625 | int
626 | Zn(decapsulate)(void *secret, size_t secret_len,
627 | 	const Zn(ciphertext) *ct, const Zn(private_key) *sk,
628 | 	void *tmp, size_t tmp_len)
629 | {
630 | 	uint8_t sbuf[SBUF_LEN(LOGN)], m[LVLBYTES], m_alt[LVLBYTES];
631 | 	uint8_t sbuf_alt[SBUF_LEN(LOGN)];
632 | 	int8_t *c_alt;
633 | 	size_t u;
634 | 	uint32_t d;
635 | 
636 | 	tmp = tmp_align(tmp, tmp_len, ZN(TMP_DECAPS) - 31);
637 | 	if (tmp == NULL) {
638 | 		return BAT_ERR_NOSPACE;
639 | 	}
640 | 
641 | 	/*
642 | 	 * Inner decryption never fails (at least, it never reports
643 | 	 * a failure).
644 | 	 */
645 | 	XCAT(bat_decrypt_, Q)(sbuf, ct->c,
646 | 		sk->f, sk->g, sk->F, sk->G, sk->w, LOGN, tmp);
647 | 
648 | 	/*
649 | 	 * From sbuf, we derive the mask that allows recovery of m
650 | 	 * out of the second ciphertext half (c2).
651 | 	 */
652 | 	hash_m(m, sbuf, sizeof sbuf);
653 | 	for (u = 0; u < sizeof m; u ++) {
654 | 		m[u] ^= ct->c2[u];
655 | 	}
656 | 
657 | 	/*
658 | 	 * Decryption is valid if and only if we can reencrypt the
659 | 	 * obtained message m and get the exact same polynomial s
660 | 	 * and ciphertext c1.
661 | 	 */
662 | 	hash_and_sample_s(sbuf_alt, sizeof sbuf_alt, m, sizeof m);
663 | #if N < 8
664 | 	sbuf_alt[0] &= (1u << N) - 1u;
665 | #endif
666 | 	c_alt = tmp;
667 | 	tmp = tmp_align((void *)(c_alt + N), ZN(TMP_DECAPS) - N,
668 | 		ZN(TMP_ENCAPS) - 31);
669 | 	if (tmp == NULL) {
670 | 		/* This should never happen in practice. */
671 | 		return BAT_ERR_NOSPACE;
672 | 	}
673 | 	d = XCAT(bat_encrypt_, Q)(c_alt, sbuf_alt, sk->h, LOGN, tmp);
674 | 	d --;
675 | 	for (u = 0; u < sizeof sbuf; u ++) {
676 | 		d |= sbuf[u] ^ sbuf_alt[u];
677 | 	}
678 | 	for (u = 0; u < sizeof ct->c; u ++) {
679 | 		d |= (uint32_t)(ct->c[u] - c_alt[u]);
680 | 	}
681 | 
682 | 	/*
683 | 	 * If encapsulation worked AND yielded the same ciphertext as
684 | 	 * received, then d == 0 at this point, and we want to produce
685 | 	 * the secret key as a hash of m. Otherwise, d != 0, and we
686 | 	 * must produce the secret as a hash of the received ciphertext
687 | 	 * and the secret value r (stored in sk->rr). We MUST NOT leak
688 | 	 * which was the case, and therefore we must always compute
689 | 	 * both hashes and perform constant-time conditional replacement.
690 | 	 */
691 | 
692 | 	make_kdf_seed_bad(m_alt, sizeof m, sk, ct);
693 | 	d = -((uint32_t)(d | -d) >> 31);
694 | 	for (u = 0; u < sizeof m; u ++) {
695 | 		m[u] ^= d & (m[u] ^ m_alt[u]);
696 | 	}
697 | 	make_secret(secret, secret_len, m, sizeof m, d + 1);
698 | 	return 0;
699 | }
700 | 


--------------------------------------------------------------------------------
/src/api_128_256.c:
--------------------------------------------------------------------------------
1 | #define Q          128
2 | #define N          256
3 | #define LOGN       8
4 | #define LVLBYTES   10
5 | 
6 | #include "api.c"
7 | 


--------------------------------------------------------------------------------
/src/api_257_512.c:
--------------------------------------------------------------------------------
1 | #define Q          257
2 | #define N          512
3 | #define LOGN       9
4 | #define LVLBYTES   16
5 | 
6 | #include "api.c"
7 | 


--------------------------------------------------------------------------------
/src/api_769_1024.c:
--------------------------------------------------------------------------------
1 | #define Q          769
2 | #define N          1024
3 | #define LOGN       10
4 | #define LVLBYTES   32
5 | 
6 | #include "api.c"
7 | 


--------------------------------------------------------------------------------
/src/bat.h:
--------------------------------------------------------------------------------
  1 | #ifndef BAT_H__
  2 | #define BAT_H__
  3 | 
  4 | #include <stddef.h>
  5 | #include <stdint.h>
  6 | 
  7 | /*
  8 |  * For modulus qqq and degree nnn, the following types and functions are
  9 |  * defined:
 10 |  *
 11 |  *   bat_qqq_nnn_private_key
 12 |  *
 13 |  *      Private key structure; contains all private key elements, including
 14 |  *      a copy of the public key.
 15 |  *
 16 |  *   bat_qqq_nnn_public_key
 17 |  *
 18 |  *      Public key structure. Contains only the public key.
 19 |  *
 20 |  *   bat_qqq_nnn_ciphertext
 21 |  *
 22 |  *      Ciphertext structure. Contains the ciphertext polynomial and
 23 |  *      the FO tag.
 24 |  *
 25 |  *   int bat_qqq_nnn_keygen(
 26 |  *           bat_qqq_nnn_private_key *sk, void *tmp, size_t tmp_len);
 27 |  *
 28 |  *      Generate a new key pair. Returned value is 0 on success, a negative
 29 |  *      value on error. Buffer tmp (tmp_len bytes) should be large enough
 30 |  *      (see the BAT_qqq_nnn_TMP_KEYGEN macro).
 31 |  *
 32 |  *   void bat_qqq_nnn_get_public_key(
 33 |  *           bat_qqq_nnn_public_key *pk, const bat_qqq_nnn_private_key *sk);
 34 |  *
 35 |  *      Get a copy of the public key from the private key.
 36 |  *
 37 |  *   size_t bat_qqq_nnn_encode_private_key(
 38 |  *           void *out, size_t max_out_len,
 39 |  *           const bat_qqq_nnn_private_key *sk, int short_format);
 40 |  *
 41 |  *      Encode the private key into bytes. If short_format is zero, then
 42 |  *      the "long format" is used (encoding contains f, g, F, G, w, and the
 43 |  *      generation seed). If short_format is non-zero, then the "short
 44 |  *      format" is used (encoding contains only F and the seed). The short
 45 |  *      format is much smaller, but requires more CPU and temporary RAM
 46 |  *      when decoding.
 47 |  *
 48 |  *      If out is NULL, then max_out_len is ignored, and the function
 49 |  *      returns the size (in bytes) that the encoded private key would have.
 50 |  *      Otherwise, if the encoded private key would be longer than
 51 |  *      max_out_len, then the function returns 0 and encodes nothing.
 52 |  *      Otherwise, the encoded private key is written into out, and its
 53 |  *      size (in bytes) is returned.
 54 |  *
 55 |  *   size_t bat_qqq_nnn_decode_private_key(
 56 |  *           bat_qqq_nnn_private_key *sk,
 57 |  *           const void *in, size_t max_in_len,
 58 |  *           void *tmp, size_t tmp_len);
 59 |  *
 60 |  *      Decode the private key from bytes. If the incoming bytes are
 61 |  *      invalid, or relate to different set of parameters, or max_in_len
 62 |  *      is shorter than the private key size (i.e. it was truncated),
 63 |  *      then this function returns 0. Otherwise, it returns the actual
 64 |  *      size (in bytes) of the encoded private key (which is not greater
 65 |  *      than max_in_len, but may be lower than max_in_len).
 66 |  *
 67 |  *      If the encoded key uses the long format, then tmp and tmp_len
 68 |  *      are ignored. If the encoded key uses the short format, then
 69 |  *      tmp (of size tmp_len bytes) is used for temporary storage; in
 70 |  *      that case, if the buffer is too short, then the function fails
 71 |  *      and returns 0. The BAT_qqq_nnn_TMP_DECODE_PRIV macro evaluates to
 72 |  *      the required minimal size.
 73 |  *
 74 |  *   size_t bat_qqq_nnn_encode_public_key(
 75 |  *           void *out, size_t max_out_len,
 76 |  *           const bat_qqq_nnn_public_key *pk);
 77 |  *
 78 |  *      Encode the public key into bytes.
 79 |  *
 80 |  *      If out is NULL, then max_out_len is ignored, and the function
 81 |  *      returns the size (in bytes) that the encoded public key would have.
 82 |  *      Otherwise, if the encoded public key would be longer than
 83 |  *      max_out_len, then the function returns 0 and encodes nothing.
 84 |  *      Otherwise, the encoded public key is written into out, and its
 85 |  *      size (in bytes) is returned.
 86 |  *
 87 |  *   size_t bat_qqq_nnn_decode_public_key(
 88 |  *           bat_qqq_nnn_public_key *pk,
 89 |  *           const void *in, size_t max_in_len);
 90 |  *
 91 |  *      Decode the public key from bytes. If the incoming bytes are
 92 |  *      invalid, or relate to different set of parameters, or max_in_len
 93 |  *      is shorter than the public key size (i.e. it was truncated),
 94 |  *      then this function returns 0. Otherwise, it returns the actual
 95 |  *      size (in bytes) of the encoded public key (which is not greater
 96 |  *      than max_in_len, but may be lower than max_in_len).
 97 |  *
 98 |  *   size_t bat_qqq_nnn_encode_ciphertext(
 99 |  *           void *out, size_t max_out_len,
100 |  *           const bat_qqq_nnn_ciphertext *ct);
101 |  *
102 |  *      Encode the ciphertext into bytes.
103 |  *
104 |  *      If out is NULL, then max_out_len is ignored, and the function
105 |  *      returns the size (in bytes) that the encoded ciphertext would have.
106 |  *      Otherwise, if the encoded ciphertext would be longer than
107 |  *      max_out_len, then the function returns 0 and encodes nothing.
108 |  *      Otherwise, the encoded ciphertext is written into out, and its
109 |  *      size (in bytes) is returned.
110 |  *
111 |  *   size_t bat_qqq_nnn_decode_ciphertext(
112 |  *           bat_qqq_nnn_ciphertext *ct,
113 |  *           const void *in, size_t max_in_len);
114 |  *
115 |  *      Decode the ciphertext from bytes. If the incoming bytes are
116 |  *      invalid, or relate to different set of parameters, or max_in_len
117 |  *      is shorter than the ciphertext size (i.e. it was truncated),
118 |  *      then this function returns 0. Otherwise, it returns the actual
119 |  *      size (in bytes) of the encoded ciphertext (which is not greater
120 |  *      than max_in_len, but may be lower than max_in_len).
121 |  *
122 |  *   int bat_qqq_nnn_encapsulate(
123 |  *           void *secret, size_t secret_len,
124 |  *           bat_qqq_nnn_ciphertext *ct,
125 |  *           const bat_qqq_nnn_public_key *pk,
126 |  *           void *tmp, size_t tmp_len);
127 |  *
128 |  *      Perform a key encpasulation with the provided public key. The
129 |  *      resulting shared secret is written into secret[], while the
130 |  *      ciphertext is written into *ct. The shared secret length is
131 |  *      arbitrary (it internally comes from a BLAKE2-based KDF) but
132 |  *      of course the sender and receiver should agree on the length to
133 |  *      use, depending on what the secret is for.
134 |  *
135 |  *      On success, 0 is returned; on error, a negative error code is
136 |  *      returned and the secret value is not produced. If provided
137 |  *      temporary buffer (tmp, of size tmp_len bytes) is too small, then
138 |  *      BAT_ERR_NOSPACE is returned (see BAT_qqq_nnn_TMP_ENCAPS).
139 |  *
140 |  *   int bat_qqq_nnn_encapsulate_explicit_seed(
141 |  *           void *secret, size_t secret_len,
142 |  *           bat_qqq_nnn_ciphertext *ct,
143 |  *           const bat_qqq_nnn_public_key *pk,
144 |  *           const uint8_t *m, void *tmp, size_t tmp_len);
145 |  *
146 |  *      This is a variant of bat_qqq_nnn_encapsulate(), in which the
147 |  *      random seed (m[] value) is provided explicitly. This function
148 |  *      is meant mostly for benchmarks and reproducible test vectors,
149 |  *      to avoid the overhead and unpredictability of the OS-provided
150 |  *      random generator; in general, bat_qqq_nnn_encapsulate() SHOULD
151 |  *      be used instead. If m is NULL, then the OS RNG is used to create
152 |  *      the seed. When m is not NULL, then it MUST be generated as a
153 |  *      uniform unpredictable sequence of bytes of the right length for
154 |  *      the target BAT version (10, 16 or 32 bytes, for BAT-128-256,
155 |  *      BAT-257-512 and BAT-769-1024, respectively).
156 |  *
157 |  *   int bat_qqq_nnn_decapsulate(
158 |  *           void *secret, size_t secret_len,
159 |  *           const bat_qqq_nnn_ciphertext *ct,
160 |  *           const bat_qqq_nnn_private_key *sk,
161 |  *           void *tmp, size_t tmp_len);
162 |  *
163 |  *      Perform a key decpasulation with the provided ciphertext and
164 |  *      private key. The resulting shared secret is written into
165 |  *      secret[]. The shared secret length is arbitrary (it internally
166 |  *      comes from a BLAKE2-based KDF) but of course the sender and
167 |  *      receiver should agree on the length to use, depending on what
168 |  *      the secret is for.
169 |  *
170 |  *      On success, 0 is returned; on error, a negative error code is
171 |  *      returned and the secret value is not produced. Such errors are
172 |  *      reported only for local technical reasons unrelated to the
173 |  *      received ciphertext; e.g. BAT_ERR_NOSPACE is returned if the
174 |  *      tmp[] buffer (of size tmp_len bytes) is returned. By
175 |  *      construction of the algorithm, invalid ciphertext values lead to
176 |  *      a recovered shared secret which is deterministic from the
177 |  *      ciphertext and private key, but unpredictable by third parties;
178 |  *      in such cases, this function reports a success (0).
179 |  */
180 | 
181 | #define BAT_MK(q, n, lvl_bytes, htype) \
182 | typedef struct { \
183 | 	int8_t f[n]; \
184 | 	int8_t g[n]; \
185 | 	int8_t F[n]; \
186 | 	int8_t G[n]; \
187 | 	int32_t w[n]; \
188 | 	htype h[n]; \
189 | 	uint8_t rr[32]; \
190 | 	uint8_t seed[32]; \
191 | } bat_ ## q ## _ ## n ## _private_key; \
192 | typedef struct { \
193 | 	htype h[n]; \
194 | } bat_ ## q ## _ ## n ## _public_key; \
195 | typedef struct { \
196 | 	int8_t c[n]; \
197 | 	uint8_t c2[lvl_bytes]; \
198 | } bat_ ## q ## _ ## n ## _ciphertext; \
199 | int bat_ ## q ## _ ## n ## _keygen(bat_ ## q ## _ ## n ## _private_key *sk, \
200 | 	void *tmp, size_t tmp_len); \
201 | void bat_ ## q ## _ ## n ## _get_public_key( \
202 | 	bat_ ## q ## _ ## n ## _public_key *pk, \
203 | 	const bat_ ## q ## _ ## n ## _private_key *sk); \
204 | size_t bat_ ## q ## _ ## n ## _encode_private_key( \
205 | 	void *out, size_t max_out_len, \
206 | 	const bat_ ## q ## _ ## n ## _private_key *sk, int short_format); \
207 | size_t bat_ ## q ## _ ## n ## _decode_private_key( \
208 | 	bat_ ## q ## _ ## n ## _private_key *sk, \
209 | 	const void *in, size_t max_in_len, \
210 | 	void *tmp, size_t tmp_len); \
211 | size_t bat_ ## q ## _ ## n ## _encode_public_key( \
212 | 	void *out, size_t max_out_len, \
213 | 	const bat_ ## q ## _ ## n ## _public_key *pk); \
214 | size_t bat_ ## q ## _ ## n ## _decode_public_key( \
215 | 	bat_ ## q ## _ ## n ## _public_key *pk, \
216 | 	const void *in, size_t max_in_len); \
217 | size_t bat_ ## q ## _ ## n ## _encode_ciphertext( \
218 | 	void *out, size_t max_out_len, \
219 | 	const bat_ ## q ## _ ## n ## _ciphertext *ct); \
220 | size_t bat_ ## q ## _ ## n ## _decode_ciphertext( \
221 | 	bat_ ## q ## _ ## n ## _ciphertext *ct, \
222 | 	const void *in, size_t max_in_len); \
223 | int bat_ ## q ## _ ## n ## _encapsulate( \
224 | 	void *secret, size_t secret_len, \
225 | 	bat_ ## q ## _ ## n ## _ciphertext *ct, \
226 | 	const bat_ ## q ## _ ## n ## _public_key *pk, \
227 | 	void *tmp, size_t tmp_len); \
228 | int bat_ ## q ## _ ## n ## _encapsulate_explicit_seed( \
229 | 	void *secret, size_t secret_len, \
230 | 	bat_ ## q ## _ ## n ## _ciphertext *ct, \
231 | 	const bat_ ## q ## _ ## n ## _public_key *pk, \
232 | 	const void *m, void *tmp, size_t tmp_len); \
233 | int bat_ ## q ## _ ## n ## _decapsulate( \
234 | 	void *secret, size_t secret_len, \
235 | 	const bat_ ## q ## _ ## n ## _ciphertext *ct, \
236 | 	const bat_ ## q ## _ ## n ## _private_key *sk, \
237 | 	void *tmp, size_t tmp_len);
238 | 
239 | BAT_MK(128, 256, 10, uint8_t)
240 | BAT_MK(257, 512, 16, uint16_t)
241 | BAT_MK(769, 1024, 32, uint16_t)
242 | 
243 | #undef BAT_MK
244 | 
245 | /*
246 |  * Macros for temporary buffer sizes.
247 |  *
248 |  * Each length is in bytes and accounts for an extra 31 bytes for internal
249 |  * alignment adjustment.
250 |  */
251 | #define BAT_128_256_TMP_KEYGEN          6175
252 | #define BAT_128_256_TMP_DECODE_PRIV     6175
253 | #define BAT_128_256_TMP_ENCAPS           799
254 | #define BAT_128_256_TMP_DECAPS          2079
255 | 
256 | #define BAT_257_512_TMP_KEYGEN         12319
257 | #define BAT_257_512_TMP_DECODE_PRIV    12319
258 | #define BAT_257_512_TMP_ENCAPS          2079
259 | #define BAT_257_512_TMP_DECAPS          4127
260 | 
261 | #define BAT_769_1024_TMP_KEYGEN        24607
262 | #define BAT_769_1024_TMP_DECODE_PRIV   24607
263 | #define BAT_769_1024_TMP_ENCAPS         4127
264 | #define BAT_769_1024_TMP_DECAPS         8223
265 | 
266 | /*
267 |  * Error codes.
268 |  */
269 | 
270 | /* Decapsulation failed. */
271 | #define BAT_ERR_DECAPS_FAILED   -1
272 | 
273 | /* Provided object (key or ciphertext) uses a different set of parameters
274 |    (modulus and/or degree) than expected by the called function. */
275 | #define BAT_ERR_WRONG_PARAMS    -2
276 | 
277 | /* Provided object (key or ciphertext) is invalidly encoded. */
278 | #define BAT_ERR_BAD_ENCODING    -3
279 | 
280 | /* Provided temporary space has insufficient length for the requested
281 |    operation. */
282 | #define BAT_ERR_NOSPACE         -4
283 | 
284 | /* Random seeding from operating system failed. */
285 | #define BAT_ERR_RANDOM          -5
286 | 
287 | /*
288 |  * Tag bytes. Each encoded public key, private key or ciphertext starts
289 |  * with a tag byte that identifies the object type and parameters.
290 |  * General format is (most-to-least significant order):
291 |  *
292 |  *    t t q q n n n n
293 |  *
294 |  * with:
295 |  *
296 |  *  - tt = 00 for a private key (long format), 01 for a private key (short
297 |  *    format), 10 for a public key, 11 for a ciphertext.
298 |  *  - qq = 00 for q = 128, 01 for q = 257, 10 for q = 769.
299 |  *  - nnnn = log2(n) where n is the degree (power of 2, up to 1024).
300 |  */
301 | #define BAT_128_256_TAG_PRIVKEY_LONG     0x08
302 | #define BAT_128_256_TAG_PRIVKEY_SHORT    0x48
303 | #define BAT_128_256_TAG_PUBKEY           0x88
304 | #define BAT_128_256_TAG_CIPHERTEXT       0xC8
305 | 
306 | #define BAT_257_512_TAG_PRIVKEY_LONG     0x19
307 | #define BAT_257_512_TAG_PRIVKEY_SHORT    0x59
308 | #define BAT_257_512_TAG_PUBKEY           0x99
309 | #define BAT_257_512_TAG_CIPHERTEXT       0xD9
310 | 
311 | #define BAT_769_1024_TAG_PRIVKEY_LONG    0x2A
312 | #define BAT_769_1024_TAG_PRIVKEY_SHORT   0x6A
313 | #define BAT_769_1024_TAG_PUBKEY          0xAA
314 | #define BAT_769_1024_TAG_CIPHERTEXT      0xEA
315 | 
316 | #endif
317 | 


--------------------------------------------------------------------------------
/src/blake2.h:
--------------------------------------------------------------------------------
 1 | #ifndef BLAKE2_H__
 2 | #define BLAKE2_H__
 3 | 
 4 | #include <stddef.h>
 5 | #include <stdint.h>
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | typedef struct {
12 | 	uint8_t buf[64];
13 | 	uint32_t h[8];
14 | 	uint64_t ctr;
15 | 	size_t out_len;
16 | } blake2s_context;
17 | 
18 | void blake2s_init(blake2s_context *bc, size_t out_len);
19 | 
20 | void blake2s_init_key(blake2s_context *bc, size_t out_len,
21 | 	const void *key, size_t key_len);
22 | 
23 | void blake2s_update(blake2s_context *bc, const void *data, size_t len);
24 | 
25 | void blake2s_final(blake2s_context *bc, void *dst);
26 | 
27 | void blake2s(void *dst, size_t dst_len, const void *key, size_t key_len,
28 | 	const void *src, size_t src_len);
29 | 
30 | /*
31 |  * Use BLAKE2s as a PRNG: for a given seed, compute the concatenation:
32 |  *   H(label || 0 || seed) || H(label || 1 || seed) || ...
33 |  * with:
34 |  *   H = BLAKE2s with a 32-byte output
35 |  *   seed = provided seed (length MUST be at most 48 bytes)
36 |  *   label = provided value (64-bit, little-endian)
37 |  *   0, 1,... = block counter (64-bit, little-endian)
38 |  * The concatenation output is truncated to dst_len and written in dst[].
39 |  * The seed and dst buffers may overlap arbitrarily.
40 |  */
41 | void blake2s_expand(void *dst, size_t dst_len,
42 | 	const void *seed, size_t seed_len, uint64_t label);
43 | 
44 | typedef struct {
45 | 	uint8_t buf[128];
46 | 	uint64_t h[8];
47 | 	uint64_t ctr;
48 | 	size_t out_len;
49 | } blake2b_context;
50 | 
51 | void blake2b_init(blake2b_context *bc, size_t out_len);
52 | 
53 | void blake2b_init_key(blake2b_context *bc, size_t out_len,
54 | 	const void *key, size_t key_len);
55 | 
56 | void blake2b_update(blake2b_context *bc, const void *data, size_t len);
57 | 
58 | void blake2b_final(blake2b_context *bc, void *dst);
59 | 
60 | void blake2b(void *dst, size_t dst_len, const void *key, size_t key_len,
61 | 	const void *src, size_t src_len);
62 | 
63 | /*
64 |  * Use BLAKE2b as a PRNG: for a given seed, compute the concatenation:
65 |  *   H(label || 0 || seed) || H(label || 1 || seed) || ...
66 |  * with:
67 |  *   H = BLAKE2b with a 64-byte output
68 |  *   seed = provided seed (length MUST be at most 112 bytes)
69 |  *   label = provided value (64-bit, little-endian)
70 |  *   0, 1,... = block counter (64-bit, little-endian)
71 |  * The concatenation output is truncated to dst_len and written in dst[].
72 |  * The seed and dst buffers may overlap arbitrarily.
73 |  */
74 | void blake2b_expand(void *dst, size_t dst_len,
75 | 	const void *seed, size_t seed_len, uint64_t label);
76 | 
77 | #ifdef __cplusplus
78 | }
79 | #endif
80 | 
81 | #endif
82 | 


--------------------------------------------------------------------------------
/src/blake2b.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Internal functions for BAT.
  3 |  */
  4 | 
  5 | /* ====================================================================== */
  6 | 
  7 | #include <stdint.h>
  8 | #include <string.h>
  9 | 
 10 | #include "blake2.h"
 11 | 
 12 | #include "inner.h"
 13 | #define BLAKE2_AVX2        BAT_AVX2
 14 | #define BLAKE2_LE          BAT_LE
 15 | #define BLAKE2_UNALIGNED   BAT_UNALIGNED
 16 | 
 17 | static const uint64_t IV[] = {
 18 | 	0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
 19 | 	0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
 20 | 	0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
 21 | 	0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
 22 | };
 23 | 
 24 | static void
 25 | process_block(uint64_t *h, const uint8_t *data, uint64_t t, int f)
 26 | {
 27 | 	uint64_t v[16], m[16];
 28 | 	int i;
 29 | 
 30 | 	memcpy(v, h, 8 * sizeof(uint64_t));
 31 | 	memcpy(v + 8, IV, sizeof IV);
 32 | 	v[12] ^= t;
 33 | 	if (f) {
 34 | 		v[14] = ~v[14];
 35 | 	}
 36 | 
 37 | #if BLAKE2_LE
 38 | 	memcpy(m, data, sizeof m);
 39 | #else
 40 | 	for (i = 0; i < 16; i ++) {
 41 | 		m[i] = dec64le(data + (i << 3));
 42 | 	}
 43 | #endif
 44 | 
 45 | #define ROR(x, n)   (((x) << (64 - (n))) | ((x) >> (n)))
 46 | 
 47 | #define G(a, b, c, d, x, y)   do { \
 48 | 		v[a] += v[b] + (x); \
 49 | 		v[d] = ROR(v[d] ^ v[a], 32); \
 50 | 		v[c] += v[d]; \
 51 | 		v[b] = ROR(v[b] ^ v[c], 24); \
 52 | 		v[a] += v[b] + (y); \
 53 | 		v[d] = ROR(v[d] ^ v[a], 16); \
 54 | 		v[c] += v[d]; \
 55 | 		v[b] = ROR(v[b] ^ v[c], 63); \
 56 | 	} while (0)
 57 | 
 58 | #define ROUND(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF) \
 59 | 	do { \
 60 | 		G(0, 4,  8, 12, m[s0], m[s1]); \
 61 | 		G(1, 5,  9, 13, m[s2], m[s3]); \
 62 | 		G(2, 6, 10, 14, m[s4], m[s5]); \
 63 | 		G(3, 7, 11, 15, m[s6], m[s7]); \
 64 | 		G(0, 5, 10, 15, m[s8], m[s9]); \
 65 | 		G(1, 6, 11, 12, m[sA], m[sB]); \
 66 | 		G(2, 7,  8, 13, m[sC], m[sD]); \
 67 | 		G(3, 4,  9, 14, m[sE], m[sF]); \
 68 | 	} while (0)
 69 | 
 70 | 	ROUND( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
 71 | 	ROUND(14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3);
 72 | 	ROUND(11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4);
 73 | 	ROUND( 7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8);
 74 | 	ROUND( 9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13);
 75 | 	ROUND( 2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9);
 76 | 	ROUND(12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11);
 77 | 	ROUND(13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10);
 78 | 	ROUND( 6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5);
 79 | 	ROUND(10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0);
 80 | 	ROUND( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
 81 | 	ROUND(14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3);
 82 | 
 83 | #undef ROR
 84 | #undef G
 85 | #undef ROUND
 86 | 
 87 | 	for (i = 0; i < 8; i ++) {
 88 | 		h[i] ^= v[i] ^ v[i + 8];
 89 | 	}
 90 | }
 91 | 
 92 | /*
 93 |  * State rules:
 94 |  *
 95 |  *   buf    buffered data
 96 |  *   h      current state
 97 |  *   ctr    number of bytes injected so far
 98 |  *
 99 |  * Initially, ctr == 0 and h contains the XOR of IV and parameter block;
100 |  * buf[] is empty. For any ctr > 0, buf[] is non-empty; it might contain
101 |  * a full block worth of data (processing of the block is delayed until
102 |  * we know whether this is the final block or not).
103 |  *
104 |  * If a key is injected, then it counts as a first full block.
105 |  */
106 | 
107 | /* see blake2.h */
108 | void
109 | blake2b_init(blake2b_context *bc, size_t out_len)
110 | {
111 | 	memcpy(bc->h, IV, sizeof bc->h);
112 | 	bc->h[0] ^= 0x01010000 ^ (uint64_t)out_len;
113 | 	bc->ctr = 0;
114 | 	bc->out_len = out_len;
115 | }
116 | 
117 | /* see blake2.h */
118 | void
119 | blake2b_init_key(blake2b_context *bc, size_t out_len,
120 | 	const void *key, size_t key_len)
121 | {
122 | 	blake2b_init(bc, out_len);
123 | 	if (key_len > 0) {
124 | 		bc->h[0] ^= (uint64_t)key_len << 8;
125 | 		memcpy(bc->buf, key, key_len);
126 | 		memset(bc->buf + key_len, 0, (sizeof bc->buf) - key_len);
127 | 		bc->ctr = sizeof bc->buf;
128 | 	}
129 | }
130 | 
131 | /* see blake2.h */
132 | void
133 | blake2b_update(blake2b_context *bc, const void *data, size_t len)
134 | {
135 | 	uint64_t ctr;
136 | 	size_t p;
137 | 
138 | 	/* Special case: if no input data, return immediately. */
139 | 	if (len == 0) {
140 | 		return;
141 | 	}
142 | 
143 | 	ctr = bc->ctr;
144 | 
145 | 	/* First complete the current block, if not already full. */
146 | 	p = (size_t)ctr & ((sizeof bc->buf) - 1);
147 | 	if (ctr == 0 || p != 0) {
148 | 		/* buffer is not full */
149 | 		size_t clen;
150 | 
151 | 		clen = sizeof bc->buf - p;
152 | 		if (clen >= len) {
153 | 			memcpy(bc->buf + p, data, len);
154 | 			bc->ctr = ctr + len;
155 | 			return;
156 | 		}
157 | 		memcpy(bc->buf + p, data, clen);
158 | 		ctr += clen;
159 | 		data = (const uint8_t *)data + clen;
160 | 		len -= clen;
161 | 	}
162 | 
163 | 	/* Process the buffered block. */
164 | 	process_block(bc->h, bc->buf, ctr, 0);
165 | 
166 | 	/* Process all subsequent full blocks, except the last. */
167 | 	while (len > sizeof bc->buf) {
168 | 		ctr += sizeof bc->buf;
169 | 		process_block(bc->h, data, ctr, 0);
170 | 		data = (const uint8_t *)data + sizeof bc->buf;
171 | 		len -= sizeof bc->buf;
172 | 	}
173 | 
174 | 	/* Copy the last block (possibly partial) into the buffer. */
175 | 	memcpy(bc->buf, data, len);
176 | 	bc->ctr = ctr + len;
177 | }
178 | 
179 | /* see blake2.h */
180 | void
181 | blake2b_final(blake2b_context *bc, void *dst)
182 | {
183 | #if !BLAKE2_LE
184 | 	int i;
185 | 	uint8_t tmp[64];
186 | #endif
187 | 	size_t p;
188 | 
189 | 	/* Pad the current block with zeros, if not full. If the
190 | 	   buffer is empty (no key, no data) then fill it with zeros
191 | 	   as well. */
192 | 	p = (size_t)bc->ctr & ((sizeof bc->buf) - 1);
193 | 	if (bc->ctr == 0 || p != 0) {
194 | 		memset(bc->buf + p, 0, (sizeof bc->buf) - p);
195 | 	}
196 | 
197 | 	process_block(bc->h, bc->buf, bc->ctr, 1);
198 | #if BLAKE2_LE
199 | 	memcpy(dst, bc->h, bc->out_len);
200 | #else
201 | 	for (i = 0; i < 8; i ++) {
202 | 		enc64le(tmp + (i << 3), bc->h[i]);
203 | 	}
204 | 	memcpy(dst, tmp, bc->out_len);
205 | #endif
206 | }
207 | 
208 | /* see blake2.h */
209 | void
210 | blake2b(void *dst, size_t dst_len, const void *key, size_t key_len,
211 | 	const void *src, size_t src_len)
212 | {
213 | 	blake2b_context bc;
214 | 
215 | 	blake2b_init_key(&bc, dst_len, key, key_len);
216 | 	blake2b_update(&bc, src, src_len);
217 | 	blake2b_final(&bc, dst);
218 | }
219 | 
220 | /* see blake2.h */
221 | void
222 | blake2b_expand(void *dst, size_t dst_len,
223 | 	const void *seed, size_t seed_len, uint64_t label)
224 | {
225 | 	uint64_t h[8];
226 | 	uint8_t buf[128];
227 | 	size_t in_len;
228 | 	uint64_t num;
229 | 
230 | 	in_len = 16 + seed_len;
231 | 	enc64le(buf, label);
232 | 	memset(buf + 8, 0, 8);
233 | 	memcpy(buf + 16, seed, seed_len);
234 | 	memset(buf + in_len, 0, (sizeof buf) - in_len);
235 | 	num = 0;
236 | 	while (dst_len > 0) {
237 | 		size_t clen;
238 | #if !BLAKE2_LE
239 | 		uint8_t tmp[64];
240 | 		int i;
241 | #endif
242 | 
243 | 		memcpy(h, IV, sizeof h);
244 | 		h[0] ^= 0x01010000 ^ (sizeof h);
245 | 		enc64le(buf + 8, num ++);
246 | 		process_block(h, buf, in_len, 1);
247 | 		clen = dst_len < (sizeof h) ? dst_len : (sizeof h);
248 | #if BLAKE2_LE
249 | 		memcpy(dst, h, clen);
250 | #else
251 | 		for (i = 0; i < 8; i ++) {
252 | 			enc64le(tmp + (i << 3), h[i]);
253 | 		}
254 | 		memcpy(dst, tmp, clen);
255 | #endif
256 | 		dst_len -= clen;
257 | 		dst = (uint8_t *)dst + clen;
258 | 	}
259 | }
260 | 


--------------------------------------------------------------------------------
/src/blake2s.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Internal functions for BAT.
  3 |  */
  4 | 
  5 | /* ====================================================================== */
  6 | 
  7 | #include <stdint.h>
  8 | #include <string.h>
  9 | 
 10 | #include "blake2.h"
 11 | 
 12 | #include "inner.h"
 13 | #define BLAKE2_AVX2        BAT_AVX2
 14 | #define BLAKE2_LE          BAT_LE
 15 | #define BLAKE2_UNALIGNED   BAT_UNALIGNED
 16 | 
 17 | ALIGNED_AVX2
 18 | static const uint32_t IV[] = {
 19 | 	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
 20 | 	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 21 | };
 22 | 
 23 | #if BLAKE2_AVX2
 24 | 
 25 | TARGET_AVX2
 26 | static void
 27 | process_block(uint32_t *h, const uint8_t *data, uint64_t t, int f)
 28 | {
 29 | 	__m128i xh0, xh1, xv0, xv1, xv2, xv3;
 30 | 	__m128i xm0, xm1, xm2, xm3, xn0, xn1, xn2, xn3;
 31 | 	__m128i xt0, xt1, xt2, xt3, xt4, xt5, xt6, xt7, xt8, xt9;
 32 | 	__m128i xror8, xror16;
 33 | 
 34 | 	xror8 = _mm_setr_epi8(
 35 | 		1, 2, 3, 0, 5, 6, 7, 4,
 36 | 		9, 10, 11, 8, 13, 14, 15, 12);
 37 | 	xror16 = _mm_setr_epi8(
 38 | 		2, 3, 0, 1, 6, 7, 4, 5,
 39 | 		10, 11, 8, 9, 14, 15, 12, 13);
 40 | 
 41 | 	/* Initialize state. */
 42 | 	xh0 = _mm_loadu_si128((const void *)(h + 0));
 43 | 	xh1 = _mm_loadu_si128((const void *)(h + 4));
 44 | 	xv0 = xh0;
 45 | 	xv1 = xh1;
 46 | 	xv2 = _mm_loadu_si128((const void *)(IV + 0));
 47 | 	xv3 = _mm_loadu_si128((const void *)(IV + 4));
 48 | 	xv3 = _mm_xor_si128(xv3, _mm_setr_epi32(
 49 | 		(int32_t)(uint32_t)t, (int32_t)(uint32_t)(t >> 32),
 50 | 		-f, 0));
 51 | 
 52 | 	/* Load data and move it into the proper order for the first round:
 53 | 	     xm0:  0  2  4  6
 54 | 	     xm1:  1  3  5  7
 55 | 	     xm2:  8 10 12 14
 56 | 	     xm3:  9 11 13 15 */
 57 | 	xm0 = _mm_loadu_si128((const void *)(data +  0));
 58 | 	xm1 = _mm_loadu_si128((const void *)(data + 16));
 59 | 	xm2 = _mm_loadu_si128((const void *)(data + 32));
 60 | 	xm3 = _mm_loadu_si128((const void *)(data + 48));
 61 | 
 62 | 	xn0 = _mm_shuffle_epi32(xm0, 0xD8);
 63 | 	xn1 = _mm_shuffle_epi32(xm1, 0xD8);
 64 | 	xm0 = _mm_unpacklo_epi64(xn0, xn1);
 65 | 	xm1 = _mm_unpackhi_epi64(xn0, xn1);
 66 | 
 67 | 	xn2 = _mm_shuffle_epi32(xm2, 0xD8);
 68 | 	xn3 = _mm_shuffle_epi32(xm3, 0xD8);
 69 | 	xm2 = _mm_unpacklo_epi64(xn2, xn3);
 70 | 	xm3 = _mm_unpackhi_epi64(xn2, xn3);
 71 | 
 72 | #define G4(xx, xy)   do { \
 73 | 		__m128i xtg; \
 74 | 		xv0 = _mm_add_epi32(xv0, _mm_add_epi32(xv1, xx)); \
 75 | 		xv3 = _mm_shuffle_epi8(_mm_xor_si128(xv0, xv3), xror16); \
 76 | 		xv2 = _mm_add_epi32(xv2, xv3); \
 77 | 		xtg = _mm_xor_si128(xv1, xv2); \
 78 | 		xv1 = _mm_or_si128( \
 79 | 			_mm_srli_epi32(xtg, 12), _mm_slli_epi32(xtg, 20)); \
 80 | 		xv0 = _mm_add_epi32(xv0, _mm_add_epi32(xv1, xy)); \
 81 | 		xv3 = _mm_shuffle_epi8(_mm_xor_si128(xv0, xv3), xror8); \
 82 | 		xv2 = _mm_add_epi32(xv2, xv3); \
 83 | 		xtg = _mm_xor_si128(xv1, xv2); \
 84 | 		xv1 = _mm_or_si128( \
 85 | 			_mm_srli_epi32(xtg, 7), _mm_slli_epi32(xtg, 25)); \
 86 | 	} while (0)
 87 | 
 88 | #define ROUND(i0, i1, i2, i3)   do { \
 89 | 		G4(i0, i1); \
 90 | 		xv1 = _mm_shuffle_epi32(xv1, 0x39); \
 91 | 		xv2 = _mm_shuffle_epi32(xv2, 0x4E); \
 92 | 		xv3 = _mm_shuffle_epi32(xv3, 0x93); \
 93 | 		G4(i2, i3); \
 94 | 		xv1 = _mm_shuffle_epi32(xv1, 0x93); \
 95 | 		xv2 = _mm_shuffle_epi32(xv2, 0x4E); \
 96 | 		xv3 = _mm_shuffle_epi32(xv3, 0x39); \
 97 | 	} while (0)
 98 | 
 99 | 	/* round 0 */
100 | 	ROUND(xm0, xm1, xm2, xm3);
101 | 
102 | 	/* round 1 */
103 | 	xt0 = _mm_shuffle_epi32(xm0, 0x00);
104 | 	xt1 = _mm_shuffle_epi32(xm0, 0xC8);
105 | 	xt2 = _mm_shuffle_epi32(xm1, 0x70);
106 | 	xt3 = _mm_shuffle_epi32(xm1, 0x80);
107 | 	xt4 = _mm_shuffle_epi32(xm2, 0x01);
108 | 	xt5 = _mm_shuffle_epi32(xm2, 0x02);
109 | 	xt6 = _mm_shuffle_epi32(xm2, 0x03);
110 | 	xt7 = _mm_shuffle_epi32(xm3, 0x80);
111 | 	xt8 = _mm_shuffle_epi32(xm3, 0x10);
112 | 	xt9 = _mm_shuffle_epi32(xm3, 0x30);
113 | 	xn0 = _mm_blend_epi32(
114 | 		_mm_blend_epi32(xt6, xt1, 0x02),
115 | 		xt7, 0x0C);
116 | 	xn1 = _mm_blend_epi32(
117 | 		_mm_blend_epi32(xt4, xt9, 0x04),
118 | 		xt1, 0x08);
119 | 	xn2 = _mm_blend_epi32(
120 | 		_mm_blend_epi32(xt3, xt0, 0x02),
121 | 		xt8, 0x04);
122 | 	xn3 = _mm_blend_epi32(
123 | 		_mm_blend_epi32(xt5, xm0, 0x02),
124 | 		xt2, 0x0C);
125 | 	ROUND(xn0, xn1, xn2, xn3);
126 | 
127 | 	/* round 2 */
128 | 	xt0 = _mm_shuffle_epi32(xn0, 0x40);
129 | 	xt1 = _mm_shuffle_epi32(xn0, 0x80);
130 | 	xt2 = _mm_shuffle_epi32(xn1, 0x80);
131 | 	xt3 = _mm_shuffle_epi32(xn1, 0x0D);
132 | 	xt4 = _mm_shuffle_epi32(xn2, 0x04);
133 | 	xt5 = _mm_shuffle_epi32(xn2, 0x32);
134 | 	xt6 = _mm_shuffle_epi32(xn3, 0x10);
135 | 	xt7 = _mm_shuffle_epi32(xn3, 0x2C);
136 | 	xm0 = _mm_blend_epi32(
137 | 		_mm_blend_epi32(xt5, xt6, 0x02),
138 | 		xt2, 0x08);
139 | 	xm1 = _mm_blend_epi32(
140 | 		_mm_blend_epi32(xt3, xt4, 0x02),
141 | 		_mm_blend_epi32(xt6, xn0, 0x08), 0x0C);
142 | 	xm2 = _mm_blend_epi32(
143 | 		_mm_blend_epi32(xt2, xt7, 0x06),
144 | 		xt1, 0x08);
145 | 	xm3 = _mm_blend_epi32(
146 | 		_mm_blend_epi32(xt0, xt3, 0x02),
147 | 		xt4, 0x04);
148 | 	ROUND(xm0, xm1, xm2, xm3);
149 | 
150 | 	/* round 3 */
151 | 	xt0 = _mm_shuffle_epi32(xm0, 0x10);
152 | 	xt1 = _mm_shuffle_epi32(xm0, 0xC8);
153 | 	xt2 = _mm_shuffle_epi32(xm1, 0x10);
154 | 	xt3 = _mm_shuffle_epi32(xm1, 0x32);
155 | 	xt4 = _mm_shuffle_epi32(xm2, 0x03);
156 | 	xt5 = _mm_shuffle_epi32(xm2, 0x06);
157 | 	xt6 = _mm_shuffle_epi32(xm3, 0x39);
158 | 	xn0 = _mm_blend_epi32(
159 | 		_mm_blend_epi32(xt5, xt3, 0x04),
160 | 		xt0, 0x08);
161 | 	xn1 = _mm_blend_epi32(
162 | 		_mm_blend_epi32(xt4, xt6, 0x0A),
163 | 		xt0, 0x04);
164 | 	xn2 = _mm_blend_epi32(
165 | 		_mm_blend_epi32(xt3, xt1, 0x0A),
166 | 		xt6, 0x04);
167 | 	xn3 = _mm_blend_epi32(
168 | 		_mm_blend_epi32(xt6, xt4, 0x02),
169 | 		xt2, 0x0C);
170 | 	ROUND(xn0, xn1, xn2, xn3);
171 | 
172 | 	/* round 4 */
173 | 	xt0 = _mm_shuffle_epi32(xn0, 0x80);
174 | 	xt1 = _mm_shuffle_epi32(xn0, 0x4C);
175 | 	xt2 = _mm_shuffle_epi32(xn1, 0x09);
176 | 	xt3 = _mm_shuffle_epi32(xn1, 0x03);
177 | 	xt4 = _mm_shuffle_epi32(xn2, 0x04);
178 | 	xt5 = _mm_shuffle_epi32(xn3, 0x40);
179 | 	xt6 = _mm_shuffle_epi32(xn3, 0x32);
180 | 	xm0 = _mm_blend_epi32(
181 | 		_mm_blend_epi32(xn1, xt4, 0x06),
182 | 		xt5, 0x08);
183 | 	xm1 = _mm_blend_epi32(
184 | 		_mm_blend_epi32(xt6, xt0, 0x02),
185 | 		xn2, 0x0C);
186 | 	xm2 = _mm_blend_epi32(
187 | 		_mm_blend_epi32(xt3, xt1, 0x0A),
188 | 		xt5, 0x04);
189 | 	xm3 = _mm_blend_epi32(
190 | 		_mm_blend_epi32(xt2, xt6, 0x04),
191 | 		xt0, 0x08);
192 | 	ROUND(xm0, xm1, xm2, xm3);
193 | 
194 | 	/* round 5 */
195 | 	xt0 = _mm_shuffle_epi32(xm0, 0x04);
196 | 	xt1 = _mm_shuffle_epi32(xm0, 0x0E);
197 | 	xt2 = _mm_shuffle_epi32(xm1, 0x04);
198 | 	xt3 = _mm_shuffle_epi32(xm1, 0x32);
199 | 	xt4 = _mm_shuffle_epi32(xm2, 0x08);
200 | 	xt5 = _mm_shuffle_epi32(xm2, 0xD0);
201 | 	xt6 = _mm_shuffle_epi32(xm3, 0x01);
202 | 	xt7 = _mm_shuffle_epi32(xm3, 0x83);
203 | 	xn0 = _mm_blend_epi32(
204 | 		_mm_blend_epi32(xt1, xt4, 0x02),
205 | 		_mm_blend_epi32(xt2, xt7, 0x08), 0x0C);
206 | 	xn1 = _mm_blend_epi32(
207 | 		_mm_blend_epi32(xt6, xt1, 0x02),
208 | 		xt5, 0x0C);
209 | 	xn2 = _mm_blend_epi32(
210 | 		_mm_blend_epi32(xt3, xt2, 0x02),
211 | 		xt6, 0x08);
212 | 	xn3 = _mm_blend_epi32(
213 | 		_mm_blend_epi32(xt7, xt0, 0x0A),
214 | 		xt4, 0x04);
215 | 	ROUND(xn0, xn1, xn2, xn3);
216 | 
217 | 	/* round 6 */
218 | 	xt0 = _mm_shuffle_epi32(xn0, 0xC6);
219 | 	xt1 = _mm_shuffle_epi32(xn1, 0x40);
220 | 	xt2 = _mm_shuffle_epi32(xn1, 0x8C);
221 | 	xt3 = _mm_shuffle_epi32(xn2, 0x09);
222 | 	xt4 = _mm_shuffle_epi32(xn2, 0x0C);
223 | 	xt5 = _mm_shuffle_epi32(xn3, 0x01);
224 | 	xt6 = _mm_shuffle_epi32(xn3, 0x30);
225 | 	xm0 = _mm_blend_epi32(
226 | 		_mm_blend_epi32(xt1, xt4, 0x0A),
227 | 		xn3, 0x04);
228 | 	xm1 = _mm_blend_epi32(
229 | 		_mm_blend_epi32(xt5, xt3, 0x02),
230 | 		xt1, 0x08);
231 | 	xm2 = _mm_blend_epi32(xt0, xt6, 0x04);
232 | 	xm3 = _mm_blend_epi32(
233 | 		_mm_blend_epi32(xt3, xt2, 0x0A),
234 | 		xt0, 0x04);
235 | 	ROUND(xm0, xm1, xm2, xm3);
236 | 
237 | 	/* round 7 */
238 | 	xt0 = _mm_shuffle_epi32(xm0, 0x0C);
239 | 	xt1 = _mm_shuffle_epi32(xm0, 0x18);
240 | 	xt2 = _mm_shuffle_epi32(xm1, 0xC2);
241 | 	xt3 = _mm_shuffle_epi32(xm2, 0x10);
242 | 	xt4 = _mm_shuffle_epi32(xm2, 0xB0);
243 | 	xt5 = _mm_shuffle_epi32(xm3, 0x40);
244 | 	xt6 = _mm_shuffle_epi32(xm3, 0x83);
245 | 	xn0 = _mm_blend_epi32(
246 | 		_mm_blend_epi32(xt2, xt5, 0x0A),
247 | 		xt0, 0x04);
248 | 	xn1 = _mm_blend_epi32(
249 | 		_mm_blend_epi32(xt6, xt1, 0x06),
250 | 		xt4, 0x08);
251 | 	xn2 = _mm_blend_epi32(
252 | 		_mm_blend_epi32(xm1, xt4, 0x04),
253 | 		xt6, 0x08);
254 | 	xn3 = _mm_blend_epi32(
255 | 		_mm_blend_epi32(xt3, xt0, 0x02),
256 | 		xt2, 0x08);
257 | 	ROUND(xn0, xn1, xn2, xn3);
258 | 
259 | 	/* round 8 */
260 | 	xt0 = _mm_shuffle_epi32(xn0, 0x02);
261 | 	xt1 = _mm_shuffle_epi32(xn0, 0x34);
262 | 	xt2 = _mm_shuffle_epi32(xn1, 0x0C);
263 | 	xt3 = _mm_shuffle_epi32(xn2, 0x03);
264 | 	xt4 = _mm_shuffle_epi32(xn2, 0x81);
265 | 	xt5 = _mm_shuffle_epi32(xn3, 0x02);
266 | 	xt6 = _mm_shuffle_epi32(xn3, 0xD0);
267 | 	xm0 = _mm_blend_epi32(
268 | 		_mm_blend_epi32(xt5, xn1, 0x02),
269 | 		xt2, 0x04);
270 | 	xm1 = _mm_blend_epi32(
271 | 		_mm_blend_epi32(xt4, xt2, 0x02),
272 | 		xt1, 0x04);
273 | 	xm2 = _mm_blend_epi32(
274 | 		_mm_blend_epi32(xt0, xn1, 0x04),
275 | 		xt6, 0x08);
276 | 	xm3 = _mm_blend_epi32(
277 | 		_mm_blend_epi32(xt3, xt1, 0x02),
278 | 		xt6, 0x04);
279 | 	ROUND(xm0, xm1, xm2, xm3);
280 | 
281 | 	/* round 9 */
282 | 	xt0 = _mm_shuffle_epi32(xm0, 0xC6);
283 | 	xt1 = _mm_shuffle_epi32(xm1, 0x2C);
284 | 	xt2 = _mm_shuffle_epi32(xm2, 0x40);
285 | 	xt3 = _mm_shuffle_epi32(xm2, 0x83);
286 | 	xt4 = _mm_shuffle_epi32(xm3, 0xD8);
287 | 	xn0 = _mm_blend_epi32(
288 | 		_mm_blend_epi32(xt3, xt1, 0x02),
289 | 		xt4, 0x04);
290 | 	xn1 = _mm_blend_epi32(xt4, xt0, 0x04);
291 | 	xn2 = _mm_blend_epi32(
292 | 		_mm_blend_epi32(xm1, xt1, 0x04),
293 | 		xt2, 0x08);
294 | 	xn3 = _mm_blend_epi32(xt0, xt2, 0x04);
295 | 	ROUND(xn0, xn1, xn2, xn3);
296 | 
297 | #undef G4
298 | #undef ROUND
299 | 
300 | 	xh0 = _mm_xor_si128(xh0, _mm_xor_si128(xv0, xv2));
301 | 	xh1 = _mm_xor_si128(xh1, _mm_xor_si128(xv1, xv3));
302 | 	_mm_storeu_si128((void *)(h + 0), xh0);
303 | 	_mm_storeu_si128((void *)(h + 4), xh1);
304 | }
305 | 
306 | /*
307 |  * Optimized AVX2 implementation for blake2s_expand().
308 |  *
309 |  * Input buffer data_x2[] has size 128 bytes; it is filled with two
310 |  * interlaced instances of the label, initial block counter and seed in
311 |  * their proper positions, and zero elsewhere.
312 |  *
313 |  * In data_x2[], the counter fields are supposed to be already set for the
314 |  * two first blocks.
315 |  *
316 |  * 'data_len' is the message length (16 + length of seed, not the length
317 |  * of the duplicated buffer data_x2).
318 |  *
319 |  * This function produces dst_len bytes. dst_len MUST be non-zero,
320 |  * and a multiple of 64 bytes.
321 |  *
322 |  * The function internally increments the block counters over 32 bits only,
323 |  * without carry propagation. The caller is responsible for calling this
324 |  * function with initial counter and dst_len values that ensure that no
325 |  * carry propagation is missed.
326 |  */
327 | TARGET_AVX2
328 | static void
329 | expand_inner_x2(uint8_t *dst, size_t dst_len,
330 | 	const uint8_t *data_x2, size_t data_len)
331 | {
332 | 	/* Initial value, duplicated for AVX2 parallelism. */
333 | 	ALIGNED_AVX2
334 | 	static const uint32_t IV_x2[] = {
335 | 		0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
336 | 		0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
337 | 		0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
338 | 		0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
339 | 	};
340 | 
341 | 	/* Initial state, duplicated for AVX2 parallelism, with a
342 | 	   personalization block for output 32 bytes. */
343 | 	ALIGNED_AVX2
344 | 	static const uint32_t hinit_out32_x2[] = {
345 | 		0x6A09E667 ^ 0x01010020, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
346 | 		0x6A09E667 ^ 0x01010020, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
347 | 		0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
348 | 		0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
349 | 	};
350 | 
351 | 	__m256i xh0, xh1, xv0, xv1, xv2, xv3;
352 | 	__m256i xm0, xm1, xm2, xm3, xn0, xn1, xn2, xn3;
353 | 	__m256i xt0, xt1, xt2, xt3, xt4, xt5, xt6, xt7, xt8, xt9;
354 | 	__m256i xror8, xror16, xca2;
355 | 
356 | 	xror8 = _mm256_setr_epi8(
357 | 		1, 2, 3, 0, 5, 6, 7, 4,
358 | 		9, 10, 11, 8, 13, 14, 15, 12,
359 | 		1, 2, 3, 0, 5, 6, 7, 4,
360 | 		9, 10, 11, 8, 13, 14, 15, 12);
361 | 	xror16 = _mm256_setr_epi8(
362 | 		2, 3, 0, 1, 6, 7, 4, 5,
363 | 		10, 11, 8, 9, 14, 15, 12, 13,
364 | 		2, 3, 0, 1, 6, 7, 4, 5,
365 | 		10, 11, 8, 9, 14, 15, 12, 13);
366 | 	xca2 = _mm256_setr_epi32(0, 2, 0, 0, 0, 2, 0, 0);
367 | 
368 | 	/* Initialize state. */
369 | 	xh0 = _mm256_loadu_si256((const void *)(hinit_out32_x2 + 0));
370 | 	xh1 = _mm256_loadu_si256((const void *)(hinit_out32_x2 + 8));
371 | 
372 | 	/* Load data and move it into the proper order for the first round:
373 | 	     xm0:  0  2  4  6
374 | 	     xm1:  1  3  5  7
375 | 	     xm2:  8 10 12 14
376 | 	     xm3:  9 11 13 15 */
377 | 	xm0 = _mm256_loadu_si256((const void *)(data_x2 +  0));
378 | 	xm1 = _mm256_loadu_si256((const void *)(data_x2 + 32));
379 | 	xm2 = _mm256_loadu_si256((const void *)(data_x2 + 64));
380 | 	xm3 = _mm256_loadu_si256((const void *)(data_x2 + 96));
381 | 
382 | 	xn0 = _mm256_shuffle_epi32(xm0, 0xD8);
383 | 	xn1 = _mm256_shuffle_epi32(xm1, 0xD8);
384 | 	xm0 = _mm256_unpacklo_epi64(xn0, xn1);
385 | 	xm1 = _mm256_unpackhi_epi64(xn0, xn1);
386 | 
387 | 	xn2 = _mm256_shuffle_epi32(xm2, 0xD8);
388 | 	xn3 = _mm256_shuffle_epi32(xm3, 0xD8);
389 | 	xm2 = _mm256_unpacklo_epi64(xn2, xn3);
390 | 	xm3 = _mm256_unpackhi_epi64(xn2, xn3);
391 | 
392 | 	for (;;) {
393 | 		/* Each loop iteration computes two BLAKE2s in parallel,
394 | 		   in the low and high lanes, respectively. */
395 | 
396 | 		/* Initialize round state. */
397 | 		xv0 = xh0;
398 | 		xv1 = xh1;
399 | 		xv2 = _mm256_loadu_si256((const void *)(IV_x2 + 0));
400 | 		xv3 = _mm256_loadu_si256((const void *)(IV_x2 + 8));
401 | 		xv3 = _mm256_xor_si256(xv3,
402 | 			_mm256_setr_epi64x(
403 | 				data_len, 0xFFFFFFFF,
404 | 				data_len, 0xFFFFFFFF));
405 | 
406 | #define G4(xx, xy)   do { \
407 | 		__m256i xtg; \
408 | 		xv0 = _mm256_add_epi32(xv0, _mm256_add_epi32(xv1, xx)); \
409 | 		xv3 = _mm256_shuffle_epi8(_mm256_xor_si256(xv0, xv3), xror16); \
410 | 		xv2 = _mm256_add_epi32(xv2, xv3); \
411 | 		xtg = _mm256_xor_si256(xv1, xv2); \
412 | 		xv1 = _mm256_or_si256( \
413 | 			_mm256_srli_epi32(xtg, 12), \
414 | 			_mm256_slli_epi32(xtg, 20)); \
415 | 		xv0 = _mm256_add_epi32(xv0, _mm256_add_epi32(xv1, xy)); \
416 | 		xv3 = _mm256_shuffle_epi8(_mm256_xor_si256(xv0, xv3), xror8); \
417 | 		xv2 = _mm256_add_epi32(xv2, xv3); \
418 | 		xtg = _mm256_xor_si256(xv1, xv2); \
419 | 		xv1 = _mm256_or_si256( \
420 | 			_mm256_srli_epi32(xtg, 7), \
421 | 			_mm256_slli_epi32(xtg, 25)); \
422 | 	} while (0)
423 | 
424 | #define ROUND(i0, i1, i2, i3)   do { \
425 | 		G4(i0, i1); \
426 | 		xv1 = _mm256_shuffle_epi32(xv1, 0x39); \
427 | 		xv2 = _mm256_shuffle_epi32(xv2, 0x4E); \
428 | 		xv3 = _mm256_shuffle_epi32(xv3, 0x93); \
429 | 		G4(i2, i3); \
430 | 		xv1 = _mm256_shuffle_epi32(xv1, 0x93); \
431 | 		xv2 = _mm256_shuffle_epi32(xv2, 0x4E); \
432 | 		xv3 = _mm256_shuffle_epi32(xv3, 0x39); \
433 | 	} while (0)
434 | 
435 | 		/* round 0 */
436 | 		ROUND(xm0, xm1, xm2, xm3);
437 | 
438 | 		/* round 1 */
439 | 		xt0 = _mm256_shuffle_epi32(xm0, 0x00);
440 | 		xt1 = _mm256_shuffle_epi32(xm0, 0xC8);
441 | 		xt2 = _mm256_shuffle_epi32(xm1, 0x70);
442 | 		xt3 = _mm256_shuffle_epi32(xm1, 0x80);
443 | 		xt4 = _mm256_shuffle_epi32(xm2, 0x01);
444 | 		xt5 = _mm256_shuffle_epi32(xm2, 0x02);
445 | 		xt6 = _mm256_shuffle_epi32(xm2, 0x03);
446 | 		xt7 = _mm256_shuffle_epi32(xm3, 0x80);
447 | 		xt8 = _mm256_shuffle_epi32(xm3, 0x10);
448 | 		xt9 = _mm256_shuffle_epi32(xm3, 0x30);
449 | 		xn0 = _mm256_blend_epi32(
450 | 			_mm256_blend_epi32(xt6, xt1, 0x22),
451 | 			xt7, 0xCC);
452 | 		xn1 = _mm256_blend_epi32(
453 | 			_mm256_blend_epi32(xt4, xt9, 0x44),
454 | 			xt1, 0x88);
455 | 		xn2 = _mm256_blend_epi32(
456 | 			_mm256_blend_epi32(xt3, xt0, 0x22),
457 | 			xt8, 0x44);
458 | 		xn3 = _mm256_blend_epi32(
459 | 			_mm256_blend_epi32(xt5, xm0, 0x22),
460 | 			xt2, 0xCC);
461 | 		ROUND(xn0, xn1, xn2, xn3);
462 | 
463 | 		/* round 2 */
464 | 		xt0 = _mm256_shuffle_epi32(xn0, 0x40);
465 | 		xt1 = _mm256_shuffle_epi32(xn0, 0x80);
466 | 		xt2 = _mm256_shuffle_epi32(xn1, 0x80);
467 | 		xt3 = _mm256_shuffle_epi32(xn1, 0x0D);
468 | 		xt4 = _mm256_shuffle_epi32(xn2, 0x04);
469 | 		xt5 = _mm256_shuffle_epi32(xn2, 0x32);
470 | 		xt6 = _mm256_shuffle_epi32(xn3, 0x10);
471 | 		xt7 = _mm256_shuffle_epi32(xn3, 0x2C);
472 | 		xm0 = _mm256_blend_epi32(
473 | 			_mm256_blend_epi32(xt5, xt6, 0x22),
474 | 			xt2, 0x88);
475 | 		xm1 = _mm256_blend_epi32(
476 | 			_mm256_blend_epi32(xt3, xt4, 0x22),
477 | 			_mm256_blend_epi32(xt6, xn0, 0x88), 0xCC);
478 | 		xm2 = _mm256_blend_epi32(
479 | 			_mm256_blend_epi32(xt2, xt7, 0x66),
480 | 			xt1, 0x88);
481 | 		xm3 = _mm256_blend_epi32(
482 | 			_mm256_blend_epi32(xt0, xt3, 0x22),
483 | 			xt4, 0x44);
484 | 		ROUND(xm0, xm1, xm2, xm3);
485 | 
486 | 		/* round 3 */
487 | 		xt0 = _mm256_shuffle_epi32(xm0, 0x10);
488 | 		xt1 = _mm256_shuffle_epi32(xm0, 0xC8);
489 | 		xt2 = _mm256_shuffle_epi32(xm1, 0x10);
490 | 		xt3 = _mm256_shuffle_epi32(xm1, 0x32);
491 | 		xt4 = _mm256_shuffle_epi32(xm2, 0x03);
492 | 		xt5 = _mm256_shuffle_epi32(xm2, 0x06);
493 | 		xt6 = _mm256_shuffle_epi32(xm3, 0x39);
494 | 		xn0 = _mm256_blend_epi32(
495 | 			_mm256_blend_epi32(xt5, xt3, 0x44),
496 | 			xt0, 0x88);
497 | 		xn1 = _mm256_blend_epi32(
498 | 			_mm256_blend_epi32(xt4, xt6, 0xAA),
499 | 			xt0, 0x44);
500 | 		xn2 = _mm256_blend_epi32(
501 | 			_mm256_blend_epi32(xt3, xt1, 0xAA),
502 | 			xt6, 0x44);
503 | 		xn3 = _mm256_blend_epi32(
504 | 			_mm256_blend_epi32(xt6, xt4, 0x22),
505 | 			xt2, 0xCC);
506 | 		ROUND(xn0, xn1, xn2, xn3);
507 | 
508 | 		/* round 4 */
509 | 		xt0 = _mm256_shuffle_epi32(xn0, 0x80);
510 | 		xt1 = _mm256_shuffle_epi32(xn0, 0x4C);
511 | 		xt2 = _mm256_shuffle_epi32(xn1, 0x09);
512 | 		xt3 = _mm256_shuffle_epi32(xn1, 0x03);
513 | 		xt4 = _mm256_shuffle_epi32(xn2, 0x04);
514 | 		xt5 = _mm256_shuffle_epi32(xn3, 0x40);
515 | 		xt6 = _mm256_shuffle_epi32(xn3, 0x32);
516 | 		xm0 = _mm256_blend_epi32(
517 | 			_mm256_blend_epi32(xn1, xt4, 0x66),
518 | 			xt5, 0x88);
519 | 		xm1 = _mm256_blend_epi32(
520 | 			_mm256_blend_epi32(xt6, xt0, 0x22),
521 | 			xn2, 0xCC);
522 | 		xm2 = _mm256_blend_epi32(
523 | 			_mm256_blend_epi32(xt3, xt1, 0xAA),
524 | 			xt5, 0x44);
525 | 		xm3 = _mm256_blend_epi32(
526 | 			_mm256_blend_epi32(xt2, xt6, 0x44),
527 | 			xt0, 0x88);
528 | 		ROUND(xm0, xm1, xm2, xm3);
529 | 
530 | 		/* round 5 */
531 | 		xt0 = _mm256_shuffle_epi32(xm0, 0x04);
532 | 		xt1 = _mm256_shuffle_epi32(xm0, 0x0E);
533 | 		xt2 = _mm256_shuffle_epi32(xm1, 0x04);
534 | 		xt3 = _mm256_shuffle_epi32(xm1, 0x32);
535 | 		xt4 = _mm256_shuffle_epi32(xm2, 0x08);
536 | 		xt5 = _mm256_shuffle_epi32(xm2, 0xD0);
537 | 		xt6 = _mm256_shuffle_epi32(xm3, 0x01);
538 | 		xt7 = _mm256_shuffle_epi32(xm3, 0x83);
539 | 		xn0 = _mm256_blend_epi32(
540 | 			_mm256_blend_epi32(xt1, xt4, 0x22),
541 | 			_mm256_blend_epi32(xt2, xt7, 0x88), 0xCC);
542 | 		xn1 = _mm256_blend_epi32(
543 | 			_mm256_blend_epi32(xt6, xt1, 0x22),
544 | 			xt5, 0xCC);
545 | 		xn2 = _mm256_blend_epi32(
546 | 			_mm256_blend_epi32(xt3, xt2, 0x22),
547 | 			xt6, 0x88);
548 | 		xn3 = _mm256_blend_epi32(
549 | 			_mm256_blend_epi32(xt7, xt0, 0xAA),
550 | 			xt4, 0x44);
551 | 		ROUND(xn0, xn1, xn2, xn3);
552 | 
553 | 		/* round 6 */
554 | 		xt0 = _mm256_shuffle_epi32(xn0, 0xC6);
555 | 		xt1 = _mm256_shuffle_epi32(xn1, 0x40);
556 | 		xt2 = _mm256_shuffle_epi32(xn1, 0x8C);
557 | 		xt3 = _mm256_shuffle_epi32(xn2, 0x09);
558 | 		xt4 = _mm256_shuffle_epi32(xn2, 0x0C);
559 | 		xt5 = _mm256_shuffle_epi32(xn3, 0x01);
560 | 		xt6 = _mm256_shuffle_epi32(xn3, 0x30);
561 | 		xm0 = _mm256_blend_epi32(
562 | 			_mm256_blend_epi32(xt1, xt4, 0xAA),
563 | 			xn3, 0x44);
564 | 		xm1 = _mm256_blend_epi32(
565 | 			_mm256_blend_epi32(xt5, xt3, 0x22),
566 | 			xt1, 0x88);
567 | 		xm2 = _mm256_blend_epi32(xt0, xt6, 0x44);
568 | 		xm3 = _mm256_blend_epi32(
569 | 			_mm256_blend_epi32(xt3, xt2, 0xAA),
570 | 			xt0, 0x44);
571 | 		ROUND(xm0, xm1, xm2, xm3);
572 | 
573 | 		/* round 7 */
574 | 		xt0 = _mm256_shuffle_epi32(xm0, 0x0C);
575 | 		xt1 = _mm256_shuffle_epi32(xm0, 0x18);
576 | 		xt2 = _mm256_shuffle_epi32(xm1, 0xC2);
577 | 		xt3 = _mm256_shuffle_epi32(xm2, 0x10);
578 | 		xt4 = _mm256_shuffle_epi32(xm2, 0xB0);
579 | 		xt5 = _mm256_shuffle_epi32(xm3, 0x40);
580 | 		xt6 = _mm256_shuffle_epi32(xm3, 0x83);
581 | 		xn0 = _mm256_blend_epi32(
582 | 			_mm256_blend_epi32(xt2, xt5, 0xAA),
583 | 			xt0, 0x44);
584 | 		xn1 = _mm256_blend_epi32(
585 | 			_mm256_blend_epi32(xt6, xt1, 0x66),
586 | 			xt4, 0x88);
587 | 		xn2 = _mm256_blend_epi32(
588 | 			_mm256_blend_epi32(xm1, xt4, 0x44),
589 | 			xt6, 0x88);
590 | 		xn3 = _mm256_blend_epi32(
591 | 			_mm256_blend_epi32(xt3, xt0, 0x22),
592 | 			xt2, 0x88);
593 | 		ROUND(xn0, xn1, xn2, xn3);
594 | 
595 | 		/* round 8 */
596 | 		xt0 = _mm256_shuffle_epi32(xn0, 0x02);
597 | 		xt1 = _mm256_shuffle_epi32(xn0, 0x34);
598 | 		xt2 = _mm256_shuffle_epi32(xn1, 0x0C);
599 | 		xt3 = _mm256_shuffle_epi32(xn2, 0x03);
600 | 		xt4 = _mm256_shuffle_epi32(xn2, 0x81);
601 | 		xt5 = _mm256_shuffle_epi32(xn3, 0x02);
602 | 		xt6 = _mm256_shuffle_epi32(xn3, 0xD0);
603 | 		xm0 = _mm256_blend_epi32(
604 | 			_mm256_blend_epi32(xt5, xn1, 0x22),
605 | 			xt2, 0x44);
606 | 		xm1 = _mm256_blend_epi32(
607 | 			_mm256_blend_epi32(xt4, xt2, 0x22),
608 | 			xt1, 0x44);
609 | 		xm2 = _mm256_blend_epi32(
610 | 			_mm256_blend_epi32(xt0, xn1, 0x44),
611 | 			xt6, 0x88);
612 | 		xm3 = _mm256_blend_epi32(
613 | 			_mm256_blend_epi32(xt3, xt1, 0x22),
614 | 			xt6, 0x44);
615 | 		ROUND(xm0, xm1, xm2, xm3);
616 | 
617 | 		/* round 9 */
618 | 		xt0 = _mm256_shuffle_epi32(xm0, 0xC6);
619 | 		xt1 = _mm256_shuffle_epi32(xm1, 0x2C);
620 | 		xt2 = _mm256_shuffle_epi32(xm2, 0x40);
621 | 		xt3 = _mm256_shuffle_epi32(xm2, 0x83);
622 | 		xt4 = _mm256_shuffle_epi32(xm3, 0xD8);
623 | 		xn0 = _mm256_blend_epi32(
624 | 			_mm256_blend_epi32(xt3, xt1, 0x22),
625 | 			xt4, 0x44);
626 | 		xn1 = _mm256_blend_epi32(xt4, xt0, 0x44);
627 | 		xn2 = _mm256_blend_epi32(
628 | 			_mm256_blend_epi32(xm1, xt1, 0x44),
629 | 			xt2, 0x88);
630 | 		xn3 = _mm256_blend_epi32(xt0, xt2, 0x44);
631 | 		ROUND(xn0, xn1, xn2, xn3);
632 | #undef G4
633 | #undef ROUND
634 | 
635 | 		/* Finalize computation and store output. The output must
636 | 		   be deinterlaced since output blocks are supposed to
637 | 		   be consecutive . */
638 | 		xt0 = _mm256_xor_si256(xh0, _mm256_xor_si256(xv0, xv2));
639 | 		xt1 = _mm256_xor_si256(xh1, _mm256_xor_si256(xv1, xv3));
640 | 		xt2 = _mm256_permute2x128_si256(xt0, xt1, 0x20);
641 | 		xt3 = _mm256_permute2x128_si256(xt0, xt1, 0x31);
642 | 		_mm256_storeu_si256((void *)(dst +  0), xt2);
643 | 		_mm256_storeu_si256((void *)(dst + 32), xt3);
644 | 
645 | 		dst += 64;
646 | 		dst_len -= 64;
647 | 		if (dst_len == 0) {
648 | 			break;
649 | 		}
650 | 
651 | 		/* Put back message words in initial order */
652 | 		xt0 = _mm256_shuffle_epi32(xn0, 0x01);
653 | 		xt1 = _mm256_shuffle_epi32(xn0, 0x83);
654 | 		xt2 = _mm256_shuffle_epi32(xn1, 0x10);
655 | 		xt3 = _mm256_shuffle_epi32(xn1, 0xB0);
656 | 		xt4 = _mm256_shuffle_epi32(xn2, 0x39);
657 | 		xt5 = _mm256_shuffle_epi32(xn3, 0x63);
658 | 		xm0 = _mm256_blend_epi32(
659 | 			_mm256_blend_epi32(xt5, xt2, 0x66),
660 | 			xt3, 0x88);
661 | 		xm1 = _mm256_blend_epi32(
662 | 			_mm256_blend_epi32(xt1, xt4, 0x22),
663 | 			xt3, 0x44);
664 | 		xm2 = _mm256_blend_epi32(xt0, xt5, 0xCC);
665 | 		xm3 = _mm256_blend_epi32(xt4, xt5, 0x22);
666 | 
667 | 		/* Increment block counter in the message.
668 | 		   Nominally, the counter is 64 bits, but we only
669 | 		   increment the low 32 bits; the caller is responsible
670 | 		   for setting the high half and calling us with values
671 | 		   that won't overflow. */
672 | 		xm0 = _mm256_add_epi32(xm0, xca2);
673 | 	}
674 | }
675 | 
676 | #else
677 | 
678 | static void
679 | process_block(uint32_t *h, const uint8_t *data, uint64_t t, int f)
680 | {
681 | 	uint32_t v[16], m[16];
682 | 	int i;
683 | 
684 | 	memcpy(v, h, 8 * sizeof(uint32_t));
685 | 	memcpy(v + 8, IV, sizeof IV);
686 | 	v[12] ^= (uint32_t)t;
687 | 	v[13] ^= (uint32_t)(t >> 32);
688 | 	if (f) {
689 | 		v[14] = ~v[14];
690 | 	}
691 | 
692 | #if BLAKE2_LE
693 | 	memcpy(m, data, sizeof m);
694 | #else
695 | 	for (i = 0; i < 16; i ++) {
696 | 		m[i] = dec32le(data + (i << 2));
697 | 	}
698 | #endif
699 | 
700 | #define ROR(x, n)   (((x) << (32 - (n))) | ((x) >> (n)))
701 | 
702 | #define G(a, b, c, d, x, y)   do { \
703 | 		v[a] += v[b] + (x); \
704 | 		v[d] = ROR(v[d] ^ v[a], 16); \
705 | 		v[c] += v[d]; \
706 | 		v[b] = ROR(v[b] ^ v[c], 12); \
707 | 		v[a] += v[b] + (y); \
708 | 		v[d] = ROR(v[d] ^ v[a], 8); \
709 | 		v[c] += v[d]; \
710 | 		v[b] = ROR(v[b] ^ v[c], 7); \
711 | 	} while (0)
712 | 
713 | #define ROUND(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF) \
714 | 	do { \
715 | 		G(0, 4,  8, 12, m[s0], m[s1]); \
716 | 		G(1, 5,  9, 13, m[s2], m[s3]); \
717 | 		G(2, 6, 10, 14, m[s4], m[s5]); \
718 | 		G(3, 7, 11, 15, m[s6], m[s7]); \
719 | 		G(0, 5, 10, 15, m[s8], m[s9]); \
720 | 		G(1, 6, 11, 12, m[sA], m[sB]); \
721 | 		G(2, 7,  8, 13, m[sC], m[sD]); \
722 | 		G(3, 4,  9, 14, m[sE], m[sF]); \
723 | 	} while (0)
724 | 
725 | 	ROUND( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
726 | 	ROUND(14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3);
727 | 	ROUND(11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4);
728 | 	ROUND( 7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8);
729 | 	ROUND( 9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13);
730 | 	ROUND( 2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9);
731 | 	ROUND(12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11);
732 | 	ROUND(13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10);
733 | 	ROUND( 6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5);
734 | 	ROUND(10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0);
735 | 
736 | #undef ROR
737 | #undef G
738 | #undef ROUND
739 | 
740 | 	for (i = 0; i < 8; i ++) {
741 | 		h[i] ^= v[i] ^ v[i + 8];
742 | 	}
743 | }
744 | 
745 | #endif
746 | 
747 | /*
748 |  * State rules:
749 |  *
750 |  *   buf    buffered data
751 |  *   h      current state
752 |  *   ctr    number of bytes injected so far
753 |  *
754 |  * Initially, ctr == 0 and h contains the XOR of IV and parameter block;
755 |  * buf[] is empty. For any ctr > 0, buf[] is non-empty; it might contain
756 |  * a full block worth of data (processing of the block is delayed until
757 |  * we know whether this is the final block or not).
758 |  *
759 |  * If a key is injected, then it counts as a first full block.
760 |  */
761 | 
762 | /* see blake2.h */
763 | void
764 | blake2s_init(blake2s_context *bc, size_t out_len)
765 | {
766 | 	memcpy(bc->h, IV, sizeof bc->h);
767 | 	bc->h[0] ^= 0x01010000 ^ (uint32_t)out_len;
768 | 	bc->ctr = 0;
769 | 	bc->out_len = out_len;
770 | }
771 | 
772 | /* see blake2.h */
773 | void
774 | blake2s_init_key(blake2s_context *bc, size_t out_len,
775 | 	const void *key, size_t key_len)
776 | {
777 | 	blake2s_init(bc, out_len);
778 | 	if (key_len > 0) {
779 | 		bc->h[0] ^= (uint32_t)key_len << 8;
780 | 		memcpy(bc->buf, key, key_len);
781 | 		memset(bc->buf + key_len, 0, (sizeof bc->buf) - key_len);
782 | 		bc->ctr = sizeof bc->buf;
783 | 	}
784 | }
785 | 
786 | /* see blake2.h */
787 | void
788 | blake2s_update(blake2s_context *bc, const void *data, size_t len)
789 | {
790 | 	uint64_t ctr;
791 | 	size_t p;
792 | 
793 | 	/* Special case: if no input data, return immediately. */
794 | 	if (len == 0) {
795 | 		return;
796 | 	}
797 | 
798 | 	ctr = bc->ctr;
799 | 
800 | 	/* First complete the current block, if not already full. */
801 | 	p = (size_t)ctr & ((sizeof bc->buf) - 1);
802 | 	if (ctr == 0 || p != 0) {
803 | 		/* buffer is not full */
804 | 		size_t clen;
805 | 
806 | 		clen = sizeof bc->buf - p;
807 | 		if (clen >= len) {
808 | 			memcpy(bc->buf + p, data, len);
809 | 			bc->ctr = ctr + len;
810 | 			return;
811 | 		}
812 | 		memcpy(bc->buf + p, data, clen);
813 | 		ctr += clen;
814 | 		data = (const uint8_t *)data + clen;
815 | 		len -= clen;
816 | 	}
817 | 
818 | 	/* Process the buffered block. */
819 | 	process_block(bc->h, bc->buf, ctr, 0);
820 | 
821 | 	/* Process all subsequent full blocks, except the last. */
822 | 	while (len > sizeof bc->buf) {
823 | 		ctr += sizeof bc->buf;
824 | 		process_block(bc->h, data, ctr, 0);
825 | 		data = (const uint8_t *)data + sizeof bc->buf;
826 | 		len -= sizeof bc->buf;
827 | 	}
828 | 
829 | 	/* Copy the last block (possibly partial) into the buffer. */
830 | 	memcpy(bc->buf, data, len);
831 | 	bc->ctr = ctr + len;
832 | }
833 | 
834 | /* see blake2.h */
835 | void
836 | blake2s_final(blake2s_context *bc, void *dst)
837 | {
838 | #if !BLAKE2_LE
839 | 	int i;
840 | 	uint8_t tmp[32];
841 | #endif
842 | 	size_t p;
843 | 
844 | 	/* Pad the current block with zeros, if not full. If the
845 | 	   buffer is empty (no key, no data) then fill it with zeros
846 | 	   as well. */
847 | 	p = (size_t)bc->ctr & ((sizeof bc->buf) - 1);
848 | 	if (bc->ctr == 0 || p != 0) {
849 | 		memset(bc->buf + p, 0, (sizeof bc->buf) - p);
850 | 	}
851 | 
852 | 	process_block(bc->h, bc->buf, bc->ctr, 1);
853 | #if BLAKE2_LE
854 | 	memcpy(dst, bc->h, bc->out_len);
855 | #else
856 | 	for (i = 0; i < 8; i ++) {
857 | 		enc32le(tmp + (i << 2), bc->h[i]);
858 | 	}
859 | 	memcpy(dst, tmp, bc->out_len);
860 | #endif
861 | }
862 | 
863 | /* see blake2.h */
864 | void
865 | blake2s(void *dst, size_t dst_len, const void *key, size_t key_len,
866 | 	const void *src, size_t src_len)
867 | {
868 | 	blake2s_context bc;
869 | 
870 | 	blake2s_init_key(&bc, dst_len, key, key_len);
871 | 	blake2s_update(&bc, src, src_len);
872 | 	blake2s_final(&bc, dst);
873 | }
874 | 
875 | /* see blake2.h */
876 | void
877 | blake2s_expand(void *dst, size_t dst_len,
878 | 	const void *seed, size_t seed_len, uint64_t label)
879 | {
880 | 	uint32_t h[8];
881 | 	uint8_t buf[64];
882 | 	size_t in_len;
883 | 	uint64_t num;
884 | 
885 | 	in_len = 16 + seed_len;
886 | 	enc64le(buf, label);
887 | 	memset(buf + 8, 0, 8);
888 | 	memcpy(buf + 16, seed, seed_len);
889 | 	memset(buf + in_len, 0, (sizeof buf) - in_len);
890 | 	num = 0;
891 | #if BLAKE2_AVX2
892 | 	if (dst_len >= 64) {
893 | 		uint8_t buf_x2[128];
894 | 
895 | 		memcpy(buf_x2 +   0, buf +  0, 16);
896 | 		memcpy(buf_x2 +  16, buf +  0, 16);
897 | 		memcpy(buf_x2 +  32, buf + 16, 16);
898 | 		memcpy(buf_x2 +  48, buf + 16, 16);
899 | 		memcpy(buf_x2 +  64, buf + 32, 16);
900 | 		memcpy(buf_x2 +  80, buf + 32, 16);
901 | 		memcpy(buf_x2 +  96, buf + 48, 16);
902 | 		memcpy(buf_x2 + 112, buf + 48, 16);
903 | 		buf_x2[24] = 1;
904 | 		while (dst_len >= 64) {
905 | 			/* We compute all full pairs of blocks, but
906 | 			   only at most 2^31 pairs at a time, since
907 | 			   expand_inner_x2() cannot propagate block
908 | 			   counter carries. */
909 | 			uint64_t tnum;
910 | 			size_t tlen;
911 | 
912 | 			tnum = (uint64_t)dst_len >> 6;
913 | 			if (tnum >= 0x80000000) {
914 | 				tnum = 0x80000000;
915 | 			}
916 | 			enc32le(buf_x2 + 12, (uint32_t)(num >> 32));
917 | 			enc32le(buf_x2 + 28, (uint32_t)(num >> 32));
918 | 			tlen = (size_t)tnum << 6;
919 | 			expand_inner_x2(dst, tlen, buf_x2, in_len);
920 | 			dst = (uint8_t *)dst + tlen;
921 | 			dst_len -= tlen;
922 | 			num += tnum << 1;
923 | 		}
924 | 	}
925 | #endif
926 | 	while (dst_len > 0) {
927 | 		size_t clen;
928 | #if !BLAKE2_LE
929 | 		uint8_t tmp[32];
930 | 		int i;
931 | #endif
932 | 
933 | 		memcpy(h, IV, sizeof h);
934 | 		h[0] ^= 0x01010000 ^ (sizeof h);
935 | 		enc64le(buf + 8, num ++);
936 | 		process_block(h, buf, in_len, 1);
937 | 		clen = dst_len < (sizeof h) ? dst_len : (sizeof h);
938 | #if BLAKE2_LE
939 | 		memcpy(dst, h, clen);
940 | #else
941 | 		for (i = 0; i < 8; i ++) {
942 | 			enc32le(tmp + (i << 2), h[i]);
943 | 		}
944 | 		memcpy(dst, tmp, clen);
945 | #endif
946 | 		dst_len -= clen;
947 | 		dst = (uint8_t *)dst + clen;
948 | 	}
949 | }
950 | 


--------------------------------------------------------------------------------
/src/codec.c:
--------------------------------------------------------------------------------
  1 | #include "inner.h"
  2 | 
  3 | /* see inner.h */
  4 | const uint8_t bat_max_fg_bits[] = {
  5 | 	0, /* unused */
  6 | 	6,
  7 | 	6,
  8 | 	6,
  9 | 	6,
 10 | 	6,
 11 | 	5,
 12 | 	5,
 13 | 	4,
 14 | 	4,
 15 | 	4
 16 | };
 17 | 
 18 | /* see inner.h */
 19 | const uint8_t bat_max_FG_bits[] = {
 20 | 	0, /* unused */
 21 | 	6,
 22 | 	6,
 23 | 	6,
 24 | 	6,
 25 | 	6,
 26 | 	6,
 27 | 	6,
 28 | 	6,
 29 | 	6,
 30 | 	6
 31 | };
 32 | 
 33 | /* see inner.h */
 34 | const uint8_t bat_max_w_bits[] = {
 35 | 	0, /* unused */
 36 | 	17,
 37 | 	17,
 38 | 	17,
 39 | 	17,
 40 | 	17,
 41 | 	17,
 42 | 	17,
 43 | 	17,
 44 | 	17,
 45 | 	17
 46 | };
 47 | 
 48 | /* see inner.h */
 49 | size_t
 50 | bat_trim_i32_encode(
 51 | 	void *out, size_t max_out_len,
 52 | 	const int32_t *x, unsigned logn, unsigned bits)
 53 | {
 54 | 	size_t n, u, out_len;
 55 | 	uint8_t *buf;
 56 | 	uint32_t acc, mask;
 57 | 	unsigned acc_len;
 58 | 
 59 | 	n = (size_t)1 << logn;
 60 | 	out_len = ((n * bits) + 7) >> 3;
 61 | 	if (out == NULL) {
 62 | 		return out_len;
 63 | 	}
 64 | 	if (out_len > max_out_len) {
 65 | 		return 0;
 66 | 	}
 67 | 	buf = out;
 68 | 	acc = 0;
 69 | 	acc_len = 0;
 70 | 	mask = ((uint32_t)1 << bits) - 1;
 71 | 	for (u = 0; u < n; u ++) {
 72 | 		acc = (acc << bits) | ((uint32_t)x[u] & mask);
 73 | 		acc_len += bits;
 74 | 		while (acc_len >= 8) {
 75 | 			acc_len -= 8;
 76 | 			*buf ++ = (uint8_t)(acc >> acc_len);
 77 | 		}
 78 | 	}
 79 | 	if (acc_len > 0) {
 80 | 		*buf ++ = (uint8_t)(acc << (8 - acc_len));
 81 | 	}
 82 | 	return out_len;
 83 | }
 84 | 
 85 | /* see inner.h */
 86 | size_t
 87 | bat_trim_i32_decode(
 88 | 	int32_t *x, unsigned logn, unsigned bits,
 89 | 	const void *in, size_t max_in_len)
 90 | {
 91 | 	size_t n, in_len;
 92 | 	const uint8_t *buf;
 93 | 	size_t u;
 94 | 	uint32_t acc, mask1, mask2;
 95 | 	unsigned acc_len;
 96 | 	uint32_t r;
 97 | 
 98 | 	n = (size_t)1 << logn;
 99 | 	in_len = ((n * bits) + 7) >> 3;
100 | 	if (in_len > max_in_len) {
101 | 		return 0;
102 | 	}
103 | 	buf = in;
104 | 	u = 0;
105 | 	acc = 0;
106 | 	acc_len = 0;
107 | 	mask1 = ((uint32_t)1 << bits) - 1;
108 | 	mask2 = (uint32_t)1 << (bits - 1);
109 | 	r = (uint32_t)-1;
110 | 	while (u < n) {
111 | 		acc = (acc << 8) | *buf ++;
112 | 		acc_len += 8;
113 | 		while (acc_len >= bits && u < n) {
114 | 			uint32_t w, q;
115 | 
116 | 			acc_len -= bits;
117 | 			w = (acc >> acc_len) & mask1;
118 | 			w |= -(w & mask2);
119 | 			x[u ++] = *(int32_t *)&w;
120 | 
121 | 			/* Value w == -mask2 is forbidden. */
122 | 			q = w + mask2;
123 | 			r &= q | -q;
124 | 		}
125 | 	}
126 | 
127 | 	/* Extra bits in the last byte must be zero. */
128 | 	acc &= (((uint32_t)1 << acc_len) - 1);
129 | 	r &= ~(acc | -acc);
130 | 
131 | 	return in_len & -(size_t)(r >> 31);
132 | }
133 | 
134 | /* see inner.h */
135 | size_t
136 | bat_trim_i8_encode(
137 | 	void *out, size_t max_out_len,
138 | 	const int8_t *x, unsigned logn, unsigned bits)
139 | {
140 | 	size_t n, u, out_len;
141 | 	uint8_t *buf;
142 | 	uint32_t acc, mask;
143 | 	unsigned acc_len;
144 | 
145 | 	n = (size_t)1 << logn;
146 | 	out_len = ((n * bits) + 7) >> 3;
147 | 	if (out == NULL) {
148 | 		return out_len;
149 | 	}
150 | 	if (out_len > max_out_len) {
151 | 		return 0;
152 | 	}
153 | 	buf = out;
154 | 	acc = 0;
155 | 	acc_len = 0;
156 | 	mask = ((uint32_t)1 << bits) - 1;
157 | 	for (u = 0; u < n; u ++) {
158 | 		acc = (acc << bits) | ((uint8_t)x[u] & mask);
159 | 		acc_len += bits;
160 | 		while (acc_len >= 8) {
161 | 			acc_len -= 8;
162 | 			*buf ++ = (uint8_t)(acc >> acc_len);
163 | 		}
164 | 	}
165 | 	if (acc_len > 0) {
166 | 		*buf ++ = (uint8_t)(acc << (8 - acc_len));
167 | 	}
168 | 	return out_len;
169 | }
170 | 
171 | /* see inner.h */
172 | size_t
173 | bat_trim_i8_decode(
174 | 	int8_t *x, unsigned logn, unsigned bits,
175 | 	const void *in, size_t max_in_len)
176 | {
177 | 	size_t n, in_len;
178 | 	const uint8_t *buf;
179 | 	size_t u;
180 | 	uint32_t acc, mask1, mask2;
181 | 	unsigned acc_len;
182 | 	uint32_t r;
183 | 
184 | 	n = (size_t)1 << logn;
185 | 	in_len = ((n * bits) + 7) >> 3;
186 | 	if (in_len > max_in_len) {
187 | 		return 0;
188 | 	}
189 | 	buf = in;
190 | 	u = 0;
191 | 	acc = 0;
192 | 	acc_len = 0;
193 | 	mask1 = ((uint32_t)1 << bits) - 1;
194 | 	mask2 = (uint32_t)1 << (bits - 1);
195 | 	r = (uint32_t)-1;
196 | 	while (u < n) {
197 | 		acc = (acc << 8) | *buf ++;
198 | 		acc_len += 8;
199 | 		while (acc_len >= bits && u < n) {
200 | 			uint32_t w, q;
201 | 
202 | 			acc_len -= bits;
203 | 			w = (acc >> acc_len) & mask1;
204 | 			w |= -(w & mask2);
205 | 			x[u ++] = (int8_t)*(int32_t *)&w;
206 | 
207 | 			/* Value w == -mask2 is forbidden. */
208 | 			q = w + mask2;
209 | 			r &= q | -q;
210 | 		}
211 | 	}
212 | 
213 | 	/* Extra bits in the last byte must be zero. */
214 | 	acc &= (((uint32_t)1 << acc_len) - 1);
215 | 	r &= ~(acc | -acc);
216 | 
217 | 	return in_len & -(size_t)(r >> 31);
218 | }
219 | 


--------------------------------------------------------------------------------
/src/fnr.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Fixed-point division.
 3 |  */
 4 | 
 5 | #include "inner.h"
 6 | 
 7 | /* see inner.h */
 8 | uint64_t
 9 | bat_fnr_div(uint64_t x, uint64_t y)
10 | {
11 | 	uint64_t sx, sy, q, b, num;
12 | 	int i;
13 | 
14 | 	/*
15 | 	 * Get absolute values and signs. From now on, we can suppose
16 | 	 * that x and y fit on 63 bits (we ignore edge conditions).
17 | 	 */
18 | 	sx = x >> 63;
19 | 	x = (x ^ -sx) + sx;
20 | 	sy = y >> 63;
21 | 	y = (y ^ -sy) + sy;
22 | 
23 | 	/*
24 | 	 * Do a bit by bit division, assuming that the quotient fits.
25 | 	 * The numerator starts at x*2^31, and is shifted one bit a time.
26 | 	 */
27 | 	q = 0;
28 | 	num = x >> 31;
29 | 	for (i = 63; i >= 0; i --) {
30 | 		b = 1 - ((num - y) >> 63);
31 | 		q |= b << i;
32 | 		num -= y & -b;
33 | 		num <<= 1;
34 | 		if (i >= 33) {
35 | 			num |= (x >> (i - 33)) & 1;
36 | 		}
37 | 	}
38 | 
39 | 	/*
40 | 	 * Rounding: if the remainder is at least y/2 (scaled), we add
41 | 	 * 2^(-32) to the quotient.
42 | 	 */
43 | 	b = 1 - ((num - y) >> 63);
44 | 	q += b;
45 | 
46 | 	/*
47 | 	 * Sign management: if the original x and y had different signs,
48 | 	 * then we must negate the quotient.
49 | 	 */
50 | 	sx ^= sy;
51 | 	q = (q ^ -sx) + sx;
52 | 
53 | 	return q;
54 | }
55 | 


--------------------------------------------------------------------------------
/src/inner.h:
--------------------------------------------------------------------------------
   1 | #ifndef BAT_INNER_H__
   2 | #define BAT_INNER_H__
   3 | 
   4 | /*
   5 |  * Internal functions for BAT.
   6 |  */
   7 | 
   8 | /* ====================================================================== */
   9 | 
  10 | #include <stdint.h>
  11 | #include <stdlib.h>
  12 | #include <string.h>
  13 | 
  14 | #include "blake2.h"
  15 | 
  16 | #if defined BAT_AVX2 && BAT_AVX2
  17 | /*
  18 |  * This implementation uses AVX2 intrinsics.
  19 |  */
  20 | #include <immintrin.h>
  21 | #ifndef BAT_LE
  22 | #define BAT_LE   1
  23 | #endif
  24 | #ifndef BAT_UNALIGNED
  25 | #define BAT_UNALIGNED   1
  26 | #endif
  27 | #if defined __GNUC__
  28 | #define TARGET_AVX2    __attribute__((target("avx2")))
  29 | #define ALIGNED_AVX2   __attribute__((aligned(32)))
  30 | #elif defined _MSC_VER && _MSC_VER
  31 | #pragma warning( disable : 4752 )
  32 | #endif
  33 | #endif
  34 | 
  35 | #ifndef TARGET_AVX2
  36 | #define TARGET_AVX2
  37 | #endif
  38 | #ifndef ALIGNED_AVX2
  39 | #define ALIGNED_AVX2
  40 | #endif
  41 | 
  42 | /*
  43 |  * Disable warning on applying unary minus on an unsigned type.
  44 |  */
  45 | #if defined _MSC_VER && _MSC_VER
  46 | #pragma warning( disable : 4146 )
  47 | #pragma warning( disable : 4244 )
  48 | #pragma warning( disable : 4267 )
  49 | #pragma warning( disable : 4334 )
  50 | #endif
  51 | 
  52 | /*
  53 |  * Auto-detect 64-bit architectures.
  54 |  */
  55 | #ifndef BAT_64
  56 | #if defined __x86_64__ || defined _M_X64 \
  57 | 	|| defined __ia64 || defined __itanium__ || defined _M_IA64 \
  58 | 	|| defined __powerpc64__ || defined __ppc64__ || defined __PPC64__ \
  59 | 	|| defined __64BIT__ || defined _LP64 || defined __LP64__ \
  60 | 	|| defined __sparc64__ \
  61 | 	|| defined __aarch64__ || defined _M_ARM64 \
  62 | 	|| defined __mips64
  63 | #define BAT_64   1
  64 | #else
  65 | #define BAT_64   0
  66 | #endif
  67 | #endif
  68 | 
  69 | /*
  70 |  * Auto-detect endianness and support of unaligned accesses.
  71 |  */
  72 | #if defined __i386__ || defined _M_IX86 \
  73 | 	|| defined __x86_64__ || defined _M_X64 \
  74 | 	|| (defined _ARCH_PWR8 \
  75 | 		&& (defined __LITTLE_ENDIAN || defined __LITTLE_ENDIAN__))
  76 | 
  77 | #ifndef BAT_LE
  78 | #define BAT_LE   1
  79 | #endif
  80 | #ifndef BAT_UNALIGNED
  81 | #define BAT_UNALIGNED   1
  82 | #endif
  83 | 
  84 | #elif (defined __LITTLE_ENDIAN && __LITTLE_ENDIAN__) \
  85 | 	|| (defined __BYTE_ORDER__ && defined __ORDER_LITTLE_ENDIAN__ \
  86 | 		&& __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
  87 | 
  88 | #ifndef BAT_LE
  89 | #define BAT_LE   1
  90 | #endif
  91 | #ifndef BAT_UNALIGNED
  92 | #define BAT_UNALIGNED   0
  93 | #endif
  94 | 
  95 | #else
  96 | 
  97 | #ifndef BAT_LE
  98 | #define BAT_LE   0
  99 | #endif
 100 | #ifndef BAT_UNALIGNED
 101 | #define BAT_UNALIGNED   0
 102 | #endif
 103 | 
 104 | #endif
 105 | 
 106 | /*
 107 |  * For seed generation:
 108 |  *
 109 |  *  - On Linux (glibc-2.25+), FreeBSD 12+ and OpenBSD, use getentropy().
 110 |  *  - On other Unix-like systems, use /dev/urandom (also a fallback for
 111 |  *    failed getentropy() calls).
 112 |  *  - On Windows, use CryptGenRandom().
 113 |  */
 114 | 
 115 | #ifndef BAT_RAND_GETENTROPY
 116 | #if (defined __linux && defined __GLIBC__ \
 117 | 	&& (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25))) \
 118 | 	|| (defined __FreeBSD__ && __FreeBSD__ >= 12) \
 119 | 	|| defined __OpenBSD__
 120 | #define BAT_RAND_GETENTROPY   1
 121 | #else
 122 | #define BAT_RAND_GETENTROPY   0
 123 | #endif
 124 | #endif
 125 | 
 126 | #ifndef BAT_RAND_URANDOM
 127 | #if defined _AIX \
 128 | 	|| defined __ANDROID__ \
 129 | 	|| defined __FreeBSD__ \
 130 | 	|| defined __NetBSD__ \
 131 | 	|| defined __OpenBSD__ \
 132 | 	|| defined __DragonFly__ \
 133 | 	|| defined __linux__ \
 134 | 	|| (defined __sun && (defined __SVR4 || defined __svr4__)) \
 135 | 	|| (defined __APPLE__ && defined __MACH__)
 136 | #define BAT_RAND_URANDOM   1
 137 | #else
 138 | #define BAT_RAND_URANDOM   0
 139 | #endif
 140 | #endif
 141 | 
 142 | #ifndef BAT_RAND_WIN32
 143 | #if defined _WIN32 || defined _WIN64
 144 | #define BAT_RAND_WIN32   1
 145 | #else
 146 | #define BAT_RAND_WIN32   0
 147 | #endif
 148 | #endif
 149 | 
 150 | /*
 151 |  * Ensure all macros are defined, to avoid warnings with -Wundef.
 152 |  */
 153 | #ifndef BAT_AVX2
 154 | #define BAT_AVX2   0
 155 | #endif
 156 | 
 157 | /*
 158 |  * MSVC 2015 does not known the C99 keyword 'restrict'.
 159 |  */
 160 | #if defined _MSC_VER && _MSC_VER
 161 | #ifndef restrict
 162 | #define restrict   __restrict
 163 | #endif
 164 | #endif
 165 | 
 166 | /* ====================================================================== */
 167 | /*
 168 |  * Fixed-point numbers.
 169 |  *
 170 |  * For FFT and other computations with approximations, we use a fixed-point
 171 |  * format over 64 bits; the top 32 bits are the integral part, and the low
 172 |  * 32 bits are the fractional part.
 173 |  */
 174 | 
 175 | /*
 176 |  * We wrap the type into a struct in order to detect any attempt at using
 177 |  * arithmetic operators on values directly. Since all functions are inline,
 178 |  * the compiler will be able to remove the wrapper, which will then have
 179 |  * no runtime cost.
 180 |  */
 181 | typedef struct {
 182 | 	uint64_t v;
 183 | } fnr;
 184 | 
 185 | static inline fnr
 186 | fnr_of(int32_t j)
 187 | {
 188 | 	fnr x;
 189 | 
 190 | 	x.v = (uint64_t)j << 32;
 191 | 	return x;
 192 | }
 193 | 
 194 | static inline fnr
 195 | fnr_of_scaled32(uint64_t t)
 196 | {
 197 | 	fnr x;
 198 | 
 199 | 	x.v = t;
 200 | 	return x;
 201 | }
 202 | 
 203 | static inline fnr
 204 | fnr_add(fnr x, fnr y)
 205 | {
 206 | 	x.v += y.v;
 207 | 	return x;
 208 | }
 209 | 
 210 | static inline fnr
 211 | fnr_sub(fnr x, fnr y)
 212 | {
 213 | 	x.v -= y.v;
 214 | 	return x;
 215 | }
 216 | 
 217 | static inline fnr
 218 | fnr_double(fnr x)
 219 | {
 220 | 	x.v <<= 1;
 221 | 	return x;
 222 | }
 223 | 
 224 | static inline fnr
 225 | fnr_neg(fnr x)
 226 | {
 227 | 	x.v = (uint64_t)0 - x.v;
 228 | 	return x;
 229 | }
 230 | 
 231 | static inline fnr
 232 | fnr_abs(fnr x)
 233 | {
 234 | 	x.v -= (x.v << 1) & -(uint64_t)(x.v >> 63);
 235 | 	return x;
 236 | }
 237 | 
 238 | static inline fnr
 239 | fnr_mul(fnr x, fnr y)
 240 | {
 241 | #if defined __GNUC__ && defined __x86_64__
 242 | 	__int128 z;
 243 | 
 244 | 	z = (__int128)*(int64_t *)&x.v * (__int128)*(int64_t *)&y.v;
 245 | 	x.v = (uint64_t)(z >> 32);
 246 | 	return x;
 247 | #else
 248 | 	int32_t xh, yh;
 249 | 	uint32_t xl, yl;
 250 | 	uint64_t z0, z1, z2, z3;
 251 | 
 252 | 	xl = (uint32_t)x.v;
 253 | 	yl = (uint32_t)y.v;
 254 | 	xh = (int32_t)(*(int64_t *)&x.v >> 32);
 255 | 	yh = (int32_t)(*(int64_t *)&y.v >> 32);
 256 | 	z0 = ((uint64_t)xl * (uint64_t)yl + 0x80000000ul) >> 32;
 257 | 	z1 = (uint64_t)((int64_t)xl * (int64_t)yh);
 258 | 	z2 = (uint64_t)((int64_t)yl * (int64_t)xh);
 259 | 	z3 = (uint64_t)((int64_t)xh * (int64_t)yh) << 32;
 260 | 	x.v = z0 + z1 + z2 + z3;
 261 | 	return x;
 262 | #endif
 263 | }
 264 | 
 265 | static inline fnr
 266 | fnr_sqr(fnr x)
 267 | {
 268 | #if defined __GNUC__ && defined __x86_64__
 269 | 	int64_t t;
 270 | 	__int128 z;
 271 | 
 272 | 	t = *(int64_t *)&x.v;
 273 | 	z = (__int128)t * (__int128)t;
 274 | 	x.v = (uint64_t)(z >> 32);
 275 | 	return x;
 276 | #else
 277 | 	int32_t xh;
 278 | 	uint32_t xl;
 279 | 	uint64_t z0, z1, z3;
 280 | 
 281 | 	xl = (uint32_t)x.v;
 282 | 	xh = (int32_t)(*(int64_t *)&x.v >> 32);
 283 | 	z0 = ((uint64_t)xl * (uint64_t)xl + 0x80000000ul) >> 32;
 284 | 	z1 = (uint64_t)((int64_t)xl * (int64_t)xh);
 285 | 	z3 = (uint64_t)((int64_t)xh * (int64_t)xh) << 32;
 286 | 	x.v = z0 + (z1 << 1) + z3;
 287 | 	return x;
 288 | #endif
 289 | }
 290 | 
 291 | static inline int32_t
 292 | fnr_round(fnr x)
 293 | {
 294 | 	x.v += 0x80000000ul;
 295 | 	return (int32_t)(*(int64_t *)&x.v >> 32);
 296 | }
 297 | 
 298 | static inline fnr
 299 | fnr_div_2e(fnr x, unsigned n)
 300 | {
 301 | 	int64_t v;
 302 | 
 303 | 	v = *(int64_t *)&x.v;
 304 | 	x.v = (uint64_t)((v + (((int64_t)1 << n) >> 1)) >> n);
 305 | 	return x;
 306 | }
 307 | 
 308 | static inline fnr
 309 | fnr_mul_2e(fnr x, unsigned n)
 310 | {
 311 | 	x.v <<= n;
 312 | 	return x;
 313 | }
 314 | 
 315 | uint64_t bat_fnr_div(uint64_t x, uint64_t y);
 316 | 
 317 | static inline fnr
 318 | fnr_inv(fnr x)
 319 | {
 320 | 	x.v = bat_fnr_div((uint64_t)1 << 32, x.v);
 321 | 	return x;
 322 | }
 323 | 
 324 | static inline fnr
 325 | fnr_div(fnr x, fnr y)
 326 | {
 327 | 	x.v = bat_fnr_div(x.v, y.v);
 328 | 	return x;
 329 | }
 330 | 
 331 | static inline int
 332 | fnr_lt(fnr x, fnr y)
 333 | {
 334 | 	return *(int64_t *)&x.v < *(int64_t *)&y.v;
 335 | }
 336 | 
 337 | static const fnr fnr_zero = { 0 };
 338 | static const fnr fnr_sqrt2 = { 6074001000ull };
 339 | 
 340 | /* ====================================================================== */
 341 | /*
 342 |  * Apply FFT on a vector.
 343 |  */
 344 | void bat_FFT(fnr *f, unsigned logn);
 345 | 
 346 | /*
 347 |  * Apply inverse FFT on a vector.
 348 |  */
 349 | void bat_iFFT(fnr *f, unsigned logn);
 350 | 
 351 | /*
 352 |  * Add polynomial b to polynomial a (works in FFT and non-FFT). The two
 353 |  * polynomial arrays must be distinct.
 354 |  */
 355 | void bat_poly_add(fnr *restrict a, const fnr *restrict b, unsigned logn);
 356 | 
 357 | /*
 358 |  * Subtract polynomial b from polynomial a (works in FFT and non-FFT). The two
 359 |  * polynomial arrays must be distinct.
 360 |  */
 361 | void bat_poly_sub(fnr *restrict a, const fnr *restrict b, unsigned logn);
 362 | 
 363 | /*
 364 |  * Negate polynomial a (works in FFT and non-FFT).
 365 |  */
 366 | void bat_poly_neg(fnr *a, unsigned logn);
 367 | 
 368 | /*
 369 |  * Multiply polynomial a by constant c.
 370 |  */
 371 | void bat_poly_mulconst(fnr *a, fnr c, unsigned logn);
 372 | 
 373 | /*
 374 |  * Multiply polynomial a by polynomial b (FFT representation only). The two
 375 |  * polynomial arrays must be distinct.
 376 |  */
 377 | void bat_poly_mul_fft(fnr *restrict a, const fnr *restrict b, unsigned logn);
 378 | 
 379 | /*
 380 |  * Compute the adjoint of a polynomial in FFT representation.
 381 |  */
 382 | void bat_poly_adj_fft(fnr *a, unsigned logn);
 383 | 
 384 | /*
 385 |  * Scale a polynomial down by a factor 2^e.
 386 |  */
 387 | void bat_poly_div_2e(fnr *a, unsigned e, unsigned logn);
 388 | 
 389 | /*
 390 |  * Multiply polynomial a by polynomial b (FFT representation only). The two
 391 |  * polynomial arrays must be distinct. The polynomial b must be auto-adjoint,
 392 |  * i.e. all its coefficients in FFT representation are real numbers (the
 393 |  * polynomial has half-length; the imaginary values of the coefficients,
 394 |  * assumed to be zero and located in the second half, are not accessed).
 395 |  */
 396 | void bat_poly_mul_autoadj_fft(fnr *restrict a,
 397 | 	const fnr *restrict b, unsigned logn);
 398 | 
 399 | /*
 400 |  * Divide polynomial a by polynomial b (FFT representation only). The two
 401 |  * polynomial arrays must be distinct. The polynomial b must be auto-adjoint,
 402 |  * i.e. all its coefficients in FFT representation are real numbers (the
 403 |  * polynomial has half-length; the imaginary values of the coefficients,
 404 |  * assumed to be zero and located in the second half, are not accessed).
 405 |  */
 406 | void bat_poly_div_autoadj_fft(fnr *restrict a,
 407 | 	const fnr *restrict b, unsigned logn);
 408 | 
 409 | /*
 410 |  * Compute (2^e)/(a*adj(a)+b*adj(b)) into d[]. Polynomials are in FFT
 411 |  * representation. d[] is a half-size polynomial because all FFT
 412 |  * coefficients are zero (they are not set by this function). Parameter e
 413 |  * can be 0.
 414 |  */
 415 | void bat_poly_invnorm_fft(fnr *restrict d,
 416 | 	const fnr *restrict a, const fnr *restrict b,
 417 | 	unsigned e, unsigned logn);
 418 | 
 419 | /* ====================================================================== */
 420 | 
 421 | /*
 422 |  * Max size in bits for elements of (f,g), indexed by log(N). Size includes
 423 |  * the sign bit.
 424 |  */
 425 | extern const uint8_t bat_max_fg_bits[];
 426 | 
 427 | /*
 428 |  * Max size in bits for elements of (F,G), indexed by log(N). Size includes
 429 |  * the sign bit.
 430 |  */
 431 | extern const uint8_t bat_max_FG_bits[];
 432 | 
 433 | /*
 434 |  * Max size in bits for elements of w, indexed by log(N). Size includes
 435 |  * the sign bit.
 436 |  */
 437 | extern const uint8_t bat_max_w_bits[];
 438 | 
 439 | /* ====================================================================== */
 440 | 
 441 | /*
 442 |  * Key pair generation, first step: given a seed, candidate polynomials
 443 |  * f and g are generated. The following properties are checked:
 444 |  *  - All coefficients of f and g are within the expected bounds.
 445 |  *  - Res(f, x^n+1) == 1 mod 2.
 446 |  *  - Res(g, x^n+1) == 1 mod 2.
 447 |  *  - The (f,g) vector has an acceptable norm, both in normal and in
 448 |  *    orthogonalized representations.
 449 |  *  - f is invertible modulo x^n+1 modulo q.
 450 |  * If any of these properties is not met, then a failure is reported
 451 |  * (returned value is 0) and the contents of f[] and g[] are indeterminate.
 452 |  * Otherwise, success (1) is returned.
 453 |  *
 454 |  * If h != NULL, then the public key h = g/f mod x^n+1 mod q is returned
 455 |  * in that array. Note that h is always internally computed, regardless
 456 |  * of whether h == NULL or not.
 457 |  *
 458 |  * Size of tmp[]: 6*n elements (24*n bytes).
 459 |  * tmp[] MUST be 64-bit aligned.
 460 |  *
 461 |  * The seed length MUST NOT exceed 48 bytes.
 462 |  */
 463 | int bat_keygen_make_fg(int8_t *f, int8_t *g, uint16_t *h,
 464 | 	uint32_t q, unsigned logn,
 465 | 	const void *seed, size_t seed_len, uint32_t *tmp);
 466 | 
 467 | /*
 468 |  * Given polynomials f and g, solve the NTRU equation for F and G. This
 469 |  * may fail if there is no solution, or if some intermediate value exceeds
 470 |  * an internal heuristic threshold. Returned value is 1 on success, 0
 471 |  * on failure. On failure, contents of F and G are indeterminate.
 472 |  *
 473 |  * Size of tmp[]: 6*n elements (24*n bytes).
 474 |  * tmp[] MUST be 64-bit aligned.
 475 |  */
 476 | int bat_keygen_solve_FG(int8_t *F, int8_t *G,
 477 | 	const int8_t *f, const int8_t *g,
 478 | 	uint32_t q, unsigned logn, uint32_t *tmp);
 479 | 
 480 | /*
 481 |  * Given polynomials f, g and F, rebuild the polynomial G that completes
 482 |  * the NTRU equation g*F - f*G = q. Returned value is 1 on success, 0 on
 483 |  * failure. A failure is reported if the rebuilt solution has
 484 |  * coefficients outside of the expected maximum range, or f is not
 485 |  * invertible modulo x^n+1 modulo q. This function does NOT fully verify
 486 |  * that f, g, F, G is a solution to the NTRU equation.
 487 |  *
 488 |  * Size of tmp[]: n elements (4*n bytes).
 489 |  */
 490 | int bat_keygen_rebuild_G(int8_t *G,
 491 | 	const int8_t *f, const int8_t *g, const int8_t *F,
 492 | 	uint32_t q, unsigned logn, uint32_t *tmp);
 493 | 
 494 | /*
 495 |  * Verify that the given f, g, F, G fulfill the NTRU equation g*F - f*G = q.
 496 |  * Returned value is 1 on success, 0 on error.
 497 |  *
 498 |  * This function may be called when decoding a private key of unsure
 499 |  * provenance. It is implicitly called by bat_keygen_solve_FG().
 500 |  *
 501 |  * Size of tmp[]: 4*n elements (16*n bytes).
 502 |  */
 503 | int bat_keygen_verify_FG(
 504 | 	const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
 505 | 	uint32_t q, unsigned logn, uint32_t *tmp);
 506 | 
 507 | /*
 508 |  * Compute the w vector. Returned value is 1 on success, 0 on error. An
 509 |  * error is reported if the w vector has coefficients that do not fit
 510 |  * in signed 16-bit integer, or if the norm of (gamma*F_d, G_d) exceeds
 511 |  * the prescribed limit.
 512 |  *
 513 |  * Size of tmp[]: 6*n elements (24*n bytes).
 514 |  * tmp[] MUST be 64-bit aligned.
 515 |  */
 516 | int bat_keygen_compute_w(int32_t *w,
 517 | 	const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
 518 | 	uint32_t q, unsigned logn, uint32_t *tmp);
 519 | 
 520 | /*
 521 |  * Compute the public key h = g/f. Returned value is 1 on success, 0 on
 522 |  * error. An error is reported if f is not invertible modulo X^n+1.
 523 |  * This function is for q = 128 and 1 <= logn <= 8.
 524 |  * CAUTION: for q = 128, public key is in an array of uint8_t, not uint16_t.
 525 |  *
 526 |  * Size of tmp[]: 3*n/4 elements (3*n bytes).
 527 |  */
 528 | int bat_make_public_128(uint8_t *h, const int8_t *f, const int8_t *g,
 529 | 	unsigned logn, uint32_t *tmp);
 530 | 
 531 | /*
 532 |  * Compute the public key h = g/f. Returned value is 1 on success, 0 on
 533 |  * error. An error is reported if f is not invertible modulo X^n+1.
 534 |  * This function is for q = 257 and 1 <= logn <= 9.
 535 |  *
 536 |  * Size of tmp[]: n elements (4*n bytes).
 537 |  */
 538 | int bat_make_public_257(uint16_t *h, const int8_t *f, const int8_t *g,
 539 | 	unsigned logn, uint32_t *tmp);
 540 | 
 541 | /*
 542 |  * Compute the public key h = g/f. Returned value is 1 on success, 0 on
 543 |  * error. An error is reported if f is not invertible modulo X^n+1.
 544 |  * This function is for q = 769 and 1 <= logn <= 10.
 545 |  *
 546 |  * Size of tmp[]: n elements (4*n bytes).
 547 |  */
 548 | int bat_make_public_769(uint16_t *h, const int8_t *f, const int8_t *g,
 549 | 	unsigned logn, uint32_t *tmp);
 550 | 
 551 | /*
 552 |  * Given f, g and F, rebuild G, for the case q = 128. This function
 553 |  * reports a failure if (q,logn) are not supported parameters, if f is
 554 |  * not invertible modulo x^n+1 and modulo q, or if the rebuilt value G
 555 |  * has coefficients that exceed the expected maximum size.
 556 |  *
 557 |  * This function does NOT check that the returned G matches the NTRU
 558 |  * equation.
 559 |  *
 560 |  * Size of tmp[]: 3*n/4 elements (3*n bytes).
 561 |  */
 562 | int bat_rebuild_G_128(int8_t *G,
 563 | 	const int8_t *f, const int8_t *g, const int8_t *F,
 564 | 	unsigned logn, uint32_t *tmp);
 565 | 
 566 | /*
 567 |  * Given f, g and F, rebuild G, for the case q = 257. This function
 568 |  * reports a failure if (q,logn) are not supported parameters, if f is
 569 |  * not invertible modulo x^n+1 and modulo q, or if the rebuilt value G
 570 |  * has coefficients that exceed the expected maximum size.
 571 |  *
 572 |  * This function does NOT check that the returned G matches the NTRU
 573 |  * equation.
 574 |  *
 575 |  * Size of tmp[]: n elements (4*n bytes).
 576 |  */
 577 | int bat_rebuild_G_257(int8_t *G,
 578 | 	const int8_t *f, const int8_t *g, const int8_t *F,
 579 | 	unsigned logn, uint32_t *tmp);
 580 | 
 581 | /*
 582 |  * Given f, g and F, rebuild G, for the case q = 769. This function
 583 |  * reports a failure if (q,logn) are not supported parameters, if f is
 584 |  * not invertible modulo x^n+1 and modulo q, or if the rebuilt value G
 585 |  * has coefficients that exceed the expected maximum size.
 586 |  *
 587 |  * This function does NOT check that the returned G matches the NTRU
 588 |  * equation.
 589 |  *
 590 |  * Size of tmp[]: n elements (4*n bytes).
 591 |  */
 592 | int bat_rebuild_G_769(int8_t *G,
 593 | 	const int8_t *f, const int8_t *g, const int8_t *F,
 594 | 	unsigned logn, uint32_t *tmp);
 595 | 
 596 | /* ====================================================================== */
 597 | 
 598 | /*
 599 |  * Get the length of sbuf, for a given degree n, with n = 2^logn.
 600 |  * The logn parameter must be between 1 and 10, inclusive. Returned length
 601 |  * is in bytes, between 1 and 128, inclusive.
 602 |  */
 603 | #define SBUF_LEN(logn)   (((1 << (logn)) + 7) >> 3)
 604 | 
 605 | /*
 606 |  * Encrypt: given public key (in h) and secret polynomial s (in sbuf[]),
 607 |  * produce ciphertext c1 (in c).
 608 |  *
 609 |  * This function is for q = 128, with logn = 1 to 8. Ciphertext elements
 610 |  * are in the -31..+32 range. The function cannot fail, hence it always
 611 |  * returns 1.
 612 |  * CAUTION: for q = 128, public key is in an array of uint8_t, not uint16_t.
 613 |  *
 614 |  * Size of tmp[]: 3*n/4 elements (3*n bytes)
 615 |  */
 616 | uint32_t bat_encrypt_128(int8_t *c, const uint8_t *sbuf,
 617 | 	const uint8_t *h, unsigned logn, uint32_t *tmp);
 618 | 
 619 | /*
 620 |  * Encrypt: given public key (in h) and secret polynomial s (in sbuf[]),
 621 |  * produce ciphertext c1 (in c).
 622 |  *
 623 |  * This function is for q = 257, with logn = 1 to 9. Ciphertext elements
 624 |  * are in the -64..+64 range. The function cannot fail, hence it always
 625 |  * returns 1.
 626 |  *
 627 |  * Size of tmp[]: n elements (4*n bytes).
 628 |  */
 629 | uint32_t bat_encrypt_257(int8_t *c, const uint8_t *sbuf,
 630 | 	const uint16_t *h, unsigned logn, uint32_t *tmp);
 631 | 
 632 | /*
 633 |  * Encrypt: given public key (in h) and secret polynomial s (in sbuf[]),
 634 |  * produce ciphertext c1 (in c).
 635 |  *
 636 |  * This function is for q = 769, with logn = 1 to 10. Ciphertext elements
 637 |  * are in the -96..+96 range.
 638 |  *
 639 |  * The function may fail, if the norm of the result is too high, in which
 640 |  * case the caller should start again with a new seed (this is uncommon).
 641 |  * On failure, this function returns 0; on success, it returns 1.
 642 |  *
 643 |  * Size of tmp[]: 3*n/4 elements (3*n bytes).
 644 |  */
 645 | uint32_t bat_encrypt_769(int8_t *c, const uint8_t *sbuf,
 646 | 	const uint16_t *h, unsigned logn, uint32_t *tmp);
 647 | 
 648 | /*
 649 |  * Decrypt: given private key (f,g,F,G,w) and ciphertext c1, extract
 650 |  * secret s. The polynomial s has length n bits (with n = 2^logn); it
 651 |  * is returned in sbuf[] (ceil(n/8) bytes; for toy versions with logn <
 652 |  * 3, the upper bits of the incomplete byte are set to zero).
 653 |  *
 654 |  * This function is for q = 128. Ciphertext elements are in the -31..+32
 655 |  * range.
 656 |  *
 657 |  * Size of tmp[]: 2*n elements (8*n bytes).
 658 |  *
 659 |  * This function never fails; for proper security, the caller must obtain
 660 |  * the message m (using the second ciphertext element c2) and check that
 661 |  * encryption of m would indeed yield exactly ciphertext c1.
 662 |  */
 663 | void bat_decrypt_128(uint8_t *sbuf, const int8_t *c,
 664 | 	const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
 665 | 	const int32_t *w, unsigned logn, uint32_t *tmp);
 666 | 
 667 | /*
 668 |  * Decrypt: given private key (f,g,F,G,w) and ciphertext c1, extract
 669 |  * secret s. The polynomial s has length n bits (with n = 2^logn); it
 670 |  * is returned in sbuf[] (ceil(n/8) bytes; for toy versions with logn <
 671 |  * 3, the upper bits of the incomplete byte are set to zero).
 672 |  *
 673 |  * This function is for q = 257. Ciphertext elements are in the -64..+64
 674 |  * range.
 675 |  *
 676 |  * Size of tmp[]: 2*n elements (8*n bytes).
 677 |  *
 678 |  * This function never fails; for proper security, the caller must obtain
 679 |  * the message m (using the second ciphertext element c2) and check that
 680 |  * encryption of m would indeed yield exactly ciphertext c1.
 681 |  */
 682 | void bat_decrypt_257(uint8_t *sbuf, const int8_t *c,
 683 | 	const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
 684 | 	const int32_t *w, unsigned logn, uint32_t *tmp);
 685 | 
 686 | /*
 687 |  * Decrypt: given private key (f,g,F,G,w) and ciphertext c1, extract
 688 |  * secret s. The polynomial s has length n bits (with n = 2^logn); it
 689 |  * is returned in sbuf[] (ceil(n/8) bytes; for toy versions with logn <
 690 |  * 3, the upper bits of the incomplete byte are set to zero).
 691 |  *
 692 |  * This function is for q = 769. Ciphertext elements are in the -96..+96
 693 |  * range.
 694 |  *
 695 |  * Size of tmp[]: 2*n elements (8*n bytes).
 696 |  *
 697 |  * This function never fails; for proper security, the caller must obtain
 698 |  * the message m (using the second ciphertext element c2) and check that
 699 |  * encryption of m would indeed yield exactly ciphertext c1.
 700 |  */
 701 | void bat_decrypt_769(uint8_t *sbuf, const int8_t *c,
 702 | 	const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
 703 | 	const int32_t *w, unsigned logn, uint32_t *tmp);
 704 | 
 705 | /*
 706 |  * Second phase of decapsulation, performed modulo 769.
 707 |  * Given c', c'', f, F and w, this function computes:
 708 |  *    Fd = q'*F - f*w
 709 |  *    q*q'*Q*s' = Fd*c' - f*c''
 710 |  *
 711 |  * On input, cp[] and cs[] must contain c' and c'', respectively, in
 712 |  * Montgomery representation modulo 769. On output, polynomial q*q'*Q*s'
 713 |  * is returned in cp[], in Montgomery representation modulo 769 (since
 714 |  * coefficients of s' can have only a few specific values, this is enough
 715 |  * to recover s'). cs[] is consumed. tmp[] must have room for 4*n bytes
 716 |  * (n 32-bit elements).
 717 |  *
 718 |  * Size of tmp[]: n elements (4*n bytes).
 719 |  */
 720 | void bat_finish_decapsulate_769(uint16_t *cp, uint16_t *cs,
 721 | 	const int8_t *f, const int8_t *F, const int32_t *w, unsigned logn,
 722 | 	uint32_t *tmp);
 723 | 
 724 | /*
 725 |  * Second phase of decapsulation, performed modulo 257.
 726 |  * Given c', c'', f, F and w, this function computes:
 727 |  *    Fd = q'*F - f*w
 728 |  *    q*q'*Q*s' = Fd*c' - f*c''
 729 |  *
 730 |  * On input, cp[] and cs[] must contain c' and c'', respectively, in
 731 |  * Montgomery representation modulo 257. On output, polynomial q*q'*Q*s'
 732 |  * is returned in cp[], in Montgomery representation modulo 257 (since
 733 |  * coefficients of s' can have only a few specific values, this is enough
 734 |  * to recover s'). cs[] is consumed. tmp[] must have room for 4*n bytes
 735 |  * (n 32-bit elements).
 736 |  *
 737 |  * Size of tmp[]: n elements (4*n bytes).
 738 |  */
 739 | void bat_finish_decapsulate_257(uint16_t *cp, uint16_t *cs,
 740 | 	const int8_t *f, const int8_t *F, const int32_t *w, unsigned logn,
 741 | 	uint32_t *tmp);
 742 | 
 743 | /*
 744 |  * Explicit reduction and conversion to Montgomery representation modulo
 745 |  * 257. This works for inputs x in range 0..4278190336.
 746 |  */
 747 | static inline uint32_t
 748 | m257_tomonty(uint32_t x)
 749 | {
 750 | 	x *= 16711935;
 751 | 	x = (x >> 16) * 257;
 752 | 	return (x >> 16) + 1;
 753 | }
 754 | 
 755 | /*
 756 |  * Explicit reduction and conversion to Montgomery representation modulo
 757 |  * 769. This works for inputs x in range 0..4244636416.
 758 |  */
 759 | static inline uint32_t
 760 | m769_tomonty(uint32_t x)
 761 | {
 762 | 	x *= 452395775;
 763 | 	x = (x >> 16) * 769;
 764 | 	x = (x >> 16) + 1;
 765 | 	x *= 2016233021;
 766 | 	x = (x >> 16) * 769;
 767 | 	return (x >> 16) + 1;
 768 | }
 769 | 
 770 | /* ====================================================================== */
 771 | /*
 772 |  * Computations on polynomials modulo q' = 64513.
 773 |  */
 774 | 
 775 | /*
 776 |  * Compute d = -a*b mod X^n+1 mod q'
 777 |  * Coefficients of source values are plain integers (for value b, they must
 778 |  * be in the -503109..+503109 range). Coefficients of output values are
 779 |  * normalized in -32256..+32256.
 780 |  *
 781 |  * Array d[] may overlap, partially or totally, with a[]; however, it
 782 |  * MUST NOT overlap with b[].
 783 |  *
 784 |  * Size of tmp[]: n/2 elements (2*n bytes).
 785 |  */
 786 | void bat_polyqp_mulneg(int16_t *d, const int16_t *a, const int32_t *b,
 787 | 	unsigned logn, uint32_t *tmp);
 788 | 
 789 | /* ====================================================================== */
 790 | /*
 791 |  * Encoding/decoding functions.
 792 |  */
 793 | 
 794 | #if BAT_LE && BAT_UNALIGNED
 795 | 
 796 | static inline unsigned
 797 | dec16le(const void *src)
 798 | {
 799 | 	return *(const uint16_t *)src;
 800 | }
 801 | 
 802 | static inline void
 803 | enc16le(void *dst, unsigned x)
 804 | {
 805 | 	*(uint16_t *)dst = x;
 806 | }
 807 | 
 808 | static inline uint32_t
 809 | dec32le(const void *src)
 810 | {
 811 | 	return *(const uint32_t *)src;
 812 | }
 813 | 
 814 | static inline void
 815 | enc32le(void *dst, uint32_t x)
 816 | {
 817 | 	*(uint32_t *)dst = x;
 818 | }
 819 | 
 820 | static inline uint64_t
 821 | dec64le(const void *src)
 822 | {
 823 | 	return *(const uint64_t *)src;
 824 | }
 825 | 
 826 | static inline void
 827 | enc64le(void *dst, uint64_t x)
 828 | {
 829 | 	*(uint64_t *)dst = x;
 830 | }
 831 | 
 832 | #else
 833 | 
 834 | static inline unsigned
 835 | dec16le(const void *src)
 836 | {
 837 | 	const uint8_t *buf;
 838 | 
 839 | 	buf = src;
 840 | 	return (unsigned)buf[0]
 841 | 		| ((unsigned)buf[1] << 8);
 842 | }
 843 | 
 844 | static inline void
 845 | enc16le(void *dst, unsigned x)
 846 | {
 847 | 	uint8_t *buf;
 848 | 
 849 | 	buf = dst;
 850 | 	buf[0] = (uint8_t)x;
 851 | 	buf[1] = (uint8_t)(x >> 8);
 852 | }
 853 | 
 854 | static inline uint32_t
 855 | dec32le(const void *src)
 856 | {
 857 | 	const uint8_t *buf;
 858 | 
 859 | 	buf = src;
 860 | 	return (uint32_t)buf[0]
 861 | 		| ((uint32_t)buf[1] << 8)
 862 | 		| ((uint32_t)buf[2] << 16)
 863 | 		| ((uint32_t)buf[3] << 24);
 864 | }
 865 | 
 866 | static inline void
 867 | enc32le(void *dst, uint32_t x)
 868 | {
 869 | 	uint8_t *buf;
 870 | 
 871 | 	buf = dst;
 872 | 	buf[0] = (uint8_t)x;
 873 | 	buf[1] = (uint8_t)(x >> 8);
 874 | 	buf[2] = (uint8_t)(x >> 16);
 875 | 	buf[3] = (uint8_t)(x >> 24);
 876 | }
 877 | 
 878 | static inline uint64_t
 879 | dec64le(const void *src)
 880 | {
 881 | 	const uint8_t *buf;
 882 | 
 883 | 	buf = src;
 884 | 	return (uint64_t)buf[0]
 885 | 		| ((uint64_t)buf[1] << 8)
 886 | 		| ((uint64_t)buf[2] << 16)
 887 | 		| ((uint64_t)buf[3] << 24)
 888 | 		| ((uint64_t)buf[4] << 32)
 889 | 		| ((uint64_t)buf[5] << 40)
 890 | 		| ((uint64_t)buf[6] << 48)
 891 | 		| ((uint64_t)buf[7] << 56);
 892 | }
 893 | 
 894 | static inline void
 895 | enc64le(void *dst, uint64_t x)
 896 | {
 897 | 	uint8_t *buf;
 898 | 
 899 | 	buf = dst;
 900 | 	buf[0] = (uint64_t)x;
 901 | 	buf[1] = (uint64_t)(x >> 8);
 902 | 	buf[2] = (uint64_t)(x >> 16);
 903 | 	buf[3] = (uint64_t)(x >> 24);
 904 | 	buf[4] = (uint64_t)(x >> 32);
 905 | 	buf[5] = (uint64_t)(x >> 40);
 906 | 	buf[6] = (uint64_t)(x >> 48);
 907 | 	buf[7] = (uint64_t)(x >> 56);
 908 | }
 909 | 
 910 | #endif
 911 | 
 912 | static inline uint32_t
 913 | dec24le(const void *src)
 914 | {
 915 | 	const uint8_t *buf;
 916 | 
 917 | 	buf = src;
 918 | 	return (uint32_t)buf[0]
 919 | 		| ((uint32_t)buf[1] << 8)
 920 | 		| ((uint32_t)buf[2] << 16);
 921 | }
 922 | 
 923 | static inline void
 924 | enc24le(void *dst, uint32_t x)
 925 | {
 926 | 	uint8_t *buf;
 927 | 
 928 | 	buf = dst;
 929 | 	buf[0] = (uint8_t)x;
 930 | 	buf[1] = (uint8_t)(x >> 8);
 931 | 	buf[2] = (uint8_t)(x >> 16);
 932 | }
 933 | 
 934 | /*
 935 |  * bat_trim_i32_encode() and bat_trim_i32_decode() encode and decode
 936 |  * polynomials with signed coefficients (int32_t), using the specified
 937 |  * number of bits for each coefficient. The number of bits includes the
 938 |  * sign bit. Each coefficient x must be such that |x| < 2^(bits-1) (the
 939 |  * value -2^(bits-1), though conceptually encodable with two's
 940 |  * complement representation, is forbidden).
 941 |  *
 942 |  * bat_trim_i8_encode() and bat_trim_i8_decode() do the same work for
 943 |  * polynomials whose coefficients are held in slots of type int8_t.
 944 |  *
 945 |  * Encoding API:
 946 |  *
 947 |  *   Output buffer (out[]) has max length max_out_len (in bytes). If
 948 |  *   that length is not large enough, then no encoding occurs and the
 949 |  *   function returns 0; otherwise, the function returns the number of
 950 |  *   bytes which have been written into out[]. If out == NULL, then
 951 |  *   max_out_len is ignored, and no output is produced, but the function
 952 |  *   returns how many bytes it would produce.
 953 |  *
 954 |  *   Encoding functions assume that the input is valid (all values in
 955 |  *   the encodable range).
 956 |  *
 957 |  * Decoding API:
 958 |  *
 959 |  *   Input buffer (in[]) has maximum length max_in_len (in bytes). If
 960 |  *   the input length is not enough for the expected polynomial, then
 961 |  *   no decoding occurs and the function returns 0. Otherwise, the values
 962 |  *   are decoded and the number of processed input bytes is returned.
 963 |  *
 964 |  *   If the input is invalid in some way (a decoded coefficient has
 965 |  *   value -2^(bits-1), or some of the ignored bits in the last byte
 966 |  *   are non-zero), then the function fails and returns 0; the contents
 967 |  *   of the output array are then indeterminate.
 968 |  *
 969 |  * Both encoding and decoding are constant-time with regards to the
 970 |  * values and bits.
 971 |  */
 972 | 
 973 | size_t bat_trim_i32_encode(void *out, size_t max_out_len,
 974 | 	const int32_t *x, unsigned logn, unsigned bits);
 975 | size_t bat_trim_i32_decode(int32_t *x, unsigned logn, unsigned bits,
 976 | 	const void *in, size_t max_in_len);
 977 | size_t bat_trim_i8_encode(void *out, size_t max_out_len,
 978 | 	const int8_t *x, unsigned logn, unsigned bits);
 979 | size_t bat_trim_i8_decode(int8_t *x, unsigned logn, unsigned bits,
 980 | 	const void *in, size_t max_in_len);
 981 | 
 982 | /*
 983 |  * Encode a polynomial with coefficients modulo 128. This is used for
 984 |  * public keys with q = 128.
 985 |  *
 986 |  * If out == NULL, then max_out_len is ignored and the function returns
 987 |  * the size of the output it could produce (in bytes).
 988 |  * If out != NULL, then max_out_len is compared with the expected output
 989 |  * size. If max_out_len is lower, then no output is produced, and the
 990 |  * function returns 0; otherwise, the output is produced and its length
 991 |  * (in bytes) is returned.
 992 |  */
 993 | size_t bat_encode_128(void *out, size_t max_out_len,
 994 | 	const uint8_t *x, unsigned logn);
 995 | 
 996 | /*
 997 |  * Decode a polynomial with coefficients modulo 128. This is used for
 998 |  * public keys with q = 128.
 999 |  *
1000 |  * Input buffer (in[]) has maximum length max_in_len (in bytes). If
1001 |  * the input length is not enough for the expected polynomial, then
1002 |  * no decoding occurs and the function returns 0. Otherwise, the values
1003 |  * are decoded and the number of processed input bytes is returned.
1004 |  *
1005 |  * If the input is invalid in some way (a decoded coefficient is out of
1006 |  * the expected range, or some ignored bit is non-zero), then this the
1007 |  * function fails and returns 0; the contents of the output array are
1008 |  * then indeterminate.
1009 |  *
1010 |  * Decoding is constant-time as long as no failure occurs.
1011 |  */
1012 | size_t bat_decode_128(uint8_t *x, unsigned logn,
1013 | 	const void *in, size_t max_in_len);
1014 | 
1015 | /*
1016 |  * Encode a ciphertext polynomial, for q = 128; coefficients are in -31..+32.
1017 |  *
1018 |  * If out == NULL, then max_out_len is ignored and the function returns
1019 |  * the size of the output it could produce (in bytes).
1020 |  * If out != NULL, then max_out_len is compared with the expected output
1021 |  * size. If max_out_len is lower, then no output is produced, and the
1022 |  * function returns 0; otherwise, the output is produced and its length
1023 |  * (in bytes) is returned.
1024 |  */
1025 | size_t bat_encode_ciphertext_128(void *out, size_t max_out_len,
1026 | 	const int8_t *c, unsigned logn);
1027 | /*
1028 |  * Decode a ciphertext polynomial, for q = 128; coefficients are in -31..+32.
1029 |  *
1030 |  * Input buffer (in[]) has maximum length max_in_len (in bytes). If
1031 |  * the input length is not enough for the expected polynomial, then
1032 |  * no decoding occurs and the function returns 0. Otherwise, the values
1033 |  * are decoded and the number of processed input bytes is returned.
1034 |  *
1035 |  * If the input is invalid in some way (a decoded coefficient is out of
1036 |  * the expected range, or some ignored bit is non-zero), then this the
1037 |  * function fails and returns 0; the contents of the output array are
1038 |  * then indeterminate.
1039 |  *
1040 |  * Decoding is constant-time with regard to the coefficient values.
1041 |  */
1042 | size_t bat_decode_ciphertext_128(int8_t *c, unsigned logn,
1043 | 	const void *in, size_t max_in_len);
1044 | 
1045 | /*
1046 |  * Encode a polynomial with coefficients modulo 257. This is used for
1047 |  * public keys with q = 257.
1048 |  *
1049 |  * If out == NULL, then max_out_len is ignored and the function returns
1050 |  * the size of the output it could produce (in bytes).
1051 |  * If out != NULL, then max_out_len is compared with the expected output
1052 |  * size. If max_out_len is lower, then no output is produced, and the
1053 |  * function returns 0; otherwise, the output is produced and its length
1054 |  * (in bytes) is returned.
1055 |  */
1056 | size_t bat_encode_257(void *out, size_t max_out_len,
1057 | 	const uint16_t *x, unsigned logn);
1058 | 
1059 | /*
1060 |  * Decode a polynomial with coefficients modulo 257. This is used for
1061 |  * public keys with q = 257.
1062 |  *
1063 |  * Input buffer (in[]) has maximum length max_in_len (in bytes). If
1064 |  * the input length is not enough for the expected polynomial, then
1065 |  * no decoding occurs and the function returns 0. Otherwise, the values
1066 |  * are decoded and the number of processed input bytes is returned.
1067 |  *
1068 |  * If the input is invalid in some way (a decoded coefficient is out of
1069 |  * the expected range, or some ignored bit is non-zero), then this the
1070 |  * function fails and returns 0; the contents of the output array are
1071 |  * then indeterminate.
1072 |  *
1073 |  * Decoding is constant-time as long as no failure occurs.
1074 |  */
1075 | size_t bat_decode_257(uint16_t *x, unsigned logn,
1076 | 	const void *in, size_t max_in_len);
1077 | 
1078 | /*
1079 |  * Encode a ciphertext polynomial, for q = 257; coefficients are in -64..+64.
1080 |  *
1081 |  * If out == NULL, then max_out_len is ignored and the function returns
1082 |  * the size of the output it could produce (in bytes).
1083 |  * If out != NULL, then max_out_len is compared with the expected output
1084 |  * size. If max_out_len is lower, then no output is produced, and the
1085 |  * function returns 0; otherwise, the output is produced and its length
1086 |  * (in bytes) is returned.
1087 |  */
1088 | size_t bat_encode_ciphertext_257(void *out, size_t max_out_len,
1089 | 	const int8_t *c, unsigned logn);
1090 | /*
1091 |  * Decode a ciphertext polynomial, for q = 257; coefficients are in -64..+64.
1092 |  *
1093 |  * Input buffer (in[]) has maximum length max_in_len (in bytes). If
1094 |  * the input length is not enough for the expected polynomial, then
1095 |  * no decoding occurs and the function returns 0. Otherwise, the values
1096 |  * are decoded and the number of processed input bytes is returned.
1097 |  *
1098 |  * If the input is invalid in some way (a decoded coefficient is out of
1099 |  * the expected range, or some ignored bit is non-zero), then this the
1100 |  * function fails and returns 0; the contents of the output array are
1101 |  * then indeterminate.
1102 |  *
1103 |  * Decoding is constant-time with regard to the coefficient values.
1104 |  */
1105 | size_t bat_decode_ciphertext_257(int8_t *c, unsigned logn,
1106 | 	const void *in, size_t max_in_len);
1107 | 
1108 | /*
1109 |  * Encode a polynomial with coefficients modulo 769. This is used for
1110 |  * public keys with q = 769.
1111 |  *
1112 |  * If out == NULL, then max_out_len is ignored and the function returns
1113 |  * the size of the output it could produce (in bytes).
1114 |  * If out != NULL, then max_out_len is compared with the expected output
1115 |  * size. If max_out_len is lower, then no output is produced, and the
1116 |  * function returns 0; otherwise, the output is produced and its length
1117 |  * (in bytes) is returned.
1118 |  */
1119 | size_t bat_encode_769(void *out, size_t max_out_len,
1120 | 	const uint16_t *x, unsigned logn);
1121 | 
1122 | /*
1123 |  * Decode a polynomial with coefficients modulo 769. This is used for
1124 |  * public keys with q = 769.
1125 |  *
1126 |  * Input buffer (in[]) has maximum length max_in_len (in bytes). If
1127 |  * the input length is not enough for the expected polynomial, then
1128 |  * no decoding occurs and the function returns 0. Otherwise, the values
1129 |  * are decoded and the number of processed input bytes is returned.
1130 |  *
1131 |  * If the input is invalid in some way (a decoded coefficient is out of
1132 |  * the expected range, or some ignored bit is non-zero), then this the
1133 |  * function fails and returns 0; the contents of the output array are
1134 |  * then indeterminate.
1135 |  *
1136 |  * Decoding is constant-time with regard to the coefficient values.
1137 |  */
1138 | size_t bat_decode_769(uint16_t *x, unsigned logn,
1139 | 	const void *in, size_t max_in_len);
1140 | 
1141 | /*
1142 |  * Encode a ciphertext polynomial, for q = 769; coefficients are in -96..+96.
1143 |  *
1144 |  * If out == NULL, then max_out_len is ignored and the function returns
1145 |  * the size of the output it could produce (in bytes).
1146 |  * If out != NULL, then max_out_len is compared with the expected output
1147 |  * size. If max_out_len is lower, then no output is produced, and the
1148 |  * function returns 0; otherwise, the output is produced and its length
1149 |  * (in bytes) is returned.
1150 |  */
1151 | size_t bat_encode_ciphertext_769(void *out, size_t max_out_len,
1152 | 	const int8_t *c, unsigned logn);
1153 | /*
1154 |  * Decode a ciphertext polynomial, for q = 769; coefficients are in -96..+96.
1155 |  *
1156 |  * Input buffer (in[]) has maximum length max_in_len (in bytes). If
1157 |  * the input length is not enough for the expected polynomial, then
1158 |  * no decoding occurs and the function returns 0. Otherwise, the values
1159 |  * are decoded and the number of processed input bytes is returned.
1160 |  *
1161 |  * If the input is invalid in some way (a decoded coefficient is out of
1162 |  * the expected range, or some ignored bit is non-zero), then this the
1163 |  * function fails and returns 0; the contents of the output array are
1164 |  * then indeterminate.
1165 |  *
1166 |  * Decoding is constant-time with regard to the coefficient values.
1167 |  */
1168 | size_t bat_decode_ciphertext_769(int8_t *c, unsigned logn,
1169 | 	const void *in, size_t max_in_len);
1170 | 
1171 | /* ====================================================================== */
1172 | 
1173 | /*
1174 |  * Obtain a random seed from the system RNG. Maximum allowed seed length
1175 |  * is 2048 bits (256 bytes).
1176 |  *
1177 |  * Returned value is 1 on success, 0 on error.
1178 |  */
1179 | int bat_get_seed(void *seed, size_t len);
1180 | 
1181 | /*
1182 |  * Custom PRNG that outputs 64-bit integers. It is based on BLAKE2s.
1183 |  */
1184 | typedef struct {
1185 | 	uint8_t buf[128];
1186 | 	uint8_t key[32];
1187 | 	uint64_t ctr;
1188 | 	size_t ptr;
1189 | } prng_context;
1190 | 
1191 | /*
1192 |  * Initialize the PRNG from the provided seed and an extra 64-bit integer.
1193 |  * The seed length MUST NOT exceed 48 bytes.
1194 |  */
1195 | static inline void
1196 | prng_init(prng_context *p, const void *seed, size_t seed_len, uint64_t label)
1197 | {
1198 | 	blake2s_expand(p->key, sizeof p->key, seed, seed_len, label);
1199 | 	p->ctr = 0;
1200 | 	p->ptr = sizeof p->buf;
1201 | }
1202 | 
1203 | /*
1204 |  * Get a 64-bit integer out of a PRNG.
1205 |  */
1206 | static inline uint64_t
1207 | prng_get_u64(prng_context *p)
1208 | {
1209 | 	uint64_t x;
1210 | 
1211 | 	if (p->ptr == sizeof p->buf) {
1212 | 		blake2s_expand(p->buf, sizeof p->buf,
1213 | 			p->key, sizeof p->key, p->ctr ++);
1214 | 		p->ptr = 0;
1215 | 	}
1216 | 	x = dec64le(p->buf + p->ptr);
1217 | 	p->ptr += 8;
1218 | 	return x;
1219 | }
1220 | 
1221 | /*
1222 |  * Get arbitrary bytes out of a PRNG.
1223 |  */
1224 | static inline void
1225 | prng_get_bytes(prng_context *p, void *dst, size_t len)
1226 | {
1227 | 	blake2s_expand(dst, len, p->key, sizeof p->key, p->ctr ++);
1228 | }
1229 | 
1230 | /* ====================================================================== */
1231 | 
1232 | #endif
1233 | 


--------------------------------------------------------------------------------
/src/kem128.c:
--------------------------------------------------------------------------------
  1 | #include "inner.h"
  2 | 
  3 | /*
  4 |  * We use computations modulo 256 (usually implicitly through the use
  5 |  * of uint8_t as a storage type). When a value is needed modulo 128,
  6 |  * we apply a mask explicitly.
  7 |  *
  8 |  * We can use int8_t* and uint8_t* interchangeably, since the C standard
  9 |  * guarantees two's-complement and compatibility of formats for
 10 |  * exact-width types.
 11 |  *
 12 |  * Note that invertibility modulo 256 is equivalent to invertibility
 13 |  * modulo 128, since this boils down to the parity of the value at the
 14 |  * deepest recursion level (see mq_poly_inv_inner() for details). In BAT,
 15 |  * the keygen makes f with odd parity only (this is required for the
 16 |  * NTRU solving algorithm), thus f is always invertible modulo 128 (and
 17 |  * 256).
 18 |  *
 19 |  * Functions use a logarithm "stride" for access: successive elements of
 20 |  * polynomial a[] are a[0], a[1 << ls], a[2 << ls], ...
 21 |  */
 22 | 
 23 | static void
 24 | mq_poly_add_inner(uint8_t *d,
 25 | 	const uint8_t *a, const uint8_t *b, int ls, unsigned logn)
 26 | {
 27 | 	size_t u, n;
 28 | 
 29 | 	n = (size_t)1 << logn;
 30 | 	for (u = 0; u < n; u ++) {
 31 | 		d[u << ls] = a[u << ls] + b[u << ls];
 32 | 	}
 33 | }
 34 | 
 35 | static void
 36 | mq_poly_sub_inner(uint8_t *d,
 37 | 	const uint8_t *a, const uint8_t *b, int ls, unsigned logn)
 38 | {
 39 | 	size_t u, n;
 40 | 
 41 | 	n = (size_t)1 << logn;
 42 | 	for (u = 0; u < n; u ++) {
 43 | 		d[u << ls] = a[u << ls] - b[u << ls];
 44 | 	}
 45 | }
 46 | 
 47 | static void
 48 | mq_poly_neg_inner(uint8_t *d, const uint8_t *a, int ls, unsigned logn)
 49 | {
 50 | 	size_t u, n;
 51 | 
 52 | 	n = (size_t)1 << logn;
 53 | 	for (u = 0; u < n; u ++) {
 54 | 		d[u << ls] = -a[u << ls];
 55 | 	}
 56 | }
 57 | 
 58 | /*
 59 |  * d <- a + X*b
 60 |  */
 61 | static void
 62 | mq_poly_add_mulX_inner(uint8_t *d,
 63 | 	const uint8_t *a, const uint8_t *b, int ls, unsigned logn)
 64 | {
 65 | 	/*
 66 | 	 * We must take care to perform the loop in a way that does not
 67 | 	 * break when d == b.
 68 | 	 */
 69 | 	size_t u, n;
 70 | 	int t;
 71 | 
 72 | 	n = (size_t)1 << logn;
 73 | 	t = -b[(n - 1) << ls];
 74 | 	for (u = 0; u < n; u ++) {
 75 | 		int tn;
 76 | 
 77 | 		tn = b[u << ls];
 78 | 		d[u << ls] = a[u << ls] + t;
 79 | 		t = tn;
 80 | 	}
 81 | }
 82 | 
 83 | /*
 84 |  * d <- a - X*b
 85 |  */
 86 | static void
 87 | mq_poly_sub_mulX_inner(uint8_t *d,
 88 | 	const uint8_t *a, const uint8_t *b, int ls, unsigned logn)
 89 | {
 90 | 	/*
 91 | 	 * We must take care to perform the loop in a way that does not
 92 | 	 * break when d == b.
 93 | 	 */
 94 | 	size_t u, n;
 95 | 	int t;
 96 | 
 97 | 	n = (size_t)1 << logn;
 98 | 	t = -b[(n - 1) << ls];
 99 | 	for (u = 0; u < n; u ++) {
100 | 		int tn;
101 | 
102 | 		tn = b[u << ls];
103 | 		d[u << ls] = a[u << ls] - t;
104 | 		t = tn;
105 | 	}
106 | }
107 | 
108 | /*
109 |  * For multiplications, we use Karatsuba, with even/odd splits:
110 |  *
111 |  *   a = a_e(X^2) + X*a_o(X^2)
112 |  *   b = b_e(X^2) + X*b_o(X^2)
113 |  *   a*b = (a_e*b_e + X*a_o*b_o)(X^2) + X*(a_e*b_o + a_o*b_e)(X^2)
114 |  *   (a_e*b_o + a_o*b_e) = (a_e + a_o)*(b_e + b_o) - a_e*b_e - a_o*b_o
115 |  *
116 |  * Size of tmp[]: 2*n bytes (with n = top-level degree).
117 |  */
118 | static void
119 | mq_poly_mul_inner(uint8_t *d, const uint8_t *a, const uint8_t *b,
120 | 	int ls, unsigned logn, uint8_t *tmp)
121 | {
122 | 	uint8_t *t1, *t2;
123 | 
124 | 	switch (logn) {
125 | 		unsigned a0, a1, a2, a3;
126 | 		unsigned b0, b1, b2, b3;
127 | 
128 | 	case 1:
129 | 		a0 = a[0 << ls];
130 | 		a1 = a[1 << ls];
131 | 		b0 = b[0 << ls];
132 | 		b1 = b[1 << ls];
133 | 		d[0 << ls] = a0 * b0 - a1 * b1;
134 | 		d[1 << ls] = a0 * b1 + a1 * b0;
135 | 		return;
136 | 	case 2:
137 | 		a0 = a[0 << ls];
138 | 		a1 = a[1 << ls];
139 | 		a2 = a[2 << ls];
140 | 		a3 = a[3 << ls];
141 | 		b0 = b[0 << ls];
142 | 		b1 = b[1 << ls];
143 | 		b2 = b[2 << ls];
144 | 		b3 = b[3 << ls];
145 | 		d[0 << ls] = a0 * b0 - a1 * b3 - a2 * b2 - a3 * b1;
146 | 		d[1 << ls] = a0 * b1 + a1 * b0 - a2 * b3 - a3 * b2;
147 | 		d[2 << ls] = a0 * b2 + a1 * b1 + a2 * b0 - a3 * b3;
148 | 		d[3 << ls] = a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
149 | 		return;
150 | 	default:
151 | 		break;
152 | 	}
153 | 
154 | 	/*
155 | 	 * a_e is a[], starting at 0, with stride ls + 1
156 | 	 * a_o is a[], starting at 1 << ls, with stride ls + 1
157 | 	 *
158 | 	 * We need two temporaries t1 and t2, and we will use only the
159 | 	 * elements with stride ls + 1 in them. Thus, in the received
160 | 	 * tmp[], we use odd-indexed elements for our temporaries,
161 | 	 * leaving the even-indexed elements free for the deeper
162 | 	 * recursion levels.
163 | 	 */
164 | 	t1 = tmp + (1 << ls);
165 | 	t2 = t1 + ((size_t)1 << (logn + ls));
166 | 
167 | 	/*
168 | 	 * t1 <- a_e + a_o
169 | 	 * t2 <- b_e + b_o
170 | 	 */
171 | 	mq_poly_add_inner(t1, a, a + (1 << ls), ls + 1, logn - 1);
172 | 	mq_poly_add_inner(t2, b, b + (1 << ls), ls + 1, logn - 1);
173 | 
174 | 	/*
175 | 	 * t1 <- (a_e + a_o)*(b_e + b_o)
176 | 	 */
177 | 	mq_poly_mul_inner(t1, t1, t2, ls + 1, logn - 1, tmp);
178 | 
179 | 	/*
180 | 	 * t2 <- a_o * b_o
181 | 	 * d_e <- a_e * b_e
182 | 	 * We don't need a[] and b[] afterwards, which is why we can
183 | 	 * write into d_e (which may overlap either or both).
184 | 	 */
185 | 	mq_poly_mul_inner(t2,
186 | 		a + (1 << ls), b + (1 << ls), ls + 1, logn - 1, tmp);
187 | 	mq_poly_mul_inner(d,
188 | 		a, b, ls + 1, logn - 1, tmp);
189 | 
190 | 	/*
191 | 	 * d_o <- t1 - t2 - d_e = a_e*b_o + a_o*b_e
192 | 	 */
193 | 	mq_poly_sub_inner(t1, t1, t2, ls + 1, logn - 1);
194 | 	mq_poly_sub_inner(d + (1 << ls), t1, d, ls + 1, logn - 1);
195 | 
196 | 	/*
197 | 	 * d_e <- d_e + X*t2 = a_e*b_e + X*a_o*b_o
198 | 	 */
199 | 	mq_poly_add_mulX_inner(d, d, t2, ls + 1, logn - 1);
200 | }
201 | 
202 | /*
203 |  * TODO: make a specialized squaring function, which could be faster
204 |  * than the plain multiplication routine.
205 |  */
206 | static void
207 | mq_poly_sqr_inner(uint8_t *d, const uint8_t *a,
208 | 	int ls, unsigned logn, uint8_t *tmp)
209 | {
210 | 	mq_poly_mul_inner(d, a, a, ls, logn, tmp);
211 | }
212 | 
213 | /*
214 |  * Polynomial inversion: we split a into even and odd coefficients:
215 |  *    a = a_e(X^2) + X*a_o(X^2)
216 |  * with a_e and a_o being half-degree. We define:
217 |  *    adj(a) = a_e(X^2) - X*a_o(X^2)
218 |  * Then:
219 |  *    a*adj(a) = a_e^2(X^2) - X^2*a_o^2(X^2)
220 |  *             = (a_e^2 - X*a_o^2)(X^2)
221 |  * which is a half-degree polynomial.
222 |  *
223 |  * Thus:
224 |  *    1/a = adj(a)*(1 / (a*adj(a)))
225 |  * so we reduced the problem of inverting a of degree n into inverting
226 |  * a*adj(a) of degree n/2. We just apply the process recursively
227 |  * until we reach degree 1.
228 |  *
229 |  * Note that 1/(a*adj(a)) is really half-degree, so the multiplication
230 |  * by adj(a) can be done with two half-degree multiplications.
231 |  *
232 |  * Size of tmp[]: 2*n bytes (with n = top-level degree).
233 |  *
234 |  * Return value: 1 on success, 0 on error. On error (a[] is not invertible),
235 |  * contents of d[] are unpredictable.
236 |  */
237 | static int
238 | mq_poly_inv_inner(uint8_t *d, const uint8_t *a, int ls,
239 | 	unsigned logn, uint8_t *tmp)
240 | {
241 | 	uint8_t *t1, *t2;
242 | 	int r;
243 | 
244 | 	if (logn == 1) {
245 | 		unsigned a0, a1, x, y;
246 | 
247 | 		a0 = a[0 << ls];
248 | 		a1 = a[1 << ls];
249 | 		x = a0 * a0 + a1 * a1;
250 | 
251 | 		/*
252 | 		 * x is invertible modulo 256 if and only if it is odd.
253 | 		 */
254 | 		r = (int)(x & 1);
255 | 
256 | 		/*
257 | 		 * If x*y = 1 + u*2^k, then:
258 | 		 *   x*(y*(2-x*y)) = (1 + u*2^k)*(1 - u*2^k)
259 | 		 *                 = 1 - (u^2)*2^(2*k)
260 | 		 *                 = 1 mod 2^(2*k)
261 | 		 * Inverse of s modulo 4 is itself:
262 | 		 *   1*1 = 1 mod 4
263 | 		 *   3*3 = 1 mod 4
264 | 		 * Thus, we apply the rule above twice on x, to get an
265 | 		 * inverse modulo 4*4 = 16, and then modulo 16*16 = 256.
266 | 		 */
267 | 		y = x;
268 | 		y *= 2 - (x * y);
269 | 		y *= 2 - (x * y);
270 | 
271 | 		/*
272 | 		 * 1/(a0 + X*a1) = (a0 - X*a1) / (a0^2 - X^2*a1^2)
273 | 		 *               = (a0 - X*a1) / (a0^2 + a1^2)
274 | 		 * (since we work modulo X^2+1)
275 | 		 */
276 | 		d[0 << ls] = a0 * y;
277 | 		d[1 << ls] = -a1 * y;
278 | 		return r;
279 | 	}
280 | 
281 | 	t1 = tmp + (1 << ls);
282 | 	t2 = t1 + ((size_t)1 << (logn + ls));
283 | 
284 | 	/*
285 | 	 * t1 <- a*adj(a)
286 | 	 */
287 | 	mq_poly_sqr_inner(t1, a, ls + 1, logn - 1, tmp);
288 | 	mq_poly_sqr_inner(t2, a + (1 << ls), ls + 1, logn - 1, tmp);
289 | 	mq_poly_sub_mulX_inner(t1, t1, t2, ls + 1, logn - 1);
290 | 
291 | 	/*
292 | 	 * t1 <- 1/a*adj(a)
293 | 	 */
294 | 	r = mq_poly_inv_inner(t1, t1, ls + 1, logn - 1, tmp);
295 | 
296 | 	/*
297 | 	 * d_e <- t1*a_e
298 | 	 * d_o <- -t1*a_o
299 | 	 */
300 | 	mq_poly_mul_inner(d, a, t1, ls + 1, logn - 1, tmp);
301 | 	mq_poly_mul_inner(d + (1 << ls),
302 | 		a + (1 << ls), t1, ls + 1, logn - 1, tmp);
303 | 	mq_poly_neg_inner(d + (1 << ls), d + (1 << ls), ls + 1, logn - 1);
304 | 
305 | 	return r;
306 | }
307 | 
308 | /*
309 |  * Wrappers for the case of ls = 1 (minimal stride).
310 |  */
311 | 
312 | static inline void
313 | mq_poly_add(uint8_t *d, const uint8_t *a, const uint8_t *b, unsigned logn)
314 | {
315 | 	mq_poly_add_inner(d, a, b, 0, logn);
316 | }
317 | 
318 | static inline void
319 | mq_poly_sub(uint8_t *d, const uint8_t *a, const uint8_t *b, unsigned logn)
320 | {
321 | 	mq_poly_sub_inner(d, a, b, 0, logn);
322 | }
323 | 
324 | static inline void
325 | mq_poly_mul(uint8_t *d, const uint8_t *a, const uint8_t *b, unsigned logn,
326 | 	uint8_t *tmp)
327 | {
328 | 	mq_poly_mul_inner(d, a, b, 0, logn, tmp);
329 | }
330 | 
331 | static inline int
332 | mq_poly_inv(uint8_t *d, const uint8_t *a, unsigned logn, uint8_t *tmp)
333 | {
334 | 	return mq_poly_inv_inner(d, a, 0, logn, tmp);
335 | }
336 | 
337 | /*
338 |  * Multiply polynomial a[] by 1+X+X^2+X^3+...+X^(n-1).
339 |  *
340 |  *   d[0]   = a[0] - a[1] - a[2] - a[3] - ... - a[n - 1]
341 |  *   d[1]   = a[0] + a[1] - a[2] - a[3] - ... - a[n - 1]
342 |  *   d[2]   = a[0] + a[1] + a[2] - a[3] - ... - a[n - 1]
343 |  *     ...
344 |  *   d[n-1] = a[0] + a[1] + a[2] + a[3] + ... + a[n - 1]
345 |  *
346 |  * Thus, d[n - 1] is the sum of all a[i], and:
347 |  *   d[i] = d[i + 1] - 2*a[i + 1]
348 |  * Equivalently:
349 |  *   d[i] = d[i - 1] + 2*a[i]
350 |  * for all i >= 1.
351 |  *
352 |  * This allows efficient computation, in O(n) operations and with no
353 |  * need for extra storage.
354 |  */
355 | static void
356 | mq_poly_mul_ones(uint8_t *d, const uint8_t *a, unsigned logn)
357 | {
358 | 	size_t u, n;
359 | 	unsigned t;
360 | 
361 | 	n = (size_t)1 << logn;
362 | 	t = a[0];
363 | 	for (u = 1; u < n; u ++) {
364 | 		t -= a[u];
365 | 	}
366 | 	d[0] = t;
367 | 	for (u = 1; u < n; u ++) {
368 | 		t += a[u] << 1;
369 | 		d[u] = t;
370 | 	}
371 | }
372 | 
373 | /*
374 |  * Multiply a polynomial by a constant.
375 |  */
376 | static void
377 | mq_poly_mulconst(uint8_t *d, const uint8_t *a, unsigned c, unsigned logn)
378 | {
379 | 	size_t u, n;
380 | 
381 | 	n = (size_t)1 << logn;
382 | 	for (u = 0; u < n; u ++) {
383 | 		d[u] = a[u] * c;
384 | 	}
385 | }
386 | 
387 | /* see inner.h */
388 | int
389 | bat_make_public_128(uint8_t *h, const int8_t *f, const int8_t *g,
390 | 	unsigned logn, uint32_t *tmp)
391 | {
392 | 	size_t u, n;
393 | 	uint8_t *t1, *t2;
394 | 	int r;
395 | 
396 | 	n = (size_t)1 << logn;
397 | 	t1 = (uint8_t *)tmp;
398 | 	t2 = t1 + n;
399 | 
400 | 	/*
401 | 	 * t1 <- 1/f
402 | 	 */
403 | 	r = mq_poly_inv(t1, (const uint8_t *)f, logn, t2);
404 | 
405 | 	/*
406 | 	 * h <- t1*g = g/f
407 | 	 */
408 | 	mq_poly_mul(h, t1, (const uint8_t *)g, logn, t2);
409 | 
410 | 	/*
411 | 	 * Reduce coefficients modulo 128.
412 | 	 */
413 | 	for (u = 0; u < n; u ++) {
414 | 		h[u] &= 0x7F;
415 | 	}
416 | 	return r;
417 | }
418 | 
419 | /* see inner.h */
420 | uint32_t
421 | bat_encrypt_128(int8_t *c, const uint8_t *sbuf,
422 | 	const uint8_t *h, unsigned logn, uint32_t *tmp)
423 | {
424 | 	size_t u, n;
425 | 	uint8_t *t1, *t2;
426 | 
427 | 	n = (size_t)1 << logn;
428 | 	t1 = (uint8_t *)tmp;
429 | 	t2 = t1 + n;
430 | 
431 | 	/*
432 | 	 * Expand sbuf[] into polynomial s (in t1).
433 | 	 */
434 | 	for (u = 0; u < n; u ++) {
435 | 		t1[u] = (sbuf[u >> 3] >> ((unsigned)u & 7)) & 1;
436 | 	}
437 | 
438 | 	/*
439 | 	 * t1 <- h*s
440 | 	 */
441 | 	mq_poly_mul(t1, h, t1, logn, t2);
442 | 
443 | 	/*
444 | 	 * c = round((h*s) / 2)
445 | 	 * Coefficients of h*s must be reduced modulo 128, into -63..+64.
446 | 	 * Rounding is toward +inf, thus the result is in -31..+32.
447 | 	 */
448 | 	for (u = 0; u < n; u ++) {
449 | 		c[u] = (((t1[u] + 63) & 0x7F) >> 1) - 31;
450 | 	}
451 | 
452 | 	/*
453 | 	 * Since coefficients of e' (centered error vector) are in
454 | 	 * {-1/2,+1/2}, and those of s' are also in {-1/2,+1/2}, the
455 | 	 * norm of vector (gamma*s',e') is always equal to
456 | 	 * sqrt((gamma^2 + 1)*(n/4)) = sqrt(n/2), thus always acceptable.
457 | 	 */
458 | 	return 1;
459 | }
460 | 
461 | /* see inner.h */
462 | void
463 | bat_decrypt_128(uint8_t *sbuf, const int8_t *c,
464 | 	const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
465 | 	const int32_t *w, unsigned logn, uint32_t *tmp)
466 | {
467 | 	/*
468 | 	 * q = 128, Q = 2, q' = 3329, k = 2.
469 | 	 *
470 | 	 * Decapsulation algorithm:
471 | 	 *
472 | 	 *   c <- k*c
473 | 	 *   c' <- (Q*f*c - f*ones - g*ones) mod q*Q
474 | 	 *   c'' <- (q'*Q*F*c - q'*F*ones - q'*G*ones - c'*w) mod q*q'*Q
475 | 	 *   e' = (-Gd*c' + g*c'') / (q*q'*Q)
476 | 	 *   s' = (Fd*c' - f*c'') / (q*q'*Q)
477 | 	 *   e = e' + (1/2)*ones
478 | 	 *   s = s' + (1/2)*ones
479 | 	 *
480 | 	 * We don't need to recompute e, only s. q*Q = 256, which is
481 | 	 * natively supported by the code in this file.
482 | 	 */
483 | 	size_t u, n;
484 | 	uint8_t *t1, *t2, *t3, *t4;
485 | 	uint16_t *tw1, *tw3, *tw4;
486 | 
487 | 	n = (size_t)1 << logn;
488 | 	t1 = (uint8_t *)tmp;
489 | 	t2 = t1 + n;
490 | 	t3 = t2 + n;
491 | 	t4 = t3 + n;
492 | 
493 | 	tw1 = (uint16_t *)t1;
494 | 	tw3 = (uint16_t *)t3;
495 | 	tw4 = tw3 + n;
496 | 
497 | 	/*
498 | 	 * c <- k*c (implicit)
499 | 	 * t2 <- Q*c
500 | 	 */
501 | 	for (u = 0; u < n; u ++) {
502 | 		t2[u] = (uint8_t)(c[u] << 2);
503 | 	}
504 | 
505 | 	/*
506 | 	 * t1 <- c' = (Q*f*c - f*ones - g*ones) mod q*Q
507 | 	 */
508 | 	mq_poly_mul(t1, t2, (const uint8_t *)f, logn, t3);
509 | 	mq_poly_add(t3, (const uint8_t *)f, (const uint8_t *)g, logn);
510 | 	mq_poly_mul_ones(t3, t3, logn);
511 | 	mq_poly_sub(t1, t1, t3, logn);
512 | 
513 | 	/*
514 | 	 * t2 <- (q'*Q*F*c - q'*F*ones - q'*G*ones - c'*w) mod q*Q
515 | 	 */
516 | 	mq_poly_mul(t2, t2, (const uint8_t *)F, logn, t4);
517 | 	mq_poly_add(t3, (const uint8_t *)F, (const uint8_t *)G, logn);
518 | 	mq_poly_mul_ones(t3, t3, logn);
519 | 	mq_poly_sub(t2, t2, t3, logn);
520 | 	mq_poly_mulconst(t2, t2, 64513 & 0xFF, logn);
521 | 	for (u = 0; u < n; u ++) {
522 | 		t3[u] = (uint8_t)w[u];
523 | 	}
524 | 	mq_poly_mul(t3, t1, t3, logn, t4);
525 | 	mq_poly_sub(t2, t2, t3, logn);
526 | 
527 | 	/*
528 | 	 * tw3 <- -c'*w mod q'
529 | 	 * This involves rebulding c' mod q' in tw3 first.
530 | 	 */
531 | 	for (u = 0; u < n; u ++) {
532 | 		*(int16_t *)&tw3[u] = (int)((t1[u] + 127) & 0xFF) - 127;
533 | 	}
534 | 	bat_polyqp_mulneg((int16_t *)tw3, (int16_t *)tw3, w,
535 | 		logn, (uint32_t *)tw4);
536 | 
537 | 	/*
538 | 	 * At that point, we have:
539 | 	 *   t1:  c' mod q*Q   (0..255)
540 | 	 *   t2:  c'' mod q*Q  (0..255)
541 | 	 *   tw3: c'' mod q'   (-32256..+32256)
542 | 	 *
543 | 	 * We now want to assemble c'' mod 257 in tw3. We do so by
544 | 	 * applying the CRT between t2 and tw3.
545 | 	 */
546 | 	for (u = 0; u < n; u ++) {
547 | 		uint16_t z;
548 | 		uint32_t x0, x1, x;
549 | 		int32_t y;
550 | 
551 | 		/*
552 | 		 * We ensure that we get a positive value by adding
553 | 		 * 64513 to the coefficient from c''.
554 | 		 */
555 | 		x0 = t2[u];
556 | 		z = tw3[u];
557 | 		x1 = (uint32_t)(*(int16_t *)&z + 64513);
558 | 
559 | 		/*
560 | 		 * CRT reconstruction: If:
561 | 		 *   x = x0 mod q*Q
562 | 		 *   x = x1 mod q'
563 | 		 * then:
564 | 		 *   x = ((1/q') * (x0 - x1) mod q*Q) * q' + x1
565 | 		 * We have q*Q = 256; since 64513 = 252*256 + 1, the value
566 | 		 * 1/q' mod q*Q is trivial.
567 | 		 */
568 | 		x = x1 + ((x0 - x1) & 0xFF) * 64513;
569 | 
570 | 		/*
571 | 		 * x is in 0..16515327; we should normalize it to
572 | 		 * -8257663..+8257664.
573 | 		 */
574 | 		y = (int32_t)x - (int32_t)(16515328
575 | 			& -((uint32_t)(8257664 - x) >> 31));
576 | 
577 | 		/*
578 | 		 * For reduction modulo 257, we ensure a positive value by
579 | 		 * adding 32131*257 = 8257667.
580 | 		 */
581 | 		tw3[u] = m257_tomonty((uint32_t)(y + 8257667));
582 | 	}
583 | 
584 | 	/*
585 | 	 * We assemble c' mod 257 in tw4 (then moved to tw1).
586 | 	 */
587 | 	for (u = 0; u < n; u ++) {
588 | 		/*
589 | 		 * For x in 0..255, normalization to -127..+128 is done
590 | 		 * by computing ((x + 127) % 256) - 127. But we then want
591 | 		 * to add 257 to get a positive value for reduction modulo
592 | 		 * 257.
593 | 		 */
594 | 		tw4[u] = m257_tomonty(((t1[u] + 127) & 0xFF) + 130);
595 | 	}
596 | 	memcpy(tw1, tw4, n * sizeof *tw4);
597 | 
598 | 	/*
599 | 	 * We have c' mod 257 in tw1, and c'' mod 257 in tw3. We
600 | 	 * use the mod 257 code to obtain q*q'*Q*s'.
601 | 	 */
602 | 	bat_finish_decapsulate_257(tw1, tw3, f, F, w, logn, (uint32_t *)tw4);
603 | 
604 | 	/*
605 | 	 * If the ciphertext is correct and the decapsulation worked well,
606 | 	 * then s' has coefficients in {-1/2,+1/2} and the coefficients of
607 | 	 * s are obtained by adding 1/2. We have the coefficients of
608 | 	 * q*q'*Q*s' in tw1[], in Montgomery representation modulo 257:
609 | 	 *
610 | 	 *    s'   s   tw1[]
611 | 	 *  -1/2   0      3     (-q*q'*Q/2 = 3 mod 257)
612 | 	 *  +1/2   1    254     (+q*q'*Q/2 = 254 mod 257)
613 | 	 *
614 | 	 * Thus, we just need to look at the least significant bit of each
615 | 	 * value in tw1[] to get the coefficients of s.
616 | 	 */
617 | 	memset(sbuf, 0, (n + 7) >> 3);
618 | 	for (u = 0; u < n; u ++) {
619 | 		sbuf[u >> 3] |= (1 - (tw1[u] & 1)) << (u & 7);
620 | 	}
621 | }
622 | 
623 | /* see inner.h */
624 | int
625 | bat_rebuild_G_128(int8_t *G,
626 | 	const int8_t *f, const int8_t *g, const int8_t *F,
627 | 	unsigned logn, uint32_t *tmp)
628 | {
629 | 	size_t u, n;
630 | 	uint8_t *t1, *t2, *t3;
631 | 	int lim;
632 | 
633 | 	n = (size_t)1 << logn;
634 | 	t1 = (uint8_t *)tmp;
635 | 	t2 = t1 + n;
636 | 	t3 = t2 + n;
637 | 
638 | 	/*
639 | 	 * We have g*F - f*G = q; therefore: G = (g*F - q) / f
640 | 	 *
641 | 	 * We compute modulo 256; note that if f is invertible modulo
642 | 	 * 128, it will be invertible modulo 256, and vice versa.
643 | 	 */
644 | 	if (!mq_poly_inv(t1, (const uint8_t *)f, logn, t3)) {
645 | 		return 0;
646 | 	}
647 | 	mq_poly_mul(t2, (const uint8_t *)g, (const uint8_t *)F, logn, t3);
648 | 	t2[0] -= 128;
649 | 	mq_poly_mul(t1, t1, t2, logn, t3);
650 | 
651 | 	/*
652 | 	 * Normalize coefficients of G around 0, and check that they
653 | 	 * are within the expected bounds.
654 | 	 */
655 | 	lim = (1 << (bat_max_FG_bits[logn] - 1)) - 1;
656 | 	for (u = 0; u < n; u ++) {
657 | 		int x;
658 | 
659 | 		x = ((t1[u] + 127) & 0xFF) - 127;
660 | 		if (x < -lim || x > +lim) {
661 | 			return 0;
662 | 		}
663 | 		G[u] = (int8_t)x;
664 | 	}
665 | 	return 1;
666 | }
667 | 
668 | /*
669 |  * Values modulo q are in 0..127, which naturally encodes over exactly
670 |  * 7 bits.
671 |  */
672 | 
673 | /* see inner.h */
674 | size_t
675 | bat_encode_128(void *out, size_t max_out_len,
676 | 	const uint8_t *x, unsigned logn)
677 | {
678 | 	size_t u, v, n, out_len;
679 | 	uint8_t *buf;
680 | 
681 | 	n = (size_t)1 << logn;
682 | 	out_len = ((7 * n) + 7) >> 3;
683 | 	if (out == NULL) {
684 | 		return out_len;
685 | 	}
686 | 	if (max_out_len < out_len) {
687 | 		return 0;
688 | 	}
689 | 	buf = out;
690 | 	if (n == 2) {
691 | 		uint32_t w;
692 | 
693 | 		w = (uint32_t)x[0]
694 | 			| ((uint32_t)x[1] << 7);
695 | 		enc16le(buf, w);
696 | 		return 2;
697 | 	} else if (n == 4) {
698 | 		uint32_t w;
699 | 
700 | 		w = (uint32_t)x[0]
701 | 			| ((uint32_t)x[1] << 7)
702 | 			| ((uint32_t)x[2] << 14)
703 | 			| ((uint32_t)x[3] << 21);
704 | 		enc32le(buf, w);
705 | 		return 4;
706 | 	} else {
707 | 		v = 0;
708 | 		for (u = 0; (u + 8) <= n; u += 8) {
709 | 			uint32_t w0, w1;
710 | 
711 | 			w0 = (uint32_t)x[u]
712 | 				| ((uint32_t)x[u + 1] << 7)
713 | 				| ((uint32_t)x[u + 2] << 14)
714 | 				| ((uint32_t)x[u + 3] << 21);
715 | 			w1 = (uint32_t)x[u + 4]
716 | 				| ((uint32_t)x[u + 5] << 7)
717 | 				| ((uint32_t)x[u + 6] << 14)
718 | 				| ((uint32_t)x[u + 7] << 21);
719 | 			enc32le(buf + v, w0 | (w1 << 28));
720 | 			enc24le(buf + v + 4, w1 >> 4);
721 | 			v += 7;
722 | 		}
723 | 		return v;
724 | 	}
725 | }
726 | 
727 | /* see inner.h */
728 | size_t
729 | bat_decode_128(uint8_t *x, unsigned logn,
730 | 	const void *in, size_t max_in_len)
731 | {
732 | 	size_t u, v, n, in_len;
733 | 	const uint8_t *buf;
734 | 	uint32_t r;
735 | 
736 | 	n = (size_t)1 << logn;
737 | 	in_len = ((7 * n) + 7) >> 3;
738 | 	if (max_in_len < in_len) {
739 | 		return 0;
740 | 	}
741 | 	buf = in;
742 | 	if (n == 2) {
743 | 		uint32_t w;
744 | 
745 | 		w = dec16le(buf);
746 | 		x[0] = w & 0x7F;
747 | 		x[1] = (w >> 7) & 0x7F;
748 | 		r = (uint32_t)((w >> 14) - 1) >> 31;
749 | 		v = 2;
750 | 	} else if (n == 4) {
751 | 		uint32_t w;
752 | 
753 | 		w = dec32le(buf);
754 | 		x[0] = w & 0x7F;
755 | 		x[1] = (w >> 7) & 0x7F;
756 | 		x[2] = (w >> 14) & 0x7F;
757 | 		x[3] = (w >> 21) & 0x7F;
758 | 		r = (uint32_t)((w >> 28) - 1) >> 31;
759 | 		v = 4;
760 | 	} else {
761 | 		v = 0;
762 | 		for (u = 0; (u + 8) <= n; u += 8) {
763 | 			uint32_t w0, w1;
764 | 
765 | 			w0 = dec32le(buf + v);
766 | 			w1 = dec24le(buf + v + 4);
767 | 			v += 7;
768 | 			x[u + 0] = w0 & 0x7F;
769 | 			x[u + 1] = (w0 >> 7) & 0x7F;
770 | 			x[u + 2] = (w0 >> 14) & 0x7F;
771 | 			x[u + 3] = (w0 >> 21) & 0x7F;
772 | 			x[u + 4] = ((w0 >> 28) | (w1 << 4)) & 0x7F;
773 | 			x[u + 5] = (w1 >> 3) & 0x7F;
774 | 			x[u + 6] = (w1 >> 10) & 0x7F;
775 | 			x[u + 7] = (w1 >> 17) & 0x7F;
776 | 		}
777 | 		r = 1;
778 | 	}
779 | 	return v & -(size_t)r;
780 | }
781 | 
782 | /*
783 |  * Ciphertext values are in -31..+32 range; for value v, we encode v+31
784 |  * over 6 bits (v+31 is in 0..63).
785 |  */
786 | 
787 | /* see inner.h */
788 | size_t
789 | bat_encode_ciphertext_128(void *out, size_t max_out_len,
790 | 	const int8_t *c, unsigned logn)
791 | {
792 | 	size_t u, v, n, out_len;
793 | 	uint8_t *buf;
794 | 
795 | 	n = (size_t)1 << logn;
796 | 	out_len = ((6 * n) + 7) >> 3;
797 | 	if (out == NULL) {
798 | 		return out_len;
799 | 	}
800 | 	if (max_out_len < out_len) {
801 | 		return 0;
802 | 	}
803 | 	buf = out;
804 | 	v = 0;
805 | 	for (u = 0; (u + 4) <= n; u += 4) {
806 | 		uint32_t w;
807 | 
808 | 		w = (uint32_t)(c[u] + 31)
809 | 			| ((uint32_t)(c[u + 1] + 31) << 6)
810 | 			| ((uint32_t)(c[u + 2] + 31) << 12)
811 | 			| ((uint32_t)(c[u + 3] + 31) << 18);
812 | 		enc24le(buf + v, w);
813 | 		v += 3;
814 | 	}
815 | 	if (u < n) {
816 | 		uint32_t w;
817 | 
818 | 		w = (uint32_t)(c[u] + 31)
819 | 			| ((uint32_t)(c[u + 1] + 31) << 6);
820 | 		enc16le(buf + v, w);
821 | 		v += 2;
822 | 	}
823 | 	return v;
824 | }
825 | 
826 | /* see inner.h */
827 | size_t
828 | bat_decode_ciphertext_128(int8_t *c, unsigned logn,
829 | 	const void *in, size_t max_in_len)
830 | {
831 | 	size_t u, v, n, in_len;
832 | 	const uint8_t *buf;
833 | 	uint32_t r;
834 | 
835 | 	n = (size_t)1 << logn;
836 | 	in_len = ((6 * n) + 7) >> 3;
837 | 	if (max_in_len < in_len) {
838 | 		return 0;
839 | 	}
840 | 	buf = in;
841 | 	v = 0;
842 | 	r = 1;
843 | 	for (u = 0; (u + 4) <= n; u += 4) {
844 | 		uint32_t w;
845 | 
846 | 		w = dec24le(buf + v);
847 | 		v += 3;
848 | 		c[u + 0] = (int)(w & 0x3F) - 31;
849 | 		c[u + 1] = (int)((w >> 6) & 0x3F) - 31;
850 | 		c[u + 2] = (int)((w >> 12) & 0x3F) - 31;
851 | 		c[u + 3] = (int)((w >> 18) & 0x3F) - 31;
852 | 	}
853 | 	if (u < n) {
854 | 		uint32_t w;
855 | 
856 | 		w = dec16le(buf + v);
857 | 		v += 2;
858 | 		c[u + 0] = (int)(w & 0x3F) - 31;
859 | 		c[u + 1] = (int)((w >> 6) & 0x3F) - 31;
860 | 		r &= (uint32_t)((w >> 12) - 1) >> 31;
861 | 	}
862 | 	return v & -(size_t)r;
863 | }
864 | 


--------------------------------------------------------------------------------
/src/modqp.c:
--------------------------------------------------------------------------------
 1 | #include "inner.h"
 2 | 
 3 | #define Q   64513
 4 | #include "modgen.c"
 5 | 
 6 | /* see inner.h */
 7 | void
 8 | bat_polyqp_mulneg(int16_t *d, const int16_t *a, const int32_t *b,
 9 | 	unsigned logn, uint32_t *tmp)
10 | {
11 | 	size_t u, n;
12 | 	uint16_t *t1, *t2;
13 | 
14 | 	n = (size_t)1 << logn;
15 | 
16 | 	/*
17 | 	 * In order to save memory, we use the destination array for
18 | 	 * intermediate computations as well. Since d may partially
19 | 	 * overlap with a, we first do a memmove().
20 | 	 */
21 | 	if (d != a) {
22 | 		memmove(d, a, n * sizeof *a);
23 | 	}
24 | 	t1 = (uint16_t *)d;
25 | 	t2 = (uint16_t *)tmp;
26 | 	for (u = 0; u < n; u ++) {
27 | 		t1[u] = mq_set(*(int16_t *)&t1[u]);
28 | 		t2[u] = mq_set(-b[u]);
29 | 	}
30 | 	NTT(t1, t1, logn);
31 | 	NTT(t2, t2, logn);
32 | 	mq_poly_mul_ntt(t1, t1, t2, logn);
33 | 	iNTT(t1, t1, logn);
34 | 	for (u = 0; u < n; u ++) {
35 | 		*(int16_t *)&t1[u] = mq_snorm(t1[u]);
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/src/prng.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * PRNG and interface to the system RNG.
 3 |  */
 4 | 
 5 | #include <assert.h>
 6 | 
 7 | #include "inner.h"
 8 | 
 9 | /*
10 |  * Include relevant system header files. For Win32, this will also need
11 |  * linking with advapi32.dll, which we trigger with an appropriate #pragma.
12 |  */
13 | #if BAT_RAND_GETENTROPY
14 | #include <unistd.h>
15 | #endif
16 | #if BAT_RAND_URANDOM
17 | #include <sys/types.h>
18 | #if !BAT_RAND_GETENTROPY
19 | #include <unistd.h>
20 | #endif
21 | #include <fcntl.h>
22 | #include <errno.h>
23 | #endif
24 | #if BAT_RAND_WIN32
25 | #include <windows.h>
26 | #define SystemFunction036   NTAPI SystemFunction036
27 | #include <NTSecAPI.h>
28 | #undef SystemFunction036
29 | #pragma comment(lib, "advapi32")
30 | #endif
31 | 
32 | /* see inner.h */
33 | int
34 | bat_get_seed(void *seed, size_t len)
35 | {
36 | 	(void)seed;
37 | 	if (len == 0) {
38 | 		return 1;
39 | 	}
40 | #if BAT_RAND_GETENTROPY
41 | 	if (getentropy(seed, len) == 0) {
42 | 		return 1;
43 | 	}
44 | #endif
45 | #if BAT_RAND_URANDOM
46 | 	/*
47 | 	 * We could try to optimize this code with some caching of the
48 | 	 * file descriptor, but this raises extra difficulties (this is
49 | 	 * hard to make thread-safe without dabbling with a mutex). It
50 | 	 * is simpler to assume that any Unix-like platform for which it
51 | 	 * is worth optimizing performance will also have a recent
52 | 	 * enough OS to use getentropy() (possibly as a wrapper around
53 | 	 * getrandom()).
54 | 	 */
55 | 	{
56 | 		int f;
57 | 
58 | 		f = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
59 | 		if (f >= 0) {
60 | 			while (len > 0) {
61 | 				ssize_t rlen;
62 | 
63 | 				rlen = read(f, seed, len);
64 | 				if (rlen < 0) {
65 | 					if (errno == EINTR) {
66 | 						continue;
67 | 					}
68 | 					break;
69 | 				}
70 | 				seed = (uint8_t *)seed + rlen;
71 | 				len -= (size_t)rlen;
72 | 			}
73 | 			close(f);
74 | 			if (len == 0) {
75 | 				return 1;
76 | 			}
77 | 		}
78 | 	}
79 | #endif
80 | #if BAT_RAND_WIN32
81 | 	/*
82 | 	 * Nominally, a "Win32" implementation should use CryptoAPI
83 | 	 * (CryptAcquireContext(), then CryptGenRandom()) but this is
84 | 	 * quite inefficient and error prone. Since Windows XP and
85 | 	 * Windows Server 2003, the RtlGenRandom() function (from
86 | 	 * advapi32.dll) offers a much direct road to the OS RNG.
87 | 	 */
88 | 	if (RtlGenRandom(seed, len)) {
89 | 		return 1;
90 | 	}
91 | #endif
92 | 	return 0;
93 | }
94 | 


--------------------------------------------------------------------------------
/src/speed_bat.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Speed benchmark code for BAT implementation.
  3 |  */
  4 | 
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <string.h>
  8 | #include <time.h>
  9 | 
 10 | #include "bat.h"
 11 | #include "inner.h"
 12 | 
 13 | #ifndef DO_BENCH86
 14 | #if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64
 15 | #define DO_BENCH86   1
 16 | #else
 17 | #define DO_BENCH86   0
 18 | #endif
 19 | #endif
 20 | 
 21 | #if DO_BENCH86
 22 | #include <immintrin.h>
 23 | 
 24 | static inline uint64_t
 25 | core_cycles(void)
 26 | {
 27 | #if defined __GNUC__ && !defined __clang__
 28 | 	uint32_t hi, lo;
 29 | 
 30 | 	_mm_lfence();
 31 | 	__asm__ __volatile__ ("rdtsc" : "=d" (hi), "=a" (lo) : : );
 32 | 	return ((uint64_t)hi << 32) | (uint64_t)lo;
 33 | #else
 34 | 	_mm_lfence();
 35 | 	return __rdtsc();
 36 | #endif
 37 | }
 38 | 
 39 | #endif
 40 | 
 41 | static void *
 42 | xmalloc(size_t len)
 43 | {
 44 | 	void *buf;
 45 | 
 46 | 	if (len == 0) {
 47 | 		return NULL;
 48 | 	}
 49 | 	buf = malloc(len);
 50 | 	if (buf == NULL) {
 51 | 		fprintf(stderr, "memory allocation error\n");
 52 | 		exit(EXIT_FAILURE);
 53 | 	}
 54 | 	return buf;
 55 | }
 56 | 
 57 | static void
 58 | xfree(void *buf)
 59 | {
 60 | 	if (buf != NULL) {
 61 | 		free(buf);
 62 | 	}
 63 | }
 64 | 
 65 | /*
 66 |  * Benchmark function takes an opaque context and an iteration count;
 67 |  * it returns 0 on success, a negative error code on error.
 68 |  */
 69 | typedef int (*bench_fun)(void *ctx, unsigned long num);
 70 | 
 71 | /*
 72 |  * Returned value is the time per iteration in nanoseconds.
 73 |  * WARNING: ON x86, VALUES ARE RETURNED IN CLOCK CYCLES, NOT NANOSECONDS;
 74 |  * THRESHOLD IS IN BILLIONS OF CYCLES.
 75 |  *
 76 |  * If the benchmark function reports an error, 0.0 is returned.
 77 |  */
 78 | static double
 79 | do_bench(bench_fun bf, void *ctx, double threshold)
 80 | {
 81 | 	unsigned long num;
 82 | 	int r;
 83 | 
 84 | 	/*
 85 | 	 * Alsways do a few blank runs to "train" the caches and branch
 86 | 	 * prediction.
 87 | 	 */
 88 | 	r = bf(ctx, 5);
 89 | 	if (r != 0) {
 90 | 		fprintf(stderr, "ERR: %d\n", r);
 91 | 		return 0.0;
 92 | 	}
 93 | 
 94 | 	num = 1;
 95 | 	for (;;) {
 96 | #if DO_BENCH86
 97 | 		uint64_t begin, end;
 98 | #else
 99 | 		clock_t begin, end;
100 | #endif
101 | 		double tt;
102 | 
103 | #if DO_BENCH86
104 | 		begin = core_cycles();
105 | #else
106 | 		begin = clock();
107 | #endif
108 | 		r = bf(ctx, num);
109 | #if DO_BENCH86
110 | 		end = core_cycles();
111 | #else
112 | 		end = clock();
113 | #endif
114 | 		if (r != 0) {
115 | 			fprintf(stderr, "ERR: %d\n", r);
116 | 			return 0.0;
117 | 		}
118 | #if DO_BENCH86
119 | 		tt = (double)(end - begin) / (double)1000000000.0;
120 | #else
121 | 		tt = (double)(end - begin) / (double)CLOCKS_PER_SEC;
122 | #endif
123 | 		if (tt >= threshold) {
124 | 			return tt * 1000000000.0 / (double)num;
125 | 		}
126 | 
127 | 		/*
128 | 		 * If the function ran for less than 0.1 seconds then
129 | 		 * we simply double the iteration number; otherwise, we
130 | 		 * use the run time to try to get a "correct" number of
131 | 		 * iterations quickly.
132 | 		 */
133 | 		if (tt < 0.1) {
134 | 			num <<= 1;
135 | 		} else {
136 | 			unsigned long num2;
137 | 
138 | 			num2 = (unsigned long)((double)num
139 | 				* (threshold * 1.1) / tt);
140 | 			if (num2 <= num) {
141 | 				num2 = num + 1;
142 | 			}
143 | 			num = num2;
144 | 		}
145 | 	}
146 | }
147 | 
148 | #define XCAT(x, y)       XCAT_(x, y)
149 | #define XCAT_(x, y)      x ## y
150 | #define Zn(q, n, name)   XCAT(XCAT(XCAT(bat_, q), XCAT(_, n)), XCAT(_, name))
151 | #define Bn(q, n, name)   XCAT(XCAT(XCAT(bench_, q), XCAT(_, n)), XCAT(_, name))
152 | 
153 | #define MK_BENCH_FUNS(q, n) \
154 |  \
155 | typedef struct { \
156 | 	Zn(q, n, private_key) sk; \
157 | 	Zn(q, n, public_key) pk; \
158 | 	Zn(q, n, ciphertext) ct; \
159 | 	uint8_t *enc_sk; \
160 | 	size_t enc_sk_len; \
161 | 	uint8_t *enc_pk; \
162 | 	size_t enc_pk_len; \
163 | 	uint8_t *enc_ct; \
164 | 	size_t enc_ct_len; \
165 | 	uint8_t *tmp; \
166 | 	size_t tmp_len; \
167 | 	uint8_t secret[32]; \
168 | 	unsigned logn; \
169 | 	uint8_t *sbuf; \
170 | 	uint8_t randm[32]; \
171 | } Bn(q, n, context); \
172 |  \
173 | static int \
174 | Bn(q, n, keygen)(void *ctx, unsigned long num) \
175 | { \
176 | 	Bn(q, n, context) *bc; \
177 |  \
178 | 	bc = ctx; \
179 | 	while (num -- > 0) { \
180 | 		if (Zn(q, n, keygen)(&bc->sk, bc->tmp, bc->tmp_len) != 0) { \
181 | 			return -1; \
182 | 		} \
183 | 	} \
184 | 	return 0; \
185 | } \
186 |  \
187 | static int \
188 | Bn(q, n, encode_private_key_short)(void *ctx, unsigned long num) \
189 | { \
190 | 	Bn(q, n, context) *bc; \
191 |  \
192 | 	bc = ctx; \
193 | 	while (num -- > 0) { \
194 | 		if (Zn(q, n, encode_private_key)( \
195 | 			bc->enc_sk, bc->enc_sk_len, &bc->sk, 1) == 0) \
196 | 		{ \
197 | 			return -1; \
198 | 		} \
199 | 	} \
200 | 	return 0; \
201 | } \
202 |  \
203 | static int \
204 | Bn(q, n, encode_private_key_long)(void *ctx, unsigned long num) \
205 | { \
206 | 	Bn(q, n, context) *bc; \
207 |  \
208 | 	bc = ctx; \
209 | 	while (num -- > 0) { \
210 | 		if (Zn(q, n, encode_private_key)( \
211 | 			bc->enc_sk, bc->enc_sk_len, &bc->sk, 0) == 0) \
212 | 		{ \
213 | 			return -1; \
214 | 		} \
215 | 	} \
216 | 	return 0; \
217 | } \
218 |  \
219 | static int \
220 | Bn(q, n, decode_private_key)(void *ctx, unsigned long num) \
221 | { \
222 | 	Bn(q, n, context) *bc; \
223 |  \
224 | 	bc = ctx; \
225 | 	while (num -- > 0) { \
226 | 		if (Zn(q, n, decode_private_key)( \
227 | 			&bc->sk, bc->enc_sk, bc->enc_sk_len, \
228 | 			bc->tmp, bc->tmp_len) == 0) \
229 | 		{ \
230 | 			return -1; \
231 | 		} \
232 | 	} \
233 | 	return 0; \
234 | } \
235 |  \
236 | static int \
237 | Bn(q, n, encode_public_key)(void *ctx, unsigned long num) \
238 | { \
239 | 	Bn(q, n, context) *bc; \
240 |  \
241 | 	bc = ctx; \
242 | 	while (num -- > 0) { \
243 | 		if (Zn(q, n, encode_public_key)( \
244 | 			bc->enc_pk, bc->enc_pk_len, &bc->pk) == 0) \
245 | 		{ \
246 | 			return -1; \
247 | 		} \
248 | 	} \
249 | 	return 0; \
250 | } \
251 |  \
252 | static int \
253 | Bn(q, n, decode_public_key)(void *ctx, unsigned long num) \
254 | { \
255 | 	Bn(q, n, context) *bc; \
256 |  \
257 | 	bc = ctx; \
258 | 	while (num -- > 0) { \
259 | 		if (Zn(q, n, decode_public_key)( \
260 | 			&bc->pk, bc->enc_pk, bc->enc_pk_len) == 0) \
261 | 		{ \
262 | 			return -1; \
263 | 		} \
264 | 	} \
265 | 	return 0; \
266 | } \
267 |  \
268 | static int \
269 | Bn(q, n, encode_ciphertext)(void *ctx, unsigned long num) \
270 | { \
271 | 	Bn(q, n, context) *bc; \
272 |  \
273 | 	bc = ctx; \
274 | 	while (num -- > 0) { \
275 | 		if (Zn(q, n, encode_ciphertext)( \
276 | 			bc->enc_ct, bc->enc_ct_len, &bc->ct) == 0) \
277 | 		{ \
278 | 			return -1; \
279 | 		} \
280 | 	} \
281 | 	return 0; \
282 | } \
283 |  \
284 | static int \
285 | Bn(q, n, decode_ciphertext)(void *ctx, unsigned long num) \
286 | { \
287 | 	Bn(q, n, context) *bc; \
288 |  \
289 | 	bc = ctx; \
290 | 	while (num -- > 0) { \
291 | 		if (Zn(q, n, decode_ciphertext)( \
292 | 			&bc->ct, bc->enc_ct, bc->enc_ct_len) == 0) \
293 | 		{ \
294 | 			return -1; \
295 | 		} \
296 | 	} \
297 | 	return 0; \
298 | } \
299 |  \
300 | static int \
301 | Bn(q, n, encapsulate)(void *ctx, unsigned long num) \
302 | { \
303 | 	Bn(q, n, context) *bc; \
304 |  \
305 | 	bc = ctx; \
306 | 	while (num -- > 0) { \
307 | 		if (Zn(q, n, encapsulate_explicit_seed)( \
308 | 			bc->secret, sizeof bc->secret, \
309 | 			&bc->ct, &bc->pk, bc->randm, \
310 | 			bc->tmp, bc->tmp_len) != 0) \
311 | 		{ \
312 | 			return -1; \
313 | 		} \
314 | 	} \
315 | 	return 0; \
316 | } \
317 |  \
318 | static int \
319 | Bn(q, n, decapsulate)(void *ctx, unsigned long num) \
320 | { \
321 | 	Bn(q, n, context) *bc; \
322 |  \
323 | 	bc = ctx; \
324 | 	while (num -- > 0) { \
325 | 		if (Zn(q, n, decapsulate)( \
326 | 			bc->secret, sizeof bc->secret, \
327 | 			&bc->ct, &bc->sk, bc->tmp, bc->tmp_len) != 0) \
328 | 		{ \
329 | 			return -1; \
330 | 		} \
331 | 	} \
332 | 	return 0; \
333 | } \
334 |  \
335 | static int \
336 | Bn(q, n, encapsulate_nofo)(void *ctx, unsigned long num) \
337 | { \
338 | 	Bn(q, n, context) *bc; \
339 |  \
340 | 	bc = ctx; \
341 | 	while (num -- > 0) { \
342 | 		if (!XCAT(bat_encrypt_, q)(bc->ct.c, bc->sbuf, \
343 | 			bc->pk.h, bc->logn, (uint32_t *)bc->tmp)) \
344 | 		{ \
345 | 			return -1; \
346 | 		} \
347 | 	} \
348 | 	return 0; \
349 | } \
350 |  \
351 | static int \
352 | Bn(q, n, decapsulate_nofo)(void *ctx, unsigned long num) \
353 | { \
354 | 	Bn(q, n, context) *bc; \
355 |  \
356 | 	bc = ctx; \
357 | 	while (num -- > 0) { \
358 | 		XCAT(bat_decrypt_, q)(bc->sbuf, bc->ct.c, \
359 | 			bc->sk.f, bc->sk.g, bc->sk.F, bc->sk.G, \
360 | 			bc->sk.w, bc->logn, (uint32_t *)bc->tmp); \
361 | 		XCAT(bat_encrypt_, q)(bc->ct.c, bc->sbuf, \
362 | 			bc->pk.h, bc->logn, (uint32_t *)bc->tmp); \
363 | 	} \
364 | 	return 0; \
365 | } \
366 |  \
367 | static void \
368 | Bn(q, n, all)(double threshold) \
369 | { \
370 | 	Bn(q, n, context) bc; \
371 |  \
372 | 	printf("q=%3u, n=%4u:", (unsigned)q, (unsigned)n); \
373 | 	fflush(stdout); \
374 |  \
375 | 	bc.enc_sk_len = Zn(q, n, encode_private_key(0, 0, 0, 0)); \
376 | 	bc.enc_pk_len = Zn(q, n, encode_public_key(0, 0, 0)); \
377 | 	bc.enc_ct_len = Zn(q, n, encode_public_key(0, 0, 0)); \
378 | 	bc.enc_sk = xmalloc(bc.enc_sk_len); \
379 | 	bc.enc_pk = xmalloc(bc.enc_pk_len); \
380 | 	bc.enc_ct = xmalloc(bc.enc_ct_len); \
381 | 	bc.tmp_len = 24 * n + 31; \
382 | 	bc.tmp = xmalloc(bc.tmp_len); \
383 | 	for (bc.logn = 1; (1u << bc.logn) < n; bc.logn ++); \
384 | 	bc.sbuf = xmalloc(SBUF_LEN(bc.logn)); \
385 | 	if (!bat_get_seed(bc.sbuf, SBUF_LEN(bc.logn))) { \
386 | 		fprintf(stderr, "ERR: bat_get_seed() failed\n"); \
387 | 		exit(EXIT_FAILURE); \
388 | 	} \
389 | 	if (!bat_get_seed(bc.randm, sizeof bc.randm)) { \
390 | 		fprintf(stderr, "ERR: bat_get_seed() failed\n"); \
391 | 		exit(EXIT_FAILURE); \
392 | 	} \
393 |  \
394 |  	PRINT_BENCHS(q, n); \
395 |  \
396 | 	xfree(bc.enc_sk); \
397 | 	xfree(bc.enc_pk); \
398 | 	xfree(bc.enc_ct); \
399 | 	xfree(bc.tmp); \
400 | 	xfree(bc.sbuf); \
401 | }
402 | 
403 | #if DO_BENCH86
404 | #define PRINT_BENCHS(q, n) \
405 | 	do { \
406 | 		printf(" %7.0fk", \
407 | 			do_bench(&Bn(q, n, keygen), \
408 | 			&bc, threshold) / 1000.0); \
409 | 		fflush(stdout); \
410 | 		printf(" %8.0f", \
411 | 			do_bench(&Bn(q, n, encode_private_key_short), \
412 | 			&bc, threshold)); \
413 | 		fflush(stdout); \
414 | 		printf(" %8.0f", \
415 | 			do_bench(&Bn(q, n, decode_private_key), \
416 | 			&bc, threshold)); \
417 | 		fflush(stdout); \
418 | 		printf(" %8.0f", \
419 | 			do_bench(&Bn(q, n, encode_private_key_long), \
420 | 			&bc, threshold)); \
421 | 		fflush(stdout); \
422 | 		printf(" %8.0f", \
423 | 			do_bench(&Bn(q, n, decode_private_key), \
424 | 			&bc, threshold)); \
425 | 		fflush(stdout); \
426 | 		Zn(q, n, get_public_key)(&bc.pk, &bc.sk); \
427 | 		printf(" %8.0f", \
428 | 			do_bench(&Bn(q, n, encode_public_key), \
429 | 			&bc, threshold)); \
430 | 		fflush(stdout); \
431 | 		printf(" %8.0f", \
432 | 			do_bench(&Bn(q, n, decode_public_key), \
433 | 			&bc, threshold)); \
434 | 		fflush(stdout); \
435 | 		printf(" %8.0f", \
436 | 			do_bench(&Bn(q, n, encapsulate_nofo), \
437 | 			&bc, threshold)); \
438 | 		fflush(stdout); \
439 | 		printf(" %8.0f", \
440 | 			do_bench(&Bn(q, n, encapsulate), \
441 | 			&bc, threshold)); \
442 | 		fflush(stdout); \
443 | 		printf(" %8.0f", \
444 | 			do_bench(&Bn(q, n, decapsulate_nofo), \
445 | 			&bc, threshold)); \
446 | 		fflush(stdout); \
447 | 		printf(" %8.0f", \
448 | 			do_bench(&Bn(q, n, decapsulate), \
449 | 			&bc, threshold)); \
450 | 		fflush(stdout); \
451 | 		printf(" %8.0f", \
452 | 			do_bench(&Bn(q, n, encode_ciphertext), \
453 | 			&bc, threshold)); \
454 | 		fflush(stdout); \
455 | 		printf(" %8.0f", \
456 | 			do_bench(&Bn(q, n, decode_ciphertext), \
457 | 			&bc, threshold)); \
458 | 		printf("\n"); \
459 | 		fflush(stdout); \
460 | 	} while (0)
461 | #else
462 | #define PRINT_BENCHS(q, n) \
463 | 	do { \
464 | 		printf(" %8.2f", \
465 | 			do_bench(&Bn(q, n, keygen), \
466 | 			&bc, threshold) / 1000000.0); \
467 | 		fflush(stdout); \
468 | 		printf(" %8.2f", \
469 | 			do_bench(&Bn(q, n, encode_private_key_short), \
470 | 			&bc, threshold) / 1000.0); \
471 | 		fflush(stdout); \
472 | 		printf(" %8.2f", \
473 | 			do_bench(&Bn(q, n, decode_private_key), \
474 | 			&bc, threshold) / 1000.0); \
475 | 		fflush(stdout); \
476 | 		printf(" %8.2f", \
477 | 			do_bench(&Bn(q, n, encode_private_key_long), \
478 | 			&bc, threshold) / 1000.0); \
479 | 		fflush(stdout); \
480 | 		printf(" %8.2f", \
481 | 			do_bench(&Bn(q, n, decode_private_key), \
482 | 			&bc, threshold) / 1000.0); \
483 | 		fflush(stdout); \
484 | 		Zn(q, n, get_public_key)(&bc.pk, &bc.sk); \
485 | 		printf(" %8.2f", \
486 | 			do_bench(&Bn(q, n, encode_public_key), \
487 | 			&bc, threshold) / 1000.0); \
488 | 		fflush(stdout); \
489 | 		printf(" %8.2f", \
490 | 			do_bench(&Bn(q, n, decode_public_key), \
491 | 			&bc, threshold) / 1000.0); \
492 | 		fflush(stdout); \
493 | 		printf(" %8.2f", \
494 | 			do_bench(&Bn(q, n, encapsulate_nofo), \
495 | 			&bc, threshold) / 1000.0); \
496 | 		fflush(stdout); \
497 | 		printf(" %8.2f", \
498 | 			do_bench(&Bn(q, n, encapsulate), \
499 | 			&bc, threshold) / 1000.0); \
500 | 		fflush(stdout); \
501 | 		printf(" %8.2f", \
502 | 			do_bench(&Bn(q, n, decapsulate_nofo), \
503 | 			&bc, threshold) / 1000.0); \
504 | 		fflush(stdout); \
505 | 		printf(" %8.2f", \
506 | 			do_bench(&Bn(q, n, decapsulate), \
507 | 			&bc, threshold) / 1000.0); \
508 | 		fflush(stdout); \
509 | 		printf(" %8.2f", \
510 | 			do_bench(&Bn(q, n, encode_ciphertext), \
511 | 			&bc, threshold) / 1000.0); \
512 | 		fflush(stdout); \
513 | 		printf(" %8.2f", \
514 | 			do_bench(&Bn(q, n, decode_ciphertext), \
515 | 			&bc, threshold) / 1000.0); \
516 | 		printf("\n"); \
517 | 		fflush(stdout); \
518 | 	} while (0)
519 | #endif
520 | 
521 | MK_BENCH_FUNS(128, 256)
522 | MK_BENCH_FUNS(257, 512)
523 | MK_BENCH_FUNS(769, 1024)
524 | 
525 | int
526 | main(int argc, char *argv[])
527 | {
528 | 	double threshold;
529 | 
530 | 	if (argc < 2) {
531 | 		threshold = 2.0;
532 | 	} else if (argc == 2) {
533 | 		threshold = atof(argv[1]);
534 | 	} else {
535 | 		threshold = -1.0;
536 | 	}
537 | 	if (threshold <= 0.0 || threshold > 60.0) {
538 | 		fprintf(stderr,
539 | "usage: speed [ threshold ]\n"
540 | "'threshold' is the minimum time for a bench run, in seconds (must be\n"
541 | "positive and less than 60).\n");
542 | 		exit(EXIT_FAILURE);
543 | 	}
544 | #if DO_BENCH86
545 | 	printf("time threshold = %.4f Gcyc\n", threshold);
546 | #else
547 | 	printf("time threshold = %.4f s\n", threshold);
548 | #endif
549 | 
550 | 	printf("esk / dsk = encode / decode private key (s = short format, l = long format)\n");
551 | 	printf("epk / dpk = encode / decode public key\n");
552 | 	printf("ect / dct = encode / decode ciphertext\n");
553 | 	printf("ecp = encapsulate, dcp = decapsulate (nofo = without Fujisaki-Okamoto)\n");
554 | #if DO_BENCH86
555 | 	printf("x86 PLATFORM, USING TSC; VALUES IN CLOCK CYCLES\n");
556 | #else
557 | 	printf("keygen in milliseconds, all other times in microseconds\n");
558 | #endif
559 | 	printf("              "
560 | 		"   keygen"
561 | 		"    esk-s"
562 | 		"    dsk-s"
563 | 		"    esk-l"
564 | 		"    dsk-l"
565 | 		"      epk"
566 | 		"      dpk"
567 | 		" ecp-nofo"
568 | 		"   ecp-fo"
569 | 		" dcp-nofo"
570 | 		"   dcp-fo"
571 | 		"      ect"
572 | 		"      dct"
573 | 		"\n");
574 | 	bench_128_256_all(threshold);
575 | 	bench_257_512_all(threshold);
576 | 	bench_769_1024_all(threshold);
577 | 	return 0;
578 | }
579 | 


--------------------------------------------------------------------------------
/src/test_bat.c:
--------------------------------------------------------------------------------
   1 | #include <stdio.h>
   2 | #include <stdlib.h>
   3 | #include <string.h>
   4 | #include <stdint.h>
   5 | 
   6 | #include "bat.h"
   7 | #include "inner.h"
   8 | 
   9 | static void
  10 | check_equals(const void *a1, const void *a2, size_t len, const char *msg)
  11 | {
  12 | 	const uint8_t *b1, *b2;
  13 | 	size_t u;
  14 | 
  15 | 	if (memcmp(a1, a2, len) == 0) {
  16 | 		return;
  17 | 	}
  18 | 	fprintf(stderr, "ERR: %s\n", msg);
  19 | 	b1 = a1;
  20 | 	b2 = a2;
  21 | 	fprintf(stderr, "a1 = ");
  22 | 	for (u = 0; u < len; u ++) {
  23 | 		fprintf(stderr, "%02x", b1[u]);
  24 | 	}
  25 | 	fprintf(stderr, "\n");
  26 | 	fprintf(stderr, "a2 = ");
  27 | 	for (u = 0; u < len; u ++) {
  28 | 		fprintf(stderr, "%02x", b2[u]);
  29 | 	}
  30 | 	fprintf(stderr, "\n");
  31 | 	exit(EXIT_FAILURE);
  32 | }
  33 | 
  34 | static void
  35 | selftest_seq(uint8_t *out, size_t len, uint32_t seed)
  36 | {
  37 | 	size_t i;
  38 | 	uint32_t t, a, b;
  39 | 
  40 | 	a = 0xDEAD4BAD * seed;
  41 | 	b = 1;
  42 | 
  43 | 	for (i = 0; i < len; i++) {
  44 | 		t = a + b;
  45 | 		a = b;
  46 | 		b = t;
  47 | 		out[i] = (t >> 24) & 0xFF;
  48 | 	}
  49 | }
  50 | 
  51 | static void
  52 | test_BLAKE2s_self()
  53 | {
  54 | 	/*
  55 | 	 * This code is from RFC 7693 (appendix E).
  56 | 	 */
  57 | 
  58 | 	// Grand hash of hash results.
  59 | 	static const uint8_t blake2s_res[32] = {
  60 | 		0x6A, 0x41, 0x1F, 0x08, 0xCE, 0x25, 0xAD, 0xCD,
  61 | 		0xFB, 0x02, 0xAB, 0xA6, 0x41, 0x45, 0x1C, 0xEC,
  62 | 		0x53, 0xC5, 0x98, 0xB2, 0x4F, 0x4F, 0xC7, 0x87,
  63 | 		0xFB, 0xDC, 0x88, 0x79, 0x7F, 0x4C, 0x1D, 0xFE
  64 | 	};
  65 | 	// Parameter sets.
  66 | 	static const size_t b2s_md_len[4] = { 16, 20, 28, 32 };
  67 | 	static const size_t b2s_in_len[6] = { 0, 3, 64, 65, 255, 1024 };
  68 | 
  69 | 	size_t i, j, outlen, inlen;
  70 | 	uint8_t in[1024], md[32], key[32];
  71 | 	blake2s_context ctx;
  72 | 
  73 | 	printf("Test BLAKE2s selftest: ");
  74 | 	fflush(stdout);
  75 | 
  76 | 	blake2s_init(&ctx, 32);
  77 | 
  78 | 	for (i = 0; i < 4; i ++) {
  79 | 		outlen = b2s_md_len[i];
  80 | 		for (j = 0; j < 6; j++) {
  81 | 			inlen = b2s_in_len[j];
  82 | 
  83 | 			selftest_seq(in, inlen, inlen);
  84 | 			blake2s(md, outlen, NULL, 0, in, inlen);
  85 | 			blake2s_update(&ctx, md, outlen);
  86 | 
  87 | 			selftest_seq(key, outlen, outlen);
  88 | 			blake2s(md, outlen, key, outlen, in, inlen);
  89 | 			blake2s_update(&ctx, md, outlen);
  90 | 		}
  91 | 		printf(".");
  92 | 		fflush(stdout);
  93 | 	}
  94 | 
  95 | 	blake2s_final(&ctx, md);
  96 | 	check_equals(md, blake2s_res, sizeof blake2s_res, "KAT");
  97 | 
  98 | 	printf(" done.\n");
  99 | 	fflush(stdout);
 100 | }
 101 | 
 102 | static void
 103 | test_BLAKE2s_expand(void)
 104 | {
 105 | 	size_t u;
 106 | 
 107 | 	printf("Test BLAKE2s expand: ");
 108 | 	fflush(stdout);
 109 | 
 110 | 	/* Test vector generated with python3 hashlib.blake2s()
 111 | 	   implementation. */
 112 | 	static const uint8_t seed[] = {
 113 | 		0x4B, 0xFC, 0xB2, 0x19, 0x96, 0xAC, 0xE1, 0xE2,
 114 | 		0xA1, 0xD5, 0x38, 0xC5, 0x4D, 0x10, 0x99, 0xBF,
 115 | 		0x53, 0x20, 0x82, 0x62
 116 | 	};
 117 | 
 118 | 	uint64_t label = 0x4A1BE6AC1347378C;
 119 | 
 120 | 	static const uint8_t ref[] = {
 121 | 		0x78, 0x3D, 0xAA, 0x23, 0xB5, 0x2A, 0xDE, 0x32,
 122 | 		0x8C, 0x44, 0xB5, 0xBF, 0x68, 0xB3, 0x8E, 0xA3,
 123 | 		0x47, 0x49, 0xDB, 0x98, 0x96, 0xB4, 0xD8, 0x84,
 124 | 		0xA0, 0xEB, 0xB0, 0x0B, 0x84, 0x91, 0x66, 0xBD,
 125 | 		0x49, 0x56, 0x50, 0xEC, 0x3E, 0x89, 0x46, 0xF3,
 126 | 		0x45, 0x26, 0xBF, 0xEA, 0x28, 0x63, 0xE3, 0x83,
 127 | 		0x31, 0x64, 0xFE, 0x30, 0xE2, 0x89, 0x71, 0xFC,
 128 | 		0x34, 0x0C, 0x13, 0x05, 0xBA, 0x0D, 0x51, 0x39,
 129 | 		0x63, 0xD7, 0x41, 0x41, 0xCB, 0x4D, 0x74, 0xE8,
 130 | 		0x3F, 0x62, 0x74, 0xA2, 0xE4, 0x12, 0xB3, 0x25,
 131 | 		0x48, 0xC9, 0x3E, 0x57, 0xD3, 0x9E, 0xDD, 0xD7,
 132 | 		0x7B, 0x35, 0xC4, 0xE8, 0x54, 0x2E, 0x78, 0x44,
 133 | 		0xEF, 0xF0, 0x98, 0xCF, 0x82, 0x6B, 0xD0, 0x92,
 134 | 		0x2E, 0xF6, 0x9E, 0xFA, 0xB3, 0x38, 0x83, 0x3B,
 135 | 		0x96, 0x0C, 0xCF, 0xEA, 0xA8, 0x5E, 0xBE, 0x14,
 136 | 		0x64, 0xD6, 0x35, 0xE3, 0xA8, 0x60, 0x40, 0xE5,
 137 | 		0xF5, 0xEB, 0xDC, 0x55, 0xC8, 0x74, 0xEB, 0x21,
 138 | 		0x19, 0x43, 0x98, 0x46, 0xCD, 0xBE, 0x22, 0x0A,
 139 | 		0x0A, 0xF9, 0x07, 0xEA, 0xF0, 0xDC, 0x26, 0x80,
 140 | 		0x43, 0x42, 0xA6, 0xEE, 0x4F, 0x73, 0xA3, 0x0E,
 141 | 		0xB4, 0xDB, 0x10, 0x2D, 0x48, 0x8A, 0x43, 0xA9,
 142 | 		0xC0, 0x8B, 0x31, 0x4F, 0x2B, 0x52, 0xBA, 0xE2,
 143 | 		0x33, 0xBC, 0x32, 0xEA, 0xB7, 0xBB, 0x64, 0x2D,
 144 | 		0x31, 0xDA, 0x42, 0x24, 0x7B, 0x7D, 0x34, 0x61,
 145 | 		0xE3, 0x90, 0x2B, 0xA4, 0x93, 0x4A, 0x9D, 0x60,
 146 | 		0x4C, 0x48, 0xA4, 0x9E, 0x27, 0x04, 0x7C, 0xE6,
 147 | 		0x53, 0x12, 0x53, 0xD2, 0x8B, 0xC9, 0xCD, 0x4D,
 148 | 		0x74, 0xF4, 0x96, 0x5D, 0x02, 0x37, 0xB4, 0x2D,
 149 | 		0xBC, 0xAB, 0xDA, 0xEC, 0x4C, 0xE3, 0xF0, 0x57,
 150 | 		0x12, 0x7F, 0xB9, 0xFD, 0xB7, 0x3A, 0xDE, 0x37,
 151 | 		0xEF, 0x1B, 0x84, 0x5B, 0xFE, 0x1D, 0xEB, 0xC4,
 152 | 		0x0C, 0xF9, 0xC7, 0xA7, 0xE0, 0xB6, 0xC7, 0xAB
 153 | 	};
 154 | 
 155 | 	for (u = 0; u < sizeof ref; u ++) {
 156 | 		uint8_t out[1 + (sizeof ref)];
 157 | 
 158 | 		out[u] = 0xFF;
 159 | 		blake2s_expand(out, u, seed, sizeof seed, label);
 160 | 		if (out[u] != 0xFF) {
 161 | 			fprintf(stderr, "Output buffer overflow");
 162 | 			exit(EXIT_FAILURE);
 163 | 		}
 164 | 		check_equals(out, ref, u, "KAT");
 165 | 		printf(".");
 166 | 		fflush(stdout);
 167 | 	}
 168 | 
 169 | 	printf(" done.\n");
 170 | 	fflush(stdout);
 171 | }
 172 | 
 173 | static void
 174 | test_BLAKE2b_self()
 175 | {
 176 | 	/*
 177 | 	 * This code is from RFC 7693 (appendix E).
 178 | 	 */
 179 | 
 180 | 	// Grand hash of hash results.
 181 | 	static const uint8_t blake2b_res[32] = {
 182 | 		0xC2, 0x3A, 0x78, 0x00, 0xD9, 0x81, 0x23, 0xBD,
 183 | 		0x10, 0xF5, 0x06, 0xC6, 0x1E, 0x29, 0xDA, 0x56,
 184 | 		0x03, 0xD7, 0x63, 0xB8, 0xBB, 0xAD, 0x2E, 0x73,
 185 | 		0x7F, 0x5E, 0x76, 0x5A, 0x7B, 0xCC, 0xD4, 0x75
 186 | 	};
 187 | 	// Parameter sets.
 188 | 	static const size_t b2b_md_len[4] = { 20, 32, 48, 64 };
 189 | 	static const size_t b2b_in_len[6] = { 0, 3, 128, 129, 255, 1024 };
 190 | 
 191 | 	size_t i, j, outlen, inlen;
 192 | 	uint8_t in[1024], md[64], key[64];
 193 | 	blake2b_context ctx;
 194 | 
 195 | 	printf("Test BLAKE2b selftest: ");
 196 | 	fflush(stdout);
 197 | 
 198 | 	blake2b_init(&ctx, 32);
 199 | 
 200 | 	for (i = 0; i < 4; i ++) {
 201 | 		outlen = b2b_md_len[i];
 202 | 		for (j = 0; j < 6; j++) {
 203 | 			inlen = b2b_in_len[j];
 204 | 
 205 | 			selftest_seq(in, inlen, inlen);
 206 | 			blake2b(md, outlen, NULL, 0, in, inlen);
 207 | 			blake2b_update(&ctx, md, outlen);
 208 | 
 209 | 			selftest_seq(key, outlen, outlen);
 210 | 			blake2b(md, outlen, key, outlen, in, inlen);
 211 | 			blake2b_update(&ctx, md, outlen);
 212 | 		}
 213 | 		printf(".");
 214 | 		fflush(stdout);
 215 | 	}
 216 | 
 217 | 	blake2b_final(&ctx, md);
 218 | 	check_equals(md, blake2b_res, sizeof blake2b_res, "KAT");
 219 | 
 220 | 	printf(" done.\n");
 221 | 	fflush(stdout);
 222 | }
 223 | 
 224 | static void
 225 | test_BLAKE2b_expand(void)
 226 | {
 227 | 	size_t u;
 228 | 
 229 | 	printf("Test BLAKE2b expand: ");
 230 | 	fflush(stdout);
 231 | 
 232 | 	/* Test vector generated with python3 hashlib.blake2b()
 233 | 	   implementation. */
 234 | 	static const uint8_t seed[] = {
 235 | 		0x4B, 0xFC, 0xB2, 0x19, 0x96, 0xAC, 0xE1, 0xE2,
 236 | 		0xA1, 0xD5, 0x38, 0xC5, 0x4D, 0x10, 0x99, 0xBF,
 237 | 		0x53, 0x20, 0x82, 0x62
 238 | 	};
 239 | 
 240 | 	uint64_t label = 0x4A1BE6AC1347378C;
 241 | 
 242 | 	static const uint8_t ref[] = {
 243 | 		0xF7, 0x50, 0xF1, 0x35, 0x88, 0x0B, 0x7F, 0xBD,
 244 | 		0x1E, 0x01, 0x54, 0x42, 0x21, 0x6C, 0xAC, 0xCA,
 245 | 		0x6A, 0x19, 0xF4, 0xFE, 0x76, 0xB1, 0x69, 0xF8,
 246 | 		0x2B, 0xA1, 0x99, 0x14, 0x13, 0xF5, 0xB1, 0x87,
 247 | 		0xD9, 0xF8, 0xA0, 0x49, 0x47, 0xF6, 0x94, 0x26,
 248 | 		0x4E, 0x91, 0xF0, 0x63, 0x36, 0x56, 0x56, 0x9C,
 249 | 		0x3D, 0xF2, 0xD9, 0x8D, 0x7D, 0x6D, 0x07, 0xF6,
 250 | 		0x64, 0xB1, 0x25, 0x14, 0xB0, 0x80, 0xF6, 0x08,
 251 | 		0x59, 0x70, 0xB0, 0xE2, 0x18, 0x2A, 0x0C, 0x9B,
 252 | 		0xA6, 0x51, 0xE2, 0x73, 0xE8, 0xBF, 0x0A, 0x2F,
 253 | 		0x3E, 0xD1, 0x65, 0x34, 0x95, 0x5F, 0xF1, 0x0C,
 254 | 		0xB3, 0x0A, 0x45, 0xF5, 0x90, 0x71, 0x71, 0x72,
 255 | 		0xCA, 0x5D, 0x58, 0x46, 0xF1, 0xDA, 0xC7, 0xE4,
 256 | 		0xD4, 0x5B, 0xAE, 0x92, 0xBD, 0x6B, 0x0B, 0xA6,
 257 | 		0xBF, 0xDD, 0x90, 0x24, 0x8B, 0x8B, 0xF7, 0x02,
 258 | 		0x4F, 0xDB, 0x99, 0xA8, 0x42, 0x2D, 0x58, 0x51,
 259 | 		0x55, 0xD5, 0xD4, 0xEA, 0x08, 0x94, 0x19, 0x99,
 260 | 		0x5B, 0x25, 0xEB, 0x24, 0x48, 0x56, 0xDE, 0xEA,
 261 | 		0xA7, 0x66, 0x02, 0xD8, 0x40, 0x2B, 0x3B, 0xCC,
 262 | 		0x2B, 0x98, 0xA1, 0x9B, 0xEE, 0x59, 0xD2, 0x42,
 263 | 		0x60, 0xF2, 0x80, 0x95, 0x4D, 0x3E, 0x93, 0xD9,
 264 | 		0x17, 0x2B, 0xAF, 0x11, 0xD4, 0xE1, 0x40, 0x60,
 265 | 		0x5F, 0xC9, 0x2D, 0x1D, 0xFA, 0x7F, 0x21, 0xAB,
 266 | 		0x0C, 0xA2, 0xFE, 0x90, 0xD9, 0x23, 0x65, 0x52,
 267 | 		0xA7, 0xE5, 0x33, 0xB6, 0xC3, 0xEA, 0xE4, 0xC0,
 268 | 		0x91, 0xBA, 0x1C, 0xB5, 0x4B, 0x81, 0xAC, 0xBF,
 269 | 		0xC3, 0x55, 0x82, 0xE7, 0xF9, 0x56, 0x0B, 0xD1,
 270 | 		0x9F, 0x74, 0x18, 0xEB, 0x49, 0xEE, 0x55, 0x48,
 271 | 		0xE6, 0x6F, 0xE6, 0x01, 0x69, 0x6A, 0x7C, 0x59,
 272 | 		0x8D, 0xD0, 0x45, 0x1C, 0x14, 0x28, 0x44, 0x74,
 273 | 		0x24, 0x95, 0xE0, 0xEB, 0x0A, 0x21, 0x82, 0x8D,
 274 | 		0x99, 0x35, 0xC5, 0x1C, 0x68, 0x98, 0x51, 0x3A,
 275 | 		0xF9, 0x7F, 0x09, 0xE7, 0xA8, 0xAB, 0x20, 0x80,
 276 | 		0xCD, 0x2D, 0x46, 0x25, 0xCB, 0x7A, 0xC6, 0xC5,
 277 | 		0xDC, 0xF5, 0xAC, 0x76, 0x00, 0xA0, 0xC0, 0xDA,
 278 | 		0x29, 0x41, 0x5C, 0x2A, 0x0D, 0x0A, 0xE4, 0x18,
 279 | 		0x73, 0x35, 0xD2, 0x8B, 0x46, 0xAA, 0x04, 0x8E,
 280 | 		0x32, 0xB4, 0xA3, 0x79, 0x95, 0x0A, 0x9F, 0x4C,
 281 | 		0x9F, 0x0D, 0xED, 0x67, 0xA8, 0x97, 0xEB, 0xB0,
 282 | 		0xCA, 0xD9, 0xF1, 0xBB, 0x88, 0x7F, 0x14, 0xD0,
 283 | 		0xD0, 0xCD, 0x7F, 0xEC, 0xAC, 0xDB, 0x7C, 0x81,
 284 | 		0x3F, 0x19, 0x6C, 0x56, 0x16, 0x26, 0x4A, 0xA7,
 285 | 		0xD8, 0x75, 0xC0, 0x91, 0xDA, 0x8A, 0x35, 0xDB,
 286 | 		0x75, 0x34, 0x9F, 0x60, 0x57, 0x0A, 0xFD, 0xBD,
 287 | 		0xBA, 0x43, 0x64, 0xB6, 0xF9, 0x63, 0x8C, 0x39,
 288 | 		0x0C, 0xFF, 0x07, 0x09, 0xBB, 0xD8, 0x85, 0x19,
 289 | 		0x0C, 0x2B, 0xDF, 0xF1, 0x97, 0xD7, 0xC2, 0x38,
 290 | 		0x15, 0x89, 0x7A, 0x54, 0x6E, 0x6E, 0x30, 0xFC,
 291 | 		0xA8, 0xD0, 0xCD, 0xC0, 0x82, 0x37, 0x0B, 0x6A,
 292 | 		0x21, 0x24, 0x48, 0x85, 0x9F, 0xB3, 0xEA, 0x1B,
 293 | 		0x12, 0xAF, 0x17, 0xD3, 0x20, 0x31, 0xE3, 0x35,
 294 | 		0xB8, 0x78, 0xF7, 0x7B, 0x2C, 0x07, 0xAD, 0xEF,
 295 | 		0x26, 0xEF, 0xCB, 0xC3, 0x59, 0x01, 0x9F, 0x73,
 296 | 		0x5C, 0x88, 0xB3, 0x61, 0x6D, 0x77, 0x52, 0x30,
 297 | 		0x04, 0x71, 0x28, 0xB8, 0x94, 0xF3, 0xA0, 0x30,
 298 | 		0x05, 0xCD, 0x51, 0x2F, 0x90, 0x8B, 0xF1, 0x1F,
 299 | 		0x52, 0xBC, 0x2B, 0x20, 0xD2, 0x52, 0xAE, 0x41,
 300 | 		0x70, 0x56, 0x07, 0x84, 0x90, 0xAF, 0x3B, 0xE6,
 301 | 		0xAD, 0x25, 0x11, 0x07, 0x36, 0x86, 0xFC, 0xD5,
 302 | 		0xA5, 0x4A, 0xE7, 0x09, 0xBF, 0x02, 0x10, 0x82,
 303 | 		0x52, 0xDB, 0x01, 0x77, 0x77, 0x2A, 0xAA, 0x3A,
 304 | 		0xFD, 0x0F, 0x9E, 0x6E, 0x86, 0x0B, 0x6F, 0x77,
 305 | 		0x7A, 0x5B, 0x1A, 0xD0, 0x9F, 0xFB, 0x49, 0x4B,
 306 | 		0x79, 0x8D, 0x5C, 0x59, 0x9D, 0x5A, 0x0D, 0x51
 307 | 	};
 308 | 
 309 | 	for (u = 0; u < sizeof ref; u ++) {
 310 | 		uint8_t out[1 + (sizeof ref)];
 311 | 
 312 | 		out[u] = 0xFF;
 313 | 		blake2b_expand(out, u, seed, sizeof seed, label);
 314 | 		if (out[u] != 0xFF) {
 315 | 			fprintf(stderr, "Output buffer overflow");
 316 | 			exit(EXIT_FAILURE);
 317 | 		}
 318 | 		check_equals(out, ref, u, "KAT");
 319 | 		printf(".");
 320 | 		fflush(stdout);
 321 | 	}
 322 | 
 323 | 	printf(" done.\n");
 324 | 	fflush(stdout);
 325 | }
 326 | 
 327 | /*
 328 |  * Initialize a PRNG with a given seed and extra label.
 329 |  */
 330 | static void
 331 | rand_init(prng_context *rng, const char *seed, uint64_t x)
 332 | {
 333 | 	prng_init(rng, seed, strlen(seed), x);
 334 | }
 335 | 
 336 | /*
 337 |  * Generate a random polynomial with integer coefficients (coefficients
 338 |  * are signed and selected uniformly over num_bits).
 339 |  */
 340 | static void
 341 | rand_poly_32(prng_context *rng, int32_t *f, unsigned logn, unsigned num_bits)
 342 | {
 343 | 	size_t u, n;
 344 | 
 345 | 	n = (size_t)1 << logn;
 346 | 	for (u = 0; u < n; u ++) {
 347 | 		uint32_t x;
 348 | 
 349 | 		x = (uint32_t)prng_get_u64(rng);
 350 | 		f[u] = *(int32_t *)&x >> (32 - num_bits);
 351 | 	}
 352 | }
 353 | 
 354 | static void
 355 | poly_add(int32_t *d, const int32_t *a, const int32_t *b, unsigned logn)
 356 | {
 357 | 	size_t u, n;
 358 | 
 359 | 	n = (size_t)1 << logn;
 360 | 	for (u = 0; u < n; u ++) {
 361 | 		d[u] = a[u] + b[u];
 362 | 	}
 363 | }
 364 | 
 365 | static void
 366 | poly_sub(int32_t *d, const int32_t *a, const int32_t *b, unsigned logn)
 367 | {
 368 | 	size_t u, n;
 369 | 
 370 | 	n = (size_t)1 << logn;
 371 | 	for (u = 0; u < n; u ++) {
 372 | 		d[u] = a[u] - b[u];
 373 | 	}
 374 | }
 375 | 
 376 | static void
 377 | poly_neg(int32_t *d, const int32_t *a, unsigned logn)
 378 | {
 379 | 	size_t u, n;
 380 | 
 381 | 	n = (size_t)1 << logn;
 382 | 	for (u = 0; u < n; u ++) {
 383 | 		d[u] = -a[u];
 384 | 	}
 385 | }
 386 | 
 387 | static void
 388 | poly_mul(int32_t *d, const int32_t *a, const int32_t *b, unsigned logn)
 389 | {
 390 | 	int32_t t[1024];
 391 | 	size_t u, v, n;
 392 | 
 393 | 	n = (size_t)1 << logn;
 394 | 	memset(t, 0, sizeof t);
 395 | 	for (u = 0; u < n; u ++) {
 396 | 		for (v = 0; v < n; v ++) {
 397 | 			int32_t m;
 398 | 
 399 | 			m = a[u] * b[v];
 400 | 			if ((u + v) < n) {
 401 | 				t[u + v] += m;
 402 | 			} else {
 403 | 				t[u + v - n] -= m;
 404 | 			}
 405 | 		}
 406 | 	}
 407 | 	memcpy(d, t, n * sizeof *d);
 408 | }
 409 | 
 410 | static inline void
 411 | print_fnr(fnr x)
 412 | {
 413 | 	fprintf(stderr, "%ld(%08lX)",
 414 | 		(long)(*(int64_t *)&x.v >> 32),
 415 | 		(unsigned long)(uint32_t)x.v);
 416 | }
 417 | 
 418 | static void
 419 | print_poly_i32(const char *name, const int32_t *a, unsigned logn)
 420 | {
 421 | 	size_t u, n;
 422 | 
 423 | 	n = (size_t)1 << logn;
 424 | 	fprintf(stderr, "%s =", name);
 425 | 	for (u = 0; u < n; u ++) {
 426 | 		fprintf(stderr, " %ld", (long)a[u]);
 427 | 	}
 428 | 	fprintf(stderr, "\n");
 429 | }
 430 | 
 431 | static void
 432 | print_poly_fnr(const char *name, const fnr *f, unsigned logn)
 433 | {
 434 | 	size_t u, n;
 435 | 
 436 | 	n = (size_t)1 << logn;
 437 | 	fprintf(stderr, "%s =", name);
 438 | 	for (u = 0; u < n; u ++) {
 439 | 		fprintf(stderr, " ");
 440 | 		print_fnr(f[u]);
 441 | 	}
 442 | 	fprintf(stderr, "\n");
 443 | }
 444 | 
 445 | static void
 446 | check_poly_eq_round(const char *banner,
 447 | 	const int32_t *a, const fnr *f, unsigned logn)
 448 | {
 449 | 	size_t u, n;
 450 | 
 451 | 	n = (size_t)1 << logn;
 452 | 	for (u = 0; u < n; u ++) {
 453 | 		if (fnr_round(f[u]) != a[u]) {
 454 | 			break;
 455 | 		}
 456 | 	}
 457 | 	if (u == n) {
 458 | 		return;
 459 | 	}
 460 | 	fprintf(stderr, "ERR: %s (not equal on %zu)\n", banner, u);
 461 | 	print_poly_i32("a", a, logn);
 462 | 	print_poly_fnr("f", f, logn);
 463 | 	fprintf(stderr, "a[%zu] = %ld\n", u, (long)a[u]);
 464 | 	fprintf(stderr, "f[%zu] = ", u);
 465 | 	print_fnr(f[u]);
 466 | 	fprintf(stderr, "\n");
 467 | 	exit(EXIT_FAILURE);
 468 | }
 469 | 
 470 | static void
 471 | test_FFT(void)
 472 | {
 473 | 	unsigned logn;
 474 | 
 475 | 	printf("Test FFT: ");
 476 | 	fflush(stdout);
 477 | 
 478 | 	for (logn = 1; logn <= 10; logn ++) {
 479 | 		prng_context rng;
 480 | 		int32_t a[1024], b[1024], c[1024];
 481 | 		fnr fa[1024], fb[1024], fc[1024];
 482 | 		unsigned num_bits;
 483 | 		size_t u, n;
 484 | 		int i;
 485 | 
 486 | 		rand_init(&rng, "test_FFT", logn);
 487 | 		n = (size_t)1 << logn;
 488 | 
 489 | 		/*
 490 | 		 * If source coefficients are over k+1 bits (including
 491 | 		 * sign bit), then product coefficients are at most
 492 | 		 * 1+2*k+logn bits, and FFT coefficients will fit in
 493 | 		 * 1+2*k+2*logn bits. We need this value to be at most 32.
 494 | 		 */
 495 | 		num_bits = (32 - 1 - 2 * logn) >> 1;
 496 | 
 497 | 		for (i = 0; i < 100; i ++) {
 498 | 			rand_poly_32(&rng, a, logn, 32 - logn);
 499 | 			for (u = 0; u < n; u ++) {
 500 | 				fa[u] = fnr_of(a[u]);
 501 | 			}
 502 | 			bat_FFT(fa, logn);
 503 | 			bat_iFFT(fa, logn);
 504 | 			check_poly_eq_round("FFT", a, fa, logn);
 505 | 
 506 | 			rand_poly_32(&rng, a, logn, num_bits);
 507 | 			rand_poly_32(&rng, b, logn, num_bits);
 508 | 			for (u = 0; u < n; u ++) {
 509 | 				fa[u] = fnr_of(a[u]);
 510 | 				fb[u] = fnr_of(b[u]);
 511 | 			}
 512 | 			bat_FFT(fa, logn);
 513 | 			bat_FFT(fb, logn);
 514 | 			memcpy(fc, fa, n * sizeof(fnr));
 515 | 			bat_poly_add(fc, fb, logn);
 516 | 			bat_iFFT(fc, logn);
 517 | 			poly_add(c, a, b, logn);
 518 | 			check_poly_eq_round("add", c, fc, logn);
 519 | 
 520 | 			memcpy(fc, fa, n * sizeof(fnr));
 521 | 			bat_poly_sub(fc, fb, logn);
 522 | 			bat_iFFT(fc, logn);
 523 | 			poly_sub(c, a, b, logn);
 524 | 			check_poly_eq_round("sub", c, fc, logn);
 525 | 
 526 | 			memcpy(fc, fa, n * sizeof(fnr));
 527 | 			bat_poly_neg(fc, logn);
 528 | 			bat_iFFT(fc, logn);
 529 | 			poly_neg(c, a, logn);
 530 | 			check_poly_eq_round("neg", c, fc, logn);
 531 | 
 532 | 			memcpy(fc, fa, n * sizeof(fnr));
 533 | 			bat_poly_mul_fft(fc, fb, logn);
 534 | 			bat_iFFT(fc, logn);
 535 | 			poly_mul(c, a, b, logn);
 536 | 			check_poly_eq_round("mul", c, fc, logn);
 537 | 		}
 538 | 
 539 | 		printf(".");
 540 | 		fflush(stdout);
 541 | 	}
 542 | 
 543 | 	printf(" done.\n");
 544 | 	fflush(stdout);
 545 | }
 546 | 
 547 | static void
 548 | prep_tmp(void *tmp, size_t tmp_len, int i)
 549 | {
 550 | 	memset(tmp, i & 0xFF, tmp_len);
 551 | }
 552 | 
 553 | static size_t
 554 | get_tmp_used(const void *tmp, size_t tmp_len, int i)
 555 | {
 556 | 	const uint8_t *buf;
 557 | 	size_t u;
 558 | 
 559 | 	buf = tmp;
 560 | 	i &= 0xFF;
 561 | 	for (u = tmp_len; u > 0; u --) {
 562 | 		if (buf[u - 1] != i) {
 563 | 			return u;
 564 | 		}
 565 | 	}
 566 | 	return 0;
 567 | }
 568 | 
 569 | static void
 570 | check_tmp_used(const char *name,
 571 | 	const void *tmp, size_t tmp_len, int i, size_t max_len)
 572 | {
 573 | 	size_t used_len;
 574 | 
 575 | 	used_len = get_tmp_used(tmp, tmp_len, i);
 576 | 	if (used_len > max_len) {
 577 | 		fprintf(stderr, "ERR: %s: tmp usage exceeded allowance"
 578 | 			" (%lu vs %lu bytes)\n",
 579 | 			name,
 580 | 			(unsigned long)used_len, (unsigned long)max_len);
 581 | 		exit(EXIT_FAILURE);
 582 | 	}
 583 | }
 584 | 
 585 | static void
 586 | test_kem_inner_spec(uint32_t q, unsigned logn)
 587 | {
 588 | 	int i;
 589 | 	union {
 590 | 		uint8_t b[24 * 1024 + 8];
 591 | 		uint32_t w[6 * 1024 + 2];
 592 | 		uint64_t d;
 593 | 	} tmp;
 594 | 	int8_t f[1024], g[1024], F[1024], G[1024], G2[1024], c[1024];
 595 | 	uint16_t h[1024];
 596 | 	int32_t w[1024];
 597 | 	int enc_fail;
 598 | 
 599 | 	printf("[%u-%u]", (unsigned)q, 1u << logn);
 600 | 	fflush(stdout);
 601 | 
 602 | 	enc_fail = 0;
 603 | 
 604 | 	for (i = 0; i < 100; i ++) {
 605 | 		prng_context rng;
 606 | 		uint8_t kg_seed[32];
 607 | 		int j;
 608 | 
 609 | 		rand_init(&rng, "kem_inner",
 610 | 			((uint64_t)(q << 4 | logn) << 32) | (uint64_t)i);
 611 | 
 612 | 		/*
 613 | 		 * Generate a new key pair.
 614 | 		 */
 615 | 		for (;;) {
 616 | 			int r;
 617 | 
 618 | 			prep_tmp(tmp.w, sizeof tmp.w, i);
 619 | 			prng_get_bytes(&rng, kg_seed, sizeof kg_seed);
 620 | 			r = bat_keygen_make_fg(f, g, h, q, logn,
 621 | 				kg_seed, sizeof kg_seed, tmp.w);
 622 | 			check_tmp_used("bat_keygen_make_fg",
 623 | 				tmp.w, sizeof tmp.w, i, 24u << logn);
 624 | 			if (!r) {
 625 | 				continue;
 626 | 			}
 627 | 
 628 | 			prep_tmp(tmp.w, sizeof tmp.w, i);
 629 | 			r = bat_keygen_solve_FG(F, G, f, g, q, logn, tmp.w);
 630 | 			check_tmp_used("bat_keygen_solve_FG",
 631 | 				tmp.w, sizeof tmp.w, i, 24u << logn);
 632 | 			if (!r) {
 633 | 				continue;
 634 | 			}
 635 | 
 636 | 			prep_tmp(tmp.w, sizeof tmp.w, i);
 637 | 			r = bat_keygen_compute_w(w, f, g, F, G, q, logn, tmp.w);
 638 | 			check_tmp_used("bat_keygen_compute_w",
 639 | 				tmp.w, sizeof tmp.w, i, 24u << logn);
 640 | 			if (!r) {
 641 | 				continue;
 642 | 			}
 643 | 
 644 | 			break;
 645 | 		}
 646 | 
 647 | 		/*
 648 | 		 * Verify the key pair is behaving properly.
 649 | 		 */
 650 | 		prep_tmp(tmp.w, sizeof tmp.w, i);
 651 | 		if (!bat_keygen_verify_FG(f, g, F, G, q, logn, tmp.w)) {
 652 | 			fprintf(stderr, "bat_keygen_verify_FG() failed\n");
 653 | 			exit(EXIT_FAILURE);
 654 | 		}
 655 | 		check_tmp_used("bat_keygen_verify_FG",
 656 | 			tmp.w, sizeof tmp.w, i, 16u << logn);
 657 | 
 658 | 		prep_tmp(tmp.w, sizeof tmp.w, i);
 659 | 		if (!bat_keygen_rebuild_G(G2, f, g, F, q, logn, tmp.w)) {
 660 | 			fprintf(stderr, "bat_keygen_rebuild_G() failed\n");
 661 | 			exit(EXIT_FAILURE);
 662 | 		}
 663 | 		check_tmp_used("bat_keygen_rebuild_G",
 664 | 			tmp.w, sizeof tmp.w, i, 4u << logn);
 665 | 		check_equals(G, G2, 1u << logn, "rebuild G");
 666 | 
 667 | 		/*
 668 | 		 * Do some encapsulation / decapsulation.
 669 | 		 */
 670 | 		for (j = 0; j < 100; j ++) {
 671 | 			uint8_t sbuf[128], sbuf2[128];
 672 | 			int r;
 673 | 
 674 | 			rand_init(&rng, "kem_inner_encaps",
 675 | 				((uint64_t)(q << 4 | logn) << 32)
 676 | 				| (uint64_t)i | ((uint64_t)j << 16));
 677 | 
 678 | 			for (;;) {
 679 | 				prng_get_bytes(&rng, sbuf, SBUF_LEN(logn));
 680 | 				if (logn < 3) {
 681 | 					sbuf[0] &= (1u << (1u << logn)) - 1u;
 682 | 				}
 683 | 
 684 | 				prep_tmp(tmp.w, sizeof tmp.w, i);
 685 | 				switch (q) {
 686 | 				case 128:
 687 | 					r = bat_encrypt_128(
 688 | 						c, sbuf, (const uint8_t *)h,
 689 | 						logn, tmp.w);
 690 | 					break;
 691 | 				case 257:
 692 | 					r = bat_encrypt_257(
 693 | 						c, sbuf, h, logn, tmp.w);
 694 | 					break;
 695 | 				case 769:
 696 | 					r = bat_encrypt_769(
 697 | 						c, sbuf, h, logn, tmp.w);
 698 | 					break;
 699 | 				default:
 700 | 					fprintf(stderr,
 701 | 						"Unknown q: %u\n", (unsigned)q);
 702 | 					exit(EXIT_FAILURE);
 703 | 				}
 704 | 				check_tmp_used("bat_encrypt",
 705 | 					tmp.w, sizeof tmp.w, i,
 706 | 					(q == 128 ? 3u : 4u) << logn);
 707 | 				if (!r) {
 708 | 					/*
 709 | 					 * This may happen with q = 769, but
 710 | 					 * not with q = 128 or q = 257.
 711 | 					 */
 712 | 					if (q == 769) {
 713 | 						enc_fail ++;
 714 | 						continue;
 715 | 					}
 716 | 					fprintf(stderr,
 717 | 						"bat_encrypt() failed\n");
 718 | 					exit(EXIT_FAILURE);
 719 | 				}
 720 | 				break;
 721 | 			}
 722 | 
 723 | 			prep_tmp(tmp.w, sizeof tmp.w, i);
 724 | 			switch (q) {
 725 | 			case 128:
 726 | 				bat_decrypt_128(sbuf2,
 727 | 					c, f, g, F, G, w, logn, tmp.w);
 728 | 				break;
 729 | 			case 257:
 730 | 				bat_decrypt_257(sbuf2,
 731 | 					c, f, g, F, G, w, logn, tmp.w);
 732 | 				break;
 733 | 			case 769:
 734 | 				bat_decrypt_769(sbuf2,
 735 | 					c, f, g, F, G, w, logn, tmp.w);
 736 | 				break;
 737 | 			default:
 738 | 				fprintf(stderr, "Unknown q: %u\n", (unsigned)q);
 739 | 				exit(EXIT_FAILURE);
 740 | 			}
 741 | 			check_tmp_used("bat_decrypt",
 742 | 				tmp.w, sizeof tmp.w, i, 8u << logn);
 743 | 
 744 | 			check_equals(sbuf, sbuf2, SBUF_LEN(logn),
 745 | 				"KEM enc/dec");
 746 | 		}
 747 | 
 748 | 		printf(".");
 749 | 		fflush(stdout);
 750 | 	}
 751 | 
 752 | 	printf("(%d)", enc_fail);
 753 | 	fflush(stdout);
 754 | }
 755 | 
 756 | static void
 757 | test_kem_inner(void)
 758 | {
 759 | 	unsigned logn;
 760 | 
 761 | 	printf("Test KEM (inner):\n   ");
 762 | 	fflush(stdout);
 763 | 	for (logn = 1; logn <= 8; logn ++) {
 764 | 		test_kem_inner_spec(128, logn);
 765 | 	}
 766 | 	printf("\n   ");
 767 | 	fflush(stdout);
 768 | 	for (logn = 1; logn <= 9; logn ++) {
 769 | 		test_kem_inner_spec(257, logn);
 770 | 	}
 771 | 	printf("\n   ");
 772 | 	fflush(stdout);
 773 | 	for (logn = 1; logn <= 10; logn ++) {
 774 | 		test_kem_inner_spec(769, logn);
 775 | 	}
 776 | 	printf("\n");
 777 | }
 778 | 
 779 | #define CC(x)   do { \
 780 | 		int cc_err = (x); \
 781 | 		if (cc_err != 0) { \
 782 | 			fprintf(stderr, "%s failed with error %d\n", \
 783 | 				#x, cc_err); \
 784 | 			exit(EXIT_FAILURE); \
 785 | 		} \
 786 | 	} while (0)
 787 | 
 788 | static void
 789 | test_kem_128_256(void)
 790 | {
 791 | 	int i;
 792 | 	bat_128_256_private_key sk, sk2;
 793 | 	bat_128_256_public_key pk, pk2;
 794 | 	bat_128_256_ciphertext ct, ct2;
 795 | 	uint8_t tmp[BAT_128_256_TMP_KEYGEN], buf[33 + 8 * 256];
 796 | 	size_t len, len2;
 797 | 
 798 | 	printf("Test KEM-128-256: ");
 799 | 	fflush(stdout);
 800 | 
 801 | 	for (i = 0; i < 100; i ++) {
 802 | 		int j;
 803 | 
 804 | 		CC(bat_128_256_keygen(&sk, tmp, sizeof tmp));
 805 | 
 806 | 		len = bat_128_256_encode_private_key(NULL, 0, &sk, 0);
 807 | 		if (len > sizeof buf) {
 808 | 			fprintf(stderr, "oversized private key encoding\n");
 809 | 			exit(EXIT_FAILURE);
 810 | 		}
 811 | 		len2 = bat_128_256_encode_private_key(buf, sizeof buf, &sk, 0);
 812 | 		if (len2 != len) {
 813 | 			fprintf(stderr, "private key encoding size mismatch\n");
 814 | 			exit(EXIT_FAILURE);
 815 | 		}
 816 | 
 817 | 		memset(&sk2, 0, sizeof sk2);
 818 | 		len2 = bat_128_256_decode_private_key(
 819 | 			&sk2, buf, sizeof buf, NULL, 0);
 820 | 		if (len2 != len) {
 821 | 			fprintf(stderr, "private key decoding size mismatch"
 822 | 				"(%zu / %zu)\n", len, len2);
 823 | 			exit(EXIT_FAILURE);
 824 | 		}
 825 | 		check_equals(sk.seed, sk2.seed, sizeof sk.seed, "sk seed");
 826 | 		check_equals(sk.f, sk2.f, sizeof sk.f, "sk f");
 827 | 		check_equals(sk.g, sk2.g, sizeof sk.g, "sk g");
 828 | 		check_equals(sk.F, sk2.F, sizeof sk.F, "sk F");
 829 | 		check_equals(sk.G, sk2.G, sizeof sk.G, "sk G");
 830 | 		check_equals(sk.w, sk2.w, sizeof sk.w, "sk w");
 831 | 		check_equals(sk.h, sk2.h, sizeof sk.h, "sk h");
 832 | 
 833 | 		len = bat_128_256_encode_private_key(NULL, 0, &sk, 1);
 834 | 		if (len > sizeof buf) {
 835 | 			fprintf(stderr, "oversized private key encoding"
 836 | 				" (short form)\n");
 837 | 			exit(EXIT_FAILURE);
 838 | 		}
 839 | 		len2 = bat_128_256_encode_private_key(buf, sizeof buf, &sk, 1);
 840 | 		if (len2 != len) {
 841 | 			fprintf(stderr, "private key encoding size mismatch"
 842 | 				" (short form) (%zu vs %zu)\n", len, len2);
 843 | 			exit(EXIT_FAILURE);
 844 | 		}
 845 | 
 846 | 		memset(&sk2, 0, sizeof sk2);
 847 | 		len2 = bat_128_256_decode_private_key(
 848 | 			&sk2, buf, sizeof buf, tmp, sizeof tmp);
 849 | 		if (len2 != len) {
 850 | 			fprintf(stderr, "private key decoding size mismatch"
 851 | 				" (short form)\n");
 852 | 			exit(EXIT_FAILURE);
 853 | 		}
 854 | 		check_equals(sk.seed, sk2.seed, sizeof sk.seed, "sk seed");
 855 | 		check_equals(sk.f, sk2.f, sizeof sk.f, "sk f");
 856 | 		check_equals(sk.g, sk2.g, sizeof sk.g, "sk g");
 857 | 		check_equals(sk.F, sk2.F, sizeof sk.F, "sk F");
 858 | 		check_equals(sk.G, sk2.G, sizeof sk.G, "sk G");
 859 | 		check_equals(sk.w, sk2.w, sizeof sk.w, "sk w");
 860 | 		check_equals(sk.h, sk2.h, sizeof sk.h, "sk h");
 861 | 
 862 | 		bat_128_256_get_public_key(&pk, &sk);
 863 | 
 864 | 		len = bat_128_256_encode_public_key(NULL, 0, &pk);
 865 | 		if (len > sizeof buf) {
 866 | 			fprintf(stderr, "oversized public key encoding\n");
 867 | 			exit(EXIT_FAILURE);
 868 | 		}
 869 | 		len2 = bat_128_256_encode_public_key(buf, sizeof buf, &pk);
 870 | 		if (len2 != len) {
 871 | 			fprintf(stderr, "public key encoding size mismatch"
 872 | 				" (%zu vs %zu)\n", len, len2);
 873 | 			exit(EXIT_FAILURE);
 874 | 		}
 875 | 
 876 | 		memset(&pk2, 0, sizeof pk2);
 877 | 		len2 = bat_128_256_decode_public_key(&pk2, buf, sizeof buf);
 878 | 		if (len2 != len) {
 879 | 			fprintf(stderr, "public key decoding size mismatch"
 880 | 				" (%zu vs %zu)\n", len, len2);
 881 | 			exit(EXIT_FAILURE);
 882 | 		}
 883 | 		check_equals(pk.h, pk2.h, sizeof pk.h, "pk h");
 884 | 
 885 | 		for (j = 0; j < 100; j ++) {
 886 | 			uint8_t secret[48], secret2[48];
 887 | 
 888 | 			CC(bat_128_256_encapsulate(secret, sizeof secret,
 889 | 				&ct, &pk, tmp, sizeof tmp));
 890 | 
 891 | 			len = bat_128_256_encode_ciphertext(NULL, 0, &ct);
 892 | 			if (len > sizeof buf) {
 893 | 				fprintf(stderr,
 894 | 					"oversized ciphertext encoding\n");
 895 | 				exit(EXIT_FAILURE);
 896 | 			}
 897 | 			len2 = bat_128_256_encode_ciphertext(
 898 | 				buf, sizeof buf, &ct);
 899 | 			if (len2 != len) {
 900 | 				fprintf(stderr,
 901 | 					"ciphertext encoding size mismatch"
 902 | 					" (%zu vs %zu)\n", len, len2);
 903 | 				exit(EXIT_FAILURE);
 904 | 			}
 905 | 
 906 | 			memset(&ct2, 0, sizeof ct2);
 907 | 			len2 = bat_128_256_decode_ciphertext(
 908 | 				&ct2, buf, sizeof buf);
 909 | 			if (len2 != len) {
 910 | 				fprintf(stderr,
 911 | 					"ciphertext decoding size mismatch"
 912 | 					" (%zu vs %zu)\n", len, len2);
 913 | 				exit(EXIT_FAILURE);
 914 | 			}
 915 | 			check_equals(ct.c, ct2.c, sizeof ct.c, "ct c");
 916 | 
 917 | 			CC(bat_128_256_decapsulate(secret2, sizeof secret2,
 918 | 				&ct, &sk, tmp, sizeof tmp));
 919 | 			check_equals(secret, secret2, sizeof secret, "secret");
 920 | 		}
 921 | 
 922 | 		printf(".");
 923 | 		fflush(stdout);
 924 | 	}
 925 | 
 926 | 	printf(" done.\n");
 927 | 	fflush(stdout);
 928 | }
 929 | 
 930 | static void
 931 | test_kem_257_512(void)
 932 | {
 933 | 	int i;
 934 | 	bat_257_512_private_key sk, sk2;
 935 | 	bat_257_512_public_key pk, pk2;
 936 | 	bat_257_512_ciphertext ct, ct2;
 937 | 	uint8_t tmp[BAT_257_512_TMP_KEYGEN], buf[33 + 8 * 512];
 938 | 	size_t len, len2;
 939 | 
 940 | 	printf("Test KEM-257-512: ");
 941 | 	fflush(stdout);
 942 | 
 943 | 	for (i = 0; i < 100; i ++) {
 944 | 		int j;
 945 | 
 946 | 		CC(bat_257_512_keygen(&sk, tmp, sizeof tmp));
 947 | 
 948 | 		len = bat_257_512_encode_private_key(NULL, 0, &sk, 0);
 949 | 		if (len > sizeof buf) {
 950 | 			fprintf(stderr, "oversized private key encoding\n");
 951 | 			exit(EXIT_FAILURE);
 952 | 		}
 953 | 		len2 = bat_257_512_encode_private_key(buf, sizeof buf, &sk, 0);
 954 | 		if (len2 != len) {
 955 | 			fprintf(stderr, "private key encoding size mismatch\n");
 956 | 			exit(EXIT_FAILURE);
 957 | 		}
 958 | 
 959 | 		memset(&sk2, 0, sizeof sk2);
 960 | 		len2 = bat_257_512_decode_private_key(
 961 | 			&sk2, buf, sizeof buf, NULL, 0);
 962 | 		if (len2 != len) {
 963 | 			fprintf(stderr, "private key decoding size mismatch"
 964 | 				"(%zu / %zu)\n", len, len2);
 965 | 			exit(EXIT_FAILURE);
 966 | 		}
 967 | 		check_equals(sk.seed, sk2.seed, sizeof sk.seed, "sk seed");
 968 | 		check_equals(sk.f, sk2.f, sizeof sk.f, "sk f");
 969 | 		check_equals(sk.g, sk2.g, sizeof sk.g, "sk g");
 970 | 		check_equals(sk.F, sk2.F, sizeof sk.F, "sk F");
 971 | 		check_equals(sk.G, sk2.G, sizeof sk.G, "sk G");
 972 | 		check_equals(sk.w, sk2.w, sizeof sk.w, "sk w");
 973 | 		check_equals(sk.h, sk2.h, sizeof sk.h, "sk h");
 974 | 
 975 | 		len = bat_257_512_encode_private_key(NULL, 0, &sk, 1);
 976 | 		if (len > sizeof buf) {
 977 | 			fprintf(stderr, "oversized private key encoding"
 978 | 				" (short form)\n");
 979 | 			exit(EXIT_FAILURE);
 980 | 		}
 981 | 		len2 = bat_257_512_encode_private_key(buf, sizeof buf, &sk, 1);
 982 | 		if (len2 != len) {
 983 | 			fprintf(stderr, "private key encoding size mismatch"
 984 | 				" (short form) (%zu vs %zu)\n", len, len2);
 985 | 			exit(EXIT_FAILURE);
 986 | 		}
 987 | 
 988 | 		memset(&sk2, 0, sizeof sk2);
 989 | 		len2 = bat_257_512_decode_private_key(
 990 | 			&sk2, buf, sizeof buf, tmp, sizeof tmp);
 991 | 		if (len2 != len) {
 992 | 			fprintf(stderr, "private key decoding size mismatch"
 993 | 				" (short form)\n");
 994 | 			exit(EXIT_FAILURE);
 995 | 		}
 996 | 		check_equals(sk.seed, sk2.seed, sizeof sk.seed, "sk seed");
 997 | 		check_equals(sk.f, sk2.f, sizeof sk.f, "sk f");
 998 | 		check_equals(sk.g, sk2.g, sizeof sk.g, "sk g");
 999 | 		check_equals(sk.F, sk2.F, sizeof sk.F, "sk F");
1000 | 		check_equals(sk.G, sk2.G, sizeof sk.G, "sk G");
1001 | 		check_equals(sk.w, sk2.w, sizeof sk.w, "sk w");
1002 | 		check_equals(sk.h, sk2.h, sizeof sk.h, "sk h");
1003 | 
1004 | 		bat_257_512_get_public_key(&pk, &sk);
1005 | 
1006 | 		len = bat_257_512_encode_public_key(NULL, 0, &pk);
1007 | 		if (len > sizeof buf) {
1008 | 			fprintf(stderr, "oversized public key encoding\n");
1009 | 			exit(EXIT_FAILURE);
1010 | 		}
1011 | 		len2 = bat_257_512_encode_public_key(buf, sizeof buf, &pk);
1012 | 		if (len2 != len) {
1013 | 			fprintf(stderr, "public key encoding size mismatch"
1014 | 				" (%zu vs %zu)\n", len, len2);
1015 | 			exit(EXIT_FAILURE);
1016 | 		}
1017 | 
1018 | 		memset(&pk2, 0, sizeof pk2);
1019 | 		len2 = bat_257_512_decode_public_key(&pk2, buf, sizeof buf);
1020 | 		if (len2 != len) {
1021 | 			fprintf(stderr, "public key decoding size mismatch"
1022 | 				" (%zu vs %zu)\n", len, len2);
1023 | 			exit(EXIT_FAILURE);
1024 | 		}
1025 | 		check_equals(pk.h, pk2.h, sizeof pk.h, "pk h");
1026 | 
1027 | 		for (j = 0; j < 100; j ++) {
1028 | 			uint8_t secret[48], secret2[48];
1029 | 
1030 | 			CC(bat_257_512_encapsulate(secret, sizeof secret,
1031 | 				&ct, &pk, tmp, sizeof tmp));
1032 | 
1033 | 			len = bat_257_512_encode_ciphertext(NULL, 0, &ct);
1034 | 			if (len > sizeof buf) {
1035 | 				fprintf(stderr,
1036 | 					"oversized ciphertext encoding\n");
1037 | 				exit(EXIT_FAILURE);
1038 | 			}
1039 | 			len2 = bat_257_512_encode_ciphertext(
1040 | 				buf, sizeof buf, &ct);
1041 | 			if (len2 != len) {
1042 | 				fprintf(stderr,
1043 | 					"ciphertext encoding size mismatch"
1044 | 					" (%zu vs %zu)\n", len, len2);
1045 | 				exit(EXIT_FAILURE);
1046 | 			}
1047 | 
1048 | 			memset(&ct2, 0, sizeof ct2);
1049 | 			len2 = bat_257_512_decode_ciphertext(
1050 | 				&ct2, buf, sizeof buf);
1051 | 			if (len2 != len) {
1052 | 				fprintf(stderr,
1053 | 					"ciphertext decoding size mismatch"
1054 | 					" (%zu vs %zu)\n", len, len2);
1055 | 				exit(EXIT_FAILURE);
1056 | 			}
1057 | 			check_equals(ct.c, ct2.c, sizeof ct.c, "ct c");
1058 | 
1059 | 			CC(bat_257_512_decapsulate(secret2, sizeof secret2,
1060 | 				&ct, &sk, tmp, sizeof tmp));
1061 | 			check_equals(secret, secret2, sizeof secret, "secret");
1062 | 		}
1063 | 
1064 | 		printf(".");
1065 | 		fflush(stdout);
1066 | 	}
1067 | 
1068 | 	printf(" done.\n");
1069 | 	fflush(stdout);
1070 | }
1071 | 
1072 | static void
1073 | test_kem_769_1024(void)
1074 | {
1075 | 	int i;
1076 | 	bat_769_1024_private_key sk, sk2;
1077 | 	bat_769_1024_public_key pk, pk2;
1078 | 	bat_769_1024_ciphertext ct, ct2;
1079 | 	uint8_t tmp[BAT_769_1024_TMP_KEYGEN], buf[33 + 8 * 1024];
1080 | 	size_t len, len2;
1081 | 
1082 | 	printf("Test KEM-769-1024: ");
1083 | 	fflush(stdout);
1084 | 
1085 | 	for (i = 0; i < 100; i ++) {
1086 | 		int j;
1087 | 
1088 | 		CC(bat_769_1024_keygen(&sk, tmp, sizeof tmp));
1089 | 
1090 | 		len = bat_769_1024_encode_private_key(NULL, 0, &sk, 0);
1091 | 		if (len > sizeof buf) {
1092 | 			fprintf(stderr, "oversized private key encoding\n");
1093 | 			exit(EXIT_FAILURE);
1094 | 		}
1095 | 		len2 = bat_769_1024_encode_private_key(buf, sizeof buf, &sk, 0);
1096 | 		if (len2 != len) {
1097 | 			fprintf(stderr, "private key encoding size mismatch\n");
1098 | 			exit(EXIT_FAILURE);
1099 | 		}
1100 | 
1101 | 		memset(&sk2, 0, sizeof sk2);
1102 | 		len2 = bat_769_1024_decode_private_key(
1103 | 			&sk2, buf, sizeof buf, NULL, 0);
1104 | 		if (len2 != len) {
1105 | 			fprintf(stderr, "private key decoding size mismatch"
1106 | 				"(%zu / %zu)\n", len, len2);
1107 | 			exit(EXIT_FAILURE);
1108 | 		}
1109 | 		check_equals(sk.seed, sk2.seed, sizeof sk.seed, "sk seed");
1110 | 		check_equals(sk.f, sk2.f, sizeof sk.f, "sk f");
1111 | 		check_equals(sk.g, sk2.g, sizeof sk.g, "sk g");
1112 | 		check_equals(sk.F, sk2.F, sizeof sk.F, "sk F");
1113 | 		check_equals(sk.G, sk2.G, sizeof sk.G, "sk G");
1114 | 		check_equals(sk.w, sk2.w, sizeof sk.w, "sk w");
1115 | 		check_equals(sk.h, sk2.h, sizeof sk.h, "sk h");
1116 | 
1117 | 		len = bat_769_1024_encode_private_key(NULL, 0, &sk, 1);
1118 | 		if (len > sizeof buf) {
1119 | 			fprintf(stderr, "oversized private key encoding"
1120 | 				" (short form)\n");
1121 | 			exit(EXIT_FAILURE);
1122 | 		}
1123 | 		len2 = bat_769_1024_encode_private_key(buf, sizeof buf, &sk, 1);
1124 | 		if (len2 != len) {
1125 | 			fprintf(stderr, "private key encoding size mismatch"
1126 | 				" (short form) (%zu vs %zu)\n", len, len2);
1127 | 			exit(EXIT_FAILURE);
1128 | 		}
1129 | 
1130 | 		memset(&sk2, 0, sizeof sk2);
1131 | 		len2 = bat_769_1024_decode_private_key(
1132 | 			&sk2, buf, sizeof buf, tmp, sizeof tmp);
1133 | 		if (len2 != len) {
1134 | 			fprintf(stderr, "private key decoding size mismatch"
1135 | 				" (short form)\n");
1136 | 			exit(EXIT_FAILURE);
1137 | 		}
1138 | 		check_equals(sk.seed, sk2.seed, sizeof sk.seed, "sk seed");
1139 | 		check_equals(sk.f, sk2.f, sizeof sk.f, "sk f");
1140 | 		check_equals(sk.g, sk2.g, sizeof sk.g, "sk g");
1141 | 		check_equals(sk.F, sk2.F, sizeof sk.F, "sk F");
1142 | 		check_equals(sk.G, sk2.G, sizeof sk.G, "sk G");
1143 | 		check_equals(sk.w, sk2.w, sizeof sk.w, "sk w");
1144 | 		check_equals(sk.h, sk2.h, sizeof sk.h, "sk h");
1145 | 
1146 | 		bat_769_1024_get_public_key(&pk, &sk);
1147 | 
1148 | 		len = bat_769_1024_encode_public_key(NULL, 0, &pk);
1149 | 		if (len > sizeof buf) {
1150 | 			fprintf(stderr, "oversized public key encoding\n");
1151 | 			exit(EXIT_FAILURE);
1152 | 		}
1153 | 		len2 = bat_769_1024_encode_public_key(buf, sizeof buf, &pk);
1154 | 		if (len2 != len) {
1155 | 			fprintf(stderr, "public key encoding size mismatch"
1156 | 				" (%zu vs %zu)\n", len, len2);
1157 | 			exit(EXIT_FAILURE);
1158 | 		}
1159 | 
1160 | 		memset(&pk2, 0, sizeof pk2);
1161 | 		len2 = bat_769_1024_decode_public_key(&pk2, buf, sizeof buf);
1162 | 		if (len2 != len) {
1163 | 			fprintf(stderr, "public key decoding size mismatch"
1164 | 				" (%zu vs %zu)\n", len, len2);
1165 | 			exit(EXIT_FAILURE);
1166 | 		}
1167 | 		check_equals(pk.h, pk2.h, sizeof pk.h, "pk h");
1168 | 
1169 | 		for (j = 0; j < 100; j ++) {
1170 | 			uint8_t secret[48], secret2[48];
1171 | 
1172 | 			CC(bat_769_1024_encapsulate(secret, sizeof secret,
1173 | 				&ct, &pk, tmp, sizeof tmp));
1174 | 
1175 | 			len = bat_769_1024_encode_ciphertext(NULL, 0, &ct);
1176 | 			if (len > sizeof buf) {
1177 | 				fprintf(stderr,
1178 | 					"oversized ciphertext encoding\n");
1179 | 				exit(EXIT_FAILURE);
1180 | 			}
1181 | 			len2 = bat_769_1024_encode_ciphertext(
1182 | 				buf, sizeof buf, &ct);
1183 | 			if (len2 != len) {
1184 | 				fprintf(stderr,
1185 | 					"ciphertext encoding size mismatch"
1186 | 					" (%zu vs %zu)\n", len, len2);
1187 | 				exit(EXIT_FAILURE);
1188 | 			}
1189 | 
1190 | 			memset(&ct2, 0, sizeof ct2);
1191 | 			len2 = bat_769_1024_decode_ciphertext(
1192 | 				&ct2, buf, sizeof buf);
1193 | 			if (len2 != len) {
1194 | 				fprintf(stderr,
1195 | 					"ciphertext decoding size mismatch"
1196 | 					" (%zu vs %zu)\n", len, len2);
1197 | 				exit(EXIT_FAILURE);
1198 | 			}
1199 | 			check_equals(ct.c, ct2.c, sizeof ct.c, "ct c");
1200 | 
1201 | 			CC(bat_769_1024_decapsulate(secret2, sizeof secret2,
1202 | 				&ct, &sk, tmp, sizeof tmp));
1203 | 			check_equals(secret, secret2, sizeof secret, "secret");
1204 | 		}
1205 | 
1206 | 		printf(".");
1207 | 		fflush(stdout);
1208 | 	}
1209 | 
1210 | 	printf(" done.\n");
1211 | 	fflush(stdout);
1212 | }
1213 | 
1214 | #if 0
1215 | /*
1216 |  * Sample code to generate key pairs and print them out in text format
1217 |  * for external analysis. Each output line contains f, g, Fd and Gd.
1218 |  */
1219 | 
1220 | /* defined in keygen.c */
1221 | void bat_make_Fd(int32_t *Fd, const int8_t *f, const int8_t *F,
1222 | 	const int32_t *w, unsigned qp, unsigned logn, uint32_t *tmp);
1223 | 
1224 | static void
1225 | make_keys(uint32_t q, unsigned logn, int num)
1226 | {
1227 | 	int i;
1228 | 	union {
1229 | 		uint8_t b[24 * 1024 + 8];
1230 | 		uint32_t w[6 * 1024 + 2];
1231 | 		uint64_t d;
1232 | 	} tmp;
1233 | 	int8_t f[1024], g[1024], F[1024], G[1024];
1234 | 	uint16_t h[1024];
1235 | 	int32_t w[1024], Fd[1024], Gd[1024];
1236 | 
1237 | 	for (i = 0; i < num; i ++) {
1238 | 		prng_context rng;
1239 | 		uint8_t kg_seed[32];
1240 | 		size_t u, n;
1241 | 
1242 | 		rand_init(&rng, "make_keys",
1243 | 			((uint64_t)(q << 4 | logn) << 32) | (uint64_t)i);
1244 | 
1245 | 		/*
1246 | 		 * Generate a new key pair.
1247 | 		 */
1248 | 		for (;;) {
1249 | 			int r;
1250 | 
1251 | 			prng_get_bytes(&rng, kg_seed, sizeof kg_seed);
1252 | 			r = bat_keygen_make_fg(f, g, h, q, logn,
1253 | 				kg_seed, sizeof kg_seed, tmp.w);
1254 | 			if (!r) {
1255 | 				continue;
1256 | 			}
1257 | 
1258 | 			r = bat_keygen_solve_FG(F, G, f, g, q, logn, tmp.w);
1259 | 			if (!r) {
1260 | 				continue;
1261 | 			}
1262 | 
1263 | 			r = bat_keygen_compute_w(w, f, g, F, G, q, logn, tmp.w);
1264 | 			if (!r) {
1265 | 				continue;
1266 | 			}
1267 | 
1268 | 			break;
1269 | 		}
1270 | 		bat_make_Fd(Fd, f, F, w, 64513, logn, tmp.w);
1271 | 		bat_make_Fd(Gd, g, G, w, 64513, logn, tmp.w);
1272 | 
1273 | 		fprintf(stderr, ".");
1274 | 		fflush(stderr);
1275 | 
1276 | 		n = 1u << logn;
1277 | 		printf("[[");
1278 | 		for (u = 0; u < n; u ++) {
1279 | 			if (u != 0) {
1280 | 				printf(", ");
1281 | 			}
1282 | 			printf("%d", f[u]);
1283 | 		}
1284 | 		printf("], [");
1285 | 		for (u = 0; u < n; u ++) {
1286 | 			if (u != 0) {
1287 | 				printf(", ");
1288 | 			}
1289 | 			printf("%d", g[u]);
1290 | 		}
1291 | 		printf("], [");
1292 | 		for (u = 0; u < n; u ++) {
1293 | 			if (u != 0) {
1294 | 				printf(", ");
1295 | 			}
1296 | 			printf("%d", Fd[u]);
1297 | 		}
1298 | 		printf("], [");
1299 | 		for (u = 0; u < n; u ++) {
1300 | 			if (u != 0) {
1301 | 				printf(", ");
1302 | 			}
1303 | 			printf("%d", Gd[u]);
1304 | 		}
1305 | 		printf("]]\n");
1306 | 		fflush(stdout);
1307 | 	}
1308 | 	fprintf(stderr, "\n");
1309 | 	fflush(stderr);
1310 | }
1311 | #endif
1312 | 
1313 | #if 0
1314 | /*
1315 |  * Sample code to generate key pairs and export them in a custom binary
1316 |  * format for external analysis. Polynomials f, g, F and G use one byte
1317 |  * per coefficient; w uses 4 bytes per coefficient (little-endian). Keys
1318 |  * are written one after the other in the specified file.
1319 |  */
1320 | 
1321 | static void
1322 | make_keys_bin(const char *fname, uint32_t q, unsigned logn, int num)
1323 | {
1324 | 	int i;
1325 | 	union {
1326 | 		uint8_t b[24 * 1024 + 8];
1327 | 		uint32_t w[6 * 1024 + 2];
1328 | 		uint64_t d;
1329 | 	} tmp;
1330 | 	int8_t f[1024], g[1024], F[1024], G[1024];
1331 | 	uint16_t h[1024];
1332 | 	int32_t w[1024];
1333 | 	FILE *kf;
1334 | 
1335 | 	kf = fopen(fname, "wb");
1336 | 	if (kf == NULL) {
1337 | 		perror("fopen");
1338 | 		exit(EXIT_FAILURE);
1339 | 	}
1340 | 	for (i = 0; i < num; i ++) {
1341 | 		prng_context rng;
1342 | 		uint8_t kg_seed[32];
1343 | 		size_t u, n;
1344 | 
1345 | 		rand_init(&rng, "make_keys_bin",
1346 | 			((uint64_t)(q << 4 | logn) << 32) | (uint64_t)i);
1347 | 
1348 | 		/*
1349 | 		 * Generate a new key pair.
1350 | 		 */
1351 | 		for (;;) {
1352 | 			int r;
1353 | 
1354 | 			prng_get_bytes(&rng, kg_seed, sizeof kg_seed);
1355 | 			r = bat_keygen_make_fg(f, g, h, q, logn,
1356 | 				kg_seed, sizeof kg_seed, tmp.w);
1357 | 			if (!r) {
1358 | 				continue;
1359 | 			}
1360 | 
1361 | 			r = bat_keygen_solve_FG(F, G, f, g, q, logn, tmp.w);
1362 | 			if (!r) {
1363 | 				continue;
1364 | 			}
1365 | 
1366 | 			r = bat_keygen_compute_w(w, f, g, F, G, q, logn, tmp.w);
1367 | 			if (!r) {
1368 | 				continue;
1369 | 			}
1370 | 
1371 | 			break;
1372 | 		}
1373 | 
1374 | 		n = 1u << logn;
1375 | 		fwrite(f, 1, n, kf);
1376 | 		fwrite(g, 1, n, kf);
1377 | 		fwrite(F, 1, n, kf);
1378 | 		fwrite(G, 1, n, kf);
1379 | 		for (u = 0; u < n; u ++) {
1380 | 			uint8_t tbuf[4];
1381 | 
1382 | 			enc32le(tbuf, w[u]);
1383 | 			fwrite(tbuf, 1, 4, kf);
1384 | 		}
1385 | 
1386 | 		if ((i + 1) % 100 == 0) {
1387 | 			fprintf(stderr, ".");
1388 | 			fflush(stderr);
1389 | 		}
1390 | 	}
1391 | 	fprintf(stderr, "\n");
1392 | 	fflush(stderr);
1393 | 
1394 | 	fclose(kf);
1395 | }
1396 | #endif
1397 | 
1398 | int
1399 | main(void)
1400 | {
1401 | 	test_BLAKE2s_self();
1402 | 	test_BLAKE2s_expand();
1403 | 	test_BLAKE2b_self();
1404 | 	test_BLAKE2b_expand();
1405 | 
1406 | 	test_FFT();
1407 | 	test_kem_inner();
1408 | 	test_kem_128_256();
1409 | 	test_kem_257_512();
1410 | 	test_kem_769_1024();
1411 | 
1412 | 	printf("Sizes:            pub     ct   priv (short/long)\n");
1413 | 	printf("BAT-128-256:     %4zu   %4zu     %3zu / %4zu\n",
1414 | 		bat_128_256_encode_public_key(0, 0, 0),
1415 | 		bat_128_256_encode_ciphertext(0, 0, 0),
1416 | 		bat_128_256_encode_private_key(0, 0, 0, 1),
1417 | 		bat_128_256_encode_private_key(0, 0, 0, 0));
1418 | 	printf("BAT-257-512:     %4zu   %4zu     %3zu / %4zu\n",
1419 | 		bat_257_512_encode_public_key(0, 0, 0),
1420 | 		bat_257_512_encode_ciphertext(0, 0, 0),
1421 | 		bat_257_512_encode_private_key(0, 0, 0, 1),
1422 | 		bat_257_512_encode_private_key(0, 0, 0, 0));
1423 | 	printf("BAT-769-1024:    %4zu   %4zu     %3zu / %4zu\n",
1424 | 		bat_769_1024_encode_public_key(0, 0, 0),
1425 | 		bat_769_1024_encode_ciphertext(0, 0, 0),
1426 | 		bat_769_1024_encode_private_key(0, 0, 0, 1),
1427 | 		bat_769_1024_encode_private_key(0, 0, 0, 0));
1428 | 
1429 | 	return 0;
1430 | }
1431 | 


--------------------------------------------------------------------------------