├── .editorconfig
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── aes_otf_saes64.c
├── aes_otf_saes64.h
├── aes_saes32.c
├── aes_saes32.h
├── aes_saes64.c
├── aes_saes64.h
├── aes_test.c
├── aes_wrap.c
├── aes_wrap.h
├── asm
    ├── README.md
    ├── saes32_c0.h
    ├── saes32_dec.S
    ├── saes32_enc.S
    ├── saes32_wrap.h
    ├── sm4_encdec.S
    └── sm4_encdec.h
├── bitmanip.c
├── bitmanip.h
├── doc
    ├── NIST.FIPS.197.pdf
    ├── gmt0002-2012sm4.pdf
    ├── lwaes.pdf
    ├── sm4en.pdf
    ├── sp800-38d.pdf
    └── test_gcm_ossl.c
├── gcm_gfmul.h
├── gcm_rv32b_gfmul.c
├── gcm_rv64b_gfmul.c
├── gcm_test.c
├── gcm_wrap.c
├── gcm_wrap.h
├── hdl
    ├── Makefile
    ├── README.md
    ├── saes32.v
    ├── saes32_tb.v
    ├── sboxes.v
    ├── synth.ys
    ├── tbref.txt
    └── yoparse.py
├── rv_endian.h
├── saes32.c
├── saes32.h
├── saes64.c
├── saes64.h
├── sboxes.c
├── sboxes.h
├── sm4_ssm4.c
├── sm4_test.c
├── sm4_wrap.h
├── test_hex.c
├── test_hex.h
└── test_main.c


/.editorconfig:
--------------------------------------------------------------------------------
 1 | #	.editorconfig
 2 | #	2019-09-02	Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | #	Works on GitHub. See: https://EditorConfig.org
 4 | 
 5 | root = true
 6 | 
 7 | [*]
 8 | end_of_line = lf
 9 | insert_final_newline = true
10 | 
11 | [*.{c,h,s,S,v}]
12 | charset = latin1
13 | indent_style = tab
14 | indent_size = 4
15 | 
16 | [*.{txt,md,tex}]
17 | charset = utf-8
18 | indent_style = space
19 | ident_size = 4
20 | 
21 | [Makefile]
22 | indent_style = tab
23 | ident_size = 4
24 | 
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # project specific
 2 | xtest
 3 | 
 4 | firmware.*
 5 | config.h
 6 | *.vvp
 7 | 
 8 | # Prerequisites
 9 | *.d
10 | 
11 | # Object files
12 | *.o
13 | *.ko
14 | *.obj
15 | *.elf
16 | 
17 | # Linker output
18 | *.ilk
19 | *.map
20 | *.exp
21 | 
22 | # Precompiled Headers
23 | *.gch
24 | *.pch
25 | 
26 | # Libraries
27 | *.lib
28 | *.a
29 | *.la
30 | *.lo
31 | 
32 | # Shared objects (inc. Windows DLLs)
33 | *.dll
34 | *.so
35 | *.so.*
36 | *.dylib
37 | 
38 | # Executables
39 | *.exe
40 | *.out
41 | *.app
42 | *.i*86
43 | *.x86_64
44 | *.hex
45 | 
46 | # Debug files
47 | *.dSYM/
48 | *.su
49 | *.idb
50 | *.pdb
51 | 
52 | # Kernel Module Compile Results
53 | *.mod*
54 | *.cmd
55 | .tmp_versions/
56 | modules.order
57 | Module.symvers
58 | Mkfile.old
59 | dkms.conf
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License (RISC-V)
 2 | 
 3 | Copyright (c) 2020, Markku-Juhani O. Saarinen, PQShield Ltd.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #	Makefile
 2 | #	2020-01-22	Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | #   Copyright (c) 2020, PQShield Ltd.  All rights reserved.
 4 | 
 5 | #	export all variables to sub-makefiles
 6 | export				
 7 | 
 8 | BIN		= xtest
 9 | CSRC	= $(wildcard *.c)
10 | OBJS	= $(CSRC:.c=.o)
11 | CC		= gcc
12 | CFLAGS	= -Wall -Wextra -Wshadow -fsanitize=address,undefined -O2 -g
13 | #CFLAGS	= -Wall -march=native -O3
14 | LIBS    +=
15 | 
16 | $(BIN): $(OBJS)
17 | 	$(CC) $(CFLAGS) -o $(BIN) $(OBJS) $(LIBS)
18 | 
19 | %.o:	%.[cS]
20 | 	$(CC) $(CFLAGS) -c $^ -o $@
21 | 
22 | clean:
23 | 	rm -rf $(OBJS) $(BIN) *~
24 | 	cd hdl && $(MAKE) clean
25 | 
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # A Lightweight (RISC-V) ISA Extension for AES and SM4
  2 | 
  3 | **[HISTORICAL]** *This personal repo was used for work-in-progress 
  4 | contributions to the RISC-V Cryptographic Extensions Task Group in 2020
  5 | and is no longer updated.*
  6 | See [riscv-crypto](https://github.com/riscv/riscv-crypto/) for
  7 | up to date information.
  8 | 
  9 | January 22, 2020  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 10 | 
 11 | **Updated** April 23, 2020: Renamed ENC1S as SAES32 (and SSM4) as per current
 12 | 	draft spec where the proposal now resides.
 13 | 
 14 | 
 15 | ## Description
 16 | 
 17 | A lightweight ISA extension proposal supporting:
 18 | 
 19 | * AES (Advanced Encryption Standard) with 128/192/256 - bit secret key,
 20 | as defined in [FIPS 197](doc/NIST.FIPS.197.pdf).
 21 | 
 22 | * SM4 Chinese Encryption algorithm [GM/T 0002-2012](doc/gmt0002-2012sm4.pdf)
 23 | [(english spec)](doc/sm4en.pdf), also defined in GB/T 32907-2016 and ISO/IEC
 24 | 18033-3:2010/DAmd 2. SM4 has only one key size, 128 bits.
 25 | 
 26 | A single instruction, SAES32 is used for encryption, decryption, and key
 27 | schedule for both ciphers. For design rationale and some analysis, see the
 28 | short report [A Lightweight ISA Extension for AES and SM4](https://arxiv.org/abs/2002.07041) (to appear at SECRISC-V 2020). Note that there the same
 29 | instruction is called "ENC1S".
 30 | 
 31 | A more complex ISA extension is appropriate for higher-end CPUs. The
 32 | primary goal of SAES32 / lweas is to eliminate timing-side vulnerabilities.
 33 | Speed-up over pure software table-based implementations is roughly 500 %.
 34 | 
 35 | **NOTE** After ENC1S proposal was adapted into the 
 36 | [Crypto TG Draft](https://github.com/scarv/riscv-crypto) as SAES32, I'm 
 37 | trying to keep this code up to date with it. There is also an initial
 38 | emulation pseudocode for the 64-bit SAES64 instructions; no real assembler
 39 | or HDL yet.
 40 | 
 41 | 
 42 | ## Software and Hardware Source Code
 43 | 
 44 | This directory contains an "emulator" C implementation of the instruction
 45 | together with runnable pseudocode for full encryption, decryption, and
 46 | key schedule of AES-128/192/256 and SM4-128. These are intended for
 47 | instruction counts, test vector generation, and other such evaluation.
 48 | Real assembler listings for the same functions (using a seriously hacky
 49 | macro instruction encoding) can be found under the [asm](asm) directory.
 50 | 
 51 | The assembler and C code use essentially the same api, AES and SM4 API
 52 | (specified in [saes32_wrap.h](saes32_wrap.h)) so that same test code
 53 | can be used with both.
 54 | 
 55 | The [hdl](hdl) directory contains Verilog combinatorial logic for the core
 56 | instruction. Simulator and basic CMOS gate count synthesis scripts are
 57 | provided for Icarus Verilog and Yosys open source tools. The same assembler
 58 | and HDL have been additionally tested with PQShield's proprietary RISC-V
 59 | emulator and the "Pluto" core on a live FPGA target, although source
 60 | code for those is not provided here.
 61 | 
 62 | 
 63 | ## Technical Details
 64 | 
 65 | The instruction is encapsulated in a single emulator function in
 66 | [saes32.c](saes32.c):
 67 | ```C
 68 | uint32_t saes32(uint32_t rs1, uint32_t rs2, int fn);
 69 | ```
 70 | The file [hdl/saes32.v](hdl/saes32.v) contains Verilog combinatorial
 71 | logic for the instruction that can be used in a RISC-V core.
 72 | ```verilog
 73 | module saes32(
 74 |     output  [31:0]  rd,                 //  output register (wire!)
 75 |     input   [31:0]  rs1,                //  input register 1
 76 |     input   [31:0]  rs2,                //  input register 2
 77 |     input   [4:0]   fn                  //  5-bit function specifier
 78 | );
 79 | ```
 80 | 
 81 | The `fn` immediate "constant" is currently 5 bits, covering encryption,
 82 | decryption, and key schedule for both algorithms. Bits `fn[1:0]` specify
 83 | the input byte and output rotation while `fn[4:2]` specify the operation.
 84 | Appropriate pseudo instruction names for the code points can be proposed;
 85 | current identifiers defined in [saes32.h](saes32.h) are:
 86 | 
 87 | | **Identifier** 	| **fn[4:2]** | **Description or Use**             |
 88 | |-------------------|:-----------:|------------------------------------|
 89 | | `SAES32_ENCSM`	| 0    | AES Encrypt main body with *MixColumns*.  |
 90 | | `SAES32_ENCS`		| 1    | AES Encrypt final round / Key Schedule.   |
 91 | | `SAES32_DECSM`	| 2    | AES Decrypt main body with *MixColumns*.  |
 92 | | `SAES32_DECS`		| 3    | AES Decrypt final round.                  |
 93 | | `SSM4_ED` 	  	| 4    | SM4 Encrypt and Decrypt.                  |
 94 | | `SSM4_KS` 	  	| 5    | SM4 Key Schedule.                         |
 95 | |                	| 6-7  | *Unused. 4x6=24 points currently used.*   |
 96 | 
 97 | For AES the instruction selects a byte from `rs2`, performs a single S-box
 98 | lookup (*SubBytes* or its inverse), evaluates a part of the MDS matrix
 99 | (*MixColumns*), rotates the result by a multiple of 8 bits (*ShiftRows*),
100 | and exclusive-ors the result with `rs1` (*AddRoundKey*). Despite its complex
101 | description, it can be seen that hardware implementation of the instructions
102 | is quite compact and the overall software implementation is fast.
103 | 
104 | For SM4 the instruction has exactly the same data path with byte selection,
105 | S-Box lookup, but with different linear operations, depending on whether
106 | encryption/decryption or key scheduling is being performed.
107 | 
108 | 
109 | ##  Galois/Counter Mode (GCM): AES-GCM with Bitmanip
110 | 
111 | The Galois/Counter Mode (GCM) specified in
112 | [NIST SP 800-38D](https://doi.org/10.6028/NIST.SP.800-38D) is a prominent
113 | Authenticated Encryption with Associated Data (AEAD) mechanism. It is
114 | the only cipher mode mandated as "MUST" for all
115 | [TLS 1.3](https://www.rfc-editor.org/rfc/rfc8446.html) implementations.
116 | 
117 | Here I'll briefly discuss implementation aspects
118 | of AES-GCM using the [bitmanip](https://github.com/riscv/riscv-bitmanip)
119 | (B) extension. Pseudocode for a relevant subset of instructions is contained
120 | in source file [bitmanip.c](bitmanip.c), with prototypes in
121 | [bitmanip.h](bitmanip.h). These are almost directly lifted from the current
122 | draft specification. The instructions relevant to GCM are the Carry-Less
123 | Multiply instructions `CMUL[H][W]` and also the Generalized Reverse `GREV[W]`.
124 | The `[W]` suffix indicates a 32-bit word size variant on RV64.
125 | 
126 | The low-level functions that use these instructions are emulated by
127 | [gcm_rv32b_gfmul.c](gcm_rv32b_gfmul.c) and 
128 | [gcm_rv64b_gfmul.c](gcm_rv64b_gfmul.c).
129 | Their correctness can be verified against the full AES-GCM test vectors 
130 | contained in the framework. There may be further room for improvement -- I
131 | use such code to draft the final assembly implementations.
132 | 
133 | An attempt has been made to pair `CMULH` immediately followed by `CMUL`,
134 | as is done with `MULH`/`MUL`, although there is less of a performance
135 | advantage in this case.
136 | 
137 | 
138 | ####    Finite Field Arithmetic
139 | 
140 | While message confidentiality in GCM is provided by a block cipher (AES)
141 | in counter mode (a CTR variant), authentication is based on a GHASH, a
142 | universal hash defined over the binary field GF(2<sup>128</sup>).
143 | Without custom instruction support GCM, just like AES itself, is either
144 | very slow or susceptible to cache timing attacks.
145 | 
146 | Whether or not authenticating ciphertext or associated data, the main
147 | operation of GCM is the GHASH multiplication between a block of
148 | authentication data and a secret generator "H". The addition in the
149 | field is trivial; just two or four XORs, depending on whether RV32 or RV64
150 | implementation is used.
151 | 
152 | The finite field is defined to be the ring of binary polynomials modulo
153 | the primitive pentanomial
154 | R(x) = x<sup>128</sup> + x<sup>7</sup> + x<sup>2</sup> + x + 1.
155 | The field encoding is slightly unusual, with the multiplicative identity
156 | (i.e. one -- "1") being encoded as a byte sequence `0x80, 0x00, .., 0x00`.
157 | Converting to little-endian encoding involves inverting bits in each byte;
158 | the `GREV[W]` instruction with constant 7 (pseudo-instruction `rev`)
159 | accomplishes this.
160 | 
161 | The multiplication itself can be asymptotically sped up with the Karatsuba
162 | method, which works even better in binary fields than it does with integers.
163 | This reduces the number of `CMUL`/`CMULH` pairs on RV64 from 4 to 3 and
164 | the on RV32 from 16 to 9, with the cost of many XORs.
165 | 
166 | 
167 | ####    Reduction via Shifts or via Multiplication
168 | 
169 | The second arithmetic step to consider is the polynomial reduction of the
170 | 255-bit ring product down to 128 bits (the field) again. The best way of
171 | doing reduction depends on *how fast* the carry-less multiplication
172 | instructions `CMUL[H][W]` are in relation to shifts and XORs.
173 | 
174 | I'll call these *shift reduction* (based on the low Hamming weight of the
175 | polynomial R) and *multiplication reduction* (which is analogous to
176 | Montgomery and Barrett methods -- albeit simpler because we're working
177 | in characteristic 2.)
178 | 
179 | 
180 | ####    Estimating the Fastest Method
181 | 
182 | Examining the multiplication implementations in 
183 | [gcm_rv32b_gfmul.c](gcm_rv32b_gfmul.c) and 
184 | [gcm_rv64b_gfmul.c](gcm_rv64b_gfmul.c) we obtain the following 
185 | arithmetic counts:
186 | 
187 | | **Arch** | **Karatsuba**  | **Reduce**    | `GREV` | `XOR` | `S[L/R]L` | `CLMUL` | `CLMULH` |
188 | |:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|
189 | | RV32B |   no  |   mul |   4   |   36  |   0   |   20  |   20  |
190 | | RV32B |   no  | shift |   4   |   56  |   24  |   16  |   16  |
191 | | RV32B |   yes |   mul |   4   |   52  |   0   |   13  |   13  |
192 | | RV32B |   yes | shift |   4   |   72  |   24  |   9   |   9   |
193 | | RV64B |   no  |   mul |   2   |   10  |   0   |   6   |   6   |
194 | | RV64B |   no  | shift |   2   |   20  |   12  |   4   |   4   |
195 | | RV64B |   yes |   mul |   2   |   14  |   0   |   5   |   5   |
196 | | RV64B |   yes | shift |   2   |   24  |   12  |   3   |   3   |
197 | 
198 | 
199 | We can see that the best selection of algorithms depends on the relative
200 | cost of multiplication. Assuming that other instructions have unit cost
201 | and ignoring loops etc, we have:
202 | 
203 | | **Arch** | **Karatsuba**  | **Reduce**    | **MUL=1** | **MUL=2** | **MUL=3** | **MUL=6** |
204 | |:-----:|:-----:|:-----:|:---------:|:---------:|:---------:|:---------:|
205 | | RV32B |   no  |   mul | **80**    |   120     |   160     | 280       |
206 | | RV32B |   no  | shift |   116     |   148     |   180     | 276       |
207 | | RV32B |   yes |   mul |   82      |   **108** | **134**   | 212       |
208 | | RV32B |   yes | shift |   118     |   136     |   154     | **208**   |
209 | | RV64B |   no  |   mul | **24**    |   **36**  |   48      | 84        |
210 | | RV64B |   no  | shift |   42      |   50      |   58      | 82        |
211 | | RV64B |   yes |   mul |   26      |   **36**  | **46**    | 76        |
212 | | RV64B |   yes | shift |   44      |   50      |   56      | **74**    |
213 | 
214 | We see that if `CLMUL[H][W]` takes twice the time of XOR and shifts,
215 | or more, then Karatsuba is worthwhile. If these multiplication instructions
216 | are six times slower, or more, then it is worthwhile to convert the reduction multiplications to shifts and XORs.
217 | 
218 | 
219 | ##  AES Notes
220 | 
221 | *   AES code density is 16 instructions per round (+ round key fetch), despite
222 |     only requiring a single S-box in hardware. The initial
223 |     [RISC-V Crypto proposal](https://github.com/scarv/riscv-crypto)
224 |     (Section 4.4, "Lightweight AES Acceleration") contains an instruction for
225 |     four parallel S-Box lookups. Without additional helper instructions, this
226 |     will result in a slower round function. Furthermore, the circuit size is
227 |     dominated by the S-Box, so the hardware size of this proposal is lower.
228 | *   In addition to being 500+% faster than plain software implementation
229 |     (depending on table lookup speed), the most important feature of this
230 |     implementation is that it is constant time and resistant to
231 |     [Cache-timing attacks on AES](http://cr.yp.to/antiforgery/cachetiming-20050414.pdf).
232 |     Constant-time implementations of AES are possible in pure software but
233 |     are exceedingly slow.
234 | *   The instructions also support the key schedule; it is possible to compute
235 |     the round keys "on the fly" without committing them to RAM. This may be
236 |     helpful in some types of security applications.
237 | *   Many applications do not actually require the AES inverse function;
238 |     even full TLS implementations may be implemented without it since
239 |     the AES-GCM mode is based on CTR; essentially a stream cipher.
240 | *   Mathematically the AES computation is organized as in the well-known
241 |     "T-Tables" technique, which is more than 20 years old in the context of
242 |     AES. If there are patents for this specific way of organizing the
243 |     computation, they are likely to have expired.
244 |     Other approaches have been considered
245 |     [in the literature](https://iacr.org/archive/ches2006/22/22.pdf).
246 | *   In hardware implementation the AES S-Box and its inverse share much of
247 |     their circuitry. For an example of gate-optimized logic for this purpose,
248 |     see e.g. [Boyar and Peralta](https://eprint.iacr.org/2011/332.pdf).
249 |     We've expanded this to SM4, as can be seen in reference combinatorial
250 |     logic in [hdl/sboxes.v](hdl/sboxes.v).
251 | *   SM4 S-Box is mathematically very close to AES S-Box, as both are based
252 |     on finite field inversion in GF(256). This property also makes the inverse
253 |     S-Box required by AES self-similar to forward S-Box. Even though different
254 |     polynomial bases are used by AES and SM4, finite fields are affine
255 |     equivalent, so much of the circuitry of the three is shared.
256 |     SM4 does not need an inverse S-Box for decryption.
257 | 
258 | 
259 | ### Testing
260 | 
261 | Only a C compiler is required to test; RISC-V instruction counts can be
262 | seen from the source code. A [Makefile](Makefile) is provided and the file
263 | [test_main.c](test-main.c) contains a minimal unit test with some standard
264 | test vectors.
265 | 
266 | ```console
267 | $ make
268 | gcc  -c test_main.c -o test_main.o
269 | [..]
270 | gcc  -o xtest aes_enc.o sm4....
271 | $ ./xtest
272 | < .. TEST TEST TEST .. >
273 | [INFO] === AES using SAES32 ===
274 | [PASS] AES-128 Enc 69C4E0D86A7B0430D8CDB78070B4C55A
275 | [PASS] AES-128 Dec 00112233445566778899AABBCCDDEEFF
276 | [PASS] AES-192 Enc DDA97CA4864CDFE06EAF70A0EC0D7191
277 | [PASS] AES-192 Dec 00112233445566778899AABBCCDDEEFF
278 | [PASS] AES-256 Enc 8EA2B7CA516745BFEAFC49904B496089
279 | [PASS] AES-256 Dec 00112233445566778899AABBCCDDEEFF
280 | [PASS] AES-128 Enc 3AD77BB40D7A3660A89ECAF32466EF97
281 | [PASS] AES-128 Dec 6BC1BEE22E409F96E93D7E117393172A
282 | [PASS] AES-192 Enc 974104846D0AD3AD7734ECB3ECEE4EEF
283 | [PASS] AES-192 Dec AE2D8A571E03AC9C9EB76FAC45AF8E51
284 | [PASS] AES-256 Enc B6ED21B99CA6F4F9F153E7B1BEAFED1D
285 | [PASS] AES-256 Dec 30C81C46A35CE411E5FBC1191A0A52EF
286 | < .. GCM tests, SAES64 tests, etc here .. >
287 | [PASS] all tests passed.
288 | $
289 | ```
290 | 
291 | **Disclaimer and Status**
292 | 
293 | *   [PQShield](https://pqshield.com) offers no warranty or specific claims of
294 |     standards compliance nor does not endorse this proposal above other
295 |     proposals. PQShield may or may not implement AES and SM4 according to this
296 |     proposal in the future.
297 | *   Despite being proposed in a personal capacity, this proposal
298 |     constitutes a "contribution" as defined in Section 1.4 of the
299 |     RISC-V foundation membership agreement.
300 | 
301 | Cheers,
302 | - markku
303 | 
304 | 


--------------------------------------------------------------------------------
/aes_otf_saes64.c:
--------------------------------------------------------------------------------
  1 | //  aes_otf_saes64.c
  2 | //  2020-05-06  Markku-Juhani O. Saarinen <mjos@pqhsield.com>
  3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //  AES Encryption with on-the-fly key expansion
  6 | 
  7 | #include <stddef.h>
  8 | 
  9 | #include "aes_wrap.h"
 10 | #include "saes64.h"
 11 | 
 12 | //  === AES-128 round with on-the-fly key schedule ===
 13 | 
 14 | //  2 x SAES64.ENCS[M], 1 x SAES64.KS1, 2 x SAES64.KS2, 2 x XOR
 15 | 
 16 | #define SAES64_OTF128A(i) {		\
 17 | 	u0 = saes64_encsm(t0, t1);	\
 18 | 	u1 = saes64_encsm(t1, t0);	\
 19 | 	ks = saes64_ks1(k1, i);		\
 20 | 	k0 = saes64_ks2(ks, k0);	\
 21 | 	k1 = saes64_ks2(k0, k1); 	\
 22 | 	u0 = u0 ^ k0;				\
 23 | 	u1 = u1 ^ k1;				}
 24 | 
 25 | #define SAES64_OTF128B(i) {		\
 26 | 	t0 = saes64_encsm(u0, u1);	\
 27 | 	t1 = saes64_encsm(u1, u0);	\
 28 | 	ks = saes64_ks1(k1, i);		\
 29 | 	k0 = saes64_ks2(ks, k0);	\
 30 | 	k1 = saes64_ks2(k0, k1); 	\
 31 | 	t0 = t0 ^ k0;				\
 32 | 	t1 = t1 ^ k1;				}
 33 | 
 34 | void aes128_enc_otf_saes64(uint8_t ct[16], const uint8_t pt[16],
 35 | 						   const uint32_t * rk)
 36 | {
 37 | 	uint64_t t0, t1, u0, u1, k0, k1, ks;
 38 | 
 39 | 	k0 = ((const uint64_t *) rk)[0];		//  load key
 40 | 	k1 = ((const uint64_t *) rk)[1];
 41 | 
 42 | 	t0 = ((const uint64_t *) pt)[0];		//  get plaintext
 43 | 	t1 = ((const uint64_t *) pt)[1];
 44 | 
 45 | 	t0 = t0 ^ k0;
 46 | 	t1 = t1 ^ k1;
 47 | 
 48 | 	SAES64_OTF128A(0);						//  first round
 49 | 	SAES64_OTF128B(1);						//  # 2
 50 | 	SAES64_OTF128A(2);						//  # 3
 51 | 	SAES64_OTF128B(3);						//  # 4
 52 | 	SAES64_OTF128A(4);						//  # 5
 53 | 	SAES64_OTF128B(5);						//  # 6
 54 | 	SAES64_OTF128A(6);						//  # 7
 55 | 	SAES64_OTF128B(7);						//  # 8
 56 | 	SAES64_OTF128A(8);						//  # 9
 57 | 	t0 = saes64_encs(u0, u1);				//  last round
 58 | 	t1 = saes64_encs(u1, u0);
 59 | 	ks = saes64_ks1(k1, 9);
 60 | 	k0 = saes64_ks2(ks, k0);
 61 | 	k1 = saes64_ks2(k0, k1);
 62 | 	t0 = t0 ^ k0;
 63 | 	t1 = t1 ^ k1;
 64 | 
 65 | 	((uint64_t *) ct)[0] = t0;				//  store ciphertext
 66 | 	((uint64_t *) ct)[1] = t1;
 67 | }
 68 | 
 69 | //  === AES-192 round with on-the-fly key schedule ===
 70 | 
 71 | //  3 rounds has: 2 x SAES64.KS1, 6 x SAES64.KS2, 6 x AES64.ENCSM, 6 x XOR
 72 | 
 73 | #define SAES64_OTF192K(i) {		\
 74 | 	ks = saes64_ks1(k2, i);		\
 75 | 	k0 = saes64_ks2(ks, k0);	\
 76 | 	k1 = saes64_ks2(k0, k1);	\
 77 | 	k2 = saes64_ks2(k1, k2); 	}
 78 | 
 79 | #define SAES64_OTF192A {		\
 80 | 	t0 = t0 ^ k0;				\
 81 | 	t1 = t1 ^ k1;				\
 82 | 	u0 = saes64_encsm(t0, t1);	\
 83 | 	u1 = saes64_encsm(t1, t0);	}
 84 | 
 85 | #define SAES64_OTF192B(i) {		\
 86 | 	u0 = u0 ^ k2;				\
 87 | 	SAES64_OTF192K(i);			\
 88 | 	u1 = u1 ^ k0;				\
 89 | 	v0 = saes64_encsm(u0, u1);	\
 90 | 	v1 = saes64_encsm(u1, u0);	}
 91 | 
 92 | #define SAES64_OTF192C(i) {		\
 93 | 	v0 = v0 ^ k1;				\
 94 | 	v1 = v1 ^ k2;				\
 95 | 	SAES64_OTF192K(i);			\
 96 | 	t0 = saes64_encsm(v0, v1);	\
 97 | 	t1 = saes64_encsm(v1, v0);	}
 98 | 
 99 | void aes192_enc_otf_saes64(uint8_t ct[16], const uint8_t pt[16],
100 | 						   const uint32_t * rk)
101 | {
102 | 	uint64_t t0, t1, u0, u1, v0, v1, k0, k1, k2, ks;
103 | 
104 | 	k0 = ((const uint64_t *) rk)[0];		//  load key
105 | 	k1 = ((const uint64_t *) rk)[1];
106 | 	k2 = ((const uint64_t *) rk)[2];
107 | 
108 | 	t0 = ((const uint64_t *) pt)[0];		//  get plaintext
109 | 	t1 = ((const uint64_t *) pt)[1];
110 | 
111 | 	SAES64_OTF192A;							//  first round
112 | 	SAES64_OTF192B(0);						//  # 2
113 | 	SAES64_OTF192C(1);						//  # 3
114 | 	SAES64_OTF192A;							//  # 4
115 | 	SAES64_OTF192B(2);						//  # 5
116 | 	SAES64_OTF192C(3);						//  # 6
117 | 	SAES64_OTF192A;							//  # 7
118 | 	SAES64_OTF192B(4);						//  # 8
119 | 	SAES64_OTF192C(5);						//  # 9
120 | 	SAES64_OTF192A;							//  # 10
121 | 	SAES64_OTF192B(6);						//  # 11
122 | 
123 | 	v0 = v0 ^ k1;							//  last round
124 | 	v1 = v1 ^ k2;
125 | 	ks = saes64_ks1(k2, 7);					//  different because ..
126 | 	k0 = saes64_ks2(ks, k0);
127 | 	k1 = saes64_ks2(k0, k1);				//  .. no need to compute k2
128 | 	t0 = saes64_encs(v0, v1);				//  different function
129 | 	t1 = saes64_encs(v1, v0);
130 | 	t0 = t0 ^ k0;							//  final AddRoundKey
131 | 	t1 = t1 ^ k1;
132 | 
133 | 	((uint64_t *) ct)[0] = t0;				//  store ciphertext
134 | 	((uint64_t *) ct)[1] = t1;
135 | }
136 | 
137 | 
138 | //  === AES-256 round with on-the-fly key schedule ===
139 | 
140 | //  2 x saes64_encs[m], 1 x saes64_KS1, 2 x SAES64.KS2, 2 x XOR
141 | 
142 | #define SAES64_OTF256A(i) {		\
143 | 	u0 = saes64_encsm(t0, t1);	\
144 | 	u1 = saes64_encsm(t1, t0);	\
145 | 	ks = saes64_ks1(k3, i);		\
146 | 	k0 = saes64_ks2(ks, k0);	\
147 | 	k1 = saes64_ks2(k0, k1);	\
148 | 	u0 = u0 ^ k2;				\
149 | 	u1 = u1 ^ k3;				}
150 | 
151 | #define SAES64_OTF256B(i) {		\
152 | 	t0 = saes64_encsm(u0, u1);	\
153 | 	t1 = saes64_encsm(u1, u0);	\
154 | 	ks = saes64_ks1(k1, i);		\
155 | 	k2 = saes64_ks2(ks, k2);	\
156 | 	k3 = saes64_ks2(k2, k3); 	\
157 | 	t0 = t0 ^ k0;				\
158 | 	t1 = t1 ^ k1;				}
159 | 
160 | 
161 | void aes256_enc_otf_saes64(uint8_t ct[16], const uint8_t pt[16],
162 | 						   const uint32_t * rk)
163 | {
164 | 	uint64_t t0, t1, u0, u1, k0, k1, k2, k3, ks;
165 | 
166 | 	k0 = ((const uint64_t *) rk)[0];		//  load key
167 | 	k1 = ((const uint64_t *) rk)[1];
168 | 	k2 = ((const uint64_t *) rk)[2];
169 | 	k3 = ((const uint64_t *) rk)[3];
170 | 
171 | 	t0 = ((const uint64_t *) pt)[0];		//  get plaintext
172 | 	t1 = ((const uint64_t *) pt)[1];
173 | 
174 | 	t0 = t0 ^ k0;
175 | 	t1 = t1 ^ k1;
176 | 
177 | 	SAES64_OTF256A(0);						//  first round
178 | 	SAES64_OTF256B(10);						//  # 2
179 | 	SAES64_OTF256A(1);						//  # 3
180 | 	SAES64_OTF256B(10);						//  # 4
181 | 	SAES64_OTF256A(2);						//  # 5
182 | 	SAES64_OTF256B(10);						//  # 6
183 | 	SAES64_OTF256A(3);						//  # 7
184 | 	SAES64_OTF256B(10);						//  # 8
185 | 	SAES64_OTF256A(4);						//  # 9
186 | 	SAES64_OTF256B(10);						//  # 10
187 | 	SAES64_OTF256A(5);						//  # 11
188 | 	SAES64_OTF256B(10);						//  # 12
189 | 	SAES64_OTF256A(6);						//  # 13
190 | 	t0 = saes64_encs(u0, u1);				//  last round
191 | 	t1 = saes64_encs(u1, u0);
192 | 	t0 = t0 ^ k0;
193 | 	t1 = t1 ^ k1;
194 | 
195 | 	((uint64_t *) ct)[0] = t0;				//  store ciphertext
196 | 	((uint64_t *) ct)[1] = t1;
197 | }
198 | 


--------------------------------------------------------------------------------
/aes_otf_saes64.h:
--------------------------------------------------------------------------------
 1 | //  aes_otf_saes64.h
 2 | //  2020-05-06  Markku-Juhani O. Saarinen <mjos@pqhsield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  AES Encryption with on-the-fly key expansion.
 6 | //  *rk can point to expanded key or just the key.
 7 | 
 8 | #ifndef _AES_OTF_SAES64_H_
 9 | #define _AES_OTF_SAES64_H_
10 | 
11 | #include <stdint.h>
12 | 
13 | void aes128_enc_otf_saes64(uint8_t ct[16], const uint8_t pt[16],
14 | 						   const uint32_t * rk);
15 | 
16 | void aes192_enc_otf_saes64(uint8_t ct[16], const uint8_t pt[16],
17 | 						   const uint32_t * rk);
18 | 
19 | void aes256_enc_otf_saes64(uint8_t ct[16], const uint8_t pt[16],
20 | 						   const uint32_t * rk);
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/aes_saes32.c:
--------------------------------------------------------------------------------
  1 | //  aes_saes32.c
  2 | //  2020-01-22  Markku-Juhani O. Saarinen <mjos@pqhsield.com>
  3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //  "Running pseudocode" for full AES-128/192/256 encryption and decryption
  6 | //  using SAES32.xxx instructions.
  7 | 
  8 | #include <stddef.h>
  9 | 
 10 | #include "saes32.h"
 11 | #include "aes_wrap.h"
 12 | #include "bitmanip.h"
 13 | #include "rv_endian.h"
 14 | #include "sboxes.h"
 15 | 
 16 | //  === ENCRYPTION ===
 17 | 
 18 | //  Encrypt rounds. Implements AES-128/192/256 depending on nr = {10,12,14}
 19 | 
 20 | void aes_enc_rounds_saes32(uint8_t ct[16], const uint8_t pt[16],
 21 | 						   const uint32_t rk[], int nr)
 22 | {
 23 | 	uint32_t t0, t1, t2, t3;				//  even round state registers
 24 | 	uint32_t u0, u1, u2, u3;				//  odd round state registers
 25 | 	const uint32_t *kp = &rk[4 * nr];		//  key pointer as loop condition
 26 | 
 27 | 	t0 = rk[0];								//  fetch even subkey
 28 | 	t1 = rk[1];
 29 | 	t2 = rk[2];
 30 | 	t3 = rk[3];
 31 | 
 32 | 	t0 ^= get32u_le(pt);					//  xor with plaintext block
 33 | 	t1 ^= get32u_le(pt + 4);
 34 | 	t2 ^= get32u_le(pt + 8);
 35 | 	t3 ^= get32u_le(pt + 12);
 36 | 
 37 | 	while (1) {								//  double round
 38 | 
 39 | 		u0 = rk[4];							//  fetch odd subkey
 40 | 		u1 = rk[5];
 41 | 		u2 = rk[6];
 42 | 		u3 = rk[7];
 43 | 
 44 | 		u0 = saes32_encsm(u0, t0, 0);		//  AES round, 16 instructions
 45 | 		u0 = saes32_encsm(u0, t1, 1);
 46 | 		u0 = saes32_encsm(u0, t2, 2);
 47 | 		u0 = saes32_encsm(u0, t3, 3);
 48 | 
 49 | 		u1 = saes32_encsm(u1, t1, 0);
 50 | 		u1 = saes32_encsm(u1, t2, 1);
 51 | 		u1 = saes32_encsm(u1, t3, 2);
 52 | 		u1 = saes32_encsm(u1, t0, 3);
 53 | 
 54 | 		u2 = saes32_encsm(u2, t2, 0);
 55 | 		u2 = saes32_encsm(u2, t3, 1);
 56 | 		u2 = saes32_encsm(u2, t0, 2);
 57 | 		u2 = saes32_encsm(u2, t1, 3);
 58 | 
 59 | 		u3 = saes32_encsm(u3, t3, 0);
 60 | 		u3 = saes32_encsm(u3, t0, 1);
 61 | 		u3 = saes32_encsm(u3, t1, 2);
 62 | 		u3 = saes32_encsm(u3, t2, 3);
 63 | 
 64 | 		t0 = rk[8];							//  fetch even subkey
 65 | 		t1 = rk[9];
 66 | 		t2 = rk[10];
 67 | 		t3 = rk[11];
 68 | 
 69 | 		rk += 8;							//  step key pointer
 70 | 		if (rk == kp)						//  final round ?
 71 | 			break;
 72 | 
 73 | 		t0 = saes32_encsm(t0, u0, 0);		//  AES round, 16 instructions
 74 | 		t0 = saes32_encsm(t0, u1, 1);
 75 | 		t0 = saes32_encsm(t0, u2, 2);
 76 | 		t0 = saes32_encsm(t0, u3, 3);
 77 | 
 78 | 		t1 = saes32_encsm(t1, u1, 0);
 79 | 		t1 = saes32_encsm(t1, u2, 1);
 80 | 		t1 = saes32_encsm(t1, u3, 2);
 81 | 		t1 = saes32_encsm(t1, u0, 3);
 82 | 
 83 | 		t2 = saes32_encsm(t2, u2, 0);
 84 | 		t2 = saes32_encsm(t2, u3, 1);
 85 | 		t2 = saes32_encsm(t2, u0, 2);
 86 | 		t2 = saes32_encsm(t2, u1, 3);
 87 | 
 88 | 		t3 = saes32_encsm(t3, u3, 0);
 89 | 		t3 = saes32_encsm(t3, u0, 1);
 90 | 		t3 = saes32_encsm(t3, u1, 2);
 91 | 		t3 = saes32_encsm(t3, u2, 3);
 92 | 	}
 93 | 
 94 | 	t0 = saes32_encs(t0, u0, 0);			//  final round is different
 95 | 	t0 = saes32_encs(t0, u1, 1);
 96 | 	t0 = saes32_encs(t0, u2, 2);
 97 | 	t0 = saes32_encs(t0, u3, 3);
 98 | 
 99 | 	t1 = saes32_encs(t1, u1, 0);
100 | 	t1 = saes32_encs(t1, u2, 1);
101 | 	t1 = saes32_encs(t1, u3, 2);
102 | 	t1 = saes32_encs(t1, u0, 3);
103 | 
104 | 	t2 = saes32_encs(t2, u2, 0);
105 | 	t2 = saes32_encs(t2, u3, 1);
106 | 	t2 = saes32_encs(t2, u0, 2);
107 | 	t2 = saes32_encs(t2, u1, 3);
108 | 
109 | 	t3 = saes32_encs(t3, u3, 0);
110 | 	t3 = saes32_encs(t3, u0, 1);
111 | 	t3 = saes32_encs(t3, u1, 2);
112 | 	t3 = saes32_encs(t3, u2, 3);
113 | 
114 | 	put32u_le(ct, t0);						//  write ciphertext block
115 | 	put32u_le(ct + 4, t1);
116 | 	put32u_le(ct + 8, t2);
117 | 	put32u_le(ct + 12, t3);
118 | }
119 | 
120 | //  Wrappers
121 | 
122 | void aes128_enc_ecb_saes32(uint8_t ct[16], const uint8_t pt[16],
123 | 						   const uint32_t rk[AES128_RK_WORDS])
124 | {
125 | 	aes_enc_rounds_saes32(ct, pt, rk, AES128_ROUNDS);
126 | }
127 | 
128 | void aes192_enc_ecb_saes32(uint8_t ct[16], const uint8_t pt[16],
129 | 						   const uint32_t rk[AES192_RK_WORDS])
130 | {
131 | 	aes_enc_rounds_saes32(ct, pt, rk, AES192_ROUNDS);
132 | }
133 | 
134 | void aes256_enc_ecb_saes32(uint8_t ct[16], const uint8_t pt[16],
135 | 						   const uint32_t rk[AES256_RK_WORDS])
136 | {
137 | 	aes_enc_rounds_saes32(ct, pt, rk, AES256_ROUNDS);
138 | }
139 | 
140 | //  Key schedule for AES-128 Encryption.
141 | 
142 | void aes128_enc_key_saes32(uint32_t rk[44], const uint8_t key[16])
143 | {
144 | 	uint32_t t0, t1, t2, t3, tr;			//  subkey registers
145 | 	const uint32_t *rke = &rk[44 - 4];		//  end pointer
146 | 	const uint8_t *rc = aes_rcon;			//  round constants
147 | 
148 | 	t0 = get32u_le(key);					//  load secret key
149 | 	t1 = get32u_le(key + 4);
150 | 	t2 = get32u_le(key + 8);
151 | 	t3 = get32u_le(key + 12);
152 | 
153 | 	while (1) {
154 | 
155 | 		rk[0] = t0;							//  store subkey
156 | 		rk[1] = t1;
157 | 		rk[2] = t2;
158 | 		rk[3] = t3;
159 | 
160 | 		if (rk == rke)						//  end condition
161 | 			return;
162 | 		rk += 4;							//  step pointer by one subkey
163 | 
164 | 		t0 ^= (uint32_t) * rc++;			//  round constant
165 | 		tr = rv32b_ror(t3, 8);				//  rotate 8 bits (little endian!)
166 | 		t0 = saes32_encs(t0, tr, 0);		//  SubWord()
167 | 		t0 = saes32_encs(t0, tr, 1);
168 | 		t0 = saes32_encs(t0, tr, 2);
169 | 		t0 = saes32_encs(t0, tr, 3);
170 | 		t1 ^= t0;
171 | 		t2 ^= t1;
172 | 		t3 ^= t2;
173 | 	}
174 | }
175 | 
176 | //  Key schedule for AES-192 encryption.
177 | 
178 | void aes192_enc_key_saes32(uint32_t rk[52], const uint8_t key[24])
179 | {
180 | 	uint32_t t0, t1, t2, t3, t4, t5, tr;	//  subkey registers
181 | 	const uint32_t *rke = &rk[52 - 4];		//  end pointer
182 | 	const uint8_t *rc = aes_rcon;			//  round constants
183 | 
184 | 	t0 = get32u_le(key);					//  load secret key
185 | 	t1 = get32u_le(key + 4);
186 | 	t2 = get32u_le(key + 8);
187 | 	t3 = get32u_le(key + 12);
188 | 	t4 = get32u_le(key + 16);
189 | 	t5 = get32u_le(key + 20);
190 | 
191 | 	while (1) {
192 | 
193 | 		rk[0] = t0;							//  store subkey (or part)
194 | 		rk[1] = t1;
195 | 		rk[2] = t2;
196 | 		rk[3] = t3;
197 | 		if (rk == rke)						//  end condition
198 | 			return;
199 | 		rk[4] = t4;
200 | 		rk[5] = t5;
201 | 		rk += 6;							//  step pointer by 1.5 subkeys
202 | 
203 | 		t0 ^= (uint32_t) * rc++;			//  round constant
204 | 		tr = rv32b_ror(t5, 8);				//  rotate 8 bits (little endian!)
205 | 		t0 = saes32_encs(t0, tr, 0);		//  SubWord()
206 | 		t0 = saes32_encs(t0, tr, 1);
207 | 		t0 = saes32_encs(t0, tr, 2);
208 | 		t0 = saes32_encs(t0, tr, 3);
209 | 
210 | 		t1 ^= t0;
211 | 		t2 ^= t1;
212 | 		t3 ^= t2;
213 | 		t4 ^= t3;
214 | 		t5 ^= t4;
215 | 	}
216 | }
217 | 
218 | //  Key schedule for AES-256 encryption.
219 | 
220 | void aes256_enc_key_saes32(uint32_t rk[60], const uint8_t key[32])
221 | {
222 | 	uint32_t t0, t1, t2, t3, t4, t5, t6, t7, tr;	// subkey registers
223 | 	const uint32_t *rke = &rk[60 - 4];		//  end pointer
224 | 	const uint8_t *rc = aes_rcon;			//  round constants
225 | 
226 | 	t0 = get32u_le(key);
227 | 	t1 = get32u_le(key + 4);
228 | 	t2 = get32u_le(key + 8);
229 | 	t3 = get32u_le(key + 12);
230 | 	t4 = get32u_le(key + 16);
231 | 	t5 = get32u_le(key + 20);
232 | 	t6 = get32u_le(key + 24);
233 | 	t7 = get32u_le(key + 28);
234 | 
235 | 	rk[0] = t0;								//  store first subkey
236 | 	rk[1] = t1;
237 | 	rk[2] = t2;
238 | 	rk[3] = t3;
239 | 
240 | 	while (1) {
241 | 
242 | 		rk[4] = t4;							//  store odd subkey
243 | 		rk[5] = t5;
244 | 		rk[6] = t6;
245 | 		rk[7] = t7;
246 | 		rk += 8;							//  step pointer by 2 subkeys
247 | 
248 | 		t0 ^= (uint32_t) * rc++;			//  round constant
249 | 		tr = rv32b_ror(t7, 8);				//  rotate 8 bits (little endian!)
250 | 		t0 = saes32_encs(t0, tr, 0);		//  SubWord()
251 | 		t0 = saes32_encs(t0, tr, 1);
252 | 		t0 = saes32_encs(t0, tr, 2);
253 | 		t0 = saes32_encs(t0, tr, 3);
254 | 		t1 ^= t0;
255 | 		t2 ^= t1;
256 | 		t3 ^= t2;
257 | 
258 | 		rk[0] = t0;							//  store even subkey
259 | 		rk[1] = t1;
260 | 		rk[2] = t2;
261 | 		rk[3] = t3;
262 | 		if (rk == rke)						//  end condition
263 | 			return;
264 | 
265 | 		t4 = saes32_encs(t4, t3, 0);		//  SubWord() - NO rotation
266 | 		t4 = saes32_encs(t4, t3, 1);
267 | 		t4 = saes32_encs(t4, t3, 2);
268 | 		t4 = saes32_encs(t4, t3, 3);
269 | 		t5 ^= t4;
270 | 		t6 ^= t5;
271 | 		t7 ^= t6;
272 | 	}
273 | }
274 | 
275 | //  === DECRYPTION ===
276 | 
277 | //  Decrypt rounds. Implements AES-128/192/256 depending on nr = {10,12,14}
278 | 
279 | void aes_dec_rounds_saes32(uint8_t pt[16], const uint8_t ct[16],
280 | 						   const uint32_t rk[], int nr)
281 | {
282 | 	uint32_t t0, t1, t2, t3;				//  even round state registers
283 | 	uint32_t u0, u1, u2, u3;				//  odd round state registers
284 | 	const uint32_t *kp = &rk[4 * nr];		//  key pointer
285 | 
286 | 	t0 = kp[0];								//  fetch last subkey
287 | 	t1 = kp[1];
288 | 	t2 = kp[2];
289 | 	t3 = kp[3];
290 | 	kp -= 8;
291 | 
292 | 	t0 ^= get32u_le(ct);					//  xor with ciphertext block
293 | 	t1 ^= get32u_le(ct + 4);
294 | 	t2 ^= get32u_le(ct + 8);
295 | 	t3 ^= get32u_le(ct + 12);
296 | 
297 | 	while (1) {
298 | 		u0 = kp[4];							//  fetch odd subkey
299 | 		u1 = kp[5];
300 | 		u2 = kp[6];
301 | 		u3 = kp[7];
302 | 
303 | 		u0 = saes32_decsm(u0, t0, 0);		//  AES decryption round, 16 instr
304 | 		u0 = saes32_decsm(u0, t3, 1);
305 | 		u0 = saes32_decsm(u0, t2, 2);
306 | 		u0 = saes32_decsm(u0, t1, 3);
307 | 
308 | 		u1 = saes32_decsm(u1, t1, 0);
309 | 		u1 = saes32_decsm(u1, t0, 1);
310 | 		u1 = saes32_decsm(u1, t3, 2);
311 | 		u1 = saes32_decsm(u1, t2, 3);
312 | 
313 | 		u2 = saes32_decsm(u2, t2, 0);
314 | 		u2 = saes32_decsm(u2, t1, 1);
315 | 		u2 = saes32_decsm(u2, t0, 2);
316 | 		u2 = saes32_decsm(u2, t3, 3);
317 | 
318 | 		u3 = saes32_decsm(u3, t3, 0);
319 | 		u3 = saes32_decsm(u3, t2, 1);
320 | 		u3 = saes32_decsm(u3, t1, 2);
321 | 		u3 = saes32_decsm(u3, t0, 3);
322 | 
323 | 		t0 = kp[0];							//  fetch even subkey
324 | 		t1 = kp[1];
325 | 		t2 = kp[2];
326 | 		t3 = kp[3];
327 | 
328 | 		if (kp == rk)						//  final round
329 | 			break;
330 | 		kp -= 8;
331 | 
332 | 		t0 = saes32_decsm(t0, u0, 0);		//  AES decryption round, 16 instr
333 | 		t0 = saes32_decsm(t0, u3, 1);
334 | 		t0 = saes32_decsm(t0, u2, 2);
335 | 		t0 = saes32_decsm(t0, u1, 3);
336 | 
337 | 		t1 = saes32_decsm(t1, u1, 0);
338 | 		t1 = saes32_decsm(t1, u0, 1);
339 | 		t1 = saes32_decsm(t1, u3, 2);
340 | 		t1 = saes32_decsm(t1, u2, 3);
341 | 
342 | 		t2 = saes32_decsm(t2, u2, 0);
343 | 		t2 = saes32_decsm(t2, u1, 1);
344 | 		t2 = saes32_decsm(t2, u0, 2);
345 | 		t2 = saes32_decsm(t2, u3, 3);
346 | 
347 | 		t3 = saes32_decsm(t3, u3, 0);
348 | 		t3 = saes32_decsm(t3, u2, 1);
349 | 		t3 = saes32_decsm(t3, u1, 2);
350 | 		t3 = saes32_decsm(t3, u0, 3);
351 | 	}
352 | 
353 | 	t0 = saes32_decs(t0, u0, 0);			//  final decryption round, 16 ins.
354 | 	t0 = saes32_decs(t0, u3, 1);
355 | 	t0 = saes32_decs(t0, u2, 2);
356 | 	t0 = saes32_decs(t0, u1, 3);
357 | 
358 | 	t1 = saes32_decs(t1, u1, 0);
359 | 	t1 = saes32_decs(t1, u0, 1);
360 | 	t1 = saes32_decs(t1, u3, 2);
361 | 	t1 = saes32_decs(t1, u2, 3);
362 | 
363 | 	t2 = saes32_decs(t2, u2, 0);
364 | 	t2 = saes32_decs(t2, u1, 1);
365 | 	t2 = saes32_decs(t2, u0, 2);
366 | 	t2 = saes32_decs(t2, u3, 3);
367 | 
368 | 	t3 = saes32_decs(t3, u3, 0);
369 | 	t3 = saes32_decs(t3, u2, 1);
370 | 	t3 = saes32_decs(t3, u1, 2);
371 | 	t3 = saes32_decs(t3, u0, 3);
372 | 
373 | 	put32u_le(pt, t0);						//  write plaintext block
374 | 	put32u_le(pt + 4, t1);
375 | 	put32u_le(pt + 8, t2);
376 | 	put32u_le(pt + 12, t3);
377 | }
378 | 
379 | //  Wrappers
380 | 
381 | void aes128_dec_ecb_saes32(uint8_t pt[16], const uint8_t ct[16],
382 | 						   const uint32_t rk[AES128_RK_WORDS])
383 | {
384 | 	aes_dec_rounds_saes32(pt, ct, rk, AES128_ROUNDS);
385 | }
386 | 
387 | void aes192_dec_ecb_saes32(uint8_t pt[16], const uint8_t ct[16],
388 | 						   const uint32_t rk[AES192_RK_WORDS])
389 | {
390 | 	aes_dec_rounds_saes32(pt, ct, rk, AES192_ROUNDS);
391 | }
392 | 
393 | void aes256_dec_ecb_saes32(uint8_t pt[16], const uint8_t ct[16],
394 | 						   const uint32_t rk[AES256_RK_WORDS])
395 | {
396 | 	aes_dec_rounds_saes32(pt, ct, rk, AES256_ROUNDS);
397 | }
398 | 
399 | //  Helper: apply inverse mixcolumns to a vector
400 | 
401 | void saes32_dec_invmc(uint32_t * v, size_t len)
402 | {
403 | 	size_t i;
404 | 	uint32_t x, y;
405 | 
406 | 	for (i = 0; i < len; i++) {
407 | 		x = v[i];
408 | 
409 | 		y = saes32_encs(0, x, 0);			//  SubWord()
410 | 		y = saes32_encs(y, x, 1);
411 | 		y = saes32_encs(y, x, 2);
412 | 		y = saes32_encs(y, x, 3);
413 | 
414 | 		x = saes32_decsm(0, y, 0);			//  Just want inv MixCol()
415 | 		x = saes32_decsm(x, y, 1);
416 | 		x = saes32_decsm(x, y, 2);
417 | 		x = saes32_decsm(x, y, 3);
418 | 
419 | 		v[i] = x;
420 | 	}
421 | }
422 | 
423 | //  Key schedule for AES-128 decryption.
424 | 
425 | void aes128_dec_key_saes32(uint32_t rk[44], const uint8_t key[16])
426 | {
427 | 	//  create an encryption key and modify middle rounds
428 | 	aes128_enc_key(rk, key);
429 | 	saes32_dec_invmc(rk + 4, AES128_RK_WORDS - 8);
430 | }
431 | 
432 | //  Key schedule for AES-192 decryption.
433 | 
434 | void aes192_dec_key_saes32(uint32_t rk[52], const uint8_t key[24])
435 | {
436 | 	//  create an encryption key and modify middle rounds
437 | 	aes192_enc_key(rk, key);
438 | 	saes32_dec_invmc(rk + 4, AES192_RK_WORDS - 8);
439 | }
440 | 
441 | //  Key schedule for AES-256 decryption.
442 | 
443 | void aes256_dec_key_saes32(uint32_t rk[60], const uint8_t key[32])
444 | {
445 | 	//  create an encryption key and modify middle rounds
446 | 	aes256_enc_key(rk, key);
447 | 	saes32_dec_invmc(rk + 4, AES256_RK_WORDS - 8);
448 | }
449 | 


--------------------------------------------------------------------------------
/aes_saes32.h:
--------------------------------------------------------------------------------
 1 | //  aes_saes32.h
 2 | //  2020-05-05  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  Implementation prototypes for aes_saes32.c
 6 | 
 7 | #ifndef _AES_SAES32_H_
 8 | #define _AES_SAES32_H_
 9 | 
10 | #include <stdint.h>
11 | 
12 | //  Set encryption key
13 | 
14 | void aes128_enc_key_saes32(uint32_t rk[AES128_RK_WORDS],
15 | 						   const uint8_t key[16]);
16 | 
17 | void aes192_enc_key_saes32(uint32_t rk[AES192_RK_WORDS],
18 | 						   const uint8_t key[24]);
19 | 
20 | void aes256_enc_key_saes32(uint32_t rk[AES256_RK_WORDS],
21 | 						   const uint8_t key[32]);
22 | 
23 | //  Encrypt a block
24 | 
25 | void aes128_enc_ecb_saes32(uint8_t ct[16], const uint8_t pt[16],
26 | 						   const uint32_t rk[AES128_RK_WORDS]);
27 | 
28 | void aes192_enc_ecb_saes32(uint8_t ct[16], const uint8_t pt[16],
29 | 						   const uint32_t rk[AES192_RK_WORDS]);
30 | 
31 | void aes256_enc_ecb_saes32(uint8_t ct[16], const uint8_t pt[16],
32 | 						   const uint32_t rk[AES256_RK_WORDS]);
33 | 
34 | 
35 | //  Set decryption key
36 | 
37 | void aes128_dec_key_saes32(uint32_t rk[AES128_RK_WORDS],
38 | 						   const uint8_t key[16]);
39 | 
40 | void aes192_dec_key_saes32(uint32_t rk[AES192_RK_WORDS],
41 | 						   const uint8_t key[24]);
42 | 
43 | void aes256_dec_key_saes32(uint32_t rk[AES256_RK_WORDS],
44 | 						   const uint8_t key[32]);
45 | 
46 | //  Decrypt a block
47 | 
48 | void aes128_dec_ecb_saes32(uint8_t pt[16], const uint8_t ct[16],
49 | 						   const uint32_t rk[AES128_RK_WORDS]);
50 | 
51 | void aes192_dec_ecb_saes32(uint8_t pt[16], const uint8_t ct[16],
52 | 						   const uint32_t rk[AES192_RK_WORDS]);
53 | 
54 | void aes256_dec_ecb_saes32(uint8_t pt[16], const uint8_t ct[16],
55 | 						   const uint32_t rk[AES256_RK_WORDS]);
56 | 
57 | #endif										//  _AES_SAES32_H_
58 | 


--------------------------------------------------------------------------------
/aes_saes64.c:
--------------------------------------------------------------------------------
  1 | //  aes_saes64.c
  2 | //  2020-05-03  Markku-Juhani O. Saarinen <mjos@pqhsield.com>
  3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //  "Running pseudocode" for full AES-128/192/256 encryption and decryption
  6 | //  using SAES64.xxx instructions.
  7 | 
  8 | #include <stddef.h>
  9 | 
 10 | #include "aes_wrap.h"
 11 | #include "saes64.h"
 12 | #include "rv_endian.h"
 13 | 
 14 | //  Encrypt rounds. Implements AES-128/192/256 depending on nr = {10,12,14}
 15 | 
 16 | //  Per round: 2 * ENCSM, 2 * load, 2 * XOR
 17 | 
 18 | #define SAES64_ENC_ROUND(r0, r1, s0, s1, i) {	\
 19 | 	r0 = saes64_encsm(s0, s1);	\
 20 | 	r1 = saes64_encsm(s1, s0);	\
 21 | 	k0 = kp[2 * i];				\
 22 | 	k1 = kp[2 * i + 1];			\
 23 | 	r0 = r0 ^ k0;				\
 24 | 	r1 = r1 ^ k1;				}
 25 | 
 26 | void aes_enc_rounds_saes64(uint8_t ct[16], const uint8_t pt[16],
 27 | 						   const uint32_t rk[], int nr)
 28 | {
 29 | 	//  key pointer
 30 | 	const uint64_t *kp = (const uint64_t *) rk;
 31 | 
 32 | 	uint64_t t0, t1, u0, u1, k0, k1;
 33 | 
 34 | 	t0 = ((const uint64_t *) pt)[0];		//  get plaintext
 35 | 	t1 = ((const uint64_t *) pt)[1];
 36 | 
 37 | 	k0 = kp[0];								//  load first round
 38 | 	k1 = kp[1];
 39 | 	t0 = t0 ^ k0;
 40 | 	t1 = t1 ^ k1;
 41 | 
 42 | 	SAES64_ENC_ROUND(u0, u1, t0, t1, 1);	//  6 insn / round
 43 | 	SAES64_ENC_ROUND(t0, t1, u0, u1, 2);
 44 | 	SAES64_ENC_ROUND(u0, u1, t0, t1, 3);
 45 | 	SAES64_ENC_ROUND(t0, t1, u0, u1, 4);
 46 | 	SAES64_ENC_ROUND(u0, u1, t0, t1, 5);
 47 | 	SAES64_ENC_ROUND(t0, t1, u0, u1, 6);
 48 | 	SAES64_ENC_ROUND(u0, u1, t0, t1, 7);
 49 | 	SAES64_ENC_ROUND(t0, t1, u0, u1, 8);
 50 | 	SAES64_ENC_ROUND(u0, u1, t0, t1, 9);
 51 | 
 52 | 	//  In reality we would entirely inline these for all 128/192/256 versions
 53 | 
 54 | 	if (nr >= 12) {							//  AES-192, AES-256
 55 | 		SAES64_ENC_ROUND(t0, t1, u0, u1, 10);
 56 | 		SAES64_ENC_ROUND(u0, u1, t0, t1, 11);
 57 | 		if (nr > 12) {
 58 | 			SAES64_ENC_ROUND(t0, t1, u0, u1, 12);
 59 | 			SAES64_ENC_ROUND(u0, u1, t0, t1, 13);
 60 | 			k0 = kp[2 * 14];				//  AES-256 last round key
 61 | 			k1 = kp[2 * 14 + 1];
 62 | 		} else {
 63 | 			k0 = kp[2 * 12];				//  AES-192 last round key
 64 | 			k1 = kp[2 * 12 + 1];
 65 | 		}
 66 | 	} else {
 67 | 		k0 = kp[2 * 10];					//  AES-128 last round key
 68 | 		k1 = kp[2 * 10 + 1];
 69 | 	}
 70 | 
 71 | 	t0 = saes64_encs(u0, u1);				//  Final round; ENCS not ENCSM
 72 | 	t1 = saes64_encs(u1, u0);
 73 | 	t0 = t0 ^ k0;							//  last round key
 74 | 	t1 = t1 ^ k1;
 75 | 
 76 | 	((uint64_t *) ct)[0] = t0;				//  store ciphertext
 77 | 	((uint64_t *) ct)[1] = t1;
 78 | }
 79 | 
 80 | //  Wrappers
 81 | 
 82 | void aes128_enc_ecb_saes64(uint8_t ct[16], const uint8_t pt[16],
 83 | 						   const uint32_t rk[AES128_RK_WORDS])
 84 | {
 85 | 	aes_enc_rounds_saes64(ct, pt, rk, AES128_ROUNDS);
 86 | }
 87 | 
 88 | void aes192_enc_ecb_saes64(uint8_t ct[16], const uint8_t pt[16],
 89 | 						   const uint32_t rk[AES192_RK_WORDS])
 90 | {
 91 | 	aes_enc_rounds_saes64(ct, pt, rk, AES192_ROUNDS);
 92 | }
 93 | 
 94 | void aes256_enc_ecb_saes64(uint8_t ct[16], const uint8_t pt[16],
 95 | 						   const uint32_t rk[AES256_RK_WORDS])
 96 | {
 97 | 	aes_enc_rounds_saes64(ct, pt, rk, AES256_ROUNDS);
 98 | }
 99 | 
100 | //  Key schedule for AES-128 Encryption.
101 | //  For each round 1 * SAES64.KS1, 2 * SAES64.KS2 and 2 * store
102 | 
103 | #define SAES64_KEY128_STEP(i) {	\
104 | 	kp[2 * i] 	= k0;			\
105 | 	kp[2 * i + 1] = k1;			\
106 | 	ks = saes64_ks1(k1, i);		\
107 | 	k0 = saes64_ks2(ks, k0);	\
108 | 	k1 = saes64_ks2(k0, k1); 	}
109 | 
110 | void aes128_enc_key_saes64(uint32_t rk[44], const uint8_t key[16])
111 | {
112 | 	uint64_t *kp = (uint64_t *) rk;			//  key pointer
113 | 	uint64_t k0, k1, ks;
114 | 
115 | 	k0 = get64u_le(key);					//  load secret key
116 | 	k1 = get64u_le(key + 8);
117 | 	SAES64_KEY128_STEP(0);					//  5 insn each, unrolled
118 | 	SAES64_KEY128_STEP(1);
119 | 	SAES64_KEY128_STEP(2);
120 | 	SAES64_KEY128_STEP(3);
121 | 	SAES64_KEY128_STEP(4);
122 | 	SAES64_KEY128_STEP(5);
123 | 	SAES64_KEY128_STEP(6);
124 | 	SAES64_KEY128_STEP(7);
125 | 	SAES64_KEY128_STEP(8);
126 | 	SAES64_KEY128_STEP(9);					//  (10 steps, 10 rounds)
127 | 	kp[20] = k0;							//  last round key
128 | 	kp[21] = k1;
129 | }
130 | 
131 | //  Key schedule for AES-192 encryption.
132 | //  For each 1.5 rounds 1 * SAES64.KS1, 3 * SAES64.KS2 and 3 * store
133 | 
134 | #define SAES64_KEY192_STEP(i) {	\
135 | 	kp[3 * i] 	= k0;			\
136 | 	kp[3 * i + 1] = k1;			\
137 | 	kp[3 * i + 2] = k2;			\
138 | 	ks = saes64_ks1(k2, i);		\
139 | 	k0 = saes64_ks2(ks, k0);	\
140 | 	k1 = saes64_ks2(k0, k1);	\
141 | 	k2 = saes64_ks2(k1, k2); 	}
142 | 
143 | void aes192_enc_key_saes64(uint32_t rk[52], const uint8_t key[24])
144 | {
145 | 	uint64_t *kp = (uint64_t *) rk;			//  key pointer
146 | 	uint64_t k0, k1, k2, ks;
147 | 
148 | 	k0 = get64u_le(key);					//  load secret key
149 | 	k1 = get64u_le(key + 8);
150 | 	k2 = get64u_le(key + 16);
151 | 	SAES64_KEY192_STEP(0);					//  two steps is 3 rounds
152 | 	SAES64_KEY192_STEP(1);					//  14/3 = 4.7 insn/round
153 | 	SAES64_KEY192_STEP(2);
154 | 	SAES64_KEY192_STEP(3);
155 | 	SAES64_KEY192_STEP(4);
156 | 	SAES64_KEY192_STEP(5);
157 | 	SAES64_KEY192_STEP(6);
158 | 	kp[21] = k0;							//  last full state
159 | 	kp[22] = k1;
160 | 	kp[23] = k2;
161 | 	ks = saes64_ks1(k2, 7);					//  (8 steps, 12 rounds)
162 | 	k0 = saes64_ks2(ks, k0);
163 | 	k1 = saes64_ks2(k0, k1);				//  no need for k2
164 | 	kp[24] = k0;							//  last round key
165 | 	kp[25] = k1;
166 | }
167 | 
168 | //  Key schedule for AES-256 encryption.
169 | //  For each 2 rounds: 2 * SAES64.KS1, 4 * SAES64.KS2 and 4 * store
170 | 
171 | #define SAES64_KEY256_STEP(i) {	\
172 | 	kp[4 * i] 	= k0;			\
173 | 	kp[4 * i + 1] = k1;			\
174 | 	kp[4 * i + 2] = k2;			\
175 | 	kp[4 * i + 3] = k3;			\
176 | 	ks = saes64_ks1(k3, i);		\
177 | 	k0 = saes64_ks2(ks, k0);	\
178 | 	k1 = saes64_ks2(k0, k1);	\
179 | 	ks = saes64_ks1(k1, 10);	\
180 | 	k2 = saes64_ks2(ks, k2);	\
181 | 	k3 = saes64_ks2(k2, k3); 	}
182 | 
183 | void aes256_enc_key_saes64(uint32_t rk[60], const uint8_t key[32])
184 | {
185 | 	uint64_t *kp = (uint64_t *) rk;			//  key pointer
186 | 	uint64_t k0, k1, k2, k3, ks;
187 | 
188 | 	k0 = get64u_le(key);					//  load secret key
189 | 	k1 = get64u_le(key + 8);
190 | 	k2 = get64u_le(key + 16);
191 | 	k3 = get64u_le(key + 24);
192 | 	SAES64_KEY256_STEP(0);					//  1 steps is 2 rounds
193 | 	SAES64_KEY256_STEP(1);					//  10/2 = 5 insn/round
194 | 	SAES64_KEY256_STEP(2);
195 | 	SAES64_KEY256_STEP(3);
196 | 	SAES64_KEY256_STEP(4);
197 | 	SAES64_KEY256_STEP(5);
198 | 	kp[24] = k0;							//  store last full state
199 | 	kp[25] = k1;
200 | 	kp[26] = k2;
201 | 	kp[27] = k3;
202 | 	ks = saes64_ks1(k3, 6);					//  no need for k2, k3
203 | 	k0 = saes64_ks2(ks, k0);
204 | 	k1 = saes64_ks2(k0, k1);
205 | 	kp[28] = k0;							//  store last round key
206 | 	kp[29] = k1;
207 | }
208 | 
209 | //  Decrypt rounds. Implements AES-128/192/256 depending on nr = {10,12,14}
210 | 
211 | //  Per round: 2 * load, 2 * XOR, 2 * DECSM
212 | 
213 | #define SAES64_DEC_ROUND(r0, r1, s0, s1, i) {	\
214 | 	k0 = kp[2 * i + 2];			\
215 | 	k1 = kp[2 * i + 3];			\
216 | 	s0 = s0 ^ k0;				\
217 | 	s1 = s1 ^ k1;				\
218 | 	r0 = saes64_decsm(s0, s1);	\
219 | 	r1 = saes64_decsm(s1, s0);	}
220 | 
221 | 
222 | void aes_dec_rounds_saes64(uint8_t pt[16], const uint8_t ct[16],
223 | 						   const uint32_t rk[], int nr)
224 | {
225 | 	//  key pointer (just  a cast)
226 | 	const uint64_t *kp = (const uint64_t *) rk;
227 | 
228 | 	uint64_t t0, t1, u0, u1, k0, k1;
229 | 
230 | 	t0 = ((const uint64_t *) ct)[0];		//  get ciphertext
231 | 	t1 = ((const uint64_t *) ct)[1];
232 | 
233 | 	//  In reality we would entirely inline these for all 128/192/256 versions
234 | 
235 | 	if (nr >= 12) {
236 | 		if (nr > 12) {						//  AES-256
237 | 			SAES64_DEC_ROUND(u0, u1, t0, t1, 13);
238 | 			SAES64_DEC_ROUND(t0, t1, u0, u1, 12);
239 | 		}									//  AES-192, AES-192
240 | 		SAES64_DEC_ROUND(u0, u1, t0, t1, 11);
241 | 		SAES64_DEC_ROUND(t0, t1, u0, u1, 10);
242 | 	}
243 | 
244 | 	SAES64_DEC_ROUND(u0, u1, t0, t1, 9);	//  6 insn / round
245 | 	SAES64_DEC_ROUND(t0, t1, u0, u1, 8);
246 | 	SAES64_DEC_ROUND(u0, u1, t0, t1, 7);
247 | 	SAES64_DEC_ROUND(t0, t1, u0, u1, 6);
248 | 	SAES64_DEC_ROUND(u0, u1, t0, t1, 5);
249 | 	SAES64_DEC_ROUND(t0, t1, u0, u1, 4);
250 | 	SAES64_DEC_ROUND(u0, u1, t0, t1, 3);
251 | 	SAES64_DEC_ROUND(t0, t1, u0, u1, 2);
252 | 	SAES64_DEC_ROUND(u0, u1, t0, t1, 1);
253 | 
254 | 	k0 = kp[2];								//  final decrypt round
255 | 	k1 = kp[3];
256 | 	u0 = u0 ^ k0;
257 | 	u1 = u1 ^ k1;
258 | 	t0 = saes64_decs(u0, u1);				//  DECS instead of DECSM
259 | 	t1 = saes64_decs(u1, u0);
260 | 	k0 = kp[0];								//  first round key
261 | 	k1 = kp[1];
262 | 	t0 = t0 ^ k0;
263 | 	t1 = t1 ^ k1;
264 | 
265 | 	((uint64_t *) pt)[0] = t0;				//  store plaintext
266 | 	((uint64_t *) pt)[1] = t1;
267 | 
268 | 	return;
269 | 
270 | }
271 | 
272 | //  Wrappers
273 | 
274 | void aes128_dec_ecb_saes64(uint8_t pt[16], const uint8_t ct[16],
275 | 						   const uint32_t rk[AES128_RK_WORDS])
276 | {
277 | 	aes_dec_rounds_saes64(pt, ct, rk, AES128_ROUNDS);
278 | }
279 | 
280 | void aes192_dec_ecb_saes64(uint8_t pt[16], const uint8_t ct[16],
281 | 						   const uint32_t rk[AES192_RK_WORDS])
282 | {
283 | 	aes_dec_rounds_saes64(pt, ct, rk, AES192_ROUNDS);
284 | }
285 | 
286 | void aes256_dec_ecb_saes64(uint8_t pt[16], const uint8_t ct[16],
287 | 						   const uint32_t rk[AES256_RK_WORDS])
288 | {
289 | 	aes_dec_rounds_saes64(pt, ct, rk, AES256_ROUNDS);
290 | }
291 | 
292 | //  Helper: apply inverse mixcolumns to a vector
293 | 
294 | static inline void saes64_dec_invmc(uint64_t * v, size_t len)
295 | {
296 | 	size_t i;
297 | 
298 | 	for (i = 0; i < len; i++) {
299 | 		v[i] = saes64_imix(v[i]);
300 | 	}
301 | }
302 | 
303 | //  Key schedule for AES-128 decryption.
304 | 
305 | void aes128_dec_key_saes64(uint32_t rk[44], const uint8_t key[16])
306 | {
307 | 	//  create an encryption key and modify middle rounds
308 | 	aes128_enc_key_saes64(rk, key);
309 | 	saes64_dec_invmc(((uint64_t *) rk) + 2, AES128_RK_WORDS / 2 - 4);
310 | }
311 | 
312 | //  Key schedule for AES-192 decryption.
313 | 
314 | void aes192_dec_key_saes64(uint32_t rk[52], const uint8_t key[24])
315 | {
316 | 	//  create an encryption key and modify middle rounds
317 | 	aes192_enc_key_saes64(rk, key);
318 | 	saes64_dec_invmc(((uint64_t *) rk) + 2, AES192_RK_WORDS / 2 - 4);
319 | }
320 | 
321 | //  Key schedule for AES-256 decryption.
322 | 
323 | void aes256_dec_key_saes64(uint32_t rk[60], const uint8_t key[32])
324 | {
325 | 	//  create an encryption key and modify middle rounds
326 | 	aes256_enc_key_saes64(rk, key);
327 | 	saes64_dec_invmc(((uint64_t *) rk) + 2, AES256_RK_WORDS / 2 - 4);
328 | }
329 | 


--------------------------------------------------------------------------------
/aes_saes64.h:
--------------------------------------------------------------------------------
 1 | //  aes_saes64.h
 2 | //  2020-05-05  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  Implementation prototypes for aes_saes64.c
 6 | 
 7 | #ifndef _AES_SAES64_H_
 8 | #define _AES_SAES64_H_
 9 | 
10 | #include <stdint.h>
11 | 
12 | //  Set encryption key
13 | 
14 | void aes128_enc_key_saes64(uint32_t rk[AES128_RK_WORDS],
15 | 						   const uint8_t key[16]);
16 | 
17 | void aes192_enc_key_saes64(uint32_t rk[AES192_RK_WORDS],
18 | 						   const uint8_t key[24]);
19 | 
20 | void aes256_enc_key_saes64(uint32_t rk[AES256_RK_WORDS],
21 | 						   const uint8_t key[32]);
22 | 
23 | //  Encrypt a block
24 | 
25 | void aes128_enc_ecb_saes64(uint8_t ct[16], const uint8_t pt[16],
26 | 						   const uint32_t rk[AES128_RK_WORDS]);
27 | 
28 | void aes192_enc_ecb_saes64(uint8_t ct[16], const uint8_t pt[16],
29 | 						   const uint32_t rk[AES192_RK_WORDS]);
30 | 
31 | void aes256_enc_ecb_saes64(uint8_t ct[16], const uint8_t pt[16],
32 | 						   const uint32_t rk[AES256_RK_WORDS]);
33 | 
34 | 
35 | //  Set decryption key
36 | 
37 | void aes128_dec_key_saes64(uint32_t rk[AES128_RK_WORDS],
38 | 						   const uint8_t key[16]);
39 | 
40 | void aes192_dec_key_saes64(uint32_t rk[AES192_RK_WORDS],
41 | 						   const uint8_t key[24]);
42 | 
43 | void aes256_dec_key_saes64(uint32_t rk[AES256_RK_WORDS],
44 | 						   const uint8_t key[32]);
45 | 
46 | //  Decrypt a block
47 | 
48 | void aes128_dec_ecb_saes64(uint8_t pt[16], const uint8_t ct[16],
49 | 						   const uint32_t rk[AES128_RK_WORDS]);
50 | 
51 | void aes192_dec_ecb_saes64(uint8_t pt[16], const uint8_t ct[16],
52 | 						   const uint32_t rk[AES192_RK_WORDS]);
53 | 
54 | void aes256_dec_ecb_saes64(uint8_t pt[16], const uint8_t ct[16],
55 | 						   const uint32_t rk[AES256_RK_WORDS]);
56 | 
57 | #endif										//  _AES_SAES64_H_
58 | 


--------------------------------------------------------------------------------
/aes_test.c:
--------------------------------------------------------------------------------
 1 | //  aes_test.c
 2 | //  2020-03-21  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  Unit tests for AES-128/192/256
 6 | 
 7 | #include "test_hex.h"
 8 | #include "aes_wrap.h"
 9 | 
10 | //  Test AES
11 | 
12 | int test_aes()
13 | {
14 | 	uint8_t pt[16], ct[16], xt[16], key[32];
15 | 	uint32_t rk[AES256_RK_WORDS];
16 | 	int fail = 0;
17 | 
18 | 	//  FIPS 197 test vectors
19 | 	readhex(pt, sizeof(pt), "00112233445566778899AABBCCDDEEFF");
20 | 	readhex(key, sizeof(key),
21 | 			"000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F");
22 | 	aes128_enc_key(rk, key);
23 | 	aes128_enc_ecb(ct, pt, rk);
24 | 
25 | 	fail += chkhex("AES-128 Enc", ct, 16, "69C4E0D86A7B0430D8CDB78070B4C55A");
26 | 	aes128_dec_key(rk, key);
27 | 	aes128_dec_ecb(xt, ct, rk);
28 | 	fail += chkhex("AES-128 Dec", xt, 16, "00112233445566778899AABBCCDDEEFF");
29 | 
30 | 	aes192_enc_key(rk, key);
31 | 	aes192_enc_ecb(ct, pt, rk);
32 | 	fail += chkhex("AES-192 Enc", ct, 16, "DDA97CA4864CDFE06EAF70A0EC0D7191");
33 | 
34 | 	aes192_dec_key(rk, key);
35 | 	aes192_dec_ecb(xt, ct, rk);
36 | 	fail += chkhex("AES-192 Dec", xt, 16, "00112233445566778899AABBCCDDEEFF");
37 | 
38 | 	aes256_enc_key(rk, key);
39 | 	aes256_enc_ecb(ct, pt, rk);
40 | 	fail += chkhex("AES-256 Enc", ct, 16, "8EA2B7CA516745BFEAFC49904B496089");
41 | 
42 | 	aes256_dec_key(rk, key);
43 | 	aes256_dec_ecb(xt, ct, rk);
44 | 	fail += chkhex("AES-256 Dec", xt, 16, "00112233445566778899AABBCCDDEEFF");
45 | 
46 | 	//  another test vector set (picked from SP800-38A)
47 | 	readhex(key, sizeof(key), "2B7E151628AED2A6ABF7158809CF4F3C");
48 | 	aes128_enc_key(rk, key);
49 | 	readhex(pt, sizeof(pt), "6BC1BEE22E409F96E93D7E117393172A");
50 | 	aes128_enc_ecb(ct, pt, rk);
51 | 	fail += chkhex("AES-128 Enc", ct, 16, "3AD77BB40D7A3660A89ECAF32466EF97");
52 | 
53 | 	aes128_dec_key(rk, key);
54 | 	aes128_dec_ecb(xt, ct, rk);
55 | 	fail += chkhex("AES-128 Dec", xt, 16, "6BC1BEE22E409F96E93D7E117393172A");
56 | 
57 | 	readhex(key, sizeof(key),
58 | 			"8E73B0F7DA0E6452C810F32B809079E562F8EAD2522C6B7B");
59 | 	aes192_enc_key(rk, key);
60 | 	readhex(pt, sizeof(pt), "AE2D8A571E03AC9C9EB76FAC45AF8E51");
61 | 	aes192_enc_ecb(ct, pt, rk);
62 | 	fail += chkhex("AES-192 Enc", ct, 16, "974104846D0AD3AD7734ECB3ECEE4EEF");
63 | 
64 | 	aes192_dec_key(rk, key);
65 | 	aes192_dec_ecb(xt, ct, rk);
66 | 	fail += chkhex("AES-192 Dec", xt, 16, "AE2D8A571E03AC9C9EB76FAC45AF8E51");
67 | 
68 | 	readhex(key, sizeof(key),
69 | 			"603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4");
70 | 	aes256_enc_key(rk, key);
71 | 	readhex(pt, sizeof(pt), "30C81C46A35CE411E5FBC1191A0A52EF");
72 | 	aes256_enc_ecb(ct, pt, rk);
73 | 	fail += chkhex("AES-256 Enc", ct, 16, "B6ED21B99CA6F4F9F153E7B1BEAFED1D");
74 | 
75 | 	aes256_dec_key(rk, key);
76 | 	aes256_dec_ecb(xt, ct, rk);
77 | 	fail += chkhex("AES-256 Dec", xt, 16, "30C81C46A35CE411E5FBC1191A0A52EF");
78 | 
79 | 	return fail;
80 | }
81 | 


--------------------------------------------------------------------------------
/aes_wrap.c:
--------------------------------------------------------------------------------
 1 | //  aes_wrap.c
 2 | //  2020-04-23  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  AES 128/192/256 block encryption and decryption
 6 | 
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | 
10 | #include "aes_wrap.h"
11 | #include "aes_saes32.h"
12 | 
13 | static void key_undef(uint32_t * rk, const uint8_t * key)
14 | {
15 | 	(void) rk;
16 | 	(void) key;
17 | 
18 | 	fprintf(stderr, "[DEAD] key_undef()\n");
19 | 	abort();
20 | }
21 | 
22 | static void ciph_undef(uint8_t * d, const uint8_t * s, const uint32_t * rk)
23 | {
24 | 	(void) d;
25 | 	(void) s;
26 | 	(void) rk;
27 | 
28 | 	fprintf(stderr, "[DEAD] key_undef()\n");
29 | 	abort();
30 | }
31 | 
32 | //  == Externally visible pointers ==
33 | 
34 | //  Set encryption key
35 | 
36 | void (*aes128_enc_key)(uint32_t rk[AES128_RK_WORDS],
37 | 					   const uint8_t key[16]) = key_undef;
38 | 
39 | void (*aes192_enc_key)(uint32_t rk[AES192_RK_WORDS],
40 | 					   const uint8_t key[24]) = key_undef;
41 | 
42 | void (*aes256_enc_key)(uint32_t rk[AES256_RK_WORDS],
43 | 					   const uint8_t key[32]) = key_undef;
44 | 
45 | //  Encrypt a block
46 | 
47 | 
48 | void (*aes128_enc_ecb)(uint8_t ct[16], const uint8_t pt[16],
49 | 					   const uint32_t rk[AES128_RK_WORDS]) = ciph_undef;
50 | 
51 | void (*aes192_enc_ecb)(uint8_t ct[16], const uint8_t pt[16],
52 | 					   const uint32_t rk[AES192_RK_WORDS]) = ciph_undef;
53 | 
54 | void (*aes256_enc_ecb)(uint8_t ct[16], const uint8_t pt[16],
55 | 					   const uint32_t rk[AES256_RK_WORDS]) = ciph_undef;
56 | 
57 | //  Set decryption key
58 | 
59 | void (*aes128_dec_key)(uint32_t rk[AES128_RK_WORDS],
60 | 					   const uint8_t key[16]) = key_undef;
61 | void (*aes192_dec_key)(uint32_t rk[AES192_RK_WORDS],
62 | 					   const uint8_t key[24]) = key_undef;
63 | void (*aes256_dec_key)(uint32_t rk[AES256_RK_WORDS],
64 | 					   const uint8_t key[32]) = key_undef;
65 | 
66 | //  Decrypt a block
67 | 
68 | void (*aes128_dec_ecb)(uint8_t pt[16], const uint8_t ct[16],
69 | 					   const uint32_t rk[AES128_RK_WORDS]) = ciph_undef;
70 | 
71 | void (*aes192_dec_ecb)(uint8_t pt[16], const uint8_t ct[16],
72 | 					   const uint32_t rk[AES192_RK_WORDS]) = ciph_undef;
73 | 
74 | void (*aes256_dec_ecb)(uint8_t pt[16], const uint8_t ct[16],
75 | 					   const uint32_t rk[AES256_RK_WORDS]) = ciph_undef;
76 | 


--------------------------------------------------------------------------------
/aes_wrap.h:
--------------------------------------------------------------------------------
 1 | //  aes_wrap.h
 2 | //  2019-10-23  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2019, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  Wrapper for AES 128/192/256 block encryption and decryption.
 6 | //  These provide function pointers tothe UUT.
 7 | 
 8 | #ifndef _AES_WRAP_H_
 9 | #define _AES_WRAP_H_
10 | 
11 | #include <stdint.h>
12 | 
13 | //  number of rounds
14 | #define AES128_ROUNDS 10
15 | #define AES192_ROUNDS 12
16 | #define AES256_ROUNDS 14
17 | 
18 | //  expanded key size
19 | #define AES128_RK_WORDS (4 * (AES128_ROUNDS + 1))
20 | #define AES192_RK_WORDS (4 * (AES192_ROUNDS + 1))
21 | #define AES256_RK_WORDS (4 * (AES256_ROUNDS + 1))
22 | 
23 | //  Set encryption key
24 | 
25 | extern void (*aes128_enc_key)(uint32_t rk[AES128_RK_WORDS],
26 | 							  const uint8_t key[16]);
27 | 
28 | extern void (*aes192_enc_key)(uint32_t rk[AES192_RK_WORDS],
29 | 							  const uint8_t key[24]);
30 | 
31 | extern void (*aes256_enc_key)(uint32_t rk[AES256_RK_WORDS],
32 | 							  const uint8_t key[32]);
33 | 
34 | //  Encrypt a block
35 | 
36 | 
37 | extern void (*aes128_enc_ecb)(uint8_t ct[16], const uint8_t pt[16],
38 | 							  const uint32_t rk[AES128_RK_WORDS]);
39 | 
40 | extern void (*aes192_enc_ecb)(uint8_t ct[16], const uint8_t pt[16],
41 | 							  const uint32_t rk[AES192_RK_WORDS]);
42 | 
43 | extern void (*aes256_enc_ecb)(uint8_t ct[16], const uint8_t pt[16],
44 | 							  const uint32_t rk[AES256_RK_WORDS]);
45 | 
46 | //  Set decryption key
47 | 
48 | extern void (*aes128_dec_key)(uint32_t rk[AES128_RK_WORDS],
49 | 							  const uint8_t key[16]);
50 | extern void (*aes192_dec_key)(uint32_t rk[AES192_RK_WORDS],
51 | 							  const uint8_t key[24]);
52 | extern void (*aes256_dec_key)(uint32_t rk[AES256_RK_WORDS],
53 | 							  const uint8_t key[32]);
54 | 
55 | //  Decrypt a block
56 | 
57 | extern void (*aes128_dec_ecb)(uint8_t pt[16], const uint8_t ct[16],
58 | 							  const uint32_t rk[AES128_RK_WORDS]);
59 | 
60 | extern void (*aes192_dec_ecb)(uint8_t pt[16], const uint8_t ct[16],
61 | 							  const uint32_t rk[AES192_RK_WORDS]);
62 | 
63 | extern void (*aes256_dec_ecb)(uint8_t pt[16], const uint8_t ct[16],
64 | 							  const uint32_t rk[AES256_RK_WORDS]);
65 | 
66 | #endif										//  _AES_WRAP_H_
67 | 


--------------------------------------------------------------------------------
/asm/README.md:
--------------------------------------------------------------------------------
 1 | # Assembler AES / SM4 using SAES32
 2 | 
 3 | 2020-02-16  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 4 | 
 5 | Assembler implementations of the AES and SM4 block ciphers using the
 6 | SAES32 instructions -- has the same prototypes  and features as the
 7 | C-language APIs (see parent), so the same unit tests work too.
 8 | 
 9 | The functions assume word-aligned input. Typically such low-level "ECB" 
10 | primitives do not work directly on plaintext or ciphertext but are
11 | wrapped in some function that implement an encryption mode such as
12 | CTR, CCM, SIV, or GCM and operate on buffers provided by the wrapper.
13 | 
14 | This is definitely not the prettiest way of using (custom-0) SAES32
15 | instructions; hacky macros in [saes32_c0.h](saes32_c0.h) are used for
16 | encoding. Requires the C preprocessor, was tested with RISC-V GCC 9.2.0.
17 | 
18 | Cheers,
19 | - markku
20 | 
21 | 


--------------------------------------------------------------------------------
/asm/saes32_c0.h:
--------------------------------------------------------------------------------
 1 | //	saes32_c0.h
 2 | //	2020-02-16	Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //	Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //	Raw encoding macros for ENC1S as custom-0 -- pretty ugly.
 6 | 
 7 | #ifndef _SAES32_C0_H_
 8 | #define _SAES32_C0_H_
 9 | 
10 | //	custom-0 r-type instruction encoding macro
11 | 
12 | .macro cust0r fn3, fn7, rd, rs1, rs2
13 | 	.word(0x0B + ((\fn3) << 12) + ((\fn7) << 25) + ((\rd) << 7) + ((\rs1) << 15) + ((\rs2) << 20))
14 | 	.endm
15 | 
16 | 
17 | //	function codes
18 | #define SAES32_ENCSM_FN		0
19 | #define SAES32_ENCS_FN		1
20 | #define SAES32_DECSM_FN		2
21 | #define SAES32_DECS_FN		3
22 | #define SSM4_ED_FN			4
23 | #define SSM4_KS_FN			5
24 | 
25 | //	SAES32 as funct3=0 -- with a fn in funct7
26 | 
27 | 	.macro	saes32			rd, rs1, rs2, fn
28 | 	cust0r	0, \fn, \rd, \rs1, \rs2
29 | 	.endm
30 | 
31 | //	Pseudo-ops for AES and SM4
32 | 
33 | 	.macro	saes32_encsm	rd, rs1, rs2, bs
34 | 	saes32	\rd, \rs1, \rs2, ((SAES32_ENCSM_FN << 2) | (\bs))
35 | 	.endm
36 | 
37 | 	.macro	saes32_encs		rd, rs1, rs2, bs
38 | 	saes32	\rd, \rs1, \rs2, ((SAES32_ENCS_FN << 2) | (\bs))
39 | 	.endm
40 | 
41 | 	.macro	saes32_decsm	rd, rs1, rs2, bs
42 | 	saes32	\rd, \rs1, \rs2, ((SAES32_DECSM_FN << 2) | (\bs))
43 | 	.endm
44 | 
45 | 	.macro	saes32_decs		rd, rs1, rs2, bs
46 | 	saes32	\rd, \rs1, \rs2, ((SAES32_DECS_FN << 2) | (\bs))
47 | 	.endm
48 | 
49 | 
50 | 	.macro	ssm4_ed			rd, rs1, rs2, bs
51 | 	saes32	\rd, \rs1, \rs2, ((SSM4_ED_FN << 2) | (\bs))
52 | 	.endm
53 | 
54 | 	.macro	ssm4_ks			rd, rs1, rs2, bs
55 | 	saes32	\rd, \rs1, \rs2, ((SSM4_KS_FN << 2) | (\bs))
56 | 	.endm
57 | 
58 | 
59 | //	numbered registers
60 | #define X0	0
61 | #define RA	1
62 | #define SP	2
63 | #define GP	3
64 | #define TP	4
65 | #define T0	5
66 | #define T1	6
67 | #define T2	7
68 | #define S0	8
69 | #define S1	9
70 | #define A0	10
71 | #define A1	11
72 | #define A2	12
73 | #define A3	13
74 | #define A4	14
75 | #define A5	15
76 | #define A6	16
77 | #define A7	17
78 | #define S2	18
79 | #define S3	19
80 | #define S4	20
81 | #define S5	21
82 | #define S6	22
83 | #define S7	23
84 | #define S8	24
85 | #define S9	25
86 | #define S10 26
87 | #define S11 27
88 | #define T3	28
89 | #define T4	29
90 | #define T5	30
91 | #define T6	31
92 | 
93 | #endif
94 | 


--------------------------------------------------------------------------------
/asm/saes32_dec.S:
--------------------------------------------------------------------------------
  1 | //	saes32_dec.S
  2 | //	2020-02-16	Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //	Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //	AES Decryption and Key Schedule using "lwaes" instructions.
  6 | 
  7 | //	macro definitions for the custom instruction
  8 | #include	"saes32_c0.h"
  9 | 
 10 | 		.option nopic
 11 | 		.text
 12 | 		.align	2
 13 | 
 14 | 
 15 | //	Decrypt rounds. Implements AES-128/192/256 depending on nr = {10,12,14}
 16 | //		void aes_dec_rounds(uint8_t pt[16], const uint8_t ct[16],
 17 | //							const uint32_t rk[], int nr)
 18 | //		where: a0 = pt, a1 = ct, a2 = rk, a3 = nr
 19 | 
 20 | 		.globl	aes_dec_rounds
 21 | 		.type	aes_dec_rounds,	 @function
 22 | 
 23 | aes_dec_rounds:
 24 | 
 25 | 		lw		t4,		0(a1)				//	load ct
 26 | 		lw		t5,		4(a1)
 27 | 		lw		t6,		8(a1)
 28 | 		lw		a7,		12(a1)
 29 | 
 30 | 		slli	a3,		a3,		4			//	final pointer
 31 | 		add		a3,		a3,		a2
 32 | 
 33 | 		lw		t0,		0(a3)				//	load rk
 34 | 		lw		t1,		4(a3)
 35 | 		lw		t2,		8(a3)
 36 | 		lw		t3,		12(a3)
 37 | 
 38 | 		xor		t0,		t0,		t4			//	ct ^ rk
 39 | 		xor		t1,		t1,		t5
 40 | 		xor		t2,		t2,		t6
 41 | 		xor		t3,		t3,		a7
 42 | 
 43 | 		j		.ent						//	enter loop in middle
 44 | 
 45 | .loop:
 46 | 		//	even round
 47 | 		saes32_decsm	T0, T0, T4, 0
 48 | 		saes32_decsm	T0, T0, A7, 1
 49 | 		saes32_decsm	T0, T0, T6, 2
 50 | 		saes32_decsm	T0, T0, T5, 3
 51 | 
 52 | 		saes32_decsm	T1, T1, T5, 0
 53 | 		saes32_decsm	T1, T1, T4, 1
 54 | 		saes32_decsm	T1, T1, A7, 2
 55 | 		saes32_decsm	T1, T1, T6, 3
 56 | 
 57 | 		saes32_decsm	T2, T2, T6, 0
 58 | 		saes32_decsm	T2, T2, T5, 1
 59 | 		saes32_decsm	T2, T2, T4, 2
 60 | 		saes32_decsm	T2, T2, A7, 3
 61 | 
 62 | 		saes32_decsm	T3, T3, A7, 0
 63 | 		saes32_decsm	T3, T3, T6, 1
 64 | 		saes32_decsm	T3, T3, T5, 2
 65 | 		saes32_decsm	T3, T3, T4, 3
 66 | 
 67 | .ent:
 68 | 		addi	a3,		a3,		-32
 69 | 		lw		t4,		16(a3)				//	load round key
 70 | 		lw		t5,		20(a3)
 71 | 		lw		t6,		24(a3)
 72 | 		lw		a7,		28(a3)
 73 | 
 74 | 		//	odd round
 75 | 		saes32_decsm	T4, T4, T0, 0
 76 | 		saes32_decsm	T4, T4, T3, 1
 77 | 		saes32_decsm	T4, T4, T2, 2
 78 | 		saes32_decsm	T4, T4, T1, 3
 79 | 
 80 | 		saes32_decsm	T5, T5, T1, 0
 81 | 		saes32_decsm	T5, T5, T0, 1
 82 | 		saes32_decsm	T5, T5, T3, 2
 83 | 		saes32_decsm	T5, T5, T2, 3
 84 | 
 85 | 		saes32_decsm	T6, T6, T2, 0
 86 | 		saes32_decsm	T6, T6, T1, 1
 87 | 		saes32_decsm	T6, T6, T0, 2
 88 | 		saes32_decsm	T6, T6, T3, 3
 89 | 
 90 | 		saes32_decsm	A7, A7, T3, 0
 91 | 		saes32_decsm	A7, A7, T2, 1
 92 | 		saes32_decsm	A7, A7, T1, 2
 93 | 		saes32_decsm	A7, A7, T0, 3
 94 | 
 95 | 		lw		t0,		0(a3)				//	load round key
 96 | 		lw		t1,		4(a3)
 97 | 		lw		t2,		8(a3)
 98 | 		lw		t3,		12(a3)
 99 | 		bne		a3,		a2,		.loop
100 | 
101 | 		//	final (output) round
102 | 		saes32_decs		T0, T0, T4, 0
103 | 		saes32_decs		T0, T0, A7, 1
104 | 		saes32_decs		T0, T0, T6, 2
105 | 		saes32_decs		T0, T0, T5, 3
106 | 
107 | 		saes32_decs		T1, T1, T5, 0
108 | 		saes32_decs		T1, T1, T4, 1
109 | 		saes32_decs		T1, T1, A7, 2
110 | 		saes32_decs		T1, T1, T6, 3
111 | 
112 | 		saes32_decs		T2, T2, T6, 0
113 | 		saes32_decs		T2, T2, T5, 1
114 | 		saes32_decs		T2, T2, T4, 2
115 | 		saes32_decs		T2, T2, A7, 3
116 | 
117 | 		saes32_decs		T3, T3, A7, 0
118 | 		saes32_decs		T3, T3, T6, 1
119 | 		saes32_decs		T3, T3, T5, 2
120 | 		saes32_decs		T3, T3, T4, 3
121 | 
122 | 		sw		t0,		0(a0)				//	store pt
123 | 		sw		t1,		4(a0)
124 | 		sw		t2,		8(a0)
125 | 		sw		t3,		12(a0)
126 | 
127 | 		jr		ra
128 | 		.size	aes_dec_rounds,	 .-aes_dec_rounds
129 | 
130 | //	Helper: apply inverse mixcolumns to a vector
131 | 
132 | 		.type	.invmc,	 @function
133 | 
134 | .invmc:
135 | 		lw		t0,		0(a0)
136 | 
137 | 		saes32_encs		T1, 0,	T0, 0
138 | 		saes32_encs		T1, T1, T0, 1
139 | 		saes32_encs		T1, T1, T0, 2
140 | 		saes32_encs		T1, T1, T0, 3
141 | 
142 | 		saes32_decsm	T0, 0,	T1, 0
143 | 		saes32_decsm	T0, T0, T1, 1
144 | 		saes32_decsm	T0, T0, T1, 2
145 | 		saes32_decsm	T0, T0, T1, 3
146 | 
147 | 		sw		t0,		0(a0)
148 | 		addi	a0,		a0,		4
149 | 		bne		a0,		a1,		.invmc
150 | 		jr		ra
151 | 
152 | 		.size	.invmc,	 .-.invmc
153 | 		.align	2
154 | 
155 | 
156 | //	Key schedule for AES-128 decryption.
157 | //		void aes128_dec_key(uint32_t rk[44], const uint8_t key[16]);
158 | //		where:	a0 = rk, a1 = key
159 | 
160 | 		.globl	aes128_dec_key
161 | 		.type	aes128_dec_key,	 @function
162 | 
163 | aes128_dec_key:
164 | 		addi	sp,		sp,		-8
165 | 		sw		s0,		0(sp)
166 | 		sw		ra,		4(sp)
167 | 		mv		s0,		a0
168 | 		call	aes128_enc_key
169 | 		addi	a0,		s0,		16
170 | 		lw		s0,		0(sp)
171 | 		lw		ra,		4(sp)
172 | 		addi	a1,		a0,		4 * 36
173 | 		addi	sp,		sp,		8
174 | 		tail	.invmc
175 | 
176 | 		.size	aes128_dec_key,	 .-aes128_dec_key
177 | 		.align	2
178 | 
179 | 
180 | //	Key schedule for AES-192 decryption.
181 | //		void aes192_dec_key(uint32_t rk[52], const uint8_t key[24]);
182 | //		where:	a0 = rk, a1 = key
183 | 
184 | 		.globl	aes192_dec_key
185 | 		.type	aes192_dec_key,	 @function
186 | 
187 | aes192_dec_key:
188 | 		addi	sp,		sp,		-8
189 | 		sw		s0,		0(sp)
190 | 		sw		ra,		4(sp)
191 | 		mv		s0,		a0
192 | 		call	aes192_enc_key
193 | 		addi	a0,		s0,		16
194 | 		lw		s0,		0(sp)
195 | 		lw		ra,		4(sp)
196 | 		addi	a1,		a0,		4 * 44
197 | 		addi	sp,		sp,		8
198 | 		tail	.invmc
199 | 
200 | 		.size	aes192_dec_key,	 .-aes192_dec_key
201 | 		.align	2
202 | 
203 | 
204 | //	Key schedule for AES-256 decryption.
205 | //		void aes256_dec_key(uint32_t rk[60], const uint8_t key[32]);
206 | //		where:	a0 = rk, a1 = key
207 | 
208 | 		.globl	aes256_dec_key
209 | 		.type	aes256_dec_key,	 @function
210 | 
211 | aes256_dec_key:
212 | 		addi	sp,		sp,		-8
213 | 		sw		s0,		0(sp)
214 | 		sw		ra,		4(sp)
215 | 		mv		s0,		a0
216 | 		call	aes256_enc_key
217 | 		addi	a0,		s0,		16
218 | 		lw		s0,		0(sp)
219 | 		lw		ra,		4(sp)
220 | 		addi	a1,		a0,		4 * 52
221 | 		addi	sp,		sp,		8
222 | 		tail	.invmc
223 | 
224 | 		.size	aes256_dec_key,	 .-aes256_dec_key
225 | 
226 | 


--------------------------------------------------------------------------------
/asm/saes32_enc.S:
--------------------------------------------------------------------------------
  1 | //	saes32_enc.S
  2 | //	2020-02-16	Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //	Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //	AES Encryption and Key Schedule using "lwaes" instructions.
  6 | 
  7 | //	macro definitions for the custom instruction
  8 | #include	"saes32_c0.h"
  9 | 
 10 | 		.option nopic
 11 | 		.text
 12 | 		.align	2
 13 | 
 14 | 
 15 | //	Encrypt rounds. Implements AES-128/192/256 depending on nr = {10,12,14}
 16 | //		void aes_enc_rounds(uint8_t ct[16], const uint8_t pt[16],
 17 | //							const uint32_t rk[], int nr)
 18 | //		where:	a0 = ct, a1 = pt, a2 = rk, a3 = nr
 19 | 
 20 | 		.globl	aes_enc_rounds
 21 | 		.type	aes_enc_rounds,	 @function
 22 | 
 23 | aes_enc_rounds:
 24 | 
 25 | 		lw		t4,		0(a1)				//	load pt
 26 | 		lw		t5,		4(a1)
 27 | 		lw		t6,		8(a1)
 28 | 		lw		a7,		12(a1)
 29 | 
 30 | 		lw		t0,		0(a2)				//	load rk
 31 | 		lw		t1,		4(a2)
 32 | 		lw		t2,		8(a2)
 33 | 		lw		t3,		12(a2)
 34 | 
 35 | 		xor		t0,		t0,		t4			//	pt ^ rk
 36 | 		xor		t1,		t1,		t5
 37 | 		xor		t2,		t2,		t6
 38 | 		xor		t3,		t3,		a7
 39 | 
 40 | 		slli	a3,		a3,		4			//	final pointer
 41 | 		add		a3,		a3,		a2
 42 | 
 43 | 		j		.ent						//	enter loop in the middle
 44 | 
 45 | .loop:
 46 | 		//	odd round
 47 | 		saes32_encsm	T0, T0, T4, 0
 48 | 		saes32_encsm	T0, T0, T5, 1
 49 | 		saes32_encsm	T0, T0, T6, 2
 50 | 		saes32_encsm	T0, T0, A7, 3
 51 | 
 52 | 		saes32_encsm	T1, T1, T5, 0
 53 | 		saes32_encsm	T1, T1, T6, 1
 54 | 		saes32_encsm	T1, T1, A7, 2
 55 | 		saes32_encsm	T1, T1, T4, 3
 56 | 
 57 | 		saes32_encsm	T2, T2, T6, 0
 58 | 		saes32_encsm	T2, T2, A7, 1
 59 | 		saes32_encsm	T2, T2, T4, 2
 60 | 		saes32_encsm	T2, T2, T5, 3
 61 | 
 62 | 		saes32_encsm	T3, T3, A7, 0
 63 | 		saes32_encsm	T3, T3, T4, 1
 64 | 		saes32_encsm	T3, T3, T5, 2
 65 | 		saes32_encsm	T3, T3, T6, 3
 66 | 
 67 | .ent:
 68 | 		lw		t4,		16(a2)				//	load round key
 69 | 		lw		t5,		20(a2)
 70 | 		lw		t6,		24(a2)
 71 | 		lw		a7,		28(a2)
 72 | 
 73 | 		//	even round
 74 | 		saes32_encsm	T4, T4, T0, 0
 75 | 		saes32_encsm	T4, T4, T1, 1
 76 | 		saes32_encsm	T4, T4, T2, 2
 77 | 		saes32_encsm	T4, T4, T3, 3
 78 | 
 79 | 		saes32_encsm	T5, T5, T1, 0
 80 | 		saes32_encsm	T5, T5, T2, 1
 81 | 		saes32_encsm	T5, T5, T3, 2
 82 | 		saes32_encsm	T5, T5, T0, 3
 83 | 
 84 | 		saes32_encsm	T6, T6, T2, 0
 85 | 		saes32_encsm	T6, T6, T3, 1
 86 | 		saes32_encsm	T6, T6, T0, 2
 87 | 		saes32_encsm	T6, T6, T1, 3
 88 | 
 89 | 		saes32_encsm	A7, A7, T3, 0
 90 | 		saes32_encsm	A7, A7, T0, 1
 91 | 		saes32_encsm	A7, A7, T1, 2
 92 | 		saes32_encsm	A7, A7, T2, 3
 93 | 
 94 | 		addi	a2,		a2,		32
 95 | 		lw		t0,		0(a2)				//	load round key
 96 | 		lw		t1,		4(a2)
 97 | 		lw		t2,		8(a2)
 98 | 		lw		t3,		12(a2)
 99 | 		bne		a3,		a2,		.loop
100 | 
101 | 		//	final (output) round
102 | 		saes32_encs		T0, T0, T4, 0
103 | 		saes32_encs		T0, T0, T5, 1
104 | 		saes32_encs		T0, T0, T6, 2
105 | 		saes32_encs		T0, T0, A7, 3
106 | 
107 | 		saes32_encs		T1, T1, T5, 0
108 | 		saes32_encs		T1, T1, T6, 1
109 | 		saes32_encs		T1, T1, A7, 2
110 | 		saes32_encs		T1, T1, T4, 3
111 | 
112 | 		saes32_encs		T2, T2, T6, 0
113 | 		saes32_encs		T2, T2, A7, 1
114 | 		saes32_encs		T2, T2, T4, 2
115 | 		saes32_encs		T2, T2, T5, 3
116 | 
117 | 		saes32_encs		T3, T3, A7, 0
118 | 		saes32_encs		T3, T3, T4, 1
119 | 		saes32_encs		T3, T3, T5, 2
120 | 		saes32_encs		T3, T3, T6, 3
121 | 
122 | 		sw		t0,		0(a0)				//	store ct
123 | 		sw		t1,		4(a0)
124 | 		sw		t2,		8(a0)
125 | 		sw		t3,		12(a0)
126 | 
127 | 		jr		ra
128 | 		.size	aes_enc_rounds,	 .-aes_enc_rounds
129 | 
130 | 
131 | //	Key schedule for AES-128 Encryption.
132 | //		void aes128_enc_key(uint32_t rk[44], const uint8_t key[16])
133 | //		where:	a0 = rk, a1 = key
134 | 
135 | 		.align	2
136 | 		.globl	aes128_enc_key
137 | 		.type	aes128_enc_key,	 @function
138 | 
139 | aes128_enc_key:
140 | 
141 | 		lui		a2,		%hi(.rcon)			//	rcon pointer
142 | 		addi	a2,		a2,		%lo(.rcon)
143 | 
144 | 		addi	a3,		a0,		16 * 10		//	end pointer
145 | 
146 | 		lw		t0,		0(a1)				//	input key
147 | 		lw		t1,		4(a1)
148 | 		lw		t2,		8(a1)
149 | 		lw		t3,		12(a1)
150 | 
151 | 		sw		t0,		0(a0)				//	first round key
152 | 		sw		t1,		4(a0)
153 | 		sw		t2,		8(a0)
154 | 		sw		t3,		12(a0)
155 | 
156 | 		xori	t0,		t0,		1			//	first round constant
157 | 		j		.nl0
158 | 
159 | .ekl0:
160 | 		addi	a2,		a2,		1
161 | 		lbu		a1,		0(a2)				//	round constant
162 | 		xor		t0,		a1,		t0
163 | 
164 | .nl0:
165 | 		slli	a4,		t3,		24			//	rotate
166 | 		srli	a1,		t3,		8
167 | 		or		a1,		a1,		a4
168 | 
169 | 		saes32_encs		T0, T0, A1, 0
170 | 		saes32_encs		T0, T0, A1, 1
171 | 		saes32_encs		T0, T0, A1, 2
172 | 		saes32_encs		T0, T0, A1, 3
173 | 
174 | 		xor		t1,		t1,		t0
175 | 		xor		t2,		t2,		t1
176 | 		xor		t3,		t3,		t2
177 | 
178 | 		addi	a0,		a0,		16
179 | 		sw		t0,		0(a0)				//	store round key
180 | 		sw		t1,		4(a0)
181 | 		sw		t2,		8(a0)
182 | 		sw		t3,		12(a0)
183 | 
184 | 		bne		a0,		a3,		.ekl0
185 | 
186 | 		jr		ra
187 | 		.size	aes128_enc_key,	 .-aes128_enc_key
188 | 
189 | 
190 | //	Key schedule for AES-192 Encryption.
191 | //		void aes192_enc_key(uint32_t rk[52], const uint8_t key[16])
192 | //		where:	a0 = rk, a1 = key
193 | 
194 | 		.align	2
195 | 		.globl	aes192_enc_key
196 | 		.type	aes192_enc_key,	 @function
197 | 
198 | aes192_enc_key:
199 | 
200 | 		lui		a2,		%hi(.rcon)			//	rcon pointer
201 | 		addi	a2,		a2,		%lo(.rcon)
202 | 
203 | 		addi	a3,		a0,		16 * 12		//	end pointer
204 | 
205 | 		lw		t0,		0(a1)				//	input key
206 | 		lw		t1,		4(a1)
207 | 		lw		t2,		8(a1)
208 | 		lw		t3,		12(a1)
209 | 		lw		t4,		16(a1)
210 | 		lw		t5,		20(a1)
211 | 
212 | 		sw		t0,		0(a0)				//	first round key
213 | 		sw		t1,		4(a0)
214 | 		sw		t2,		8(a0)
215 | 		sw		t3,		12(a0)
216 | 
217 | 		xori	t0,		t0,		1			//	first round constant
218 | 		j		.nl1
219 | 
220 | .ekl1:	addi	a2,		a2,		1
221 | 		lbu		a1,		0(a2)				//	round constant
222 | 		xor		t0,		a1,		t0
223 | 
224 | .nl1:	sw		t4,		16(a0)				//	high part of round key
225 | 		sw		t5,		20(a0)
226 | 
227 | 		slli	a4,		t5,		24			//	rotate
228 | 		srli	a1,		t5,		8
229 | 		or		a1,		a1,		a4
230 | 
231 | 		saes32_encs		T0, T0, A1, 0
232 | 		saes32_encs		T0, T0, A1, 1
233 | 		saes32_encs		T0, T0, A1, 2
234 | 		saes32_encs		T0, T0, A1, 3
235 | 
236 | 		xor		t1,		t1,		t0
237 | 		xor		t2,		t2,		t1
238 | 		xor		t3,		t3,		t2
239 | 		xor		t4,		t4,		t3
240 | 		xor		t5,		t5,		t4
241 | 
242 | 		addi	a0,		a0,		24
243 | 		sw		t0,		0(a0)				//	store round key
244 | 		sw		t1,		4(a0)
245 | 		sw		t2,		8(a0)
246 | 		sw		t3,		12(a0)
247 | 
248 | 		bne		a0,		a3,		.ekl1
249 | 
250 | 		jr		ra
251 | 		.size	aes192_enc_key,	 .-aes192_enc_key
252 | 
253 | 
254 | //	Key schedule for AES-256 Encryption.
255 | //		void aes256_enc_key(uint32_t rk[60], const uint8_t key[16])
256 | //		where:	a0 = rk, a1 = key
257 | 
258 | 		.align	2
259 | 		.globl	aes256_enc_key
260 | 		.type	aes256_enc_key,	 @function
261 | 
262 | aes256_enc_key:
263 | 
264 | 		lui		a2,		%hi(.rcon)			//	rcon pointer
265 | 		addi	a2,		a2,		%lo(.rcon)
266 | 
267 | 		addi	a3,		a0,		16 * 14		//	end pointer
268 | 
269 | 		lw		t0,		0(a1)				//	input key
270 | 		lw		t1,		4(a1)
271 | 		lw		t2,		8(a1)
272 | 		lw		t3,		12(a1)
273 | 		lw		t4,		16(a1)
274 | 		lw		t5,		20(a1)
275 | 		lw		t6,		24(a1)
276 | 		lw		a7,		28(a1)
277 | 
278 | 		sw		t0,		0(a0)				//	first round key
279 | 		sw		t1,		4(a0)
280 | 		sw		t2,		8(a0)
281 | 		sw		t3,		12(a0)
282 | 
283 | 		xori	t0,		t0,		1			//	first round constant
284 | 		j		.nl2
285 | 
286 | .ekl2:	saes32_encs		T4, T4, T3, 0	//	no rotate
287 | 		saes32_encs		T4, T4, T3, 1
288 | 		saes32_encs		T4, T4, T3, 2
289 | 		saes32_encs		T4, T4, T3, 3
290 | 
291 | 		xor		t5,		t5,		t4
292 | 		xor		t6,		t6,		t5
293 | 		xor		a7,		a7,		t6
294 | 
295 | 		addi	a2,		a2,		1
296 | 		lbu		a1,		0(a2)				//	round constant
297 | 		xor		t0,		a1,		t0
298 | 
299 | .nl2:	sw		t4,		16(a0)				//	store upper part of rk
300 | 		sw		t5,		20(a0)
301 | 		sw		t6,		24(a0)
302 | 		sw		a7,		28(a0)
303 | 
304 | 		slli	a4,		a7,		24			//	rotate
305 | 		srli	a1,		a7,		8
306 | 		or		a1,		a1,		a4
307 | 
308 | 		saes32_encs		T0, T0, A1, 0
309 | 		saes32_encs		T0, T0, A1, 1
310 | 		saes32_encs		T0, T0, A1, 2
311 | 		saes32_encs		T0, T0, A1, 3
312 | 
313 | 		xor		t1,		t1,		t0
314 | 		xor		t2,		t2,		t1
315 | 		xor		t3,		t3,		t2
316 | 
317 | 		addi	a0,		a0,		32
318 | 
319 | 		sw		t0,		0(a0)				//	store round key
320 | 		sw		t1,		4(a0)
321 | 		sw		t2,		8(a0)
322 | 		sw		t3,		12(a0)
323 | 
324 | 		bne		a0,		a3,		.ekl2		//	final rk ?
325 | 
326 | 		jr		ra
327 | 		.size	aes256_enc_key,	 .-aes256_enc_key
328 | 
329 | //	round constants
330 | 
331 | 		.type	.rcon,	@object
332 | .rcon:
333 | 		.byte	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
334 | 		.size	.rcon,	10
335 | 
336 | 


--------------------------------------------------------------------------------
/asm/saes32_wrap.h:
--------------------------------------------------------------------------------
 1 | //	aes_wrap.h
 2 | //	2019-10-23	Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //	Copyright (c) 2019, PQShield Ltd. All rights reserved.
 4 | 
 5 | //	AES 128/192/256 block encryption and decryption (no dependencies)
 6 | 
 7 | #ifndef _AES_WRAP_H_
 8 | #define _AES_WRAP_H_
 9 | 
10 | #include <stdint.h>
11 | #include <stddef.h>
12 | 
13 | //	number of rounds
14 | #define AES128_ROUNDS 10
15 | #define AES192_ROUNDS 12
16 | #define AES256_ROUNDS 14
17 | 
18 | //	expanded key size
19 | #define AES128_RK_WORDS (4 * (AES128_ROUNDS + 1))
20 | #define AES192_RK_WORDS (4 * (AES192_ROUNDS + 1))
21 | #define AES256_RK_WORDS (4 * (AES256_ROUNDS + 1))
22 | 
23 | //	=== ENCRYPT ===
24 | 
25 | //	set encryption key
26 | void aes128_enc_key(uint32_t rk[AES128_RK_WORDS], const uint8_t key[16]);
27 | void aes192_enc_key(uint32_t rk[AES192_RK_WORDS], const uint8_t key[24]);
28 | void aes256_enc_key(uint32_t rk[AES256_RK_WORDS], const uint8_t key[32]);
29 | 
30 | //	implementation
31 | void aes_enc_rounds(uint8_t ct[16], const uint8_t pt[16],
32 | 				  const uint32_t rk[], int nr);
33 | 
34 | //	aliases
35 | #define aes128_enc_ecb(ct, pt, rk) aes_enc_rounds(ct, pt, rk, AES128_ROUNDS);
36 | #define aes192_enc_ecb(ct, pt, rk) aes_enc_rounds(ct, pt, rk, AES192_ROUNDS);
37 | #define aes256_enc_ecb(ct, pt, rk) aes_enc_rounds(ct, pt, rk, AES256_ROUNDS);
38 | 
39 | //	=== DECRYPT ===
40 | 
41 | //	set decryption key
42 | void aes128_dec_key(uint32_t rk[AES128_RK_WORDS], const uint8_t key[16]);
43 | void aes192_dec_key(uint32_t rk[AES192_RK_WORDS], const uint8_t key[24]);
44 | void aes256_dec_key(uint32_t rk[AES256_RK_WORDS], const uint8_t key[32]);
45 | 
46 | void aes_dec_rounds(uint8_t pt[16], const uint8_t ct[16],
47 | 				  const uint32_t rk[], int nr);
48 | 
49 | //	aliases
50 | #define aes128_dec_ecb(ct, pt, rk) aes_dec_rounds(ct, pt, rk, AES128_ROUNDS);
51 | #define aes192_dec_ecb(ct, pt, rk) aes_dec_rounds(ct, pt, rk, AES192_ROUNDS);
52 | #define aes256_dec_ecb(ct, pt, rk) aes_dec_rounds(ct, pt, rk, AES256_ROUNDS);
53 | 
54 | #endif										//	_AES_WRAP_H_
55 | 


--------------------------------------------------------------------------------
/asm/sm4_encdec.S:
--------------------------------------------------------------------------------
  1 | //	sm4_encdec.S
  2 | //	2020-02-16	Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //	Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //	AES Encryption and Key Schedule using "lwaes" instructions.
  6 | 
  7 | //	macro definitions for the custom instruction
  8 | #include	"saes32_c0.h"
  9 | 
 10 | 		.option nopic
 11 | 		.text
 12 | 		.align	2
 13 | 
 14 | 
 15 | //	Encrypt or decrypt a block, depending on round key ordering.
 16 | //		void sm4_encdec(uint8_t out[16], const uint8_t in[16],
 17 | //						 const uint32_t rk[SM4_RK_WORDS])
 18 | //		where:	a0 = out, a1 = in, a2 = rk
 19 | 
 20 | 		.globl	sm4_encdec
 21 | 		.type	sm4_encdec,		 @function
 22 | 
 23 | sm4_encdec:
 24 | 		lw		t0,		0(a1)
 25 | 		lw		t1,		4(a1)
 26 | 		lw		t2,		8(a1)
 27 | 		lw		t3,		12(a1)
 28 | 
 29 | 		addi	a3,		a2,		128
 30 | 
 31 | .loop:
 32 | 		xor		t4,		t2,		t3
 33 | 
 34 | 		lw		a1,		0(a2)
 35 | 		xor		a1,		a1,		t1
 36 | 		xor		a1,		a1,		t4
 37 | 
 38 | 		ssm4_ed		T0, T0, A1, 0
 39 | 		ssm4_ed		T0, T0, A1, 1
 40 | 		ssm4_ed		T0, T0, A1, 2
 41 | 		ssm4_ed		T0, T0, A1, 3
 42 | 
 43 | 		lw		a1,		4(a2)
 44 | 		xor		a1,		a1,		t0
 45 | 		xor		a1,		a1,		t4
 46 | 
 47 | 		ssm4_ed		T1, T1, A1, 0
 48 | 		ssm4_ed		T1, T1, A1, 1
 49 | 		ssm4_ed		T1, T1, A1, 2
 50 | 		ssm4_ed		T1, T1, A1, 3
 51 | 
 52 | 		xor		t4,		t0,		t1
 53 | 
 54 | 		lw		a1,		8(a2)
 55 | 		xor		a1,		a1,		t3
 56 | 		xor		a1,		a1,		t4
 57 | 
 58 | 		ssm4_ed		T2, T2, A1, 0
 59 | 		ssm4_ed		T2, T2, A1, 1
 60 | 		ssm4_ed		T2, T2, A1, 2
 61 | 		ssm4_ed		T2, T2, A1, 3
 62 | 
 63 | 		lw		a1,		12(a2)
 64 | 		xor		a1,		a1,		t2
 65 | 		xor		a1,		a1,		t4
 66 | 
 67 | 		ssm4_ed		T3, T3, A1, 0
 68 | 		ssm4_ed		T3, T3, A1, 1
 69 | 		ssm4_ed		T3, T3, A1, 2
 70 | 		ssm4_ed		T3, T3, A1, 3
 71 | 
 72 | 		addi	a2,		a2,		16
 73 | 		bne		a3,		a2,		.loop
 74 | 
 75 | 		sw		t3,		0(a0)
 76 | 		sw		t2,		4(a0)
 77 | 		sw		t1,		8(a0)
 78 | 		sw		t0,		12(a0)
 79 | 
 80 | 		jr		ra
 81 | 		.size	sm4_encdec,		 .-sm4_encdec
 82 | 		.align	2
 83 | 
 84 | 
 85 | //	Set key for encryption.
 86 | //		void sm4_enc_key_asm(uint32_t rk[32], const uint8_t key[16])
 87 | //		where:	a0 = rk, a1 = key
 88 | 
 89 | 		.globl	sm4_enc_key
 90 | 		.type	sm4_enc_key,	 @function
 91 | sm4_enc_key:
 92 | 		lw		t0,		0(a1)
 93 | 		lw		t1,		4(a1)
 94 | 		lw		t2,		8(a1)
 95 | 		lw		t3,		12(a1)
 96 | 
 97 | 		addi	a2,		a0,		128
 98 | 
 99 | 		//	"fk" constant
100 | 
101 | 		li		a1,		0xC6BAB000
102 | 		addi	a1,		a1,		0x1A3
103 | 		xor		t0,		t0,		a1
104 | 
105 | 		li		a1,		0x5033A800
106 | 		addi	a1,		a1,		0x256
107 | 		xor		t1,		t1,		a1
108 | 
109 | 		li		a1,		0x97917800
110 | 		addi	a1,		a1,		0x567
111 | 		xor		t2,		t2,		a1
112 | 
113 | 		li		a1,		0xDC227000
114 | 		addi	a1,		a1,		0x0B2
115 | 		xor		t3,		t3,		a1
116 | 
117 | 		//	constants for "ck" generation
118 | 
119 | 		li		a3,		0x140E0000
120 | 		addi	a3,		a3,		0x600
121 | 
122 | 		li		a4,		0x1C1C1800
123 | 		addi	a4,		a4,		0x41C
124 | 
125 | 		li		a5,		0xFEFEF800
126 | 		addi	a5,		a5,		0x6FE
127 | 
128 | 		li		a6,		0x01000000
129 | 		addi	a6,		a6,		0x100
130 | 
131 | .ekl:
132 | 		xor		t4,		t2,		t3
133 | 		xor		a1,		t1,		a3
134 | 		xor		a1,		a1,		t4
135 | 		xor		a1,		a1,		a6
136 | 
137 | 		add		a3,		a3,		a4
138 | 		and		a3,		a3,		a5
139 | 
140 | 		ssm4_ks		T0, T0, A1, 0
141 | 		ssm4_ks		T0, T0, A1, 1
142 | 		ssm4_ks		T0, T0, A1, 2
143 | 		ssm4_ks		T0, T0, A1, 3
144 | 		sw		t0,		0(a0)
145 | 
146 | 		xor		a1,		a3,		t0
147 | 		xor		a1,		a1,		t4
148 | 		xor		a1,		a1,		a6
149 | 
150 | 		add		a3,		a3,		a4
151 | 		and		a3,		a3,		a5
152 | 
153 | 		ssm4_ks		T1, T1, A1, 0
154 | 		ssm4_ks		T1, T1, A1, 1
155 | 		ssm4_ks		T1, T1, A1, 2
156 | 		ssm4_ks		T1, T1, A1, 3
157 | 		sw		t1,		4(a0)
158 | 
159 | 		xor		t4,		t0,		t1
160 | 		xor		a1,		t3,		a3
161 | 		xor		a1,		a1,		t4
162 | 		xor		a1,		a1,		a6
163 | 
164 | 		add		a3,		a3,		a4
165 | 		and		a3,		a3,		a5
166 | 
167 | 		ssm4_ks		T2, T2, A1, 0
168 | 		ssm4_ks		T2, T2, A1, 1
169 | 		ssm4_ks		T2, T2, A1, 2
170 | 		ssm4_ks		T2, T2, A1, 3
171 | 		sw		t2,		8(a0)
172 | 
173 | 		xor		a1,		a3,		t2
174 | 		xor		a1,		a1,		t4
175 | 		xor		a1,		a1,		a6
176 | 
177 | 		add		a3,		a3,		a4
178 | 		and		a3,		a3,		a5
179 | 
180 | 		ssm4_ks		T3, T3, A1, 0
181 | 		ssm4_ks		T3, T3, A1, 1
182 | 		ssm4_ks		T3, T3, A1, 2
183 | 		ssm4_ks		T3, T3, A1, 3
184 | 		sw		t3,		12(a0)
185 | 
186 | 		addi	a0,		a0,		16
187 | 
188 | 		bne		a2,		a0,		.ekl
189 | 
190 | 		jr		ra
191 | 		.size	sm4_enc_key,	 .-sm4_enc_key
192 | 		.align	2
193 | 
194 | 
195 | //	Set key for decryption.
196 | //		void sm4_dec_key_asm(uint32_t rk[32], const uint8_t key[16])
197 | //			Where:	a0 = rk, a1 = key
198 | 
199 | 		.globl	sm4_dec_key
200 | 		.type	sm4_dec_key,	 @function
201 | 
202 | sm4_dec_key:
203 | 		addi	sp,		sp,		-8			//	generate a forward key
204 | 		sw		s0,		0(sp)
205 | 		sw		ra,		4(sp)
206 | 		mv		s0,		a0
207 | 		call	sm4_enc_key
208 | 		mv		a0,		s0
209 | 		lw		s0,		0(sp)
210 | 		lw		ra,		4(sp)
211 | 		addi	sp,		sp,		8
212 | 
213 | 		addi	a5,		a0,		124			//	flip order
214 | 		addi	a2,		a0,		64
215 | .dkl:
216 | 		lw		a3,		0(a5)
217 | 		lw		a4,		0(a0)
218 | 		addi	a0,		a0,		4
219 | 		addi	a5,		a5,		-4
220 | 		sw		a3,		-4(a0)
221 | 		sw		a4,		4(a5)
222 | 		bne		a0,		a2,		.dkl
223 | 
224 | 		jr		ra
225 | 		.size	sm4_dec_key,	 .-sm4_dec_key
226 | 
227 | 


--------------------------------------------------------------------------------
/asm/sm4_encdec.h:
--------------------------------------------------------------------------------
 1 | //	sm4_encdec.h
 2 | //	2020-01-24	Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //	Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //	Prototypes for SM4 (Chinese Encryption Standard) Encryption.
 6 | 
 7 | //	The decryption funtion is the same as encryption with the difference
 8 | //	of having a reversed key schedule. Hence we define both functions here.
 9 | 
10 | #ifndef _SM4_ENCDEC_H_
11 | #define _SM4_ENCDEC_H_
12 | 
13 | #include <stdint.h>
14 | 
15 | //	Size of the expanded key.
16 | #define SM4_RK_WORDS  32
17 | 
18 | //	encrypt/decrypt a block, depending on ordering of rk
19 | void sm4_encdec(uint8_t out[16], const uint8_t in[16],
20 | 				const uint32_t rk[SM4_RK_WORDS]);
21 | 
22 | //	expand a secret key for encryption
23 | void sm4_enc_key(uint32_t rk[SM4_RK_WORDS], const uint8_t key[16]);
24 | 
25 | //	expand a secret key for decryption
26 | void sm4_dec_key(uint32_t rk[SM4_RK_WORDS], const uint8_t key[16]);
27 | 
28 | //	aliases
29 | #define sm4_enc_ecb(ct, pt, rk) sm4_encdec(ct, pt, rk)
30 | #define sm4_dec_ecb(pt, ct, rk) sm4_encdec(pt, ct, rk)
31 | 
32 | #endif										/* _SM4_ENCDEC_H_ */
33 | 


--------------------------------------------------------------------------------
/bitmanip.c:
--------------------------------------------------------------------------------
  1 | //  bitmanip.c
  2 | //  2020-03-07  Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //  instruction emulation code -- these are all from bitmanip
  6 | 
  7 | #include "bitmanip.h"
  8 | 
  9 | //  carryless multiply
 10 | 
 11 | uint32_t rv32b_clmul(uint32_t rs1, uint32_t rs2)
 12 | {
 13 | 	uint32_t x = 0;
 14 | 	for (int i = 0; i < 32; i++)
 15 | 		if ((rs2 >> i) & 1)
 16 | 			x ^= rs1 << i;
 17 | 	return x;
 18 | }
 19 | 
 20 | uint32_t rv32b_clmulh(uint32_t rs1, uint32_t rs2)
 21 | {
 22 | 	uint32_t x = 0;
 23 | 	for (int i = 1; i < 32; i++)
 24 | 		if ((rs2 >> i) & 1)
 25 | 			x ^= rs1 >> (32 - i);
 26 | 	return x;
 27 | }
 28 | 
 29 | uint32_t rv32b_clmulr(uint32_t rs1, uint32_t rs2)
 30 | {
 31 | 	uint32_t x = 0;
 32 | 	for (int i = 0; i < 32; i++)
 33 | 		if ((rs2 >> i) & 1)
 34 | 			x ^= rs1 >> (32 - i - 1);
 35 | 	return x;
 36 | }
 37 | 
 38 | //	64-bit
 39 | 
 40 | uint64_t rv64b_clmul(uint64_t rs1, uint64_t rs2)
 41 | {
 42 | 	uint64_t x = 0;
 43 | 	for (int i = 0; i < 64; i++)
 44 | 		if ((rs2 >> i) & 1)
 45 | 			x ^= rs1 << i;
 46 | 	return x;
 47 | }
 48 | 
 49 | uint64_t rv64b_clmulh(uint64_t rs1, uint64_t rs2)
 50 | {
 51 | 	uint64_t x = 0;
 52 | 	for (int i = 1; i < 64; i++)
 53 | 		if ((rs2 >> i) & 1)
 54 | 			x ^= rs1 >> (64 - i);
 55 | 	return x;
 56 | }
 57 | 
 58 | uint64_t rv64b_clmulr(uint64_t rs1, uint64_t rs2)
 59 | {
 60 | 	uint64_t x = 0;
 61 | 	for (int i = 0; i < 64; i++)
 62 | 		if ((rs2 >> i) & 1)
 63 | 			x ^= rs1 >> (64 - i - 1);
 64 | 	return x;
 65 | }
 66 | 
 67 | //  rotate right ROR / RORI
 68 | 
 69 | uint32_t rv32b_ror(uint32_t rs1, uint32_t rs2)
 70 | {
 71 | 	int shamt = rs2 & (32 - 1);
 72 | 	return (rs1 >> shamt) | (rs1 << ((32 - shamt) & (32 - 1)));
 73 | }
 74 | 
 75 | uint64_t rv64b_ror(uint64_t rs1, uint64_t rs2)
 76 | {
 77 | 	int shamt = rs2 & (64 - 1);
 78 | 	return (rs1 >> shamt) | (rs1 << ((64 - shamt) & (64 - 1)));
 79 | }
 80 | 
 81 | //  and with negate ANDN
 82 | 
 83 | uint64_t rv32b_andn(uint32_t rs1, uint32_t rs2)
 84 | {
 85 | 	return rs1 & ~rs2;
 86 | }
 87 | 
 88 | uint64_t rv64b_andn(uint64_t rs1, uint64_t rs2)
 89 | {
 90 | 	return rs1 & ~rs2;
 91 | }
 92 | 
 93 | //  generalized reverse GREV / GREVI
 94 | 
 95 | uint32_t rv32b_grev(uint32_t rs1, uint32_t rs2)
 96 | {
 97 | 	uint32_t x = rs1;
 98 | 	int shamt = rs2 & 31;
 99 | 	if (shamt & 1)
100 | 		x = ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
101 | 	if (shamt & 2)
102 | 		x = ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
103 | 	if (shamt & 4)
104 | 		x = ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
105 | 	if (shamt & 8)
106 | 		x = ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8);
107 | 	if (shamt & 16)
108 | 		x = ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16);
109 | 	return x;
110 | }
111 | 
112 | uint64_t rv64b_grev(uint64_t rs1, uint64_t rs2)
113 | {
114 | 	uint64_t x = rs1;
115 | 	int shamt = rs2 & 63;
116 | 	if (shamt & 1)
117 | 		x = ((x & 0x5555555555555555LL) << 1) |
118 | 			((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
119 | 	if (shamt & 2)
120 | 		x = ((x & 0x3333333333333333LL) << 2) |
121 | 			((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
122 | 	if (shamt & 4)
123 | 		x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
124 | 			((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
125 | 	if (shamt & 8)
126 | 		x = ((x & 0x00FF00FF00FF00FFLL) << 8) |
127 | 			((x & 0xFF00FF00FF00FF00LL) >> 8);
128 | 	if (shamt & 16)
129 | 		x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
130 | 			((x & 0xFFFF0000FFFF0000LL) >> 16);
131 | 	if (shamt & 32)
132 | 		x = ((x & 0x00000000FFFFFFFFLL) << 32) |
133 | 			((x & 0xFFFFFFFF00000000LL) >> 32);
134 | 	return x;
135 | }
136 | 
137 | //  32-bit helper for SHFL/UNSHFL
138 | 
139 | static inline uint32_t shuffle32_stage(uint32_t src, uint32_t ml,
140 | 									   uint32_t mr, int n)
141 | {
142 | 	uint32_t x = src & ~(ml | mr);
143 | 	x |= ((src << n) & ml) | ((src >> n) & mr);
144 | 	return x;
145 | }
146 | 
147 | //  generalized shuffle SHFL / SHFLI
148 | 
149 | uint32_t rv32b_shfl(uint32_t rs1, uint32_t rs2)
150 | {
151 | 	uint32_t x = rs1;
152 | 	int shamt = rs2 & 15;
153 | 
154 | 	if (shamt & 8)
155 | 		x = shuffle32_stage(x, 0x00FF0000, 0x0000FF00, 8);
156 | 	if (shamt & 4)
157 | 		x = shuffle32_stage(x, 0x0F000F00, 0x00F000F0, 4);
158 | 	if (shamt & 2)
159 | 		x = shuffle32_stage(x, 0x30303030, 0x0C0C0C0C, 2);
160 | 	if (shamt & 1)
161 | 		x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
162 | 
163 | 	return x;
164 | }
165 | 
166 | //  generalized unshuffle UNSHFL / UNSHFLI
167 | 
168 | uint32_t rv32b_unshfl(uint32_t rs1, uint32_t rs2)
169 | {
170 | 	uint32_t x = rs1;
171 | 	int shamt = rs2 & 15;
172 | 
173 | 	if (shamt & 1)
174 | 		x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
175 | 	if (shamt & 2)
176 | 		x = shuffle32_stage(x, 0x30303030, 0x0C0C0C0C, 2);
177 | 	if (shamt & 4)
178 | 		x = shuffle32_stage(x, 0x0F000F00, 0x00F000F0, 4);
179 | 	if (shamt & 8)
180 | 		x = shuffle32_stage(x, 0x00FF0000, 0x0000FF00, 8);
181 | 
182 | 	return x;
183 | }
184 | 
185 | 
186 | //  64-bit helper for SHFLW/UNSHFLW
187 | 
188 | static inline uint64_t shuffle64_stage(uint64_t src, uint64_t ml,
189 | 									   uint64_t mr, int n)
190 | {
191 | 	uint64_t x = src & ~(ml | mr);
192 | 	x |= ((src << n) & ml) | ((src >> n) & mr);
193 | 	return x;
194 | }
195 | 
196 | //  generalized shuffle SHFLW
197 | 
198 | uint64_t rv64b_shfl(uint64_t rs1, uint64_t rs2)
199 | {
200 | 	uint64_t x = rs1;
201 | 	int shamt = rs2 & 31;
202 | 
203 | 	if (shamt & 16)
204 | 		x = shuffle64_stage(x, 0x0000FFFF00000000LL, 0x00000000FFFF0000LL, 16);
205 | 	if (shamt & 8)
206 | 		x = shuffle64_stage(x, 0x00FF000000FF0000LL, 0x0000FF000000FF00LL, 8);
207 | 	if (shamt & 4)
208 | 		x = shuffle64_stage(x, 0x0F000F000F000F00LL, 0x00F000F000F000F0LL, 4);
209 | 	if (shamt & 2)
210 | 		x = shuffle64_stage(x, 0x3030303030303030LL, 0x0C0C0C0C0C0C0C0CLL, 2);
211 | 	if (shamt & 1)
212 | 		x = shuffle64_stage(x, 0x4444444444444444LL, 0x2222222222222222LL, 1);
213 | 
214 | 	return x;
215 | }
216 | 
217 | //  generalized unshuffle UNSHFLW
218 | 
219 | uint64_t rv64b_unshfl(uint64_t rs1, uint64_t rs2)
220 | {
221 | 	uint64_t x = rs1;
222 | 	int shamt = rs2 & 31;
223 | 
224 | 	if (shamt & 1)
225 | 		x = shuffle64_stage(x, 0x4444444444444444LL, 0x2222222222222222LL, 1);
226 | 	if (shamt & 2)
227 | 		x = shuffle64_stage(x, 0x3030303030303030LL, 0x0C0C0C0C0C0C0C0CLL, 2);
228 | 	if (shamt & 4)
229 | 		x = shuffle64_stage(x, 0x0F000F000F000F00LL, 0x00F000F000F000F0LL, 4);
230 | 	if (shamt & 8)
231 | 		x = shuffle64_stage(x, 0x00FF000000FF0000LL, 0x0000FF000000FF00LL, 8);
232 | 	if (shamt & 16)
233 | 		x = shuffle64_stage(x, 0x0000FFFF00000000LL, 0x00000000FFFF0000LL, 16);
234 | 
235 | 	return x;
236 | }
237 | 


--------------------------------------------------------------------------------
/bitmanip.h:
--------------------------------------------------------------------------------
 1 | //  bitmanip.h
 2 | //  2020-03-07  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //	Prototypes for bitmanip instruction emulation code.
 6 | //	-- intended to be replaced with intrinsics.
 7 | 
 8 | #ifndef _BITMANIP_H_
 9 | #define _BITMANIP_H_
10 | 
11 | #include <stdint.h>
12 | 
13 | //  rotate right ROR / RORI
14 | uint32_t rv32b_ror(uint32_t rs1, uint32_t rs2);
15 | uint64_t rv64b_ror(uint64_t rs1, uint64_t rs2);
16 | 
17 | //  and with negate ANDN
18 | uint64_t rv32b_andn(uint32_t rs1, uint32_t rs2);
19 | uint64_t rv64b_andn(uint64_t rs1, uint64_t rs2);
20 | 
21 | //  generalized reverse GREV / GREVI
22 | uint32_t rv32b_grev(uint32_t rs1, uint32_t rs2);
23 | uint64_t rv64b_grev(uint64_t rs1, uint64_t rs2);
24 | 
25 | //  generalized shuffle SHFL / SHFLI
26 | uint32_t rv32b_shfl(uint32_t rs1, uint32_t rs2);
27 | uint64_t rv64b_shfl(uint64_t rs1, uint64_t rs2);
28 | 
29 | //  generalized unshuffle UNSHFL / UNSHFLI
30 | uint32_t rv32b_unshfl(uint32_t rs1, uint32_t rs2);
31 | uint64_t rv64b_unshfl(uint64_t rs1, uint64_t rs2);
32 | 
33 | //  carryless multiply
34 | uint32_t rv32b_clmul(uint32_t rs1, uint32_t rs2);
35 | uint32_t rv32b_clmulh(uint32_t rs1, uint32_t rs2);
36 | uint32_t rv32b_clmulr(uint32_t rs1, uint32_t rs2);
37 | 
38 | uint64_t rv64b_clmul(uint64_t rs1, uint64_t rs2);
39 | uint64_t rv64b_clmulh(uint64_t rs1, uint64_t rs2);
40 | uint64_t rv64b_clmulr(uint64_t rs1, uint64_t rs2);
41 | 
42 | #endif										//  _BITMANIP_H_
43 | 


--------------------------------------------------------------------------------
/doc/NIST.FIPS.197.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjosaarinen/lwaes_isa/75277ed816e7b11fd6f4f4df62ba44993fb1f94f/doc/NIST.FIPS.197.pdf


--------------------------------------------------------------------------------
/doc/gmt0002-2012sm4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjosaarinen/lwaes_isa/75277ed816e7b11fd6f4f4df62ba44993fb1f94f/doc/gmt0002-2012sm4.pdf


--------------------------------------------------------------------------------
/doc/lwaes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjosaarinen/lwaes_isa/75277ed816e7b11fd6f4f4df62ba44993fb1f94f/doc/lwaes.pdf


--------------------------------------------------------------------------------
/doc/sm4en.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjosaarinen/lwaes_isa/75277ed816e7b11fd6f4f4df62ba44993fb1f94f/doc/sm4en.pdf


--------------------------------------------------------------------------------
/doc/sp800-38d.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjosaarinen/lwaes_isa/75277ed816e7b11fd6f4f4df62ba44993fb1f94f/doc/sp800-38d.pdf


--------------------------------------------------------------------------------
/doc/test_gcm_ossl.c:
--------------------------------------------------------------------------------
  1 | //  openssl_gcm.c
  2 | //  2020-03-24  Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | 
  4 | //  test GCM against OpenSSL (to increase coverage)
  5 | 
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <string.h>
  9 | #include <time.h>
 10 | 
 11 | #include <openssl/evp.h>
 12 | #include "aes_gcm.h"
 13 | 
 14 | 
 15 | static int gcm_encrypt(uint8_t * plaintext, int plaintext_len,
 16 | //              uint8_t * aad, int aad_len,
 17 | 					   uint8_t * key,
 18 | 					   uint8_t * iv, int iv_len, uint8_t * ciphertext,
 19 | 					   uint8_t * tag)
 20 | {
 21 | 	EVP_CIPHER_CTX *ctx;
 22 | 
 23 | 	int len;
 24 | 	int ciphertext_len;
 25 | 
 26 | 	if (!(ctx = EVP_CIPHER_CTX_new()))
 27 | 		return 0;
 28 | 
 29 | 	if (1 != EVP_EncryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL))
 30 | 		return 0;
 31 | 
 32 | 	if (1 != EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
 33 | 		return 0;
 34 | 
 35 | 	if (1 != EVP_EncryptInit_ex(ctx, NULL, NULL, key, iv))
 36 | 		return 0;
 37 | 
 38 | /*
 39 | 	if (1 != EVP_EncryptUpdate(ctx, NULL, &len, aad, aad_len))
 40 | 		return 0;
 41 | */
 42 | 	if (1 !=
 43 | 		EVP_EncryptUpdate(ctx, ciphertext, &len, plaintext, plaintext_len))
 44 | 		return 0;
 45 | 	ciphertext_len = len;
 46 | 
 47 | 	if (1 != EVP_EncryptFinal_ex(ctx, ciphertext + len, &len))
 48 | 		return 0;
 49 | 	ciphertext_len += len;
 50 | 
 51 | 	if (1 != EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, 16, tag))
 52 | 		return 0;
 53 | 
 54 | 	EVP_CIPHER_CTX_free(ctx);
 55 | 
 56 | 	return ciphertext_len;
 57 | }
 58 | 
 59 | static void hexvar(const uint8_t * v, size_t len, const char *lab)
 60 | {
 61 | 	size_t i;
 62 | 
 63 | 	printf("%s", lab);
 64 | 	for (i = 0; i < len; i++)
 65 | 		printf("%02X", v[i]);
 66 | 	printf("\n");
 67 | }
 68 | 
 69 | static void rndvar(uint8_t * v, size_t len)
 70 | {
 71 | 	size_t i;
 72 | 
 73 | 	for (i = 0; i < len; i++)
 74 | 		v[i] = random();
 75 | 
 76 | }
 77 | 
 78 | int test_gcm_ossl()
 79 | {
 80 | 	int l, l1;
 81 | 	uint8_t k[16], iv[12], p[1024], c1[1024], c2[1024];
 82 | 	int fail = 0;
 83 | 
 84 | 	srandom(time(NULL));
 85 | 
 86 | 	for (l = 0; l < 1000; l++) {
 87 | 
 88 | 		putchar('.');
 89 | 
 90 | 		memset(c1, 0, l + 16);
 91 | 		memset(c2, 0, l + 16);
 92 | 
 93 | 		rndvar(k, 16);
 94 | 		rndvar(iv, 12);
 95 | 		rndvar(p, l);
 96 | 
 97 | 		l1 = gcm_encrypt(p, l, k, iv, 12, c1, c1 + l);
 98 | 		aes128_enc_gcm(c2, p, l, k, iv);
 99 | 
100 | 		if (l1 != l || memcmp(c1, c2, l + 16) != 0) {
101 | 			printf(" [FAIL] l=%d\n", l);
102 | 			hexvar(k, 16, "K\t");
103 | 			hexvar(iv, 12, "IV\t");
104 | 			hexvar(p, l, "P\t");
105 | 			hexvar(c1, l1 + 16, "C1\t");
106 | 			hexvar(c2, l + 16, "C2\t");
107 | 			fail++;
108 | 		}
109 | 	}
110 | 	printf("\n");
111 | 
112 | 	return fail;
113 | }
114 | 


--------------------------------------------------------------------------------
/gcm_gfmul.h:
--------------------------------------------------------------------------------
 1 | //  gcm_gfmul.h
 2 | //  2020-03-23  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  a minimal interface to core GHASH finite field operations
 6 | 
 7 | #ifndef _GCM_GFMUL_H_
 8 | #define _GCM_GFMUL_H_
 9 | 
10 | #include <stdint.h>
11 | 
12 | //  A GF(2^128) element type -- just for alignment and to avoid casts
13 | 
14 | typedef union {
15 | 	uint8_t b[16];
16 | 	uint32_t w[4];
17 | 	uint64_t d[2];
18 | } gf128_t;
19 | 
20 | //  bit reversal, 32-bit variants (rv32_ghash.c)
21 | void rv32_ghash_rev(gf128_t * z);
22 | 
23 | //  32-bit compact version (rv32_ghash.c)
24 | void rv32_ghash_mul(gf128_t * z, const gf128_t * x, const gf128_t * h);
25 | 
26 | //  32-bit karatsuba version (rv32_ghash.c)
27 | void rv32_ghash_mul_kar(gf128_t * z, const gf128_t * x, const gf128_t * h);
28 | 
29 | //  bit reversal, 64-bit variant (rv64_ghash.c)
30 | void rv64_ghash_rev(gf128_t * z);
31 | 
32 | //  64-bit version (Karatsuba optional) (rv64_ghash.c)
33 | void rv64_ghash_mul(gf128_t * z, const gf128_t * x, const gf128_t * h);
34 | 
35 | //  Function pointers so that different versions can be tested. (aes_gcm.c)
36 | 
37 | //  reverse bits in bytes of a 128-bit block; do this for h and final value
38 | extern void (*ghash_rev)(gf128_t * z);
39 | 
40 | //  finite field multiply z = ( z ^ rev(x) ) * h
41 | extern void (*ghash_mul)(gf128_t * z, const gf128_t * x, const gf128_t * h);
42 | 
43 | #endif										//  _GCM_GFMUL_H_
44 | 


--------------------------------------------------------------------------------
/gcm_rv32b_gfmul.c:
--------------------------------------------------------------------------------
  1 | //  gcm_rv32b_gfmul.c
  2 | //  2020-03-23  Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //  32-bit GHASH bit-reverse and multiplication for GCM
  6 | 
  7 | #include "gcm_gfmul.h"
  8 | #include "bitmanip.h"
  9 | 
 10 | //  disable shift reduction
 11 | #define NO_SHIFTRED
 12 | 
 13 | //  reverse bits in bytes of a 128-bit block; do this for h and final value
 14 | 
 15 | void rv32_ghash_rev(gf128_t * z)
 16 | {
 17 | 	z->w[0] = rv32b_grev(z->w[0], 7);
 18 | 	z->w[1] = rv32b_grev(z->w[1], 7);
 19 | 	z->w[2] = rv32b_grev(z->w[2], 7);
 20 | 	z->w[3] = rv32b_grev(z->w[3], 7);
 21 | }
 22 | 
 23 | //  multiply z = ( z ^ rev(x) ) * h
 24 | //  32-bit compact loop version
 25 | 
 26 | void rv32_ghash_mul(gf128_t * z, const gf128_t * x, const gf128_t * h)
 27 | {
 28 | 	int i;
 29 | 	uint32_t x0, x1, x2, x3, y;
 30 | 	uint32_t z0, z1, z2, z3, z4;
 31 | 	uint32_t t0, t1, t2;
 32 | 
 33 | 	x0 = x->w[0];							//  new data
 34 | 	x1 = x->w[1];
 35 | 	x2 = x->w[2];
 36 | 	x3 = x->w[3];
 37 | 
 38 | 	z0 = z->w[0];							//  inline to avoid these loads
 39 | 	z1 = z->w[1];
 40 | 	z2 = z->w[2];
 41 | 	z3 = z->w[3];
 42 | 
 43 | 	//  4 x GREV
 44 | 	x0 = rv32b_grev(x0, 7);					//  reverse input x only
 45 | 	x1 = rv32b_grev(x1, 7);
 46 | 	x2 = rv32b_grev(x2, 7);
 47 | 	x3 = rv32b_grev(x3, 7);
 48 | 
 49 | 	//  4 x XOR
 50 | 	x0 = x0 ^ z0;							//  z is kept unreversed
 51 | 	x1 = x1 ^ z1;
 52 | 	x2 = x2 ^ z2;
 53 | 	x3 = x3 ^ z3;
 54 | 
 55 | 	//  4 x CMULH, 4 x CLMUL, 3 x XOR
 56 | 	y = h->w[3];							//  start from highest word
 57 | 	z4 = rv32b_clmulh(x3, y);
 58 | 	z3 = rv32b_clmul(x3, y);
 59 | 	t1 = rv32b_clmulh(x2, y);
 60 | 	z2 = rv32b_clmul(x2, y);
 61 | 	z3 = z3 ^ t1;
 62 | 	t1 = rv32b_clmulh(x1, y);
 63 | 	z1 = rv32b_clmul(x1, y);
 64 | 	z2 = z2 ^ t1;
 65 | 	t1 = rv32b_clmulh(x0, y);
 66 | 	z0 = rv32b_clmul(x0, y);
 67 | 	z1 = z1 ^ t1;
 68 | 
 69 | #ifdef NO_SHIFTRED
 70 | 	//  Mul reduction: 1 x CLMULH, 1 x CLMUL, 2 x XOR
 71 | 	t1 = rv32b_clmulh(z4, 0x87);
 72 | 	t0 = rv32b_clmul(z4, 0x87);
 73 | 	z1 = z1 ^ t1;
 74 | 	z0 = z0 ^ t0;
 75 | #else
 76 | 	//  Shift reduction: 6 x SHIFT, 7 x XOR
 77 | 	z1 = z1 ^ (z4 >> 31) ^ (z4 >> 30) ^ (z4 >> 25);
 78 | 	z0 = z0 ^ z4 ^ (z4 << 1) ^ (z4 << 2) ^ (z4 << 7);
 79 | #endif
 80 | 
 81 | 	//  repeat 3 times
 82 | 	for (i = 2; i >= 0; i--) {				//  towards less significant
 83 | 
 84 | 		y = h->w[i];						//  unroll this if you like
 85 | 
 86 | 		//  4 x CLMULH, 4 x CLMUL, 7 x XOR
 87 | 		t1 = rv32b_clmulh(x3, y);
 88 | 		t0 = rv32b_clmul(x3, y);
 89 | 		z4 = z3 ^ t1;
 90 | 		t1 = rv32b_clmulh(x2, y);
 91 | 		t2 = rv32b_clmul(x2, y);
 92 | 		z3 = z2 ^ t0 ^ t1;
 93 | 		t1 = rv32b_clmulh(x1, y);
 94 | 		t0 = rv32b_clmul(x1, y);
 95 | 		z2 = z1 ^ t1 ^ t2;
 96 | 		t1 = rv32b_clmulh(x0, y);
 97 | 		t2 = rv32b_clmul(x0, y);
 98 | 		z1 = z0 ^ t0 ^ t1;
 99 | 
100 | #ifdef NO_SHIFTRED
101 | 		//  Mul reduction: 1 x CLMULH, 1 x CLMUL, 2 x XOR
102 | 		t1 = rv32b_clmulh(z4, 0x87);
103 | 		t0 = rv32b_clmul(z4, 0x87);
104 | 		z1 = z1 ^ t1;
105 | 		z0 = t2 ^ t0;
106 | #else
107 | 		//  Shift reduction: 6 x SHIFT, 7 x XOR
108 | 		z1 = z1 ^ (z4 >> 31) ^ (z4 >> 30) ^ (z4 >> 25);
109 | 		z0 = t2 ^ z4 ^ (z4 << 1) ^ (z4 << 2) ^ (z4 << 7);
110 | #endif
111 | 
112 | 	}
113 | 
114 | 	z->w[0] = z0;							//  inline to remove store
115 | 	z->w[1] = z1;
116 | 	z->w[2] = z2;
117 | 	z->w[3] = z3;
118 | }
119 | 
120 | //  multiply z = ( z ^ rev(x) ) * h
121 | //  32-bit Karatsuba version
122 | 
123 | void rv32_ghash_mul_kar(gf128_t * z, const gf128_t * x, const gf128_t * h)
124 | {
125 | 	uint32_t x0, x1, x2, x3, y0, y1, y2, y3;
126 | 	uint32_t z0, z1, z2, z3, z4, z5, z6, z7;
127 | 	uint32_t t0, t1, t2, t3;
128 | 
129 | 	x0 = x->w[0];							//  load new data
130 | 	x1 = x->w[1];
131 | 	x2 = x->w[2];
132 | 	x3 = x->w[3];
133 | 
134 | 	z0 = z->w[0];							//  inline to avoid these loads
135 | 	z1 = z->w[1];
136 | 	z2 = z->w[2];
137 | 	z3 = z->w[3];
138 | 
139 | 	y0 = h->w[0];							//  y is untouched
140 | 	y1 = h->w[1];
141 | 	y2 = h->w[2];
142 | 	y3 = h->w[3];
143 | 
144 | 	//  4 x GREV
145 | 	x0 = rv32b_grev(x0, 7);					//  reverse input x only
146 | 	x1 = rv32b_grev(x1, 7);
147 | 	x2 = rv32b_grev(x2, 7);
148 | 	x3 = rv32b_grev(x3, 7);
149 | 
150 | 	//  4 x XOR
151 | 	x0 = x0 ^ z0;							//  z is updated
152 | 	x1 = x1 ^ z1;
153 | 	x2 = x2 ^ z2;
154 | 	x3 = x3 ^ z3;
155 | 
156 | 	//  2-level Karatsuba multiplication
157 | 	//  9 x CLMULH, 9 x CLMUL, 40 x XOR
158 | 
159 | 	z7 = rv32b_clmulh(x3, y3);				//  high pair
160 | 	z6 = rv32b_clmul(x3, y3);
161 | 	z5 = rv32b_clmulh(x2, y2);
162 | 	z4 = rv32b_clmul(x2, y2);
163 | 	t0 = x2 ^ x3;
164 | 	t2 = y2 ^ y3;
165 | 	t1 = rv32b_clmulh(t0, t2);
166 | 	t0 = rv32b_clmul(t0, t2);
167 | 	t1 = t1 ^ z5 ^ z7;
168 | 	t0 = t0 ^ z4 ^ z6;
169 | 	z6 = z6 ^ t1;
170 | 	z5 = z5 ^ t0;
171 | 
172 | 	z3 = rv32b_clmulh(x1, y1);				//  low pair
173 | 	z2 = rv32b_clmul(x1, y1);
174 | 	z1 = rv32b_clmulh(x0, y0);
175 | 	z0 = rv32b_clmul(x0, y0);
176 | 	t0 = x0 ^ x1;
177 | 	t2 = y0 ^ y1;
178 | 	t1 = rv32b_clmulh(t0, t2);
179 | 	t0 = rv32b_clmul(t0, t2);
180 | 	t1 = t1 ^ z1 ^ z3;
181 | 	t0 = t0 ^ z0 ^ z2;
182 | 	z2 = z2 ^ t1;
183 | 	z1 = z1 ^ t0;
184 | 
185 | 	t3 = y1 ^ y3;							//  split
186 | 	t2 = y0 ^ y2;
187 | 	t1 = x1 ^ x3;
188 | 	t0 = x0 ^ x2;
189 | 
190 | 	x3 = rv32b_clmulh(t1, t3);				//  middle
191 | 	x2 = rv32b_clmul(t1, t3);
192 | 	x1 = rv32b_clmulh(t0, t2);
193 | 	x0 = rv32b_clmul(t0, t2);
194 | 
195 | 	t0 = t0 ^ t1;
196 | 	t2 = t2 ^ t3;
197 | 	t1 = rv32b_clmulh(t0, t2);
198 | 	t0 = rv32b_clmul(t0, t2);
199 | 	t1 = t1 ^ x1 ^ x3;
200 | 	t0 = t0 ^ x0 ^ x2;
201 | 	x2 = x2 ^ t1;
202 | 	x1 = x1 ^ t0;
203 | 
204 | 	x3 = x3 ^ z3 ^ z7;						//  finalize
205 | 	x2 = x2 ^ z2 ^ z6;
206 | 	x1 = x1 ^ z1 ^ z5;
207 | 	x0 = x0 ^ z0 ^ z4;
208 | 	z5 = z5 ^ x3;
209 | 	z4 = z4 ^ x2;
210 | 	z3 = z3 ^ x1;
211 | 	z2 = z2 ^ x0;
212 | 
213 | 	//  == REDUCTION ==
214 | 
215 | #ifdef NO_SHIFTRED
216 | 	//  Mul reduction: 4 x CLMULH, 4 x CLMUL, 8 x XOR
217 | 	t1 = rv32b_clmulh(z7, 0x87);
218 | 	t0 = rv32b_clmul(z7, 0x87);
219 | 	z4 = z4 ^ t1;
220 | 	z3 = z3 ^ t0;
221 | 	t1 = rv32b_clmulh(z6, 0x87);
222 | 	t0 = rv32b_clmul(z6, 0x87);
223 | 	z3 = z3 ^ t1;
224 | 	z2 = z2 ^ t0;
225 | 	t1 = rv32b_clmulh(z5, 0x87);
226 | 	t0 = rv32b_clmul(z5, 0x87);
227 | 	z2 = z2 ^ t1;
228 | 	z1 = z1 ^ t0;
229 | 	t1 = rv32b_clmulh(z4, 0x87);
230 | 	t0 = rv32b_clmul(z4, 0x87);
231 | 	z1 = z1 ^ t1;
232 | 	z0 = z0 ^ t0;
233 | #else
234 | 	//  Shift reduction: 24 x SHIFT, 28 x XOR
235 | 	z4 = z4 ^ (z7 >> 31) ^ (z7 >> 30) ^ (z7 >> 25);
236 | 	z3 = z3 ^ z7 ^ (z7 << 1) ^ (z7 << 2) ^ (z7 << 7) ^
237 | 		(z6 >> 31) ^ (z6 >> 30) ^ (z6 >> 25);
238 | 	z2 = z2 ^ z6 ^ (z6 << 1) ^ (z6 << 2) ^ (z6 << 7) ^
239 | 		(z5 >> 31) ^ (z5 >> 30) ^ (z5 >> 25);
240 | 	z1 = z1 ^ z5 ^ (z5 << 1) ^ (z5 << 2) ^ (z5 << 7) ^
241 | 		(z4 >> 31) ^ (z4 >> 30) ^ (z4 >> 25);
242 | 	z0 = z0 ^ z4 ^ (z4 << 1) ^ (z4 << 2) ^ (z4 << 7);
243 | #endif
244 | 
245 | 	z->w[0] = z0;							//  inline to remove store
246 | 	z->w[1] = z1;
247 | 	z->w[2] = z2;
248 | 	z->w[3] = z3;
249 | }
250 | 


--------------------------------------------------------------------------------
/gcm_rv64b_gfmul.c:
--------------------------------------------------------------------------------
  1 | //  gcm_rv64b_gfmul.c
  2 | //  2020-03-23  Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //  64-bit GHASH bit-reverse and multiplication for GCM
  6 | 
  7 | #include "gcm_gfmul.h"
  8 | #include "bitmanip.h"
  9 | 
 10 | //  disable shift reduction
 11 | //#define NO_SHIFTRED
 12 | //  disable karatsuba multiplication
 13 | //#define NO_KARATSUBA
 14 | 
 15 | //  reverse bits in bytes of a 128-bit block; do this for h and final value
 16 | 
 17 | void rv64_ghash_rev(gf128_t * z)
 18 | {
 19 | 	z->d[0] = rv64b_grev(z->d[0], 7);
 20 | 	z->d[1] = rv64b_grev(z->d[1], 7);
 21 | }
 22 | 
 23 | //  multiply z = ( z ^ rev(x) ) * h
 24 | 
 25 | void rv64_ghash_mul(gf128_t * z, const gf128_t * x, const gf128_t * h)
 26 | {
 27 | 	uint64_t x0, x1, y0, y1;
 28 | 	uint64_t z0, z1, z2, z3, t0, t1, t2;
 29 | 
 30 | 	x0 = x->d[0];							//  new input
 31 | 	x1 = x->d[1];
 32 | 
 33 | 	z0 = z->d[0];							//  inline to avoid these loads
 34 | 	z1 = z->d[1];
 35 | 
 36 | 	y0 = h->d[0];							//  h value already reversed
 37 | 	y1 = h->d[1];
 38 | 
 39 | 	//  2 x GREVW, 2 x XOR
 40 | 	x0 = rv64b_grev(x0, 7);					//  reverse input x only
 41 | 	x1 = rv64b_grev(x1, 7);
 42 | 	x0 = x0 ^ z0;							//  z is updated
 43 | 	x1 = x1 ^ z1;
 44 | 
 45 | #ifdef NO_KARATSUBA
 46 | 
 47 | 	(void) t2;								//  unused
 48 | 
 49 | 	//  Without Karatsuba; 4 x CLMULH, 4 x CLMUL, 4 x XOR
 50 | 	z3 = rv64b_clmulh(x1, y1);
 51 | 	z2 = rv64b_clmul(x1, y1);
 52 | 	t1 = rv64b_clmulh(x0, y1);
 53 | 	z1 = rv64b_clmul(x0, y1);
 54 | 	z2 = z2 ^ t1;
 55 | 	t1 = rv64b_clmulh(x1, y0);
 56 | 	t0 = rv64b_clmul(x1, y0);
 57 | 	z2 = z2 ^ t1;
 58 | 	z1 = z1 ^ t0;
 59 | 	t1 = rv64b_clmulh(x0, y0);
 60 | 	z0 = rv64b_clmul(x0, y0);
 61 | 	z1 = z1 ^ t1;
 62 | 
 63 | #else
 64 | 
 65 | 	//  With Karatsuba; 3 x CLMULH, 3 x CLMUL, 8 x XOR
 66 | 	z3 = rv64b_clmulh(x1, y1);
 67 | 	z2 = rv64b_clmul(x1, y1);
 68 | 	z1 = rv64b_clmulh(x0, y0);
 69 | 	z0 = rv64b_clmul(x0, y0);
 70 | 	t0 = x0 ^ x1;
 71 | 	t2 = y0 ^ y1;
 72 | 	t1 = rv64b_clmulh(t0, t2);
 73 | 	t0 = rv64b_clmul(t0, t2);
 74 | 	t1 = t1 ^ z1 ^ z3;
 75 | 	t0 = t0 ^ z0 ^ z2;
 76 | 	z2 = z2 ^ t1;
 77 | 	z1 = z1 ^ t0;
 78 | 
 79 | #endif
 80 | 
 81 | #ifdef NO_SHIFTRED
 82 | 
 83 | 	//  Mul reduction: 2 x CLMULH, 2 x CLMUL, 4 x XOR
 84 | 	t1 = rv64b_clmulh(z3, 0x87);
 85 | 	t0 = rv64b_clmul(z3, 0x87);
 86 | 	z2 = z2 ^ t1;
 87 | 	z1 = z1 ^ t0;
 88 | 	t1 = rv64b_clmulh(z2, 0x87);
 89 | 	t0 = rv64b_clmul(z2, 0x87);
 90 | 	z1 = z1 ^ t1;
 91 | 	z0 = z0 ^ t0;
 92 | 
 93 | #else
 94 | 
 95 | 	//  Shift reduction: 12 x SHIFT, 14 x XOR
 96 | 	z2 = z2 ^ (z3 >> 63) ^ (z3 >> 62) ^ (z3 >> 57);
 97 | 	z1 = z1 ^ z3 ^ (z3 << 1) ^ (z3 << 2) ^ (z3 << 7) ^
 98 | 		(z2 >> 63) ^ (z2 >> 62) ^ (z2 >> 57);
 99 | 	z0 = z0 ^ z2 ^ (z2 << 1) ^ (z2 << 2) ^ (z2 << 7);
100 | 
101 | #endif
102 | 
103 | 	z->d[0] = z0;							//  inline to avoid these stores
104 | 	z->d[1] = z1;
105 | }
106 | 


--------------------------------------------------------------------------------
/gcm_test.c:
--------------------------------------------------------------------------------
  1 | //  gcm_test.c
  2 | //  2020-03-21  Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //  Unit tests for GCM AES-128/192/256 in simple mode. Selected from
  6 | //  https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Algorithm-Validation-Program/documents/mac/gcmtestvectors.zip
  7 | 
  8 | #include <stdio.h>
  9 | #include <string.h>
 10 | #include <stdlib.h>
 11 | 
 12 | #include "test_hex.h"
 13 | #include "gcm_wrap.h"
 14 | #include "gcm_gfmul.h"
 15 | 
 16 | //  A GCM test
 17 | 
 18 | int test_gcm()
 19 | {
 20 | 	uint8_t pt[100], ct[100], xt[100], k[32], iv[12];
 21 | 	size_t mlen, clen;
 22 | 	int flag, fail = 0;
 23 | 
 24 | 	//  GCM AES-128, one-block message
 25 | 
 26 | 	readhex(k, sizeof(k), "7FDDB57453C241D03EFBED3AC44E371C");
 27 | 	readhex(iv, sizeof(iv), "EE283A3FC75575E33EFD4887");
 28 | 	mlen = readhex(pt, sizeof(pt), "D5DE42B461646C255C87BD2962D3B9A2");
 29 | 	clen = mlen + 16;
 30 | 	memset(ct, 0, clen);
 31 | 	aes128_enc_gcm(ct, pt, mlen, k, iv);
 32 | 	fail += chkhex("GCM AES-128", ct, clen,
 33 | 				   "2CCDA4A5415CB91E135C2A0F78C9B2FD"
 34 | 				   "B36D1DF9B9D5E596F83E8B7F52971CB3");
 35 | 
 36 | 	memset(xt, 0, mlen);
 37 | 	flag = aes128_dec_vfy_gcm(xt, ct, clen, k, iv) ||
 38 | 		memcmp(xt, pt, mlen) != 0;
 39 | 
 40 | 	ct[rand() % clen] ^= 1 << (rand() & 7);	//  corrupt random bit
 41 | 
 42 | 	flag |= !(aes128_dec_vfy_gcm(xt, ct, clen, k, iv) ||
 43 | 			  memcmp(xt, pt, mlen) != 0);
 44 | 	printf("[%s] GCM AES-128 verify / corrupt test\n", flag ? "FAIL" : "PASS");
 45 | 	if (flag)
 46 | 		fail++;
 47 | 
 48 | 	//  GCM AES-192, two-block message
 49 | 
 50 | 	readhex(k, sizeof(k), "165C4AA5D78EE15F297D5D2EAE39EAAC"
 51 | 			"3480FC50A6D9A98E");
 52 | 	readhex(iv, sizeof(iv), "0E321E714C4A262350FC50FC");
 53 | 	mlen = readhex(pt, sizeof(pt),
 54 | 				   "5AFA41EFE94C0193FC9FE62FD6CFACC8"
 55 | 				   "868725AB4965A5C9132D74179F0AEE72");
 56 | 	clen = mlen + 16;
 57 | 	memset(ct, 0, clen);
 58 | 	aes192_enc_gcm(ct, pt, mlen, k, iv);
 59 | 	fail += chkhex("GCM AES-192", ct, clen,
 60 | 				   "5AB8AC904E7D4A627EE327B4629B6863"
 61 | 				   "19936ABC709E8C0FB6817CB16D0C4F76"
 62 | 				   "62BFEA782D6A05CD04030C433639B969");
 63 | 
 64 | 	memset(xt, 0, mlen);
 65 | 	flag = aes192_dec_vfy_gcm(xt, ct, clen, k, iv) ||
 66 | 		memcmp(xt, pt, mlen) != 0;
 67 | 
 68 | 	ct[rand() % clen] ^= 1 << (rand() & 7);	//  corrupt random bit
 69 | 
 70 | 	flag |= !(aes192_dec_vfy_gcm(xt, ct, clen, k, iv) ||
 71 | 			  memcmp(xt, pt, mlen) != 0);
 72 | 	printf("[%s] GCM AES-192 verify / corrupt test\n", flag ? "FAIL" : "PASS");
 73 | 	if (flag)
 74 | 		fail++;
 75 | 
 76 | 	//  GCM AES-256, 51-byte message
 77 | 
 78 | 	readhex(k, sizeof(k), "1FDED32D5999DE4A76E0F8082108823A"
 79 | 			"EF60417E1896CF4218A2FA90F632EC8A");
 80 | 	readhex(iv, sizeof(iv), "1F3AFA4711E9474F32E70462");
 81 | 	mlen = readhex(pt, sizeof(pt),
 82 | 				   "06B2C75853DF9AEB17BEFD33CEA81C63"
 83 | 				   "0B0FC53667FF45199C629C8E15DCE41E"
 84 | 				   "530AA792F796B8138EEAB2E86C7B7BEE" "1D40B0");
 85 | 	clen = mlen + 16;
 86 | 	memset(ct, 0, clen);
 87 | 	aes256_enc_gcm(ct, pt, mlen, k, iv);
 88 | 	fail += chkhex("GCM AES-256", ct, clen,
 89 | 				   "91FBD061DDC5A7FCC9513FCDFDC9C3A7"
 90 | 				   "C5D4D64CEDF6A9C24AB8A77C36EEFBF1"
 91 | 				   "C5DC00BC50121B96456C8CD8B6FF1F8B"
 92 | 				   "3E480F" "30096D340F3D5C42D82A6F475DEF23EB");
 93 | 	memset(xt, 0, mlen);
 94 | 	flag = aes256_dec_vfy_gcm(xt, ct, clen, k, iv) ||
 95 | 		memcmp(xt, pt, mlen) != 0;
 96 | 
 97 | 	ct[rand() % clen] ^= 1 << (rand() & 7);	//  corrupt random bit
 98 | 
 99 | 	flag |= !(aes256_dec_vfy_gcm(xt, ct, clen, k, iv) ||
100 | 			  memcmp(xt, pt, mlen) != 0);
101 | 	printf("[%s] GCM AES-256 verify / corrupt test\n", flag ? "FAIL" : "PASS");
102 | 	if (flag)
103 | 		fail++;
104 | 
105 | 	return fail;
106 | }
107 | 


--------------------------------------------------------------------------------
/gcm_wrap.c:
--------------------------------------------------------------------------------
  1 | //  gcm_wrap.c
  2 | //  2020-03-21  Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //  A basic (limited!) AES-GCM interface for testing purposes.
  6 | 
  7 | #include <string.h>
  8 | 
  9 | #include "bitmanip.h"
 10 | #include "aes_wrap.h"
 11 | #include "gcm_wrap.h"
 12 | #include "gcm_gfmul.h"
 13 | 
 14 | //  function pointers are here
 15 | 
 16 | void (*ghash_rev)(gf128_t *) = rv64_ghash_rev;
 17 | void (*ghash_mul)(gf128_t *, const gf128_t *, const gf128_t *) =
 18 | 	rv64_ghash_mul;
 19 | 
 20 | //  the same "body" for encryption/decryption and various key lengths
 21 | 
 22 | static void aes_gcm_body(uint8_t * dst, uint8_t tag[16],
 23 | 						 const uint8_t * src, size_t len,
 24 | 						 const uint8_t iv[12], const uint32_t rk[],
 25 | 						 void (*enc_ecb)(uint8_t * ct, const uint8_t * pt,
 26 | 										 const uint32_t * rk), int enc_flag)
 27 | {
 28 | 	size_t i, ctr;
 29 | 	gf128_t b, c, z, h, t, p;
 30 | 
 31 | 	h.d[0] = 0;								//  h = AES_k(0)
 32 | 	h.d[1] = 0;
 33 | 	enc_ecb(h.b, h.b, rk);
 34 | 	ghash_rev(&h);
 35 | 
 36 | 	ctr = 0;								//  counter value
 37 | 	memcpy(p.b, iv, 12);					//  J0
 38 | 	p.w[3] = rv32b_grev(++ctr, 0x18);		//  big-endian counter
 39 | 	enc_ecb(t.b, p.b, rk);					//  first AES_k(IV | 1) for tag
 40 | 
 41 | 	z.d[0] = 0;								//  initialize GHASH result
 42 | 	z.d[1] = 0;
 43 | 
 44 | 	if (enc_flag) {							//  == encrypt / generate tag ==
 45 | 
 46 | 		i = len;
 47 | 		while (i >= 16) {					//  full block
 48 | 			p.w[3] = rv32b_grev(++ctr, 0x18);	//  rev8.w; big-endian counter
 49 | 			enc_ecb(c.b, p.b, rk);
 50 | 			memcpy(b.b, src, 16);			//  load plaintext
 51 | 			c.d[0] ^= b.d[0];
 52 | 			c.d[1] ^= b.d[1];
 53 | 			memcpy(dst, c.b, 16);			//  store ciphertext
 54 | 			ghash_mul(&z, &c, &h);			//  GHASH the block
 55 | 			src += 16;
 56 | 			dst += 16;
 57 | 			i -= 16;
 58 | 		}
 59 | 
 60 | 		if (i > 0) {						//  partial block
 61 | 			p.w[3] = rv32b_grev(++ctr, 0x18);	//  rev8.w; big-endian counter
 62 | 			enc_ecb(c.b, p.b, rk);
 63 | 			memcpy(b.b, src, i);			//  load plaintext
 64 | 			c.d[0] ^= b.d[0];
 65 | 			c.d[1] ^= b.d[1];
 66 | 			memcpy(dst, c.b, i);
 67 | 			memset(&c.b[i], 0, 16 - i);		//  zero pad input
 68 | 			ghash_mul(&z, &c, &h);			//  GHASH last block
 69 | 		}
 70 | 
 71 | 	} else {								//  == decrypt / verify tag ==
 72 | 
 73 | 		i = len;
 74 | 		while (i >= 16) {					//  full block
 75 | 			p.w[3] = rv32b_grev(++ctr, 0x18);	//  rev8.w; big-endian counter
 76 | 			enc_ecb(b.b, p.b, rk);
 77 | 			memcpy(c.b, src, 16);			//  load ciphertext
 78 | 			b.d[0] ^= c.d[0];
 79 | 			b.d[1] ^= c.d[1];
 80 | 			memcpy(dst, b.b, 16);			//  store plaintext
 81 | 			ghash_mul(&z, &c, &h);			//  GHASH the block
 82 | 			src += 16;
 83 | 			dst += 16;
 84 | 			i -= 16;
 85 | 		}
 86 | 
 87 | 		if (i > 0) {						//  partial block
 88 | 			p.w[3] = rv32b_grev(++ctr, 0x18);	//  rev8.w; big-endian counter
 89 | 			enc_ecb(b.b, p.b, rk);
 90 | 			memcpy(c.b, src, i);
 91 | 			b.d[0] ^= c.d[0];
 92 | 			b.d[1] ^= c.d[1];
 93 | 			memcpy(dst, b.b, i);
 94 | 			memset(&c.b[i], 0, 16 - i);		//  zero pad input
 95 | 			ghash_mul(&z, &c, &h);			//  GHASH last block
 96 | 		}
 97 | 	}
 98 | 
 99 | 	c.d[0] = 0;								//  pad with bit length
100 | 	c.w[2] = rv32b_grev(len >> 29, 0x18);
101 | 	c.w[3] = rv32b_grev(len << 3, 0x18);
102 | 	ghash_mul(&z, &c, &h);					//  last GHASH block
103 | 	ghash_rev(&z);							//  flip result bits
104 | 	t.d[0] = t.d[0] ^ z.d[0];				//  XOR with AES_k(IV | 1)
105 | 	t.d[1] = t.d[1] ^ z.d[1];
106 | 	memcpy(tag, t.b, 16);					//  write tag
107 | }
108 | 
109 | //  verify it
110 | 
111 | static int aes_gcm_vfy(uint8_t * m,
112 | 					   const uint8_t * c, size_t clen,
113 | 					   const uint8_t iv[12], const uint32_t rk[],
114 | 					   void (*enc_ecb)(uint8_t * ct, const uint8_t * pt,
115 | 									   const uint32_t * rk))
116 | {
117 | 	size_t i;
118 | 	uint8_t tag[16], x;
119 | 
120 | 	if (clen < 16)
121 | 		return -1;
122 | 
123 | 	aes_gcm_body(m, tag, c, clen - 16, iv, rk, enc_ecb, 0);
124 | 	x = 0;
125 | 	for (i = 0; i < 16; i++) {
126 | 		x |= tag[i] ^ c[clen - 16 + i];
127 | 	}
128 | 
129 | 	return x == 0 ? 0 : 1;
130 | }
131 | 
132 | //  AES128-GCM
133 | 
134 | void aes128_enc_gcm(uint8_t * c, const uint8_t * m, size_t mlen,
135 | 					const uint8_t * key, const uint8_t iv[12])
136 | {
137 | 	uint32_t rk[AES128_RK_WORDS];
138 | 
139 | 	aes128_enc_key(rk, key);
140 | 	aes_gcm_body(c, c + mlen, m, mlen, iv, rk, aes128_enc_ecb, 1);
141 | }
142 | 
143 | int aes128_dec_vfy_gcm(uint8_t * m, const uint8_t * c, size_t clen,
144 | 					   const uint8_t * key, const uint8_t iv[12])
145 | {
146 | 	uint32_t rk[AES128_RK_WORDS];
147 | 
148 | 	aes128_enc_key(rk, key);
149 | 	return aes_gcm_vfy(m, c, clen, iv, rk, aes128_enc_ecb);
150 | }
151 | 
152 | 
153 | //  AES192-GCM
154 | 
155 | void aes192_enc_gcm(uint8_t * c, const uint8_t * m, size_t mlen,
156 | 					const uint8_t * key, const uint8_t iv[12])
157 | {
158 | 	uint32_t rk[AES192_RK_WORDS];
159 | 
160 | 	aes192_enc_key(rk, key);
161 | 	aes_gcm_body(c, c + mlen, m, mlen, iv, rk, aes192_enc_ecb, 1);
162 | }
163 | 
164 | int aes192_dec_vfy_gcm(uint8_t * m, const uint8_t * c, size_t clen,
165 | 					   const uint8_t * key, const uint8_t iv[12])
166 | {
167 | 	uint32_t rk[AES192_RK_WORDS];
168 | 
169 | 	aes192_enc_key(rk, key);
170 | 	return aes_gcm_vfy(m, c, clen, iv, rk, aes192_enc_ecb);
171 | }
172 | 
173 | //  AES256-GCM
174 | 
175 | void aes256_enc_gcm(uint8_t * c, const uint8_t * m, size_t mlen,
176 | 					const uint8_t * key, const uint8_t iv[12])
177 | {
178 | 	uint32_t rk[AES256_RK_WORDS];
179 | 
180 | 	aes256_enc_key(rk, key);
181 | 	aes_gcm_body(c, c + mlen, m, mlen, iv, rk, aes256_enc_ecb, 1);
182 | }
183 | 
184 | int aes256_dec_vfy_gcm(uint8_t * m, const uint8_t * c, size_t clen,
185 | 					   const uint8_t * key, const uint8_t iv[12])
186 | {
187 | 	uint32_t rk[AES256_RK_WORDS];
188 | 
189 | 	aes256_enc_key(rk, key);
190 | 	return aes_gcm_vfy(m, c, clen, iv, rk, aes256_enc_ecb);
191 | }
192 | 


--------------------------------------------------------------------------------
/gcm_wrap.h:
--------------------------------------------------------------------------------
 1 | //  gcm_wrao.h
 2 | //  2020-03-21  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  Basic AES-GCM; 96-bit IV, no AAD, 128-bit auth tag padded at the end.
 6 | //  Ciphertext is always 16 bytes larger than plaintext.
 7 | //  Decrypt/verify routines (aesxxx_dec_vfy_gcm) return nonzero on failure.
 8 | 
 9 | #ifndef _GCM_WRAP_H_
10 | #define _GCM_WRAP_H_
11 | 
12 | #include <stdint.h>
13 | #include <stddef.h>
14 | 
15 | //  AES-GCM-128 Encrypt / Decrypt & Verify
16 | 
17 | void aes128_enc_gcm(uint8_t * c, const uint8_t * m, size_t mlen,
18 | 					const uint8_t * key, const uint8_t iv[12]);
19 | int aes128_dec_vfy_gcm(uint8_t * m, const uint8_t * c, size_t clen,
20 | 					   const uint8_t * key, const uint8_t iv[12]);
21 | 
22 | //  AES-GCM-192 Encrypt / Decrypt & Verify
23 | 
24 | void aes192_enc_gcm(uint8_t * c, const uint8_t * m, size_t mlen,
25 | 					const uint8_t * key, const uint8_t iv[12]);
26 | int aes192_dec_vfy_gcm(uint8_t * m, const uint8_t * c, size_t clen,
27 | 					   const uint8_t * key, const uint8_t iv[12]);
28 | 
29 | //  AES-GCM-256 Encrypt / Decrypt & Verify
30 | 
31 | void aes256_enc_gcm(uint8_t * c, const uint8_t * m, size_t mlen,
32 | 					const uint8_t * key, const uint8_t iv[12]);
33 | int aes256_dec_vfy_gcm(uint8_t * m, const uint8_t * c, size_t clen,
34 | 					   const uint8_t * key, const uint8_t iv[12]);
35 | 
36 | #endif										//  _GCM_WRAP_H_
37 | 


--------------------------------------------------------------------------------
/hdl/Makefile:
--------------------------------------------------------------------------------
 1 | #	Makefile
 2 | #	2020-01-29  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | #	Copyright (c) 2020, PQShield Ltd.  All rights reserved.
 4 | 
 5 | #	Minimal makefile for Icarus verilog
 6 | 
 7 | HDL		= 	$(wildcard *.v)
 8 | SIM		=	sim.vvp
 9 | 
10 | #	simulate
11 | 
12 | sim:	$(SIM)
13 | 	vvp -N $(SIM)
14 | 
15 | $(SIM):	$(HDL)
16 | 	iverilog -o sim.vvp $(HDL)
17 | 
18 | #	synthesis and reporting
19 | 
20 | synth.tmp:	$(HDL) synth.ys
21 | 	rm -f synth.tmp
22 | 	yosys -v1 synth.ys
23 | 
24 | rep:	synth.tmp
25 | 	./yoparse.py synth.tmp
26 | 
27 | #	prints differing lines
28 | 
29 | test:	$(SIM)
30 | 	vvp -n $(SIM) | grep "[TB]" | diff - tbref.txt
31 | 
32 | clean:
33 | 	rm -f	$(SIM) synth.tmp
34 | 
35 | 


--------------------------------------------------------------------------------
/hdl/README.md:
--------------------------------------------------------------------------------
  1 | # HDL for the AES / SM4 instruction
  2 | 
  3 | 2020-01-29  Markku-Juhani O. Saarinen <mjos@pqshield.com>
  4 | 
  5 | 2020-02-28  Updated with gate counts.
  6 | 
  7 | The main instruction is in [saes32.v](saes32.v), while [sboxes.v](sboxes.v)
  8 | has S-box implementations for AES and SM4. As can be seen, the entire thing
  9 | is only about 100 lines + sboxes. Timing can be significantly further
 10 | improved.
 11 | 
 12 | If your design doesn't need both AES and SM4, or you just need the forward
 13 | AES, you can use macros `SAES32_NO_AES`, `SAES32_NO_AESI`, or `SAES32_NO_SM4`
 14 | to disable forward AES, inverse AES, or SM4 respectively.
 15 | 
 16 | A note about [sboxes.v](sboxes.v): I created linear SM4 "top" and "bottom"
 17 | layers for the [Boyar-Peralta](https://eprint.iacr.org/2011/332.pdf) AES
 18 | S-Box to demonstrate the fact that all three s-box types can share circuitry.
 19 | The [sboxes.v](sboxes.v) file has some commentary on this.
 20 | 
 21 | Currently the code does not mux the middle layer, which would reduce gate
 22 | count. Also note that the the 21->8 bit bottom layers (which are linear)
 23 | can be merged ("collapsed into") the 8->32 bit output layers since they are
 24 | also linear. This would reduce timing and possibly gate count too. The
 25 | present code prioritizes readability over these considerations.
 26 | 
 27 | There's a simple [Makefile](Makefile) and a testbench for Icarus
 28 | Verilog (which is freely available for Debian/Ubuntu etc).
 29 | 
 30 | I have also tested this on Xilinx xsim and vivado with the C and Assembler
 31 | language test suites (see parent directory). PQShield's Pluto RV32 core
 32 | (on an Artix-7 FPGA) was used, although build files are not provided for
 33 | that.
 34 | 
 35 | 
 36 | ##  CMOS Area and Latency Estimate
 37 | 
 38 | There's a Yosys script to make area estimates against a mock CMOS ASIC
 39 | cell library. Running `make rep` will perform synthesis and report gate
 40 | and transistor counts on four separate "feature sets" of the instruction:
 41 | 
 42 | | **Target**           | **Gate Equivalents** | **Transistors** | **LTP** |
 43 | |----------------------|--------:|-------:|----:|
 44 | | AES Encrypt (only)   |  642.0  |  2568  |  25 |
 45 | | AES                  | 1240.0  |  4960  |  28 |
 46 | | SM4                  |  766.5  |  3066  |  25 |
 47 | | AES + SM4 (full)     | 1678.5  |  6714  |  28 |
 48 | 
 49 | LTP is the reported *Longest Topological Path* and is a circuit depth /
 50 | gate delay measure.
 51 | 
 52 | (Currently the weights are such that transistors = 4*GE, but this can be
 53 | tuned in the [yoparse.py](yoparse.py) script.)
 54 | 
 55 | [Yosys](http://www.clifford.at/yosys/) version:
 56 | `Yosys 0.9+1706 (git sha1 cd60f079, clang 6.0.0-1ubuntu2 -fPIC -Os)`
 57 | 
 58 | 
 59 | ##  Testing with a Simulator
 60 | 
 61 | No output from `make test` implies that simulator output matches with
 62 | [tbref.txt](tbref.txt). More test cases can be generated using the
 63 | C emulator code (in parent directory). Matching [saes32_tb.v](saes32_tb.v)
 64 | output is generated with argument `./xtest tb`. Just expand the
 65 | ` test_hwtb()`  function in [../test_main.c](../test_main.c) to your needs.
 66 | 
 67 | ```console
 68 | $ make
 69 | iverilog -o sim.vvp saes32_tb.v sboxes.v saes32.v
 70 | vvp -N sim.vvp
 71 | [TB] rd=a56363c6 rs1=00000000 rs2=00000000 fn=00
 72 | [TB] rd=6e6edcb2 rs1=00000000 rs2=01234567 fn=01
 73 | [TB] rd=5ab4ee5a rs1=00000000 rs2=02468ace fn=02
 74 | [TB] rd=f68d7b7b rs1=00000000 rs2=0369d035 fn=03
 75 | [TB] rd=000000de rs1=00000000 rs2=048d159c fn=04
 76 | [TB] rd=00003900 rs1=00000000 rs2=05b05b03 fn=05
 77 | [TB] rd=00660000 rs1=00000000 rs2=06d3a06a fn=06
 78 | [TB] rd=c5000000 rs1=00000000 rs2=07f6e5d1 fn=07
 79 | [TB] rd=0728ebb2 rs1=00000000 rs2=091a2b38 fn=08
 80 | [TB] rd=670a0cb1 rs1=00000000 rs2=0a3d709f fn=09
 81 | [TB] rd=7ca1470a rs1=00000000 rs2=0b60b606 fn=0a
 82 | [TB] rd=4ffcd7e5 rs1=00000000 rs2=0c83fb6d fn=0b
 83 | [TB] rd=00000019 rs1=00000000 rs2=0da740d4 fn=0c
 84 | [TB] rd=0000dc00 rs1=00000000 rs2=0eca863b fn=0d
 85 | [TB] rd=00530000 rs1=00000000 rs2=0fedcba2 fn=0e
 86 | [TB] rd=e3000000 rs1=00000000 rs2=11111109 fn=0f
 87 | [TB] rd=5353d784 rs1=00000000 rs2=12345670 fn=10
 88 | [TB] rd=c030f0c0 rs1=00000000 rs2=13579bd7 fn=11
 89 | [TB] rd=020a0808 rs1=00000000 rs2=147ae13e fn=12
 90 | [TB] rd=46fafabc rs1=00000000 rs2=159e26a5 fn=13
 91 | [TB] rd=00051428 rs1=00000000 rs2=16c16c0c fn=14
 92 | [TB] rd=9b6ddb60 rs1=00000000 rs2=17e4b173 fn=15
 93 | [TB] rd=5bb7e096 rs1=00000000 rs2=1907f6da fn=16
 94 | [TB] rd=13608209 rs1=00000000 rs2=1a2b3c41 fn=17
 95 | 
 96 | $ make test
 97 | vvp -n sim.vvp | grep "[TB]" | diff - tbref.txt
 98 | $
 99 | ```
100 | 
101 | [Icarus Verilog](https://github.com/steveicarus/iverilog) versions:
102 | `Icarus Verilog Parser/Elaborator version 11.0 (devel) (s20150603-796-g875431a3)`
103 | `Icarus Verilog runtime version 11.0 (devel) (s20150603-796-g875431a3)`
104 | 
105 | Cheers,
106 | - markku
107 | 
108 | 


--------------------------------------------------------------------------------
/hdl/saes32.v:
--------------------------------------------------------------------------------
  1 | //	saes32.v
  2 | //	2020-01-29	Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //	Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //	Proposed SAES32 instruction for lightweight AES, AES^-1, and SM4 (RV32).
  6 | 
  7 | //	Multiply by 0x02 in AES's GF(256) - LFSR style
  8 | 
  9 | module aes_xtime( output [7:0] out, input [7:0] in );
 10 | 	assign	out = { in[6:0], 1'b0 } ^ ( in[7] ? 8'h1B : 8'h00 );
 11 | endmodule
 12 | 
 13 | //	aes encrypt
 14 | 
 15 | `ifndef SAES32_NO_AES
 16 | 
 17 | module aes_t( output [31:0] out, input [7:0] in, input f );
 18 | 
 19 | 	wire [7:0] x;
 20 | 	wire [7:0] x2;
 21 | 
 22 | 	aes_sbox  sbox	( x,  in );
 23 | 	aes_xtime lfsr1 ( x2, x	 );
 24 | 
 25 | 	//	NOP / MixColumns MDS Matrix
 26 | 
 27 | 	assign out = f ? { 24'b0, x } : { x ^ x2, x, x, x2 } ;
 28 | 
 29 | endmodule
 30 | 
 31 | `endif
 32 | 
 33 | //	aes decrypt
 34 | 
 35 | `ifndef SAES32_NO_AESI
 36 | 
 37 | module aesi_t( output [31:0] out, input [7:0] in, input f );
 38 | 
 39 | 	wire [7:0] x;
 40 | 	wire [7:0] x2;
 41 | 	wire [7:0] x4;
 42 | 	wire [7:0] x8;
 43 | 
 44 | 	aesi_sbox  sbox	 ( x,  in );
 45 | 	aes_xtime  lfsr1 ( x2, x  );			//	todo: reduce circuit depth
 46 | 	aes_xtime  lfsr2 ( x4, x2 );
 47 | 	aes_xtime  lfsr3 ( x8, x4 );
 48 | 
 49 | 	//	NOP / Inverse MixColumns MDS Matrix
 50 | 
 51 | 	assign out = f ? { 24'b0, x } :
 52 | 		{ x ^ x2 ^ x8, x ^ x4 ^ x8, x ^ x8, x2 ^ x4 ^ x8 };
 53 | 
 54 | endmodule
 55 | 
 56 | `endif
 57 | 
 58 | //	sm4 encrypt / decrypt
 59 | 
 60 | `ifndef SAES32_NO_SM4
 61 | 
 62 | module sm4_t( output [31:0] out, input [7:0] in, input f );
 63 | 
 64 | 	wire [7:0] x;
 65 | 
 66 | 	sm4_sbox  sbox	( x,  in );
 67 | 
 68 | 	//	Either L' or L linear layers (for keying and encrypt / decrypt)
 69 | 	//	( this looks slightly odd due to the use of little-endian byte order )
 70 | 	assign out = f ? { x[2:0], 5'b0, x[0], 2'b0 ,x[7:3], 1'b0, x[7:1], x } :
 71 | 		{ x[5:0], x, x[7:6], x[7:2], x[1:0] ^ x[7:6], x[7:2] ^ x[5:0], x[1:0] };
 72 | 
 73 | endmodule
 74 | 
 75 | `endif
 76 | 
 77 | //	Combinatorial logic for the SAES32 instruction itself
 78 | 
 79 | module saes32(
 80 | 	output	[31:0]	rd,					//	output register (wire!)
 81 | 	input	[31:0]	rs1,				//	input register 1
 82 | 	input	[31:0]	rs2,				//	input register 2
 83 | 	input	[4:0]	fn					//	5-bit function specifier
 84 | );
 85 | 
 86 | 	//	select input byte from rs2 according to fn[1:0]
 87 | 
 88 | 	wire [7:0] x =	fn[1:0] == 2'b00 ?	rs2[ 7: 0] :
 89 | 					fn[1:0] == 2'b01 ?	rs2[15: 8] :
 90 | 					fn[1:0] == 2'b10 ?	rs2[23:16] :
 91 | 										rs2[31:24];
 92 | 
 93 | 	//	expand to 32 bits
 94 | 
 95 | `ifndef SAES32_NO_AES
 96 | 	wire [31:0] aes_32;
 97 | 	aes_t	aes		( aes_32,  x, fn[2] );
 98 | `endif
 99 | 
100 | `ifndef SAES32_NO_AESI
101 | 	wire [31:0] aesi_32;
102 | 	aesi_t	aesi	( aesi_32, x, fn[2] );
103 | `endif
104 | 
105 | `ifndef SAES32_NO_SM4
106 | 	wire [31:0] sm4_32;
107 | 	sm4_t	sm4		( sm4_32,  x, fn[2] );
108 | `endif
109 | 
110 | 	wire [31:0] y =
111 | `ifndef SAES32_NO_AES
112 | 					fn[4:3] == 2'b00 ?	aes_32 :
113 | `endif
114 | `ifndef SAES32_NO_AESI
115 | 					fn[4:3] == 2'b01 ?	aesi_32 :
116 | `endif
117 | `ifndef SAES32_NO_SM4
118 | 					fn[4:3] == 2'b10 ?	sm4_32 :
119 | `endif
120 | 					32'h00000000;
121 | 
122 | 	//	rotate output
123 | 
124 | 	wire [31:0] z = fn[1:0] == 2'b00 ?	y :
125 | 					fn[1:0] == 2'b01 ?	{ y[23: 0], y[31:24] } :
126 | 					fn[1:0] == 2'b10 ?	{ y[15: 0], y[31:16] } :
127 | 										{ y[ 7: 0], y[31: 8] };
128 | 
129 | 	//	XOR the result with rs1
130 | 
131 | 	assign	rd = z ^ rs1;
132 | 
133 | endmodule
134 | 
135 | 


--------------------------------------------------------------------------------
/hdl/saes32_tb.v:
--------------------------------------------------------------------------------
 1 | //	saes32_tb.v
 2 | //	2020-01-29	Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //	Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //	test bench for the AES / SM4 instruction
 6 | 
 7 | `timescale	1 ns / 1 ps
 8 | 
 9 | module saes32_tb;
10 | 
11 | 	//	clock generator
12 | 	reg clk = 1;
13 | 	always #5 clk = ~clk;
14 | 
15 | 	reg [31:0] cnt = 0;
16 | 
17 | 	reg [31:0]	rs1 = 32'h00000000;
18 | 	reg [31:0]	rs2 = 32'h00000000;
19 | 	reg [4:0]	fn	= 0;
20 | 	wire [31:0] rd;
21 | 
22 | 	wire [7:0] box;
23 | 
24 | 	//	test instance
25 | 	saes32 uut ( rd, rs1, rs2, fn );
26 | 
27 | 	always @(posedge clk) begin
28 | 
29 | 	$display("[TB] rd=%h rs1=%h rs2=%h fn=%h", rd, rs1, rs2, fn );
30 | 
31 | 		fn	<= fn  + 1;
32 | 		rs2 <= rs2 + 32'h01234567;
33 | 
34 | 		if (cnt == 23) begin
35 | 			$finish;
36 | 		end
37 | 		cnt <= cnt + 1;
38 | 	end
39 | 
40 | 
41 | endmodule
42 | 
43 | 


--------------------------------------------------------------------------------
/hdl/sboxes.v:
--------------------------------------------------------------------------------
  1 | //	sboxes.v
  2 | //	2020-01-29	Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //	Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | /*
  6 | 
  7 | 	Non-hardened combinatorial logic for AES, inverse AES, and SM4 S-Boxes.
  8 | 
  9 | 	Each S-Box has a nonlinear middle layer sandwitched between linear
 10 | 	top and bottom layers. In this version the top ("inner") layer expands
 11 | 	8 bits to 21 bits while the bottom layer compresses 18 bits back to 8.
 12 | 
 13 | 	Overall structure and AES and AES^-1 slightly modified from [BoPe12].
 14 | 	SM4 top and bottom layers by Markku-Juhani O. Saarinen, January 2020.
 15 | 
 16 | 	The middle layer is common between all; the beneficiality of muxing it
 17 | 	depends on target. Currently we are not doing it.
 18 | 
 19 | 	How? Because all of these are "Nyberg S-boxes" [Ny93]; built from a
 20 | 	multiplicative inverse in GF(256) and are therefore affine isomorphic.
 21 | 
 22 | 	[BoPe12] Boyar J., Peralta R. "A Small Depth-16 Circuit for the AES
 23 | 	S-Box." Proc.SEC 2012. IFIP AICT 376. Springer, pp. 287-298 (2012)
 24 | 	DOI: https://doi.org/10.1007/978-3-642-30436-1_24
 25 | 	Preprint: https://eprint.iacr.org/2011/332.pdf
 26 | 
 27 | 	[Ny93] Nyberg K., "Differentially Uniform Mappings for Cryptography",
 28 | 	Proc. EUROCRYPT '93, LNCS 765, Springer, pp. 55-64 (1993)
 29 | 	DOI: https://doi.org/10.1007/3-540-48285-7_6
 30 | 
 31 | */
 32 | 
 33 | //	The shared non-linear middle part for AES, AES^-1, and SM4.
 34 | 
 35 | module sbox_inv_mid( output [17:0] y, input [20:0] x );
 36 | 
 37 | 	wire [45:0] t;
 38 | 
 39 | 	assign	t[ 0] = x[ 3] ^	 x[12];
 40 | 	assign	t[ 1] = x[ 9] &	 x[ 5];
 41 | 	assign	t[ 2] = x[17] &	 x[ 6];
 42 | 	assign	t[ 3] = x[10] ^	 t[ 1];
 43 | 	assign	t[ 4] = x[14] &	 x[ 0];
 44 | 	assign	t[ 5] = t[ 4] ^	 t[ 1];
 45 | 	assign	t[ 6] = x[ 3] &	 x[12];
 46 | 	assign	t[ 7] = x[16] &	 x[ 7];
 47 | 	assign	t[ 8] = t[ 0] ^	 t[ 6];
 48 | 	assign	t[ 9] = x[15] &	 x[13];
 49 | 	assign	t[10] = t[ 9] ^	 t[ 6];
 50 | 	assign	t[11] = x[ 1] &	 x[11];
 51 | 	assign	t[12] = x[ 4] &	 x[20];
 52 | 	assign	t[13] = t[12] ^	 t[11];
 53 | 	assign	t[14] = x[ 2] &	 x[ 8];
 54 | 	assign	t[15] = t[14] ^	 t[11];
 55 | 	assign	t[16] = t[ 3] ^	 t[ 2];
 56 | 	assign	t[17] = t[ 5] ^	 x[18];
 57 | 	assign	t[18] = t[ 8] ^	 t[ 7];
 58 | 	assign	t[19] = t[10] ^	 t[15];
 59 | 	assign	t[20] = t[16] ^	 t[13];
 60 | 	assign	t[21] = t[17] ^	 t[15];
 61 | 	assign	t[22] = t[18] ^	 t[13];
 62 | 	assign	t[23] = t[19] ^	 x[19];
 63 | 	assign	t[24] = t[22] ^	 t[23];
 64 | 	assign	t[25] = t[22] &	 t[20];
 65 | 	assign	t[26] = t[21] ^	 t[25];
 66 | 	assign	t[27] = t[20] ^	 t[21];
 67 | 	assign	t[28] = t[23] ^	 t[25];
 68 | 	assign	t[29] = t[28] &	 t[27];
 69 | 	assign	t[30] = t[26] &	 t[24];
 70 | 	assign	t[31] = t[20] &	 t[23];
 71 | 	assign	t[32] = t[27] &	 t[31];
 72 | 	assign	t[33] = t[27] ^	 t[25];
 73 | 	assign	t[34] = t[21] &	 t[22];
 74 | 	assign	t[35] = t[24] &	 t[34];
 75 | 	assign	t[36] = t[24] ^	 t[25];
 76 | 	assign	t[37] = t[21] ^	 t[29];
 77 | 	assign	t[38] = t[32] ^	 t[33];
 78 | 	assign	t[39] = t[23] ^	 t[30];
 79 | 	assign	t[40] = t[35] ^	 t[36];
 80 | 	assign	t[41] = t[38] ^	 t[40];
 81 | 	assign	t[42] = t[37] ^	 t[39];
 82 | 	assign	t[43] = t[37] ^	 t[38];
 83 | 	assign	t[44] = t[39] ^	 t[40];
 84 | 	assign	t[45] = t[42] ^	 t[41];
 85 | 	assign	y[ 0] = t[38] &	 x[ 7];
 86 | 	assign	y[ 1] = t[37] &	 x[13];
 87 | 	assign	y[ 2] = t[42] &	 x[11];
 88 | 	assign	y[ 3] = t[45] &	 x[20];
 89 | 	assign	y[ 4] = t[41] &	 x[ 8];
 90 | 	assign	y[ 5] = t[44] &	 x[ 9];
 91 | 	assign	y[ 6] = t[40] &	 x[17];
 92 | 	assign	y[ 7] = t[39] &	 x[14];
 93 | 	assign	y[ 8] = t[43] &	 x[ 3];
 94 | 	assign	y[ 9] = t[38] &	 x[16];
 95 | 	assign	y[10] = t[37] &	 x[15];
 96 | 	assign	y[11] = t[42] &	 x[ 1];
 97 | 	assign	y[12] = t[45] &	 x[ 4];
 98 | 	assign	y[13] = t[41] &	 x[ 2];
 99 | 	assign	y[14] = t[44] &	 x[ 5];
100 | 	assign	y[15] = t[40] &	 x[ 6];
101 | 	assign	y[16] = t[39] &	 x[ 0];
102 | 	assign	y[17] = t[43] &	 x[12];
103 | 
104 | endmodule
105 | 
106 | //	=== AES (Forward) ===
107 | 
108 | `ifndef SAES32_NO_AES
109 | 
110 | //	top (inner) linear layer for AES
111 | 
112 | module sbox_aes_top( output [20:0] y, input [7:0] x);
113 | 
114 | 	wire [5:0] t;
115 | 
116 | 	assign	y[ 0] = x[ 0];
117 | 	assign	y[ 1] = x[ 7] ^	 x[ 4];
118 | 	assign	y[ 2] = x[ 7] ^	 x[ 2];
119 | 	assign	y[ 3] = x[ 7] ^	 x[ 1];
120 | 	assign	y[ 4] = x[ 4] ^	 x[ 2];
121 | 	assign	t[ 0] = x[ 3] ^	 x[ 1];
122 | 	assign	y[ 5] = y[ 1] ^	 t[ 0];
123 | 	assign	t[ 1] = x[ 6] ^	 x[ 5];
124 | 	assign	y[ 6] = x[ 0] ^	 y[ 5];
125 | 	assign	y[ 7] = x[ 0] ^	 t[ 1];
126 | 	assign	y[ 8] = y[ 5] ^	 t[ 1];
127 | 	assign	t[ 2] = x[ 6] ^	 x[ 2];
128 | 	assign	t[ 3] = x[ 5] ^	 x[ 2];
129 | 	assign	y[ 9] = y[ 3] ^	 y[ 4];
130 | 	assign	y[10] = y[ 5] ^	 t[ 2];
131 | 	assign	y[11] = t[ 0] ^	 t[ 2];
132 | 	assign	y[12] = t[ 0] ^	 t[ 3];
133 | 	assign	y[13] = y[ 7] ^	 y[12];
134 | 	assign	t[ 4] = x[ 4] ^	 x[ 0];
135 | 	assign	y[14] = t[ 1] ^	 t[ 4];
136 | 	assign	y[15] = y[ 1] ^	 y[14];
137 | 	assign	t[ 5] = x[ 1] ^	 x[ 0];
138 | 	assign	y[16] = t[ 1] ^	 t[ 5];
139 | 	assign	y[17] = y[ 2] ^	 y[16];
140 | 	assign	y[18] = y[ 2] ^	 y[ 8];
141 | 	assign	y[19] = y[15] ^	 y[13];
142 | 	assign	y[20] = y[ 1] ^	 t[ 3];
143 | 
144 | endmodule
145 | 
146 | //	bottom (outer) linear layer for AES
147 | 
148 | module sbox_aes_out( output [7:0] y, input [17:0] x);
149 | 
150 | 	wire [29:0] t;
151 | 
152 | 	assign	t[ 0] = x[11] ^	 x[12];
153 | 	assign	t[ 1] = x[ 0] ^	 x[ 6];
154 | 	assign	t[ 2] = x[14] ^	 x[16];
155 | 	assign	t[ 3] = x[15] ^	 x[ 5];
156 | 	assign	t[ 4] = x[ 4] ^	 x[ 8];
157 | 	assign	t[ 5] = x[17] ^	 x[11];
158 | 	assign	t[ 6] = x[12] ^	 t[ 5];
159 | 	assign	t[ 7] = x[14] ^	 t[ 3];
160 | 	assign	t[ 8] = x[ 1] ^	 x[ 9];
161 | 	assign	t[ 9] = x[ 2] ^	 x[ 3];
162 | 	assign	t[10] = x[ 3] ^	 t[ 4];
163 | 	assign	t[11] = x[10] ^	 t[ 2];
164 | 	assign	t[12] = x[16] ^	 x[ 1];
165 | 	assign	t[13] = x[ 0] ^	 t[ 0];
166 | 	assign	t[14] = x[ 2] ^	 x[11];
167 | 	assign	t[15] = x[ 5] ^	 t[ 1];
168 | 	assign	t[16] = x[ 6] ^	 t[ 0];
169 | 	assign	t[17] = x[ 7] ^	 t[ 1];
170 | 	assign	t[18] = x[ 8] ^	 t[ 8];
171 | 	assign	t[19] = x[13] ^	 t[ 4];
172 | 	assign	t[20] = t[ 0] ^	 t[ 1];
173 | 	assign	t[21] = t[ 1] ^	 t[ 7];
174 | 	assign	t[22] = t[ 3] ^	 t[12];
175 | 	assign	t[23] = t[18] ^	 t[ 2];
176 | 	assign	t[24] = t[15] ^	 t[ 9];
177 | 	assign	t[25] = t[ 6] ^	 t[10];
178 | 	assign	t[26] = t[ 7] ^	 t[ 9];
179 | 	assign	t[27] = t[ 8] ^	 t[10];
180 | 	assign	t[28] = t[11] ^	 t[14];
181 | 	assign	t[29] = t[11] ^	 t[17];
182 | 	assign	y[ 0] = t[ 6] ^~ t[23];
183 | 	assign	y[ 1] = t[13] ^~ t[27];
184 | 	assign	y[ 2] = t[25] ^	 t[29];
185 | 	assign	y[ 3] = t[20] ^	 t[22];
186 | 	assign	y[ 4] = t[ 6] ^	 t[21];
187 | 	assign	y[ 5] = t[19] ^~ t[28];
188 | 	assign	y[ 6] = t[16] ^~ t[26];
189 | 	assign	y[ 7] = t[ 6] ^	 t[24];
190 | 
191 | endmodule
192 | 
193 | //	AES s-box
194 | 
195 | module aes_sbox( output [7:0] fx, input [7:0] in );
196 | 
197 | 	wire [20:0] t1;
198 | 	wire [17:0] t2;
199 | 
200 | 	sbox_aes_top top ( t1, in );
201 | 	sbox_inv_mid mid ( t2, t1 );
202 | 	sbox_aes_out out ( fx, t2 );
203 | 
204 | endmodule
205 | 
206 | `endif
207 | 
208 | 
209 | //	=== AES^-1 (Inverse) ===
210 | 
211 | `ifndef SAES32_NO_AESI
212 | 
213 | //	top (inner) linear layer for AES^-1
214 | 
215 | module sbox_aesi_top( output [20:0] y, input [7:0] x);
216 | 
217 | 	wire [4:0] t;
218 | 
219 | 	assign	y[17] = x[ 7] ^	 x[ 4];
220 | 	assign	y[16] = x[ 6] ^~ x[ 4];
221 | 	assign	y[ 2] = x[ 7] ^~ x[ 6];
222 | 	assign	y[ 1] = x[ 4] ^	 x[ 3];
223 | 	assign	y[18] = x[ 3] ^~ x[ 0];
224 | 	assign	t[ 0] = x[ 1] ^	 x[ 0];
225 | 	assign	y[ 6] = x[ 6] ^~ y[17];
226 | 	assign	y[14] = y[16] ^	 t[ 0];
227 | 	assign	y[ 7] = x[ 0] ^~ y[ 1];
228 | 	assign	y[ 8] = y[ 2] ^	 y[18];
229 | 	assign	y[ 9] = y[ 2] ^	 t[ 0];
230 | 	assign	y[ 3] = y[ 1] ^	 t[ 0];
231 | 	assign	y[19] = x[ 5] ^~ y[ 1];
232 | 	assign	t[ 1] = x[ 6] ^	 x[ 1];
233 | 	assign	y[13] = x[ 5] ^~ y[14];
234 | 	assign	y[15] = y[18] ^	 t[ 1];
235 | 	assign	y[ 4] = x[ 3] ^	 y[ 6];
236 | 	assign	t[ 2] = x[ 5] ^~ x[ 2];
237 | 	assign	t[ 3] = x[ 2] ^~ x[ 1];
238 | 	assign	t[ 4] = x[ 5] ^~ x[ 3];
239 | 	assign	y[ 5] = y[16] ^	 t[ 2];
240 | 	assign	y[12] = t[ 1] ^	 t[ 4];
241 | 	assign	y[20] = y[ 1] ^	 t[ 3];
242 | 	assign	y[11] = y[ 8] ^	 y[20];
243 | 	assign	y[10] = y[ 8] ^	 t[ 3];
244 | 	assign	y[ 0] = x[ 7] ^	 t[ 2];
245 | 
246 | endmodule
247 | 
248 | //	bottom (outer) linear layer for AES^-1
249 | 
250 | module sbox_aesi_out( output [7:0] y, input [17:0] x);
251 | 
252 | 	wire [29:0] t;
253 | 
254 | 	assign	t[ 0] = x[ 2] ^	 x[11];
255 | 	assign	t[ 1] = x[ 8] ^	 x[ 9];
256 | 	assign	t[ 2] = x[ 4] ^	 x[12];
257 | 	assign	t[ 3] = x[15] ^	 x[ 0];
258 | 	assign	t[ 4] = x[16] ^	 x[ 6];
259 | 	assign	t[ 5] = x[14] ^	 x[ 1];
260 | 	assign	t[ 6] = x[17] ^	 x[10];
261 | 	assign	t[ 7] = t[ 0] ^	 t[ 1];
262 | 	assign	t[ 8] = x[ 0] ^	 x[ 3];
263 | 	assign	t[ 9] = x[ 5] ^	 x[13];
264 | 	assign	t[10] = x[ 7] ^	 t[ 4];
265 | 	assign	t[11] = t[ 0] ^	 t[ 3];
266 | 	assign	t[12] = x[14] ^	 x[16];
267 | 	assign	t[13] = x[17] ^	 x[ 1];
268 | 	assign	t[14] = x[17] ^	 x[12];
269 | 	assign	t[15] = x[ 4] ^	 x[ 9];
270 | 	assign	t[16] = x[ 7] ^	 x[11];
271 | 	assign	t[17] = x[ 8] ^	 t[ 2];
272 | 	assign	t[18] = x[13] ^	 t[ 5];
273 | 	assign	t[19] = t[ 2] ^	 t[ 3];
274 | 	assign	t[20] = t[ 4] ^	 t[ 6];
275 | 	assign	t[22] = t[ 2] ^	 t[ 7];
276 | 	assign	t[23] = t[ 7] ^	 t[ 8];
277 | 	assign	t[24] = t[ 5] ^	 t[ 7];
278 | 	assign	t[25] = t[ 6] ^	 t[10];
279 | 	assign	t[26] = t[ 9] ^	 t[11];
280 | 	assign	t[27] = t[10] ^	 t[18];
281 | 	assign	t[28] = t[11] ^	 t[25];
282 | 	assign	t[29] = t[15] ^	 t[20];
283 | 	assign	y[ 0] = t[ 9] ^	 t[16];
284 | 	assign	y[ 1] = t[14] ^	 t[23];
285 | 	assign	y[ 2] = t[19] ^	 t[24];
286 | 	assign	y[ 3] = t[23] ^	 t[27];
287 | 	assign	y[ 4] = t[12] ^	 t[22];
288 | 	assign	y[ 5] = t[17] ^	 t[28];
289 | 	assign	y[ 6] = t[26] ^	 t[29];
290 | 	assign	y[ 7] = t[13] ^	 t[22];
291 | 
292 | endmodule
293 | 
294 | //	AES inverse S-box
295 | 
296 | module aesi_sbox( output [7:0] fx, input [7:0] in );
297 | 
298 | 	wire [20:0] t1;
299 | 	wire [17:0] t2;
300 | 
301 | 	sbox_aesi_top top ( t1, in );
302 | 	sbox_inv_mid mid ( t2, t1 );
303 | 	sbox_aesi_out out ( fx, t2 );
304 | 
305 | endmodule
306 | 
307 | `endif
308 | 
309 | //	=== SM4 ===
310 | 
311 | `ifndef SAES32_NO_SM4
312 | 
313 | //	top (inner) linear layer for SM4
314 | 
315 | 
316 | module sbox_sm4_top( output [20:0] y, input [7:0] x);
317 | 
318 | 	wire [6:0] t;
319 | 
320 | 	assign	y[18] = x[ 2] ^	 x[ 6];
321 | 	assign	t[ 0] = x[ 3] ^	 x[ 4];
322 | 	assign	t[ 1] = x[ 2] ^	 x[ 7];
323 | 	assign	t[ 2] = x[ 7] ^	 y[18];
324 | 	assign	t[ 3] = x[ 1] ^	 t[ 1];
325 | 	assign	t[ 4] = x[ 6] ^	 x[ 7];
326 | 	assign	t[ 5] = x[ 0] ^	 y[18];
327 | 	assign	t[ 6] = x[ 3] ^	 x[ 6];
328 | 	assign	y[10] = x[ 1] ^	 y[18];
329 | 	assign	y[ 0] = x[ 5] ^~ y[10];
330 | 	assign	y[ 1] = t[ 0] ^	 t[ 3];
331 | 	assign	y[ 2] = x[ 0] ^	 t[ 0];
332 | 	assign	y[ 4] = x[ 0] ^	 t[ 3];
333 | 	assign	y[ 3] = x[ 3] ^	 y[ 4];
334 | 	assign	y[ 5] = x[ 5] ^	 t[ 5];
335 | 	assign	y[ 6] = x[ 0] ^~ x[ 1];
336 | 	assign	y[ 7] = t[ 0] ^~ y[10];
337 | 	assign	y[ 8] = t[ 0] ^	 t[ 5];
338 | 	assign	y[ 9] = x[ 3];
339 | 	assign	y[11] = t[ 0] ^	 t[ 4];
340 | 	assign	y[12] = x[ 5] ^	 t[ 4];
341 | 	assign	y[13] = x[ 5] ^~ y[ 1];
342 | 	assign	y[14] = x[ 4] ^~ t[ 2];
343 | 	assign	y[15] = x[ 1] ^~ t[ 6];
344 | 	assign	y[16] = x[ 0] ^~ t[ 2];
345 | 	assign	y[17] = t[ 0] ^~ t[ 2];
346 | 	assign	y[19] = x[ 5] ^~ y[14];
347 | 	assign	y[20] = x[ 0] ^	 t[ 1];
348 | 
349 | endmodule
350 | 
351 | //	bottom (outer) linear layer for SM4
352 | 
353 | module sbox_sm4_out( output [7:0] y, input [17:0] x);
354 | 
355 | 	wire [29:0] t;
356 | 
357 | 	assign	t[ 0] = x[ 4] ^	 x[ 7];
358 | 	assign	t[ 1] = x[13] ^	 x[15];
359 | 	assign	t[ 2] = x[ 2] ^	 x[16];
360 | 	assign	t[ 3] = x[ 6] ^	 t[ 0];
361 | 	assign	t[ 4] = x[12] ^	 t[ 1];
362 | 	assign	t[ 5] = x[ 9] ^	 x[10];
363 | 	assign	t[ 6] = x[11] ^	 t[ 2];
364 | 	assign	t[ 7] = x[ 1] ^	 t[ 4];
365 | 	assign	t[ 8] = x[ 0] ^	 x[17];
366 | 	assign	t[ 9] = x[ 3] ^	 x[17];
367 | 	assign	t[10] = x[ 8] ^	 t[ 3];
368 | 	assign	t[11] = t[ 2] ^	 t[ 5];
369 | 	assign	t[12] = x[14] ^	 t[ 6];
370 | 	assign	t[13] = t[ 7] ^	 t[ 9];
371 | 	assign	t[14] = x[ 0] ^	 x[ 6];
372 | 	assign	t[15] = x[ 7] ^	 x[16];
373 | 	assign	t[16] = x[ 5] ^	 x[13];
374 | 	assign	t[17] = x[ 3] ^	 x[15];
375 | 	assign	t[18] = x[10] ^	 x[12];
376 | 	assign	t[19] = x[ 9] ^	 t[ 1];
377 | 	assign	t[20] = x[ 4] ^	 t[ 4];
378 | 	assign	t[21] = x[14] ^	 t[ 3];
379 | 	assign	t[22] = x[16] ^	 t[ 5];
380 | 	assign	t[23] = t[ 7] ^	 t[14];
381 | 	assign	t[24] = t[ 8] ^	 t[11];
382 | 	assign	t[25] = t[ 0] ^	 t[12];
383 | 	assign	t[26] = t[17] ^	 t[ 3];
384 | 	assign	t[27] = t[18] ^	 t[10];
385 | 	assign	t[28] = t[19] ^	 t[ 6];
386 | 	assign	t[29] = t[ 8] ^	 t[10];
387 | 	assign	y[ 0] = t[11] ^~ t[13];
388 | 	assign	y[ 1] = t[15] ^~ t[23];
389 | 	assign	y[ 2] = t[20] ^	 t[24];
390 | 	assign	y[ 3] = t[16] ^	 t[25];
391 | 	assign	y[ 4] = t[26] ^~ t[22];
392 | 	assign	y[ 5] = t[21] ^	 t[13];
393 | 	assign	y[ 6] = t[27] ^~ t[12];
394 | 	assign	y[ 7] = t[28] ^~ t[29];
395 | 
396 | endmodule
397 | 
398 | //	SM4 S-box (there is no need for inverse)
399 | 
400 | module sm4_sbox( output [7:0] fx, input [7:0] in );
401 | 
402 | 	wire [20:0] t1;
403 | 	wire [17:0] t2;
404 | 
405 | 	sbox_sm4_top top ( t1, in );
406 | 	sbox_inv_mid mid ( t2, t1 );
407 | 	sbox_sm4_out out ( fx, t2 );
408 | 
409 | endmodule
410 | 
411 | `endif
412 | 


--------------------------------------------------------------------------------
/hdl/synth.ys:
--------------------------------------------------------------------------------
 1 | design -reset
 2 | read_verilog -D SAES32_NO_AESI -D SAES32_NO_SM4 -defer saes32.v sboxes.v
 3 | hierarchy -top saes32
 4 | rename saes32 saes32_aes_enc_only
 5 | synth -flatten; abc -dff -g cmos; opt -fast
 6 | tee -a synth.tmp stat -tech cmos
 7 | tee -a synth.tmp ltp -noff
 8 | 
 9 | design -reset
10 | read_verilog -D SAES32_NO_SM4 -defer saes32.v sboxes.v
11 | hierarchy -top saes32
12 | rename saes32 saes32_aes_encdec
13 | synth -flatten; abc -dff -g cmos; opt -fast
14 | tee -a synth.tmp stat -tech cmos
15 | tee -a synth.tmp ltp -noff
16 | 
17 | design -reset
18 | read_verilog -D SAES32_NO_AES -D SAES32_NO_AESI -defer saes32.v sboxes.v
19 | hierarchy -top saes32
20 | rename saes32 saes32_sm4_only
21 | synth -flatten; abc -dff -g cmos; opt -fast
22 | tee -a synth.tmp stat -tech cmos
23 | tee -a synth.tmp ltp -noff
24 | 
25 | design -reset
26 | read_verilog -defer saes32.v sboxes.v
27 | hierarchy -top saes32
28 | rename saes32 saes32_full
29 | synth -flatten; abc -dff -g cmos; opt -fast
30 | tee -a synth.tmp stat -tech cmos
31 | tee -a synth.tmp ltp -noff
32 | 
33 | 


--------------------------------------------------------------------------------
/hdl/tbref.txt:
--------------------------------------------------------------------------------
 1 | [TB] rd=a56363c6 rs1=00000000 rs2=00000000 fn=00
 2 | [TB] rd=6e6edcb2 rs1=00000000 rs2=01234567 fn=01
 3 | [TB] rd=5ab4ee5a rs1=00000000 rs2=02468ace fn=02
 4 | [TB] rd=f68d7b7b rs1=00000000 rs2=0369d035 fn=03
 5 | [TB] rd=000000de rs1=00000000 rs2=048d159c fn=04
 6 | [TB] rd=00003900 rs1=00000000 rs2=05b05b03 fn=05
 7 | [TB] rd=00660000 rs1=00000000 rs2=06d3a06a fn=06
 8 | [TB] rd=c5000000 rs1=00000000 rs2=07f6e5d1 fn=07
 9 | [TB] rd=0728ebb2 rs1=00000000 rs2=091a2b38 fn=08
10 | [TB] rd=670a0cb1 rs1=00000000 rs2=0a3d709f fn=09
11 | [TB] rd=7ca1470a rs1=00000000 rs2=0b60b606 fn=0a
12 | [TB] rd=4ffcd7e5 rs1=00000000 rs2=0c83fb6d fn=0b
13 | [TB] rd=00000019 rs1=00000000 rs2=0da740d4 fn=0c
14 | [TB] rd=0000dc00 rs1=00000000 rs2=0eca863b fn=0d
15 | [TB] rd=00530000 rs1=00000000 rs2=0fedcba2 fn=0e
16 | [TB] rd=e3000000 rs1=00000000 rs2=11111109 fn=0f
17 | [TB] rd=5353d784 rs1=00000000 rs2=12345670 fn=10
18 | [TB] rd=c030f0c0 rs1=00000000 rs2=13579bd7 fn=11
19 | [TB] rd=020a0808 rs1=00000000 rs2=147ae13e fn=12
20 | [TB] rd=46fafabc rs1=00000000 rs2=159e26a5 fn=13
21 | [TB] rd=00051428 rs1=00000000 rs2=16c16c0c fn=14
22 | [TB] rd=9b6ddb60 rs1=00000000 rs2=17e4b173 fn=15
23 | [TB] rd=5bb7e096 rs1=00000000 rs2=1907f6da fn=16
24 | [TB] rd=13608209 rs1=00000000 rs2=1a2b3c41 fn=17
25 | 


--------------------------------------------------------------------------------
/hdl/yoparse.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | #	yoparse.py
 4 | #	2020-02-27	Markku-Juhani O. Saarinen <mjos@pqshield.com>
 5 | 
 6 | #	parse the synthesis output
 7 | 
 8 | import sys
 9 | 
10 | # "For evaluation purposes we [use] the following mockup ASIC cell library:"
11 | 
12 | wt = {}
13 | wt["$_NOT_"]	= 0.5
14 | wt["$_NAND_"]	= 1.0
15 | wt["$_NOR_"]	= 1.0
16 | wt["$_XOR_"]	= 3.0
17 | wt["$_XNOR_"]	= 3.0
18 | wt["$_DFF_P_"]	= 4.0
19 | wt["$_AOI3_"]	= 1.5
20 | wt["$_OAI3_"]	= 1.5
21 | wt["$_AOI4_"]	= 2.0
22 | wt["$_OAI4_"]	= 2.0
23 | wt["$_NMUX_"]	= 2.5
24 | wt["$_MUX_"]	= 3.0
25 | 
26 | # parse input files
27 | 
28 | for fn in sys.argv[1:]:
29 | 
30 | 	print(f"=== Summary for {fn} ===")
31 | 
32 | 	with open(fn, 'r') as f:
33 | 		lns = f.readlines()
34 | 
35 | 	tb = {}
36 | 	li = 0
37 | 	targ = ""
38 | 	ge = 0.0
39 | 	tr = 0
40 | 	ltp = 0
41 | 
42 | 	for lin in lns:
43 | 
44 | 		li = li + 1
45 | 		lv = lin.split();
46 | 		ll = len(lv)
47 | 
48 | 		if ll == 3 and lv[0] == "===":
49 | 			targ = lv[1]
50 | 			ge = 0.0
51 | 			tr = 0
52 | 			ltp = 0
53 | 
54 | 		if ll == 5 and lv[3] == "transistors:":
55 | 			tr = int(lv[4])
56 | 
57 | 		if ll == 6 and lv[1] == "topological":
58 | 			tmp = lv[5][8:]
59 | 			ltp = int(tmp[:-2])
60 | 
61 | 		if ll == 2 and lv[0][:2] == "$_":
62 | 			if lv[0] in wt:
63 | 				ge = ge + float(lv[1]) * wt[lv[0]]
64 | 			else:
65 | 				print(f"{fn}:{li} unknown gate {lv[0]}")
66 | 
67 | 		# update it
68 | 		if targ != "":
69 | 			tb[targ] = ( ge, tr, ltp )
70 | 
71 | 	# print the counts
72 | 
73 | 	for x in tb:
74 | 		print(f"{x:20}  ge={tb[x][0]:7}  tr={tb[x][1]:5}  ltp={tb[x][2]:3}")
75 | 
76 | 


--------------------------------------------------------------------------------
/rv_endian.h:
--------------------------------------------------------------------------------
  1 | //  rv_endian.h
  2 | //  2020-04-30  Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //  RISC-V specific endianess support would be here (via intrinsics)
  6 | 
  7 | #ifndef _RV_ENDIAN_H_
  8 | #define _RV_ENDIAN_H_
  9 | 
 10 | //  revert if not big endian
 11 | 
 12 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 13 | #define GREV_BE32(x) (x)
 14 | #else
 15 | 	//  grev(x, 0x18) or rev8
 16 | #define GREV_BE32(x) (	\
 17 | 	(((x) & 0xFF000000) >> 24) | (((x) & 0x00FF0000) >> 8)  | \
 18 | 	(((x) & 0x0000FF00) << 8)  | (((x) & 0x000000FF) << 24))
 19 | #endif
 20 | 
 21 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 22 | #define GREV_BE64(x) (x)
 23 | #else
 24 | //  RISC-V: grev(x, 0x38) or rev8(x)
 25 | #define GREV_BE64(x) (						\
 26 | 	(((x) & 0xFF00000000000000LL) >> 56) | 	\
 27 | 	(((x) & 0x00FF000000000000LL) >> 40) | 	\
 28 | 	(((x) & 0x0000FF0000000000LL) >> 24) | 	\
 29 | 	(((x) & 0x000000FF00000000LL) >> 8)  | 	\
 30 | 	(((x) & 0x00000000FF000000LL) << 8)  | 	\
 31 | 	(((x) & 0x0000000000FF0000LL) << 24) | 	\
 32 | 	(((x) & 0x000000000000FF00LL) << 40) | 	\
 33 | 	(((x) & 0x00000000000000FFLL) << 56))
 34 | #endif
 35 | 
 36 | //  rotate left
 37 | static inline uint32_t rol32(uint32_t x, uint32_t n)
 38 | {
 39 | 	return ((x) << n) | ((x) >> (32 - n));
 40 | }
 41 | 
 42 | //  little-endian loads and stores (unaligned)
 43 | 
 44 | static inline uint32_t get32u_le(const uint8_t * v)
 45 | {
 46 | 	return ((uint32_t) v[0]) | (((uint32_t) v[1]) << 8) |
 47 | 		(((uint32_t) v[2]) << 16) | (((uint32_t) v[3]) << 24);
 48 | }
 49 | 
 50 | static inline void put32u_le(uint8_t * v, uint32_t x)
 51 | {
 52 | 	v[0] = x;
 53 | 	v[1] = x >> 8;
 54 | 	v[2] = x >> 16;
 55 | 	v[3] = x >> 24;
 56 | }
 57 | 
 58 | static inline uint64_t get64u_le(const uint8_t * v)
 59 | {
 60 | 	return ((uint64_t) v[0]) | (((uint64_t) v[1]) << 8) |
 61 | 		(((uint64_t) v[2]) << 16) | (((uint64_t) v[3]) << 24) |
 62 | 		(((uint64_t) v[4]) << 32) | (((uint64_t) v[5]) << 40) |
 63 | 		(((uint64_t) v[6]) << 48) | (((uint64_t) v[7]) << 56);
 64 | }
 65 | 
 66 | static inline void put64u_le(uint8_t * v, uint64_t x)
 67 | {
 68 | 	v[0] = x;
 69 | 	v[1] = x >> 8;
 70 | 	v[2] = x >> 16;
 71 | 	v[3] = x >> 24;
 72 | 	v[4] = x >> 32;
 73 | 	v[5] = x >> 40;
 74 | 	v[6] = x >> 48;
 75 | 	v[7] = x >> 56;
 76 | }
 77 | 
 78 | 
 79 | //  big-endian loads and stores (unaligned)
 80 | 
 81 | static inline uint32_t get32u_be(const uint8_t * v)
 82 | {
 83 | 	return (((uint32_t) v[0]) << 24) | (((uint32_t) v[1]) << 16) |
 84 | 		(((uint32_t) v[2]) << 8) | ((uint32_t) v[3]);
 85 | }
 86 | 
 87 | static inline void put32u_be(uint8_t * v, uint32_t x)
 88 | {
 89 | 	v[0] = x >> 24;
 90 | 	v[1] = x >> 16;
 91 | 	v[2] = x >> 8;
 92 | 	v[3] = x;
 93 | }
 94 | 
 95 | static inline uint64_t get64u_be(const uint8_t * v)
 96 | {
 97 | 	return (((uint64_t) v[0]) << 56) | (((uint64_t) v[1]) << 48) |
 98 | 		(((uint64_t) v[2]) << 40) | (((uint64_t) v[3]) << 32) |
 99 | 		(((uint64_t) v[4]) << 24) | (((uint64_t) v[5]) << 16) |
100 | 		(((uint64_t) v[6]) << 8) | ((uint64_t) v[7]);
101 | }
102 | 
103 | static inline void put64u_be(uint8_t * v, uint64_t x)
104 | {
105 | 	v[0] = x >> 56;
106 | 	v[1] = x >> 48;
107 | 	v[2] = x >> 40;
108 | 	v[3] = x >> 32;
109 | 	v[4] = x >> 24;
110 | 	v[5] = x >> 16;
111 | 	v[6] = x >> 8;
112 | 	v[7] = x;
113 | }
114 | 
115 | #endif
116 | 


--------------------------------------------------------------------------------
/saes32.c:
--------------------------------------------------------------------------------
  1 | //  saes32.c
  2 | //  2020-01-24  Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //  Running pseudocode for SAES32 (and ENC4S) AES/SM4 instruction.
  6 | 
  7 | #include "saes32.h"
  8 | #include "sboxes.h"
  9 | 
 10 | //  Function codes
 11 | 
 12 | #define SAES32_ENCSM	0
 13 | #define SAES32_ENCS		1
 14 | #define SAES32_DECSM	2
 15 | #define SAES32_DECS		3
 16 | #define SSM4_ED			4
 17 | #define SSM4_KS			5
 18 | 
 19 | //  Multiply by 0x02 in AES's GF(256) - LFSR style
 20 | 
 21 | static inline uint8_t aes_xtime(uint8_t x)
 22 | {
 23 | 	return (x << 1) ^ ((x & 0x80) ? 0x11B : 0x00);
 24 | }
 25 | 
 26 | //  === THIS IS THE SINGLE LIGHTWEIGHT INSTRUCTION FOR AES AND SM4  ===
 27 | 
 28 | //  SAES32: Instruction for a byte select, single S-box, and linear operation.
 29 | 
 30 | uint32_t saes32(uint32_t rs1, uint32_t rs2, int fn)
 31 | {
 32 | 	uint32_t fa, fb, x, x2, x4, x8;
 33 | 
 34 | 	fa = 8 * (fn & 3);						//  [1:0]   byte select / rotate
 35 | 	fb = (fn >> 2) & 7;						//  [4:2]   cipher select
 36 | 
 37 | 	//  select input byte
 38 | 
 39 | 	x = (rs2 >> fa) & 0xFF;					//  select byte
 40 | 
 41 | 	//  8->8 bit s-box
 42 | 
 43 | 	switch (fb) {
 44 | 
 45 | 	case SAES32_ENCSM:						//  0 : AES Forward + MC
 46 | 	case SAES32_ENCS:						//  1 : AES Forward "key"
 47 | 		x = aes_sbox[x];
 48 | 		break;
 49 | 
 50 | 	case SAES32_DECSM:						//  1 : AES Inverse + MC
 51 | 	case SAES32_DECS:						//  2 : AES Inverse "key"
 52 | 		x = aes_isbox[x];
 53 | 		break;
 54 | 
 55 | 	case SSM4_ED:							//  3 : SM4 encrypt/decrypt
 56 | 	case SSM4_KS:							//  4 : SM4 key schedule
 57 | 		x = sm4_sbox[x];
 58 | 		break;
 59 | 
 60 | 	default:								//  none
 61 | 		break;
 62 | 	}
 63 | 
 64 | 	//  8->32 bit linear transforms expressed as little-endian
 65 | 
 66 | 	switch (fb) {
 67 | 
 68 | 	case SAES32_ENCSM:						//  0 : AES Forward MixCol
 69 | 		x2 = aes_xtime(x);					//  double x
 70 | 		x = ((x ^ x2) << 24) |				//  0x03    MixCol MDS Matrix
 71 | 			(x << 16) |						//  0x01
 72 | 			(x << 8) |						//  0x01
 73 | 			x2;								//  0x02
 74 | 		break;
 75 | 
 76 | 	case SAES32_DECSM:						//  2 : AES Inverse MixCol
 77 | 		x2 = aes_xtime(x);					//  double x
 78 | 		x4 = aes_xtime(x2);					//  double to 4*x
 79 | 		x8 = aes_xtime(x4);					//  double to 8*x
 80 | 		x = ((x ^ x2 ^ x8) << 24) |			//  0x0B    Inv MixCol MDS Matrix
 81 | 			((x ^ x4 ^ x8) << 16) |			//  0x0D
 82 | 			((x ^ x8) << 8) |				//  0x09
 83 | 			(x2 ^ x4 ^ x8);					//  0x0E
 84 | 		break;
 85 | 
 86 | 	case SSM4_ED:							//  4 : SM4 linear transform L 
 87 | 		x = x ^ (x << 8) ^ (x << 2) ^ (x << 18) ^
 88 | 			((x & 0x3F) << 26) ^ ((x & 0xC0) << 10);
 89 | 		break;
 90 | 
 91 | 	case SSM4_KS:							//  5 : SM4 transform L' (key)
 92 | 		x = x ^ ((x & 0x07) << 29) ^ ((x & 0xFE) << 7) ^
 93 | 			((x & 1) << 23) ^ ((x & 0xF8) << 13);
 94 | 		break;
 95 | 
 96 | 	default:								//  none
 97 | 		break;
 98 | 
 99 | 	}
100 | 
101 | 	//  rotate output left by fa bits
102 | 
103 | 	if (fa != 0) {
104 | 		x = (x << fa) | (x >> (32 - fa));
105 | 	}
106 | 
107 | 	return x ^ rs1;							//  XOR with rs2
108 | }
109 | 
110 | //  === PSEUDO OPS ===
111 | 
112 | //  AES Encryption
113 | 
114 | uint32_t saes32_encsm(uint32_t rs1, uint32_t rs2, int bs)
115 | {
116 | 	return saes32(rs1, rs2, (SAES32_ENCSM << 2) | bs);
117 | }
118 | 
119 | uint32_t saes32_encs(uint32_t rs1, uint32_t rs2, int bs)
120 | {
121 | 	return saes32(rs1, rs2, (SAES32_ENCS << 2) | bs);
122 | }
123 | 
124 | //  AES Decryption
125 | 
126 | uint32_t saes32_decsm(uint32_t rs1, uint32_t rs2, int bs)
127 | {
128 | 	return saes32(rs1, rs2, (SAES32_DECSM << 2) | bs);
129 | }
130 | 
131 | uint32_t saes32_decs(uint32_t rs1, uint32_t rs2, int bs)
132 | {
133 | 	return saes32(rs1, rs2, (SAES32_DECS << 2) | bs);
134 | }
135 | 
136 | //  SM4 Encryption, Decryption and Key Schedule
137 | 
138 | uint32_t ssm4_ed(uint32_t rs1, uint32_t rs2, int bs)
139 | {
140 | 	return saes32(rs1, rs2, (SSM4_ED << 2) | bs);
141 | }
142 | 
143 | uint32_t ssm4_ks(uint32_t rs1, uint32_t rs2, int bs)
144 | {
145 | 	return saes32(rs1, rs2, (SSM4_KS << 2) | bs);
146 | }
147 | 


--------------------------------------------------------------------------------
/saes32.h:
--------------------------------------------------------------------------------
 1 | //  saes32.h
 2 | //  2020-01-27  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  Prototypes for SAES32 -- replace with intrinsics.
 6 | 
 7 | #ifndef _SAES32_H_
 8 | #define _SAES32_H_
 9 | 
10 | #include <stdint.h>
11 | 
12 | //  Hardware simulation:
13 | //  SAES32: Instruction for a byte select, single S-box, and linear operation.
14 | 
15 | uint32_t saes32(uint32_t rs1, uint32_t rs2, int sn);
16 | 
17 | //  === (Pseudo) Instructions ===
18 | 
19 | //  AES Encryption
20 | 
21 | uint32_t saes32_encsm(uint32_t rs1, uint32_t rs2, int bs);
22 | uint32_t saes32_encs(uint32_t rs1, uint32_t rs2, int bs);
23 | 
24 | //  AES Decryption
25 | 
26 | uint32_t saes32_decsm(uint32_t rs1, uint32_t rs2, int bs);
27 | uint32_t saes32_decs(uint32_t rs1, uint32_t rs2, int bs);
28 | 
29 | //  SM4 Encryption, Decryption and Key Schedule
30 | 
31 | uint32_t ssm4_ed(uint32_t rs1, uint32_t rs2, int bs);
32 | uint32_t ssm4_ks(uint32_t rs1, uint32_t rs2, int bs);
33 | 
34 | #endif										//  _SAES32_H_
35 | 


--------------------------------------------------------------------------------
/saes64.c:
--------------------------------------------------------------------------------
  1 | //  saes64.c
  2 | //  2020-05-03  Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //  Emulation code for SAES64
  6 | 
  7 | #include "saes64.h"
  8 | #include "sboxes.h"
  9 | 
 10 | //  ( Multiply by 0x02 in AES's GF(256) - LFSR style )
 11 | 
 12 | static inline uint8_t aes_xtime(uint8_t x)
 13 | {
 14 | 	return (x << 1) ^ ((x & 0x80) ? 0x11B : 0x00);
 15 | }
 16 | 
 17 | //  ( MixColumns functions )
 18 | 
 19 | static inline uint32_t saes64_mc8(uint32_t x)
 20 | {
 21 | 	uint32_t x2;
 22 | 
 23 | 	x2 = aes_xtime(x);						//  double x
 24 | 	x = ((x ^ x2) << 24) |					//  0x03    MixCol MDS Matrix
 25 | 		(x << 16) |							//  0x01
 26 | 		(x << 8) |							//  0x01
 27 | 		x2;									//  0x02
 28 | 
 29 | 	return x;
 30 | }
 31 | 
 32 | static uint32_t saes64_mc32(uint32_t x)
 33 | {
 34 | 	uint32_t y;
 35 | 
 36 | 	y = saes64_mc8((x >> 24) & 0xFF);
 37 | 	y = (y << 8) | (y >> 24);
 38 | 	y ^= saes64_mc8((x >> 16) & 0xFF);
 39 | 	y = (y << 8) | (y >> 24);
 40 | 	y ^= saes64_mc8((x >> 8) & 0xFF);
 41 | 	y = (y << 8) | (y >> 24);
 42 | 	y ^= saes64_mc8(x & 0xFF);
 43 | 
 44 | 	return y;
 45 | }
 46 | 
 47 | //  SAES64.ENCS:    Half of ShiftRows and SubBytes (last round)
 48 | 
 49 | uint64_t saes64_encs(uint64_t rs1, uint64_t rs2)
 50 | {
 51 | 	return ((uint64_t) aes_sbox[rs1 & 0xFF]) |
 52 | 		(((uint64_t) aes_sbox[(rs1 >> 40) & 0xFF]) << 8) |
 53 | 		(((uint64_t) aes_sbox[(rs2 >> 16) & 0xFF]) << 16) |
 54 | 		(((uint64_t) aes_sbox[(rs2 >> 56) & 0xFF]) << 24) |
 55 | 		(((uint64_t) aes_sbox[(rs1 >> 32) & 0xFF]) << 32) |
 56 | 		(((uint64_t) aes_sbox[(rs2 >> 8) & 0xFF]) << 40) |
 57 | 		(((uint64_t) aes_sbox[(rs2 >> 48) & 0xFF]) << 48) |
 58 | 		(((uint64_t) aes_sbox[(rs1 >> 24) & 0xFF]) << 56);
 59 | }
 60 | 
 61 | //  SAES64.ENCSM:   Half of ShiftRows, SubBytes, and MixColumns
 62 | 
 63 | uint64_t saes64_encsm(uint64_t rs1, uint64_t rs2)
 64 | {
 65 | 	uint64_t x;
 66 | 
 67 | 	//  ShiftRows and SubBytes
 68 | 	x = saes64_encs(rs1, rs2);
 69 | 
 70 | 	//  MixColumns
 71 | 	x = ((uint64_t) saes64_mc32(x)) |
 72 | 		(((uint64_t) saes64_mc32(x >> 32)) << 32);
 73 | 
 74 | 	return x;
 75 | }
 76 | 
 77 | //  SAES64.DECS:    Half of Inverse ShiftRows and SubBytes (last round)
 78 | 
 79 | uint64_t saes64_decs(uint64_t rs1, uint64_t rs2)
 80 | {
 81 | 	return ((uint64_t) aes_isbox[rs1 & 0xFF]) |
 82 | 		(((uint64_t) aes_isbox[(rs2 >> 40) & 0xFF]) << 8) |
 83 | 		(((uint64_t) aes_isbox[(rs2 >> 16) & 0xFF]) << 16) |
 84 | 		(((uint64_t) aes_isbox[(rs1 >> 56) & 0xFF]) << 24) |
 85 | 		(((uint64_t) aes_isbox[(rs1 >> 32) & 0xFF]) << 32) |
 86 | 		(((uint64_t) aes_isbox[(rs1 >> 8) & 0xFF]) << 40) |
 87 | 		(((uint64_t) aes_isbox[(rs2 >> 48) & 0xFF]) << 48) |
 88 | 		(((uint64_t) aes_isbox[(rs2 >> 24) & 0xFF]) << 56);
 89 | }
 90 | 
 91 | //  SAES64.DECSM:   Half of Inverse ShiftRows, SubBytes, and MixColumns
 92 | 
 93 | uint64_t saes64_decsm(uint64_t rs1, uint64_t rs2)
 94 | {
 95 | 	uint64_t x;
 96 | 
 97 | 	x = saes64_decs(rs1, rs2);				//  Inverse ShiftRows, SubBytes
 98 | 	x = saes64_imix(x);						//  Inverse MixColumns  
 99 | 
100 | 	return x;
101 | }
102 | 
103 | //  ( Inverse MixColumns functions )
104 | 
105 | static inline uint32_t saes64_imc8(uint32_t x)
106 | {
107 | 	uint32_t x2, x4, x8;
108 | 
109 | 	x2 = aes_xtime(x);						//  double x
110 | 	x4 = aes_xtime(x2);						//  double to 4*x
111 | 	x8 = aes_xtime(x4);						//  double to 8*x
112 | 
113 | 	x = ((x ^ x2 ^ x8) << 24) |				//  0x0B    Inv MixCol MDS Matrix
114 | 		((x ^ x4 ^ x8) << 16) |				//  0x0D
115 | 		((x ^ x8) << 8) |					//  0x09
116 | 		(x2 ^ x4 ^ x8);						//  0x0E
117 | 
118 | 	return x;
119 | }
120 | 
121 | static uint32_t saes64_imc32(uint32_t x)
122 | {
123 | 	uint32_t y;
124 | 
125 | 	y = saes64_imc8((x >> 24) & 0xFF);
126 | 	y = (y << 8) | (y >> 24);
127 | 	y ^= saes64_imc8((x >> 16) & 0xFF);
128 | 	y = (y << 8) | (y >> 24);
129 | 	y ^= saes64_imc8((x >> 8) & 0xFF);
130 | 	y = (y << 8) | (y >> 24);
131 | 	y ^= saes64_imc8(x & 0xFF);
132 | 
133 | 	return y;
134 | }
135 | 
136 | //  SAES64.IMIX:    Inverse MixColumns for decryption key schedule
137 | 
138 | uint64_t saes64_imix(uint64_t rs1)
139 | {
140 | 	return ((uint64_t) saes64_imc32(rs1)) |
141 | 		(((uint64_t) saes64_imc32(rs1 >> 32)) << 32);
142 | }
143 | 
144 | //  SAES.KS1:       Key Schedule 1 -- SubWord and opt. rotation, round const
145 | 
146 | uint64_t saes64_ks1(uint64_t rs1, uint8_t i)
147 | {
148 | 	uint32_t t, rc;
149 | 
150 | 	t = rs1 >> 32;
151 | 	rc = 0;
152 | 
153 | 	if (i < 10) {							//  10: don't do it
154 | 		t = (t >> 8) | (t << 24);			//  t = ROR(t, 8)
155 | 		rc = aes_rcon[i];					//  round constant
156 | 	}
157 | 	//  SubWord
158 | 	t = ((uint32_t) aes_sbox[t & 0xFF]) |
159 | 		(((uint32_t) aes_sbox[(t >> 8) & 0xFF]) << 8) |
160 | 		(((uint32_t) aes_sbox[(t >> 16) & 0xFF]) << 16) |
161 | 		(((uint32_t) aes_sbox[(t >> 24) & 0xFF]) << 24);
162 | 
163 | 	t ^= rc;
164 | 
165 | 	return ((uint64_t) t) | (((uint64_t) t) << 32);
166 | }
167 | 
168 | //  SAES.KS2:       Key Schedule 2 -- Linear expansion
169 | 
170 | uint64_t saes64_ks2(uint64_t rs1, uint64_t rs2)
171 | {
172 | 	uint32_t t;
173 | 
174 | 	t = (rs1 >> 32) ^ (rs2 & 0xFFFFFFFF);	//  32 bits
175 | 
176 | 	return ((uint64_t) t) ^					//  low 32 bits
177 | 		(((uint64_t) t) << 32) ^ (rs2 & 0xFFFFFFFF00000000LL);
178 | }
179 | 


--------------------------------------------------------------------------------
/saes64.h:
--------------------------------------------------------------------------------
 1 | //  saes64.h
 2 | //  2020-05-02  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  Prototypes for SAES64 -- replace with intrinsics.
 6 | 
 7 | #ifndef _SAES64_H_
 8 | #define _SAES64_H_
 9 | 
10 | #include <stdint.h>
11 | 
12 | //  === (Pseudo) Instructions ===
13 | 
14 | //  SAES64.ENCSM:   Half of ShiftRows, SubBytes, and MixColumns
15 | uint64_t saes64_encsm(uint64_t rs1, uint64_t rs2);
16 | 
17 | //  SAES64.ENCS:    Half of ShiftRows and SubBytes (last round)
18 | uint64_t saes64_encs(uint64_t rs1, uint64_t rs2);
19 | 
20 | //  SAES64.DECSM:   Half of Inverse ShiftRows, SubBytes, and MixColumns
21 | uint64_t saes64_decsm(uint64_t rs1, uint64_t rs2);
22 | 
23 | //  SAES64.DECS:    Half of Inverse ShiftRows and SubBytes (last round)
24 | uint64_t saes64_decs(uint64_t rs1, uint64_t rs2);
25 | 
26 | //  SAES64.IMIX:    Inverse MixColumns for decryption key schedule
27 | uint64_t saes64_imix(uint64_t rs1);
28 | 
29 | //  SAES.KS1:       Key Schedule 1 -- SubWord and opt. rotation, round const
30 | uint64_t saes64_ks1(uint64_t rs1, uint8_t i);
31 | 
32 | //  SAES.KS1:       Key Schedule 1 -- SubWord and opt. rotation, round const
33 | uint64_t saes64_ks2(uint64_t rs1, uint64_t rs2);
34 | 
35 | #endif										//  _SAES64_H_
36 | 


--------------------------------------------------------------------------------
/sboxes.c:
--------------------------------------------------------------------------------
 1 | //  sboxes.c
 2 | //  2020-05-05  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  Data for AES and SM4.
 6 | 
 7 | #include "sboxes.h"
 8 | 
 9 | //  AES Round Constants
10 | 
11 | const uint8_t aes_rcon[] = {
12 | 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
13 | };
14 | 
15 | //  AES Forward S-Box
16 | 
17 | const uint8_t aes_sbox[256] = {
18 | 	0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B,
19 | 	0xFE, 0xD7, 0xAB, 0x76, 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0,
20 | 	0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, 0xB7, 0xFD, 0x93, 0x26,
21 | 	0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
22 | 	0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2,
23 | 	0xEB, 0x27, 0xB2, 0x75, 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0,
24 | 	0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, 0x53, 0xD1, 0x00, 0xED,
25 | 	0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
26 | 	0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F,
27 | 	0x50, 0x3C, 0x9F, 0xA8, 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5,
28 | 	0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, 0xCD, 0x0C, 0x13, 0xEC,
29 | 	0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
30 | 	0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14,
31 | 	0xDE, 0x5E, 0x0B, 0xDB, 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C,
32 | 	0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, 0xE7, 0xC8, 0x37, 0x6D,
33 | 	0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
34 | 	0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F,
35 | 	0x4B, 0xBD, 0x8B, 0x8A, 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E,
36 | 	0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, 0xE1, 0xF8, 0x98, 0x11,
37 | 	0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
38 | 	0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F,
39 | 	0xB0, 0x54, 0xBB, 0x16
40 | };
41 | 
42 | //  AES Inverse S-Box
43 | 
44 | const uint8_t aes_isbox[256] = {
45 | 	0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E,
46 | 	0x81, 0xF3, 0xD7, 0xFB, 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87,
47 | 	0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, 0x54, 0x7B, 0x94, 0x32,
48 | 	0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E,
49 | 	0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49,
50 | 	0x6D, 0x8B, 0xD1, 0x25, 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16,
51 | 	0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, 0x6C, 0x70, 0x48, 0x50,
52 | 	0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84,
53 | 	0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05,
54 | 	0xB8, 0xB3, 0x45, 0x06, 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02,
55 | 	0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, 0x3A, 0x91, 0x11, 0x41,
56 | 	0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73,
57 | 	0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8,
58 | 	0x1C, 0x75, 0xDF, 0x6E, 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89,
59 | 	0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, 0xFC, 0x56, 0x3E, 0x4B,
60 | 	0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4,
61 | 	0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59,
62 | 	0x27, 0x80, 0xEC, 0x5F, 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D,
63 | 	0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, 0xA0, 0xE0, 0x3B, 0x4D,
64 | 	0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61,
65 | 	0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63,
66 | 	0x55, 0x21, 0x0C, 0x7D
67 | };
68 | 
69 | //  SM4 Forward S-Box (there is no need for an inverse S-Box)
70 | 
71 | const uint8_t sm4_sbox[256] = {
72 | 	0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2,
73 | 	0x28, 0xFB, 0x2C, 0x05, 0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3,
74 | 	0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, 0x9C, 0x42, 0x50, 0xF4,
75 | 	0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62,
76 | 	0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA,
77 | 	0x75, 0x8F, 0x3F, 0xA6, 0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA,
78 | 	0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, 0x68, 0x6B, 0x81, 0xB2,
79 | 	0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35,
80 | 	0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B,
81 | 	0x01, 0x21, 0x78, 0x87, 0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52,
82 | 	0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E, 0xEA, 0xBF, 0x8A, 0xD2,
83 | 	0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1,
84 | 	0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30,
85 | 	0xF5, 0x8C, 0xB1, 0xE3, 0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60,
86 | 	0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, 0xD5, 0xDB, 0x37, 0x45,
87 | 	0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51,
88 | 	0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41,
89 | 	0x1F, 0x10, 0x5A, 0xD8, 0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD,
90 | 	0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0, 0x89, 0x69, 0x97, 0x4A,
91 | 	0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84,
92 | 	0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E,
93 | 	0xD7, 0xCB, 0x39, 0x48
94 | };
95 | 


--------------------------------------------------------------------------------
/sboxes.h:
--------------------------------------------------------------------------------
 1 | //  sboxes.h
 2 | //  2020-05-05  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  Data for AES and SM4.
 6 | 
 7 | #ifndef _SBOXES_H_
 8 | #define _SBOXES_H_
 9 | 
10 | #include <stdint.h>
11 | 
12 | //  AES Round Constants
13 | extern const uint8_t aes_rcon[];
14 | 
15 | //  AES Forward S-Box
16 | extern const uint8_t aes_sbox[256];
17 | 
18 | //  AES Inverse S-Box
19 | extern const uint8_t aes_isbox[256];
20 | 
21 | //  SM4 Forward S-Box (there is no need for an inverse S-Box)
22 | extern const uint8_t sm4_sbox[256];
23 | 
24 | #endif										//  _SBOXES_H_
25 | 


--------------------------------------------------------------------------------
/sm4_ssm4.c:
--------------------------------------------------------------------------------
  1 | //  sm4_ssm4.c
  2 | //  2020-01-27  Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //  SM4 (Chinese Encryption Standard) Encryption and Decryption.
  6 | 
  7 | #include "sm4_wrap.h"
  8 | #include "saes32.h"
  9 | #include "rv_endian.h"
 10 | 
 11 | //  SSM4_ED_X4  is a block of four ssm4.ed instructions:
 12 | 
 13 | #define SSM4_ED_X4(rs1, rs2) {		\
 14 | 	rs1 = ssm4_ed(rs1, rs2, 0);		\
 15 | 	rs1 = ssm4_ed(rs1, rs2, 1);		\
 16 | 	rs1 = ssm4_ed(rs1, rs2, 2);		\
 17 | 	rs1 = ssm4_ed(rs1, rs2, 3);		\
 18 | }
 19 | 
 20 | //  SSM4_KS_X4  is a block of four ssm4.ks instructions:
 21 | 
 22 | #define SSM4_KS_X4(rs1, rs2) {		\
 23 | 	rs1 = ssm4_ks(rs1, rs2, 0);		\
 24 | 	rs1 = ssm4_ks(rs1, rs2, 1);		\
 25 | 	rs1 = ssm4_ks(rs1, rs2, 2);		\
 26 | 	rs1 = ssm4_ks(rs1, rs2, 3);		\
 27 | }
 28 | 
 29 | //  encrypt or decrypt a block, depending on round key ordering
 30 | 
 31 | void sm4_encdec(uint8_t out[16], const uint8_t in[16],
 32 | 				const uint32_t rk[SM4_RK_WORDS])
 33 | {
 34 | 	uint32_t x0, x1, x2, x3, t, u;
 35 | 	const uint32_t *kp = &rk[SM4_RK_WORDS];
 36 | 
 37 | 	x0 = get32u_le(in);						//  little endian (native)
 38 | 	x1 = get32u_le(in + 4);
 39 | 	x2 = get32u_le(in + 8);
 40 | 	x3 = get32u_le(in + 12);
 41 | 
 42 | 	do {
 43 | 
 44 | 		u = x2 ^ x3;						//  10 XORs total per round
 45 | 
 46 | 		t = rk[0];							//  subkeys can be inline
 47 | 		t ^= u;
 48 | 		t ^= x1;
 49 | 		SSM4_ED_X4(x0, t);					//  4 x SSM4.ED
 50 | 
 51 | 		t = rk[1];
 52 | 		t ^= u;
 53 | 		t ^= x0;
 54 | 		SSM4_ED_X4(x1, t);					//  4 x SSM4.ED
 55 | 		u = x0 ^ x1;
 56 | 
 57 | 		t = rk[2];
 58 | 		t ^= u;
 59 | 		t ^= x3;
 60 | 		SSM4_ED_X4(x2, t);					//  4 x SSM4.ED
 61 | 
 62 | 		t = rk[3];
 63 | 		t ^= u;
 64 | 		t ^= x2;
 65 | 		SSM4_ED_X4(x3, t);					//  4 x SSM4.ED
 66 | 
 67 | 		rk += 4;							//  unroll?
 68 | 
 69 | 	} while (rk != kp);
 70 | 
 71 | 	put32u_le(out, x3);
 72 | 	put32u_le(out + 4, x2);
 73 | 	put32u_le(out + 8, x1);
 74 | 	put32u_le(out + 12, x0);
 75 | }
 76 | 
 77 | //  set key for encryption
 78 | 
 79 | void sm4_enc_key(uint32_t rk[SM4_RK_WORDS], const uint8_t key[16])
 80 | {
 81 | 	const uint32_t *kp = &rk[SM4_RK_WORDS];
 82 | 	uint32_t x0, x1, x2, x3;
 83 | 	uint32_t t, u, ck;
 84 | 
 85 | 	x0 = get32u_le(key);					//  fetch key words
 86 | 	x1 = get32u_le(key + 4);
 87 | 	x2 = get32u_le(key + 8);
 88 | 	x3 = get32u_le(key + 12);
 89 | 
 90 | 	x0 ^= 0xC6BAB1A3;						//  "FK" constants, little-endian
 91 | 	x1 ^= 0x5033AA56;						//  (note: seems pointless?)
 92 | 	x2 ^= 0x97917D67;
 93 | 	x3 ^= 0xDC2270B2;
 94 | 
 95 | 	ck = 0x140E0600;						//  0x150E0700 with LSBs masked
 96 | 
 97 | 	do {
 98 | /*
 99 | 	"CK" Discussion:
100 | 
101 | 	The SM4 "CK" round constants are a sequence of bytes 7*i (mod 256) with
102 | 	i = 0..127, interpreted as 32-bit words. Often these words are stored in
103 | 	a constant table. However many ISAs have a "SIMD" addition that adds 4 or
104 | 	more bytes in parallel, which is faster than a table look-up. Even some
105 | 	low-ended embedded targets such as Cortex M4 (Armv7E-M/DSP) support this
106 | 	(SADD8) and its introduction as a RISC-V extension should be considered.
107 | 	Meanwhile, we can perfom the same function with three simple arithmetic
108 | 	ops which is likely to still be faster than fetching from a table and
109 | 	(with the address arithmatic). This implementation is certainly smaller.
110 | */
111 | 		t = ck ^ 0x01000100;				//  these constants in registers
112 | 		ck += 0x1C1C1C1C;					//  if we have "SADD8", then
113 | 		ck &= 0xFEFEFEFE;					//  -> 4 x "SADD8" per round.
114 | 
115 | 		u = x2 ^ x3;						//  10 XORs per round
116 | 		t = t ^ u;
117 | 		t = t ^ x1;
118 | 		SSM4_KS_X4(x0, t);					//  4 x SSM4.KS
119 | 
120 | 		rk[0] = x0;							//  four stores per round
121 | 
122 | 		t = ck ^ 0x01000100;
123 | 		ck += 0x1C1C1C1C;
124 | 		ck &= 0xFEFEFEFE;
125 | 
126 | 		t = t ^ u;
127 | 		t = t ^ x0;
128 | 		SSM4_KS_X4(x1, t);					//  4 x SSM4.KS
129 | 		rk[1] = x1;
130 | 
131 | 		t = ck ^ 0x01000100;
132 | 		ck += 0x1C1C1C1C;
133 | 		ck &= 0xFEFEFEFE;
134 | 
135 | 		u = x0 ^ x1;
136 | 		t ^= u;
137 | 		t ^= x3;
138 | 		SSM4_KS_X4(x2, t);					//  4 x SSM4.KS
139 | 		rk[2] = x2;
140 | 
141 | 		t = ck ^ 0x01000100;
142 | 		ck += 0x1C1C1C1C;
143 | 		ck &= 0xFEFEFEFE;
144 | 
145 | 		t ^= u;
146 | 		t ^= x2;
147 | 		SSM4_KS_X4(x3, t);					//  4 x SSM4.KS
148 | 		rk[3] = x3;
149 | 
150 | 		rk += 4;
151 | 
152 | 	} while (rk != kp);
153 | }
154 | 
155 | //  set key for decryption
156 | 
157 | void sm4_dec_key(uint32_t rk[SM4_RK_WORDS], const uint8_t key[16])
158 | {
159 | 	uint32_t t;
160 | 	int i, j;
161 | 
162 | 	sm4_enc_key(rk, key);					//  encryption expansion
163 | 
164 | 	//  decryption round keys = encryption round keys in reverse order
165 | 	for (i = 0, j = SM4_RK_WORDS - 1; i < j; i++, j--) {
166 | 		t = rk[i];
167 | 		rk[i] = rk[j];
168 | 		rk[j] = t;
169 | 	}
170 | }
171 | 


--------------------------------------------------------------------------------
/sm4_test.c:
--------------------------------------------------------------------------------
 1 | //  sm4_test.c
 2 | //  2020-03-21  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  Unit tests for SM4
 6 | 
 7 | #include "test_hex.h"
 8 | #include "sm4_wrap.h"
 9 | 
10 | //  Test SM4
11 | 
12 | int test_sm4()
13 | {
14 | 	uint8_t pt[16], ct[16], xt[16], key[16];
15 | 	uint32_t rk[SM4_RK_WORDS];
16 | 	int fail = 0;
17 | 
18 | 	//  the sole test vector in the standard itself
19 | 	readhex(key, sizeof(key), "0123456789ABCDEFFEDCBA9876543210");
20 | 	sm4_enc_key(rk, key);
21 | 	readhex(pt, sizeof(pt), "0123456789ABCDEFFEDCBA9876543210");
22 | 	sm4_enc_ecb(ct, pt, rk);
23 | 	fail += chkhex("SM4 Encrypt", ct, 16, "681EDF34D206965E86B3E94F536E4246");
24 | 	sm4_dec_key(rk, key);
25 | 	sm4_enc_ecb(xt, ct, rk);
26 | 	fail += chkhex("SM4 Decrypt", xt, 16, "0123456789ABCDEFFEDCBA9876543210");
27 | 
28 | 	//  from various sources..
29 | 	readhex(key, sizeof(key), "FEDCBA98765432100123456789ABCDEF");
30 | 	sm4_enc_key(rk, key);
31 | 	readhex(pt, sizeof(pt), "000102030405060708090A0B0C0D0E0F");
32 | 	sm4_enc_ecb(ct, pt, rk);
33 | 	fail += chkhex("SM4 Encrypt", ct, 16, "F766678F13F01ADEAC1B3EA955ADB594");
34 | 	sm4_dec_key(rk, key);
35 | 	sm4_dec_ecb(xt, ct, rk);
36 | 	fail += chkhex("SM4 Decrypt", xt, 16, "000102030405060708090A0B0C0D0E0F");
37 | 
38 | 	readhex(key, sizeof(key), "EB23ADD6454757555747395B76661C9A");
39 | 	sm4_enc_key(rk, key);
40 | 	readhex(pt, sizeof(pt), "D294D879A1F02C7C5906D6C2D0C54D9F");
41 | 	sm4_enc_ecb(ct, pt, rk);
42 | 	fail += chkhex("SM4 Encrypt", ct, 16, "865DE90D6B6E99273E2D44859D9C16DF");
43 | 	sm4_dec_key(rk, key);
44 | 	sm4_dec_ecb(xt, ct, rk);
45 | 	fail += chkhex("SM4 Decrypt", xt, 16, "D294D879A1F02C7C5906D6C2D0C54D9F");
46 | 
47 | 	readhex(key, sizeof(key), "F11235535318FA844A3CBE643169F59E");
48 | 	sm4_enc_key(rk, key);
49 | 	readhex(pt, sizeof(pt), "A27EE076E48E6F389710EC7B5E8A3BE5");
50 | 	sm4_enc_ecb(ct, pt, rk);
51 | 	fail += chkhex("SM4 Encrypt", ct, 16, "94CFE3F59E8507FEC41DBE738CCD53E1");
52 | 	sm4_dec_key(rk, key);
53 | 	sm4_dec_ecb(xt, ct, rk);
54 | 	fail += chkhex("SM4 Decrypt", xt, 16, "A27EE076E48E6F389710EC7B5E8A3BE5");
55 | 
56 | 	return fail;
57 | }
58 | 


--------------------------------------------------------------------------------
/sm4_wrap.h:
--------------------------------------------------------------------------------
 1 | //  sm4_wrap.h
 2 | //  2020-01-24  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  Prototypes for SM4 (Chinese Encryption Standard) Encryption.
 6 | 
 7 | //  The decryption funtion is the same as encryption with the difference
 8 | //  of having a reversed key schedule. Hence we define both functions here.
 9 | 
10 | #ifndef _SM4_WRAP_H_
11 | #define _SM4_WRAP_H_
12 | 
13 | #include <stdint.h>
14 | 
15 | //  Size of the expanded key.
16 | #define SM4_RK_WORDS  32
17 | 
18 | //  encrypt/decrypt a block, depending on ordering of rk
19 | void sm4_encdec(uint8_t out[16], const uint8_t in[16],
20 | 				const uint32_t rk[SM4_RK_WORDS]);
21 | 
22 | //  expand a secret key for encryption
23 | void sm4_enc_key(uint32_t rk[SM4_RK_WORDS], const uint8_t key[16]);
24 | 
25 | //  expand a secret key for decryption
26 | void sm4_dec_key(uint32_t rk[SM4_RK_WORDS], const uint8_t key[16]);
27 | 
28 | //  aliases
29 | #define sm4_enc_ecb(ct, pt, rk) sm4_encdec(ct, pt, rk)
30 | #define sm4_dec_ecb(pt, ct, rk) sm4_encdec(pt, ct, rk)
31 | 
32 | #endif										//  _SM4_WRAP_H_
33 | 


--------------------------------------------------------------------------------
/test_hex.c:
--------------------------------------------------------------------------------
  1 | //  test_hex.c
  2 | //  2020-03-07  Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //  functions to facilitate simple runtime tests
  6 | 
  7 | #include "test_hex.h"
  8 | 
  9 | //  single hex digit
 10 | 
 11 | static int hexdigit(char ch)
 12 | {
 13 | 	if (ch >= '0' && ch <= '9')
 14 | 		return ch - '0';
 15 | 	if (ch >= 'A' && ch <= 'F')
 16 | 		return ch - 'A' + 10;
 17 | 	if (ch >= 'a' && ch <= 'f')
 18 | 		return ch - 'a' + 10;
 19 | 	return -1;
 20 | }
 21 | 
 22 | //  read a hex string of "maxbytes", return byte length
 23 | 
 24 | size_t readhex(uint8_t * buf, size_t maxbytes, const char *str)
 25 | {
 26 | 	size_t i;
 27 | 	int h, l;
 28 | 
 29 | 	for (i = 0; i < maxbytes; i++) {
 30 | 		h = hexdigit(str[2 * i]);
 31 | 		if (h < 0)
 32 | 			return i;
 33 | 		l = hexdigit(str[2 * i + 1]);
 34 | 		if (l < 0)
 35 | 			return i;
 36 | 		buf[i] = (h << 4) + l;
 37 | 	}
 38 | 
 39 | 	return i;
 40 | }
 41 | 
 42 | //  print hexadecimal "data", length "len", with label "lab"
 43 | 
 44 | void prthex(const char *lab, const void *data, size_t len)
 45 | {
 46 | 	size_t i;
 47 | 	uint8_t x;
 48 | 
 49 | 	printf("[TEST] %s ", lab);
 50 | 	const char hex[] = "0123456789ABCDEF";
 51 | 
 52 | 	for (i = 0; i < len; i++) {
 53 | 		x = ((const uint8_t *) data)[i];
 54 | 		putchar(hex[(x >> 4) & 0xF]);
 55 | 		putchar(hex[x & 0xF]);
 56 | 	}
 57 | 	putchar('\n');
 58 | }
 59 | 
 60 | //  check "data" of "len" bytes against a hexadecimal test vector "ref"
 61 | 
 62 | int chkhex(const char *lab, const void *data, size_t len, const char *ref)
 63 | {
 64 | 	size_t i;
 65 | 	uint8_t x;
 66 | 	int fail = 0;
 67 | 
 68 | 	//  check equivalence
 69 | 	for (i = 0; i < len; i++) {
 70 | 		x = ((const uint8_t *) data)[i];
 71 | 		if (hexdigit(ref[2 * i]) != ((x >> 4) & 0xF) ||
 72 | 			hexdigit(ref[2 * i + 1]) != (x & 0x0F)) {
 73 | 			fail = 1;
 74 | 			break;
 75 | 		}
 76 | 	}
 77 | 
 78 | 	if (i == len && hexdigit(ref[2 * len]) >= 0) {
 79 | 		fail = 1;
 80 | 	}
 81 | 
 82 | 	printf("[%s] %s %s\n", fail ? "FAIL" : "PASS", lab, ref);
 83 | 
 84 | 	if (fail) {
 85 | 		prthex(lab, data, len);
 86 | 	}
 87 | 
 88 | 	return fail;
 89 | }
 90 | 
 91 | //  boolean return value check
 92 | 
 93 | int chkret(const char *lab, int want, int have)
 94 | {
 95 | 	printf("[%s] %s WANT=%d  HAVE=%d\n",
 96 | 		   want != have ? "FAIL" : "PASS", lab, want, have);
 97 | 
 98 | 	return want != have ? 1 : 0;
 99 | }
100 | 


--------------------------------------------------------------------------------
/test_hex.h:
--------------------------------------------------------------------------------
 1 | //  test_hex.h
 2 | //  2020-03-07  Markku-Juhani O. Saarinen <mjos@pqshield.com>
 3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
 4 | 
 5 | //  functions to facilitate simple runtime tests
 6 | 
 7 | #ifndef _TEST_HEX_H_
 8 | #define _TEST_HEX_H_
 9 | 
10 | #include <stdio.h>
11 | #include <stdint.h>
12 | #include <stddef.h>
13 | #include <string.h>
14 | 
15 | //  read a hex string of "maxbytes", return byte length
16 | size_t readhex(uint8_t * buf, size_t maxbytes, const char *str);
17 | 
18 | //  print hexadecimal "data", length "len", with label "lab"
19 | void prthex(const char *lab, const void *data, size_t len);
20 | 
21 | //  check "data" of "len" bytes against a hexadecimal test vector "ref"
22 | int chkhex(const char *lab, const void *data, size_t len, const char *ref);
23 | 
24 | //  boolean return value check
25 | int chkret(const char *lab, int want, int have);
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/test_main.c:
--------------------------------------------------------------------------------
  1 | //  test_main.c
  2 | //  2020-01-23  Markku-Juhani O. Saarinen <mjos@pqshield.com>
  3 | //  Copyright (c) 2020, PQShield Ltd. All rights reserved.
  4 | 
  5 | //  Minimal unit tests for AES-128/192/256 (FIPS 197) and SM4 (GM/T 0002-2012).
  6 | 
  7 | #include <stdio.h>
  8 | #include <stdint.h>
  9 | #include <stddef.h>
 10 | #include <string.h>
 11 | 
 12 | #include "aes_wrap.h"
 13 | #include "saes32.h"
 14 | #include "aes_saes32.h"
 15 | #include "aes_saes64.h"
 16 | #include "aes_otf_saes64.h"
 17 | 
 18 | #include "gcm_wrap.h"
 19 | #include "gcm_gfmul.h"
 20 | 
 21 | 
 22 | //  unit tests
 23 | 
 24 | int test_aes();								//  aes_test.c
 25 | int test_sm4();								//  sm4_test.c
 26 | int test_gcm();								//  gcm_test.c
 27 | 
 28 | //  generate "reference" hw testbench data for the instruction
 29 | //  output should match with hdl/saes32_tb.v
 30 | 
 31 | int test_hwtb()
 32 | {
 33 | 	uint32_t rd, rs1, rs2, fn;
 34 | 
 35 | 	rs1 = 0x00000000;
 36 | 	rs2 = 0x00000000;
 37 | 
 38 | 	for (fn = 0; fn < 24; fn++) {
 39 | 
 40 | 		rd = saes32(rs1, rs2, fn);
 41 | 
 42 | 		printf("[TB] rd=%08x rs1=%08x rs2=%08x fn=%02x\n", rd, rs1, rs2, fn);
 43 | 
 44 | 		rs2 += 0x01234567;
 45 | 	}
 46 | 
 47 | 	return 0;
 48 | }
 49 | 
 50 | //  stub main: run unit tests
 51 | 
 52 | int main(int argc, char **argv)
 53 | {
 54 | 	int fail = 0;
 55 | 
 56 | 	//  generate hardware testbench data ?
 57 | 	if (argc > 1 && strcmp(argv[1], "tb") == 0) {
 58 | 		return test_hwtb();
 59 | 	}
 60 | 	//  algorithm tests
 61 | 
 62 | 	printf("[INFO] === AES using SAES32 ===\n");
 63 | 
 64 | 	aes128_enc_key = aes128_enc_key_saes32;	//  set encryption key
 65 | 	aes192_enc_key = aes192_enc_key_saes32;
 66 | 	aes256_enc_key = aes256_enc_key_saes32;
 67 | 
 68 | 	aes128_enc_ecb = aes128_enc_ecb_saes32;	//  encrypt a block
 69 | 	aes192_enc_ecb = aes192_enc_ecb_saes32;
 70 | 	aes256_enc_ecb = aes256_enc_ecb_saes32;
 71 | 
 72 | 	aes128_dec_key = aes128_dec_key_saes32;	//  set decryption key
 73 | 	aes192_dec_key = aes192_dec_key_saes32;
 74 | 	aes256_dec_key = aes256_dec_key_saes32;
 75 | 
 76 | 	aes128_dec_ecb = aes128_dec_ecb_saes32;	//  decrypt a block
 77 | 	aes192_dec_ecb = aes192_dec_ecb_saes32;
 78 | 	aes256_dec_ecb = aes256_dec_ecb_saes32;
 79 | 
 80 | 	fail += test_aes();						//  run tests with UUT = SAES32
 81 | 
 82 | 	printf("[INFO] === AES using SAES64 / On-the-fly keying ===\n");
 83 | 
 84 | 	aes128_enc_ecb = aes128_enc_otf_saes64;
 85 | 	aes192_enc_ecb = aes192_enc_otf_saes64;
 86 | 	aes256_enc_ecb = aes256_enc_otf_saes64;
 87 | 
 88 | 	fail += test_aes();						//  run tests with UUT = OTF/64
 89 | 
 90 | 	printf("[INFO] === AES using SAES64 ===\n");
 91 | 
 92 | 	aes128_enc_key = aes128_enc_key_saes64;	//  set encryption key
 93 | 	aes192_enc_key = aes192_enc_key_saes64;
 94 | 	aes256_enc_key = aes256_enc_key_saes64;
 95 | 
 96 | 	aes128_enc_ecb = aes128_enc_ecb_saes64;	//  encrypt a block
 97 | 	aes192_enc_ecb = aes192_enc_ecb_saes64;
 98 | 	aes256_enc_ecb = aes256_enc_ecb_saes64;
 99 | 
100 | 	aes128_dec_key = aes128_dec_key_saes64;	//  set decryption key
101 | 	aes192_dec_key = aes192_dec_key_saes64;
102 | 	aes256_dec_key = aes256_dec_key_saes64;
103 | 
104 | 	aes128_dec_ecb = aes128_dec_ecb_saes64;	//  decrypt a block
105 | 	aes192_dec_ecb = aes192_dec_ecb_saes64;
106 | 	aes256_dec_ecb = aes256_dec_ecb_saes64;
107 | 
108 | 	fail += test_aes();						//  run tests with UUT = SAES64
109 | 
110 | 
111 | 
112 | 	printf("[INFO] === GCM using rv64_ghash_mul() ===\n");
113 | 	ghash_rev = rv64_ghash_rev;
114 | 	ghash_mul = rv64_ghash_mul;
115 | 	fail += test_gcm();
116 | 
117 | 	printf("[INFO] === GCM using rv32_ghash_mul() ===\n");
118 | 	ghash_rev = rv32_ghash_rev;
119 | 	ghash_mul = rv32_ghash_mul;
120 | 	fail += test_gcm();
121 | 
122 | 	printf("[INFO] === GCM using rv32_ghash_mul_kar() ===\n");
123 | 	ghash_rev = rv32_ghash_rev;
124 | 	ghash_mul = rv32_ghash_mul_kar;
125 | 	fail += test_gcm();
126 | 
127 | 	printf("[INFO] === SM4 test ===\n");
128 | 	fail += test_sm4();
129 | 
130 | 	if (fail == 0) {
131 | 		printf("[PASS] all tests passed.\n");
132 | 	} else {
133 | 		printf("[FAIL] %d test(s) failed.\n", fail);
134 | 	}
135 | 
136 | 	return fail;
137 | }
138 | 


--------------------------------------------------------------------------------