├── .editorconfig ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── aes_otf_saes64.c ├── aes_otf_saes64.h ├── aes_saes32.c ├── aes_saes32.h ├── aes_saes64.c ├── aes_saes64.h ├── aes_test.c ├── aes_wrap.c ├── aes_wrap.h ├── asm ├── README.md ├── saes32_c0.h ├── saes32_dec.S ├── saes32_enc.S ├── saes32_wrap.h ├── sm4_encdec.S └── sm4_encdec.h ├── bitmanip.c ├── bitmanip.h ├── doc ├── NIST.FIPS.197.pdf ├── gmt0002-2012sm4.pdf ├── lwaes.pdf ├── sm4en.pdf ├── sp800-38d.pdf └── test_gcm_ossl.c ├── gcm_gfmul.h ├── gcm_rv32b_gfmul.c ├── gcm_rv64b_gfmul.c ├── gcm_test.c ├── gcm_wrap.c ├── gcm_wrap.h ├── hdl ├── Makefile ├── README.md ├── saes32.v ├── saes32_tb.v ├── sboxes.v ├── synth.ys ├── tbref.txt └── yoparse.py ├── rv_endian.h ├── saes32.c ├── saes32.h ├── saes64.c ├── saes64.h ├── sboxes.c ├── sboxes.h ├── sm4_ssm4.c ├── sm4_test.c ├── sm4_wrap.h ├── test_hex.c ├── test_hex.h └── test_main.c /.editorconfig: -------------------------------------------------------------------------------- 1 | # .editorconfig 2 | # 2019-09-02 Markku-Juhani O. Saarinen 3 | # Works on GitHub. See: https://EditorConfig.org 4 | 5 | root = true 6 | 7 | [*] 8 | end_of_line = lf 9 | insert_final_newline = true 10 | 11 | [*.{c,h,s,S,v}] 12 | charset = latin1 13 | indent_style = tab 14 | indent_size = 4 15 | 16 | [*.{txt,md,tex}] 17 | charset = utf-8 18 | indent_style = space 19 | ident_size = 4 20 | 21 | [Makefile] 22 | indent_style = tab 23 | ident_size = 4 24 | 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # project specific 2 | xtest 3 | 4 | firmware.* 5 | config.h 6 | *.vvp 7 | 8 | # Prerequisites 9 | *.d 10 | 11 | # Object files 12 | *.o 13 | *.ko 14 | *.obj 15 | *.elf 16 | 17 | # Linker output 18 | *.ilk 19 | *.map 20 | *.exp 21 | 22 | # Precompiled Headers 23 | *.gch 24 | *.pch 25 | 26 | # Libraries 27 | *.lib 28 | *.a 29 | *.la 30 | *.lo 31 | 32 | # Shared objects (inc. Windows DLLs) 33 | *.dll 34 | *.so 35 | *.so.* 36 | *.dylib 37 | 38 | # Executables 39 | *.exe 40 | *.out 41 | *.app 42 | *.i*86 43 | *.x86_64 44 | *.hex 45 | 46 | # Debug files 47 | *.dSYM/ 48 | *.su 49 | *.idb 50 | *.pdb 51 | 52 | # Kernel Module Compile Results 53 | *.mod* 54 | *.cmd 55 | .tmp_versions/ 56 | modules.order 57 | Module.symvers 58 | Mkfile.old 59 | dkms.conf 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License (RISC-V) 2 | 3 | Copyright (c) 2020, Markku-Juhani O. Saarinen, PQShield Ltd. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile 2 | # 2020-01-22 Markku-Juhani O. Saarinen 3 | # Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | # export all variables to sub-makefiles 6 | export 7 | 8 | BIN = xtest 9 | CSRC = $(wildcard *.c) 10 | OBJS = $(CSRC:.c=.o) 11 | CC = gcc 12 | CFLAGS = -Wall -Wextra -Wshadow -fsanitize=address,undefined -O2 -g 13 | #CFLAGS = -Wall -march=native -O3 14 | LIBS += 15 | 16 | $(BIN): $(OBJS) 17 | $(CC) $(CFLAGS) -o $(BIN) $(OBJS) $(LIBS) 18 | 19 | %.o: %.[cS] 20 | $(CC) $(CFLAGS) -c $^ -o $@ 21 | 22 | clean: 23 | rm -rf $(OBJS) $(BIN) *~ 24 | cd hdl && $(MAKE) clean 25 | 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Lightweight (RISC-V) ISA Extension for AES and SM4 2 | 3 | **[HISTORICAL]** *This personal repo was used for work-in-progress 4 | contributions to the RISC-V Cryptographic Extensions Task Group in 2020 5 | and is no longer updated.* 6 | See [riscv-crypto](https://github.com/riscv/riscv-crypto/) for 7 | up to date information. 8 | 9 | January 22, 2020 Markku-Juhani O. Saarinen 10 | 11 | **Updated** April 23, 2020: Renamed ENC1S as SAES32 (and SSM4) as per current 12 | draft spec where the proposal now resides. 13 | 14 | 15 | ## Description 16 | 17 | A lightweight ISA extension proposal supporting: 18 | 19 | * AES (Advanced Encryption Standard) with 128/192/256 - bit secret key, 20 | as defined in [FIPS 197](doc/NIST.FIPS.197.pdf). 21 | 22 | * SM4 Chinese Encryption algorithm [GM/T 0002-2012](doc/gmt0002-2012sm4.pdf) 23 | [(english spec)](doc/sm4en.pdf), also defined in GB/T 32907-2016 and ISO/IEC 24 | 18033-3:2010/DAmd 2. SM4 has only one key size, 128 bits. 25 | 26 | A single instruction, SAES32 is used for encryption, decryption, and key 27 | schedule for both ciphers. For design rationale and some analysis, see the 28 | short report [A Lightweight ISA Extension for AES and SM4](https://arxiv.org/abs/2002.07041) (to appear at SECRISC-V 2020). Note that there the same 29 | instruction is called "ENC1S". 30 | 31 | A more complex ISA extension is appropriate for higher-end CPUs. The 32 | primary goal of SAES32 / lweas is to eliminate timing-side vulnerabilities. 33 | Speed-up over pure software table-based implementations is roughly 500 %. 34 | 35 | **NOTE** After ENC1S proposal was adapted into the 36 | [Crypto TG Draft](https://github.com/scarv/riscv-crypto) as SAES32, I'm 37 | trying to keep this code up to date with it. There is also an initial 38 | emulation pseudocode for the 64-bit SAES64 instructions; no real assembler 39 | or HDL yet. 40 | 41 | 42 | ## Software and Hardware Source Code 43 | 44 | This directory contains an "emulator" C implementation of the instruction 45 | together with runnable pseudocode for full encryption, decryption, and 46 | key schedule of AES-128/192/256 and SM4-128. These are intended for 47 | instruction counts, test vector generation, and other such evaluation. 48 | Real assembler listings for the same functions (using a seriously hacky 49 | macro instruction encoding) can be found under the [asm](asm) directory. 50 | 51 | The assembler and C code use essentially the same api, AES and SM4 API 52 | (specified in [saes32_wrap.h](saes32_wrap.h)) so that same test code 53 | can be used with both. 54 | 55 | The [hdl](hdl) directory contains Verilog combinatorial logic for the core 56 | instruction. Simulator and basic CMOS gate count synthesis scripts are 57 | provided for Icarus Verilog and Yosys open source tools. The same assembler 58 | and HDL have been additionally tested with PQShield's proprietary RISC-V 59 | emulator and the "Pluto" core on a live FPGA target, although source 60 | code for those is not provided here. 61 | 62 | 63 | ## Technical Details 64 | 65 | The instruction is encapsulated in a single emulator function in 66 | [saes32.c](saes32.c): 67 | ```C 68 | uint32_t saes32(uint32_t rs1, uint32_t rs2, int fn); 69 | ``` 70 | The file [hdl/saes32.v](hdl/saes32.v) contains Verilog combinatorial 71 | logic for the instruction that can be used in a RISC-V core. 72 | ```verilog 73 | module saes32( 74 | output [31:0] rd, // output register (wire!) 75 | input [31:0] rs1, // input register 1 76 | input [31:0] rs2, // input register 2 77 | input [4:0] fn // 5-bit function specifier 78 | ); 79 | ``` 80 | 81 | The `fn` immediate "constant" is currently 5 bits, covering encryption, 82 | decryption, and key schedule for both algorithms. Bits `fn[1:0]` specify 83 | the input byte and output rotation while `fn[4:2]` specify the operation. 84 | Appropriate pseudo instruction names for the code points can be proposed; 85 | current identifiers defined in [saes32.h](saes32.h) are: 86 | 87 | | **Identifier** | **fn[4:2]** | **Description or Use** | 88 | |-------------------|:-----------:|------------------------------------| 89 | | `SAES32_ENCSM` | 0 | AES Encrypt main body with *MixColumns*. | 90 | | `SAES32_ENCS` | 1 | AES Encrypt final round / Key Schedule. | 91 | | `SAES32_DECSM` | 2 | AES Decrypt main body with *MixColumns*. | 92 | | `SAES32_DECS` | 3 | AES Decrypt final round. | 93 | | `SSM4_ED` | 4 | SM4 Encrypt and Decrypt. | 94 | | `SSM4_KS` | 5 | SM4 Key Schedule. | 95 | | | 6-7 | *Unused. 4x6=24 points currently used.* | 96 | 97 | For AES the instruction selects a byte from `rs2`, performs a single S-box 98 | lookup (*SubBytes* or its inverse), evaluates a part of the MDS matrix 99 | (*MixColumns*), rotates the result by a multiple of 8 bits (*ShiftRows*), 100 | and exclusive-ors the result with `rs1` (*AddRoundKey*). Despite its complex 101 | description, it can be seen that hardware implementation of the instructions 102 | is quite compact and the overall software implementation is fast. 103 | 104 | For SM4 the instruction has exactly the same data path with byte selection, 105 | S-Box lookup, but with different linear operations, depending on whether 106 | encryption/decryption or key scheduling is being performed. 107 | 108 | 109 | ## Galois/Counter Mode (GCM): AES-GCM with Bitmanip 110 | 111 | The Galois/Counter Mode (GCM) specified in 112 | [NIST SP 800-38D](https://doi.org/10.6028/NIST.SP.800-38D) is a prominent 113 | Authenticated Encryption with Associated Data (AEAD) mechanism. It is 114 | the only cipher mode mandated as "MUST" for all 115 | [TLS 1.3](https://www.rfc-editor.org/rfc/rfc8446.html) implementations. 116 | 117 | Here I'll briefly discuss implementation aspects 118 | of AES-GCM using the [bitmanip](https://github.com/riscv/riscv-bitmanip) 119 | (B) extension. Pseudocode for a relevant subset of instructions is contained 120 | in source file [bitmanip.c](bitmanip.c), with prototypes in 121 | [bitmanip.h](bitmanip.h). These are almost directly lifted from the current 122 | draft specification. The instructions relevant to GCM are the Carry-Less 123 | Multiply instructions `CMUL[H][W]` and also the Generalized Reverse `GREV[W]`. 124 | The `[W]` suffix indicates a 32-bit word size variant on RV64. 125 | 126 | The low-level functions that use these instructions are emulated by 127 | [gcm_rv32b_gfmul.c](gcm_rv32b_gfmul.c) and 128 | [gcm_rv64b_gfmul.c](gcm_rv64b_gfmul.c). 129 | Their correctness can be verified against the full AES-GCM test vectors 130 | contained in the framework. There may be further room for improvement -- I 131 | use such code to draft the final assembly implementations. 132 | 133 | An attempt has been made to pair `CMULH` immediately followed by `CMUL`, 134 | as is done with `MULH`/`MUL`, although there is less of a performance 135 | advantage in this case. 136 | 137 | 138 | #### Finite Field Arithmetic 139 | 140 | While message confidentiality in GCM is provided by a block cipher (AES) 141 | in counter mode (a CTR variant), authentication is based on a GHASH, a 142 | universal hash defined over the binary field GF(2128). 143 | Without custom instruction support GCM, just like AES itself, is either 144 | very slow or susceptible to cache timing attacks. 145 | 146 | Whether or not authenticating ciphertext or associated data, the main 147 | operation of GCM is the GHASH multiplication between a block of 148 | authentication data and a secret generator "H". The addition in the 149 | field is trivial; just two or four XORs, depending on whether RV32 or RV64 150 | implementation is used. 151 | 152 | The finite field is defined to be the ring of binary polynomials modulo 153 | the primitive pentanomial 154 | R(x) = x128 + x7 + x2 + x + 1. 155 | The field encoding is slightly unusual, with the multiplicative identity 156 | (i.e. one -- "1") being encoded as a byte sequence `0x80, 0x00, .., 0x00`. 157 | Converting to little-endian encoding involves inverting bits in each byte; 158 | the `GREV[W]` instruction with constant 7 (pseudo-instruction `rev`) 159 | accomplishes this. 160 | 161 | The multiplication itself can be asymptotically sped up with the Karatsuba 162 | method, which works even better in binary fields than it does with integers. 163 | This reduces the number of `CMUL`/`CMULH` pairs on RV64 from 4 to 3 and 164 | the on RV32 from 16 to 9, with the cost of many XORs. 165 | 166 | 167 | #### Reduction via Shifts or via Multiplication 168 | 169 | The second arithmetic step to consider is the polynomial reduction of the 170 | 255-bit ring product down to 128 bits (the field) again. The best way of 171 | doing reduction depends on *how fast* the carry-less multiplication 172 | instructions `CMUL[H][W]` are in relation to shifts and XORs. 173 | 174 | I'll call these *shift reduction* (based on the low Hamming weight of the 175 | polynomial R) and *multiplication reduction* (which is analogous to 176 | Montgomery and Barrett methods -- albeit simpler because we're working 177 | in characteristic 2.) 178 | 179 | 180 | #### Estimating the Fastest Method 181 | 182 | Examining the multiplication implementations in 183 | [gcm_rv32b_gfmul.c](gcm_rv32b_gfmul.c) and 184 | [gcm_rv64b_gfmul.c](gcm_rv64b_gfmul.c) we obtain the following 185 | arithmetic counts: 186 | 187 | | **Arch** | **Karatsuba** | **Reduce** | `GREV` | `XOR` | `S[L/R]L` | `CLMUL` | `CLMULH` | 188 | |:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:| 189 | | RV32B | no | mul | 4 | 36 | 0 | 20 | 20 | 190 | | RV32B | no | shift | 4 | 56 | 24 | 16 | 16 | 191 | | RV32B | yes | mul | 4 | 52 | 0 | 13 | 13 | 192 | | RV32B | yes | shift | 4 | 72 | 24 | 9 | 9 | 193 | | RV64B | no | mul | 2 | 10 | 0 | 6 | 6 | 194 | | RV64B | no | shift | 2 | 20 | 12 | 4 | 4 | 195 | | RV64B | yes | mul | 2 | 14 | 0 | 5 | 5 | 196 | | RV64B | yes | shift | 2 | 24 | 12 | 3 | 3 | 197 | 198 | 199 | We can see that the best selection of algorithms depends on the relative 200 | cost of multiplication. Assuming that other instructions have unit cost 201 | and ignoring loops etc, we have: 202 | 203 | | **Arch** | **Karatsuba** | **Reduce** | **MUL=1** | **MUL=2** | **MUL=3** | **MUL=6** | 204 | |:-----:|:-----:|:-----:|:---------:|:---------:|:---------:|:---------:| 205 | | RV32B | no | mul | **80** | 120 | 160 | 280 | 206 | | RV32B | no | shift | 116 | 148 | 180 | 276 | 207 | | RV32B | yes | mul | 82 | **108** | **134** | 212 | 208 | | RV32B | yes | shift | 118 | 136 | 154 | **208** | 209 | | RV64B | no | mul | **24** | **36** | 48 | 84 | 210 | | RV64B | no | shift | 42 | 50 | 58 | 82 | 211 | | RV64B | yes | mul | 26 | **36** | **46** | 76 | 212 | | RV64B | yes | shift | 44 | 50 | 56 | **74** | 213 | 214 | We see that if `CLMUL[H][W]` takes twice the time of XOR and shifts, 215 | or more, then Karatsuba is worthwhile. If these multiplication instructions 216 | are six times slower, or more, then it is worthwhile to convert the reduction multiplications to shifts and XORs. 217 | 218 | 219 | ## AES Notes 220 | 221 | * AES code density is 16 instructions per round (+ round key fetch), despite 222 | only requiring a single S-box in hardware. The initial 223 | [RISC-V Crypto proposal](https://github.com/scarv/riscv-crypto) 224 | (Section 4.4, "Lightweight AES Acceleration") contains an instruction for 225 | four parallel S-Box lookups. Without additional helper instructions, this 226 | will result in a slower round function. Furthermore, the circuit size is 227 | dominated by the S-Box, so the hardware size of this proposal is lower. 228 | * In addition to being 500+% faster than plain software implementation 229 | (depending on table lookup speed), the most important feature of this 230 | implementation is that it is constant time and resistant to 231 | [Cache-timing attacks on AES](http://cr.yp.to/antiforgery/cachetiming-20050414.pdf). 232 | Constant-time implementations of AES are possible in pure software but 233 | are exceedingly slow. 234 | * The instructions also support the key schedule; it is possible to compute 235 | the round keys "on the fly" without committing them to RAM. This may be 236 | helpful in some types of security applications. 237 | * Many applications do not actually require the AES inverse function; 238 | even full TLS implementations may be implemented without it since 239 | the AES-GCM mode is based on CTR; essentially a stream cipher. 240 | * Mathematically the AES computation is organized as in the well-known 241 | "T-Tables" technique, which is more than 20 years old in the context of 242 | AES. If there are patents for this specific way of organizing the 243 | computation, they are likely to have expired. 244 | Other approaches have been considered 245 | [in the literature](https://iacr.org/archive/ches2006/22/22.pdf). 246 | * In hardware implementation the AES S-Box and its inverse share much of 247 | their circuitry. For an example of gate-optimized logic for this purpose, 248 | see e.g. [Boyar and Peralta](https://eprint.iacr.org/2011/332.pdf). 249 | We've expanded this to SM4, as can be seen in reference combinatorial 250 | logic in [hdl/sboxes.v](hdl/sboxes.v). 251 | * SM4 S-Box is mathematically very close to AES S-Box, as both are based 252 | on finite field inversion in GF(256). This property also makes the inverse 253 | S-Box required by AES self-similar to forward S-Box. Even though different 254 | polynomial bases are used by AES and SM4, finite fields are affine 255 | equivalent, so much of the circuitry of the three is shared. 256 | SM4 does not need an inverse S-Box for decryption. 257 | 258 | 259 | ### Testing 260 | 261 | Only a C compiler is required to test; RISC-V instruction counts can be 262 | seen from the source code. A [Makefile](Makefile) is provided and the file 263 | [test_main.c](test-main.c) contains a minimal unit test with some standard 264 | test vectors. 265 | 266 | ```console 267 | $ make 268 | gcc -c test_main.c -o test_main.o 269 | [..] 270 | gcc -o xtest aes_enc.o sm4.... 271 | $ ./xtest 272 | < .. TEST TEST TEST .. > 273 | [INFO] === AES using SAES32 === 274 | [PASS] AES-128 Enc 69C4E0D86A7B0430D8CDB78070B4C55A 275 | [PASS] AES-128 Dec 00112233445566778899AABBCCDDEEFF 276 | [PASS] AES-192 Enc DDA97CA4864CDFE06EAF70A0EC0D7191 277 | [PASS] AES-192 Dec 00112233445566778899AABBCCDDEEFF 278 | [PASS] AES-256 Enc 8EA2B7CA516745BFEAFC49904B496089 279 | [PASS] AES-256 Dec 00112233445566778899AABBCCDDEEFF 280 | [PASS] AES-128 Enc 3AD77BB40D7A3660A89ECAF32466EF97 281 | [PASS] AES-128 Dec 6BC1BEE22E409F96E93D7E117393172A 282 | [PASS] AES-192 Enc 974104846D0AD3AD7734ECB3ECEE4EEF 283 | [PASS] AES-192 Dec AE2D8A571E03AC9C9EB76FAC45AF8E51 284 | [PASS] AES-256 Enc B6ED21B99CA6F4F9F153E7B1BEAFED1D 285 | [PASS] AES-256 Dec 30C81C46A35CE411E5FBC1191A0A52EF 286 | < .. GCM tests, SAES64 tests, etc here .. > 287 | [PASS] all tests passed. 288 | $ 289 | ``` 290 | 291 | **Disclaimer and Status** 292 | 293 | * [PQShield](https://pqshield.com) offers no warranty or specific claims of 294 | standards compliance nor does not endorse this proposal above other 295 | proposals. PQShield may or may not implement AES and SM4 according to this 296 | proposal in the future. 297 | * Despite being proposed in a personal capacity, this proposal 298 | constitutes a "contribution" as defined in Section 1.4 of the 299 | RISC-V foundation membership agreement. 300 | 301 | Cheers, 302 | - markku 303 | 304 | -------------------------------------------------------------------------------- /aes_otf_saes64.c: -------------------------------------------------------------------------------- 1 | // aes_otf_saes64.c 2 | // 2020-05-06 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // AES Encryption with on-the-fly key expansion 6 | 7 | #include 8 | 9 | #include "aes_wrap.h" 10 | #include "saes64.h" 11 | 12 | // === AES-128 round with on-the-fly key schedule === 13 | 14 | // 2 x SAES64.ENCS[M], 1 x SAES64.KS1, 2 x SAES64.KS2, 2 x XOR 15 | 16 | #define SAES64_OTF128A(i) { \ 17 | u0 = saes64_encsm(t0, t1); \ 18 | u1 = saes64_encsm(t1, t0); \ 19 | ks = saes64_ks1(k1, i); \ 20 | k0 = saes64_ks2(ks, k0); \ 21 | k1 = saes64_ks2(k0, k1); \ 22 | u0 = u0 ^ k0; \ 23 | u1 = u1 ^ k1; } 24 | 25 | #define SAES64_OTF128B(i) { \ 26 | t0 = saes64_encsm(u0, u1); \ 27 | t1 = saes64_encsm(u1, u0); \ 28 | ks = saes64_ks1(k1, i); \ 29 | k0 = saes64_ks2(ks, k0); \ 30 | k1 = saes64_ks2(k0, k1); \ 31 | t0 = t0 ^ k0; \ 32 | t1 = t1 ^ k1; } 33 | 34 | void aes128_enc_otf_saes64(uint8_t ct[16], const uint8_t pt[16], 35 | const uint32_t * rk) 36 | { 37 | uint64_t t0, t1, u0, u1, k0, k1, ks; 38 | 39 | k0 = ((const uint64_t *) rk)[0]; // load key 40 | k1 = ((const uint64_t *) rk)[1]; 41 | 42 | t0 = ((const uint64_t *) pt)[0]; // get plaintext 43 | t1 = ((const uint64_t *) pt)[1]; 44 | 45 | t0 = t0 ^ k0; 46 | t1 = t1 ^ k1; 47 | 48 | SAES64_OTF128A(0); // first round 49 | SAES64_OTF128B(1); // # 2 50 | SAES64_OTF128A(2); // # 3 51 | SAES64_OTF128B(3); // # 4 52 | SAES64_OTF128A(4); // # 5 53 | SAES64_OTF128B(5); // # 6 54 | SAES64_OTF128A(6); // # 7 55 | SAES64_OTF128B(7); // # 8 56 | SAES64_OTF128A(8); // # 9 57 | t0 = saes64_encs(u0, u1); // last round 58 | t1 = saes64_encs(u1, u0); 59 | ks = saes64_ks1(k1, 9); 60 | k0 = saes64_ks2(ks, k0); 61 | k1 = saes64_ks2(k0, k1); 62 | t0 = t0 ^ k0; 63 | t1 = t1 ^ k1; 64 | 65 | ((uint64_t *) ct)[0] = t0; // store ciphertext 66 | ((uint64_t *) ct)[1] = t1; 67 | } 68 | 69 | // === AES-192 round with on-the-fly key schedule === 70 | 71 | // 3 rounds has: 2 x SAES64.KS1, 6 x SAES64.KS2, 6 x AES64.ENCSM, 6 x XOR 72 | 73 | #define SAES64_OTF192K(i) { \ 74 | ks = saes64_ks1(k2, i); \ 75 | k0 = saes64_ks2(ks, k0); \ 76 | k1 = saes64_ks2(k0, k1); \ 77 | k2 = saes64_ks2(k1, k2); } 78 | 79 | #define SAES64_OTF192A { \ 80 | t0 = t0 ^ k0; \ 81 | t1 = t1 ^ k1; \ 82 | u0 = saes64_encsm(t0, t1); \ 83 | u1 = saes64_encsm(t1, t0); } 84 | 85 | #define SAES64_OTF192B(i) { \ 86 | u0 = u0 ^ k2; \ 87 | SAES64_OTF192K(i); \ 88 | u1 = u1 ^ k0; \ 89 | v0 = saes64_encsm(u0, u1); \ 90 | v1 = saes64_encsm(u1, u0); } 91 | 92 | #define SAES64_OTF192C(i) { \ 93 | v0 = v0 ^ k1; \ 94 | v1 = v1 ^ k2; \ 95 | SAES64_OTF192K(i); \ 96 | t0 = saes64_encsm(v0, v1); \ 97 | t1 = saes64_encsm(v1, v0); } 98 | 99 | void aes192_enc_otf_saes64(uint8_t ct[16], const uint8_t pt[16], 100 | const uint32_t * rk) 101 | { 102 | uint64_t t0, t1, u0, u1, v0, v1, k0, k1, k2, ks; 103 | 104 | k0 = ((const uint64_t *) rk)[0]; // load key 105 | k1 = ((const uint64_t *) rk)[1]; 106 | k2 = ((const uint64_t *) rk)[2]; 107 | 108 | t0 = ((const uint64_t *) pt)[0]; // get plaintext 109 | t1 = ((const uint64_t *) pt)[1]; 110 | 111 | SAES64_OTF192A; // first round 112 | SAES64_OTF192B(0); // # 2 113 | SAES64_OTF192C(1); // # 3 114 | SAES64_OTF192A; // # 4 115 | SAES64_OTF192B(2); // # 5 116 | SAES64_OTF192C(3); // # 6 117 | SAES64_OTF192A; // # 7 118 | SAES64_OTF192B(4); // # 8 119 | SAES64_OTF192C(5); // # 9 120 | SAES64_OTF192A; // # 10 121 | SAES64_OTF192B(6); // # 11 122 | 123 | v0 = v0 ^ k1; // last round 124 | v1 = v1 ^ k2; 125 | ks = saes64_ks1(k2, 7); // different because .. 126 | k0 = saes64_ks2(ks, k0); 127 | k1 = saes64_ks2(k0, k1); // .. no need to compute k2 128 | t0 = saes64_encs(v0, v1); // different function 129 | t1 = saes64_encs(v1, v0); 130 | t0 = t0 ^ k0; // final AddRoundKey 131 | t1 = t1 ^ k1; 132 | 133 | ((uint64_t *) ct)[0] = t0; // store ciphertext 134 | ((uint64_t *) ct)[1] = t1; 135 | } 136 | 137 | 138 | // === AES-256 round with on-the-fly key schedule === 139 | 140 | // 2 x saes64_encs[m], 1 x saes64_KS1, 2 x SAES64.KS2, 2 x XOR 141 | 142 | #define SAES64_OTF256A(i) { \ 143 | u0 = saes64_encsm(t0, t1); \ 144 | u1 = saes64_encsm(t1, t0); \ 145 | ks = saes64_ks1(k3, i); \ 146 | k0 = saes64_ks2(ks, k0); \ 147 | k1 = saes64_ks2(k0, k1); \ 148 | u0 = u0 ^ k2; \ 149 | u1 = u1 ^ k3; } 150 | 151 | #define SAES64_OTF256B(i) { \ 152 | t0 = saes64_encsm(u0, u1); \ 153 | t1 = saes64_encsm(u1, u0); \ 154 | ks = saes64_ks1(k1, i); \ 155 | k2 = saes64_ks2(ks, k2); \ 156 | k3 = saes64_ks2(k2, k3); \ 157 | t0 = t0 ^ k0; \ 158 | t1 = t1 ^ k1; } 159 | 160 | 161 | void aes256_enc_otf_saes64(uint8_t ct[16], const uint8_t pt[16], 162 | const uint32_t * rk) 163 | { 164 | uint64_t t0, t1, u0, u1, k0, k1, k2, k3, ks; 165 | 166 | k0 = ((const uint64_t *) rk)[0]; // load key 167 | k1 = ((const uint64_t *) rk)[1]; 168 | k2 = ((const uint64_t *) rk)[2]; 169 | k3 = ((const uint64_t *) rk)[3]; 170 | 171 | t0 = ((const uint64_t *) pt)[0]; // get plaintext 172 | t1 = ((const uint64_t *) pt)[1]; 173 | 174 | t0 = t0 ^ k0; 175 | t1 = t1 ^ k1; 176 | 177 | SAES64_OTF256A(0); // first round 178 | SAES64_OTF256B(10); // # 2 179 | SAES64_OTF256A(1); // # 3 180 | SAES64_OTF256B(10); // # 4 181 | SAES64_OTF256A(2); // # 5 182 | SAES64_OTF256B(10); // # 6 183 | SAES64_OTF256A(3); // # 7 184 | SAES64_OTF256B(10); // # 8 185 | SAES64_OTF256A(4); // # 9 186 | SAES64_OTF256B(10); // # 10 187 | SAES64_OTF256A(5); // # 11 188 | SAES64_OTF256B(10); // # 12 189 | SAES64_OTF256A(6); // # 13 190 | t0 = saes64_encs(u0, u1); // last round 191 | t1 = saes64_encs(u1, u0); 192 | t0 = t0 ^ k0; 193 | t1 = t1 ^ k1; 194 | 195 | ((uint64_t *) ct)[0] = t0; // store ciphertext 196 | ((uint64_t *) ct)[1] = t1; 197 | } 198 | -------------------------------------------------------------------------------- /aes_otf_saes64.h: -------------------------------------------------------------------------------- 1 | // aes_otf_saes64.h 2 | // 2020-05-06 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // AES Encryption with on-the-fly key expansion. 6 | // *rk can point to expanded key or just the key. 7 | 8 | #ifndef _AES_OTF_SAES64_H_ 9 | #define _AES_OTF_SAES64_H_ 10 | 11 | #include 12 | 13 | void aes128_enc_otf_saes64(uint8_t ct[16], const uint8_t pt[16], 14 | const uint32_t * rk); 15 | 16 | void aes192_enc_otf_saes64(uint8_t ct[16], const uint8_t pt[16], 17 | const uint32_t * rk); 18 | 19 | void aes256_enc_otf_saes64(uint8_t ct[16], const uint8_t pt[16], 20 | const uint32_t * rk); 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /aes_saes32.c: -------------------------------------------------------------------------------- 1 | // aes_saes32.c 2 | // 2020-01-22 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // "Running pseudocode" for full AES-128/192/256 encryption and decryption 6 | // using SAES32.xxx instructions. 7 | 8 | #include 9 | 10 | #include "saes32.h" 11 | #include "aes_wrap.h" 12 | #include "bitmanip.h" 13 | #include "rv_endian.h" 14 | #include "sboxes.h" 15 | 16 | // === ENCRYPTION === 17 | 18 | // Encrypt rounds. Implements AES-128/192/256 depending on nr = {10,12,14} 19 | 20 | void aes_enc_rounds_saes32(uint8_t ct[16], const uint8_t pt[16], 21 | const uint32_t rk[], int nr) 22 | { 23 | uint32_t t0, t1, t2, t3; // even round state registers 24 | uint32_t u0, u1, u2, u3; // odd round state registers 25 | const uint32_t *kp = &rk[4 * nr]; // key pointer as loop condition 26 | 27 | t0 = rk[0]; // fetch even subkey 28 | t1 = rk[1]; 29 | t2 = rk[2]; 30 | t3 = rk[3]; 31 | 32 | t0 ^= get32u_le(pt); // xor with plaintext block 33 | t1 ^= get32u_le(pt + 4); 34 | t2 ^= get32u_le(pt + 8); 35 | t3 ^= get32u_le(pt + 12); 36 | 37 | while (1) { // double round 38 | 39 | u0 = rk[4]; // fetch odd subkey 40 | u1 = rk[5]; 41 | u2 = rk[6]; 42 | u3 = rk[7]; 43 | 44 | u0 = saes32_encsm(u0, t0, 0); // AES round, 16 instructions 45 | u0 = saes32_encsm(u0, t1, 1); 46 | u0 = saes32_encsm(u0, t2, 2); 47 | u0 = saes32_encsm(u0, t3, 3); 48 | 49 | u1 = saes32_encsm(u1, t1, 0); 50 | u1 = saes32_encsm(u1, t2, 1); 51 | u1 = saes32_encsm(u1, t3, 2); 52 | u1 = saes32_encsm(u1, t0, 3); 53 | 54 | u2 = saes32_encsm(u2, t2, 0); 55 | u2 = saes32_encsm(u2, t3, 1); 56 | u2 = saes32_encsm(u2, t0, 2); 57 | u2 = saes32_encsm(u2, t1, 3); 58 | 59 | u3 = saes32_encsm(u3, t3, 0); 60 | u3 = saes32_encsm(u3, t0, 1); 61 | u3 = saes32_encsm(u3, t1, 2); 62 | u3 = saes32_encsm(u3, t2, 3); 63 | 64 | t0 = rk[8]; // fetch even subkey 65 | t1 = rk[9]; 66 | t2 = rk[10]; 67 | t3 = rk[11]; 68 | 69 | rk += 8; // step key pointer 70 | if (rk == kp) // final round ? 71 | break; 72 | 73 | t0 = saes32_encsm(t0, u0, 0); // AES round, 16 instructions 74 | t0 = saes32_encsm(t0, u1, 1); 75 | t0 = saes32_encsm(t0, u2, 2); 76 | t0 = saes32_encsm(t0, u3, 3); 77 | 78 | t1 = saes32_encsm(t1, u1, 0); 79 | t1 = saes32_encsm(t1, u2, 1); 80 | t1 = saes32_encsm(t1, u3, 2); 81 | t1 = saes32_encsm(t1, u0, 3); 82 | 83 | t2 = saes32_encsm(t2, u2, 0); 84 | t2 = saes32_encsm(t2, u3, 1); 85 | t2 = saes32_encsm(t2, u0, 2); 86 | t2 = saes32_encsm(t2, u1, 3); 87 | 88 | t3 = saes32_encsm(t3, u3, 0); 89 | t3 = saes32_encsm(t3, u0, 1); 90 | t3 = saes32_encsm(t3, u1, 2); 91 | t3 = saes32_encsm(t3, u2, 3); 92 | } 93 | 94 | t0 = saes32_encs(t0, u0, 0); // final round is different 95 | t0 = saes32_encs(t0, u1, 1); 96 | t0 = saes32_encs(t0, u2, 2); 97 | t0 = saes32_encs(t0, u3, 3); 98 | 99 | t1 = saes32_encs(t1, u1, 0); 100 | t1 = saes32_encs(t1, u2, 1); 101 | t1 = saes32_encs(t1, u3, 2); 102 | t1 = saes32_encs(t1, u0, 3); 103 | 104 | t2 = saes32_encs(t2, u2, 0); 105 | t2 = saes32_encs(t2, u3, 1); 106 | t2 = saes32_encs(t2, u0, 2); 107 | t2 = saes32_encs(t2, u1, 3); 108 | 109 | t3 = saes32_encs(t3, u3, 0); 110 | t3 = saes32_encs(t3, u0, 1); 111 | t3 = saes32_encs(t3, u1, 2); 112 | t3 = saes32_encs(t3, u2, 3); 113 | 114 | put32u_le(ct, t0); // write ciphertext block 115 | put32u_le(ct + 4, t1); 116 | put32u_le(ct + 8, t2); 117 | put32u_le(ct + 12, t3); 118 | } 119 | 120 | // Wrappers 121 | 122 | void aes128_enc_ecb_saes32(uint8_t ct[16], const uint8_t pt[16], 123 | const uint32_t rk[AES128_RK_WORDS]) 124 | { 125 | aes_enc_rounds_saes32(ct, pt, rk, AES128_ROUNDS); 126 | } 127 | 128 | void aes192_enc_ecb_saes32(uint8_t ct[16], const uint8_t pt[16], 129 | const uint32_t rk[AES192_RK_WORDS]) 130 | { 131 | aes_enc_rounds_saes32(ct, pt, rk, AES192_ROUNDS); 132 | } 133 | 134 | void aes256_enc_ecb_saes32(uint8_t ct[16], const uint8_t pt[16], 135 | const uint32_t rk[AES256_RK_WORDS]) 136 | { 137 | aes_enc_rounds_saes32(ct, pt, rk, AES256_ROUNDS); 138 | } 139 | 140 | // Key schedule for AES-128 Encryption. 141 | 142 | void aes128_enc_key_saes32(uint32_t rk[44], const uint8_t key[16]) 143 | { 144 | uint32_t t0, t1, t2, t3, tr; // subkey registers 145 | const uint32_t *rke = &rk[44 - 4]; // end pointer 146 | const uint8_t *rc = aes_rcon; // round constants 147 | 148 | t0 = get32u_le(key); // load secret key 149 | t1 = get32u_le(key + 4); 150 | t2 = get32u_le(key + 8); 151 | t3 = get32u_le(key + 12); 152 | 153 | while (1) { 154 | 155 | rk[0] = t0; // store subkey 156 | rk[1] = t1; 157 | rk[2] = t2; 158 | rk[3] = t3; 159 | 160 | if (rk == rke) // end condition 161 | return; 162 | rk += 4; // step pointer by one subkey 163 | 164 | t0 ^= (uint32_t) * rc++; // round constant 165 | tr = rv32b_ror(t3, 8); // rotate 8 bits (little endian!) 166 | t0 = saes32_encs(t0, tr, 0); // SubWord() 167 | t0 = saes32_encs(t0, tr, 1); 168 | t0 = saes32_encs(t0, tr, 2); 169 | t0 = saes32_encs(t0, tr, 3); 170 | t1 ^= t0; 171 | t2 ^= t1; 172 | t3 ^= t2; 173 | } 174 | } 175 | 176 | // Key schedule for AES-192 encryption. 177 | 178 | void aes192_enc_key_saes32(uint32_t rk[52], const uint8_t key[24]) 179 | { 180 | uint32_t t0, t1, t2, t3, t4, t5, tr; // subkey registers 181 | const uint32_t *rke = &rk[52 - 4]; // end pointer 182 | const uint8_t *rc = aes_rcon; // round constants 183 | 184 | t0 = get32u_le(key); // load secret key 185 | t1 = get32u_le(key + 4); 186 | t2 = get32u_le(key + 8); 187 | t3 = get32u_le(key + 12); 188 | t4 = get32u_le(key + 16); 189 | t5 = get32u_le(key + 20); 190 | 191 | while (1) { 192 | 193 | rk[0] = t0; // store subkey (or part) 194 | rk[1] = t1; 195 | rk[2] = t2; 196 | rk[3] = t3; 197 | if (rk == rke) // end condition 198 | return; 199 | rk[4] = t4; 200 | rk[5] = t5; 201 | rk += 6; // step pointer by 1.5 subkeys 202 | 203 | t0 ^= (uint32_t) * rc++; // round constant 204 | tr = rv32b_ror(t5, 8); // rotate 8 bits (little endian!) 205 | t0 = saes32_encs(t0, tr, 0); // SubWord() 206 | t0 = saes32_encs(t0, tr, 1); 207 | t0 = saes32_encs(t0, tr, 2); 208 | t0 = saes32_encs(t0, tr, 3); 209 | 210 | t1 ^= t0; 211 | t2 ^= t1; 212 | t3 ^= t2; 213 | t4 ^= t3; 214 | t5 ^= t4; 215 | } 216 | } 217 | 218 | // Key schedule for AES-256 encryption. 219 | 220 | void aes256_enc_key_saes32(uint32_t rk[60], const uint8_t key[32]) 221 | { 222 | uint32_t t0, t1, t2, t3, t4, t5, t6, t7, tr; // subkey registers 223 | const uint32_t *rke = &rk[60 - 4]; // end pointer 224 | const uint8_t *rc = aes_rcon; // round constants 225 | 226 | t0 = get32u_le(key); 227 | t1 = get32u_le(key + 4); 228 | t2 = get32u_le(key + 8); 229 | t3 = get32u_le(key + 12); 230 | t4 = get32u_le(key + 16); 231 | t5 = get32u_le(key + 20); 232 | t6 = get32u_le(key + 24); 233 | t7 = get32u_le(key + 28); 234 | 235 | rk[0] = t0; // store first subkey 236 | rk[1] = t1; 237 | rk[2] = t2; 238 | rk[3] = t3; 239 | 240 | while (1) { 241 | 242 | rk[4] = t4; // store odd subkey 243 | rk[5] = t5; 244 | rk[6] = t6; 245 | rk[7] = t7; 246 | rk += 8; // step pointer by 2 subkeys 247 | 248 | t0 ^= (uint32_t) * rc++; // round constant 249 | tr = rv32b_ror(t7, 8); // rotate 8 bits (little endian!) 250 | t0 = saes32_encs(t0, tr, 0); // SubWord() 251 | t0 = saes32_encs(t0, tr, 1); 252 | t0 = saes32_encs(t0, tr, 2); 253 | t0 = saes32_encs(t0, tr, 3); 254 | t1 ^= t0; 255 | t2 ^= t1; 256 | t3 ^= t2; 257 | 258 | rk[0] = t0; // store even subkey 259 | rk[1] = t1; 260 | rk[2] = t2; 261 | rk[3] = t3; 262 | if (rk == rke) // end condition 263 | return; 264 | 265 | t4 = saes32_encs(t4, t3, 0); // SubWord() - NO rotation 266 | t4 = saes32_encs(t4, t3, 1); 267 | t4 = saes32_encs(t4, t3, 2); 268 | t4 = saes32_encs(t4, t3, 3); 269 | t5 ^= t4; 270 | t6 ^= t5; 271 | t7 ^= t6; 272 | } 273 | } 274 | 275 | // === DECRYPTION === 276 | 277 | // Decrypt rounds. Implements AES-128/192/256 depending on nr = {10,12,14} 278 | 279 | void aes_dec_rounds_saes32(uint8_t pt[16], const uint8_t ct[16], 280 | const uint32_t rk[], int nr) 281 | { 282 | uint32_t t0, t1, t2, t3; // even round state registers 283 | uint32_t u0, u1, u2, u3; // odd round state registers 284 | const uint32_t *kp = &rk[4 * nr]; // key pointer 285 | 286 | t0 = kp[0]; // fetch last subkey 287 | t1 = kp[1]; 288 | t2 = kp[2]; 289 | t3 = kp[3]; 290 | kp -= 8; 291 | 292 | t0 ^= get32u_le(ct); // xor with ciphertext block 293 | t1 ^= get32u_le(ct + 4); 294 | t2 ^= get32u_le(ct + 8); 295 | t3 ^= get32u_le(ct + 12); 296 | 297 | while (1) { 298 | u0 = kp[4]; // fetch odd subkey 299 | u1 = kp[5]; 300 | u2 = kp[6]; 301 | u3 = kp[7]; 302 | 303 | u0 = saes32_decsm(u0, t0, 0); // AES decryption round, 16 instr 304 | u0 = saes32_decsm(u0, t3, 1); 305 | u0 = saes32_decsm(u0, t2, 2); 306 | u0 = saes32_decsm(u0, t1, 3); 307 | 308 | u1 = saes32_decsm(u1, t1, 0); 309 | u1 = saes32_decsm(u1, t0, 1); 310 | u1 = saes32_decsm(u1, t3, 2); 311 | u1 = saes32_decsm(u1, t2, 3); 312 | 313 | u2 = saes32_decsm(u2, t2, 0); 314 | u2 = saes32_decsm(u2, t1, 1); 315 | u2 = saes32_decsm(u2, t0, 2); 316 | u2 = saes32_decsm(u2, t3, 3); 317 | 318 | u3 = saes32_decsm(u3, t3, 0); 319 | u3 = saes32_decsm(u3, t2, 1); 320 | u3 = saes32_decsm(u3, t1, 2); 321 | u3 = saes32_decsm(u3, t0, 3); 322 | 323 | t0 = kp[0]; // fetch even subkey 324 | t1 = kp[1]; 325 | t2 = kp[2]; 326 | t3 = kp[3]; 327 | 328 | if (kp == rk) // final round 329 | break; 330 | kp -= 8; 331 | 332 | t0 = saes32_decsm(t0, u0, 0); // AES decryption round, 16 instr 333 | t0 = saes32_decsm(t0, u3, 1); 334 | t0 = saes32_decsm(t0, u2, 2); 335 | t0 = saes32_decsm(t0, u1, 3); 336 | 337 | t1 = saes32_decsm(t1, u1, 0); 338 | t1 = saes32_decsm(t1, u0, 1); 339 | t1 = saes32_decsm(t1, u3, 2); 340 | t1 = saes32_decsm(t1, u2, 3); 341 | 342 | t2 = saes32_decsm(t2, u2, 0); 343 | t2 = saes32_decsm(t2, u1, 1); 344 | t2 = saes32_decsm(t2, u0, 2); 345 | t2 = saes32_decsm(t2, u3, 3); 346 | 347 | t3 = saes32_decsm(t3, u3, 0); 348 | t3 = saes32_decsm(t3, u2, 1); 349 | t3 = saes32_decsm(t3, u1, 2); 350 | t3 = saes32_decsm(t3, u0, 3); 351 | } 352 | 353 | t0 = saes32_decs(t0, u0, 0); // final decryption round, 16 ins. 354 | t0 = saes32_decs(t0, u3, 1); 355 | t0 = saes32_decs(t0, u2, 2); 356 | t0 = saes32_decs(t0, u1, 3); 357 | 358 | t1 = saes32_decs(t1, u1, 0); 359 | t1 = saes32_decs(t1, u0, 1); 360 | t1 = saes32_decs(t1, u3, 2); 361 | t1 = saes32_decs(t1, u2, 3); 362 | 363 | t2 = saes32_decs(t2, u2, 0); 364 | t2 = saes32_decs(t2, u1, 1); 365 | t2 = saes32_decs(t2, u0, 2); 366 | t2 = saes32_decs(t2, u3, 3); 367 | 368 | t3 = saes32_decs(t3, u3, 0); 369 | t3 = saes32_decs(t3, u2, 1); 370 | t3 = saes32_decs(t3, u1, 2); 371 | t3 = saes32_decs(t3, u0, 3); 372 | 373 | put32u_le(pt, t0); // write plaintext block 374 | put32u_le(pt + 4, t1); 375 | put32u_le(pt + 8, t2); 376 | put32u_le(pt + 12, t3); 377 | } 378 | 379 | // Wrappers 380 | 381 | void aes128_dec_ecb_saes32(uint8_t pt[16], const uint8_t ct[16], 382 | const uint32_t rk[AES128_RK_WORDS]) 383 | { 384 | aes_dec_rounds_saes32(pt, ct, rk, AES128_ROUNDS); 385 | } 386 | 387 | void aes192_dec_ecb_saes32(uint8_t pt[16], const uint8_t ct[16], 388 | const uint32_t rk[AES192_RK_WORDS]) 389 | { 390 | aes_dec_rounds_saes32(pt, ct, rk, AES192_ROUNDS); 391 | } 392 | 393 | void aes256_dec_ecb_saes32(uint8_t pt[16], const uint8_t ct[16], 394 | const uint32_t rk[AES256_RK_WORDS]) 395 | { 396 | aes_dec_rounds_saes32(pt, ct, rk, AES256_ROUNDS); 397 | } 398 | 399 | // Helper: apply inverse mixcolumns to a vector 400 | 401 | void saes32_dec_invmc(uint32_t * v, size_t len) 402 | { 403 | size_t i; 404 | uint32_t x, y; 405 | 406 | for (i = 0; i < len; i++) { 407 | x = v[i]; 408 | 409 | y = saes32_encs(0, x, 0); // SubWord() 410 | y = saes32_encs(y, x, 1); 411 | y = saes32_encs(y, x, 2); 412 | y = saes32_encs(y, x, 3); 413 | 414 | x = saes32_decsm(0, y, 0); // Just want inv MixCol() 415 | x = saes32_decsm(x, y, 1); 416 | x = saes32_decsm(x, y, 2); 417 | x = saes32_decsm(x, y, 3); 418 | 419 | v[i] = x; 420 | } 421 | } 422 | 423 | // Key schedule for AES-128 decryption. 424 | 425 | void aes128_dec_key_saes32(uint32_t rk[44], const uint8_t key[16]) 426 | { 427 | // create an encryption key and modify middle rounds 428 | aes128_enc_key(rk, key); 429 | saes32_dec_invmc(rk + 4, AES128_RK_WORDS - 8); 430 | } 431 | 432 | // Key schedule for AES-192 decryption. 433 | 434 | void aes192_dec_key_saes32(uint32_t rk[52], const uint8_t key[24]) 435 | { 436 | // create an encryption key and modify middle rounds 437 | aes192_enc_key(rk, key); 438 | saes32_dec_invmc(rk + 4, AES192_RK_WORDS - 8); 439 | } 440 | 441 | // Key schedule for AES-256 decryption. 442 | 443 | void aes256_dec_key_saes32(uint32_t rk[60], const uint8_t key[32]) 444 | { 445 | // create an encryption key and modify middle rounds 446 | aes256_enc_key(rk, key); 447 | saes32_dec_invmc(rk + 4, AES256_RK_WORDS - 8); 448 | } 449 | -------------------------------------------------------------------------------- /aes_saes32.h: -------------------------------------------------------------------------------- 1 | // aes_saes32.h 2 | // 2020-05-05 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Implementation prototypes for aes_saes32.c 6 | 7 | #ifndef _AES_SAES32_H_ 8 | #define _AES_SAES32_H_ 9 | 10 | #include 11 | 12 | // Set encryption key 13 | 14 | void aes128_enc_key_saes32(uint32_t rk[AES128_RK_WORDS], 15 | const uint8_t key[16]); 16 | 17 | void aes192_enc_key_saes32(uint32_t rk[AES192_RK_WORDS], 18 | const uint8_t key[24]); 19 | 20 | void aes256_enc_key_saes32(uint32_t rk[AES256_RK_WORDS], 21 | const uint8_t key[32]); 22 | 23 | // Encrypt a block 24 | 25 | void aes128_enc_ecb_saes32(uint8_t ct[16], const uint8_t pt[16], 26 | const uint32_t rk[AES128_RK_WORDS]); 27 | 28 | void aes192_enc_ecb_saes32(uint8_t ct[16], const uint8_t pt[16], 29 | const uint32_t rk[AES192_RK_WORDS]); 30 | 31 | void aes256_enc_ecb_saes32(uint8_t ct[16], const uint8_t pt[16], 32 | const uint32_t rk[AES256_RK_WORDS]); 33 | 34 | 35 | // Set decryption key 36 | 37 | void aes128_dec_key_saes32(uint32_t rk[AES128_RK_WORDS], 38 | const uint8_t key[16]); 39 | 40 | void aes192_dec_key_saes32(uint32_t rk[AES192_RK_WORDS], 41 | const uint8_t key[24]); 42 | 43 | void aes256_dec_key_saes32(uint32_t rk[AES256_RK_WORDS], 44 | const uint8_t key[32]); 45 | 46 | // Decrypt a block 47 | 48 | void aes128_dec_ecb_saes32(uint8_t pt[16], const uint8_t ct[16], 49 | const uint32_t rk[AES128_RK_WORDS]); 50 | 51 | void aes192_dec_ecb_saes32(uint8_t pt[16], const uint8_t ct[16], 52 | const uint32_t rk[AES192_RK_WORDS]); 53 | 54 | void aes256_dec_ecb_saes32(uint8_t pt[16], const uint8_t ct[16], 55 | const uint32_t rk[AES256_RK_WORDS]); 56 | 57 | #endif // _AES_SAES32_H_ 58 | -------------------------------------------------------------------------------- /aes_saes64.c: -------------------------------------------------------------------------------- 1 | // aes_saes64.c 2 | // 2020-05-03 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // "Running pseudocode" for full AES-128/192/256 encryption and decryption 6 | // using SAES64.xxx instructions. 7 | 8 | #include 9 | 10 | #include "aes_wrap.h" 11 | #include "saes64.h" 12 | #include "rv_endian.h" 13 | 14 | // Encrypt rounds. Implements AES-128/192/256 depending on nr = {10,12,14} 15 | 16 | // Per round: 2 * ENCSM, 2 * load, 2 * XOR 17 | 18 | #define SAES64_ENC_ROUND(r0, r1, s0, s1, i) { \ 19 | r0 = saes64_encsm(s0, s1); \ 20 | r1 = saes64_encsm(s1, s0); \ 21 | k0 = kp[2 * i]; \ 22 | k1 = kp[2 * i + 1]; \ 23 | r0 = r0 ^ k0; \ 24 | r1 = r1 ^ k1; } 25 | 26 | void aes_enc_rounds_saes64(uint8_t ct[16], const uint8_t pt[16], 27 | const uint32_t rk[], int nr) 28 | { 29 | // key pointer 30 | const uint64_t *kp = (const uint64_t *) rk; 31 | 32 | uint64_t t0, t1, u0, u1, k0, k1; 33 | 34 | t0 = ((const uint64_t *) pt)[0]; // get plaintext 35 | t1 = ((const uint64_t *) pt)[1]; 36 | 37 | k0 = kp[0]; // load first round 38 | k1 = kp[1]; 39 | t0 = t0 ^ k0; 40 | t1 = t1 ^ k1; 41 | 42 | SAES64_ENC_ROUND(u0, u1, t0, t1, 1); // 6 insn / round 43 | SAES64_ENC_ROUND(t0, t1, u0, u1, 2); 44 | SAES64_ENC_ROUND(u0, u1, t0, t1, 3); 45 | SAES64_ENC_ROUND(t0, t1, u0, u1, 4); 46 | SAES64_ENC_ROUND(u0, u1, t0, t1, 5); 47 | SAES64_ENC_ROUND(t0, t1, u0, u1, 6); 48 | SAES64_ENC_ROUND(u0, u1, t0, t1, 7); 49 | SAES64_ENC_ROUND(t0, t1, u0, u1, 8); 50 | SAES64_ENC_ROUND(u0, u1, t0, t1, 9); 51 | 52 | // In reality we would entirely inline these for all 128/192/256 versions 53 | 54 | if (nr >= 12) { // AES-192, AES-256 55 | SAES64_ENC_ROUND(t0, t1, u0, u1, 10); 56 | SAES64_ENC_ROUND(u0, u1, t0, t1, 11); 57 | if (nr > 12) { 58 | SAES64_ENC_ROUND(t0, t1, u0, u1, 12); 59 | SAES64_ENC_ROUND(u0, u1, t0, t1, 13); 60 | k0 = kp[2 * 14]; // AES-256 last round key 61 | k1 = kp[2 * 14 + 1]; 62 | } else { 63 | k0 = kp[2 * 12]; // AES-192 last round key 64 | k1 = kp[2 * 12 + 1]; 65 | } 66 | } else { 67 | k0 = kp[2 * 10]; // AES-128 last round key 68 | k1 = kp[2 * 10 + 1]; 69 | } 70 | 71 | t0 = saes64_encs(u0, u1); // Final round; ENCS not ENCSM 72 | t1 = saes64_encs(u1, u0); 73 | t0 = t0 ^ k0; // last round key 74 | t1 = t1 ^ k1; 75 | 76 | ((uint64_t *) ct)[0] = t0; // store ciphertext 77 | ((uint64_t *) ct)[1] = t1; 78 | } 79 | 80 | // Wrappers 81 | 82 | void aes128_enc_ecb_saes64(uint8_t ct[16], const uint8_t pt[16], 83 | const uint32_t rk[AES128_RK_WORDS]) 84 | { 85 | aes_enc_rounds_saes64(ct, pt, rk, AES128_ROUNDS); 86 | } 87 | 88 | void aes192_enc_ecb_saes64(uint8_t ct[16], const uint8_t pt[16], 89 | const uint32_t rk[AES192_RK_WORDS]) 90 | { 91 | aes_enc_rounds_saes64(ct, pt, rk, AES192_ROUNDS); 92 | } 93 | 94 | void aes256_enc_ecb_saes64(uint8_t ct[16], const uint8_t pt[16], 95 | const uint32_t rk[AES256_RK_WORDS]) 96 | { 97 | aes_enc_rounds_saes64(ct, pt, rk, AES256_ROUNDS); 98 | } 99 | 100 | // Key schedule for AES-128 Encryption. 101 | // For each round 1 * SAES64.KS1, 2 * SAES64.KS2 and 2 * store 102 | 103 | #define SAES64_KEY128_STEP(i) { \ 104 | kp[2 * i] = k0; \ 105 | kp[2 * i + 1] = k1; \ 106 | ks = saes64_ks1(k1, i); \ 107 | k0 = saes64_ks2(ks, k0); \ 108 | k1 = saes64_ks2(k0, k1); } 109 | 110 | void aes128_enc_key_saes64(uint32_t rk[44], const uint8_t key[16]) 111 | { 112 | uint64_t *kp = (uint64_t *) rk; // key pointer 113 | uint64_t k0, k1, ks; 114 | 115 | k0 = get64u_le(key); // load secret key 116 | k1 = get64u_le(key + 8); 117 | SAES64_KEY128_STEP(0); // 5 insn each, unrolled 118 | SAES64_KEY128_STEP(1); 119 | SAES64_KEY128_STEP(2); 120 | SAES64_KEY128_STEP(3); 121 | SAES64_KEY128_STEP(4); 122 | SAES64_KEY128_STEP(5); 123 | SAES64_KEY128_STEP(6); 124 | SAES64_KEY128_STEP(7); 125 | SAES64_KEY128_STEP(8); 126 | SAES64_KEY128_STEP(9); // (10 steps, 10 rounds) 127 | kp[20] = k0; // last round key 128 | kp[21] = k1; 129 | } 130 | 131 | // Key schedule for AES-192 encryption. 132 | // For each 1.5 rounds 1 * SAES64.KS1, 3 * SAES64.KS2 and 3 * store 133 | 134 | #define SAES64_KEY192_STEP(i) { \ 135 | kp[3 * i] = k0; \ 136 | kp[3 * i + 1] = k1; \ 137 | kp[3 * i + 2] = k2; \ 138 | ks = saes64_ks1(k2, i); \ 139 | k0 = saes64_ks2(ks, k0); \ 140 | k1 = saes64_ks2(k0, k1); \ 141 | k2 = saes64_ks2(k1, k2); } 142 | 143 | void aes192_enc_key_saes64(uint32_t rk[52], const uint8_t key[24]) 144 | { 145 | uint64_t *kp = (uint64_t *) rk; // key pointer 146 | uint64_t k0, k1, k2, ks; 147 | 148 | k0 = get64u_le(key); // load secret key 149 | k1 = get64u_le(key + 8); 150 | k2 = get64u_le(key + 16); 151 | SAES64_KEY192_STEP(0); // two steps is 3 rounds 152 | SAES64_KEY192_STEP(1); // 14/3 = 4.7 insn/round 153 | SAES64_KEY192_STEP(2); 154 | SAES64_KEY192_STEP(3); 155 | SAES64_KEY192_STEP(4); 156 | SAES64_KEY192_STEP(5); 157 | SAES64_KEY192_STEP(6); 158 | kp[21] = k0; // last full state 159 | kp[22] = k1; 160 | kp[23] = k2; 161 | ks = saes64_ks1(k2, 7); // (8 steps, 12 rounds) 162 | k0 = saes64_ks2(ks, k0); 163 | k1 = saes64_ks2(k0, k1); // no need for k2 164 | kp[24] = k0; // last round key 165 | kp[25] = k1; 166 | } 167 | 168 | // Key schedule for AES-256 encryption. 169 | // For each 2 rounds: 2 * SAES64.KS1, 4 * SAES64.KS2 and 4 * store 170 | 171 | #define SAES64_KEY256_STEP(i) { \ 172 | kp[4 * i] = k0; \ 173 | kp[4 * i + 1] = k1; \ 174 | kp[4 * i + 2] = k2; \ 175 | kp[4 * i + 3] = k3; \ 176 | ks = saes64_ks1(k3, i); \ 177 | k0 = saes64_ks2(ks, k0); \ 178 | k1 = saes64_ks2(k0, k1); \ 179 | ks = saes64_ks1(k1, 10); \ 180 | k2 = saes64_ks2(ks, k2); \ 181 | k3 = saes64_ks2(k2, k3); } 182 | 183 | void aes256_enc_key_saes64(uint32_t rk[60], const uint8_t key[32]) 184 | { 185 | uint64_t *kp = (uint64_t *) rk; // key pointer 186 | uint64_t k0, k1, k2, k3, ks; 187 | 188 | k0 = get64u_le(key); // load secret key 189 | k1 = get64u_le(key + 8); 190 | k2 = get64u_le(key + 16); 191 | k3 = get64u_le(key + 24); 192 | SAES64_KEY256_STEP(0); // 1 steps is 2 rounds 193 | SAES64_KEY256_STEP(1); // 10/2 = 5 insn/round 194 | SAES64_KEY256_STEP(2); 195 | SAES64_KEY256_STEP(3); 196 | SAES64_KEY256_STEP(4); 197 | SAES64_KEY256_STEP(5); 198 | kp[24] = k0; // store last full state 199 | kp[25] = k1; 200 | kp[26] = k2; 201 | kp[27] = k3; 202 | ks = saes64_ks1(k3, 6); // no need for k2, k3 203 | k0 = saes64_ks2(ks, k0); 204 | k1 = saes64_ks2(k0, k1); 205 | kp[28] = k0; // store last round key 206 | kp[29] = k1; 207 | } 208 | 209 | // Decrypt rounds. Implements AES-128/192/256 depending on nr = {10,12,14} 210 | 211 | // Per round: 2 * load, 2 * XOR, 2 * DECSM 212 | 213 | #define SAES64_DEC_ROUND(r0, r1, s0, s1, i) { \ 214 | k0 = kp[2 * i + 2]; \ 215 | k1 = kp[2 * i + 3]; \ 216 | s0 = s0 ^ k0; \ 217 | s1 = s1 ^ k1; \ 218 | r0 = saes64_decsm(s0, s1); \ 219 | r1 = saes64_decsm(s1, s0); } 220 | 221 | 222 | void aes_dec_rounds_saes64(uint8_t pt[16], const uint8_t ct[16], 223 | const uint32_t rk[], int nr) 224 | { 225 | // key pointer (just a cast) 226 | const uint64_t *kp = (const uint64_t *) rk; 227 | 228 | uint64_t t0, t1, u0, u1, k0, k1; 229 | 230 | t0 = ((const uint64_t *) ct)[0]; // get ciphertext 231 | t1 = ((const uint64_t *) ct)[1]; 232 | 233 | // In reality we would entirely inline these for all 128/192/256 versions 234 | 235 | if (nr >= 12) { 236 | if (nr > 12) { // AES-256 237 | SAES64_DEC_ROUND(u0, u1, t0, t1, 13); 238 | SAES64_DEC_ROUND(t0, t1, u0, u1, 12); 239 | } // AES-192, AES-192 240 | SAES64_DEC_ROUND(u0, u1, t0, t1, 11); 241 | SAES64_DEC_ROUND(t0, t1, u0, u1, 10); 242 | } 243 | 244 | SAES64_DEC_ROUND(u0, u1, t0, t1, 9); // 6 insn / round 245 | SAES64_DEC_ROUND(t0, t1, u0, u1, 8); 246 | SAES64_DEC_ROUND(u0, u1, t0, t1, 7); 247 | SAES64_DEC_ROUND(t0, t1, u0, u1, 6); 248 | SAES64_DEC_ROUND(u0, u1, t0, t1, 5); 249 | SAES64_DEC_ROUND(t0, t1, u0, u1, 4); 250 | SAES64_DEC_ROUND(u0, u1, t0, t1, 3); 251 | SAES64_DEC_ROUND(t0, t1, u0, u1, 2); 252 | SAES64_DEC_ROUND(u0, u1, t0, t1, 1); 253 | 254 | k0 = kp[2]; // final decrypt round 255 | k1 = kp[3]; 256 | u0 = u0 ^ k0; 257 | u1 = u1 ^ k1; 258 | t0 = saes64_decs(u0, u1); // DECS instead of DECSM 259 | t1 = saes64_decs(u1, u0); 260 | k0 = kp[0]; // first round key 261 | k1 = kp[1]; 262 | t0 = t0 ^ k0; 263 | t1 = t1 ^ k1; 264 | 265 | ((uint64_t *) pt)[0] = t0; // store plaintext 266 | ((uint64_t *) pt)[1] = t1; 267 | 268 | return; 269 | 270 | } 271 | 272 | // Wrappers 273 | 274 | void aes128_dec_ecb_saes64(uint8_t pt[16], const uint8_t ct[16], 275 | const uint32_t rk[AES128_RK_WORDS]) 276 | { 277 | aes_dec_rounds_saes64(pt, ct, rk, AES128_ROUNDS); 278 | } 279 | 280 | void aes192_dec_ecb_saes64(uint8_t pt[16], const uint8_t ct[16], 281 | const uint32_t rk[AES192_RK_WORDS]) 282 | { 283 | aes_dec_rounds_saes64(pt, ct, rk, AES192_ROUNDS); 284 | } 285 | 286 | void aes256_dec_ecb_saes64(uint8_t pt[16], const uint8_t ct[16], 287 | const uint32_t rk[AES256_RK_WORDS]) 288 | { 289 | aes_dec_rounds_saes64(pt, ct, rk, AES256_ROUNDS); 290 | } 291 | 292 | // Helper: apply inverse mixcolumns to a vector 293 | 294 | static inline void saes64_dec_invmc(uint64_t * v, size_t len) 295 | { 296 | size_t i; 297 | 298 | for (i = 0; i < len; i++) { 299 | v[i] = saes64_imix(v[i]); 300 | } 301 | } 302 | 303 | // Key schedule for AES-128 decryption. 304 | 305 | void aes128_dec_key_saes64(uint32_t rk[44], const uint8_t key[16]) 306 | { 307 | // create an encryption key and modify middle rounds 308 | aes128_enc_key_saes64(rk, key); 309 | saes64_dec_invmc(((uint64_t *) rk) + 2, AES128_RK_WORDS / 2 - 4); 310 | } 311 | 312 | // Key schedule for AES-192 decryption. 313 | 314 | void aes192_dec_key_saes64(uint32_t rk[52], const uint8_t key[24]) 315 | { 316 | // create an encryption key and modify middle rounds 317 | aes192_enc_key_saes64(rk, key); 318 | saes64_dec_invmc(((uint64_t *) rk) + 2, AES192_RK_WORDS / 2 - 4); 319 | } 320 | 321 | // Key schedule for AES-256 decryption. 322 | 323 | void aes256_dec_key_saes64(uint32_t rk[60], const uint8_t key[32]) 324 | { 325 | // create an encryption key and modify middle rounds 326 | aes256_enc_key_saes64(rk, key); 327 | saes64_dec_invmc(((uint64_t *) rk) + 2, AES256_RK_WORDS / 2 - 4); 328 | } 329 | -------------------------------------------------------------------------------- /aes_saes64.h: -------------------------------------------------------------------------------- 1 | // aes_saes64.h 2 | // 2020-05-05 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Implementation prototypes for aes_saes64.c 6 | 7 | #ifndef _AES_SAES64_H_ 8 | #define _AES_SAES64_H_ 9 | 10 | #include 11 | 12 | // Set encryption key 13 | 14 | void aes128_enc_key_saes64(uint32_t rk[AES128_RK_WORDS], 15 | const uint8_t key[16]); 16 | 17 | void aes192_enc_key_saes64(uint32_t rk[AES192_RK_WORDS], 18 | const uint8_t key[24]); 19 | 20 | void aes256_enc_key_saes64(uint32_t rk[AES256_RK_WORDS], 21 | const uint8_t key[32]); 22 | 23 | // Encrypt a block 24 | 25 | void aes128_enc_ecb_saes64(uint8_t ct[16], const uint8_t pt[16], 26 | const uint32_t rk[AES128_RK_WORDS]); 27 | 28 | void aes192_enc_ecb_saes64(uint8_t ct[16], const uint8_t pt[16], 29 | const uint32_t rk[AES192_RK_WORDS]); 30 | 31 | void aes256_enc_ecb_saes64(uint8_t ct[16], const uint8_t pt[16], 32 | const uint32_t rk[AES256_RK_WORDS]); 33 | 34 | 35 | // Set decryption key 36 | 37 | void aes128_dec_key_saes64(uint32_t rk[AES128_RK_WORDS], 38 | const uint8_t key[16]); 39 | 40 | void aes192_dec_key_saes64(uint32_t rk[AES192_RK_WORDS], 41 | const uint8_t key[24]); 42 | 43 | void aes256_dec_key_saes64(uint32_t rk[AES256_RK_WORDS], 44 | const uint8_t key[32]); 45 | 46 | // Decrypt a block 47 | 48 | void aes128_dec_ecb_saes64(uint8_t pt[16], const uint8_t ct[16], 49 | const uint32_t rk[AES128_RK_WORDS]); 50 | 51 | void aes192_dec_ecb_saes64(uint8_t pt[16], const uint8_t ct[16], 52 | const uint32_t rk[AES192_RK_WORDS]); 53 | 54 | void aes256_dec_ecb_saes64(uint8_t pt[16], const uint8_t ct[16], 55 | const uint32_t rk[AES256_RK_WORDS]); 56 | 57 | #endif // _AES_SAES64_H_ 58 | -------------------------------------------------------------------------------- /aes_test.c: -------------------------------------------------------------------------------- 1 | // aes_test.c 2 | // 2020-03-21 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Unit tests for AES-128/192/256 6 | 7 | #include "test_hex.h" 8 | #include "aes_wrap.h" 9 | 10 | // Test AES 11 | 12 | int test_aes() 13 | { 14 | uint8_t pt[16], ct[16], xt[16], key[32]; 15 | uint32_t rk[AES256_RK_WORDS]; 16 | int fail = 0; 17 | 18 | // FIPS 197 test vectors 19 | readhex(pt, sizeof(pt), "00112233445566778899AABBCCDDEEFF"); 20 | readhex(key, sizeof(key), 21 | "000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F"); 22 | aes128_enc_key(rk, key); 23 | aes128_enc_ecb(ct, pt, rk); 24 | 25 | fail += chkhex("AES-128 Enc", ct, 16, "69C4E0D86A7B0430D8CDB78070B4C55A"); 26 | aes128_dec_key(rk, key); 27 | aes128_dec_ecb(xt, ct, rk); 28 | fail += chkhex("AES-128 Dec", xt, 16, "00112233445566778899AABBCCDDEEFF"); 29 | 30 | aes192_enc_key(rk, key); 31 | aes192_enc_ecb(ct, pt, rk); 32 | fail += chkhex("AES-192 Enc", ct, 16, "DDA97CA4864CDFE06EAF70A0EC0D7191"); 33 | 34 | aes192_dec_key(rk, key); 35 | aes192_dec_ecb(xt, ct, rk); 36 | fail += chkhex("AES-192 Dec", xt, 16, "00112233445566778899AABBCCDDEEFF"); 37 | 38 | aes256_enc_key(rk, key); 39 | aes256_enc_ecb(ct, pt, rk); 40 | fail += chkhex("AES-256 Enc", ct, 16, "8EA2B7CA516745BFEAFC49904B496089"); 41 | 42 | aes256_dec_key(rk, key); 43 | aes256_dec_ecb(xt, ct, rk); 44 | fail += chkhex("AES-256 Dec", xt, 16, "00112233445566778899AABBCCDDEEFF"); 45 | 46 | // another test vector set (picked from SP800-38A) 47 | readhex(key, sizeof(key), "2B7E151628AED2A6ABF7158809CF4F3C"); 48 | aes128_enc_key(rk, key); 49 | readhex(pt, sizeof(pt), "6BC1BEE22E409F96E93D7E117393172A"); 50 | aes128_enc_ecb(ct, pt, rk); 51 | fail += chkhex("AES-128 Enc", ct, 16, "3AD77BB40D7A3660A89ECAF32466EF97"); 52 | 53 | aes128_dec_key(rk, key); 54 | aes128_dec_ecb(xt, ct, rk); 55 | fail += chkhex("AES-128 Dec", xt, 16, "6BC1BEE22E409F96E93D7E117393172A"); 56 | 57 | readhex(key, sizeof(key), 58 | "8E73B0F7DA0E6452C810F32B809079E562F8EAD2522C6B7B"); 59 | aes192_enc_key(rk, key); 60 | readhex(pt, sizeof(pt), "AE2D8A571E03AC9C9EB76FAC45AF8E51"); 61 | aes192_enc_ecb(ct, pt, rk); 62 | fail += chkhex("AES-192 Enc", ct, 16, "974104846D0AD3AD7734ECB3ECEE4EEF"); 63 | 64 | aes192_dec_key(rk, key); 65 | aes192_dec_ecb(xt, ct, rk); 66 | fail += chkhex("AES-192 Dec", xt, 16, "AE2D8A571E03AC9C9EB76FAC45AF8E51"); 67 | 68 | readhex(key, sizeof(key), 69 | "603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4"); 70 | aes256_enc_key(rk, key); 71 | readhex(pt, sizeof(pt), "30C81C46A35CE411E5FBC1191A0A52EF"); 72 | aes256_enc_ecb(ct, pt, rk); 73 | fail += chkhex("AES-256 Enc", ct, 16, "B6ED21B99CA6F4F9F153E7B1BEAFED1D"); 74 | 75 | aes256_dec_key(rk, key); 76 | aes256_dec_ecb(xt, ct, rk); 77 | fail += chkhex("AES-256 Dec", xt, 16, "30C81C46A35CE411E5FBC1191A0A52EF"); 78 | 79 | return fail; 80 | } 81 | -------------------------------------------------------------------------------- /aes_wrap.c: -------------------------------------------------------------------------------- 1 | // aes_wrap.c 2 | // 2020-04-23 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // AES 128/192/256 block encryption and decryption 6 | 7 | #include 8 | #include 9 | 10 | #include "aes_wrap.h" 11 | #include "aes_saes32.h" 12 | 13 | static void key_undef(uint32_t * rk, const uint8_t * key) 14 | { 15 | (void) rk; 16 | (void) key; 17 | 18 | fprintf(stderr, "[DEAD] key_undef()\n"); 19 | abort(); 20 | } 21 | 22 | static void ciph_undef(uint8_t * d, const uint8_t * s, const uint32_t * rk) 23 | { 24 | (void) d; 25 | (void) s; 26 | (void) rk; 27 | 28 | fprintf(stderr, "[DEAD] key_undef()\n"); 29 | abort(); 30 | } 31 | 32 | // == Externally visible pointers == 33 | 34 | // Set encryption key 35 | 36 | void (*aes128_enc_key)(uint32_t rk[AES128_RK_WORDS], 37 | const uint8_t key[16]) = key_undef; 38 | 39 | void (*aes192_enc_key)(uint32_t rk[AES192_RK_WORDS], 40 | const uint8_t key[24]) = key_undef; 41 | 42 | void (*aes256_enc_key)(uint32_t rk[AES256_RK_WORDS], 43 | const uint8_t key[32]) = key_undef; 44 | 45 | // Encrypt a block 46 | 47 | 48 | void (*aes128_enc_ecb)(uint8_t ct[16], const uint8_t pt[16], 49 | const uint32_t rk[AES128_RK_WORDS]) = ciph_undef; 50 | 51 | void (*aes192_enc_ecb)(uint8_t ct[16], const uint8_t pt[16], 52 | const uint32_t rk[AES192_RK_WORDS]) = ciph_undef; 53 | 54 | void (*aes256_enc_ecb)(uint8_t ct[16], const uint8_t pt[16], 55 | const uint32_t rk[AES256_RK_WORDS]) = ciph_undef; 56 | 57 | // Set decryption key 58 | 59 | void (*aes128_dec_key)(uint32_t rk[AES128_RK_WORDS], 60 | const uint8_t key[16]) = key_undef; 61 | void (*aes192_dec_key)(uint32_t rk[AES192_RK_WORDS], 62 | const uint8_t key[24]) = key_undef; 63 | void (*aes256_dec_key)(uint32_t rk[AES256_RK_WORDS], 64 | const uint8_t key[32]) = key_undef; 65 | 66 | // Decrypt a block 67 | 68 | void (*aes128_dec_ecb)(uint8_t pt[16], const uint8_t ct[16], 69 | const uint32_t rk[AES128_RK_WORDS]) = ciph_undef; 70 | 71 | void (*aes192_dec_ecb)(uint8_t pt[16], const uint8_t ct[16], 72 | const uint32_t rk[AES192_RK_WORDS]) = ciph_undef; 73 | 74 | void (*aes256_dec_ecb)(uint8_t pt[16], const uint8_t ct[16], 75 | const uint32_t rk[AES256_RK_WORDS]) = ciph_undef; 76 | -------------------------------------------------------------------------------- /aes_wrap.h: -------------------------------------------------------------------------------- 1 | // aes_wrap.h 2 | // 2019-10-23 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2019, PQShield Ltd. All rights reserved. 4 | 5 | // Wrapper for AES 128/192/256 block encryption and decryption. 6 | // These provide function pointers tothe UUT. 7 | 8 | #ifndef _AES_WRAP_H_ 9 | #define _AES_WRAP_H_ 10 | 11 | #include 12 | 13 | // number of rounds 14 | #define AES128_ROUNDS 10 15 | #define AES192_ROUNDS 12 16 | #define AES256_ROUNDS 14 17 | 18 | // expanded key size 19 | #define AES128_RK_WORDS (4 * (AES128_ROUNDS + 1)) 20 | #define AES192_RK_WORDS (4 * (AES192_ROUNDS + 1)) 21 | #define AES256_RK_WORDS (4 * (AES256_ROUNDS + 1)) 22 | 23 | // Set encryption key 24 | 25 | extern void (*aes128_enc_key)(uint32_t rk[AES128_RK_WORDS], 26 | const uint8_t key[16]); 27 | 28 | extern void (*aes192_enc_key)(uint32_t rk[AES192_RK_WORDS], 29 | const uint8_t key[24]); 30 | 31 | extern void (*aes256_enc_key)(uint32_t rk[AES256_RK_WORDS], 32 | const uint8_t key[32]); 33 | 34 | // Encrypt a block 35 | 36 | 37 | extern void (*aes128_enc_ecb)(uint8_t ct[16], const uint8_t pt[16], 38 | const uint32_t rk[AES128_RK_WORDS]); 39 | 40 | extern void (*aes192_enc_ecb)(uint8_t ct[16], const uint8_t pt[16], 41 | const uint32_t rk[AES192_RK_WORDS]); 42 | 43 | extern void (*aes256_enc_ecb)(uint8_t ct[16], const uint8_t pt[16], 44 | const uint32_t rk[AES256_RK_WORDS]); 45 | 46 | // Set decryption key 47 | 48 | extern void (*aes128_dec_key)(uint32_t rk[AES128_RK_WORDS], 49 | const uint8_t key[16]); 50 | extern void (*aes192_dec_key)(uint32_t rk[AES192_RK_WORDS], 51 | const uint8_t key[24]); 52 | extern void (*aes256_dec_key)(uint32_t rk[AES256_RK_WORDS], 53 | const uint8_t key[32]); 54 | 55 | // Decrypt a block 56 | 57 | extern void (*aes128_dec_ecb)(uint8_t pt[16], const uint8_t ct[16], 58 | const uint32_t rk[AES128_RK_WORDS]); 59 | 60 | extern void (*aes192_dec_ecb)(uint8_t pt[16], const uint8_t ct[16], 61 | const uint32_t rk[AES192_RK_WORDS]); 62 | 63 | extern void (*aes256_dec_ecb)(uint8_t pt[16], const uint8_t ct[16], 64 | const uint32_t rk[AES256_RK_WORDS]); 65 | 66 | #endif // _AES_WRAP_H_ 67 | -------------------------------------------------------------------------------- /asm/README.md: -------------------------------------------------------------------------------- 1 | # Assembler AES / SM4 using SAES32 2 | 3 | 2020-02-16 Markku-Juhani O. Saarinen 4 | 5 | Assembler implementations of the AES and SM4 block ciphers using the 6 | SAES32 instructions -- has the same prototypes and features as the 7 | C-language APIs (see parent), so the same unit tests work too. 8 | 9 | The functions assume word-aligned input. Typically such low-level "ECB" 10 | primitives do not work directly on plaintext or ciphertext but are 11 | wrapped in some function that implement an encryption mode such as 12 | CTR, CCM, SIV, or GCM and operate on buffers provided by the wrapper. 13 | 14 | This is definitely not the prettiest way of using (custom-0) SAES32 15 | instructions; hacky macros in [saes32_c0.h](saes32_c0.h) are used for 16 | encoding. Requires the C preprocessor, was tested with RISC-V GCC 9.2.0. 17 | 18 | Cheers, 19 | - markku 20 | 21 | -------------------------------------------------------------------------------- /asm/saes32_c0.h: -------------------------------------------------------------------------------- 1 | // saes32_c0.h 2 | // 2020-02-16 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Raw encoding macros for ENC1S as custom-0 -- pretty ugly. 6 | 7 | #ifndef _SAES32_C0_H_ 8 | #define _SAES32_C0_H_ 9 | 10 | // custom-0 r-type instruction encoding macro 11 | 12 | .macro cust0r fn3, fn7, rd, rs1, rs2 13 | .word(0x0B + ((\fn3) << 12) + ((\fn7) << 25) + ((\rd) << 7) + ((\rs1) << 15) + ((\rs2) << 20)) 14 | .endm 15 | 16 | 17 | // function codes 18 | #define SAES32_ENCSM_FN 0 19 | #define SAES32_ENCS_FN 1 20 | #define SAES32_DECSM_FN 2 21 | #define SAES32_DECS_FN 3 22 | #define SSM4_ED_FN 4 23 | #define SSM4_KS_FN 5 24 | 25 | // SAES32 as funct3=0 -- with a fn in funct7 26 | 27 | .macro saes32 rd, rs1, rs2, fn 28 | cust0r 0, \fn, \rd, \rs1, \rs2 29 | .endm 30 | 31 | // Pseudo-ops for AES and SM4 32 | 33 | .macro saes32_encsm rd, rs1, rs2, bs 34 | saes32 \rd, \rs1, \rs2, ((SAES32_ENCSM_FN << 2) | (\bs)) 35 | .endm 36 | 37 | .macro saes32_encs rd, rs1, rs2, bs 38 | saes32 \rd, \rs1, \rs2, ((SAES32_ENCS_FN << 2) | (\bs)) 39 | .endm 40 | 41 | .macro saes32_decsm rd, rs1, rs2, bs 42 | saes32 \rd, \rs1, \rs2, ((SAES32_DECSM_FN << 2) | (\bs)) 43 | .endm 44 | 45 | .macro saes32_decs rd, rs1, rs2, bs 46 | saes32 \rd, \rs1, \rs2, ((SAES32_DECS_FN << 2) | (\bs)) 47 | .endm 48 | 49 | 50 | .macro ssm4_ed rd, rs1, rs2, bs 51 | saes32 \rd, \rs1, \rs2, ((SSM4_ED_FN << 2) | (\bs)) 52 | .endm 53 | 54 | .macro ssm4_ks rd, rs1, rs2, bs 55 | saes32 \rd, \rs1, \rs2, ((SSM4_KS_FN << 2) | (\bs)) 56 | .endm 57 | 58 | 59 | // numbered registers 60 | #define X0 0 61 | #define RA 1 62 | #define SP 2 63 | #define GP 3 64 | #define TP 4 65 | #define T0 5 66 | #define T1 6 67 | #define T2 7 68 | #define S0 8 69 | #define S1 9 70 | #define A0 10 71 | #define A1 11 72 | #define A2 12 73 | #define A3 13 74 | #define A4 14 75 | #define A5 15 76 | #define A6 16 77 | #define A7 17 78 | #define S2 18 79 | #define S3 19 80 | #define S4 20 81 | #define S5 21 82 | #define S6 22 83 | #define S7 23 84 | #define S8 24 85 | #define S9 25 86 | #define S10 26 87 | #define S11 27 88 | #define T3 28 89 | #define T4 29 90 | #define T5 30 91 | #define T6 31 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /asm/saes32_dec.S: -------------------------------------------------------------------------------- 1 | // saes32_dec.S 2 | // 2020-02-16 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // AES Decryption and Key Schedule using "lwaes" instructions. 6 | 7 | // macro definitions for the custom instruction 8 | #include "saes32_c0.h" 9 | 10 | .option nopic 11 | .text 12 | .align 2 13 | 14 | 15 | // Decrypt rounds. Implements AES-128/192/256 depending on nr = {10,12,14} 16 | // void aes_dec_rounds(uint8_t pt[16], const uint8_t ct[16], 17 | // const uint32_t rk[], int nr) 18 | // where: a0 = pt, a1 = ct, a2 = rk, a3 = nr 19 | 20 | .globl aes_dec_rounds 21 | .type aes_dec_rounds, @function 22 | 23 | aes_dec_rounds: 24 | 25 | lw t4, 0(a1) // load ct 26 | lw t5, 4(a1) 27 | lw t6, 8(a1) 28 | lw a7, 12(a1) 29 | 30 | slli a3, a3, 4 // final pointer 31 | add a3, a3, a2 32 | 33 | lw t0, 0(a3) // load rk 34 | lw t1, 4(a3) 35 | lw t2, 8(a3) 36 | lw t3, 12(a3) 37 | 38 | xor t0, t0, t4 // ct ^ rk 39 | xor t1, t1, t5 40 | xor t2, t2, t6 41 | xor t3, t3, a7 42 | 43 | j .ent // enter loop in middle 44 | 45 | .loop: 46 | // even round 47 | saes32_decsm T0, T0, T4, 0 48 | saes32_decsm T0, T0, A7, 1 49 | saes32_decsm T0, T0, T6, 2 50 | saes32_decsm T0, T0, T5, 3 51 | 52 | saes32_decsm T1, T1, T5, 0 53 | saes32_decsm T1, T1, T4, 1 54 | saes32_decsm T1, T1, A7, 2 55 | saes32_decsm T1, T1, T6, 3 56 | 57 | saes32_decsm T2, T2, T6, 0 58 | saes32_decsm T2, T2, T5, 1 59 | saes32_decsm T2, T2, T4, 2 60 | saes32_decsm T2, T2, A7, 3 61 | 62 | saes32_decsm T3, T3, A7, 0 63 | saes32_decsm T3, T3, T6, 1 64 | saes32_decsm T3, T3, T5, 2 65 | saes32_decsm T3, T3, T4, 3 66 | 67 | .ent: 68 | addi a3, a3, -32 69 | lw t4, 16(a3) // load round key 70 | lw t5, 20(a3) 71 | lw t6, 24(a3) 72 | lw a7, 28(a3) 73 | 74 | // odd round 75 | saes32_decsm T4, T4, T0, 0 76 | saes32_decsm T4, T4, T3, 1 77 | saes32_decsm T4, T4, T2, 2 78 | saes32_decsm T4, T4, T1, 3 79 | 80 | saes32_decsm T5, T5, T1, 0 81 | saes32_decsm T5, T5, T0, 1 82 | saes32_decsm T5, T5, T3, 2 83 | saes32_decsm T5, T5, T2, 3 84 | 85 | saes32_decsm T6, T6, T2, 0 86 | saes32_decsm T6, T6, T1, 1 87 | saes32_decsm T6, T6, T0, 2 88 | saes32_decsm T6, T6, T3, 3 89 | 90 | saes32_decsm A7, A7, T3, 0 91 | saes32_decsm A7, A7, T2, 1 92 | saes32_decsm A7, A7, T1, 2 93 | saes32_decsm A7, A7, T0, 3 94 | 95 | lw t0, 0(a3) // load round key 96 | lw t1, 4(a3) 97 | lw t2, 8(a3) 98 | lw t3, 12(a3) 99 | bne a3, a2, .loop 100 | 101 | // final (output) round 102 | saes32_decs T0, T0, T4, 0 103 | saes32_decs T0, T0, A7, 1 104 | saes32_decs T0, T0, T6, 2 105 | saes32_decs T0, T0, T5, 3 106 | 107 | saes32_decs T1, T1, T5, 0 108 | saes32_decs T1, T1, T4, 1 109 | saes32_decs T1, T1, A7, 2 110 | saes32_decs T1, T1, T6, 3 111 | 112 | saes32_decs T2, T2, T6, 0 113 | saes32_decs T2, T2, T5, 1 114 | saes32_decs T2, T2, T4, 2 115 | saes32_decs T2, T2, A7, 3 116 | 117 | saes32_decs T3, T3, A7, 0 118 | saes32_decs T3, T3, T6, 1 119 | saes32_decs T3, T3, T5, 2 120 | saes32_decs T3, T3, T4, 3 121 | 122 | sw t0, 0(a0) // store pt 123 | sw t1, 4(a0) 124 | sw t2, 8(a0) 125 | sw t3, 12(a0) 126 | 127 | jr ra 128 | .size aes_dec_rounds, .-aes_dec_rounds 129 | 130 | // Helper: apply inverse mixcolumns to a vector 131 | 132 | .type .invmc, @function 133 | 134 | .invmc: 135 | lw t0, 0(a0) 136 | 137 | saes32_encs T1, 0, T0, 0 138 | saes32_encs T1, T1, T0, 1 139 | saes32_encs T1, T1, T0, 2 140 | saes32_encs T1, T1, T0, 3 141 | 142 | saes32_decsm T0, 0, T1, 0 143 | saes32_decsm T0, T0, T1, 1 144 | saes32_decsm T0, T0, T1, 2 145 | saes32_decsm T0, T0, T1, 3 146 | 147 | sw t0, 0(a0) 148 | addi a0, a0, 4 149 | bne a0, a1, .invmc 150 | jr ra 151 | 152 | .size .invmc, .-.invmc 153 | .align 2 154 | 155 | 156 | // Key schedule for AES-128 decryption. 157 | // void aes128_dec_key(uint32_t rk[44], const uint8_t key[16]); 158 | // where: a0 = rk, a1 = key 159 | 160 | .globl aes128_dec_key 161 | .type aes128_dec_key, @function 162 | 163 | aes128_dec_key: 164 | addi sp, sp, -8 165 | sw s0, 0(sp) 166 | sw ra, 4(sp) 167 | mv s0, a0 168 | call aes128_enc_key 169 | addi a0, s0, 16 170 | lw s0, 0(sp) 171 | lw ra, 4(sp) 172 | addi a1, a0, 4 * 36 173 | addi sp, sp, 8 174 | tail .invmc 175 | 176 | .size aes128_dec_key, .-aes128_dec_key 177 | .align 2 178 | 179 | 180 | // Key schedule for AES-192 decryption. 181 | // void aes192_dec_key(uint32_t rk[52], const uint8_t key[24]); 182 | // where: a0 = rk, a1 = key 183 | 184 | .globl aes192_dec_key 185 | .type aes192_dec_key, @function 186 | 187 | aes192_dec_key: 188 | addi sp, sp, -8 189 | sw s0, 0(sp) 190 | sw ra, 4(sp) 191 | mv s0, a0 192 | call aes192_enc_key 193 | addi a0, s0, 16 194 | lw s0, 0(sp) 195 | lw ra, 4(sp) 196 | addi a1, a0, 4 * 44 197 | addi sp, sp, 8 198 | tail .invmc 199 | 200 | .size aes192_dec_key, .-aes192_dec_key 201 | .align 2 202 | 203 | 204 | // Key schedule for AES-256 decryption. 205 | // void aes256_dec_key(uint32_t rk[60], const uint8_t key[32]); 206 | // where: a0 = rk, a1 = key 207 | 208 | .globl aes256_dec_key 209 | .type aes256_dec_key, @function 210 | 211 | aes256_dec_key: 212 | addi sp, sp, -8 213 | sw s0, 0(sp) 214 | sw ra, 4(sp) 215 | mv s0, a0 216 | call aes256_enc_key 217 | addi a0, s0, 16 218 | lw s0, 0(sp) 219 | lw ra, 4(sp) 220 | addi a1, a0, 4 * 52 221 | addi sp, sp, 8 222 | tail .invmc 223 | 224 | .size aes256_dec_key, .-aes256_dec_key 225 | 226 | -------------------------------------------------------------------------------- /asm/saes32_enc.S: -------------------------------------------------------------------------------- 1 | // saes32_enc.S 2 | // 2020-02-16 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // AES Encryption and Key Schedule using "lwaes" instructions. 6 | 7 | // macro definitions for the custom instruction 8 | #include "saes32_c0.h" 9 | 10 | .option nopic 11 | .text 12 | .align 2 13 | 14 | 15 | // Encrypt rounds. Implements AES-128/192/256 depending on nr = {10,12,14} 16 | // void aes_enc_rounds(uint8_t ct[16], const uint8_t pt[16], 17 | // const uint32_t rk[], int nr) 18 | // where: a0 = ct, a1 = pt, a2 = rk, a3 = nr 19 | 20 | .globl aes_enc_rounds 21 | .type aes_enc_rounds, @function 22 | 23 | aes_enc_rounds: 24 | 25 | lw t4, 0(a1) // load pt 26 | lw t5, 4(a1) 27 | lw t6, 8(a1) 28 | lw a7, 12(a1) 29 | 30 | lw t0, 0(a2) // load rk 31 | lw t1, 4(a2) 32 | lw t2, 8(a2) 33 | lw t3, 12(a2) 34 | 35 | xor t0, t0, t4 // pt ^ rk 36 | xor t1, t1, t5 37 | xor t2, t2, t6 38 | xor t3, t3, a7 39 | 40 | slli a3, a3, 4 // final pointer 41 | add a3, a3, a2 42 | 43 | j .ent // enter loop in the middle 44 | 45 | .loop: 46 | // odd round 47 | saes32_encsm T0, T0, T4, 0 48 | saes32_encsm T0, T0, T5, 1 49 | saes32_encsm T0, T0, T6, 2 50 | saes32_encsm T0, T0, A7, 3 51 | 52 | saes32_encsm T1, T1, T5, 0 53 | saes32_encsm T1, T1, T6, 1 54 | saes32_encsm T1, T1, A7, 2 55 | saes32_encsm T1, T1, T4, 3 56 | 57 | saes32_encsm T2, T2, T6, 0 58 | saes32_encsm T2, T2, A7, 1 59 | saes32_encsm T2, T2, T4, 2 60 | saes32_encsm T2, T2, T5, 3 61 | 62 | saes32_encsm T3, T3, A7, 0 63 | saes32_encsm T3, T3, T4, 1 64 | saes32_encsm T3, T3, T5, 2 65 | saes32_encsm T3, T3, T6, 3 66 | 67 | .ent: 68 | lw t4, 16(a2) // load round key 69 | lw t5, 20(a2) 70 | lw t6, 24(a2) 71 | lw a7, 28(a2) 72 | 73 | // even round 74 | saes32_encsm T4, T4, T0, 0 75 | saes32_encsm T4, T4, T1, 1 76 | saes32_encsm T4, T4, T2, 2 77 | saes32_encsm T4, T4, T3, 3 78 | 79 | saes32_encsm T5, T5, T1, 0 80 | saes32_encsm T5, T5, T2, 1 81 | saes32_encsm T5, T5, T3, 2 82 | saes32_encsm T5, T5, T0, 3 83 | 84 | saes32_encsm T6, T6, T2, 0 85 | saes32_encsm T6, T6, T3, 1 86 | saes32_encsm T6, T6, T0, 2 87 | saes32_encsm T6, T6, T1, 3 88 | 89 | saes32_encsm A7, A7, T3, 0 90 | saes32_encsm A7, A7, T0, 1 91 | saes32_encsm A7, A7, T1, 2 92 | saes32_encsm A7, A7, T2, 3 93 | 94 | addi a2, a2, 32 95 | lw t0, 0(a2) // load round key 96 | lw t1, 4(a2) 97 | lw t2, 8(a2) 98 | lw t3, 12(a2) 99 | bne a3, a2, .loop 100 | 101 | // final (output) round 102 | saes32_encs T0, T0, T4, 0 103 | saes32_encs T0, T0, T5, 1 104 | saes32_encs T0, T0, T6, 2 105 | saes32_encs T0, T0, A7, 3 106 | 107 | saes32_encs T1, T1, T5, 0 108 | saes32_encs T1, T1, T6, 1 109 | saes32_encs T1, T1, A7, 2 110 | saes32_encs T1, T1, T4, 3 111 | 112 | saes32_encs T2, T2, T6, 0 113 | saes32_encs T2, T2, A7, 1 114 | saes32_encs T2, T2, T4, 2 115 | saes32_encs T2, T2, T5, 3 116 | 117 | saes32_encs T3, T3, A7, 0 118 | saes32_encs T3, T3, T4, 1 119 | saes32_encs T3, T3, T5, 2 120 | saes32_encs T3, T3, T6, 3 121 | 122 | sw t0, 0(a0) // store ct 123 | sw t1, 4(a0) 124 | sw t2, 8(a0) 125 | sw t3, 12(a0) 126 | 127 | jr ra 128 | .size aes_enc_rounds, .-aes_enc_rounds 129 | 130 | 131 | // Key schedule for AES-128 Encryption. 132 | // void aes128_enc_key(uint32_t rk[44], const uint8_t key[16]) 133 | // where: a0 = rk, a1 = key 134 | 135 | .align 2 136 | .globl aes128_enc_key 137 | .type aes128_enc_key, @function 138 | 139 | aes128_enc_key: 140 | 141 | lui a2, %hi(.rcon) // rcon pointer 142 | addi a2, a2, %lo(.rcon) 143 | 144 | addi a3, a0, 16 * 10 // end pointer 145 | 146 | lw t0, 0(a1) // input key 147 | lw t1, 4(a1) 148 | lw t2, 8(a1) 149 | lw t3, 12(a1) 150 | 151 | sw t0, 0(a0) // first round key 152 | sw t1, 4(a0) 153 | sw t2, 8(a0) 154 | sw t3, 12(a0) 155 | 156 | xori t0, t0, 1 // first round constant 157 | j .nl0 158 | 159 | .ekl0: 160 | addi a2, a2, 1 161 | lbu a1, 0(a2) // round constant 162 | xor t0, a1, t0 163 | 164 | .nl0: 165 | slli a4, t3, 24 // rotate 166 | srli a1, t3, 8 167 | or a1, a1, a4 168 | 169 | saes32_encs T0, T0, A1, 0 170 | saes32_encs T0, T0, A1, 1 171 | saes32_encs T0, T0, A1, 2 172 | saes32_encs T0, T0, A1, 3 173 | 174 | xor t1, t1, t0 175 | xor t2, t2, t1 176 | xor t3, t3, t2 177 | 178 | addi a0, a0, 16 179 | sw t0, 0(a0) // store round key 180 | sw t1, 4(a0) 181 | sw t2, 8(a0) 182 | sw t3, 12(a0) 183 | 184 | bne a0, a3, .ekl0 185 | 186 | jr ra 187 | .size aes128_enc_key, .-aes128_enc_key 188 | 189 | 190 | // Key schedule for AES-192 Encryption. 191 | // void aes192_enc_key(uint32_t rk[52], const uint8_t key[16]) 192 | // where: a0 = rk, a1 = key 193 | 194 | .align 2 195 | .globl aes192_enc_key 196 | .type aes192_enc_key, @function 197 | 198 | aes192_enc_key: 199 | 200 | lui a2, %hi(.rcon) // rcon pointer 201 | addi a2, a2, %lo(.rcon) 202 | 203 | addi a3, a0, 16 * 12 // end pointer 204 | 205 | lw t0, 0(a1) // input key 206 | lw t1, 4(a1) 207 | lw t2, 8(a1) 208 | lw t3, 12(a1) 209 | lw t4, 16(a1) 210 | lw t5, 20(a1) 211 | 212 | sw t0, 0(a0) // first round key 213 | sw t1, 4(a0) 214 | sw t2, 8(a0) 215 | sw t3, 12(a0) 216 | 217 | xori t0, t0, 1 // first round constant 218 | j .nl1 219 | 220 | .ekl1: addi a2, a2, 1 221 | lbu a1, 0(a2) // round constant 222 | xor t0, a1, t0 223 | 224 | .nl1: sw t4, 16(a0) // high part of round key 225 | sw t5, 20(a0) 226 | 227 | slli a4, t5, 24 // rotate 228 | srli a1, t5, 8 229 | or a1, a1, a4 230 | 231 | saes32_encs T0, T0, A1, 0 232 | saes32_encs T0, T0, A1, 1 233 | saes32_encs T0, T0, A1, 2 234 | saes32_encs T0, T0, A1, 3 235 | 236 | xor t1, t1, t0 237 | xor t2, t2, t1 238 | xor t3, t3, t2 239 | xor t4, t4, t3 240 | xor t5, t5, t4 241 | 242 | addi a0, a0, 24 243 | sw t0, 0(a0) // store round key 244 | sw t1, 4(a0) 245 | sw t2, 8(a0) 246 | sw t3, 12(a0) 247 | 248 | bne a0, a3, .ekl1 249 | 250 | jr ra 251 | .size aes192_enc_key, .-aes192_enc_key 252 | 253 | 254 | // Key schedule for AES-256 Encryption. 255 | // void aes256_enc_key(uint32_t rk[60], const uint8_t key[16]) 256 | // where: a0 = rk, a1 = key 257 | 258 | .align 2 259 | .globl aes256_enc_key 260 | .type aes256_enc_key, @function 261 | 262 | aes256_enc_key: 263 | 264 | lui a2, %hi(.rcon) // rcon pointer 265 | addi a2, a2, %lo(.rcon) 266 | 267 | addi a3, a0, 16 * 14 // end pointer 268 | 269 | lw t0, 0(a1) // input key 270 | lw t1, 4(a1) 271 | lw t2, 8(a1) 272 | lw t3, 12(a1) 273 | lw t4, 16(a1) 274 | lw t5, 20(a1) 275 | lw t6, 24(a1) 276 | lw a7, 28(a1) 277 | 278 | sw t0, 0(a0) // first round key 279 | sw t1, 4(a0) 280 | sw t2, 8(a0) 281 | sw t3, 12(a0) 282 | 283 | xori t0, t0, 1 // first round constant 284 | j .nl2 285 | 286 | .ekl2: saes32_encs T4, T4, T3, 0 // no rotate 287 | saes32_encs T4, T4, T3, 1 288 | saes32_encs T4, T4, T3, 2 289 | saes32_encs T4, T4, T3, 3 290 | 291 | xor t5, t5, t4 292 | xor t6, t6, t5 293 | xor a7, a7, t6 294 | 295 | addi a2, a2, 1 296 | lbu a1, 0(a2) // round constant 297 | xor t0, a1, t0 298 | 299 | .nl2: sw t4, 16(a0) // store upper part of rk 300 | sw t5, 20(a0) 301 | sw t6, 24(a0) 302 | sw a7, 28(a0) 303 | 304 | slli a4, a7, 24 // rotate 305 | srli a1, a7, 8 306 | or a1, a1, a4 307 | 308 | saes32_encs T0, T0, A1, 0 309 | saes32_encs T0, T0, A1, 1 310 | saes32_encs T0, T0, A1, 2 311 | saes32_encs T0, T0, A1, 3 312 | 313 | xor t1, t1, t0 314 | xor t2, t2, t1 315 | xor t3, t3, t2 316 | 317 | addi a0, a0, 32 318 | 319 | sw t0, 0(a0) // store round key 320 | sw t1, 4(a0) 321 | sw t2, 8(a0) 322 | sw t3, 12(a0) 323 | 324 | bne a0, a3, .ekl2 // final rk ? 325 | 326 | jr ra 327 | .size aes256_enc_key, .-aes256_enc_key 328 | 329 | // round constants 330 | 331 | .type .rcon, @object 332 | .rcon: 333 | .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 334 | .size .rcon, 10 335 | 336 | -------------------------------------------------------------------------------- /asm/saes32_wrap.h: -------------------------------------------------------------------------------- 1 | // aes_wrap.h 2 | // 2019-10-23 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2019, PQShield Ltd. All rights reserved. 4 | 5 | // AES 128/192/256 block encryption and decryption (no dependencies) 6 | 7 | #ifndef _AES_WRAP_H_ 8 | #define _AES_WRAP_H_ 9 | 10 | #include 11 | #include 12 | 13 | // number of rounds 14 | #define AES128_ROUNDS 10 15 | #define AES192_ROUNDS 12 16 | #define AES256_ROUNDS 14 17 | 18 | // expanded key size 19 | #define AES128_RK_WORDS (4 * (AES128_ROUNDS + 1)) 20 | #define AES192_RK_WORDS (4 * (AES192_ROUNDS + 1)) 21 | #define AES256_RK_WORDS (4 * (AES256_ROUNDS + 1)) 22 | 23 | // === ENCRYPT === 24 | 25 | // set encryption key 26 | void aes128_enc_key(uint32_t rk[AES128_RK_WORDS], const uint8_t key[16]); 27 | void aes192_enc_key(uint32_t rk[AES192_RK_WORDS], const uint8_t key[24]); 28 | void aes256_enc_key(uint32_t rk[AES256_RK_WORDS], const uint8_t key[32]); 29 | 30 | // implementation 31 | void aes_enc_rounds(uint8_t ct[16], const uint8_t pt[16], 32 | const uint32_t rk[], int nr); 33 | 34 | // aliases 35 | #define aes128_enc_ecb(ct, pt, rk) aes_enc_rounds(ct, pt, rk, AES128_ROUNDS); 36 | #define aes192_enc_ecb(ct, pt, rk) aes_enc_rounds(ct, pt, rk, AES192_ROUNDS); 37 | #define aes256_enc_ecb(ct, pt, rk) aes_enc_rounds(ct, pt, rk, AES256_ROUNDS); 38 | 39 | // === DECRYPT === 40 | 41 | // set decryption key 42 | void aes128_dec_key(uint32_t rk[AES128_RK_WORDS], const uint8_t key[16]); 43 | void aes192_dec_key(uint32_t rk[AES192_RK_WORDS], const uint8_t key[24]); 44 | void aes256_dec_key(uint32_t rk[AES256_RK_WORDS], const uint8_t key[32]); 45 | 46 | void aes_dec_rounds(uint8_t pt[16], const uint8_t ct[16], 47 | const uint32_t rk[], int nr); 48 | 49 | // aliases 50 | #define aes128_dec_ecb(ct, pt, rk) aes_dec_rounds(ct, pt, rk, AES128_ROUNDS); 51 | #define aes192_dec_ecb(ct, pt, rk) aes_dec_rounds(ct, pt, rk, AES192_ROUNDS); 52 | #define aes256_dec_ecb(ct, pt, rk) aes_dec_rounds(ct, pt, rk, AES256_ROUNDS); 53 | 54 | #endif // _AES_WRAP_H_ 55 | -------------------------------------------------------------------------------- /asm/sm4_encdec.S: -------------------------------------------------------------------------------- 1 | // sm4_encdec.S 2 | // 2020-02-16 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // AES Encryption and Key Schedule using "lwaes" instructions. 6 | 7 | // macro definitions for the custom instruction 8 | #include "saes32_c0.h" 9 | 10 | .option nopic 11 | .text 12 | .align 2 13 | 14 | 15 | // Encrypt or decrypt a block, depending on round key ordering. 16 | // void sm4_encdec(uint8_t out[16], const uint8_t in[16], 17 | // const uint32_t rk[SM4_RK_WORDS]) 18 | // where: a0 = out, a1 = in, a2 = rk 19 | 20 | .globl sm4_encdec 21 | .type sm4_encdec, @function 22 | 23 | sm4_encdec: 24 | lw t0, 0(a1) 25 | lw t1, 4(a1) 26 | lw t2, 8(a1) 27 | lw t3, 12(a1) 28 | 29 | addi a3, a2, 128 30 | 31 | .loop: 32 | xor t4, t2, t3 33 | 34 | lw a1, 0(a2) 35 | xor a1, a1, t1 36 | xor a1, a1, t4 37 | 38 | ssm4_ed T0, T0, A1, 0 39 | ssm4_ed T0, T0, A1, 1 40 | ssm4_ed T0, T0, A1, 2 41 | ssm4_ed T0, T0, A1, 3 42 | 43 | lw a1, 4(a2) 44 | xor a1, a1, t0 45 | xor a1, a1, t4 46 | 47 | ssm4_ed T1, T1, A1, 0 48 | ssm4_ed T1, T1, A1, 1 49 | ssm4_ed T1, T1, A1, 2 50 | ssm4_ed T1, T1, A1, 3 51 | 52 | xor t4, t0, t1 53 | 54 | lw a1, 8(a2) 55 | xor a1, a1, t3 56 | xor a1, a1, t4 57 | 58 | ssm4_ed T2, T2, A1, 0 59 | ssm4_ed T2, T2, A1, 1 60 | ssm4_ed T2, T2, A1, 2 61 | ssm4_ed T2, T2, A1, 3 62 | 63 | lw a1, 12(a2) 64 | xor a1, a1, t2 65 | xor a1, a1, t4 66 | 67 | ssm4_ed T3, T3, A1, 0 68 | ssm4_ed T3, T3, A1, 1 69 | ssm4_ed T3, T3, A1, 2 70 | ssm4_ed T3, T3, A1, 3 71 | 72 | addi a2, a2, 16 73 | bne a3, a2, .loop 74 | 75 | sw t3, 0(a0) 76 | sw t2, 4(a0) 77 | sw t1, 8(a0) 78 | sw t0, 12(a0) 79 | 80 | jr ra 81 | .size sm4_encdec, .-sm4_encdec 82 | .align 2 83 | 84 | 85 | // Set key for encryption. 86 | // void sm4_enc_key_asm(uint32_t rk[32], const uint8_t key[16]) 87 | // where: a0 = rk, a1 = key 88 | 89 | .globl sm4_enc_key 90 | .type sm4_enc_key, @function 91 | sm4_enc_key: 92 | lw t0, 0(a1) 93 | lw t1, 4(a1) 94 | lw t2, 8(a1) 95 | lw t3, 12(a1) 96 | 97 | addi a2, a0, 128 98 | 99 | // "fk" constant 100 | 101 | li a1, 0xC6BAB000 102 | addi a1, a1, 0x1A3 103 | xor t0, t0, a1 104 | 105 | li a1, 0x5033A800 106 | addi a1, a1, 0x256 107 | xor t1, t1, a1 108 | 109 | li a1, 0x97917800 110 | addi a1, a1, 0x567 111 | xor t2, t2, a1 112 | 113 | li a1, 0xDC227000 114 | addi a1, a1, 0x0B2 115 | xor t3, t3, a1 116 | 117 | // constants for "ck" generation 118 | 119 | li a3, 0x140E0000 120 | addi a3, a3, 0x600 121 | 122 | li a4, 0x1C1C1800 123 | addi a4, a4, 0x41C 124 | 125 | li a5, 0xFEFEF800 126 | addi a5, a5, 0x6FE 127 | 128 | li a6, 0x01000000 129 | addi a6, a6, 0x100 130 | 131 | .ekl: 132 | xor t4, t2, t3 133 | xor a1, t1, a3 134 | xor a1, a1, t4 135 | xor a1, a1, a6 136 | 137 | add a3, a3, a4 138 | and a3, a3, a5 139 | 140 | ssm4_ks T0, T0, A1, 0 141 | ssm4_ks T0, T0, A1, 1 142 | ssm4_ks T0, T0, A1, 2 143 | ssm4_ks T0, T0, A1, 3 144 | sw t0, 0(a0) 145 | 146 | xor a1, a3, t0 147 | xor a1, a1, t4 148 | xor a1, a1, a6 149 | 150 | add a3, a3, a4 151 | and a3, a3, a5 152 | 153 | ssm4_ks T1, T1, A1, 0 154 | ssm4_ks T1, T1, A1, 1 155 | ssm4_ks T1, T1, A1, 2 156 | ssm4_ks T1, T1, A1, 3 157 | sw t1, 4(a0) 158 | 159 | xor t4, t0, t1 160 | xor a1, t3, a3 161 | xor a1, a1, t4 162 | xor a1, a1, a6 163 | 164 | add a3, a3, a4 165 | and a3, a3, a5 166 | 167 | ssm4_ks T2, T2, A1, 0 168 | ssm4_ks T2, T2, A1, 1 169 | ssm4_ks T2, T2, A1, 2 170 | ssm4_ks T2, T2, A1, 3 171 | sw t2, 8(a0) 172 | 173 | xor a1, a3, t2 174 | xor a1, a1, t4 175 | xor a1, a1, a6 176 | 177 | add a3, a3, a4 178 | and a3, a3, a5 179 | 180 | ssm4_ks T3, T3, A1, 0 181 | ssm4_ks T3, T3, A1, 1 182 | ssm4_ks T3, T3, A1, 2 183 | ssm4_ks T3, T3, A1, 3 184 | sw t3, 12(a0) 185 | 186 | addi a0, a0, 16 187 | 188 | bne a2, a0, .ekl 189 | 190 | jr ra 191 | .size sm4_enc_key, .-sm4_enc_key 192 | .align 2 193 | 194 | 195 | // Set key for decryption. 196 | // void sm4_dec_key_asm(uint32_t rk[32], const uint8_t key[16]) 197 | // Where: a0 = rk, a1 = key 198 | 199 | .globl sm4_dec_key 200 | .type sm4_dec_key, @function 201 | 202 | sm4_dec_key: 203 | addi sp, sp, -8 // generate a forward key 204 | sw s0, 0(sp) 205 | sw ra, 4(sp) 206 | mv s0, a0 207 | call sm4_enc_key 208 | mv a0, s0 209 | lw s0, 0(sp) 210 | lw ra, 4(sp) 211 | addi sp, sp, 8 212 | 213 | addi a5, a0, 124 // flip order 214 | addi a2, a0, 64 215 | .dkl: 216 | lw a3, 0(a5) 217 | lw a4, 0(a0) 218 | addi a0, a0, 4 219 | addi a5, a5, -4 220 | sw a3, -4(a0) 221 | sw a4, 4(a5) 222 | bne a0, a2, .dkl 223 | 224 | jr ra 225 | .size sm4_dec_key, .-sm4_dec_key 226 | 227 | -------------------------------------------------------------------------------- /asm/sm4_encdec.h: -------------------------------------------------------------------------------- 1 | // sm4_encdec.h 2 | // 2020-01-24 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Prototypes for SM4 (Chinese Encryption Standard) Encryption. 6 | 7 | // The decryption funtion is the same as encryption with the difference 8 | // of having a reversed key schedule. Hence we define both functions here. 9 | 10 | #ifndef _SM4_ENCDEC_H_ 11 | #define _SM4_ENCDEC_H_ 12 | 13 | #include 14 | 15 | // Size of the expanded key. 16 | #define SM4_RK_WORDS 32 17 | 18 | // encrypt/decrypt a block, depending on ordering of rk 19 | void sm4_encdec(uint8_t out[16], const uint8_t in[16], 20 | const uint32_t rk[SM4_RK_WORDS]); 21 | 22 | // expand a secret key for encryption 23 | void sm4_enc_key(uint32_t rk[SM4_RK_WORDS], const uint8_t key[16]); 24 | 25 | // expand a secret key for decryption 26 | void sm4_dec_key(uint32_t rk[SM4_RK_WORDS], const uint8_t key[16]); 27 | 28 | // aliases 29 | #define sm4_enc_ecb(ct, pt, rk) sm4_encdec(ct, pt, rk) 30 | #define sm4_dec_ecb(pt, ct, rk) sm4_encdec(pt, ct, rk) 31 | 32 | #endif /* _SM4_ENCDEC_H_ */ 33 | -------------------------------------------------------------------------------- /bitmanip.c: -------------------------------------------------------------------------------- 1 | // bitmanip.c 2 | // 2020-03-07 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // instruction emulation code -- these are all from bitmanip 6 | 7 | #include "bitmanip.h" 8 | 9 | // carryless multiply 10 | 11 | uint32_t rv32b_clmul(uint32_t rs1, uint32_t rs2) 12 | { 13 | uint32_t x = 0; 14 | for (int i = 0; i < 32; i++) 15 | if ((rs2 >> i) & 1) 16 | x ^= rs1 << i; 17 | return x; 18 | } 19 | 20 | uint32_t rv32b_clmulh(uint32_t rs1, uint32_t rs2) 21 | { 22 | uint32_t x = 0; 23 | for (int i = 1; i < 32; i++) 24 | if ((rs2 >> i) & 1) 25 | x ^= rs1 >> (32 - i); 26 | return x; 27 | } 28 | 29 | uint32_t rv32b_clmulr(uint32_t rs1, uint32_t rs2) 30 | { 31 | uint32_t x = 0; 32 | for (int i = 0; i < 32; i++) 33 | if ((rs2 >> i) & 1) 34 | x ^= rs1 >> (32 - i - 1); 35 | return x; 36 | } 37 | 38 | // 64-bit 39 | 40 | uint64_t rv64b_clmul(uint64_t rs1, uint64_t rs2) 41 | { 42 | uint64_t x = 0; 43 | for (int i = 0; i < 64; i++) 44 | if ((rs2 >> i) & 1) 45 | x ^= rs1 << i; 46 | return x; 47 | } 48 | 49 | uint64_t rv64b_clmulh(uint64_t rs1, uint64_t rs2) 50 | { 51 | uint64_t x = 0; 52 | for (int i = 1; i < 64; i++) 53 | if ((rs2 >> i) & 1) 54 | x ^= rs1 >> (64 - i); 55 | return x; 56 | } 57 | 58 | uint64_t rv64b_clmulr(uint64_t rs1, uint64_t rs2) 59 | { 60 | uint64_t x = 0; 61 | for (int i = 0; i < 64; i++) 62 | if ((rs2 >> i) & 1) 63 | x ^= rs1 >> (64 - i - 1); 64 | return x; 65 | } 66 | 67 | // rotate right ROR / RORI 68 | 69 | uint32_t rv32b_ror(uint32_t rs1, uint32_t rs2) 70 | { 71 | int shamt = rs2 & (32 - 1); 72 | return (rs1 >> shamt) | (rs1 << ((32 - shamt) & (32 - 1))); 73 | } 74 | 75 | uint64_t rv64b_ror(uint64_t rs1, uint64_t rs2) 76 | { 77 | int shamt = rs2 & (64 - 1); 78 | return (rs1 >> shamt) | (rs1 << ((64 - shamt) & (64 - 1))); 79 | } 80 | 81 | // and with negate ANDN 82 | 83 | uint64_t rv32b_andn(uint32_t rs1, uint32_t rs2) 84 | { 85 | return rs1 & ~rs2; 86 | } 87 | 88 | uint64_t rv64b_andn(uint64_t rs1, uint64_t rs2) 89 | { 90 | return rs1 & ~rs2; 91 | } 92 | 93 | // generalized reverse GREV / GREVI 94 | 95 | uint32_t rv32b_grev(uint32_t rs1, uint32_t rs2) 96 | { 97 | uint32_t x = rs1; 98 | int shamt = rs2 & 31; 99 | if (shamt & 1) 100 | x = ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1); 101 | if (shamt & 2) 102 | x = ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2); 103 | if (shamt & 4) 104 | x = ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4); 105 | if (shamt & 8) 106 | x = ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8); 107 | if (shamt & 16) 108 | x = ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16); 109 | return x; 110 | } 111 | 112 | uint64_t rv64b_grev(uint64_t rs1, uint64_t rs2) 113 | { 114 | uint64_t x = rs1; 115 | int shamt = rs2 & 63; 116 | if (shamt & 1) 117 | x = ((x & 0x5555555555555555LL) << 1) | 118 | ((x & 0xAAAAAAAAAAAAAAAALL) >> 1); 119 | if (shamt & 2) 120 | x = ((x & 0x3333333333333333LL) << 2) | 121 | ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2); 122 | if (shamt & 4) 123 | x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) | 124 | ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4); 125 | if (shamt & 8) 126 | x = ((x & 0x00FF00FF00FF00FFLL) << 8) | 127 | ((x & 0xFF00FF00FF00FF00LL) >> 8); 128 | if (shamt & 16) 129 | x = ((x & 0x0000FFFF0000FFFFLL) << 16) | 130 | ((x & 0xFFFF0000FFFF0000LL) >> 16); 131 | if (shamt & 32) 132 | x = ((x & 0x00000000FFFFFFFFLL) << 32) | 133 | ((x & 0xFFFFFFFF00000000LL) >> 32); 134 | return x; 135 | } 136 | 137 | // 32-bit helper for SHFL/UNSHFL 138 | 139 | static inline uint32_t shuffle32_stage(uint32_t src, uint32_t ml, 140 | uint32_t mr, int n) 141 | { 142 | uint32_t x = src & ~(ml | mr); 143 | x |= ((src << n) & ml) | ((src >> n) & mr); 144 | return x; 145 | } 146 | 147 | // generalized shuffle SHFL / SHFLI 148 | 149 | uint32_t rv32b_shfl(uint32_t rs1, uint32_t rs2) 150 | { 151 | uint32_t x = rs1; 152 | int shamt = rs2 & 15; 153 | 154 | if (shamt & 8) 155 | x = shuffle32_stage(x, 0x00FF0000, 0x0000FF00, 8); 156 | if (shamt & 4) 157 | x = shuffle32_stage(x, 0x0F000F00, 0x00F000F0, 4); 158 | if (shamt & 2) 159 | x = shuffle32_stage(x, 0x30303030, 0x0C0C0C0C, 2); 160 | if (shamt & 1) 161 | x = shuffle32_stage(x, 0x44444444, 0x22222222, 1); 162 | 163 | return x; 164 | } 165 | 166 | // generalized unshuffle UNSHFL / UNSHFLI 167 | 168 | uint32_t rv32b_unshfl(uint32_t rs1, uint32_t rs2) 169 | { 170 | uint32_t x = rs1; 171 | int shamt = rs2 & 15; 172 | 173 | if (shamt & 1) 174 | x = shuffle32_stage(x, 0x44444444, 0x22222222, 1); 175 | if (shamt & 2) 176 | x = shuffle32_stage(x, 0x30303030, 0x0C0C0C0C, 2); 177 | if (shamt & 4) 178 | x = shuffle32_stage(x, 0x0F000F00, 0x00F000F0, 4); 179 | if (shamt & 8) 180 | x = shuffle32_stage(x, 0x00FF0000, 0x0000FF00, 8); 181 | 182 | return x; 183 | } 184 | 185 | 186 | // 64-bit helper for SHFLW/UNSHFLW 187 | 188 | static inline uint64_t shuffle64_stage(uint64_t src, uint64_t ml, 189 | uint64_t mr, int n) 190 | { 191 | uint64_t x = src & ~(ml | mr); 192 | x |= ((src << n) & ml) | ((src >> n) & mr); 193 | return x; 194 | } 195 | 196 | // generalized shuffle SHFLW 197 | 198 | uint64_t rv64b_shfl(uint64_t rs1, uint64_t rs2) 199 | { 200 | uint64_t x = rs1; 201 | int shamt = rs2 & 31; 202 | 203 | if (shamt & 16) 204 | x = shuffle64_stage(x, 0x0000FFFF00000000LL, 0x00000000FFFF0000LL, 16); 205 | if (shamt & 8) 206 | x = shuffle64_stage(x, 0x00FF000000FF0000LL, 0x0000FF000000FF00LL, 8); 207 | if (shamt & 4) 208 | x = shuffle64_stage(x, 0x0F000F000F000F00LL, 0x00F000F000F000F0LL, 4); 209 | if (shamt & 2) 210 | x = shuffle64_stage(x, 0x3030303030303030LL, 0x0C0C0C0C0C0C0C0CLL, 2); 211 | if (shamt & 1) 212 | x = shuffle64_stage(x, 0x4444444444444444LL, 0x2222222222222222LL, 1); 213 | 214 | return x; 215 | } 216 | 217 | // generalized unshuffle UNSHFLW 218 | 219 | uint64_t rv64b_unshfl(uint64_t rs1, uint64_t rs2) 220 | { 221 | uint64_t x = rs1; 222 | int shamt = rs2 & 31; 223 | 224 | if (shamt & 1) 225 | x = shuffle64_stage(x, 0x4444444444444444LL, 0x2222222222222222LL, 1); 226 | if (shamt & 2) 227 | x = shuffle64_stage(x, 0x3030303030303030LL, 0x0C0C0C0C0C0C0C0CLL, 2); 228 | if (shamt & 4) 229 | x = shuffle64_stage(x, 0x0F000F000F000F00LL, 0x00F000F000F000F0LL, 4); 230 | if (shamt & 8) 231 | x = shuffle64_stage(x, 0x00FF000000FF0000LL, 0x0000FF000000FF00LL, 8); 232 | if (shamt & 16) 233 | x = shuffle64_stage(x, 0x0000FFFF00000000LL, 0x00000000FFFF0000LL, 16); 234 | 235 | return x; 236 | } 237 | -------------------------------------------------------------------------------- /bitmanip.h: -------------------------------------------------------------------------------- 1 | // bitmanip.h 2 | // 2020-03-07 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Prototypes for bitmanip instruction emulation code. 6 | // -- intended to be replaced with intrinsics. 7 | 8 | #ifndef _BITMANIP_H_ 9 | #define _BITMANIP_H_ 10 | 11 | #include 12 | 13 | // rotate right ROR / RORI 14 | uint32_t rv32b_ror(uint32_t rs1, uint32_t rs2); 15 | uint64_t rv64b_ror(uint64_t rs1, uint64_t rs2); 16 | 17 | // and with negate ANDN 18 | uint64_t rv32b_andn(uint32_t rs1, uint32_t rs2); 19 | uint64_t rv64b_andn(uint64_t rs1, uint64_t rs2); 20 | 21 | // generalized reverse GREV / GREVI 22 | uint32_t rv32b_grev(uint32_t rs1, uint32_t rs2); 23 | uint64_t rv64b_grev(uint64_t rs1, uint64_t rs2); 24 | 25 | // generalized shuffle SHFL / SHFLI 26 | uint32_t rv32b_shfl(uint32_t rs1, uint32_t rs2); 27 | uint64_t rv64b_shfl(uint64_t rs1, uint64_t rs2); 28 | 29 | // generalized unshuffle UNSHFL / UNSHFLI 30 | uint32_t rv32b_unshfl(uint32_t rs1, uint32_t rs2); 31 | uint64_t rv64b_unshfl(uint64_t rs1, uint64_t rs2); 32 | 33 | // carryless multiply 34 | uint32_t rv32b_clmul(uint32_t rs1, uint32_t rs2); 35 | uint32_t rv32b_clmulh(uint32_t rs1, uint32_t rs2); 36 | uint32_t rv32b_clmulr(uint32_t rs1, uint32_t rs2); 37 | 38 | uint64_t rv64b_clmul(uint64_t rs1, uint64_t rs2); 39 | uint64_t rv64b_clmulh(uint64_t rs1, uint64_t rs2); 40 | uint64_t rv64b_clmulr(uint64_t rs1, uint64_t rs2); 41 | 42 | #endif // _BITMANIP_H_ 43 | -------------------------------------------------------------------------------- /doc/NIST.FIPS.197.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mjosaarinen/lwaes_isa/75277ed816e7b11fd6f4f4df62ba44993fb1f94f/doc/NIST.FIPS.197.pdf -------------------------------------------------------------------------------- /doc/gmt0002-2012sm4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mjosaarinen/lwaes_isa/75277ed816e7b11fd6f4f4df62ba44993fb1f94f/doc/gmt0002-2012sm4.pdf -------------------------------------------------------------------------------- /doc/lwaes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mjosaarinen/lwaes_isa/75277ed816e7b11fd6f4f4df62ba44993fb1f94f/doc/lwaes.pdf -------------------------------------------------------------------------------- /doc/sm4en.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mjosaarinen/lwaes_isa/75277ed816e7b11fd6f4f4df62ba44993fb1f94f/doc/sm4en.pdf -------------------------------------------------------------------------------- /doc/sp800-38d.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mjosaarinen/lwaes_isa/75277ed816e7b11fd6f4f4df62ba44993fb1f94f/doc/sp800-38d.pdf -------------------------------------------------------------------------------- /doc/test_gcm_ossl.c: -------------------------------------------------------------------------------- 1 | // openssl_gcm.c 2 | // 2020-03-24 Markku-Juhani O. Saarinen 3 | 4 | // test GCM against OpenSSL (to increase coverage) 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include "aes_gcm.h" 13 | 14 | 15 | static int gcm_encrypt(uint8_t * plaintext, int plaintext_len, 16 | // uint8_t * aad, int aad_len, 17 | uint8_t * key, 18 | uint8_t * iv, int iv_len, uint8_t * ciphertext, 19 | uint8_t * tag) 20 | { 21 | EVP_CIPHER_CTX *ctx; 22 | 23 | int len; 24 | int ciphertext_len; 25 | 26 | if (!(ctx = EVP_CIPHER_CTX_new())) 27 | return 0; 28 | 29 | if (1 != EVP_EncryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL)) 30 | return 0; 31 | 32 | if (1 != EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL)) 33 | return 0; 34 | 35 | if (1 != EVP_EncryptInit_ex(ctx, NULL, NULL, key, iv)) 36 | return 0; 37 | 38 | /* 39 | if (1 != EVP_EncryptUpdate(ctx, NULL, &len, aad, aad_len)) 40 | return 0; 41 | */ 42 | if (1 != 43 | EVP_EncryptUpdate(ctx, ciphertext, &len, plaintext, plaintext_len)) 44 | return 0; 45 | ciphertext_len = len; 46 | 47 | if (1 != EVP_EncryptFinal_ex(ctx, ciphertext + len, &len)) 48 | return 0; 49 | ciphertext_len += len; 50 | 51 | if (1 != EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, 16, tag)) 52 | return 0; 53 | 54 | EVP_CIPHER_CTX_free(ctx); 55 | 56 | return ciphertext_len; 57 | } 58 | 59 | static void hexvar(const uint8_t * v, size_t len, const char *lab) 60 | { 61 | size_t i; 62 | 63 | printf("%s", lab); 64 | for (i = 0; i < len; i++) 65 | printf("%02X", v[i]); 66 | printf("\n"); 67 | } 68 | 69 | static void rndvar(uint8_t * v, size_t len) 70 | { 71 | size_t i; 72 | 73 | for (i = 0; i < len; i++) 74 | v[i] = random(); 75 | 76 | } 77 | 78 | int test_gcm_ossl() 79 | { 80 | int l, l1; 81 | uint8_t k[16], iv[12], p[1024], c1[1024], c2[1024]; 82 | int fail = 0; 83 | 84 | srandom(time(NULL)); 85 | 86 | for (l = 0; l < 1000; l++) { 87 | 88 | putchar('.'); 89 | 90 | memset(c1, 0, l + 16); 91 | memset(c2, 0, l + 16); 92 | 93 | rndvar(k, 16); 94 | rndvar(iv, 12); 95 | rndvar(p, l); 96 | 97 | l1 = gcm_encrypt(p, l, k, iv, 12, c1, c1 + l); 98 | aes128_enc_gcm(c2, p, l, k, iv); 99 | 100 | if (l1 != l || memcmp(c1, c2, l + 16) != 0) { 101 | printf(" [FAIL] l=%d\n", l); 102 | hexvar(k, 16, "K\t"); 103 | hexvar(iv, 12, "IV\t"); 104 | hexvar(p, l, "P\t"); 105 | hexvar(c1, l1 + 16, "C1\t"); 106 | hexvar(c2, l + 16, "C2\t"); 107 | fail++; 108 | } 109 | } 110 | printf("\n"); 111 | 112 | return fail; 113 | } 114 | -------------------------------------------------------------------------------- /gcm_gfmul.h: -------------------------------------------------------------------------------- 1 | // gcm_gfmul.h 2 | // 2020-03-23 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // a minimal interface to core GHASH finite field operations 6 | 7 | #ifndef _GCM_GFMUL_H_ 8 | #define _GCM_GFMUL_H_ 9 | 10 | #include 11 | 12 | // A GF(2^128) element type -- just for alignment and to avoid casts 13 | 14 | typedef union { 15 | uint8_t b[16]; 16 | uint32_t w[4]; 17 | uint64_t d[2]; 18 | } gf128_t; 19 | 20 | // bit reversal, 32-bit variants (rv32_ghash.c) 21 | void rv32_ghash_rev(gf128_t * z); 22 | 23 | // 32-bit compact version (rv32_ghash.c) 24 | void rv32_ghash_mul(gf128_t * z, const gf128_t * x, const gf128_t * h); 25 | 26 | // 32-bit karatsuba version (rv32_ghash.c) 27 | void rv32_ghash_mul_kar(gf128_t * z, const gf128_t * x, const gf128_t * h); 28 | 29 | // bit reversal, 64-bit variant (rv64_ghash.c) 30 | void rv64_ghash_rev(gf128_t * z); 31 | 32 | // 64-bit version (Karatsuba optional) (rv64_ghash.c) 33 | void rv64_ghash_mul(gf128_t * z, const gf128_t * x, const gf128_t * h); 34 | 35 | // Function pointers so that different versions can be tested. (aes_gcm.c) 36 | 37 | // reverse bits in bytes of a 128-bit block; do this for h and final value 38 | extern void (*ghash_rev)(gf128_t * z); 39 | 40 | // finite field multiply z = ( z ^ rev(x) ) * h 41 | extern void (*ghash_mul)(gf128_t * z, const gf128_t * x, const gf128_t * h); 42 | 43 | #endif // _GCM_GFMUL_H_ 44 | -------------------------------------------------------------------------------- /gcm_rv32b_gfmul.c: -------------------------------------------------------------------------------- 1 | // gcm_rv32b_gfmul.c 2 | // 2020-03-23 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // 32-bit GHASH bit-reverse and multiplication for GCM 6 | 7 | #include "gcm_gfmul.h" 8 | #include "bitmanip.h" 9 | 10 | // disable shift reduction 11 | #define NO_SHIFTRED 12 | 13 | // reverse bits in bytes of a 128-bit block; do this for h and final value 14 | 15 | void rv32_ghash_rev(gf128_t * z) 16 | { 17 | z->w[0] = rv32b_grev(z->w[0], 7); 18 | z->w[1] = rv32b_grev(z->w[1], 7); 19 | z->w[2] = rv32b_grev(z->w[2], 7); 20 | z->w[3] = rv32b_grev(z->w[3], 7); 21 | } 22 | 23 | // multiply z = ( z ^ rev(x) ) * h 24 | // 32-bit compact loop version 25 | 26 | void rv32_ghash_mul(gf128_t * z, const gf128_t * x, const gf128_t * h) 27 | { 28 | int i; 29 | uint32_t x0, x1, x2, x3, y; 30 | uint32_t z0, z1, z2, z3, z4; 31 | uint32_t t0, t1, t2; 32 | 33 | x0 = x->w[0]; // new data 34 | x1 = x->w[1]; 35 | x2 = x->w[2]; 36 | x3 = x->w[3]; 37 | 38 | z0 = z->w[0]; // inline to avoid these loads 39 | z1 = z->w[1]; 40 | z2 = z->w[2]; 41 | z3 = z->w[3]; 42 | 43 | // 4 x GREV 44 | x0 = rv32b_grev(x0, 7); // reverse input x only 45 | x1 = rv32b_grev(x1, 7); 46 | x2 = rv32b_grev(x2, 7); 47 | x3 = rv32b_grev(x3, 7); 48 | 49 | // 4 x XOR 50 | x0 = x0 ^ z0; // z is kept unreversed 51 | x1 = x1 ^ z1; 52 | x2 = x2 ^ z2; 53 | x3 = x3 ^ z3; 54 | 55 | // 4 x CMULH, 4 x CLMUL, 3 x XOR 56 | y = h->w[3]; // start from highest word 57 | z4 = rv32b_clmulh(x3, y); 58 | z3 = rv32b_clmul(x3, y); 59 | t1 = rv32b_clmulh(x2, y); 60 | z2 = rv32b_clmul(x2, y); 61 | z3 = z3 ^ t1; 62 | t1 = rv32b_clmulh(x1, y); 63 | z1 = rv32b_clmul(x1, y); 64 | z2 = z2 ^ t1; 65 | t1 = rv32b_clmulh(x0, y); 66 | z0 = rv32b_clmul(x0, y); 67 | z1 = z1 ^ t1; 68 | 69 | #ifdef NO_SHIFTRED 70 | // Mul reduction: 1 x CLMULH, 1 x CLMUL, 2 x XOR 71 | t1 = rv32b_clmulh(z4, 0x87); 72 | t0 = rv32b_clmul(z4, 0x87); 73 | z1 = z1 ^ t1; 74 | z0 = z0 ^ t0; 75 | #else 76 | // Shift reduction: 6 x SHIFT, 7 x XOR 77 | z1 = z1 ^ (z4 >> 31) ^ (z4 >> 30) ^ (z4 >> 25); 78 | z0 = z0 ^ z4 ^ (z4 << 1) ^ (z4 << 2) ^ (z4 << 7); 79 | #endif 80 | 81 | // repeat 3 times 82 | for (i = 2; i >= 0; i--) { // towards less significant 83 | 84 | y = h->w[i]; // unroll this if you like 85 | 86 | // 4 x CLMULH, 4 x CLMUL, 7 x XOR 87 | t1 = rv32b_clmulh(x3, y); 88 | t0 = rv32b_clmul(x3, y); 89 | z4 = z3 ^ t1; 90 | t1 = rv32b_clmulh(x2, y); 91 | t2 = rv32b_clmul(x2, y); 92 | z3 = z2 ^ t0 ^ t1; 93 | t1 = rv32b_clmulh(x1, y); 94 | t0 = rv32b_clmul(x1, y); 95 | z2 = z1 ^ t1 ^ t2; 96 | t1 = rv32b_clmulh(x0, y); 97 | t2 = rv32b_clmul(x0, y); 98 | z1 = z0 ^ t0 ^ t1; 99 | 100 | #ifdef NO_SHIFTRED 101 | // Mul reduction: 1 x CLMULH, 1 x CLMUL, 2 x XOR 102 | t1 = rv32b_clmulh(z4, 0x87); 103 | t0 = rv32b_clmul(z4, 0x87); 104 | z1 = z1 ^ t1; 105 | z0 = t2 ^ t0; 106 | #else 107 | // Shift reduction: 6 x SHIFT, 7 x XOR 108 | z1 = z1 ^ (z4 >> 31) ^ (z4 >> 30) ^ (z4 >> 25); 109 | z0 = t2 ^ z4 ^ (z4 << 1) ^ (z4 << 2) ^ (z4 << 7); 110 | #endif 111 | 112 | } 113 | 114 | z->w[0] = z0; // inline to remove store 115 | z->w[1] = z1; 116 | z->w[2] = z2; 117 | z->w[3] = z3; 118 | } 119 | 120 | // multiply z = ( z ^ rev(x) ) * h 121 | // 32-bit Karatsuba version 122 | 123 | void rv32_ghash_mul_kar(gf128_t * z, const gf128_t * x, const gf128_t * h) 124 | { 125 | uint32_t x0, x1, x2, x3, y0, y1, y2, y3; 126 | uint32_t z0, z1, z2, z3, z4, z5, z6, z7; 127 | uint32_t t0, t1, t2, t3; 128 | 129 | x0 = x->w[0]; // load new data 130 | x1 = x->w[1]; 131 | x2 = x->w[2]; 132 | x3 = x->w[3]; 133 | 134 | z0 = z->w[0]; // inline to avoid these loads 135 | z1 = z->w[1]; 136 | z2 = z->w[2]; 137 | z3 = z->w[3]; 138 | 139 | y0 = h->w[0]; // y is untouched 140 | y1 = h->w[1]; 141 | y2 = h->w[2]; 142 | y3 = h->w[3]; 143 | 144 | // 4 x GREV 145 | x0 = rv32b_grev(x0, 7); // reverse input x only 146 | x1 = rv32b_grev(x1, 7); 147 | x2 = rv32b_grev(x2, 7); 148 | x3 = rv32b_grev(x3, 7); 149 | 150 | // 4 x XOR 151 | x0 = x0 ^ z0; // z is updated 152 | x1 = x1 ^ z1; 153 | x2 = x2 ^ z2; 154 | x3 = x3 ^ z3; 155 | 156 | // 2-level Karatsuba multiplication 157 | // 9 x CLMULH, 9 x CLMUL, 40 x XOR 158 | 159 | z7 = rv32b_clmulh(x3, y3); // high pair 160 | z6 = rv32b_clmul(x3, y3); 161 | z5 = rv32b_clmulh(x2, y2); 162 | z4 = rv32b_clmul(x2, y2); 163 | t0 = x2 ^ x3; 164 | t2 = y2 ^ y3; 165 | t1 = rv32b_clmulh(t0, t2); 166 | t0 = rv32b_clmul(t0, t2); 167 | t1 = t1 ^ z5 ^ z7; 168 | t0 = t0 ^ z4 ^ z6; 169 | z6 = z6 ^ t1; 170 | z5 = z5 ^ t0; 171 | 172 | z3 = rv32b_clmulh(x1, y1); // low pair 173 | z2 = rv32b_clmul(x1, y1); 174 | z1 = rv32b_clmulh(x0, y0); 175 | z0 = rv32b_clmul(x0, y0); 176 | t0 = x0 ^ x1; 177 | t2 = y0 ^ y1; 178 | t1 = rv32b_clmulh(t0, t2); 179 | t0 = rv32b_clmul(t0, t2); 180 | t1 = t1 ^ z1 ^ z3; 181 | t0 = t0 ^ z0 ^ z2; 182 | z2 = z2 ^ t1; 183 | z1 = z1 ^ t0; 184 | 185 | t3 = y1 ^ y3; // split 186 | t2 = y0 ^ y2; 187 | t1 = x1 ^ x3; 188 | t0 = x0 ^ x2; 189 | 190 | x3 = rv32b_clmulh(t1, t3); // middle 191 | x2 = rv32b_clmul(t1, t3); 192 | x1 = rv32b_clmulh(t0, t2); 193 | x0 = rv32b_clmul(t0, t2); 194 | 195 | t0 = t0 ^ t1; 196 | t2 = t2 ^ t3; 197 | t1 = rv32b_clmulh(t0, t2); 198 | t0 = rv32b_clmul(t0, t2); 199 | t1 = t1 ^ x1 ^ x3; 200 | t0 = t0 ^ x0 ^ x2; 201 | x2 = x2 ^ t1; 202 | x1 = x1 ^ t0; 203 | 204 | x3 = x3 ^ z3 ^ z7; // finalize 205 | x2 = x2 ^ z2 ^ z6; 206 | x1 = x1 ^ z1 ^ z5; 207 | x0 = x0 ^ z0 ^ z4; 208 | z5 = z5 ^ x3; 209 | z4 = z4 ^ x2; 210 | z3 = z3 ^ x1; 211 | z2 = z2 ^ x0; 212 | 213 | // == REDUCTION == 214 | 215 | #ifdef NO_SHIFTRED 216 | // Mul reduction: 4 x CLMULH, 4 x CLMUL, 8 x XOR 217 | t1 = rv32b_clmulh(z7, 0x87); 218 | t0 = rv32b_clmul(z7, 0x87); 219 | z4 = z4 ^ t1; 220 | z3 = z3 ^ t0; 221 | t1 = rv32b_clmulh(z6, 0x87); 222 | t0 = rv32b_clmul(z6, 0x87); 223 | z3 = z3 ^ t1; 224 | z2 = z2 ^ t0; 225 | t1 = rv32b_clmulh(z5, 0x87); 226 | t0 = rv32b_clmul(z5, 0x87); 227 | z2 = z2 ^ t1; 228 | z1 = z1 ^ t0; 229 | t1 = rv32b_clmulh(z4, 0x87); 230 | t0 = rv32b_clmul(z4, 0x87); 231 | z1 = z1 ^ t1; 232 | z0 = z0 ^ t0; 233 | #else 234 | // Shift reduction: 24 x SHIFT, 28 x XOR 235 | z4 = z4 ^ (z7 >> 31) ^ (z7 >> 30) ^ (z7 >> 25); 236 | z3 = z3 ^ z7 ^ (z7 << 1) ^ (z7 << 2) ^ (z7 << 7) ^ 237 | (z6 >> 31) ^ (z6 >> 30) ^ (z6 >> 25); 238 | z2 = z2 ^ z6 ^ (z6 << 1) ^ (z6 << 2) ^ (z6 << 7) ^ 239 | (z5 >> 31) ^ (z5 >> 30) ^ (z5 >> 25); 240 | z1 = z1 ^ z5 ^ (z5 << 1) ^ (z5 << 2) ^ (z5 << 7) ^ 241 | (z4 >> 31) ^ (z4 >> 30) ^ (z4 >> 25); 242 | z0 = z0 ^ z4 ^ (z4 << 1) ^ (z4 << 2) ^ (z4 << 7); 243 | #endif 244 | 245 | z->w[0] = z0; // inline to remove store 246 | z->w[1] = z1; 247 | z->w[2] = z2; 248 | z->w[3] = z3; 249 | } 250 | -------------------------------------------------------------------------------- /gcm_rv64b_gfmul.c: -------------------------------------------------------------------------------- 1 | // gcm_rv64b_gfmul.c 2 | // 2020-03-23 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // 64-bit GHASH bit-reverse and multiplication for GCM 6 | 7 | #include "gcm_gfmul.h" 8 | #include "bitmanip.h" 9 | 10 | // disable shift reduction 11 | //#define NO_SHIFTRED 12 | // disable karatsuba multiplication 13 | //#define NO_KARATSUBA 14 | 15 | // reverse bits in bytes of a 128-bit block; do this for h and final value 16 | 17 | void rv64_ghash_rev(gf128_t * z) 18 | { 19 | z->d[0] = rv64b_grev(z->d[0], 7); 20 | z->d[1] = rv64b_grev(z->d[1], 7); 21 | } 22 | 23 | // multiply z = ( z ^ rev(x) ) * h 24 | 25 | void rv64_ghash_mul(gf128_t * z, const gf128_t * x, const gf128_t * h) 26 | { 27 | uint64_t x0, x1, y0, y1; 28 | uint64_t z0, z1, z2, z3, t0, t1, t2; 29 | 30 | x0 = x->d[0]; // new input 31 | x1 = x->d[1]; 32 | 33 | z0 = z->d[0]; // inline to avoid these loads 34 | z1 = z->d[1]; 35 | 36 | y0 = h->d[0]; // h value already reversed 37 | y1 = h->d[1]; 38 | 39 | // 2 x GREVW, 2 x XOR 40 | x0 = rv64b_grev(x0, 7); // reverse input x only 41 | x1 = rv64b_grev(x1, 7); 42 | x0 = x0 ^ z0; // z is updated 43 | x1 = x1 ^ z1; 44 | 45 | #ifdef NO_KARATSUBA 46 | 47 | (void) t2; // unused 48 | 49 | // Without Karatsuba; 4 x CLMULH, 4 x CLMUL, 4 x XOR 50 | z3 = rv64b_clmulh(x1, y1); 51 | z2 = rv64b_clmul(x1, y1); 52 | t1 = rv64b_clmulh(x0, y1); 53 | z1 = rv64b_clmul(x0, y1); 54 | z2 = z2 ^ t1; 55 | t1 = rv64b_clmulh(x1, y0); 56 | t0 = rv64b_clmul(x1, y0); 57 | z2 = z2 ^ t1; 58 | z1 = z1 ^ t0; 59 | t1 = rv64b_clmulh(x0, y0); 60 | z0 = rv64b_clmul(x0, y0); 61 | z1 = z1 ^ t1; 62 | 63 | #else 64 | 65 | // With Karatsuba; 3 x CLMULH, 3 x CLMUL, 8 x XOR 66 | z3 = rv64b_clmulh(x1, y1); 67 | z2 = rv64b_clmul(x1, y1); 68 | z1 = rv64b_clmulh(x0, y0); 69 | z0 = rv64b_clmul(x0, y0); 70 | t0 = x0 ^ x1; 71 | t2 = y0 ^ y1; 72 | t1 = rv64b_clmulh(t0, t2); 73 | t0 = rv64b_clmul(t0, t2); 74 | t1 = t1 ^ z1 ^ z3; 75 | t0 = t0 ^ z0 ^ z2; 76 | z2 = z2 ^ t1; 77 | z1 = z1 ^ t0; 78 | 79 | #endif 80 | 81 | #ifdef NO_SHIFTRED 82 | 83 | // Mul reduction: 2 x CLMULH, 2 x CLMUL, 4 x XOR 84 | t1 = rv64b_clmulh(z3, 0x87); 85 | t0 = rv64b_clmul(z3, 0x87); 86 | z2 = z2 ^ t1; 87 | z1 = z1 ^ t0; 88 | t1 = rv64b_clmulh(z2, 0x87); 89 | t0 = rv64b_clmul(z2, 0x87); 90 | z1 = z1 ^ t1; 91 | z0 = z0 ^ t0; 92 | 93 | #else 94 | 95 | // Shift reduction: 12 x SHIFT, 14 x XOR 96 | z2 = z2 ^ (z3 >> 63) ^ (z3 >> 62) ^ (z3 >> 57); 97 | z1 = z1 ^ z3 ^ (z3 << 1) ^ (z3 << 2) ^ (z3 << 7) ^ 98 | (z2 >> 63) ^ (z2 >> 62) ^ (z2 >> 57); 99 | z0 = z0 ^ z2 ^ (z2 << 1) ^ (z2 << 2) ^ (z2 << 7); 100 | 101 | #endif 102 | 103 | z->d[0] = z0; // inline to avoid these stores 104 | z->d[1] = z1; 105 | } 106 | -------------------------------------------------------------------------------- /gcm_test.c: -------------------------------------------------------------------------------- 1 | // gcm_test.c 2 | // 2020-03-21 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Unit tests for GCM AES-128/192/256 in simple mode. Selected from 6 | // https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Algorithm-Validation-Program/documents/mac/gcmtestvectors.zip 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #include "test_hex.h" 13 | #include "gcm_wrap.h" 14 | #include "gcm_gfmul.h" 15 | 16 | // A GCM test 17 | 18 | int test_gcm() 19 | { 20 | uint8_t pt[100], ct[100], xt[100], k[32], iv[12]; 21 | size_t mlen, clen; 22 | int flag, fail = 0; 23 | 24 | // GCM AES-128, one-block message 25 | 26 | readhex(k, sizeof(k), "7FDDB57453C241D03EFBED3AC44E371C"); 27 | readhex(iv, sizeof(iv), "EE283A3FC75575E33EFD4887"); 28 | mlen = readhex(pt, sizeof(pt), "D5DE42B461646C255C87BD2962D3B9A2"); 29 | clen = mlen + 16; 30 | memset(ct, 0, clen); 31 | aes128_enc_gcm(ct, pt, mlen, k, iv); 32 | fail += chkhex("GCM AES-128", ct, clen, 33 | "2CCDA4A5415CB91E135C2A0F78C9B2FD" 34 | "B36D1DF9B9D5E596F83E8B7F52971CB3"); 35 | 36 | memset(xt, 0, mlen); 37 | flag = aes128_dec_vfy_gcm(xt, ct, clen, k, iv) || 38 | memcmp(xt, pt, mlen) != 0; 39 | 40 | ct[rand() % clen] ^= 1 << (rand() & 7); // corrupt random bit 41 | 42 | flag |= !(aes128_dec_vfy_gcm(xt, ct, clen, k, iv) || 43 | memcmp(xt, pt, mlen) != 0); 44 | printf("[%s] GCM AES-128 verify / corrupt test\n", flag ? "FAIL" : "PASS"); 45 | if (flag) 46 | fail++; 47 | 48 | // GCM AES-192, two-block message 49 | 50 | readhex(k, sizeof(k), "165C4AA5D78EE15F297D5D2EAE39EAAC" 51 | "3480FC50A6D9A98E"); 52 | readhex(iv, sizeof(iv), "0E321E714C4A262350FC50FC"); 53 | mlen = readhex(pt, sizeof(pt), 54 | "5AFA41EFE94C0193FC9FE62FD6CFACC8" 55 | "868725AB4965A5C9132D74179F0AEE72"); 56 | clen = mlen + 16; 57 | memset(ct, 0, clen); 58 | aes192_enc_gcm(ct, pt, mlen, k, iv); 59 | fail += chkhex("GCM AES-192", ct, clen, 60 | "5AB8AC904E7D4A627EE327B4629B6863" 61 | "19936ABC709E8C0FB6817CB16D0C4F76" 62 | "62BFEA782D6A05CD04030C433639B969"); 63 | 64 | memset(xt, 0, mlen); 65 | flag = aes192_dec_vfy_gcm(xt, ct, clen, k, iv) || 66 | memcmp(xt, pt, mlen) != 0; 67 | 68 | ct[rand() % clen] ^= 1 << (rand() & 7); // corrupt random bit 69 | 70 | flag |= !(aes192_dec_vfy_gcm(xt, ct, clen, k, iv) || 71 | memcmp(xt, pt, mlen) != 0); 72 | printf("[%s] GCM AES-192 verify / corrupt test\n", flag ? "FAIL" : "PASS"); 73 | if (flag) 74 | fail++; 75 | 76 | // GCM AES-256, 51-byte message 77 | 78 | readhex(k, sizeof(k), "1FDED32D5999DE4A76E0F8082108823A" 79 | "EF60417E1896CF4218A2FA90F632EC8A"); 80 | readhex(iv, sizeof(iv), "1F3AFA4711E9474F32E70462"); 81 | mlen = readhex(pt, sizeof(pt), 82 | "06B2C75853DF9AEB17BEFD33CEA81C63" 83 | "0B0FC53667FF45199C629C8E15DCE41E" 84 | "530AA792F796B8138EEAB2E86C7B7BEE" "1D40B0"); 85 | clen = mlen + 16; 86 | memset(ct, 0, clen); 87 | aes256_enc_gcm(ct, pt, mlen, k, iv); 88 | fail += chkhex("GCM AES-256", ct, clen, 89 | "91FBD061DDC5A7FCC9513FCDFDC9C3A7" 90 | "C5D4D64CEDF6A9C24AB8A77C36EEFBF1" 91 | "C5DC00BC50121B96456C8CD8B6FF1F8B" 92 | "3E480F" "30096D340F3D5C42D82A6F475DEF23EB"); 93 | memset(xt, 0, mlen); 94 | flag = aes256_dec_vfy_gcm(xt, ct, clen, k, iv) || 95 | memcmp(xt, pt, mlen) != 0; 96 | 97 | ct[rand() % clen] ^= 1 << (rand() & 7); // corrupt random bit 98 | 99 | flag |= !(aes256_dec_vfy_gcm(xt, ct, clen, k, iv) || 100 | memcmp(xt, pt, mlen) != 0); 101 | printf("[%s] GCM AES-256 verify / corrupt test\n", flag ? "FAIL" : "PASS"); 102 | if (flag) 103 | fail++; 104 | 105 | return fail; 106 | } 107 | -------------------------------------------------------------------------------- /gcm_wrap.c: -------------------------------------------------------------------------------- 1 | // gcm_wrap.c 2 | // 2020-03-21 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // A basic (limited!) AES-GCM interface for testing purposes. 6 | 7 | #include 8 | 9 | #include "bitmanip.h" 10 | #include "aes_wrap.h" 11 | #include "gcm_wrap.h" 12 | #include "gcm_gfmul.h" 13 | 14 | // function pointers are here 15 | 16 | void (*ghash_rev)(gf128_t *) = rv64_ghash_rev; 17 | void (*ghash_mul)(gf128_t *, const gf128_t *, const gf128_t *) = 18 | rv64_ghash_mul; 19 | 20 | // the same "body" for encryption/decryption and various key lengths 21 | 22 | static void aes_gcm_body(uint8_t * dst, uint8_t tag[16], 23 | const uint8_t * src, size_t len, 24 | const uint8_t iv[12], const uint32_t rk[], 25 | void (*enc_ecb)(uint8_t * ct, const uint8_t * pt, 26 | const uint32_t * rk), int enc_flag) 27 | { 28 | size_t i, ctr; 29 | gf128_t b, c, z, h, t, p; 30 | 31 | h.d[0] = 0; // h = AES_k(0) 32 | h.d[1] = 0; 33 | enc_ecb(h.b, h.b, rk); 34 | ghash_rev(&h); 35 | 36 | ctr = 0; // counter value 37 | memcpy(p.b, iv, 12); // J0 38 | p.w[3] = rv32b_grev(++ctr, 0x18); // big-endian counter 39 | enc_ecb(t.b, p.b, rk); // first AES_k(IV | 1) for tag 40 | 41 | z.d[0] = 0; // initialize GHASH result 42 | z.d[1] = 0; 43 | 44 | if (enc_flag) { // == encrypt / generate tag == 45 | 46 | i = len; 47 | while (i >= 16) { // full block 48 | p.w[3] = rv32b_grev(++ctr, 0x18); // rev8.w; big-endian counter 49 | enc_ecb(c.b, p.b, rk); 50 | memcpy(b.b, src, 16); // load plaintext 51 | c.d[0] ^= b.d[0]; 52 | c.d[1] ^= b.d[1]; 53 | memcpy(dst, c.b, 16); // store ciphertext 54 | ghash_mul(&z, &c, &h); // GHASH the block 55 | src += 16; 56 | dst += 16; 57 | i -= 16; 58 | } 59 | 60 | if (i > 0) { // partial block 61 | p.w[3] = rv32b_grev(++ctr, 0x18); // rev8.w; big-endian counter 62 | enc_ecb(c.b, p.b, rk); 63 | memcpy(b.b, src, i); // load plaintext 64 | c.d[0] ^= b.d[0]; 65 | c.d[1] ^= b.d[1]; 66 | memcpy(dst, c.b, i); 67 | memset(&c.b[i], 0, 16 - i); // zero pad input 68 | ghash_mul(&z, &c, &h); // GHASH last block 69 | } 70 | 71 | } else { // == decrypt / verify tag == 72 | 73 | i = len; 74 | while (i >= 16) { // full block 75 | p.w[3] = rv32b_grev(++ctr, 0x18); // rev8.w; big-endian counter 76 | enc_ecb(b.b, p.b, rk); 77 | memcpy(c.b, src, 16); // load ciphertext 78 | b.d[0] ^= c.d[0]; 79 | b.d[1] ^= c.d[1]; 80 | memcpy(dst, b.b, 16); // store plaintext 81 | ghash_mul(&z, &c, &h); // GHASH the block 82 | src += 16; 83 | dst += 16; 84 | i -= 16; 85 | } 86 | 87 | if (i > 0) { // partial block 88 | p.w[3] = rv32b_grev(++ctr, 0x18); // rev8.w; big-endian counter 89 | enc_ecb(b.b, p.b, rk); 90 | memcpy(c.b, src, i); 91 | b.d[0] ^= c.d[0]; 92 | b.d[1] ^= c.d[1]; 93 | memcpy(dst, b.b, i); 94 | memset(&c.b[i], 0, 16 - i); // zero pad input 95 | ghash_mul(&z, &c, &h); // GHASH last block 96 | } 97 | } 98 | 99 | c.d[0] = 0; // pad with bit length 100 | c.w[2] = rv32b_grev(len >> 29, 0x18); 101 | c.w[3] = rv32b_grev(len << 3, 0x18); 102 | ghash_mul(&z, &c, &h); // last GHASH block 103 | ghash_rev(&z); // flip result bits 104 | t.d[0] = t.d[0] ^ z.d[0]; // XOR with AES_k(IV | 1) 105 | t.d[1] = t.d[1] ^ z.d[1]; 106 | memcpy(tag, t.b, 16); // write tag 107 | } 108 | 109 | // verify it 110 | 111 | static int aes_gcm_vfy(uint8_t * m, 112 | const uint8_t * c, size_t clen, 113 | const uint8_t iv[12], const uint32_t rk[], 114 | void (*enc_ecb)(uint8_t * ct, const uint8_t * pt, 115 | const uint32_t * rk)) 116 | { 117 | size_t i; 118 | uint8_t tag[16], x; 119 | 120 | if (clen < 16) 121 | return -1; 122 | 123 | aes_gcm_body(m, tag, c, clen - 16, iv, rk, enc_ecb, 0); 124 | x = 0; 125 | for (i = 0; i < 16; i++) { 126 | x |= tag[i] ^ c[clen - 16 + i]; 127 | } 128 | 129 | return x == 0 ? 0 : 1; 130 | } 131 | 132 | // AES128-GCM 133 | 134 | void aes128_enc_gcm(uint8_t * c, const uint8_t * m, size_t mlen, 135 | const uint8_t * key, const uint8_t iv[12]) 136 | { 137 | uint32_t rk[AES128_RK_WORDS]; 138 | 139 | aes128_enc_key(rk, key); 140 | aes_gcm_body(c, c + mlen, m, mlen, iv, rk, aes128_enc_ecb, 1); 141 | } 142 | 143 | int aes128_dec_vfy_gcm(uint8_t * m, const uint8_t * c, size_t clen, 144 | const uint8_t * key, const uint8_t iv[12]) 145 | { 146 | uint32_t rk[AES128_RK_WORDS]; 147 | 148 | aes128_enc_key(rk, key); 149 | return aes_gcm_vfy(m, c, clen, iv, rk, aes128_enc_ecb); 150 | } 151 | 152 | 153 | // AES192-GCM 154 | 155 | void aes192_enc_gcm(uint8_t * c, const uint8_t * m, size_t mlen, 156 | const uint8_t * key, const uint8_t iv[12]) 157 | { 158 | uint32_t rk[AES192_RK_WORDS]; 159 | 160 | aes192_enc_key(rk, key); 161 | aes_gcm_body(c, c + mlen, m, mlen, iv, rk, aes192_enc_ecb, 1); 162 | } 163 | 164 | int aes192_dec_vfy_gcm(uint8_t * m, const uint8_t * c, size_t clen, 165 | const uint8_t * key, const uint8_t iv[12]) 166 | { 167 | uint32_t rk[AES192_RK_WORDS]; 168 | 169 | aes192_enc_key(rk, key); 170 | return aes_gcm_vfy(m, c, clen, iv, rk, aes192_enc_ecb); 171 | } 172 | 173 | // AES256-GCM 174 | 175 | void aes256_enc_gcm(uint8_t * c, const uint8_t * m, size_t mlen, 176 | const uint8_t * key, const uint8_t iv[12]) 177 | { 178 | uint32_t rk[AES256_RK_WORDS]; 179 | 180 | aes256_enc_key(rk, key); 181 | aes_gcm_body(c, c + mlen, m, mlen, iv, rk, aes256_enc_ecb, 1); 182 | } 183 | 184 | int aes256_dec_vfy_gcm(uint8_t * m, const uint8_t * c, size_t clen, 185 | const uint8_t * key, const uint8_t iv[12]) 186 | { 187 | uint32_t rk[AES256_RK_WORDS]; 188 | 189 | aes256_enc_key(rk, key); 190 | return aes_gcm_vfy(m, c, clen, iv, rk, aes256_enc_ecb); 191 | } 192 | -------------------------------------------------------------------------------- /gcm_wrap.h: -------------------------------------------------------------------------------- 1 | // gcm_wrao.h 2 | // 2020-03-21 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Basic AES-GCM; 96-bit IV, no AAD, 128-bit auth tag padded at the end. 6 | // Ciphertext is always 16 bytes larger than plaintext. 7 | // Decrypt/verify routines (aesxxx_dec_vfy_gcm) return nonzero on failure. 8 | 9 | #ifndef _GCM_WRAP_H_ 10 | #define _GCM_WRAP_H_ 11 | 12 | #include 13 | #include 14 | 15 | // AES-GCM-128 Encrypt / Decrypt & Verify 16 | 17 | void aes128_enc_gcm(uint8_t * c, const uint8_t * m, size_t mlen, 18 | const uint8_t * key, const uint8_t iv[12]); 19 | int aes128_dec_vfy_gcm(uint8_t * m, const uint8_t * c, size_t clen, 20 | const uint8_t * key, const uint8_t iv[12]); 21 | 22 | // AES-GCM-192 Encrypt / Decrypt & Verify 23 | 24 | void aes192_enc_gcm(uint8_t * c, const uint8_t * m, size_t mlen, 25 | const uint8_t * key, const uint8_t iv[12]); 26 | int aes192_dec_vfy_gcm(uint8_t * m, const uint8_t * c, size_t clen, 27 | const uint8_t * key, const uint8_t iv[12]); 28 | 29 | // AES-GCM-256 Encrypt / Decrypt & Verify 30 | 31 | void aes256_enc_gcm(uint8_t * c, const uint8_t * m, size_t mlen, 32 | const uint8_t * key, const uint8_t iv[12]); 33 | int aes256_dec_vfy_gcm(uint8_t * m, const uint8_t * c, size_t clen, 34 | const uint8_t * key, const uint8_t iv[12]); 35 | 36 | #endif // _GCM_WRAP_H_ 37 | -------------------------------------------------------------------------------- /hdl/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile 2 | # 2020-01-29 Markku-Juhani O. Saarinen 3 | # Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | # Minimal makefile for Icarus verilog 6 | 7 | HDL = $(wildcard *.v) 8 | SIM = sim.vvp 9 | 10 | # simulate 11 | 12 | sim: $(SIM) 13 | vvp -N $(SIM) 14 | 15 | $(SIM): $(HDL) 16 | iverilog -o sim.vvp $(HDL) 17 | 18 | # synthesis and reporting 19 | 20 | synth.tmp: $(HDL) synth.ys 21 | rm -f synth.tmp 22 | yosys -v1 synth.ys 23 | 24 | rep: synth.tmp 25 | ./yoparse.py synth.tmp 26 | 27 | # prints differing lines 28 | 29 | test: $(SIM) 30 | vvp -n $(SIM) | grep "[TB]" | diff - tbref.txt 31 | 32 | clean: 33 | rm -f $(SIM) synth.tmp 34 | 35 | -------------------------------------------------------------------------------- /hdl/README.md: -------------------------------------------------------------------------------- 1 | # HDL for the AES / SM4 instruction 2 | 3 | 2020-01-29 Markku-Juhani O. Saarinen 4 | 5 | 2020-02-28 Updated with gate counts. 6 | 7 | The main instruction is in [saes32.v](saes32.v), while [sboxes.v](sboxes.v) 8 | has S-box implementations for AES and SM4. As can be seen, the entire thing 9 | is only about 100 lines + sboxes. Timing can be significantly further 10 | improved. 11 | 12 | If your design doesn't need both AES and SM4, or you just need the forward 13 | AES, you can use macros `SAES32_NO_AES`, `SAES32_NO_AESI`, or `SAES32_NO_SM4` 14 | to disable forward AES, inverse AES, or SM4 respectively. 15 | 16 | A note about [sboxes.v](sboxes.v): I created linear SM4 "top" and "bottom" 17 | layers for the [Boyar-Peralta](https://eprint.iacr.org/2011/332.pdf) AES 18 | S-Box to demonstrate the fact that all three s-box types can share circuitry. 19 | The [sboxes.v](sboxes.v) file has some commentary on this. 20 | 21 | Currently the code does not mux the middle layer, which would reduce gate 22 | count. Also note that the the 21->8 bit bottom layers (which are linear) 23 | can be merged ("collapsed into") the 8->32 bit output layers since they are 24 | also linear. This would reduce timing and possibly gate count too. The 25 | present code prioritizes readability over these considerations. 26 | 27 | There's a simple [Makefile](Makefile) and a testbench for Icarus 28 | Verilog (which is freely available for Debian/Ubuntu etc). 29 | 30 | I have also tested this on Xilinx xsim and vivado with the C and Assembler 31 | language test suites (see parent directory). PQShield's Pluto RV32 core 32 | (on an Artix-7 FPGA) was used, although build files are not provided for 33 | that. 34 | 35 | 36 | ## CMOS Area and Latency Estimate 37 | 38 | There's a Yosys script to make area estimates against a mock CMOS ASIC 39 | cell library. Running `make rep` will perform synthesis and report gate 40 | and transistor counts on four separate "feature sets" of the instruction: 41 | 42 | | **Target** | **Gate Equivalents** | **Transistors** | **LTP** | 43 | |----------------------|--------:|-------:|----:| 44 | | AES Encrypt (only) | 642.0 | 2568 | 25 | 45 | | AES | 1240.0 | 4960 | 28 | 46 | | SM4 | 766.5 | 3066 | 25 | 47 | | AES + SM4 (full) | 1678.5 | 6714 | 28 | 48 | 49 | LTP is the reported *Longest Topological Path* and is a circuit depth / 50 | gate delay measure. 51 | 52 | (Currently the weights are such that transistors = 4*GE, but this can be 53 | tuned in the [yoparse.py](yoparse.py) script.) 54 | 55 | [Yosys](http://www.clifford.at/yosys/) version: 56 | `Yosys 0.9+1706 (git sha1 cd60f079, clang 6.0.0-1ubuntu2 -fPIC -Os)` 57 | 58 | 59 | ## Testing with a Simulator 60 | 61 | No output from `make test` implies that simulator output matches with 62 | [tbref.txt](tbref.txt). More test cases can be generated using the 63 | C emulator code (in parent directory). Matching [saes32_tb.v](saes32_tb.v) 64 | output is generated with argument `./xtest tb`. Just expand the 65 | ` test_hwtb()` function in [../test_main.c](../test_main.c) to your needs. 66 | 67 | ```console 68 | $ make 69 | iverilog -o sim.vvp saes32_tb.v sboxes.v saes32.v 70 | vvp -N sim.vvp 71 | [TB] rd=a56363c6 rs1=00000000 rs2=00000000 fn=00 72 | [TB] rd=6e6edcb2 rs1=00000000 rs2=01234567 fn=01 73 | [TB] rd=5ab4ee5a rs1=00000000 rs2=02468ace fn=02 74 | [TB] rd=f68d7b7b rs1=00000000 rs2=0369d035 fn=03 75 | [TB] rd=000000de rs1=00000000 rs2=048d159c fn=04 76 | [TB] rd=00003900 rs1=00000000 rs2=05b05b03 fn=05 77 | [TB] rd=00660000 rs1=00000000 rs2=06d3a06a fn=06 78 | [TB] rd=c5000000 rs1=00000000 rs2=07f6e5d1 fn=07 79 | [TB] rd=0728ebb2 rs1=00000000 rs2=091a2b38 fn=08 80 | [TB] rd=670a0cb1 rs1=00000000 rs2=0a3d709f fn=09 81 | [TB] rd=7ca1470a rs1=00000000 rs2=0b60b606 fn=0a 82 | [TB] rd=4ffcd7e5 rs1=00000000 rs2=0c83fb6d fn=0b 83 | [TB] rd=00000019 rs1=00000000 rs2=0da740d4 fn=0c 84 | [TB] rd=0000dc00 rs1=00000000 rs2=0eca863b fn=0d 85 | [TB] rd=00530000 rs1=00000000 rs2=0fedcba2 fn=0e 86 | [TB] rd=e3000000 rs1=00000000 rs2=11111109 fn=0f 87 | [TB] rd=5353d784 rs1=00000000 rs2=12345670 fn=10 88 | [TB] rd=c030f0c0 rs1=00000000 rs2=13579bd7 fn=11 89 | [TB] rd=020a0808 rs1=00000000 rs2=147ae13e fn=12 90 | [TB] rd=46fafabc rs1=00000000 rs2=159e26a5 fn=13 91 | [TB] rd=00051428 rs1=00000000 rs2=16c16c0c fn=14 92 | [TB] rd=9b6ddb60 rs1=00000000 rs2=17e4b173 fn=15 93 | [TB] rd=5bb7e096 rs1=00000000 rs2=1907f6da fn=16 94 | [TB] rd=13608209 rs1=00000000 rs2=1a2b3c41 fn=17 95 | 96 | $ make test 97 | vvp -n sim.vvp | grep "[TB]" | diff - tbref.txt 98 | $ 99 | ``` 100 | 101 | [Icarus Verilog](https://github.com/steveicarus/iverilog) versions: 102 | `Icarus Verilog Parser/Elaborator version 11.0 (devel) (s20150603-796-g875431a3)` 103 | `Icarus Verilog runtime version 11.0 (devel) (s20150603-796-g875431a3)` 104 | 105 | Cheers, 106 | - markku 107 | 108 | -------------------------------------------------------------------------------- /hdl/saes32.v: -------------------------------------------------------------------------------- 1 | // saes32.v 2 | // 2020-01-29 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Proposed SAES32 instruction for lightweight AES, AES^-1, and SM4 (RV32). 6 | 7 | // Multiply by 0x02 in AES's GF(256) - LFSR style 8 | 9 | module aes_xtime( output [7:0] out, input [7:0] in ); 10 | assign out = { in[6:0], 1'b0 } ^ ( in[7] ? 8'h1B : 8'h00 ); 11 | endmodule 12 | 13 | // aes encrypt 14 | 15 | `ifndef SAES32_NO_AES 16 | 17 | module aes_t( output [31:0] out, input [7:0] in, input f ); 18 | 19 | wire [7:0] x; 20 | wire [7:0] x2; 21 | 22 | aes_sbox sbox ( x, in ); 23 | aes_xtime lfsr1 ( x2, x ); 24 | 25 | // NOP / MixColumns MDS Matrix 26 | 27 | assign out = f ? { 24'b0, x } : { x ^ x2, x, x, x2 } ; 28 | 29 | endmodule 30 | 31 | `endif 32 | 33 | // aes decrypt 34 | 35 | `ifndef SAES32_NO_AESI 36 | 37 | module aesi_t( output [31:0] out, input [7:0] in, input f ); 38 | 39 | wire [7:0] x; 40 | wire [7:0] x2; 41 | wire [7:0] x4; 42 | wire [7:0] x8; 43 | 44 | aesi_sbox sbox ( x, in ); 45 | aes_xtime lfsr1 ( x2, x ); // todo: reduce circuit depth 46 | aes_xtime lfsr2 ( x4, x2 ); 47 | aes_xtime lfsr3 ( x8, x4 ); 48 | 49 | // NOP / Inverse MixColumns MDS Matrix 50 | 51 | assign out = f ? { 24'b0, x } : 52 | { x ^ x2 ^ x8, x ^ x4 ^ x8, x ^ x8, x2 ^ x4 ^ x8 }; 53 | 54 | endmodule 55 | 56 | `endif 57 | 58 | // sm4 encrypt / decrypt 59 | 60 | `ifndef SAES32_NO_SM4 61 | 62 | module sm4_t( output [31:0] out, input [7:0] in, input f ); 63 | 64 | wire [7:0] x; 65 | 66 | sm4_sbox sbox ( x, in ); 67 | 68 | // Either L' or L linear layers (for keying and encrypt / decrypt) 69 | // ( this looks slightly odd due to the use of little-endian byte order ) 70 | assign out = f ? { x[2:0], 5'b0, x[0], 2'b0 ,x[7:3], 1'b0, x[7:1], x } : 71 | { x[5:0], x, x[7:6], x[7:2], x[1:0] ^ x[7:6], x[7:2] ^ x[5:0], x[1:0] }; 72 | 73 | endmodule 74 | 75 | `endif 76 | 77 | // Combinatorial logic for the SAES32 instruction itself 78 | 79 | module saes32( 80 | output [31:0] rd, // output register (wire!) 81 | input [31:0] rs1, // input register 1 82 | input [31:0] rs2, // input register 2 83 | input [4:0] fn // 5-bit function specifier 84 | ); 85 | 86 | // select input byte from rs2 according to fn[1:0] 87 | 88 | wire [7:0] x = fn[1:0] == 2'b00 ? rs2[ 7: 0] : 89 | fn[1:0] == 2'b01 ? rs2[15: 8] : 90 | fn[1:0] == 2'b10 ? rs2[23:16] : 91 | rs2[31:24]; 92 | 93 | // expand to 32 bits 94 | 95 | `ifndef SAES32_NO_AES 96 | wire [31:0] aes_32; 97 | aes_t aes ( aes_32, x, fn[2] ); 98 | `endif 99 | 100 | `ifndef SAES32_NO_AESI 101 | wire [31:0] aesi_32; 102 | aesi_t aesi ( aesi_32, x, fn[2] ); 103 | `endif 104 | 105 | `ifndef SAES32_NO_SM4 106 | wire [31:0] sm4_32; 107 | sm4_t sm4 ( sm4_32, x, fn[2] ); 108 | `endif 109 | 110 | wire [31:0] y = 111 | `ifndef SAES32_NO_AES 112 | fn[4:3] == 2'b00 ? aes_32 : 113 | `endif 114 | `ifndef SAES32_NO_AESI 115 | fn[4:3] == 2'b01 ? aesi_32 : 116 | `endif 117 | `ifndef SAES32_NO_SM4 118 | fn[4:3] == 2'b10 ? sm4_32 : 119 | `endif 120 | 32'h00000000; 121 | 122 | // rotate output 123 | 124 | wire [31:0] z = fn[1:0] == 2'b00 ? y : 125 | fn[1:0] == 2'b01 ? { y[23: 0], y[31:24] } : 126 | fn[1:0] == 2'b10 ? { y[15: 0], y[31:16] } : 127 | { y[ 7: 0], y[31: 8] }; 128 | 129 | // XOR the result with rs1 130 | 131 | assign rd = z ^ rs1; 132 | 133 | endmodule 134 | 135 | -------------------------------------------------------------------------------- /hdl/saes32_tb.v: -------------------------------------------------------------------------------- 1 | // saes32_tb.v 2 | // 2020-01-29 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // test bench for the AES / SM4 instruction 6 | 7 | `timescale 1 ns / 1 ps 8 | 9 | module saes32_tb; 10 | 11 | // clock generator 12 | reg clk = 1; 13 | always #5 clk = ~clk; 14 | 15 | reg [31:0] cnt = 0; 16 | 17 | reg [31:0] rs1 = 32'h00000000; 18 | reg [31:0] rs2 = 32'h00000000; 19 | reg [4:0] fn = 0; 20 | wire [31:0] rd; 21 | 22 | wire [7:0] box; 23 | 24 | // test instance 25 | saes32 uut ( rd, rs1, rs2, fn ); 26 | 27 | always @(posedge clk) begin 28 | 29 | $display("[TB] rd=%h rs1=%h rs2=%h fn=%h", rd, rs1, rs2, fn ); 30 | 31 | fn <= fn + 1; 32 | rs2 <= rs2 + 32'h01234567; 33 | 34 | if (cnt == 23) begin 35 | $finish; 36 | end 37 | cnt <= cnt + 1; 38 | end 39 | 40 | 41 | endmodule 42 | 43 | -------------------------------------------------------------------------------- /hdl/sboxes.v: -------------------------------------------------------------------------------- 1 | // sboxes.v 2 | // 2020-01-29 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | /* 6 | 7 | Non-hardened combinatorial logic for AES, inverse AES, and SM4 S-Boxes. 8 | 9 | Each S-Box has a nonlinear middle layer sandwitched between linear 10 | top and bottom layers. In this version the top ("inner") layer expands 11 | 8 bits to 21 bits while the bottom layer compresses 18 bits back to 8. 12 | 13 | Overall structure and AES and AES^-1 slightly modified from [BoPe12]. 14 | SM4 top and bottom layers by Markku-Juhani O. Saarinen, January 2020. 15 | 16 | The middle layer is common between all; the beneficiality of muxing it 17 | depends on target. Currently we are not doing it. 18 | 19 | How? Because all of these are "Nyberg S-boxes" [Ny93]; built from a 20 | multiplicative inverse in GF(256) and are therefore affine isomorphic. 21 | 22 | [BoPe12] Boyar J., Peralta R. "A Small Depth-16 Circuit for the AES 23 | S-Box." Proc.SEC 2012. IFIP AICT 376. Springer, pp. 287-298 (2012) 24 | DOI: https://doi.org/10.1007/978-3-642-30436-1_24 25 | Preprint: https://eprint.iacr.org/2011/332.pdf 26 | 27 | [Ny93] Nyberg K., "Differentially Uniform Mappings for Cryptography", 28 | Proc. EUROCRYPT '93, LNCS 765, Springer, pp. 55-64 (1993) 29 | DOI: https://doi.org/10.1007/3-540-48285-7_6 30 | 31 | */ 32 | 33 | // The shared non-linear middle part for AES, AES^-1, and SM4. 34 | 35 | module sbox_inv_mid( output [17:0] y, input [20:0] x ); 36 | 37 | wire [45:0] t; 38 | 39 | assign t[ 0] = x[ 3] ^ x[12]; 40 | assign t[ 1] = x[ 9] & x[ 5]; 41 | assign t[ 2] = x[17] & x[ 6]; 42 | assign t[ 3] = x[10] ^ t[ 1]; 43 | assign t[ 4] = x[14] & x[ 0]; 44 | assign t[ 5] = t[ 4] ^ t[ 1]; 45 | assign t[ 6] = x[ 3] & x[12]; 46 | assign t[ 7] = x[16] & x[ 7]; 47 | assign t[ 8] = t[ 0] ^ t[ 6]; 48 | assign t[ 9] = x[15] & x[13]; 49 | assign t[10] = t[ 9] ^ t[ 6]; 50 | assign t[11] = x[ 1] & x[11]; 51 | assign t[12] = x[ 4] & x[20]; 52 | assign t[13] = t[12] ^ t[11]; 53 | assign t[14] = x[ 2] & x[ 8]; 54 | assign t[15] = t[14] ^ t[11]; 55 | assign t[16] = t[ 3] ^ t[ 2]; 56 | assign t[17] = t[ 5] ^ x[18]; 57 | assign t[18] = t[ 8] ^ t[ 7]; 58 | assign t[19] = t[10] ^ t[15]; 59 | assign t[20] = t[16] ^ t[13]; 60 | assign t[21] = t[17] ^ t[15]; 61 | assign t[22] = t[18] ^ t[13]; 62 | assign t[23] = t[19] ^ x[19]; 63 | assign t[24] = t[22] ^ t[23]; 64 | assign t[25] = t[22] & t[20]; 65 | assign t[26] = t[21] ^ t[25]; 66 | assign t[27] = t[20] ^ t[21]; 67 | assign t[28] = t[23] ^ t[25]; 68 | assign t[29] = t[28] & t[27]; 69 | assign t[30] = t[26] & t[24]; 70 | assign t[31] = t[20] & t[23]; 71 | assign t[32] = t[27] & t[31]; 72 | assign t[33] = t[27] ^ t[25]; 73 | assign t[34] = t[21] & t[22]; 74 | assign t[35] = t[24] & t[34]; 75 | assign t[36] = t[24] ^ t[25]; 76 | assign t[37] = t[21] ^ t[29]; 77 | assign t[38] = t[32] ^ t[33]; 78 | assign t[39] = t[23] ^ t[30]; 79 | assign t[40] = t[35] ^ t[36]; 80 | assign t[41] = t[38] ^ t[40]; 81 | assign t[42] = t[37] ^ t[39]; 82 | assign t[43] = t[37] ^ t[38]; 83 | assign t[44] = t[39] ^ t[40]; 84 | assign t[45] = t[42] ^ t[41]; 85 | assign y[ 0] = t[38] & x[ 7]; 86 | assign y[ 1] = t[37] & x[13]; 87 | assign y[ 2] = t[42] & x[11]; 88 | assign y[ 3] = t[45] & x[20]; 89 | assign y[ 4] = t[41] & x[ 8]; 90 | assign y[ 5] = t[44] & x[ 9]; 91 | assign y[ 6] = t[40] & x[17]; 92 | assign y[ 7] = t[39] & x[14]; 93 | assign y[ 8] = t[43] & x[ 3]; 94 | assign y[ 9] = t[38] & x[16]; 95 | assign y[10] = t[37] & x[15]; 96 | assign y[11] = t[42] & x[ 1]; 97 | assign y[12] = t[45] & x[ 4]; 98 | assign y[13] = t[41] & x[ 2]; 99 | assign y[14] = t[44] & x[ 5]; 100 | assign y[15] = t[40] & x[ 6]; 101 | assign y[16] = t[39] & x[ 0]; 102 | assign y[17] = t[43] & x[12]; 103 | 104 | endmodule 105 | 106 | // === AES (Forward) === 107 | 108 | `ifndef SAES32_NO_AES 109 | 110 | // top (inner) linear layer for AES 111 | 112 | module sbox_aes_top( output [20:0] y, input [7:0] x); 113 | 114 | wire [5:0] t; 115 | 116 | assign y[ 0] = x[ 0]; 117 | assign y[ 1] = x[ 7] ^ x[ 4]; 118 | assign y[ 2] = x[ 7] ^ x[ 2]; 119 | assign y[ 3] = x[ 7] ^ x[ 1]; 120 | assign y[ 4] = x[ 4] ^ x[ 2]; 121 | assign t[ 0] = x[ 3] ^ x[ 1]; 122 | assign y[ 5] = y[ 1] ^ t[ 0]; 123 | assign t[ 1] = x[ 6] ^ x[ 5]; 124 | assign y[ 6] = x[ 0] ^ y[ 5]; 125 | assign y[ 7] = x[ 0] ^ t[ 1]; 126 | assign y[ 8] = y[ 5] ^ t[ 1]; 127 | assign t[ 2] = x[ 6] ^ x[ 2]; 128 | assign t[ 3] = x[ 5] ^ x[ 2]; 129 | assign y[ 9] = y[ 3] ^ y[ 4]; 130 | assign y[10] = y[ 5] ^ t[ 2]; 131 | assign y[11] = t[ 0] ^ t[ 2]; 132 | assign y[12] = t[ 0] ^ t[ 3]; 133 | assign y[13] = y[ 7] ^ y[12]; 134 | assign t[ 4] = x[ 4] ^ x[ 0]; 135 | assign y[14] = t[ 1] ^ t[ 4]; 136 | assign y[15] = y[ 1] ^ y[14]; 137 | assign t[ 5] = x[ 1] ^ x[ 0]; 138 | assign y[16] = t[ 1] ^ t[ 5]; 139 | assign y[17] = y[ 2] ^ y[16]; 140 | assign y[18] = y[ 2] ^ y[ 8]; 141 | assign y[19] = y[15] ^ y[13]; 142 | assign y[20] = y[ 1] ^ t[ 3]; 143 | 144 | endmodule 145 | 146 | // bottom (outer) linear layer for AES 147 | 148 | module sbox_aes_out( output [7:0] y, input [17:0] x); 149 | 150 | wire [29:0] t; 151 | 152 | assign t[ 0] = x[11] ^ x[12]; 153 | assign t[ 1] = x[ 0] ^ x[ 6]; 154 | assign t[ 2] = x[14] ^ x[16]; 155 | assign t[ 3] = x[15] ^ x[ 5]; 156 | assign t[ 4] = x[ 4] ^ x[ 8]; 157 | assign t[ 5] = x[17] ^ x[11]; 158 | assign t[ 6] = x[12] ^ t[ 5]; 159 | assign t[ 7] = x[14] ^ t[ 3]; 160 | assign t[ 8] = x[ 1] ^ x[ 9]; 161 | assign t[ 9] = x[ 2] ^ x[ 3]; 162 | assign t[10] = x[ 3] ^ t[ 4]; 163 | assign t[11] = x[10] ^ t[ 2]; 164 | assign t[12] = x[16] ^ x[ 1]; 165 | assign t[13] = x[ 0] ^ t[ 0]; 166 | assign t[14] = x[ 2] ^ x[11]; 167 | assign t[15] = x[ 5] ^ t[ 1]; 168 | assign t[16] = x[ 6] ^ t[ 0]; 169 | assign t[17] = x[ 7] ^ t[ 1]; 170 | assign t[18] = x[ 8] ^ t[ 8]; 171 | assign t[19] = x[13] ^ t[ 4]; 172 | assign t[20] = t[ 0] ^ t[ 1]; 173 | assign t[21] = t[ 1] ^ t[ 7]; 174 | assign t[22] = t[ 3] ^ t[12]; 175 | assign t[23] = t[18] ^ t[ 2]; 176 | assign t[24] = t[15] ^ t[ 9]; 177 | assign t[25] = t[ 6] ^ t[10]; 178 | assign t[26] = t[ 7] ^ t[ 9]; 179 | assign t[27] = t[ 8] ^ t[10]; 180 | assign t[28] = t[11] ^ t[14]; 181 | assign t[29] = t[11] ^ t[17]; 182 | assign y[ 0] = t[ 6] ^~ t[23]; 183 | assign y[ 1] = t[13] ^~ t[27]; 184 | assign y[ 2] = t[25] ^ t[29]; 185 | assign y[ 3] = t[20] ^ t[22]; 186 | assign y[ 4] = t[ 6] ^ t[21]; 187 | assign y[ 5] = t[19] ^~ t[28]; 188 | assign y[ 6] = t[16] ^~ t[26]; 189 | assign y[ 7] = t[ 6] ^ t[24]; 190 | 191 | endmodule 192 | 193 | // AES s-box 194 | 195 | module aes_sbox( output [7:0] fx, input [7:0] in ); 196 | 197 | wire [20:0] t1; 198 | wire [17:0] t2; 199 | 200 | sbox_aes_top top ( t1, in ); 201 | sbox_inv_mid mid ( t2, t1 ); 202 | sbox_aes_out out ( fx, t2 ); 203 | 204 | endmodule 205 | 206 | `endif 207 | 208 | 209 | // === AES^-1 (Inverse) === 210 | 211 | `ifndef SAES32_NO_AESI 212 | 213 | // top (inner) linear layer for AES^-1 214 | 215 | module sbox_aesi_top( output [20:0] y, input [7:0] x); 216 | 217 | wire [4:0] t; 218 | 219 | assign y[17] = x[ 7] ^ x[ 4]; 220 | assign y[16] = x[ 6] ^~ x[ 4]; 221 | assign y[ 2] = x[ 7] ^~ x[ 6]; 222 | assign y[ 1] = x[ 4] ^ x[ 3]; 223 | assign y[18] = x[ 3] ^~ x[ 0]; 224 | assign t[ 0] = x[ 1] ^ x[ 0]; 225 | assign y[ 6] = x[ 6] ^~ y[17]; 226 | assign y[14] = y[16] ^ t[ 0]; 227 | assign y[ 7] = x[ 0] ^~ y[ 1]; 228 | assign y[ 8] = y[ 2] ^ y[18]; 229 | assign y[ 9] = y[ 2] ^ t[ 0]; 230 | assign y[ 3] = y[ 1] ^ t[ 0]; 231 | assign y[19] = x[ 5] ^~ y[ 1]; 232 | assign t[ 1] = x[ 6] ^ x[ 1]; 233 | assign y[13] = x[ 5] ^~ y[14]; 234 | assign y[15] = y[18] ^ t[ 1]; 235 | assign y[ 4] = x[ 3] ^ y[ 6]; 236 | assign t[ 2] = x[ 5] ^~ x[ 2]; 237 | assign t[ 3] = x[ 2] ^~ x[ 1]; 238 | assign t[ 4] = x[ 5] ^~ x[ 3]; 239 | assign y[ 5] = y[16] ^ t[ 2]; 240 | assign y[12] = t[ 1] ^ t[ 4]; 241 | assign y[20] = y[ 1] ^ t[ 3]; 242 | assign y[11] = y[ 8] ^ y[20]; 243 | assign y[10] = y[ 8] ^ t[ 3]; 244 | assign y[ 0] = x[ 7] ^ t[ 2]; 245 | 246 | endmodule 247 | 248 | // bottom (outer) linear layer for AES^-1 249 | 250 | module sbox_aesi_out( output [7:0] y, input [17:0] x); 251 | 252 | wire [29:0] t; 253 | 254 | assign t[ 0] = x[ 2] ^ x[11]; 255 | assign t[ 1] = x[ 8] ^ x[ 9]; 256 | assign t[ 2] = x[ 4] ^ x[12]; 257 | assign t[ 3] = x[15] ^ x[ 0]; 258 | assign t[ 4] = x[16] ^ x[ 6]; 259 | assign t[ 5] = x[14] ^ x[ 1]; 260 | assign t[ 6] = x[17] ^ x[10]; 261 | assign t[ 7] = t[ 0] ^ t[ 1]; 262 | assign t[ 8] = x[ 0] ^ x[ 3]; 263 | assign t[ 9] = x[ 5] ^ x[13]; 264 | assign t[10] = x[ 7] ^ t[ 4]; 265 | assign t[11] = t[ 0] ^ t[ 3]; 266 | assign t[12] = x[14] ^ x[16]; 267 | assign t[13] = x[17] ^ x[ 1]; 268 | assign t[14] = x[17] ^ x[12]; 269 | assign t[15] = x[ 4] ^ x[ 9]; 270 | assign t[16] = x[ 7] ^ x[11]; 271 | assign t[17] = x[ 8] ^ t[ 2]; 272 | assign t[18] = x[13] ^ t[ 5]; 273 | assign t[19] = t[ 2] ^ t[ 3]; 274 | assign t[20] = t[ 4] ^ t[ 6]; 275 | assign t[22] = t[ 2] ^ t[ 7]; 276 | assign t[23] = t[ 7] ^ t[ 8]; 277 | assign t[24] = t[ 5] ^ t[ 7]; 278 | assign t[25] = t[ 6] ^ t[10]; 279 | assign t[26] = t[ 9] ^ t[11]; 280 | assign t[27] = t[10] ^ t[18]; 281 | assign t[28] = t[11] ^ t[25]; 282 | assign t[29] = t[15] ^ t[20]; 283 | assign y[ 0] = t[ 9] ^ t[16]; 284 | assign y[ 1] = t[14] ^ t[23]; 285 | assign y[ 2] = t[19] ^ t[24]; 286 | assign y[ 3] = t[23] ^ t[27]; 287 | assign y[ 4] = t[12] ^ t[22]; 288 | assign y[ 5] = t[17] ^ t[28]; 289 | assign y[ 6] = t[26] ^ t[29]; 290 | assign y[ 7] = t[13] ^ t[22]; 291 | 292 | endmodule 293 | 294 | // AES inverse S-box 295 | 296 | module aesi_sbox( output [7:0] fx, input [7:0] in ); 297 | 298 | wire [20:0] t1; 299 | wire [17:0] t2; 300 | 301 | sbox_aesi_top top ( t1, in ); 302 | sbox_inv_mid mid ( t2, t1 ); 303 | sbox_aesi_out out ( fx, t2 ); 304 | 305 | endmodule 306 | 307 | `endif 308 | 309 | // === SM4 === 310 | 311 | `ifndef SAES32_NO_SM4 312 | 313 | // top (inner) linear layer for SM4 314 | 315 | 316 | module sbox_sm4_top( output [20:0] y, input [7:0] x); 317 | 318 | wire [6:0] t; 319 | 320 | assign y[18] = x[ 2] ^ x[ 6]; 321 | assign t[ 0] = x[ 3] ^ x[ 4]; 322 | assign t[ 1] = x[ 2] ^ x[ 7]; 323 | assign t[ 2] = x[ 7] ^ y[18]; 324 | assign t[ 3] = x[ 1] ^ t[ 1]; 325 | assign t[ 4] = x[ 6] ^ x[ 7]; 326 | assign t[ 5] = x[ 0] ^ y[18]; 327 | assign t[ 6] = x[ 3] ^ x[ 6]; 328 | assign y[10] = x[ 1] ^ y[18]; 329 | assign y[ 0] = x[ 5] ^~ y[10]; 330 | assign y[ 1] = t[ 0] ^ t[ 3]; 331 | assign y[ 2] = x[ 0] ^ t[ 0]; 332 | assign y[ 4] = x[ 0] ^ t[ 3]; 333 | assign y[ 3] = x[ 3] ^ y[ 4]; 334 | assign y[ 5] = x[ 5] ^ t[ 5]; 335 | assign y[ 6] = x[ 0] ^~ x[ 1]; 336 | assign y[ 7] = t[ 0] ^~ y[10]; 337 | assign y[ 8] = t[ 0] ^ t[ 5]; 338 | assign y[ 9] = x[ 3]; 339 | assign y[11] = t[ 0] ^ t[ 4]; 340 | assign y[12] = x[ 5] ^ t[ 4]; 341 | assign y[13] = x[ 5] ^~ y[ 1]; 342 | assign y[14] = x[ 4] ^~ t[ 2]; 343 | assign y[15] = x[ 1] ^~ t[ 6]; 344 | assign y[16] = x[ 0] ^~ t[ 2]; 345 | assign y[17] = t[ 0] ^~ t[ 2]; 346 | assign y[19] = x[ 5] ^~ y[14]; 347 | assign y[20] = x[ 0] ^ t[ 1]; 348 | 349 | endmodule 350 | 351 | // bottom (outer) linear layer for SM4 352 | 353 | module sbox_sm4_out( output [7:0] y, input [17:0] x); 354 | 355 | wire [29:0] t; 356 | 357 | assign t[ 0] = x[ 4] ^ x[ 7]; 358 | assign t[ 1] = x[13] ^ x[15]; 359 | assign t[ 2] = x[ 2] ^ x[16]; 360 | assign t[ 3] = x[ 6] ^ t[ 0]; 361 | assign t[ 4] = x[12] ^ t[ 1]; 362 | assign t[ 5] = x[ 9] ^ x[10]; 363 | assign t[ 6] = x[11] ^ t[ 2]; 364 | assign t[ 7] = x[ 1] ^ t[ 4]; 365 | assign t[ 8] = x[ 0] ^ x[17]; 366 | assign t[ 9] = x[ 3] ^ x[17]; 367 | assign t[10] = x[ 8] ^ t[ 3]; 368 | assign t[11] = t[ 2] ^ t[ 5]; 369 | assign t[12] = x[14] ^ t[ 6]; 370 | assign t[13] = t[ 7] ^ t[ 9]; 371 | assign t[14] = x[ 0] ^ x[ 6]; 372 | assign t[15] = x[ 7] ^ x[16]; 373 | assign t[16] = x[ 5] ^ x[13]; 374 | assign t[17] = x[ 3] ^ x[15]; 375 | assign t[18] = x[10] ^ x[12]; 376 | assign t[19] = x[ 9] ^ t[ 1]; 377 | assign t[20] = x[ 4] ^ t[ 4]; 378 | assign t[21] = x[14] ^ t[ 3]; 379 | assign t[22] = x[16] ^ t[ 5]; 380 | assign t[23] = t[ 7] ^ t[14]; 381 | assign t[24] = t[ 8] ^ t[11]; 382 | assign t[25] = t[ 0] ^ t[12]; 383 | assign t[26] = t[17] ^ t[ 3]; 384 | assign t[27] = t[18] ^ t[10]; 385 | assign t[28] = t[19] ^ t[ 6]; 386 | assign t[29] = t[ 8] ^ t[10]; 387 | assign y[ 0] = t[11] ^~ t[13]; 388 | assign y[ 1] = t[15] ^~ t[23]; 389 | assign y[ 2] = t[20] ^ t[24]; 390 | assign y[ 3] = t[16] ^ t[25]; 391 | assign y[ 4] = t[26] ^~ t[22]; 392 | assign y[ 5] = t[21] ^ t[13]; 393 | assign y[ 6] = t[27] ^~ t[12]; 394 | assign y[ 7] = t[28] ^~ t[29]; 395 | 396 | endmodule 397 | 398 | // SM4 S-box (there is no need for inverse) 399 | 400 | module sm4_sbox( output [7:0] fx, input [7:0] in ); 401 | 402 | wire [20:0] t1; 403 | wire [17:0] t2; 404 | 405 | sbox_sm4_top top ( t1, in ); 406 | sbox_inv_mid mid ( t2, t1 ); 407 | sbox_sm4_out out ( fx, t2 ); 408 | 409 | endmodule 410 | 411 | `endif 412 | -------------------------------------------------------------------------------- /hdl/synth.ys: -------------------------------------------------------------------------------- 1 | design -reset 2 | read_verilog -D SAES32_NO_AESI -D SAES32_NO_SM4 -defer saes32.v sboxes.v 3 | hierarchy -top saes32 4 | rename saes32 saes32_aes_enc_only 5 | synth -flatten; abc -dff -g cmos; opt -fast 6 | tee -a synth.tmp stat -tech cmos 7 | tee -a synth.tmp ltp -noff 8 | 9 | design -reset 10 | read_verilog -D SAES32_NO_SM4 -defer saes32.v sboxes.v 11 | hierarchy -top saes32 12 | rename saes32 saes32_aes_encdec 13 | synth -flatten; abc -dff -g cmos; opt -fast 14 | tee -a synth.tmp stat -tech cmos 15 | tee -a synth.tmp ltp -noff 16 | 17 | design -reset 18 | read_verilog -D SAES32_NO_AES -D SAES32_NO_AESI -defer saes32.v sboxes.v 19 | hierarchy -top saes32 20 | rename saes32 saes32_sm4_only 21 | synth -flatten; abc -dff -g cmos; opt -fast 22 | tee -a synth.tmp stat -tech cmos 23 | tee -a synth.tmp ltp -noff 24 | 25 | design -reset 26 | read_verilog -defer saes32.v sboxes.v 27 | hierarchy -top saes32 28 | rename saes32 saes32_full 29 | synth -flatten; abc -dff -g cmos; opt -fast 30 | tee -a synth.tmp stat -tech cmos 31 | tee -a synth.tmp ltp -noff 32 | 33 | -------------------------------------------------------------------------------- /hdl/tbref.txt: -------------------------------------------------------------------------------- 1 | [TB] rd=a56363c6 rs1=00000000 rs2=00000000 fn=00 2 | [TB] rd=6e6edcb2 rs1=00000000 rs2=01234567 fn=01 3 | [TB] rd=5ab4ee5a rs1=00000000 rs2=02468ace fn=02 4 | [TB] rd=f68d7b7b rs1=00000000 rs2=0369d035 fn=03 5 | [TB] rd=000000de rs1=00000000 rs2=048d159c fn=04 6 | [TB] rd=00003900 rs1=00000000 rs2=05b05b03 fn=05 7 | [TB] rd=00660000 rs1=00000000 rs2=06d3a06a fn=06 8 | [TB] rd=c5000000 rs1=00000000 rs2=07f6e5d1 fn=07 9 | [TB] rd=0728ebb2 rs1=00000000 rs2=091a2b38 fn=08 10 | [TB] rd=670a0cb1 rs1=00000000 rs2=0a3d709f fn=09 11 | [TB] rd=7ca1470a rs1=00000000 rs2=0b60b606 fn=0a 12 | [TB] rd=4ffcd7e5 rs1=00000000 rs2=0c83fb6d fn=0b 13 | [TB] rd=00000019 rs1=00000000 rs2=0da740d4 fn=0c 14 | [TB] rd=0000dc00 rs1=00000000 rs2=0eca863b fn=0d 15 | [TB] rd=00530000 rs1=00000000 rs2=0fedcba2 fn=0e 16 | [TB] rd=e3000000 rs1=00000000 rs2=11111109 fn=0f 17 | [TB] rd=5353d784 rs1=00000000 rs2=12345670 fn=10 18 | [TB] rd=c030f0c0 rs1=00000000 rs2=13579bd7 fn=11 19 | [TB] rd=020a0808 rs1=00000000 rs2=147ae13e fn=12 20 | [TB] rd=46fafabc rs1=00000000 rs2=159e26a5 fn=13 21 | [TB] rd=00051428 rs1=00000000 rs2=16c16c0c fn=14 22 | [TB] rd=9b6ddb60 rs1=00000000 rs2=17e4b173 fn=15 23 | [TB] rd=5bb7e096 rs1=00000000 rs2=1907f6da fn=16 24 | [TB] rd=13608209 rs1=00000000 rs2=1a2b3c41 fn=17 25 | -------------------------------------------------------------------------------- /hdl/yoparse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # yoparse.py 4 | # 2020-02-27 Markku-Juhani O. Saarinen 5 | 6 | # parse the synthesis output 7 | 8 | import sys 9 | 10 | # "For evaluation purposes we [use] the following mockup ASIC cell library:" 11 | 12 | wt = {} 13 | wt["$_NOT_"] = 0.5 14 | wt["$_NAND_"] = 1.0 15 | wt["$_NOR_"] = 1.0 16 | wt["$_XOR_"] = 3.0 17 | wt["$_XNOR_"] = 3.0 18 | wt["$_DFF_P_"] = 4.0 19 | wt["$_AOI3_"] = 1.5 20 | wt["$_OAI3_"] = 1.5 21 | wt["$_AOI4_"] = 2.0 22 | wt["$_OAI4_"] = 2.0 23 | wt["$_NMUX_"] = 2.5 24 | wt["$_MUX_"] = 3.0 25 | 26 | # parse input files 27 | 28 | for fn in sys.argv[1:]: 29 | 30 | print(f"=== Summary for {fn} ===") 31 | 32 | with open(fn, 'r') as f: 33 | lns = f.readlines() 34 | 35 | tb = {} 36 | li = 0 37 | targ = "" 38 | ge = 0.0 39 | tr = 0 40 | ltp = 0 41 | 42 | for lin in lns: 43 | 44 | li = li + 1 45 | lv = lin.split(); 46 | ll = len(lv) 47 | 48 | if ll == 3 and lv[0] == "===": 49 | targ = lv[1] 50 | ge = 0.0 51 | tr = 0 52 | ltp = 0 53 | 54 | if ll == 5 and lv[3] == "transistors:": 55 | tr = int(lv[4]) 56 | 57 | if ll == 6 and lv[1] == "topological": 58 | tmp = lv[5][8:] 59 | ltp = int(tmp[:-2]) 60 | 61 | if ll == 2 and lv[0][:2] == "$_": 62 | if lv[0] in wt: 63 | ge = ge + float(lv[1]) * wt[lv[0]] 64 | else: 65 | print(f"{fn}:{li} unknown gate {lv[0]}") 66 | 67 | # update it 68 | if targ != "": 69 | tb[targ] = ( ge, tr, ltp ) 70 | 71 | # print the counts 72 | 73 | for x in tb: 74 | print(f"{x:20} ge={tb[x][0]:7} tr={tb[x][1]:5} ltp={tb[x][2]:3}") 75 | 76 | -------------------------------------------------------------------------------- /rv_endian.h: -------------------------------------------------------------------------------- 1 | // rv_endian.h 2 | // 2020-04-30 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // RISC-V specific endianess support would be here (via intrinsics) 6 | 7 | #ifndef _RV_ENDIAN_H_ 8 | #define _RV_ENDIAN_H_ 9 | 10 | // revert if not big endian 11 | 12 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 13 | #define GREV_BE32(x) (x) 14 | #else 15 | // grev(x, 0x18) or rev8 16 | #define GREV_BE32(x) ( \ 17 | (((x) & 0xFF000000) >> 24) | (((x) & 0x00FF0000) >> 8) | \ 18 | (((x) & 0x0000FF00) << 8) | (((x) & 0x000000FF) << 24)) 19 | #endif 20 | 21 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 22 | #define GREV_BE64(x) (x) 23 | #else 24 | // RISC-V: grev(x, 0x38) or rev8(x) 25 | #define GREV_BE64(x) ( \ 26 | (((x) & 0xFF00000000000000LL) >> 56) | \ 27 | (((x) & 0x00FF000000000000LL) >> 40) | \ 28 | (((x) & 0x0000FF0000000000LL) >> 24) | \ 29 | (((x) & 0x000000FF00000000LL) >> 8) | \ 30 | (((x) & 0x00000000FF000000LL) << 8) | \ 31 | (((x) & 0x0000000000FF0000LL) << 24) | \ 32 | (((x) & 0x000000000000FF00LL) << 40) | \ 33 | (((x) & 0x00000000000000FFLL) << 56)) 34 | #endif 35 | 36 | // rotate left 37 | static inline uint32_t rol32(uint32_t x, uint32_t n) 38 | { 39 | return ((x) << n) | ((x) >> (32 - n)); 40 | } 41 | 42 | // little-endian loads and stores (unaligned) 43 | 44 | static inline uint32_t get32u_le(const uint8_t * v) 45 | { 46 | return ((uint32_t) v[0]) | (((uint32_t) v[1]) << 8) | 47 | (((uint32_t) v[2]) << 16) | (((uint32_t) v[3]) << 24); 48 | } 49 | 50 | static inline void put32u_le(uint8_t * v, uint32_t x) 51 | { 52 | v[0] = x; 53 | v[1] = x >> 8; 54 | v[2] = x >> 16; 55 | v[3] = x >> 24; 56 | } 57 | 58 | static inline uint64_t get64u_le(const uint8_t * v) 59 | { 60 | return ((uint64_t) v[0]) | (((uint64_t) v[1]) << 8) | 61 | (((uint64_t) v[2]) << 16) | (((uint64_t) v[3]) << 24) | 62 | (((uint64_t) v[4]) << 32) | (((uint64_t) v[5]) << 40) | 63 | (((uint64_t) v[6]) << 48) | (((uint64_t) v[7]) << 56); 64 | } 65 | 66 | static inline void put64u_le(uint8_t * v, uint64_t x) 67 | { 68 | v[0] = x; 69 | v[1] = x >> 8; 70 | v[2] = x >> 16; 71 | v[3] = x >> 24; 72 | v[4] = x >> 32; 73 | v[5] = x >> 40; 74 | v[6] = x >> 48; 75 | v[7] = x >> 56; 76 | } 77 | 78 | 79 | // big-endian loads and stores (unaligned) 80 | 81 | static inline uint32_t get32u_be(const uint8_t * v) 82 | { 83 | return (((uint32_t) v[0]) << 24) | (((uint32_t) v[1]) << 16) | 84 | (((uint32_t) v[2]) << 8) | ((uint32_t) v[3]); 85 | } 86 | 87 | static inline void put32u_be(uint8_t * v, uint32_t x) 88 | { 89 | v[0] = x >> 24; 90 | v[1] = x >> 16; 91 | v[2] = x >> 8; 92 | v[3] = x; 93 | } 94 | 95 | static inline uint64_t get64u_be(const uint8_t * v) 96 | { 97 | return (((uint64_t) v[0]) << 56) | (((uint64_t) v[1]) << 48) | 98 | (((uint64_t) v[2]) << 40) | (((uint64_t) v[3]) << 32) | 99 | (((uint64_t) v[4]) << 24) | (((uint64_t) v[5]) << 16) | 100 | (((uint64_t) v[6]) << 8) | ((uint64_t) v[7]); 101 | } 102 | 103 | static inline void put64u_be(uint8_t * v, uint64_t x) 104 | { 105 | v[0] = x >> 56; 106 | v[1] = x >> 48; 107 | v[2] = x >> 40; 108 | v[3] = x >> 32; 109 | v[4] = x >> 24; 110 | v[5] = x >> 16; 111 | v[6] = x >> 8; 112 | v[7] = x; 113 | } 114 | 115 | #endif 116 | -------------------------------------------------------------------------------- /saes32.c: -------------------------------------------------------------------------------- 1 | // saes32.c 2 | // 2020-01-24 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Running pseudocode for SAES32 (and ENC4S) AES/SM4 instruction. 6 | 7 | #include "saes32.h" 8 | #include "sboxes.h" 9 | 10 | // Function codes 11 | 12 | #define SAES32_ENCSM 0 13 | #define SAES32_ENCS 1 14 | #define SAES32_DECSM 2 15 | #define SAES32_DECS 3 16 | #define SSM4_ED 4 17 | #define SSM4_KS 5 18 | 19 | // Multiply by 0x02 in AES's GF(256) - LFSR style 20 | 21 | static inline uint8_t aes_xtime(uint8_t x) 22 | { 23 | return (x << 1) ^ ((x & 0x80) ? 0x11B : 0x00); 24 | } 25 | 26 | // === THIS IS THE SINGLE LIGHTWEIGHT INSTRUCTION FOR AES AND SM4 === 27 | 28 | // SAES32: Instruction for a byte select, single S-box, and linear operation. 29 | 30 | uint32_t saes32(uint32_t rs1, uint32_t rs2, int fn) 31 | { 32 | uint32_t fa, fb, x, x2, x4, x8; 33 | 34 | fa = 8 * (fn & 3); // [1:0] byte select / rotate 35 | fb = (fn >> 2) & 7; // [4:2] cipher select 36 | 37 | // select input byte 38 | 39 | x = (rs2 >> fa) & 0xFF; // select byte 40 | 41 | // 8->8 bit s-box 42 | 43 | switch (fb) { 44 | 45 | case SAES32_ENCSM: // 0 : AES Forward + MC 46 | case SAES32_ENCS: // 1 : AES Forward "key" 47 | x = aes_sbox[x]; 48 | break; 49 | 50 | case SAES32_DECSM: // 1 : AES Inverse + MC 51 | case SAES32_DECS: // 2 : AES Inverse "key" 52 | x = aes_isbox[x]; 53 | break; 54 | 55 | case SSM4_ED: // 3 : SM4 encrypt/decrypt 56 | case SSM4_KS: // 4 : SM4 key schedule 57 | x = sm4_sbox[x]; 58 | break; 59 | 60 | default: // none 61 | break; 62 | } 63 | 64 | // 8->32 bit linear transforms expressed as little-endian 65 | 66 | switch (fb) { 67 | 68 | case SAES32_ENCSM: // 0 : AES Forward MixCol 69 | x2 = aes_xtime(x); // double x 70 | x = ((x ^ x2) << 24) | // 0x03 MixCol MDS Matrix 71 | (x << 16) | // 0x01 72 | (x << 8) | // 0x01 73 | x2; // 0x02 74 | break; 75 | 76 | case SAES32_DECSM: // 2 : AES Inverse MixCol 77 | x2 = aes_xtime(x); // double x 78 | x4 = aes_xtime(x2); // double to 4*x 79 | x8 = aes_xtime(x4); // double to 8*x 80 | x = ((x ^ x2 ^ x8) << 24) | // 0x0B Inv MixCol MDS Matrix 81 | ((x ^ x4 ^ x8) << 16) | // 0x0D 82 | ((x ^ x8) << 8) | // 0x09 83 | (x2 ^ x4 ^ x8); // 0x0E 84 | break; 85 | 86 | case SSM4_ED: // 4 : SM4 linear transform L 87 | x = x ^ (x << 8) ^ (x << 2) ^ (x << 18) ^ 88 | ((x & 0x3F) << 26) ^ ((x & 0xC0) << 10); 89 | break; 90 | 91 | case SSM4_KS: // 5 : SM4 transform L' (key) 92 | x = x ^ ((x & 0x07) << 29) ^ ((x & 0xFE) << 7) ^ 93 | ((x & 1) << 23) ^ ((x & 0xF8) << 13); 94 | break; 95 | 96 | default: // none 97 | break; 98 | 99 | } 100 | 101 | // rotate output left by fa bits 102 | 103 | if (fa != 0) { 104 | x = (x << fa) | (x >> (32 - fa)); 105 | } 106 | 107 | return x ^ rs1; // XOR with rs2 108 | } 109 | 110 | // === PSEUDO OPS === 111 | 112 | // AES Encryption 113 | 114 | uint32_t saes32_encsm(uint32_t rs1, uint32_t rs2, int bs) 115 | { 116 | return saes32(rs1, rs2, (SAES32_ENCSM << 2) | bs); 117 | } 118 | 119 | uint32_t saes32_encs(uint32_t rs1, uint32_t rs2, int bs) 120 | { 121 | return saes32(rs1, rs2, (SAES32_ENCS << 2) | bs); 122 | } 123 | 124 | // AES Decryption 125 | 126 | uint32_t saes32_decsm(uint32_t rs1, uint32_t rs2, int bs) 127 | { 128 | return saes32(rs1, rs2, (SAES32_DECSM << 2) | bs); 129 | } 130 | 131 | uint32_t saes32_decs(uint32_t rs1, uint32_t rs2, int bs) 132 | { 133 | return saes32(rs1, rs2, (SAES32_DECS << 2) | bs); 134 | } 135 | 136 | // SM4 Encryption, Decryption and Key Schedule 137 | 138 | uint32_t ssm4_ed(uint32_t rs1, uint32_t rs2, int bs) 139 | { 140 | return saes32(rs1, rs2, (SSM4_ED << 2) | bs); 141 | } 142 | 143 | uint32_t ssm4_ks(uint32_t rs1, uint32_t rs2, int bs) 144 | { 145 | return saes32(rs1, rs2, (SSM4_KS << 2) | bs); 146 | } 147 | -------------------------------------------------------------------------------- /saes32.h: -------------------------------------------------------------------------------- 1 | // saes32.h 2 | // 2020-01-27 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Prototypes for SAES32 -- replace with intrinsics. 6 | 7 | #ifndef _SAES32_H_ 8 | #define _SAES32_H_ 9 | 10 | #include 11 | 12 | // Hardware simulation: 13 | // SAES32: Instruction for a byte select, single S-box, and linear operation. 14 | 15 | uint32_t saes32(uint32_t rs1, uint32_t rs2, int sn); 16 | 17 | // === (Pseudo) Instructions === 18 | 19 | // AES Encryption 20 | 21 | uint32_t saes32_encsm(uint32_t rs1, uint32_t rs2, int bs); 22 | uint32_t saes32_encs(uint32_t rs1, uint32_t rs2, int bs); 23 | 24 | // AES Decryption 25 | 26 | uint32_t saes32_decsm(uint32_t rs1, uint32_t rs2, int bs); 27 | uint32_t saes32_decs(uint32_t rs1, uint32_t rs2, int bs); 28 | 29 | // SM4 Encryption, Decryption and Key Schedule 30 | 31 | uint32_t ssm4_ed(uint32_t rs1, uint32_t rs2, int bs); 32 | uint32_t ssm4_ks(uint32_t rs1, uint32_t rs2, int bs); 33 | 34 | #endif // _SAES32_H_ 35 | -------------------------------------------------------------------------------- /saes64.c: -------------------------------------------------------------------------------- 1 | // saes64.c 2 | // 2020-05-03 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Emulation code for SAES64 6 | 7 | #include "saes64.h" 8 | #include "sboxes.h" 9 | 10 | // ( Multiply by 0x02 in AES's GF(256) - LFSR style ) 11 | 12 | static inline uint8_t aes_xtime(uint8_t x) 13 | { 14 | return (x << 1) ^ ((x & 0x80) ? 0x11B : 0x00); 15 | } 16 | 17 | // ( MixColumns functions ) 18 | 19 | static inline uint32_t saes64_mc8(uint32_t x) 20 | { 21 | uint32_t x2; 22 | 23 | x2 = aes_xtime(x); // double x 24 | x = ((x ^ x2) << 24) | // 0x03 MixCol MDS Matrix 25 | (x << 16) | // 0x01 26 | (x << 8) | // 0x01 27 | x2; // 0x02 28 | 29 | return x; 30 | } 31 | 32 | static uint32_t saes64_mc32(uint32_t x) 33 | { 34 | uint32_t y; 35 | 36 | y = saes64_mc8((x >> 24) & 0xFF); 37 | y = (y << 8) | (y >> 24); 38 | y ^= saes64_mc8((x >> 16) & 0xFF); 39 | y = (y << 8) | (y >> 24); 40 | y ^= saes64_mc8((x >> 8) & 0xFF); 41 | y = (y << 8) | (y >> 24); 42 | y ^= saes64_mc8(x & 0xFF); 43 | 44 | return y; 45 | } 46 | 47 | // SAES64.ENCS: Half of ShiftRows and SubBytes (last round) 48 | 49 | uint64_t saes64_encs(uint64_t rs1, uint64_t rs2) 50 | { 51 | return ((uint64_t) aes_sbox[rs1 & 0xFF]) | 52 | (((uint64_t) aes_sbox[(rs1 >> 40) & 0xFF]) << 8) | 53 | (((uint64_t) aes_sbox[(rs2 >> 16) & 0xFF]) << 16) | 54 | (((uint64_t) aes_sbox[(rs2 >> 56) & 0xFF]) << 24) | 55 | (((uint64_t) aes_sbox[(rs1 >> 32) & 0xFF]) << 32) | 56 | (((uint64_t) aes_sbox[(rs2 >> 8) & 0xFF]) << 40) | 57 | (((uint64_t) aes_sbox[(rs2 >> 48) & 0xFF]) << 48) | 58 | (((uint64_t) aes_sbox[(rs1 >> 24) & 0xFF]) << 56); 59 | } 60 | 61 | // SAES64.ENCSM: Half of ShiftRows, SubBytes, and MixColumns 62 | 63 | uint64_t saes64_encsm(uint64_t rs1, uint64_t rs2) 64 | { 65 | uint64_t x; 66 | 67 | // ShiftRows and SubBytes 68 | x = saes64_encs(rs1, rs2); 69 | 70 | // MixColumns 71 | x = ((uint64_t) saes64_mc32(x)) | 72 | (((uint64_t) saes64_mc32(x >> 32)) << 32); 73 | 74 | return x; 75 | } 76 | 77 | // SAES64.DECS: Half of Inverse ShiftRows and SubBytes (last round) 78 | 79 | uint64_t saes64_decs(uint64_t rs1, uint64_t rs2) 80 | { 81 | return ((uint64_t) aes_isbox[rs1 & 0xFF]) | 82 | (((uint64_t) aes_isbox[(rs2 >> 40) & 0xFF]) << 8) | 83 | (((uint64_t) aes_isbox[(rs2 >> 16) & 0xFF]) << 16) | 84 | (((uint64_t) aes_isbox[(rs1 >> 56) & 0xFF]) << 24) | 85 | (((uint64_t) aes_isbox[(rs1 >> 32) & 0xFF]) << 32) | 86 | (((uint64_t) aes_isbox[(rs1 >> 8) & 0xFF]) << 40) | 87 | (((uint64_t) aes_isbox[(rs2 >> 48) & 0xFF]) << 48) | 88 | (((uint64_t) aes_isbox[(rs2 >> 24) & 0xFF]) << 56); 89 | } 90 | 91 | // SAES64.DECSM: Half of Inverse ShiftRows, SubBytes, and MixColumns 92 | 93 | uint64_t saes64_decsm(uint64_t rs1, uint64_t rs2) 94 | { 95 | uint64_t x; 96 | 97 | x = saes64_decs(rs1, rs2); // Inverse ShiftRows, SubBytes 98 | x = saes64_imix(x); // Inverse MixColumns 99 | 100 | return x; 101 | } 102 | 103 | // ( Inverse MixColumns functions ) 104 | 105 | static inline uint32_t saes64_imc8(uint32_t x) 106 | { 107 | uint32_t x2, x4, x8; 108 | 109 | x2 = aes_xtime(x); // double x 110 | x4 = aes_xtime(x2); // double to 4*x 111 | x8 = aes_xtime(x4); // double to 8*x 112 | 113 | x = ((x ^ x2 ^ x8) << 24) | // 0x0B Inv MixCol MDS Matrix 114 | ((x ^ x4 ^ x8) << 16) | // 0x0D 115 | ((x ^ x8) << 8) | // 0x09 116 | (x2 ^ x4 ^ x8); // 0x0E 117 | 118 | return x; 119 | } 120 | 121 | static uint32_t saes64_imc32(uint32_t x) 122 | { 123 | uint32_t y; 124 | 125 | y = saes64_imc8((x >> 24) & 0xFF); 126 | y = (y << 8) | (y >> 24); 127 | y ^= saes64_imc8((x >> 16) & 0xFF); 128 | y = (y << 8) | (y >> 24); 129 | y ^= saes64_imc8((x >> 8) & 0xFF); 130 | y = (y << 8) | (y >> 24); 131 | y ^= saes64_imc8(x & 0xFF); 132 | 133 | return y; 134 | } 135 | 136 | // SAES64.IMIX: Inverse MixColumns for decryption key schedule 137 | 138 | uint64_t saes64_imix(uint64_t rs1) 139 | { 140 | return ((uint64_t) saes64_imc32(rs1)) | 141 | (((uint64_t) saes64_imc32(rs1 >> 32)) << 32); 142 | } 143 | 144 | // SAES.KS1: Key Schedule 1 -- SubWord and opt. rotation, round const 145 | 146 | uint64_t saes64_ks1(uint64_t rs1, uint8_t i) 147 | { 148 | uint32_t t, rc; 149 | 150 | t = rs1 >> 32; 151 | rc = 0; 152 | 153 | if (i < 10) { // 10: don't do it 154 | t = (t >> 8) | (t << 24); // t = ROR(t, 8) 155 | rc = aes_rcon[i]; // round constant 156 | } 157 | // SubWord 158 | t = ((uint32_t) aes_sbox[t & 0xFF]) | 159 | (((uint32_t) aes_sbox[(t >> 8) & 0xFF]) << 8) | 160 | (((uint32_t) aes_sbox[(t >> 16) & 0xFF]) << 16) | 161 | (((uint32_t) aes_sbox[(t >> 24) & 0xFF]) << 24); 162 | 163 | t ^= rc; 164 | 165 | return ((uint64_t) t) | (((uint64_t) t) << 32); 166 | } 167 | 168 | // SAES.KS2: Key Schedule 2 -- Linear expansion 169 | 170 | uint64_t saes64_ks2(uint64_t rs1, uint64_t rs2) 171 | { 172 | uint32_t t; 173 | 174 | t = (rs1 >> 32) ^ (rs2 & 0xFFFFFFFF); // 32 bits 175 | 176 | return ((uint64_t) t) ^ // low 32 bits 177 | (((uint64_t) t) << 32) ^ (rs2 & 0xFFFFFFFF00000000LL); 178 | } 179 | -------------------------------------------------------------------------------- /saes64.h: -------------------------------------------------------------------------------- 1 | // saes64.h 2 | // 2020-05-02 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Prototypes for SAES64 -- replace with intrinsics. 6 | 7 | #ifndef _SAES64_H_ 8 | #define _SAES64_H_ 9 | 10 | #include 11 | 12 | // === (Pseudo) Instructions === 13 | 14 | // SAES64.ENCSM: Half of ShiftRows, SubBytes, and MixColumns 15 | uint64_t saes64_encsm(uint64_t rs1, uint64_t rs2); 16 | 17 | // SAES64.ENCS: Half of ShiftRows and SubBytes (last round) 18 | uint64_t saes64_encs(uint64_t rs1, uint64_t rs2); 19 | 20 | // SAES64.DECSM: Half of Inverse ShiftRows, SubBytes, and MixColumns 21 | uint64_t saes64_decsm(uint64_t rs1, uint64_t rs2); 22 | 23 | // SAES64.DECS: Half of Inverse ShiftRows and SubBytes (last round) 24 | uint64_t saes64_decs(uint64_t rs1, uint64_t rs2); 25 | 26 | // SAES64.IMIX: Inverse MixColumns for decryption key schedule 27 | uint64_t saes64_imix(uint64_t rs1); 28 | 29 | // SAES.KS1: Key Schedule 1 -- SubWord and opt. rotation, round const 30 | uint64_t saes64_ks1(uint64_t rs1, uint8_t i); 31 | 32 | // SAES.KS1: Key Schedule 1 -- SubWord and opt. rotation, round const 33 | uint64_t saes64_ks2(uint64_t rs1, uint64_t rs2); 34 | 35 | #endif // _SAES64_H_ 36 | -------------------------------------------------------------------------------- /sboxes.c: -------------------------------------------------------------------------------- 1 | // sboxes.c 2 | // 2020-05-05 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Data for AES and SM4. 6 | 7 | #include "sboxes.h" 8 | 9 | // AES Round Constants 10 | 11 | const uint8_t aes_rcon[] = { 12 | 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 13 | }; 14 | 15 | // AES Forward S-Box 16 | 17 | const uint8_t aes_sbox[256] = { 18 | 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 19 | 0xFE, 0xD7, 0xAB, 0x76, 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 20 | 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, 0xB7, 0xFD, 0x93, 0x26, 21 | 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, 22 | 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 23 | 0xEB, 0x27, 0xB2, 0x75, 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 24 | 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, 0x53, 0xD1, 0x00, 0xED, 25 | 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, 26 | 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 27 | 0x50, 0x3C, 0x9F, 0xA8, 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 28 | 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, 0xCD, 0x0C, 0x13, 0xEC, 29 | 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, 30 | 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 31 | 0xDE, 0x5E, 0x0B, 0xDB, 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 32 | 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, 0xE7, 0xC8, 0x37, 0x6D, 33 | 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, 34 | 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 35 | 0x4B, 0xBD, 0x8B, 0x8A, 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 36 | 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, 0xE1, 0xF8, 0x98, 0x11, 37 | 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, 38 | 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 39 | 0xB0, 0x54, 0xBB, 0x16 40 | }; 41 | 42 | // AES Inverse S-Box 43 | 44 | const uint8_t aes_isbox[256] = { 45 | 0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E, 46 | 0x81, 0xF3, 0xD7, 0xFB, 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, 47 | 0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, 0x54, 0x7B, 0x94, 0x32, 48 | 0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E, 49 | 0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49, 50 | 0x6D, 0x8B, 0xD1, 0x25, 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, 51 | 0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, 0x6C, 0x70, 0x48, 0x50, 52 | 0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84, 53 | 0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05, 54 | 0xB8, 0xB3, 0x45, 0x06, 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, 55 | 0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, 0x3A, 0x91, 0x11, 0x41, 56 | 0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73, 57 | 0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8, 58 | 0x1C, 0x75, 0xDF, 0x6E, 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, 59 | 0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, 0xFC, 0x56, 0x3E, 0x4B, 60 | 0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4, 61 | 0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59, 62 | 0x27, 0x80, 0xEC, 0x5F, 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, 63 | 0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, 0xA0, 0xE0, 0x3B, 0x4D, 64 | 0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61, 65 | 0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63, 66 | 0x55, 0x21, 0x0C, 0x7D 67 | }; 68 | 69 | // SM4 Forward S-Box (there is no need for an inverse S-Box) 70 | 71 | const uint8_t sm4_sbox[256] = { 72 | 0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 73 | 0x28, 0xFB, 0x2C, 0x05, 0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 74 | 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, 0x9C, 0x42, 0x50, 0xF4, 75 | 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, 76 | 0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 77 | 0x75, 0x8F, 0x3F, 0xA6, 0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 78 | 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, 0x68, 0x6B, 0x81, 0xB2, 79 | 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, 80 | 0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 81 | 0x01, 0x21, 0x78, 0x87, 0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 82 | 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E, 0xEA, 0xBF, 0x8A, 0xD2, 83 | 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1, 84 | 0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 85 | 0xF5, 0x8C, 0xB1, 0xE3, 0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 86 | 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, 0xD5, 0xDB, 0x37, 0x45, 87 | 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, 88 | 0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 89 | 0x1F, 0x10, 0x5A, 0xD8, 0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 90 | 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0, 0x89, 0x69, 0x97, 0x4A, 91 | 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84, 92 | 0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 93 | 0xD7, 0xCB, 0x39, 0x48 94 | }; 95 | -------------------------------------------------------------------------------- /sboxes.h: -------------------------------------------------------------------------------- 1 | // sboxes.h 2 | // 2020-05-05 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Data for AES and SM4. 6 | 7 | #ifndef _SBOXES_H_ 8 | #define _SBOXES_H_ 9 | 10 | #include 11 | 12 | // AES Round Constants 13 | extern const uint8_t aes_rcon[]; 14 | 15 | // AES Forward S-Box 16 | extern const uint8_t aes_sbox[256]; 17 | 18 | // AES Inverse S-Box 19 | extern const uint8_t aes_isbox[256]; 20 | 21 | // SM4 Forward S-Box (there is no need for an inverse S-Box) 22 | extern const uint8_t sm4_sbox[256]; 23 | 24 | #endif // _SBOXES_H_ 25 | -------------------------------------------------------------------------------- /sm4_ssm4.c: -------------------------------------------------------------------------------- 1 | // sm4_ssm4.c 2 | // 2020-01-27 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // SM4 (Chinese Encryption Standard) Encryption and Decryption. 6 | 7 | #include "sm4_wrap.h" 8 | #include "saes32.h" 9 | #include "rv_endian.h" 10 | 11 | // SSM4_ED_X4 is a block of four ssm4.ed instructions: 12 | 13 | #define SSM4_ED_X4(rs1, rs2) { \ 14 | rs1 = ssm4_ed(rs1, rs2, 0); \ 15 | rs1 = ssm4_ed(rs1, rs2, 1); \ 16 | rs1 = ssm4_ed(rs1, rs2, 2); \ 17 | rs1 = ssm4_ed(rs1, rs2, 3); \ 18 | } 19 | 20 | // SSM4_KS_X4 is a block of four ssm4.ks instructions: 21 | 22 | #define SSM4_KS_X4(rs1, rs2) { \ 23 | rs1 = ssm4_ks(rs1, rs2, 0); \ 24 | rs1 = ssm4_ks(rs1, rs2, 1); \ 25 | rs1 = ssm4_ks(rs1, rs2, 2); \ 26 | rs1 = ssm4_ks(rs1, rs2, 3); \ 27 | } 28 | 29 | // encrypt or decrypt a block, depending on round key ordering 30 | 31 | void sm4_encdec(uint8_t out[16], const uint8_t in[16], 32 | const uint32_t rk[SM4_RK_WORDS]) 33 | { 34 | uint32_t x0, x1, x2, x3, t, u; 35 | const uint32_t *kp = &rk[SM4_RK_WORDS]; 36 | 37 | x0 = get32u_le(in); // little endian (native) 38 | x1 = get32u_le(in + 4); 39 | x2 = get32u_le(in + 8); 40 | x3 = get32u_le(in + 12); 41 | 42 | do { 43 | 44 | u = x2 ^ x3; // 10 XORs total per round 45 | 46 | t = rk[0]; // subkeys can be inline 47 | t ^= u; 48 | t ^= x1; 49 | SSM4_ED_X4(x0, t); // 4 x SSM4.ED 50 | 51 | t = rk[1]; 52 | t ^= u; 53 | t ^= x0; 54 | SSM4_ED_X4(x1, t); // 4 x SSM4.ED 55 | u = x0 ^ x1; 56 | 57 | t = rk[2]; 58 | t ^= u; 59 | t ^= x3; 60 | SSM4_ED_X4(x2, t); // 4 x SSM4.ED 61 | 62 | t = rk[3]; 63 | t ^= u; 64 | t ^= x2; 65 | SSM4_ED_X4(x3, t); // 4 x SSM4.ED 66 | 67 | rk += 4; // unroll? 68 | 69 | } while (rk != kp); 70 | 71 | put32u_le(out, x3); 72 | put32u_le(out + 4, x2); 73 | put32u_le(out + 8, x1); 74 | put32u_le(out + 12, x0); 75 | } 76 | 77 | // set key for encryption 78 | 79 | void sm4_enc_key(uint32_t rk[SM4_RK_WORDS], const uint8_t key[16]) 80 | { 81 | const uint32_t *kp = &rk[SM4_RK_WORDS]; 82 | uint32_t x0, x1, x2, x3; 83 | uint32_t t, u, ck; 84 | 85 | x0 = get32u_le(key); // fetch key words 86 | x1 = get32u_le(key + 4); 87 | x2 = get32u_le(key + 8); 88 | x3 = get32u_le(key + 12); 89 | 90 | x0 ^= 0xC6BAB1A3; // "FK" constants, little-endian 91 | x1 ^= 0x5033AA56; // (note: seems pointless?) 92 | x2 ^= 0x97917D67; 93 | x3 ^= 0xDC2270B2; 94 | 95 | ck = 0x140E0600; // 0x150E0700 with LSBs masked 96 | 97 | do { 98 | /* 99 | "CK" Discussion: 100 | 101 | The SM4 "CK" round constants are a sequence of bytes 7*i (mod 256) with 102 | i = 0..127, interpreted as 32-bit words. Often these words are stored in 103 | a constant table. However many ISAs have a "SIMD" addition that adds 4 or 104 | more bytes in parallel, which is faster than a table look-up. Even some 105 | low-ended embedded targets such as Cortex M4 (Armv7E-M/DSP) support this 106 | (SADD8) and its introduction as a RISC-V extension should be considered. 107 | Meanwhile, we can perfom the same function with three simple arithmetic 108 | ops which is likely to still be faster than fetching from a table and 109 | (with the address arithmatic). This implementation is certainly smaller. 110 | */ 111 | t = ck ^ 0x01000100; // these constants in registers 112 | ck += 0x1C1C1C1C; // if we have "SADD8", then 113 | ck &= 0xFEFEFEFE; // -> 4 x "SADD8" per round. 114 | 115 | u = x2 ^ x3; // 10 XORs per round 116 | t = t ^ u; 117 | t = t ^ x1; 118 | SSM4_KS_X4(x0, t); // 4 x SSM4.KS 119 | 120 | rk[0] = x0; // four stores per round 121 | 122 | t = ck ^ 0x01000100; 123 | ck += 0x1C1C1C1C; 124 | ck &= 0xFEFEFEFE; 125 | 126 | t = t ^ u; 127 | t = t ^ x0; 128 | SSM4_KS_X4(x1, t); // 4 x SSM4.KS 129 | rk[1] = x1; 130 | 131 | t = ck ^ 0x01000100; 132 | ck += 0x1C1C1C1C; 133 | ck &= 0xFEFEFEFE; 134 | 135 | u = x0 ^ x1; 136 | t ^= u; 137 | t ^= x3; 138 | SSM4_KS_X4(x2, t); // 4 x SSM4.KS 139 | rk[2] = x2; 140 | 141 | t = ck ^ 0x01000100; 142 | ck += 0x1C1C1C1C; 143 | ck &= 0xFEFEFEFE; 144 | 145 | t ^= u; 146 | t ^= x2; 147 | SSM4_KS_X4(x3, t); // 4 x SSM4.KS 148 | rk[3] = x3; 149 | 150 | rk += 4; 151 | 152 | } while (rk != kp); 153 | } 154 | 155 | // set key for decryption 156 | 157 | void sm4_dec_key(uint32_t rk[SM4_RK_WORDS], const uint8_t key[16]) 158 | { 159 | uint32_t t; 160 | int i, j; 161 | 162 | sm4_enc_key(rk, key); // encryption expansion 163 | 164 | // decryption round keys = encryption round keys in reverse order 165 | for (i = 0, j = SM4_RK_WORDS - 1; i < j; i++, j--) { 166 | t = rk[i]; 167 | rk[i] = rk[j]; 168 | rk[j] = t; 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /sm4_test.c: -------------------------------------------------------------------------------- 1 | // sm4_test.c 2 | // 2020-03-21 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Unit tests for SM4 6 | 7 | #include "test_hex.h" 8 | #include "sm4_wrap.h" 9 | 10 | // Test SM4 11 | 12 | int test_sm4() 13 | { 14 | uint8_t pt[16], ct[16], xt[16], key[16]; 15 | uint32_t rk[SM4_RK_WORDS]; 16 | int fail = 0; 17 | 18 | // the sole test vector in the standard itself 19 | readhex(key, sizeof(key), "0123456789ABCDEFFEDCBA9876543210"); 20 | sm4_enc_key(rk, key); 21 | readhex(pt, sizeof(pt), "0123456789ABCDEFFEDCBA9876543210"); 22 | sm4_enc_ecb(ct, pt, rk); 23 | fail += chkhex("SM4 Encrypt", ct, 16, "681EDF34D206965E86B3E94F536E4246"); 24 | sm4_dec_key(rk, key); 25 | sm4_enc_ecb(xt, ct, rk); 26 | fail += chkhex("SM4 Decrypt", xt, 16, "0123456789ABCDEFFEDCBA9876543210"); 27 | 28 | // from various sources.. 29 | readhex(key, sizeof(key), "FEDCBA98765432100123456789ABCDEF"); 30 | sm4_enc_key(rk, key); 31 | readhex(pt, sizeof(pt), "000102030405060708090A0B0C0D0E0F"); 32 | sm4_enc_ecb(ct, pt, rk); 33 | fail += chkhex("SM4 Encrypt", ct, 16, "F766678F13F01ADEAC1B3EA955ADB594"); 34 | sm4_dec_key(rk, key); 35 | sm4_dec_ecb(xt, ct, rk); 36 | fail += chkhex("SM4 Decrypt", xt, 16, "000102030405060708090A0B0C0D0E0F"); 37 | 38 | readhex(key, sizeof(key), "EB23ADD6454757555747395B76661C9A"); 39 | sm4_enc_key(rk, key); 40 | readhex(pt, sizeof(pt), "D294D879A1F02C7C5906D6C2D0C54D9F"); 41 | sm4_enc_ecb(ct, pt, rk); 42 | fail += chkhex("SM4 Encrypt", ct, 16, "865DE90D6B6E99273E2D44859D9C16DF"); 43 | sm4_dec_key(rk, key); 44 | sm4_dec_ecb(xt, ct, rk); 45 | fail += chkhex("SM4 Decrypt", xt, 16, "D294D879A1F02C7C5906D6C2D0C54D9F"); 46 | 47 | readhex(key, sizeof(key), "F11235535318FA844A3CBE643169F59E"); 48 | sm4_enc_key(rk, key); 49 | readhex(pt, sizeof(pt), "A27EE076E48E6F389710EC7B5E8A3BE5"); 50 | sm4_enc_ecb(ct, pt, rk); 51 | fail += chkhex("SM4 Encrypt", ct, 16, "94CFE3F59E8507FEC41DBE738CCD53E1"); 52 | sm4_dec_key(rk, key); 53 | sm4_dec_ecb(xt, ct, rk); 54 | fail += chkhex("SM4 Decrypt", xt, 16, "A27EE076E48E6F389710EC7B5E8A3BE5"); 55 | 56 | return fail; 57 | } 58 | -------------------------------------------------------------------------------- /sm4_wrap.h: -------------------------------------------------------------------------------- 1 | // sm4_wrap.h 2 | // 2020-01-24 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Prototypes for SM4 (Chinese Encryption Standard) Encryption. 6 | 7 | // The decryption funtion is the same as encryption with the difference 8 | // of having a reversed key schedule. Hence we define both functions here. 9 | 10 | #ifndef _SM4_WRAP_H_ 11 | #define _SM4_WRAP_H_ 12 | 13 | #include 14 | 15 | // Size of the expanded key. 16 | #define SM4_RK_WORDS 32 17 | 18 | // encrypt/decrypt a block, depending on ordering of rk 19 | void sm4_encdec(uint8_t out[16], const uint8_t in[16], 20 | const uint32_t rk[SM4_RK_WORDS]); 21 | 22 | // expand a secret key for encryption 23 | void sm4_enc_key(uint32_t rk[SM4_RK_WORDS], const uint8_t key[16]); 24 | 25 | // expand a secret key for decryption 26 | void sm4_dec_key(uint32_t rk[SM4_RK_WORDS], const uint8_t key[16]); 27 | 28 | // aliases 29 | #define sm4_enc_ecb(ct, pt, rk) sm4_encdec(ct, pt, rk) 30 | #define sm4_dec_ecb(pt, ct, rk) sm4_encdec(pt, ct, rk) 31 | 32 | #endif // _SM4_WRAP_H_ 33 | -------------------------------------------------------------------------------- /test_hex.c: -------------------------------------------------------------------------------- 1 | // test_hex.c 2 | // 2020-03-07 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // functions to facilitate simple runtime tests 6 | 7 | #include "test_hex.h" 8 | 9 | // single hex digit 10 | 11 | static int hexdigit(char ch) 12 | { 13 | if (ch >= '0' && ch <= '9') 14 | return ch - '0'; 15 | if (ch >= 'A' && ch <= 'F') 16 | return ch - 'A' + 10; 17 | if (ch >= 'a' && ch <= 'f') 18 | return ch - 'a' + 10; 19 | return -1; 20 | } 21 | 22 | // read a hex string of "maxbytes", return byte length 23 | 24 | size_t readhex(uint8_t * buf, size_t maxbytes, const char *str) 25 | { 26 | size_t i; 27 | int h, l; 28 | 29 | for (i = 0; i < maxbytes; i++) { 30 | h = hexdigit(str[2 * i]); 31 | if (h < 0) 32 | return i; 33 | l = hexdigit(str[2 * i + 1]); 34 | if (l < 0) 35 | return i; 36 | buf[i] = (h << 4) + l; 37 | } 38 | 39 | return i; 40 | } 41 | 42 | // print hexadecimal "data", length "len", with label "lab" 43 | 44 | void prthex(const char *lab, const void *data, size_t len) 45 | { 46 | size_t i; 47 | uint8_t x; 48 | 49 | printf("[TEST] %s ", lab); 50 | const char hex[] = "0123456789ABCDEF"; 51 | 52 | for (i = 0; i < len; i++) { 53 | x = ((const uint8_t *) data)[i]; 54 | putchar(hex[(x >> 4) & 0xF]); 55 | putchar(hex[x & 0xF]); 56 | } 57 | putchar('\n'); 58 | } 59 | 60 | // check "data" of "len" bytes against a hexadecimal test vector "ref" 61 | 62 | int chkhex(const char *lab, const void *data, size_t len, const char *ref) 63 | { 64 | size_t i; 65 | uint8_t x; 66 | int fail = 0; 67 | 68 | // check equivalence 69 | for (i = 0; i < len; i++) { 70 | x = ((const uint8_t *) data)[i]; 71 | if (hexdigit(ref[2 * i]) != ((x >> 4) & 0xF) || 72 | hexdigit(ref[2 * i + 1]) != (x & 0x0F)) { 73 | fail = 1; 74 | break; 75 | } 76 | } 77 | 78 | if (i == len && hexdigit(ref[2 * len]) >= 0) { 79 | fail = 1; 80 | } 81 | 82 | printf("[%s] %s %s\n", fail ? "FAIL" : "PASS", lab, ref); 83 | 84 | if (fail) { 85 | prthex(lab, data, len); 86 | } 87 | 88 | return fail; 89 | } 90 | 91 | // boolean return value check 92 | 93 | int chkret(const char *lab, int want, int have) 94 | { 95 | printf("[%s] %s WANT=%d HAVE=%d\n", 96 | want != have ? "FAIL" : "PASS", lab, want, have); 97 | 98 | return want != have ? 1 : 0; 99 | } 100 | -------------------------------------------------------------------------------- /test_hex.h: -------------------------------------------------------------------------------- 1 | // test_hex.h 2 | // 2020-03-07 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // functions to facilitate simple runtime tests 6 | 7 | #ifndef _TEST_HEX_H_ 8 | #define _TEST_HEX_H_ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | // read a hex string of "maxbytes", return byte length 16 | size_t readhex(uint8_t * buf, size_t maxbytes, const char *str); 17 | 18 | // print hexadecimal "data", length "len", with label "lab" 19 | void prthex(const char *lab, const void *data, size_t len); 20 | 21 | // check "data" of "len" bytes against a hexadecimal test vector "ref" 22 | int chkhex(const char *lab, const void *data, size_t len, const char *ref); 23 | 24 | // boolean return value check 25 | int chkret(const char *lab, int want, int have); 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /test_main.c: -------------------------------------------------------------------------------- 1 | // test_main.c 2 | // 2020-01-23 Markku-Juhani O. Saarinen 3 | // Copyright (c) 2020, PQShield Ltd. All rights reserved. 4 | 5 | // Minimal unit tests for AES-128/192/256 (FIPS 197) and SM4 (GM/T 0002-2012). 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "aes_wrap.h" 13 | #include "saes32.h" 14 | #include "aes_saes32.h" 15 | #include "aes_saes64.h" 16 | #include "aes_otf_saes64.h" 17 | 18 | #include "gcm_wrap.h" 19 | #include "gcm_gfmul.h" 20 | 21 | 22 | // unit tests 23 | 24 | int test_aes(); // aes_test.c 25 | int test_sm4(); // sm4_test.c 26 | int test_gcm(); // gcm_test.c 27 | 28 | // generate "reference" hw testbench data for the instruction 29 | // output should match with hdl/saes32_tb.v 30 | 31 | int test_hwtb() 32 | { 33 | uint32_t rd, rs1, rs2, fn; 34 | 35 | rs1 = 0x00000000; 36 | rs2 = 0x00000000; 37 | 38 | for (fn = 0; fn < 24; fn++) { 39 | 40 | rd = saes32(rs1, rs2, fn); 41 | 42 | printf("[TB] rd=%08x rs1=%08x rs2=%08x fn=%02x\n", rd, rs1, rs2, fn); 43 | 44 | rs2 += 0x01234567; 45 | } 46 | 47 | return 0; 48 | } 49 | 50 | // stub main: run unit tests 51 | 52 | int main(int argc, char **argv) 53 | { 54 | int fail = 0; 55 | 56 | // generate hardware testbench data ? 57 | if (argc > 1 && strcmp(argv[1], "tb") == 0) { 58 | return test_hwtb(); 59 | } 60 | // algorithm tests 61 | 62 | printf("[INFO] === AES using SAES32 ===\n"); 63 | 64 | aes128_enc_key = aes128_enc_key_saes32; // set encryption key 65 | aes192_enc_key = aes192_enc_key_saes32; 66 | aes256_enc_key = aes256_enc_key_saes32; 67 | 68 | aes128_enc_ecb = aes128_enc_ecb_saes32; // encrypt a block 69 | aes192_enc_ecb = aes192_enc_ecb_saes32; 70 | aes256_enc_ecb = aes256_enc_ecb_saes32; 71 | 72 | aes128_dec_key = aes128_dec_key_saes32; // set decryption key 73 | aes192_dec_key = aes192_dec_key_saes32; 74 | aes256_dec_key = aes256_dec_key_saes32; 75 | 76 | aes128_dec_ecb = aes128_dec_ecb_saes32; // decrypt a block 77 | aes192_dec_ecb = aes192_dec_ecb_saes32; 78 | aes256_dec_ecb = aes256_dec_ecb_saes32; 79 | 80 | fail += test_aes(); // run tests with UUT = SAES32 81 | 82 | printf("[INFO] === AES using SAES64 / On-the-fly keying ===\n"); 83 | 84 | aes128_enc_ecb = aes128_enc_otf_saes64; 85 | aes192_enc_ecb = aes192_enc_otf_saes64; 86 | aes256_enc_ecb = aes256_enc_otf_saes64; 87 | 88 | fail += test_aes(); // run tests with UUT = OTF/64 89 | 90 | printf("[INFO] === AES using SAES64 ===\n"); 91 | 92 | aes128_enc_key = aes128_enc_key_saes64; // set encryption key 93 | aes192_enc_key = aes192_enc_key_saes64; 94 | aes256_enc_key = aes256_enc_key_saes64; 95 | 96 | aes128_enc_ecb = aes128_enc_ecb_saes64; // encrypt a block 97 | aes192_enc_ecb = aes192_enc_ecb_saes64; 98 | aes256_enc_ecb = aes256_enc_ecb_saes64; 99 | 100 | aes128_dec_key = aes128_dec_key_saes64; // set decryption key 101 | aes192_dec_key = aes192_dec_key_saes64; 102 | aes256_dec_key = aes256_dec_key_saes64; 103 | 104 | aes128_dec_ecb = aes128_dec_ecb_saes64; // decrypt a block 105 | aes192_dec_ecb = aes192_dec_ecb_saes64; 106 | aes256_dec_ecb = aes256_dec_ecb_saes64; 107 | 108 | fail += test_aes(); // run tests with UUT = SAES64 109 | 110 | 111 | 112 | printf("[INFO] === GCM using rv64_ghash_mul() ===\n"); 113 | ghash_rev = rv64_ghash_rev; 114 | ghash_mul = rv64_ghash_mul; 115 | fail += test_gcm(); 116 | 117 | printf("[INFO] === GCM using rv32_ghash_mul() ===\n"); 118 | ghash_rev = rv32_ghash_rev; 119 | ghash_mul = rv32_ghash_mul; 120 | fail += test_gcm(); 121 | 122 | printf("[INFO] === GCM using rv32_ghash_mul_kar() ===\n"); 123 | ghash_rev = rv32_ghash_rev; 124 | ghash_mul = rv32_ghash_mul_kar; 125 | fail += test_gcm(); 126 | 127 | printf("[INFO] === SM4 test ===\n"); 128 | fail += test_sm4(); 129 | 130 | if (fail == 0) { 131 | printf("[PASS] all tests passed.\n"); 132 | } else { 133 | printf("[FAIL] %d test(s) failed.\n", fail); 134 | } 135 | 136 | return fail; 137 | } 138 | --------------------------------------------------------------------------------