├── README.md ├── clmul-x86.c ├── aes-arm.c ├── clmul-arm.c ├── README-p8.md └── aes-p8.c /README.md: -------------------------------------------------------------------------------- 1 | # AES-Intrinsics 2 | 3 | This GitHub repository contains source code for AES encryption using Intel AES and ARMv8 AES intrinsics, and Power8 built-ins. The source files should be portable across toolchains which support the Intel and ARMv8 AES extensions. 4 | 5 | Only the AES encryption function is provided. The function operates on full blocks. Users must set the key, and users must pad the last block. The small sample program included with each source file does both on an empty message. 6 | 7 | ## Intel AES 8 | 9 | The GitHub does not have an Intel AES implementation. Intel has an excellent document at [Intel Advanced Encryption Standard (AES) New Instructions Set](https://www.intel.com/content/dam/doc/white-paper/advanced-encryption-standard-new-instructions-set-paper.pdf). 10 | 11 | If you want to test the programs but don't have a capable machine on hand, then you can use the Intel Software Development Emulator. You can find it at http://software.intel.com/en-us/articles/intel-software-development-emulator. 12 | 13 | ## ARM AES 14 | 15 | To compile the ARM sources on an ARMv8 machine, be sure your CFLAGS include `-march=armv8-a+crc+crypto`. Apple iOS CFLAGS should include `-arch arm64` and a system root like `-isysroot /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS8.2.sdk`. 16 | 17 | The ARM source files are based on code from ARM, and code by Johannes Schneiders, Skip Hovsmith and Barry O'Rourke for the mbedTLS project. You can find the mbedTLS GitHub at http://github.com/ARMmbed/mbedtls. Prior to ARM's implementation, Critical Blue provided the source code and pull request at http://github.com/CriticalBlue/mbedtls. 18 | 19 | If you want to test the programs but don't have a capable machine on hand, then you can use the ARM Fixed Virtual Platforms. You can find it at https://developer.arm.com/products/system-design/fixed-virtual-platforms. 20 | 21 | ## Power8 AES 22 | 23 | To compile the Power8 sources on an PPC machine with GCC, be sure your CFLAGS include `-mcpu=power8 -maltivec`. If using IBM XL C/C++ then use `-qarch=pwr8 -qaltivec`. 24 | 25 | The Power8 source files are written from scratch. IBM's documentation sucks. Namely, there is none. 26 | 27 | # Benchmarks 28 | 29 | To be determined. -------------------------------------------------------------------------------- /clmul-x86.c: -------------------------------------------------------------------------------- 1 | /* clmul-x86.c - Intel Carryless Multiply using C intrinsics */ 2 | /* Written and place in public domain by Jeffrey Walton */ 3 | /* Based on code from Intel CLMUL guide */ 4 | 5 | /* Include the GCC super header */ 6 | #if defined(__GNUC__) 7 | # include 8 | # include 9 | #endif 10 | 11 | /* Microsoft supports clmul extensions as of Visual Studio VS2008 */ 12 | #if defined(_MSC_VER) 13 | # include 14 | # define WIN32_LEAN_AND_MEAN 15 | # include 16 | typedef UINT8 uint8_t; 17 | #endif 18 | 19 | /* Perform the multiplication and reduction in GF(2^128) */ 20 | void clmul_x86(uint8_t r[16], const uint8_t a[16], const uint8_t b[16]) 21 | { 22 | const __m128i MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 23 | 24 | __m128i a1 = _mm_loadu_si128((const __m128i*)a); 25 | __m128i b1 = _mm_loadu_si128((const __m128i*)b); 26 | 27 | a1 = _mm_shuffle_epi8(a1, MASK); 28 | b1 = _mm_shuffle_epi8(b1, MASK); 29 | 30 | __m128i T0, T1, T2, T3, T4, T5; 31 | 32 | T0 = _mm_clmulepi64_si128(a1, b1, 0x00); 33 | T1 = _mm_clmulepi64_si128(a1, b1, 0x01); 34 | T2 = _mm_clmulepi64_si128(a1, b1, 0x10); 35 | T3 = _mm_clmulepi64_si128(a1, b1, 0x11); 36 | 37 | T1 = _mm_xor_si128(T1, T2); 38 | T2 = _mm_slli_si128(T1, 8); 39 | T1 = _mm_srli_si128(T1, 8); 40 | T0 = _mm_xor_si128(T0, T2); 41 | T3 = _mm_xor_si128(T3, T1); 42 | 43 | T4 = _mm_srli_epi32(T0, 31); 44 | T0 = _mm_slli_epi32(T0, 1); 45 | 46 | T5 = _mm_srli_epi32(T3, 31); 47 | T3 = _mm_slli_epi32(T3, 1); 48 | 49 | T2 = _mm_srli_si128(T4, 12); 50 | T5 = _mm_slli_si128(T5, 4); 51 | T4 = _mm_slli_si128(T4, 4); 52 | T0 = _mm_or_si128(T0, T4); 53 | T3 = _mm_or_si128(T3, T5); 54 | T3 = _mm_or_si128(T3, T2); 55 | 56 | T4 = _mm_slli_epi32(T0, 31); 57 | T5 = _mm_slli_epi32(T0, 30); 58 | T2 = _mm_slli_epi32(T0, 25); 59 | 60 | T4 = _mm_xor_si128(T4, T5); 61 | T4 = _mm_xor_si128(T4, T2); 62 | T5 = _mm_srli_si128(T4, 4); 63 | T3 = _mm_xor_si128(T3, T5); 64 | T4 = _mm_slli_si128(T4, 12); 65 | T0 = _mm_xor_si128(T0, T4); 66 | T3 = _mm_xor_si128(T3, T0); 67 | 68 | T4 = _mm_srli_epi32(T0, 1); 69 | T1 = _mm_srli_epi32(T0, 2); 70 | T2 = _mm_srli_epi32(T0, 7); 71 | T3 = _mm_xor_si128(T3, T1); 72 | T3 = _mm_xor_si128(T3, T2); 73 | T3 = _mm_xor_si128(T3, T4); 74 | 75 | T3 = _mm_shuffle_epi8(T3, MASK); 76 | 77 | _mm_storeu_si128((__m128i*)r, T3); 78 | } 79 | 80 | #if defined(TEST_MAIN) 81 | 82 | #include 83 | #include 84 | int main(int argc, char* argv[]) 85 | { 86 | /* A's high nibble is 0x01, B's high nibble is 0x02 */ 87 | uint8_t a[16] = {0x1f,0x1e,0x1d,0x1c,0x1b,0x1a,0x19,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10}; 88 | uint8_t b[16] = {0x2f,0x2e,0x2d,0x2c,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x23,0x22,0x21,0x20}; 89 | uint8_t r[16]; 90 | 91 | clmul_x86(r, a, b); 92 | 93 | /* 020BBEB352AEAE16... */ 94 | printf("GHASH of message: "); 95 | printf("%02X%02X%02X%02X%02X%02X%02X%02X...\n", 96 | r[0] & 0xFF, r[1] & 0xFF, r[2] & 0xFF, r[3] & 0xFF, 97 | r[4] & 0xFF, r[5] & 0xFF, r[6] & 0xFF, r[7] & 0xFF); 98 | 99 | int success = (r[0] == 0x02 && r[1] == 0x0B && r[2] == 0xBE && r[3] == 0xB3 && 100 | r[4] == 0x52 && r[5] == 0xAE && r[6] == 0xAE && r[7] == 0x16); 101 | 102 | if (success) 103 | printf("Success!\n"); 104 | else 105 | printf("Failure!\n"); 106 | 107 | return (success != 0 ? 0 : 1); 108 | } 109 | 110 | #endif 111 | -------------------------------------------------------------------------------- /aes-arm.c: -------------------------------------------------------------------------------- 1 | /* aes-arm.c - ARMv8 AES extensions using C intrinsics */ 2 | /* Written and placed in public domain by Jeffrey Walton */ 3 | /* Based on code from ARM, and by Johannes Schneiders, Skip */ 4 | /* Hovsmith and Barry O'Rourke for the mbedTLS project. */ 5 | 6 | /* gcc -std=c99 -march=armv8-a+crypto aes-arm.c -o aes-arm.exe */ 7 | 8 | #if defined(__arm__) || defined(__aarch32__) || defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64) 9 | # if defined(__GNUC__) 10 | # include 11 | # endif 12 | # if defined(__ARM_NEON) || defined(_MSC_VER) 13 | # include 14 | # endif 15 | /* GCC and LLVM Clang, but not Apple Clang */ 16 | # if defined(__GNUC__) && !defined(__apple_build_version__) 17 | # if defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRYPTO) 18 | # include 19 | # endif 20 | # endif 21 | #endif /* ARM Headers */ 22 | 23 | void aes_process_arm(const uint8_t key[], const uint8_t subkeys[], uint32_t rounds, 24 | const uint8_t input[], uint8_t output[], uint32_t length) 25 | { 26 | while (length >= 16) 27 | { 28 | uint8x16_t block = vld1q_u8(input); 29 | 30 | // AES single round encryption 31 | block = vaeseq_u8(block, vld1q_u8(key)); 32 | // AES mix columns 33 | block = vaesmcq_u8(block); 34 | 35 | // AES single round encryption 36 | block = vaeseq_u8(block, vld1q_u8(subkeys)); 37 | // AES mix columns 38 | block = vaesmcq_u8(block); 39 | 40 | for (unsigned int i=1; i 63 | #include 64 | 65 | int main(int argc, char* argv[]) 66 | { 67 | /* FIPS 197, Appendix B input */ 68 | const uint8_t input[16] = { /* user input, unaligned buffer */ 69 | 0x32, 0x43, 0xf6, 0xa8, 0x88, 0x5a, 0x30, 0x8d, 0x31, 0x31, 0x98, 0xa2, 0xe0, 0x37, 0x07, 0x34 70 | }; 71 | 72 | /* FIPS 197, Appendix B key */ 73 | const uint8_t key[16] = { /* user input, unaligned buffer */ 74 | 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x9 , 0xcf, 0x4f, 0x3c 75 | }; 76 | 77 | /* FIPS 197, Appendix B expanded subkeys */ 78 | #ifdef _MSC_VER 79 | __declspec(align(16)) 80 | #else 81 | __attribute__((aligned(4))) 82 | #endif 83 | const uint8_t subkeys[10][16] = { /* library controlled, aligned buffer */ 84 | {0xA0, 0xFA, 0xFE, 0x17, 0x88, 0x54, 0x2c, 0xb1, 0x23, 0xa3, 0x39, 0x39, 0x2a, 0x6c, 0x76, 0x05}, 85 | {0xF2, 0xC2, 0x95, 0xF2, 0x7a, 0x96, 0xb9, 0x43, 0x59, 0x35, 0x80, 0x7a, 0x73, 0x59, 0xf6, 0x7f}, 86 | {0x3D, 0x80, 0x47, 0x7D, 0x47, 0x16, 0xFE, 0x3E, 0x1E, 0x23, 0x7E, 0x44, 0x6D, 0x7A, 0x88, 0x3B}, 87 | {0xEF, 0x44, 0xA5, 0x41, 0xA8, 0x52, 0x5B, 0x7F, 0xB6, 0x71, 0x25, 0x3B, 0xDB, 0x0B, 0xAD, 0x00}, 88 | {0xD4, 0xD1, 0xC6, 0xF8, 0x7C, 0x83, 0x9D, 0x87, 0xCA, 0xF2, 0xB8, 0xBC, 0x11, 0xF9, 0x15, 0xBC}, 89 | {0x6D, 0x88, 0xA3, 0x7A, 0x11, 0x0B, 0x3E, 0xFD, 0xDB, 0xF9, 0x86, 0x41, 0xCA, 0x00, 0x93, 0xFD}, 90 | {0x4E, 0x54, 0xF7, 0x0E, 0x5F, 0x5F, 0xC9, 0xF3, 0x84, 0xA6, 0x4F, 0xB2, 0x4E, 0xA6, 0xDC, 0x4F}, 91 | {0xEA, 0xD2, 0x73, 0x21, 0xB5, 0x8D, 0xBA, 0xD2, 0x31, 0x2B, 0xF5, 0x60, 0x7F, 0x8D, 0x29, 0x2F}, 92 | {0xAC, 0x77, 0x66, 0xF3, 0x19, 0xFA, 0xDC, 0x21, 0x28, 0xD1, 0x29, 0x41, 0x57, 0x5c, 0x00, 0x6E}, 93 | {0xD0, 0x14, 0xF9, 0xA8, 0xC9, 0xEE, 0x25, 0x89, 0xE1, 0x3F, 0x0c, 0xC8, 0xB6, 0x63, 0x0C, 0xA6} 94 | }; 95 | 96 | /* Result */ 97 | uint8_t result[19] = { 0 }; 98 | 99 | aes_process_arm((const uint8_t*)key, (const uint8_t*)subkeys, 10, input, result+3, 16); 100 | 101 | printf("Input: "); 102 | for (unsigned int i=0; i<16; ++i) 103 | printf("%02X ", input[i]); 104 | printf("\n"); 105 | 106 | printf("Key: "); 107 | for (unsigned int i=0; i<16; ++i) 108 | printf("%02X ", key[i]); 109 | printf("\n"); 110 | 111 | printf("Output: "); 112 | for (unsigned int i=3; i<19; ++i) 113 | printf("%02X ", result[i]); 114 | printf("\n"); 115 | 116 | /* FIPS 197, Appendix B output */ 117 | const uint8_t exp[16] = { 118 | 0x39, 0x25, 0x84, 0x1D, 0x02, 0xDC, 0x09, 0xFB, 0xDC, 0x11, 0x85, 0x97, 0x19, 0x6A, 0x0B, 0x32 119 | }; 120 | 121 | if (0 == memcmp(result+3, exp, 16)) 122 | printf("SUCCESS!!!\n"); 123 | else 124 | printf("FAILURE!!!\n"); 125 | 126 | return 0; 127 | } 128 | 129 | #endif 130 | -------------------------------------------------------------------------------- /clmul-arm.c: -------------------------------------------------------------------------------- 1 | /* clmul-arm.c - ARMv8 Carryless Multiply using C intrinsics */ 2 | /* Written and placed in public domain by Jeffrey Walton */ 3 | /* Based on code from ARM, and by Johannes Schneiders, Skip */ 4 | /* Hovsmith and Barry O'Rourke for the mbedTLS project. */ 5 | 6 | /* Visual Studio 2017 and above supports ARMv8, but its not clear how to detect */ 7 | /* it or use it at the moment. Also see http://stackoverflow.com/q/37244202, */ 8 | /* http://stackoverflow.com/q/41646026, and http://stackoverflow.com/q/41688101 */ 9 | #if defined(__arm64__) || defined(__aarch64__) 10 | # if defined(__GNUC__) 11 | # include 12 | # endif 13 | # if defined(__ARM_NEON) || (defined(__GNUC__) && (defined(__aarch64__) || defined(__aarch64__))) 14 | # include 15 | # endif 16 | /* GCC and LLVM Clang, but not Apple Clang */ 17 | # if defined(__GNUC__) && !defined(__apple_build_version__) 18 | # if defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRYPTO) 19 | # include 20 | # endif 21 | # endif 22 | #endif /* ARM Headers */ 23 | 24 | /********************************/ 25 | /* GCC and compatible compilers */ 26 | /********************************/ 27 | #if defined(__GNUC__) 28 | #if defined(__GNUC_STDC_INLINE__) || defined(__INLINE__) 29 | # define MAYBE_INLINE inline 30 | #else 31 | # define MAYBE_INLINE 32 | #endif 33 | 34 | /* Schneiders, Hovsmith and O'Rourke discovered this trick. */ 35 | /* It results in much better code generation in production code */ 36 | /* by avoiding D-register spills when using vgetq_lane_u64. The */ 37 | /* problem does not surface under minimal test cases. */ 38 | MAYBE_INLINE uint8x16_t PMULL_LOW(const uint8x16_t a, const uint8x16_t b) 39 | { 40 | uint8x16_t r; 41 | __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" 42 | :"=w" (r) : "w" (a), "w" (b) ); 43 | return r; 44 | } 45 | 46 | MAYBE_INLINE uint8x16_t PMULL_HIGH(const uint8x16_t a, const uint8x16_t b) 47 | { 48 | uint8x16_t r; 49 | __asm __volatile("pmull2 %0.1q, %1.2d, %2.2d \n\t" 50 | :"=w" (r) : "w" (a), "w" (b) ); 51 | return r; 52 | } 53 | #endif /* GCC and compatibles */ 54 | 55 | /**************************************/ 56 | /* Microsoft and compatible compilers */ 57 | /**************************************/ 58 | #if defined(_MSC_VER) 59 | inline uint8x16_t PMULL_LOW(const uint8x16_t a, const uint8x16_t b) 60 | { 61 | return (uint8x16_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0), 62 | vgetq_lane_u64(vreinterpretq_u64_u8(b),0))); 63 | } 64 | 65 | inline uint8x16_t PMULL_HIGH(const uint8x16_t a, const uint8x16_t b) 66 | { 67 | return (uint8x16_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1), 68 | vgetq_lane_u64(vreinterpretq_u64_u8(b),1))); 69 | } 70 | #endif /* Microsoft and compatibles */ 71 | 72 | /*********************************************************/ 73 | /* Perform the multiplication and reduction in GF(2^128) */ 74 | /*********************************************************/ 75 | void clmul_arm(uint8_t r[16], const uint8_t a[16], const uint8_t b[16]) 76 | { 77 | uint8x16_t a8, b8, c8; 78 | uint8x16_t z, p; 79 | uint8x16_t r0, r1; 80 | uint8x16_t t0, t1; 81 | 82 | a8 = vrbitq_u8(vld1q_u8(a)); 83 | b8 = vrbitq_u8(vld1q_u8(b)); 84 | 85 | /* polynomial multiply */ 86 | z = vdupq_n_u8(0); 87 | r0 = PMULL_LOW(a8, b8); 88 | r1 = PMULL_HIGH(a8, b8); 89 | t0 = vextq_u8(b8, b8, 8); 90 | t1 = PMULL_LOW(a8, t0); 91 | t0 = PMULL_HIGH(a8, t0); 92 | t0 = veorq_u8(t0, t1); 93 | t1 = vextq_u8(z, t0, 8); 94 | r0 = veorq_u8(r0, t1); 95 | t1 = vextq_u8(t0, z, 8); 96 | r1 = veorq_u8(r1, t1); 97 | 98 | /* polynomial reduction */ 99 | p = vreinterpretq_u8_u64(vdupq_n_u64(0x0000000000000087)); 100 | t0 = PMULL_HIGH(r1, p); 101 | t1 = vextq_u8(t0, z, 8); 102 | r1 = veorq_u8(r1, t1); 103 | t1 = vextq_u8(z, t0, 8); 104 | r0 = veorq_u8(r0, t1); 105 | t0 = PMULL_LOW(r1, p); 106 | c8 = veorq_u8(r0, t0); 107 | 108 | vst1q_u8(r, vrbitq_u8(c8)); 109 | } 110 | 111 | #if defined(TEST_MAIN) 112 | 113 | #include 114 | #include 115 | int main(int argc, char* argv[]) 116 | { 117 | /* A's high nibble is 0x01, B's high nibble is 0x02 */ 118 | uint8_t a[16] = {0x1f,0x1e,0x1d,0x1c,0x1b,0x1a,0x19,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10}; 119 | uint8_t b[16] = {0x2f,0x2e,0x2d,0x2c,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x23,0x22,0x21,0x20}; 120 | uint8_t r[16]; 121 | 122 | clmul_arm(r, a, b); 123 | 124 | /* 020BBEB352AEAE16... */ 125 | printf("GHASH of message: "); 126 | printf("%02X%02X%02X%02X%02X%02X%02X%02X...\n", 127 | r[0] & 0xFF, r[1] & 0xFF, r[2] & 0xFF, r[3] & 0xFF, 128 | r[4] & 0xFF, r[5] & 0xFF, r[6] & 0xFF, r[7] & 0xFF); 129 | 130 | int success = (r[0] == 0x02 && r[1] == 0x0B && r[2] == 0xBE && r[3] == 0xB3 && 131 | r[4] == 0x52 && r[5] == 0xAE && r[6] == 0xAE && r[7] == 0x16); 132 | 133 | if (success) 134 | printf("Success!\n"); 135 | else 136 | printf("Failure!\n"); 137 | 138 | return (success != 0 ? 0 : 1); 139 | } 140 | 141 | #endif 142 | -------------------------------------------------------------------------------- /README-p8.md: -------------------------------------------------------------------------------- 1 | # AES-Power8 2 | 3 | This is a test implementation of Power 8's in-core crypto using xlC and GCC built-in's. 4 | 5 | The test implementation side steps key scheduling by using a pre-expanded "golden" key from FIPS 197, Appendix B. The golden key is the big-endian byte array `2b 7e 15 16 28 ae d2 a6 ab f7 15 88 09 cf 4f 3c`, and it produces the key schedule hard-coded in the program. 6 | 7 | The GCC Compile Farm (http://gcc.gnu.org/wiki/CompileFarm) offers two test machines. To test on a Power 8 little-endian machine use GCC112. To test on a big-endian machine use GCC119. 8 | 9 | According to data from GCC112, the naive impementation provided by `fips197-p8.c` achieves about 6 cycles-per-byte (cpb). It is mostly dull, but its still better than 20 to 30 cpb for C and C++. Running 4 or 8 blocks in parallel will increase performance to around 1 to 1.5 cpb. 10 | 11 | ## Compiling 12 | 13 | To compile the source file using GCC: 14 | 15 | gcc -std=c99 -mcpu=power8 fips197-p8.c -o fips197-p8.exe 16 | 17 | To compile the source file using IBM XL C/C++: 18 | 19 | xlc -qarch=pwr8 -qaltivec fips197-p8.c -o fips197-p8.exe 20 | 21 | ## Decryption 22 | 23 | The decryption rountines are mostly a copy and paste of the encryption routines using the appropriate inverse function. However, you must build the key table using the algorithm discussed in FIPS 197, Sections 5.3.1 through 5.3.4 (pp. 20-23). You cannot use the "Equivalent Inverse Cipher" from Section 5.3.5 (p.23). 24 | 25 | If you use the same key table as built for encryption, then you should index the subkey table in reverse order. That is, start with index `rounds`, then `rounds-1`, ..., then index `1`, and finally index `0`. (Remember, there are `N+1` subkeys for `N` rounds of AES). 26 | 27 | ## Byte Order 28 | 29 | The VSX unit only operates on big-endian data. However, the CPU will load the VSX register in little-endian format on a little-endian machine by default. On little-endian machines each 16-byte buffer must be byte reversed before loading. Conversely, the data needs to be stored in little endian format on little endian machines when moving from a VSX register to memory. You have two options when reversing the data to ensure it is properly loaded into a VSX register or saved from a VSX register. First you can reverse the in-memory byte buffer. Second, you can load the byte buffer and then permute the vector. 30 | 31 | A derivative of the test program used the first strategy for the subkey table. The subkey table is converted to big endian once so each subkey does not need a permute after loading. It was an optimization that benefited multiple encryptions under the same key. The test program used the second strategy on user data like input and output buffers. 32 | 33 | For general reading on byte ordering, see "Targeting your applications - what little endian and big endian IBM XL C/C++ compiler differences mean to you" (http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html). 34 | 35 | ## Optimizations 36 | 37 | There are at least two optimizations available that your program should take. The first optimization is perform the byte reversal on little-endian machines for the subkey table once after it is built. You will still need to perform the endian conversions on user supplied input and output buffers as the data is streamed into the program. 38 | 39 | The second optimization your program should take is to run 4 or 8 blocks of encryption or decryption in parallel. The VSX unit has 32 full size registers, so you should be able to raise the number of simultaneous transformations to 12 if desired. 40 | 41 | As an example, instead of a single loop operating on a a single block: 42 | 43 | ``` 44 | VectorType s = VectorLoad(input); 45 | VectorType k = VectorLoadKey(subkeys); 46 | 47 | s = VectorXor(s, k); 48 | for (size_t i=1; i 15 | #include 16 | #include 17 | #include 18 | 19 | #if defined(__ALTIVEC__) 20 | # include 21 | # undef vector 22 | # undef pixel 23 | # undef bool 24 | #endif 25 | 26 | #if defined(__xlc__) || defined(__xlC__) 27 | # define TEST_AES_XLC 1 28 | #elif defined(__clang__) 29 | # define TEST_AES_CLANG 1 30 | #elif defined(__GNUC__) 31 | # define TEST_AES_GCC 1 32 | #endif 33 | 34 | #if defined(__LITTLE_ENDIAN__) 35 | # define TEST_AES_LITTLE_ENDIAN 1 36 | #endif 37 | 38 | typedef __vector unsigned char uint8x16_p8; 39 | typedef __vector unsigned long long uint64x2_p8; 40 | 41 | uint8x16_p8 Reverse8x16(const uint8x16_p8 src) 42 | { 43 | const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; 44 | const uint8x16_p8 zero = {0}; 45 | return vec_perm(src, zero, mask); 46 | } 47 | 48 | uint64x2_p8 Reverse64x2(const uint64x2_p8 src) 49 | { 50 | const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; 51 | const uint8x16_p8 zero = {0}; 52 | return (uint64x2_p8)vec_perm((uint8x16_p8)src, zero, mask); 53 | } 54 | 55 | /* Load from big-endian format. Perform endian conversion as necessary */ 56 | uint8x16_p8 Load8x16(const uint8_t src[16]) 57 | { 58 | #if defined(TEST_AES_XLC) 59 | /* http://stackoverflow.com/q/46124383/608639 */ 60 | return vec_xl_be(0, (uint8_t*)src); 61 | #else 62 | /* GCC, Clang, etc */ 63 | # if defined(TEST_AES_LITTLE_ENDIAN) 64 | return Reverse8x16(vec_vsx_ld(0, src)); 65 | # else 66 | return vec_vsx_ld(0, src); 67 | # endif 68 | #endif 69 | } 70 | 71 | /* Store in big-endian format. Perform endian conversion as necessary */ 72 | void Store8x16(const uint8x16_p8 src, uint8_t dest[16]) 73 | { 74 | #if defined(TEST_AES_XLC) 75 | /* http://stackoverflow.com/q/46124383/608639 */ 76 | vec_xst_be(src, 0, (uint8_t*)dest); 77 | #else 78 | /* GCC, Clang, etc */ 79 | # if defined(TEST_AES_LITTLE_ENDIAN) 80 | vec_vsx_st(Reverse8x16(src), 0, dest); 81 | # else 82 | vec_vsx_st(src, 0, dest); 83 | # endif 84 | #endif 85 | } 86 | 87 | /* Load from big-endian format. Perform endian conversion as necessary */ 88 | uint64x2_p8 Load64x2(const uint8_t src[16]) 89 | { 90 | #if defined(TEST_AES_XLC) 91 | /* http://stackoverflow.com/q/46124383/608639 */ 92 | return (uint64x2_p8)vec_xl_be(0, (uint8_t*)src); 93 | #else 94 | /* GCC, Clang, etc */ 95 | # if defined(TEST_AES_LITTLE_ENDIAN) 96 | return (uint64x2_p8)Reverse8x16(vec_vsx_ld(0, src)); 97 | # else 98 | return (uint64x2_p8)vec_vsx_ld(0, src); 99 | # endif 100 | #endif 101 | } 102 | 103 | /* Store in big-endian format. Perform endian conversion as necessary */ 104 | void Store64x2(const uint64x2_p8 src, uint8_t dest[16]) 105 | { 106 | #if defined(TEST_AES_XLC) 107 | /* http://stackoverflow.com/q/46124383/608639 */ 108 | vec_xst_be((uint8x16_p8)src, 0, (uint8_t*)dest); 109 | #else 110 | /* GCC, Clang, etc */ 111 | # if defined(TEST_AES_LITTLE_ENDIAN) 112 | vec_vsx_st(Reverse8x16((uint8x16_p8)src), 0, dest); 113 | # else 114 | vec_vsx_st((uint8x16_p8)src, 0, dest); 115 | # endif 116 | #endif 117 | } 118 | 119 | int main(int argc, char* argv[]) 120 | { 121 | /* FIPS 197, Appendix B input */ 122 | const uint8_t input[17] = { /* user input, unaligned buffer */ 123 | 0xff /*-1*/, 0x32, 0x43, 0xf6, 0xa8, 0x88, 0x5a, 0x30, 0x8d, 0x31, 0x31, 0x98, 0xa2, 0xe0, 0x37, 0x07, 0x34 124 | }; 125 | 126 | /* FIPS 197, Appendix B key */ 127 | const uint8_t key[18] = { /* user input, unaligned buffer */ 128 | 0xff /*-1*/, 0xff /*-1*/, 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x9 , 0xcf, 0x4f, 0x3c 129 | }; 130 | 131 | /* FIPS 197, Appendix B expanded subkeys */ 132 | __attribute__((aligned(16))) 133 | const uint8_t subkeys[10][16] = { /* library controlled, aligned buffer */ 134 | {0xA0, 0xFA, 0xFE, 0x17, 0x88, 0x54, 0x2c, 0xb1, 0x23, 0xa3, 0x39, 0x39, 0x2a, 0x6c, 0x76, 0x05}, 135 | {0xF2, 0xC2, 0x95, 0xF2, 0x7a, 0x96, 0xb9, 0x43, 0x59, 0x35, 0x80, 0x7a, 0x73, 0x59, 0xf6, 0x7f}, 136 | {0x3D, 0x80, 0x47, 0x7D, 0x47, 0x16, 0xFE, 0x3E, 0x1E, 0x23, 0x7E, 0x44, 0x6D, 0x7A, 0x88, 0x3B}, 137 | {0xEF, 0x44, 0xA5, 0x41, 0xA8, 0x52, 0x5B, 0x7F, 0xB6, 0x71, 0x25, 0x3B, 0xDB, 0x0B, 0xAD, 0x00}, 138 | {0xD4, 0xD1, 0xC6, 0xF8, 0x7C, 0x83, 0x9D, 0x87, 0xCA, 0xF2, 0xB8, 0xBC, 0x11, 0xF9, 0x15, 0xBC}, 139 | {0x6D, 0x88, 0xA3, 0x7A, 0x11, 0x0B, 0x3E, 0xFD, 0xDB, 0xF9, 0x86, 0x41, 0xCA, 0x00, 0x93, 0xFD}, 140 | {0x4E, 0x54, 0xF7, 0x0E, 0x5F, 0x5F, 0xC9, 0xF3, 0x84, 0xA6, 0x4F, 0xB2, 0x4E, 0xA6, 0xDC, 0x4F}, 141 | {0xEA, 0xD2, 0x73, 0x21, 0xB5, 0x8D, 0xBA, 0xD2, 0x31, 0x2B, 0xF5, 0x60, 0x7F, 0x8D, 0x29, 0x2F}, 142 | {0xAC, 0x77, 0x66, 0xF3, 0x19, 0xFA, 0xDC, 0x21, 0x28, 0xD1, 0x29, 0x41, 0x57, 0x5c, 0x00, 0x6E}, 143 | {0xD0, 0x14, 0xF9, 0xA8, 0xC9, 0xEE, 0x25, 0x89, 0xE1, 0x3F, 0x0c, 0xC8, 0xB6, 0x63, 0x0C, 0xA6} 144 | }; 145 | 146 | /* Result */ 147 | uint8_t result[19] = { /* user output, unaligned buffer */ 148 | 0xff /*-1*/, 0xff /*-1*/, 0xff /*-1*/ 149 | }; 150 | 151 | #if defined(TEST_AES_XLC) 152 | 153 | /* Ensure we are exercising unaligned user buffers */ 154 | uint8x16_p8 s = Load8x16(input+1); 155 | uint8x16_p8 k = Load8x16(key+2); 156 | s = vec_xor(s, k); 157 | 158 | k = Load8x16(subkeys[0]); 159 | s = __vcipher(s, k); 160 | 161 | k = Load8x16(subkeys[1]); 162 | s = __vcipher(s, k); 163 | 164 | k = Load8x16(subkeys[2]); 165 | s = __vcipher(s, k); 166 | 167 | k = Load8x16(subkeys[3]); 168 | s = __vcipher(s, k); 169 | 170 | k = Load8x16(subkeys[4]); 171 | s = __vcipher(s, k); 172 | 173 | k = Load8x16(subkeys[5]); 174 | s = __vcipher(s, k); 175 | 176 | k = Load8x16(subkeys[6]); 177 | s = __vcipher(s, k); 178 | 179 | k = Load8x16(subkeys[7]); 180 | s = __vcipher(s, k); 181 | 182 | k = Load8x16(subkeys[8]); 183 | s = __vcipher(s, k); 184 | 185 | k = Load8x16(subkeys[9]); 186 | s = __vcipherlast(s, k); 187 | 188 | /* Ensure we are exercising unaligned user buffers */ 189 | Store8x16(s, result+3); 190 | 191 | #elif defined(TEST_AES_GCC) 192 | 193 | /* Ensure we are exercising unaligned user buffers */ 194 | uint64x2_p8 s = Load64x2(input+1); 195 | uint64x2_p8 k = Load64x2(key+2); 196 | s = vec_xor(s, k); 197 | 198 | k = Load64x2(subkeys[0]); 199 | s = __builtin_crypto_vcipher(s, k); 200 | 201 | k = Load64x2(subkeys[1]); 202 | s = __builtin_crypto_vcipher(s, k); 203 | 204 | k = Load64x2(subkeys[2]); 205 | s = __builtin_crypto_vcipher(s, k); 206 | 207 | k = Load64x2(subkeys[3]); 208 | s = __builtin_crypto_vcipher(s, k); 209 | 210 | k = Load64x2(subkeys[4]); 211 | s = __builtin_crypto_vcipher(s, k); 212 | 213 | k = Load64x2(subkeys[5]); 214 | s = __builtin_crypto_vcipher(s, k); 215 | 216 | k = Load64x2(subkeys[6]); 217 | s = __builtin_crypto_vcipher(s, k); 218 | 219 | k = Load64x2(subkeys[7]); 220 | s = __builtin_crypto_vcipher(s, k); 221 | 222 | k = Load64x2(subkeys[8]); 223 | s = __builtin_crypto_vcipher(s, k); 224 | 225 | k = Load64x2(subkeys[9]); 226 | s = __builtin_crypto_vcipherlast(s, k); 227 | 228 | /* Ensure we are exercising unaligned user buffers */ 229 | Store64x2(s, result+3); 230 | 231 | #endif 232 | 233 | printf("Input: "); 234 | for (unsigned int i=1; i<17; ++i) 235 | printf("%02X ", input[i]); 236 | printf("\n"); 237 | 238 | printf("Key: "); 239 | for (unsigned int i=2; i<18; ++i) 240 | printf("%02X ", key[i]); 241 | printf("\n"); 242 | 243 | printf("Output: "); 244 | for (unsigned int i=3; i<19; ++i) 245 | printf("%02X ", result[i]); 246 | printf("\n"); 247 | 248 | /* FIPS 197, Appendix B output */ 249 | const uint8_t exp[16] = { 250 | 0x39, 0x25, 0x84, 0x1D, 0x02, 0xDC, 0x09, 0xFB, 0xDC, 0x11, 0x85, 0x97, 0x19, 0x6A, 0x0B, 0x32 251 | }; 252 | 253 | if (0 == memcmp(result+3, exp, 16)) 254 | printf("SUCCESS!!!\n"); 255 | else 256 | printf("FAILURE!!!\n"); 257 | 258 | return 0; 259 | } 260 | --------------------------------------------------------------------------------