├── README.md
├── clmul-x86.c
├── aes-arm.c
├── clmul-arm.c
├── README-p8.md
└── aes-p8.c


/README.md:
--------------------------------------------------------------------------------
 1 | # AES-Intrinsics
 2 | 
 3 | This GitHub repository contains source code for AES encryption using Intel AES and ARMv8 AES intrinsics, and Power8 built-ins. The source files should be portable across toolchains which support the Intel and ARMv8 AES extensions.
 4 | 
 5 | Only the AES encryption function is provided. The function operates on full blocks. Users must set the key, and users must pad the last block. The small sample program included with each source file does both on an empty message.
 6 | 
 7 | ## Intel AES
 8 | 
 9 | The GitHub does not have an Intel AES implementation. Intel has an excellent document at [Intel Advanced Encryption Standard (AES) New Instructions Set](https://www.intel.com/content/dam/doc/white-paper/advanced-encryption-standard-new-instructions-set-paper.pdf).
10 | 
11 | If you want to test the programs but don't have a capable machine on hand, then you can use the Intel Software Development Emulator. You can find it at http://software.intel.com/en-us/articles/intel-software-development-emulator.
12 | 
13 | ## ARM AES
14 | 
15 | To compile the ARM sources on an ARMv8 machine, be sure your CFLAGS include `-march=armv8-a+crc+crypto`. Apple iOS CFLAGS should include `-arch arm64` and a system root like `-isysroot  /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS8.2.sdk`.
16 | 
17 | The ARM source files are based on code from ARM, and code by Johannes Schneiders, Skip Hovsmith and Barry O'Rourke for the mbedTLS project. You can find the mbedTLS GitHub at http://github.com/ARMmbed/mbedtls. Prior to ARM's implementation, Critical Blue provided the source code and pull request at http://github.com/CriticalBlue/mbedtls.
18 | 
19 | If you want to test the programs but don't have a capable machine on hand, then you can use the ARM  Fixed Virtual Platforms. You can find it at https://developer.arm.com/products/system-design/fixed-virtual-platforms.
20 | 
21 | ## Power8 AES
22 | 
23 | To compile the Power8 sources on an PPC machine with GCC, be sure your CFLAGS include `-mcpu=power8 -maltivec`. If using IBM XL C/C++ then use `-qarch=pwr8 -qaltivec`.
24 | 
25 | The Power8 source files are written from scratch. IBM's documentation sucks. Namely, there is none.
26 | 
27 | # Benchmarks
28 | 
29 | To be determined.


--------------------------------------------------------------------------------
/clmul-x86.c:
--------------------------------------------------------------------------------
  1 | /* clmul-x86.c - Intel Carryless Multiply using C intrinsics  */
  2 | /*   Written and place in public domain by Jeffrey Walton     */
  3 | /*   Based on code from Intel CLMUL guide                     */
  4 | 
  5 | /* Include the GCC super header */
  6 | #if defined(__GNUC__)
  7 | # include <stdint.h>
  8 | # include <x86intrin.h>
  9 | #endif
 10 | 
 11 | /* Microsoft supports clmul extensions as of Visual Studio VS2008 */
 12 | #if defined(_MSC_VER)
 13 | # include <immintrin.h>
 14 | # define WIN32_LEAN_AND_MEAN
 15 | # include <Windows.h>
 16 | typedef UINT8 uint8_t;
 17 | #endif
 18 | 
 19 | /* Perform the multiplication and reduction in GF(2^128) */
 20 | void clmul_x86(uint8_t r[16], const uint8_t a[16], const uint8_t b[16])
 21 | {
 22 |     const __m128i MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 23 | 
 24 |     __m128i a1 = _mm_loadu_si128((const __m128i*)a);
 25 |     __m128i b1 = _mm_loadu_si128((const __m128i*)b);
 26 | 
 27 |     a1 = _mm_shuffle_epi8(a1, MASK);
 28 |     b1 = _mm_shuffle_epi8(b1, MASK);
 29 | 
 30 |     __m128i T0, T1, T2, T3, T4, T5;
 31 | 
 32 |     T0 = _mm_clmulepi64_si128(a1, b1, 0x00);
 33 |     T1 = _mm_clmulepi64_si128(a1, b1, 0x01);
 34 |     T2 = _mm_clmulepi64_si128(a1, b1, 0x10);
 35 |     T3 = _mm_clmulepi64_si128(a1, b1, 0x11);
 36 | 
 37 |     T1 = _mm_xor_si128(T1, T2);
 38 |     T2 = _mm_slli_si128(T1, 8);
 39 |     T1 = _mm_srli_si128(T1, 8);
 40 |     T0 = _mm_xor_si128(T0, T2);
 41 |     T3 = _mm_xor_si128(T3, T1);
 42 | 
 43 |     T4 = _mm_srli_epi32(T0, 31);
 44 |     T0 = _mm_slli_epi32(T0, 1);
 45 | 
 46 |     T5 = _mm_srli_epi32(T3, 31);
 47 |     T3 = _mm_slli_epi32(T3, 1);
 48 | 
 49 |     T2 = _mm_srli_si128(T4, 12);
 50 |     T5 = _mm_slli_si128(T5, 4);
 51 |     T4 = _mm_slli_si128(T4, 4);
 52 |     T0 = _mm_or_si128(T0, T4);
 53 |     T3 = _mm_or_si128(T3, T5);
 54 |     T3 = _mm_or_si128(T3, T2);
 55 | 
 56 |     T4 = _mm_slli_epi32(T0, 31);
 57 |     T5 = _mm_slli_epi32(T0, 30);
 58 |     T2 = _mm_slli_epi32(T0, 25);
 59 | 
 60 |     T4 = _mm_xor_si128(T4, T5);
 61 |     T4 = _mm_xor_si128(T4, T2);
 62 |     T5 = _mm_srli_si128(T4, 4);
 63 |     T3 = _mm_xor_si128(T3, T5);
 64 |     T4 = _mm_slli_si128(T4, 12);
 65 |     T0 = _mm_xor_si128(T0, T4);
 66 |     T3 = _mm_xor_si128(T3, T0);
 67 | 
 68 |     T4 = _mm_srli_epi32(T0, 1);
 69 |     T1 = _mm_srli_epi32(T0, 2);
 70 |     T2 = _mm_srli_epi32(T0, 7);
 71 |     T3 = _mm_xor_si128(T3, T1);
 72 |     T3 = _mm_xor_si128(T3, T2);
 73 |     T3 = _mm_xor_si128(T3, T4);
 74 | 
 75 |     T3 = _mm_shuffle_epi8(T3, MASK);
 76 | 
 77 |     _mm_storeu_si128((__m128i*)r, T3);
 78 | }
 79 | 
 80 | #if defined(TEST_MAIN)
 81 | 
 82 | #include <stdio.h>
 83 | #include <string.h>
 84 | int main(int argc, char* argv[])
 85 | {
 86 |     /* A's high nibble is 0x01, B's high nibble is 0x02 */
 87 |     uint8_t a[16] = {0x1f,0x1e,0x1d,0x1c,0x1b,0x1a,0x19,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10};
 88 |     uint8_t b[16] = {0x2f,0x2e,0x2d,0x2c,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x23,0x22,0x21,0x20};
 89 |     uint8_t r[16];
 90 | 
 91 |     clmul_x86(r, a, b);
 92 | 
 93 |     /* 020BBEB352AEAE16... */
 94 |     printf("GHASH of message: ");
 95 |     printf("%02X%02X%02X%02X%02X%02X%02X%02X...\n",
 96 |         r[0] & 0xFF, r[1] & 0xFF, r[2] & 0xFF, r[3] & 0xFF,
 97 |         r[4] & 0xFF, r[5] & 0xFF, r[6] & 0xFF, r[7] & 0xFF);
 98 | 
 99 |     int success = (r[0] == 0x02 && r[1] == 0x0B && r[2] == 0xBE && r[3] == 0xB3 &&
100 |         r[4] == 0x52 && r[5] == 0xAE && r[6] == 0xAE && r[7] == 0x16);
101 | 
102 |     if (success)
103 |         printf("Success!\n");
104 |     else
105 |         printf("Failure!\n");
106 | 
107 |     return (success != 0 ? 0 : 1);
108 | }
109 | 
110 | #endif
111 | 


--------------------------------------------------------------------------------
/aes-arm.c:
--------------------------------------------------------------------------------
  1 | /* aes-arm.c - ARMv8 AES extensions using C intrinsics         */
  2 | /*   Written and placed in public domain by Jeffrey Walton     */
  3 | /*   Based on code from ARM, and by Johannes Schneiders, Skip  */
  4 | /*   Hovsmith and Barry O'Rourke for the mbedTLS project.      */
  5 | 
  6 | /* gcc -std=c99 -march=armv8-a+crypto aes-arm.c -o aes-arm.exe */
  7 | 
  8 | #if defined(__arm__) || defined(__aarch32__) || defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64)
  9 | # if defined(__GNUC__)
 10 | #  include <stdint.h>
 11 | # endif
 12 | # if defined(__ARM_NEON) || defined(_MSC_VER)
 13 | #  include <arm_neon.h>
 14 | # endif
 15 | /* GCC and LLVM Clang, but not Apple Clang */
 16 | # if defined(__GNUC__) && !defined(__apple_build_version__)
 17 | #  if defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRYPTO)
 18 | #   include <arm_acle.h>
 19 | #  endif
 20 | # endif
 21 | #endif  /* ARM Headers */
 22 | 
 23 | void aes_process_arm(const uint8_t key[], const uint8_t subkeys[], uint32_t rounds,
 24 |                      const uint8_t input[], uint8_t output[], uint32_t length)
 25 | {
 26 | 	while (length >= 16)
 27 | 	{
 28 | 		uint8x16_t block = vld1q_u8(input);
 29 | 
 30 | 		// AES single round encryption
 31 | 		block = vaeseq_u8(block, vld1q_u8(key));
 32 | 		// AES mix columns
 33 | 		block = vaesmcq_u8(block);
 34 | 
 35 | 		// AES single round encryption
 36 | 		block = vaeseq_u8(block, vld1q_u8(subkeys));
 37 | 		// AES mix columns
 38 | 		block = vaesmcq_u8(block);
 39 | 
 40 | 		for (unsigned int i=1; i<rounds-2; ++i)
 41 | 		{
 42 | 			// AES single round encryption
 43 | 			block = vaeseq_u8(block, vld1q_u8(subkeys+i*16));
 44 | 			// AES mix columns
 45 | 			block = vaesmcq_u8(block);
 46 | 		}
 47 | 
 48 | 		// AES single round encryption
 49 | 		block = vaeseq_u8(block, vld1q_u8(subkeys+(rounds-2)*16));
 50 | 		// Final Add (bitwise Xor)
 51 | 		block = veorq_u8(block, vld1q_u8(subkeys+(rounds-1)*16));
 52 | 
 53 | 		vst1q_u8(output, block);
 54 | 
 55 | 		input += 16; output += 16;
 56 | 		length -= 16;
 57 | 	}
 58 | }
 59 | 
 60 | #if defined(TEST_MAIN)
 61 | 
 62 | #include <stdio.h>
 63 | #include <string.h>
 64 | 
 65 | int main(int argc, char* argv[])
 66 | {
 67 | 	/* FIPS 197, Appendix B input */
 68 | 	const uint8_t input[16] = { /* user input, unaligned buffer */
 69 | 		0x32, 0x43, 0xf6, 0xa8, 0x88, 0x5a, 0x30, 0x8d, 0x31, 0x31, 0x98, 0xa2, 0xe0, 0x37, 0x07, 0x34
 70 | 	};
 71 | 
 72 | 	/* FIPS 197, Appendix B key */
 73 | 	const uint8_t key[16] = { /* user input, unaligned buffer */
 74 | 		0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x9 , 0xcf, 0x4f, 0x3c
 75 | 	};
 76 | 
 77 | 	/* FIPS 197, Appendix B expanded subkeys */
 78 | #ifdef _MSC_VER
 79 | 	__declspec(align(16))
 80 | #else
 81 | 	__attribute__((aligned(4)))
 82 | #endif
 83 | 	const uint8_t subkeys[10][16] = { /* library controlled, aligned buffer */
 84 | 		{0xA0, 0xFA, 0xFE, 0x17, 0x88, 0x54, 0x2c, 0xb1, 0x23, 0xa3, 0x39, 0x39, 0x2a, 0x6c, 0x76, 0x05},
 85 | 		{0xF2, 0xC2, 0x95, 0xF2, 0x7a, 0x96, 0xb9, 0x43, 0x59, 0x35, 0x80, 0x7a, 0x73, 0x59, 0xf6, 0x7f},
 86 | 		{0x3D, 0x80, 0x47, 0x7D, 0x47, 0x16, 0xFE, 0x3E, 0x1E, 0x23, 0x7E, 0x44, 0x6D, 0x7A, 0x88, 0x3B},
 87 | 		{0xEF, 0x44, 0xA5, 0x41, 0xA8, 0x52, 0x5B, 0x7F, 0xB6, 0x71, 0x25, 0x3B, 0xDB, 0x0B, 0xAD, 0x00},
 88 | 		{0xD4, 0xD1, 0xC6, 0xF8, 0x7C, 0x83, 0x9D, 0x87, 0xCA, 0xF2, 0xB8, 0xBC, 0x11, 0xF9, 0x15, 0xBC},
 89 | 		{0x6D, 0x88, 0xA3, 0x7A, 0x11, 0x0B, 0x3E, 0xFD, 0xDB, 0xF9, 0x86, 0x41, 0xCA, 0x00, 0x93, 0xFD},
 90 | 		{0x4E, 0x54, 0xF7, 0x0E, 0x5F, 0x5F, 0xC9, 0xF3, 0x84, 0xA6, 0x4F, 0xB2, 0x4E, 0xA6, 0xDC, 0x4F},
 91 | 		{0xEA, 0xD2, 0x73, 0x21, 0xB5, 0x8D, 0xBA, 0xD2, 0x31, 0x2B, 0xF5, 0x60, 0x7F, 0x8D, 0x29, 0x2F},
 92 | 		{0xAC, 0x77, 0x66, 0xF3, 0x19, 0xFA, 0xDC, 0x21, 0x28, 0xD1, 0x29, 0x41, 0x57, 0x5c, 0x00, 0x6E},
 93 | 		{0xD0, 0x14, 0xF9, 0xA8, 0xC9, 0xEE, 0x25, 0x89, 0xE1, 0x3F, 0x0c, 0xC8, 0xB6, 0x63, 0x0C, 0xA6}
 94 | 	};
 95 | 
 96 | 	/* Result */
 97 | 	uint8_t result[19] = { 0 };
 98 | 
 99 | 	aes_process_arm((const uint8_t*)key, (const uint8_t*)subkeys, 10, input, result+3, 16);
100 | 
101 | 	printf("Input: ");
102 | 	for (unsigned int i=0; i<16; ++i)
103 | 		printf("%02X ", input[i]);
104 | 	printf("\n");
105 | 
106 | 	printf("Key: ");
107 | 	for (unsigned int i=0; i<16; ++i)
108 | 		printf("%02X ", key[i]);
109 | 	printf("\n");
110 | 
111 | 	printf("Output: ");
112 | 	for (unsigned int i=3; i<19; ++i)
113 | 		printf("%02X ", result[i]);
114 | 	printf("\n");
115 | 
116 | 	/* FIPS 197, Appendix B output */
117 | 	const uint8_t exp[16] = {
118 | 		0x39, 0x25, 0x84, 0x1D, 0x02, 0xDC, 0x09, 0xFB, 0xDC, 0x11, 0x85, 0x97, 0x19, 0x6A, 0x0B, 0x32
119 | 	};
120 | 
121 | 	if (0 == memcmp(result+3, exp, 16))
122 | 		printf("SUCCESS!!!\n");
123 | 	else
124 | 		printf("FAILURE!!!\n");
125 | 
126 | 	return 0;
127 | }
128 | 
129 | #endif
130 | 


--------------------------------------------------------------------------------
/clmul-arm.c:
--------------------------------------------------------------------------------
  1 | /* clmul-arm.c - ARMv8 Carryless Multiply using C intrinsics  */
  2 | /*   Written and placed in public domain by Jeffrey Walton    */
  3 | /*   Based on code from ARM, and by Johannes Schneiders, Skip */
  4 | /*   Hovsmith and Barry O'Rourke for the mbedTLS project.     */
  5 | 
  6 | /* Visual Studio 2017 and above supports ARMv8, but its not clear how to detect */
  7 | /* it or use it at the moment. Also see http://stackoverflow.com/q/37244202,    */
  8 | /* http://stackoverflow.com/q/41646026, and http://stackoverflow.com/q/41688101 */
  9 | #if defined(__arm64__) || defined(__aarch64__)
 10 | # if defined(__GNUC__)
 11 | #  include <stdint.h>
 12 | # endif
 13 | # if defined(__ARM_NEON) || (defined(__GNUC__) && (defined(__aarch64__) || defined(__aarch64__)))
 14 | #  include <arm_neon.h>
 15 | # endif
 16 | /* GCC and LLVM Clang, but not Apple Clang */
 17 | # if defined(__GNUC__) && !defined(__apple_build_version__)
 18 | #  if defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRYPTO)
 19 | #   include <arm_acle.h>
 20 | #  endif
 21 | # endif
 22 | #endif  /* ARM Headers */
 23 | 
 24 | /********************************/
 25 | /* GCC and compatible compilers */
 26 | /********************************/
 27 | #if defined(__GNUC__)
 28 | #if defined(__GNUC_STDC_INLINE__) || defined(__INLINE__)
 29 | # define MAYBE_INLINE inline
 30 | #else
 31 | # define MAYBE_INLINE
 32 | #endif
 33 | 
 34 | /* Schneiders, Hovsmith and O'Rourke discovered this trick.     */
 35 | /* It results in much better code generation in production code */
 36 | /* by avoiding D-register spills when using vgetq_lane_u64. The */
 37 | /* problem does not surface under minimal test cases.           */
 38 | MAYBE_INLINE uint8x16_t PMULL_LOW(const uint8x16_t a, const uint8x16_t b)
 39 | {
 40 |     uint8x16_t r;
 41 |     __asm __volatile("pmull    %0.1q, %1.1d, %2.1d \n\t"
 42 |         :"=w" (r) : "w" (a), "w" (b) );
 43 |     return r;
 44 | }
 45 | 
 46 | MAYBE_INLINE uint8x16_t PMULL_HIGH(const uint8x16_t a, const uint8x16_t b)
 47 | {
 48 |     uint8x16_t r;
 49 |     __asm __volatile("pmull2   %0.1q, %1.2d, %2.2d \n\t"
 50 |         :"=w" (r) : "w" (a), "w" (b) );
 51 |     return r;
 52 | }
 53 | #endif /* GCC and compatibles */
 54 | 
 55 | /**************************************/
 56 | /* Microsoft and compatible compilers */
 57 | /**************************************/
 58 | #if defined(_MSC_VER)
 59 | inline uint8x16_t PMULL_LOW(const uint8x16_t a, const uint8x16_t b)
 60 | {
 61 |     return (uint8x16_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
 62 |                                   vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
 63 | }
 64 | 
 65 | inline uint8x16_t PMULL_HIGH(const uint8x16_t a, const uint8x16_t b)
 66 | {
 67 |     return (uint8x16_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
 68 |                                   vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
 69 | }
 70 | #endif /* Microsoft and compatibles */
 71 | 
 72 | /*********************************************************/
 73 | /* Perform the multiplication and reduction in GF(2^128) */
 74 | /*********************************************************/
 75 | void clmul_arm(uint8_t r[16], const uint8_t a[16], const uint8_t b[16])
 76 | {
 77 |     uint8x16_t a8, b8, c8;
 78 |     uint8x16_t z, p;
 79 |     uint8x16_t r0, r1;
 80 |     uint8x16_t t0, t1;
 81 | 
 82 |     a8 = vrbitq_u8(vld1q_u8(a));
 83 |     b8 = vrbitq_u8(vld1q_u8(b));
 84 | 
 85 |     /* polynomial multiply */
 86 |     z = vdupq_n_u8(0);
 87 |     r0 = PMULL_LOW(a8, b8);
 88 |     r1 = PMULL_HIGH(a8, b8);
 89 |     t0 = vextq_u8(b8, b8, 8);
 90 |     t1 = PMULL_LOW(a8, t0);
 91 |     t0 = PMULL_HIGH(a8, t0);
 92 |     t0 = veorq_u8(t0, t1);
 93 |     t1 = vextq_u8(z, t0, 8);
 94 |     r0 = veorq_u8(r0, t1);
 95 |     t1 = vextq_u8(t0, z, 8);
 96 |     r1 = veorq_u8(r1, t1);
 97 | 
 98 |     /* polynomial reduction */
 99 |     p = vreinterpretq_u8_u64(vdupq_n_u64(0x0000000000000087));
100 |     t0 = PMULL_HIGH(r1, p);
101 |     t1 = vextq_u8(t0, z, 8);
102 |     r1 = veorq_u8(r1, t1);
103 |     t1 = vextq_u8(z, t0, 8);
104 |     r0 = veorq_u8(r0, t1);
105 |     t0 = PMULL_LOW(r1, p);
106 |     c8 = veorq_u8(r0, t0);
107 | 
108 |     vst1q_u8(r, vrbitq_u8(c8));
109 | }
110 | 
111 | #if defined(TEST_MAIN)
112 | 
113 | #include <stdio.h>
114 | #include <string.h>
115 | int main(int argc, char* argv[])
116 | {
117 |     /* A's high nibble is 0x01, B's high nibble is 0x02 */
118 |     uint8_t a[16] = {0x1f,0x1e,0x1d,0x1c,0x1b,0x1a,0x19,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10};
119 |     uint8_t b[16] = {0x2f,0x2e,0x2d,0x2c,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x23,0x22,0x21,0x20};
120 |     uint8_t r[16];
121 | 
122 |     clmul_arm(r, a, b);
123 | 
124 |     /* 020BBEB352AEAE16... */
125 |     printf("GHASH of message: ");
126 |     printf("%02X%02X%02X%02X%02X%02X%02X%02X...\n",
127 |         r[0] & 0xFF, r[1] & 0xFF, r[2] & 0xFF, r[3] & 0xFF,
128 |         r[4] & 0xFF, r[5] & 0xFF, r[6] & 0xFF, r[7] & 0xFF);
129 | 
130 |     int success = (r[0] == 0x02 && r[1] == 0x0B && r[2] == 0xBE && r[3] == 0xB3 &&
131 |         r[4] == 0x52 && r[5] == 0xAE && r[6] == 0xAE && r[7] == 0x16);
132 | 
133 |     if (success)
134 |         printf("Success!\n");
135 |     else
136 |         printf("Failure!\n");
137 | 
138 |     return (success != 0 ? 0 : 1);
139 | }
140 | 
141 | #endif
142 | 


--------------------------------------------------------------------------------
/README-p8.md:
--------------------------------------------------------------------------------
 1 | # AES-Power8
 2 | 
 3 | This is a test implementation of Power 8's in-core crypto using xlC and GCC built-in's.
 4 | 
 5 | The test implementation side steps key scheduling by using a pre-expanded "golden" key from FIPS 197, Appendix B. The golden key is the big-endian byte array `2b 7e 15 16 28 ae d2 a6 ab f7 15 88 09 cf 4f 3c`, and it produces the key schedule hard-coded in the program.
 6 | 
 7 | The GCC Compile Farm (http://gcc.gnu.org/wiki/CompileFarm) offers two test machines. To test on a Power 8 little-endian machine use GCC112. To test on a big-endian machine use GCC119.
 8 | 
 9 | According to data from GCC112, the naive impementation provided by `fips197-p8.c` achieves about 6 cycles-per-byte (cpb). It is mostly dull, but its still better than 20 to 30 cpb for C and C++. Running 4 or 8 blocks in parallel will increase performance to around 1 to 1.5 cpb.
10 | 
11 | ## Compiling
12 | 
13 | To compile the source file using GCC:
14 | 
15 |     gcc -std=c99 -mcpu=power8 fips197-p8.c -o fips197-p8.exe
16 | 
17 | To compile the source file using IBM XL C/C++:
18 | 
19 |     xlc -qarch=pwr8 -qaltivec fips197-p8.c -o fips197-p8.exe
20 | 
21 | ## Decryption
22 | 
23 | The decryption rountines are mostly a copy and paste of the encryption routines using the appropriate inverse function. However, you must build the key table using the algorithm discussed in FIPS 197, Sections 5.3.1 through 5.3.4 (pp. 20-23). You cannot use the "Equivalent Inverse Cipher" from Section 5.3.5 (p.23).
24 | 
25 | If you use the same key table as built for encryption, then you should index the subkey table in reverse order. That is, start with index `rounds`, then `rounds-1`, ..., then index `1`, and finally index `0`. (Remember, there are `N+1` subkeys for `N` rounds of AES).
26 | 
27 | ## Byte Order
28 | 
29 | The VSX unit only operates on big-endian data. However, the CPU will load the VSX register in little-endian format on a little-endian machine by default. On little-endian machines each 16-byte buffer must be byte reversed before loading. Conversely, the data needs to be stored in little endian format on little endian machines when moving from a VSX register to memory. You have two options when reversing the data to ensure it is properly loaded into a VSX register or saved from a VSX register. First you can reverse the in-memory byte buffer. Second, you can load the byte buffer and then permute the vector.
30 | 
31 | A derivative of the test program used the first strategy for the subkey table. The subkey table is converted to big endian once so each subkey does not need a permute after loading. It was an optimization that benefited multiple encryptions under the same key. The test program used the second strategy on user data like input and output buffers.
32 | 
33 | For general reading on byte ordering, see "Targeting your applications - what little endian and big endian IBM XL C/C++ compiler differences mean to you" (http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html).
34 | 
35 | ## Optimizations
36 | 
37 | There are at least two optimizations available that your program should take. The first optimization is perform the byte reversal on little-endian machines for the subkey table once after it is built. You will still need to perform the endian conversions on user supplied input and output buffers as the data is streamed into the program.
38 | 
39 | The second optimization your program should take is to run 4 or 8 blocks of encryption or decryption in parallel. The VSX unit has 32 full size registers, so you should be able to raise the number of simultaneous transformations to 12 if desired.
40 | 
41 | As an example, instead of a single loop operating on a a single block:
42 | 
43 | ```
44 | VectorType s = VectorLoad(input);
45 | VectorType k = VectorLoadKey(subkeys);
46 | 
47 | s = VectorXor(s, k);
48 | for (size_t i=1; i<rounds-1; i+=2)
49 | {
50 |     s = VectorEncrypt(s, VectorLoadKey(  i*16,   subkeys));
51 |     s = VectorEncrypt(s, VectorLoadKey((i+1)*16, subkeys));
52 | }
53 | 
54 | s = VectorEncrypt(s, VectorLoadKey((rounds-1)*16, subkeys));
55 | s = VectorEncryptLast(s, VectorLoadKey(rounds*16, subkeys));
56 | ```
57 | 
58 | Run multiple transformations simultaneously:
59 | 
60 | ```
61 | VectorType k = VectorLoadKey(subkeys);
62 | VectorType s0 = VectorLoad( 0, input);
63 | VectorType s1 = VectorLoad(16, input);
64 | VectorType s2 = VectorLoad(32, input);
65 | VectorType s3 = VectorLoad(64, input);
66 | 
67 | s0 = VectorXor(s0, k);
68 | s1 = VectorXor(s1, k);
69 | s2 = VectorXor(s2, k);
70 | s3 = VectorXor(s3, k);
71 | 
72 | for (size_t i=1; i<rounds; ++i)
73 | {
74 |      k = VectorLoadKey(i*16, subkeys);
75 |     s0 = VectorEncrypt(s0, k);
76 |     s1 = VectorEncrypt(s1, k);
77 |     s2 = VectorEncrypt(s2, k);
78 |     s3 = VectorEncrypt(s3, k);
79 | }
80 | 
81 |  k = VectorLoadKey(rounds*16, subkeys);
82 | s0 = VectorEncryptLast(s0, k);
83 | s1 = VectorEncryptLast(s1, k);
84 | s2 = VectorEncryptLast(s2, k);
85 | s3 = VectorEncryptLast(s3, k);
86 | ```
87 | 
88 | ## Field Implementations
89 | 
90 | Both Botan and Crypto++ used `fips197-p8.c` as a proof of concept. You can find the Botan issue to track the cut-in at Issue 1206, Add Power8 AES Encryption (http://github.com/randombit/botan/issues/1206). The issue to track the cut-in for Crypto++ can be found at Issue 497, Add Power8 AES Encryption (http://github.com/weidai11/cryptopp/issues/497).
91 | 
92 | ## Acknowledgements
93 | 
94 | Thanks to Bill Schmidt, George Wilson, and Michael Strosaker from the IBM Linux Technology Center for help with the implementation.
95 | 
96 | Many thanks to Andy Polyakov for comments, helpful suggestions and answering questions about his ASM implmentation of Power 8 AES. Andy's implementation is lightening fast and available in the OpenSSL project and the Linux kernel. Andy's code and license terms can be found at http://www.openssl.org/~appro/cryptogams/.
97 | 


--------------------------------------------------------------------------------
/aes-p8.c:
--------------------------------------------------------------------------------
  1 | /* Written and placed in public domain by Jeffrey Walton         */
  2 | /*  aes-p8.c tests Power 8 AES using GCC and XL C/C++ built-ins. */
  3 | 
  4 | /* xlc -qarch=pwr8 -qaltivec aes-p8.c -o aes-p8.exe              */
  5 | /* gcc -std=c99 -mcpu=power8 aes-p8.c -o aes-p8.exe              */
  6 | 
  7 | /* To test on an AltiVec/Power 8 little-endian machine use       */
  8 | /* GCC112. To test on a big-endian machine use GCC119.           */
  9 | 
 10 | /* Many thanks to Andy Polyakov for comments, helpful            */
 11 | /* suggestions and answering questions about his ASM             */
 12 | /* implmentation of Power 8 AES.                                 */
 13 | 
 14 | #include <stdio.h>
 15 | #include <stdlib.h>
 16 | #include <string.h>
 17 | #include <stdint.h>
 18 | 
 19 | #if defined(__ALTIVEC__)
 20 | # include <altivec.h>
 21 | # undef vector
 22 | # undef pixel
 23 | # undef bool
 24 | #endif
 25 | 
 26 | #if defined(__xlc__) || defined(__xlC__)
 27 | # define TEST_AES_XLC 1
 28 | #elif defined(__clang__)
 29 | # define TEST_AES_CLANG 1
 30 | #elif defined(__GNUC__)
 31 | # define TEST_AES_GCC 1
 32 | #endif
 33 | 
 34 | #if defined(__LITTLE_ENDIAN__)
 35 | # define TEST_AES_LITTLE_ENDIAN 1
 36 | #endif
 37 | 
 38 | typedef __vector unsigned char uint8x16_p8;
 39 | typedef __vector unsigned long long uint64x2_p8;
 40 | 
 41 | uint8x16_p8 Reverse8x16(const uint8x16_p8 src)
 42 | {
 43 | 	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
 44 | 	const uint8x16_p8 zero = {0};
 45 | 	return vec_perm(src, zero, mask);
 46 | }
 47 | 
 48 | uint64x2_p8 Reverse64x2(const uint64x2_p8 src)
 49 | {
 50 | 	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
 51 | 	const uint8x16_p8 zero = {0};
 52 | 	return (uint64x2_p8)vec_perm((uint8x16_p8)src, zero, mask);
 53 | }
 54 | 
 55 | /* Load from big-endian format. Perform endian conversion as necessary */
 56 | uint8x16_p8 Load8x16(const uint8_t src[16])
 57 | {
 58 | #if defined(TEST_AES_XLC)
 59 | 	/* http://stackoverflow.com/q/46124383/608639 */
 60 | 	return vec_xl_be(0, (uint8_t*)src);
 61 | #else
 62 | 	/* GCC, Clang, etc */
 63 | # if defined(TEST_AES_LITTLE_ENDIAN)
 64 | 	return Reverse8x16(vec_vsx_ld(0, src));
 65 | # else
 66 | 	return vec_vsx_ld(0, src);
 67 | # endif
 68 | #endif
 69 | }
 70 | 
 71 | /* Store in big-endian format. Perform endian conversion as necessary */
 72 | void Store8x16(const uint8x16_p8 src, uint8_t dest[16])
 73 | {
 74 | #if defined(TEST_AES_XLC)
 75 | 	/* http://stackoverflow.com/q/46124383/608639 */
 76 | 	vec_xst_be(src, 0, (uint8_t*)dest);
 77 | #else
 78 | 	/* GCC, Clang, etc */
 79 | # if defined(TEST_AES_LITTLE_ENDIAN)
 80 | 	vec_vsx_st(Reverse8x16(src), 0, dest);
 81 | # else
 82 | 	vec_vsx_st(src, 0, dest);
 83 | # endif
 84 | #endif
 85 | }
 86 | 
 87 | /* Load from big-endian format. Perform endian conversion as necessary */
 88 | uint64x2_p8 Load64x2(const uint8_t src[16])
 89 | {
 90 | #if defined(TEST_AES_XLC)
 91 | 	/* http://stackoverflow.com/q/46124383/608639 */
 92 | 	return (uint64x2_p8)vec_xl_be(0, (uint8_t*)src);
 93 | #else
 94 | 	/* GCC, Clang, etc */
 95 | # if defined(TEST_AES_LITTLE_ENDIAN)
 96 | 	return (uint64x2_p8)Reverse8x16(vec_vsx_ld(0, src));
 97 | # else
 98 | 	return (uint64x2_p8)vec_vsx_ld(0, src);
 99 | # endif
100 | #endif
101 | }
102 | 
103 | /* Store in big-endian format. Perform endian conversion as necessary */
104 | void Store64x2(const uint64x2_p8 src, uint8_t dest[16])
105 | {
106 | #if defined(TEST_AES_XLC)
107 | 	/* http://stackoverflow.com/q/46124383/608639 */
108 | 	vec_xst_be((uint8x16_p8)src, 0, (uint8_t*)dest);
109 | #else
110 | 	/* GCC, Clang, etc */
111 | # if defined(TEST_AES_LITTLE_ENDIAN)
112 | 	vec_vsx_st(Reverse8x16((uint8x16_p8)src), 0, dest);
113 | # else
114 | 	vec_vsx_st((uint8x16_p8)src, 0, dest);
115 | # endif
116 | #endif
117 | }
118 | 
119 | int main(int argc, char* argv[])
120 | {
121 | 	/* FIPS 197, Appendix B input */
122 | 	const uint8_t input[17] = { /* user input, unaligned buffer */
123 | 		0xff /*-1*/, 0x32, 0x43, 0xf6, 0xa8, 0x88, 0x5a, 0x30, 0x8d, 0x31, 0x31, 0x98, 0xa2, 0xe0, 0x37, 0x07, 0x34
124 | 	};
125 | 
126 | 	/* FIPS 197, Appendix B key */
127 | 	const uint8_t key[18] = { /* user input, unaligned buffer */
128 | 		0xff /*-1*/, 0xff /*-1*/, 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x9 , 0xcf, 0x4f, 0x3c
129 | 	};
130 | 
131 | 	/* FIPS 197, Appendix B expanded subkeys */
132 | 	__attribute__((aligned(16)))
133 | 	const uint8_t subkeys[10][16] = { /* library controlled, aligned buffer */
134 | 		{0xA0, 0xFA, 0xFE, 0x17, 0x88, 0x54, 0x2c, 0xb1, 0x23, 0xa3, 0x39, 0x39, 0x2a, 0x6c, 0x76, 0x05},
135 | 		{0xF2, 0xC2, 0x95, 0xF2, 0x7a, 0x96, 0xb9, 0x43, 0x59, 0x35, 0x80, 0x7a, 0x73, 0x59, 0xf6, 0x7f},
136 | 		{0x3D, 0x80, 0x47, 0x7D, 0x47, 0x16, 0xFE, 0x3E, 0x1E, 0x23, 0x7E, 0x44, 0x6D, 0x7A, 0x88, 0x3B},
137 | 		{0xEF, 0x44, 0xA5, 0x41, 0xA8, 0x52, 0x5B, 0x7F, 0xB6, 0x71, 0x25, 0x3B, 0xDB, 0x0B, 0xAD, 0x00},
138 | 		{0xD4, 0xD1, 0xC6, 0xF8, 0x7C, 0x83, 0x9D, 0x87, 0xCA, 0xF2, 0xB8, 0xBC, 0x11, 0xF9, 0x15, 0xBC},
139 | 		{0x6D, 0x88, 0xA3, 0x7A, 0x11, 0x0B, 0x3E, 0xFD, 0xDB, 0xF9, 0x86, 0x41, 0xCA, 0x00, 0x93, 0xFD},
140 | 		{0x4E, 0x54, 0xF7, 0x0E, 0x5F, 0x5F, 0xC9, 0xF3, 0x84, 0xA6, 0x4F, 0xB2, 0x4E, 0xA6, 0xDC, 0x4F},
141 | 		{0xEA, 0xD2, 0x73, 0x21, 0xB5, 0x8D, 0xBA, 0xD2, 0x31, 0x2B, 0xF5, 0x60, 0x7F, 0x8D, 0x29, 0x2F},
142 | 		{0xAC, 0x77, 0x66, 0xF3, 0x19, 0xFA, 0xDC, 0x21, 0x28, 0xD1, 0x29, 0x41, 0x57, 0x5c, 0x00, 0x6E},
143 | 		{0xD0, 0x14, 0xF9, 0xA8, 0xC9, 0xEE, 0x25, 0x89, 0xE1, 0x3F, 0x0c, 0xC8, 0xB6, 0x63, 0x0C, 0xA6}
144 | 	};
145 | 
146 | 	/* Result */
147 | 	uint8_t result[19] = { /* user output, unaligned buffer */
148 | 		0xff /*-1*/, 0xff /*-1*/, 0xff /*-1*/
149 | 	};
150 | 
151 | #if defined(TEST_AES_XLC)
152 | 
153 | 	/* Ensure we are exercising unaligned user buffers */
154 | 	uint8x16_p8 s = Load8x16(input+1);
155 | 	uint8x16_p8 k = Load8x16(key+2);
156 | 	s = vec_xor(s, k);
157 | 
158 | 	k = Load8x16(subkeys[0]);
159 | 	s = __vcipher(s, k);
160 | 
161 | 	k = Load8x16(subkeys[1]);
162 | 	s = __vcipher(s, k);
163 | 
164 | 	k = Load8x16(subkeys[2]);
165 | 	s = __vcipher(s, k);
166 | 
167 | 	k = Load8x16(subkeys[3]);
168 | 	s = __vcipher(s, k);
169 | 
170 | 	k = Load8x16(subkeys[4]);
171 | 	s = __vcipher(s, k);
172 | 
173 | 	k = Load8x16(subkeys[5]);
174 | 	s = __vcipher(s, k);
175 | 
176 | 	k = Load8x16(subkeys[6]);
177 | 	s = __vcipher(s, k);
178 | 
179 | 	k = Load8x16(subkeys[7]);
180 | 	s = __vcipher(s, k);
181 | 
182 | 	k = Load8x16(subkeys[8]);
183 | 	s = __vcipher(s, k);
184 | 
185 | 	k = Load8x16(subkeys[9]);
186 | 	s = __vcipherlast(s, k);
187 | 
188 | 	/* Ensure we are exercising unaligned user buffers */
189 | 	Store8x16(s, result+3);
190 | 
191 | #elif defined(TEST_AES_GCC)
192 | 
193 | 	/* Ensure we are exercising unaligned user buffers */
194 | 	uint64x2_p8 s = Load64x2(input+1);
195 | 	uint64x2_p8 k = Load64x2(key+2);
196 | 	s = vec_xor(s, k);
197 | 
198 | 	k = Load64x2(subkeys[0]);
199 | 	s = __builtin_crypto_vcipher(s, k);
200 | 
201 | 	k = Load64x2(subkeys[1]);
202 | 	s = __builtin_crypto_vcipher(s, k);
203 | 
204 | 	k = Load64x2(subkeys[2]);
205 | 	s = __builtin_crypto_vcipher(s, k);
206 | 
207 | 	k = Load64x2(subkeys[3]);
208 | 	s = __builtin_crypto_vcipher(s, k);
209 | 
210 | 	k = Load64x2(subkeys[4]);
211 | 	s = __builtin_crypto_vcipher(s, k);
212 | 
213 | 	k = Load64x2(subkeys[5]);
214 | 	s = __builtin_crypto_vcipher(s, k);
215 | 
216 | 	k = Load64x2(subkeys[6]);
217 | 	s = __builtin_crypto_vcipher(s, k);
218 | 
219 | 	k = Load64x2(subkeys[7]);
220 | 	s = __builtin_crypto_vcipher(s, k);
221 | 
222 | 	k = Load64x2(subkeys[8]);
223 | 	s = __builtin_crypto_vcipher(s, k);
224 | 
225 | 	k = Load64x2(subkeys[9]);
226 | 	s = __builtin_crypto_vcipherlast(s, k);
227 | 
228 | 	/* Ensure we are exercising unaligned user buffers */
229 | 	Store64x2(s, result+3);
230 | 
231 | #endif
232 | 
233 | 	printf("Input: ");
234 | 	for (unsigned int i=1; i<17; ++i)
235 | 		printf("%02X ", input[i]);
236 | 	printf("\n");
237 | 
238 | 	printf("Key: ");
239 | 	for (unsigned int i=2; i<18; ++i)
240 | 		printf("%02X ", key[i]);
241 | 	printf("\n");
242 | 
243 | 	printf("Output: ");
244 | 	for (unsigned int i=3; i<19; ++i)
245 | 		printf("%02X ", result[i]);
246 | 	printf("\n");
247 | 
248 | 	/* FIPS 197, Appendix B output */
249 | 	const uint8_t exp[16] = {
250 | 		0x39, 0x25, 0x84, 0x1D, 0x02, 0xDC, 0x09, 0xFB, 0xDC, 0x11, 0x85, 0x97, 0x19, 0x6A, 0x0B, 0x32
251 | 	};
252 | 
253 | 	if (0 == memcmp(result+3, exp, 16))
254 | 		printf("SUCCESS!!!\n");
255 | 	else
256 | 		printf("FAILURE!!!\n");
257 | 
258 | 	return 0;
259 | }
260 | 


--------------------------------------------------------------------------------