├── README.md ├── AES_final.cu ├── 192-ctr.cuh ├── 256-ctr.cuh ├── 256-es.cuh ├── LICENSE ├── 192-es.cuh ├── 128-ctr.cuh ├── file-encryption.cuh ├── AES_final.h └── 128-es.cuh /README.md: -------------------------------------------------------------------------------- 1 | **Breakthrough AES Performance on GPUs** 2 | 3 | These are CUDA optimizations of T-table based implementation of AES which contain zero bank conflicts. 4 | 5 | We achieved 6 | 7 | **315.2** Gbps AES-128 encryption on a **GTX 970**
8 | **878.6** Gbps AES-128 encryption on an **RTX 2070 Super** 9 | 10 | These results are published in https://ieeexplore.ieee.org/document/9422754 11 | 12 | In science, reproducibility of experiments is crucial but almost none of the GPU optimizations of AES is publicly availble. This is why we publish our codes here. 13 | 14 | Moreover, comparing different optimization results on different GPUs is almost impossible. When you have adifferent kind of optimization and want to compare it with our optimizations, please use these codes on the same GPU you used for your codes. 15 | 16 | 17 | **Cihangir Tezcan**, PhD
18 | _Director of Cyber Security Center_
19 | _Head of Department of Cyber Security, Informatics Institute_
20 | _Middle East Technical University_
21 | _Ankara, Turkey_ 22 | -------------------------------------------------------------------------------- /AES_final.cu: -------------------------------------------------------------------------------- 1 | // System includes 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // CUDA runtime 8 | #include 9 | 10 | // Helper functions and utilities to work with CUDA 11 | //#include 12 | //#include 13 | 14 | #include 15 | //#include 16 | 17 | // Custom header 18 | #include "AES_final.h" 19 | // 20 | #include "128-es.cuh" 21 | #include "128-ctr.cuh" 22 | #include "192-es.cuh" 23 | #include "192-ctr.cuh" 24 | #include "256-es.cuh" 25 | #include "256-ctr.cuh" 26 | //#include "small.cuh" 27 | //#include "silent.cuh" 28 | #include "file-encryption.cuh" 29 | 30 | void selection(int choice) { 31 | if (choice == 1) main128ExhaustiveSearch(1); 32 | else if (choice == 11) main128ExhaustiveSearch(11); 33 | else if (choice == 2) main128ExhaustiveSearch(2); 34 | else if (choice == 22) main128ExhaustiveSearch(22); 35 | else if (choice == 3) main128Ctr(); 36 | else if (choice == 4) main192ExhaustiveSearch(); 37 | else if (choice == 5) main192Ctr(); 38 | else if (choice == 6) main256ExhaustiveSearch(); 39 | else if (choice == 7) main256Ctr(); 40 | else if (choice == 8) { 41 | main128ExhaustiveSearch(1); 42 | main128Ctr(); 43 | main192ExhaustiveSearch(); 44 | main192Ctr(); 45 | main256ExhaustiveSearch(); 46 | main256Ctr(); 47 | } 48 | else printf("Wrong selection\n"); 49 | } 50 | 51 | int main() { 52 | cudaSetDevice(0); 53 | int choice; 54 | printf( 55 | "(1) AES-128 Exhaustive Search (no bank conflict, byteperm)\n" 56 | "(11) AES-128 Exhaustive Search (no bank conflict, arithmetic shift)\n" 57 | "(2) AES-128 Exhaustive Search (conflicting S-box, arithmetic shift)\n" 58 | "(22) AES-128 Exhaustive Search (conflicting S-box, byteperm)\n" 59 | "(3) AES-128 CTR \n" 60 | "(4) AES-192 Exhaustive Search\n" 61 | "(5) AES-192 CTR\n" 62 | "(6) AES-256 Exhaustive Search\n" 63 | "(7) AES-256 CTR\n" 64 | "(8) ALL\n" 65 | "Choice: "); 66 | scanf_s("%d", &choice); 67 | selection(choice); 68 | // AES-128 Exhaustive Search 69 | // main128ExhaustiveSearch(); 70 | 71 | // AES-128 Counter Mode 72 | // main128Ctr(); 73 | 74 | // AES-192 Exhaustive Search 75 | // main192ExhaustiveSearch(); 76 | 77 | 78 | // AES-192 Counter Mode 79 | // main192Ctr(); 80 | 81 | // AES-256 Exhaustive Search 82 | // main256ExhaustiveSearch(); 83 | 84 | // AES-256 Counter Mode 85 | // main256Ctr(); 86 | 87 | // Small AES probability calculation 88 | //mainSmall(); 89 | 90 | // Silent 91 | //mainSilent(); 92 | 93 | // File Encryption 94 | //mainFileEncryption(); 95 | return 0; 96 | } 97 | -------------------------------------------------------------------------------- /192-ctr.cuh: -------------------------------------------------------------------------------- 1 | // System includes 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // CUDA runtime 8 | #include 9 | 10 | // Helper functions and utilities to work with CUDA 11 | //#include 12 | //#include 13 | 14 | #include 15 | //#include 16 | 17 | // Custom header 18 | //#include "kernel.h" 19 | 20 | 21 | // Key expansion from given key set, populate rk[52] 22 | __host__ void keyExpansion192(u32* key, u32* rk) { 23 | 24 | u32 rk0, rk1, rk2, rk3, rk4, rk5; 25 | rk0 = key[0]; 26 | rk1 = key[1]; 27 | rk2 = key[2]; 28 | rk3 = key[3]; 29 | rk4 = key[4]; 30 | rk5 = key[5]; 31 | 32 | rk[0] = rk0; 33 | rk[1] = rk1; 34 | rk[2] = rk2; 35 | rk[3] = rk3; 36 | rk[4] = rk4; 37 | rk[5] = rk5; 38 | 39 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_192; roundCount++) { 40 | u32 temp = rk5; 41 | rk0 = rk0 ^ T4_3[(temp >> 16) & 0xff] ^ T4_2[(temp >> 8) & 0xff] ^ T4_1[(temp) & 0xff] ^ T4_0[(temp >> 24)] ^ RCON32[roundCount]; 42 | rk1 = rk1 ^ rk0; 43 | rk2 = rk2 ^ rk1; 44 | rk3 = rk3 ^ rk2; 45 | rk4 = rk4 ^ rk3; 46 | rk5 = rk5 ^ rk4; 47 | 48 | rk[roundCount * 6 + 6] = rk0; 49 | rk[roundCount * 6 + 7] = rk1; 50 | rk[roundCount * 6 + 8] = rk2; 51 | rk[roundCount * 6 + 9] = rk3; 52 | if (roundCount == 7) { 53 | break; 54 | } 55 | rk[roundCount * 6 + 10] = rk4; 56 | rk[roundCount * 6 + 11] = rk5; 57 | } 58 | 59 | // Print keys 60 | //for (int i = 0;i < 52;i++) { 61 | // printf("%08x ", rk[i]); 62 | // if ((i+1) % 4 == 0) { 63 | // printf("Round: %d\n", i / 4); 64 | // } 65 | //} 66 | } 67 | 68 | // CTR encryption with one table extended as 32 columns 69 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function 70 | // SBox[256] is partly expanded 71 | __global__ void counter192WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* rk, u32* t0G, u32* t4G, u64* range) { 72 | 73 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 74 | int warpThreadIndex = threadIdx.x & 31; 75 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE; 76 | 77 | // 78 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 79 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 80 | __shared__ u32 rkS[AES_192_KEY_SIZE_INT]; 81 | 82 | if (threadIdx.x < TABLE_SIZE) { 83 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 84 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 85 | } 86 | 87 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { 88 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; 89 | } 90 | 91 | if (threadIdx.x < AES_192_KEY_SIZE_INT) { 92 | rkS[threadIdx.x] = rk[threadIdx.x]; 93 | } 94 | 95 | } 96 | // 97 | 98 | // Wait until every thread is ready 99 | __syncthreads(); 100 | 101 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 102 | u32 s0, s1, s2, s3; 103 | pt0Init = pt[0]; 104 | pt1Init = pt[1]; 105 | pt2Init = pt[2]; 106 | pt3Init = pt[3]; 107 | 108 | u32 threadRange = *range; 109 | u64 threadRangeStart = pt2Init; 110 | threadRangeStart = threadRangeStart << 32; 111 | threadRangeStart ^= pt3Init; 112 | threadRangeStart += (u64)threadIndex * threadRange; 113 | pt2Init = threadRangeStart >> 32; 114 | pt3Init = threadRangeStart & 0xFFFFFFFF; 115 | 116 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 117 | 118 | // Create plaintext as 32 bit unsigned integers 119 | s0 = pt0Init; 120 | s1 = pt1Init; 121 | s2 = pt2Init; 122 | s3 = pt3Init; 123 | 124 | // First round just XORs input with key. 125 | s0 = s0 ^ rkS[0]; 126 | s1 = s1 ^ rkS[1]; 127 | s2 = s2 ^ rkS[2]; 128 | s3 = s3 ^ rkS[3]; 129 | 130 | u32 t0, t1, t2, t3; 131 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1_192; roundCount++) { 132 | 133 | // Table based round function 134 | u32 rkStart = roundCount * 4 + 4; 135 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart]; 136 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1]; 137 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2]; 138 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3]; 139 | 140 | s0 = t0; 141 | s1 = t1; 142 | s2 = t2; 143 | s3 = t3; 144 | 145 | } 146 | 147 | // Calculate the last round key 148 | // Last round uses s-box directly and XORs to produce output. 149 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[48]; 150 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[49]; 151 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[50]; 152 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[51]; 153 | 154 | /*if (threadIndex == 0 && rangeCount == 0) { 155 | printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3); 156 | }*/ 157 | 158 | // Overflow 159 | if (pt3Init == MAX_U32) { 160 | pt2Init++; 161 | } 162 | 163 | // Create key as 32 bit unsigned integers 164 | pt3Init++; 165 | } 166 | 167 | if (threadIndex == 1048575) { 168 | printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init); 169 | printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3); 170 | printf("-------------------------------\n"); 171 | } 172 | 173 | } 174 | 175 | __host__ int main192Ctr() { 176 | printf("\n"); 177 | printf("########## AES-192 Counter Mode Implementation ##########\n"); 178 | printf("\n"); 179 | 180 | // Allocate plaintext and every round key 181 | u32 *pt, *ct, *rk192, *roundKeys192; 182 | gpuErrorCheck(cudaMallocManaged(&pt, 4 * sizeof(u32))); 183 | gpuErrorCheck(cudaMallocManaged(&ct, 4 * sizeof(u32))); 184 | gpuErrorCheck(cudaMallocManaged(&rk192, 6 * sizeof(u32))); 185 | gpuErrorCheck(cudaMallocManaged(&roundKeys192, AES_192_KEY_SIZE_INT * sizeof(u32))); 186 | 187 | pt[0] = 0x6bc1bee2U; 188 | pt[1] = 0x2e409f96U; 189 | pt[2] = 0xe93d7e11U; 190 | pt[3] = 0x7393172aU; 191 | 192 | ct[0] = 0xF3EED1BDU; 193 | ct[1] = 0xB5D2A03CU; 194 | ct[2] = 0x064B5A7EU; 195 | ct[3] = 0x3DB181F8U; 196 | 197 | rk192[0] = 0x8e73b0f7U; 198 | rk192[1] = 0xda0e6452U; 199 | rk192[2] = 0xc810f32bU; 200 | rk192[3] = 0x809079e5U; 201 | rk192[4] = 0x62f8ead2U; 202 | rk192[5] = 0x522c6b7bU; 203 | 204 | // Allocate RCON values 205 | u32* rcon; 206 | gpuErrorCheck(cudaMallocManaged(&rcon, RCON_SIZE * sizeof(u32))); 207 | for (int i = 0; i < RCON_SIZE; i++) { 208 | rcon[i] = RCON32[i]; 209 | } 210 | 211 | // Allocate Tables 212 | u32 *t0, *t1, *t2, *t3, *t4, *t4_0, *t4_1, *t4_2, *t4_3; 213 | gpuErrorCheck(cudaMallocManaged(&t0, TABLE_SIZE * sizeof(u32))); 214 | gpuErrorCheck(cudaMallocManaged(&t1, TABLE_SIZE * sizeof(u32))); 215 | gpuErrorCheck(cudaMallocManaged(&t2, TABLE_SIZE * sizeof(u32))); 216 | gpuErrorCheck(cudaMallocManaged(&t3, TABLE_SIZE * sizeof(u32))); 217 | gpuErrorCheck(cudaMallocManaged(&t4, TABLE_SIZE * sizeof(u32))); 218 | gpuErrorCheck(cudaMallocManaged(&t4_0, TABLE_SIZE * sizeof(u32))); 219 | gpuErrorCheck(cudaMallocManaged(&t4_1, TABLE_SIZE * sizeof(u32))); 220 | gpuErrorCheck(cudaMallocManaged(&t4_2, TABLE_SIZE * sizeof(u32))); 221 | gpuErrorCheck(cudaMallocManaged(&t4_3, TABLE_SIZE * sizeof(u32))); 222 | for (int i = 0; i < TABLE_SIZE; i++) { 223 | t0[i] = T0[i]; 224 | t1[i] = T1[i]; 225 | t2[i] = T2[i]; 226 | t3[i] = T3[i]; 227 | t4[i] = T4[i]; 228 | t4_0[i] = T4_0[i]; 229 | t4_1[i] = T4_1[i]; 230 | t4_2[i] = T4_2[i]; 231 | t4_3[i] = T4_3[i]; 232 | } 233 | 234 | printf("-------------------------------\n"); 235 | u64* range = calculateRange(); 236 | /* printf("Initial Plaintext : %08x %08x %08x %08x\n", pt[0], pt[1], pt[2], pt[3]); 237 | printf("Initial Key : %08x %08x %08x %08x %08x %08x\n", rk192[0], rk192[1], rk192[2], rk192[3], rk192[4], rk192[5]); 238 | printf("-------------------------------\n");*/ 239 | 240 | // Key expansion 241 | keyExpansion192(rk192, roundKeys192); 242 | 243 | clock_t beginTime = clock(); 244 | // Kernels 245 | counter192WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<> > (pt, roundKeys192, t0, t4, range); 246 | 247 | cudaDeviceSynchronize(); 248 | printf("Time elapsed: %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC); 249 | printf("-------------------------------\n"); 250 | printLastCUDAError(); 251 | 252 | // Free alocated arrays 253 | cudaFree(range); 254 | cudaFree(pt); 255 | cudaFree(ct); 256 | cudaFree(rk192); 257 | cudaFree(roundKeys192); 258 | cudaFree(t0); 259 | cudaFree(t1); 260 | cudaFree(t2); 261 | cudaFree(t3); 262 | cudaFree(t4); 263 | cudaFree(t4_0); 264 | cudaFree(t4_1); 265 | cudaFree(t4_2); 266 | cudaFree(t4_3); 267 | cudaFree(rcon); 268 | 269 | return 0; 270 | } -------------------------------------------------------------------------------- /256-ctr.cuh: -------------------------------------------------------------------------------- 1 | // System includes 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // CUDA runtime 8 | #include 9 | 10 | // Helper functions and utilities to work with CUDA 11 | //#include 12 | //#include 13 | 14 | #include 15 | //#include 16 | 17 | // Custom header 18 | //#include "kernel.h" 19 | 20 | 21 | // Key expansion from given key set, populate rk[52] 22 | __host__ void keyExpansion256(u32* key, u32* rk) { 23 | 24 | u32 rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7; 25 | rk0 = key[0]; 26 | rk1 = key[1]; 27 | rk2 = key[2]; 28 | rk3 = key[3]; 29 | rk4 = key[4]; 30 | rk5 = key[5]; 31 | rk6 = key[6]; 32 | rk7 = key[7]; 33 | 34 | rk[0] = rk0; 35 | rk[1] = rk1; 36 | rk[2] = rk2; 37 | rk[3] = rk3; 38 | rk[4] = rk4; 39 | rk[5] = rk5; 40 | rk[6] = rk6; 41 | rk[7] = rk7; 42 | 43 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_256; roundCount++) { 44 | u32 temp = rk7; 45 | rk0 = rk0 ^ T4_3[(temp >> 16) & 0xff] ^ T4_2[(temp >> 8) & 0xff] ^ T4_1[(temp) & 0xff] ^ T4_0[(temp >> 24)] ^ RCON32[roundCount]; 46 | rk1 = rk1 ^ rk0; 47 | rk2 = rk2 ^ rk1; 48 | rk3 = rk3 ^ rk2; 49 | rk4 = rk4 ^ T4_3[(rk3 >> 24) & 0xff] ^ T4_2[(rk3 >> 16) & 0xff] ^ T4_1[(rk3 >> 8) & 0xff] ^ T4_0[rk3 & 0xff]; 50 | rk5 = rk5 ^ rk4; 51 | rk6 = rk6 ^ rk5; 52 | rk7 = rk7 ^ rk6; 53 | 54 | rk[roundCount * 8 + 8] = rk0; 55 | rk[roundCount * 8 + 9] = rk1; 56 | rk[roundCount * 8 + 10] = rk2; 57 | rk[roundCount * 8 + 11] = rk3; 58 | if (roundCount == 6) { 59 | break; 60 | } 61 | rk[roundCount * 8 + 12] = rk4; 62 | rk[roundCount * 8 + 13] = rk5; 63 | rk[roundCount * 8 + 14] = rk6; 64 | rk[roundCount * 8 + 15] = rk7; 65 | 66 | } 67 | 68 | //for (int i = 0; i < 60; i++) { 69 | // printf("%08x ", rk[i]); 70 | // if ((i + 1) % 4 == 0) { 71 | // printf("Round: %d\n", i / 4); 72 | // } 73 | //} 74 | } 75 | 76 | // CTR encryption with one table extended as 32 columns 77 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function 78 | // SBox[256] is partly expanded 79 | __global__ void counter256WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* rk, u32* t0G, u32* t4G, u64* range) { 80 | 81 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 82 | int warpThreadIndex = threadIdx.x & 31; 83 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE; 84 | 85 | // 86 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 87 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 88 | __shared__ u32 rkS[AES_256_KEY_SIZE_INT]; 89 | 90 | if (threadIdx.x < TABLE_SIZE) { 91 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 92 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 93 | } 94 | 95 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { 96 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; 97 | } 98 | 99 | if (threadIdx.x < AES_256_KEY_SIZE_INT) { 100 | rkS[threadIdx.x] = rk[threadIdx.x]; 101 | } 102 | 103 | } 104 | // 105 | 106 | // Wait until every thread is ready 107 | __syncthreads(); 108 | 109 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 110 | u32 s0, s1, s2, s3; 111 | pt0Init = pt[0]; 112 | pt1Init = pt[1]; 113 | pt2Init = pt[2]; 114 | pt3Init = pt[3]; 115 | 116 | u32 threadRange = *range; 117 | u64 threadRangeStart = pt2Init; 118 | threadRangeStart = threadRangeStart << 32; 119 | threadRangeStart ^= pt3Init; 120 | threadRangeStart += (u64)threadIndex * threadRange; 121 | pt2Init = threadRangeStart >> 32; 122 | pt3Init = threadRangeStart & 0xFFFFFFFF; 123 | 124 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 125 | 126 | // Create plaintext as 32 bit unsigned integers 127 | s0 = pt0Init; 128 | s1 = pt1Init; 129 | s2 = pt2Init; 130 | s3 = pt3Init; 131 | 132 | // First round just XORs input with key. 133 | s0 = s0 ^ rkS[0]; 134 | s1 = s1 ^ rkS[1]; 135 | s2 = s2 ^ rkS[2]; 136 | s3 = s3 ^ rkS[3]; 137 | 138 | u32 t0, t1, t2, t3; 139 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1_256; roundCount++) { 140 | 141 | // Table based round function 142 | u32 rkStart = roundCount * 4 + 4; 143 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart]; 144 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1]; 145 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2]; 146 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3]; 147 | 148 | s0 = t0; 149 | s1 = t1; 150 | s2 = t2; 151 | s3 = t3; 152 | 153 | } 154 | 155 | // Calculate the last round key 156 | // Last round uses s-box directly and XORs to produce output. 157 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[56]; 158 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[57]; 159 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[58]; 160 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[59]; 161 | 162 | //if (threadIndex == 0 && rangeCount == 0) { 163 | //printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3); 164 | //} 165 | 166 | // Overflow 167 | if (pt3Init == MAX_U32) { 168 | pt2Init++; 169 | } 170 | 171 | // Create key as 32 bit unsigned integers 172 | pt3Init++; 173 | } 174 | 175 | if (threadIndex == 1048575) { 176 | printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init); 177 | printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3); 178 | printf("-------------------------------\n"); 179 | } 180 | 181 | } 182 | 183 | __host__ int main256Ctr() { 184 | printf("\n"); 185 | printf("########## AES-256 Counter Mode Implementation ##########\n"); 186 | printf("\n"); 187 | 188 | // Allocate plaintext and every round key 189 | u32 *pt, *ct, *rk256, *roundKeys256; 190 | gpuErrorCheck(cudaMallocManaged(&pt, 4 * sizeof(u32))); 191 | gpuErrorCheck(cudaMallocManaged(&ct, 4 * sizeof(u32))); 192 | gpuErrorCheck(cudaMallocManaged(&rk256, 8 * sizeof(u32))); 193 | gpuErrorCheck(cudaMallocManaged(&roundKeys256, AES_256_KEY_SIZE_INT * sizeof(u32))); 194 | 195 | pt[0] = 0x6bc1bee2U; 196 | pt[1] = 0x2e409f96U; 197 | pt[2] = 0xe93d7e11U; 198 | pt[3] = 0x7393172aU; 199 | 200 | ct[0] = 0xF3EED1BDU; 201 | ct[1] = 0xB5D2A03CU; 202 | ct[2] = 0x064B5A7EU; 203 | ct[3] = 0x3DB181F8U; 204 | 205 | rk256[0] = 0x603deb10U; 206 | rk256[1] = 0x15ca71beU; 207 | rk256[2] = 0x2b73aef0U; 208 | rk256[3] = 0x857d7781U; 209 | rk256[4] = 0x1f352c07U; 210 | rk256[5] = 0x3b6108d7U; 211 | rk256[6] = 0x2d9810a3U; 212 | rk256[7] = 0x0914dff4U; 213 | 214 | // Allocate RCON values 215 | u32* rcon; 216 | gpuErrorCheck(cudaMallocManaged(&rcon, RCON_SIZE * sizeof(u32))); 217 | for (int i = 0; i < RCON_SIZE; i++) { 218 | rcon[i] = RCON32[i]; 219 | } 220 | 221 | // Allocate Tables 222 | u32 *t0, *t1, *t2, *t3, *t4, *t4_0, *t4_1, *t4_2, *t4_3; 223 | gpuErrorCheck(cudaMallocManaged(&t0, TABLE_SIZE * sizeof(u32))); 224 | gpuErrorCheck(cudaMallocManaged(&t1, TABLE_SIZE * sizeof(u32))); 225 | gpuErrorCheck(cudaMallocManaged(&t2, TABLE_SIZE * sizeof(u32))); 226 | gpuErrorCheck(cudaMallocManaged(&t3, TABLE_SIZE * sizeof(u32))); 227 | gpuErrorCheck(cudaMallocManaged(&t4, TABLE_SIZE * sizeof(u32))); 228 | gpuErrorCheck(cudaMallocManaged(&t4_0, TABLE_SIZE * sizeof(u32))); 229 | gpuErrorCheck(cudaMallocManaged(&t4_1, TABLE_SIZE * sizeof(u32))); 230 | gpuErrorCheck(cudaMallocManaged(&t4_2, TABLE_SIZE * sizeof(u32))); 231 | gpuErrorCheck(cudaMallocManaged(&t4_3, TABLE_SIZE * sizeof(u32))); 232 | for (int i = 0; i < TABLE_SIZE; i++) { 233 | t0[i] = T0[i]; 234 | t1[i] = T1[i]; 235 | t2[i] = T2[i]; 236 | t3[i] = T3[i]; 237 | t4[i] = T4[i]; 238 | t4_0[i] = T4_0[i]; 239 | t4_1[i] = T4_1[i]; 240 | t4_2[i] = T4_2[i]; 241 | t4_3[i] = T4_3[i]; 242 | } 243 | 244 | printf("-------------------------------\n"); 245 | u64* range = calculateRange(); 246 | /* printf("Initial Plaintext : %08x %08x %08x %08x\n", pt[0], pt[1], pt[2], pt[3]); 247 | printf("Initial Key : %08x %08x %08x %08x %08x %08x %08x %08x\n", rk256[0], rk256[1], rk256[2], rk256[3], rk256[4], rk256[5], rk256[6], rk256[7]); 248 | printf("-------------------------------\n");*/ 249 | 250 | keyExpansion256(rk256, roundKeys256); 251 | clock_t beginTime = clock(); 252 | // Kernels 253 | counter256WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, roundKeys256, t0, t4, range); 254 | 255 | cudaDeviceSynchronize(); 256 | printf("Time elapsed: %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC); 257 | printf("-------------------------------\n"); 258 | printLastCUDAError(); 259 | 260 | // Free alocated arrays 261 | cudaFree(range); 262 | cudaFree(pt); 263 | cudaFree(ct); 264 | cudaFree(rk256); 265 | cudaFree(roundKeys256); 266 | cudaFree(t0); 267 | cudaFree(t1); 268 | cudaFree(t2); 269 | cudaFree(t3); 270 | cudaFree(t4); 271 | cudaFree(t4_0); 272 | cudaFree(t4_1); 273 | cudaFree(t4_2); 274 | cudaFree(t4_3); 275 | cudaFree(rcon); 276 | 277 | 278 | return 0; 279 | } -------------------------------------------------------------------------------- /256-es.cuh: -------------------------------------------------------------------------------- 1 | // System includes 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // CUDA runtime 8 | #include 9 | 10 | // Helper functions and utilities to work with CUDA 11 | //#include 12 | //#include 13 | 14 | #include 15 | //#include 16 | 17 | // Custom header 18 | //#include "kernel.h" 19 | 20 | 21 | // Exhaustive search with one table extended as 32 columns 22 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function 23 | // SBox[256] is partly expanded 24 | __global__ void exhaustiveSearch256WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range) { 25 | 26 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 27 | int warpThreadIndex = threadIdx.x & 31; 28 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE; 29 | 30 | // 31 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 32 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 33 | __shared__ u32 rconS[RCON_SIZE]; 34 | __shared__ u32 ctS[U32_SIZE]; 35 | 36 | 37 | if (threadIdx.x < TABLE_SIZE) { 38 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 39 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 40 | } 41 | 42 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { 43 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; 44 | } 45 | 46 | if (threadIdx.x < RCON_SIZE) { 47 | rconS[threadIdx.x] = rconG[threadIdx.x]; 48 | } 49 | 50 | if (threadIdx.x < U32_SIZE) { 51 | ctS[threadIdx.x] = ct[threadIdx.x]; 52 | } 53 | } 54 | // 55 | 56 | // Wait until every thread is ready 57 | __syncthreads(); 58 | 59 | u32 rk0Init, rk1Init, rk2Init, rk3Init, rk4Init, rk5Init, rk6Init, rk7Init; 60 | rk0Init = rk[0]; 61 | rk1Init = rk[1]; 62 | rk2Init = rk[2]; 63 | rk3Init = rk[3]; 64 | rk4Init = rk[4]; 65 | rk5Init = rk[5]; 66 | rk6Init = rk[6]; 67 | rk7Init = rk[7]; 68 | 69 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 70 | pt0Init = pt[0]; 71 | pt1Init = pt[1]; 72 | pt2Init = pt[2]; 73 | pt3Init = pt[3]; 74 | 75 | u32 threadRange = *range; 76 | u64 threadRangeStart = (u64)threadIndex * threadRange; 77 | rk6Init = rk6Init + threadRangeStart / MAX_U32; 78 | rk7Init = rk7Init + threadRangeStart % MAX_U32; 79 | 80 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 81 | 82 | // Calculate round keys 83 | u32 rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7; 84 | rk0 = rk0Init; 85 | rk1 = rk1Init; 86 | rk2 = rk2Init; 87 | rk3 = rk3Init; 88 | rk4 = rk4Init; 89 | rk5 = rk5Init; 90 | rk6 = rk6Init; 91 | rk7 = rk7Init; 92 | 93 | // Create plaintext as 32 bit unsigned integers 94 | u32 s0, s1, s2, s3; 95 | s0 = pt0Init; 96 | s1 = pt1Init; 97 | s2 = pt2Init; 98 | s3 = pt3Init; 99 | 100 | // First round just XORs input with key. 101 | s0 = s0 ^ rk0; 102 | s1 = s1 ^ rk1; 103 | s2 = s2 ^ rk2; 104 | s3 = s3 ^ rk3; 105 | 106 | u32 t0, t1, t2, t3; 107 | u8 rconIndex = 0; 108 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1_256; roundCount++) { 109 | // Table based round function 110 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT); 111 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT); 112 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT); 113 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT); 114 | 115 | // Add round key 116 | if (roundCount % 2 == 0) { 117 | t0 = t0 ^ rk4; 118 | t1 = t1 ^ rk5; 119 | t2 = t2 ^ rk6; 120 | t3 = t3 ^ rk7; 121 | } else { 122 | // Calculate round key 123 | u32 temp = rk7; 124 | rk0 = rk0 ^ 125 | (t4S[(temp >> 16) & 0xff][warpThreadIndexSBox] & 0xff000000) ^ 126 | (t4S[(temp >> 8) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^ 127 | (t4S[(temp) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^ 128 | (t4S[(temp >> 24)][warpThreadIndexSBox] & 0x000000ff) ^ 129 | rconS[rconIndex++]; 130 | rk1 = rk1 ^ rk0; 131 | rk2 = rk2 ^ rk1; 132 | rk3 = rk3 ^ rk2; 133 | rk4 = rk4 ^ 134 | (t4S[(rk3 >> 24) & 0xff][warpThreadIndexSBox] & 0xff000000) ^ 135 | (t4S[(rk3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^ 136 | (t4S[(rk3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^ 137 | (t4S[(rk3) & 0xff][warpThreadIndexSBox] & 0x000000ff); 138 | rk5 = rk5 ^ rk4; 139 | rk6 = rk6 ^ rk5; 140 | rk7 = rk7 ^ rk6; 141 | 142 | t0 = t0 ^ rk0; 143 | t1 = t1 ^ rk1; 144 | t2 = t2 ^ rk2; 145 | t3 = t3 ^ rk3; 146 | } 147 | 148 | s0 = t0; 149 | s1 = t1; 150 | s2 = t2; 151 | s3 = t3; 152 | } 153 | 154 | // Calculate the last round key 155 | u32 temp = rk7; 156 | rk0 = rk0 ^ 157 | (t4S[(temp >> 16) & 0xff][warpThreadIndexSBox] & 0xff000000) ^ 158 | (t4S[(temp >> 8) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^ 159 | (t4S[(temp) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^ 160 | (t4S[(temp >> 24)][warpThreadIndexSBox] & 0x000000ff) ^ 161 | rconS[rconIndex++]; 162 | 163 | // Last round uses s-box directly and XORs to produce output. 164 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk0; 165 | if (s0 == ctS[0]) { 166 | rk1 = rk1 ^ rk0; 167 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk1; 168 | if (s1 == ctS[1]) { 169 | rk2 = rk2 ^ rk1; 170 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk2; 171 | if (s2 == ctS[2]) { 172 | rk3 = rk2 ^ rk3; 173 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk3; 174 | if (s3 == ctS[3]) { 175 | printf("! Found key : %08x %08x %08x %08x %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init, rk4Init, rk5Init, rk6Init, rk7Init); 176 | printf("-------------------------------\n"); 177 | } 178 | } 179 | } 180 | } 181 | 182 | // Overflow 183 | if (rk7Init == MAX_U32) { 184 | rk6Init++; 185 | } 186 | 187 | // Create key as 32 bit unsigned integers 188 | rk7Init++; 189 | } 190 | } 191 | 192 | __host__ int main256ExhaustiveSearch() { 193 | printf("\n"); 194 | printf("########## AES-256 Exhaustive Search Implementation ##########\n"); 195 | printf("\n"); 196 | 197 | // Allocate plaintext, ciphertext and initial round key 198 | u32 *pt, *ct, *rk256; 199 | gpuErrorCheck(cudaMallocManaged(&pt, 4 * sizeof(u32))); 200 | gpuErrorCheck(cudaMallocManaged(&ct, 4 * sizeof(u32))); 201 | gpuErrorCheck(cudaMallocManaged(&rk256, 8 * sizeof(u32))); 202 | 203 | pt[0] = 0x6bc1bee2U; 204 | pt[1] = 0x2e409f96U; 205 | pt[2] = 0xe93d7e11U; 206 | pt[3] = 0x7393172aU; 207 | 208 | ct[0] = 0xF3EED1BDU; 209 | ct[1] = 0xB5D2A03CU; 210 | ct[2] = 0x064B5A7EU; 211 | ct[3] = 0x3DB181F8U; 212 | 213 | rk256[0] = 0x603deb10U; 214 | rk256[1] = 0x15ca71beU; 215 | rk256[2] = 0x2b73aef0U; 216 | rk256[3] = 0x857d7781U; 217 | rk256[4] = 0x1f352c07U; 218 | rk256[5] = 0x3b6108d7U; 219 | rk256[6] = 0x2d9810a3U; 220 | rk256[7] = 0x0914dff4U; 221 | 222 | // Allocate RCON values 223 | u32* rcon; 224 | gpuErrorCheck(cudaMallocManaged(&rcon, RCON_SIZE * sizeof(u32))); 225 | for (int i = 0; i < RCON_SIZE; i++) { 226 | rcon[i] = RCON32[i]; 227 | } 228 | 229 | // Allocate Tables 230 | u32 *t0, *t1, *t2, *t3, *t4, *t4_0, *t4_1, *t4_2, *t4_3; 231 | gpuErrorCheck(cudaMallocManaged(&t0, TABLE_SIZE * sizeof(u32))); 232 | gpuErrorCheck(cudaMallocManaged(&t1, TABLE_SIZE * sizeof(u32))); 233 | gpuErrorCheck(cudaMallocManaged(&t2, TABLE_SIZE * sizeof(u32))); 234 | gpuErrorCheck(cudaMallocManaged(&t3, TABLE_SIZE * sizeof(u32))); 235 | gpuErrorCheck(cudaMallocManaged(&t4, TABLE_SIZE * sizeof(u32))); 236 | gpuErrorCheck(cudaMallocManaged(&t4_0, TABLE_SIZE * sizeof(u32))); 237 | gpuErrorCheck(cudaMallocManaged(&t4_1, TABLE_SIZE * sizeof(u32))); 238 | gpuErrorCheck(cudaMallocManaged(&t4_2, TABLE_SIZE * sizeof(u32))); 239 | gpuErrorCheck(cudaMallocManaged(&t4_3, TABLE_SIZE * sizeof(u32))); 240 | for (int i = 0; i < TABLE_SIZE; i++) { 241 | t0[i] = T0[i]; 242 | t1[i] = T1[i]; 243 | t2[i] = T2[i]; 244 | t3[i] = T3[i]; 245 | t4[i] = T4[i]; 246 | t4_0[i] = T4_0[i]; 247 | t4_1[i] = T4_1[i]; 248 | t4_2[i] = T4_2[i]; 249 | t4_3[i] = T4_3[i]; 250 | } 251 | 252 | printf("-------------------------------\n"); 253 | u64* range = calculateRange(); 254 | /* printf("Plaintext : %08x %08x %08x %08x\n", pt[0], pt[1], pt[2], pt[3]); 255 | printf("Ciphertext : %08x %08x %08x %08x\n", ct[0], ct[1], ct[2], ct[3]); 256 | printf("Initial Key : %08x %08x %08x %08x %08x %08x %08x %08x\n", rk256[0], rk256[1], rk256[2], rk256[3], rk256[4], rk256[5], rk256[6], rk256[7]); 257 | printf("-------------------------------\n");*/ 258 | 259 | clock_t beginTime = clock(); 260 | // Kernels 261 | exhaustiveSearch256WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, ct, rk256, t0, t4, rcon, range); 262 | 263 | cudaDeviceSynchronize(); 264 | printf("Time elapsed: %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC); 265 | printf("-------------------------------\n"); 266 | printLastCUDAError(); 267 | 268 | // Free alocated arrays 269 | cudaFree(range); 270 | cudaFree(pt); 271 | cudaFree(ct); 272 | cudaFree(rk256); 273 | cudaFree(t0); 274 | cudaFree(t1); 275 | cudaFree(t2); 276 | cudaFree(t3); 277 | cudaFree(t4); 278 | cudaFree(t4_0); 279 | cudaFree(t4_1); 280 | cudaFree(t4_2); 281 | cudaFree(t4_3); 282 | cudaFree(rcon); 283 | 284 | return 0; 285 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [2020] [Cihangir Tezcan] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /192-es.cuh: -------------------------------------------------------------------------------- 1 | // System includes 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // CUDA runtime 8 | #include 9 | 10 | // Helper functions and utilities to work with CUDA 11 | //#include 12 | //#include 13 | 14 | #include 15 | //#include 16 | 17 | // Custom header 18 | //#include "kernel.h" 19 | 20 | 21 | // Exhaustive search with one table extended as 32 columns 22 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function 23 | // SBox[256] is partly expanded 24 | __global__ void exhaustiveSearch192WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range) { 25 | 26 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 27 | int warpThreadIndex = threadIdx.x & 31; 28 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE; 29 | 30 | // 31 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 32 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 33 | __shared__ u32 rconS[RCON_SIZE]; 34 | __shared__ u32 ctS[U32_SIZE]; 35 | 36 | 37 | if (threadIdx.x < TABLE_SIZE) { 38 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 39 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 40 | } 41 | 42 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { 43 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; 44 | } 45 | 46 | if (threadIdx.x < RCON_SIZE) { 47 | rconS[threadIdx.x] = rconG[threadIdx.x]; 48 | } 49 | 50 | if (threadIdx.x < U32_SIZE) { 51 | ctS[threadIdx.x] = ct[threadIdx.x]; 52 | } 53 | } 54 | // 55 | 56 | // Wait until every thread is ready 57 | __syncthreads(); 58 | 59 | u32 rk0Init, rk1Init, rk2Init, rk3Init, rk4Init, rk5Init; 60 | rk0Init = rk[0]; 61 | rk1Init = rk[1]; 62 | rk2Init = rk[2]; 63 | rk3Init = rk[3]; 64 | rk4Init = rk[4]; 65 | rk5Init = rk[5]; 66 | 67 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 68 | pt0Init = pt[0]; 69 | pt1Init = pt[1]; 70 | pt2Init = pt[2]; 71 | pt3Init = pt[3]; 72 | 73 | u32 threadRange = *range; 74 | u64 threadRangeStart = (u64)threadIndex * threadRange; 75 | rk4Init = rk4Init + threadRangeStart / MAX_U32; 76 | rk5Init = rk5Init + threadRangeStart % MAX_U32; 77 | 78 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 79 | 80 | // Calculate round keys 81 | u32 rk0, rk1, rk2, rk3, rk4, rk5; 82 | rk0 = rk0Init; 83 | rk1 = rk1Init; 84 | rk2 = rk2Init; 85 | rk3 = rk3Init; 86 | rk4 = rk4Init; 87 | rk5 = rk5Init; 88 | 89 | // Create plaintext as 32 bit unsigned integers 90 | u32 s0, s1, s2, s3; 91 | s0 = pt0Init; 92 | s1 = pt1Init; 93 | s2 = pt2Init; 94 | s3 = pt3Init; 95 | 96 | // First round just XORs input with key. 97 | s0 = s0 ^ rk0; 98 | s1 = s1 ^ rk1; 99 | s2 = s2 ^ rk2; 100 | s3 = s3 ^ rk3; 101 | 102 | u32 t0, t1, t2, t3; 103 | u8 rconIndex = 0; 104 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1_192; roundCount++) { 105 | // Table based round function 106 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT); 107 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT); 108 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT); 109 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT); 110 | 111 | // Add round key 112 | if (roundCount % 3 == 0) { 113 | t0 = t0 ^ rk4; 114 | t1 = t1 ^ rk5; 115 | // Calculate round key 116 | u32 temp = rk5; 117 | rk0 = rk0 ^ 118 | (t4S[(temp >> 16) & 0xff][warpThreadIndexSBox] & 0xff000000) ^ 119 | (t4S[(temp >> 8) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^ 120 | (t4S[(temp) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^ 121 | (t4S[(temp >> 24)][warpThreadIndexSBox] & 0x000000ff) ^ 122 | rconS[rconIndex++]; 123 | rk1 = rk1 ^ rk0; 124 | rk2 = rk2 ^ rk1; 125 | rk3 = rk3 ^ rk2; 126 | rk4 = rk4 ^ rk3; 127 | rk5 = rk5 ^ rk4; 128 | 129 | t2 = t2 ^ rk0; 130 | t3 = t3 ^ rk1; 131 | } else if (roundCount % 3 == 1) { 132 | t0 = t0 ^ rk2; 133 | t1 = t1 ^ rk3; 134 | t2 = t2 ^ rk4; 135 | t3 = t3 ^ rk5; 136 | } else { 137 | // Calculate round key 138 | u32 temp = rk5; 139 | rk0 = rk0 ^ 140 | (t4S[(temp >> 16) & 0xff][warpThreadIndexSBox] & 0xff000000) ^ 141 | (t4S[(temp >> 8) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^ 142 | (t4S[(temp) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^ 143 | (t4S[(temp >> 24)][warpThreadIndexSBox] & 0x000000ff) ^ 144 | rconS[rconIndex++]; 145 | rk1 = rk1 ^ rk0; 146 | rk2 = rk2 ^ rk1; 147 | rk3 = rk3 ^ rk2; 148 | rk4 = rk4 ^ rk3; 149 | rk5 = rk5 ^ rk4; 150 | 151 | t0 = t0 ^ rk0; 152 | t1 = t1 ^ rk1; 153 | t2 = t2 ^ rk2; 154 | t3 = t3 ^ rk3; 155 | } 156 | 157 | s0 = t0; 158 | s1 = t1; 159 | s2 = t2; 160 | s3 = t3; 161 | } 162 | 163 | // Calculate the last round key 164 | u32 temp = rk5; 165 | rk0 = rk0 ^ 166 | (t4S[(temp >> 16) & 0xff][warpThreadIndexSBox] & 0xff000000) ^ 167 | (t4S[(temp >> 8) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^ 168 | (t4S[(temp) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^ 169 | (t4S[(temp >> 24)][warpThreadIndexSBox] & 0x000000ff) ^ 170 | rconS[rconIndex]; 171 | 172 | // Last round uses s-box directly and XORs to produce output. 173 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk0; 174 | if (s0 == ctS[0]) { 175 | rk1 = rk1 ^ rk0; 176 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk1; 177 | if (s1 == ctS[1]) { 178 | rk2 = rk2 ^ rk1; 179 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk2; 180 | if (s2 == ctS[2]) { 181 | rk3 = rk2 ^ rk3; 182 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk3; 183 | if (s3 == ctS[3]) { 184 | printf("! Found key : %08x %08x %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init, rk4Init, rk5Init); 185 | printf("-------------------------------\n"); 186 | } 187 | } 188 | } 189 | } 190 | 191 | // Overflow 192 | if (rk5Init == MAX_U32) { 193 | rk4Init++; 194 | } 195 | 196 | // Create key as 32 bit unsigned integers 197 | rk5Init++; 198 | } 199 | } 200 | __global__ void exhaustiveSearch192WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range, u8* SAES) { 201 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 202 | int warpThreadIndex = threadIdx.x & 31; 203 | // int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE; 204 | // 205 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 206 | // __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 207 | __shared__ u8 Sbox[64][32][4]; 208 | __shared__ u32 rconS[RCON_SIZE]; 209 | __shared__ u32 ctS[U32_SIZE]; 210 | if (threadIdx.x < TABLE_SIZE) { 211 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; } 212 | // for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; } 213 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; } 214 | if (threadIdx.x < RCON_SIZE) { rconS[threadIdx.x] = rconG[threadIdx.x]; } 215 | if (threadIdx.x < U32_SIZE) { ctS[threadIdx.x] = ct[threadIdx.x]; } 216 | } 217 | // 218 | // Wait until every thread is ready 219 | __syncthreads(); 220 | u32 rk0Init, rk1Init, rk2Init, rk3Init, rk4Init, rk5Init; 221 | rk0Init = rk[0]; rk1Init = rk[1]; rk2Init = rk[2]; rk3Init = rk[3]; rk4Init = rk[4]; rk5Init = rk[5]; 222 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 223 | pt0Init = pt[0]; pt1Init = pt[1]; pt2Init = pt[2]; pt3Init = pt[3]; 224 | u32 threadRange = *range; 225 | u64 threadRangeStart = (u64)threadIndex * threadRange; 226 | rk4Init = rk4Init + threadRangeStart / MAX_U32; 227 | rk5Init = rk5Init + threadRangeStart % MAX_U32; 228 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 229 | // Calculate round keys 230 | u32 rk0, rk1, rk2, rk3, rk4, rk5; 231 | rk0 = rk0Init; rk1 = rk1Init; rk2 = rk2Init; rk3 = rk3Init; rk4 = rk4Init; rk5 = rk5Init; 232 | // Create plaintext as 32 bit unsigned integers 233 | u32 s0, s1, s2, s3; 234 | s0 = pt0Init; s1 = pt1Init; s2 = pt2Init; s3 = pt3Init; 235 | // First round just XORs input with key. 236 | s0 = s0 ^ rk0; s1 = s1 ^ rk1; s2 = s2 ^ rk2; s3 = s3 ^ rk3; 237 | u32 t0, t1, t2, t3; 238 | u8 rconIndex = 0; 239 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1_192; roundCount++) { 240 | // Table based round function 241 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT); 242 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT); 243 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT); 244 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT); 245 | // Add round key 246 | if (roundCount % 3 == 0) { 247 | t0 = t0 ^ rk4; t1 = t1 ^ rk5; 248 | // Calculate round key 249 | u32 temp = rk5; 250 | rk0 = rk0 ^ 251 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16) & 0xff) % 4], SHIFT_1_RIGHT) ^ 252 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8) & 0xff) % 4], SHIFT_2_RIGHT) ^ 253 | arithmeticRightShiftBytePerm((u64)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp) & 0xff) % 4], SHIFT_3_RIGHT) ^ 254 | ((u64)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^ 255 | rconS[rconIndex++]; 256 | rk1 = rk1 ^ rk0; rk2 = rk2 ^ rk1; rk3 = rk3 ^ rk2; rk4 = rk4 ^ rk3; rk5 = rk5 ^ rk4; 257 | t2 = t2 ^ rk0; 258 | t3 = t3 ^ rk1; 259 | } 260 | else if (roundCount % 3 == 1) { t0 = t0 ^ rk2; t1 = t1 ^ rk3; t2 = t2 ^ rk4; t3 = t3 ^ rk5; } 261 | else { 262 | // Calculate round key 263 | u32 temp = rk5; 264 | rk0 = rk0 ^ 265 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16) & 0xff) % 4], SHIFT_1_RIGHT) ^ 266 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8) & 0xff) % 4], SHIFT_2_RIGHT) ^ 267 | arithmeticRightShiftBytePerm((u64)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp) & 0xff) % 4], SHIFT_3_RIGHT) ^ 268 | ((u64)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^ 269 | rconS[rconIndex++]; 270 | rk1 = rk1 ^ rk0; rk2 = rk2 ^ rk1; rk3 = rk3 ^ rk2; rk4 = rk4 ^ rk3; rk5 = rk5 ^ rk4; 271 | t0 = t0 ^ rk0; t1 = t1 ^ rk1; t2 = t2 ^ rk2; t3 = t3 ^ rk3; 272 | } 273 | s0 = t0; s1 = t1; s2 = t2; s3 = t3; 274 | } 275 | // Calculate the last round key 276 | u32 temp = rk5; 277 | rk0 = rk0 ^ 278 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16) & 0xff) % 4], SHIFT_1_RIGHT) ^ 279 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8) & 0xff) % 4], SHIFT_2_RIGHT) ^ 280 | arithmeticRightShiftBytePerm((u64)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp) & 0xff) % 4], SHIFT_3_RIGHT) ^ 281 | ((u64)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^ 282 | rconS[rconIndex]; 283 | 284 | // Last round uses s-box directly and XORs to produce output. 285 | s0 = arithmeticRightShiftBytePerm((u64)Sbox[((t0 >> 24)) / 4][warpThreadIndex][((t0 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t1 >> 16) & 0xff) / 4][warpThreadIndex][((t1 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t2 >> 8) & 0xFF) / 4][warpThreadIndex][((t2 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t3 & 0xFF) / 4)][warpThreadIndex][((t3 & 0xFF) % 4)]) ^ rk0; 286 | if (s0 == ctS[0]) { 287 | rk1 = rk1 ^ rk0; 288 | s1 = arithmeticRightShiftBytePerm((u64)Sbox[((t1 >> 24)) / 4][warpThreadIndex][((t1 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t2 >> 16) & 0xff) / 4][warpThreadIndex][((t2 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t3 >> 8) & 0xFF) / 4][warpThreadIndex][((t3 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t0 & 0xFF) / 4)][warpThreadIndex][((t0 & 0xFF) % 4)]) ^ rk1; 289 | if (s1 == ctS[1]) { 290 | rk2 = rk2 ^ rk1; 291 | s2 = arithmeticRightShiftBytePerm((u64)Sbox[((t2 >> 24)) / 4][warpThreadIndex][((t2 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t3 >> 16) & 0xff) / 4][warpThreadIndex][((t3 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t0 >> 8) & 0xFF) / 4][warpThreadIndex][((t0 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t1 & 0xFF) / 4)][warpThreadIndex][((t1 & 0xFF) % 4)]) ^ rk2; 292 | if (s2 == ctS[2]) { 293 | rk3 = rk2 ^ rk3; 294 | s3 = arithmeticRightShiftBytePerm((u64)Sbox[((t3 >> 24)) / 4][warpThreadIndex][((t3 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t0 >> 16) & 0xff) / 4][warpThreadIndex][((t0 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t1 >> 8) & 0xFF) / 4][warpThreadIndex][((t1 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t2 & 0xFF) / 4)][warpThreadIndex][((t2 & 0xFF) % 4)]) ^ rk3; 295 | if (s3 == ctS[3]) { 296 | printf("! Found key : %08x %08x %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init, rk4Init, rk5Init); 297 | printf("-------------------------------\n"); 298 | } 299 | } 300 | } 301 | } 302 | if (rk5Init == MAX_U32) { rk4Init++; }// Overflow 303 | rk5Init++; // Create key as 32 bit unsigned integers 304 | } 305 | } 306 | 307 | __host__ int main192ExhaustiveSearch() { 308 | printf("\n"); printf("########## AES-192 Exhaustive Search Implementation ##########\n"); printf("\n"); 309 | // Allocate plaintext, ciphertext and initial round key 310 | u32 *pt, *ct, *rk192; 311 | gpuErrorCheck(cudaMallocManaged(&pt, 4 * sizeof(u32))); 312 | gpuErrorCheck(cudaMallocManaged(&ct, 4 * sizeof(u32))); 313 | gpuErrorCheck(cudaMallocManaged(&rk192, 6 * sizeof(u32))); 314 | pt[0] = 0x6bc1bee2U; pt[1] = 0x2e409f96U; pt[2] = 0xe93d7e11U; pt[3] = 0x7393172aU; 315 | ct[0] = 0xBD334F1DU; ct[1] = 0x6E45F25FU; ct[2] = 0xF712A214U; ct[3] = 0x571FA5CCU; 316 | rk192[0] = 0x8e73b0f7U; rk192[1] = 0xda0e6452U; rk192[2] = 0xc810f32bU; rk192[3] = 0x809079e5U; rk192[4] = 0x62f8ead2U; rk192[5] = 0x522c6b70U; 317 | // Allocate RCON values 318 | u32* rcon; 319 | gpuErrorCheck(cudaMallocManaged(&rcon, RCON_SIZE * sizeof(u32))); 320 | for (int i = 0; i < RCON_SIZE; i++) { rcon[i] = RCON32[i]; } 321 | // Allocate Tables 322 | u32 *t0, *t1, *t2, *t3, *t4, *t4_0, *t4_1, *t4_2, *t4_3; 323 | u8* SAES_d; // Cihangir 324 | gpuErrorCheck(cudaMallocManaged(&t0, TABLE_SIZE * sizeof(u32))); 325 | gpuErrorCheck(cudaMallocManaged(&t1, TABLE_SIZE * sizeof(u32))); 326 | gpuErrorCheck(cudaMallocManaged(&t2, TABLE_SIZE * sizeof(u32))); 327 | gpuErrorCheck(cudaMallocManaged(&t3, TABLE_SIZE * sizeof(u32))); 328 | gpuErrorCheck(cudaMallocManaged(&t4, TABLE_SIZE * sizeof(u32))); 329 | gpuErrorCheck(cudaMallocManaged(&t4_0, TABLE_SIZE * sizeof(u32))); 330 | gpuErrorCheck(cudaMallocManaged(&t4_1, TABLE_SIZE * sizeof(u32))); 331 | gpuErrorCheck(cudaMallocManaged(&t4_2, TABLE_SIZE * sizeof(u32))); 332 | gpuErrorCheck(cudaMallocManaged(&t4_3, TABLE_SIZE * sizeof(u32))); 333 | gpuErrorCheck(cudaMallocManaged(&SAES_d, 256 * sizeof(u8))); // Cihangir 334 | for (int i = 0; i < TABLE_SIZE; i++) { t0[i] = T0[i]; t1[i] = T1[i]; t2[i] = T2[i]; t3[i] = T3[i]; t4[i] = T4[i]; t4_0[i] = T4_0[i]; t4_1[i] = T4_1[i]; t4_2[i] = T4_2[i]; t4_3[i] = T4_3[i]; } 335 | for (int i = 0; i < 256; i++) SAES_d[i] = SAES[i]; 336 | printf("-------------------------------\n"); 337 | u64* range = calculateRange(); 338 | /* printf("Plaintext : %08x %08x %08x %08x\n", pt[0], pt[1], pt[2], pt[3]); 339 | printf("Ciphertext : %08x %08x %08x %08x\n", ct[0], ct[1], ct[2], ct[3]); 340 | printf("Initial Key : %08x %08x %08x %08x %08x %08x\n", rk192[0], rk192[1], rk192[2], rk192[3], rk192[4], rk192[5]); 341 | printf("-------------------------------\n");*/ 342 | clock_t beginTime = clock(); 343 | // Kernels 344 | exhaustiveSearch192WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, ct, rk192, t0, t4, rcon, range); 345 | // exhaustiveSearch192WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir << > > (pt, ct, rk192, t0, t4, rcon, range, SAES_d); 346 | 347 | cudaDeviceSynchronize(); 348 | printf("Time elapsed: %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC); 349 | printf("-------------------------------\n"); 350 | printLastCUDAError(); 351 | // Free alocated arrays 352 | cudaFree(range);cudaFree(pt);cudaFree(ct);cudaFree(rk192);cudaFree(t0);cudaFree(t1);cudaFree(t2);cudaFree(t3);cudaFree(t4); 353 | cudaFree(t4_0);cudaFree(t4_1);cudaFree(t4_2);cudaFree(t4_3);cudaFree(rcon); 354 | return 0; 355 | } -------------------------------------------------------------------------------- /128-ctr.cuh: -------------------------------------------------------------------------------- 1 | // System includes 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // CUDA runtime 8 | #include 9 | 10 | // Helper functions and utilities to work with CUDA 11 | //#include 12 | //#include 13 | 14 | #include 15 | //#include 16 | 17 | // Custom header 18 | //#include "kernel.h" 19 | 20 | 21 | // Key expansion from given key set, populate rk[44] 22 | __host__ void keyExpansion(u32* key, u32* rk) { 23 | 24 | u32 rk0, rk1, rk2, rk3; 25 | rk0 = key[0]; 26 | rk1 = key[1]; 27 | rk2 = key[2]; 28 | rk3 = key[3]; 29 | 30 | rk[0] = rk0; 31 | rk[1] = rk1; 32 | rk[2] = rk2; 33 | rk[3] = rk3; 34 | 35 | for (u8 roundCount = 0; roundCount < ROUND_COUNT; roundCount++) { 36 | u32 temp = rk3; 37 | rk0 = rk0 ^ T4_3[(temp >> 16) & 0xff] ^ T4_2[(temp >> 8) & 0xff] ^ T4_1[(temp) & 0xff] ^ T4_0[(temp >> 24)] ^ RCON32[roundCount]; 38 | rk1 = rk1 ^ rk0; 39 | rk2 = rk2 ^ rk1; 40 | rk3 = rk2 ^ rk3; 41 | 42 | rk[roundCount * 4 + 4] = rk0; 43 | rk[roundCount * 4 + 5] = rk1; 44 | rk[roundCount * 4 + 6] = rk2; 45 | rk[roundCount * 4 + 7] = rk3; 46 | } 47 | } 48 | 49 | // CTR encryption with one table extended as 32 columns 50 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function 51 | // SBox[256] is partly expanded 52 | __global__ void counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* rk, u32* t0G, u32* t4G, u64* range) { 53 | 54 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 55 | int warpThreadIndex = threadIdx.x & 31; 56 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE; 57 | 58 | // 59 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 60 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 61 | __shared__ u32 rkS[AES_128_KEY_SIZE_INT]; 62 | 63 | if (threadIdx.x < TABLE_SIZE) { 64 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 65 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 66 | } 67 | 68 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { 69 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; 70 | } 71 | 72 | if (threadIdx.x < AES_128_KEY_SIZE_INT) { 73 | rkS[threadIdx.x] = rk[threadIdx.x]; 74 | } 75 | 76 | } 77 | // 78 | 79 | // Wait until every thread is ready 80 | __syncthreads(); 81 | 82 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 83 | u32 s0, s1, s2, s3; 84 | pt0Init = pt[0]; 85 | pt1Init = pt[1]; 86 | pt2Init = pt[2]; 87 | pt3Init = pt[3]; 88 | 89 | u32 threadRange = *range; 90 | u64 threadRangeStart = pt2Init; 91 | threadRangeStart = threadRangeStart << 32; 92 | threadRangeStart ^= pt3Init; 93 | threadRangeStart += (u64)threadIndex * threadRange; 94 | pt2Init = threadRangeStart >> 32; 95 | pt3Init = threadRangeStart & 0xFFFFFFFF; 96 | 97 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 98 | 99 | // Create plaintext as 32 bit unsigned integers 100 | s0 = pt0Init; 101 | s1 = pt1Init; 102 | s2 = pt2Init; 103 | s3 = pt3Init; 104 | 105 | // First round just XORs input with key. 106 | s0 = s0 ^ rkS[0]; 107 | s1 = s1 ^ rkS[1]; 108 | s2 = s2 ^ rkS[2]; 109 | s3 = s3 ^ rkS[3]; 110 | 111 | u32 t0, t1, t2, t3; 112 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) { 113 | 114 | // Table based round function 115 | u32 rkStart = roundCount * 4 + 4; 116 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart]; 117 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1]; 118 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2]; 119 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3]; 120 | 121 | s0 = t0; 122 | s1 = t1; 123 | s2 = t2; 124 | s3 = t3; 125 | 126 | } 127 | 128 | // Calculate the last round key 129 | // Last round uses s-box directly and XORs to produce output. 130 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[40]; 131 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[41]; 132 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[42]; 133 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[43]; 134 | 135 | // Overflow 136 | if (pt3Init == MAX_U32) { 137 | pt2Init++; 138 | } 139 | 140 | pt3Init++; 141 | 142 | } 143 | 144 | if (threadIndex == 1048575) { 145 | printf("threadIndex : %d\n", threadIndex); 146 | printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init); 147 | printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3); 148 | printf("-------------------------------\n"); 149 | } 150 | /* pt[0] ^= s0; 151 | pt[1] ^= s0; 152 | pt[2] ^= s0; 153 | pt[3] ^= s0;*/ 154 | } 155 | __global__ void counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir(u32* pt, u32* rk, u32* t0G, u32* t4G, u64* range, u8* SAES) { 156 | u64 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 157 | int warpThreadIndex = threadIdx.x & 31; 158 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 159 | __shared__ u8 Sbox[64][32][4]; 160 | // __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 161 | __shared__ u32 rkS[AES_128_KEY_SIZE_INT]; 162 | if (threadIdx.x < TABLE_SIZE) { 163 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 164 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 165 | Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; 166 | } 167 | // for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; } 168 | // for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; } 169 | if (threadIdx.x < AES_128_KEY_SIZE_INT) {rkS[threadIdx.x] = rk[threadIdx.x];} 170 | } 171 | __syncthreads(); 172 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 173 | u32 s0, s1, s2, s3; 174 | pt0Init = pt[0]; pt1Init = pt[1]; pt2Init = pt[2]; pt3Init = pt[3]; 175 | u64 threadRange = *range; 176 | u64 threadRangeStart = pt2Init; 177 | threadRangeStart = threadRangeStart << 32; 178 | threadRangeStart ^= pt3Init; 179 | threadRangeStart += threadIndex * threadRange; 180 | pt2Init = threadRangeStart >> 32; 181 | pt3Init = threadRangeStart & 0xFFFFFFFF; 182 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 183 | // Create plaintext as 32 bit unsigned integers 184 | s0 = pt0Init; s1 = pt1Init; s2 = pt2Init; s3 = pt3Init; 185 | // First round just XORs input with key. 186 | s0 = s0 ^ rkS[0]; s1 = s1 ^ rkS[1]; s2 = s2 ^ rkS[2]; s3 = s3 ^ rkS[3]; 187 | u32 t0, t1, t2, t3; 188 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) { 189 | // Table based round function 190 | u32 rkStart = roundCount * 4 + 4; 191 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart]; 192 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1]; 193 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2]; 194 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3]; 195 | s0 = t0; s1 = t1; s2 = t2; s3 = t3; 196 | } 197 | // Calculate the last round key 198 | // Last round uses s-box directly and XORs to produce output. 199 | /* s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[40]; 200 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[41]; 201 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[42]; 202 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[43];*/ 203 | s0 = arithmeticRightShiftBytePerm((u32)Sbox[((t0 >> 24)) / 4][warpThreadIndex][((t0 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t1 >> 16) & 0xff) / 4][warpThreadIndex][((t1 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t2 >> 8) & 0xFF) / 4][warpThreadIndex][((t2 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t3 & 0xFF) / 4)][warpThreadIndex][((t3 & 0xFF) % 4)]) ^ rkS[40]; 204 | s1 = arithmeticRightShiftBytePerm((u32)Sbox[((t1 >> 24)) / 4][warpThreadIndex][((t1 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t2 >> 16) & 0xff) / 4][warpThreadIndex][((t2 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t3 >> 8) & 0xFF) / 4][warpThreadIndex][((t3 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t0 & 0xFF) / 4)][warpThreadIndex][((t0 & 0xFF) % 4)]) ^ rkS[41]; 205 | s2 = arithmeticRightShiftBytePerm((u32)Sbox[((t2 >> 24)) / 4][warpThreadIndex][((t2 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t3 >> 16) & 0xff) / 4][warpThreadIndex][((t3 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t0 >> 8) & 0xFF) / 4][warpThreadIndex][((t0 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t1 & 0xFF) / 4)][warpThreadIndex][((t1 & 0xFF) % 4)]) ^ rkS[42]; 206 | s3 = arithmeticRightShiftBytePerm((u32)Sbox[((t3 >> 24)) / 4][warpThreadIndex][((t3 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t0 >> 16) & 0xff) / 4][warpThreadIndex][((t0 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t1 >> 8) & 0xFF) / 4][warpThreadIndex][((t1 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t2 & 0xFF) / 4)][warpThreadIndex][((t2 & 0xFF) % 4)]) ^ rkS[43]; 207 | // Overflow 208 | if (pt3Init == MAX_U32) { pt2Init++; } 209 | pt3Init++; 210 | } 211 | if (threadIndex == 1048575) { 212 | printf("threadIndex : %d\n", threadIndex); 213 | printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init); 214 | printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3); 215 | printf("-------------------------------\n"); 216 | } 217 | } 218 | __global__ void counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir2(u32* pt, u32* rk, u32* t0G, u32* t4G, u64* range, u8* SAES) { 219 | u64 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 220 | int warpThreadIndex = threadIdx.x & 31; 221 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 222 | __shared__ u8 Sbox[64][32][4]; 223 | // __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 224 | __shared__ u32 rkS[AES_128_KEY_SIZE_INT]; 225 | if (threadIdx.x < TABLE_SIZE) { 226 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 227 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 228 | Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; 229 | } 230 | // for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; } 231 | // for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; } 232 | if (threadIdx.x < AES_128_KEY_SIZE_INT) { rkS[threadIdx.x] = rk[threadIdx.x]; } 233 | } 234 | __syncthreads(); 235 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 236 | u32 s0, s1, s2, s3; 237 | pt0Init = pt[0]; pt1Init = pt[1]; pt2Init = pt[2]; pt3Init = pt[3]; 238 | u64 threadRange = *range; 239 | u64 threadRangeStart = pt2Init; 240 | threadRangeStart = threadRangeStart << 32; 241 | threadRangeStart ^= pt3Init; 242 | threadRangeStart += threadIndex * threadRange; 243 | pt2Init = threadRangeStart >> 32; 244 | pt3Init = threadRangeStart & 0xFFFFFFFF; 245 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 246 | // Create plaintext as 32 bit unsigned integers 247 | s0 = pt0Init; s1 = pt1Init; s2 = pt2Init; s3 = pt3Init; 248 | // First round just XORs input with key. 249 | s0 = s0 ^ rkS[0]; s1 = s1 ^ rkS[1]; s2 = s2 ^ rkS[2]; s3 = s3 ^ rkS[3]; 250 | u32 t0, t1, t2, t3; 251 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) { 252 | // Table based round function 253 | u32 rkStart = roundCount * 4 + 4; 254 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s3 & 0xFF][warpThreadIndex], 24) ^ rkS[rkStart]; 255 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s0 & 0xFF][warpThreadIndex], 24) ^ rkS[rkStart + 1]; 256 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s1 & 0xFF][warpThreadIndex], 24) ^ rkS[rkStart + 2]; 257 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s2 & 0xFF][warpThreadIndex], 24) ^ rkS[rkStart + 3]; 258 | s0 = t0; s1 = t1; s2 = t2; s3 = t3; 259 | } 260 | // Calculate the last round key 261 | // Last round uses s-box directly and XORs to produce output. 262 | /* s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[40]; 263 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[41]; 264 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[42]; 265 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[43];*/ 266 | s0 = arithmeticRightShift((u64)Sbox[((t0 >> 24)) / 4][warpThreadIndex][((t0 >> 24)) % 4], 8) ^ arithmeticRightShift((u64)Sbox[((t1 >> 16) & 0xff) / 4][warpThreadIndex][((t1 >> 16)) % 4], 16) ^ arithmeticRightShift((u64)Sbox[((t2 >> 8) & 0xFF) / 4][warpThreadIndex][((t2 >> 8)) % 4], 24) ^ ((u64)Sbox[((t3 & 0xFF) / 4)][warpThreadIndex][((t3 & 0xFF) % 4)]) ^ rkS[40]; 267 | s1 = arithmeticRightShift((u64)Sbox[((t1 >> 24)) / 4][warpThreadIndex][((t1 >> 24)) % 4], 8) ^ arithmeticRightShift((u64)Sbox[((t2 >> 16) & 0xff) / 4][warpThreadIndex][((t2 >> 16)) % 4], 16) ^ arithmeticRightShift((u64)Sbox[((t3 >> 8) & 0xFF) / 4][warpThreadIndex][((t3 >> 8)) % 4], 24) ^ ((u64)Sbox[((t0 & 0xFF) / 4)][warpThreadIndex][((t0 & 0xFF) % 4)]) ^ rkS[41]; 268 | s2 = arithmeticRightShift((u64)Sbox[((t2 >> 24)) / 4][warpThreadIndex][((t2 >> 24)) % 4], 8) ^ arithmeticRightShift((u64)Sbox[((t3 >> 16) & 0xff) / 4][warpThreadIndex][((t3 >> 16)) % 4], 16) ^ arithmeticRightShift((u64)Sbox[((t0 >> 8) & 0xFF) / 4][warpThreadIndex][((t0 >> 8)) % 4], 24) ^ ((u64)Sbox[((t1 & 0xFF) / 4)][warpThreadIndex][((t1 & 0xFF) % 4)]) ^ rkS[42]; 269 | s3 = arithmeticRightShift((u64)Sbox[((t3 >> 24)) / 4][warpThreadIndex][((t3 >> 24)) % 4], 8) ^ arithmeticRightShift((u64)Sbox[((t0 >> 16) & 0xff) / 4][warpThreadIndex][((t0 >> 16)) % 4], 16) ^ arithmeticRightShift((u64)Sbox[((t1 >> 8) & 0xFF) / 4][warpThreadIndex][((t1 >> 8)) % 4], 24) ^ ((u64)Sbox[((t2 & 0xFF) / 4)][warpThreadIndex][((t2 & 0xFF) % 4)]) ^ rkS[43]; 270 | // Overflow 271 | if (pt3Init == MAX_U32) { pt2Init++; } 272 | pt3Init++; 273 | } 274 | if (threadIndex == 1048575) { 275 | printf("threadIndex : %I64d\n", threadIndex); 276 | printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init); 277 | printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3); 278 | printf("-------------------------------\n"); 279 | } 280 | } 281 | // CTR encryption with one table extended as 32 columns 282 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function 283 | // 4 S-box, each shifted 284 | __global__ void counterWithOneTableExtendedSharedMemoryBytePerm4ShiftedSbox(u32* pt, u32* rk, u32* t0G, u32* t4_0G, u32* t4_1G, u32* t4_2G, u32* t4_3G, u64* range) { 285 | 286 | u32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 287 | int warpThreadIndex = threadIdx.x & 31; 288 | 289 | // 290 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 291 | __shared__ u32 t4_0S[TABLE_SIZE]; 292 | __shared__ u32 t4_1S[TABLE_SIZE]; 293 | __shared__ u32 t4_2S[TABLE_SIZE]; 294 | __shared__ u32 t4_3S[TABLE_SIZE]; 295 | __shared__ u32 rkS[AES_128_KEY_SIZE_INT]; 296 | 297 | if (threadIdx.x < TABLE_SIZE) { 298 | t4_0S[threadIdx.x] = t4_0G[threadIdx.x]; 299 | t4_1S[threadIdx.x] = t4_1G[threadIdx.x]; 300 | t4_2S[threadIdx.x] = t4_2G[threadIdx.x]; 301 | t4_3S[threadIdx.x] = t4_3G[threadIdx.x]; 302 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 303 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 304 | } 305 | 306 | if (threadIdx.x < AES_128_KEY_SIZE_INT) { 307 | rkS[threadIdx.x] = rk[threadIdx.x]; 308 | } 309 | 310 | } 311 | // 312 | 313 | // Wait until every thread is ready 314 | __syncthreads(); 315 | 316 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 317 | u32 s0, s1, s2, s3; 318 | pt0Init = pt[0]; 319 | pt1Init = pt[1]; 320 | pt2Init = pt[2]; 321 | pt3Init = pt[3]; 322 | 323 | u32 threadRange = *range; 324 | u64 threadRangeStart = pt2Init; 325 | threadRangeStart = threadRangeStart << 32; 326 | threadRangeStart ^= pt3Init; 327 | threadRangeStart += (u64)threadIndex * threadRange; 328 | pt2Init = threadRangeStart >> 32; 329 | pt3Init = threadRangeStart & 0xFFFFFFFF; 330 | 331 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 332 | 333 | // Create plaintext as 32 bit unsigned integers 334 | s0 = pt0Init; 335 | s1 = pt1Init; 336 | s2 = pt2Init; 337 | s3 = pt3Init; 338 | 339 | // First round just XORs input with key. 340 | s0 = s0 ^ rkS[0]; 341 | s1 = s1 ^ rkS[1]; 342 | s2 = s2 ^ rkS[2]; 343 | s3 = s3 ^ rkS[3]; 344 | 345 | u32 t0, t1, t2, t3; 346 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) { 347 | 348 | // Table based round function 349 | u32 rkStart = roundCount * 4 + 4; 350 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart]; 351 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1]; 352 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2]; 353 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3]; 354 | 355 | s0 = t0; 356 | s1 = t1; 357 | s2 = t2; 358 | s3 = t3; 359 | 360 | } 361 | 362 | // Calculate the last round key 363 | // Last round uses s-box directly and XORs to produce output. 364 | s0 = t4_3S[t0 >> 24] ^ t4_2S[(t1 >> 16) & 0xff] ^ t4_1S[(t2 >> 8) & 0xff] ^ t4_0S[(t3) & 0xFF] ^ rkS[40]; 365 | s1 = t4_3S[t1 >> 24] ^ t4_2S[(t2 >> 16) & 0xff] ^ t4_1S[(t3 >> 8) & 0xff] ^ t4_0S[(t0) & 0xFF] ^ rkS[41]; 366 | s2 = t4_3S[t2 >> 24] ^ t4_2S[(t3 >> 16) & 0xff] ^ t4_1S[(t0 >> 8) & 0xff] ^ t4_0S[(t1) & 0xFF] ^ rkS[42]; 367 | s3 = t4_3S[t3 >> 24] ^ t4_2S[(t0 >> 16) & 0xff] ^ t4_1S[(t1 >> 8) & 0xff] ^ t4_0S[(t2) & 0xFF] ^ rkS[43]; 368 | 369 | // Overflow 370 | if (pt3Init == MAX_U32) { 371 | pt2Init++; 372 | } 373 | 374 | // Create key as 32 bit unsigned integers 375 | pt3Init++; 376 | } 377 | 378 | if (threadIndex == 1048575) { 379 | printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init); 380 | printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3); 381 | printf("-------------------------------\n"); 382 | } 383 | } 384 | 385 | __host__ int main128Ctr() { 386 | printf("\n"); 387 | printf("########## AES-128 Counter Mode Implementation ##########\n"); 388 | printf("\n"); 389 | 390 | // Allocate plaintext and every round key 391 | u32 *pt, *rk, *roundKeys; 392 | gpuErrorCheck(cudaMallocManaged(&pt, 4 * sizeof(u32))); 393 | gpuErrorCheck(cudaMallocManaged(&rk, 4 * sizeof(u32))); 394 | gpuErrorCheck(cudaMallocManaged(&roundKeys, AES_128_KEY_SIZE_INT * sizeof(u32))); 395 | 396 | pt[0] = 0x3243F6A8U; 397 | pt[1] = 0x885A308DU; 398 | pt[2] = 0x313198A2U; 399 | pt[3] = 0x00000000U; 400 | 401 | rk[0] = 0x2B7E1516U; 402 | rk[1] = 0x28AED2A6U; 403 | rk[2] = 0xABF71588U; 404 | rk[3] = 0x09CF4F3CU; 405 | 406 | // Allocate RCON values 407 | u32* rcon; 408 | gpuErrorCheck(cudaMallocManaged(&rcon, RCON_SIZE * sizeof(u32))); 409 | for (int i = 0; i < RCON_SIZE; i++) { 410 | rcon[i] = RCON32[i]; 411 | } 412 | 413 | // Allocate Tables 414 | u32 *t0, *t1, *t2, *t3, *t4, *t4_0, *t4_1, *t4_2, *t4_3; 415 | u8* SAES_d; // Cihangir 416 | gpuErrorCheck(cudaMallocManaged(&t0, TABLE_SIZE * sizeof(u32))); 417 | gpuErrorCheck(cudaMallocManaged(&t1, TABLE_SIZE * sizeof(u32))); 418 | gpuErrorCheck(cudaMallocManaged(&t2, TABLE_SIZE * sizeof(u32))); 419 | gpuErrorCheck(cudaMallocManaged(&t3, TABLE_SIZE * sizeof(u32))); 420 | gpuErrorCheck(cudaMallocManaged(&t4, TABLE_SIZE * sizeof(u32))); 421 | gpuErrorCheck(cudaMallocManaged(&t4_0, TABLE_SIZE * sizeof(u32))); 422 | gpuErrorCheck(cudaMallocManaged(&t4_1, TABLE_SIZE * sizeof(u32))); 423 | gpuErrorCheck(cudaMallocManaged(&t4_2, TABLE_SIZE * sizeof(u32))); 424 | gpuErrorCheck(cudaMallocManaged(&t4_3, TABLE_SIZE * sizeof(u32))); 425 | gpuErrorCheck(cudaMallocManaged(&SAES_d, 256 * sizeof(u8))); // Cihangir 426 | for (int i = 0; i < TABLE_SIZE; i++) { 427 | t0[i] = T0[i]; 428 | t1[i] = T1[i]; 429 | t2[i] = T2[i]; 430 | t3[i] = T3[i]; 431 | t4[i] = T4[i]; 432 | t4_0[i] = T4_0[i]; 433 | t4_1[i] = T4_1[i]; 434 | t4_2[i] = T4_2[i]; 435 | t4_3[i] = T4_3[i]; 436 | } 437 | for (int i = 0; i < 256; i++) SAES_d[i] = SAES[i]; // Cihangir 438 | printf("-------------------------------\n"); 439 | u64* range = calculateRange(); 440 | /* printf("Initial Plaintext : %08x %08x %08x %08x\n", pt[0], pt[1], pt[2], pt[3]); 441 | printf("Initial Key : %08x %08x %08x %08x\n", rk[0], rk[1], rk[2], rk[3]); 442 | printf("-------------------------------\n");*/ 443 | 444 | // Key expansion 445 | keyExpansion(rk, roundKeys); 446 | 447 | clock_t beginTime = clock(); 448 | // Kernels 449 | // counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, roundKeys, t0, t4, range); 450 | counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir << > > (pt, roundKeys, t0, t4, range, SAES_d); 451 | // counterWithOneTableExtendedSharedMemoryBytePerm4ShiftedSbox<<>>(pt, roundKeys, t0, t4_0, t4_1, t4_2, t4_3, range); 452 | // cudaMemcpy(rk, pt, 4*sizeof(u32), cudaMemcpyDeviceToHost); 453 | cudaDeviceSynchronize(); 454 | printf("Time elapsed: %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC); 455 | printf("-------------------------------\n"); 456 | printLastCUDAError(); 457 | printf("plaintext: %x %x %x %x\n",rk[0], rk[1], rk[2], rk[3]); 458 | 459 | // Free alocated arrays 460 | cudaFree(range); 461 | cudaFree(pt); 462 | cudaFree(rk); 463 | cudaFree(roundKeys); 464 | cudaFree(t0); 465 | cudaFree(t1); 466 | cudaFree(t2); 467 | cudaFree(t3); 468 | cudaFree(t4); 469 | cudaFree(t4_0); 470 | cudaFree(t4_1); 471 | cudaFree(t4_2); 472 | cudaFree(t4_3); 473 | cudaFree(rcon); 474 | 475 | return 0; 476 | } -------------------------------------------------------------------------------- /file-encryption.cuh: -------------------------------------------------------------------------------- 1 | // System includes 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // CUDA runtime 8 | #include 9 | 10 | // Helper functions and utilities to work with CUDA 11 | //#include 12 | //#include 13 | 14 | #include 15 | //#include 16 | 17 | // Custom header 18 | //#include "kernel.h" 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | __device__ u32 fileEncryptionTotalG = 0; 25 | 26 | // CTR encryption with one table extended as 32 columns 27 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function 28 | // SBox[256] is partly expanded 29 | __global__ void fileEncryption128counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, 30 | u32* encryptionCountG, u32* threadCountG) { 31 | 32 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 33 | int warpThreadIndex = threadIdx.x & 31; 34 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE; 35 | 36 | // 37 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 38 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 39 | __shared__ u32 rkS[AES_128_KEY_SIZE_INT]; 40 | 41 | if (threadIdx.x < TABLE_SIZE) { 42 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 43 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 44 | } 45 | 46 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { 47 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; 48 | } 49 | 50 | if (threadIdx.x < AES_128_KEY_SIZE_INT) { 51 | rkS[threadIdx.x] = rk[threadIdx.x]; 52 | } 53 | 54 | } 55 | // 56 | 57 | // Wait until every thread is ready 58 | __syncthreads(); 59 | 60 | u32 pt0Init, pt1Init, pt2Init, pt3Init, s0, s1, s2, s3; 61 | pt0Init = pt[0]; 62 | pt1Init = pt[1]; 63 | pt2Init = pt[2]; 64 | pt3Init = pt[3]; 65 | 66 | u32 pt2Max, pt3Max, threadCount = *threadCountG; 67 | u64 threadRangeStart = pt2Init; 68 | threadRangeStart = threadRangeStart << 32; 69 | threadRangeStart ^= pt3Init; 70 | threadRangeStart += *encryptionCountG; 71 | pt2Max = threadRangeStart >> 32; 72 | pt3Max = threadRangeStart & 0xFFFFFFFF; 73 | 74 | // Initialize plaintext 75 | pt3Init += threadIndex; 76 | if (pt3Init < threadIndex) { 77 | pt2Init++; 78 | } 79 | 80 | if (pt2Init >= pt2Max && pt3Init >= pt3Max) { 81 | return; 82 | } 83 | 84 | // Initialize ciphertext index 85 | u64 ctIndex = threadIndex*4; 86 | 87 | //if (threadIndex == 0) { 88 | // printf("Boundry: %08x %08x\n", pt2Max, pt3Max); 89 | // printf("threadCount: %08x\n", threadCount); 90 | // printf("encryptionCountG: %08x\n", *encryptionCountG); 91 | //} 92 | 93 | for (;;) { 94 | 95 | // Create plaintext as 32 bit unsigned integers 96 | s0 = pt0Init; 97 | s1 = pt1Init; 98 | s2 = pt2Init; 99 | s3 = pt3Init; 100 | 101 | // First round just XORs input with key. 102 | s0 = s0 ^ rkS[0]; 103 | s1 = s1 ^ rkS[1]; 104 | s2 = s2 ^ rkS[2]; 105 | s3 = s3 ^ rkS[3]; 106 | 107 | u32 t0, t1, t2, t3; 108 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) { 109 | 110 | // Table based round function 111 | u32 rkStart = roundCount * 4 + 4; 112 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart]; 113 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1]; 114 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2]; 115 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3]; 116 | 117 | s0 = t0; 118 | s1 = t1; 119 | s2 = t2; 120 | s3 = t3; 121 | 122 | } 123 | 124 | // Calculate the last round key 125 | // Last round uses s-box directly and XORs to produce output. 126 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[40]; 127 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[41]; 128 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[42]; 129 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[43]; 130 | 131 | // Allocate ciphertext 132 | ct[ctIndex ] = s0; 133 | ct[ctIndex + 1] = s1; 134 | ct[ctIndex + 2] = s2; 135 | ct[ctIndex + 3] = s3; 136 | 137 | //if (pt3Init+1 == 0x05ea2a80) { 138 | // printf("-------------------------------\n"); 139 | // printf("threadIndex : %d\n", threadIndex); 140 | // printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init); 141 | // printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3); 142 | // printf("Ciphertext index : %d %d %d %d\n", ctIndex, ctIndex + 1, ctIndex + 2, ctIndex + 3); 143 | // printf("-------------------------------\n"); 144 | //} 145 | 146 | // Increase plaintext 147 | pt3Init += threadCount; 148 | if (pt3Init < threadCount) { 149 | pt2Init++; 150 | } 151 | 152 | // Ciphertext index 153 | ctIndex += threadCount * 4; 154 | 155 | //atomicAdd(&fileEncryptionTotalG, 1); 156 | 157 | if (pt2Init >= pt2Max && pt3Init >= pt3Max) { 158 | break; 159 | } 160 | 161 | } 162 | 163 | //if (threadIndex == 0) { 164 | // printf("threadIndex : %d\n", threadIndex); 165 | // printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init); 166 | // printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3); 167 | // printf("Ciphertext index : %d %d %d %d\n", ctIndex, ctIndex+1, ctIndex+2, ctIndex+3); 168 | // printf("-------------------------------\n"); 169 | //} 170 | 171 | } 172 | 173 | 174 | // CTR encryption with one table extended as 32 columns 175 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function 176 | // SBox[256] is partly expanded 177 | __global__ void fileEncryption192counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, 178 | u32* encryptionCountG, u32* threadCountG) { 179 | 180 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 181 | int warpThreadIndex = threadIdx.x & 31; 182 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE; 183 | 184 | // 185 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 186 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 187 | __shared__ u32 rkS[AES_192_KEY_SIZE_INT]; 188 | 189 | if (threadIdx.x < TABLE_SIZE) { 190 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 191 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 192 | } 193 | 194 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { 195 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; 196 | } 197 | 198 | if (threadIdx.x < AES_192_KEY_SIZE_INT) { 199 | rkS[threadIdx.x] = rk[threadIdx.x]; 200 | } 201 | 202 | } 203 | // 204 | 205 | // Wait until every thread is ready 206 | __syncthreads(); 207 | 208 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 209 | u32 s0, s1, s2, s3; 210 | pt0Init = pt[0]; 211 | pt1Init = pt[1]; 212 | pt2Init = pt[2]; 213 | pt3Init = pt[3]; 214 | 215 | u32 pt2Max, pt3Max, threadCount = *threadCountG; 216 | u64 threadRangeStart = pt2Init; 217 | threadRangeStart = threadRangeStart << 32; 218 | threadRangeStart ^= pt3Init; 219 | threadRangeStart += *encryptionCountG; 220 | pt2Max = threadRangeStart >> 32; 221 | pt3Max = threadRangeStart & 0xFFFFFFFF; 222 | 223 | // Initialize plaintext 224 | pt3Init += threadIndex; 225 | if (pt3Init < threadIndex) { 226 | pt2Init++; 227 | } 228 | 229 | if (pt2Init >= pt2Max && pt3Init >= pt3Max) { 230 | return; 231 | } 232 | 233 | // Initialize ciphertext index 234 | u64 ctIndex = threadIndex * 4; 235 | 236 | for (;;) { 237 | 238 | // Create plaintext as 32 bit unsigned integers 239 | s0 = pt0Init; 240 | s1 = pt1Init; 241 | s2 = pt2Init; 242 | s3 = pt3Init; 243 | 244 | // First round just XORs input with key. 245 | s0 = s0 ^ rkS[0]; 246 | s1 = s1 ^ rkS[1]; 247 | s2 = s2 ^ rkS[2]; 248 | s3 = s3 ^ rkS[3]; 249 | 250 | u32 t0, t1, t2, t3; 251 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1_192; roundCount++) { 252 | 253 | // Table based round function 254 | u32 rkStart = roundCount * 4 + 4; 255 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart]; 256 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1]; 257 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2]; 258 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3]; 259 | 260 | s0 = t0; 261 | s1 = t1; 262 | s2 = t2; 263 | s3 = t3; 264 | 265 | } 266 | 267 | // Calculate the last round key 268 | // Last round uses s-box directly and XORs to produce output. 269 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[48]; 270 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[49]; 271 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[50]; 272 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[51]; 273 | 274 | // Allocate ciphertext 275 | ct[ctIndex] = s0; 276 | ct[ctIndex + 1] = s1; 277 | ct[ctIndex + 2] = s2; 278 | ct[ctIndex + 3] = s3; 279 | 280 | // Increase plaintext 281 | pt3Init += threadCount; 282 | if (pt3Init < threadCount) { 283 | pt2Init++; 284 | } 285 | 286 | // Ciphertext index 287 | ctIndex += threadCount * 4; 288 | 289 | //atomicAdd(&fileEncryptionTotalG, 1); 290 | 291 | if (pt2Init >= pt2Max && pt3Init >= pt3Max) { 292 | break; 293 | } 294 | } 295 | 296 | //if (threadIndex == 0) { 297 | // printf("threadIndex : %d\n", threadIndex); 298 | // printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init); 299 | // printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3); 300 | // printf("Ciphertext index : %d %d %d %d\n", ctIndex, ctIndex+1, ctIndex+2, ctIndex+3); 301 | // printf("-------------------------------\n"); 302 | //} 303 | 304 | } 305 | 306 | 307 | // CTR encryption with one table extended as 32 columns 308 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function 309 | // SBox[256] is partly expanded 310 | __global__ void fileEncryption256counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, 311 | u32* encryptionCountG, u32* threadCountG) { 312 | 313 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 314 | int warpThreadIndex = threadIdx.x & 31; 315 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE; 316 | 317 | // 318 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 319 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 320 | __shared__ u32 rkS[AES_256_KEY_SIZE_INT]; 321 | 322 | if (threadIdx.x < TABLE_SIZE) { 323 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 324 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 325 | } 326 | 327 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { 328 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; 329 | } 330 | 331 | if (threadIdx.x < AES_256_KEY_SIZE_INT) { 332 | rkS[threadIdx.x] = rk[threadIdx.x]; 333 | } 334 | 335 | } 336 | // 337 | 338 | // Wait until every thread is ready 339 | __syncthreads(); 340 | 341 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 342 | u32 s0, s1, s2, s3; 343 | pt0Init = pt[0]; 344 | pt1Init = pt[1]; 345 | pt2Init = pt[2]; 346 | pt3Init = pt[3]; 347 | 348 | u32 pt2Max, pt3Max, threadCount = *threadCountG; 349 | u64 threadRangeStart = pt2Init; 350 | threadRangeStart = threadRangeStart << 32; 351 | threadRangeStart ^= pt3Init; 352 | threadRangeStart += *encryptionCountG; 353 | pt2Max = threadRangeStart >> 32; 354 | pt3Max = threadRangeStart & 0xFFFFFFFF; 355 | 356 | // Initialize plaintext 357 | pt3Init += threadIndex; 358 | if (pt3Init < threadIndex) { 359 | pt2Init++; 360 | } 361 | 362 | if (pt2Init >= pt2Max && pt3Init >= pt3Max) { 363 | return; 364 | } 365 | 366 | // Initialize ciphertext index 367 | u64 ctIndex = threadIndex * 4; 368 | 369 | for (;;) { 370 | 371 | // Create plaintext as 32 bit unsigned integers 372 | s0 = pt0Init; 373 | s1 = pt1Init; 374 | s2 = pt2Init; 375 | s3 = pt3Init; 376 | 377 | // First round just XORs input with key. 378 | s0 = s0 ^ rkS[0]; 379 | s1 = s1 ^ rkS[1]; 380 | s2 = s2 ^ rkS[2]; 381 | s3 = s3 ^ rkS[3]; 382 | 383 | u32 t0, t1, t2, t3; 384 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1_256; roundCount++) { 385 | 386 | // Table based round function 387 | u32 rkStart = roundCount * 4 + 4; 388 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart]; 389 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1]; 390 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2]; 391 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3]; 392 | 393 | s0 = t0; 394 | s1 = t1; 395 | s2 = t2; 396 | s3 = t3; 397 | 398 | } 399 | 400 | // Calculate the last round key 401 | // Last round uses s-box directly and XORs to produce output. 402 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[56]; 403 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[57]; 404 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[58]; 405 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[59]; 406 | 407 | // Allocate ciphertext 408 | ct[ctIndex] = s0; 409 | ct[ctIndex + 1] = s1; 410 | ct[ctIndex + 2] = s2; 411 | ct[ctIndex + 3] = s3; 412 | 413 | // Increase plaintext 414 | pt3Init += threadCount; 415 | if (pt3Init < threadCount) { 416 | pt2Init++; 417 | } 418 | 419 | // Ciphertext index 420 | ctIndex += threadCount * 4; 421 | 422 | //atomicAdd(&fileEncryptionTotalG, 1); 423 | 424 | if (pt2Init >= pt2Max && pt3Init >= pt3Max) { 425 | break; 426 | } 427 | } 428 | 429 | //if (threadIndex == 0) { 430 | // printf("threadIndex : %d\n", threadIndex); 431 | // printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init); 432 | // printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3); 433 | // printf("Ciphertext index : %d %d %d %d\n", ctIndex, ctIndex+1, ctIndex+2, ctIndex+3); 434 | // printf("-------------------------------\n"); 435 | //} 436 | 437 | } 438 | 439 | 440 | __host__ int mainFileEncryption() { 441 | printf("\n"); 442 | printf("########## AES CTR File Encryption Implementation ##########\n"); 443 | printf("\n"); 444 | 445 | // Inputs 446 | int chunkSize = 1024; 447 | int keyLen = AES_128_KEY_LEN_INT; 448 | const std::string filePath = "C://file-encryption-test//movie4.mp4"; 449 | const std::string outFilePath = filePath + "_ENC"; 450 | 451 | std::fstream fileIn(filePath, std::fstream::in | std::fstream::binary); 452 | if (fileIn) { 453 | 454 | // Get file size 455 | fileIn.seekg(0, fileIn.end); 456 | u32 fileSize = fileIn.tellg(); 457 | fileIn.seekg(0, fileIn.beg); 458 | printf("File path : %s\n", filePath.c_str()); 459 | printf("File size in bytes : %u\n", fileSize); 460 | printf("Encrypted file path : %s\n", outFilePath.c_str()); 461 | printf("-------------------------------\n"); 462 | 463 | // Allocate plaintext and every round key 464 | u32 *pt, *rk, rk128[AES_128_KEY_LEN_INT], rk192[AES_192_KEY_LEN_INT], rk256[AES_256_KEY_LEN_INT]; 465 | gpuErrorCheck(cudaMallocManaged(&pt, 4 * sizeof(u32))); 466 | 467 | pt[0] = 0x3243F6A8U; 468 | pt[1] = 0x885A308DU; 469 | pt[2] = 0x313198A2U; 470 | pt[3] = 0x00000000U; 471 | 472 | rk128[0] = 0x2B7E1516U; 473 | rk128[1] = 0x28AED2A6U; 474 | rk128[2] = 0xABF71588U; 475 | rk128[3] = 0x09CF4F3CU; 476 | 477 | rk192[0] = 0x8e73b0f7U; 478 | rk192[1] = 0xda0e6452U; 479 | rk192[2] = 0xc810f32bU; 480 | rk192[3] = 0x809079e5U; 481 | rk192[4] = 0x62f8ead2U; 482 | rk192[5] = 0x522c6b7bU; 483 | 484 | rk256[0] = 0x603deb10U; 485 | rk256[1] = 0x15ca71beU; 486 | rk256[2] = 0x2b73aef0U; 487 | rk256[3] = 0x857d7781U; 488 | rk256[4] = 0x1f352c07U; 489 | rk256[5] = 0x3b6108d7U; 490 | rk256[6] = 0x2d9810a3U; 491 | rk256[7] = 0x0914dff4U; 492 | 493 | // Allocate RCON values 494 | u32* rcon; 495 | gpuErrorCheck(cudaMallocManaged(&rcon, RCON_SIZE * sizeof(u32))); 496 | for (int i = 0; i < RCON_SIZE; i++) { 497 | rcon[i] = RCON32[i]; 498 | } 499 | 500 | // Allocate Tables 501 | u32 *t0, *t1, *t2, *t3, *t4, *t4_0, *t4_1, *t4_2, *t4_3; 502 | gpuErrorCheck(cudaMallocManaged(&t0, TABLE_SIZE * sizeof(u32))); 503 | gpuErrorCheck(cudaMallocManaged(&t1, TABLE_SIZE * sizeof(u32))); 504 | gpuErrorCheck(cudaMallocManaged(&t2, TABLE_SIZE * sizeof(u32))); 505 | gpuErrorCheck(cudaMallocManaged(&t3, TABLE_SIZE * sizeof(u32))); 506 | gpuErrorCheck(cudaMallocManaged(&t4, TABLE_SIZE * sizeof(u32))); 507 | gpuErrorCheck(cudaMallocManaged(&t4_0, TABLE_SIZE * sizeof(u32))); 508 | gpuErrorCheck(cudaMallocManaged(&t4_1, TABLE_SIZE * sizeof(u32))); 509 | gpuErrorCheck(cudaMallocManaged(&t4_2, TABLE_SIZE * sizeof(u32))); 510 | gpuErrorCheck(cudaMallocManaged(&t4_3, TABLE_SIZE * sizeof(u32))); 511 | for (int i = 0; i < TABLE_SIZE; i++) { 512 | t0[i] = T0[i]; 513 | t1[i] = T1[i]; 514 | t2[i] = T2[i]; 515 | t3[i] = T3[i]; 516 | t4[i] = T4[i]; 517 | t4_0[i] = T4_0[i]; 518 | t4_1[i] = T4_1[i]; 519 | t4_2[i] = T4_2[i]; 520 | t4_3[i] = T4_3[i]; 521 | } 522 | 523 | // Calculate encryption boundary 524 | u32 *ct, *encryptionCount, *threadCount; 525 | gpuErrorCheck(cudaMallocManaged(&threadCount, 1 * sizeof(u32))); 526 | gpuErrorCheck(cudaMallocManaged(&encryptionCount, 1 * sizeof(u32))); 527 | threadCount[0] = BLOCKS * THREADS; 528 | double totalBlockSize = (double)fileSize / BYTE_COUNT; 529 | encryptionCount[0] = ceil(totalBlockSize); 530 | u32 ciphertextSize = encryptionCount[0] * U32_SIZE * sizeof(u32); 531 | 532 | // Allocate ciphertext 533 | //gpuErrorCheck(cudaMallocManaged(&ct, ciphertextSize)); 534 | gpuErrorCheck(cudaMalloc((void **)&ct, ciphertextSize)); 535 | 536 | printf("Blocks : %d\n", BLOCKS); 537 | printf("Threads : %d\n", THREADS); 538 | printf("Total thread count : %u\n", threadCount[0]); 539 | printf("Total encryptions : %u\n", encryptionCount[0]); 540 | printf("Total encryptions in byte : %u\n", ciphertextSize); 541 | printf("Each thread encryptions : %.2f\n", encryptionCount[0] / (double)threadCount[0]); 542 | printf("-------------------------------\n"); 543 | printf("Initial Counter : %08x %08x %08x %08x\n", pt[0], pt[1], pt[2], pt[3]); 544 | int keySize; 545 | if (keyLen == AES_128_KEY_LEN_INT) { 546 | rk = rk128; 547 | keySize = AES_128_KEY_SIZE_INT; 548 | printf("Initial Key (%d byte) : %08x %08x %08x %08x\n", AES_128_KEY_LEN_INT * U32_SIZE, rk[0], rk[1], rk[2], rk[3]); 549 | } else if (keyLen == AES_192_KEY_LEN_INT) { 550 | rk = rk192; 551 | keySize = AES_192_KEY_SIZE_INT; 552 | printf("Initial Key (%d byte) : %08x %08x %08x %08x %08x %08x\n", AES_192_KEY_LEN_INT * U32_SIZE, rk[0], rk[1], rk[2], rk[3], rk[4], rk[5]); 553 | } else if (keyLen == AES_256_KEY_LEN_INT) { 554 | rk = rk256; 555 | keySize = AES_256_KEY_SIZE_INT; 556 | printf("Initial Key (%d byte) : %08x %08x %08x %08x %08x %08x %08x %08x\n", AES_256_KEY_LEN_INT * U32_SIZE, rk[0], rk[1], rk[2], rk[3], rk[4], rk[5], rk[6], rk[7]); 557 | } 558 | printf("-------------------------------\n"); 559 | 560 | // Prepare round keys 561 | u32 *roundKeys; 562 | gpuErrorCheck(cudaMallocManaged(&roundKeys, keySize * sizeof(u32))); 563 | if (keyLen == AES_128_KEY_LEN_INT) { 564 | keyExpansion(rk128, roundKeys); 565 | } else if (keyLen == AES_192_KEY_LEN_INT) { 566 | keyExpansion192(rk192, roundKeys); 567 | } else if (keyLen == AES_256_KEY_LEN_INT) { 568 | keyExpansion256(rk256, roundKeys); 569 | } 570 | 571 | clock_t beginTime = clock(); 572 | // Kernels 573 | if (keyLen == AES_128_KEY_LEN_INT) { 574 | fileEncryption128counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, ct, roundKeys, t0, t4, encryptionCount, threadCount); 575 | } else if (keyLen == AES_192_KEY_LEN_INT) { 576 | fileEncryption192counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, ct, roundKeys, t0, t4, encryptionCount, threadCount); 577 | } else if (keyLen == AES_256_KEY_LEN_INT) { 578 | fileEncryption256counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, ct, roundKeys, t0, t4, encryptionCount, threadCount); 579 | } 580 | 581 | cudaDeviceSynchronize(); 582 | printf("Time elapsed (Encryption) : %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC); 583 | printLastCUDAError(); 584 | 585 | //u32 totEncryption; 586 | //cudaMemcpyFromSymbol(&totEncryption, fileEncryptionTotalG, sizeof(u32)); 587 | //printf("Total encryptions : %I64d\n", totEncryption); 588 | //printf("-------------------------------\n"); 589 | 590 | beginTime = clock(); 591 | u32 *ctH = new u32[encryptionCount[0] * U32_SIZE]; 592 | cudaMemcpy(ctH, ct, ciphertextSize, cudaMemcpyDeviceToHost); 593 | printf("Time elapsed (Memcpy) : %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC); 594 | 595 | //return 0; 596 | 597 | // Open output file 598 | beginTime = clock(); 599 | std::fstream fileOut(outFilePath, std::fstream::out | std::fstream::binary); 600 | u32 cipherTextIndex = 0; 601 | // Allocate file buffer 602 | char * buffer = new char[chunkSize]; 603 | while (1) { 604 | // Read data as a block into buffer: 605 | fileIn.read(buffer, chunkSize); 606 | // Decide whether buffer is at the last part 607 | long readByte = 0; 608 | if (fileIn) { 609 | // All characters read successfully 610 | readByte = chunkSize; 611 | } else { 612 | // Only readByte characters could be read 613 | readByte = fileIn.gcount(); 614 | } 615 | // Process current buffer 616 | u32 readInt = 0; 617 | for (int bufferIndex = 0; bufferIndex < readByte; bufferIndex++) { 618 | // Process 4 byte as integers 619 | int bufferIntIndex = (bufferIndex + 1) % U32_SIZE; 620 | if (bufferIntIndex == 0) { 621 | // Change 4 byte to int 622 | readInt = 0; 623 | readInt |= (0x000000FF & buffer[bufferIndex - 3]) << 24; 624 | readInt |= (0x000000FF & buffer[bufferIndex - 2]) << 16; 625 | readInt |= (0x000000FF & buffer[bufferIndex - 1]) << 8; 626 | readInt |= (0x000000FF & buffer[bufferIndex ]); 627 | // XOR with ciphertext 628 | readInt ^= ctH[cipherTextIndex++]; 629 | // Change 4 byte back to char 630 | buffer[bufferIndex - 3] = readInt >> 24; 631 | buffer[bufferIndex - 2] = readInt >> 16; 632 | buffer[bufferIndex - 1] = readInt >> 8; 633 | buffer[bufferIndex] = readInt; 634 | } else if (bufferIndex == readByte - 1) { 635 | // Change bufferIntIndex byte to int 636 | readInt = 0; 637 | for (int extraByteIndex = 0; extraByteIndex < bufferIntIndex; extraByteIndex++) { 638 | readInt |= (0x000000FF & buffer[bufferIndex - bufferIntIndex + extraByteIndex + 1]) << ((U32_SIZE -1 -extraByteIndex) * 8); 639 | } 640 | // XOR with ciphertext 641 | readInt ^= ctH[cipherTextIndex++]; 642 | // Change bufferIntIndex byte back to char 643 | for (int extraByteIndex = 0; extraByteIndex < bufferIntIndex; extraByteIndex++) { 644 | buffer[bufferIndex - bufferIntIndex + extraByteIndex + 1] = readInt >> (U32_SIZE - 1 - extraByteIndex) * 8; 645 | } 646 | } 647 | } 648 | // Write buffer to output file 649 | fileOut.write(buffer, readByte); 650 | // stop 651 | if (readByte < chunkSize) { 652 | break; 653 | } 654 | } 655 | printf("Time elapsed (File write) : %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC); 656 | 657 | delete[] buffer; 658 | fileOut.close(); 659 | 660 | // Free alocated arrays 661 | cudaFree(threadCount); 662 | cudaFree(encryptionCount); 663 | cudaFree(ct); 664 | cudaFree(pt); 665 | cudaFree(rk); 666 | cudaFree(roundKeys); 667 | cudaFree(t0); 668 | cudaFree(t1); 669 | cudaFree(t2); 670 | cudaFree(t3); 671 | cudaFree(t4); 672 | cudaFree(t4_0); 673 | cudaFree(t4_1); 674 | cudaFree(t4_2); 675 | cudaFree(t4_3); 676 | cudaFree(rcon); 677 | } else { 678 | printf("File could not be opened: %s\n", filePath.c_str()); 679 | } 680 | 681 | fileIn.close(); 682 | return 0; 683 | } -------------------------------------------------------------------------------- /AES_final.h: -------------------------------------------------------------------------------- 1 | typedef unsigned char u8; 2 | typedef unsigned short u16; 3 | typedef unsigned int u32; 4 | typedef unsigned long long u64; 5 | 6 | #define BLOCKS 1024 7 | #define THREADS 1024 8 | #define TWO_POWER_RANGE 35 9 | 10 | #define SHARED_MEM_BANK_SIZE 32 11 | #define S_BOX_BANK_SIZE 8 12 | #define TABLE_SIZE 256 13 | #define RCON_SIZE 15 14 | #define U32_SIZE 4 15 | #define MAX_U32 4294967295 16 | #define MAX_U16 0x0000FFFF 17 | #define BYTE_COUNT 16 // 128 / 8 18 | #define PARTLY_DIVIDE_THRESHOLD 110 19 | 20 | #define AES_128_KEY_LEN_INT 4 21 | #define AES_192_KEY_LEN_INT 6 22 | #define AES_256_KEY_LEN_INT 8 23 | 24 | #define AES_128_KEY_SIZE_INT 44 25 | #define AES_192_KEY_SIZE_INT 52 26 | #define AES_256_KEY_SIZE_INT 60 27 | 28 | #define ROUND_COUNT 10 29 | #define ROUND_COUNT_MIN_1 9 30 | #define ROUND_COUNT_192 12 31 | #define ROUND_COUNT_MIN_1_192 11 32 | #define ROUND_COUNT_256 14 33 | #define ROUND_COUNT_MIN_1_256 13 34 | 35 | // __byte_perm Constants 36 | // u32 t = __byte_perm(x, y, selector); 37 | #define SHIFT_1_RIGHT 17185 // 0x00004321U i.e. ( >> 8 ) 38 | #define SHIFT_2_RIGHT 21554 // 0x00005432U i.e. ( >> 16 ) 39 | #define SHIFT_3_RIGHT 25923 // 0x00006543U i.e. ( >> 24 ) 40 | 41 | #define gpuErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); } 42 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) 43 | { 44 | if (code != cudaSuccess) { 45 | fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 46 | //if (abort) exit(code); 47 | } 48 | } 49 | 50 | void printLastCUDAError(){ 51 | cudaError_t cudaError = cudaGetLastError(); 52 | if (cudaError != cudaSuccess) { 53 | printf("-----\n"); 54 | printf("ERROR: cudaGetLastError() returned %d: %s\n", cudaError, cudaGetErrorString(cudaError)); 55 | printf("-----\n"); 56 | } 57 | } 58 | 59 | __host__ u64* calculateRange() { 60 | u64* range; 61 | gpuErrorCheck(cudaMallocManaged(&range, 1 * sizeof(u64))); 62 | int threadCount = BLOCKS * THREADS; 63 | double keyRange = pow(2, TWO_POWER_RANGE); 64 | double threadRange = keyRange / threadCount; 65 | *range = ceil(threadRange); 66 | 67 | /* printf("Blocks : %d\n", BLOCKS); 68 | printf("Threads : %d\n", THREADS); 69 | printf("Total Thread count : %d\n", threadCount); 70 | printf("Key Range (power) : %d\n", TWO_POWER_RANGE); 71 | printf("Key Range (decimal) : %.0f\n", keyRange); 72 | printf("Each Thread Key Range : %.2f\n", threadRange); 73 | printf("Each Thread Key Range (kernel): %d\n", range[0]); 74 | printf("Total encryptions : %.0f\n", ceil(threadRange) * threadCount); 75 | printf("-------------------------------\n");*/ 76 | 77 | return range; 78 | } 79 | 80 | __device__ u32 arithmeticRightShift(u32 x, u32 n) { return (x >> n) | (x << (-n & 31)); } 81 | __device__ u32 arithmetic16bitRightShift(u32 x, u32 n, u32 n2Power) { return (x >> n) | ((x & n2Power) << (-n & 15)); } 82 | __device__ u32 arithmeticRightShiftBytePerm(u32 x, u32 n) { return __byte_perm(x, x, n); } 83 | 84 | // ROTATE LEFT 85 | #define ROTL64(x,n) (((x)<<(n))|((x)>>(64-(n)))) 86 | #define ROTL16(x,n) (((x)<<(n))|((x)>>(16-(n))))&0xffff 87 | u8 SAES[256] = { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }; 88 | 89 | u32 T0[TABLE_SIZE] = { 90 | 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU, 91 | 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U, 92 | 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU, 93 | 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU, 94 | 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U, 95 | 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU, 96 | 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU, 97 | 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU, 98 | 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU, 99 | 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU, 100 | 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U, 101 | 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU, 102 | 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU, 103 | 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U, 104 | 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU, 105 | 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU, 106 | 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU, 107 | 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU, 108 | 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU, 109 | 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U, 110 | 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU, 111 | 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU, 112 | 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU, 113 | 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU, 114 | 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U, 115 | 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U, 116 | 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U, 117 | 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U, 118 | 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU, 119 | 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U, 120 | 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U, 121 | 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU, 122 | 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU, 123 | 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U, 124 | 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U, 125 | 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U, 126 | 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU, 127 | 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U, 128 | 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU, 129 | 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U, 130 | 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU, 131 | 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U, 132 | 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U, 133 | 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU, 134 | 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U, 135 | 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U, 136 | 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U, 137 | 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U, 138 | 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U, 139 | 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U, 140 | 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U, 141 | 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U, 142 | 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU, 143 | 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U, 144 | 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U, 145 | 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U, 146 | 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U, 147 | 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U, 148 | 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U, 149 | 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU, 150 | 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U, 151 | 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U, 152 | 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U, 153 | 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU, 154 | }; 155 | u32 T1[TABLE_SIZE] = { 156 | 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU, 157 | 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U, 158 | 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU, 159 | 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U, 160 | 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU, 161 | 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U, 162 | 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU, 163 | 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U, 164 | 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U, 165 | 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU, 166 | 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U, 167 | 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U, 168 | 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U, 169 | 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU, 170 | 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U, 171 | 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U, 172 | 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU, 173 | 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U, 174 | 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U, 175 | 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U, 176 | 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU, 177 | 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU, 178 | 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U, 179 | 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU, 180 | 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU, 181 | 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U, 182 | 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU, 183 | 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U, 184 | 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU, 185 | 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U, 186 | 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U, 187 | 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U, 188 | 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU, 189 | 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U, 190 | 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU, 191 | 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U, 192 | 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU, 193 | 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U, 194 | 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U, 195 | 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU, 196 | 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU, 197 | 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU, 198 | 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U, 199 | 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U, 200 | 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU, 201 | 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U, 202 | 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU, 203 | 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U, 204 | 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU, 205 | 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U, 206 | 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU, 207 | 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU, 208 | 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U, 209 | 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU, 210 | 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U, 211 | 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU, 212 | 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U, 213 | 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U, 214 | 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U, 215 | 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU, 216 | 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU, 217 | 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U, 218 | 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU, 219 | 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U, 220 | }; 221 | u32 T2[TABLE_SIZE] = { 222 | 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU, 223 | 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U, 224 | 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU, 225 | 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U, 226 | 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU, 227 | 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U, 228 | 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU, 229 | 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U, 230 | 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U, 231 | 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU, 232 | 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U, 233 | 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U, 234 | 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U, 235 | 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU, 236 | 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U, 237 | 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U, 238 | 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU, 239 | 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U, 240 | 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U, 241 | 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U, 242 | 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU, 243 | 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU, 244 | 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U, 245 | 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU, 246 | 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU, 247 | 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U, 248 | 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU, 249 | 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U, 250 | 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU, 251 | 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U, 252 | 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U, 253 | 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U, 254 | 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU, 255 | 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U, 256 | 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU, 257 | 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U, 258 | 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU, 259 | 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U, 260 | 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U, 261 | 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU, 262 | 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU, 263 | 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU, 264 | 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U, 265 | 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U, 266 | 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU, 267 | 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U, 268 | 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU, 269 | 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U, 270 | 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU, 271 | 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U, 272 | 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU, 273 | 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU, 274 | 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U, 275 | 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU, 276 | 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U, 277 | 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU, 278 | 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U, 279 | 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U, 280 | 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U, 281 | 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU, 282 | 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU, 283 | 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U, 284 | 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU, 285 | 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U, 286 | }; 287 | u32 T3[TABLE_SIZE] = { 288 | 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U, 289 | 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U, 290 | 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U, 291 | 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU, 292 | 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU, 293 | 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU, 294 | 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U, 295 | 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU, 296 | 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU, 297 | 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U, 298 | 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U, 299 | 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU, 300 | 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU, 301 | 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU, 302 | 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU, 303 | 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU, 304 | 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U, 305 | 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU, 306 | 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU, 307 | 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U, 308 | 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U, 309 | 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U, 310 | 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U, 311 | 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U, 312 | 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU, 313 | 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U, 314 | 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU, 315 | 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU, 316 | 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U, 317 | 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U, 318 | 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U, 319 | 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU, 320 | 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U, 321 | 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU, 322 | 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU, 323 | 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U, 324 | 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U, 325 | 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU, 326 | 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U, 327 | 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU, 328 | 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U, 329 | 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U, 330 | 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U, 331 | 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U, 332 | 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU, 333 | 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U, 334 | 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU, 335 | 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U, 336 | 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU, 337 | 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U, 338 | 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU, 339 | 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU, 340 | 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU, 341 | 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU, 342 | 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U, 343 | 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U, 344 | 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U, 345 | 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U, 346 | 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U, 347 | 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U, 348 | 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU, 349 | 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U, 350 | 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU, 351 | 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU, 352 | }; 353 | u32 T4[TABLE_SIZE] = { 354 | 0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU, 355 | 0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U, 356 | 0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU, 357 | 0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U, 358 | 0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU, 359 | 0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U, 360 | 0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU, 361 | 0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U, 362 | 0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U, 363 | 0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU, 364 | 0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U, 365 | 0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U, 366 | 0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U, 367 | 0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU, 368 | 0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U, 369 | 0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U, 370 | 0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU, 371 | 0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U, 372 | 0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U, 373 | 0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U, 374 | 0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU, 375 | 0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU, 376 | 0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U, 377 | 0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU, 378 | 0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU, 379 | 0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U, 380 | 0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU, 381 | 0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U, 382 | 0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU, 383 | 0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U, 384 | 0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U, 385 | 0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U, 386 | 0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU, 387 | 0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U, 388 | 0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU, 389 | 0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U, 390 | 0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU, 391 | 0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U, 392 | 0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U, 393 | 0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU, 394 | 0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU, 395 | 0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU, 396 | 0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U, 397 | 0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U, 398 | 0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU, 399 | 0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U, 400 | 0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU, 401 | 0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U, 402 | 0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU, 403 | 0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U, 404 | 0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU, 405 | 0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU, 406 | 0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U, 407 | 0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU, 408 | 0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U, 409 | 0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU, 410 | 0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U, 411 | 0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U, 412 | 0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U, 413 | 0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU, 414 | 0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU, 415 | 0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U, 416 | 0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU, 417 | 0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U, 418 | }; 419 | u32 T4_0[TABLE_SIZE] = { 420 | 0x00000063U, 0x0000007cU, 0x00000077U, 0x0000007bU, 421 | 0x000000f2U, 0x0000006bU, 0x0000006fU, 0x000000c5U, 422 | 0x00000030U, 0x00000001U, 0x00000067U, 0x0000002bU, 423 | 0x000000feU, 0x000000d7U, 0x000000abU, 0x00000076U, 424 | 0x000000caU, 0x00000082U, 0x000000c9U, 0x0000007dU, 425 | 0x000000faU, 0x00000059U, 0x00000047U, 0x000000f0U, 426 | 0x000000adU, 0x000000d4U, 0x000000a2U, 0x000000afU, 427 | 0x0000009cU, 0x000000a4U, 0x00000072U, 0x000000c0U, 428 | 0x000000b7U, 0x000000fdU, 0x00000093U, 0x00000026U, 429 | 0x00000036U, 0x0000003fU, 0x000000f7U, 0x000000ccU, 430 | 0x00000034U, 0x000000a5U, 0x000000e5U, 0x000000f1U, 431 | 0x00000071U, 0x000000d8U, 0x00000031U, 0x00000015U, 432 | 0x00000004U, 0x000000c7U, 0x00000023U, 0x000000c3U, 433 | 0x00000018U, 0x00000096U, 0x00000005U, 0x0000009aU, 434 | 0x00000007U, 0x00000012U, 0x00000080U, 0x000000e2U, 435 | 0x000000ebU, 0x00000027U, 0x000000b2U, 0x00000075U, 436 | 0x00000009U, 0x00000083U, 0x0000002cU, 0x0000001aU, 437 | 0x0000001bU, 0x0000006eU, 0x0000005aU, 0x000000a0U, 438 | 0x00000052U, 0x0000003bU, 0x000000d6U, 0x000000b3U, 439 | 0x00000029U, 0x000000e3U, 0x0000002fU, 0x00000084U, 440 | 0x00000053U, 0x000000d1U, 0x00000000U, 0x000000edU, 441 | 0x00000020U, 0x000000fcU, 0x000000b1U, 0x0000005bU, 442 | 0x0000006aU, 0x000000cbU, 0x000000beU, 0x00000039U, 443 | 0x0000004aU, 0x0000004cU, 0x00000058U, 0x000000cfU, 444 | 0x000000d0U, 0x000000efU, 0x000000aaU, 0x000000fbU, 445 | 0x00000043U, 0x0000004dU, 0x00000033U, 0x00000085U, 446 | 0x00000045U, 0x000000f9U, 0x00000002U, 0x0000007fU, 447 | 0x00000050U, 0x0000003cU, 0x0000009fU, 0x000000a8U, 448 | 0x00000051U, 0x000000a3U, 0x00000040U, 0x0000008fU, 449 | 0x00000092U, 0x0000009dU, 0x00000038U, 0x000000f5U, 450 | 0x000000bcU, 0x000000b6U, 0x000000daU, 0x00000021U, 451 | 0x00000010U, 0x000000ffU, 0x000000f3U, 0x000000d2U, 452 | 0x000000cdU, 0x0000000cU, 0x00000013U, 0x000000ecU, 453 | 0x0000005fU, 0x00000097U, 0x00000044U, 0x00000017U, 454 | 0x000000c4U, 0x000000a7U, 0x0000007eU, 0x0000003dU, 455 | 0x00000064U, 0x0000005dU, 0x00000019U, 0x00000073U, 456 | 0x00000060U, 0x00000081U, 0x0000004fU, 0x000000dcU, 457 | 0x00000022U, 0x0000002aU, 0x00000090U, 0x00000088U, 458 | 0x00000046U, 0x000000eeU, 0x000000b8U, 0x00000014U, 459 | 0x000000deU, 0x0000005eU, 0x0000000bU, 0x000000dbU, 460 | 0x000000e0U, 0x00000032U, 0x0000003aU, 0x0000000aU, 461 | 0x00000049U, 0x00000006U, 0x00000024U, 0x0000005cU, 462 | 0x000000c2U, 0x000000d3U, 0x000000acU, 0x00000062U, 463 | 0x00000091U, 0x00000095U, 0x000000e4U, 0x00000079U, 464 | 0x000000e7U, 0x000000c8U, 0x00000037U, 0x0000006dU, 465 | 0x0000008dU, 0x000000d5U, 0x0000004eU, 0x000000a9U, 466 | 0x0000006cU, 0x00000056U, 0x000000f4U, 0x000000eaU, 467 | 0x00000065U, 0x0000007aU, 0x000000aeU, 0x00000008U, 468 | 0x000000baU, 0x00000078U, 0x00000025U, 0x0000002eU, 469 | 0x0000001cU, 0x000000a6U, 0x000000b4U, 0x000000c6U, 470 | 0x000000e8U, 0x000000ddU, 0x00000074U, 0x0000001fU, 471 | 0x0000004bU, 0x000000bdU, 0x0000008bU, 0x0000008aU, 472 | 0x00000070U, 0x0000003eU, 0x000000b5U, 0x00000066U, 473 | 0x00000048U, 0x00000003U, 0x000000f6U, 0x0000000eU, 474 | 0x00000061U, 0x00000035U, 0x00000057U, 0x000000b9U, 475 | 0x00000086U, 0x000000c1U, 0x0000001dU, 0x0000009eU, 476 | 0x000000e1U, 0x000000f8U, 0x00000098U, 0x00000011U, 477 | 0x00000069U, 0x000000d9U, 0x0000008eU, 0x00000094U, 478 | 0x0000009bU, 0x0000001eU, 0x00000087U, 0x000000e9U, 479 | 0x000000ceU, 0x00000055U, 0x00000028U, 0x000000dfU, 480 | 0x0000008cU, 0x000000a1U, 0x00000089U, 0x0000000dU, 481 | 0x000000bfU, 0x000000e6U, 0x00000042U, 0x00000068U, 482 | 0x00000041U, 0x00000099U, 0x0000002dU, 0x0000000fU, 483 | 0x000000b0U, 0x00000054U, 0x000000bbU, 0x00000016U, 484 | }; 485 | u32 T4_1[TABLE_SIZE] = { 486 | 0x00006300U, 0x00007c00U, 0x00007700U, 0x00007b00U, 487 | 0x0000f200U, 0x00006b00U, 0x00006f00U, 0x0000c500U, 488 | 0x00003000U, 0x00000100U, 0x00006700U, 0x00002b00U, 489 | 0x0000fe00U, 0x0000d700U, 0x0000ab00U, 0x00007600U, 490 | 0x0000ca00U, 0x00008200U, 0x0000c900U, 0x00007d00U, 491 | 0x0000fa00U, 0x00005900U, 0x00004700U, 0x0000f000U, 492 | 0x0000ad00U, 0x0000d400U, 0x0000a200U, 0x0000af00U, 493 | 0x00009c00U, 0x0000a400U, 0x00007200U, 0x0000c000U, 494 | 0x0000b700U, 0x0000fd00U, 0x00009300U, 0x00002600U, 495 | 0x00003600U, 0x00003f00U, 0x0000f700U, 0x0000cc00U, 496 | 0x00003400U, 0x0000a500U, 0x0000e500U, 0x0000f100U, 497 | 0x00007100U, 0x0000d800U, 0x00003100U, 0x00001500U, 498 | 0x00000400U, 0x0000c700U, 0x00002300U, 0x0000c300U, 499 | 0x00001800U, 0x00009600U, 0x00000500U, 0x00009a00U, 500 | 0x00000700U, 0x00001200U, 0x00008000U, 0x0000e200U, 501 | 0x0000eb00U, 0x00002700U, 0x0000b200U, 0x00007500U, 502 | 0x00000900U, 0x00008300U, 0x00002c00U, 0x00001a00U, 503 | 0x00001b00U, 0x00006e00U, 0x00005a00U, 0x0000a000U, 504 | 0x00005200U, 0x00003b00U, 0x0000d600U, 0x0000b300U, 505 | 0x00002900U, 0x0000e300U, 0x00002f00U, 0x00008400U, 506 | 0x00005300U, 0x0000d100U, 0x00000000U, 0x0000ed00U, 507 | 0x00002000U, 0x0000fc00U, 0x0000b100U, 0x00005b00U, 508 | 0x00006a00U, 0x0000cb00U, 0x0000be00U, 0x00003900U, 509 | 0x00004a00U, 0x00004c00U, 0x00005800U, 0x0000cf00U, 510 | 0x0000d000U, 0x0000ef00U, 0x0000aa00U, 0x0000fb00U, 511 | 0x00004300U, 0x00004d00U, 0x00003300U, 0x00008500U, 512 | 0x00004500U, 0x0000f900U, 0x00000200U, 0x00007f00U, 513 | 0x00005000U, 0x00003c00U, 0x00009f00U, 0x0000a800U, 514 | 0x00005100U, 0x0000a300U, 0x00004000U, 0x00008f00U, 515 | 0x00009200U, 0x00009d00U, 0x00003800U, 0x0000f500U, 516 | 0x0000bc00U, 0x0000b600U, 0x0000da00U, 0x00002100U, 517 | 0x00001000U, 0x0000ff00U, 0x0000f300U, 0x0000d200U, 518 | 0x0000cd00U, 0x00000c00U, 0x00001300U, 0x0000ec00U, 519 | 0x00005f00U, 0x00009700U, 0x00004400U, 0x00001700U, 520 | 0x0000c400U, 0x0000a700U, 0x00007e00U, 0x00003d00U, 521 | 0x00006400U, 0x00005d00U, 0x00001900U, 0x00007300U, 522 | 0x00006000U, 0x00008100U, 0x00004f00U, 0x0000dc00U, 523 | 0x00002200U, 0x00002a00U, 0x00009000U, 0x00008800U, 524 | 0x00004600U, 0x0000ee00U, 0x0000b800U, 0x00001400U, 525 | 0x0000de00U, 0x00005e00U, 0x00000b00U, 0x0000db00U, 526 | 0x0000e000U, 0x00003200U, 0x00003a00U, 0x00000a00U, 527 | 0x00004900U, 0x00000600U, 0x00002400U, 0x00005c00U, 528 | 0x0000c200U, 0x0000d300U, 0x0000ac00U, 0x00006200U, 529 | 0x00009100U, 0x00009500U, 0x0000e400U, 0x00007900U, 530 | 0x0000e700U, 0x0000c800U, 0x00003700U, 0x00006d00U, 531 | 0x00008d00U, 0x0000d500U, 0x00004e00U, 0x0000a900U, 532 | 0x00006c00U, 0x00005600U, 0x0000f400U, 0x0000ea00U, 533 | 0x00006500U, 0x00007a00U, 0x0000ae00U, 0x00000800U, 534 | 0x0000ba00U, 0x00007800U, 0x00002500U, 0x00002e00U, 535 | 0x00001c00U, 0x0000a600U, 0x0000b400U, 0x0000c600U, 536 | 0x0000e800U, 0x0000dd00U, 0x00007400U, 0x00001f00U, 537 | 0x00004b00U, 0x0000bd00U, 0x00008b00U, 0x00008a00U, 538 | 0x00007000U, 0x00003e00U, 0x0000b500U, 0x00006600U, 539 | 0x00004800U, 0x00000300U, 0x0000f600U, 0x00000e00U, 540 | 0x00006100U, 0x00003500U, 0x00005700U, 0x0000b900U, 541 | 0x00008600U, 0x0000c100U, 0x00001d00U, 0x00009e00U, 542 | 0x0000e100U, 0x0000f800U, 0x00009800U, 0x00001100U, 543 | 0x00006900U, 0x0000d900U, 0x00008e00U, 0x00009400U, 544 | 0x00009b00U, 0x00001e00U, 0x00008700U, 0x0000e900U, 545 | 0x0000ce00U, 0x00005500U, 0x00002800U, 0x0000df00U, 546 | 0x00008c00U, 0x0000a100U, 0x00008900U, 0x00000d00U, 547 | 0x0000bf00U, 0x0000e600U, 0x00004200U, 0x00006800U, 548 | 0x00004100U, 0x00009900U, 0x00002d00U, 0x00000f00U, 549 | 0x0000b000U, 0x00005400U, 0x0000bb00U, 0x00001600U, 550 | }; 551 | u32 T4_2[TABLE_SIZE] = { 552 | 0x00630000U, 0x007c0000U, 0x00770000U, 0x007b0000U, 553 | 0x00f20000U, 0x006b0000U, 0x006f0000U, 0x00c50000U, 554 | 0x00300000U, 0x00010000U, 0x00670000U, 0x002b0000U, 555 | 0x00fe0000U, 0x00d70000U, 0x00ab0000U, 0x00760000U, 556 | 0x00ca0000U, 0x00820000U, 0x00c90000U, 0x007d0000U, 557 | 0x00fa0000U, 0x00590000U, 0x00470000U, 0x00f00000U, 558 | 0x00ad0000U, 0x00d40000U, 0x00a20000U, 0x00af0000U, 559 | 0x009c0000U, 0x00a40000U, 0x00720000U, 0x00c00000U, 560 | 0x00b70000U, 0x00fd0000U, 0x00930000U, 0x00260000U, 561 | 0x00360000U, 0x003f0000U, 0x00f70000U, 0x00cc0000U, 562 | 0x00340000U, 0x00a50000U, 0x00e50000U, 0x00f10000U, 563 | 0x00710000U, 0x00d80000U, 0x00310000U, 0x00150000U, 564 | 0x00040000U, 0x00c70000U, 0x00230000U, 0x00c30000U, 565 | 0x00180000U, 0x00960000U, 0x00050000U, 0x009a0000U, 566 | 0x00070000U, 0x00120000U, 0x00800000U, 0x00e20000U, 567 | 0x00eb0000U, 0x00270000U, 0x00b20000U, 0x00750000U, 568 | 0x00090000U, 0x00830000U, 0x002c0000U, 0x001a0000U, 569 | 0x001b0000U, 0x006e0000U, 0x005a0000U, 0x00a00000U, 570 | 0x00520000U, 0x003b0000U, 0x00d60000U, 0x00b30000U, 571 | 0x00290000U, 0x00e30000U, 0x002f0000U, 0x00840000U, 572 | 0x00530000U, 0x00d10000U, 0x00000000U, 0x00ed0000U, 573 | 0x00200000U, 0x00fc0000U, 0x00b10000U, 0x005b0000U, 574 | 0x006a0000U, 0x00cb0000U, 0x00be0000U, 0x00390000U, 575 | 0x004a0000U, 0x004c0000U, 0x00580000U, 0x00cf0000U, 576 | 0x00d00000U, 0x00ef0000U, 0x00aa0000U, 0x00fb0000U, 577 | 0x00430000U, 0x004d0000U, 0x00330000U, 0x00850000U, 578 | 0x00450000U, 0x00f90000U, 0x00020000U, 0x007f0000U, 579 | 0x00500000U, 0x003c0000U, 0x009f0000U, 0x00a80000U, 580 | 0x00510000U, 0x00a30000U, 0x00400000U, 0x008f0000U, 581 | 0x00920000U, 0x009d0000U, 0x00380000U, 0x00f50000U, 582 | 0x00bc0000U, 0x00b60000U, 0x00da0000U, 0x00210000U, 583 | 0x00100000U, 0x00ff0000U, 0x00f30000U, 0x00d20000U, 584 | 0x00cd0000U, 0x000c0000U, 0x00130000U, 0x00ec0000U, 585 | 0x005f0000U, 0x00970000U, 0x00440000U, 0x00170000U, 586 | 0x00c40000U, 0x00a70000U, 0x007e0000U, 0x003d0000U, 587 | 0x00640000U, 0x005d0000U, 0x00190000U, 0x00730000U, 588 | 0x00600000U, 0x00810000U, 0x004f0000U, 0x00dc0000U, 589 | 0x00220000U, 0x002a0000U, 0x00900000U, 0x00880000U, 590 | 0x00460000U, 0x00ee0000U, 0x00b80000U, 0x00140000U, 591 | 0x00de0000U, 0x005e0000U, 0x000b0000U, 0x00db0000U, 592 | 0x00e00000U, 0x00320000U, 0x003a0000U, 0x000a0000U, 593 | 0x00490000U, 0x00060000U, 0x00240000U, 0x005c0000U, 594 | 0x00c20000U, 0x00d30000U, 0x00ac0000U, 0x00620000U, 595 | 0x00910000U, 0x00950000U, 0x00e40000U, 0x00790000U, 596 | 0x00e70000U, 0x00c80000U, 0x00370000U, 0x006d0000U, 597 | 0x008d0000U, 0x00d50000U, 0x004e0000U, 0x00a90000U, 598 | 0x006c0000U, 0x00560000U, 0x00f40000U, 0x00ea0000U, 599 | 0x00650000U, 0x007a0000U, 0x00ae0000U, 0x00080000U, 600 | 0x00ba0000U, 0x00780000U, 0x00250000U, 0x002e0000U, 601 | 0x001c0000U, 0x00a60000U, 0x00b40000U, 0x00c60000U, 602 | 0x00e80000U, 0x00dd0000U, 0x00740000U, 0x001f0000U, 603 | 0x004b0000U, 0x00bd0000U, 0x008b0000U, 0x008a0000U, 604 | 0x00700000U, 0x003e0000U, 0x00b50000U, 0x00660000U, 605 | 0x00480000U, 0x00030000U, 0x00f60000U, 0x000e0000U, 606 | 0x00610000U, 0x00350000U, 0x00570000U, 0x00b90000U, 607 | 0x00860000U, 0x00c10000U, 0x001d0000U, 0x009e0000U, 608 | 0x00e10000U, 0x00f80000U, 0x00980000U, 0x00110000U, 609 | 0x00690000U, 0x00d90000U, 0x008e0000U, 0x00940000U, 610 | 0x009b0000U, 0x001e0000U, 0x00870000U, 0x00e90000U, 611 | 0x00ce0000U, 0x00550000U, 0x00280000U, 0x00df0000U, 612 | 0x008c0000U, 0x00a10000U, 0x00890000U, 0x000d0000U, 613 | 0x00bf0000U, 0x00e60000U, 0x00420000U, 0x00680000U, 614 | 0x00410000U, 0x00990000U, 0x002d0000U, 0x000f0000U, 615 | 0x00b00000U, 0x00540000U, 0x00bb0000U, 0x00160000U, 616 | }; 617 | u32 T4_3[TABLE_SIZE] = { 618 | 0x63000000U, 0x7c000000U, 0x77000000U, 0x7b000000U, 619 | 0xf2000000U, 0x6b000000U, 0x6f000000U, 0xc5000000U, 620 | 0x30000000U, 0x01000000U, 0x67000000U, 0x2b000000U, 621 | 0xfe000000U, 0xd7000000U, 0xab000000U, 0x76000000U, 622 | 0xca000000U, 0x82000000U, 0xc9000000U, 0x7d000000U, 623 | 0xfa000000U, 0x59000000U, 0x47000000U, 0xf0000000U, 624 | 0xad000000U, 0xd4000000U, 0xa2000000U, 0xaf000000U, 625 | 0x9c000000U, 0xa4000000U, 0x72000000U, 0xc0000000U, 626 | 0xb7000000U, 0xfd000000U, 0x93000000U, 0x26000000U, 627 | 0x36000000U, 0x3f000000U, 0xf7000000U, 0xcc000000U, 628 | 0x34000000U, 0xa5000000U, 0xe5000000U, 0xf1000000U, 629 | 0x71000000U, 0xd8000000U, 0x31000000U, 0x15000000U, 630 | 0x04000000U, 0xc7000000U, 0x23000000U, 0xc3000000U, 631 | 0x18000000U, 0x96000000U, 0x05000000U, 0x9a000000U, 632 | 0x07000000U, 0x12000000U, 0x80000000U, 0xe2000000U, 633 | 0xeb000000U, 0x27000000U, 0xb2000000U, 0x75000000U, 634 | 0x09000000U, 0x83000000U, 0x2c000000U, 0x1a000000U, 635 | 0x1b000000U, 0x6e000000U, 0x5a000000U, 0xa0000000U, 636 | 0x52000000U, 0x3b000000U, 0xd6000000U, 0xb3000000U, 637 | 0x29000000U, 0xe3000000U, 0x2f000000U, 0x84000000U, 638 | 0x53000000U, 0xd1000000U, 0x00000000U, 0xed000000U, 639 | 0x20000000U, 0xfc000000U, 0xb1000000U, 0x5b000000U, 640 | 0x6a000000U, 0xcb000000U, 0xbe000000U, 0x39000000U, 641 | 0x4a000000U, 0x4c000000U, 0x58000000U, 0xcf000000U, 642 | 0xd0000000U, 0xef000000U, 0xaa000000U, 0xfb000000U, 643 | 0x43000000U, 0x4d000000U, 0x33000000U, 0x85000000U, 644 | 0x45000000U, 0xf9000000U, 0x02000000U, 0x7f000000U, 645 | 0x50000000U, 0x3c000000U, 0x9f000000U, 0xa8000000U, 646 | 0x51000000U, 0xa3000000U, 0x40000000U, 0x8f000000U, 647 | 0x92000000U, 0x9d000000U, 0x38000000U, 0xf5000000U, 648 | 0xbc000000U, 0xb6000000U, 0xda000000U, 0x21000000U, 649 | 0x10000000U, 0xff000000U, 0xf3000000U, 0xd2000000U, 650 | 0xcd000000U, 0x0c000000U, 0x13000000U, 0xec000000U, 651 | 0x5f000000U, 0x97000000U, 0x44000000U, 0x17000000U, 652 | 0xc4000000U, 0xa7000000U, 0x7e000000U, 0x3d000000U, 653 | 0x64000000U, 0x5d000000U, 0x19000000U, 0x73000000U, 654 | 0x60000000U, 0x81000000U, 0x4f000000U, 0xdc000000U, 655 | 0x22000000U, 0x2a000000U, 0x90000000U, 0x88000000U, 656 | 0x46000000U, 0xee000000U, 0xb8000000U, 0x14000000U, 657 | 0xde000000U, 0x5e000000U, 0x0b000000U, 0xdb000000U, 658 | 0xe0000000U, 0x32000000U, 0x3a000000U, 0x0a000000U, 659 | 0x49000000U, 0x06000000U, 0x24000000U, 0x5c000000U, 660 | 0xc2000000U, 0xd3000000U, 0xac000000U, 0x62000000U, 661 | 0x91000000U, 0x95000000U, 0xe4000000U, 0x79000000U, 662 | 0xe7000000U, 0xc8000000U, 0x37000000U, 0x6d000000U, 663 | 0x8d000000U, 0xd5000000U, 0x4e000000U, 0xa9000000U, 664 | 0x6c000000U, 0x56000000U, 0xf4000000U, 0xea000000U, 665 | 0x65000000U, 0x7a000000U, 0xae000000U, 0x08000000U, 666 | 0xba000000U, 0x78000000U, 0x25000000U, 0x2e000000U, 667 | 0x1c000000U, 0xa6000000U, 0xb4000000U, 0xc6000000U, 668 | 0xe8000000U, 0xdd000000U, 0x74000000U, 0x1f000000U, 669 | 0x4b000000U, 0xbd000000U, 0x8b000000U, 0x8a000000U, 670 | 0x70000000U, 0x3e000000U, 0xb5000000U, 0x66000000U, 671 | 0x48000000U, 0x03000000U, 0xf6000000U, 0x0e000000U, 672 | 0x61000000U, 0x35000000U, 0x57000000U, 0xb9000000U, 673 | 0x86000000U, 0xc1000000U, 0x1d000000U, 0x9e000000U, 674 | 0xe1000000U, 0xf8000000U, 0x98000000U, 0x11000000U, 675 | 0x69000000U, 0xd9000000U, 0x8e000000U, 0x94000000U, 676 | 0x9b000000U, 0x1e000000U, 0x87000000U, 0xe9000000U, 677 | 0xce000000U, 0x55000000U, 0x28000000U, 0xdf000000U, 678 | 0x8c000000U, 0xa1000000U, 0x89000000U, 0x0d000000U, 679 | 0xbf000000U, 0xe6000000U, 0x42000000U, 0x68000000U, 680 | 0x41000000U, 0x99000000U, 0x2d000000U, 0x0f000000U, 681 | 0xb0000000U, 0x54000000U, 0xbb000000U, 0x16000000U, 682 | }; 683 | u32 RCON32[RCON_SIZE] = { 684 | 0x01000000, 0x02000000, 0x04000000, 0x08000000, 685 | 0x10000000, 0x20000000, 0x40000000, 0x80000000, 686 | 0x1B000000, 0x36000000, 0x6C000000, 0xD8000000, 687 | 0xAB000000, 0x4D000000, 0x9A000000 688 | }; 689 | 690 | // Small AES 691 | #define PROB_SIZE_1 16 692 | #define PROB_SIZE_2 256 693 | #define PROB_SIZE_3 4096 694 | #define PROB_SIZE_4 65536 695 | #define ROUND_5 5 696 | 697 | u16 T0_SML[16] = { 698 | 0xc66a, 0x6bbd, 0xa55f, 0x844c, 699 | 0x4226, 0xcef2, 0xe779, 0x4abe, 700 | 0x299b, 0xadd7, 0xeff1, 0x8cd4, 701 | 0x6335, 0x2113, 0x0000, 0x0898 702 | }; 703 | u16 T1_SML[16] = { 704 | 0xac66, 0xd6bb, 0xfa55, 0xc844, 705 | 0x6422, 0x3cee, 0x9e77, 0xf4aa, 706 | 0xb299, 0x7add, 0x1eff, 0x58cc, 707 | 0x5633, 0x3211, 0x0000, 0x9088 708 | }; 709 | u16 T2_SML[16] = { 710 | 0x6ac6, 0xbd6b, 0x5fa5, 0x4c84, 711 | 0x2642, 0xf3ce, 0x79e7, 0xbf4a, 712 | 0x9b29, 0xd7ad, 0xf1ef, 0xd58c, 713 | 0x3563, 0x1321, 0x0000, 0x9908 714 | }; 715 | u16 T3_SML[16] = { 716 | 0x66ac, 0xbbd6, 0x55fa, 0x44c8, 717 | 0x2264, 0xef3c, 0x779e, 0xabf4, 718 | 0x99b2, 0xdd7a, 0xff1e, 0xcd58, 719 | 0x3356, 0x1132, 0x0000, 0x8990 720 | }; 721 | u16 T4_SML[16] = { 722 | 0x6666, 0xbbbb, 0x5555, 0x4444, 723 | 0x2222, 0xeeee, 0x7777, 0xaaaa, 724 | 0x9999, 0xdddd, 0xffff, 0xcccc, 725 | 0x3333, 0x1111, 0x0000, 0x8888 726 | }; 727 | u16 RCON_SML[16] = { 728 | 0x1000, 0x2000, 0x3000, 0x4000, 729 | 0x5000, 0x6000, 0x7000, 0x8000, 730 | 0x9000, 0xa000, 0xb000, 0xc000, 731 | 0xd000, 0xe000, 0xf000, 0x0000 732 | }; 733 | 734 | int main(); 735 | -------------------------------------------------------------------------------- /128-es.cuh: -------------------------------------------------------------------------------- 1 | // System includes 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // CUDA runtime 8 | #include 9 | 10 | // Helper functions and utilities to work with CUDA 11 | //#include 12 | //#include 13 | 14 | #include 15 | //#include 16 | 17 | // Custom header 18 | //#include "kernel.h" 19 | 20 | 21 | // Basic exhaustive search 22 | // 4 Tables 23 | __global__ void exhaustiveSearch(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t1G, u32* t2G, u32* t3G, u32* t4G, u32* rconG, u64* range) { 24 | 25 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 26 | 27 | // 28 | __shared__ u32 t0S[TABLE_SIZE]; 29 | __shared__ u32 t1S[TABLE_SIZE]; 30 | __shared__ u32 t2S[TABLE_SIZE]; 31 | __shared__ u32 t3S[TABLE_SIZE]; 32 | __shared__ u32 t4S[TABLE_SIZE]; 33 | __shared__ u32 rconS[RCON_SIZE]; 34 | __shared__ u32 ctS[U32_SIZE]; 35 | 36 | 37 | if (threadIdx.x < TABLE_SIZE) { 38 | t0S[threadIdx.x] = t0G[threadIdx.x]; 39 | t1S[threadIdx.x] = t1G[threadIdx.x]; 40 | t2S[threadIdx.x] = t2G[threadIdx.x]; 41 | t3S[threadIdx.x] = t3G[threadIdx.x]; 42 | t4S[threadIdx.x] = t4G[threadIdx.x]; 43 | 44 | if (threadIdx.x < RCON_SIZE) { 45 | rconS[threadIdx.x] = rconG[threadIdx.x]; 46 | } 47 | 48 | if (threadIdx.x < U32_SIZE) { 49 | ctS[threadIdx.x] = ct[threadIdx.x]; 50 | } 51 | } 52 | // 53 | 54 | // Wait until every thread is ready 55 | __syncthreads(); 56 | 57 | u32 rk0Init, rk1Init, rk2Init, rk3Init; 58 | rk0Init = rk[0]; 59 | rk1Init = rk[1]; 60 | rk2Init = rk[2]; 61 | rk3Init = rk[3]; 62 | 63 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 64 | pt0Init = pt[0]; 65 | pt1Init = pt[1]; 66 | pt2Init = pt[2]; 67 | pt3Init = pt[3]; 68 | 69 | u64 threadRange = *range; 70 | u64 threadRangeStart = (u64)threadIndex * threadRange; 71 | rk2Init = rk2Init + threadRangeStart / MAX_U32; 72 | rk3Init = rk3Init + threadRangeStart % MAX_U32; 73 | 74 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 75 | 76 | u32 rk0, rk1, rk2, rk3; 77 | rk0 = rk0Init; 78 | rk1 = rk1Init; 79 | rk2 = rk2Init; 80 | rk3 = rk3Init; 81 | 82 | // Create plaintext as 32 bit unsigned integers 83 | u32 s0, s1, s2, s3; 84 | s0 = pt0Init; 85 | s1 = pt1Init; 86 | s2 = pt2Init; 87 | s3 = pt3Init; 88 | 89 | // First round just XORs input with key. 90 | s0 = s0 ^ rk0; 91 | s1 = s1 ^ rk1; 92 | s2 = s2 ^ rk2; 93 | s3 = s3 ^ rk3; 94 | 95 | //if (threadIndex == 0 && rangeCount == 0) { 96 | // printf("--Round: %d\n", 0); 97 | // printf("%08x%08x%08x%08x\n", s0, s1, s2, s3); 98 | // printf("-- Round Key\n"); 99 | // printf("%08x%08x%08x%08x\n", rk0, rk1, rk2, rk3); 100 | //} 101 | 102 | u32 t0, t1, t2, t3; 103 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) { 104 | 105 | // Calculate round key 106 | u32 temp = rk3; 107 | // TODO: temp & 0xff000000 108 | rk0 = rk0 ^ 109 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^ 110 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^ 111 | (t4S[(temp) & 0xff] & 0x0000ff00) ^ 112 | (t4S[(temp >> 24)] & 0x000000ff) ^ 113 | rconS[roundCount]; 114 | rk1 = rk1 ^ rk0; 115 | rk2 = rk2 ^ rk1; 116 | rk3 = rk2 ^ rk3; 117 | 118 | // Table based round function 119 | t0 = t0S[s0 >> 24] ^ t1S[(s1 >> 16) & 0xFF] ^ t2S[(s2 >> 8) & 0xFF] ^ t3S[s3 & 0xFF] ^ rk0; 120 | t1 = t0S[s1 >> 24] ^ t1S[(s2 >> 16) & 0xFF] ^ t2S[(s3 >> 8) & 0xFF] ^ t3S[s0 & 0xFF] ^ rk1; 121 | t2 = t0S[s2 >> 24] ^ t1S[(s3 >> 16) & 0xFF] ^ t2S[(s0 >> 8) & 0xFF] ^ t3S[s1 & 0xFF] ^ rk2; 122 | t3 = t0S[s3 >> 24] ^ t1S[(s0 >> 16) & 0xFF] ^ t2S[(s1 >> 8) & 0xFF] ^ t3S[s2 & 0xFF] ^ rk3; 123 | 124 | s0 = t0; 125 | s1 = t1; 126 | s2 = t2; 127 | s3 = t3; 128 | 129 | //if (threadIndex == 0 && rangeCount == 0) { 130 | // printf("--Round: %d\n", roundCount); 131 | // printf("%08x%08x%08x%08x\n", s0, s1, s2, s3); 132 | // printf("-- Round Key\n"); 133 | // printf("%08x%08x%08x%08x\n", rk0, rk1, rk2, rk3); 134 | //} 135 | } 136 | 137 | // Calculate the last round key 138 | u32 temp = rk3; 139 | rk0 = rk0 ^ 140 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^ 141 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^ 142 | (t4S[(temp) & 0xff] & 0x0000ff00) ^ 143 | (t4S[(temp >> 24)] & 0x000000ff) ^ 144 | rconS[ROUND_COUNT_MIN_1]; 145 | // Last round uses s-box directly and XORs to produce output. 146 | s0 = (t4S[t0 >> 24] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t3) & 0xFF] & 0x000000FF) ^ rk0; 147 | if (s0 == ctS[0]) { 148 | rk1 = rk1 ^ rk0; 149 | s1 = (t4S[t1 >> 24] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t0) & 0xFF] & 0x000000FF) ^ rk1; 150 | if (s1 == ctS[1]) { 151 | rk2 = rk2 ^ rk1; 152 | s2 = (t4S[t2 >> 24] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t1) & 0xFF] & 0x000000FF) ^ rk2; 153 | if (s2 == ctS[2]) { 154 | rk3 = rk2 ^ rk3; 155 | s3 = (t4S[t3 >> 24] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t2) & 0xFF] & 0x000000FF) ^ rk3; 156 | if (s3 == ctS[3]) { 157 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init); 158 | printf("-------------------------------\n"); 159 | } 160 | } 161 | } 162 | } 163 | 164 | // Overflow 165 | if (rk3Init == MAX_U32) { 166 | rk2Init++; 167 | } 168 | 169 | // Create key as 32 bit unsigned integers 170 | rk3Init++; 171 | } 172 | } 173 | 174 | // Exhaustive search with one table 175 | // 1 Table -> arithmetic shift: 2 shift 1 and 176 | __global__ void exhaustiveSearchWithOneTable(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range) { 177 | 178 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 179 | 180 | // 181 | __shared__ u32 t0S[TABLE_SIZE]; 182 | __shared__ u32 t4S[TABLE_SIZE]; 183 | __shared__ u32 rconS[RCON_SIZE]; 184 | __shared__ u32 ctS[U32_SIZE]; 185 | 186 | if (threadIdx.x < TABLE_SIZE) { 187 | t0S[threadIdx.x] = t0G[threadIdx.x]; 188 | t4S[threadIdx.x] = t4G[threadIdx.x]; 189 | 190 | if (threadIdx.x < RCON_SIZE) { 191 | rconS[threadIdx.x] = rconG[threadIdx.x]; 192 | } 193 | 194 | if (threadIdx.x < U32_SIZE) { 195 | ctS[threadIdx.x] = ct[threadIdx.x]; 196 | } 197 | } 198 | // 199 | 200 | // Wait until every thread is ready 201 | __syncthreads(); 202 | 203 | u32 rk0Init, rk1Init, rk2Init, rk3Init; 204 | rk0Init = rk[0]; 205 | rk1Init = rk[1]; 206 | rk2Init = rk[2]; 207 | rk3Init = rk[3]; 208 | 209 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 210 | pt0Init = pt[0]; 211 | pt1Init = pt[1]; 212 | pt2Init = pt[2]; 213 | pt3Init = pt[3]; 214 | 215 | u64 threadRange = *range; 216 | u64 threadRangeStart = (u64)threadIndex * threadRange; 217 | rk2Init = rk2Init + threadRangeStart / MAX_U32; 218 | rk3Init = rk3Init + threadRangeStart % MAX_U32; 219 | 220 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 221 | 222 | u32 rk0, rk1, rk2, rk3; 223 | rk0 = rk0Init; 224 | rk1 = rk1Init; 225 | rk2 = rk2Init; 226 | rk3 = rk3Init; 227 | 228 | // Create plaintext as 32 bit unsigned integers 229 | u32 s0, s1, s2, s3; 230 | s0 = pt0Init; 231 | s1 = pt1Init; 232 | s2 = pt2Init; 233 | s3 = pt3Init; 234 | 235 | // First round just XORs input with key. 236 | s0 = s0 ^ rk0; 237 | s1 = s1 ^ rk1; 238 | s2 = s2 ^ rk2; 239 | s3 = s3 ^ rk3; 240 | 241 | u32 t0, t1, t2, t3; 242 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) { 243 | 244 | // Calculate round key 245 | u32 temp = rk3; 246 | rk0 = rk0 ^ 247 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^ 248 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^ 249 | (t4S[(temp) & 0xff] & 0x0000ff00) ^ 250 | (t4S[(temp >> 24)] & 0x000000ff) ^ 251 | rconS[roundCount]; 252 | rk1 = rk1 ^ rk0; 253 | rk2 = rk2 ^ rk1; 254 | rk3 = rk2 ^ rk3; 255 | 256 | // Table based round function 257 | t0 = t0S[s0 >> 24] ^ arithmeticRightShift(t0S[(s1 >> 16) & 0xFF], 8) ^ arithmeticRightShift(t0S[(s2 >> 8) & 0xFF], 16) ^ arithmeticRightShift(t0S[s3 & 0xFF], 24) ^ rk0; 258 | t1 = t0S[s1 >> 24] ^ arithmeticRightShift(t0S[(s2 >> 16) & 0xFF], 8) ^ arithmeticRightShift(t0S[(s3 >> 8) & 0xFF], 16) ^ arithmeticRightShift(t0S[s0 & 0xFF], 24) ^ rk1; 259 | t2 = t0S[s2 >> 24] ^ arithmeticRightShift(t0S[(s3 >> 16) & 0xFF], 8) ^ arithmeticRightShift(t0S[(s0 >> 8) & 0xFF], 16) ^ arithmeticRightShift(t0S[s1 & 0xFF], 24) ^ rk2; 260 | t3 = t0S[s3 >> 24] ^ arithmeticRightShift(t0S[(s0 >> 16) & 0xFF], 8) ^ arithmeticRightShift(t0S[(s1 >> 8) & 0xFF], 16) ^ arithmeticRightShift(t0S[s2 & 0xFF], 24) ^ rk3; 261 | 262 | s0 = t0; 263 | s1 = t1; 264 | s2 = t2; 265 | s3 = t3; 266 | 267 | } 268 | 269 | // Calculate the last round key 270 | u32 temp = rk3; 271 | rk0 = rk0 ^ 272 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^ 273 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^ 274 | (t4S[(temp) & 0xff] & 0x0000ff00) ^ 275 | (t4S[(temp >> 24)] & 0x000000ff) ^ 276 | rconS[ROUND_COUNT_MIN_1]; 277 | // Last round uses s-box directly and XORs to produce output. 278 | s0 = (t4S[t0 >> 24] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t3) & 0xFF] & 0x000000FF) ^ rk0; 279 | if (s0 == ctS[0]) { 280 | rk1 = rk1 ^ rk0; 281 | s1 = (t4S[t1 >> 24] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t0) & 0xFF] & 0x000000FF) ^ rk1; 282 | if (s1 == ctS[1]) { 283 | rk2 = rk2 ^ rk1; 284 | s2 = (t4S[t2 >> 24] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t1) & 0xFF] & 0x000000FF) ^ rk2; 285 | if (s2 == ctS[2]) { 286 | rk3 = rk2 ^ rk3; 287 | s3 = (t4S[t3 >> 24] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t2) & 0xFF] & 0x000000FF) ^ rk3; 288 | if (s3 == ctS[3]) { 289 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init); 290 | printf("-------------------------------\n"); 291 | } 292 | } 293 | } 294 | } 295 | 296 | // Overflow 297 | if (rk3Init == MAX_U32) { 298 | rk2Init++; 299 | } 300 | 301 | // Create key as 32 bit unsigned integers 302 | rk3Init++; 303 | } 304 | } 305 | 306 | // Exhaustive search with one table extended as 32 columns 307 | // 1 Table [256][32] -> arithmetic shift: 2 shift 1 and 308 | __global__ void exhaustiveSearchWithOneTableExtendedSharedMemory(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range) { 309 | 310 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 311 | int warpThreadIndex = threadIdx.x & 31; 312 | 313 | // 314 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 315 | __shared__ u32 t4S[TABLE_SIZE]; 316 | __shared__ u32 rconS[RCON_SIZE]; 317 | __shared__ u32 ctS[U32_SIZE]; 318 | 319 | if (threadIdx.x < TABLE_SIZE) { 320 | t4S[threadIdx.x] = t4G[threadIdx.x]; 321 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 322 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 323 | } 324 | 325 | if (threadIdx.x < RCON_SIZE) { 326 | rconS[threadIdx.x] = rconG[threadIdx.x]; 327 | } 328 | 329 | if (threadIdx.x < U32_SIZE) { 330 | ctS[threadIdx.x] = ct[threadIdx.x]; 331 | } 332 | } 333 | // 334 | 335 | // Wait until every thread is ready 336 | __syncthreads(); 337 | 338 | u32 rk0Init, rk1Init, rk2Init, rk3Init; 339 | rk0Init = rk[0]; 340 | rk1Init = rk[1]; 341 | rk2Init = rk[2]; 342 | rk3Init = rk[3]; 343 | 344 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 345 | pt0Init = pt[0]; 346 | pt1Init = pt[1]; 347 | pt2Init = pt[2]; 348 | pt3Init = pt[3]; 349 | 350 | u64 threadRange = *range; 351 | u64 threadRangeStart = (u64)threadIndex * threadRange; 352 | rk2Init = rk2Init + threadRangeStart / MAX_U32; 353 | rk3Init = rk3Init + threadRangeStart % MAX_U32; 354 | 355 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 356 | 357 | u32 rk0, rk1, rk2, rk3; 358 | rk0 = rk0Init; 359 | rk1 = rk1Init; 360 | rk2 = rk2Init; 361 | rk3 = rk3Init; 362 | 363 | // Create plaintext as 32 bit unsigned integers 364 | u32 s0, s1, s2, s3; 365 | s0 = pt0Init; 366 | s1 = pt1Init; 367 | s2 = pt2Init; 368 | s3 = pt3Init; 369 | 370 | // First round just XORs input with key. 371 | s0 = s0 ^ rk0; 372 | s1 = s1 ^ rk1; 373 | s2 = s2 ^ rk2; 374 | s3 = s3 ^ rk3; 375 | 376 | u32 t0, t1, t2, t3; 377 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) { 378 | 379 | // Calculate round key 380 | u32 temp = rk3; 381 | rk0 = rk0 ^ 382 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^ 383 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^ 384 | (t4S[(temp) & 0xff] & 0x0000ff00) ^ 385 | (t4S[(temp >> 24)] & 0x000000ff) ^ 386 | rconS[roundCount]; 387 | rk1 = rk1 ^ rk0; 388 | rk2 = rk2 ^ rk1; 389 | rk3 = rk2 ^ rk3; 390 | 391 | // Table based round function 392 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s3 & 0xFF][warpThreadIndex], 24) ^ rk0; 393 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s0 & 0xFF][warpThreadIndex], 24) ^ rk1; 394 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s1 & 0xFF][warpThreadIndex], 24) ^ rk2; 395 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s2 & 0xFF][warpThreadIndex], 24) ^ rk3; 396 | 397 | s0 = t0; 398 | s1 = t1; 399 | s2 = t2; 400 | s3 = t3; 401 | 402 | } 403 | 404 | // Calculate the last round key 405 | u32 temp = rk3; 406 | rk0 = rk0 ^ 407 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^ 408 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^ 409 | (t4S[(temp) & 0xff] & 0x0000ff00) ^ 410 | (t4S[(temp >> 24)] & 0x000000ff) ^ 411 | rconS[ROUND_COUNT_MIN_1]; 412 | // Last round uses s-box directly and XORs to produce output. 413 | s0 = (t4S[t0 >> 24] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t3) & 0xFF] & 0x000000FF) ^ rk0; 414 | if (s0 == ctS[0]) { 415 | rk1 = rk1 ^ rk0; 416 | s1 = (t4S[t1 >> 24] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t0) & 0xFF] & 0x000000FF) ^ rk1; 417 | if (s1 == ctS[1]) { 418 | rk2 = rk2 ^ rk1; 419 | s2 = (t4S[t2 >> 24] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t1) & 0xFF] & 0x000000FF) ^ rk2; 420 | if (s2 == ctS[2]) { 421 | rk3 = rk2 ^ rk3; 422 | s3 = (t4S[t3 >> 24] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t2) & 0xFF] & 0x000000FF) ^ rk3; 423 | if (s3 == ctS[3]) { 424 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init); 425 | printf("-------------------------------\n"); 426 | } 427 | } 428 | } 429 | } 430 | 431 | // Overflow 432 | if (rk3Init == MAX_U32) { 433 | rk2Init++; 434 | } 435 | 436 | // Create key as 32 bit unsigned integers 437 | rk3Init++; 438 | } 439 | } 440 | 441 | // Exhaustive search with one table extended as 32 columns 442 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function 443 | __global__ void exhaustiveSearchWithOneTableExtendedSharedMemoryBytePerm(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range) { 444 | 445 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 446 | int warpThreadIndex = threadIdx.x & 31; 447 | 448 | // 449 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 450 | __shared__ u32 t4S[TABLE_SIZE]; 451 | __shared__ u32 rconS[RCON_SIZE]; 452 | __shared__ u32 ctS[U32_SIZE]; 453 | 454 | if (threadIdx.x < TABLE_SIZE) { 455 | t4S[threadIdx.x] = t4G[threadIdx.x]; 456 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 457 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 458 | } 459 | 460 | if (threadIdx.x < RCON_SIZE) { 461 | rconS[threadIdx.x] = rconG[threadIdx.x]; 462 | } 463 | 464 | if (threadIdx.x < U32_SIZE) { 465 | ctS[threadIdx.x] = ct[threadIdx.x]; 466 | } 467 | } 468 | // 469 | 470 | // Wait until every thread is ready 471 | __syncthreads(); 472 | 473 | u32 rk0Init, rk1Init, rk2Init, rk3Init; 474 | rk0Init = rk[0]; 475 | rk1Init = rk[1]; 476 | rk2Init = rk[2]; 477 | rk3Init = rk[3]; 478 | 479 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 480 | pt0Init = pt[0]; 481 | pt1Init = pt[1]; 482 | pt2Init = pt[2]; 483 | pt3Init = pt[3]; 484 | 485 | u64 threadRange = *range; 486 | u64 threadRangeStart = (u64)threadIndex * threadRange; 487 | rk2Init = rk2Init + threadRangeStart / MAX_U32; 488 | rk3Init = rk3Init + threadRangeStart % MAX_U32; 489 | 490 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 491 | 492 | u32 rk0, rk1, rk2, rk3; 493 | rk0 = rk0Init; 494 | rk1 = rk1Init; 495 | rk2 = rk2Init; 496 | rk3 = rk3Init; 497 | 498 | // Create plaintext as 32 bit unsigned integers 499 | u32 s0, s1, s2, s3; 500 | s0 = pt0Init; 501 | s1 = pt1Init; 502 | s2 = pt2Init; 503 | s3 = pt3Init; 504 | 505 | // First round just XORs input with key. 506 | s0 = s0 ^ rk0; 507 | s1 = s1 ^ rk1; 508 | s2 = s2 ^ rk2; 509 | s3 = s3 ^ rk3; 510 | 511 | u32 t0, t1, t2, t3; 512 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) { 513 | 514 | // Calculate round key 515 | u32 temp = rk3; 516 | rk0 = rk0 ^ 517 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^ 518 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^ 519 | (t4S[(temp) & 0xff] & 0x0000ff00) ^ 520 | (t4S[(temp >> 24)] & 0x000000ff) ^ 521 | rconS[roundCount]; 522 | rk1 = rk1 ^ rk0; 523 | rk2 = rk2 ^ rk1; 524 | rk3 = rk2 ^ rk3; 525 | 526 | // Table based round function 527 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk0; 528 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk1; 529 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk2; 530 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk3; 531 | 532 | s0 = t0; 533 | s1 = t1; 534 | s2 = t2; 535 | s3 = t3; 536 | 537 | } 538 | 539 | // Calculate the last round key 540 | u32 temp = rk3; 541 | rk0 = rk0 ^ 542 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^ 543 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^ 544 | (t4S[(temp) & 0xff] & 0x0000ff00) ^ 545 | (t4S[(temp >> 24)] & 0x000000ff) ^ 546 | rconS[ROUND_COUNT_MIN_1]; 547 | // Last round uses s-box directly and XORs to produce output. 548 | s0 = (t4S[t0 >> 24] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t3) & 0xFF] & 0x000000FF) ^ rk0; 549 | if (s0 == ctS[0]) { 550 | rk1 = rk1 ^ rk0; 551 | s1 = (t4S[t1 >> 24] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t0) & 0xFF] & 0x000000FF) ^ rk1; 552 | if (s1 == ctS[1]) { 553 | rk2 = rk2 ^ rk1; 554 | s2 = (t4S[t2 >> 24] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t1) & 0xFF] & 0x000000FF) ^ rk2; 555 | if (s2 == ctS[2]) { 556 | rk3 = rk2 ^ rk3; 557 | s3 = (t4S[t3 >> 24] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t2) & 0xFF] & 0x000000FF) ^ rk3; 558 | if (s3 == ctS[3]) { 559 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init); 560 | printf("-------------------------------\n"); 561 | } 562 | } 563 | } 564 | } 565 | 566 | // Overflow 567 | if (rk3Init == MAX_U32) { 568 | rk2Init++; 569 | } 570 | 571 | // Create key as 32 bit unsigned integers 572 | rk3Init++; 573 | } 574 | } 575 | 576 | // Exhaustive search with one table extended as 32 columns 577 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function 578 | // SBox[256] is partly expanded 579 | __global__ void exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range) { 580 | 581 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 582 | int warpThreadIndex = threadIdx.x & 31; 583 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE; 584 | 585 | // 586 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 587 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 588 | __shared__ u32 rconS[RCON_SIZE]; 589 | __shared__ u32 ctS[U32_SIZE]; 590 | 591 | if (threadIdx.x < TABLE_SIZE) { 592 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 593 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 594 | } 595 | 596 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { 597 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; 598 | } 599 | 600 | if (threadIdx.x < RCON_SIZE) { 601 | rconS[threadIdx.x] = rconG[threadIdx.x]; 602 | } 603 | 604 | if (threadIdx.x < U32_SIZE) { 605 | ctS[threadIdx.x] = ct[threadIdx.x]; 606 | } 607 | } 608 | // 609 | 610 | // Wait until every thread is ready 611 | __syncthreads(); 612 | 613 | u32 rk0Init, rk1Init, rk2Init, rk3Init; 614 | rk0Init = rk[0]; 615 | rk1Init = rk[1]; 616 | rk2Init = rk[2]; 617 | rk3Init = rk[3]; 618 | 619 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 620 | pt0Init = pt[0]; 621 | pt1Init = pt[1]; 622 | pt2Init = pt[2]; 623 | pt3Init = pt[3]; 624 | 625 | u64 threadRange = *range; 626 | u64 threadRangeStart = (u64)threadIndex * threadRange; 627 | rk2Init = rk2Init + threadRangeStart / MAX_U32; 628 | rk3Init = rk3Init + threadRangeStart % MAX_U32; 629 | 630 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 631 | 632 | u32 rk0, rk1, rk2, rk3; 633 | rk0 = rk0Init; 634 | rk1 = rk1Init; 635 | rk2 = rk2Init; 636 | rk3 = rk3Init; 637 | 638 | // Create plaintext as 32 bit unsigned integers 639 | u32 s0, s1, s2, s3; 640 | s0 = pt0Init; 641 | s1 = pt1Init; 642 | s2 = pt2Init; 643 | s3 = pt3Init; 644 | 645 | // First round just XORs input with key. 646 | s0 = s0 ^ rk0; 647 | s1 = s1 ^ rk1; 648 | s2 = s2 ^ rk2; 649 | s3 = s3 ^ rk3; 650 | 651 | u32 t0, t1, t2, t3; 652 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) { 653 | 654 | // Calculate round key 655 | u32 temp = rk3; 656 | rk0 = rk0 ^ 657 | (t4S[(temp >> 16) & 0xff][warpThreadIndexSBox] & 0xff000000) ^ 658 | (t4S[(temp >> 8) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^ 659 | (t4S[(temp) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^ 660 | (t4S[(temp >> 24)][warpThreadIndexSBox] & 0x000000ff) ^ 661 | rconS[roundCount]; 662 | rk1 = rk1 ^ rk0; 663 | rk2 = rk2 ^ rk1; 664 | rk3 = rk2 ^ rk3; 665 | 666 | // Table based round function 667 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk0; 668 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk1; 669 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk2; 670 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk3; 671 | 672 | s0 = t0; 673 | s1 = t1; 674 | s2 = t2; 675 | s3 = t3; 676 | 677 | } 678 | 679 | // Calculate the last round key 680 | u32 temp = rk3; 681 | rk0 = rk0 ^ 682 | (t4S[(temp >> 16) & 0xff][warpThreadIndexSBox] & 0xff000000) ^ 683 | (t4S[(temp >> 8) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^ 684 | (t4S[(temp) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^ 685 | (t4S[(temp >> 24)][warpThreadIndexSBox] & 0x000000ff) ^ 686 | rconS[ROUND_COUNT_MIN_1]; 687 | // Last round uses s-box directly and XORs to produce output. 688 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk0; 689 | if (s0 == ctS[0]) { 690 | rk1 = rk1 ^ rk0; 691 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk1; 692 | if (s1 == ctS[1]) { 693 | rk2 = rk2 ^ rk1; 694 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk2; 695 | if (s2 == ctS[2]) { 696 | rk3 = rk2 ^ rk3; 697 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk3; 698 | if (s3 == ctS[3]) { 699 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init); 700 | printf("-------------------------------\n"); 701 | } 702 | } 703 | } 704 | } 705 | 706 | // Overflow 707 | if (rk3Init == MAX_U32) { 708 | rk2Init++; 709 | } 710 | 711 | // Create key as 32 bit unsigned integers 712 | rk3Init++; 713 | } 714 | } 715 | __global__ void exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range, u8 *SAES) { 716 | u64 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 717 | int warpThreadIndex = threadIdx.x & 31; 718 | // int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE; 719 | // 720 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 721 | __shared__ u32 rconS[RCON_SIZE]; 722 | __shared__ u32 ctS[U32_SIZE]; 723 | // __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 724 | __shared__ u8 Sbox[64][32][4]; 725 | 726 | if (threadIdx.x < TABLE_SIZE) { 727 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 728 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 729 | Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; 730 | } 731 | // for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; } 732 | // for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { Sbox[threadIdx.x/4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; } 733 | if (threadIdx.x < U32_SIZE) { ctS[threadIdx.x] = ct[threadIdx.x]; } 734 | if (threadIdx.x < RCON_SIZE) { rconS[threadIdx.x] = rconG[threadIdx.x]; } 735 | 736 | } // 737 | __syncthreads(); // Wait until every thread is ready 738 | u32 rk0Init, rk1Init, rk2Init, rk3Init; 739 | rk0Init = rk[0]; rk1Init = rk[1]; rk2Init = rk[2]; rk3Init = rk[3]; 740 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 741 | pt0Init = pt[0]; pt1Init = pt[1]; pt2Init = pt[2]; pt3Init = pt[3]; 742 | u64 threadRange = *range; 743 | u64 threadRangeStart = threadIndex * threadRange; 744 | rk2Init = rk2Init + threadRangeStart / (u64)MAX_U32; 745 | rk3Init = rk3Init + threadRangeStart % (u64)MAX_U32; 746 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 747 | u32 rk0, rk1, rk2, rk3; 748 | rk0 = rk0Init; rk1 = rk1Init; rk2 = rk2Init; rk3 = rk3Init; 749 | // Create plaintext as 32 bit unsigned integers 750 | u32 s0, s1, s2, s3; 751 | s0 = pt0Init; s1 = pt1Init; s2 = pt2Init; s3 = pt3Init; 752 | // First round just XORs input with key. 753 | s0 = s0 ^ rk0; s1 = s1 ^ rk1; s2 = s2 ^ rk2; s3 = s3 ^ rk3; 754 | u32 t0, t1, t2, t3; 755 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) { 756 | // Calculate round key 757 | u32 temp = rk3; 758 | rk0 = rk0 ^ 759 | arithmeticRightShiftBytePerm((u32)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16)) % 4], SHIFT_1_RIGHT) ^ 760 | arithmeticRightShiftBytePerm((u32)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8)) % 4], SHIFT_2_RIGHT) ^ 761 | arithmeticRightShiftBytePerm((u32)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp)) % 4], SHIFT_3_RIGHT) ^ 762 | ((u32)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^ 763 | rconS[roundCount]; 764 | rk1 = rk1 ^ rk0; rk2 = rk2 ^ rk1; rk3 = rk2 ^ rk3; 765 | // Table based round function 766 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk0; 767 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk1; 768 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk2; 769 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk3; 770 | s0 = t0; s1 = t1; s2 = t2; s3 = t3; 771 | } 772 | // Calculate the last round key 773 | u32 temp = rk3; 774 | rk0 = rk0 ^ 775 | arithmeticRightShiftBytePerm((u32)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16)) % 4], SHIFT_1_RIGHT) ^ 776 | arithmeticRightShiftBytePerm((u32)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8)) % 4], SHIFT_2_RIGHT) ^ 777 | arithmeticRightShiftBytePerm((u32)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp)) % 4], SHIFT_3_RIGHT) ^ 778 | ((u32)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^ 779 | rconS[ROUND_COUNT_MIN_1]; 780 | // Last round uses s-box directly and XORs to produce output. 781 | s0 = arithmeticRightShiftBytePerm((u32)Sbox[((t0 >> 24)) / 4][warpThreadIndex][((t0 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t1 >> 16) & 0xff) / 4][warpThreadIndex][((t1 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t2 >> 8) &0xFF) / 4][warpThreadIndex][((t2 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t3 & 0xFF) / 4)][warpThreadIndex][((t3 & 0xFF) % 4)]) ^ rk0; 782 | if (s0 == ctS[0]) { 783 | rk1 = rk1 ^ rk0; 784 | s1 = arithmeticRightShiftBytePerm((u32)Sbox[((t1 >> 24)) / 4][warpThreadIndex][((t1 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t2 >> 16) & 0xff) / 4][warpThreadIndex][((t2 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t3 >> 8) & 0xFF) / 4][warpThreadIndex][((t3 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t0 & 0xFF) / 4)][warpThreadIndex][((t0 & 0xFF) % 4)]) ^ rk1; 785 | if (s1 == ctS[1]) { 786 | rk2 = rk2 ^ rk1; 787 | s2 = arithmeticRightShiftBytePerm((u32)Sbox[((t2 >> 24)) / 4][warpThreadIndex][((t2 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t3 >> 16) & 0xff) / 4][warpThreadIndex][((t3 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t0 >> 8) & 0xFF) / 4][warpThreadIndex][((t0 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t1 & 0xFF) / 4)][warpThreadIndex][((t1 & 0xFF) % 4)]) ^ rk2; 788 | if (s2 == ctS[2]) { 789 | rk3 = rk2 ^ rk3; 790 | s3 = arithmeticRightShiftBytePerm((u32)Sbox[((t3 >> 24)) / 4][warpThreadIndex][((t3 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t0 >> 16) & 0xff) / 4][warpThreadIndex][((t0 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t1 >> 8) & 0xFF) / 4][warpThreadIndex][((t1 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t2 & 0xFF) / 4)][warpThreadIndex][((t2 & 0xFF) % 4)]) ^ rk3; 791 | if (s3 == ctS[3]) { 792 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init); 793 | printf("-------------------------------\n"); 794 | } 795 | } 796 | } 797 | } 798 | // Overflow 799 | if (rk3Init == MAX_U32) { rk2Init++; } 800 | rk3Init++; // Create key as 32 bit unsigned integers 801 | } 802 | } 803 | __global__ void exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir2(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range, u8* SAES) { 804 | u64 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 805 | int warpThreadIndex = threadIdx.x & 31; 806 | // int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE; 807 | // 808 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 809 | __shared__ u32 rconS[RCON_SIZE]; 810 | __shared__ u32 ctS[U32_SIZE]; 811 | // __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 812 | __shared__ u8 Sbox[64][32][4]; 813 | 814 | if (threadIdx.x < TABLE_SIZE) { 815 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 816 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 817 | Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; 818 | } 819 | // for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; } 820 | // for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { Sbox[threadIdx.x/4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; } 821 | if (threadIdx.x < U32_SIZE) { ctS[threadIdx.x] = ct[threadIdx.x]; } 822 | if (threadIdx.x < RCON_SIZE) { rconS[threadIdx.x] = rconG[threadIdx.x]; } 823 | 824 | } // 825 | __syncthreads(); // Wait until every thread is ready 826 | u32 rk0Init, rk1Init, rk2Init, rk3Init; 827 | rk0Init = rk[0]; rk1Init = rk[1]; rk2Init = rk[2]; rk3Init = rk[3]; 828 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 829 | pt0Init = pt[0]; pt1Init = pt[1]; pt2Init = pt[2]; pt3Init = pt[3]; 830 | u64 threadRange = *range; 831 | u64 threadRangeStart = threadIndex * threadRange; 832 | rk2Init = rk2Init + threadRangeStart / (u64)MAX_U32; 833 | rk3Init = rk3Init + threadRangeStart % (u64)MAX_U32; 834 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 835 | u32 rk0, rk1, rk2, rk3; 836 | rk0 = rk0Init; rk1 = rk1Init; rk2 = rk2Init; rk3 = rk3Init; 837 | // Create plaintext as 32 bit unsigned integers 838 | u32 s0, s1, s2, s3; 839 | s0 = pt0Init; s1 = pt1Init; s2 = pt2Init; s3 = pt3Init; 840 | // First round just XORs input with key. 841 | s0 = s0 ^ rk0; s1 = s1 ^ rk1; s2 = s2 ^ rk2; s3 = s3 ^ rk3; 842 | u32 t0, t1, t2, t3; 843 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) { 844 | // Calculate round key 845 | u32 temp = rk3; 846 | rk0 = rk0 ^ 847 | arithmeticRightShift((u32)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16)) % 4], 8) ^ 848 | arithmeticRightShift((u32)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8)) % 4], 16) ^ 849 | arithmeticRightShift((u32)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp)) % 4], 24) ^ 850 | ((u32)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^ 851 | rconS[roundCount]; 852 | rk1 = rk1 ^ rk0; rk2 = rk2 ^ rk1; rk3 = rk2 ^ rk3; 853 | // Table based round function 854 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s3 & 0xFF][warpThreadIndex], 24) ^ rk0; 855 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s0 & 0xFF][warpThreadIndex], 24) ^ rk1; 856 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s1 & 0xFF][warpThreadIndex], 24) ^ rk2; 857 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s2 & 0xFF][warpThreadIndex], 24) ^ rk3; 858 | s0 = t0; s1 = t1; s2 = t2; s3 = t3; 859 | } 860 | // Calculate the last round key 861 | u32 temp = rk3; 862 | rk0 = rk0 ^ 863 | arithmeticRightShift((u32)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16)) % 4], 8) ^ 864 | arithmeticRightShift((u32)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8)) % 4], 16) ^ 865 | arithmeticRightShift((u32)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp)) % 4], 24) ^ 866 | ((u32)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^ 867 | rconS[ROUND_COUNT_MIN_1]; 868 | // Last round uses s-box directly and XORs to produce output. 869 | s0 = arithmeticRightShift((u32)Sbox[((t0 >> 24)) / 4][warpThreadIndex][((t0 >> 24)) % 4], 8) ^ arithmeticRightShift((u32)Sbox[((t1 >> 16) & 0xff) / 4][warpThreadIndex][((t1 >> 16)) % 4], 16) ^ arithmeticRightShift((u32)Sbox[((t2 >> 8) & 0xFF) / 4][warpThreadIndex][((t2 >> 8)) % 4], 24) ^ ((u32)Sbox[((t3 & 0xFF) / 4)][warpThreadIndex][((t3 & 0xFF) % 4)]) ^ rk0; 870 | if (s0 == ctS[0]) { 871 | rk1 = rk1 ^ rk0; 872 | s1 = arithmeticRightShift((u32)Sbox[((t1 >> 24)) / 4][warpThreadIndex][((t1 >> 24)) % 4], 8) ^ arithmeticRightShift((u32)Sbox[((t2 >> 16) & 0xff) / 4][warpThreadIndex][((t2 >> 16)) % 4], 16) ^ arithmeticRightShift((u32)Sbox[((t3 >> 8) & 0xFF) / 4][warpThreadIndex][((t3 >> 8)) % 4], 24) ^ ((u32)Sbox[((t0 & 0xFF) / 4)][warpThreadIndex][((t0 & 0xFF) % 4)]) ^ rk1; 873 | if (s1 == ctS[1]) { 874 | rk2 = rk2 ^ rk1; 875 | s2 = arithmeticRightShift((u32)Sbox[((t2 >> 24)) / 4][warpThreadIndex][((t2 >> 24)) % 4], 8) ^ arithmeticRightShift((u32)Sbox[((t3 >> 16) & 0xff) / 4][warpThreadIndex][((t3 >> 16)) % 4], 16) ^ arithmeticRightShift((u32)Sbox[((t0 >> 8) & 0xFF) / 4][warpThreadIndex][((t0 >> 8)) % 4], 24) ^ ((u32)Sbox[((t1 & 0xFF) / 4)][warpThreadIndex][((t1 & 0xFF) % 4)]) ^ rk2; 876 | if (s2 == ctS[2]) { 877 | rk3 = rk2 ^ rk3; 878 | s3 = arithmeticRightShift((u32)Sbox[((t3 >> 24)) / 4][warpThreadIndex][((t3 >> 24)) % 4], 8) ^ arithmeticRightShift((u32)Sbox[((t0 >> 16) & 0xff) / 4][warpThreadIndex][((t0 >> 16)) % 4], 16) ^ arithmeticRightShift((u32)Sbox[((t1 >> 8) & 0xFF) / 4][warpThreadIndex][((t1 >> 8)) % 4], 24) ^ ((u32)Sbox[((t2 & 0xFF) / 4)][warpThreadIndex][((t2 & 0xFF) % 4)]) ^ rk3; 879 | if (s3 == ctS[3]) { 880 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init); 881 | printf("-------------------------------\n"); 882 | } 883 | } 884 | } 885 | } 886 | // Overflow 887 | if (rk3Init == MAX_U32) { rk2Init++; } 888 | rk3Init++; // Create key as 32 bit unsigned integers 889 | } 890 | } 891 | /*__global__ void exhaustiveSearchCem(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t1G, u32* t4G, u32* rconG, u64* range, u8* SAES) { 892 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 893 | int warpThreadIndex = threadIdx.x & 31; 894 | // int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE; 895 | // 896 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 897 | __shared__ u32 t1S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 898 | // __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE]; 899 | __shared__ u8 Sbox[64][32][4]; 900 | __shared__ u32 rconS[RCON_SIZE]; 901 | __shared__ u32 ctS[U32_SIZE]; 902 | if (threadIdx.x < TABLE_SIZE) { 903 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; } 904 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { t1S[threadIdx.x][bankIndex] = t1G[threadIdx.x]; } 905 | // for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; } 906 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; } 907 | if (threadIdx.x < RCON_SIZE) { rconS[threadIdx.x] = rconG[threadIdx.x]; } 908 | if (threadIdx.x < U32_SIZE) { ctS[threadIdx.x] = ct[threadIdx.x]; } 909 | } // 910 | __syncthreads(); // Wait until every thread is ready 911 | u32 rk0Init, rk1Init, rk2Init, rk3Init; 912 | rk0Init = rk[0]; rk1Init = rk[1]; rk2Init = rk[2]; rk3Init = rk[3]; 913 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 914 | pt0Init = pt[0]; pt1Init = pt[1]; pt2Init = pt[2]; pt3Init = pt[3]; 915 | u64 threadRange = *range; 916 | u64 threadRangeStart = (u64)threadIndex * threadRange; 917 | rk2Init = rk2Init + threadRangeStart / MAX_U32; 918 | rk3Init = rk3Init + threadRangeStart % MAX_U32; 919 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 920 | u32 rk0, rk1, rk2, rk3; 921 | rk0 = rk0Init; rk1 = rk1Init; rk2 = rk2Init; rk3 = rk3Init; 922 | // Create plaintext as 32 bit unsigned integers 923 | u32 s0, s1, s2, s3; 924 | s0 = pt0Init; s1 = pt1Init; s2 = pt2Init; s3 = pt3Init; 925 | // First round just XORs input with key. 926 | s0 = s0 ^ rk0; s1 = s1 ^ rk1; s2 = s2 ^ rk2; s3 = s3 ^ rk3; 927 | u32 t0, t1, t2, t3; 928 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) { 929 | // Calculate round key 930 | u32 temp = rk3; 931 | rk0 = rk0 ^ 932 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16) & 0xff) % 4], SHIFT_1_RIGHT) ^ 933 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8) & 0xff) % 4], SHIFT_2_RIGHT) ^ 934 | arithmeticRightShiftBytePerm((u64)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp) & 0xff) % 4], SHIFT_3_RIGHT) ^ 935 | ((u64)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^ 936 | rconS[roundCount]; 937 | 938 | rk1 = rk1 ^ rk0; rk2 = rk2 ^ rk1; rk3 = rk2 ^ rk3; 939 | // Table based round function 940 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ t1S[(s1 >> 16) & 0xFF][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk0; 941 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ t1S[(s2 >> 16) & 0xFF][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk1; 942 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ t1S[(s3 >> 16) & 0xFF][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk2; 943 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ t1S[(s0 >> 16) & 0xFF][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk3; 944 | s0 = t0; s1 = t1; s2 = t2; s3 = t3; 945 | } 946 | // Calculate the last round key 947 | u32 temp = rk3; 948 | rk0 = rk0 ^ 949 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16)) % 4], SHIFT_1_RIGHT) ^ 950 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8)) % 4], SHIFT_2_RIGHT) ^ 951 | arithmeticRightShiftBytePerm((u64)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp)) % 4], SHIFT_3_RIGHT) ^ 952 | ((u64)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^ 953 | rconS[ROUND_COUNT_MIN_1]; 954 | // Last round uses s-box directly and XORs to produce output. 955 | s0 = arithmeticRightShiftBytePerm((u64)Sbox[((t0 >> 24)) / 4][warpThreadIndex][((t0 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t1 >> 16) & 0xff) / 4][warpThreadIndex][((t1 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t2 >> 8) & 0xFF) / 4][warpThreadIndex][((t2 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t3 & 0xFF) / 4)][warpThreadIndex][((t3 & 0xFF) % 4)]) ^ rk0; 956 | if (s0 == ctS[0]) { 957 | rk1 = rk1 ^ rk0; 958 | s1 = arithmeticRightShiftBytePerm((u64)Sbox[((t1 >> 24)) / 4][warpThreadIndex][((t1 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t2 >> 16) & 0xff) / 4][warpThreadIndex][((t2 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t3 >> 8) & 0xFF) / 4][warpThreadIndex][((t3 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t0 & 0xFF) / 4)][warpThreadIndex][((t0 & 0xFF) % 4)]) ^ rk1; 959 | if (s1 == ctS[1]) { 960 | rk2 = rk2 ^ rk1; 961 | s2 = arithmeticRightShiftBytePerm((u64)Sbox[((t2 >> 24)) / 4][warpThreadIndex][((t2 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t3 >> 16) & 0xff) / 4][warpThreadIndex][((t3 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t0 >> 8) & 0xFF) / 4][warpThreadIndex][((t0 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t1 & 0xFF) / 4)][warpThreadIndex][((t1 & 0xFF) % 4)]) ^ rk2; 962 | if (s2 == ctS[2]) { 963 | rk3 = rk2 ^ rk3; 964 | s3 = arithmeticRightShiftBytePerm((u64)Sbox[((t3 >> 24)) / 4][warpThreadIndex][((t3 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t0 >> 16) & 0xff) / 4][warpThreadIndex][((t0 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t1 >> 8) & 0xFF) / 4][warpThreadIndex][((t1 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t2 & 0xFF) / 4)][warpThreadIndex][((t2 & 0xFF) % 4)]) ^ rk3; 965 | if (s3 == ctS[3]) { 966 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init); 967 | printf("-------------------------------\n"); 968 | } 969 | } 970 | } 971 | } 972 | // Overflow 973 | if (rk3Init == MAX_U32) { rk2Init++; } 974 | rk3Init++; // Create key as 32 bit unsigned integers 975 | } 976 | }*/ 977 | 978 | 979 | // Exhaustive search with one table extended as 32 columns 980 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function 981 | // 4 S-box, each shifted 982 | __global__ void exhaustiveSearchWithOneTableExtendedSharedMemoryBytePerm4ShiftedSbox(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4_0G, u32* t4_1G, u32* t4_2G, u32* t4_3G, u32* rconG, u64* range) { 983 | 984 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; 985 | int warpThreadIndex = threadIdx.x & 31; 986 | 987 | // 988 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE]; 989 | __shared__ u32 t4_0S[TABLE_SIZE]; 990 | __shared__ u32 t4_1S[TABLE_SIZE]; 991 | __shared__ u32 t4_2S[TABLE_SIZE]; 992 | __shared__ u32 t4_3S[TABLE_SIZE]; 993 | __shared__ u32 rconS[RCON_SIZE]; 994 | __shared__ u32 ctS[U32_SIZE]; 995 | 996 | if (threadIdx.x < TABLE_SIZE) { 997 | t4_0S[threadIdx.x] = t4_0G[threadIdx.x]; 998 | t4_1S[threadIdx.x] = t4_1G[threadIdx.x]; 999 | t4_2S[threadIdx.x] = t4_2G[threadIdx.x]; 1000 | t4_3S[threadIdx.x] = t4_3G[threadIdx.x]; 1001 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { 1002 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; 1003 | } 1004 | 1005 | if (threadIdx.x < RCON_SIZE) { 1006 | rconS[threadIdx.x] = rconG[threadIdx.x]; 1007 | } 1008 | 1009 | if (threadIdx.x < U32_SIZE) { 1010 | ctS[threadIdx.x] = ct[threadIdx.x]; 1011 | } 1012 | } 1013 | // 1014 | 1015 | // Wait until every thread is ready 1016 | __syncthreads(); 1017 | 1018 | u32 rk0Init, rk1Init, rk2Init, rk3Init; 1019 | rk0Init = rk[0]; 1020 | rk1Init = rk[1]; 1021 | rk2Init = rk[2]; 1022 | rk3Init = rk[3]; 1023 | 1024 | u32 pt0Init, pt1Init, pt2Init, pt3Init; 1025 | pt0Init = pt[0]; 1026 | pt1Init = pt[1]; 1027 | pt2Init = pt[2]; 1028 | pt3Init = pt[3]; 1029 | 1030 | u64 threadRange = *range; 1031 | u64 threadRangeStart = (u64)threadIndex * threadRange; 1032 | rk2Init = rk2Init + threadRangeStart / MAX_U32; 1033 | rk3Init = rk3Init + threadRangeStart % MAX_U32; 1034 | 1035 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) { 1036 | 1037 | u32 rk0, rk1, rk2, rk3; 1038 | rk0 = rk0Init; 1039 | rk1 = rk1Init; 1040 | rk2 = rk2Init; 1041 | rk3 = rk3Init; 1042 | 1043 | // Create plaintext as 32 bit unsigned integers 1044 | u32 s0, s1, s2, s3; 1045 | s0 = pt0Init; 1046 | s1 = pt1Init; 1047 | s2 = pt2Init; 1048 | s3 = pt3Init; 1049 | 1050 | // First round just XORs input with key. 1051 | s0 = s0 ^ rk0; 1052 | s1 = s1 ^ rk1; 1053 | s2 = s2 ^ rk2; 1054 | s3 = s3 ^ rk3; 1055 | 1056 | u32 t0, t1, t2, t3; 1057 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) { 1058 | 1059 | // Calculate round key 1060 | u32 temp = rk3; 1061 | rk0 = rk0 ^ t4_3S[(temp >> 16) & 0xff] ^ t4_2S[(temp >> 8) & 0xff] ^ t4_1S[(temp) & 0xff] ^ t4_0S[(temp >> 24)] ^ rconS[roundCount]; 1062 | rk1 = rk1 ^ rk0; 1063 | rk2 = rk2 ^ rk1; 1064 | rk3 = rk2 ^ rk3; 1065 | 1066 | // Table based round function 1067 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk0; 1068 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk1; 1069 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk2; 1070 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk3; 1071 | 1072 | s0 = t0; 1073 | s1 = t1; 1074 | s2 = t2; 1075 | s3 = t3; 1076 | 1077 | } 1078 | 1079 | // Calculate the last round key 1080 | u32 temp = rk3; 1081 | rk0 = rk0 ^ t4_3S[(temp >> 16) & 0xff] ^ t4_2S[(temp >> 8) & 0xff] ^ t4_1S[(temp) & 0xff] ^ t4_0S[(temp >> 24)] ^ rconS[ROUND_COUNT_MIN_1]; 1082 | // Last round uses s-box directly and XORs to produce output. 1083 | s0 = t4_3S[t0 >> 24] ^ t4_2S[(t1 >> 16) & 0xff] ^ t4_1S[(t2 >> 8) & 0xff] ^ t4_0S[(t3) & 0xFF] ^ rk0; 1084 | if (s0 == ctS[0]) { 1085 | rk1 = rk1 ^ rk0; 1086 | s1 = t4_3S[t1 >> 24] ^ t4_2S[(t2 >> 16) & 0xff] ^ t4_1S[(t3 >> 8) & 0xff] ^ t4_0S[(t0) & 0xFF] ^ rk1; 1087 | if (s1 == ctS[1]) { 1088 | rk2 = rk2 ^ rk1; 1089 | s2 = t4_3S[t2 >> 24] ^ t4_2S[(t3 >> 16) & 0xff] ^ t4_1S[(t0 >> 8) & 0xff] ^ t4_0S[(t1) & 0xFF] ^ rk2; 1090 | if (s2 == ctS[2]) { 1091 | rk3 = rk2 ^ rk3; 1092 | s3 = t4_3S[t3 >> 24] ^ t4_2S[(t0 >> 16) & 0xff] ^ t4_1S[(t1 >> 8) & 0xff] ^ t4_0S[(t2) & 0xFF] ^ rk3; 1093 | if (s3 == ctS[3]) { 1094 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init); 1095 | printf("-------------------------------\n"); 1096 | } 1097 | } 1098 | } 1099 | } 1100 | 1101 | // Overflow 1102 | if (rk3Init == MAX_U32) { 1103 | rk2Init++; 1104 | } 1105 | 1106 | // Create key as 32 bit unsigned integers 1107 | rk3Init++; 1108 | } 1109 | } 1110 | 1111 | __host__ int main128ExhaustiveSearch(int choice) { 1112 | printf("\n"); printf("########## AES-128 Exhaustive Search Implementation ##########\n"); printf("\n"); 1113 | // Allocate plaintext, ciphertext and initial round key 1114 | u32 *pt, *ct, *rk; 1115 | gpuErrorCheck(cudaMallocManaged(&pt, 4 * sizeof(u32))); 1116 | gpuErrorCheck(cudaMallocManaged(&ct, 4 * sizeof(u32))); 1117 | gpuErrorCheck(cudaMallocManaged(&rk, 4 * sizeof(u32))); 1118 | pt[0] = 0x3243F6A8U; pt[1] = 0x885A308DU; pt[2] = 0x313198A2U; pt[3] = 0xE0370734U; 1119 | // pt[0] = 0; pt[1] = 0; pt[2] = 0; pt[3] = 0; 1120 | ct[0] = 0x3925841DU; ct[1] = 0x02DC09FBU; ct[2] = 0xDC118597U; ct[3] = 0x196A0B32U; 1121 | // aes-cipher-internals.xlsx 1122 | rk[0] = 0x2B7E1516U; rk[1] = 0x28AED2A6U; rk[2] = 0xABF71588U; rk[3] = 0x09CF0000U; 1123 | // Allocate RCON values 1124 | u32* rcon; 1125 | gpuErrorCheck(cudaMallocManaged(&rcon, RCON_SIZE * sizeof(u32))); 1126 | for (int i = 0; i < RCON_SIZE; i++) { rcon[i] = RCON32[i]; } 1127 | // Allocate Tables 1128 | u32 *t0, *t1, *t2, *t3, *t4, *t4_0, *t4_1, *t4_2, *t4_3; 1129 | u8* SAES_d; // Cihangir 1130 | gpuErrorCheck(cudaMallocManaged(&t0, TABLE_SIZE * sizeof(u32))); 1131 | gpuErrorCheck(cudaMallocManaged(&t1, TABLE_SIZE * sizeof(u32))); 1132 | gpuErrorCheck(cudaMallocManaged(&t2, TABLE_SIZE * sizeof(u32))); 1133 | gpuErrorCheck(cudaMallocManaged(&t3, TABLE_SIZE * sizeof(u32))); 1134 | gpuErrorCheck(cudaMallocManaged(&t4, TABLE_SIZE * sizeof(u32))); 1135 | gpuErrorCheck(cudaMallocManaged(&t4_0, TABLE_SIZE * sizeof(u32))); 1136 | gpuErrorCheck(cudaMallocManaged(&t4_1, TABLE_SIZE * sizeof(u32))); 1137 | gpuErrorCheck(cudaMallocManaged(&t4_2, TABLE_SIZE * sizeof(u32))); 1138 | gpuErrorCheck(cudaMallocManaged(&t4_3, TABLE_SIZE * sizeof(u32))); 1139 | gpuErrorCheck(cudaMallocManaged(&SAES_d, 256 * sizeof(u8))); // Cihangir 1140 | for (int i = 0; i < TABLE_SIZE; i++) { 1141 | t0[i] = T0[i]; t1[i] = T1[i]; t2[i] = T2[i]; t3[i] = T3[i]; t4[i] = T4[i]; 1142 | t4_0[i] = T4_0[i]; t4_1[i] = T4_1[i]; t4_2[i] = T4_2[i]; t4_3[i] = T4_3[i]; 1143 | } 1144 | for (int i = 0; i < 256; i++) SAES_d[i] = SAES[i]; // Cihangir 1145 | printf("-------------------------------\n"); 1146 | u64* range = calculateRange(); 1147 | /* printf("Plaintext : %08x %08x %08x %08x\n", pt[0], pt[1], pt[2], pt[3]); 1148 | printf("Ciphertext : %08x %08x %08x %08x\n", ct[0], ct[1], ct[2], ct[3]); 1149 | printf("Initial Key : %08x %08x %08x %08x\n", rk[0], rk[1], rk[2], rk[3]); 1150 | printf("-------------------------------\n");*/ 1151 | 1152 | 1153 | 1154 | clock_t beginTime = clock(); 1155 | // exhaustiveSearch << > > (pt, ct, rk, t0, t1, t2, t3, t4, rcon, range); 1156 | // exhaustiveSearchWithOneTable<<>>(pt, ct, rk, t0, t4, rcon, range); 1157 | if (choice == 1) exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir2 << > > (pt, ct, rk, t0, t4, rcon, range, SAES_d); 1158 | if (choice == 11) exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir << > > (pt, ct, rk, t0, t4, rcon, range, SAES_d); 1159 | // else if (choice == 2) exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox << > > (pt, ct, rk, t0, t4, rcon, range); 1160 | else if (choice == 2) exhaustiveSearchWithOneTableExtendedSharedMemory << > > (pt, ct, rk, t0, t4, rcon, range); 1161 | else if (choice == 22) exhaustiveSearchWithOneTableExtendedSharedMemoryBytePerm << > > (pt, ct, rk, t0, t4, rcon, range); 1162 | 1163 | // exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox << > > (pt, ct, rk, t0, t4, rcon, range); 1164 | // exhaustiveSearchCem << > > (pt, ct, rk, t0, t1, t4, rcon, range, SAES_d); 1165 | cudaDeviceSynchronize(); 1166 | printf("Time elapsed: %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC); 1167 | 1168 | /* beginTime = clock(); 1169 | // Kernels 1170 | //exhaustiveSearch<<>>(pt, ct, rk, t0, t1, t2, t3, t4, rcon, range); 1171 | //exhaustiveSearchWithOneTable<<>>(pt, ct, rk, t0, t4, rcon, range); 1172 | //exhaustiveSearchWithOneTableExtendedSharedMemory<<>>(pt, ct, rk, t0, t4, rcon, range); 1173 | //exhaustiveSearchWithOneTableExtendedSharedMemoryBytePerm<<>>(pt, ct, rk, t0, t4, rcon, range); 1174 | // Fastest 1175 | exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, ct, rk, t0, t4, rcon, range); 1176 | //exhaustiveSearchWithOneTableExtendedSharedMemoryBytePerm4ShiftedSbox<<>>(pt, ct, rk, t0, t4_0, t4_1, t4_2, t4_3, rcon, range); 1177 | cudaDeviceSynchronize(); 1178 | printf("Time elapsed: %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC);*/ 1179 | 1180 | 1181 | printf("-------------------------------\n"); 1182 | printLastCUDAError(); 1183 | // Free alocated arrays 1184 | cudaFree(range); cudaFree(pt); cudaFree(ct); cudaFree(rk); cudaFree(t0); cudaFree(t1); cudaFree(t2); cudaFree(t3); cudaFree(t4); 1185 | cudaFree(t4_0); cudaFree(t4_1); cudaFree(t4_2); cudaFree(t4_3); cudaFree(rcon); cudaFree(SAES_d); 1186 | return 0; 1187 | } 1188 | 1189 | --------------------------------------------------------------------------------