├── README.md
├── AES_final.cu
├── 192-ctr.cuh
├── 256-ctr.cuh
├── 256-es.cuh
├── LICENSE
├── 192-es.cuh
├── 128-ctr.cuh
├── file-encryption.cuh
├── AES_final.h
└── 128-es.cuh
/README.md:
--------------------------------------------------------------------------------
1 | **Breakthrough AES Performance on GPUs**
2 |
3 | These are CUDA optimizations of T-table based implementation of AES which contain zero bank conflicts.
4 |
5 | We achieved
6 |
7 | **315.2** Gbps AES-128 encryption on a **GTX 970**
8 | **878.6** Gbps AES-128 encryption on an **RTX 2070 Super**
9 |
10 | These results are published in https://ieeexplore.ieee.org/document/9422754
11 |
12 | In science, reproducibility of experiments is crucial but almost none of the GPU optimizations of AES is publicly availble. This is why we publish our codes here.
13 |
14 | Moreover, comparing different optimization results on different GPUs is almost impossible. When you have adifferent kind of optimization and want to compare it with our optimizations, please use these codes on the same GPU you used for your codes.
15 |
16 |
17 | **Cihangir Tezcan**, PhD
18 | _Director of Cyber Security Center_
19 | _Head of Department of Cyber Security, Informatics Institute_
20 | _Middle East Technical University_
21 | _Ankara, Turkey_
22 |
--------------------------------------------------------------------------------
/AES_final.cu:
--------------------------------------------------------------------------------
1 | // System includes
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | // CUDA runtime
8 | #include
9 |
10 | // Helper functions and utilities to work with CUDA
11 | //#include
12 | //#include
13 |
14 | #include
15 | //#include
16 |
17 | // Custom header
18 | #include "AES_final.h"
19 | //
20 | #include "128-es.cuh"
21 | #include "128-ctr.cuh"
22 | #include "192-es.cuh"
23 | #include "192-ctr.cuh"
24 | #include "256-es.cuh"
25 | #include "256-ctr.cuh"
26 | //#include "small.cuh"
27 | //#include "silent.cuh"
28 | #include "file-encryption.cuh"
29 |
30 | void selection(int choice) {
31 | if (choice == 1) main128ExhaustiveSearch(1);
32 | else if (choice == 11) main128ExhaustiveSearch(11);
33 | else if (choice == 2) main128ExhaustiveSearch(2);
34 | else if (choice == 22) main128ExhaustiveSearch(22);
35 | else if (choice == 3) main128Ctr();
36 | else if (choice == 4) main192ExhaustiveSearch();
37 | else if (choice == 5) main192Ctr();
38 | else if (choice == 6) main256ExhaustiveSearch();
39 | else if (choice == 7) main256Ctr();
40 | else if (choice == 8) {
41 | main128ExhaustiveSearch(1);
42 | main128Ctr();
43 | main192ExhaustiveSearch();
44 | main192Ctr();
45 | main256ExhaustiveSearch();
46 | main256Ctr();
47 | }
48 | else printf("Wrong selection\n");
49 | }
50 |
51 | int main() {
52 | cudaSetDevice(0);
53 | int choice;
54 | printf(
55 | "(1) AES-128 Exhaustive Search (no bank conflict, byteperm)\n"
56 | "(11) AES-128 Exhaustive Search (no bank conflict, arithmetic shift)\n"
57 | "(2) AES-128 Exhaustive Search (conflicting S-box, arithmetic shift)\n"
58 | "(22) AES-128 Exhaustive Search (conflicting S-box, byteperm)\n"
59 | "(3) AES-128 CTR \n"
60 | "(4) AES-192 Exhaustive Search\n"
61 | "(5) AES-192 CTR\n"
62 | "(6) AES-256 Exhaustive Search\n"
63 | "(7) AES-256 CTR\n"
64 | "(8) ALL\n"
65 | "Choice: ");
66 | scanf_s("%d", &choice);
67 | selection(choice);
68 | // AES-128 Exhaustive Search
69 | // main128ExhaustiveSearch();
70 |
71 | // AES-128 Counter Mode
72 | // main128Ctr();
73 |
74 | // AES-192 Exhaustive Search
75 | // main192ExhaustiveSearch();
76 |
77 |
78 | // AES-192 Counter Mode
79 | // main192Ctr();
80 |
81 | // AES-256 Exhaustive Search
82 | // main256ExhaustiveSearch();
83 |
84 | // AES-256 Counter Mode
85 | // main256Ctr();
86 |
87 | // Small AES probability calculation
88 | //mainSmall();
89 |
90 | // Silent
91 | //mainSilent();
92 |
93 | // File Encryption
94 | //mainFileEncryption();
95 | return 0;
96 | }
97 |
--------------------------------------------------------------------------------
/192-ctr.cuh:
--------------------------------------------------------------------------------
1 | // System includes
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | // CUDA runtime
8 | #include
9 |
10 | // Helper functions and utilities to work with CUDA
11 | //#include
12 | //#include
13 |
14 | #include
15 | //#include
16 |
17 | // Custom header
18 | //#include "kernel.h"
19 |
20 |
21 | // Key expansion from given key set, populate rk[52]
22 | __host__ void keyExpansion192(u32* key, u32* rk) {
23 |
24 | u32 rk0, rk1, rk2, rk3, rk4, rk5;
25 | rk0 = key[0];
26 | rk1 = key[1];
27 | rk2 = key[2];
28 | rk3 = key[3];
29 | rk4 = key[4];
30 | rk5 = key[5];
31 |
32 | rk[0] = rk0;
33 | rk[1] = rk1;
34 | rk[2] = rk2;
35 | rk[3] = rk3;
36 | rk[4] = rk4;
37 | rk[5] = rk5;
38 |
39 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_192; roundCount++) {
40 | u32 temp = rk5;
41 | rk0 = rk0 ^ T4_3[(temp >> 16) & 0xff] ^ T4_2[(temp >> 8) & 0xff] ^ T4_1[(temp) & 0xff] ^ T4_0[(temp >> 24)] ^ RCON32[roundCount];
42 | rk1 = rk1 ^ rk0;
43 | rk2 = rk2 ^ rk1;
44 | rk3 = rk3 ^ rk2;
45 | rk4 = rk4 ^ rk3;
46 | rk5 = rk5 ^ rk4;
47 |
48 | rk[roundCount * 6 + 6] = rk0;
49 | rk[roundCount * 6 + 7] = rk1;
50 | rk[roundCount * 6 + 8] = rk2;
51 | rk[roundCount * 6 + 9] = rk3;
52 | if (roundCount == 7) {
53 | break;
54 | }
55 | rk[roundCount * 6 + 10] = rk4;
56 | rk[roundCount * 6 + 11] = rk5;
57 | }
58 |
59 | // Print keys
60 | //for (int i = 0;i < 52;i++) {
61 | // printf("%08x ", rk[i]);
62 | // if ((i+1) % 4 == 0) {
63 | // printf("Round: %d\n", i / 4);
64 | // }
65 | //}
66 | }
67 |
68 | // CTR encryption with one table extended as 32 columns
69 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function
70 | // SBox[256] is partly expanded
71 | __global__ void counter192WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* rk, u32* t0G, u32* t4G, u64* range) {
72 |
73 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
74 | int warpThreadIndex = threadIdx.x & 31;
75 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE;
76 |
77 | //
78 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
79 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
80 | __shared__ u32 rkS[AES_192_KEY_SIZE_INT];
81 |
82 | if (threadIdx.x < TABLE_SIZE) {
83 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
84 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
85 | }
86 |
87 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) {
88 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x];
89 | }
90 |
91 | if (threadIdx.x < AES_192_KEY_SIZE_INT) {
92 | rkS[threadIdx.x] = rk[threadIdx.x];
93 | }
94 |
95 | }
96 | //
97 |
98 | // Wait until every thread is ready
99 | __syncthreads();
100 |
101 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
102 | u32 s0, s1, s2, s3;
103 | pt0Init = pt[0];
104 | pt1Init = pt[1];
105 | pt2Init = pt[2];
106 | pt3Init = pt[3];
107 |
108 | u32 threadRange = *range;
109 | u64 threadRangeStart = pt2Init;
110 | threadRangeStart = threadRangeStart << 32;
111 | threadRangeStart ^= pt3Init;
112 | threadRangeStart += (u64)threadIndex * threadRange;
113 | pt2Init = threadRangeStart >> 32;
114 | pt3Init = threadRangeStart & 0xFFFFFFFF;
115 |
116 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
117 |
118 | // Create plaintext as 32 bit unsigned integers
119 | s0 = pt0Init;
120 | s1 = pt1Init;
121 | s2 = pt2Init;
122 | s3 = pt3Init;
123 |
124 | // First round just XORs input with key.
125 | s0 = s0 ^ rkS[0];
126 | s1 = s1 ^ rkS[1];
127 | s2 = s2 ^ rkS[2];
128 | s3 = s3 ^ rkS[3];
129 |
130 | u32 t0, t1, t2, t3;
131 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1_192; roundCount++) {
132 |
133 | // Table based round function
134 | u32 rkStart = roundCount * 4 + 4;
135 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart];
136 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1];
137 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2];
138 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3];
139 |
140 | s0 = t0;
141 | s1 = t1;
142 | s2 = t2;
143 | s3 = t3;
144 |
145 | }
146 |
147 | // Calculate the last round key
148 | // Last round uses s-box directly and XORs to produce output.
149 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[48];
150 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[49];
151 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[50];
152 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[51];
153 |
154 | /*if (threadIndex == 0 && rangeCount == 0) {
155 | printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3);
156 | }*/
157 |
158 | // Overflow
159 | if (pt3Init == MAX_U32) {
160 | pt2Init++;
161 | }
162 |
163 | // Create key as 32 bit unsigned integers
164 | pt3Init++;
165 | }
166 |
167 | if (threadIndex == 1048575) {
168 | printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init);
169 | printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3);
170 | printf("-------------------------------\n");
171 | }
172 |
173 | }
174 |
175 | __host__ int main192Ctr() {
176 | printf("\n");
177 | printf("########## AES-192 Counter Mode Implementation ##########\n");
178 | printf("\n");
179 |
180 | // Allocate plaintext and every round key
181 | u32 *pt, *ct, *rk192, *roundKeys192;
182 | gpuErrorCheck(cudaMallocManaged(&pt, 4 * sizeof(u32)));
183 | gpuErrorCheck(cudaMallocManaged(&ct, 4 * sizeof(u32)));
184 | gpuErrorCheck(cudaMallocManaged(&rk192, 6 * sizeof(u32)));
185 | gpuErrorCheck(cudaMallocManaged(&roundKeys192, AES_192_KEY_SIZE_INT * sizeof(u32)));
186 |
187 | pt[0] = 0x6bc1bee2U;
188 | pt[1] = 0x2e409f96U;
189 | pt[2] = 0xe93d7e11U;
190 | pt[3] = 0x7393172aU;
191 |
192 | ct[0] = 0xF3EED1BDU;
193 | ct[1] = 0xB5D2A03CU;
194 | ct[2] = 0x064B5A7EU;
195 | ct[3] = 0x3DB181F8U;
196 |
197 | rk192[0] = 0x8e73b0f7U;
198 | rk192[1] = 0xda0e6452U;
199 | rk192[2] = 0xc810f32bU;
200 | rk192[3] = 0x809079e5U;
201 | rk192[4] = 0x62f8ead2U;
202 | rk192[5] = 0x522c6b7bU;
203 |
204 | // Allocate RCON values
205 | u32* rcon;
206 | gpuErrorCheck(cudaMallocManaged(&rcon, RCON_SIZE * sizeof(u32)));
207 | for (int i = 0; i < RCON_SIZE; i++) {
208 | rcon[i] = RCON32[i];
209 | }
210 |
211 | // Allocate Tables
212 | u32 *t0, *t1, *t2, *t3, *t4, *t4_0, *t4_1, *t4_2, *t4_3;
213 | gpuErrorCheck(cudaMallocManaged(&t0, TABLE_SIZE * sizeof(u32)));
214 | gpuErrorCheck(cudaMallocManaged(&t1, TABLE_SIZE * sizeof(u32)));
215 | gpuErrorCheck(cudaMallocManaged(&t2, TABLE_SIZE * sizeof(u32)));
216 | gpuErrorCheck(cudaMallocManaged(&t3, TABLE_SIZE * sizeof(u32)));
217 | gpuErrorCheck(cudaMallocManaged(&t4, TABLE_SIZE * sizeof(u32)));
218 | gpuErrorCheck(cudaMallocManaged(&t4_0, TABLE_SIZE * sizeof(u32)));
219 | gpuErrorCheck(cudaMallocManaged(&t4_1, TABLE_SIZE * sizeof(u32)));
220 | gpuErrorCheck(cudaMallocManaged(&t4_2, TABLE_SIZE * sizeof(u32)));
221 | gpuErrorCheck(cudaMallocManaged(&t4_3, TABLE_SIZE * sizeof(u32)));
222 | for (int i = 0; i < TABLE_SIZE; i++) {
223 | t0[i] = T0[i];
224 | t1[i] = T1[i];
225 | t2[i] = T2[i];
226 | t3[i] = T3[i];
227 | t4[i] = T4[i];
228 | t4_0[i] = T4_0[i];
229 | t4_1[i] = T4_1[i];
230 | t4_2[i] = T4_2[i];
231 | t4_3[i] = T4_3[i];
232 | }
233 |
234 | printf("-------------------------------\n");
235 | u64* range = calculateRange();
236 | /* printf("Initial Plaintext : %08x %08x %08x %08x\n", pt[0], pt[1], pt[2], pt[3]);
237 | printf("Initial Key : %08x %08x %08x %08x %08x %08x\n", rk192[0], rk192[1], rk192[2], rk192[3], rk192[4], rk192[5]);
238 | printf("-------------------------------\n");*/
239 |
240 | // Key expansion
241 | keyExpansion192(rk192, roundKeys192);
242 |
243 | clock_t beginTime = clock();
244 | // Kernels
245 | counter192WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<> > (pt, roundKeys192, t0, t4, range);
246 |
247 | cudaDeviceSynchronize();
248 | printf("Time elapsed: %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC);
249 | printf("-------------------------------\n");
250 | printLastCUDAError();
251 |
252 | // Free alocated arrays
253 | cudaFree(range);
254 | cudaFree(pt);
255 | cudaFree(ct);
256 | cudaFree(rk192);
257 | cudaFree(roundKeys192);
258 | cudaFree(t0);
259 | cudaFree(t1);
260 | cudaFree(t2);
261 | cudaFree(t3);
262 | cudaFree(t4);
263 | cudaFree(t4_0);
264 | cudaFree(t4_1);
265 | cudaFree(t4_2);
266 | cudaFree(t4_3);
267 | cudaFree(rcon);
268 |
269 | return 0;
270 | }
--------------------------------------------------------------------------------
/256-ctr.cuh:
--------------------------------------------------------------------------------
1 | // System includes
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | // CUDA runtime
8 | #include
9 |
10 | // Helper functions and utilities to work with CUDA
11 | //#include
12 | //#include
13 |
14 | #include
15 | //#include
16 |
17 | // Custom header
18 | //#include "kernel.h"
19 |
20 |
21 | // Key expansion from given key set, populate rk[52]
22 | __host__ void keyExpansion256(u32* key, u32* rk) {
23 |
24 | u32 rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7;
25 | rk0 = key[0];
26 | rk1 = key[1];
27 | rk2 = key[2];
28 | rk3 = key[3];
29 | rk4 = key[4];
30 | rk5 = key[5];
31 | rk6 = key[6];
32 | rk7 = key[7];
33 |
34 | rk[0] = rk0;
35 | rk[1] = rk1;
36 | rk[2] = rk2;
37 | rk[3] = rk3;
38 | rk[4] = rk4;
39 | rk[5] = rk5;
40 | rk[6] = rk6;
41 | rk[7] = rk7;
42 |
43 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_256; roundCount++) {
44 | u32 temp = rk7;
45 | rk0 = rk0 ^ T4_3[(temp >> 16) & 0xff] ^ T4_2[(temp >> 8) & 0xff] ^ T4_1[(temp) & 0xff] ^ T4_0[(temp >> 24)] ^ RCON32[roundCount];
46 | rk1 = rk1 ^ rk0;
47 | rk2 = rk2 ^ rk1;
48 | rk3 = rk3 ^ rk2;
49 | rk4 = rk4 ^ T4_3[(rk3 >> 24) & 0xff] ^ T4_2[(rk3 >> 16) & 0xff] ^ T4_1[(rk3 >> 8) & 0xff] ^ T4_0[rk3 & 0xff];
50 | rk5 = rk5 ^ rk4;
51 | rk6 = rk6 ^ rk5;
52 | rk7 = rk7 ^ rk6;
53 |
54 | rk[roundCount * 8 + 8] = rk0;
55 | rk[roundCount * 8 + 9] = rk1;
56 | rk[roundCount * 8 + 10] = rk2;
57 | rk[roundCount * 8 + 11] = rk3;
58 | if (roundCount == 6) {
59 | break;
60 | }
61 | rk[roundCount * 8 + 12] = rk4;
62 | rk[roundCount * 8 + 13] = rk5;
63 | rk[roundCount * 8 + 14] = rk6;
64 | rk[roundCount * 8 + 15] = rk7;
65 |
66 | }
67 |
68 | //for (int i = 0; i < 60; i++) {
69 | // printf("%08x ", rk[i]);
70 | // if ((i + 1) % 4 == 0) {
71 | // printf("Round: %d\n", i / 4);
72 | // }
73 | //}
74 | }
75 |
76 | // CTR encryption with one table extended as 32 columns
77 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function
78 | // SBox[256] is partly expanded
79 | __global__ void counter256WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* rk, u32* t0G, u32* t4G, u64* range) {
80 |
81 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
82 | int warpThreadIndex = threadIdx.x & 31;
83 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE;
84 |
85 | //
86 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
87 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
88 | __shared__ u32 rkS[AES_256_KEY_SIZE_INT];
89 |
90 | if (threadIdx.x < TABLE_SIZE) {
91 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
92 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
93 | }
94 |
95 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) {
96 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x];
97 | }
98 |
99 | if (threadIdx.x < AES_256_KEY_SIZE_INT) {
100 | rkS[threadIdx.x] = rk[threadIdx.x];
101 | }
102 |
103 | }
104 | //
105 |
106 | // Wait until every thread is ready
107 | __syncthreads();
108 |
109 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
110 | u32 s0, s1, s2, s3;
111 | pt0Init = pt[0];
112 | pt1Init = pt[1];
113 | pt2Init = pt[2];
114 | pt3Init = pt[3];
115 |
116 | u32 threadRange = *range;
117 | u64 threadRangeStart = pt2Init;
118 | threadRangeStart = threadRangeStart << 32;
119 | threadRangeStart ^= pt3Init;
120 | threadRangeStart += (u64)threadIndex * threadRange;
121 | pt2Init = threadRangeStart >> 32;
122 | pt3Init = threadRangeStart & 0xFFFFFFFF;
123 |
124 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
125 |
126 | // Create plaintext as 32 bit unsigned integers
127 | s0 = pt0Init;
128 | s1 = pt1Init;
129 | s2 = pt2Init;
130 | s3 = pt3Init;
131 |
132 | // First round just XORs input with key.
133 | s0 = s0 ^ rkS[0];
134 | s1 = s1 ^ rkS[1];
135 | s2 = s2 ^ rkS[2];
136 | s3 = s3 ^ rkS[3];
137 |
138 | u32 t0, t1, t2, t3;
139 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1_256; roundCount++) {
140 |
141 | // Table based round function
142 | u32 rkStart = roundCount * 4 + 4;
143 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart];
144 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1];
145 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2];
146 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3];
147 |
148 | s0 = t0;
149 | s1 = t1;
150 | s2 = t2;
151 | s3 = t3;
152 |
153 | }
154 |
155 | // Calculate the last round key
156 | // Last round uses s-box directly and XORs to produce output.
157 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[56];
158 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[57];
159 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[58];
160 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[59];
161 |
162 | //if (threadIndex == 0 && rangeCount == 0) {
163 | //printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3);
164 | //}
165 |
166 | // Overflow
167 | if (pt3Init == MAX_U32) {
168 | pt2Init++;
169 | }
170 |
171 | // Create key as 32 bit unsigned integers
172 | pt3Init++;
173 | }
174 |
175 | if (threadIndex == 1048575) {
176 | printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init);
177 | printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3);
178 | printf("-------------------------------\n");
179 | }
180 |
181 | }
182 |
183 | __host__ int main256Ctr() {
184 | printf("\n");
185 | printf("########## AES-256 Counter Mode Implementation ##########\n");
186 | printf("\n");
187 |
188 | // Allocate plaintext and every round key
189 | u32 *pt, *ct, *rk256, *roundKeys256;
190 | gpuErrorCheck(cudaMallocManaged(&pt, 4 * sizeof(u32)));
191 | gpuErrorCheck(cudaMallocManaged(&ct, 4 * sizeof(u32)));
192 | gpuErrorCheck(cudaMallocManaged(&rk256, 8 * sizeof(u32)));
193 | gpuErrorCheck(cudaMallocManaged(&roundKeys256, AES_256_KEY_SIZE_INT * sizeof(u32)));
194 |
195 | pt[0] = 0x6bc1bee2U;
196 | pt[1] = 0x2e409f96U;
197 | pt[2] = 0xe93d7e11U;
198 | pt[3] = 0x7393172aU;
199 |
200 | ct[0] = 0xF3EED1BDU;
201 | ct[1] = 0xB5D2A03CU;
202 | ct[2] = 0x064B5A7EU;
203 | ct[3] = 0x3DB181F8U;
204 |
205 | rk256[0] = 0x603deb10U;
206 | rk256[1] = 0x15ca71beU;
207 | rk256[2] = 0x2b73aef0U;
208 | rk256[3] = 0x857d7781U;
209 | rk256[4] = 0x1f352c07U;
210 | rk256[5] = 0x3b6108d7U;
211 | rk256[6] = 0x2d9810a3U;
212 | rk256[7] = 0x0914dff4U;
213 |
214 | // Allocate RCON values
215 | u32* rcon;
216 | gpuErrorCheck(cudaMallocManaged(&rcon, RCON_SIZE * sizeof(u32)));
217 | for (int i = 0; i < RCON_SIZE; i++) {
218 | rcon[i] = RCON32[i];
219 | }
220 |
221 | // Allocate Tables
222 | u32 *t0, *t1, *t2, *t3, *t4, *t4_0, *t4_1, *t4_2, *t4_3;
223 | gpuErrorCheck(cudaMallocManaged(&t0, TABLE_SIZE * sizeof(u32)));
224 | gpuErrorCheck(cudaMallocManaged(&t1, TABLE_SIZE * sizeof(u32)));
225 | gpuErrorCheck(cudaMallocManaged(&t2, TABLE_SIZE * sizeof(u32)));
226 | gpuErrorCheck(cudaMallocManaged(&t3, TABLE_SIZE * sizeof(u32)));
227 | gpuErrorCheck(cudaMallocManaged(&t4, TABLE_SIZE * sizeof(u32)));
228 | gpuErrorCheck(cudaMallocManaged(&t4_0, TABLE_SIZE * sizeof(u32)));
229 | gpuErrorCheck(cudaMallocManaged(&t4_1, TABLE_SIZE * sizeof(u32)));
230 | gpuErrorCheck(cudaMallocManaged(&t4_2, TABLE_SIZE * sizeof(u32)));
231 | gpuErrorCheck(cudaMallocManaged(&t4_3, TABLE_SIZE * sizeof(u32)));
232 | for (int i = 0; i < TABLE_SIZE; i++) {
233 | t0[i] = T0[i];
234 | t1[i] = T1[i];
235 | t2[i] = T2[i];
236 | t3[i] = T3[i];
237 | t4[i] = T4[i];
238 | t4_0[i] = T4_0[i];
239 | t4_1[i] = T4_1[i];
240 | t4_2[i] = T4_2[i];
241 | t4_3[i] = T4_3[i];
242 | }
243 |
244 | printf("-------------------------------\n");
245 | u64* range = calculateRange();
246 | /* printf("Initial Plaintext : %08x %08x %08x %08x\n", pt[0], pt[1], pt[2], pt[3]);
247 | printf("Initial Key : %08x %08x %08x %08x %08x %08x %08x %08x\n", rk256[0], rk256[1], rk256[2], rk256[3], rk256[4], rk256[5], rk256[6], rk256[7]);
248 | printf("-------------------------------\n");*/
249 |
250 | keyExpansion256(rk256, roundKeys256);
251 | clock_t beginTime = clock();
252 | // Kernels
253 | counter256WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, roundKeys256, t0, t4, range);
254 |
255 | cudaDeviceSynchronize();
256 | printf("Time elapsed: %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC);
257 | printf("-------------------------------\n");
258 | printLastCUDAError();
259 |
260 | // Free alocated arrays
261 | cudaFree(range);
262 | cudaFree(pt);
263 | cudaFree(ct);
264 | cudaFree(rk256);
265 | cudaFree(roundKeys256);
266 | cudaFree(t0);
267 | cudaFree(t1);
268 | cudaFree(t2);
269 | cudaFree(t3);
270 | cudaFree(t4);
271 | cudaFree(t4_0);
272 | cudaFree(t4_1);
273 | cudaFree(t4_2);
274 | cudaFree(t4_3);
275 | cudaFree(rcon);
276 |
277 |
278 | return 0;
279 | }
--------------------------------------------------------------------------------
/256-es.cuh:
--------------------------------------------------------------------------------
1 | // System includes
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | // CUDA runtime
8 | #include
9 |
10 | // Helper functions and utilities to work with CUDA
11 | //#include
12 | //#include
13 |
14 | #include
15 | //#include
16 |
17 | // Custom header
18 | //#include "kernel.h"
19 |
20 |
21 | // Exhaustive search with one table extended as 32 columns
22 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function
23 | // SBox[256] is partly expanded
24 | __global__ void exhaustiveSearch256WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range) {
25 |
26 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
27 | int warpThreadIndex = threadIdx.x & 31;
28 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE;
29 |
30 | //
31 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
32 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
33 | __shared__ u32 rconS[RCON_SIZE];
34 | __shared__ u32 ctS[U32_SIZE];
35 |
36 |
37 | if (threadIdx.x < TABLE_SIZE) {
38 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
39 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
40 | }
41 |
42 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) {
43 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x];
44 | }
45 |
46 | if (threadIdx.x < RCON_SIZE) {
47 | rconS[threadIdx.x] = rconG[threadIdx.x];
48 | }
49 |
50 | if (threadIdx.x < U32_SIZE) {
51 | ctS[threadIdx.x] = ct[threadIdx.x];
52 | }
53 | }
54 | //
55 |
56 | // Wait until every thread is ready
57 | __syncthreads();
58 |
59 | u32 rk0Init, rk1Init, rk2Init, rk3Init, rk4Init, rk5Init, rk6Init, rk7Init;
60 | rk0Init = rk[0];
61 | rk1Init = rk[1];
62 | rk2Init = rk[2];
63 | rk3Init = rk[3];
64 | rk4Init = rk[4];
65 | rk5Init = rk[5];
66 | rk6Init = rk[6];
67 | rk7Init = rk[7];
68 |
69 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
70 | pt0Init = pt[0];
71 | pt1Init = pt[1];
72 | pt2Init = pt[2];
73 | pt3Init = pt[3];
74 |
75 | u32 threadRange = *range;
76 | u64 threadRangeStart = (u64)threadIndex * threadRange;
77 | rk6Init = rk6Init + threadRangeStart / MAX_U32;
78 | rk7Init = rk7Init + threadRangeStart % MAX_U32;
79 |
80 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
81 |
82 | // Calculate round keys
83 | u32 rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7;
84 | rk0 = rk0Init;
85 | rk1 = rk1Init;
86 | rk2 = rk2Init;
87 | rk3 = rk3Init;
88 | rk4 = rk4Init;
89 | rk5 = rk5Init;
90 | rk6 = rk6Init;
91 | rk7 = rk7Init;
92 |
93 | // Create plaintext as 32 bit unsigned integers
94 | u32 s0, s1, s2, s3;
95 | s0 = pt0Init;
96 | s1 = pt1Init;
97 | s2 = pt2Init;
98 | s3 = pt3Init;
99 |
100 | // First round just XORs input with key.
101 | s0 = s0 ^ rk0;
102 | s1 = s1 ^ rk1;
103 | s2 = s2 ^ rk2;
104 | s3 = s3 ^ rk3;
105 |
106 | u32 t0, t1, t2, t3;
107 | u8 rconIndex = 0;
108 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1_256; roundCount++) {
109 | // Table based round function
110 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT);
111 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT);
112 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT);
113 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT);
114 |
115 | // Add round key
116 | if (roundCount % 2 == 0) {
117 | t0 = t0 ^ rk4;
118 | t1 = t1 ^ rk5;
119 | t2 = t2 ^ rk6;
120 | t3 = t3 ^ rk7;
121 | } else {
122 | // Calculate round key
123 | u32 temp = rk7;
124 | rk0 = rk0 ^
125 | (t4S[(temp >> 16) & 0xff][warpThreadIndexSBox] & 0xff000000) ^
126 | (t4S[(temp >> 8) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^
127 | (t4S[(temp) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^
128 | (t4S[(temp >> 24)][warpThreadIndexSBox] & 0x000000ff) ^
129 | rconS[rconIndex++];
130 | rk1 = rk1 ^ rk0;
131 | rk2 = rk2 ^ rk1;
132 | rk3 = rk3 ^ rk2;
133 | rk4 = rk4 ^
134 | (t4S[(rk3 >> 24) & 0xff][warpThreadIndexSBox] & 0xff000000) ^
135 | (t4S[(rk3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^
136 | (t4S[(rk3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^
137 | (t4S[(rk3) & 0xff][warpThreadIndexSBox] & 0x000000ff);
138 | rk5 = rk5 ^ rk4;
139 | rk6 = rk6 ^ rk5;
140 | rk7 = rk7 ^ rk6;
141 |
142 | t0 = t0 ^ rk0;
143 | t1 = t1 ^ rk1;
144 | t2 = t2 ^ rk2;
145 | t3 = t3 ^ rk3;
146 | }
147 |
148 | s0 = t0;
149 | s1 = t1;
150 | s2 = t2;
151 | s3 = t3;
152 | }
153 |
154 | // Calculate the last round key
155 | u32 temp = rk7;
156 | rk0 = rk0 ^
157 | (t4S[(temp >> 16) & 0xff][warpThreadIndexSBox] & 0xff000000) ^
158 | (t4S[(temp >> 8) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^
159 | (t4S[(temp) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^
160 | (t4S[(temp >> 24)][warpThreadIndexSBox] & 0x000000ff) ^
161 | rconS[rconIndex++];
162 |
163 | // Last round uses s-box directly and XORs to produce output.
164 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk0;
165 | if (s0 == ctS[0]) {
166 | rk1 = rk1 ^ rk0;
167 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk1;
168 | if (s1 == ctS[1]) {
169 | rk2 = rk2 ^ rk1;
170 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk2;
171 | if (s2 == ctS[2]) {
172 | rk3 = rk2 ^ rk3;
173 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk3;
174 | if (s3 == ctS[3]) {
175 | printf("! Found key : %08x %08x %08x %08x %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init, rk4Init, rk5Init, rk6Init, rk7Init);
176 | printf("-------------------------------\n");
177 | }
178 | }
179 | }
180 | }
181 |
182 | // Overflow
183 | if (rk7Init == MAX_U32) {
184 | rk6Init++;
185 | }
186 |
187 | // Create key as 32 bit unsigned integers
188 | rk7Init++;
189 | }
190 | }
191 |
192 | __host__ int main256ExhaustiveSearch() {
193 | printf("\n");
194 | printf("########## AES-256 Exhaustive Search Implementation ##########\n");
195 | printf("\n");
196 |
197 | // Allocate plaintext, ciphertext and initial round key
198 | u32 *pt, *ct, *rk256;
199 | gpuErrorCheck(cudaMallocManaged(&pt, 4 * sizeof(u32)));
200 | gpuErrorCheck(cudaMallocManaged(&ct, 4 * sizeof(u32)));
201 | gpuErrorCheck(cudaMallocManaged(&rk256, 8 * sizeof(u32)));
202 |
203 | pt[0] = 0x6bc1bee2U;
204 | pt[1] = 0x2e409f96U;
205 | pt[2] = 0xe93d7e11U;
206 | pt[3] = 0x7393172aU;
207 |
208 | ct[0] = 0xF3EED1BDU;
209 | ct[1] = 0xB5D2A03CU;
210 | ct[2] = 0x064B5A7EU;
211 | ct[3] = 0x3DB181F8U;
212 |
213 | rk256[0] = 0x603deb10U;
214 | rk256[1] = 0x15ca71beU;
215 | rk256[2] = 0x2b73aef0U;
216 | rk256[3] = 0x857d7781U;
217 | rk256[4] = 0x1f352c07U;
218 | rk256[5] = 0x3b6108d7U;
219 | rk256[6] = 0x2d9810a3U;
220 | rk256[7] = 0x0914dff4U;
221 |
222 | // Allocate RCON values
223 | u32* rcon;
224 | gpuErrorCheck(cudaMallocManaged(&rcon, RCON_SIZE * sizeof(u32)));
225 | for (int i = 0; i < RCON_SIZE; i++) {
226 | rcon[i] = RCON32[i];
227 | }
228 |
229 | // Allocate Tables
230 | u32 *t0, *t1, *t2, *t3, *t4, *t4_0, *t4_1, *t4_2, *t4_3;
231 | gpuErrorCheck(cudaMallocManaged(&t0, TABLE_SIZE * sizeof(u32)));
232 | gpuErrorCheck(cudaMallocManaged(&t1, TABLE_SIZE * sizeof(u32)));
233 | gpuErrorCheck(cudaMallocManaged(&t2, TABLE_SIZE * sizeof(u32)));
234 | gpuErrorCheck(cudaMallocManaged(&t3, TABLE_SIZE * sizeof(u32)));
235 | gpuErrorCheck(cudaMallocManaged(&t4, TABLE_SIZE * sizeof(u32)));
236 | gpuErrorCheck(cudaMallocManaged(&t4_0, TABLE_SIZE * sizeof(u32)));
237 | gpuErrorCheck(cudaMallocManaged(&t4_1, TABLE_SIZE * sizeof(u32)));
238 | gpuErrorCheck(cudaMallocManaged(&t4_2, TABLE_SIZE * sizeof(u32)));
239 | gpuErrorCheck(cudaMallocManaged(&t4_3, TABLE_SIZE * sizeof(u32)));
240 | for (int i = 0; i < TABLE_SIZE; i++) {
241 | t0[i] = T0[i];
242 | t1[i] = T1[i];
243 | t2[i] = T2[i];
244 | t3[i] = T3[i];
245 | t4[i] = T4[i];
246 | t4_0[i] = T4_0[i];
247 | t4_1[i] = T4_1[i];
248 | t4_2[i] = T4_2[i];
249 | t4_3[i] = T4_3[i];
250 | }
251 |
252 | printf("-------------------------------\n");
253 | u64* range = calculateRange();
254 | /* printf("Plaintext : %08x %08x %08x %08x\n", pt[0], pt[1], pt[2], pt[3]);
255 | printf("Ciphertext : %08x %08x %08x %08x\n", ct[0], ct[1], ct[2], ct[3]);
256 | printf("Initial Key : %08x %08x %08x %08x %08x %08x %08x %08x\n", rk256[0], rk256[1], rk256[2], rk256[3], rk256[4], rk256[5], rk256[6], rk256[7]);
257 | printf("-------------------------------\n");*/
258 |
259 | clock_t beginTime = clock();
260 | // Kernels
261 | exhaustiveSearch256WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, ct, rk256, t0, t4, rcon, range);
262 |
263 | cudaDeviceSynchronize();
264 | printf("Time elapsed: %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC);
265 | printf("-------------------------------\n");
266 | printLastCUDAError();
267 |
268 | // Free alocated arrays
269 | cudaFree(range);
270 | cudaFree(pt);
271 | cudaFree(ct);
272 | cudaFree(rk256);
273 | cudaFree(t0);
274 | cudaFree(t1);
275 | cudaFree(t2);
276 | cudaFree(t3);
277 | cudaFree(t4);
278 | cudaFree(t4_0);
279 | cudaFree(t4_1);
280 | cudaFree(t4_2);
281 | cudaFree(t4_3);
282 | cudaFree(rcon);
283 |
284 | return 0;
285 | }
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [2020] [Cihangir Tezcan]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/192-es.cuh:
--------------------------------------------------------------------------------
1 | // System includes
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | // CUDA runtime
8 | #include
9 |
10 | // Helper functions and utilities to work with CUDA
11 | //#include
12 | //#include
13 |
14 | #include
15 | //#include
16 |
17 | // Custom header
18 | //#include "kernel.h"
19 |
20 |
21 | // Exhaustive search with one table extended as 32 columns
22 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function
23 | // SBox[256] is partly expanded
24 | __global__ void exhaustiveSearch192WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range) {
25 |
26 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
27 | int warpThreadIndex = threadIdx.x & 31;
28 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE;
29 |
30 | //
31 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
32 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
33 | __shared__ u32 rconS[RCON_SIZE];
34 | __shared__ u32 ctS[U32_SIZE];
35 |
36 |
37 | if (threadIdx.x < TABLE_SIZE) {
38 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
39 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
40 | }
41 |
42 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) {
43 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x];
44 | }
45 |
46 | if (threadIdx.x < RCON_SIZE) {
47 | rconS[threadIdx.x] = rconG[threadIdx.x];
48 | }
49 |
50 | if (threadIdx.x < U32_SIZE) {
51 | ctS[threadIdx.x] = ct[threadIdx.x];
52 | }
53 | }
54 | //
55 |
56 | // Wait until every thread is ready
57 | __syncthreads();
58 |
59 | u32 rk0Init, rk1Init, rk2Init, rk3Init, rk4Init, rk5Init;
60 | rk0Init = rk[0];
61 | rk1Init = rk[1];
62 | rk2Init = rk[2];
63 | rk3Init = rk[3];
64 | rk4Init = rk[4];
65 | rk5Init = rk[5];
66 |
67 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
68 | pt0Init = pt[0];
69 | pt1Init = pt[1];
70 | pt2Init = pt[2];
71 | pt3Init = pt[3];
72 |
73 | u32 threadRange = *range;
74 | u64 threadRangeStart = (u64)threadIndex * threadRange;
75 | rk4Init = rk4Init + threadRangeStart / MAX_U32;
76 | rk5Init = rk5Init + threadRangeStart % MAX_U32;
77 |
78 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
79 |
80 | // Calculate round keys
81 | u32 rk0, rk1, rk2, rk3, rk4, rk5;
82 | rk0 = rk0Init;
83 | rk1 = rk1Init;
84 | rk2 = rk2Init;
85 | rk3 = rk3Init;
86 | rk4 = rk4Init;
87 | rk5 = rk5Init;
88 |
89 | // Create plaintext as 32 bit unsigned integers
90 | u32 s0, s1, s2, s3;
91 | s0 = pt0Init;
92 | s1 = pt1Init;
93 | s2 = pt2Init;
94 | s3 = pt3Init;
95 |
96 | // First round just XORs input with key.
97 | s0 = s0 ^ rk0;
98 | s1 = s1 ^ rk1;
99 | s2 = s2 ^ rk2;
100 | s3 = s3 ^ rk3;
101 |
102 | u32 t0, t1, t2, t3;
103 | u8 rconIndex = 0;
104 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1_192; roundCount++) {
105 | // Table based round function
106 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT);
107 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT);
108 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT);
109 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT);
110 |
111 | // Add round key
112 | if (roundCount % 3 == 0) {
113 | t0 = t0 ^ rk4;
114 | t1 = t1 ^ rk5;
115 | // Calculate round key
116 | u32 temp = rk5;
117 | rk0 = rk0 ^
118 | (t4S[(temp >> 16) & 0xff][warpThreadIndexSBox] & 0xff000000) ^
119 | (t4S[(temp >> 8) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^
120 | (t4S[(temp) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^
121 | (t4S[(temp >> 24)][warpThreadIndexSBox] & 0x000000ff) ^
122 | rconS[rconIndex++];
123 | rk1 = rk1 ^ rk0;
124 | rk2 = rk2 ^ rk1;
125 | rk3 = rk3 ^ rk2;
126 | rk4 = rk4 ^ rk3;
127 | rk5 = rk5 ^ rk4;
128 |
129 | t2 = t2 ^ rk0;
130 | t3 = t3 ^ rk1;
131 | } else if (roundCount % 3 == 1) {
132 | t0 = t0 ^ rk2;
133 | t1 = t1 ^ rk3;
134 | t2 = t2 ^ rk4;
135 | t3 = t3 ^ rk5;
136 | } else {
137 | // Calculate round key
138 | u32 temp = rk5;
139 | rk0 = rk0 ^
140 | (t4S[(temp >> 16) & 0xff][warpThreadIndexSBox] & 0xff000000) ^
141 | (t4S[(temp >> 8) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^
142 | (t4S[(temp) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^
143 | (t4S[(temp >> 24)][warpThreadIndexSBox] & 0x000000ff) ^
144 | rconS[rconIndex++];
145 | rk1 = rk1 ^ rk0;
146 | rk2 = rk2 ^ rk1;
147 | rk3 = rk3 ^ rk2;
148 | rk4 = rk4 ^ rk3;
149 | rk5 = rk5 ^ rk4;
150 |
151 | t0 = t0 ^ rk0;
152 | t1 = t1 ^ rk1;
153 | t2 = t2 ^ rk2;
154 | t3 = t3 ^ rk3;
155 | }
156 |
157 | s0 = t0;
158 | s1 = t1;
159 | s2 = t2;
160 | s3 = t3;
161 | }
162 |
163 | // Calculate the last round key
164 | u32 temp = rk5;
165 | rk0 = rk0 ^
166 | (t4S[(temp >> 16) & 0xff][warpThreadIndexSBox] & 0xff000000) ^
167 | (t4S[(temp >> 8) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^
168 | (t4S[(temp) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^
169 | (t4S[(temp >> 24)][warpThreadIndexSBox] & 0x000000ff) ^
170 | rconS[rconIndex];
171 |
172 | // Last round uses s-box directly and XORs to produce output.
173 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk0;
174 | if (s0 == ctS[0]) {
175 | rk1 = rk1 ^ rk0;
176 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk1;
177 | if (s1 == ctS[1]) {
178 | rk2 = rk2 ^ rk1;
179 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk2;
180 | if (s2 == ctS[2]) {
181 | rk3 = rk2 ^ rk3;
182 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk3;
183 | if (s3 == ctS[3]) {
184 | printf("! Found key : %08x %08x %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init, rk4Init, rk5Init);
185 | printf("-------------------------------\n");
186 | }
187 | }
188 | }
189 | }
190 |
191 | // Overflow
192 | if (rk5Init == MAX_U32) {
193 | rk4Init++;
194 | }
195 |
196 | // Create key as 32 bit unsigned integers
197 | rk5Init++;
198 | }
199 | }
200 | __global__ void exhaustiveSearch192WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range, u8* SAES) {
201 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
202 | int warpThreadIndex = threadIdx.x & 31;
203 | // int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE;
204 | //
205 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
206 | // __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
207 | __shared__ u8 Sbox[64][32][4];
208 | __shared__ u32 rconS[RCON_SIZE];
209 | __shared__ u32 ctS[U32_SIZE];
210 | if (threadIdx.x < TABLE_SIZE) {
211 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; }
212 | // for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; }
213 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; }
214 | if (threadIdx.x < RCON_SIZE) { rconS[threadIdx.x] = rconG[threadIdx.x]; }
215 | if (threadIdx.x < U32_SIZE) { ctS[threadIdx.x] = ct[threadIdx.x]; }
216 | }
217 | //
218 | // Wait until every thread is ready
219 | __syncthreads();
220 | u32 rk0Init, rk1Init, rk2Init, rk3Init, rk4Init, rk5Init;
221 | rk0Init = rk[0]; rk1Init = rk[1]; rk2Init = rk[2]; rk3Init = rk[3]; rk4Init = rk[4]; rk5Init = rk[5];
222 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
223 | pt0Init = pt[0]; pt1Init = pt[1]; pt2Init = pt[2]; pt3Init = pt[3];
224 | u32 threadRange = *range;
225 | u64 threadRangeStart = (u64)threadIndex * threadRange;
226 | rk4Init = rk4Init + threadRangeStart / MAX_U32;
227 | rk5Init = rk5Init + threadRangeStart % MAX_U32;
228 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
229 | // Calculate round keys
230 | u32 rk0, rk1, rk2, rk3, rk4, rk5;
231 | rk0 = rk0Init; rk1 = rk1Init; rk2 = rk2Init; rk3 = rk3Init; rk4 = rk4Init; rk5 = rk5Init;
232 | // Create plaintext as 32 bit unsigned integers
233 | u32 s0, s1, s2, s3;
234 | s0 = pt0Init; s1 = pt1Init; s2 = pt2Init; s3 = pt3Init;
235 | // First round just XORs input with key.
236 | s0 = s0 ^ rk0; s1 = s1 ^ rk1; s2 = s2 ^ rk2; s3 = s3 ^ rk3;
237 | u32 t0, t1, t2, t3;
238 | u8 rconIndex = 0;
239 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1_192; roundCount++) {
240 | // Table based round function
241 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT);
242 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT);
243 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT);
244 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT);
245 | // Add round key
246 | if (roundCount % 3 == 0) {
247 | t0 = t0 ^ rk4; t1 = t1 ^ rk5;
248 | // Calculate round key
249 | u32 temp = rk5;
250 | rk0 = rk0 ^
251 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16) & 0xff) % 4], SHIFT_1_RIGHT) ^
252 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8) & 0xff) % 4], SHIFT_2_RIGHT) ^
253 | arithmeticRightShiftBytePerm((u64)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp) & 0xff) % 4], SHIFT_3_RIGHT) ^
254 | ((u64)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^
255 | rconS[rconIndex++];
256 | rk1 = rk1 ^ rk0; rk2 = rk2 ^ rk1; rk3 = rk3 ^ rk2; rk4 = rk4 ^ rk3; rk5 = rk5 ^ rk4;
257 | t2 = t2 ^ rk0;
258 | t3 = t3 ^ rk1;
259 | }
260 | else if (roundCount % 3 == 1) { t0 = t0 ^ rk2; t1 = t1 ^ rk3; t2 = t2 ^ rk4; t3 = t3 ^ rk5; }
261 | else {
262 | // Calculate round key
263 | u32 temp = rk5;
264 | rk0 = rk0 ^
265 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16) & 0xff) % 4], SHIFT_1_RIGHT) ^
266 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8) & 0xff) % 4], SHIFT_2_RIGHT) ^
267 | arithmeticRightShiftBytePerm((u64)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp) & 0xff) % 4], SHIFT_3_RIGHT) ^
268 | ((u64)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^
269 | rconS[rconIndex++];
270 | rk1 = rk1 ^ rk0; rk2 = rk2 ^ rk1; rk3 = rk3 ^ rk2; rk4 = rk4 ^ rk3; rk5 = rk5 ^ rk4;
271 | t0 = t0 ^ rk0; t1 = t1 ^ rk1; t2 = t2 ^ rk2; t3 = t3 ^ rk3;
272 | }
273 | s0 = t0; s1 = t1; s2 = t2; s3 = t3;
274 | }
275 | // Calculate the last round key
276 | u32 temp = rk5;
277 | rk0 = rk0 ^
278 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16) & 0xff) % 4], SHIFT_1_RIGHT) ^
279 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8) & 0xff) % 4], SHIFT_2_RIGHT) ^
280 | arithmeticRightShiftBytePerm((u64)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp) & 0xff) % 4], SHIFT_3_RIGHT) ^
281 | ((u64)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^
282 | rconS[rconIndex];
283 |
284 | // Last round uses s-box directly and XORs to produce output.
285 | s0 = arithmeticRightShiftBytePerm((u64)Sbox[((t0 >> 24)) / 4][warpThreadIndex][((t0 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t1 >> 16) & 0xff) / 4][warpThreadIndex][((t1 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t2 >> 8) & 0xFF) / 4][warpThreadIndex][((t2 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t3 & 0xFF) / 4)][warpThreadIndex][((t3 & 0xFF) % 4)]) ^ rk0;
286 | if (s0 == ctS[0]) {
287 | rk1 = rk1 ^ rk0;
288 | s1 = arithmeticRightShiftBytePerm((u64)Sbox[((t1 >> 24)) / 4][warpThreadIndex][((t1 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t2 >> 16) & 0xff) / 4][warpThreadIndex][((t2 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t3 >> 8) & 0xFF) / 4][warpThreadIndex][((t3 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t0 & 0xFF) / 4)][warpThreadIndex][((t0 & 0xFF) % 4)]) ^ rk1;
289 | if (s1 == ctS[1]) {
290 | rk2 = rk2 ^ rk1;
291 | s2 = arithmeticRightShiftBytePerm((u64)Sbox[((t2 >> 24)) / 4][warpThreadIndex][((t2 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t3 >> 16) & 0xff) / 4][warpThreadIndex][((t3 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t0 >> 8) & 0xFF) / 4][warpThreadIndex][((t0 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t1 & 0xFF) / 4)][warpThreadIndex][((t1 & 0xFF) % 4)]) ^ rk2;
292 | if (s2 == ctS[2]) {
293 | rk3 = rk2 ^ rk3;
294 | s3 = arithmeticRightShiftBytePerm((u64)Sbox[((t3 >> 24)) / 4][warpThreadIndex][((t3 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t0 >> 16) & 0xff) / 4][warpThreadIndex][((t0 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t1 >> 8) & 0xFF) / 4][warpThreadIndex][((t1 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t2 & 0xFF) / 4)][warpThreadIndex][((t2 & 0xFF) % 4)]) ^ rk3;
295 | if (s3 == ctS[3]) {
296 | printf("! Found key : %08x %08x %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init, rk4Init, rk5Init);
297 | printf("-------------------------------\n");
298 | }
299 | }
300 | }
301 | }
302 | if (rk5Init == MAX_U32) { rk4Init++; }// Overflow
303 | rk5Init++; // Create key as 32 bit unsigned integers
304 | }
305 | }
306 |
307 | __host__ int main192ExhaustiveSearch() {
308 | printf("\n"); printf("########## AES-192 Exhaustive Search Implementation ##########\n"); printf("\n");
309 | // Allocate plaintext, ciphertext and initial round key
310 | u32 *pt, *ct, *rk192;
311 | gpuErrorCheck(cudaMallocManaged(&pt, 4 * sizeof(u32)));
312 | gpuErrorCheck(cudaMallocManaged(&ct, 4 * sizeof(u32)));
313 | gpuErrorCheck(cudaMallocManaged(&rk192, 6 * sizeof(u32)));
314 | pt[0] = 0x6bc1bee2U; pt[1] = 0x2e409f96U; pt[2] = 0xe93d7e11U; pt[3] = 0x7393172aU;
315 | ct[0] = 0xBD334F1DU; ct[1] = 0x6E45F25FU; ct[2] = 0xF712A214U; ct[3] = 0x571FA5CCU;
316 | rk192[0] = 0x8e73b0f7U; rk192[1] = 0xda0e6452U; rk192[2] = 0xc810f32bU; rk192[3] = 0x809079e5U; rk192[4] = 0x62f8ead2U; rk192[5] = 0x522c6b70U;
317 | // Allocate RCON values
318 | u32* rcon;
319 | gpuErrorCheck(cudaMallocManaged(&rcon, RCON_SIZE * sizeof(u32)));
320 | for (int i = 0; i < RCON_SIZE; i++) { rcon[i] = RCON32[i]; }
321 | // Allocate Tables
322 | u32 *t0, *t1, *t2, *t3, *t4, *t4_0, *t4_1, *t4_2, *t4_3;
323 | u8* SAES_d; // Cihangir
324 | gpuErrorCheck(cudaMallocManaged(&t0, TABLE_SIZE * sizeof(u32)));
325 | gpuErrorCheck(cudaMallocManaged(&t1, TABLE_SIZE * sizeof(u32)));
326 | gpuErrorCheck(cudaMallocManaged(&t2, TABLE_SIZE * sizeof(u32)));
327 | gpuErrorCheck(cudaMallocManaged(&t3, TABLE_SIZE * sizeof(u32)));
328 | gpuErrorCheck(cudaMallocManaged(&t4, TABLE_SIZE * sizeof(u32)));
329 | gpuErrorCheck(cudaMallocManaged(&t4_0, TABLE_SIZE * sizeof(u32)));
330 | gpuErrorCheck(cudaMallocManaged(&t4_1, TABLE_SIZE * sizeof(u32)));
331 | gpuErrorCheck(cudaMallocManaged(&t4_2, TABLE_SIZE * sizeof(u32)));
332 | gpuErrorCheck(cudaMallocManaged(&t4_3, TABLE_SIZE * sizeof(u32)));
333 | gpuErrorCheck(cudaMallocManaged(&SAES_d, 256 * sizeof(u8))); // Cihangir
334 | for (int i = 0; i < TABLE_SIZE; i++) { t0[i] = T0[i]; t1[i] = T1[i]; t2[i] = T2[i]; t3[i] = T3[i]; t4[i] = T4[i]; t4_0[i] = T4_0[i]; t4_1[i] = T4_1[i]; t4_2[i] = T4_2[i]; t4_3[i] = T4_3[i]; }
335 | for (int i = 0; i < 256; i++) SAES_d[i] = SAES[i];
336 | printf("-------------------------------\n");
337 | u64* range = calculateRange();
338 | /* printf("Plaintext : %08x %08x %08x %08x\n", pt[0], pt[1], pt[2], pt[3]);
339 | printf("Ciphertext : %08x %08x %08x %08x\n", ct[0], ct[1], ct[2], ct[3]);
340 | printf("Initial Key : %08x %08x %08x %08x %08x %08x\n", rk192[0], rk192[1], rk192[2], rk192[3], rk192[4], rk192[5]);
341 | printf("-------------------------------\n");*/
342 | clock_t beginTime = clock();
343 | // Kernels
344 | exhaustiveSearch192WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, ct, rk192, t0, t4, rcon, range);
345 | // exhaustiveSearch192WithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir << > > (pt, ct, rk192, t0, t4, rcon, range, SAES_d);
346 |
347 | cudaDeviceSynchronize();
348 | printf("Time elapsed: %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC);
349 | printf("-------------------------------\n");
350 | printLastCUDAError();
351 | // Free alocated arrays
352 | cudaFree(range);cudaFree(pt);cudaFree(ct);cudaFree(rk192);cudaFree(t0);cudaFree(t1);cudaFree(t2);cudaFree(t3);cudaFree(t4);
353 | cudaFree(t4_0);cudaFree(t4_1);cudaFree(t4_2);cudaFree(t4_3);cudaFree(rcon);
354 | return 0;
355 | }
--------------------------------------------------------------------------------
/128-ctr.cuh:
--------------------------------------------------------------------------------
1 | // System includes
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | // CUDA runtime
8 | #include
9 |
10 | // Helper functions and utilities to work with CUDA
11 | //#include
12 | //#include
13 |
14 | #include
15 | //#include
16 |
17 | // Custom header
18 | //#include "kernel.h"
19 |
20 |
21 | // Key expansion from given key set, populate rk[44]
22 | __host__ void keyExpansion(u32* key, u32* rk) {
23 |
24 | u32 rk0, rk1, rk2, rk3;
25 | rk0 = key[0];
26 | rk1 = key[1];
27 | rk2 = key[2];
28 | rk3 = key[3];
29 |
30 | rk[0] = rk0;
31 | rk[1] = rk1;
32 | rk[2] = rk2;
33 | rk[3] = rk3;
34 |
35 | for (u8 roundCount = 0; roundCount < ROUND_COUNT; roundCount++) {
36 | u32 temp = rk3;
37 | rk0 = rk0 ^ T4_3[(temp >> 16) & 0xff] ^ T4_2[(temp >> 8) & 0xff] ^ T4_1[(temp) & 0xff] ^ T4_0[(temp >> 24)] ^ RCON32[roundCount];
38 | rk1 = rk1 ^ rk0;
39 | rk2 = rk2 ^ rk1;
40 | rk3 = rk2 ^ rk3;
41 |
42 | rk[roundCount * 4 + 4] = rk0;
43 | rk[roundCount * 4 + 5] = rk1;
44 | rk[roundCount * 4 + 6] = rk2;
45 | rk[roundCount * 4 + 7] = rk3;
46 | }
47 | }
48 |
49 | // CTR encryption with one table extended as 32 columns
50 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function
51 | // SBox[256] is partly expanded
52 | __global__ void counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* rk, u32* t0G, u32* t4G, u64* range) {
53 |
54 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
55 | int warpThreadIndex = threadIdx.x & 31;
56 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE;
57 |
58 | //
59 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
60 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
61 | __shared__ u32 rkS[AES_128_KEY_SIZE_INT];
62 |
63 | if (threadIdx.x < TABLE_SIZE) {
64 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
65 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
66 | }
67 |
68 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) {
69 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x];
70 | }
71 |
72 | if (threadIdx.x < AES_128_KEY_SIZE_INT) {
73 | rkS[threadIdx.x] = rk[threadIdx.x];
74 | }
75 |
76 | }
77 | //
78 |
79 | // Wait until every thread is ready
80 | __syncthreads();
81 |
82 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
83 | u32 s0, s1, s2, s3;
84 | pt0Init = pt[0];
85 | pt1Init = pt[1];
86 | pt2Init = pt[2];
87 | pt3Init = pt[3];
88 |
89 | u32 threadRange = *range;
90 | u64 threadRangeStart = pt2Init;
91 | threadRangeStart = threadRangeStart << 32;
92 | threadRangeStart ^= pt3Init;
93 | threadRangeStart += (u64)threadIndex * threadRange;
94 | pt2Init = threadRangeStart >> 32;
95 | pt3Init = threadRangeStart & 0xFFFFFFFF;
96 |
97 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
98 |
99 | // Create plaintext as 32 bit unsigned integers
100 | s0 = pt0Init;
101 | s1 = pt1Init;
102 | s2 = pt2Init;
103 | s3 = pt3Init;
104 |
105 | // First round just XORs input with key.
106 | s0 = s0 ^ rkS[0];
107 | s1 = s1 ^ rkS[1];
108 | s2 = s2 ^ rkS[2];
109 | s3 = s3 ^ rkS[3];
110 |
111 | u32 t0, t1, t2, t3;
112 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) {
113 |
114 | // Table based round function
115 | u32 rkStart = roundCount * 4 + 4;
116 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart];
117 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1];
118 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2];
119 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3];
120 |
121 | s0 = t0;
122 | s1 = t1;
123 | s2 = t2;
124 | s3 = t3;
125 |
126 | }
127 |
128 | // Calculate the last round key
129 | // Last round uses s-box directly and XORs to produce output.
130 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[40];
131 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[41];
132 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[42];
133 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[43];
134 |
135 | // Overflow
136 | if (pt3Init == MAX_U32) {
137 | pt2Init++;
138 | }
139 |
140 | pt3Init++;
141 |
142 | }
143 |
144 | if (threadIndex == 1048575) {
145 | printf("threadIndex : %d\n", threadIndex);
146 | printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init);
147 | printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3);
148 | printf("-------------------------------\n");
149 | }
150 | /* pt[0] ^= s0;
151 | pt[1] ^= s0;
152 | pt[2] ^= s0;
153 | pt[3] ^= s0;*/
154 | }
155 | __global__ void counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir(u32* pt, u32* rk, u32* t0G, u32* t4G, u64* range, u8* SAES) {
156 | u64 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
157 | int warpThreadIndex = threadIdx.x & 31;
158 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
159 | __shared__ u8 Sbox[64][32][4];
160 | // __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
161 | __shared__ u32 rkS[AES_128_KEY_SIZE_INT];
162 | if (threadIdx.x < TABLE_SIZE) {
163 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
164 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
165 | Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x];
166 | }
167 | // for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; }
168 | // for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; }
169 | if (threadIdx.x < AES_128_KEY_SIZE_INT) {rkS[threadIdx.x] = rk[threadIdx.x];}
170 | }
171 | __syncthreads();
172 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
173 | u32 s0, s1, s2, s3;
174 | pt0Init = pt[0]; pt1Init = pt[1]; pt2Init = pt[2]; pt3Init = pt[3];
175 | u64 threadRange = *range;
176 | u64 threadRangeStart = pt2Init;
177 | threadRangeStart = threadRangeStart << 32;
178 | threadRangeStart ^= pt3Init;
179 | threadRangeStart += threadIndex * threadRange;
180 | pt2Init = threadRangeStart >> 32;
181 | pt3Init = threadRangeStart & 0xFFFFFFFF;
182 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
183 | // Create plaintext as 32 bit unsigned integers
184 | s0 = pt0Init; s1 = pt1Init; s2 = pt2Init; s3 = pt3Init;
185 | // First round just XORs input with key.
186 | s0 = s0 ^ rkS[0]; s1 = s1 ^ rkS[1]; s2 = s2 ^ rkS[2]; s3 = s3 ^ rkS[3];
187 | u32 t0, t1, t2, t3;
188 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) {
189 | // Table based round function
190 | u32 rkStart = roundCount * 4 + 4;
191 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart];
192 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1];
193 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2];
194 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3];
195 | s0 = t0; s1 = t1; s2 = t2; s3 = t3;
196 | }
197 | // Calculate the last round key
198 | // Last round uses s-box directly and XORs to produce output.
199 | /* s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[40];
200 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[41];
201 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[42];
202 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[43];*/
203 | s0 = arithmeticRightShiftBytePerm((u32)Sbox[((t0 >> 24)) / 4][warpThreadIndex][((t0 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t1 >> 16) & 0xff) / 4][warpThreadIndex][((t1 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t2 >> 8) & 0xFF) / 4][warpThreadIndex][((t2 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t3 & 0xFF) / 4)][warpThreadIndex][((t3 & 0xFF) % 4)]) ^ rkS[40];
204 | s1 = arithmeticRightShiftBytePerm((u32)Sbox[((t1 >> 24)) / 4][warpThreadIndex][((t1 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t2 >> 16) & 0xff) / 4][warpThreadIndex][((t2 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t3 >> 8) & 0xFF) / 4][warpThreadIndex][((t3 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t0 & 0xFF) / 4)][warpThreadIndex][((t0 & 0xFF) % 4)]) ^ rkS[41];
205 | s2 = arithmeticRightShiftBytePerm((u32)Sbox[((t2 >> 24)) / 4][warpThreadIndex][((t2 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t3 >> 16) & 0xff) / 4][warpThreadIndex][((t3 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t0 >> 8) & 0xFF) / 4][warpThreadIndex][((t0 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t1 & 0xFF) / 4)][warpThreadIndex][((t1 & 0xFF) % 4)]) ^ rkS[42];
206 | s3 = arithmeticRightShiftBytePerm((u32)Sbox[((t3 >> 24)) / 4][warpThreadIndex][((t3 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t0 >> 16) & 0xff) / 4][warpThreadIndex][((t0 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t1 >> 8) & 0xFF) / 4][warpThreadIndex][((t1 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t2 & 0xFF) / 4)][warpThreadIndex][((t2 & 0xFF) % 4)]) ^ rkS[43];
207 | // Overflow
208 | if (pt3Init == MAX_U32) { pt2Init++; }
209 | pt3Init++;
210 | }
211 | if (threadIndex == 1048575) {
212 | printf("threadIndex : %d\n", threadIndex);
213 | printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init);
214 | printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3);
215 | printf("-------------------------------\n");
216 | }
217 | }
218 | __global__ void counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir2(u32* pt, u32* rk, u32* t0G, u32* t4G, u64* range, u8* SAES) {
219 | u64 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
220 | int warpThreadIndex = threadIdx.x & 31;
221 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
222 | __shared__ u8 Sbox[64][32][4];
223 | // __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
224 | __shared__ u32 rkS[AES_128_KEY_SIZE_INT];
225 | if (threadIdx.x < TABLE_SIZE) {
226 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
227 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
228 | Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x];
229 | }
230 | // for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; }
231 | // for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; }
232 | if (threadIdx.x < AES_128_KEY_SIZE_INT) { rkS[threadIdx.x] = rk[threadIdx.x]; }
233 | }
234 | __syncthreads();
235 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
236 | u32 s0, s1, s2, s3;
237 | pt0Init = pt[0]; pt1Init = pt[1]; pt2Init = pt[2]; pt3Init = pt[3];
238 | u64 threadRange = *range;
239 | u64 threadRangeStart = pt2Init;
240 | threadRangeStart = threadRangeStart << 32;
241 | threadRangeStart ^= pt3Init;
242 | threadRangeStart += threadIndex * threadRange;
243 | pt2Init = threadRangeStart >> 32;
244 | pt3Init = threadRangeStart & 0xFFFFFFFF;
245 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
246 | // Create plaintext as 32 bit unsigned integers
247 | s0 = pt0Init; s1 = pt1Init; s2 = pt2Init; s3 = pt3Init;
248 | // First round just XORs input with key.
249 | s0 = s0 ^ rkS[0]; s1 = s1 ^ rkS[1]; s2 = s2 ^ rkS[2]; s3 = s3 ^ rkS[3];
250 | u32 t0, t1, t2, t3;
251 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) {
252 | // Table based round function
253 | u32 rkStart = roundCount * 4 + 4;
254 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s3 & 0xFF][warpThreadIndex], 24) ^ rkS[rkStart];
255 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s0 & 0xFF][warpThreadIndex], 24) ^ rkS[rkStart + 1];
256 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s1 & 0xFF][warpThreadIndex], 24) ^ rkS[rkStart + 2];
257 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s2 & 0xFF][warpThreadIndex], 24) ^ rkS[rkStart + 3];
258 | s0 = t0; s1 = t1; s2 = t2; s3 = t3;
259 | }
260 | // Calculate the last round key
261 | // Last round uses s-box directly and XORs to produce output.
262 | /* s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[40];
263 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[41];
264 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[42];
265 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[43];*/
266 | s0 = arithmeticRightShift((u64)Sbox[((t0 >> 24)) / 4][warpThreadIndex][((t0 >> 24)) % 4], 8) ^ arithmeticRightShift((u64)Sbox[((t1 >> 16) & 0xff) / 4][warpThreadIndex][((t1 >> 16)) % 4], 16) ^ arithmeticRightShift((u64)Sbox[((t2 >> 8) & 0xFF) / 4][warpThreadIndex][((t2 >> 8)) % 4], 24) ^ ((u64)Sbox[((t3 & 0xFF) / 4)][warpThreadIndex][((t3 & 0xFF) % 4)]) ^ rkS[40];
267 | s1 = arithmeticRightShift((u64)Sbox[((t1 >> 24)) / 4][warpThreadIndex][((t1 >> 24)) % 4], 8) ^ arithmeticRightShift((u64)Sbox[((t2 >> 16) & 0xff) / 4][warpThreadIndex][((t2 >> 16)) % 4], 16) ^ arithmeticRightShift((u64)Sbox[((t3 >> 8) & 0xFF) / 4][warpThreadIndex][((t3 >> 8)) % 4], 24) ^ ((u64)Sbox[((t0 & 0xFF) / 4)][warpThreadIndex][((t0 & 0xFF) % 4)]) ^ rkS[41];
268 | s2 = arithmeticRightShift((u64)Sbox[((t2 >> 24)) / 4][warpThreadIndex][((t2 >> 24)) % 4], 8) ^ arithmeticRightShift((u64)Sbox[((t3 >> 16) & 0xff) / 4][warpThreadIndex][((t3 >> 16)) % 4], 16) ^ arithmeticRightShift((u64)Sbox[((t0 >> 8) & 0xFF) / 4][warpThreadIndex][((t0 >> 8)) % 4], 24) ^ ((u64)Sbox[((t1 & 0xFF) / 4)][warpThreadIndex][((t1 & 0xFF) % 4)]) ^ rkS[42];
269 | s3 = arithmeticRightShift((u64)Sbox[((t3 >> 24)) / 4][warpThreadIndex][((t3 >> 24)) % 4], 8) ^ arithmeticRightShift((u64)Sbox[((t0 >> 16) & 0xff) / 4][warpThreadIndex][((t0 >> 16)) % 4], 16) ^ arithmeticRightShift((u64)Sbox[((t1 >> 8) & 0xFF) / 4][warpThreadIndex][((t1 >> 8)) % 4], 24) ^ ((u64)Sbox[((t2 & 0xFF) / 4)][warpThreadIndex][((t2 & 0xFF) % 4)]) ^ rkS[43];
270 | // Overflow
271 | if (pt3Init == MAX_U32) { pt2Init++; }
272 | pt3Init++;
273 | }
274 | if (threadIndex == 1048575) {
275 | printf("threadIndex : %I64d\n", threadIndex);
276 | printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init);
277 | printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3);
278 | printf("-------------------------------\n");
279 | }
280 | }
281 | // CTR encryption with one table extended as 32 columns
282 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function
283 | // 4 S-box, each shifted
284 | __global__ void counterWithOneTableExtendedSharedMemoryBytePerm4ShiftedSbox(u32* pt, u32* rk, u32* t0G, u32* t4_0G, u32* t4_1G, u32* t4_2G, u32* t4_3G, u64* range) {
285 |
286 | u32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
287 | int warpThreadIndex = threadIdx.x & 31;
288 |
289 | //
290 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
291 | __shared__ u32 t4_0S[TABLE_SIZE];
292 | __shared__ u32 t4_1S[TABLE_SIZE];
293 | __shared__ u32 t4_2S[TABLE_SIZE];
294 | __shared__ u32 t4_3S[TABLE_SIZE];
295 | __shared__ u32 rkS[AES_128_KEY_SIZE_INT];
296 |
297 | if (threadIdx.x < TABLE_SIZE) {
298 | t4_0S[threadIdx.x] = t4_0G[threadIdx.x];
299 | t4_1S[threadIdx.x] = t4_1G[threadIdx.x];
300 | t4_2S[threadIdx.x] = t4_2G[threadIdx.x];
301 | t4_3S[threadIdx.x] = t4_3G[threadIdx.x];
302 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
303 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
304 | }
305 |
306 | if (threadIdx.x < AES_128_KEY_SIZE_INT) {
307 | rkS[threadIdx.x] = rk[threadIdx.x];
308 | }
309 |
310 | }
311 | //
312 |
313 | // Wait until every thread is ready
314 | __syncthreads();
315 |
316 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
317 | u32 s0, s1, s2, s3;
318 | pt0Init = pt[0];
319 | pt1Init = pt[1];
320 | pt2Init = pt[2];
321 | pt3Init = pt[3];
322 |
323 | u32 threadRange = *range;
324 | u64 threadRangeStart = pt2Init;
325 | threadRangeStart = threadRangeStart << 32;
326 | threadRangeStart ^= pt3Init;
327 | threadRangeStart += (u64)threadIndex * threadRange;
328 | pt2Init = threadRangeStart >> 32;
329 | pt3Init = threadRangeStart & 0xFFFFFFFF;
330 |
331 | for (u32 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
332 |
333 | // Create plaintext as 32 bit unsigned integers
334 | s0 = pt0Init;
335 | s1 = pt1Init;
336 | s2 = pt2Init;
337 | s3 = pt3Init;
338 |
339 | // First round just XORs input with key.
340 | s0 = s0 ^ rkS[0];
341 | s1 = s1 ^ rkS[1];
342 | s2 = s2 ^ rkS[2];
343 | s3 = s3 ^ rkS[3];
344 |
345 | u32 t0, t1, t2, t3;
346 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) {
347 |
348 | // Table based round function
349 | u32 rkStart = roundCount * 4 + 4;
350 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart];
351 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1];
352 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2];
353 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3];
354 |
355 | s0 = t0;
356 | s1 = t1;
357 | s2 = t2;
358 | s3 = t3;
359 |
360 | }
361 |
362 | // Calculate the last round key
363 | // Last round uses s-box directly and XORs to produce output.
364 | s0 = t4_3S[t0 >> 24] ^ t4_2S[(t1 >> 16) & 0xff] ^ t4_1S[(t2 >> 8) & 0xff] ^ t4_0S[(t3) & 0xFF] ^ rkS[40];
365 | s1 = t4_3S[t1 >> 24] ^ t4_2S[(t2 >> 16) & 0xff] ^ t4_1S[(t3 >> 8) & 0xff] ^ t4_0S[(t0) & 0xFF] ^ rkS[41];
366 | s2 = t4_3S[t2 >> 24] ^ t4_2S[(t3 >> 16) & 0xff] ^ t4_1S[(t0 >> 8) & 0xff] ^ t4_0S[(t1) & 0xFF] ^ rkS[42];
367 | s3 = t4_3S[t3 >> 24] ^ t4_2S[(t0 >> 16) & 0xff] ^ t4_1S[(t1 >> 8) & 0xff] ^ t4_0S[(t2) & 0xFF] ^ rkS[43];
368 |
369 | // Overflow
370 | if (pt3Init == MAX_U32) {
371 | pt2Init++;
372 | }
373 |
374 | // Create key as 32 bit unsigned integers
375 | pt3Init++;
376 | }
377 |
378 | if (threadIndex == 1048575) {
379 | printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init);
380 | printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3);
381 | printf("-------------------------------\n");
382 | }
383 | }
384 |
385 | __host__ int main128Ctr() {
386 | printf("\n");
387 | printf("########## AES-128 Counter Mode Implementation ##########\n");
388 | printf("\n");
389 |
390 | // Allocate plaintext and every round key
391 | u32 *pt, *rk, *roundKeys;
392 | gpuErrorCheck(cudaMallocManaged(&pt, 4 * sizeof(u32)));
393 | gpuErrorCheck(cudaMallocManaged(&rk, 4 * sizeof(u32)));
394 | gpuErrorCheck(cudaMallocManaged(&roundKeys, AES_128_KEY_SIZE_INT * sizeof(u32)));
395 |
396 | pt[0] = 0x3243F6A8U;
397 | pt[1] = 0x885A308DU;
398 | pt[2] = 0x313198A2U;
399 | pt[3] = 0x00000000U;
400 |
401 | rk[0] = 0x2B7E1516U;
402 | rk[1] = 0x28AED2A6U;
403 | rk[2] = 0xABF71588U;
404 | rk[3] = 0x09CF4F3CU;
405 |
406 | // Allocate RCON values
407 | u32* rcon;
408 | gpuErrorCheck(cudaMallocManaged(&rcon, RCON_SIZE * sizeof(u32)));
409 | for (int i = 0; i < RCON_SIZE; i++) {
410 | rcon[i] = RCON32[i];
411 | }
412 |
413 | // Allocate Tables
414 | u32 *t0, *t1, *t2, *t3, *t4, *t4_0, *t4_1, *t4_2, *t4_3;
415 | u8* SAES_d; // Cihangir
416 | gpuErrorCheck(cudaMallocManaged(&t0, TABLE_SIZE * sizeof(u32)));
417 | gpuErrorCheck(cudaMallocManaged(&t1, TABLE_SIZE * sizeof(u32)));
418 | gpuErrorCheck(cudaMallocManaged(&t2, TABLE_SIZE * sizeof(u32)));
419 | gpuErrorCheck(cudaMallocManaged(&t3, TABLE_SIZE * sizeof(u32)));
420 | gpuErrorCheck(cudaMallocManaged(&t4, TABLE_SIZE * sizeof(u32)));
421 | gpuErrorCheck(cudaMallocManaged(&t4_0, TABLE_SIZE * sizeof(u32)));
422 | gpuErrorCheck(cudaMallocManaged(&t4_1, TABLE_SIZE * sizeof(u32)));
423 | gpuErrorCheck(cudaMallocManaged(&t4_2, TABLE_SIZE * sizeof(u32)));
424 | gpuErrorCheck(cudaMallocManaged(&t4_3, TABLE_SIZE * sizeof(u32)));
425 | gpuErrorCheck(cudaMallocManaged(&SAES_d, 256 * sizeof(u8))); // Cihangir
426 | for (int i = 0; i < TABLE_SIZE; i++) {
427 | t0[i] = T0[i];
428 | t1[i] = T1[i];
429 | t2[i] = T2[i];
430 | t3[i] = T3[i];
431 | t4[i] = T4[i];
432 | t4_0[i] = T4_0[i];
433 | t4_1[i] = T4_1[i];
434 | t4_2[i] = T4_2[i];
435 | t4_3[i] = T4_3[i];
436 | }
437 | for (int i = 0; i < 256; i++) SAES_d[i] = SAES[i]; // Cihangir
438 | printf("-------------------------------\n");
439 | u64* range = calculateRange();
440 | /* printf("Initial Plaintext : %08x %08x %08x %08x\n", pt[0], pt[1], pt[2], pt[3]);
441 | printf("Initial Key : %08x %08x %08x %08x\n", rk[0], rk[1], rk[2], rk[3]);
442 | printf("-------------------------------\n");*/
443 |
444 | // Key expansion
445 | keyExpansion(rk, roundKeys);
446 |
447 | clock_t beginTime = clock();
448 | // Kernels
449 | // counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, roundKeys, t0, t4, range);
450 | counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir << > > (pt, roundKeys, t0, t4, range, SAES_d);
451 | // counterWithOneTableExtendedSharedMemoryBytePerm4ShiftedSbox<<>>(pt, roundKeys, t0, t4_0, t4_1, t4_2, t4_3, range);
452 | // cudaMemcpy(rk, pt, 4*sizeof(u32), cudaMemcpyDeviceToHost);
453 | cudaDeviceSynchronize();
454 | printf("Time elapsed: %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC);
455 | printf("-------------------------------\n");
456 | printLastCUDAError();
457 | printf("plaintext: %x %x %x %x\n",rk[0], rk[1], rk[2], rk[3]);
458 |
459 | // Free alocated arrays
460 | cudaFree(range);
461 | cudaFree(pt);
462 | cudaFree(rk);
463 | cudaFree(roundKeys);
464 | cudaFree(t0);
465 | cudaFree(t1);
466 | cudaFree(t2);
467 | cudaFree(t3);
468 | cudaFree(t4);
469 | cudaFree(t4_0);
470 | cudaFree(t4_1);
471 | cudaFree(t4_2);
472 | cudaFree(t4_3);
473 | cudaFree(rcon);
474 |
475 | return 0;
476 | }
--------------------------------------------------------------------------------
/file-encryption.cuh:
--------------------------------------------------------------------------------
1 | // System includes
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | // CUDA runtime
8 | #include
9 |
10 | // Helper functions and utilities to work with CUDA
11 | //#include
12 | //#include
13 |
14 | #include
15 | //#include
16 |
17 | // Custom header
18 | //#include "kernel.h"
19 |
20 | #include
21 | #include
22 | #include
23 |
24 | __device__ u32 fileEncryptionTotalG = 0;
25 |
26 | // CTR encryption with one table extended as 32 columns
27 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function
28 | // SBox[256] is partly expanded
29 | __global__ void fileEncryption128counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G,
30 | u32* encryptionCountG, u32* threadCountG) {
31 |
32 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
33 | int warpThreadIndex = threadIdx.x & 31;
34 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE;
35 |
36 | //
37 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
38 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
39 | __shared__ u32 rkS[AES_128_KEY_SIZE_INT];
40 |
41 | if (threadIdx.x < TABLE_SIZE) {
42 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
43 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
44 | }
45 |
46 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) {
47 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x];
48 | }
49 |
50 | if (threadIdx.x < AES_128_KEY_SIZE_INT) {
51 | rkS[threadIdx.x] = rk[threadIdx.x];
52 | }
53 |
54 | }
55 | //
56 |
57 | // Wait until every thread is ready
58 | __syncthreads();
59 |
60 | u32 pt0Init, pt1Init, pt2Init, pt3Init, s0, s1, s2, s3;
61 | pt0Init = pt[0];
62 | pt1Init = pt[1];
63 | pt2Init = pt[2];
64 | pt3Init = pt[3];
65 |
66 | u32 pt2Max, pt3Max, threadCount = *threadCountG;
67 | u64 threadRangeStart = pt2Init;
68 | threadRangeStart = threadRangeStart << 32;
69 | threadRangeStart ^= pt3Init;
70 | threadRangeStart += *encryptionCountG;
71 | pt2Max = threadRangeStart >> 32;
72 | pt3Max = threadRangeStart & 0xFFFFFFFF;
73 |
74 | // Initialize plaintext
75 | pt3Init += threadIndex;
76 | if (pt3Init < threadIndex) {
77 | pt2Init++;
78 | }
79 |
80 | if (pt2Init >= pt2Max && pt3Init >= pt3Max) {
81 | return;
82 | }
83 |
84 | // Initialize ciphertext index
85 | u64 ctIndex = threadIndex*4;
86 |
87 | //if (threadIndex == 0) {
88 | // printf("Boundry: %08x %08x\n", pt2Max, pt3Max);
89 | // printf("threadCount: %08x\n", threadCount);
90 | // printf("encryptionCountG: %08x\n", *encryptionCountG);
91 | //}
92 |
93 | for (;;) {
94 |
95 | // Create plaintext as 32 bit unsigned integers
96 | s0 = pt0Init;
97 | s1 = pt1Init;
98 | s2 = pt2Init;
99 | s3 = pt3Init;
100 |
101 | // First round just XORs input with key.
102 | s0 = s0 ^ rkS[0];
103 | s1 = s1 ^ rkS[1];
104 | s2 = s2 ^ rkS[2];
105 | s3 = s3 ^ rkS[3];
106 |
107 | u32 t0, t1, t2, t3;
108 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) {
109 |
110 | // Table based round function
111 | u32 rkStart = roundCount * 4 + 4;
112 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart];
113 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1];
114 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2];
115 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3];
116 |
117 | s0 = t0;
118 | s1 = t1;
119 | s2 = t2;
120 | s3 = t3;
121 |
122 | }
123 |
124 | // Calculate the last round key
125 | // Last round uses s-box directly and XORs to produce output.
126 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[40];
127 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[41];
128 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[42];
129 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[43];
130 |
131 | // Allocate ciphertext
132 | ct[ctIndex ] = s0;
133 | ct[ctIndex + 1] = s1;
134 | ct[ctIndex + 2] = s2;
135 | ct[ctIndex + 3] = s3;
136 |
137 | //if (pt3Init+1 == 0x05ea2a80) {
138 | // printf("-------------------------------\n");
139 | // printf("threadIndex : %d\n", threadIndex);
140 | // printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init);
141 | // printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3);
142 | // printf("Ciphertext index : %d %d %d %d\n", ctIndex, ctIndex + 1, ctIndex + 2, ctIndex + 3);
143 | // printf("-------------------------------\n");
144 | //}
145 |
146 | // Increase plaintext
147 | pt3Init += threadCount;
148 | if (pt3Init < threadCount) {
149 | pt2Init++;
150 | }
151 |
152 | // Ciphertext index
153 | ctIndex += threadCount * 4;
154 |
155 | //atomicAdd(&fileEncryptionTotalG, 1);
156 |
157 | if (pt2Init >= pt2Max && pt3Init >= pt3Max) {
158 | break;
159 | }
160 |
161 | }
162 |
163 | //if (threadIndex == 0) {
164 | // printf("threadIndex : %d\n", threadIndex);
165 | // printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init);
166 | // printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3);
167 | // printf("Ciphertext index : %d %d %d %d\n", ctIndex, ctIndex+1, ctIndex+2, ctIndex+3);
168 | // printf("-------------------------------\n");
169 | //}
170 |
171 | }
172 |
173 |
174 | // CTR encryption with one table extended as 32 columns
175 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function
176 | // SBox[256] is partly expanded
177 | __global__ void fileEncryption192counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G,
178 | u32* encryptionCountG, u32* threadCountG) {
179 |
180 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
181 | int warpThreadIndex = threadIdx.x & 31;
182 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE;
183 |
184 | //
185 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
186 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
187 | __shared__ u32 rkS[AES_192_KEY_SIZE_INT];
188 |
189 | if (threadIdx.x < TABLE_SIZE) {
190 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
191 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
192 | }
193 |
194 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) {
195 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x];
196 | }
197 |
198 | if (threadIdx.x < AES_192_KEY_SIZE_INT) {
199 | rkS[threadIdx.x] = rk[threadIdx.x];
200 | }
201 |
202 | }
203 | //
204 |
205 | // Wait until every thread is ready
206 | __syncthreads();
207 |
208 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
209 | u32 s0, s1, s2, s3;
210 | pt0Init = pt[0];
211 | pt1Init = pt[1];
212 | pt2Init = pt[2];
213 | pt3Init = pt[3];
214 |
215 | u32 pt2Max, pt3Max, threadCount = *threadCountG;
216 | u64 threadRangeStart = pt2Init;
217 | threadRangeStart = threadRangeStart << 32;
218 | threadRangeStart ^= pt3Init;
219 | threadRangeStart += *encryptionCountG;
220 | pt2Max = threadRangeStart >> 32;
221 | pt3Max = threadRangeStart & 0xFFFFFFFF;
222 |
223 | // Initialize plaintext
224 | pt3Init += threadIndex;
225 | if (pt3Init < threadIndex) {
226 | pt2Init++;
227 | }
228 |
229 | if (pt2Init >= pt2Max && pt3Init >= pt3Max) {
230 | return;
231 | }
232 |
233 | // Initialize ciphertext index
234 | u64 ctIndex = threadIndex * 4;
235 |
236 | for (;;) {
237 |
238 | // Create plaintext as 32 bit unsigned integers
239 | s0 = pt0Init;
240 | s1 = pt1Init;
241 | s2 = pt2Init;
242 | s3 = pt3Init;
243 |
244 | // First round just XORs input with key.
245 | s0 = s0 ^ rkS[0];
246 | s1 = s1 ^ rkS[1];
247 | s2 = s2 ^ rkS[2];
248 | s3 = s3 ^ rkS[3];
249 |
250 | u32 t0, t1, t2, t3;
251 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1_192; roundCount++) {
252 |
253 | // Table based round function
254 | u32 rkStart = roundCount * 4 + 4;
255 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart];
256 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1];
257 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2];
258 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3];
259 |
260 | s0 = t0;
261 | s1 = t1;
262 | s2 = t2;
263 | s3 = t3;
264 |
265 | }
266 |
267 | // Calculate the last round key
268 | // Last round uses s-box directly and XORs to produce output.
269 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[48];
270 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[49];
271 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[50];
272 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[51];
273 |
274 | // Allocate ciphertext
275 | ct[ctIndex] = s0;
276 | ct[ctIndex + 1] = s1;
277 | ct[ctIndex + 2] = s2;
278 | ct[ctIndex + 3] = s3;
279 |
280 | // Increase plaintext
281 | pt3Init += threadCount;
282 | if (pt3Init < threadCount) {
283 | pt2Init++;
284 | }
285 |
286 | // Ciphertext index
287 | ctIndex += threadCount * 4;
288 |
289 | //atomicAdd(&fileEncryptionTotalG, 1);
290 |
291 | if (pt2Init >= pt2Max && pt3Init >= pt3Max) {
292 | break;
293 | }
294 | }
295 |
296 | //if (threadIndex == 0) {
297 | // printf("threadIndex : %d\n", threadIndex);
298 | // printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init);
299 | // printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3);
300 | // printf("Ciphertext index : %d %d %d %d\n", ctIndex, ctIndex+1, ctIndex+2, ctIndex+3);
301 | // printf("-------------------------------\n");
302 | //}
303 |
304 | }
305 |
306 |
307 | // CTR encryption with one table extended as 32 columns
308 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function
309 | // SBox[256] is partly expanded
310 | __global__ void fileEncryption256counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G,
311 | u32* encryptionCountG, u32* threadCountG) {
312 |
313 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
314 | int warpThreadIndex = threadIdx.x & 31;
315 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE;
316 |
317 | //
318 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
319 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
320 | __shared__ u32 rkS[AES_256_KEY_SIZE_INT];
321 |
322 | if (threadIdx.x < TABLE_SIZE) {
323 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
324 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
325 | }
326 |
327 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) {
328 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x];
329 | }
330 |
331 | if (threadIdx.x < AES_256_KEY_SIZE_INT) {
332 | rkS[threadIdx.x] = rk[threadIdx.x];
333 | }
334 |
335 | }
336 | //
337 |
338 | // Wait until every thread is ready
339 | __syncthreads();
340 |
341 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
342 | u32 s0, s1, s2, s3;
343 | pt0Init = pt[0];
344 | pt1Init = pt[1];
345 | pt2Init = pt[2];
346 | pt3Init = pt[3];
347 |
348 | u32 pt2Max, pt3Max, threadCount = *threadCountG;
349 | u64 threadRangeStart = pt2Init;
350 | threadRangeStart = threadRangeStart << 32;
351 | threadRangeStart ^= pt3Init;
352 | threadRangeStart += *encryptionCountG;
353 | pt2Max = threadRangeStart >> 32;
354 | pt3Max = threadRangeStart & 0xFFFFFFFF;
355 |
356 | // Initialize plaintext
357 | pt3Init += threadIndex;
358 | if (pt3Init < threadIndex) {
359 | pt2Init++;
360 | }
361 |
362 | if (pt2Init >= pt2Max && pt3Init >= pt3Max) {
363 | return;
364 | }
365 |
366 | // Initialize ciphertext index
367 | u64 ctIndex = threadIndex * 4;
368 |
369 | for (;;) {
370 |
371 | // Create plaintext as 32 bit unsigned integers
372 | s0 = pt0Init;
373 | s1 = pt1Init;
374 | s2 = pt2Init;
375 | s3 = pt3Init;
376 |
377 | // First round just XORs input with key.
378 | s0 = s0 ^ rkS[0];
379 | s1 = s1 ^ rkS[1];
380 | s2 = s2 ^ rkS[2];
381 | s3 = s3 ^ rkS[3];
382 |
383 | u32 t0, t1, t2, t3;
384 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1_256; roundCount++) {
385 |
386 | // Table based round function
387 | u32 rkStart = roundCount * 4 + 4;
388 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart];
389 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 1];
390 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 2];
391 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rkS[rkStart + 3];
392 |
393 | s0 = t0;
394 | s1 = t1;
395 | s2 = t2;
396 | s3 = t3;
397 |
398 | }
399 |
400 | // Calculate the last round key
401 | // Last round uses s-box directly and XORs to produce output.
402 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[56];
403 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[57];
404 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[58];
405 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rkS[59];
406 |
407 | // Allocate ciphertext
408 | ct[ctIndex] = s0;
409 | ct[ctIndex + 1] = s1;
410 | ct[ctIndex + 2] = s2;
411 | ct[ctIndex + 3] = s3;
412 |
413 | // Increase plaintext
414 | pt3Init += threadCount;
415 | if (pt3Init < threadCount) {
416 | pt2Init++;
417 | }
418 |
419 | // Ciphertext index
420 | ctIndex += threadCount * 4;
421 |
422 | //atomicAdd(&fileEncryptionTotalG, 1);
423 |
424 | if (pt2Init >= pt2Max && pt3Init >= pt3Max) {
425 | break;
426 | }
427 | }
428 |
429 | //if (threadIndex == 0) {
430 | // printf("threadIndex : %d\n", threadIndex);
431 | // printf("Plaintext : %08x %08x %08x %08x\n", pt0Init, pt1Init, pt2Init, pt3Init);
432 | // printf("Ciphertext : %08x %08x %08x %08x\n", s0, s1, s2, s3);
433 | // printf("Ciphertext index : %d %d %d %d\n", ctIndex, ctIndex+1, ctIndex+2, ctIndex+3);
434 | // printf("-------------------------------\n");
435 | //}
436 |
437 | }
438 |
439 |
440 | __host__ int mainFileEncryption() {
441 | printf("\n");
442 | printf("########## AES CTR File Encryption Implementation ##########\n");
443 | printf("\n");
444 |
445 | // Inputs
446 | int chunkSize = 1024;
447 | int keyLen = AES_128_KEY_LEN_INT;
448 | const std::string filePath = "C://file-encryption-test//movie4.mp4";
449 | const std::string outFilePath = filePath + "_ENC";
450 |
451 | std::fstream fileIn(filePath, std::fstream::in | std::fstream::binary);
452 | if (fileIn) {
453 |
454 | // Get file size
455 | fileIn.seekg(0, fileIn.end);
456 | u32 fileSize = fileIn.tellg();
457 | fileIn.seekg(0, fileIn.beg);
458 | printf("File path : %s\n", filePath.c_str());
459 | printf("File size in bytes : %u\n", fileSize);
460 | printf("Encrypted file path : %s\n", outFilePath.c_str());
461 | printf("-------------------------------\n");
462 |
463 | // Allocate plaintext and every round key
464 | u32 *pt, *rk, rk128[AES_128_KEY_LEN_INT], rk192[AES_192_KEY_LEN_INT], rk256[AES_256_KEY_LEN_INT];
465 | gpuErrorCheck(cudaMallocManaged(&pt, 4 * sizeof(u32)));
466 |
467 | pt[0] = 0x3243F6A8U;
468 | pt[1] = 0x885A308DU;
469 | pt[2] = 0x313198A2U;
470 | pt[3] = 0x00000000U;
471 |
472 | rk128[0] = 0x2B7E1516U;
473 | rk128[1] = 0x28AED2A6U;
474 | rk128[2] = 0xABF71588U;
475 | rk128[3] = 0x09CF4F3CU;
476 |
477 | rk192[0] = 0x8e73b0f7U;
478 | rk192[1] = 0xda0e6452U;
479 | rk192[2] = 0xc810f32bU;
480 | rk192[3] = 0x809079e5U;
481 | rk192[4] = 0x62f8ead2U;
482 | rk192[5] = 0x522c6b7bU;
483 |
484 | rk256[0] = 0x603deb10U;
485 | rk256[1] = 0x15ca71beU;
486 | rk256[2] = 0x2b73aef0U;
487 | rk256[3] = 0x857d7781U;
488 | rk256[4] = 0x1f352c07U;
489 | rk256[5] = 0x3b6108d7U;
490 | rk256[6] = 0x2d9810a3U;
491 | rk256[7] = 0x0914dff4U;
492 |
493 | // Allocate RCON values
494 | u32* rcon;
495 | gpuErrorCheck(cudaMallocManaged(&rcon, RCON_SIZE * sizeof(u32)));
496 | for (int i = 0; i < RCON_SIZE; i++) {
497 | rcon[i] = RCON32[i];
498 | }
499 |
500 | // Allocate Tables
501 | u32 *t0, *t1, *t2, *t3, *t4, *t4_0, *t4_1, *t4_2, *t4_3;
502 | gpuErrorCheck(cudaMallocManaged(&t0, TABLE_SIZE * sizeof(u32)));
503 | gpuErrorCheck(cudaMallocManaged(&t1, TABLE_SIZE * sizeof(u32)));
504 | gpuErrorCheck(cudaMallocManaged(&t2, TABLE_SIZE * sizeof(u32)));
505 | gpuErrorCheck(cudaMallocManaged(&t3, TABLE_SIZE * sizeof(u32)));
506 | gpuErrorCheck(cudaMallocManaged(&t4, TABLE_SIZE * sizeof(u32)));
507 | gpuErrorCheck(cudaMallocManaged(&t4_0, TABLE_SIZE * sizeof(u32)));
508 | gpuErrorCheck(cudaMallocManaged(&t4_1, TABLE_SIZE * sizeof(u32)));
509 | gpuErrorCheck(cudaMallocManaged(&t4_2, TABLE_SIZE * sizeof(u32)));
510 | gpuErrorCheck(cudaMallocManaged(&t4_3, TABLE_SIZE * sizeof(u32)));
511 | for (int i = 0; i < TABLE_SIZE; i++) {
512 | t0[i] = T0[i];
513 | t1[i] = T1[i];
514 | t2[i] = T2[i];
515 | t3[i] = T3[i];
516 | t4[i] = T4[i];
517 | t4_0[i] = T4_0[i];
518 | t4_1[i] = T4_1[i];
519 | t4_2[i] = T4_2[i];
520 | t4_3[i] = T4_3[i];
521 | }
522 |
523 | // Calculate encryption boundary
524 | u32 *ct, *encryptionCount, *threadCount;
525 | gpuErrorCheck(cudaMallocManaged(&threadCount, 1 * sizeof(u32)));
526 | gpuErrorCheck(cudaMallocManaged(&encryptionCount, 1 * sizeof(u32)));
527 | threadCount[0] = BLOCKS * THREADS;
528 | double totalBlockSize = (double)fileSize / BYTE_COUNT;
529 | encryptionCount[0] = ceil(totalBlockSize);
530 | u32 ciphertextSize = encryptionCount[0] * U32_SIZE * sizeof(u32);
531 |
532 | // Allocate ciphertext
533 | //gpuErrorCheck(cudaMallocManaged(&ct, ciphertextSize));
534 | gpuErrorCheck(cudaMalloc((void **)&ct, ciphertextSize));
535 |
536 | printf("Blocks : %d\n", BLOCKS);
537 | printf("Threads : %d\n", THREADS);
538 | printf("Total thread count : %u\n", threadCount[0]);
539 | printf("Total encryptions : %u\n", encryptionCount[0]);
540 | printf("Total encryptions in byte : %u\n", ciphertextSize);
541 | printf("Each thread encryptions : %.2f\n", encryptionCount[0] / (double)threadCount[0]);
542 | printf("-------------------------------\n");
543 | printf("Initial Counter : %08x %08x %08x %08x\n", pt[0], pt[1], pt[2], pt[3]);
544 | int keySize;
545 | if (keyLen == AES_128_KEY_LEN_INT) {
546 | rk = rk128;
547 | keySize = AES_128_KEY_SIZE_INT;
548 | printf("Initial Key (%d byte) : %08x %08x %08x %08x\n", AES_128_KEY_LEN_INT * U32_SIZE, rk[0], rk[1], rk[2], rk[3]);
549 | } else if (keyLen == AES_192_KEY_LEN_INT) {
550 | rk = rk192;
551 | keySize = AES_192_KEY_SIZE_INT;
552 | printf("Initial Key (%d byte) : %08x %08x %08x %08x %08x %08x\n", AES_192_KEY_LEN_INT * U32_SIZE, rk[0], rk[1], rk[2], rk[3], rk[4], rk[5]);
553 | } else if (keyLen == AES_256_KEY_LEN_INT) {
554 | rk = rk256;
555 | keySize = AES_256_KEY_SIZE_INT;
556 | printf("Initial Key (%d byte) : %08x %08x %08x %08x %08x %08x %08x %08x\n", AES_256_KEY_LEN_INT * U32_SIZE, rk[0], rk[1], rk[2], rk[3], rk[4], rk[5], rk[6], rk[7]);
557 | }
558 | printf("-------------------------------\n");
559 |
560 | // Prepare round keys
561 | u32 *roundKeys;
562 | gpuErrorCheck(cudaMallocManaged(&roundKeys, keySize * sizeof(u32)));
563 | if (keyLen == AES_128_KEY_LEN_INT) {
564 | keyExpansion(rk128, roundKeys);
565 | } else if (keyLen == AES_192_KEY_LEN_INT) {
566 | keyExpansion192(rk192, roundKeys);
567 | } else if (keyLen == AES_256_KEY_LEN_INT) {
568 | keyExpansion256(rk256, roundKeys);
569 | }
570 |
571 | clock_t beginTime = clock();
572 | // Kernels
573 | if (keyLen == AES_128_KEY_LEN_INT) {
574 | fileEncryption128counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, ct, roundKeys, t0, t4, encryptionCount, threadCount);
575 | } else if (keyLen == AES_192_KEY_LEN_INT) {
576 | fileEncryption192counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, ct, roundKeys, t0, t4, encryptionCount, threadCount);
577 | } else if (keyLen == AES_256_KEY_LEN_INT) {
578 | fileEncryption256counterWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, ct, roundKeys, t0, t4, encryptionCount, threadCount);
579 | }
580 |
581 | cudaDeviceSynchronize();
582 | printf("Time elapsed (Encryption) : %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC);
583 | printLastCUDAError();
584 |
585 | //u32 totEncryption;
586 | //cudaMemcpyFromSymbol(&totEncryption, fileEncryptionTotalG, sizeof(u32));
587 | //printf("Total encryptions : %I64d\n", totEncryption);
588 | //printf("-------------------------------\n");
589 |
590 | beginTime = clock();
591 | u32 *ctH = new u32[encryptionCount[0] * U32_SIZE];
592 | cudaMemcpy(ctH, ct, ciphertextSize, cudaMemcpyDeviceToHost);
593 | printf("Time elapsed (Memcpy) : %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC);
594 |
595 | //return 0;
596 |
597 | // Open output file
598 | beginTime = clock();
599 | std::fstream fileOut(outFilePath, std::fstream::out | std::fstream::binary);
600 | u32 cipherTextIndex = 0;
601 | // Allocate file buffer
602 | char * buffer = new char[chunkSize];
603 | while (1) {
604 | // Read data as a block into buffer:
605 | fileIn.read(buffer, chunkSize);
606 | // Decide whether buffer is at the last part
607 | long readByte = 0;
608 | if (fileIn) {
609 | // All characters read successfully
610 | readByte = chunkSize;
611 | } else {
612 | // Only readByte characters could be read
613 | readByte = fileIn.gcount();
614 | }
615 | // Process current buffer
616 | u32 readInt = 0;
617 | for (int bufferIndex = 0; bufferIndex < readByte; bufferIndex++) {
618 | // Process 4 byte as integers
619 | int bufferIntIndex = (bufferIndex + 1) % U32_SIZE;
620 | if (bufferIntIndex == 0) {
621 | // Change 4 byte to int
622 | readInt = 0;
623 | readInt |= (0x000000FF & buffer[bufferIndex - 3]) << 24;
624 | readInt |= (0x000000FF & buffer[bufferIndex - 2]) << 16;
625 | readInt |= (0x000000FF & buffer[bufferIndex - 1]) << 8;
626 | readInt |= (0x000000FF & buffer[bufferIndex ]);
627 | // XOR with ciphertext
628 | readInt ^= ctH[cipherTextIndex++];
629 | // Change 4 byte back to char
630 | buffer[bufferIndex - 3] = readInt >> 24;
631 | buffer[bufferIndex - 2] = readInt >> 16;
632 | buffer[bufferIndex - 1] = readInt >> 8;
633 | buffer[bufferIndex] = readInt;
634 | } else if (bufferIndex == readByte - 1) {
635 | // Change bufferIntIndex byte to int
636 | readInt = 0;
637 | for (int extraByteIndex = 0; extraByteIndex < bufferIntIndex; extraByteIndex++) {
638 | readInt |= (0x000000FF & buffer[bufferIndex - bufferIntIndex + extraByteIndex + 1]) << ((U32_SIZE -1 -extraByteIndex) * 8);
639 | }
640 | // XOR with ciphertext
641 | readInt ^= ctH[cipherTextIndex++];
642 | // Change bufferIntIndex byte back to char
643 | for (int extraByteIndex = 0; extraByteIndex < bufferIntIndex; extraByteIndex++) {
644 | buffer[bufferIndex - bufferIntIndex + extraByteIndex + 1] = readInt >> (U32_SIZE - 1 - extraByteIndex) * 8;
645 | }
646 | }
647 | }
648 | // Write buffer to output file
649 | fileOut.write(buffer, readByte);
650 | // stop
651 | if (readByte < chunkSize) {
652 | break;
653 | }
654 | }
655 | printf("Time elapsed (File write) : %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC);
656 |
657 | delete[] buffer;
658 | fileOut.close();
659 |
660 | // Free alocated arrays
661 | cudaFree(threadCount);
662 | cudaFree(encryptionCount);
663 | cudaFree(ct);
664 | cudaFree(pt);
665 | cudaFree(rk);
666 | cudaFree(roundKeys);
667 | cudaFree(t0);
668 | cudaFree(t1);
669 | cudaFree(t2);
670 | cudaFree(t3);
671 | cudaFree(t4);
672 | cudaFree(t4_0);
673 | cudaFree(t4_1);
674 | cudaFree(t4_2);
675 | cudaFree(t4_3);
676 | cudaFree(rcon);
677 | } else {
678 | printf("File could not be opened: %s\n", filePath.c_str());
679 | }
680 |
681 | fileIn.close();
682 | return 0;
683 | }
--------------------------------------------------------------------------------
/AES_final.h:
--------------------------------------------------------------------------------
1 | typedef unsigned char u8;
2 | typedef unsigned short u16;
3 | typedef unsigned int u32;
4 | typedef unsigned long long u64;
5 |
6 | #define BLOCKS 1024
7 | #define THREADS 1024
8 | #define TWO_POWER_RANGE 35
9 |
10 | #define SHARED_MEM_BANK_SIZE 32
11 | #define S_BOX_BANK_SIZE 8
12 | #define TABLE_SIZE 256
13 | #define RCON_SIZE 15
14 | #define U32_SIZE 4
15 | #define MAX_U32 4294967295
16 | #define MAX_U16 0x0000FFFF
17 | #define BYTE_COUNT 16 // 128 / 8
18 | #define PARTLY_DIVIDE_THRESHOLD 110
19 |
20 | #define AES_128_KEY_LEN_INT 4
21 | #define AES_192_KEY_LEN_INT 6
22 | #define AES_256_KEY_LEN_INT 8
23 |
24 | #define AES_128_KEY_SIZE_INT 44
25 | #define AES_192_KEY_SIZE_INT 52
26 | #define AES_256_KEY_SIZE_INT 60
27 |
28 | #define ROUND_COUNT 10
29 | #define ROUND_COUNT_MIN_1 9
30 | #define ROUND_COUNT_192 12
31 | #define ROUND_COUNT_MIN_1_192 11
32 | #define ROUND_COUNT_256 14
33 | #define ROUND_COUNT_MIN_1_256 13
34 |
35 | // __byte_perm Constants
36 | // u32 t = __byte_perm(x, y, selector);
37 | #define SHIFT_1_RIGHT 17185 // 0x00004321U i.e. ( >> 8 )
38 | #define SHIFT_2_RIGHT 21554 // 0x00005432U i.e. ( >> 16 )
39 | #define SHIFT_3_RIGHT 25923 // 0x00006543U i.e. ( >> 24 )
40 |
41 | #define gpuErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
42 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
43 | {
44 | if (code != cudaSuccess) {
45 | fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
46 | //if (abort) exit(code);
47 | }
48 | }
49 |
50 | void printLastCUDAError(){
51 | cudaError_t cudaError = cudaGetLastError();
52 | if (cudaError != cudaSuccess) {
53 | printf("-----\n");
54 | printf("ERROR: cudaGetLastError() returned %d: %s\n", cudaError, cudaGetErrorString(cudaError));
55 | printf("-----\n");
56 | }
57 | }
58 |
59 | __host__ u64* calculateRange() {
60 | u64* range;
61 | gpuErrorCheck(cudaMallocManaged(&range, 1 * sizeof(u64)));
62 | int threadCount = BLOCKS * THREADS;
63 | double keyRange = pow(2, TWO_POWER_RANGE);
64 | double threadRange = keyRange / threadCount;
65 | *range = ceil(threadRange);
66 |
67 | /* printf("Blocks : %d\n", BLOCKS);
68 | printf("Threads : %d\n", THREADS);
69 | printf("Total Thread count : %d\n", threadCount);
70 | printf("Key Range (power) : %d\n", TWO_POWER_RANGE);
71 | printf("Key Range (decimal) : %.0f\n", keyRange);
72 | printf("Each Thread Key Range : %.2f\n", threadRange);
73 | printf("Each Thread Key Range (kernel): %d\n", range[0]);
74 | printf("Total encryptions : %.0f\n", ceil(threadRange) * threadCount);
75 | printf("-------------------------------\n");*/
76 |
77 | return range;
78 | }
79 |
80 | __device__ u32 arithmeticRightShift(u32 x, u32 n) { return (x >> n) | (x << (-n & 31)); }
81 | __device__ u32 arithmetic16bitRightShift(u32 x, u32 n, u32 n2Power) { return (x >> n) | ((x & n2Power) << (-n & 15)); }
82 | __device__ u32 arithmeticRightShiftBytePerm(u32 x, u32 n) { return __byte_perm(x, x, n); }
83 |
84 | // ROTATE LEFT
85 | #define ROTL64(x,n) (((x)<<(n))|((x)>>(64-(n))))
86 | #define ROTL16(x,n) (((x)<<(n))|((x)>>(16-(n))))&0xffff
87 | u8 SAES[256] = { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 };
88 |
89 | u32 T0[TABLE_SIZE] = {
90 | 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
91 | 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
92 | 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
93 | 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
94 | 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
95 | 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
96 | 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
97 | 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
98 | 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
99 | 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
100 | 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
101 | 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
102 | 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
103 | 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
104 | 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
105 | 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
106 | 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
107 | 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
108 | 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
109 | 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
110 | 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
111 | 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
112 | 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
113 | 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
114 | 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
115 | 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
116 | 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
117 | 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
118 | 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
119 | 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
120 | 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
121 | 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
122 | 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
123 | 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
124 | 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
125 | 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
126 | 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
127 | 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
128 | 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
129 | 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
130 | 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
131 | 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
132 | 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
133 | 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
134 | 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
135 | 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
136 | 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
137 | 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
138 | 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
139 | 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
140 | 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
141 | 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
142 | 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
143 | 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
144 | 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
145 | 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
146 | 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
147 | 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
148 | 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
149 | 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
150 | 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
151 | 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
152 | 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
153 | 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
154 | };
155 | u32 T1[TABLE_SIZE] = {
156 | 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
157 | 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
158 | 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
159 | 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
160 | 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
161 | 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
162 | 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
163 | 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
164 | 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
165 | 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
166 | 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
167 | 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
168 | 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
169 | 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
170 | 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
171 | 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
172 | 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
173 | 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
174 | 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
175 | 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
176 | 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
177 | 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
178 | 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
179 | 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
180 | 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
181 | 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
182 | 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
183 | 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
184 | 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
185 | 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
186 | 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
187 | 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
188 | 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
189 | 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
190 | 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
191 | 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
192 | 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
193 | 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
194 | 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
195 | 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
196 | 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
197 | 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
198 | 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
199 | 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
200 | 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
201 | 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
202 | 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
203 | 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
204 | 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
205 | 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
206 | 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
207 | 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
208 | 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
209 | 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
210 | 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
211 | 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
212 | 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
213 | 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
214 | 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
215 | 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
216 | 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
217 | 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
218 | 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
219 | 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
220 | };
221 | u32 T2[TABLE_SIZE] = {
222 | 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
223 | 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
224 | 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
225 | 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
226 | 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
227 | 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
228 | 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
229 | 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
230 | 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
231 | 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
232 | 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
233 | 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
234 | 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
235 | 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
236 | 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
237 | 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
238 | 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
239 | 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
240 | 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
241 | 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
242 | 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
243 | 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
244 | 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
245 | 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
246 | 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
247 | 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
248 | 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
249 | 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
250 | 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
251 | 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
252 | 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
253 | 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
254 | 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
255 | 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
256 | 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
257 | 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
258 | 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
259 | 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
260 | 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
261 | 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
262 | 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
263 | 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
264 | 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
265 | 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
266 | 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
267 | 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
268 | 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
269 | 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
270 | 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
271 | 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
272 | 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
273 | 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
274 | 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
275 | 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
276 | 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
277 | 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
278 | 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
279 | 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
280 | 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
281 | 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
282 | 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
283 | 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
284 | 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
285 | 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
286 | };
287 | u32 T3[TABLE_SIZE] = {
288 | 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
289 | 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
290 | 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
291 | 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
292 | 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
293 | 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
294 | 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
295 | 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
296 | 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
297 | 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
298 | 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
299 | 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
300 | 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
301 | 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
302 | 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
303 | 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
304 | 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
305 | 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
306 | 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
307 | 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
308 | 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
309 | 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
310 | 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
311 | 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
312 | 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
313 | 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
314 | 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
315 | 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
316 | 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
317 | 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
318 | 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
319 | 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
320 | 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
321 | 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
322 | 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
323 | 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
324 | 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
325 | 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
326 | 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
327 | 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
328 | 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
329 | 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
330 | 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
331 | 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
332 | 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
333 | 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
334 | 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
335 | 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
336 | 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
337 | 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
338 | 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
339 | 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
340 | 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
341 | 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
342 | 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
343 | 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
344 | 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
345 | 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
346 | 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
347 | 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
348 | 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
349 | 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
350 | 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
351 | 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
352 | };
353 | u32 T4[TABLE_SIZE] = {
354 | 0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,
355 | 0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,
356 | 0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,
357 | 0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,
358 | 0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,
359 | 0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,
360 | 0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,
361 | 0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,
362 | 0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,
363 | 0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,
364 | 0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,
365 | 0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,
366 | 0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,
367 | 0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,
368 | 0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,
369 | 0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,
370 | 0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,
371 | 0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,
372 | 0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,
373 | 0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,
374 | 0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,
375 | 0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,
376 | 0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,
377 | 0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,
378 | 0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,
379 | 0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,
380 | 0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,
381 | 0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,
382 | 0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,
383 | 0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,
384 | 0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,
385 | 0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,
386 | 0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,
387 | 0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,
388 | 0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,
389 | 0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,
390 | 0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,
391 | 0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,
392 | 0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,
393 | 0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,
394 | 0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,
395 | 0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,
396 | 0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,
397 | 0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,
398 | 0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,
399 | 0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,
400 | 0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,
401 | 0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,
402 | 0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,
403 | 0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,
404 | 0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,
405 | 0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,
406 | 0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,
407 | 0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,
408 | 0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,
409 | 0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,
410 | 0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,
411 | 0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,
412 | 0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,
413 | 0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,
414 | 0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,
415 | 0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,
416 | 0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,
417 | 0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U,
418 | };
419 | u32 T4_0[TABLE_SIZE] = {
420 | 0x00000063U, 0x0000007cU, 0x00000077U, 0x0000007bU,
421 | 0x000000f2U, 0x0000006bU, 0x0000006fU, 0x000000c5U,
422 | 0x00000030U, 0x00000001U, 0x00000067U, 0x0000002bU,
423 | 0x000000feU, 0x000000d7U, 0x000000abU, 0x00000076U,
424 | 0x000000caU, 0x00000082U, 0x000000c9U, 0x0000007dU,
425 | 0x000000faU, 0x00000059U, 0x00000047U, 0x000000f0U,
426 | 0x000000adU, 0x000000d4U, 0x000000a2U, 0x000000afU,
427 | 0x0000009cU, 0x000000a4U, 0x00000072U, 0x000000c0U,
428 | 0x000000b7U, 0x000000fdU, 0x00000093U, 0x00000026U,
429 | 0x00000036U, 0x0000003fU, 0x000000f7U, 0x000000ccU,
430 | 0x00000034U, 0x000000a5U, 0x000000e5U, 0x000000f1U,
431 | 0x00000071U, 0x000000d8U, 0x00000031U, 0x00000015U,
432 | 0x00000004U, 0x000000c7U, 0x00000023U, 0x000000c3U,
433 | 0x00000018U, 0x00000096U, 0x00000005U, 0x0000009aU,
434 | 0x00000007U, 0x00000012U, 0x00000080U, 0x000000e2U,
435 | 0x000000ebU, 0x00000027U, 0x000000b2U, 0x00000075U,
436 | 0x00000009U, 0x00000083U, 0x0000002cU, 0x0000001aU,
437 | 0x0000001bU, 0x0000006eU, 0x0000005aU, 0x000000a0U,
438 | 0x00000052U, 0x0000003bU, 0x000000d6U, 0x000000b3U,
439 | 0x00000029U, 0x000000e3U, 0x0000002fU, 0x00000084U,
440 | 0x00000053U, 0x000000d1U, 0x00000000U, 0x000000edU,
441 | 0x00000020U, 0x000000fcU, 0x000000b1U, 0x0000005bU,
442 | 0x0000006aU, 0x000000cbU, 0x000000beU, 0x00000039U,
443 | 0x0000004aU, 0x0000004cU, 0x00000058U, 0x000000cfU,
444 | 0x000000d0U, 0x000000efU, 0x000000aaU, 0x000000fbU,
445 | 0x00000043U, 0x0000004dU, 0x00000033U, 0x00000085U,
446 | 0x00000045U, 0x000000f9U, 0x00000002U, 0x0000007fU,
447 | 0x00000050U, 0x0000003cU, 0x0000009fU, 0x000000a8U,
448 | 0x00000051U, 0x000000a3U, 0x00000040U, 0x0000008fU,
449 | 0x00000092U, 0x0000009dU, 0x00000038U, 0x000000f5U,
450 | 0x000000bcU, 0x000000b6U, 0x000000daU, 0x00000021U,
451 | 0x00000010U, 0x000000ffU, 0x000000f3U, 0x000000d2U,
452 | 0x000000cdU, 0x0000000cU, 0x00000013U, 0x000000ecU,
453 | 0x0000005fU, 0x00000097U, 0x00000044U, 0x00000017U,
454 | 0x000000c4U, 0x000000a7U, 0x0000007eU, 0x0000003dU,
455 | 0x00000064U, 0x0000005dU, 0x00000019U, 0x00000073U,
456 | 0x00000060U, 0x00000081U, 0x0000004fU, 0x000000dcU,
457 | 0x00000022U, 0x0000002aU, 0x00000090U, 0x00000088U,
458 | 0x00000046U, 0x000000eeU, 0x000000b8U, 0x00000014U,
459 | 0x000000deU, 0x0000005eU, 0x0000000bU, 0x000000dbU,
460 | 0x000000e0U, 0x00000032U, 0x0000003aU, 0x0000000aU,
461 | 0x00000049U, 0x00000006U, 0x00000024U, 0x0000005cU,
462 | 0x000000c2U, 0x000000d3U, 0x000000acU, 0x00000062U,
463 | 0x00000091U, 0x00000095U, 0x000000e4U, 0x00000079U,
464 | 0x000000e7U, 0x000000c8U, 0x00000037U, 0x0000006dU,
465 | 0x0000008dU, 0x000000d5U, 0x0000004eU, 0x000000a9U,
466 | 0x0000006cU, 0x00000056U, 0x000000f4U, 0x000000eaU,
467 | 0x00000065U, 0x0000007aU, 0x000000aeU, 0x00000008U,
468 | 0x000000baU, 0x00000078U, 0x00000025U, 0x0000002eU,
469 | 0x0000001cU, 0x000000a6U, 0x000000b4U, 0x000000c6U,
470 | 0x000000e8U, 0x000000ddU, 0x00000074U, 0x0000001fU,
471 | 0x0000004bU, 0x000000bdU, 0x0000008bU, 0x0000008aU,
472 | 0x00000070U, 0x0000003eU, 0x000000b5U, 0x00000066U,
473 | 0x00000048U, 0x00000003U, 0x000000f6U, 0x0000000eU,
474 | 0x00000061U, 0x00000035U, 0x00000057U, 0x000000b9U,
475 | 0x00000086U, 0x000000c1U, 0x0000001dU, 0x0000009eU,
476 | 0x000000e1U, 0x000000f8U, 0x00000098U, 0x00000011U,
477 | 0x00000069U, 0x000000d9U, 0x0000008eU, 0x00000094U,
478 | 0x0000009bU, 0x0000001eU, 0x00000087U, 0x000000e9U,
479 | 0x000000ceU, 0x00000055U, 0x00000028U, 0x000000dfU,
480 | 0x0000008cU, 0x000000a1U, 0x00000089U, 0x0000000dU,
481 | 0x000000bfU, 0x000000e6U, 0x00000042U, 0x00000068U,
482 | 0x00000041U, 0x00000099U, 0x0000002dU, 0x0000000fU,
483 | 0x000000b0U, 0x00000054U, 0x000000bbU, 0x00000016U,
484 | };
485 | u32 T4_1[TABLE_SIZE] = {
486 | 0x00006300U, 0x00007c00U, 0x00007700U, 0x00007b00U,
487 | 0x0000f200U, 0x00006b00U, 0x00006f00U, 0x0000c500U,
488 | 0x00003000U, 0x00000100U, 0x00006700U, 0x00002b00U,
489 | 0x0000fe00U, 0x0000d700U, 0x0000ab00U, 0x00007600U,
490 | 0x0000ca00U, 0x00008200U, 0x0000c900U, 0x00007d00U,
491 | 0x0000fa00U, 0x00005900U, 0x00004700U, 0x0000f000U,
492 | 0x0000ad00U, 0x0000d400U, 0x0000a200U, 0x0000af00U,
493 | 0x00009c00U, 0x0000a400U, 0x00007200U, 0x0000c000U,
494 | 0x0000b700U, 0x0000fd00U, 0x00009300U, 0x00002600U,
495 | 0x00003600U, 0x00003f00U, 0x0000f700U, 0x0000cc00U,
496 | 0x00003400U, 0x0000a500U, 0x0000e500U, 0x0000f100U,
497 | 0x00007100U, 0x0000d800U, 0x00003100U, 0x00001500U,
498 | 0x00000400U, 0x0000c700U, 0x00002300U, 0x0000c300U,
499 | 0x00001800U, 0x00009600U, 0x00000500U, 0x00009a00U,
500 | 0x00000700U, 0x00001200U, 0x00008000U, 0x0000e200U,
501 | 0x0000eb00U, 0x00002700U, 0x0000b200U, 0x00007500U,
502 | 0x00000900U, 0x00008300U, 0x00002c00U, 0x00001a00U,
503 | 0x00001b00U, 0x00006e00U, 0x00005a00U, 0x0000a000U,
504 | 0x00005200U, 0x00003b00U, 0x0000d600U, 0x0000b300U,
505 | 0x00002900U, 0x0000e300U, 0x00002f00U, 0x00008400U,
506 | 0x00005300U, 0x0000d100U, 0x00000000U, 0x0000ed00U,
507 | 0x00002000U, 0x0000fc00U, 0x0000b100U, 0x00005b00U,
508 | 0x00006a00U, 0x0000cb00U, 0x0000be00U, 0x00003900U,
509 | 0x00004a00U, 0x00004c00U, 0x00005800U, 0x0000cf00U,
510 | 0x0000d000U, 0x0000ef00U, 0x0000aa00U, 0x0000fb00U,
511 | 0x00004300U, 0x00004d00U, 0x00003300U, 0x00008500U,
512 | 0x00004500U, 0x0000f900U, 0x00000200U, 0x00007f00U,
513 | 0x00005000U, 0x00003c00U, 0x00009f00U, 0x0000a800U,
514 | 0x00005100U, 0x0000a300U, 0x00004000U, 0x00008f00U,
515 | 0x00009200U, 0x00009d00U, 0x00003800U, 0x0000f500U,
516 | 0x0000bc00U, 0x0000b600U, 0x0000da00U, 0x00002100U,
517 | 0x00001000U, 0x0000ff00U, 0x0000f300U, 0x0000d200U,
518 | 0x0000cd00U, 0x00000c00U, 0x00001300U, 0x0000ec00U,
519 | 0x00005f00U, 0x00009700U, 0x00004400U, 0x00001700U,
520 | 0x0000c400U, 0x0000a700U, 0x00007e00U, 0x00003d00U,
521 | 0x00006400U, 0x00005d00U, 0x00001900U, 0x00007300U,
522 | 0x00006000U, 0x00008100U, 0x00004f00U, 0x0000dc00U,
523 | 0x00002200U, 0x00002a00U, 0x00009000U, 0x00008800U,
524 | 0x00004600U, 0x0000ee00U, 0x0000b800U, 0x00001400U,
525 | 0x0000de00U, 0x00005e00U, 0x00000b00U, 0x0000db00U,
526 | 0x0000e000U, 0x00003200U, 0x00003a00U, 0x00000a00U,
527 | 0x00004900U, 0x00000600U, 0x00002400U, 0x00005c00U,
528 | 0x0000c200U, 0x0000d300U, 0x0000ac00U, 0x00006200U,
529 | 0x00009100U, 0x00009500U, 0x0000e400U, 0x00007900U,
530 | 0x0000e700U, 0x0000c800U, 0x00003700U, 0x00006d00U,
531 | 0x00008d00U, 0x0000d500U, 0x00004e00U, 0x0000a900U,
532 | 0x00006c00U, 0x00005600U, 0x0000f400U, 0x0000ea00U,
533 | 0x00006500U, 0x00007a00U, 0x0000ae00U, 0x00000800U,
534 | 0x0000ba00U, 0x00007800U, 0x00002500U, 0x00002e00U,
535 | 0x00001c00U, 0x0000a600U, 0x0000b400U, 0x0000c600U,
536 | 0x0000e800U, 0x0000dd00U, 0x00007400U, 0x00001f00U,
537 | 0x00004b00U, 0x0000bd00U, 0x00008b00U, 0x00008a00U,
538 | 0x00007000U, 0x00003e00U, 0x0000b500U, 0x00006600U,
539 | 0x00004800U, 0x00000300U, 0x0000f600U, 0x00000e00U,
540 | 0x00006100U, 0x00003500U, 0x00005700U, 0x0000b900U,
541 | 0x00008600U, 0x0000c100U, 0x00001d00U, 0x00009e00U,
542 | 0x0000e100U, 0x0000f800U, 0x00009800U, 0x00001100U,
543 | 0x00006900U, 0x0000d900U, 0x00008e00U, 0x00009400U,
544 | 0x00009b00U, 0x00001e00U, 0x00008700U, 0x0000e900U,
545 | 0x0000ce00U, 0x00005500U, 0x00002800U, 0x0000df00U,
546 | 0x00008c00U, 0x0000a100U, 0x00008900U, 0x00000d00U,
547 | 0x0000bf00U, 0x0000e600U, 0x00004200U, 0x00006800U,
548 | 0x00004100U, 0x00009900U, 0x00002d00U, 0x00000f00U,
549 | 0x0000b000U, 0x00005400U, 0x0000bb00U, 0x00001600U,
550 | };
551 | u32 T4_2[TABLE_SIZE] = {
552 | 0x00630000U, 0x007c0000U, 0x00770000U, 0x007b0000U,
553 | 0x00f20000U, 0x006b0000U, 0x006f0000U, 0x00c50000U,
554 | 0x00300000U, 0x00010000U, 0x00670000U, 0x002b0000U,
555 | 0x00fe0000U, 0x00d70000U, 0x00ab0000U, 0x00760000U,
556 | 0x00ca0000U, 0x00820000U, 0x00c90000U, 0x007d0000U,
557 | 0x00fa0000U, 0x00590000U, 0x00470000U, 0x00f00000U,
558 | 0x00ad0000U, 0x00d40000U, 0x00a20000U, 0x00af0000U,
559 | 0x009c0000U, 0x00a40000U, 0x00720000U, 0x00c00000U,
560 | 0x00b70000U, 0x00fd0000U, 0x00930000U, 0x00260000U,
561 | 0x00360000U, 0x003f0000U, 0x00f70000U, 0x00cc0000U,
562 | 0x00340000U, 0x00a50000U, 0x00e50000U, 0x00f10000U,
563 | 0x00710000U, 0x00d80000U, 0x00310000U, 0x00150000U,
564 | 0x00040000U, 0x00c70000U, 0x00230000U, 0x00c30000U,
565 | 0x00180000U, 0x00960000U, 0x00050000U, 0x009a0000U,
566 | 0x00070000U, 0x00120000U, 0x00800000U, 0x00e20000U,
567 | 0x00eb0000U, 0x00270000U, 0x00b20000U, 0x00750000U,
568 | 0x00090000U, 0x00830000U, 0x002c0000U, 0x001a0000U,
569 | 0x001b0000U, 0x006e0000U, 0x005a0000U, 0x00a00000U,
570 | 0x00520000U, 0x003b0000U, 0x00d60000U, 0x00b30000U,
571 | 0x00290000U, 0x00e30000U, 0x002f0000U, 0x00840000U,
572 | 0x00530000U, 0x00d10000U, 0x00000000U, 0x00ed0000U,
573 | 0x00200000U, 0x00fc0000U, 0x00b10000U, 0x005b0000U,
574 | 0x006a0000U, 0x00cb0000U, 0x00be0000U, 0x00390000U,
575 | 0x004a0000U, 0x004c0000U, 0x00580000U, 0x00cf0000U,
576 | 0x00d00000U, 0x00ef0000U, 0x00aa0000U, 0x00fb0000U,
577 | 0x00430000U, 0x004d0000U, 0x00330000U, 0x00850000U,
578 | 0x00450000U, 0x00f90000U, 0x00020000U, 0x007f0000U,
579 | 0x00500000U, 0x003c0000U, 0x009f0000U, 0x00a80000U,
580 | 0x00510000U, 0x00a30000U, 0x00400000U, 0x008f0000U,
581 | 0x00920000U, 0x009d0000U, 0x00380000U, 0x00f50000U,
582 | 0x00bc0000U, 0x00b60000U, 0x00da0000U, 0x00210000U,
583 | 0x00100000U, 0x00ff0000U, 0x00f30000U, 0x00d20000U,
584 | 0x00cd0000U, 0x000c0000U, 0x00130000U, 0x00ec0000U,
585 | 0x005f0000U, 0x00970000U, 0x00440000U, 0x00170000U,
586 | 0x00c40000U, 0x00a70000U, 0x007e0000U, 0x003d0000U,
587 | 0x00640000U, 0x005d0000U, 0x00190000U, 0x00730000U,
588 | 0x00600000U, 0x00810000U, 0x004f0000U, 0x00dc0000U,
589 | 0x00220000U, 0x002a0000U, 0x00900000U, 0x00880000U,
590 | 0x00460000U, 0x00ee0000U, 0x00b80000U, 0x00140000U,
591 | 0x00de0000U, 0x005e0000U, 0x000b0000U, 0x00db0000U,
592 | 0x00e00000U, 0x00320000U, 0x003a0000U, 0x000a0000U,
593 | 0x00490000U, 0x00060000U, 0x00240000U, 0x005c0000U,
594 | 0x00c20000U, 0x00d30000U, 0x00ac0000U, 0x00620000U,
595 | 0x00910000U, 0x00950000U, 0x00e40000U, 0x00790000U,
596 | 0x00e70000U, 0x00c80000U, 0x00370000U, 0x006d0000U,
597 | 0x008d0000U, 0x00d50000U, 0x004e0000U, 0x00a90000U,
598 | 0x006c0000U, 0x00560000U, 0x00f40000U, 0x00ea0000U,
599 | 0x00650000U, 0x007a0000U, 0x00ae0000U, 0x00080000U,
600 | 0x00ba0000U, 0x00780000U, 0x00250000U, 0x002e0000U,
601 | 0x001c0000U, 0x00a60000U, 0x00b40000U, 0x00c60000U,
602 | 0x00e80000U, 0x00dd0000U, 0x00740000U, 0x001f0000U,
603 | 0x004b0000U, 0x00bd0000U, 0x008b0000U, 0x008a0000U,
604 | 0x00700000U, 0x003e0000U, 0x00b50000U, 0x00660000U,
605 | 0x00480000U, 0x00030000U, 0x00f60000U, 0x000e0000U,
606 | 0x00610000U, 0x00350000U, 0x00570000U, 0x00b90000U,
607 | 0x00860000U, 0x00c10000U, 0x001d0000U, 0x009e0000U,
608 | 0x00e10000U, 0x00f80000U, 0x00980000U, 0x00110000U,
609 | 0x00690000U, 0x00d90000U, 0x008e0000U, 0x00940000U,
610 | 0x009b0000U, 0x001e0000U, 0x00870000U, 0x00e90000U,
611 | 0x00ce0000U, 0x00550000U, 0x00280000U, 0x00df0000U,
612 | 0x008c0000U, 0x00a10000U, 0x00890000U, 0x000d0000U,
613 | 0x00bf0000U, 0x00e60000U, 0x00420000U, 0x00680000U,
614 | 0x00410000U, 0x00990000U, 0x002d0000U, 0x000f0000U,
615 | 0x00b00000U, 0x00540000U, 0x00bb0000U, 0x00160000U,
616 | };
617 | u32 T4_3[TABLE_SIZE] = {
618 | 0x63000000U, 0x7c000000U, 0x77000000U, 0x7b000000U,
619 | 0xf2000000U, 0x6b000000U, 0x6f000000U, 0xc5000000U,
620 | 0x30000000U, 0x01000000U, 0x67000000U, 0x2b000000U,
621 | 0xfe000000U, 0xd7000000U, 0xab000000U, 0x76000000U,
622 | 0xca000000U, 0x82000000U, 0xc9000000U, 0x7d000000U,
623 | 0xfa000000U, 0x59000000U, 0x47000000U, 0xf0000000U,
624 | 0xad000000U, 0xd4000000U, 0xa2000000U, 0xaf000000U,
625 | 0x9c000000U, 0xa4000000U, 0x72000000U, 0xc0000000U,
626 | 0xb7000000U, 0xfd000000U, 0x93000000U, 0x26000000U,
627 | 0x36000000U, 0x3f000000U, 0xf7000000U, 0xcc000000U,
628 | 0x34000000U, 0xa5000000U, 0xe5000000U, 0xf1000000U,
629 | 0x71000000U, 0xd8000000U, 0x31000000U, 0x15000000U,
630 | 0x04000000U, 0xc7000000U, 0x23000000U, 0xc3000000U,
631 | 0x18000000U, 0x96000000U, 0x05000000U, 0x9a000000U,
632 | 0x07000000U, 0x12000000U, 0x80000000U, 0xe2000000U,
633 | 0xeb000000U, 0x27000000U, 0xb2000000U, 0x75000000U,
634 | 0x09000000U, 0x83000000U, 0x2c000000U, 0x1a000000U,
635 | 0x1b000000U, 0x6e000000U, 0x5a000000U, 0xa0000000U,
636 | 0x52000000U, 0x3b000000U, 0xd6000000U, 0xb3000000U,
637 | 0x29000000U, 0xe3000000U, 0x2f000000U, 0x84000000U,
638 | 0x53000000U, 0xd1000000U, 0x00000000U, 0xed000000U,
639 | 0x20000000U, 0xfc000000U, 0xb1000000U, 0x5b000000U,
640 | 0x6a000000U, 0xcb000000U, 0xbe000000U, 0x39000000U,
641 | 0x4a000000U, 0x4c000000U, 0x58000000U, 0xcf000000U,
642 | 0xd0000000U, 0xef000000U, 0xaa000000U, 0xfb000000U,
643 | 0x43000000U, 0x4d000000U, 0x33000000U, 0x85000000U,
644 | 0x45000000U, 0xf9000000U, 0x02000000U, 0x7f000000U,
645 | 0x50000000U, 0x3c000000U, 0x9f000000U, 0xa8000000U,
646 | 0x51000000U, 0xa3000000U, 0x40000000U, 0x8f000000U,
647 | 0x92000000U, 0x9d000000U, 0x38000000U, 0xf5000000U,
648 | 0xbc000000U, 0xb6000000U, 0xda000000U, 0x21000000U,
649 | 0x10000000U, 0xff000000U, 0xf3000000U, 0xd2000000U,
650 | 0xcd000000U, 0x0c000000U, 0x13000000U, 0xec000000U,
651 | 0x5f000000U, 0x97000000U, 0x44000000U, 0x17000000U,
652 | 0xc4000000U, 0xa7000000U, 0x7e000000U, 0x3d000000U,
653 | 0x64000000U, 0x5d000000U, 0x19000000U, 0x73000000U,
654 | 0x60000000U, 0x81000000U, 0x4f000000U, 0xdc000000U,
655 | 0x22000000U, 0x2a000000U, 0x90000000U, 0x88000000U,
656 | 0x46000000U, 0xee000000U, 0xb8000000U, 0x14000000U,
657 | 0xde000000U, 0x5e000000U, 0x0b000000U, 0xdb000000U,
658 | 0xe0000000U, 0x32000000U, 0x3a000000U, 0x0a000000U,
659 | 0x49000000U, 0x06000000U, 0x24000000U, 0x5c000000U,
660 | 0xc2000000U, 0xd3000000U, 0xac000000U, 0x62000000U,
661 | 0x91000000U, 0x95000000U, 0xe4000000U, 0x79000000U,
662 | 0xe7000000U, 0xc8000000U, 0x37000000U, 0x6d000000U,
663 | 0x8d000000U, 0xd5000000U, 0x4e000000U, 0xa9000000U,
664 | 0x6c000000U, 0x56000000U, 0xf4000000U, 0xea000000U,
665 | 0x65000000U, 0x7a000000U, 0xae000000U, 0x08000000U,
666 | 0xba000000U, 0x78000000U, 0x25000000U, 0x2e000000U,
667 | 0x1c000000U, 0xa6000000U, 0xb4000000U, 0xc6000000U,
668 | 0xe8000000U, 0xdd000000U, 0x74000000U, 0x1f000000U,
669 | 0x4b000000U, 0xbd000000U, 0x8b000000U, 0x8a000000U,
670 | 0x70000000U, 0x3e000000U, 0xb5000000U, 0x66000000U,
671 | 0x48000000U, 0x03000000U, 0xf6000000U, 0x0e000000U,
672 | 0x61000000U, 0x35000000U, 0x57000000U, 0xb9000000U,
673 | 0x86000000U, 0xc1000000U, 0x1d000000U, 0x9e000000U,
674 | 0xe1000000U, 0xf8000000U, 0x98000000U, 0x11000000U,
675 | 0x69000000U, 0xd9000000U, 0x8e000000U, 0x94000000U,
676 | 0x9b000000U, 0x1e000000U, 0x87000000U, 0xe9000000U,
677 | 0xce000000U, 0x55000000U, 0x28000000U, 0xdf000000U,
678 | 0x8c000000U, 0xa1000000U, 0x89000000U, 0x0d000000U,
679 | 0xbf000000U, 0xe6000000U, 0x42000000U, 0x68000000U,
680 | 0x41000000U, 0x99000000U, 0x2d000000U, 0x0f000000U,
681 | 0xb0000000U, 0x54000000U, 0xbb000000U, 0x16000000U,
682 | };
683 | u32 RCON32[RCON_SIZE] = {
684 | 0x01000000, 0x02000000, 0x04000000, 0x08000000,
685 | 0x10000000, 0x20000000, 0x40000000, 0x80000000,
686 | 0x1B000000, 0x36000000, 0x6C000000, 0xD8000000,
687 | 0xAB000000, 0x4D000000, 0x9A000000
688 | };
689 |
690 | // Small AES
691 | #define PROB_SIZE_1 16
692 | #define PROB_SIZE_2 256
693 | #define PROB_SIZE_3 4096
694 | #define PROB_SIZE_4 65536
695 | #define ROUND_5 5
696 |
697 | u16 T0_SML[16] = {
698 | 0xc66a, 0x6bbd, 0xa55f, 0x844c,
699 | 0x4226, 0xcef2, 0xe779, 0x4abe,
700 | 0x299b, 0xadd7, 0xeff1, 0x8cd4,
701 | 0x6335, 0x2113, 0x0000, 0x0898
702 | };
703 | u16 T1_SML[16] = {
704 | 0xac66, 0xd6bb, 0xfa55, 0xc844,
705 | 0x6422, 0x3cee, 0x9e77, 0xf4aa,
706 | 0xb299, 0x7add, 0x1eff, 0x58cc,
707 | 0x5633, 0x3211, 0x0000, 0x9088
708 | };
709 | u16 T2_SML[16] = {
710 | 0x6ac6, 0xbd6b, 0x5fa5, 0x4c84,
711 | 0x2642, 0xf3ce, 0x79e7, 0xbf4a,
712 | 0x9b29, 0xd7ad, 0xf1ef, 0xd58c,
713 | 0x3563, 0x1321, 0x0000, 0x9908
714 | };
715 | u16 T3_SML[16] = {
716 | 0x66ac, 0xbbd6, 0x55fa, 0x44c8,
717 | 0x2264, 0xef3c, 0x779e, 0xabf4,
718 | 0x99b2, 0xdd7a, 0xff1e, 0xcd58,
719 | 0x3356, 0x1132, 0x0000, 0x8990
720 | };
721 | u16 T4_SML[16] = {
722 | 0x6666, 0xbbbb, 0x5555, 0x4444,
723 | 0x2222, 0xeeee, 0x7777, 0xaaaa,
724 | 0x9999, 0xdddd, 0xffff, 0xcccc,
725 | 0x3333, 0x1111, 0x0000, 0x8888
726 | };
727 | u16 RCON_SML[16] = {
728 | 0x1000, 0x2000, 0x3000, 0x4000,
729 | 0x5000, 0x6000, 0x7000, 0x8000,
730 | 0x9000, 0xa000, 0xb000, 0xc000,
731 | 0xd000, 0xe000, 0xf000, 0x0000
732 | };
733 |
734 | int main();
735 |
--------------------------------------------------------------------------------
/128-es.cuh:
--------------------------------------------------------------------------------
1 | // System includes
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | // CUDA runtime
8 | #include
9 |
10 | // Helper functions and utilities to work with CUDA
11 | //#include
12 | //#include
13 |
14 | #include
15 | //#include
16 |
17 | // Custom header
18 | //#include "kernel.h"
19 |
20 |
21 | // Basic exhaustive search
22 | // 4 Tables
23 | __global__ void exhaustiveSearch(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t1G, u32* t2G, u32* t3G, u32* t4G, u32* rconG, u64* range) {
24 |
25 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
26 |
27 | //
28 | __shared__ u32 t0S[TABLE_SIZE];
29 | __shared__ u32 t1S[TABLE_SIZE];
30 | __shared__ u32 t2S[TABLE_SIZE];
31 | __shared__ u32 t3S[TABLE_SIZE];
32 | __shared__ u32 t4S[TABLE_SIZE];
33 | __shared__ u32 rconS[RCON_SIZE];
34 | __shared__ u32 ctS[U32_SIZE];
35 |
36 |
37 | if (threadIdx.x < TABLE_SIZE) {
38 | t0S[threadIdx.x] = t0G[threadIdx.x];
39 | t1S[threadIdx.x] = t1G[threadIdx.x];
40 | t2S[threadIdx.x] = t2G[threadIdx.x];
41 | t3S[threadIdx.x] = t3G[threadIdx.x];
42 | t4S[threadIdx.x] = t4G[threadIdx.x];
43 |
44 | if (threadIdx.x < RCON_SIZE) {
45 | rconS[threadIdx.x] = rconG[threadIdx.x];
46 | }
47 |
48 | if (threadIdx.x < U32_SIZE) {
49 | ctS[threadIdx.x] = ct[threadIdx.x];
50 | }
51 | }
52 | //
53 |
54 | // Wait until every thread is ready
55 | __syncthreads();
56 |
57 | u32 rk0Init, rk1Init, rk2Init, rk3Init;
58 | rk0Init = rk[0];
59 | rk1Init = rk[1];
60 | rk2Init = rk[2];
61 | rk3Init = rk[3];
62 |
63 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
64 | pt0Init = pt[0];
65 | pt1Init = pt[1];
66 | pt2Init = pt[2];
67 | pt3Init = pt[3];
68 |
69 | u64 threadRange = *range;
70 | u64 threadRangeStart = (u64)threadIndex * threadRange;
71 | rk2Init = rk2Init + threadRangeStart / MAX_U32;
72 | rk3Init = rk3Init + threadRangeStart % MAX_U32;
73 |
74 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
75 |
76 | u32 rk0, rk1, rk2, rk3;
77 | rk0 = rk0Init;
78 | rk1 = rk1Init;
79 | rk2 = rk2Init;
80 | rk3 = rk3Init;
81 |
82 | // Create plaintext as 32 bit unsigned integers
83 | u32 s0, s1, s2, s3;
84 | s0 = pt0Init;
85 | s1 = pt1Init;
86 | s2 = pt2Init;
87 | s3 = pt3Init;
88 |
89 | // First round just XORs input with key.
90 | s0 = s0 ^ rk0;
91 | s1 = s1 ^ rk1;
92 | s2 = s2 ^ rk2;
93 | s3 = s3 ^ rk3;
94 |
95 | //if (threadIndex == 0 && rangeCount == 0) {
96 | // printf("--Round: %d\n", 0);
97 | // printf("%08x%08x%08x%08x\n", s0, s1, s2, s3);
98 | // printf("-- Round Key\n");
99 | // printf("%08x%08x%08x%08x\n", rk0, rk1, rk2, rk3);
100 | //}
101 |
102 | u32 t0, t1, t2, t3;
103 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) {
104 |
105 | // Calculate round key
106 | u32 temp = rk3;
107 | // TODO: temp & 0xff000000
108 | rk0 = rk0 ^
109 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^
110 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^
111 | (t4S[(temp) & 0xff] & 0x0000ff00) ^
112 | (t4S[(temp >> 24)] & 0x000000ff) ^
113 | rconS[roundCount];
114 | rk1 = rk1 ^ rk0;
115 | rk2 = rk2 ^ rk1;
116 | rk3 = rk2 ^ rk3;
117 |
118 | // Table based round function
119 | t0 = t0S[s0 >> 24] ^ t1S[(s1 >> 16) & 0xFF] ^ t2S[(s2 >> 8) & 0xFF] ^ t3S[s3 & 0xFF] ^ rk0;
120 | t1 = t0S[s1 >> 24] ^ t1S[(s2 >> 16) & 0xFF] ^ t2S[(s3 >> 8) & 0xFF] ^ t3S[s0 & 0xFF] ^ rk1;
121 | t2 = t0S[s2 >> 24] ^ t1S[(s3 >> 16) & 0xFF] ^ t2S[(s0 >> 8) & 0xFF] ^ t3S[s1 & 0xFF] ^ rk2;
122 | t3 = t0S[s3 >> 24] ^ t1S[(s0 >> 16) & 0xFF] ^ t2S[(s1 >> 8) & 0xFF] ^ t3S[s2 & 0xFF] ^ rk3;
123 |
124 | s0 = t0;
125 | s1 = t1;
126 | s2 = t2;
127 | s3 = t3;
128 |
129 | //if (threadIndex == 0 && rangeCount == 0) {
130 | // printf("--Round: %d\n", roundCount);
131 | // printf("%08x%08x%08x%08x\n", s0, s1, s2, s3);
132 | // printf("-- Round Key\n");
133 | // printf("%08x%08x%08x%08x\n", rk0, rk1, rk2, rk3);
134 | //}
135 | }
136 |
137 | // Calculate the last round key
138 | u32 temp = rk3;
139 | rk0 = rk0 ^
140 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^
141 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^
142 | (t4S[(temp) & 0xff] & 0x0000ff00) ^
143 | (t4S[(temp >> 24)] & 0x000000ff) ^
144 | rconS[ROUND_COUNT_MIN_1];
145 | // Last round uses s-box directly and XORs to produce output.
146 | s0 = (t4S[t0 >> 24] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t3) & 0xFF] & 0x000000FF) ^ rk0;
147 | if (s0 == ctS[0]) {
148 | rk1 = rk1 ^ rk0;
149 | s1 = (t4S[t1 >> 24] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t0) & 0xFF] & 0x000000FF) ^ rk1;
150 | if (s1 == ctS[1]) {
151 | rk2 = rk2 ^ rk1;
152 | s2 = (t4S[t2 >> 24] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t1) & 0xFF] & 0x000000FF) ^ rk2;
153 | if (s2 == ctS[2]) {
154 | rk3 = rk2 ^ rk3;
155 | s3 = (t4S[t3 >> 24] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t2) & 0xFF] & 0x000000FF) ^ rk3;
156 | if (s3 == ctS[3]) {
157 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init);
158 | printf("-------------------------------\n");
159 | }
160 | }
161 | }
162 | }
163 |
164 | // Overflow
165 | if (rk3Init == MAX_U32) {
166 | rk2Init++;
167 | }
168 |
169 | // Create key as 32 bit unsigned integers
170 | rk3Init++;
171 | }
172 | }
173 |
174 | // Exhaustive search with one table
175 | // 1 Table -> arithmetic shift: 2 shift 1 and
176 | __global__ void exhaustiveSearchWithOneTable(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range) {
177 |
178 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
179 |
180 | //
181 | __shared__ u32 t0S[TABLE_SIZE];
182 | __shared__ u32 t4S[TABLE_SIZE];
183 | __shared__ u32 rconS[RCON_SIZE];
184 | __shared__ u32 ctS[U32_SIZE];
185 |
186 | if (threadIdx.x < TABLE_SIZE) {
187 | t0S[threadIdx.x] = t0G[threadIdx.x];
188 | t4S[threadIdx.x] = t4G[threadIdx.x];
189 |
190 | if (threadIdx.x < RCON_SIZE) {
191 | rconS[threadIdx.x] = rconG[threadIdx.x];
192 | }
193 |
194 | if (threadIdx.x < U32_SIZE) {
195 | ctS[threadIdx.x] = ct[threadIdx.x];
196 | }
197 | }
198 | //
199 |
200 | // Wait until every thread is ready
201 | __syncthreads();
202 |
203 | u32 rk0Init, rk1Init, rk2Init, rk3Init;
204 | rk0Init = rk[0];
205 | rk1Init = rk[1];
206 | rk2Init = rk[2];
207 | rk3Init = rk[3];
208 |
209 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
210 | pt0Init = pt[0];
211 | pt1Init = pt[1];
212 | pt2Init = pt[2];
213 | pt3Init = pt[3];
214 |
215 | u64 threadRange = *range;
216 | u64 threadRangeStart = (u64)threadIndex * threadRange;
217 | rk2Init = rk2Init + threadRangeStart / MAX_U32;
218 | rk3Init = rk3Init + threadRangeStart % MAX_U32;
219 |
220 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
221 |
222 | u32 rk0, rk1, rk2, rk3;
223 | rk0 = rk0Init;
224 | rk1 = rk1Init;
225 | rk2 = rk2Init;
226 | rk3 = rk3Init;
227 |
228 | // Create plaintext as 32 bit unsigned integers
229 | u32 s0, s1, s2, s3;
230 | s0 = pt0Init;
231 | s1 = pt1Init;
232 | s2 = pt2Init;
233 | s3 = pt3Init;
234 |
235 | // First round just XORs input with key.
236 | s0 = s0 ^ rk0;
237 | s1 = s1 ^ rk1;
238 | s2 = s2 ^ rk2;
239 | s3 = s3 ^ rk3;
240 |
241 | u32 t0, t1, t2, t3;
242 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) {
243 |
244 | // Calculate round key
245 | u32 temp = rk3;
246 | rk0 = rk0 ^
247 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^
248 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^
249 | (t4S[(temp) & 0xff] & 0x0000ff00) ^
250 | (t4S[(temp >> 24)] & 0x000000ff) ^
251 | rconS[roundCount];
252 | rk1 = rk1 ^ rk0;
253 | rk2 = rk2 ^ rk1;
254 | rk3 = rk2 ^ rk3;
255 |
256 | // Table based round function
257 | t0 = t0S[s0 >> 24] ^ arithmeticRightShift(t0S[(s1 >> 16) & 0xFF], 8) ^ arithmeticRightShift(t0S[(s2 >> 8) & 0xFF], 16) ^ arithmeticRightShift(t0S[s3 & 0xFF], 24) ^ rk0;
258 | t1 = t0S[s1 >> 24] ^ arithmeticRightShift(t0S[(s2 >> 16) & 0xFF], 8) ^ arithmeticRightShift(t0S[(s3 >> 8) & 0xFF], 16) ^ arithmeticRightShift(t0S[s0 & 0xFF], 24) ^ rk1;
259 | t2 = t0S[s2 >> 24] ^ arithmeticRightShift(t0S[(s3 >> 16) & 0xFF], 8) ^ arithmeticRightShift(t0S[(s0 >> 8) & 0xFF], 16) ^ arithmeticRightShift(t0S[s1 & 0xFF], 24) ^ rk2;
260 | t3 = t0S[s3 >> 24] ^ arithmeticRightShift(t0S[(s0 >> 16) & 0xFF], 8) ^ arithmeticRightShift(t0S[(s1 >> 8) & 0xFF], 16) ^ arithmeticRightShift(t0S[s2 & 0xFF], 24) ^ rk3;
261 |
262 | s0 = t0;
263 | s1 = t1;
264 | s2 = t2;
265 | s3 = t3;
266 |
267 | }
268 |
269 | // Calculate the last round key
270 | u32 temp = rk3;
271 | rk0 = rk0 ^
272 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^
273 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^
274 | (t4S[(temp) & 0xff] & 0x0000ff00) ^
275 | (t4S[(temp >> 24)] & 0x000000ff) ^
276 | rconS[ROUND_COUNT_MIN_1];
277 | // Last round uses s-box directly and XORs to produce output.
278 | s0 = (t4S[t0 >> 24] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t3) & 0xFF] & 0x000000FF) ^ rk0;
279 | if (s0 == ctS[0]) {
280 | rk1 = rk1 ^ rk0;
281 | s1 = (t4S[t1 >> 24] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t0) & 0xFF] & 0x000000FF) ^ rk1;
282 | if (s1 == ctS[1]) {
283 | rk2 = rk2 ^ rk1;
284 | s2 = (t4S[t2 >> 24] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t1) & 0xFF] & 0x000000FF) ^ rk2;
285 | if (s2 == ctS[2]) {
286 | rk3 = rk2 ^ rk3;
287 | s3 = (t4S[t3 >> 24] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t2) & 0xFF] & 0x000000FF) ^ rk3;
288 | if (s3 == ctS[3]) {
289 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init);
290 | printf("-------------------------------\n");
291 | }
292 | }
293 | }
294 | }
295 |
296 | // Overflow
297 | if (rk3Init == MAX_U32) {
298 | rk2Init++;
299 | }
300 |
301 | // Create key as 32 bit unsigned integers
302 | rk3Init++;
303 | }
304 | }
305 |
306 | // Exhaustive search with one table extended as 32 columns
307 | // 1 Table [256][32] -> arithmetic shift: 2 shift 1 and
308 | __global__ void exhaustiveSearchWithOneTableExtendedSharedMemory(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range) {
309 |
310 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
311 | int warpThreadIndex = threadIdx.x & 31;
312 |
313 | //
314 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
315 | __shared__ u32 t4S[TABLE_SIZE];
316 | __shared__ u32 rconS[RCON_SIZE];
317 | __shared__ u32 ctS[U32_SIZE];
318 |
319 | if (threadIdx.x < TABLE_SIZE) {
320 | t4S[threadIdx.x] = t4G[threadIdx.x];
321 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
322 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
323 | }
324 |
325 | if (threadIdx.x < RCON_SIZE) {
326 | rconS[threadIdx.x] = rconG[threadIdx.x];
327 | }
328 |
329 | if (threadIdx.x < U32_SIZE) {
330 | ctS[threadIdx.x] = ct[threadIdx.x];
331 | }
332 | }
333 | //
334 |
335 | // Wait until every thread is ready
336 | __syncthreads();
337 |
338 | u32 rk0Init, rk1Init, rk2Init, rk3Init;
339 | rk0Init = rk[0];
340 | rk1Init = rk[1];
341 | rk2Init = rk[2];
342 | rk3Init = rk[3];
343 |
344 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
345 | pt0Init = pt[0];
346 | pt1Init = pt[1];
347 | pt2Init = pt[2];
348 | pt3Init = pt[3];
349 |
350 | u64 threadRange = *range;
351 | u64 threadRangeStart = (u64)threadIndex * threadRange;
352 | rk2Init = rk2Init + threadRangeStart / MAX_U32;
353 | rk3Init = rk3Init + threadRangeStart % MAX_U32;
354 |
355 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
356 |
357 | u32 rk0, rk1, rk2, rk3;
358 | rk0 = rk0Init;
359 | rk1 = rk1Init;
360 | rk2 = rk2Init;
361 | rk3 = rk3Init;
362 |
363 | // Create plaintext as 32 bit unsigned integers
364 | u32 s0, s1, s2, s3;
365 | s0 = pt0Init;
366 | s1 = pt1Init;
367 | s2 = pt2Init;
368 | s3 = pt3Init;
369 |
370 | // First round just XORs input with key.
371 | s0 = s0 ^ rk0;
372 | s1 = s1 ^ rk1;
373 | s2 = s2 ^ rk2;
374 | s3 = s3 ^ rk3;
375 |
376 | u32 t0, t1, t2, t3;
377 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) {
378 |
379 | // Calculate round key
380 | u32 temp = rk3;
381 | rk0 = rk0 ^
382 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^
383 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^
384 | (t4S[(temp) & 0xff] & 0x0000ff00) ^
385 | (t4S[(temp >> 24)] & 0x000000ff) ^
386 | rconS[roundCount];
387 | rk1 = rk1 ^ rk0;
388 | rk2 = rk2 ^ rk1;
389 | rk3 = rk2 ^ rk3;
390 |
391 | // Table based round function
392 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s3 & 0xFF][warpThreadIndex], 24) ^ rk0;
393 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s0 & 0xFF][warpThreadIndex], 24) ^ rk1;
394 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s1 & 0xFF][warpThreadIndex], 24) ^ rk2;
395 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s2 & 0xFF][warpThreadIndex], 24) ^ rk3;
396 |
397 | s0 = t0;
398 | s1 = t1;
399 | s2 = t2;
400 | s3 = t3;
401 |
402 | }
403 |
404 | // Calculate the last round key
405 | u32 temp = rk3;
406 | rk0 = rk0 ^
407 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^
408 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^
409 | (t4S[(temp) & 0xff] & 0x0000ff00) ^
410 | (t4S[(temp >> 24)] & 0x000000ff) ^
411 | rconS[ROUND_COUNT_MIN_1];
412 | // Last round uses s-box directly and XORs to produce output.
413 | s0 = (t4S[t0 >> 24] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t3) & 0xFF] & 0x000000FF) ^ rk0;
414 | if (s0 == ctS[0]) {
415 | rk1 = rk1 ^ rk0;
416 | s1 = (t4S[t1 >> 24] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t0) & 0xFF] & 0x000000FF) ^ rk1;
417 | if (s1 == ctS[1]) {
418 | rk2 = rk2 ^ rk1;
419 | s2 = (t4S[t2 >> 24] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t1) & 0xFF] & 0x000000FF) ^ rk2;
420 | if (s2 == ctS[2]) {
421 | rk3 = rk2 ^ rk3;
422 | s3 = (t4S[t3 >> 24] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t2) & 0xFF] & 0x000000FF) ^ rk3;
423 | if (s3 == ctS[3]) {
424 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init);
425 | printf("-------------------------------\n");
426 | }
427 | }
428 | }
429 | }
430 |
431 | // Overflow
432 | if (rk3Init == MAX_U32) {
433 | rk2Init++;
434 | }
435 |
436 | // Create key as 32 bit unsigned integers
437 | rk3Init++;
438 | }
439 | }
440 |
441 | // Exhaustive search with one table extended as 32 columns
442 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function
443 | __global__ void exhaustiveSearchWithOneTableExtendedSharedMemoryBytePerm(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range) {
444 |
445 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
446 | int warpThreadIndex = threadIdx.x & 31;
447 |
448 | //
449 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
450 | __shared__ u32 t4S[TABLE_SIZE];
451 | __shared__ u32 rconS[RCON_SIZE];
452 | __shared__ u32 ctS[U32_SIZE];
453 |
454 | if (threadIdx.x < TABLE_SIZE) {
455 | t4S[threadIdx.x] = t4G[threadIdx.x];
456 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
457 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
458 | }
459 |
460 | if (threadIdx.x < RCON_SIZE) {
461 | rconS[threadIdx.x] = rconG[threadIdx.x];
462 | }
463 |
464 | if (threadIdx.x < U32_SIZE) {
465 | ctS[threadIdx.x] = ct[threadIdx.x];
466 | }
467 | }
468 | //
469 |
470 | // Wait until every thread is ready
471 | __syncthreads();
472 |
473 | u32 rk0Init, rk1Init, rk2Init, rk3Init;
474 | rk0Init = rk[0];
475 | rk1Init = rk[1];
476 | rk2Init = rk[2];
477 | rk3Init = rk[3];
478 |
479 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
480 | pt0Init = pt[0];
481 | pt1Init = pt[1];
482 | pt2Init = pt[2];
483 | pt3Init = pt[3];
484 |
485 | u64 threadRange = *range;
486 | u64 threadRangeStart = (u64)threadIndex * threadRange;
487 | rk2Init = rk2Init + threadRangeStart / MAX_U32;
488 | rk3Init = rk3Init + threadRangeStart % MAX_U32;
489 |
490 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
491 |
492 | u32 rk0, rk1, rk2, rk3;
493 | rk0 = rk0Init;
494 | rk1 = rk1Init;
495 | rk2 = rk2Init;
496 | rk3 = rk3Init;
497 |
498 | // Create plaintext as 32 bit unsigned integers
499 | u32 s0, s1, s2, s3;
500 | s0 = pt0Init;
501 | s1 = pt1Init;
502 | s2 = pt2Init;
503 | s3 = pt3Init;
504 |
505 | // First round just XORs input with key.
506 | s0 = s0 ^ rk0;
507 | s1 = s1 ^ rk1;
508 | s2 = s2 ^ rk2;
509 | s3 = s3 ^ rk3;
510 |
511 | u32 t0, t1, t2, t3;
512 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) {
513 |
514 | // Calculate round key
515 | u32 temp = rk3;
516 | rk0 = rk0 ^
517 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^
518 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^
519 | (t4S[(temp) & 0xff] & 0x0000ff00) ^
520 | (t4S[(temp >> 24)] & 0x000000ff) ^
521 | rconS[roundCount];
522 | rk1 = rk1 ^ rk0;
523 | rk2 = rk2 ^ rk1;
524 | rk3 = rk2 ^ rk3;
525 |
526 | // Table based round function
527 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk0;
528 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk1;
529 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk2;
530 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk3;
531 |
532 | s0 = t0;
533 | s1 = t1;
534 | s2 = t2;
535 | s3 = t3;
536 |
537 | }
538 |
539 | // Calculate the last round key
540 | u32 temp = rk3;
541 | rk0 = rk0 ^
542 | (t4S[(temp >> 16) & 0xff] & 0xff000000) ^
543 | (t4S[(temp >> 8) & 0xff] & 0x00ff0000) ^
544 | (t4S[(temp) & 0xff] & 0x0000ff00) ^
545 | (t4S[(temp >> 24)] & 0x000000ff) ^
546 | rconS[ROUND_COUNT_MIN_1];
547 | // Last round uses s-box directly and XORs to produce output.
548 | s0 = (t4S[t0 >> 24] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t3) & 0xFF] & 0x000000FF) ^ rk0;
549 | if (s0 == ctS[0]) {
550 | rk1 = rk1 ^ rk0;
551 | s1 = (t4S[t1 >> 24] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t0) & 0xFF] & 0x000000FF) ^ rk1;
552 | if (s1 == ctS[1]) {
553 | rk2 = rk2 ^ rk1;
554 | s2 = (t4S[t2 >> 24] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t1) & 0xFF] & 0x000000FF) ^ rk2;
555 | if (s2 == ctS[2]) {
556 | rk3 = rk2 ^ rk3;
557 | s3 = (t4S[t3 >> 24] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff] & 0x0000FF00) ^ (t4S[(t2) & 0xFF] & 0x000000FF) ^ rk3;
558 | if (s3 == ctS[3]) {
559 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init);
560 | printf("-------------------------------\n");
561 | }
562 | }
563 | }
564 | }
565 |
566 | // Overflow
567 | if (rk3Init == MAX_U32) {
568 | rk2Init++;
569 | }
570 |
571 | // Create key as 32 bit unsigned integers
572 | rk3Init++;
573 | }
574 | }
575 |
576 | // Exhaustive search with one table extended as 32 columns
577 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function
578 | // SBox[256] is partly expanded
579 | __global__ void exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range) {
580 |
581 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
582 | int warpThreadIndex = threadIdx.x & 31;
583 | int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE;
584 |
585 | //
586 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
587 | __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
588 | __shared__ u32 rconS[RCON_SIZE];
589 | __shared__ u32 ctS[U32_SIZE];
590 |
591 | if (threadIdx.x < TABLE_SIZE) {
592 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
593 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
594 | }
595 |
596 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) {
597 | t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x];
598 | }
599 |
600 | if (threadIdx.x < RCON_SIZE) {
601 | rconS[threadIdx.x] = rconG[threadIdx.x];
602 | }
603 |
604 | if (threadIdx.x < U32_SIZE) {
605 | ctS[threadIdx.x] = ct[threadIdx.x];
606 | }
607 | }
608 | //
609 |
610 | // Wait until every thread is ready
611 | __syncthreads();
612 |
613 | u32 rk0Init, rk1Init, rk2Init, rk3Init;
614 | rk0Init = rk[0];
615 | rk1Init = rk[1];
616 | rk2Init = rk[2];
617 | rk3Init = rk[3];
618 |
619 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
620 | pt0Init = pt[0];
621 | pt1Init = pt[1];
622 | pt2Init = pt[2];
623 | pt3Init = pt[3];
624 |
625 | u64 threadRange = *range;
626 | u64 threadRangeStart = (u64)threadIndex * threadRange;
627 | rk2Init = rk2Init + threadRangeStart / MAX_U32;
628 | rk3Init = rk3Init + threadRangeStart % MAX_U32;
629 |
630 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
631 |
632 | u32 rk0, rk1, rk2, rk3;
633 | rk0 = rk0Init;
634 | rk1 = rk1Init;
635 | rk2 = rk2Init;
636 | rk3 = rk3Init;
637 |
638 | // Create plaintext as 32 bit unsigned integers
639 | u32 s0, s1, s2, s3;
640 | s0 = pt0Init;
641 | s1 = pt1Init;
642 | s2 = pt2Init;
643 | s3 = pt3Init;
644 |
645 | // First round just XORs input with key.
646 | s0 = s0 ^ rk0;
647 | s1 = s1 ^ rk1;
648 | s2 = s2 ^ rk2;
649 | s3 = s3 ^ rk3;
650 |
651 | u32 t0, t1, t2, t3;
652 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) {
653 |
654 | // Calculate round key
655 | u32 temp = rk3;
656 | rk0 = rk0 ^
657 | (t4S[(temp >> 16) & 0xff][warpThreadIndexSBox] & 0xff000000) ^
658 | (t4S[(temp >> 8) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^
659 | (t4S[(temp) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^
660 | (t4S[(temp >> 24)][warpThreadIndexSBox] & 0x000000ff) ^
661 | rconS[roundCount];
662 | rk1 = rk1 ^ rk0;
663 | rk2 = rk2 ^ rk1;
664 | rk3 = rk2 ^ rk3;
665 |
666 | // Table based round function
667 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk0;
668 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk1;
669 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk2;
670 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk3;
671 |
672 | s0 = t0;
673 | s1 = t1;
674 | s2 = t2;
675 | s3 = t3;
676 |
677 | }
678 |
679 | // Calculate the last round key
680 | u32 temp = rk3;
681 | rk0 = rk0 ^
682 | (t4S[(temp >> 16) & 0xff][warpThreadIndexSBox] & 0xff000000) ^
683 | (t4S[(temp >> 8) & 0xff][warpThreadIndexSBox] & 0x00ff0000) ^
684 | (t4S[(temp) & 0xff][warpThreadIndexSBox] & 0x0000ff00) ^
685 | (t4S[(temp >> 24)][warpThreadIndexSBox] & 0x000000ff) ^
686 | rconS[ROUND_COUNT_MIN_1];
687 | // Last round uses s-box directly and XORs to produce output.
688 | s0 = (t4S[t0 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t1 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t2 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t3) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk0;
689 | if (s0 == ctS[0]) {
690 | rk1 = rk1 ^ rk0;
691 | s1 = (t4S[t1 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t2 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t3 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t0) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk1;
692 | if (s1 == ctS[1]) {
693 | rk2 = rk2 ^ rk1;
694 | s2 = (t4S[t2 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t3 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t0 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t1) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk2;
695 | if (s2 == ctS[2]) {
696 | rk3 = rk2 ^ rk3;
697 | s3 = (t4S[t3 >> 24][warpThreadIndexSBox] & 0xFF000000) ^ (t4S[(t0 >> 16) & 0xff][warpThreadIndexSBox] & 0x00FF0000) ^ (t4S[(t1 >> 8) & 0xff][warpThreadIndexSBox] & 0x0000FF00) ^ (t4S[(t2) & 0xFF][warpThreadIndexSBox] & 0x000000FF) ^ rk3;
698 | if (s3 == ctS[3]) {
699 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init);
700 | printf("-------------------------------\n");
701 | }
702 | }
703 | }
704 | }
705 |
706 | // Overflow
707 | if (rk3Init == MAX_U32) {
708 | rk2Init++;
709 | }
710 |
711 | // Create key as 32 bit unsigned integers
712 | rk3Init++;
713 | }
714 | }
715 | __global__ void exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range, u8 *SAES) {
716 | u64 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
717 | int warpThreadIndex = threadIdx.x & 31;
718 | // int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE;
719 | //
720 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
721 | __shared__ u32 rconS[RCON_SIZE];
722 | __shared__ u32 ctS[U32_SIZE];
723 | // __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
724 | __shared__ u8 Sbox[64][32][4];
725 |
726 | if (threadIdx.x < TABLE_SIZE) {
727 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
728 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
729 | Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x];
730 | }
731 | // for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; }
732 | // for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { Sbox[threadIdx.x/4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; }
733 | if (threadIdx.x < U32_SIZE) { ctS[threadIdx.x] = ct[threadIdx.x]; }
734 | if (threadIdx.x < RCON_SIZE) { rconS[threadIdx.x] = rconG[threadIdx.x]; }
735 |
736 | } //
737 | __syncthreads(); // Wait until every thread is ready
738 | u32 rk0Init, rk1Init, rk2Init, rk3Init;
739 | rk0Init = rk[0]; rk1Init = rk[1]; rk2Init = rk[2]; rk3Init = rk[3];
740 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
741 | pt0Init = pt[0]; pt1Init = pt[1]; pt2Init = pt[2]; pt3Init = pt[3];
742 | u64 threadRange = *range;
743 | u64 threadRangeStart = threadIndex * threadRange;
744 | rk2Init = rk2Init + threadRangeStart / (u64)MAX_U32;
745 | rk3Init = rk3Init + threadRangeStart % (u64)MAX_U32;
746 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
747 | u32 rk0, rk1, rk2, rk3;
748 | rk0 = rk0Init; rk1 = rk1Init; rk2 = rk2Init; rk3 = rk3Init;
749 | // Create plaintext as 32 bit unsigned integers
750 | u32 s0, s1, s2, s3;
751 | s0 = pt0Init; s1 = pt1Init; s2 = pt2Init; s3 = pt3Init;
752 | // First round just XORs input with key.
753 | s0 = s0 ^ rk0; s1 = s1 ^ rk1; s2 = s2 ^ rk2; s3 = s3 ^ rk3;
754 | u32 t0, t1, t2, t3;
755 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) {
756 | // Calculate round key
757 | u32 temp = rk3;
758 | rk0 = rk0 ^
759 | arithmeticRightShiftBytePerm((u32)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16)) % 4], SHIFT_1_RIGHT) ^
760 | arithmeticRightShiftBytePerm((u32)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8)) % 4], SHIFT_2_RIGHT) ^
761 | arithmeticRightShiftBytePerm((u32)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp)) % 4], SHIFT_3_RIGHT) ^
762 | ((u32)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^
763 | rconS[roundCount];
764 | rk1 = rk1 ^ rk0; rk2 = rk2 ^ rk1; rk3 = rk2 ^ rk3;
765 | // Table based round function
766 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk0;
767 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk1;
768 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk2;
769 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk3;
770 | s0 = t0; s1 = t1; s2 = t2; s3 = t3;
771 | }
772 | // Calculate the last round key
773 | u32 temp = rk3;
774 | rk0 = rk0 ^
775 | arithmeticRightShiftBytePerm((u32)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16)) % 4], SHIFT_1_RIGHT) ^
776 | arithmeticRightShiftBytePerm((u32)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8)) % 4], SHIFT_2_RIGHT) ^
777 | arithmeticRightShiftBytePerm((u32)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp)) % 4], SHIFT_3_RIGHT) ^
778 | ((u32)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^
779 | rconS[ROUND_COUNT_MIN_1];
780 | // Last round uses s-box directly and XORs to produce output.
781 | s0 = arithmeticRightShiftBytePerm((u32)Sbox[((t0 >> 24)) / 4][warpThreadIndex][((t0 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t1 >> 16) & 0xff) / 4][warpThreadIndex][((t1 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t2 >> 8) &0xFF) / 4][warpThreadIndex][((t2 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t3 & 0xFF) / 4)][warpThreadIndex][((t3 & 0xFF) % 4)]) ^ rk0;
782 | if (s0 == ctS[0]) {
783 | rk1 = rk1 ^ rk0;
784 | s1 = arithmeticRightShiftBytePerm((u32)Sbox[((t1 >> 24)) / 4][warpThreadIndex][((t1 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t2 >> 16) & 0xff) / 4][warpThreadIndex][((t2 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t3 >> 8) & 0xFF) / 4][warpThreadIndex][((t3 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t0 & 0xFF) / 4)][warpThreadIndex][((t0 & 0xFF) % 4)]) ^ rk1;
785 | if (s1 == ctS[1]) {
786 | rk2 = rk2 ^ rk1;
787 | s2 = arithmeticRightShiftBytePerm((u32)Sbox[((t2 >> 24)) / 4][warpThreadIndex][((t2 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t3 >> 16) & 0xff) / 4][warpThreadIndex][((t3 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t0 >> 8) & 0xFF) / 4][warpThreadIndex][((t0 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t1 & 0xFF) / 4)][warpThreadIndex][((t1 & 0xFF) % 4)]) ^ rk2;
788 | if (s2 == ctS[2]) {
789 | rk3 = rk2 ^ rk3;
790 | s3 = arithmeticRightShiftBytePerm((u32)Sbox[((t3 >> 24)) / 4][warpThreadIndex][((t3 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t0 >> 16) & 0xff) / 4][warpThreadIndex][((t0 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u32)Sbox[((t1 >> 8) & 0xFF) / 4][warpThreadIndex][((t1 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u32)Sbox[((t2 & 0xFF) / 4)][warpThreadIndex][((t2 & 0xFF) % 4)]) ^ rk3;
791 | if (s3 == ctS[3]) {
792 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init);
793 | printf("-------------------------------\n");
794 | }
795 | }
796 | }
797 | }
798 | // Overflow
799 | if (rk3Init == MAX_U32) { rk2Init++; }
800 | rk3Init++; // Create key as 32 bit unsigned integers
801 | }
802 | }
803 | __global__ void exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir2(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4G, u32* rconG, u64* range, u8* SAES) {
804 | u64 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
805 | int warpThreadIndex = threadIdx.x & 31;
806 | // int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE;
807 | //
808 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
809 | __shared__ u32 rconS[RCON_SIZE];
810 | __shared__ u32 ctS[U32_SIZE];
811 | // __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
812 | __shared__ u8 Sbox[64][32][4];
813 |
814 | if (threadIdx.x < TABLE_SIZE) {
815 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
816 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
817 | Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x];
818 | }
819 | // for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; }
820 | // for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { Sbox[threadIdx.x/4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; }
821 | if (threadIdx.x < U32_SIZE) { ctS[threadIdx.x] = ct[threadIdx.x]; }
822 | if (threadIdx.x < RCON_SIZE) { rconS[threadIdx.x] = rconG[threadIdx.x]; }
823 |
824 | } //
825 | __syncthreads(); // Wait until every thread is ready
826 | u32 rk0Init, rk1Init, rk2Init, rk3Init;
827 | rk0Init = rk[0]; rk1Init = rk[1]; rk2Init = rk[2]; rk3Init = rk[3];
828 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
829 | pt0Init = pt[0]; pt1Init = pt[1]; pt2Init = pt[2]; pt3Init = pt[3];
830 | u64 threadRange = *range;
831 | u64 threadRangeStart = threadIndex * threadRange;
832 | rk2Init = rk2Init + threadRangeStart / (u64)MAX_U32;
833 | rk3Init = rk3Init + threadRangeStart % (u64)MAX_U32;
834 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
835 | u32 rk0, rk1, rk2, rk3;
836 | rk0 = rk0Init; rk1 = rk1Init; rk2 = rk2Init; rk3 = rk3Init;
837 | // Create plaintext as 32 bit unsigned integers
838 | u32 s0, s1, s2, s3;
839 | s0 = pt0Init; s1 = pt1Init; s2 = pt2Init; s3 = pt3Init;
840 | // First round just XORs input with key.
841 | s0 = s0 ^ rk0; s1 = s1 ^ rk1; s2 = s2 ^ rk2; s3 = s3 ^ rk3;
842 | u32 t0, t1, t2, t3;
843 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) {
844 | // Calculate round key
845 | u32 temp = rk3;
846 | rk0 = rk0 ^
847 | arithmeticRightShift((u32)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16)) % 4], 8) ^
848 | arithmeticRightShift((u32)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8)) % 4], 16) ^
849 | arithmeticRightShift((u32)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp)) % 4], 24) ^
850 | ((u32)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^
851 | rconS[roundCount];
852 | rk1 = rk1 ^ rk0; rk2 = rk2 ^ rk1; rk3 = rk2 ^ rk3;
853 | // Table based round function
854 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s3 & 0xFF][warpThreadIndex], 24) ^ rk0;
855 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s0 & 0xFF][warpThreadIndex], 24) ^ rk1;
856 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s1 & 0xFF][warpThreadIndex], 24) ^ rk2;
857 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShift(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], 8) ^ arithmeticRightShift(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], 16) ^ arithmeticRightShift(t0S[s2 & 0xFF][warpThreadIndex], 24) ^ rk3;
858 | s0 = t0; s1 = t1; s2 = t2; s3 = t3;
859 | }
860 | // Calculate the last round key
861 | u32 temp = rk3;
862 | rk0 = rk0 ^
863 | arithmeticRightShift((u32)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16)) % 4], 8) ^
864 | arithmeticRightShift((u32)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8)) % 4], 16) ^
865 | arithmeticRightShift((u32)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp)) % 4], 24) ^
866 | ((u32)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^
867 | rconS[ROUND_COUNT_MIN_1];
868 | // Last round uses s-box directly and XORs to produce output.
869 | s0 = arithmeticRightShift((u32)Sbox[((t0 >> 24)) / 4][warpThreadIndex][((t0 >> 24)) % 4], 8) ^ arithmeticRightShift((u32)Sbox[((t1 >> 16) & 0xff) / 4][warpThreadIndex][((t1 >> 16)) % 4], 16) ^ arithmeticRightShift((u32)Sbox[((t2 >> 8) & 0xFF) / 4][warpThreadIndex][((t2 >> 8)) % 4], 24) ^ ((u32)Sbox[((t3 & 0xFF) / 4)][warpThreadIndex][((t3 & 0xFF) % 4)]) ^ rk0;
870 | if (s0 == ctS[0]) {
871 | rk1 = rk1 ^ rk0;
872 | s1 = arithmeticRightShift((u32)Sbox[((t1 >> 24)) / 4][warpThreadIndex][((t1 >> 24)) % 4], 8) ^ arithmeticRightShift((u32)Sbox[((t2 >> 16) & 0xff) / 4][warpThreadIndex][((t2 >> 16)) % 4], 16) ^ arithmeticRightShift((u32)Sbox[((t3 >> 8) & 0xFF) / 4][warpThreadIndex][((t3 >> 8)) % 4], 24) ^ ((u32)Sbox[((t0 & 0xFF) / 4)][warpThreadIndex][((t0 & 0xFF) % 4)]) ^ rk1;
873 | if (s1 == ctS[1]) {
874 | rk2 = rk2 ^ rk1;
875 | s2 = arithmeticRightShift((u32)Sbox[((t2 >> 24)) / 4][warpThreadIndex][((t2 >> 24)) % 4], 8) ^ arithmeticRightShift((u32)Sbox[((t3 >> 16) & 0xff) / 4][warpThreadIndex][((t3 >> 16)) % 4], 16) ^ arithmeticRightShift((u32)Sbox[((t0 >> 8) & 0xFF) / 4][warpThreadIndex][((t0 >> 8)) % 4], 24) ^ ((u32)Sbox[((t1 & 0xFF) / 4)][warpThreadIndex][((t1 & 0xFF) % 4)]) ^ rk2;
876 | if (s2 == ctS[2]) {
877 | rk3 = rk2 ^ rk3;
878 | s3 = arithmeticRightShift((u32)Sbox[((t3 >> 24)) / 4][warpThreadIndex][((t3 >> 24)) % 4], 8) ^ arithmeticRightShift((u32)Sbox[((t0 >> 16) & 0xff) / 4][warpThreadIndex][((t0 >> 16)) % 4], 16) ^ arithmeticRightShift((u32)Sbox[((t1 >> 8) & 0xFF) / 4][warpThreadIndex][((t1 >> 8)) % 4], 24) ^ ((u32)Sbox[((t2 & 0xFF) / 4)][warpThreadIndex][((t2 & 0xFF) % 4)]) ^ rk3;
879 | if (s3 == ctS[3]) {
880 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init);
881 | printf("-------------------------------\n");
882 | }
883 | }
884 | }
885 | }
886 | // Overflow
887 | if (rk3Init == MAX_U32) { rk2Init++; }
888 | rk3Init++; // Create key as 32 bit unsigned integers
889 | }
890 | }
891 | /*__global__ void exhaustiveSearchCem(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t1G, u32* t4G, u32* rconG, u64* range, u8* SAES) {
892 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
893 | int warpThreadIndex = threadIdx.x & 31;
894 | // int warpThreadIndexSBox = warpThreadIndex % S_BOX_BANK_SIZE;
895 | //
896 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
897 | __shared__ u32 t1S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
898 | // __shared__ u32 t4S[TABLE_SIZE][S_BOX_BANK_SIZE];
899 | __shared__ u8 Sbox[64][32][4];
900 | __shared__ u32 rconS[RCON_SIZE];
901 | __shared__ u32 ctS[U32_SIZE];
902 | if (threadIdx.x < TABLE_SIZE) {
903 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x]; }
904 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) { t1S[threadIdx.x][bankIndex] = t1G[threadIdx.x]; }
905 | // for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { t4S[threadIdx.x][bankIndex] = t4G[threadIdx.x]; }
906 | for (u8 bankIndex = 0; bankIndex < S_BOX_BANK_SIZE; bankIndex++) { Sbox[threadIdx.x / 4][bankIndex][threadIdx.x % 4] = SAES[threadIdx.x]; }
907 | if (threadIdx.x < RCON_SIZE) { rconS[threadIdx.x] = rconG[threadIdx.x]; }
908 | if (threadIdx.x < U32_SIZE) { ctS[threadIdx.x] = ct[threadIdx.x]; }
909 | } //
910 | __syncthreads(); // Wait until every thread is ready
911 | u32 rk0Init, rk1Init, rk2Init, rk3Init;
912 | rk0Init = rk[0]; rk1Init = rk[1]; rk2Init = rk[2]; rk3Init = rk[3];
913 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
914 | pt0Init = pt[0]; pt1Init = pt[1]; pt2Init = pt[2]; pt3Init = pt[3];
915 | u64 threadRange = *range;
916 | u64 threadRangeStart = (u64)threadIndex * threadRange;
917 | rk2Init = rk2Init + threadRangeStart / MAX_U32;
918 | rk3Init = rk3Init + threadRangeStart % MAX_U32;
919 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
920 | u32 rk0, rk1, rk2, rk3;
921 | rk0 = rk0Init; rk1 = rk1Init; rk2 = rk2Init; rk3 = rk3Init;
922 | // Create plaintext as 32 bit unsigned integers
923 | u32 s0, s1, s2, s3;
924 | s0 = pt0Init; s1 = pt1Init; s2 = pt2Init; s3 = pt3Init;
925 | // First round just XORs input with key.
926 | s0 = s0 ^ rk0; s1 = s1 ^ rk1; s2 = s2 ^ rk2; s3 = s3 ^ rk3;
927 | u32 t0, t1, t2, t3;
928 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) {
929 | // Calculate round key
930 | u32 temp = rk3;
931 | rk0 = rk0 ^
932 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16) & 0xff) % 4], SHIFT_1_RIGHT) ^
933 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8) & 0xff) % 4], SHIFT_2_RIGHT) ^
934 | arithmeticRightShiftBytePerm((u64)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp) & 0xff) % 4], SHIFT_3_RIGHT) ^
935 | ((u64)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^
936 | rconS[roundCount];
937 |
938 | rk1 = rk1 ^ rk0; rk2 = rk2 ^ rk1; rk3 = rk2 ^ rk3;
939 | // Table based round function
940 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ t1S[(s1 >> 16) & 0xFF][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk0;
941 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ t1S[(s2 >> 16) & 0xFF][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk1;
942 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ t1S[(s3 >> 16) & 0xFF][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk2;
943 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ t1S[(s0 >> 16) & 0xFF][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk3;
944 | s0 = t0; s1 = t1; s2 = t2; s3 = t3;
945 | }
946 | // Calculate the last round key
947 | u32 temp = rk3;
948 | rk0 = rk0 ^
949 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 16) & 0xff) / 4][warpThreadIndex][((temp >> 16)) % 4], SHIFT_1_RIGHT) ^
950 | arithmeticRightShiftBytePerm((u64)Sbox[((temp >> 8) & 0xff) / 4][warpThreadIndex][((temp >> 8)) % 4], SHIFT_2_RIGHT) ^
951 | arithmeticRightShiftBytePerm((u64)Sbox[((temp) & 0xff) / 4][warpThreadIndex][((temp)) % 4], SHIFT_3_RIGHT) ^
952 | ((u64)Sbox[((temp >> 24) / 4)][warpThreadIndex][((temp >> 24) % 4)]) ^
953 | rconS[ROUND_COUNT_MIN_1];
954 | // Last round uses s-box directly and XORs to produce output.
955 | s0 = arithmeticRightShiftBytePerm((u64)Sbox[((t0 >> 24)) / 4][warpThreadIndex][((t0 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t1 >> 16) & 0xff) / 4][warpThreadIndex][((t1 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t2 >> 8) & 0xFF) / 4][warpThreadIndex][((t2 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t3 & 0xFF) / 4)][warpThreadIndex][((t3 & 0xFF) % 4)]) ^ rk0;
956 | if (s0 == ctS[0]) {
957 | rk1 = rk1 ^ rk0;
958 | s1 = arithmeticRightShiftBytePerm((u64)Sbox[((t1 >> 24)) / 4][warpThreadIndex][((t1 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t2 >> 16) & 0xff) / 4][warpThreadIndex][((t2 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t3 >> 8) & 0xFF) / 4][warpThreadIndex][((t3 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t0 & 0xFF) / 4)][warpThreadIndex][((t0 & 0xFF) % 4)]) ^ rk1;
959 | if (s1 == ctS[1]) {
960 | rk2 = rk2 ^ rk1;
961 | s2 = arithmeticRightShiftBytePerm((u64)Sbox[((t2 >> 24)) / 4][warpThreadIndex][((t2 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t3 >> 16) & 0xff) / 4][warpThreadIndex][((t3 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t0 >> 8) & 0xFF) / 4][warpThreadIndex][((t0 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t1 & 0xFF) / 4)][warpThreadIndex][((t1 & 0xFF) % 4)]) ^ rk2;
962 | if (s2 == ctS[2]) {
963 | rk3 = rk2 ^ rk3;
964 | s3 = arithmeticRightShiftBytePerm((u64)Sbox[((t3 >> 24)) / 4][warpThreadIndex][((t3 >> 24)) % 4], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t0 >> 16) & 0xff) / 4][warpThreadIndex][((t0 >> 16)) % 4], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm((u64)Sbox[((t1 >> 8) & 0xFF) / 4][warpThreadIndex][((t1 >> 8)) % 4], SHIFT_3_RIGHT) ^ ((u64)Sbox[((t2 & 0xFF) / 4)][warpThreadIndex][((t2 & 0xFF) % 4)]) ^ rk3;
965 | if (s3 == ctS[3]) {
966 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init);
967 | printf("-------------------------------\n");
968 | }
969 | }
970 | }
971 | }
972 | // Overflow
973 | if (rk3Init == MAX_U32) { rk2Init++; }
974 | rk3Init++; // Create key as 32 bit unsigned integers
975 | }
976 | }*/
977 |
978 |
979 | // Exhaustive search with one table extended as 32 columns
980 | // 1 Table [256][32] -> arithmetic shift: __byte_perm function
981 | // 4 S-box, each shifted
982 | __global__ void exhaustiveSearchWithOneTableExtendedSharedMemoryBytePerm4ShiftedSbox(u32* pt, u32* ct, u32* rk, u32* t0G, u32* t4_0G, u32* t4_1G, u32* t4_2G, u32* t4_3G, u32* rconG, u64* range) {
983 |
984 | int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
985 | int warpThreadIndex = threadIdx.x & 31;
986 |
987 | //
988 | __shared__ u32 t0S[TABLE_SIZE][SHARED_MEM_BANK_SIZE];
989 | __shared__ u32 t4_0S[TABLE_SIZE];
990 | __shared__ u32 t4_1S[TABLE_SIZE];
991 | __shared__ u32 t4_2S[TABLE_SIZE];
992 | __shared__ u32 t4_3S[TABLE_SIZE];
993 | __shared__ u32 rconS[RCON_SIZE];
994 | __shared__ u32 ctS[U32_SIZE];
995 |
996 | if (threadIdx.x < TABLE_SIZE) {
997 | t4_0S[threadIdx.x] = t4_0G[threadIdx.x];
998 | t4_1S[threadIdx.x] = t4_1G[threadIdx.x];
999 | t4_2S[threadIdx.x] = t4_2G[threadIdx.x];
1000 | t4_3S[threadIdx.x] = t4_3G[threadIdx.x];
1001 | for (u8 bankIndex = 0; bankIndex < SHARED_MEM_BANK_SIZE; bankIndex++) {
1002 | t0S[threadIdx.x][bankIndex] = t0G[threadIdx.x];
1003 | }
1004 |
1005 | if (threadIdx.x < RCON_SIZE) {
1006 | rconS[threadIdx.x] = rconG[threadIdx.x];
1007 | }
1008 |
1009 | if (threadIdx.x < U32_SIZE) {
1010 | ctS[threadIdx.x] = ct[threadIdx.x];
1011 | }
1012 | }
1013 | //
1014 |
1015 | // Wait until every thread is ready
1016 | __syncthreads();
1017 |
1018 | u32 rk0Init, rk1Init, rk2Init, rk3Init;
1019 | rk0Init = rk[0];
1020 | rk1Init = rk[1];
1021 | rk2Init = rk[2];
1022 | rk3Init = rk[3];
1023 |
1024 | u32 pt0Init, pt1Init, pt2Init, pt3Init;
1025 | pt0Init = pt[0];
1026 | pt1Init = pt[1];
1027 | pt2Init = pt[2];
1028 | pt3Init = pt[3];
1029 |
1030 | u64 threadRange = *range;
1031 | u64 threadRangeStart = (u64)threadIndex * threadRange;
1032 | rk2Init = rk2Init + threadRangeStart / MAX_U32;
1033 | rk3Init = rk3Init + threadRangeStart % MAX_U32;
1034 |
1035 | for (u64 rangeCount = 0; rangeCount < threadRange; rangeCount++) {
1036 |
1037 | u32 rk0, rk1, rk2, rk3;
1038 | rk0 = rk0Init;
1039 | rk1 = rk1Init;
1040 | rk2 = rk2Init;
1041 | rk3 = rk3Init;
1042 |
1043 | // Create plaintext as 32 bit unsigned integers
1044 | u32 s0, s1, s2, s3;
1045 | s0 = pt0Init;
1046 | s1 = pt1Init;
1047 | s2 = pt2Init;
1048 | s3 = pt3Init;
1049 |
1050 | // First round just XORs input with key.
1051 | s0 = s0 ^ rk0;
1052 | s1 = s1 ^ rk1;
1053 | s2 = s2 ^ rk2;
1054 | s3 = s3 ^ rk3;
1055 |
1056 | u32 t0, t1, t2, t3;
1057 | for (u8 roundCount = 0; roundCount < ROUND_COUNT_MIN_1; roundCount++) {
1058 |
1059 | // Calculate round key
1060 | u32 temp = rk3;
1061 | rk0 = rk0 ^ t4_3S[(temp >> 16) & 0xff] ^ t4_2S[(temp >> 8) & 0xff] ^ t4_1S[(temp) & 0xff] ^ t4_0S[(temp >> 24)] ^ rconS[roundCount];
1062 | rk1 = rk1 ^ rk0;
1063 | rk2 = rk2 ^ rk1;
1064 | rk3 = rk2 ^ rk3;
1065 |
1066 | // Table based round function
1067 | t0 = t0S[s0 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s3 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk0;
1068 | t1 = t0S[s1 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s2 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s0 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk1;
1069 | t2 = t0S[s2 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s3 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s1 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk2;
1070 | t3 = t0S[s3 >> 24][warpThreadIndex] ^ arithmeticRightShiftBytePerm(t0S[(s0 >> 16) & 0xFF][warpThreadIndex], SHIFT_1_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[(s1 >> 8) & 0xFF][warpThreadIndex], SHIFT_2_RIGHT) ^ arithmeticRightShiftBytePerm(t0S[s2 & 0xFF][warpThreadIndex], SHIFT_3_RIGHT) ^ rk3;
1071 |
1072 | s0 = t0;
1073 | s1 = t1;
1074 | s2 = t2;
1075 | s3 = t3;
1076 |
1077 | }
1078 |
1079 | // Calculate the last round key
1080 | u32 temp = rk3;
1081 | rk0 = rk0 ^ t4_3S[(temp >> 16) & 0xff] ^ t4_2S[(temp >> 8) & 0xff] ^ t4_1S[(temp) & 0xff] ^ t4_0S[(temp >> 24)] ^ rconS[ROUND_COUNT_MIN_1];
1082 | // Last round uses s-box directly and XORs to produce output.
1083 | s0 = t4_3S[t0 >> 24] ^ t4_2S[(t1 >> 16) & 0xff] ^ t4_1S[(t2 >> 8) & 0xff] ^ t4_0S[(t3) & 0xFF] ^ rk0;
1084 | if (s0 == ctS[0]) {
1085 | rk1 = rk1 ^ rk0;
1086 | s1 = t4_3S[t1 >> 24] ^ t4_2S[(t2 >> 16) & 0xff] ^ t4_1S[(t3 >> 8) & 0xff] ^ t4_0S[(t0) & 0xFF] ^ rk1;
1087 | if (s1 == ctS[1]) {
1088 | rk2 = rk2 ^ rk1;
1089 | s2 = t4_3S[t2 >> 24] ^ t4_2S[(t3 >> 16) & 0xff] ^ t4_1S[(t0 >> 8) & 0xff] ^ t4_0S[(t1) & 0xFF] ^ rk2;
1090 | if (s2 == ctS[2]) {
1091 | rk3 = rk2 ^ rk3;
1092 | s3 = t4_3S[t3 >> 24] ^ t4_2S[(t0 >> 16) & 0xff] ^ t4_1S[(t1 >> 8) & 0xff] ^ t4_0S[(t2) & 0xFF] ^ rk3;
1093 | if (s3 == ctS[3]) {
1094 | printf("! Found key : %08x %08x %08x %08x\n", rk0Init, rk1Init, rk2Init, rk3Init);
1095 | printf("-------------------------------\n");
1096 | }
1097 | }
1098 | }
1099 | }
1100 |
1101 | // Overflow
1102 | if (rk3Init == MAX_U32) {
1103 | rk2Init++;
1104 | }
1105 |
1106 | // Create key as 32 bit unsigned integers
1107 | rk3Init++;
1108 | }
1109 | }
1110 |
1111 | __host__ int main128ExhaustiveSearch(int choice) {
1112 | printf("\n"); printf("########## AES-128 Exhaustive Search Implementation ##########\n"); printf("\n");
1113 | // Allocate plaintext, ciphertext and initial round key
1114 | u32 *pt, *ct, *rk;
1115 | gpuErrorCheck(cudaMallocManaged(&pt, 4 * sizeof(u32)));
1116 | gpuErrorCheck(cudaMallocManaged(&ct, 4 * sizeof(u32)));
1117 | gpuErrorCheck(cudaMallocManaged(&rk, 4 * sizeof(u32)));
1118 | pt[0] = 0x3243F6A8U; pt[1] = 0x885A308DU; pt[2] = 0x313198A2U; pt[3] = 0xE0370734U;
1119 | // pt[0] = 0; pt[1] = 0; pt[2] = 0; pt[3] = 0;
1120 | ct[0] = 0x3925841DU; ct[1] = 0x02DC09FBU; ct[2] = 0xDC118597U; ct[3] = 0x196A0B32U;
1121 | // aes-cipher-internals.xlsx
1122 | rk[0] = 0x2B7E1516U; rk[1] = 0x28AED2A6U; rk[2] = 0xABF71588U; rk[3] = 0x09CF0000U;
1123 | // Allocate RCON values
1124 | u32* rcon;
1125 | gpuErrorCheck(cudaMallocManaged(&rcon, RCON_SIZE * sizeof(u32)));
1126 | for (int i = 0; i < RCON_SIZE; i++) { rcon[i] = RCON32[i]; }
1127 | // Allocate Tables
1128 | u32 *t0, *t1, *t2, *t3, *t4, *t4_0, *t4_1, *t4_2, *t4_3;
1129 | u8* SAES_d; // Cihangir
1130 | gpuErrorCheck(cudaMallocManaged(&t0, TABLE_SIZE * sizeof(u32)));
1131 | gpuErrorCheck(cudaMallocManaged(&t1, TABLE_SIZE * sizeof(u32)));
1132 | gpuErrorCheck(cudaMallocManaged(&t2, TABLE_SIZE * sizeof(u32)));
1133 | gpuErrorCheck(cudaMallocManaged(&t3, TABLE_SIZE * sizeof(u32)));
1134 | gpuErrorCheck(cudaMallocManaged(&t4, TABLE_SIZE * sizeof(u32)));
1135 | gpuErrorCheck(cudaMallocManaged(&t4_0, TABLE_SIZE * sizeof(u32)));
1136 | gpuErrorCheck(cudaMallocManaged(&t4_1, TABLE_SIZE * sizeof(u32)));
1137 | gpuErrorCheck(cudaMallocManaged(&t4_2, TABLE_SIZE * sizeof(u32)));
1138 | gpuErrorCheck(cudaMallocManaged(&t4_3, TABLE_SIZE * sizeof(u32)));
1139 | gpuErrorCheck(cudaMallocManaged(&SAES_d, 256 * sizeof(u8))); // Cihangir
1140 | for (int i = 0; i < TABLE_SIZE; i++) {
1141 | t0[i] = T0[i]; t1[i] = T1[i]; t2[i] = T2[i]; t3[i] = T3[i]; t4[i] = T4[i];
1142 | t4_0[i] = T4_0[i]; t4_1[i] = T4_1[i]; t4_2[i] = T4_2[i]; t4_3[i] = T4_3[i];
1143 | }
1144 | for (int i = 0; i < 256; i++) SAES_d[i] = SAES[i]; // Cihangir
1145 | printf("-------------------------------\n");
1146 | u64* range = calculateRange();
1147 | /* printf("Plaintext : %08x %08x %08x %08x\n", pt[0], pt[1], pt[2], pt[3]);
1148 | printf("Ciphertext : %08x %08x %08x %08x\n", ct[0], ct[1], ct[2], ct[3]);
1149 | printf("Initial Key : %08x %08x %08x %08x\n", rk[0], rk[1], rk[2], rk[3]);
1150 | printf("-------------------------------\n");*/
1151 |
1152 |
1153 |
1154 | clock_t beginTime = clock();
1155 | // exhaustiveSearch << > > (pt, ct, rk, t0, t1, t2, t3, t4, rcon, range);
1156 | // exhaustiveSearchWithOneTable<<>>(pt, ct, rk, t0, t4, rcon, range);
1157 | if (choice == 1) exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir2 << > > (pt, ct, rk, t0, t4, rcon, range, SAES_d);
1158 | if (choice == 11) exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBoxCihangir << > > (pt, ct, rk, t0, t4, rcon, range, SAES_d);
1159 | // else if (choice == 2) exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox << > > (pt, ct, rk, t0, t4, rcon, range);
1160 | else if (choice == 2) exhaustiveSearchWithOneTableExtendedSharedMemory << > > (pt, ct, rk, t0, t4, rcon, range);
1161 | else if (choice == 22) exhaustiveSearchWithOneTableExtendedSharedMemoryBytePerm << > > (pt, ct, rk, t0, t4, rcon, range);
1162 |
1163 | // exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox << > > (pt, ct, rk, t0, t4, rcon, range);
1164 | // exhaustiveSearchCem << > > (pt, ct, rk, t0, t1, t4, rcon, range, SAES_d);
1165 | cudaDeviceSynchronize();
1166 | printf("Time elapsed: %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC);
1167 |
1168 | /* beginTime = clock();
1169 | // Kernels
1170 | //exhaustiveSearch<<>>(pt, ct, rk, t0, t1, t2, t3, t4, rcon, range);
1171 | //exhaustiveSearchWithOneTable<<>>(pt, ct, rk, t0, t4, rcon, range);
1172 | //exhaustiveSearchWithOneTableExtendedSharedMemory<<>>(pt, ct, rk, t0, t4, rcon, range);
1173 | //exhaustiveSearchWithOneTableExtendedSharedMemoryBytePerm<<>>(pt, ct, rk, t0, t4, rcon, range);
1174 | // Fastest
1175 | exhaustiveSearchWithOneTableExtendedSharedMemoryBytePermPartlyExtendedSBox<<>>(pt, ct, rk, t0, t4, rcon, range);
1176 | //exhaustiveSearchWithOneTableExtendedSharedMemoryBytePerm4ShiftedSbox<<>>(pt, ct, rk, t0, t4_0, t4_1, t4_2, t4_3, rcon, range);
1177 | cudaDeviceSynchronize();
1178 | printf("Time elapsed: %f sec\n", float(clock() - beginTime) / CLOCKS_PER_SEC);*/
1179 |
1180 |
1181 | printf("-------------------------------\n");
1182 | printLastCUDAError();
1183 | // Free alocated arrays
1184 | cudaFree(range); cudaFree(pt); cudaFree(ct); cudaFree(rk); cudaFree(t0); cudaFree(t1); cudaFree(t2); cudaFree(t3); cudaFree(t4);
1185 | cudaFree(t4_0); cudaFree(t4_1); cudaFree(t4_2); cudaFree(t4_3); cudaFree(rcon); cudaFree(SAES_d);
1186 | return 0;
1187 | }
1188 |
1189 |
--------------------------------------------------------------------------------