├── README.md
├── blake2b.cu
├── blake2b.cuh
├── config.h
├── keccak.cu
├── keccak.cuh
├── md2.cu
├── md2.cuh
├── md5.cu
├── md5.cuh
├── sha1.cu
├── sha1.cuh
├── sha256.cu
└── sha256.cuh


/README.md:
--------------------------------------------------------------------------------
 1 | # CUDA Hashing Algorithms Collection
 2 | #### Author: Matt Zweil & The Mochimo Core Contributor Team
 3 | 
 4 | This repository of CUDA Hash functions is released into the Public Domain.
 5 | 
 6 | To use any of the associated hashing functions, please include the config.h header file.
 7 | 
 8 | Alternatively, you can just define these three definitions yourself, and omit the config.h file:
 9 | 
10 | ```
11 | typedef unsigned char BYTE;
12 | typedef unsigned int  WORD;
13 | typedef unsigned long long LONG;
14 | ```
15 | 
16 | Special thanks to Brad Conte for his original public domain C implementations of some of these algos.
17 | 


--------------------------------------------------------------------------------
/blake2b.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * blake2b.cu CUDA Implementation of BLAKE2B Hashing
  3 |  *
  4 |  * Date: 12 June 2019
  5 |  * Revision: 1
  6 |  *
  7 |  * This file is released into the Public Domain.
  8 |  */
  9 | 
 10 | 
 11 | #include <assert.h>
 12 | extern "C"
 13 | {
 14 | #include "blake2b.cuh"
 15 | }
 16 | #define BLAKE2B_ROUNDS 12
 17 | #define BLAKE2B_BLOCK_LENGTH 128
 18 | #define BLAKE2B_CHAIN_SIZE 8
 19 | #define BLAKE2B_CHAIN_LENGTH (BLAKE2B_CHAIN_SIZE * sizeof(int64_t))
 20 | #define BLAKE2B_STATE_SIZE 16
 21 | #define BLAKE2B_STATE_LENGTH (BLAKE2B_STATE_SIZE * sizeof(int64_t))
 22 | extern "C"
 23 | {
 24 | typedef struct {
 25 | 
 26 |     WORD digestlen;
 27 |     BYTE key[64];
 28 |     WORD keylen;
 29 | 
 30 |     BYTE buff[BLAKE2B_BLOCK_LENGTH];
 31 |     int64_t chain[BLAKE2B_CHAIN_SIZE];
 32 |     int64_t state[BLAKE2B_STATE_SIZE];
 33 | 
 34 |     WORD pos;
 35 |     LONG t0;
 36 |     LONG t1;
 37 |     LONG f0;
 38 | 
 39 | } cuda_blake2b_ctx_t;
 40 | }
 41 | typedef cuda_blake2b_ctx_t CUDA_BLAKE2B_CTX;
 42 | 
 43 | __constant__ CUDA_BLAKE2B_CTX c_CTX;
 44 | 
 45 | __constant__ LONG BLAKE2B_IVS[8] =
 46 | {
 47 |         0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b,
 48 |         0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f,
 49 |         0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
 50 | };
 51 | 
 52 | const LONG CPU_BLAKE2B_IVS[8] =
 53 | {
 54 |         0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b,
 55 |         0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f,
 56 |         0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
 57 | };
 58 | 
 59 | void cpu_blake2b_init(cuda_blake2b_ctx_t *ctx, BYTE* key, WORD keylen, WORD digestbitlen)
 60 | {
 61 |     memset(ctx, 0, sizeof(cuda_blake2b_ctx_t));
 62 |     memcpy(ctx->buff, key, keylen);
 63 |     memcpy(ctx->key, key, keylen);
 64 |     ctx->keylen = keylen;
 65 | 
 66 |     ctx->digestlen = digestbitlen >> 3;
 67 |     ctx->pos = 0;
 68 |     ctx->t0 = 0;
 69 |     ctx->t1 = 0;
 70 |     ctx->f0 = 0;
 71 |     ctx->chain[0] = CPU_BLAKE2B_IVS[0] ^ (ctx->digestlen | (ctx->keylen << 8) | 0x1010000);
 72 |     ctx->chain[1] = CPU_BLAKE2B_IVS[1];
 73 |     ctx->chain[2] = CPU_BLAKE2B_IVS[2];
 74 |     ctx->chain[3] = CPU_BLAKE2B_IVS[3];
 75 |     ctx->chain[4] = CPU_BLAKE2B_IVS[4];
 76 |     ctx->chain[5] = CPU_BLAKE2B_IVS[5];
 77 |     ctx->chain[6] = CPU_BLAKE2B_IVS[6];
 78 |     ctx->chain[7] = CPU_BLAKE2B_IVS[7];
 79 | 
 80 | 
 81 |     ctx->pos = BLAKE2B_BLOCK_LENGTH;
 82 | }
 83 | 
 84 | 
 85 | 
 86 | __constant__ unsigned char BLAKE2B_SIGMAS[12][16] =
 87 | {
 88 |         { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
 89 |         { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
 90 |         { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
 91 |         { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
 92 |         { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
 93 |         { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
 94 |         { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
 95 |         { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
 96 |         { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
 97 |         { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
 98 |         { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
 99 |         { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
100 | };
101 | 
102 | __device__ LONG cuda_blake2b_leuint64(BYTE *in)
103 | {
104 |     LONG a;
105 |     memcpy(&a, in, 8);
106 |     return a;
107 | 
108 | /* If memory is not little endian
109 | BYTE *a = (BYTE *)in;
110 | return ((LONG)(a[0]) << 0) | ((LONG)(a[1]) << 8) | ((LONG)(a[2]) << 16) | ((LONG)(a[3]) << 24) |((LONG)(a[4]) << 32)
111 |     | ((LONG)(a[5]) << 40) | ((LONG)(a[6]) << 48) | 	((LONG)(a[7]) << 56);
112 |  */
113 | }
114 | 
115 | __device__ LONG cuda_blake2b_ROTR64(LONG a, BYTE b)
116 | {
117 |     return (a >> b) | (a << (64 - b));
118 | }
119 | 
120 | __device__ void cuda_blake2b_G(cuda_blake2b_ctx_t *ctx, int64_t m1, int64_t m2, int32_t a, int32_t b, int32_t c, int32_t d)
121 | {
122 |     ctx->state[a] = ctx->state[a] + ctx->state[b] + m1;
123 |     ctx->state[d] = cuda_blake2b_ROTR64(ctx->state[d] ^ ctx->state[a], 32);
124 |     ctx->state[c] = ctx->state[c] + ctx->state[d];
125 |     ctx->state[b] = cuda_blake2b_ROTR64(ctx->state[b] ^ ctx->state[c], 24);
126 |     ctx->state[a] = ctx->state[a] + ctx->state[b] + m2;
127 |     ctx->state[d] = cuda_blake2b_ROTR64(ctx->state[d] ^ ctx->state[a], 16);
128 |     ctx->state[c] = ctx->state[c] + ctx->state[d];
129 |     ctx->state[b] = cuda_blake2b_ROTR64(ctx->state[b] ^ ctx->state[c], 63);
130 | }
131 | 
132 | __device__ __forceinline__ void cuda_blake2b_init_state(cuda_blake2b_ctx_t *ctx)
133 | {
134 |     memcpy(ctx->state, ctx->chain, BLAKE2B_CHAIN_LENGTH);
135 |     for (int i = 0; i < 4; i++)
136 |         ctx->state[BLAKE2B_CHAIN_SIZE + i] = BLAKE2B_IVS[i];
137 | 
138 |     ctx->state[12] = ctx->t0 ^ BLAKE2B_IVS[4];
139 |     ctx->state[13] = ctx->t1 ^ BLAKE2B_IVS[5];
140 |     ctx->state[14] = ctx->f0 ^ BLAKE2B_IVS[6];
141 |     ctx->state[15] = BLAKE2B_IVS[7];
142 | }
143 | 
144 | __device__ __forceinline__ void cuda_blake2b_compress(cuda_blake2b_ctx_t *ctx, BYTE* in, WORD inoffset)
145 | {
146 |     cuda_blake2b_init_state(ctx);
147 | 
148 |     LONG  m[16] = {0};
149 |     for (int j = 0; j < 16; j++)
150 |         m[j] = cuda_blake2b_leuint64(in + inoffset + (j << 3));
151 | 
152 |     for (int round = 0; round < BLAKE2B_ROUNDS; round++)
153 |     {
154 |         cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][0]], m[BLAKE2B_SIGMAS[round][1]], 0, 4, 8, 12);
155 |         cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][2]], m[BLAKE2B_SIGMAS[round][3]], 1, 5, 9, 13);
156 |         cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][4]], m[BLAKE2B_SIGMAS[round][5]], 2, 6, 10, 14);
157 |         cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][6]], m[BLAKE2B_SIGMAS[round][7]], 3, 7, 11, 15);
158 |         cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][8]], m[BLAKE2B_SIGMAS[round][9]], 0, 5, 10, 15);
159 |         cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][10]], m[BLAKE2B_SIGMAS[round][11]], 1, 6, 11, 12);
160 |         cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][12]], m[BLAKE2B_SIGMAS[round][13]], 2, 7, 8, 13);
161 |         cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][14]], m[BLAKE2B_SIGMAS[round][15]], 3, 4, 9, 14);
162 |     }
163 | 
164 |     for (int offset = 0; offset < BLAKE2B_CHAIN_SIZE; offset++)
165 |         ctx->chain[offset] = ctx->chain[offset] ^ ctx->state[offset] ^ ctx->state[offset + 8];
166 | }
167 | 
168 | __device__ void cuda_blake2b_init(cuda_blake2b_ctx_t *ctx, BYTE* key, WORD keylen, WORD digestbitlen)
169 | {
170 |     memset(ctx, 0, sizeof(cuda_blake2b_ctx_t));
171 | 
172 |     ctx->keylen = keylen;
173 |     ctx->digestlen = digestbitlen >> 3;
174 |     ctx->pos = 0;
175 |     ctx->t0 = 0;
176 |     ctx->t1 = 0;
177 |     ctx->f0 = 0;
178 |     ctx->chain[0] = BLAKE2B_IVS[0] ^ (ctx->digestlen | (ctx->keylen << 8) | 0x1010000);
179 |     ctx->chain[1] = BLAKE2B_IVS[1];
180 |     ctx->chain[2] = BLAKE2B_IVS[2];
181 |     ctx->chain[3] = BLAKE2B_IVS[3];
182 |     ctx->chain[4] = BLAKE2B_IVS[4];
183 |     ctx->chain[5] = BLAKE2B_IVS[5];
184 |     ctx->chain[6] = BLAKE2B_IVS[6];
185 |     ctx->chain[7] = BLAKE2B_IVS[7];
186 | 
187 |     memcpy(ctx->buff, key, keylen);
188 |     memcpy(ctx->key, key, keylen);
189 |     ctx->pos = BLAKE2B_BLOCK_LENGTH;
190 | }
191 | 
192 | __device__ void cuda_blake2b_update(cuda_blake2b_ctx_t *ctx, BYTE* in, LONG inlen)
193 | {
194 |     if (inlen == 0)
195 |         return;
196 | 
197 |     WORD start = 0;
198 |     int64_t in_index = 0, block_index = 0;
199 | 
200 |     if (ctx->pos)
201 |     {
202 |         start = BLAKE2B_BLOCK_LENGTH - ctx->pos;
203 |         if (start < inlen){
204 |             memcpy(ctx->buff + ctx->pos, in, start);
205 |             ctx->t0 += BLAKE2B_BLOCK_LENGTH;
206 | 
207 |             if (ctx->t0 == 0) ctx->t1++;
208 | 
209 |             cuda_blake2b_compress(ctx, ctx->buff, 0);
210 |             ctx->pos = 0;
211 |             memset(ctx->buff, 0, BLAKE2B_BLOCK_LENGTH);
212 |         } else {
213 |             memcpy(ctx->buff + ctx->pos, in, inlen);//read the whole *in
214 |             ctx->pos += inlen;
215 |             return;
216 |         }
217 |     }
218 | 
219 |     block_index =  inlen - BLAKE2B_BLOCK_LENGTH;
220 |     for (in_index = start; in_index < block_index; in_index += BLAKE2B_BLOCK_LENGTH)
221 |     {
222 |         ctx->t0 += BLAKE2B_BLOCK_LENGTH;
223 |         if (ctx->t0 == 0)
224 |             ctx->t1++;
225 | 
226 |         cuda_blake2b_compress(ctx, in, in_index);
227 |     }
228 | 
229 |     memcpy(ctx->buff, in + in_index, inlen - in_index);
230 |     ctx->pos += inlen - in_index;
231 | }
232 | 
233 | __device__ void cuda_blake2b_final(cuda_blake2b_ctx_t *ctx, BYTE* out)
234 | {
235 |     ctx->f0 = 0xFFFFFFFFFFFFFFFFL;
236 |     ctx->t0 += ctx->pos;
237 |     if (ctx->pos > 0 && ctx->t0 == 0)
238 |         ctx->t1++;
239 | 
240 |     cuda_blake2b_compress(ctx, ctx->buff, 0);
241 |     memset(ctx->buff, 0, BLAKE2B_BLOCK_LENGTH);
242 |     memset(ctx->state, 0, BLAKE2B_STATE_LENGTH);
243 | 
244 |     int i8 = 0;
245 |     for (int i = 0; i < BLAKE2B_CHAIN_SIZE && ((i8 = i * 8) < ctx->digestlen); i++)
246 |     {
247 |         BYTE * BYTEs = (BYTE*)(&ctx->chain[i]);
248 |         if (i8 < ctx->digestlen - 8)
249 |             memcpy(out + i8, BYTEs, 8);
250 |         else
251 |             memcpy(out + i8, BYTEs, ctx->digestlen - i8);
252 |     }
253 | }
254 | 
255 | __global__ void kernel_blake2b_hash(BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch, WORD BLAKE2B_BLOCK_SIZE)
256 | {
257 |     WORD thread = blockIdx.x * blockDim.x + threadIdx.x;
258 |     if (thread >= n_batch)
259 |     {
260 |         return;
261 |     }
262 |     BYTE* in = indata  + thread * inlen;
263 |     BYTE* out = outdata  + thread * BLAKE2B_BLOCK_SIZE;
264 |     CUDA_BLAKE2B_CTX ctx = c_CTX;
265 |     //if not precomputed CTX, call cuda_blake2b_init() with key
266 |     cuda_blake2b_update(&ctx, in, inlen);
267 |     cuda_blake2b_final(&ctx, out);
268 | }
269 | extern "C"
270 | {
271 | void mcm_cuda_blake2b_hash_batch(BYTE *key, WORD keylen, BYTE *in, WORD inlen, BYTE *out, WORD n_outbit, WORD n_batch) {
272 |     BYTE * cuda_indata;
273 |     BYTE * cuda_outdata;
274 |     const WORD BLAKE2B_BLOCK_SIZE = (n_outbit >> 3);
275 |     cudaMalloc(&cuda_indata, inlen * n_batch);
276 |     cudaMalloc(&cuda_outdata, BLAKE2B_BLOCK_SIZE * n_batch);
277 | 
278 |     CUDA_BLAKE2B_CTX ctx;
279 |     assert(keylen <= 128); // we must define keylen at host
280 |     cpu_blake2b_init(&ctx, key, keylen, n_outbit);
281 | 
282 |     cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice);
283 |     cudaMemcpyToSymbol(c_CTX, &ctx, sizeof(CUDA_BLAKE2B_CTX), 0, cudaMemcpyHostToDevice);
284 | 
285 |     WORD thread = 256;
286 |     WORD block = (n_batch + thread - 1) / thread;
287 | 
288 |     kernel_blake2b_hash << < block, thread >> > (cuda_indata, inlen, cuda_outdata, n_batch, BLAKE2B_BLOCK_SIZE);
289 |     cudaMemcpy(out, cuda_outdata, BLAKE2B_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost);
290 |     cudaDeviceSynchronize();
291 |     cudaError_t error = cudaGetLastError();
292 |     if (error != cudaSuccess) {
293 |         printf("Error cuda blake2b hash: %s \n", cudaGetErrorString(error));
294 |     }
295 |     cudaFree(cuda_indata);
296 |     cudaFree(cuda_outdata);
297 | }
298 | }


--------------------------------------------------------------------------------
/blake2b.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * blake2b.cuh CUDA Implementation of BLAKE2B Hashing
 3 |  *
 4 |  * Date: 12 June 2019
 5 |  * Revision: 1
 6 |  *
 7 |  * This file is released into the Public Domain.
 8 |  */
 9 | 
10 | 
11 | #pragma once
12 | #include "config.h"
13 | void mcm_cuda_blake2b_hash_batch(BYTE* key, WORD keylen, BYTE * in, WORD inlen, BYTE * out, WORD n_outbit, WORD n_batch);
14 | 


--------------------------------------------------------------------------------
/config.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Type Definitions for CUDA Hashing Algos
 3 |  *
 4 |  * Date: 12 June 2019
 5 |  * Revision: 1
 6 |  *
 7 |  * This file is released into the Public Domain.
 8 |  */
 9 | 
10 | #pragma once
11 | #define USE_MD2 1
12 | #define USE_MD5 1
13 | #define USE_SHA1 1
14 | #define USE_SHA256 1
15 | 
16 | #define CUDA_HASH 1
17 | #define OCL_HASH 0
18 | 
19 | typedef unsigned char BYTE;
20 | typedef unsigned int  WORD;
21 | typedef unsigned long long LONG;
22 | 
23 | #include <stdlib.h>
24 | #include <string.h>
25 | #include <stdio.h>
26 | 


--------------------------------------------------------------------------------
/keccak.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * keccak.cu  Implementation of Keccak/SHA3 digest
  3 |  *
  4 |  * Date: 12 June 2019
  5 |  * Revision: 1
  6 |  *
  7 |  * This file is released into the Public Domain.
  8 |  */
  9 |  
 10 |  
 11 | extern "C"
 12 | {
 13 | #include "keccak.cuh"
 14 | }
 15 | 
 16 | #define KECCAK_ROUND 24
 17 | #define KECCAK_STATE_SIZE 25
 18 | #define KECCAK_Q_SIZE 192
 19 | 
 20 | __constant__ LONG CUDA_KECCAK_CONSTS[24] = { 0x0000000000000001, 0x0000000000008082,
 21 |                                           0x800000000000808a, 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, 0x8000000080008081,
 22 |                                           0x8000000000008009, 0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
 23 |                                           0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003, 0x8000000000008002,
 24 |                                           0x8000000000000080, 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, 0x8000000000008080,
 25 |                                           0x0000000080000001, 0x8000000080008008 };
 26 | 
 27 | typedef struct {
 28 | 
 29 |     BYTE sha3_flag;
 30 |     WORD digestbitlen;
 31 |     LONG rate_bits;
 32 |     LONG rate_BYTEs;
 33 |     LONG absorb_round;
 34 | 
 35 |     int64_t state[KECCAK_STATE_SIZE];
 36 |     BYTE q[KECCAK_Q_SIZE];
 37 | 
 38 |     LONG bits_in_queue;
 39 | 
 40 | } cuda_keccak_ctx_t;
 41 | typedef cuda_keccak_ctx_t CUDA_KECCAK_CTX;
 42 | 
 43 | __device__ LONG cuda_keccak_leuint64(void *in)
 44 | {
 45 |     LONG a;
 46 |     memcpy(&a, in, 8);
 47 |     return a;
 48 | }
 49 | 
 50 | __device__ int64_t cuda_keccak_MIN(int64_t a, int64_t b)
 51 | {
 52 |     if (a > b) return b;
 53 |     return a;
 54 | }
 55 | 
 56 | __device__ LONG cuda_keccak_UMIN(LONG a, LONG b)
 57 | {
 58 |     if (a > b) return b;
 59 |     return a;
 60 | }
 61 | 
 62 | __device__ void cuda_keccak_extract(cuda_keccak_ctx_t *ctx)
 63 | {
 64 |     LONG len = ctx->rate_bits >> 6;
 65 |     int64_t a;
 66 |     int s = sizeof(LONG);
 67 | 
 68 |     for (int i = 0;i < len;i++) {
 69 |         a = cuda_keccak_leuint64((int64_t*)&ctx->state[i]);
 70 |         memcpy(ctx->q + (i * s), &a, s);
 71 |     }
 72 | }
 73 | 
 74 | __device__ __forceinline__ LONG cuda_keccak_ROTL64(LONG a, LONG  b)
 75 | {
 76 |     return (a << b) | (a >> (64 - b));
 77 | }
 78 | 
 79 | __device__ void cuda_keccak_permutations(cuda_keccak_ctx_t * ctx)
 80 | {
 81 | 
 82 |     int64_t* A = ctx->state;;
 83 | 
 84 |     int64_t *a00 = A, *a01 = A + 1, *a02 = A + 2, *a03 = A + 3, *a04 = A + 4;
 85 |     int64_t *a05 = A + 5, *a06 = A + 6, *a07 = A + 7, *a08 = A + 8, *a09 = A + 9;
 86 |     int64_t *a10 = A + 10, *a11 = A + 11, *a12 = A + 12, *a13 = A + 13, *a14 = A + 14;
 87 |     int64_t *a15 = A + 15, *a16 = A + 16, *a17 = A + 17, *a18 = A + 18, *a19 = A + 19;
 88 |     int64_t *a20 = A + 20, *a21 = A + 21, *a22 = A + 22, *a23 = A + 23, *a24 = A + 24;
 89 | 
 90 |     for (int i = 0; i < KECCAK_ROUND; i++) {
 91 | 
 92 |         /* Theta */
 93 |         int64_t c0 = *a00 ^ *a05 ^ *a10 ^ *a15 ^ *a20;
 94 |         int64_t c1 = *a01 ^ *a06 ^ *a11 ^ *a16 ^ *a21;
 95 |         int64_t c2 = *a02 ^ *a07 ^ *a12 ^ *a17 ^ *a22;
 96 |         int64_t c3 = *a03 ^ *a08 ^ *a13 ^ *a18 ^ *a23;
 97 |         int64_t c4 = *a04 ^ *a09 ^ *a14 ^ *a19 ^ *a24;
 98 | 
 99 |         int64_t d1 = cuda_keccak_ROTL64(c1, 1) ^ c4;
100 |         int64_t d2 = cuda_keccak_ROTL64(c2, 1) ^ c0;
101 |         int64_t d3 = cuda_keccak_ROTL64(c3, 1) ^ c1;
102 |         int64_t d4 = cuda_keccak_ROTL64(c4, 1) ^ c2;
103 |         int64_t d0 = cuda_keccak_ROTL64(c0, 1) ^ c3;
104 | 
105 |         *a00 ^= d1;
106 |         *a05 ^= d1;
107 |         *a10 ^= d1;
108 |         *a15 ^= d1;
109 |         *a20 ^= d1;
110 |         *a01 ^= d2;
111 |         *a06 ^= d2;
112 |         *a11 ^= d2;
113 |         *a16 ^= d2;
114 |         *a21 ^= d2;
115 |         *a02 ^= d3;
116 |         *a07 ^= d3;
117 |         *a12 ^= d3;
118 |         *a17 ^= d3;
119 |         *a22 ^= d3;
120 |         *a03 ^= d4;
121 |         *a08 ^= d4;
122 |         *a13 ^= d4;
123 |         *a18 ^= d4;
124 |         *a23 ^= d4;
125 |         *a04 ^= d0;
126 |         *a09 ^= d0;
127 |         *a14 ^= d0;
128 |         *a19 ^= d0;
129 |         *a24 ^= d0;
130 | 
131 |         /* Rho pi */
132 |         c1 = cuda_keccak_ROTL64(*a01, 1);
133 |         *a01 = cuda_keccak_ROTL64(*a06, 44);
134 |         *a06 = cuda_keccak_ROTL64(*a09, 20);
135 |         *a09 = cuda_keccak_ROTL64(*a22, 61);
136 |         *a22 = cuda_keccak_ROTL64(*a14, 39);
137 |         *a14 = cuda_keccak_ROTL64(*a20, 18);
138 |         *a20 = cuda_keccak_ROTL64(*a02, 62);
139 |         *a02 = cuda_keccak_ROTL64(*a12, 43);
140 |         *a12 = cuda_keccak_ROTL64(*a13, 25);
141 |         *a13 = cuda_keccak_ROTL64(*a19, 8);
142 |         *a19 = cuda_keccak_ROTL64(*a23, 56);
143 |         *a23 = cuda_keccak_ROTL64(*a15, 41);
144 |         *a15 = cuda_keccak_ROTL64(*a04, 27);
145 |         *a04 = cuda_keccak_ROTL64(*a24, 14);
146 |         *a24 = cuda_keccak_ROTL64(*a21, 2);
147 |         *a21 = cuda_keccak_ROTL64(*a08, 55);
148 |         *a08 = cuda_keccak_ROTL64(*a16, 45);
149 |         *a16 = cuda_keccak_ROTL64(*a05, 36);
150 |         *a05 = cuda_keccak_ROTL64(*a03, 28);
151 |         *a03 = cuda_keccak_ROTL64(*a18, 21);
152 |         *a18 = cuda_keccak_ROTL64(*a17, 15);
153 |         *a17 = cuda_keccak_ROTL64(*a11, 10);
154 |         *a11 = cuda_keccak_ROTL64(*a07, 6);
155 |         *a07 = cuda_keccak_ROTL64(*a10, 3);
156 |         *a10 = c1;
157 | 
158 |         /* Chi */
159 |         c0 = *a00 ^ (~*a01 & *a02);
160 |         c1 = *a01 ^ (~*a02 & *a03);
161 |         *a02 ^= ~*a03 & *a04;
162 |         *a03 ^= ~*a04 & *a00;
163 |         *a04 ^= ~*a00 & *a01;
164 |         *a00 = c0;
165 |         *a01 = c1;
166 | 
167 |         c0 = *a05 ^ (~*a06 & *a07);
168 |         c1 = *a06 ^ (~*a07 & *a08);
169 |         *a07 ^= ~*a08 & *a09;
170 |         *a08 ^= ~*a09 & *a05;
171 |         *a09 ^= ~*a05 & *a06;
172 |         *a05 = c0;
173 |         *a06 = c1;
174 | 
175 |         c0 = *a10 ^ (~*a11 & *a12);
176 |         c1 = *a11 ^ (~*a12 & *a13);
177 |         *a12 ^= ~*a13 & *a14;
178 |         *a13 ^= ~*a14 & *a10;
179 |         *a14 ^= ~*a10 & *a11;
180 |         *a10 = c0;
181 |         *a11 = c1;
182 | 
183 |         c0 = *a15 ^ (~*a16 & *a17);
184 |         c1 = *a16 ^ (~*a17 & *a18);
185 |         *a17 ^= ~*a18 & *a19;
186 |         *a18 ^= ~*a19 & *a15;
187 |         *a19 ^= ~*a15 & *a16;
188 |         *a15 = c0;
189 |         *a16 = c1;
190 | 
191 |         c0 = *a20 ^ (~*a21 & *a22);
192 |         c1 = *a21 ^ (~*a22 & *a23);
193 |         *a22 ^= ~*a23 & *a24;
194 |         *a23 ^= ~*a24 & *a20;
195 |         *a24 ^= ~*a20 & *a21;
196 |         *a20 = c0;
197 |         *a21 = c1;
198 | 
199 |         /* Iota */
200 |         *a00 ^= CUDA_KECCAK_CONSTS[i];
201 |     }
202 | }
203 | 
204 | 
205 | __device__ void cuda_keccak_absorb(cuda_keccak_ctx_t *ctx, BYTE* in)
206 | {
207 | 
208 |     LONG offset = 0;
209 |     for (LONG i = 0; i < ctx->absorb_round; ++i) {
210 |         ctx->state[i] ^= cuda_keccak_leuint64(in + offset);
211 |         offset += 8;
212 |     }
213 | 
214 |     cuda_keccak_permutations(ctx);
215 | }
216 | 
217 | __device__ void cuda_keccak_pad(cuda_keccak_ctx_t *ctx)
218 | {
219 |     ctx->q[ctx->bits_in_queue >> 3] |= (1L << (ctx->bits_in_queue & 7));
220 | 
221 |     if (++(ctx->bits_in_queue) == ctx->rate_bits) {
222 |         cuda_keccak_absorb(ctx, ctx->q);
223 |         ctx->bits_in_queue = 0;
224 |     }
225 | 
226 |     LONG full = ctx->bits_in_queue >> 6;
227 |     LONG partial = ctx->bits_in_queue & 63;
228 | 
229 |     LONG offset = 0;
230 |     for (int i = 0; i < full; ++i) {
231 |         ctx->state[i] ^= cuda_keccak_leuint64(ctx->q + offset);
232 |         offset += 8;
233 |     }
234 | 
235 |     if (partial > 0) {
236 |         LONG mask = (1L << partial) - 1;
237 |         ctx->state[full] ^= cuda_keccak_leuint64(ctx->q + offset) & mask;
238 |     }
239 | 
240 |     ctx->state[(ctx->rate_bits - 1) >> 6] ^= 9223372036854775808ULL;/* 1 << 63 */
241 | 
242 |     cuda_keccak_permutations(ctx);
243 |     cuda_keccak_extract(ctx);
244 | 
245 |     ctx->bits_in_queue = ctx->rate_bits;
246 | }
247 | 
248 | /*
249 |  * Digestbitlen must be 128 224 256 288 384 512
250 |  */
251 | __device__ void cuda_keccak_init(cuda_keccak_ctx_t *ctx, WORD digestbitlen)
252 | {
253 |     memset(ctx, 0, sizeof(cuda_keccak_ctx_t));
254 |     ctx->sha3_flag = 0;
255 |     ctx->digestbitlen = digestbitlen;
256 |     ctx->rate_bits = 1600 - ((ctx->digestbitlen) << 1);
257 |     ctx->rate_BYTEs = ctx->rate_bits >> 3;
258 |     ctx->absorb_round = ctx->rate_bits >> 6;
259 |     ctx->bits_in_queue = 0;
260 | }
261 | 
262 | /*
263 |  * Digestbitlen must be 224 256 384 512
264 |  */
265 | __device__ void cuda_keccak_sha3_init(cuda_keccak_ctx_t *ctx, WORD digestbitlen)
266 | {
267 |     cuda_keccak_init(ctx, digestbitlen);
268 |     ctx->sha3_flag = 1;
269 | }
270 | 
271 | __device__ void cuda_keccak_update(cuda_keccak_ctx_t *ctx, BYTE *in, LONG inlen)
272 | {
273 |     int64_t BYTEs = ctx->bits_in_queue >> 3;
274 |     int64_t count = 0;
275 |     while (count < inlen) {
276 |         if (BYTEs == 0 && count <= ((int64_t)(inlen - ctx->rate_BYTEs))) {
277 |             do {
278 |                 cuda_keccak_absorb(ctx, in + count);
279 |                 count += ctx->rate_BYTEs;
280 |             } while (count <= ((int64_t)(inlen - ctx->rate_BYTEs)));
281 |         } else {
282 |             int64_t partial = cuda_keccak_MIN(ctx->rate_BYTEs - BYTEs, inlen - count);
283 |             memcpy(ctx->q + BYTEs, in + count, partial);
284 | 
285 |             BYTEs += partial;
286 |             count += partial;
287 | 
288 |             if (BYTEs == ctx->rate_BYTEs) {
289 |                 cuda_keccak_absorb(ctx, ctx->q);
290 |                 BYTEs = 0;
291 |             }
292 |         }
293 |     }
294 |     ctx->bits_in_queue = BYTEs << 3;
295 | }
296 | 
297 | __device__ void cuda_keccak_final(cuda_keccak_ctx_t *ctx, BYTE *out)
298 | {
299 |     if (ctx->sha3_flag) {
300 |         int mask = (1 << 2) - 1;
301 |         ctx->q[ctx->bits_in_queue >> 3] = (BYTE)(0x02 & mask);
302 |         ctx->bits_in_queue += 2;
303 |     }
304 | 
305 |     cuda_keccak_pad(ctx);
306 |     LONG i = 0;
307 | 
308 |     while (i < ctx->digestbitlen) {
309 |         if (ctx->bits_in_queue == 0) {
310 |             cuda_keccak_permutations(ctx);
311 |             cuda_keccak_extract(ctx);
312 |             ctx->bits_in_queue = ctx->rate_bits;
313 |         }
314 | 
315 |         LONG partial_block = cuda_keccak_UMIN(ctx->bits_in_queue, ctx->digestbitlen - i);
316 |         memcpy(out + (i >> 3), ctx->q + (ctx->rate_BYTEs - (ctx->bits_in_queue >> 3)), partial_block >> 3);
317 |         ctx->bits_in_queue -= partial_block;
318 |         i += partial_block;
319 |     }
320 | }
321 | 
322 | __global__ void kernel_keccak_hash(BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch, WORD KECCAK_BLOCK_SIZE)
323 | {
324 |     WORD thread = blockIdx.x * blockDim.x + threadIdx.x;
325 |     if (thread >= n_batch)
326 |     {
327 |         return;
328 |     }
329 |     BYTE* in = indata  + thread * inlen;
330 |     BYTE* out = outdata  + thread * KECCAK_BLOCK_SIZE;
331 |     CUDA_KECCAK_CTX ctx;
332 |     cuda_keccak_init(&ctx, KECCAK_BLOCK_SIZE << 3);
333 |     cuda_keccak_update(&ctx, in, inlen);
334 |     cuda_keccak_final(&ctx, out);
335 | }
336 | extern "C"
337 | {
338 | void mcm_cuda_keccak_hash_batch(BYTE * in, WORD inlen, BYTE * out, WORD n_outbit, WORD n_batch)
339 | {
340 |     BYTE * cuda_indata;
341 |     BYTE * cuda_outdata;
342 |     const WORD KECCAK_BLOCK_SIZE = (n_outbit >> 3);
343 |     cudaMalloc(&cuda_indata, inlen * n_batch);
344 |     cudaMalloc(&cuda_outdata, KECCAK_BLOCK_SIZE * n_batch);
345 |     cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice);
346 | 
347 |     WORD thread = 256;
348 |     WORD block = (n_batch + thread - 1) / thread;
349 | 
350 |     kernel_keccak_hash << < block, thread >> > (cuda_indata, inlen, cuda_outdata, n_batch, KECCAK_BLOCK_SIZE);
351 |     cudaMemcpy(out, cuda_outdata, KECCAK_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost);
352 |     cudaDeviceSynchronize();
353 |     cudaError_t error = cudaGetLastError();
354 |     if (error != cudaSuccess) {
355 |         printf("Error cuda keccak hash: %s \n", cudaGetErrorString(error));
356 |     }
357 |     cudaFree(cuda_indata);
358 |     cudaFree(cuda_outdata);
359 | }
360 | }


--------------------------------------------------------------------------------
/keccak.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * keccak.cuh CUDA Implementation of BLAKE2B Hashing
 3 |  *
 4 |  * Date: 12 June 2019
 5 |  * Revision: 1
 6 |  *
 7 |  * This file is released into the Public Domain.
 8 |  */
 9 | 
10 | 
11 | #pragma once
12 | #include "config.h"
13 | void mcm_cuda_keccak_hash_batch(BYTE * in, WORD inlen, BYTE * out, WORD n_outbit, WORD n_batch);
14 | 


--------------------------------------------------------------------------------
/md2.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * md2.cu CUDA Implementation of MD2 digest
  3 |  *
  4 |  * Date: 12 June 2019
  5 |  * Revision: 1
  6 |  * 
  7 |  * Based on the public domain Reference Implementation in C, by
  8 |  * Brad Conte, original code here:
  9 |  *
 10 |  * https://github.com/B-Con/crypto-algorithms
 11 |  *
 12 |  * This file is released into the Public Domain.
 13 |  */
 14 | 
 15 | 
 16 | /*************************** HEADER FILES ***************************/
 17 | #include <stdlib.h>
 18 | #include <memory.h>
 19 | #include <device_launch_parameters.h>
 20 | #include <cuda_runtime.h>
 21 | extern "C" {
 22 | #include "md2.cuh"
 23 | }
 24 | #define MD2_BLOCK_SIZE 16
 25 | /**************************** STRUCT ********************************/
 26 | typedef struct {
 27 | 	BYTE data[16];
 28 | 	BYTE state[48];
 29 | 	BYTE checksum[16];
 30 | 	int len;
 31 | } CUDA_MD2_CTX;
 32 | 
 33 | /**************************** VARIABLES *****************************/
 34 | __constant__ BYTE s[256] = {
 35 | 	41, 46, 67, 201, 162, 216, 124, 1, 61, 54, 84, 161, 236, 240, 6,
 36 | 	19, 98, 167, 5, 243, 192, 199, 115, 140, 152, 147, 43, 217, 188,
 37 | 	76, 130, 202, 30, 155, 87, 60, 253, 212, 224, 22, 103, 66, 111, 24,
 38 | 	138, 23, 229, 18, 190, 78, 196, 214, 218, 158, 222, 73, 160, 251,
 39 | 	245, 142, 187, 47, 238, 122, 169, 104, 121, 145, 21, 178, 7, 63,
 40 | 	148, 194, 16, 137, 11, 34, 95, 33, 128, 127, 93, 154, 90, 144, 50,
 41 | 	39, 53, 62, 204, 231, 191, 247, 151, 3, 255, 25, 48, 179, 72, 165,
 42 | 	181, 209, 215, 94, 146, 42, 172, 86, 170, 198, 79, 184, 56, 210,
 43 | 	150, 164, 125, 182, 118, 252, 107, 226, 156, 116, 4, 241, 69, 157,
 44 | 	112, 89, 100, 113, 135, 32, 134, 91, 207, 101, 230, 45, 168, 2, 27,
 45 | 	96, 37, 173, 174, 176, 185, 246, 28, 70, 97, 105, 52, 64, 126, 15,
 46 | 	85, 71, 163, 35, 221, 81, 175, 58, 195, 92, 249, 206, 186, 197,
 47 | 	234, 38, 44, 83, 13, 110, 133, 40, 132, 9, 211, 223, 205, 244, 65,
 48 | 	129, 77, 82, 106, 220, 55, 200, 108, 193, 171, 250, 36, 225, 123,
 49 | 	8, 12, 189, 177, 74, 120, 136, 149, 139, 227, 99, 232, 109, 233,
 50 | 	203, 213, 254, 59, 0, 29, 57, 242, 239, 183, 14, 102, 88, 208, 228,
 51 | 	166, 119, 114, 248, 235, 117, 75, 10, 49, 68, 80, 180, 143, 237,
 52 | 	31, 26, 219, 153, 141, 51, 159, 17, 131, 20
 53 | };
 54 | 
 55 | /*********************** FUNCTION DEFINITIONS ***********************/
 56 | __device__ void cuda_md2_transform(CUDA_MD2_CTX *ctx, BYTE data[])
 57 | {
 58 | 	int j,k,t;
 59 | 
 60 | 	//memcpy(&ctx->state[16], data);
 61 | 	for (j=0; j < 16; ++j) {
 62 | 		ctx->state[j + 16] = data[j];
 63 | 		ctx->state[j + 32] = (ctx->state[j+16] ^ ctx->state[j]);
 64 | 	}
 65 | 
 66 | 	t = 0;
 67 | 	for (j = 0; j < 18; ++j) {
 68 | 		for (k = 0; k < 48; ++k) {
 69 | 			ctx->state[k] ^= s[t];
 70 | 			t = ctx->state[k];
 71 | 		}
 72 | 		t = (t+j) & 0xFF;
 73 | 	}
 74 | 
 75 | 	t = ctx->checksum[15];
 76 | 	for (j=0; j < 16; ++j) {
 77 | 		ctx->checksum[j] ^= s[data[j] ^ t];
 78 | 		t = ctx->checksum[j];
 79 | 	}
 80 | }
 81 | 
 82 | __device__ void cuda_md2_init(CUDA_MD2_CTX *ctx)
 83 | {
 84 | 	int i;
 85 | 
 86 | 	for (i=0; i < 48; ++i)
 87 | 		ctx->state[i] = 0;
 88 | 	for (i=0; i < 16; ++i)
 89 | 		ctx->checksum[i] = 0;
 90 | 	ctx->len = 0;
 91 | }
 92 | 
 93 | __device__ void cuda_md2_update(CUDA_MD2_CTX *ctx, const BYTE data[], size_t len)
 94 | {
 95 | 	size_t i;
 96 | 
 97 | 	for (i = 0; i < len; ++i) {
 98 | 		ctx->data[ctx->len] = data[i];
 99 | 		ctx->len++;
100 | 		if (ctx->len == MD2_BLOCK_SIZE) {
101 | 			cuda_md2_transform(ctx, ctx->data);
102 | 			ctx->len = 0;
103 | 		}
104 | 	}
105 | }
106 | 
107 | __device__ void cuda_md2_final(CUDA_MD2_CTX *ctx, BYTE hash[])
108 | {
109 | 	int to_pad;
110 | 
111 | 	to_pad = MD2_BLOCK_SIZE - ctx->len;
112 | 
113 | 	while (ctx->len < MD2_BLOCK_SIZE)
114 | 		ctx->data[ctx->len++] = to_pad;
115 | 
116 | 	cuda_md2_transform(ctx, ctx->data);
117 | 	cuda_md2_transform(ctx, ctx->checksum);
118 | 
119 | 	memcpy(hash, ctx->state, MD2_BLOCK_SIZE);
120 | }
121 | 
122 | __global__ void kernel_md2_hash(BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch)
123 | {
124 | 	WORD thread = blockIdx.x * blockDim.x + threadIdx.x;
125 | 	if (thread >= n_batch)
126 | 	{
127 | 		return;
128 | 	}
129 | 	BYTE* in = indata  + thread * inlen;
130 | 	BYTE* out = outdata  + thread * MD2_BLOCK_SIZE;
131 | 	CUDA_MD2_CTX ctx;
132 | 	cuda_md2_init(&ctx);
133 | 	cuda_md2_update(&ctx, in, inlen);
134 | 	cuda_md2_final(&ctx, out);
135 | }
136 | extern "C" {
137 | void mcm_cuda_md2_hash_batch(BYTE *in, WORD inlen, BYTE *out, WORD n_batch) {
138 | 	BYTE *cuda_indata;
139 | 	BYTE *cuda_outdata;
140 | 	cudaMalloc(&cuda_indata, inlen * n_batch);
141 | 	cudaMalloc(&cuda_outdata, MD2_BLOCK_SIZE * n_batch);
142 | 	cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice);
143 | 
144 | 	WORD thread = 256;
145 | 	WORD block = (n_batch + thread - 1) / thread;
146 | 
147 | 	kernel_md2_hash << < block, thread >> > (cuda_indata, inlen, cuda_outdata, n_batch);
148 | 	cudaMemcpy(out, cuda_outdata, MD2_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost);
149 | 	cudaDeviceSynchronize();
150 | 	cudaError_t error = cudaGetLastError();
151 | 	if (error != cudaSuccess) {
152 | 		printf("Error cuda md2 hash: %s \n", cudaGetErrorString(error));
153 | 	}
154 | 	cudaFree(cuda_indata);
155 | 	cudaFree(cuda_outdata);
156 | }
157 | }
158 | 


--------------------------------------------------------------------------------
/md2.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * md2.cuh CUDA Implementation of MD2 digest       
 3 |  *
 4 |  * Date: 12 June 2019
 5 |  * Revision: 1
 6 |  * 
 7 |  * Based on the public domain Reference Implementation in C, by
 8 |  * Brad Conte, original code here:
 9 |  *
10 |  * https://github.com/B-Con/crypto-algorithms
11 |  *
12 |  * This file is released into the Public Domain.
13 |  */
14 | 
15 | 
16 | #pragma once
17 | #include "config.h"
18 | void mcm_cuda_md2_hash_batch(BYTE* in, WORD inlen, BYTE* out, WORD n_batch);
19 | 


--------------------------------------------------------------------------------
/md5.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * md5.cu CUDA Implementation of MD5 digest       
  3 |  *
  4 |  * Date: 12 June 2019
  5 |  * Revision: 1
  6 |  * 
  7 |  * Based on the public domain Reference Implementation in C, by
  8 |  * Brad Conte, original code here:
  9 |  *
 10 |  * https://github.com/B-Con/crypto-algorithms
 11 |  *
 12 |  * This file is released into the Public Domain.
 13 |  */
 14 | 
 15 | 
 16 | /*************************** HEADER FILES ***************************/
 17 | #include <stdlib.h>
 18 | #include <memory.h>
 19 | extern "C" {
 20 | #include "md5.cuh"
 21 | }
 22 | /****************************** MACROS ******************************/
 23 | #define MD5_BLOCK_SIZE 16               // MD5 outputs a 16 byte digest
 24 | 
 25 | /**************************** DATA TYPES ****************************/
 26 | typedef unsigned char BYTE;             // 8-bit byte
 27 | typedef unsigned int  WORD;             // 32-bit word, change to "long" for 16-bit machines
 28 | 
 29 | typedef struct {
 30 | 	BYTE data[64];
 31 | 	WORD datalen;
 32 | 	unsigned long long bitlen;
 33 | 	WORD state[4];
 34 | } CUDA_MD5_CTX;
 35 | 
 36 | /****************************** MACROS ******************************/
 37 | #ifndef ROTLEFT
 38 | #define ROTLEFT(a,b) ((a << b) | (a >> (32-b)))
 39 | #endif
 40 | 
 41 | #define F(x,y,z) ((x & y) | (~x & z))
 42 | #define G(x,y,z) ((x & z) | (y & ~z))
 43 | #define H(x,y,z) (x ^ y ^ z)
 44 | #define I(x,y,z) (y ^ (x | ~z))
 45 | 
 46 | #define FF(a,b,c,d,m,s,t) { a += F(b,c,d) + m + t; \
 47 |                             a = b + ROTLEFT(a,s); }
 48 | #define GG(a,b,c,d,m,s,t) { a += G(b,c,d) + m + t; \
 49 |                             a = b + ROTLEFT(a,s); }
 50 | #define HH(a,b,c,d,m,s,t) { a += H(b,c,d) + m + t; \
 51 |                             a = b + ROTLEFT(a,s); }
 52 | #define II(a,b,c,d,m,s,t) { a += I(b,c,d) + m + t; \
 53 |                             a = b + ROTLEFT(a,s); }
 54 | 
 55 | /*********************** FUNCTION DEFINITIONS ***********************/
 56 | __device__ void cuda_md5_transform(CUDA_MD5_CTX *ctx, const BYTE data[])
 57 | {
 58 | 	WORD a, b, c, d, m[16], i, j;
 59 | 
 60 | 	// MD5 specifies big endian byte order, but this implementation assumes a little
 61 | 	// endian byte order CPU. Reverse all the bytes upon input, and re-reverse them
 62 | 	// on output (in md5_final()).
 63 | 	for (i = 0, j = 0; i < 16; ++i, j += 4)
 64 | 		m[i] = (data[j]) + (data[j + 1] << 8) + (data[j + 2] << 16) + (data[j + 3] << 24);
 65 | 
 66 | 	a = ctx->state[0];
 67 | 	b = ctx->state[1];
 68 | 	c = ctx->state[2];
 69 | 	d = ctx->state[3];
 70 | 
 71 | 	FF(a,b,c,d,m[0],  7,0xd76aa478);
 72 | 	FF(d,a,b,c,m[1], 12,0xe8c7b756);
 73 | 	FF(c,d,a,b,m[2], 17,0x242070db);
 74 | 	FF(b,c,d,a,m[3], 22,0xc1bdceee);
 75 | 	FF(a,b,c,d,m[4],  7,0xf57c0faf);
 76 | 	FF(d,a,b,c,m[5], 12,0x4787c62a);
 77 | 	FF(c,d,a,b,m[6], 17,0xa8304613);
 78 | 	FF(b,c,d,a,m[7], 22,0xfd469501);
 79 | 	FF(a,b,c,d,m[8],  7,0x698098d8);
 80 | 	FF(d,a,b,c,m[9], 12,0x8b44f7af);
 81 | 	FF(c,d,a,b,m[10],17,0xffff5bb1);
 82 | 	FF(b,c,d,a,m[11],22,0x895cd7be);
 83 | 	FF(a,b,c,d,m[12], 7,0x6b901122);
 84 | 	FF(d,a,b,c,m[13],12,0xfd987193);
 85 | 	FF(c,d,a,b,m[14],17,0xa679438e);
 86 | 	FF(b,c,d,a,m[15],22,0x49b40821);
 87 | 
 88 | 	GG(a,b,c,d,m[1],  5,0xf61e2562);
 89 | 	GG(d,a,b,c,m[6],  9,0xc040b340);
 90 | 	GG(c,d,a,b,m[11],14,0x265e5a51);
 91 | 	GG(b,c,d,a,m[0], 20,0xe9b6c7aa);
 92 | 	GG(a,b,c,d,m[5],  5,0xd62f105d);
 93 | 	GG(d,a,b,c,m[10], 9,0x02441453);
 94 | 	GG(c,d,a,b,m[15],14,0xd8a1e681);
 95 | 	GG(b,c,d,a,m[4], 20,0xe7d3fbc8);
 96 | 	GG(a,b,c,d,m[9],  5,0x21e1cde6);
 97 | 	GG(d,a,b,c,m[14], 9,0xc33707d6);
 98 | 	GG(c,d,a,b,m[3], 14,0xf4d50d87);
 99 | 	GG(b,c,d,a,m[8], 20,0x455a14ed);
100 | 	GG(a,b,c,d,m[13], 5,0xa9e3e905);
101 | 	GG(d,a,b,c,m[2],  9,0xfcefa3f8);
102 | 	GG(c,d,a,b,m[7], 14,0x676f02d9);
103 | 	GG(b,c,d,a,m[12],20,0x8d2a4c8a);
104 | 
105 | 	HH(a,b,c,d,m[5],  4,0xfffa3942);
106 | 	HH(d,a,b,c,m[8], 11,0x8771f681);
107 | 	HH(c,d,a,b,m[11],16,0x6d9d6122);
108 | 	HH(b,c,d,a,m[14],23,0xfde5380c);
109 | 	HH(a,b,c,d,m[1],  4,0xa4beea44);
110 | 	HH(d,a,b,c,m[4], 11,0x4bdecfa9);
111 | 	HH(c,d,a,b,m[7], 16,0xf6bb4b60);
112 | 	HH(b,c,d,a,m[10],23,0xbebfbc70);
113 | 	HH(a,b,c,d,m[13], 4,0x289b7ec6);
114 | 	HH(d,a,b,c,m[0], 11,0xeaa127fa);
115 | 	HH(c,d,a,b,m[3], 16,0xd4ef3085);
116 | 	HH(b,c,d,a,m[6], 23,0x04881d05);
117 | 	HH(a,b,c,d,m[9],  4,0xd9d4d039);
118 | 	HH(d,a,b,c,m[12],11,0xe6db99e5);
119 | 	HH(c,d,a,b,m[15],16,0x1fa27cf8);
120 | 	HH(b,c,d,a,m[2], 23,0xc4ac5665);
121 | 
122 | 	II(a,b,c,d,m[0],  6,0xf4292244);
123 | 	II(d,a,b,c,m[7], 10,0x432aff97);
124 | 	II(c,d,a,b,m[14],15,0xab9423a7);
125 | 	II(b,c,d,a,m[5], 21,0xfc93a039);
126 | 	II(a,b,c,d,m[12], 6,0x655b59c3);
127 | 	II(d,a,b,c,m[3], 10,0x8f0ccc92);
128 | 	II(c,d,a,b,m[10],15,0xffeff47d);
129 | 	II(b,c,d,a,m[1], 21,0x85845dd1);
130 | 	II(a,b,c,d,m[8],  6,0x6fa87e4f);
131 | 	II(d,a,b,c,m[15],10,0xfe2ce6e0);
132 | 	II(c,d,a,b,m[6], 15,0xa3014314);
133 | 	II(b,c,d,a,m[13],21,0x4e0811a1);
134 | 	II(a,b,c,d,m[4],  6,0xf7537e82);
135 | 	II(d,a,b,c,m[11],10,0xbd3af235);
136 | 	II(c,d,a,b,m[2], 15,0x2ad7d2bb);
137 | 	II(b,c,d,a,m[9], 21,0xeb86d391);
138 | 
139 | 	ctx->state[0] += a;
140 | 	ctx->state[1] += b;
141 | 	ctx->state[2] += c;
142 | 	ctx->state[3] += d;
143 | }
144 | 
145 | __device__ void cuda_md5_init(CUDA_MD5_CTX *ctx)
146 | {
147 | 	ctx->datalen = 0;
148 | 	ctx->bitlen = 0;
149 | 	ctx->state[0] = 0x67452301;
150 | 	ctx->state[1] = 0xEFCDAB89;
151 | 	ctx->state[2] = 0x98BADCFE;
152 | 	ctx->state[3] = 0x10325476;
153 | }
154 | 
155 | __device__ void cuda_md5_update(CUDA_MD5_CTX *ctx, const BYTE data[], size_t len)
156 | {
157 | 	size_t i;
158 | 
159 | 	for (i = 0; i < len; ++i) {
160 | 		ctx->data[ctx->datalen] = data[i];
161 | 		ctx->datalen++;
162 | 		if (ctx->datalen == 64) {
163 | 			cuda_md5_transform(ctx, ctx->data);
164 | 			ctx->bitlen += 512;
165 | 			ctx->datalen = 0;
166 | 		}
167 | 	}
168 | }
169 | 
170 | __device__ void cuda_md5_final(CUDA_MD5_CTX *ctx, BYTE hash[])
171 | {
172 | 	size_t i;
173 | 
174 | 	i = ctx->datalen;
175 | 
176 | 	// Pad whatever data is left in the buffer.
177 | 	if (ctx->datalen < 56) {
178 | 		ctx->data[i++] = 0x80;
179 | 		while (i < 56)
180 | 			ctx->data[i++] = 0x00;
181 | 	}
182 | 	else if (ctx->datalen >= 56) {
183 | 		ctx->data[i++] = 0x80;
184 | 		while (i < 64)
185 | 			ctx->data[i++] = 0x00;
186 | 		cuda_md5_transform(ctx, ctx->data);
187 | 		memset(ctx->data, 0, 56);
188 | 	}
189 | 
190 | 	// Append to the padding the total message's length in bits and transform.
191 | 	ctx->bitlen += ctx->datalen * 8;
192 | 	ctx->data[56] = ctx->bitlen;
193 | 	ctx->data[57] = ctx->bitlen >> 8;
194 | 	ctx->data[58] = ctx->bitlen >> 16;
195 | 	ctx->data[59] = ctx->bitlen >> 24;
196 | 	ctx->data[60] = ctx->bitlen >> 32;
197 | 	ctx->data[61] = ctx->bitlen >> 40;
198 | 	ctx->data[62] = ctx->bitlen >> 48;
199 | 	ctx->data[63] = ctx->bitlen >> 56;
200 | 	cuda_md5_transform(ctx, ctx->data);
201 | 
202 | 	// Since this implementation uses little endian byte ordering and MD uses big endian,
203 | 	// reverse all the bytes when copying the final state to the output hash.
204 | 	for (i = 0; i < 4; ++i) {
205 | 		hash[i]      = (ctx->state[0] >> (i * 8)) & 0x000000ff;
206 | 		hash[i + 4]  = (ctx->state[1] >> (i * 8)) & 0x000000ff;
207 | 		hash[i + 8]  = (ctx->state[2] >> (i * 8)) & 0x000000ff;
208 | 		hash[i + 12] = (ctx->state[3] >> (i * 8)) & 0x000000ff;
209 | 	}
210 | }
211 | 
212 | __global__ void kernel_md5_hash(BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch)
213 | {
214 | 	WORD thread = blockIdx.x * blockDim.x + threadIdx.x;
215 | 	if (thread >= n_batch)
216 | 	{
217 | 		return;
218 | 	}
219 | 	BYTE* in = indata  + thread * inlen;
220 | 	BYTE* out = outdata  + thread * MD5_BLOCK_SIZE;
221 | 	CUDA_MD5_CTX ctx;
222 | 	cuda_md5_init(&ctx);
223 | 	cuda_md5_update(&ctx, in, inlen);
224 | 	cuda_md5_final(&ctx, out);
225 | }
226 | 
227 | extern "C"
228 | {
229 | void mcm_cuda_md5_hash_batch(BYTE* in, WORD inlen, BYTE* out, WORD n_batch)
230 | {
231 | 	BYTE *cuda_indata;
232 | 	BYTE *cuda_outdata;
233 | 	cudaMalloc(&cuda_indata, inlen * n_batch);
234 | 	cudaMalloc(&cuda_outdata, MD5_BLOCK_SIZE * n_batch);
235 | 	cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice);
236 | 
237 | 	WORD thread = 256;
238 | 	WORD block = (n_batch + thread - 1) / thread;
239 | 
240 | 	kernel_md5_hash << < block, thread >> > (cuda_indata, inlen, cuda_outdata, n_batch);
241 | 	cudaMemcpy(out, cuda_outdata, MD5_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost);
242 | 	cudaDeviceSynchronize();
243 | 	cudaError_t error = cudaGetLastError();
244 | 	if (error != cudaSuccess) {
245 | 		printf("Error cuda md5 hash: %s \n", cudaGetErrorString(error));
246 | 	}
247 | 	cudaFree(cuda_indata);
248 | 	cudaFree(cuda_outdata);
249 | }
250 | }
251 | 


--------------------------------------------------------------------------------
/md5.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * md5.cuh CUDA Implementation of MD5 digest       
 3 |  *
 4 |  * Date: 12 June 2019
 5 |  * Revision: 1
 6 |  * 
 7 |  * Based on the public domain Reference Implementation in C, by
 8 |  * Brad Conte, original code here:
 9 |  *
10 |  * https://github.com/B-Con/crypto-algorithms
11 |  *
12 |  * This file is released into the Public Domain.
13 |  */
14 | 
15 | 
16 | #pragma once
17 | #include "config.h"
18 | void mcm_cuda_md5_hash_batch(BYTE* in, WORD inlen, BYTE* out, WORD n_batch);
19 | 


--------------------------------------------------------------------------------
/sha1.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * sha1.cu CUDA Implementation of SHA1 Hashing       
  3 |  *
  4 |  * Date: 12 June 2019
  5 |  * Revision: 1
  6 |  * 
  7 |  * Based on the public domain Reference Implementation in C, by
  8 |  * Brad Conte, original code here:
  9 |  *
 10 |  * https://github.com/B-Con/crypto-algorithms
 11 |  *
 12 |  * This file is released into the Public Domain.
 13 |  */
 14 | 
 15 |  
 16 | /*************************** HEADER FILES ***************************/
 17 | #include <stdlib.h>
 18 | #include <memory.h>
 19 | extern "C" {
 20 | #include "sha1.cuh"
 21 | }
 22 | 
 23 | /****************************** MACROS ******************************/
 24 | #define SHA1_BLOCK_SIZE 20              // SHA1 outputs a 20 byte digest
 25 | 
 26 | /**************************** DATA TYPES ****************************/
 27 | typedef struct {
 28 | 	BYTE data[64];
 29 | 	WORD datalen;
 30 | 	unsigned long long bitlen;
 31 | 	WORD state[5];
 32 | 	WORD k[4];
 33 | } CUDA_SHA1_CTX;
 34 | 
 35 | /****************************** MACROS ******************************/
 36 | #ifndef ROTLEFT
 37 | #define ROTLEFT(a,b) (((a) << (b)) | ((a) >> (32-(b))))
 38 | #endif
 39 | 
 40 | /*********************** FUNCTION DEFINITIONS ***********************/
 41 | __device__  __forceinline__ void cuda_sha1_transform(CUDA_SHA1_CTX *ctx, const BYTE data[])
 42 | {
 43 | 	WORD a, b, c, d, e, i, j, t, m[80];
 44 | 
 45 | 	for (i = 0, j = 0; i < 16; ++i, j += 4)
 46 | 		m[i] = (data[j] << 24) + (data[j + 1] << 16) + (data[j + 2] << 8) + (data[j + 3]);
 47 | 	for ( ; i < 80; ++i) {
 48 | 		m[i] = (m[i - 3] ^ m[i - 8] ^ m[i - 14] ^ m[i - 16]);
 49 | 		m[i] = (m[i] << 1) | (m[i] >> 31);
 50 | 	}
 51 | 
 52 | 	a = ctx->state[0];
 53 | 	b = ctx->state[1];
 54 | 	c = ctx->state[2];
 55 | 	d = ctx->state[3];
 56 | 	e = ctx->state[4];
 57 | 
 58 | 	for (i = 0; i < 20; ++i) {
 59 | 		t = ROTLEFT(a, 5) + ((b & c) ^ (~b & d)) + e + ctx->k[0] + m[i];
 60 | 		e = d;
 61 | 		d = c;
 62 | 		c = ROTLEFT(b, 30);
 63 | 		b = a;
 64 | 		a = t;
 65 | 	}
 66 | 	for ( ; i < 40; ++i) {
 67 | 		t = ROTLEFT(a, 5) + (b ^ c ^ d) + e + ctx->k[1] + m[i];
 68 | 		e = d;
 69 | 		d = c;
 70 | 		c = ROTLEFT(b, 30);
 71 | 		b = a;
 72 | 		a = t;
 73 | 	}
 74 | 	for ( ; i < 60; ++i) {
 75 | 		t = ROTLEFT(a, 5) + ((b & c) ^ (b & d) ^ (c & d))  + e + ctx->k[2] + m[i];
 76 | 		e = d;
 77 | 		d = c;
 78 | 		c = ROTLEFT(b, 30);
 79 | 		b = a;
 80 | 		a = t;
 81 | 	}
 82 | 	for ( ; i < 80; ++i) {
 83 | 		t = ROTLEFT(a, 5) + (b ^ c ^ d) + e + ctx->k[3] + m[i];
 84 | 		e = d;
 85 | 		d = c;
 86 | 		c = ROTLEFT(b, 30);
 87 | 		b = a;
 88 | 		a = t;
 89 | 	}
 90 | 
 91 | 	ctx->state[0] += a;
 92 | 	ctx->state[1] += b;
 93 | 	ctx->state[2] += c;
 94 | 	ctx->state[3] += d;
 95 | 	ctx->state[4] += e;
 96 | }
 97 | 
 98 | __device__ void cuda_sha1_init(CUDA_SHA1_CTX *ctx)
 99 | {
100 | 	ctx->datalen = 0;
101 | 	ctx->bitlen = 0;
102 | 	ctx->state[0] = 0x67452301;
103 | 	ctx->state[1] = 0xEFCDAB89;
104 | 	ctx->state[2] = 0x98BADCFE;
105 | 	ctx->state[3] = 0x10325476;
106 | 	ctx->state[4] = 0xc3d2e1f0;
107 | 	ctx->k[0] = 0x5a827999;
108 | 	ctx->k[1] = 0x6ed9eba1;
109 | 	ctx->k[2] = 0x8f1bbcdc;
110 | 	ctx->k[3] = 0xca62c1d6;
111 | }
112 | 
113 | __device__ void cuda_sha1_update(CUDA_SHA1_CTX *ctx, const BYTE data[], size_t len)
114 | {
115 | 	size_t i;
116 | 
117 | 	for (i = 0; i < len; ++i) {
118 | 		ctx->data[ctx->datalen] = data[i];
119 | 		ctx->datalen++;
120 | 		if (ctx->datalen == 64) {
121 | 			cuda_sha1_transform(ctx, ctx->data);
122 | 			ctx->bitlen += 512;
123 | 			ctx->datalen = 0;
124 | 		}
125 | 	}
126 | }
127 | 
128 | __device__ void cuda_sha1_final(CUDA_SHA1_CTX *ctx, BYTE hash[])
129 | {
130 | 	WORD i;
131 | 
132 | 	i = ctx->datalen;
133 | 
134 | 	// Pad whatever data is left in the buffer.
135 | 	if (ctx->datalen < 56) {
136 | 		ctx->data[i++] = 0x80;
137 | 		while (i < 56)
138 | 			ctx->data[i++] = 0x00;
139 | 	}
140 | 	else {
141 | 		ctx->data[i++] = 0x80;
142 | 		while (i < 64)
143 | 			ctx->data[i++] = 0x00;
144 | 		cuda_sha1_transform(ctx, ctx->data);
145 | 		memset(ctx->data, 0, 56);
146 | 	}
147 | 
148 | 	// Append to the padding the total message's length in bits and transform.
149 | 	ctx->bitlen += ctx->datalen * 8;
150 | 	ctx->data[63] = ctx->bitlen;
151 | 	ctx->data[62] = ctx->bitlen >> 8;
152 | 	ctx->data[61] = ctx->bitlen >> 16;
153 | 	ctx->data[60] = ctx->bitlen >> 24;
154 | 	ctx->data[59] = ctx->bitlen >> 32;
155 | 	ctx->data[58] = ctx->bitlen >> 40;
156 | 	ctx->data[57] = ctx->bitlen >> 48;
157 | 	ctx->data[56] = ctx->bitlen >> 56;
158 | 	cuda_sha1_transform(ctx, ctx->data);
159 | 
160 | 	// Since this implementation uses little endian byte ordering and MD uses big endian,
161 | 	// reverse all the bytes when copying the final state to the output hash.
162 | 	for (i = 0; i < 4; ++i) {
163 | 		hash[i]      = (ctx->state[0] >> (24 - i * 8)) & 0x000000ff;
164 | 		hash[i + 4]  = (ctx->state[1] >> (24 - i * 8)) & 0x000000ff;
165 | 		hash[i + 8]  = (ctx->state[2] >> (24 - i * 8)) & 0x000000ff;
166 | 		hash[i + 12] = (ctx->state[3] >> (24 - i * 8)) & 0x000000ff;
167 | 		hash[i + 16] = (ctx->state[4] >> (24 - i * 8)) & 0x000000ff;
168 | 	}
169 | }
170 | 
171 | __global__ void kernel_sha1_hash(BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch)
172 | {
173 | 	WORD thread = blockIdx.x * blockDim.x + threadIdx.x;
174 | 	if (thread >= n_batch)
175 | 	{
176 | 		return;
177 | 	}
178 | 	BYTE* in = indata  + thread * inlen;
179 | 	BYTE* out = outdata  + thread * SHA1_BLOCK_SIZE;
180 | 	CUDA_SHA1_CTX ctx;
181 | 	cuda_sha1_init(&ctx);
182 | 	cuda_sha1_update(&ctx, in, inlen);
183 | 	cuda_sha1_final(&ctx, out);
184 | }
185 | 
186 | extern "C"
187 | {
188 | void mcm_cuda_sha1_hash_batch(BYTE* in, WORD inlen, BYTE* out, WORD n_batch)
189 | {
190 | 	BYTE *cuda_indata;
191 | 	BYTE *cuda_outdata;
192 | 	cudaMalloc(&cuda_indata, inlen * n_batch);
193 | 	cudaMalloc(&cuda_outdata, SHA1_BLOCK_SIZE * n_batch);
194 | 	cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice);
195 | 
196 | 	WORD thread = 256;
197 | 	WORD block = (n_batch + thread - 1) / thread;
198 | 
199 | 	kernel_sha1_hash << < block, thread >> > (cuda_indata, inlen, cuda_outdata, n_batch);
200 | 	cudaMemcpy(out, cuda_outdata, SHA1_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost);
201 | 	cudaDeviceSynchronize();
202 | 	cudaError_t error = cudaGetLastError();
203 | 	if (error != cudaSuccess) {
204 | 		printf("Error cuda sha1 hash: %s \n", cudaGetErrorString(error));
205 | 	}
206 | 	cudaFree(cuda_indata);
207 | 	cudaFree(cuda_outdata);
208 | }
209 | }
210 | 


--------------------------------------------------------------------------------
/sha1.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * sha1.cuh CUDA Implementation of SHA1 Hashing    
 3 |  *
 4 |  * Date: 12 June 2019
 5 |  * Revision: 1
 6 |  * 
 7 |  * Based on the public domain Reference Implementation in C, by
 8 |  * Brad Conte, original code here:
 9 |  *
10 |  * https://github.com/B-Con/crypto-algorithms
11 |  *
12 |  * This file is released into the Public Domain.
13 |  */
14 | 
15 |  
16 | #pragma once
17 | #include "config.h"
18 | void mcm_cuda_sha1_hash_batch(BYTE* in, WORD inlen, BYTE* out, WORD n_batch);
19 | 


--------------------------------------------------------------------------------
/sha256.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * sha256.cu Implementation of SHA256 Hashing    
  3 |  *
  4 |  * Date: 12 June 2019
  5 |  * Revision: 1
  6 |  * *
  7 |  * Based on the public domain Reference Implementation in C, by
  8 |  * Brad Conte, original code here:
  9 |  *
 10 |  * https://github.com/B-Con/crypto-algorithms
 11 |  *
 12 |  * This file is released into the Public Domain.
 13 |  */
 14 | 
 15 |  
 16 | /*************************** HEADER FILES ***************************/
 17 | #include <stdlib.h>
 18 | #include <memory.h>
 19 | extern "C" {
 20 | #include "sha256.cuh"
 21 | }
 22 | /****************************** MACROS ******************************/
 23 | #define SHA256_BLOCK_SIZE 32            // SHA256 outputs a 32 byte digest
 24 | 
 25 | /**************************** DATA TYPES ****************************/
 26 | 
 27 | typedef struct {
 28 | 	BYTE data[64];
 29 | 	WORD datalen;
 30 | 	unsigned long long bitlen;
 31 | 	WORD state[8];
 32 | } CUDA_SHA256_CTX;
 33 | 
 34 | /****************************** MACROS ******************************/
 35 | #ifndef ROTLEFT
 36 | #define ROTLEFT(a,b) (((a) << (b)) | ((a) >> (32-(b))))
 37 | #endif
 38 | 
 39 | #define ROTRIGHT(a,b) (((a) >> (b)) | ((a) << (32-(b))))
 40 | 
 41 | #define CH(x,y,z) (((x) & (y)) ^ (~(x) & (z)))
 42 | #define MAJ(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
 43 | #define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22))
 44 | #define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25))
 45 | #define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ ((x) >> 3))
 46 | #define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ ((x) >> 10))
 47 | 
 48 | /**************************** VARIABLES *****************************/
 49 | __constant__ WORD k[64] = {
 50 | 	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
 51 | 	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
 52 | 	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
 53 | 	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
 54 | 	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
 55 | 	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
 56 | 	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
 57 | 	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 58 | };
 59 | 
 60 | /*********************** FUNCTION DEFINITIONS ***********************/
 61 | __device__  __forceinline__ void cuda_sha256_transform(CUDA_SHA256_CTX *ctx, const BYTE data[])
 62 | {
 63 | 	WORD a, b, c, d, e, f, g, h, i, j, t1, t2, m[64];
 64 | 
 65 | 	for (i = 0, j = 0; i < 16; ++i, j += 4)
 66 | 		m[i] = (data[j] << 24) | (data[j + 1] << 16) | (data[j + 2] << 8) | (data[j + 3]);
 67 | 	for ( ; i < 64; ++i)
 68 | 		m[i] = SIG1(m[i - 2]) + m[i - 7] + SIG0(m[i - 15]) + m[i - 16];
 69 | 
 70 | 	a = ctx->state[0];
 71 | 	b = ctx->state[1];
 72 | 	c = ctx->state[2];
 73 | 	d = ctx->state[3];
 74 | 	e = ctx->state[4];
 75 | 	f = ctx->state[5];
 76 | 	g = ctx->state[6];
 77 | 	h = ctx->state[7];
 78 | 
 79 | 	for (i = 0; i < 64; ++i) {
 80 | 		t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i];
 81 | 		t2 = EP0(a) + MAJ(a,b,c);
 82 | 		h = g;
 83 | 		g = f;
 84 | 		f = e;
 85 | 		e = d + t1;
 86 | 		d = c;
 87 | 		c = b;
 88 | 		b = a;
 89 | 		a = t1 + t2;
 90 | 	}
 91 | 
 92 | 	ctx->state[0] += a;
 93 | 	ctx->state[1] += b;
 94 | 	ctx->state[2] += c;
 95 | 	ctx->state[3] += d;
 96 | 	ctx->state[4] += e;
 97 | 	ctx->state[5] += f;
 98 | 	ctx->state[6] += g;
 99 | 	ctx->state[7] += h;
100 | }
101 | 
102 | __device__ void cuda_sha256_init(CUDA_SHA256_CTX *ctx)
103 | {
104 | 	ctx->datalen = 0;
105 | 	ctx->bitlen = 0;
106 | 	ctx->state[0] = 0x6a09e667;
107 | 	ctx->state[1] = 0xbb67ae85;
108 | 	ctx->state[2] = 0x3c6ef372;
109 | 	ctx->state[3] = 0xa54ff53a;
110 | 	ctx->state[4] = 0x510e527f;
111 | 	ctx->state[5] = 0x9b05688c;
112 | 	ctx->state[6] = 0x1f83d9ab;
113 | 	ctx->state[7] = 0x5be0cd19;
114 | }
115 | 
116 | __device__ void cuda_sha256_update(CUDA_SHA256_CTX *ctx, const BYTE data[], size_t len)
117 | {
118 | 	WORD i;
119 | 
120 | 	for (i = 0; i < len; ++i) {
121 | 		ctx->data[ctx->datalen] = data[i];
122 | 		ctx->datalen++;
123 | 		if (ctx->datalen == 64) {
124 | 			cuda_sha256_transform(ctx, ctx->data);
125 | 			ctx->bitlen += 512;
126 | 			ctx->datalen = 0;
127 | 		}
128 | 	}
129 | }
130 | 
131 | __device__ void cuda_sha256_final(CUDA_SHA256_CTX *ctx, BYTE hash[])
132 | {
133 | 	WORD i;
134 | 
135 | 	i = ctx->datalen;
136 | 
137 | 	// Pad whatever data is left in the buffer.
138 | 	if (ctx->datalen < 56) {
139 | 		ctx->data[i++] = 0x80;
140 | 		while (i < 56)
141 | 			ctx->data[i++] = 0x00;
142 | 	}
143 | 	else {
144 | 		ctx->data[i++] = 0x80;
145 | 		while (i < 64)
146 | 			ctx->data[i++] = 0x00;
147 | 		cuda_sha256_transform(ctx, ctx->data);
148 | 		memset(ctx->data, 0, 56);
149 | 	}
150 | 
151 | 	// Append to the padding the total message's length in bits and transform.
152 | 	ctx->bitlen += ctx->datalen * 8;
153 | 	ctx->data[63] = ctx->bitlen;
154 | 	ctx->data[62] = ctx->bitlen >> 8;
155 | 	ctx->data[61] = ctx->bitlen >> 16;
156 | 	ctx->data[60] = ctx->bitlen >> 24;
157 | 	ctx->data[59] = ctx->bitlen >> 32;
158 | 	ctx->data[58] = ctx->bitlen >> 40;
159 | 	ctx->data[57] = ctx->bitlen >> 48;
160 | 	ctx->data[56] = ctx->bitlen >> 56;
161 | 	cuda_sha256_transform(ctx, ctx->data);
162 | 
163 | 	// Since this implementation uses little endian byte ordering and SHA uses big endian,
164 | 	// reverse all the bytes when copying the final state to the output hash.
165 | 	for (i = 0; i < 4; ++i) {
166 | 		hash[i]      = (ctx->state[0] >> (24 - i * 8)) & 0x000000ff;
167 | 		hash[i + 4]  = (ctx->state[1] >> (24 - i * 8)) & 0x000000ff;
168 | 		hash[i + 8]  = (ctx->state[2] >> (24 - i * 8)) & 0x000000ff;
169 | 		hash[i + 12] = (ctx->state[3] >> (24 - i * 8)) & 0x000000ff;
170 | 		hash[i + 16] = (ctx->state[4] >> (24 - i * 8)) & 0x000000ff;
171 | 		hash[i + 20] = (ctx->state[5] >> (24 - i * 8)) & 0x000000ff;
172 | 		hash[i + 24] = (ctx->state[6] >> (24 - i * 8)) & 0x000000ff;
173 | 		hash[i + 28] = (ctx->state[7] >> (24 - i * 8)) & 0x000000ff;
174 | 	}
175 | }
176 | 
177 | __global__ void kernel_sha256_hash(BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch)
178 | {
179 | 	WORD thread = blockIdx.x * blockDim.x + threadIdx.x;
180 | 	if (thread >= n_batch)
181 | 	{
182 | 		return;
183 | 	}
184 | 	BYTE* in = indata  + thread * inlen;
185 | 	BYTE* out = outdata  + thread * SHA256_BLOCK_SIZE;
186 | 	CUDA_SHA256_CTX ctx;
187 | 	cuda_sha256_init(&ctx);
188 | 	cuda_sha256_update(&ctx, in, inlen);
189 | 	cuda_sha256_final(&ctx, out);
190 | }
191 | 
192 | extern "C"
193 | {
194 | void mcm_cuda_sha256_hash_batch(BYTE* in, WORD inlen, BYTE* out, WORD n_batch)
195 | {
196 | 	BYTE *cuda_indata;
197 | 	BYTE *cuda_outdata;
198 | 	cudaMalloc(&cuda_indata, inlen * n_batch);
199 | 	cudaMalloc(&cuda_outdata, SHA256_BLOCK_SIZE * n_batch);
200 | 	cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice);
201 | 
202 | 	WORD thread = 256;
203 | 	WORD block = (n_batch + thread - 1) / thread;
204 | 
205 | 	kernel_sha256_hash << < block, thread >> > (cuda_indata, inlen, cuda_outdata, n_batch);
206 | 	cudaMemcpy(out, cuda_outdata, SHA256_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost);
207 | 	cudaDeviceSynchronize();
208 | 	cudaError_t error = cudaGetLastError();
209 | 	if (error != cudaSuccess) {
210 | 		printf("Error cuda sha256 hash: %s \n", cudaGetErrorString(error));
211 | 	}
212 | 	cudaFree(cuda_indata);
213 | 	cudaFree(cuda_outdata);
214 | }
215 | }
216 | 


--------------------------------------------------------------------------------
/sha256.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * sha256.cuh CUDA Implementation of SHA256 Hashing    
 3 |  *
 4 |  * Date: 12 June 2019
 5 |  * Revision: 1
 6 |  * 
 7 |  * Based on the public domain Reference Implementation in C, by
 8 |  * Brad Conte, original code here:
 9 |  *
10 |  * https://github.com/B-Con/crypto-algorithms
11 |  *
12 |  * This file is released into the Public Domain.
13 |  */
14 | 
15 | 
16 | #pragma once
17 | #include "config.h"
18 | void mcm_cuda_sha256_hash_batch(BYTE* in, WORD inlen, BYTE* out, WORD n_batch);
19 | 


--------------------------------------------------------------------------------