├── README.md ├── blake2b.cu ├── blake2b.cuh ├── config.h ├── keccak.cu ├── keccak.cuh ├── md2.cu ├── md2.cuh ├── md5.cu ├── md5.cuh ├── sha1.cu ├── sha1.cuh ├── sha256.cu └── sha256.cuh /README.md: -------------------------------------------------------------------------------- 1 | # CUDA Hashing Algorithms Collection 2 | #### Author: Matt Zweil & The Mochimo Core Contributor Team 3 | 4 | This repository of CUDA Hash functions is released into the Public Domain. 5 | 6 | To use any of the associated hashing functions, please include the config.h header file. 7 | 8 | Alternatively, you can just define these three definitions yourself, and omit the config.h file: 9 | 10 | ``` 11 | typedef unsigned char BYTE; 12 | typedef unsigned int WORD; 13 | typedef unsigned long long LONG; 14 | ``` 15 | 16 | Special thanks to Brad Conte for his original public domain C implementations of some of these algos. 17 | -------------------------------------------------------------------------------- /blake2b.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * blake2b.cu CUDA Implementation of BLAKE2B Hashing 3 | * 4 | * Date: 12 June 2019 5 | * Revision: 1 6 | * 7 | * This file is released into the Public Domain. 8 | */ 9 | 10 | 11 | #include 12 | extern "C" 13 | { 14 | #include "blake2b.cuh" 15 | } 16 | #define BLAKE2B_ROUNDS 12 17 | #define BLAKE2B_BLOCK_LENGTH 128 18 | #define BLAKE2B_CHAIN_SIZE 8 19 | #define BLAKE2B_CHAIN_LENGTH (BLAKE2B_CHAIN_SIZE * sizeof(int64_t)) 20 | #define BLAKE2B_STATE_SIZE 16 21 | #define BLAKE2B_STATE_LENGTH (BLAKE2B_STATE_SIZE * sizeof(int64_t)) 22 | extern "C" 23 | { 24 | typedef struct { 25 | 26 | WORD digestlen; 27 | BYTE key[64]; 28 | WORD keylen; 29 | 30 | BYTE buff[BLAKE2B_BLOCK_LENGTH]; 31 | int64_t chain[BLAKE2B_CHAIN_SIZE]; 32 | int64_t state[BLAKE2B_STATE_SIZE]; 33 | 34 | WORD pos; 35 | LONG t0; 36 | LONG t1; 37 | LONG f0; 38 | 39 | } cuda_blake2b_ctx_t; 40 | } 41 | typedef cuda_blake2b_ctx_t CUDA_BLAKE2B_CTX; 42 | 43 | __constant__ CUDA_BLAKE2B_CTX c_CTX; 44 | 45 | __constant__ LONG BLAKE2B_IVS[8] = 46 | { 47 | 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 48 | 0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 49 | 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 50 | }; 51 | 52 | const LONG CPU_BLAKE2B_IVS[8] = 53 | { 54 | 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 55 | 0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 56 | 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 57 | }; 58 | 59 | void cpu_blake2b_init(cuda_blake2b_ctx_t *ctx, BYTE* key, WORD keylen, WORD digestbitlen) 60 | { 61 | memset(ctx, 0, sizeof(cuda_blake2b_ctx_t)); 62 | memcpy(ctx->buff, key, keylen); 63 | memcpy(ctx->key, key, keylen); 64 | ctx->keylen = keylen; 65 | 66 | ctx->digestlen = digestbitlen >> 3; 67 | ctx->pos = 0; 68 | ctx->t0 = 0; 69 | ctx->t1 = 0; 70 | ctx->f0 = 0; 71 | ctx->chain[0] = CPU_BLAKE2B_IVS[0] ^ (ctx->digestlen | (ctx->keylen << 8) | 0x1010000); 72 | ctx->chain[1] = CPU_BLAKE2B_IVS[1]; 73 | ctx->chain[2] = CPU_BLAKE2B_IVS[2]; 74 | ctx->chain[3] = CPU_BLAKE2B_IVS[3]; 75 | ctx->chain[4] = CPU_BLAKE2B_IVS[4]; 76 | ctx->chain[5] = CPU_BLAKE2B_IVS[5]; 77 | ctx->chain[6] = CPU_BLAKE2B_IVS[6]; 78 | ctx->chain[7] = CPU_BLAKE2B_IVS[7]; 79 | 80 | 81 | ctx->pos = BLAKE2B_BLOCK_LENGTH; 82 | } 83 | 84 | 85 | 86 | __constant__ unsigned char BLAKE2B_SIGMAS[12][16] = 87 | { 88 | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 89 | { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, 90 | { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, 91 | { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, 92 | { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, 93 | { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, 94 | { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, 95 | { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, 96 | { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, 97 | { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, 98 | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 99 | { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } 100 | }; 101 | 102 | __device__ LONG cuda_blake2b_leuint64(BYTE *in) 103 | { 104 | LONG a; 105 | memcpy(&a, in, 8); 106 | return a; 107 | 108 | /* If memory is not little endian 109 | BYTE *a = (BYTE *)in; 110 | return ((LONG)(a[0]) << 0) | ((LONG)(a[1]) << 8) | ((LONG)(a[2]) << 16) | ((LONG)(a[3]) << 24) |((LONG)(a[4]) << 32) 111 | | ((LONG)(a[5]) << 40) | ((LONG)(a[6]) << 48) | ((LONG)(a[7]) << 56); 112 | */ 113 | } 114 | 115 | __device__ LONG cuda_blake2b_ROTR64(LONG a, BYTE b) 116 | { 117 | return (a >> b) | (a << (64 - b)); 118 | } 119 | 120 | __device__ void cuda_blake2b_G(cuda_blake2b_ctx_t *ctx, int64_t m1, int64_t m2, int32_t a, int32_t b, int32_t c, int32_t d) 121 | { 122 | ctx->state[a] = ctx->state[a] + ctx->state[b] + m1; 123 | ctx->state[d] = cuda_blake2b_ROTR64(ctx->state[d] ^ ctx->state[a], 32); 124 | ctx->state[c] = ctx->state[c] + ctx->state[d]; 125 | ctx->state[b] = cuda_blake2b_ROTR64(ctx->state[b] ^ ctx->state[c], 24); 126 | ctx->state[a] = ctx->state[a] + ctx->state[b] + m2; 127 | ctx->state[d] = cuda_blake2b_ROTR64(ctx->state[d] ^ ctx->state[a], 16); 128 | ctx->state[c] = ctx->state[c] + ctx->state[d]; 129 | ctx->state[b] = cuda_blake2b_ROTR64(ctx->state[b] ^ ctx->state[c], 63); 130 | } 131 | 132 | __device__ __forceinline__ void cuda_blake2b_init_state(cuda_blake2b_ctx_t *ctx) 133 | { 134 | memcpy(ctx->state, ctx->chain, BLAKE2B_CHAIN_LENGTH); 135 | for (int i = 0; i < 4; i++) 136 | ctx->state[BLAKE2B_CHAIN_SIZE + i] = BLAKE2B_IVS[i]; 137 | 138 | ctx->state[12] = ctx->t0 ^ BLAKE2B_IVS[4]; 139 | ctx->state[13] = ctx->t1 ^ BLAKE2B_IVS[5]; 140 | ctx->state[14] = ctx->f0 ^ BLAKE2B_IVS[6]; 141 | ctx->state[15] = BLAKE2B_IVS[7]; 142 | } 143 | 144 | __device__ __forceinline__ void cuda_blake2b_compress(cuda_blake2b_ctx_t *ctx, BYTE* in, WORD inoffset) 145 | { 146 | cuda_blake2b_init_state(ctx); 147 | 148 | LONG m[16] = {0}; 149 | for (int j = 0; j < 16; j++) 150 | m[j] = cuda_blake2b_leuint64(in + inoffset + (j << 3)); 151 | 152 | for (int round = 0; round < BLAKE2B_ROUNDS; round++) 153 | { 154 | cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][0]], m[BLAKE2B_SIGMAS[round][1]], 0, 4, 8, 12); 155 | cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][2]], m[BLAKE2B_SIGMAS[round][3]], 1, 5, 9, 13); 156 | cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][4]], m[BLAKE2B_SIGMAS[round][5]], 2, 6, 10, 14); 157 | cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][6]], m[BLAKE2B_SIGMAS[round][7]], 3, 7, 11, 15); 158 | cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][8]], m[BLAKE2B_SIGMAS[round][9]], 0, 5, 10, 15); 159 | cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][10]], m[BLAKE2B_SIGMAS[round][11]], 1, 6, 11, 12); 160 | cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][12]], m[BLAKE2B_SIGMAS[round][13]], 2, 7, 8, 13); 161 | cuda_blake2b_G(ctx, m[BLAKE2B_SIGMAS[round][14]], m[BLAKE2B_SIGMAS[round][15]], 3, 4, 9, 14); 162 | } 163 | 164 | for (int offset = 0; offset < BLAKE2B_CHAIN_SIZE; offset++) 165 | ctx->chain[offset] = ctx->chain[offset] ^ ctx->state[offset] ^ ctx->state[offset + 8]; 166 | } 167 | 168 | __device__ void cuda_blake2b_init(cuda_blake2b_ctx_t *ctx, BYTE* key, WORD keylen, WORD digestbitlen) 169 | { 170 | memset(ctx, 0, sizeof(cuda_blake2b_ctx_t)); 171 | 172 | ctx->keylen = keylen; 173 | ctx->digestlen = digestbitlen >> 3; 174 | ctx->pos = 0; 175 | ctx->t0 = 0; 176 | ctx->t1 = 0; 177 | ctx->f0 = 0; 178 | ctx->chain[0] = BLAKE2B_IVS[0] ^ (ctx->digestlen | (ctx->keylen << 8) | 0x1010000); 179 | ctx->chain[1] = BLAKE2B_IVS[1]; 180 | ctx->chain[2] = BLAKE2B_IVS[2]; 181 | ctx->chain[3] = BLAKE2B_IVS[3]; 182 | ctx->chain[4] = BLAKE2B_IVS[4]; 183 | ctx->chain[5] = BLAKE2B_IVS[5]; 184 | ctx->chain[6] = BLAKE2B_IVS[6]; 185 | ctx->chain[7] = BLAKE2B_IVS[7]; 186 | 187 | memcpy(ctx->buff, key, keylen); 188 | memcpy(ctx->key, key, keylen); 189 | ctx->pos = BLAKE2B_BLOCK_LENGTH; 190 | } 191 | 192 | __device__ void cuda_blake2b_update(cuda_blake2b_ctx_t *ctx, BYTE* in, LONG inlen) 193 | { 194 | if (inlen == 0) 195 | return; 196 | 197 | WORD start = 0; 198 | int64_t in_index = 0, block_index = 0; 199 | 200 | if (ctx->pos) 201 | { 202 | start = BLAKE2B_BLOCK_LENGTH - ctx->pos; 203 | if (start < inlen){ 204 | memcpy(ctx->buff + ctx->pos, in, start); 205 | ctx->t0 += BLAKE2B_BLOCK_LENGTH; 206 | 207 | if (ctx->t0 == 0) ctx->t1++; 208 | 209 | cuda_blake2b_compress(ctx, ctx->buff, 0); 210 | ctx->pos = 0; 211 | memset(ctx->buff, 0, BLAKE2B_BLOCK_LENGTH); 212 | } else { 213 | memcpy(ctx->buff + ctx->pos, in, inlen);//read the whole *in 214 | ctx->pos += inlen; 215 | return; 216 | } 217 | } 218 | 219 | block_index = inlen - BLAKE2B_BLOCK_LENGTH; 220 | for (in_index = start; in_index < block_index; in_index += BLAKE2B_BLOCK_LENGTH) 221 | { 222 | ctx->t0 += BLAKE2B_BLOCK_LENGTH; 223 | if (ctx->t0 == 0) 224 | ctx->t1++; 225 | 226 | cuda_blake2b_compress(ctx, in, in_index); 227 | } 228 | 229 | memcpy(ctx->buff, in + in_index, inlen - in_index); 230 | ctx->pos += inlen - in_index; 231 | } 232 | 233 | __device__ void cuda_blake2b_final(cuda_blake2b_ctx_t *ctx, BYTE* out) 234 | { 235 | ctx->f0 = 0xFFFFFFFFFFFFFFFFL; 236 | ctx->t0 += ctx->pos; 237 | if (ctx->pos > 0 && ctx->t0 == 0) 238 | ctx->t1++; 239 | 240 | cuda_blake2b_compress(ctx, ctx->buff, 0); 241 | memset(ctx->buff, 0, BLAKE2B_BLOCK_LENGTH); 242 | memset(ctx->state, 0, BLAKE2B_STATE_LENGTH); 243 | 244 | int i8 = 0; 245 | for (int i = 0; i < BLAKE2B_CHAIN_SIZE && ((i8 = i * 8) < ctx->digestlen); i++) 246 | { 247 | BYTE * BYTEs = (BYTE*)(&ctx->chain[i]); 248 | if (i8 < ctx->digestlen - 8) 249 | memcpy(out + i8, BYTEs, 8); 250 | else 251 | memcpy(out + i8, BYTEs, ctx->digestlen - i8); 252 | } 253 | } 254 | 255 | __global__ void kernel_blake2b_hash(BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch, WORD BLAKE2B_BLOCK_SIZE) 256 | { 257 | WORD thread = blockIdx.x * blockDim.x + threadIdx.x; 258 | if (thread >= n_batch) 259 | { 260 | return; 261 | } 262 | BYTE* in = indata + thread * inlen; 263 | BYTE* out = outdata + thread * BLAKE2B_BLOCK_SIZE; 264 | CUDA_BLAKE2B_CTX ctx = c_CTX; 265 | //if not precomputed CTX, call cuda_blake2b_init() with key 266 | cuda_blake2b_update(&ctx, in, inlen); 267 | cuda_blake2b_final(&ctx, out); 268 | } 269 | extern "C" 270 | { 271 | void mcm_cuda_blake2b_hash_batch(BYTE *key, WORD keylen, BYTE *in, WORD inlen, BYTE *out, WORD n_outbit, WORD n_batch) { 272 | BYTE * cuda_indata; 273 | BYTE * cuda_outdata; 274 | const WORD BLAKE2B_BLOCK_SIZE = (n_outbit >> 3); 275 | cudaMalloc(&cuda_indata, inlen * n_batch); 276 | cudaMalloc(&cuda_outdata, BLAKE2B_BLOCK_SIZE * n_batch); 277 | 278 | CUDA_BLAKE2B_CTX ctx; 279 | assert(keylen <= 128); // we must define keylen at host 280 | cpu_blake2b_init(&ctx, key, keylen, n_outbit); 281 | 282 | cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice); 283 | cudaMemcpyToSymbol(c_CTX, &ctx, sizeof(CUDA_BLAKE2B_CTX), 0, cudaMemcpyHostToDevice); 284 | 285 | WORD thread = 256; 286 | WORD block = (n_batch + thread - 1) / thread; 287 | 288 | kernel_blake2b_hash << < block, thread >> > (cuda_indata, inlen, cuda_outdata, n_batch, BLAKE2B_BLOCK_SIZE); 289 | cudaMemcpy(out, cuda_outdata, BLAKE2B_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost); 290 | cudaDeviceSynchronize(); 291 | cudaError_t error = cudaGetLastError(); 292 | if (error != cudaSuccess) { 293 | printf("Error cuda blake2b hash: %s \n", cudaGetErrorString(error)); 294 | } 295 | cudaFree(cuda_indata); 296 | cudaFree(cuda_outdata); 297 | } 298 | } -------------------------------------------------------------------------------- /blake2b.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * blake2b.cuh CUDA Implementation of BLAKE2B Hashing 3 | * 4 | * Date: 12 June 2019 5 | * Revision: 1 6 | * 7 | * This file is released into the Public Domain. 8 | */ 9 | 10 | 11 | #pragma once 12 | #include "config.h" 13 | void mcm_cuda_blake2b_hash_batch(BYTE* key, WORD keylen, BYTE * in, WORD inlen, BYTE * out, WORD n_outbit, WORD n_batch); 14 | -------------------------------------------------------------------------------- /config.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Type Definitions for CUDA Hashing Algos 3 | * 4 | * Date: 12 June 2019 5 | * Revision: 1 6 | * 7 | * This file is released into the Public Domain. 8 | */ 9 | 10 | #pragma once 11 | #define USE_MD2 1 12 | #define USE_MD5 1 13 | #define USE_SHA1 1 14 | #define USE_SHA256 1 15 | 16 | #define CUDA_HASH 1 17 | #define OCL_HASH 0 18 | 19 | typedef unsigned char BYTE; 20 | typedef unsigned int WORD; 21 | typedef unsigned long long LONG; 22 | 23 | #include 24 | #include 25 | #include 26 | -------------------------------------------------------------------------------- /keccak.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * keccak.cu Implementation of Keccak/SHA3 digest 3 | * 4 | * Date: 12 June 2019 5 | * Revision: 1 6 | * 7 | * This file is released into the Public Domain. 8 | */ 9 | 10 | 11 | extern "C" 12 | { 13 | #include "keccak.cuh" 14 | } 15 | 16 | #define KECCAK_ROUND 24 17 | #define KECCAK_STATE_SIZE 25 18 | #define KECCAK_Q_SIZE 192 19 | 20 | __constant__ LONG CUDA_KECCAK_CONSTS[24] = { 0x0000000000000001, 0x0000000000008082, 21 | 0x800000000000808a, 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 22 | 0x8000000000008009, 0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, 23 | 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003, 0x8000000000008002, 24 | 0x8000000000000080, 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, 0x8000000000008080, 25 | 0x0000000080000001, 0x8000000080008008 }; 26 | 27 | typedef struct { 28 | 29 | BYTE sha3_flag; 30 | WORD digestbitlen; 31 | LONG rate_bits; 32 | LONG rate_BYTEs; 33 | LONG absorb_round; 34 | 35 | int64_t state[KECCAK_STATE_SIZE]; 36 | BYTE q[KECCAK_Q_SIZE]; 37 | 38 | LONG bits_in_queue; 39 | 40 | } cuda_keccak_ctx_t; 41 | typedef cuda_keccak_ctx_t CUDA_KECCAK_CTX; 42 | 43 | __device__ LONG cuda_keccak_leuint64(void *in) 44 | { 45 | LONG a; 46 | memcpy(&a, in, 8); 47 | return a; 48 | } 49 | 50 | __device__ int64_t cuda_keccak_MIN(int64_t a, int64_t b) 51 | { 52 | if (a > b) return b; 53 | return a; 54 | } 55 | 56 | __device__ LONG cuda_keccak_UMIN(LONG a, LONG b) 57 | { 58 | if (a > b) return b; 59 | return a; 60 | } 61 | 62 | __device__ void cuda_keccak_extract(cuda_keccak_ctx_t *ctx) 63 | { 64 | LONG len = ctx->rate_bits >> 6; 65 | int64_t a; 66 | int s = sizeof(LONG); 67 | 68 | for (int i = 0;i < len;i++) { 69 | a = cuda_keccak_leuint64((int64_t*)&ctx->state[i]); 70 | memcpy(ctx->q + (i * s), &a, s); 71 | } 72 | } 73 | 74 | __device__ __forceinline__ LONG cuda_keccak_ROTL64(LONG a, LONG b) 75 | { 76 | return (a << b) | (a >> (64 - b)); 77 | } 78 | 79 | __device__ void cuda_keccak_permutations(cuda_keccak_ctx_t * ctx) 80 | { 81 | 82 | int64_t* A = ctx->state;; 83 | 84 | int64_t *a00 = A, *a01 = A + 1, *a02 = A + 2, *a03 = A + 3, *a04 = A + 4; 85 | int64_t *a05 = A + 5, *a06 = A + 6, *a07 = A + 7, *a08 = A + 8, *a09 = A + 9; 86 | int64_t *a10 = A + 10, *a11 = A + 11, *a12 = A + 12, *a13 = A + 13, *a14 = A + 14; 87 | int64_t *a15 = A + 15, *a16 = A + 16, *a17 = A + 17, *a18 = A + 18, *a19 = A + 19; 88 | int64_t *a20 = A + 20, *a21 = A + 21, *a22 = A + 22, *a23 = A + 23, *a24 = A + 24; 89 | 90 | for (int i = 0; i < KECCAK_ROUND; i++) { 91 | 92 | /* Theta */ 93 | int64_t c0 = *a00 ^ *a05 ^ *a10 ^ *a15 ^ *a20; 94 | int64_t c1 = *a01 ^ *a06 ^ *a11 ^ *a16 ^ *a21; 95 | int64_t c2 = *a02 ^ *a07 ^ *a12 ^ *a17 ^ *a22; 96 | int64_t c3 = *a03 ^ *a08 ^ *a13 ^ *a18 ^ *a23; 97 | int64_t c4 = *a04 ^ *a09 ^ *a14 ^ *a19 ^ *a24; 98 | 99 | int64_t d1 = cuda_keccak_ROTL64(c1, 1) ^ c4; 100 | int64_t d2 = cuda_keccak_ROTL64(c2, 1) ^ c0; 101 | int64_t d3 = cuda_keccak_ROTL64(c3, 1) ^ c1; 102 | int64_t d4 = cuda_keccak_ROTL64(c4, 1) ^ c2; 103 | int64_t d0 = cuda_keccak_ROTL64(c0, 1) ^ c3; 104 | 105 | *a00 ^= d1; 106 | *a05 ^= d1; 107 | *a10 ^= d1; 108 | *a15 ^= d1; 109 | *a20 ^= d1; 110 | *a01 ^= d2; 111 | *a06 ^= d2; 112 | *a11 ^= d2; 113 | *a16 ^= d2; 114 | *a21 ^= d2; 115 | *a02 ^= d3; 116 | *a07 ^= d3; 117 | *a12 ^= d3; 118 | *a17 ^= d3; 119 | *a22 ^= d3; 120 | *a03 ^= d4; 121 | *a08 ^= d4; 122 | *a13 ^= d4; 123 | *a18 ^= d4; 124 | *a23 ^= d4; 125 | *a04 ^= d0; 126 | *a09 ^= d0; 127 | *a14 ^= d0; 128 | *a19 ^= d0; 129 | *a24 ^= d0; 130 | 131 | /* Rho pi */ 132 | c1 = cuda_keccak_ROTL64(*a01, 1); 133 | *a01 = cuda_keccak_ROTL64(*a06, 44); 134 | *a06 = cuda_keccak_ROTL64(*a09, 20); 135 | *a09 = cuda_keccak_ROTL64(*a22, 61); 136 | *a22 = cuda_keccak_ROTL64(*a14, 39); 137 | *a14 = cuda_keccak_ROTL64(*a20, 18); 138 | *a20 = cuda_keccak_ROTL64(*a02, 62); 139 | *a02 = cuda_keccak_ROTL64(*a12, 43); 140 | *a12 = cuda_keccak_ROTL64(*a13, 25); 141 | *a13 = cuda_keccak_ROTL64(*a19, 8); 142 | *a19 = cuda_keccak_ROTL64(*a23, 56); 143 | *a23 = cuda_keccak_ROTL64(*a15, 41); 144 | *a15 = cuda_keccak_ROTL64(*a04, 27); 145 | *a04 = cuda_keccak_ROTL64(*a24, 14); 146 | *a24 = cuda_keccak_ROTL64(*a21, 2); 147 | *a21 = cuda_keccak_ROTL64(*a08, 55); 148 | *a08 = cuda_keccak_ROTL64(*a16, 45); 149 | *a16 = cuda_keccak_ROTL64(*a05, 36); 150 | *a05 = cuda_keccak_ROTL64(*a03, 28); 151 | *a03 = cuda_keccak_ROTL64(*a18, 21); 152 | *a18 = cuda_keccak_ROTL64(*a17, 15); 153 | *a17 = cuda_keccak_ROTL64(*a11, 10); 154 | *a11 = cuda_keccak_ROTL64(*a07, 6); 155 | *a07 = cuda_keccak_ROTL64(*a10, 3); 156 | *a10 = c1; 157 | 158 | /* Chi */ 159 | c0 = *a00 ^ (~*a01 & *a02); 160 | c1 = *a01 ^ (~*a02 & *a03); 161 | *a02 ^= ~*a03 & *a04; 162 | *a03 ^= ~*a04 & *a00; 163 | *a04 ^= ~*a00 & *a01; 164 | *a00 = c0; 165 | *a01 = c1; 166 | 167 | c0 = *a05 ^ (~*a06 & *a07); 168 | c1 = *a06 ^ (~*a07 & *a08); 169 | *a07 ^= ~*a08 & *a09; 170 | *a08 ^= ~*a09 & *a05; 171 | *a09 ^= ~*a05 & *a06; 172 | *a05 = c0; 173 | *a06 = c1; 174 | 175 | c0 = *a10 ^ (~*a11 & *a12); 176 | c1 = *a11 ^ (~*a12 & *a13); 177 | *a12 ^= ~*a13 & *a14; 178 | *a13 ^= ~*a14 & *a10; 179 | *a14 ^= ~*a10 & *a11; 180 | *a10 = c0; 181 | *a11 = c1; 182 | 183 | c0 = *a15 ^ (~*a16 & *a17); 184 | c1 = *a16 ^ (~*a17 & *a18); 185 | *a17 ^= ~*a18 & *a19; 186 | *a18 ^= ~*a19 & *a15; 187 | *a19 ^= ~*a15 & *a16; 188 | *a15 = c0; 189 | *a16 = c1; 190 | 191 | c0 = *a20 ^ (~*a21 & *a22); 192 | c1 = *a21 ^ (~*a22 & *a23); 193 | *a22 ^= ~*a23 & *a24; 194 | *a23 ^= ~*a24 & *a20; 195 | *a24 ^= ~*a20 & *a21; 196 | *a20 = c0; 197 | *a21 = c1; 198 | 199 | /* Iota */ 200 | *a00 ^= CUDA_KECCAK_CONSTS[i]; 201 | } 202 | } 203 | 204 | 205 | __device__ void cuda_keccak_absorb(cuda_keccak_ctx_t *ctx, BYTE* in) 206 | { 207 | 208 | LONG offset = 0; 209 | for (LONG i = 0; i < ctx->absorb_round; ++i) { 210 | ctx->state[i] ^= cuda_keccak_leuint64(in + offset); 211 | offset += 8; 212 | } 213 | 214 | cuda_keccak_permutations(ctx); 215 | } 216 | 217 | __device__ void cuda_keccak_pad(cuda_keccak_ctx_t *ctx) 218 | { 219 | ctx->q[ctx->bits_in_queue >> 3] |= (1L << (ctx->bits_in_queue & 7)); 220 | 221 | if (++(ctx->bits_in_queue) == ctx->rate_bits) { 222 | cuda_keccak_absorb(ctx, ctx->q); 223 | ctx->bits_in_queue = 0; 224 | } 225 | 226 | LONG full = ctx->bits_in_queue >> 6; 227 | LONG partial = ctx->bits_in_queue & 63; 228 | 229 | LONG offset = 0; 230 | for (int i = 0; i < full; ++i) { 231 | ctx->state[i] ^= cuda_keccak_leuint64(ctx->q + offset); 232 | offset += 8; 233 | } 234 | 235 | if (partial > 0) { 236 | LONG mask = (1L << partial) - 1; 237 | ctx->state[full] ^= cuda_keccak_leuint64(ctx->q + offset) & mask; 238 | } 239 | 240 | ctx->state[(ctx->rate_bits - 1) >> 6] ^= 9223372036854775808ULL;/* 1 << 63 */ 241 | 242 | cuda_keccak_permutations(ctx); 243 | cuda_keccak_extract(ctx); 244 | 245 | ctx->bits_in_queue = ctx->rate_bits; 246 | } 247 | 248 | /* 249 | * Digestbitlen must be 128 224 256 288 384 512 250 | */ 251 | __device__ void cuda_keccak_init(cuda_keccak_ctx_t *ctx, WORD digestbitlen) 252 | { 253 | memset(ctx, 0, sizeof(cuda_keccak_ctx_t)); 254 | ctx->sha3_flag = 0; 255 | ctx->digestbitlen = digestbitlen; 256 | ctx->rate_bits = 1600 - ((ctx->digestbitlen) << 1); 257 | ctx->rate_BYTEs = ctx->rate_bits >> 3; 258 | ctx->absorb_round = ctx->rate_bits >> 6; 259 | ctx->bits_in_queue = 0; 260 | } 261 | 262 | /* 263 | * Digestbitlen must be 224 256 384 512 264 | */ 265 | __device__ void cuda_keccak_sha3_init(cuda_keccak_ctx_t *ctx, WORD digestbitlen) 266 | { 267 | cuda_keccak_init(ctx, digestbitlen); 268 | ctx->sha3_flag = 1; 269 | } 270 | 271 | __device__ void cuda_keccak_update(cuda_keccak_ctx_t *ctx, BYTE *in, LONG inlen) 272 | { 273 | int64_t BYTEs = ctx->bits_in_queue >> 3; 274 | int64_t count = 0; 275 | while (count < inlen) { 276 | if (BYTEs == 0 && count <= ((int64_t)(inlen - ctx->rate_BYTEs))) { 277 | do { 278 | cuda_keccak_absorb(ctx, in + count); 279 | count += ctx->rate_BYTEs; 280 | } while (count <= ((int64_t)(inlen - ctx->rate_BYTEs))); 281 | } else { 282 | int64_t partial = cuda_keccak_MIN(ctx->rate_BYTEs - BYTEs, inlen - count); 283 | memcpy(ctx->q + BYTEs, in + count, partial); 284 | 285 | BYTEs += partial; 286 | count += partial; 287 | 288 | if (BYTEs == ctx->rate_BYTEs) { 289 | cuda_keccak_absorb(ctx, ctx->q); 290 | BYTEs = 0; 291 | } 292 | } 293 | } 294 | ctx->bits_in_queue = BYTEs << 3; 295 | } 296 | 297 | __device__ void cuda_keccak_final(cuda_keccak_ctx_t *ctx, BYTE *out) 298 | { 299 | if (ctx->sha3_flag) { 300 | int mask = (1 << 2) - 1; 301 | ctx->q[ctx->bits_in_queue >> 3] = (BYTE)(0x02 & mask); 302 | ctx->bits_in_queue += 2; 303 | } 304 | 305 | cuda_keccak_pad(ctx); 306 | LONG i = 0; 307 | 308 | while (i < ctx->digestbitlen) { 309 | if (ctx->bits_in_queue == 0) { 310 | cuda_keccak_permutations(ctx); 311 | cuda_keccak_extract(ctx); 312 | ctx->bits_in_queue = ctx->rate_bits; 313 | } 314 | 315 | LONG partial_block = cuda_keccak_UMIN(ctx->bits_in_queue, ctx->digestbitlen - i); 316 | memcpy(out + (i >> 3), ctx->q + (ctx->rate_BYTEs - (ctx->bits_in_queue >> 3)), partial_block >> 3); 317 | ctx->bits_in_queue -= partial_block; 318 | i += partial_block; 319 | } 320 | } 321 | 322 | __global__ void kernel_keccak_hash(BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch, WORD KECCAK_BLOCK_SIZE) 323 | { 324 | WORD thread = blockIdx.x * blockDim.x + threadIdx.x; 325 | if (thread >= n_batch) 326 | { 327 | return; 328 | } 329 | BYTE* in = indata + thread * inlen; 330 | BYTE* out = outdata + thread * KECCAK_BLOCK_SIZE; 331 | CUDA_KECCAK_CTX ctx; 332 | cuda_keccak_init(&ctx, KECCAK_BLOCK_SIZE << 3); 333 | cuda_keccak_update(&ctx, in, inlen); 334 | cuda_keccak_final(&ctx, out); 335 | } 336 | extern "C" 337 | { 338 | void mcm_cuda_keccak_hash_batch(BYTE * in, WORD inlen, BYTE * out, WORD n_outbit, WORD n_batch) 339 | { 340 | BYTE * cuda_indata; 341 | BYTE * cuda_outdata; 342 | const WORD KECCAK_BLOCK_SIZE = (n_outbit >> 3); 343 | cudaMalloc(&cuda_indata, inlen * n_batch); 344 | cudaMalloc(&cuda_outdata, KECCAK_BLOCK_SIZE * n_batch); 345 | cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice); 346 | 347 | WORD thread = 256; 348 | WORD block = (n_batch + thread - 1) / thread; 349 | 350 | kernel_keccak_hash << < block, thread >> > (cuda_indata, inlen, cuda_outdata, n_batch, KECCAK_BLOCK_SIZE); 351 | cudaMemcpy(out, cuda_outdata, KECCAK_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost); 352 | cudaDeviceSynchronize(); 353 | cudaError_t error = cudaGetLastError(); 354 | if (error != cudaSuccess) { 355 | printf("Error cuda keccak hash: %s \n", cudaGetErrorString(error)); 356 | } 357 | cudaFree(cuda_indata); 358 | cudaFree(cuda_outdata); 359 | } 360 | } -------------------------------------------------------------------------------- /keccak.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * keccak.cuh CUDA Implementation of BLAKE2B Hashing 3 | * 4 | * Date: 12 June 2019 5 | * Revision: 1 6 | * 7 | * This file is released into the Public Domain. 8 | */ 9 | 10 | 11 | #pragma once 12 | #include "config.h" 13 | void mcm_cuda_keccak_hash_batch(BYTE * in, WORD inlen, BYTE * out, WORD n_outbit, WORD n_batch); 14 | -------------------------------------------------------------------------------- /md2.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * md2.cu CUDA Implementation of MD2 digest 3 | * 4 | * Date: 12 June 2019 5 | * Revision: 1 6 | * 7 | * Based on the public domain Reference Implementation in C, by 8 | * Brad Conte, original code here: 9 | * 10 | * https://github.com/B-Con/crypto-algorithms 11 | * 12 | * This file is released into the Public Domain. 13 | */ 14 | 15 | 16 | /*************************** HEADER FILES ***************************/ 17 | #include 18 | #include 19 | #include 20 | #include 21 | extern "C" { 22 | #include "md2.cuh" 23 | } 24 | #define MD2_BLOCK_SIZE 16 25 | /**************************** STRUCT ********************************/ 26 | typedef struct { 27 | BYTE data[16]; 28 | BYTE state[48]; 29 | BYTE checksum[16]; 30 | int len; 31 | } CUDA_MD2_CTX; 32 | 33 | /**************************** VARIABLES *****************************/ 34 | __constant__ BYTE s[256] = { 35 | 41, 46, 67, 201, 162, 216, 124, 1, 61, 54, 84, 161, 236, 240, 6, 36 | 19, 98, 167, 5, 243, 192, 199, 115, 140, 152, 147, 43, 217, 188, 37 | 76, 130, 202, 30, 155, 87, 60, 253, 212, 224, 22, 103, 66, 111, 24, 38 | 138, 23, 229, 18, 190, 78, 196, 214, 218, 158, 222, 73, 160, 251, 39 | 245, 142, 187, 47, 238, 122, 169, 104, 121, 145, 21, 178, 7, 63, 40 | 148, 194, 16, 137, 11, 34, 95, 33, 128, 127, 93, 154, 90, 144, 50, 41 | 39, 53, 62, 204, 231, 191, 247, 151, 3, 255, 25, 48, 179, 72, 165, 42 | 181, 209, 215, 94, 146, 42, 172, 86, 170, 198, 79, 184, 56, 210, 43 | 150, 164, 125, 182, 118, 252, 107, 226, 156, 116, 4, 241, 69, 157, 44 | 112, 89, 100, 113, 135, 32, 134, 91, 207, 101, 230, 45, 168, 2, 27, 45 | 96, 37, 173, 174, 176, 185, 246, 28, 70, 97, 105, 52, 64, 126, 15, 46 | 85, 71, 163, 35, 221, 81, 175, 58, 195, 92, 249, 206, 186, 197, 47 | 234, 38, 44, 83, 13, 110, 133, 40, 132, 9, 211, 223, 205, 244, 65, 48 | 129, 77, 82, 106, 220, 55, 200, 108, 193, 171, 250, 36, 225, 123, 49 | 8, 12, 189, 177, 74, 120, 136, 149, 139, 227, 99, 232, 109, 233, 50 | 203, 213, 254, 59, 0, 29, 57, 242, 239, 183, 14, 102, 88, 208, 228, 51 | 166, 119, 114, 248, 235, 117, 75, 10, 49, 68, 80, 180, 143, 237, 52 | 31, 26, 219, 153, 141, 51, 159, 17, 131, 20 53 | }; 54 | 55 | /*********************** FUNCTION DEFINITIONS ***********************/ 56 | __device__ void cuda_md2_transform(CUDA_MD2_CTX *ctx, BYTE data[]) 57 | { 58 | int j,k,t; 59 | 60 | //memcpy(&ctx->state[16], data); 61 | for (j=0; j < 16; ++j) { 62 | ctx->state[j + 16] = data[j]; 63 | ctx->state[j + 32] = (ctx->state[j+16] ^ ctx->state[j]); 64 | } 65 | 66 | t = 0; 67 | for (j = 0; j < 18; ++j) { 68 | for (k = 0; k < 48; ++k) { 69 | ctx->state[k] ^= s[t]; 70 | t = ctx->state[k]; 71 | } 72 | t = (t+j) & 0xFF; 73 | } 74 | 75 | t = ctx->checksum[15]; 76 | for (j=0; j < 16; ++j) { 77 | ctx->checksum[j] ^= s[data[j] ^ t]; 78 | t = ctx->checksum[j]; 79 | } 80 | } 81 | 82 | __device__ void cuda_md2_init(CUDA_MD2_CTX *ctx) 83 | { 84 | int i; 85 | 86 | for (i=0; i < 48; ++i) 87 | ctx->state[i] = 0; 88 | for (i=0; i < 16; ++i) 89 | ctx->checksum[i] = 0; 90 | ctx->len = 0; 91 | } 92 | 93 | __device__ void cuda_md2_update(CUDA_MD2_CTX *ctx, const BYTE data[], size_t len) 94 | { 95 | size_t i; 96 | 97 | for (i = 0; i < len; ++i) { 98 | ctx->data[ctx->len] = data[i]; 99 | ctx->len++; 100 | if (ctx->len == MD2_BLOCK_SIZE) { 101 | cuda_md2_transform(ctx, ctx->data); 102 | ctx->len = 0; 103 | } 104 | } 105 | } 106 | 107 | __device__ void cuda_md2_final(CUDA_MD2_CTX *ctx, BYTE hash[]) 108 | { 109 | int to_pad; 110 | 111 | to_pad = MD2_BLOCK_SIZE - ctx->len; 112 | 113 | while (ctx->len < MD2_BLOCK_SIZE) 114 | ctx->data[ctx->len++] = to_pad; 115 | 116 | cuda_md2_transform(ctx, ctx->data); 117 | cuda_md2_transform(ctx, ctx->checksum); 118 | 119 | memcpy(hash, ctx->state, MD2_BLOCK_SIZE); 120 | } 121 | 122 | __global__ void kernel_md2_hash(BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch) 123 | { 124 | WORD thread = blockIdx.x * blockDim.x + threadIdx.x; 125 | if (thread >= n_batch) 126 | { 127 | return; 128 | } 129 | BYTE* in = indata + thread * inlen; 130 | BYTE* out = outdata + thread * MD2_BLOCK_SIZE; 131 | CUDA_MD2_CTX ctx; 132 | cuda_md2_init(&ctx); 133 | cuda_md2_update(&ctx, in, inlen); 134 | cuda_md2_final(&ctx, out); 135 | } 136 | extern "C" { 137 | void mcm_cuda_md2_hash_batch(BYTE *in, WORD inlen, BYTE *out, WORD n_batch) { 138 | BYTE *cuda_indata; 139 | BYTE *cuda_outdata; 140 | cudaMalloc(&cuda_indata, inlen * n_batch); 141 | cudaMalloc(&cuda_outdata, MD2_BLOCK_SIZE * n_batch); 142 | cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice); 143 | 144 | WORD thread = 256; 145 | WORD block = (n_batch + thread - 1) / thread; 146 | 147 | kernel_md2_hash << < block, thread >> > (cuda_indata, inlen, cuda_outdata, n_batch); 148 | cudaMemcpy(out, cuda_outdata, MD2_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost); 149 | cudaDeviceSynchronize(); 150 | cudaError_t error = cudaGetLastError(); 151 | if (error != cudaSuccess) { 152 | printf("Error cuda md2 hash: %s \n", cudaGetErrorString(error)); 153 | } 154 | cudaFree(cuda_indata); 155 | cudaFree(cuda_outdata); 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /md2.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * md2.cuh CUDA Implementation of MD2 digest 3 | * 4 | * Date: 12 June 2019 5 | * Revision: 1 6 | * 7 | * Based on the public domain Reference Implementation in C, by 8 | * Brad Conte, original code here: 9 | * 10 | * https://github.com/B-Con/crypto-algorithms 11 | * 12 | * This file is released into the Public Domain. 13 | */ 14 | 15 | 16 | #pragma once 17 | #include "config.h" 18 | void mcm_cuda_md2_hash_batch(BYTE* in, WORD inlen, BYTE* out, WORD n_batch); 19 | -------------------------------------------------------------------------------- /md5.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * md5.cu CUDA Implementation of MD5 digest 3 | * 4 | * Date: 12 June 2019 5 | * Revision: 1 6 | * 7 | * Based on the public domain Reference Implementation in C, by 8 | * Brad Conte, original code here: 9 | * 10 | * https://github.com/B-Con/crypto-algorithms 11 | * 12 | * This file is released into the Public Domain. 13 | */ 14 | 15 | 16 | /*************************** HEADER FILES ***************************/ 17 | #include 18 | #include 19 | extern "C" { 20 | #include "md5.cuh" 21 | } 22 | /****************************** MACROS ******************************/ 23 | #define MD5_BLOCK_SIZE 16 // MD5 outputs a 16 byte digest 24 | 25 | /**************************** DATA TYPES ****************************/ 26 | typedef unsigned char BYTE; // 8-bit byte 27 | typedef unsigned int WORD; // 32-bit word, change to "long" for 16-bit machines 28 | 29 | typedef struct { 30 | BYTE data[64]; 31 | WORD datalen; 32 | unsigned long long bitlen; 33 | WORD state[4]; 34 | } CUDA_MD5_CTX; 35 | 36 | /****************************** MACROS ******************************/ 37 | #ifndef ROTLEFT 38 | #define ROTLEFT(a,b) ((a << b) | (a >> (32-b))) 39 | #endif 40 | 41 | #define F(x,y,z) ((x & y) | (~x & z)) 42 | #define G(x,y,z) ((x & z) | (y & ~z)) 43 | #define H(x,y,z) (x ^ y ^ z) 44 | #define I(x,y,z) (y ^ (x | ~z)) 45 | 46 | #define FF(a,b,c,d,m,s,t) { a += F(b,c,d) + m + t; \ 47 | a = b + ROTLEFT(a,s); } 48 | #define GG(a,b,c,d,m,s,t) { a += G(b,c,d) + m + t; \ 49 | a = b + ROTLEFT(a,s); } 50 | #define HH(a,b,c,d,m,s,t) { a += H(b,c,d) + m + t; \ 51 | a = b + ROTLEFT(a,s); } 52 | #define II(a,b,c,d,m,s,t) { a += I(b,c,d) + m + t; \ 53 | a = b + ROTLEFT(a,s); } 54 | 55 | /*********************** FUNCTION DEFINITIONS ***********************/ 56 | __device__ void cuda_md5_transform(CUDA_MD5_CTX *ctx, const BYTE data[]) 57 | { 58 | WORD a, b, c, d, m[16], i, j; 59 | 60 | // MD5 specifies big endian byte order, but this implementation assumes a little 61 | // endian byte order CPU. Reverse all the bytes upon input, and re-reverse them 62 | // on output (in md5_final()). 63 | for (i = 0, j = 0; i < 16; ++i, j += 4) 64 | m[i] = (data[j]) + (data[j + 1] << 8) + (data[j + 2] << 16) + (data[j + 3] << 24); 65 | 66 | a = ctx->state[0]; 67 | b = ctx->state[1]; 68 | c = ctx->state[2]; 69 | d = ctx->state[3]; 70 | 71 | FF(a,b,c,d,m[0], 7,0xd76aa478); 72 | FF(d,a,b,c,m[1], 12,0xe8c7b756); 73 | FF(c,d,a,b,m[2], 17,0x242070db); 74 | FF(b,c,d,a,m[3], 22,0xc1bdceee); 75 | FF(a,b,c,d,m[4], 7,0xf57c0faf); 76 | FF(d,a,b,c,m[5], 12,0x4787c62a); 77 | FF(c,d,a,b,m[6], 17,0xa8304613); 78 | FF(b,c,d,a,m[7], 22,0xfd469501); 79 | FF(a,b,c,d,m[8], 7,0x698098d8); 80 | FF(d,a,b,c,m[9], 12,0x8b44f7af); 81 | FF(c,d,a,b,m[10],17,0xffff5bb1); 82 | FF(b,c,d,a,m[11],22,0x895cd7be); 83 | FF(a,b,c,d,m[12], 7,0x6b901122); 84 | FF(d,a,b,c,m[13],12,0xfd987193); 85 | FF(c,d,a,b,m[14],17,0xa679438e); 86 | FF(b,c,d,a,m[15],22,0x49b40821); 87 | 88 | GG(a,b,c,d,m[1], 5,0xf61e2562); 89 | GG(d,a,b,c,m[6], 9,0xc040b340); 90 | GG(c,d,a,b,m[11],14,0x265e5a51); 91 | GG(b,c,d,a,m[0], 20,0xe9b6c7aa); 92 | GG(a,b,c,d,m[5], 5,0xd62f105d); 93 | GG(d,a,b,c,m[10], 9,0x02441453); 94 | GG(c,d,a,b,m[15],14,0xd8a1e681); 95 | GG(b,c,d,a,m[4], 20,0xe7d3fbc8); 96 | GG(a,b,c,d,m[9], 5,0x21e1cde6); 97 | GG(d,a,b,c,m[14], 9,0xc33707d6); 98 | GG(c,d,a,b,m[3], 14,0xf4d50d87); 99 | GG(b,c,d,a,m[8], 20,0x455a14ed); 100 | GG(a,b,c,d,m[13], 5,0xa9e3e905); 101 | GG(d,a,b,c,m[2], 9,0xfcefa3f8); 102 | GG(c,d,a,b,m[7], 14,0x676f02d9); 103 | GG(b,c,d,a,m[12],20,0x8d2a4c8a); 104 | 105 | HH(a,b,c,d,m[5], 4,0xfffa3942); 106 | HH(d,a,b,c,m[8], 11,0x8771f681); 107 | HH(c,d,a,b,m[11],16,0x6d9d6122); 108 | HH(b,c,d,a,m[14],23,0xfde5380c); 109 | HH(a,b,c,d,m[1], 4,0xa4beea44); 110 | HH(d,a,b,c,m[4], 11,0x4bdecfa9); 111 | HH(c,d,a,b,m[7], 16,0xf6bb4b60); 112 | HH(b,c,d,a,m[10],23,0xbebfbc70); 113 | HH(a,b,c,d,m[13], 4,0x289b7ec6); 114 | HH(d,a,b,c,m[0], 11,0xeaa127fa); 115 | HH(c,d,a,b,m[3], 16,0xd4ef3085); 116 | HH(b,c,d,a,m[6], 23,0x04881d05); 117 | HH(a,b,c,d,m[9], 4,0xd9d4d039); 118 | HH(d,a,b,c,m[12],11,0xe6db99e5); 119 | HH(c,d,a,b,m[15],16,0x1fa27cf8); 120 | HH(b,c,d,a,m[2], 23,0xc4ac5665); 121 | 122 | II(a,b,c,d,m[0], 6,0xf4292244); 123 | II(d,a,b,c,m[7], 10,0x432aff97); 124 | II(c,d,a,b,m[14],15,0xab9423a7); 125 | II(b,c,d,a,m[5], 21,0xfc93a039); 126 | II(a,b,c,d,m[12], 6,0x655b59c3); 127 | II(d,a,b,c,m[3], 10,0x8f0ccc92); 128 | II(c,d,a,b,m[10],15,0xffeff47d); 129 | II(b,c,d,a,m[1], 21,0x85845dd1); 130 | II(a,b,c,d,m[8], 6,0x6fa87e4f); 131 | II(d,a,b,c,m[15],10,0xfe2ce6e0); 132 | II(c,d,a,b,m[6], 15,0xa3014314); 133 | II(b,c,d,a,m[13],21,0x4e0811a1); 134 | II(a,b,c,d,m[4], 6,0xf7537e82); 135 | II(d,a,b,c,m[11],10,0xbd3af235); 136 | II(c,d,a,b,m[2], 15,0x2ad7d2bb); 137 | II(b,c,d,a,m[9], 21,0xeb86d391); 138 | 139 | ctx->state[0] += a; 140 | ctx->state[1] += b; 141 | ctx->state[2] += c; 142 | ctx->state[3] += d; 143 | } 144 | 145 | __device__ void cuda_md5_init(CUDA_MD5_CTX *ctx) 146 | { 147 | ctx->datalen = 0; 148 | ctx->bitlen = 0; 149 | ctx->state[0] = 0x67452301; 150 | ctx->state[1] = 0xEFCDAB89; 151 | ctx->state[2] = 0x98BADCFE; 152 | ctx->state[3] = 0x10325476; 153 | } 154 | 155 | __device__ void cuda_md5_update(CUDA_MD5_CTX *ctx, const BYTE data[], size_t len) 156 | { 157 | size_t i; 158 | 159 | for (i = 0; i < len; ++i) { 160 | ctx->data[ctx->datalen] = data[i]; 161 | ctx->datalen++; 162 | if (ctx->datalen == 64) { 163 | cuda_md5_transform(ctx, ctx->data); 164 | ctx->bitlen += 512; 165 | ctx->datalen = 0; 166 | } 167 | } 168 | } 169 | 170 | __device__ void cuda_md5_final(CUDA_MD5_CTX *ctx, BYTE hash[]) 171 | { 172 | size_t i; 173 | 174 | i = ctx->datalen; 175 | 176 | // Pad whatever data is left in the buffer. 177 | if (ctx->datalen < 56) { 178 | ctx->data[i++] = 0x80; 179 | while (i < 56) 180 | ctx->data[i++] = 0x00; 181 | } 182 | else if (ctx->datalen >= 56) { 183 | ctx->data[i++] = 0x80; 184 | while (i < 64) 185 | ctx->data[i++] = 0x00; 186 | cuda_md5_transform(ctx, ctx->data); 187 | memset(ctx->data, 0, 56); 188 | } 189 | 190 | // Append to the padding the total message's length in bits and transform. 191 | ctx->bitlen += ctx->datalen * 8; 192 | ctx->data[56] = ctx->bitlen; 193 | ctx->data[57] = ctx->bitlen >> 8; 194 | ctx->data[58] = ctx->bitlen >> 16; 195 | ctx->data[59] = ctx->bitlen >> 24; 196 | ctx->data[60] = ctx->bitlen >> 32; 197 | ctx->data[61] = ctx->bitlen >> 40; 198 | ctx->data[62] = ctx->bitlen >> 48; 199 | ctx->data[63] = ctx->bitlen >> 56; 200 | cuda_md5_transform(ctx, ctx->data); 201 | 202 | // Since this implementation uses little endian byte ordering and MD uses big endian, 203 | // reverse all the bytes when copying the final state to the output hash. 204 | for (i = 0; i < 4; ++i) { 205 | hash[i] = (ctx->state[0] >> (i * 8)) & 0x000000ff; 206 | hash[i + 4] = (ctx->state[1] >> (i * 8)) & 0x000000ff; 207 | hash[i + 8] = (ctx->state[2] >> (i * 8)) & 0x000000ff; 208 | hash[i + 12] = (ctx->state[3] >> (i * 8)) & 0x000000ff; 209 | } 210 | } 211 | 212 | __global__ void kernel_md5_hash(BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch) 213 | { 214 | WORD thread = blockIdx.x * blockDim.x + threadIdx.x; 215 | if (thread >= n_batch) 216 | { 217 | return; 218 | } 219 | BYTE* in = indata + thread * inlen; 220 | BYTE* out = outdata + thread * MD5_BLOCK_SIZE; 221 | CUDA_MD5_CTX ctx; 222 | cuda_md5_init(&ctx); 223 | cuda_md5_update(&ctx, in, inlen); 224 | cuda_md5_final(&ctx, out); 225 | } 226 | 227 | extern "C" 228 | { 229 | void mcm_cuda_md5_hash_batch(BYTE* in, WORD inlen, BYTE* out, WORD n_batch) 230 | { 231 | BYTE *cuda_indata; 232 | BYTE *cuda_outdata; 233 | cudaMalloc(&cuda_indata, inlen * n_batch); 234 | cudaMalloc(&cuda_outdata, MD5_BLOCK_SIZE * n_batch); 235 | cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice); 236 | 237 | WORD thread = 256; 238 | WORD block = (n_batch + thread - 1) / thread; 239 | 240 | kernel_md5_hash << < block, thread >> > (cuda_indata, inlen, cuda_outdata, n_batch); 241 | cudaMemcpy(out, cuda_outdata, MD5_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost); 242 | cudaDeviceSynchronize(); 243 | cudaError_t error = cudaGetLastError(); 244 | if (error != cudaSuccess) { 245 | printf("Error cuda md5 hash: %s \n", cudaGetErrorString(error)); 246 | } 247 | cudaFree(cuda_indata); 248 | cudaFree(cuda_outdata); 249 | } 250 | } 251 | -------------------------------------------------------------------------------- /md5.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * md5.cuh CUDA Implementation of MD5 digest 3 | * 4 | * Date: 12 June 2019 5 | * Revision: 1 6 | * 7 | * Based on the public domain Reference Implementation in C, by 8 | * Brad Conte, original code here: 9 | * 10 | * https://github.com/B-Con/crypto-algorithms 11 | * 12 | * This file is released into the Public Domain. 13 | */ 14 | 15 | 16 | #pragma once 17 | #include "config.h" 18 | void mcm_cuda_md5_hash_batch(BYTE* in, WORD inlen, BYTE* out, WORD n_batch); 19 | -------------------------------------------------------------------------------- /sha1.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * sha1.cu CUDA Implementation of SHA1 Hashing 3 | * 4 | * Date: 12 June 2019 5 | * Revision: 1 6 | * 7 | * Based on the public domain Reference Implementation in C, by 8 | * Brad Conte, original code here: 9 | * 10 | * https://github.com/B-Con/crypto-algorithms 11 | * 12 | * This file is released into the Public Domain. 13 | */ 14 | 15 | 16 | /*************************** HEADER FILES ***************************/ 17 | #include 18 | #include 19 | extern "C" { 20 | #include "sha1.cuh" 21 | } 22 | 23 | /****************************** MACROS ******************************/ 24 | #define SHA1_BLOCK_SIZE 20 // SHA1 outputs a 20 byte digest 25 | 26 | /**************************** DATA TYPES ****************************/ 27 | typedef struct { 28 | BYTE data[64]; 29 | WORD datalen; 30 | unsigned long long bitlen; 31 | WORD state[5]; 32 | WORD k[4]; 33 | } CUDA_SHA1_CTX; 34 | 35 | /****************************** MACROS ******************************/ 36 | #ifndef ROTLEFT 37 | #define ROTLEFT(a,b) (((a) << (b)) | ((a) >> (32-(b)))) 38 | #endif 39 | 40 | /*********************** FUNCTION DEFINITIONS ***********************/ 41 | __device__ __forceinline__ void cuda_sha1_transform(CUDA_SHA1_CTX *ctx, const BYTE data[]) 42 | { 43 | WORD a, b, c, d, e, i, j, t, m[80]; 44 | 45 | for (i = 0, j = 0; i < 16; ++i, j += 4) 46 | m[i] = (data[j] << 24) + (data[j + 1] << 16) + (data[j + 2] << 8) + (data[j + 3]); 47 | for ( ; i < 80; ++i) { 48 | m[i] = (m[i - 3] ^ m[i - 8] ^ m[i - 14] ^ m[i - 16]); 49 | m[i] = (m[i] << 1) | (m[i] >> 31); 50 | } 51 | 52 | a = ctx->state[0]; 53 | b = ctx->state[1]; 54 | c = ctx->state[2]; 55 | d = ctx->state[3]; 56 | e = ctx->state[4]; 57 | 58 | for (i = 0; i < 20; ++i) { 59 | t = ROTLEFT(a, 5) + ((b & c) ^ (~b & d)) + e + ctx->k[0] + m[i]; 60 | e = d; 61 | d = c; 62 | c = ROTLEFT(b, 30); 63 | b = a; 64 | a = t; 65 | } 66 | for ( ; i < 40; ++i) { 67 | t = ROTLEFT(a, 5) + (b ^ c ^ d) + e + ctx->k[1] + m[i]; 68 | e = d; 69 | d = c; 70 | c = ROTLEFT(b, 30); 71 | b = a; 72 | a = t; 73 | } 74 | for ( ; i < 60; ++i) { 75 | t = ROTLEFT(a, 5) + ((b & c) ^ (b & d) ^ (c & d)) + e + ctx->k[2] + m[i]; 76 | e = d; 77 | d = c; 78 | c = ROTLEFT(b, 30); 79 | b = a; 80 | a = t; 81 | } 82 | for ( ; i < 80; ++i) { 83 | t = ROTLEFT(a, 5) + (b ^ c ^ d) + e + ctx->k[3] + m[i]; 84 | e = d; 85 | d = c; 86 | c = ROTLEFT(b, 30); 87 | b = a; 88 | a = t; 89 | } 90 | 91 | ctx->state[0] += a; 92 | ctx->state[1] += b; 93 | ctx->state[2] += c; 94 | ctx->state[3] += d; 95 | ctx->state[4] += e; 96 | } 97 | 98 | __device__ void cuda_sha1_init(CUDA_SHA1_CTX *ctx) 99 | { 100 | ctx->datalen = 0; 101 | ctx->bitlen = 0; 102 | ctx->state[0] = 0x67452301; 103 | ctx->state[1] = 0xEFCDAB89; 104 | ctx->state[2] = 0x98BADCFE; 105 | ctx->state[3] = 0x10325476; 106 | ctx->state[4] = 0xc3d2e1f0; 107 | ctx->k[0] = 0x5a827999; 108 | ctx->k[1] = 0x6ed9eba1; 109 | ctx->k[2] = 0x8f1bbcdc; 110 | ctx->k[3] = 0xca62c1d6; 111 | } 112 | 113 | __device__ void cuda_sha1_update(CUDA_SHA1_CTX *ctx, const BYTE data[], size_t len) 114 | { 115 | size_t i; 116 | 117 | for (i = 0; i < len; ++i) { 118 | ctx->data[ctx->datalen] = data[i]; 119 | ctx->datalen++; 120 | if (ctx->datalen == 64) { 121 | cuda_sha1_transform(ctx, ctx->data); 122 | ctx->bitlen += 512; 123 | ctx->datalen = 0; 124 | } 125 | } 126 | } 127 | 128 | __device__ void cuda_sha1_final(CUDA_SHA1_CTX *ctx, BYTE hash[]) 129 | { 130 | WORD i; 131 | 132 | i = ctx->datalen; 133 | 134 | // Pad whatever data is left in the buffer. 135 | if (ctx->datalen < 56) { 136 | ctx->data[i++] = 0x80; 137 | while (i < 56) 138 | ctx->data[i++] = 0x00; 139 | } 140 | else { 141 | ctx->data[i++] = 0x80; 142 | while (i < 64) 143 | ctx->data[i++] = 0x00; 144 | cuda_sha1_transform(ctx, ctx->data); 145 | memset(ctx->data, 0, 56); 146 | } 147 | 148 | // Append to the padding the total message's length in bits and transform. 149 | ctx->bitlen += ctx->datalen * 8; 150 | ctx->data[63] = ctx->bitlen; 151 | ctx->data[62] = ctx->bitlen >> 8; 152 | ctx->data[61] = ctx->bitlen >> 16; 153 | ctx->data[60] = ctx->bitlen >> 24; 154 | ctx->data[59] = ctx->bitlen >> 32; 155 | ctx->data[58] = ctx->bitlen >> 40; 156 | ctx->data[57] = ctx->bitlen >> 48; 157 | ctx->data[56] = ctx->bitlen >> 56; 158 | cuda_sha1_transform(ctx, ctx->data); 159 | 160 | // Since this implementation uses little endian byte ordering and MD uses big endian, 161 | // reverse all the bytes when copying the final state to the output hash. 162 | for (i = 0; i < 4; ++i) { 163 | hash[i] = (ctx->state[0] >> (24 - i * 8)) & 0x000000ff; 164 | hash[i + 4] = (ctx->state[1] >> (24 - i * 8)) & 0x000000ff; 165 | hash[i + 8] = (ctx->state[2] >> (24 - i * 8)) & 0x000000ff; 166 | hash[i + 12] = (ctx->state[3] >> (24 - i * 8)) & 0x000000ff; 167 | hash[i + 16] = (ctx->state[4] >> (24 - i * 8)) & 0x000000ff; 168 | } 169 | } 170 | 171 | __global__ void kernel_sha1_hash(BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch) 172 | { 173 | WORD thread = blockIdx.x * blockDim.x + threadIdx.x; 174 | if (thread >= n_batch) 175 | { 176 | return; 177 | } 178 | BYTE* in = indata + thread * inlen; 179 | BYTE* out = outdata + thread * SHA1_BLOCK_SIZE; 180 | CUDA_SHA1_CTX ctx; 181 | cuda_sha1_init(&ctx); 182 | cuda_sha1_update(&ctx, in, inlen); 183 | cuda_sha1_final(&ctx, out); 184 | } 185 | 186 | extern "C" 187 | { 188 | void mcm_cuda_sha1_hash_batch(BYTE* in, WORD inlen, BYTE* out, WORD n_batch) 189 | { 190 | BYTE *cuda_indata; 191 | BYTE *cuda_outdata; 192 | cudaMalloc(&cuda_indata, inlen * n_batch); 193 | cudaMalloc(&cuda_outdata, SHA1_BLOCK_SIZE * n_batch); 194 | cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice); 195 | 196 | WORD thread = 256; 197 | WORD block = (n_batch + thread - 1) / thread; 198 | 199 | kernel_sha1_hash << < block, thread >> > (cuda_indata, inlen, cuda_outdata, n_batch); 200 | cudaMemcpy(out, cuda_outdata, SHA1_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost); 201 | cudaDeviceSynchronize(); 202 | cudaError_t error = cudaGetLastError(); 203 | if (error != cudaSuccess) { 204 | printf("Error cuda sha1 hash: %s \n", cudaGetErrorString(error)); 205 | } 206 | cudaFree(cuda_indata); 207 | cudaFree(cuda_outdata); 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /sha1.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * sha1.cuh CUDA Implementation of SHA1 Hashing 3 | * 4 | * Date: 12 June 2019 5 | * Revision: 1 6 | * 7 | * Based on the public domain Reference Implementation in C, by 8 | * Brad Conte, original code here: 9 | * 10 | * https://github.com/B-Con/crypto-algorithms 11 | * 12 | * This file is released into the Public Domain. 13 | */ 14 | 15 | 16 | #pragma once 17 | #include "config.h" 18 | void mcm_cuda_sha1_hash_batch(BYTE* in, WORD inlen, BYTE* out, WORD n_batch); 19 | -------------------------------------------------------------------------------- /sha256.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * sha256.cu Implementation of SHA256 Hashing 3 | * 4 | * Date: 12 June 2019 5 | * Revision: 1 6 | * * 7 | * Based on the public domain Reference Implementation in C, by 8 | * Brad Conte, original code here: 9 | * 10 | * https://github.com/B-Con/crypto-algorithms 11 | * 12 | * This file is released into the Public Domain. 13 | */ 14 | 15 | 16 | /*************************** HEADER FILES ***************************/ 17 | #include 18 | #include 19 | extern "C" { 20 | #include "sha256.cuh" 21 | } 22 | /****************************** MACROS ******************************/ 23 | #define SHA256_BLOCK_SIZE 32 // SHA256 outputs a 32 byte digest 24 | 25 | /**************************** DATA TYPES ****************************/ 26 | 27 | typedef struct { 28 | BYTE data[64]; 29 | WORD datalen; 30 | unsigned long long bitlen; 31 | WORD state[8]; 32 | } CUDA_SHA256_CTX; 33 | 34 | /****************************** MACROS ******************************/ 35 | #ifndef ROTLEFT 36 | #define ROTLEFT(a,b) (((a) << (b)) | ((a) >> (32-(b)))) 37 | #endif 38 | 39 | #define ROTRIGHT(a,b) (((a) >> (b)) | ((a) << (32-(b)))) 40 | 41 | #define CH(x,y,z) (((x) & (y)) ^ (~(x) & (z))) 42 | #define MAJ(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) 43 | #define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22)) 44 | #define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25)) 45 | #define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ ((x) >> 3)) 46 | #define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ ((x) >> 10)) 47 | 48 | /**************************** VARIABLES *****************************/ 49 | __constant__ WORD k[64] = { 50 | 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 51 | 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 52 | 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 53 | 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 54 | 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 55 | 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 56 | 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 57 | 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 58 | }; 59 | 60 | /*********************** FUNCTION DEFINITIONS ***********************/ 61 | __device__ __forceinline__ void cuda_sha256_transform(CUDA_SHA256_CTX *ctx, const BYTE data[]) 62 | { 63 | WORD a, b, c, d, e, f, g, h, i, j, t1, t2, m[64]; 64 | 65 | for (i = 0, j = 0; i < 16; ++i, j += 4) 66 | m[i] = (data[j] << 24) | (data[j + 1] << 16) | (data[j + 2] << 8) | (data[j + 3]); 67 | for ( ; i < 64; ++i) 68 | m[i] = SIG1(m[i - 2]) + m[i - 7] + SIG0(m[i - 15]) + m[i - 16]; 69 | 70 | a = ctx->state[0]; 71 | b = ctx->state[1]; 72 | c = ctx->state[2]; 73 | d = ctx->state[3]; 74 | e = ctx->state[4]; 75 | f = ctx->state[5]; 76 | g = ctx->state[6]; 77 | h = ctx->state[7]; 78 | 79 | for (i = 0; i < 64; ++i) { 80 | t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i]; 81 | t2 = EP0(a) + MAJ(a,b,c); 82 | h = g; 83 | g = f; 84 | f = e; 85 | e = d + t1; 86 | d = c; 87 | c = b; 88 | b = a; 89 | a = t1 + t2; 90 | } 91 | 92 | ctx->state[0] += a; 93 | ctx->state[1] += b; 94 | ctx->state[2] += c; 95 | ctx->state[3] += d; 96 | ctx->state[4] += e; 97 | ctx->state[5] += f; 98 | ctx->state[6] += g; 99 | ctx->state[7] += h; 100 | } 101 | 102 | __device__ void cuda_sha256_init(CUDA_SHA256_CTX *ctx) 103 | { 104 | ctx->datalen = 0; 105 | ctx->bitlen = 0; 106 | ctx->state[0] = 0x6a09e667; 107 | ctx->state[1] = 0xbb67ae85; 108 | ctx->state[2] = 0x3c6ef372; 109 | ctx->state[3] = 0xa54ff53a; 110 | ctx->state[4] = 0x510e527f; 111 | ctx->state[5] = 0x9b05688c; 112 | ctx->state[6] = 0x1f83d9ab; 113 | ctx->state[7] = 0x5be0cd19; 114 | } 115 | 116 | __device__ void cuda_sha256_update(CUDA_SHA256_CTX *ctx, const BYTE data[], size_t len) 117 | { 118 | WORD i; 119 | 120 | for (i = 0; i < len; ++i) { 121 | ctx->data[ctx->datalen] = data[i]; 122 | ctx->datalen++; 123 | if (ctx->datalen == 64) { 124 | cuda_sha256_transform(ctx, ctx->data); 125 | ctx->bitlen += 512; 126 | ctx->datalen = 0; 127 | } 128 | } 129 | } 130 | 131 | __device__ void cuda_sha256_final(CUDA_SHA256_CTX *ctx, BYTE hash[]) 132 | { 133 | WORD i; 134 | 135 | i = ctx->datalen; 136 | 137 | // Pad whatever data is left in the buffer. 138 | if (ctx->datalen < 56) { 139 | ctx->data[i++] = 0x80; 140 | while (i < 56) 141 | ctx->data[i++] = 0x00; 142 | } 143 | else { 144 | ctx->data[i++] = 0x80; 145 | while (i < 64) 146 | ctx->data[i++] = 0x00; 147 | cuda_sha256_transform(ctx, ctx->data); 148 | memset(ctx->data, 0, 56); 149 | } 150 | 151 | // Append to the padding the total message's length in bits and transform. 152 | ctx->bitlen += ctx->datalen * 8; 153 | ctx->data[63] = ctx->bitlen; 154 | ctx->data[62] = ctx->bitlen >> 8; 155 | ctx->data[61] = ctx->bitlen >> 16; 156 | ctx->data[60] = ctx->bitlen >> 24; 157 | ctx->data[59] = ctx->bitlen >> 32; 158 | ctx->data[58] = ctx->bitlen >> 40; 159 | ctx->data[57] = ctx->bitlen >> 48; 160 | ctx->data[56] = ctx->bitlen >> 56; 161 | cuda_sha256_transform(ctx, ctx->data); 162 | 163 | // Since this implementation uses little endian byte ordering and SHA uses big endian, 164 | // reverse all the bytes when copying the final state to the output hash. 165 | for (i = 0; i < 4; ++i) { 166 | hash[i] = (ctx->state[0] >> (24 - i * 8)) & 0x000000ff; 167 | hash[i + 4] = (ctx->state[1] >> (24 - i * 8)) & 0x000000ff; 168 | hash[i + 8] = (ctx->state[2] >> (24 - i * 8)) & 0x000000ff; 169 | hash[i + 12] = (ctx->state[3] >> (24 - i * 8)) & 0x000000ff; 170 | hash[i + 16] = (ctx->state[4] >> (24 - i * 8)) & 0x000000ff; 171 | hash[i + 20] = (ctx->state[5] >> (24 - i * 8)) & 0x000000ff; 172 | hash[i + 24] = (ctx->state[6] >> (24 - i * 8)) & 0x000000ff; 173 | hash[i + 28] = (ctx->state[7] >> (24 - i * 8)) & 0x000000ff; 174 | } 175 | } 176 | 177 | __global__ void kernel_sha256_hash(BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch) 178 | { 179 | WORD thread = blockIdx.x * blockDim.x + threadIdx.x; 180 | if (thread >= n_batch) 181 | { 182 | return; 183 | } 184 | BYTE* in = indata + thread * inlen; 185 | BYTE* out = outdata + thread * SHA256_BLOCK_SIZE; 186 | CUDA_SHA256_CTX ctx; 187 | cuda_sha256_init(&ctx); 188 | cuda_sha256_update(&ctx, in, inlen); 189 | cuda_sha256_final(&ctx, out); 190 | } 191 | 192 | extern "C" 193 | { 194 | void mcm_cuda_sha256_hash_batch(BYTE* in, WORD inlen, BYTE* out, WORD n_batch) 195 | { 196 | BYTE *cuda_indata; 197 | BYTE *cuda_outdata; 198 | cudaMalloc(&cuda_indata, inlen * n_batch); 199 | cudaMalloc(&cuda_outdata, SHA256_BLOCK_SIZE * n_batch); 200 | cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice); 201 | 202 | WORD thread = 256; 203 | WORD block = (n_batch + thread - 1) / thread; 204 | 205 | kernel_sha256_hash << < block, thread >> > (cuda_indata, inlen, cuda_outdata, n_batch); 206 | cudaMemcpy(out, cuda_outdata, SHA256_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost); 207 | cudaDeviceSynchronize(); 208 | cudaError_t error = cudaGetLastError(); 209 | if (error != cudaSuccess) { 210 | printf("Error cuda sha256 hash: %s \n", cudaGetErrorString(error)); 211 | } 212 | cudaFree(cuda_indata); 213 | cudaFree(cuda_outdata); 214 | } 215 | } 216 | -------------------------------------------------------------------------------- /sha256.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * sha256.cuh CUDA Implementation of SHA256 Hashing 3 | * 4 | * Date: 12 June 2019 5 | * Revision: 1 6 | * 7 | * Based on the public domain Reference Implementation in C, by 8 | * Brad Conte, original code here: 9 | * 10 | * https://github.com/B-Con/crypto-algorithms 11 | * 12 | * This file is released into the Public Domain. 13 | */ 14 | 15 | 16 | #pragma once 17 | #include "config.h" 18 | void mcm_cuda_sha256_hash_batch(BYTE* in, WORD inlen, BYTE* out, WORD n_batch); 19 | --------------------------------------------------------------------------------