├── docs ├── presentation.pdf └── SHA Message Digest Computation on GPU.pdf ├── README └── src ├── parsha256.h ├── Makefile ├── common.h ├── parsha256_kernel.cu ├── sha1_kernel.cu ├── sha1_cpu.cu ├── sha1test.cu └── parsha256test.cu /docs/presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tadasv/gpgpu_sha/HEAD/docs/presentation.pdf -------------------------------------------------------------------------------- /docs/SHA Message Digest Computation on GPU.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tadasv/gpgpu_sha/HEAD/docs/SHA Message Digest Computation on GPU.pdf -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | This is a project I did for algorithms class in college. It is an implementation of SHA1 and PARSHA-256 algorithms 2 | on GPU. I believe that PARSHA-256 has some bugs due to byte ordering. 3 | 4 | Compilation: cd src && make 5 | Execution: 6 | ./sha1test - SHA-1 performance test 7 | ./parsha256test - PARSHA-256 performance test on GPU 8 | ./parsha256testemu - PARSHA-256 performance test on CPU (emulation mode) 9 | -------------------------------------------------------------------------------- /src/parsha256.h: -------------------------------------------------------------------------------- 1 | #ifndef __PARSHA256_H__ 2 | #define __PARSHA256_H__ 3 | 4 | /* 2 to the power of a */ 5 | #define POW2(a) ((unsigned)1 << (a)) 6 | #define DELTA(i) (POW2(i) * (2 * PARSHA256_BLOCK_SIZE - 2 * PARSHA256_HASH_SIZE - PARSHA256_IV_SIZE) - (PARSHA256_BLOCK_SIZE - 2 * PARSHA256_HASH_SIZE)) 7 | #define LAMDA(i) (POW2(i -1 ) * (2 * PARSHA256_BLOCK_SIZE - 2 * PARSHA256_HASH_SIZE - PARSHA256_IV_SIZE)) 8 | /* Hash function domain in bits */ 9 | #define PARSHA256_BLOCK_SIZE 768 10 | /* Hash function range in bits */ 11 | #define PARSHA256_HASH_SIZE 256 12 | /* Length of IV in bits */ 13 | #define PARSHA256_IV_SIZE 256 14 | /* Available processor tree */ 15 | #define TREE_SIZE 16 16 | 17 | #define PARSHA256_256BITSB 32 18 | #define PARSHA256_512BITSB 64 19 | #define PARSHA256_768BITSB 96 20 | 21 | #endif /* __PARSHA256_H__ */ 22 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := /usr/local/cuda/bin/nvcc --ptxas-options=-v 2 | LIBS := -L/usr/local/cuda/sdk/lib -L/usr/local/cuda/lib 3 | INCS := -I/usr/local/cuda/include -I/usr/include/cuda -I./ -I/usr/local/cuda/sdk/common/inc 4 | CFLAGS := $(INCS) -c# -D_DEBUG 5 | LDFLAGS := $(LIBS) -lcuda -lcutil 6 | SHA1OBJS := sha1test.o sha1_cpu.o sha1_kernel.o 7 | PARSHA256OBJS := parsha256test.o parsha256_kernel.o 8 | PARSHA256EMUOBJS := parsha256testemu.o parsha256_kernelemu.o 9 | 10 | 11 | all: sha1test parsha256test parsha256testemu 12 | 13 | # SHA-1 benchmark test 14 | sha1test: $(SHA1OBJS) 15 | $(NVCC) $(LDFLAGS) $(SHA1OBJS) -o sha1test 16 | sha1test.o: sha1test.cu common.h 17 | $(NVCC) $(CFLAGS) sha1test.cu -o sha1test.o 18 | sha1_cpu.o: sha1_cpu.cu common.h 19 | $(NVCC) $(CFLAGS) sha1_cpu.cu -o sha1_cpu.o 20 | sha1_kernel.o: sha1_kernel.cu common.h 21 | $(NVCC) $(CFLAGS) sha1_kernel.cu -o sha1_kernel.o 22 | 23 | # PARSHA-256 benchmark test 24 | parsha256test: $(PARSHA256OBJS) 25 | $(NVCC) $(LDFLAGS) $(PARSHA256OBJS) -o parsha256test 26 | parsha256test.o: parsha256test.cu parsha256.h 27 | $(NVCC) $(CFLAGS) parsha256test.cu -o parsha256test.o 28 | parsha256_kernel.o: parsha256_kernel.cu parsha256.h 29 | $(NVCC) $(CFLAGS) parsha256_kernel.cu -o parsha256_kernel.o 30 | 31 | # PARSHA-256 benchmark test in emulation mode 32 | parsha256testemu: $(PARSHA256EMUOBJS) 33 | $(NVCC) -deviceemu $(LDFLAGS) $(PARSHA256EMUOBJS) -o parsha256testemu 34 | parsha256testemu.o: parsha256test.cu parsha256.h 35 | $(NVCC) -deviceemu $(CFLAGS) parsha256test.cu -o parsha256testemu.o 36 | parsha256_kernelemu.o: parsha256_kernel.cu parsha256.h 37 | $(NVCC) -deviceemu $(CFLAGS) parsha256_kernel.cu -o parsha256_kernelemu.o 38 | 39 | clean: 40 | rm -rf *~ 41 | rm -rf *.o 42 | rm -rf sha1test 43 | rm -rf parsha256test 44 | rm -rf parsha256testemu 45 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef __COMMON_H__ 2 | #define __COMMON_H__ 3 | 4 | /* 5 | * 32-bit integer manipulation macros (big endian) 6 | */ 7 | #ifndef GET_UINT32_BE 8 | #define GET_UINT32_BE(n,b,i)\ 9 | {\ 10 | (n) = ( (unsigned long) (b)[(i) ] << 24 )\ 11 | | ( (unsigned long) (b)[(i) + 1] << 16 )\ 12 | | ( (unsigned long) (b)[(i) + 2] << 8 )\ 13 | | ( (unsigned long) (b)[(i) + 3] );\ 14 | } 15 | #endif 16 | 17 | #ifndef RETURN_UINT32_BE 18 | #define RETURN_UINT32_BE(b,i)\ 19 | (\ 20 | ( (unsigned long) (b)[(i) ] << 24 )\ 21 | | ( (unsigned long) (b)[(i) + 1] << 16 )\ 22 | | ( (unsigned long) (b)[(i) + 2] << 8 )\ 23 | | ( (unsigned long) (b)[(i) + 3] )\ 24 | ) 25 | #endif 26 | 27 | 28 | #ifndef GET_UINT32_BE_GPU 29 | #define GET_UINT32_BE_GPU(n,b,i)\ 30 | {\ 31 | (n) = ( (unsigned long) (b)[(i) + 3] << 24 )\ 32 | | ( (unsigned long) (b)[(i) + 2] << 16 )\ 33 | | ( (unsigned long) (b)[(i) + 1] << 8 )\ 34 | | ( (unsigned long) (b)[(i) ] );\ 35 | } 36 | #endif 37 | 38 | 39 | #ifndef PUT_UINT32_BE 40 | #define PUT_UINT32_BE(n,b,i)\ 41 | {\ 42 | (b)[(i) ] = (unsigned char) ( (n) >> 24 ); \ 43 | (b)[(i) + 1] = (unsigned char) ( (n) >> 16 ); \ 44 | (b)[(i) + 2] = (unsigned char) ( (n) >> 8 ); \ 45 | (b)[(i) + 3] = (unsigned char) ( (n) ); \ 46 | } 47 | #endif 48 | 49 | 50 | #define TRUNCLONG(x) (x) 51 | /* Circular rotation to the right for 32 bit word */ 52 | #define ROTATER(x,n) (((x) >> (n)) | ((x) << (32 - (n)))) 53 | /* Shift to the right */ 54 | #define SHIFTR(x,n) ((x) >> (n)) 55 | 56 | /* Little-Endian to Big-Endian for 32 bit word */ 57 | #define LETOBE32(i) (((i) & 0xff) << 24) + (((i) & 0xff00) << 8) + (((i) & 0xff0000) >> 8) + (((i) >> 24) & 0xff) 58 | /* Return number of 0 bytes to pad */ 59 | #define padding_256(len) (((len) & 0x3f) < 56) ? (56 - ((len) & 0x3f)) : (120 - ((len) & 0x3f)) 60 | 61 | 62 | #endif /* __COMMON_H__ */ 63 | 64 | -------------------------------------------------------------------------------- /src/parsha256_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "parsha256.h" 3 | #include 4 | 5 | #define ch_256(x, y, z) ((x & y) ^ (~x & z)) 6 | #define maj_256(x, y, z) ((x & y) ^ (x & z) ^ (y & z)) 7 | #define Sigma0_256(x) (ROTATER(x, 2) ^ ROTATER(x, 13) ^ ROTATER(x, 22)) 8 | #define Sigma1_256(x) (ROTATER(x, 6) ^ ROTATER(x, 11) ^ ROTATER(x, 25)) 9 | #define sigma0_256(x) (ROTATER(x, 7) ^ ROTATER(x, 18) ^ SHIFTR(x, 3)) 10 | #define sigma1_256(x) (ROTATER(x, 17) ^ ROTATER(x, 19) ^ SHIFTR(x, 10)) 11 | 12 | 13 | /* 14 | * Table of round constants. 15 | * First 32 bits of the fractional parts of the cube roots of the first 64 primes 2..311 16 | */ 17 | __device__ static const unsigned int K256[] = { 18 | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 19 | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 20 | 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 21 | 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 22 | 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 23 | 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 24 | 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 25 | 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 26 | }; 27 | 28 | 29 | /* 30 | * Process one block 31 | */ 32 | __device__ void sha256 (unsigned char *input, unsigned char *output) 33 | { 34 | unsigned long W[64], a, b, c, d, e, f, g, h; 35 | unsigned long a1, b1, c1, d1, e1, f1, g1, h1; 36 | unsigned long t1, t2; 37 | int t; 38 | 39 | for (t = 0; t < 16; t++) 40 | /* Add 32 because first 8 words are intermediate hash state */ 41 | GET_UINT32_BE(W[t], input, t * 4 + 32); 42 | for (; t < 64; t++) 43 | W[t] = sigma1_256(W[t - 2]) + W[t - 7] + sigma0_256(W[t - 15]) + W[t - 16]; 44 | 45 | /* intermediate hash state */ 46 | GET_UINT32_BE(a, input, 0); 47 | GET_UINT32_BE(b, input, 4); 48 | GET_UINT32_BE(c, input, 8); 49 | GET_UINT32_BE(d, input, 12); 50 | GET_UINT32_BE(e, input, 16); 51 | GET_UINT32_BE(f, input, 20); 52 | GET_UINT32_BE(g, input, 24); 53 | GET_UINT32_BE(h, input, 28); 54 | 55 | a1 = a; 56 | b1 = b; 57 | c1 = c; 58 | d1 = d; 59 | e1 = e; 60 | f1 = f; 61 | g1 = g; 62 | h1 = h; 63 | 64 | for (t = 0; t < 64; t++) { 65 | t1 = h + Sigma1_256(e) + ch_256(e, f, g) + K256[t] + W[t]; 66 | t2 = Sigma0_256(a) + maj_256(a, b, c); 67 | h = g; 68 | g = f; 69 | f = e; 70 | e = d + t1; 71 | d = c; 72 | c = b; 73 | b = a; 74 | a = t1 + t2; 75 | } 76 | 77 | a = a + a1; 78 | b = b + b1; 79 | c = c + c1; 80 | d = d + d1; 81 | e = e + e1; 82 | f = f + f1; 83 | g = g + g1; 84 | h = h + h1; 85 | 86 | PUT_UINT32_BE(a, output, 0); 87 | PUT_UINT32_BE(b, output, 4); 88 | PUT_UINT32_BE(c, output, 8); 89 | PUT_UINT32_BE(d, output, 12); 90 | PUT_UINT32_BE(e, output, 16); 91 | PUT_UINT32_BE(f, output, 20); 92 | PUT_UINT32_BE(g, output, 24); 93 | PUT_UINT32_BE(h, output, 28); 94 | } 95 | 96 | 97 | __global__ void parsha256_kernel (unsigned char *input, unsigned char *output, unsigned long total_threads) 98 | { 99 | unsigned long thread_index = blockIdx.x * blockDim.x + threadIdx.x; 100 | 101 | if (thread_index > total_threads - 1) 102 | return; 103 | 104 | sha256(&input[thread_index * 96], &output[thread_index * 32]); 105 | } 106 | -------------------------------------------------------------------------------- /src/sha1_kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * SHA-1 GPU implementation. 3 | * 2008, Tadas Vilkeliskis 4 | */ 5 | #include 6 | #include "common.h" 7 | 8 | 9 | #define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n))) 10 | #define R(t) \ 11 | temp = extended[block_index + t - 3] ^ extended[block_index + t - 8] ^ \ 12 | extended[block_index + t - 14] ^ extended[block_index + t - 16]; \ 13 | extended[block_index + t] = S(temp,1); \ 14 | 15 | 16 | typedef struct { 17 | unsigned long state[5]; 18 | } sha1_gpu_context; 19 | 20 | /* 21 | * Process extended block. 22 | */ 23 | __device__ void sha1_gpu_process (sha1_gpu_context *ctx, unsigned long W[80]) 24 | { 25 | unsigned long A, B, C, D, E; 26 | A = ctx->state[0]; 27 | B = ctx->state[1]; 28 | C = ctx->state[2]; 29 | D = ctx->state[3]; 30 | E = ctx->state[4]; 31 | 32 | #define P(a,b,c,d,e,x) \ 33 | { \ 34 | e += S(a,5) + F(b,c,d) + K + x; b = S(b,30); \ 35 | } 36 | 37 | 38 | #define F(x,y,z) (z ^ (x & (y ^ z))) 39 | #define K 0x5A827999 40 | 41 | P( A, B, C, D, E, W[0] ); 42 | P( E, A, B, C, D, W[1] ); 43 | P( D, E, A, B, C, W[2] ); 44 | P( C, D, E, A, B, W[3] ); 45 | P( B, C, D, E, A, W[4] ); 46 | P( A, B, C, D, E, W[5] ); 47 | P( E, A, B, C, D, W[6] ); 48 | P( D, E, A, B, C, W[7] ); 49 | P( C, D, E, A, B, W[8] ); 50 | P( B, C, D, E, A, W[9] ); 51 | P( A, B, C, D, E, W[10] ); 52 | P( E, A, B, C, D, W[11] ); 53 | P( D, E, A, B, C, W[12] ); 54 | P( C, D, E, A, B, W[13] ); 55 | P( B, C, D, E, A, W[14] ); 56 | P( A, B, C, D, E, W[15] ); 57 | P( E, A, B, C, D, W[16] ); 58 | P( D, E, A, B, C, W[17] ); 59 | P( C, D, E, A, B, W[18] ); 60 | P( B, C, D, E, A, W[19] ); 61 | 62 | #undef K 63 | #undef F 64 | 65 | #define F(x,y,z) (x ^ y ^ z) 66 | #define K 0x6ED9EBA1 67 | 68 | P( A, B, C, D, E, W[20] ); 69 | P( E, A, B, C, D, W[21] ); 70 | P( D, E, A, B, C, W[22] ); 71 | P( C, D, E, A, B, W[23] ); 72 | P( B, C, D, E, A, W[24] ); 73 | P( A, B, C, D, E, W[25] ); 74 | P( E, A, B, C, D, W[26] ); 75 | P( D, E, A, B, C, W[27] ); 76 | P( C, D, E, A, B, W[28] ); 77 | P( B, C, D, E, A, W[29] ); 78 | P( A, B, C, D, E, W[30] ); 79 | P( E, A, B, C, D, W[31] ); 80 | P( D, E, A, B, C, W[32] ); 81 | P( C, D, E, A, B, W[33] ); 82 | P( B, C, D, E, A, W[34] ); 83 | P( A, B, C, D, E, W[35] ); 84 | P( E, A, B, C, D, W[36] ); 85 | P( D, E, A, B, C, W[37] ); 86 | P( C, D, E, A, B, W[38] ); 87 | P( B, C, D, E, A, W[39] ); 88 | 89 | #undef K 90 | #undef F 91 | 92 | #define F(x,y,z) ((x & y) | (z & (x | y))) 93 | #define K 0x8F1BBCDC 94 | 95 | P( A, B, C, D, E, W[40] ); 96 | P( E, A, B, C, D, W[41] ); 97 | P( D, E, A, B, C, W[42] ); 98 | P( C, D, E, A, B, W[43] ); 99 | P( B, C, D, E, A, W[44] ); 100 | P( A, B, C, D, E, W[45] ); 101 | P( E, A, B, C, D, W[46] ); 102 | P( D, E, A, B, C, W[47] ); 103 | P( C, D, E, A, B, W[48] ); 104 | P( B, C, D, E, A, W[49] ); 105 | P( A, B, C, D, E, W[50] ); 106 | P( E, A, B, C, D, W[51] ); 107 | P( D, E, A, B, C, W[52] ); 108 | P( C, D, E, A, B, W[53] ); 109 | P( B, C, D, E, A, W[54] ); 110 | P( A, B, C, D, E, W[55] ); 111 | P( E, A, B, C, D, W[56] ); 112 | P( D, E, A, B, C, W[57] ); 113 | P( C, D, E, A, B, W[58] ); 114 | P( B, C, D, E, A, W[59] ); 115 | 116 | #undef K 117 | #undef F 118 | 119 | #define F(x,y,z) (x ^ y ^ z) 120 | #define K 0xCA62C1D6 121 | 122 | P( A, B, C, D, E, W[60] ); 123 | P( E, A, B, C, D, W[61] ); 124 | P( D, E, A, B, C, W[62] ); 125 | P( C, D, E, A, B, W[63] ); 126 | P( B, C, D, E, A, W[64] ); 127 | P( A, B, C, D, E, W[65] ); 128 | P( E, A, B, C, D, W[66] ); 129 | P( D, E, A, B, C, W[67] ); 130 | P( C, D, E, A, B, W[68] ); 131 | P( B, C, D, E, A, W[69] ); 132 | P( A, B, C, D, E, W[70] ); 133 | P( E, A, B, C, D, W[71] ); 134 | P( D, E, A, B, C, W[72] ); 135 | P( C, D, E, A, B, W[73] ); 136 | P( B, C, D, E, A, W[74] ); 137 | P( A, B, C, D, E, W[75] ); 138 | P( E, A, B, C, D, W[76] ); 139 | P( D, E, A, B, C, W[77] ); 140 | P( C, D, E, A, B, W[78] ); 141 | P( B, C, D, E, A, W[79] ); 142 | 143 | #undef K 144 | #undef F 145 | 146 | ctx->state[0] += A; 147 | ctx->state[1] += B; 148 | ctx->state[2] += C; 149 | ctx->state[3] += D; 150 | ctx->state[4] += E; 151 | } 152 | 153 | __global__ void sha1_kernel_global (unsigned char *data, sha1_gpu_context *ctx, int total_threads, unsigned long *extended) 154 | { 155 | int thread_index = threadIdx.x + blockDim.x * blockIdx.x; 156 | int e_index = thread_index * 80; 157 | int block_index = thread_index * 64; 158 | unsigned long temp, t; 159 | 160 | if (thread_index > total_threads -1) 161 | return; 162 | 163 | /* 164 | * Extend 32 block byte block into 80 byte block. 165 | */ 166 | GET_UINT32_BE( extended[e_index ], data + block_index, 0 ); 167 | GET_UINT32_BE( extended[e_index + 1], data + block_index, 4 ); 168 | GET_UINT32_BE( extended[e_index + 2], data + block_index, 8 ); 169 | GET_UINT32_BE( extended[e_index + 3], data + block_index, 12 ); 170 | GET_UINT32_BE( extended[e_index + 4], data + block_index, 16 ); 171 | GET_UINT32_BE( extended[e_index + 5], data + block_index, 20 ); 172 | GET_UINT32_BE( extended[e_index + 6], data + block_index, 24 ); 173 | GET_UINT32_BE( extended[e_index + 7], data + block_index, 28 ); 174 | GET_UINT32_BE( extended[e_index + 8], data + block_index, 32 ); 175 | GET_UINT32_BE( extended[e_index + 9], data + block_index, 36 ); 176 | GET_UINT32_BE( extended[e_index +10], data + block_index, 40 ); 177 | GET_UINT32_BE( extended[e_index +11], data + block_index, 44 ); 178 | GET_UINT32_BE( extended[e_index +12], data + block_index, 48 ); 179 | GET_UINT32_BE( extended[e_index +13], data + block_index, 52 ); 180 | GET_UINT32_BE( extended[e_index +14], data + block_index, 56 ); 181 | GET_UINT32_BE( extended[e_index +15], data + block_index, 60 ); 182 | 183 | for (t = 16; t < 80; t++) { 184 | temp = extended[e_index + t - 3] ^ extended[e_index + t - 8] ^ 185 | extended[e_index + t - 14] ^ extended[e_index + t - 16]; 186 | extended[e_index + t] = S(temp,1); 187 | } 188 | 189 | /* Wait for the last thread and compute intermediate hash values of extended blocks */ 190 | __syncthreads(); 191 | if (thread_index == total_threads - 1) { 192 | for (t = 0; t < total_threads; t++) 193 | sha1_gpu_process (ctx, (unsigned long*)&extended[t * 80]); 194 | } 195 | } 196 | 197 | -------------------------------------------------------------------------------- /src/sha1_cpu.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * SHA-1 CPU implementation 3 | */ 4 | #include 5 | #include 6 | #include "common.h" 7 | 8 | typedef struct { 9 | unsigned long total[2]; /* number of bytes processed */ 10 | unsigned long state[5]; /* intermediate digest state */ 11 | unsigned char buffer[64]; /* data block being processed */ 12 | } sha1_cpu_context; 13 | 14 | 15 | static const unsigned char sha1_padding[64] = 16 | { 17 | 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 21 | }; 22 | 23 | 24 | /* 25 | * Prepare SHA-1 for execution. 26 | */ 27 | void sha1_cpu_starts(sha1_cpu_context* ctx) 28 | { 29 | ctx->total[0] = 0; 30 | ctx->total[1] = 0; 31 | ctx->state[0] = 0x67452301; 32 | ctx->state[1] = 0xEFCDAB89; 33 | ctx->state[2] = 0x98BADCFE; 34 | ctx->state[3] = 0x10325476; 35 | ctx->state[4] = 0xC3D2E1F0; 36 | } 37 | 38 | 39 | /* 40 | * Process one block of data. 41 | */ 42 | static void sha1_cpu_process(sha1_cpu_context *ctx, unsigned char data[64]) 43 | { 44 | unsigned long temp, W[16]={0,}, A, B, C, D, E; 45 | 46 | GET_UINT32_BE( W[ 0], data, 0 ); 47 | GET_UINT32_BE( W[ 1], data, 4 ); 48 | GET_UINT32_BE( W[ 2], data, 8 ); 49 | GET_UINT32_BE( W[ 3], data, 12 ); 50 | GET_UINT32_BE( W[ 4], data, 16 ); 51 | GET_UINT32_BE( W[ 5], data, 20 ); 52 | GET_UINT32_BE( W[ 6], data, 24 ); 53 | GET_UINT32_BE( W[ 7], data, 28 ); 54 | GET_UINT32_BE( W[ 8], data, 32 ); 55 | GET_UINT32_BE( W[ 9], data, 36 ); 56 | GET_UINT32_BE( W[10], data, 40 ); 57 | GET_UINT32_BE( W[11], data, 44 ); 58 | GET_UINT32_BE( W[12], data, 48 ); 59 | GET_UINT32_BE( W[13], data, 52 ); 60 | GET_UINT32_BE( W[14], data, 56 ); 61 | GET_UINT32_BE( W[15], data, 60 ); 62 | 63 | #define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n))) 64 | 65 | #define R(t) \ 66 | ( \ 67 | temp = W[(t - 3) & 0x0F] ^ W[(t - 8) & 0x0F] ^ \ 68 | W[(t - 14) & 0x0F] ^ W[ t & 0x0F], \ 69 | ( W[t & 0x0F] = S(temp,1) ) \ 70 | ) 71 | 72 | #define P(a,b,c,d,e,x) \ 73 | { \ 74 | e += S(a,5) + F(b,c,d) + K + x; b = S(b,30); \ 75 | } 76 | 77 | A = ctx->state[0]; 78 | B = ctx->state[1]; 79 | C = ctx->state[2]; 80 | D = ctx->state[3]; 81 | E = ctx->state[4]; 82 | 83 | #define F(x,y,z) (z ^ (x & (y ^ z))) 84 | #define K 0x5A827999 85 | 86 | P( A, B, C, D, E, W[0] ); 87 | P( E, A, B, C, D, W[1] ); 88 | P( D, E, A, B, C, W[2] ); 89 | P( C, D, E, A, B, W[3] ); 90 | P( B, C, D, E, A, W[4] ); 91 | P( A, B, C, D, E, W[5] ); 92 | P( E, A, B, C, D, W[6] ); 93 | P( D, E, A, B, C, W[7] ); 94 | P( C, D, E, A, B, W[8] ); 95 | P( B, C, D, E, A, W[9] ); 96 | P( A, B, C, D, E, W[10] ); 97 | P( E, A, B, C, D, W[11] ); 98 | P( D, E, A, B, C, W[12] ); 99 | P( C, D, E, A, B, W[13] ); 100 | P( B, C, D, E, A, W[14] ); 101 | P( A, B, C, D, E, W[15] ); 102 | P( E, A, B, C, D, R(16) ); 103 | P( D, E, A, B, C, R(17) ); 104 | P( C, D, E, A, B, R(18) ); 105 | P( B, C, D, E, A, R(19) ); 106 | 107 | #undef K 108 | #undef F 109 | 110 | #define F(x,y,z) (x ^ y ^ z) 111 | #define K 0x6ED9EBA1 112 | 113 | P( A, B, C, D, E, R(20) ); 114 | P( E, A, B, C, D, R(21) ); 115 | P( D, E, A, B, C, R(22) ); 116 | P( C, D, E, A, B, R(23) ); 117 | P( B, C, D, E, A, R(24) ); 118 | P( A, B, C, D, E, R(25) ); 119 | P( E, A, B, C, D, R(26) ); 120 | P( D, E, A, B, C, R(27) ); 121 | P( C, D, E, A, B, R(28) ); 122 | P( B, C, D, E, A, R(29) ); 123 | P( A, B, C, D, E, R(30) ); 124 | P( E, A, B, C, D, R(31) ); 125 | P( D, E, A, B, C, R(32) ); 126 | P( C, D, E, A, B, R(33) ); 127 | P( B, C, D, E, A, R(34) ); 128 | P( A, B, C, D, E, R(35) ); 129 | P( E, A, B, C, D, R(36) ); 130 | P( D, E, A, B, C, R(37) ); 131 | P( C, D, E, A, B, R(38) ); 132 | P( B, C, D, E, A, R(39) ); 133 | 134 | #undef K 135 | #undef F 136 | 137 | #define F(x,y,z) ((x & y) | (z & (x | y))) 138 | #define K 0x8F1BBCDC 139 | 140 | P( A, B, C, D, E, R(40) ); 141 | P( E, A, B, C, D, R(41) ); 142 | P( D, E, A, B, C, R(42) ); 143 | P( C, D, E, A, B, R(43) ); 144 | P( B, C, D, E, A, R(44) ); 145 | P( A, B, C, D, E, R(45) ); 146 | P( E, A, B, C, D, R(46) ); 147 | P( D, E, A, B, C, R(47) ); 148 | P( C, D, E, A, B, R(48) ); 149 | P( B, C, D, E, A, R(49) ); 150 | P( A, B, C, D, E, R(50) ); 151 | P( E, A, B, C, D, R(51) ); 152 | P( D, E, A, B, C, R(52) ); 153 | P( C, D, E, A, B, R(53) ); 154 | P( B, C, D, E, A, R(54) ); 155 | P( A, B, C, D, E, R(55) ); 156 | P( E, A, B, C, D, R(56) ); 157 | P( D, E, A, B, C, R(57) ); 158 | P( C, D, E, A, B, R(58) ); 159 | P( B, C, D, E, A, R(59) ); 160 | 161 | #undef K 162 | #undef F 163 | 164 | #define F(x,y,z) (x ^ y ^ z) 165 | #define K 0xCA62C1D6 166 | 167 | P( A, B, C, D, E, R(60) ); 168 | P( E, A, B, C, D, R(61) ); 169 | P( D, E, A, B, C, R(62) ); 170 | P( C, D, E, A, B, R(63) ); 171 | P( B, C, D, E, A, R(64) ); 172 | P( A, B, C, D, E, R(65) ); 173 | P( E, A, B, C, D, R(66) ); 174 | P( D, E, A, B, C, R(67) ); 175 | P( C, D, E, A, B, R(68) ); 176 | P( B, C, D, E, A, R(69) ); 177 | P( A, B, C, D, E, R(70) ); 178 | P( E, A, B, C, D, R(71) ); 179 | P( D, E, A, B, C, R(72) ); 180 | P( C, D, E, A, B, R(73) ); 181 | P( B, C, D, E, A, R(74) ); 182 | P( A, B, C, D, E, R(75) ); 183 | P( E, A, B, C, D, R(76) ); 184 | P( D, E, A, B, C, R(77) ); 185 | P( C, D, E, A, B, R(78) ); 186 | P( B, C, D, E, A, R(79) ); 187 | 188 | #undef K 189 | #undef F 190 | 191 | ctx->state[0] += A; 192 | ctx->state[1] += B; 193 | ctx->state[2] += C; 194 | ctx->state[3] += D; 195 | ctx->state[4] += E; 196 | } 197 | 198 | 199 | /* 200 | * Splits input message into blocks and processes them one by one. Also 201 | * checks how many 0 need to be padded and processes the last, padded, block. 202 | */ 203 | void sha1_cpu_update(sha1_cpu_context *ctx, unsigned char *input, int ilen) 204 | { 205 | int fill; 206 | unsigned long left; 207 | 208 | if ( ilen <= 0 ) 209 | return; 210 | 211 | left = ctx->total[0] & 0x3F; 212 | fill = 64 - left; 213 | 214 | ctx->total[0] += ilen; 215 | ctx->total[0] &= 0xFFFFFFFF; 216 | 217 | if (ctx->total[0] < (unsigned long) ilen) 218 | ctx->total[1]++; 219 | 220 | if ( left && ilen >= fill ) { 221 | memcpy((void *) (ctx->buffer + left), (void *) input, fill); 222 | sha1_cpu_process(ctx, ctx->buffer); 223 | input += fill; 224 | ilen -= fill; 225 | left = 0; 226 | } 227 | 228 | while ( ilen >= 64 ) { 229 | sha1_cpu_process(ctx, input); 230 | input += 64; 231 | ilen -= 64; 232 | } 233 | 234 | if ( ilen > 0 ) { 235 | memcpy( (void *) (ctx->buffer + left), (void *) input, ilen ); 236 | } 237 | } 238 | 239 | 240 | /* 241 | * Process padded block and return hash to user. 242 | */ 243 | void sha1_cpu_finish(sha1_cpu_context *ctx, unsigned char *output) 244 | { 245 | unsigned long last, padn; 246 | unsigned long high, low; 247 | unsigned char msglen[8]; 248 | 249 | 250 | high = (ctx->total[0] >> 29) | (ctx->total[1] << 3); 251 | low = (ctx->total[0] << 3); 252 | 253 | PUT_UINT32_BE(high, msglen, 0); 254 | PUT_UINT32_BE(low, msglen, 4); 255 | 256 | last = ctx->total[0] & 0x3F; 257 | padn = (last < 56 ) ? ( 56 - last ) : ( 120 - last); 258 | 259 | sha1_cpu_update(ctx, (unsigned char *) sha1_padding, padn); 260 | sha1_cpu_update(ctx, msglen, 8); 261 | 262 | PUT_UINT32_BE(ctx->state[0], output, 0); 263 | PUT_UINT32_BE(ctx->state[1], output, 4); 264 | PUT_UINT32_BE(ctx->state[2], output, 8); 265 | PUT_UINT32_BE(ctx->state[3], output, 12); 266 | PUT_UINT32_BE(ctx->state[4], output, 16); 267 | } 268 | 269 | 270 | /* 271 | * Execute SHA-1 272 | */ 273 | void sha1_cpu(unsigned char *input, int ilen, unsigned char *output) { 274 | sha1_cpu_context ctx; 275 | 276 | sha1_cpu_starts( &ctx ); 277 | sha1_cpu_update( &ctx, input, ilen ); 278 | sha1_cpu_finish( &ctx, output ); 279 | 280 | memset( &ctx, 0, sizeof( sha1_cpu_context ) ); 281 | } 282 | 283 | -------------------------------------------------------------------------------- /src/sha1test.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * SHA-1 benchmark program. Calculates execution time of SHA-1 on CPU and GPU. 3 | * Also includes function sha1_gpu_global() which prepares SHA-1 to be executed 4 | * on GPU. 5 | * 6 | * 2008, Tadas Vilkeliskis 7 | */ 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "common.h" 13 | 14 | #define MAX_THREADS_PER_BLOCK 128 15 | 16 | typedef struct { 17 | unsigned long state[5]; 18 | } sha1_gpu_context; 19 | 20 | 21 | typedef struct { 22 | unsigned const char *data; 23 | unsigned const char *hash; 24 | } testvector; 25 | 26 | 27 | typedef struct { 28 | unsigned int kernel_timer; /* time spent in kernel */ 29 | unsigned int malloc_timer; /* how much time we spend allocating memory */ 30 | unsigned int memcpy_timer; /* how much time we spend copying from host to device */ 31 | unsigned int free_timer; /* how much time we spend releasing memory */ 32 | } chronometer; 33 | 34 | /* timers used to check performance */ 35 | chronometer chmeter = {0, 0, 0, 0}; 36 | 37 | extern void sha1_cpu (unsigned char *input, int ilen, unsigned char *output); 38 | extern __global__ void sha1_kernel_global (unsigned char *data, sha1_gpu_context *ctx, int total_threads, unsigned long *extended); 39 | 40 | /* 41 | * Run sha1 kernel on GPU 42 | * input - message 43 | * size - message size 44 | * output - buffer to store hash value 45 | * proc - maximum threads per block 46 | */ 47 | void sha1_gpu_global (unsigned char *input, unsigned long size, unsigned char *output, int proc) 48 | { 49 | int total_threads; /* Total number of threads in the grid */ 50 | int blocks_per_grid; /* Number of blocks in the grid */ 51 | int threads_per_block; /* Number of threads in a block */ 52 | int pad, size_be; /* Number of zeros to pad, message size in big-enadian. */ 53 | int total_datablocks; /* Total number of blocks message is split into */ 54 | int i, k; /* Temporary variables */ 55 | unsigned char *d_message; /* Input message on the device */ 56 | unsigned long *d_extended; /* Extended blocks on the device */ 57 | sha1_gpu_context ctx, *d_ctx; /* Intermediate hash states */ 58 | 59 | /* Initialization vector for SHA-1 */ 60 | ctx.state[0] = 0x67452301; 61 | ctx.state[1] = 0xEFCDAB89; 62 | ctx.state[2] = 0x98BADCFE; 63 | ctx.state[3] = 0x10325476; 64 | ctx.state[4] = 0xC3D2E1F0; 65 | 66 | pad = padding_256 (size); 67 | threads_per_block = proc; 68 | blocks_per_grid = 1; 69 | /* How many blocks in the message */ 70 | total_datablocks = (size + pad + 8) / 64; 71 | 72 | if (total_datablocks > threads_per_block) 73 | total_threads = threads_per_block; 74 | else 75 | total_threads = total_datablocks; 76 | 77 | size_be = LETOBE32 (size * 8); 78 | 79 | /* Allocate enough memory on the device */ 80 | CUT_SAFE_CALL (cutResetTimer (chmeter.malloc_timer)); 81 | CUT_SAFE_CALL (cutStartTimer (chmeter.malloc_timer)); 82 | cudaMalloc ((void**)&d_extended, proc * 80 * sizeof(unsigned long)); 83 | CUT_CHECK_ERROR ("d_extended malloc failed"); 84 | cudaMalloc ((void**)&d_message, size + pad + 8); 85 | CUT_CHECK_ERROR ("d_message malloc failed"); 86 | cudaMalloc ((void**)&d_ctx, sizeof (sha1_gpu_context)); 87 | CUT_CHECK_ERROR ("d_ctx malloc failed"); 88 | CUT_SAFE_CALL (cutStopTimer (chmeter.malloc_timer)); 89 | CUT_SAFE_CALL (cutResetTimer (chmeter.memcpy_timer)); 90 | 91 | /* 92 | * Copy the data from host to device and perform padding 93 | */ 94 | CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer)); 95 | cudaMemcpy (d_ctx, &ctx, sizeof (sha1_gpu_context), cudaMemcpyHostToDevice); 96 | cudaMemcpy (d_message, input, size, cudaMemcpyHostToDevice); 97 | cudaMemset (d_message + size, 0x80, 1); 98 | cudaMemset (d_message + size + 1, 0, pad + 7); 99 | cudaMemcpy (d_message + size + pad + 4, &size_be, 4, cudaMemcpyHostToDevice); 100 | CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer)); 101 | 102 | /* 103 | * Run the algorithm 104 | */ 105 | i = 0; 106 | k = total_datablocks / total_threads; 107 | CUT_SAFE_CALL (cutResetTimer (chmeter.kernel_timer)); 108 | CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer)); 109 | if (k - 1 > 0) { 110 | /* 111 | * Kernel is executed multiple times and only one block in the grid is used. 112 | * Since thread synchronization is allowed only within a block. 113 | */ 114 | for (i = 0; i < k; i++) { 115 | sha1_kernel_global <<>>(d_message + threads_per_block * i * 64, d_ctx, threads_per_block, d_extended); 116 | CUT_CHECK_ERROR ("Kernel execution failed"); 117 | /* 118 | * Here I do not perform thread synchronization 119 | * since threads are shynchronized in the kernel 120 | */ 121 | } 122 | } 123 | threads_per_block = total_datablocks - (i * total_threads); 124 | sha1_kernel_global <<>>(d_message + total_threads * i * 64, d_ctx, threads_per_block, d_extended); 125 | CUT_CHECK_ERROR ("Kernel execution failed"); 126 | 127 | CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer)); 128 | CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer)); 129 | cudaMemcpy (&ctx, d_ctx, sizeof(sha1_gpu_context), cudaMemcpyDeviceToHost); 130 | CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer)); 131 | 132 | CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer)); 133 | /* Put the hash value in the users' buffer */ 134 | PUT_UINT32_BE( ctx.state[0], output, 0 ); 135 | PUT_UINT32_BE( ctx.state[1], output, 4 ); 136 | PUT_UINT32_BE( ctx.state[2], output, 8 ); 137 | PUT_UINT32_BE( ctx.state[3], output, 12 ); 138 | PUT_UINT32_BE( ctx.state[4], output, 16 ); 139 | CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer)); 140 | 141 | CUT_SAFE_CALL (cutResetTimer (chmeter.free_timer)); 142 | CUT_SAFE_CALL (cutStartTimer (chmeter.free_timer)); 143 | cudaFree (d_message); 144 | cudaFree (d_ctx); 145 | cudaFree (d_extended); 146 | CUT_SAFE_CALL (cutStopTimer (chmeter.free_timer)); 147 | } 148 | 149 | 150 | int main(int argc, char *argv[]) 151 | { 152 | testvector tv1 = { 153 | (unsigned char *) "abc", 154 | (unsigned char *) "\xa9\x99\x3e\x36\x47\x06\x81\x6a\xba\x3e\x25\x71\x78\x50\xc2\x6c\x9c\xd0\xd8\x9d" 155 | }; 156 | testvector tv2 = { 157 | (unsigned char *) "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 158 | (unsigned char *) "\x84\x98\x3e\x44\x1c\x3b\xd2\x6e\xba\xae\x4a\xa1\xf9\x51\x29\xe5\xe5\x46\x70\xf1" 159 | }; 160 | unsigned char hash[20]; 161 | unsigned char *data = NULL; 162 | int i; 163 | int max_threads_per_block = MAX_THREADS_PER_BLOCK; 164 | 165 | printf ("===================================\n"); 166 | printf ("SHA-1 HASH ALGORITHM BENCHMARK TEST\n"); 167 | printf ("===================================\n"); 168 | 169 | CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.kernel_timer)); 170 | CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.malloc_timer)); 171 | CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.memcpy_timer)); 172 | CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.free_timer)); 173 | 174 | printf ("\nTesting algorithm correctness...\n"); 175 | 176 | sha1_cpu ((unsigned char*)tv1.data, strlen((const char*)tv1.data), hash); 177 | if (memcmp (hash, tv1.hash, 20) == 0) printf ("CPU TEST 1 PASSED\n"); 178 | else printf ("CPU TEST 1 FAILED\n"); 179 | 180 | sha1_gpu_global ((unsigned char*)tv1.data, strlen((const char*)tv1.data), hash, max_threads_per_block); 181 | if (memcmp (hash, tv1.hash, 20) == 0) printf ("GPU TEST 1 PASSED\n"); 182 | else printf ("GPU TEST 1 FAILED\n"); 183 | 184 | sha1_cpu ((unsigned char*)tv2.data, strlen((const char*)tv2.data), hash); 185 | if (memcmp (hash, tv2.hash, 20) == 0) printf ("CPU TEST 2 PASSED\n"); 186 | else printf ("CPU TEST 2 FAILED\n"); 187 | 188 | sha1_gpu_global ((unsigned char*)tv2.data, strlen((const char*)tv2.data), hash, max_threads_per_block); 189 | if (memcmp (hash, tv2.hash, 20) == 0) printf ("GPU TEST 2 PASSED\n"); 190 | else printf ("GPU TEST 2 FAILED\n"); 191 | 192 | printf ("Done.\n\n"); 193 | printf ("\tSIZE EXEC KERNEL\tcudaMemcpy\tcudaMalloc\tcudaFree\n"); 194 | 195 | for (i = 1000; i < 100000000; i = i * 10) { 196 | data = (unsigned char *) malloc (i); 197 | if (data == NULL) { 198 | printf ("ERROR: Insufficient memory on host\n"); 199 | return -1; 200 | } 201 | 202 | CUT_SAFE_CALL (cutResetTimer (chmeter.kernel_timer)); 203 | CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer)); 204 | sha1_cpu (data, i, hash); 205 | CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer)); 206 | printf ("CPU\t%-10d%f\n", i, cutGetTimerValue (chmeter.kernel_timer)); 207 | 208 | CUT_SAFE_CALL (cutResetTimer (chmeter.kernel_timer)); 209 | CUT_SAFE_CALL (cutResetTimer (chmeter.malloc_timer)); 210 | CUT_SAFE_CALL (cutResetTimer (chmeter.memcpy_timer)); 211 | CUT_SAFE_CALL (cutResetTimer (chmeter.free_timer)); 212 | memset (hash, 0, 20); 213 | 214 | sha1_gpu_global (data, i, hash, max_threads_per_block); 215 | printf ("GPU\t%-10d%f\t%f\t%f\t%f\n", i, 216 | cutGetTimerValue (chmeter.kernel_timer), 217 | cutGetTimerValue (chmeter.memcpy_timer), 218 | cutGetTimerValue (chmeter.malloc_timer), 219 | cutGetTimerValue (chmeter.free_timer)); 220 | free (data); 221 | } 222 | 223 | return 0; 224 | } 225 | -------------------------------------------------------------------------------- /src/parsha256test.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * PARSHA-256 benchmark program. Calculates execution time of PARSHA-256 on CPU and GPU. 3 | * Also includes function parsha256_gpu which prepares PARSHA-256 to executes on GPU and 4 | * executes it. 5 | * 6 | * 2008, Tadas Vilkeliskis 7 | */ 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "parsha256.h" 13 | 14 | typedef struct { 15 | unsigned int kernel_timer; /* execution time of the kernel */ 16 | unsigned int malloc_timer; /* time spent on memory allocation */ 17 | unsigned int memcpy_timer; /* time spent on copying memory from hsot to device and vise versa */ 18 | unsigned int free_timer; /* time spent on memory deallocation */ 19 | } chronometer; 20 | 21 | chronometer chmeter = {0, 0, 0, 0}; 22 | 23 | extern __global__ void parsha256_kernel (unsigned char *input, unsigned char *output, unsigned long total_threads); 24 | 25 | void parsha256_gpu (unsigned char *input, unsigned long size, unsigned char *output) 26 | { 27 | unsigned long t; /* effective tree height */ 28 | unsigned char *d_input; /* input buffer on device */ 29 | unsigned char *d_output; /* intermediate hash states */ 30 | int total_threads; /* Total number of threads in the grid */ 31 | int threads_per_block = 128; /* Maximum number of threads per block */ 32 | int total_blocks; /* Total blocks in the grid */ 33 | unsigned char *buffer_ptr; /* Pointer to input buffer */ 34 | unsigned long bytes_read = 0; /* Bytes read from the input */ 35 | unsigned long q, r, b, s, k; 36 | int l1, K1, L1; 37 | /* 38 | * Initialization vector. Length 256 bits. Since reference machine is using 64 bit words 39 | * char array was used instead of word array. I was experiencing some problems with words. 40 | */ 41 | const unsigned char IV[32] = {0x67, 0xe6, 0x09, 0x6a, 42 | 0x85, 0xae, 0x67, 0xbb, 43 | 0x72, 0xf3, 0x6e, 0x3c, 44 | 0x3a, 0xf5, 0x4f, 0xa5, 45 | 0x7f, 0x52, 0x0e, 0x51, 46 | 0x8c, 0x68, 0x05, 0x9b, 47 | 0xab, 0xd9, 0x83, 0x1f, 48 | 0x19, 0xcd, 0xe0, 0x5b}; 49 | /* Few temporary variables */ 50 | int i, j; 51 | unsigned long tmp1, tmp2; 52 | 53 | size = size * 8; /* bytes to bits */ 54 | 55 | if (size <= 160 * 8) { 56 | /* 57 | * if L <= delta0 = n - l, then return h(h(x||0^(n-l-L)||IV)||bin_(n-m)(L)) 58 | * */ 59 | printf ("Not implemented for size less than %d bits\n", 160 * 8); 60 | return; 61 | } 62 | 63 | /* BEGIN INITIALIZATION */ 64 | /* Determine effective tree height */ 65 | if (size >= DELTA(TREE_SIZE)) 66 | t = TREE_SIZE; 67 | else { 68 | for (i = TREE_SIZE - 1; i >= 1; i--) 69 | if (DELTA(i) <= size && size < DELTA(i + 1)) { 70 | t = i; 71 | i = 0; /* break the loop */ 72 | } 73 | } 74 | 75 | /* Find other parameters needed to complete computation */ 76 | q = r = 0; 77 | if (size > DELTA(t)) { 78 | q = (size - DELTA(t)) / LAMDA(t); 79 | r = (size - DELTA(t)) % LAMDA(t); 80 | if (r == 0) { 81 | q--; 82 | r = LAMDA(t); 83 | } 84 | } 85 | 86 | b = r / (2 * PARSHA256_BLOCK_SIZE - 2 * PARSHA256_HASH_SIZE - PARSHA256_IV_SIZE); 87 | if (r % (2 * PARSHA256_BLOCK_SIZE - 2 * PARSHA256_HASH_SIZE - PARSHA256_IV_SIZE)) 88 | b++; 89 | 90 | /* Total number of processors for the first round */ 91 | total_threads = POW2(t); 92 | #if 0 93 | #ifdef _DEBUG 94 | printf ("tree size: %d\n", t); 95 | printf ("total threads: %d\n", total_threads); 96 | printf ("q, r, b: %d %d %d\n", q, r, b); 97 | #endif 98 | #endif 99 | CUT_SAFE_CALL (cutResetTimer (chmeter.malloc_timer)); 100 | CUT_SAFE_CALL (cutStartTimer (chmeter.malloc_timer)); 101 | /* Allocate enough memory on the device */ 102 | cudaMalloc ((void**)&d_input, total_threads * PARSHA256_768BITSB); 103 | CUT_CHECK_ERROR ("Memory allocation failed"); 104 | cudaMalloc ((void**)&d_output, total_threads * PARSHA256_256BITSB); 105 | CUT_CHECK_ERROR ("Memory allocation failed"); 106 | CUT_SAFE_CALL (cutStopTimer (chmeter.malloc_timer)); 107 | 108 | /* END INITIALIZATION */ 109 | 110 | /* BEGIN FIRST ROUND */ 111 | buffer_ptr = input; 112 | CUT_SAFE_CALL (cutResetTimer (chmeter.memcpy_timer)); 113 | CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer)); 114 | for (i = 0; i < total_threads; i++) { 115 | /* Copy 512 bits */ 116 | cudaMemcpy(d_input + i * PARSHA256_768BITSB, buffer_ptr, PARSHA256_512BITSB, 117 | cudaMemcpyHostToDevice); 118 | CUT_CHECK_ERROR ("Memory copy failed"); 119 | /* Add 256 bits of IV */ 120 | cudaMemcpy(d_input + i * PARSHA256_768BITSB + PARSHA256_512BITSB, 121 | (unsigned char *)&IV, PARSHA256_256BITSB, cudaMemcpyHostToDevice); 122 | CUT_CHECK_ERROR ("Memory copy failed"); 123 | buffer_ptr += PARSHA256_512BITSB; 124 | bytes_read += PARSHA256_512BITSB; 125 | } 126 | CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer)); 127 | 128 | /* execute kernel */ 129 | total_blocks = total_threads / threads_per_block + (total_threads % threads_per_block == 0 ? 0 : 1); 130 | #if 0 131 | #ifdef _DEBUG 132 | printf ("bytes read: %d\n", bytes_read); 133 | printf ("total blocks: %d\n", total_blocks); 134 | printf ("total_threads: %d\n", total_threads); 135 | printf ("threads_per_block: %d\n", threads_per_block); 136 | #endif 137 | #endif 138 | CUT_SAFE_CALL (cutResetTimer (chmeter.kernel_timer)); 139 | CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer)); 140 | parsha256_kernel <<>> (d_input, d_output, total_threads); 141 | CUT_CHECK_ERROR ("Kernel execution failed"); 142 | CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer)); 143 | 144 | /* END FIRST ROUND */ 145 | /* BEGIN STEADY STATE */ 146 | tmp2 = q + 1; 147 | for (i = 2; i <= tmp2; i++) { 148 | tmp1 = POW2 (t - 1) - 1; 149 | 150 | CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer)); 151 | 152 | for (j = 0; j <= tmp1; j++) { 153 | /* Copy intermediate hash states */ 154 | cudaMemcpy (d_input + j * PARSHA256_768BITSB, d_output + j * PARSHA256_512BITSB, 155 | PARSHA256_256BITSB, cudaMemcpyDeviceToDevice); 156 | CUT_CHECK_ERROR ("Memory copy failed"); 157 | cudaMemcpy (d_input + j * PARSHA256_768BITSB + PARSHA256_256BITSB, 158 | d_output + j * PARSHA256_512BITSB + PARSHA256_256BITSB, 159 | PARSHA256_256BITSB, 160 | cudaMemcpyDeviceToDevice); 161 | CUT_CHECK_ERROR ("Memory copy failed"); 162 | /* Copy 256 bits from input message */ 163 | cudaMemcpy (d_input + j * PARSHA256_768BITSB + PARSHA256_512BITSB, buffer_ptr, 164 | PARSHA256_256BITSB, cudaMemcpyHostToDevice); 165 | buffer_ptr += PARSHA256_256BITSB; 166 | bytes_read += PARSHA256_256BITSB; 167 | } 168 | 169 | tmp1 = POW2 (t) - 1; 170 | for (j = POW2 (t - 1); j <= tmp1; j++) { 171 | /* Copy 512 bits */ 172 | cudaMemcpy(d_input + j * PARSHA256_768BITSB, buffer_ptr, PARSHA256_512BITSB, 173 | cudaMemcpyHostToDevice); 174 | CUT_CHECK_ERROR ("Memory copy failed"); 175 | /* Add 256 bits of IV */ 176 | cudaMemcpy(d_input + j * PARSHA256_768BITSB + PARSHA256_512BITSB, 177 | (unsigned char *)&IV, PARSHA256_256BITSB, cudaMemcpyHostToDevice); 178 | CUT_CHECK_ERROR ("Memory copy failed"); 179 | buffer_ptr += PARSHA256_512BITSB; 180 | bytes_read += PARSHA256_512BITSB; 181 | } 182 | 183 | CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer)); 184 | 185 | /* execute kernel */ 186 | total_blocks = total_threads / threads_per_block + (total_threads % threads_per_block == 0 ? 0 : 1); 187 | #if 0 188 | #ifdef _DEBUG 189 | printf ("bytes read (steady state): %d\n", bytes_read); 190 | printf ("total blocks: %d\n", total_blocks); 191 | printf ("total_threads: %d\n", total_threads); 192 | printf ("threads_per_block: %d\n", threads_per_block); 193 | #endif 194 | #endif 195 | CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer)); 196 | parsha256_kernel <<>> (d_input, d_output, total_threads); 197 | CUT_CHECK_ERROR ("Kernel execution failed"); 198 | CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer)); 199 | } 200 | 201 | tmp1 = POW2(t - 1) - 1; 202 | total_threads = POW2(t - 1) + b - 1; 203 | 204 | CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer)); 205 | 206 | for (i = 0; i <= tmp1; i++) { 207 | /* Copy intermediate hash states */ 208 | cudaMemcpy (d_input + i * PARSHA256_768BITSB, d_output + i * PARSHA256_512BITSB, 209 | PARSHA256_256BITSB, cudaMemcpyDeviceToDevice); 210 | CUT_CHECK_ERROR ("Memory copy failed"); 211 | cudaMemcpy (d_input + i * PARSHA256_768BITSB + PARSHA256_256BITSB, 212 | d_output + i * PARSHA256_512BITSB + PARSHA256_256BITSB, 213 | PARSHA256_256BITSB, 214 | cudaMemcpyDeviceToDevice); 215 | CUT_CHECK_ERROR ("Memory copy failed"); 216 | /* Copy 256 bits from input message */ 217 | cudaMemcpy (d_input + i * PARSHA256_768BITSB + PARSHA256_512BITSB, buffer_ptr, 218 | PARSHA256_256BITSB, cudaMemcpyHostToDevice); 219 | buffer_ptr += PARSHA256_256BITSB; 220 | bytes_read += PARSHA256_256BITSB; 221 | } 222 | 223 | for (i = POW2(t - 1); i <= total_threads; i++) { 224 | /* Copy 512 bits */ 225 | cudaMemcpy(d_input + i * PARSHA256_768BITSB, buffer_ptr, PARSHA256_512BITSB, 226 | cudaMemcpyHostToDevice); 227 | CUT_CHECK_ERROR ("Memory copy failed"); 228 | /* Add 256 bits of IV */ 229 | cudaMemcpy(d_input + i * PARSHA256_768BITSB + PARSHA256_512BITSB, 230 | (unsigned char *)&IV, PARSHA256_256BITSB, cudaMemcpyHostToDevice); 231 | CUT_CHECK_ERROR ("Memory copy failed"); 232 | buffer_ptr += PARSHA256_512BITSB; 233 | bytes_read += PARSHA256_512BITSB; 234 | } 235 | 236 | CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer)); 237 | 238 | /* execute kernel */ 239 | total_blocks = total_threads / threads_per_block + (total_threads % threads_per_block == 0 ? 0 : 1); 240 | #if 0 241 | #ifdef _DEBUG 242 | printf ("bytes read (end game): %d\n", bytes_read); 243 | printf ("total blocks: %d\n", total_blocks); 244 | printf ("total_threads: %d\n", total_threads); 245 | printf ("threads_per_block: %d\n", threads_per_block); 246 | #endif 247 | #endif 248 | CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer)); 249 | parsha256_kernel <<>> (d_input, d_output, total_threads); 250 | CUT_CHECK_ERROR ("Kernel execution failed"); 251 | CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer)); 252 | 253 | /* BEGIN FLUSHING */ 254 | tmp1 = q + t + 1; 255 | size = size / 8; /* back to bytes */ 256 | for (i = q + 3; i <= tmp1; i++) { 257 | s = q + t + 2 - i; 258 | k = (b - 1 + POW2 (t - s - 1)) / POW2 (t - s); 259 | l1 = (b - 1 + POW2 (t - s)) / POW2 (t - s); 260 | K1 = POW2 (s - 1) + k; 261 | L1 = POW2 (s - 1) + l1; 262 | 263 | /* zero out the buffer for padding I guess */ 264 | CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer)); 265 | cudaMemset(d_input, 0, K1 * PARSHA256_256BITSB); 266 | CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer)); 267 | tmp2 = K1 - 1; 268 | 269 | if (size - bytes_read >= K1 * PARSHA256_256BITSB) 270 | bytes_read += (K1 * PARSHA256_256BITSB); 271 | else 272 | bytes_read += (size - bytes_read); 273 | 274 | CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer)); 275 | for (j = 0; j <= tmp2; j++) { 276 | /* Copy intermediate hash states */ 277 | cudaMemcpy (d_input + j * PARSHA256_768BITSB, d_output + j * PARSHA256_512BITSB, 278 | PARSHA256_256BITSB, cudaMemcpyDeviceToDevice); 279 | CUT_CHECK_ERROR ("Memory copy failed"); 280 | cudaMemcpy (d_input + j * PARSHA256_768BITSB + PARSHA256_256BITSB, 281 | d_output + j * PARSHA256_512BITSB + PARSHA256_256BITSB, 282 | PARSHA256_256BITSB, 283 | cudaMemcpyDeviceToDevice); 284 | CUT_CHECK_ERROR ("Memory copy failed"); 285 | /* Copy 256 bits from input message */ 286 | cudaMemcpy (d_input + j * PARSHA256_768BITSB + PARSHA256_512BITSB, buffer_ptr, 287 | PARSHA256_256BITSB, cudaMemcpyHostToDevice); 288 | buffer_ptr += PARSHA256_256BITSB; 289 | } 290 | CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer)); 291 | 292 | /* execute the kernel */ 293 | total_threads = K1; 294 | total_blocks = total_threads / threads_per_block + (total_threads % threads_per_block == 0 ? 0 : 1); 295 | #if 0 296 | #ifdef _DEBUG 297 | printf ("bytes readi (flushing): %d\n", bytes_read); 298 | printf ("total blocks: %d\n", total_blocks); 299 | printf ("total_threads: %d\n", total_threads); 300 | printf ("threads_per_block: %d\n", threads_per_block); 301 | #endif 302 | #endif 303 | CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer)); 304 | parsha256_kernel <<>> (d_input, d_output, total_threads); 305 | CUT_CHECK_ERROR ("Kernel execution failed"); 306 | CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer)); 307 | 308 | tmp2 = L1 - 1; 309 | CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer)); 310 | for (j = K1; j <= tmp2; j++) { 311 | cudaMemcpy (d_output + j * PARSHA256_256BITSB, d_output + j * PARSHA256_512BITSB, 312 | PARSHA256_256BITSB, cudaMemcpyDeviceToDevice); 313 | } 314 | CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer)); 315 | } 316 | 317 | total_blocks = 1; 318 | total_threads = 1; 319 | if (b > 0) { 320 | CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer)); 321 | cudaMemset (d_input, 0, PARSHA256_768BITSB); 322 | /* Copy intermediate hash states */ 323 | cudaMemcpy (d_input, d_output, PARSHA256_256BITSB, cudaMemcpyDeviceToDevice); 324 | CUT_CHECK_ERROR ("Memory copy failed"); 325 | cudaMemcpy (d_input + PARSHA256_256BITSB, d_output + PARSHA256_256BITSB, 326 | PARSHA256_256BITSB, cudaMemcpyDeviceToDevice); 327 | CUT_CHECK_ERROR ("Memory copy failed"); 328 | CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer)); 329 | 330 | if (size - bytes_read >= PARSHA256_256BITSB) { 331 | CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer)); 332 | cudaMemcpy (d_input + PARSHA256_512BITSB, buffer_ptr, PARSHA256_256BITSB, 333 | cudaMemcpyHostToDevice); 334 | CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer)); 335 | buffer_ptr += PARSHA256_256BITSB; 336 | bytes_read += PARSHA256_256BITSB; 337 | } else { 338 | CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer)); 339 | cudaMemcpy (d_input + PARSHA256_512BITSB, buffer_ptr, size - bytes_read, 340 | cudaMemcpyHostToDevice); 341 | CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer)); 342 | bytes_read += (size - bytes_read); 343 | } 344 | 345 | CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer)); 346 | parsha256_kernel <<>> (d_input, d_output, total_threads); 347 | CUT_CHECK_ERROR ("Kernel execution failed"); 348 | // cudaThreadSynchronize(); 349 | CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer)); 350 | } 351 | 352 | CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer)); 353 | cudaMemset (d_output + PARSHA256_256BITSB, 0, PARSHA256_512BITSB - 8); 354 | cudaMemcpy (d_input, d_output, PARSHA256_768BITSB, cudaMemcpyDeviceToDevice); 355 | size = size * 8; 356 | /* 357 | * The following line should fail on 32 bit machines. Since reference machine I 358 | * am writing this code on uses 64 bit words thus size of int is 8 bytes. 359 | */ 360 | cudaMemcpy (d_input + PARSHA256_768BITSB - 8, &size, 8, cudaMemcpyHostToDevice); 361 | 362 | /* Hash one more time */ 363 | CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer)); 364 | CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer)); 365 | parsha256_kernel <<>> (d_input, d_output, 1); 366 | CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer)); 367 | 368 | /* And we are done here */ 369 | cudaMemcpy (output, d_output, PARSHA256_256BITSB, cudaMemcpyDeviceToHost); 370 | 371 | CUT_SAFE_CALL (cutResetTimer (chmeter.free_timer)); 372 | CUT_SAFE_CALL (cutStartTimer (chmeter.free_timer)); 373 | cudaFree (d_input); 374 | cudaFree (d_output); 375 | CUT_SAFE_CALL (cutStopTimer (chmeter.free_timer)); 376 | } 377 | 378 | int main (int argc, char **argv) 379 | { 380 | unsigned char *buffer; 381 | unsigned int size; 382 | unsigned char output[32]; 383 | 384 | printf ("========================================\n"); 385 | printf ("PARSHA-256 HASH ALGORITHM BENCHMARK TEST\n"); 386 | printf ("========================================\n\n"); 387 | 388 | CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.kernel_timer)); 389 | CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.malloc_timer)); 390 | CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.memcpy_timer)); 391 | CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.free_timer)); 392 | 393 | printf ("SIZE EXEC KERNEL\tcudaMemcpy\tcudaMalloc\tcudaFree\n"); 394 | 395 | for (size = 1000; size <= 100000000; size *= 10) { 396 | buffer = (unsigned char *) malloc (size * sizeof (char)); 397 | if (buffer == NULL) { 398 | printf ("Memory allocation failed\n"); 399 | return -1; 400 | } 401 | 402 | parsha256_gpu (buffer, size, output); 403 | printf ("%-10d%f\t%f\t%f\t%f\n", size, 404 | cutGetTimerValue (chmeter.kernel_timer), 405 | cutGetTimerValue (chmeter.memcpy_timer), 406 | cutGetTimerValue (chmeter.malloc_timer), 407 | cutGetTimerValue (chmeter.free_timer)); 408 | 409 | 410 | free (buffer); 411 | } 412 | } 413 | --------------------------------------------------------------------------------