├── docs
    ├── presentation.pdf
    └── SHA Message Digest Computation on GPU.pdf
├── README
└── src
    ├── parsha256.h
    ├── Makefile
    ├── common.h
    ├── parsha256_kernel.cu
    ├── sha1_kernel.cu
    ├── sha1_cpu.cu
    ├── sha1test.cu
    └── parsha256test.cu


/docs/presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tadasv/gpgpu_sha/HEAD/docs/presentation.pdf


--------------------------------------------------------------------------------
/docs/SHA Message Digest Computation on GPU.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tadasv/gpgpu_sha/HEAD/docs/SHA Message Digest Computation on GPU.pdf


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | This is a project I did for algorithms class in college. It is an implementation of SHA1 and PARSHA-256 algorithms
2 | on GPU. I believe that PARSHA-256 has some bugs due to byte ordering.
3 | 
4 | Compilation: cd src && make
5 | Execution:
6 | 	./sha1test		- SHA-1 performance test
7 | 	./parsha256test		- PARSHA-256 performance test on GPU
8 | 	./parsha256testemu	- PARSHA-256 performance test on CPU (emulation mode)
9 | 


--------------------------------------------------------------------------------
/src/parsha256.h:
--------------------------------------------------------------------------------
 1 | #ifndef __PARSHA256_H__
 2 | #define __PARSHA256_H__
 3 | 
 4 | /* 2 to the power of a */
 5 | #define POW2(a) ((unsigned)1 << (a))
 6 | #define DELTA(i) (POW2(i) * (2 * PARSHA256_BLOCK_SIZE - 2 * PARSHA256_HASH_SIZE - PARSHA256_IV_SIZE) - (PARSHA256_BLOCK_SIZE - 2 * PARSHA256_HASH_SIZE))
 7 | #define LAMDA(i) (POW2(i -1 ) * (2 * PARSHA256_BLOCK_SIZE - 2 * PARSHA256_HASH_SIZE - PARSHA256_IV_SIZE))
 8 | /* Hash function domain in bits */
 9 | #define PARSHA256_BLOCK_SIZE	768
10 | /* Hash function range in bits */
11 | #define PARSHA256_HASH_SIZE	256
12 | /* Length of IV in bits */
13 | #define PARSHA256_IV_SIZE	256
14 | /* Available processor tree */
15 | #define TREE_SIZE		16
16 | 
17 | #define PARSHA256_256BITSB 32
18 | #define PARSHA256_512BITSB 64
19 | #define PARSHA256_768BITSB 96
20 | 
21 | #endif /* __PARSHA256_H__ */
22 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := /usr/local/cuda/bin/nvcc --ptxas-options=-v
 2 | LIBS := -L/usr/local/cuda/sdk/lib -L/usr/local/cuda/lib
 3 | INCS := -I/usr/local/cuda/include -I/usr/include/cuda -I./ -I/usr/local/cuda/sdk/common/inc
 4 | CFLAGS := $(INCS) -c# -D_DEBUG
 5 | LDFLAGS := $(LIBS) -lcuda -lcutil
 6 | SHA1OBJS := sha1test.o sha1_cpu.o sha1_kernel.o
 7 | PARSHA256OBJS := parsha256test.o parsha256_kernel.o
 8 | PARSHA256EMUOBJS := parsha256testemu.o parsha256_kernelemu.o
 9 | 
10 | 
11 | all: sha1test parsha256test parsha256testemu
12 | 
13 | # SHA-1 benchmark test
14 | sha1test: $(SHA1OBJS)
15 | 	$(NVCC) $(LDFLAGS) $(SHA1OBJS) -o sha1test
16 | sha1test.o: sha1test.cu common.h
17 | 	$(NVCC) $(CFLAGS) sha1test.cu -o sha1test.o
18 | sha1_cpu.o: sha1_cpu.cu common.h
19 | 	$(NVCC) $(CFLAGS) sha1_cpu.cu -o sha1_cpu.o
20 | sha1_kernel.o: sha1_kernel.cu common.h
21 | 	$(NVCC) $(CFLAGS) sha1_kernel.cu -o sha1_kernel.o
22 | 
23 | # PARSHA-256 benchmark test
24 | parsha256test: $(PARSHA256OBJS)
25 | 	$(NVCC) $(LDFLAGS) $(PARSHA256OBJS) -o parsha256test
26 | parsha256test.o: parsha256test.cu parsha256.h
27 | 	$(NVCC) $(CFLAGS) parsha256test.cu -o parsha256test.o
28 | parsha256_kernel.o: parsha256_kernel.cu parsha256.h
29 | 	$(NVCC) $(CFLAGS) parsha256_kernel.cu -o parsha256_kernel.o
30 | 
31 | # PARSHA-256 benchmark test in emulation mode
32 | parsha256testemu: $(PARSHA256EMUOBJS)
33 | 	$(NVCC) -deviceemu $(LDFLAGS) $(PARSHA256EMUOBJS) -o parsha256testemu
34 | parsha256testemu.o: parsha256test.cu parsha256.h
35 | 	$(NVCC) -deviceemu $(CFLAGS) parsha256test.cu -o parsha256testemu.o
36 | parsha256_kernelemu.o: parsha256_kernel.cu parsha256.h
37 | 	$(NVCC) -deviceemu $(CFLAGS) parsha256_kernel.cu -o parsha256_kernelemu.o
38 | 
39 | clean:
40 | 	rm -rf *~
41 | 	rm -rf *.o
42 | 	rm -rf sha1test
43 | 	rm -rf parsha256test
44 | 	rm -rf parsha256testemu
45 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef __COMMON_H__
 2 | #define __COMMON_H__
 3 | 
 4 | /*
 5 |  * 32-bit integer manipulation macros (big endian)
 6 |  */
 7 | #ifndef GET_UINT32_BE
 8 | #define GET_UINT32_BE(n,b,i)\
 9 | {\
10 |     (n) = ( (unsigned long) (b)[(i) ] << 24 )\
11 |         | ( (unsigned long) (b)[(i) + 1] << 16 )\
12 |         | ( (unsigned long) (b)[(i) + 2] <<  8 )\
13 |         | ( (unsigned long) (b)[(i) + 3]       );\
14 | }
15 | #endif
16 | 
17 | #ifndef RETURN_UINT32_BE
18 | #define RETURN_UINT32_BE(b,i)\
19 | (\
20 |     	( (unsigned long) (b)[(i) ] << 24 )\
21 |     	| ( (unsigned long) (b)[(i) + 1] << 16 )\
22 |         | ( (unsigned long) (b)[(i) + 2] <<  8 )\
23 |         | ( (unsigned long) (b)[(i) + 3]       )\
24 | )
25 | #endif
26 | 
27 | 
28 | #ifndef GET_UINT32_BE_GPU
29 | #define GET_UINT32_BE_GPU(n,b,i)\
30 | {\
31 |     (n) = ( (unsigned long) (b)[(i) + 3] << 24 )\
32 |         | ( (unsigned long) (b)[(i) + 2] << 16 )\
33 |         | ( (unsigned long) (b)[(i) + 1] <<  8 )\
34 |         | ( (unsigned long) (b)[(i) ]       );\
35 | }
36 | #endif
37 | 
38 | 
39 | #ifndef PUT_UINT32_BE
40 | #define PUT_UINT32_BE(n,b,i)\
41 | {\
42 |     (b)[(i)    ] = (unsigned char) ( (n) >> 24 );	\
43 |     (b)[(i) + 1] = (unsigned char) ( (n) >> 16 );	\
44 |     (b)[(i) + 2] = (unsigned char) ( (n) >>  8 );	\
45 |     (b)[(i) + 3] = (unsigned char) ( (n)       );	\
46 | }
47 | #endif
48 | 
49 | 
50 | #define	TRUNCLONG(x)	(x)
51 | /* Circular rotation to the right for 32 bit word */
52 | #define	ROTATER(x,n)	(((x) >> (n)) | ((x) << (32 - (n))))
53 | /* Shift to the right */
54 | #define	SHIFTR(x,n)		((x) >> (n))
55 | 
56 | /* Little-Endian to Big-Endian for 32 bit word */
57 | #define LETOBE32(i) (((i) & 0xff) << 24) + (((i) & 0xff00) << 8) + (((i) & 0xff0000) >> 8) + (((i) >> 24) & 0xff)
58 | /* Return number of 0 bytes to pad */
59 | #define padding_256(len)	(((len) & 0x3f) < 56) ? (56 - ((len) & 0x3f)) : (120 - ((len) & 0x3f))
60 | 
61 | 
62 | #endif /* __COMMON_H__ */
63 | 
64 | 


--------------------------------------------------------------------------------
/src/parsha256_kernel.cu:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "parsha256.h"
  3 | #include <stdio.h>
  4 | 
  5 | #define ch_256(x, y, z) ((x & y) ^ (~x & z))
  6 | #define maj_256(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
  7 | #define Sigma0_256(x) (ROTATER(x, 2) ^ ROTATER(x, 13) ^ ROTATER(x, 22))
  8 | #define Sigma1_256(x) (ROTATER(x, 6) ^ ROTATER(x, 11) ^ ROTATER(x, 25))
  9 | #define sigma0_256(x) (ROTATER(x, 7) ^ ROTATER(x, 18) ^ SHIFTR(x, 3))
 10 | #define sigma1_256(x) (ROTATER(x, 17) ^ ROTATER(x, 19) ^ SHIFTR(x, 10))
 11 | 
 12 | 
 13 | /*
 14 |  * Table of round constants.
 15 |  * First 32 bits of the fractional parts of the cube roots of the first 64 primes 2..311
 16 |  */
 17 | __device__ static const unsigned int K256[] = {
 18 | 	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 19 | 	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 20 | 	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 21 | 	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 22 | 	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 23 | 	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 24 | 	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 25 | 	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 26 | };
 27 | 
 28 | 
 29 | /*
 30 |  * Process one block
 31 |  */
 32 | __device__ void sha256 (unsigned char *input, unsigned char *output)
 33 | {
 34 | 	unsigned long W[64], a, b, c, d, e, f, g, h;
 35 | 	unsigned long a1, b1, c1, d1, e1, f1, g1, h1;
 36 | 	unsigned long t1, t2;
 37 | 	int t;
 38 | 
 39 | 	for (t = 0; t < 16; t++)
 40 | 		/* Add 32 because first 8 words are intermediate hash state */
 41 | 		GET_UINT32_BE(W[t], input, t * 4 + 32);
 42 | 	for (; t < 64; t++)
 43 | 		W[t] = sigma1_256(W[t - 2]) + W[t - 7] + sigma0_256(W[t - 15]) + W[t - 16];
 44 | 
 45 | 	/* intermediate hash state */
 46 | 	GET_UINT32_BE(a, input,  0);
 47 | 	GET_UINT32_BE(b, input,  4);
 48 | 	GET_UINT32_BE(c, input,  8);
 49 | 	GET_UINT32_BE(d, input, 12);
 50 | 	GET_UINT32_BE(e, input, 16);
 51 | 	GET_UINT32_BE(f, input, 20);
 52 | 	GET_UINT32_BE(g, input, 24);
 53 | 	GET_UINT32_BE(h, input, 28);
 54 | 
 55 | 	a1 = a;
 56 | 	b1 = b;
 57 | 	c1 = c;
 58 | 	d1 = d;
 59 | 	e1 = e;
 60 | 	f1 = f;
 61 | 	g1 = g;
 62 | 	h1 = h;
 63 | 
 64 | 	for (t = 0; t < 64; t++) {
 65 | 		t1 = h + Sigma1_256(e) + ch_256(e, f, g) + K256[t] + W[t];
 66 | 		t2 = Sigma0_256(a) + maj_256(a, b, c);
 67 | 		h = g;
 68 | 		g = f;
 69 | 		f = e;
 70 | 		e = d + t1;
 71 | 		d = c;
 72 | 		c = b;
 73 | 		b = a;
 74 | 		a = t1 + t2;
 75 | 	}
 76 | 
 77 | 	a = a + a1;
 78 | 	b = b + b1;
 79 | 	c = c + c1;
 80 | 	d = d + d1;
 81 | 	e = e + e1;
 82 | 	f = f + f1;
 83 | 	g = g + g1;
 84 | 	h = h + h1;
 85 | 
 86 | 	PUT_UINT32_BE(a, output, 0);
 87 | 	PUT_UINT32_BE(b, output, 4);
 88 | 	PUT_UINT32_BE(c, output, 8);
 89 | 	PUT_UINT32_BE(d, output, 12);
 90 | 	PUT_UINT32_BE(e, output, 16);
 91 | 	PUT_UINT32_BE(f, output, 20);
 92 | 	PUT_UINT32_BE(g, output, 24);
 93 | 	PUT_UINT32_BE(h, output, 28);
 94 | }
 95 | 
 96 | 
 97 | __global__ void parsha256_kernel (unsigned char *input, unsigned char *output, unsigned long total_threads)
 98 | {
 99 | 	unsigned long thread_index = blockIdx.x * blockDim.x + threadIdx.x;
100 | 
101 | 	if (thread_index > total_threads - 1)
102 | 		return;
103 | 
104 | 	sha256(&input[thread_index * 96], &output[thread_index * 32]);
105 | }
106 | 


--------------------------------------------------------------------------------
/src/sha1_kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SHA-1 GPU implementation.
  3 |  * 2008, Tadas Vilkeliskis <vilkeliskis.t@gmail.com>
  4 |  */
  5 | #include <cuda.h>
  6 | #include "common.h"
  7 | 
  8 | 
  9 | #define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n)))
 10 | #define R(t) \
 11 | 	temp = extended[block_index + t -  3] ^ extended[block_index + t - 8] ^     \
 12 | 		   extended[block_index + t - 14] ^ extended[block_index + t - 16]; \
 13 | 	extended[block_index + t] = S(temp,1); \
 14 | 
 15 | 
 16 | typedef struct {
 17 | 	unsigned long state[5];
 18 | } sha1_gpu_context;
 19 | 
 20 | /*
 21 |  * Process extended block.
 22 |  */
 23 | __device__ void sha1_gpu_process (sha1_gpu_context *ctx, unsigned long W[80])
 24 | {
 25 | 	unsigned long A, B, C, D, E;
 26 | 	A = ctx->state[0];
 27 | 	B = ctx->state[1];
 28 | 	C = ctx->state[2];
 29 | 	D = ctx->state[3];
 30 | 	E = ctx->state[4];
 31 | 
 32 | #define P(a,b,c,d,e,x)                                  \
 33 | {                                                       \
 34 |     e += S(a,5) + F(b,c,d) + K + x; b = S(b,30);        \
 35 | }
 36 | 
 37 | 
 38 | #define F(x,y,z) (z ^ (x & (y ^ z)))
 39 | #define K 0x5A827999
 40 |   
 41 | 	P( A, B, C, D, E, W[0]  );
 42 | 	P( E, A, B, C, D, W[1]  );
 43 | 	P( D, E, A, B, C, W[2]  );
 44 | 	P( C, D, E, A, B, W[3]  );
 45 | 	P( B, C, D, E, A, W[4]  );
 46 | 	P( A, B, C, D, E, W[5]  );
 47 | 	P( E, A, B, C, D, W[6]  );
 48 | 	P( D, E, A, B, C, W[7]  );
 49 | 	P( C, D, E, A, B, W[8]  );
 50 | 	P( B, C, D, E, A, W[9]  );
 51 | 	P( A, B, C, D, E, W[10] );
 52 | 	P( E, A, B, C, D, W[11] );
 53 | 	P( D, E, A, B, C, W[12] );
 54 | 	P( C, D, E, A, B, W[13] );
 55 | 	P( B, C, D, E, A, W[14] );
 56 | 	P( A, B, C, D, E, W[15] );
 57 | 	P( E, A, B, C, D, W[16] );
 58 | 	P( D, E, A, B, C, W[17] );
 59 | 	P( C, D, E, A, B, W[18] );
 60 | 	P( B, C, D, E, A, W[19] );
 61 |   
 62 | #undef K
 63 | #undef F
 64 | 
 65 | #define F(x,y,z) (x ^ y ^ z)
 66 | #define K 0x6ED9EBA1
 67 |   
 68 | 	P( A, B, C, D, E, W[20] );
 69 | 	P( E, A, B, C, D, W[21] );
 70 | 	P( D, E, A, B, C, W[22] );
 71 | 	P( C, D, E, A, B, W[23] );
 72 | 	P( B, C, D, E, A, W[24] );
 73 | 	P( A, B, C, D, E, W[25] );
 74 | 	P( E, A, B, C, D, W[26] );
 75 | 	P( D, E, A, B, C, W[27] );
 76 | 	P( C, D, E, A, B, W[28] );
 77 | 	P( B, C, D, E, A, W[29] );
 78 | 	P( A, B, C, D, E, W[30] );
 79 | 	P( E, A, B, C, D, W[31] );
 80 | 	P( D, E, A, B, C, W[32] );
 81 | 	P( C, D, E, A, B, W[33] );
 82 | 	P( B, C, D, E, A, W[34] );
 83 | 	P( A, B, C, D, E, W[35] );
 84 | 	P( E, A, B, C, D, W[36] );
 85 | 	P( D, E, A, B, C, W[37] );
 86 | 	P( C, D, E, A, B, W[38] );
 87 | 	P( B, C, D, E, A, W[39] );
 88 |   
 89 | #undef K
 90 | #undef F
 91 | 
 92 | #define F(x,y,z) ((x & y) | (z & (x | y)))
 93 | #define K 0x8F1BBCDC
 94 | 
 95 | 	P( A, B, C, D, E, W[40] );
 96 | 	P( E, A, B, C, D, W[41] );
 97 | 	P( D, E, A, B, C, W[42] );
 98 | 	P( C, D, E, A, B, W[43] );
 99 | 	P( B, C, D, E, A, W[44] );
100 | 	P( A, B, C, D, E, W[45] );
101 | 	P( E, A, B, C, D, W[46] );
102 | 	P( D, E, A, B, C, W[47] );
103 | 	P( C, D, E, A, B, W[48] );
104 | 	P( B, C, D, E, A, W[49] );
105 | 	P( A, B, C, D, E, W[50] );
106 | 	P( E, A, B, C, D, W[51] );
107 | 	P( D, E, A, B, C, W[52] );
108 | 	P( C, D, E, A, B, W[53] );
109 | 	P( B, C, D, E, A, W[54] );
110 | 	P( A, B, C, D, E, W[55] );
111 | 	P( E, A, B, C, D, W[56] );
112 | 	P( D, E, A, B, C, W[57] );
113 | 	P( C, D, E, A, B, W[58] );
114 | 	P( B, C, D, E, A, W[59] );
115 |   
116 | #undef K
117 | #undef F
118 | 
119 | #define F(x,y,z) (x ^ y ^ z)
120 | #define K 0xCA62C1D6
121 |   
122 | 	P( A, B, C, D, E, W[60] );
123 | 	P( E, A, B, C, D, W[61] );
124 | 	P( D, E, A, B, C, W[62] );
125 | 	P( C, D, E, A, B, W[63] );
126 | 	P( B, C, D, E, A, W[64] );
127 | 	P( A, B, C, D, E, W[65] );
128 | 	P( E, A, B, C, D, W[66] );
129 | 	P( D, E, A, B, C, W[67] );
130 | 	P( C, D, E, A, B, W[68] );
131 | 	P( B, C, D, E, A, W[69] );
132 | 	P( A, B, C, D, E, W[70] );
133 | 	P( E, A, B, C, D, W[71] );
134 | 	P( D, E, A, B, C, W[72] );
135 | 	P( C, D, E, A, B, W[73] );
136 | 	P( B, C, D, E, A, W[74] );
137 | 	P( A, B, C, D, E, W[75] );
138 | 	P( E, A, B, C, D, W[76] );
139 | 	P( D, E, A, B, C, W[77] );
140 | 	P( C, D, E, A, B, W[78] );
141 | 	P( B, C, D, E, A, W[79] );
142 |   
143 | #undef K
144 | #undef F
145 | 
146 | 	ctx->state[0] += A;
147 | 	ctx->state[1] += B;
148 | 	ctx->state[2] += C;
149 | 	ctx->state[3] += D;
150 | 	ctx->state[4] += E;
151 | }
152 | 
153 | __global__ void sha1_kernel_global (unsigned char *data, sha1_gpu_context *ctx, int total_threads, unsigned long *extended)
154 | {
155 | 	int thread_index = threadIdx.x + blockDim.x * blockIdx.x;
156 | 	int e_index = thread_index * 80;
157 | 	int block_index = thread_index * 64;
158 | 	unsigned long temp, t;
159 | 
160 | 	if (thread_index > total_threads -1)
161 | 		return;
162 | 
163 | 	/*
164 | 	 * Extend 32 block byte block into 80 byte block.
165 | 	 */
166 | 	GET_UINT32_BE( extended[e_index    ], data + block_index,  0 );
167 | 	GET_UINT32_BE( extended[e_index + 1], data + block_index,  4 );
168 | 	GET_UINT32_BE( extended[e_index + 2], data + block_index,  8 );
169 | 	GET_UINT32_BE( extended[e_index + 3], data + block_index, 12 );
170 | 	GET_UINT32_BE( extended[e_index + 4], data + block_index, 16 );
171 | 	GET_UINT32_BE( extended[e_index + 5], data + block_index, 20 );
172 | 	GET_UINT32_BE( extended[e_index + 6], data + block_index, 24 );
173 | 	GET_UINT32_BE( extended[e_index + 7], data + block_index, 28 );
174 | 	GET_UINT32_BE( extended[e_index + 8], data + block_index, 32 );
175 | 	GET_UINT32_BE( extended[e_index + 9], data + block_index, 36 );
176 | 	GET_UINT32_BE( extended[e_index +10], data + block_index, 40 );
177 | 	GET_UINT32_BE( extended[e_index +11], data + block_index, 44 );
178 | 	GET_UINT32_BE( extended[e_index +12], data + block_index, 48 );
179 | 	GET_UINT32_BE( extended[e_index +13], data + block_index, 52 );
180 | 	GET_UINT32_BE( extended[e_index +14], data + block_index, 56 );
181 | 	GET_UINT32_BE( extended[e_index +15], data + block_index, 60 );
182 | 
183 | 	for (t = 16; t < 80; t++) {
184 | 			temp = extended[e_index + t - 3] ^ extended[e_index + t - 8] ^
185 | 				extended[e_index + t - 14] ^ extended[e_index + t - 16];
186 | 			extended[e_index + t] = S(temp,1);
187 | 	}
188 | 
189 | 	/* Wait for the last thread and compute intermediate hash values of extended blocks */
190 | 	__syncthreads();
191 | 	if (thread_index == total_threads - 1) {
192 | 		for (t = 0; t < total_threads; t++)
193 | 			sha1_gpu_process (ctx, (unsigned long*)&extended[t * 80]);
194 | 	}
195 | }
196 | 
197 | 


--------------------------------------------------------------------------------
/src/sha1_cpu.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SHA-1 CPU implementation
  3 |  */
  4 | #include <string.h>
  5 | #include <stdio.h>
  6 | #include "common.h"
  7 | 
  8 | typedef struct {
  9 |   unsigned long total[2];     /* number of bytes processed  */
 10 |   unsigned long state[5];     /* intermediate digest state  */
 11 |   unsigned char buffer[64];   /* data block being processed */
 12 | } sha1_cpu_context;
 13 | 
 14 | 
 15 | static const unsigned char sha1_padding[64] =
 16 | {
 17 | 	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 18 | 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 19 | 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 20 | 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 21 | };
 22 | 
 23 | 
 24 | /*
 25 |  * Prepare SHA-1 for execution.
 26 |  */
 27 | void sha1_cpu_starts(sha1_cpu_context* ctx)
 28 | {
 29 | 	ctx->total[0] = 0;
 30 | 	ctx->total[1] = 0;
 31 | 	ctx->state[0] = 0x67452301;
 32 | 	ctx->state[1] = 0xEFCDAB89;
 33 | 	ctx->state[2] = 0x98BADCFE;
 34 | 	ctx->state[3] = 0x10325476;
 35 | 	ctx->state[4] = 0xC3D2E1F0;
 36 | }
 37 | 
 38 | 
 39 | /*
 40 |  * Process one block of data.
 41 |  */
 42 | static void sha1_cpu_process(sha1_cpu_context *ctx, unsigned char data[64])
 43 | {
 44 | 	unsigned long temp, W[16]={0,}, A, B, C, D, E;
 45 |   
 46 | 	GET_UINT32_BE( W[ 0], data,  0 );
 47 | 	GET_UINT32_BE( W[ 1], data,  4 );
 48 | 	GET_UINT32_BE( W[ 2], data,  8 );
 49 | 	GET_UINT32_BE( W[ 3], data, 12 );
 50 | 	GET_UINT32_BE( W[ 4], data, 16 );
 51 | 	GET_UINT32_BE( W[ 5], data, 20 );
 52 | 	GET_UINT32_BE( W[ 6], data, 24 );
 53 | 	GET_UINT32_BE( W[ 7], data, 28 );
 54 | 	GET_UINT32_BE( W[ 8], data, 32 );
 55 | 	GET_UINT32_BE( W[ 9], data, 36 );
 56 | 	GET_UINT32_BE( W[10], data, 40 );
 57 | 	GET_UINT32_BE( W[11], data, 44 );
 58 | 	GET_UINT32_BE( W[12], data, 48 );
 59 | 	GET_UINT32_BE( W[13], data, 52 );
 60 | 	GET_UINT32_BE( W[14], data, 56 );
 61 | 	GET_UINT32_BE( W[15], data, 60 );
 62 |   
 63 | #define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n)))
 64 | 
 65 | #define R(t)						\
 66 | (                                                       \
 67 |     temp = W[(t -  3) & 0x0F] ^ W[(t - 8) & 0x0F] ^     \
 68 |            W[(t - 14) & 0x0F] ^ W[ t      & 0x0F],      \
 69 |     ( W[t & 0x0F] = S(temp,1) )                         \
 70 | )
 71 | 
 72 | #define P(a,b,c,d,e,x)                                  \
 73 | {                                                       \
 74 |     e += S(a,5) + F(b,c,d) + K + x; b = S(b,30);        \
 75 | }
 76 | 
 77 | 	A = ctx->state[0];
 78 | 	B = ctx->state[1];
 79 | 	C = ctx->state[2];
 80 | 	D = ctx->state[3];
 81 | 	E = ctx->state[4];
 82 |   
 83 | #define F(x,y,z) (z ^ (x & (y ^ z)))
 84 | #define K 0x5A827999
 85 |   
 86 | 	P( A, B, C, D, E, W[0]  );
 87 | 	P( E, A, B, C, D, W[1]  );
 88 | 	P( D, E, A, B, C, W[2]  );
 89 | 	P( C, D, E, A, B, W[3]  );
 90 | 	P( B, C, D, E, A, W[4]  );
 91 | 	P( A, B, C, D, E, W[5]  );
 92 | 	P( E, A, B, C, D, W[6]  );
 93 | 	P( D, E, A, B, C, W[7]  );
 94 | 	P( C, D, E, A, B, W[8]  );
 95 | 	P( B, C, D, E, A, W[9]  );
 96 | 	P( A, B, C, D, E, W[10] );
 97 | 	P( E, A, B, C, D, W[11] );
 98 | 	P( D, E, A, B, C, W[12] );
 99 | 	P( C, D, E, A, B, W[13] );
100 | 	P( B, C, D, E, A, W[14] );
101 | 	P( A, B, C, D, E, W[15] );
102 | 	P( E, A, B, C, D, R(16) );
103 | 	P( D, E, A, B, C, R(17) );
104 | 	P( C, D, E, A, B, R(18) );
105 | 	P( B, C, D, E, A, R(19) );
106 | 
107 | #undef K
108 | #undef F
109 | 
110 | #define F(x,y,z) (x ^ y ^ z)
111 | #define K 0x6ED9EBA1
112 |   
113 | 	P( A, B, C, D, E, R(20) );
114 | 	P( E, A, B, C, D, R(21) );
115 | 	P( D, E, A, B, C, R(22) );
116 | 	P( C, D, E, A, B, R(23) );
117 | 	P( B, C, D, E, A, R(24) );
118 | 	P( A, B, C, D, E, R(25) );
119 | 	P( E, A, B, C, D, R(26) );
120 | 	P( D, E, A, B, C, R(27) );
121 | 	P( C, D, E, A, B, R(28) );
122 | 	P( B, C, D, E, A, R(29) );
123 | 	P( A, B, C, D, E, R(30) );
124 | 	P( E, A, B, C, D, R(31) );
125 | 	P( D, E, A, B, C, R(32) );
126 | 	P( C, D, E, A, B, R(33) );
127 | 	P( B, C, D, E, A, R(34) );
128 | 	P( A, B, C, D, E, R(35) );
129 | 	P( E, A, B, C, D, R(36) );
130 | 	P( D, E, A, B, C, R(37) );
131 | 	P( C, D, E, A, B, R(38) );
132 | 	P( B, C, D, E, A, R(39) );
133 | 
134 | #undef K
135 | #undef F
136 | 
137 | #define F(x,y,z) ((x & y) | (z & (x | y)))
138 | #define K 0x8F1BBCDC
139 | 
140 | 	P( A, B, C, D, E, R(40) );
141 | 	P( E, A, B, C, D, R(41) );
142 | 	P( D, E, A, B, C, R(42) );
143 | 	P( C, D, E, A, B, R(43) );
144 | 	P( B, C, D, E, A, R(44) );
145 | 	P( A, B, C, D, E, R(45) );
146 | 	P( E, A, B, C, D, R(46) );
147 | 	P( D, E, A, B, C, R(47) );
148 | 	P( C, D, E, A, B, R(48) );
149 | 	P( B, C, D, E, A, R(49) );
150 | 	P( A, B, C, D, E, R(50) );
151 | 	P( E, A, B, C, D, R(51) );
152 | 	P( D, E, A, B, C, R(52) );
153 | 	P( C, D, E, A, B, R(53) );
154 | 	P( B, C, D, E, A, R(54) );
155 | 	P( A, B, C, D, E, R(55) );
156 | 	P( E, A, B, C, D, R(56) );
157 | 	P( D, E, A, B, C, R(57) );
158 | 	P( C, D, E, A, B, R(58) );
159 | 	P( B, C, D, E, A, R(59) );
160 | 
161 | #undef K
162 | #undef F
163 | 
164 | #define F(x,y,z) (x ^ y ^ z)
165 | #define K 0xCA62C1D6
166 |   
167 | 	P( A, B, C, D, E, R(60) );
168 | 	P( E, A, B, C, D, R(61) );
169 | 	P( D, E, A, B, C, R(62) );
170 | 	P( C, D, E, A, B, R(63) );
171 | 	P( B, C, D, E, A, R(64) );
172 | 	P( A, B, C, D, E, R(65) );
173 | 	P( E, A, B, C, D, R(66) );
174 | 	P( D, E, A, B, C, R(67) );
175 | 	P( C, D, E, A, B, R(68) );
176 | 	P( B, C, D, E, A, R(69) );
177 | 	P( A, B, C, D, E, R(70) );
178 | 	P( E, A, B, C, D, R(71) );
179 | 	P( D, E, A, B, C, R(72) );
180 | 	P( C, D, E, A, B, R(73) );
181 | 	P( B, C, D, E, A, R(74) );
182 | 	P( A, B, C, D, E, R(75) );
183 | 	P( E, A, B, C, D, R(76) );
184 | 	P( D, E, A, B, C, R(77) );
185 | 	P( C, D, E, A, B, R(78) );
186 | 	P( B, C, D, E, A, R(79) );
187 | 
188 | #undef K
189 | #undef F
190 | 
191 | 	ctx->state[0] += A;
192 | 	ctx->state[1] += B;
193 | 	ctx->state[2] += C;
194 | 	ctx->state[3] += D;
195 | 	ctx->state[4] += E;
196 | }
197 | 
198 | 
199 | /*
200 |  * Splits input message into blocks and processes them one by one. Also
201 |  * checks how many 0 need to be padded and processes the last, padded, block.
202 |  */
203 | void sha1_cpu_update(sha1_cpu_context *ctx, unsigned char *input, int ilen)
204 | {
205 | 	int fill;
206 | 	unsigned long left;
207 |   
208 | 	if ( ilen <= 0 )
209 | 		return;
210 |   
211 | 	left = ctx->total[0] & 0x3F;
212 | 	fill = 64 - left;
213 |   
214 | 	ctx->total[0] += ilen;
215 | 	ctx->total[0] &= 0xFFFFFFFF;
216 | 
217 | 	if (ctx->total[0] < (unsigned long) ilen)
218 | 		ctx->total[1]++;
219 |   
220 | 	if ( left && ilen >= fill ) {
221 | 		memcpy((void *) (ctx->buffer + left), (void *) input, fill);
222 | 		sha1_cpu_process(ctx, ctx->buffer);
223 | 		input += fill;
224 | 		ilen  -= fill;
225 | 		left = 0;
226 | 	}
227 |   
228 | 	while ( ilen >= 64 ) {
229 | 		sha1_cpu_process(ctx, input);
230 | 		input += 64;
231 | 		ilen  -= 64;
232 | 	}
233 |   
234 | 	if ( ilen > 0 ) {
235 | 		memcpy( (void *) (ctx->buffer + left), (void *) input, ilen );
236 | 	}
237 | }
238 | 
239 | 
240 | /*
241 |  * Process padded block and return hash to user.
242 |  */
243 | void sha1_cpu_finish(sha1_cpu_context *ctx, unsigned char *output)
244 | {
245 | 	unsigned long last, padn;
246 | 	unsigned long high, low;
247 | 	unsigned char msglen[8];
248 | 
249 | 
250 | 	high = (ctx->total[0] >> 29) | (ctx->total[1] <<  3);
251 | 	low  = (ctx->total[0] <<  3);
252 | 
253 | 	PUT_UINT32_BE(high, msglen, 0);
254 | 	PUT_UINT32_BE(low,  msglen, 4);
255 | 
256 | 	last = ctx->total[0] & 0x3F;
257 | 	padn = (last < 56 ) ? ( 56 - last ) : ( 120 - last);
258 | 
259 | 	sha1_cpu_update(ctx, (unsigned char *) sha1_padding, padn);
260 | 	sha1_cpu_update(ctx, msglen, 8);
261 | 
262 | 	PUT_UINT32_BE(ctx->state[0], output,  0);
263 | 	PUT_UINT32_BE(ctx->state[1], output,  4);
264 | 	PUT_UINT32_BE(ctx->state[2], output,  8);
265 | 	PUT_UINT32_BE(ctx->state[3], output, 12);
266 | 	PUT_UINT32_BE(ctx->state[4], output, 16);
267 | }
268 | 
269 | 
270 | /*
271 |  * Execute SHA-1
272 |  */
273 | void sha1_cpu(unsigned char *input, int ilen, unsigned char *output) {
274 | 	sha1_cpu_context ctx;
275 | 
276 | 	sha1_cpu_starts( &ctx );
277 | 	sha1_cpu_update( &ctx, input, ilen );
278 | 	sha1_cpu_finish( &ctx, output );
279 | 
280 | 	memset( &ctx, 0, sizeof( sha1_cpu_context ) );
281 | }
282 | 
283 | 


--------------------------------------------------------------------------------
/src/sha1test.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SHA-1 benchmark program. Calculates execution time of SHA-1 on CPU and GPU.
  3 |  * Also includes function sha1_gpu_global() which prepares SHA-1 to be executed
  4 |  * on GPU.
  5 |  *
  6 |  * 2008, Tadas Vilkeliskis <vilkeliskis.t@gmail.com>
  7 |  */
  8 | #include <stdlib.h>
  9 | #include <stdio.h>
 10 | #include <cuda.h>
 11 | #include <cutil.h>
 12 | #include "common.h"
 13 | 
 14 | #define MAX_THREADS_PER_BLOCK 128
 15 | 
 16 | typedef struct {
 17 | 	unsigned long state[5];
 18 | } sha1_gpu_context;
 19 | 
 20 | 
 21 | typedef struct {
 22 | 	unsigned const char *data;
 23 | 	unsigned const char *hash;
 24 | } testvector;
 25 | 
 26 | 
 27 | typedef struct {
 28 | 	unsigned int kernel_timer;	/* time spent in kernel */
 29 | 	unsigned int malloc_timer;	/* how much time we spend allocating memory */
 30 | 	unsigned int memcpy_timer;	/* how much time we spend copying from host to device */
 31 | 	unsigned int free_timer;	/* how much time we spend releasing memory */
 32 | } chronometer;
 33 | 
 34 | /* timers used to check performance */
 35 | chronometer chmeter = {0, 0, 0, 0};
 36 | 
 37 | extern void sha1_cpu (unsigned char *input, int ilen, unsigned char *output);
 38 | extern __global__ void sha1_kernel_global (unsigned char *data, sha1_gpu_context *ctx, int total_threads, unsigned long *extended);
 39 | 
 40 | /*
 41 |  * Run sha1 kernel on GPU
 42 |  * input - message
 43 |  * size - message size
 44 |  * output - buffer to store hash value
 45 |  * proc - maximum threads per block
 46 |  */
 47 | void sha1_gpu_global (unsigned char *input, unsigned long size, unsigned char *output, int proc)
 48 | {
 49 | 	int total_threads;		/* Total number of threads in the grid */
 50 | 	int blocks_per_grid;		/* Number of blocks in the grid */
 51 | 	int threads_per_block;		/* Number of threads in a block */
 52 | 	int pad, size_be;		/* Number of zeros to pad, message size in big-enadian. */
 53 | 	int total_datablocks;		/* Total number of blocks message is split into */
 54 | 	int i, k;			/* Temporary variables */
 55 | 	unsigned char *d_message;	/* Input message on the device */
 56 | 	unsigned long *d_extended;	/* Extended blocks on the device */
 57 | 	sha1_gpu_context ctx, *d_ctx;	/* Intermediate hash states */
 58 | 
 59 | 	/* Initialization vector for SHA-1 */
 60 | 	ctx.state[0] = 0x67452301;
 61 | 	ctx.state[1] = 0xEFCDAB89;
 62 | 	ctx.state[2] = 0x98BADCFE;
 63 | 	ctx.state[3] = 0x10325476;
 64 | 	ctx.state[4] = 0xC3D2E1F0;
 65 | 
 66 | 	pad = padding_256 (size);
 67 | 	threads_per_block = proc;
 68 | 	blocks_per_grid = 1;
 69 | 	/* How many blocks in the message */
 70 | 	total_datablocks = (size + pad + 8) / 64;
 71 | 
 72 | 	if (total_datablocks > threads_per_block)
 73 | 		total_threads = threads_per_block;
 74 | 	else
 75 | 		total_threads = total_datablocks;
 76 | 	
 77 | 	size_be = LETOBE32 (size * 8);
 78 | 
 79 | 	/* Allocate enough memory on the device */
 80 | 	CUT_SAFE_CALL (cutResetTimer (chmeter.malloc_timer));
 81 | 	CUT_SAFE_CALL (cutStartTimer (chmeter.malloc_timer));
 82 | 	cudaMalloc ((void**)&d_extended, proc * 80 * sizeof(unsigned long));
 83 | 	CUT_CHECK_ERROR ("d_extended malloc failed");
 84 | 	cudaMalloc ((void**)&d_message, size + pad + 8);
 85 | 	CUT_CHECK_ERROR ("d_message malloc failed");
 86 | 	cudaMalloc ((void**)&d_ctx, sizeof (sha1_gpu_context));
 87 | 	CUT_CHECK_ERROR ("d_ctx malloc failed");
 88 | 	CUT_SAFE_CALL (cutStopTimer (chmeter.malloc_timer));
 89 | 	CUT_SAFE_CALL (cutResetTimer (chmeter.memcpy_timer));
 90 | 
 91 | 	/*
 92 | 	 * Copy the data from host to device and perform padding
 93 | 	 */
 94 | 	CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer));
 95 | 	cudaMemcpy (d_ctx, &ctx, sizeof (sha1_gpu_context), cudaMemcpyHostToDevice);
 96 | 	cudaMemcpy (d_message, input, size, cudaMemcpyHostToDevice);
 97 | 	cudaMemset (d_message + size, 0x80, 1);
 98 | 	cudaMemset (d_message + size + 1, 0, pad + 7);
 99 | 	cudaMemcpy (d_message + size + pad + 4, &size_be, 4, cudaMemcpyHostToDevice);
100 | 	CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer));
101 | 
102 | 	/*
103 | 	 * Run the algorithm
104 | 	 */
105 | 	i = 0;
106 | 	k = total_datablocks / total_threads;
107 | 	CUT_SAFE_CALL (cutResetTimer (chmeter.kernel_timer));
108 | 	CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer));
109 | 	if (k - 1 > 0) {
110 | 		/*
111 | 		 * Kernel is executed multiple times and only one block in the grid is used.
112 | 		 * Since thread synchronization is allowed only within a block.
113 | 		 */
114 | 		for (i = 0; i < k; i++) {
115 | 			sha1_kernel_global <<<blocks_per_grid, proc>>>(d_message + threads_per_block * i * 64, d_ctx, threads_per_block, d_extended);
116 | 			CUT_CHECK_ERROR ("Kernel execution failed");
117 | 			/*
118 | 			 * Here I do not perform thread synchronization
119 | 			 * since threads are shynchronized in the kernel
120 | 			 */
121 | 		}
122 | 	}
123 | 	threads_per_block = total_datablocks - (i * total_threads);
124 | 	sha1_kernel_global <<<blocks_per_grid, proc>>>(d_message + total_threads * i * 64, d_ctx, threads_per_block, d_extended);
125 | 	CUT_CHECK_ERROR ("Kernel execution failed");
126 | 
127 | 	CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer));
128 | 	CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer));
129 | 	cudaMemcpy (&ctx, d_ctx, sizeof(sha1_gpu_context), cudaMemcpyDeviceToHost);
130 | 	CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer));
131 | 
132 | 	CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer));
133 | 	/* Put the hash value in the users' buffer */
134 | 	PUT_UINT32_BE( ctx.state[0], output,  0 );
135 | 	PUT_UINT32_BE( ctx.state[1], output,  4 );
136 | 	PUT_UINT32_BE( ctx.state[2], output,  8 );
137 | 	PUT_UINT32_BE( ctx.state[3], output, 12 );
138 | 	PUT_UINT32_BE( ctx.state[4], output, 16 );
139 | 	CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer));
140 | 
141 | 	CUT_SAFE_CALL (cutResetTimer (chmeter.free_timer));
142 | 	CUT_SAFE_CALL (cutStartTimer (chmeter.free_timer));
143 | 	cudaFree (d_message);
144 | 	cudaFree (d_ctx);
145 | 	cudaFree (d_extended);
146 | 	CUT_SAFE_CALL (cutStopTimer (chmeter.free_timer));
147 | }
148 | 
149 | 
150 | int main(int argc, char *argv[])
151 | {
152 | 	testvector tv1 = {
153 | 		(unsigned char *) "abc",
154 | 		(unsigned char *) "\xa9\x99\x3e\x36\x47\x06\x81\x6a\xba\x3e\x25\x71\x78\x50\xc2\x6c\x9c\xd0\xd8\x9d"
155 | 	};
156 | 	testvector tv2 = {
157 | 		(unsigned char *) "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
158 | 		(unsigned char *) "\x84\x98\x3e\x44\x1c\x3b\xd2\x6e\xba\xae\x4a\xa1\xf9\x51\x29\xe5\xe5\x46\x70\xf1"
159 | 	};
160 | 	unsigned char hash[20];
161 | 	unsigned char *data = NULL;
162 | 	int i;
163 | 	int max_threads_per_block = MAX_THREADS_PER_BLOCK;
164 | 
165 | 	printf ("===================================\n");
166 | 	printf ("SHA-1 HASH ALGORITHM BENCHMARK TEST\n");
167 | 	printf ("===================================\n");
168 | 
169 | 	CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.kernel_timer));
170 | 	CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.malloc_timer));
171 | 	CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.memcpy_timer));
172 | 	CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.free_timer));
173 | 
174 | 	printf ("\nTesting algorithm correctness...\n");
175 | 
176 | 	sha1_cpu ((unsigned char*)tv1.data, strlen((const char*)tv1.data), hash);
177 | 	if (memcmp (hash, tv1.hash, 20) == 0) printf ("CPU TEST 1 PASSED\n");
178 | 	else printf ("CPU TEST 1 FAILED\n");
179 | 
180 | 	sha1_gpu_global ((unsigned char*)tv1.data, strlen((const char*)tv1.data), hash, max_threads_per_block);
181 | 	if (memcmp (hash, tv1.hash, 20) == 0) printf ("GPU TEST 1 PASSED\n");
182 | 	else printf ("GPU TEST 1 FAILED\n");
183 | 	
184 | 	sha1_cpu ((unsigned char*)tv2.data, strlen((const char*)tv2.data), hash);
185 | 	if (memcmp (hash, tv2.hash, 20) == 0) printf ("CPU TEST 2 PASSED\n");
186 | 	else printf ("CPU TEST 2 FAILED\n");
187 | 
188 | 	sha1_gpu_global ((unsigned char*)tv2.data, strlen((const char*)tv2.data), hash, max_threads_per_block);
189 | 	if (memcmp (hash, tv2.hash, 20) == 0) printf ("GPU TEST 2 PASSED\n");
190 | 	else printf ("GPU TEST 2 FAILED\n");
191 | 
192 | 	printf ("Done.\n\n");
193 | 	printf ("\tSIZE      EXEC KERNEL\tcudaMemcpy\tcudaMalloc\tcudaFree\n");
194 | 
195 | 	for (i = 1000; i < 100000000; i = i * 10) {
196 | 		data = (unsigned char *) malloc (i);
197 | 		if (data == NULL) {
198 | 			printf ("ERROR: Insufficient memory on host\n");
199 | 			return -1;
200 | 		}
201 | 
202 | 		CUT_SAFE_CALL (cutResetTimer (chmeter.kernel_timer));
203 | 		CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer));
204 | 		sha1_cpu (data, i, hash);
205 | 		CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer));
206 | 		printf ("CPU\t%-10d%f\n", i, cutGetTimerValue (chmeter.kernel_timer));
207 | 
208 | 		CUT_SAFE_CALL (cutResetTimer (chmeter.kernel_timer));
209 | 		CUT_SAFE_CALL (cutResetTimer (chmeter.malloc_timer));
210 | 		CUT_SAFE_CALL (cutResetTimer (chmeter.memcpy_timer));
211 | 		CUT_SAFE_CALL (cutResetTimer (chmeter.free_timer));
212 | 		memset (hash, 0, 20);
213 | 
214 | 		sha1_gpu_global (data, i, hash, max_threads_per_block);
215 | 		printf ("GPU\t%-10d%f\t%f\t%f\t%f\n", i,
216 | 				cutGetTimerValue (chmeter.kernel_timer),
217 | 				cutGetTimerValue (chmeter.memcpy_timer),
218 | 				cutGetTimerValue (chmeter.malloc_timer),
219 | 				cutGetTimerValue (chmeter.free_timer));
220 | 		free (data);
221 | 	}
222 | 
223 | 	return 0;
224 | }
225 | 


--------------------------------------------------------------------------------
/src/parsha256test.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * PARSHA-256 benchmark program. Calculates execution time of PARSHA-256 on CPU and GPU.
  3 |  * Also includes function parsha256_gpu which prepares PARSHA-256 to executes on GPU and
  4 |  * executes it.
  5 |  *
  6 |  * 2008, Tadas Vilkeliskis <vilkeliskis.t@gmail.com>
  7 |  */
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include <cuda.h>
 11 | #include <cutil.h>
 12 | #include "parsha256.h"
 13 | 
 14 | typedef struct {
 15 | 	unsigned int kernel_timer;	/* execution time of the kernel */
 16 | 	unsigned int malloc_timer;	/* time spent on memory allocation */
 17 | 	unsigned int memcpy_timer;	/* time spent on copying memory from hsot to device and vise versa */
 18 | 	unsigned int free_timer;	/* time spent on memory deallocation */
 19 | } chronometer;
 20 | 
 21 | chronometer chmeter = {0, 0, 0, 0};
 22 | 
 23 | extern __global__ void parsha256_kernel (unsigned char *input, unsigned char *output, unsigned long total_threads);
 24 | 
 25 | void parsha256_gpu (unsigned char *input, unsigned long size, unsigned char *output)
 26 | {
 27 | 	unsigned long t;		/* effective tree height */
 28 | 	unsigned char *d_input;		/* input buffer on device */
 29 | 	unsigned char *d_output;	/* intermediate hash states */
 30 | 	int total_threads;		/* Total number of threads in the grid */
 31 | 	int threads_per_block = 128;	/* Maximum number of threads per block */
 32 | 	int total_blocks;		/* Total blocks in the grid */
 33 | 	unsigned char *buffer_ptr;	/* Pointer to input buffer */
 34 | 	unsigned long bytes_read = 0;	/* Bytes read from the input */
 35 | 	unsigned long q, r, b, s, k;
 36 | 	int l1, K1, L1;
 37 | 	/*
 38 | 	 * Initialization vector. Length 256 bits. Since reference machine is using 64 bit words
 39 | 	 * char array was used instead of word array. I was experiencing some problems with words.
 40 | 	 */
 41 | 	const unsigned char IV[32] =    {0x67, 0xe6, 0x09, 0x6a,
 42 | 					0x85, 0xae, 0x67, 0xbb,
 43 | 					0x72, 0xf3, 0x6e, 0x3c,
 44 | 					0x3a, 0xf5, 0x4f, 0xa5,
 45 | 					0x7f, 0x52, 0x0e, 0x51,
 46 | 					0x8c, 0x68, 0x05, 0x9b,
 47 | 					0xab, 0xd9, 0x83, 0x1f,
 48 | 					0x19, 0xcd, 0xe0, 0x5b};
 49 | 	/* Few temporary variables */
 50 | 	int i, j;
 51 | 	unsigned long tmp1, tmp2;
 52 | 
 53 | 	size = size * 8; /* bytes to bits */
 54 | 
 55 | 	if (size <= 160 * 8) {
 56 | 		/*
 57 | 		 * if L <= delta0 = n - l, then return h(h(x||0^(n-l-L)||IV)||bin_(n-m)(L))
 58 | 		 * */
 59 | 		printf ("Not implemented for size less than %d bits\n", 160 * 8);
 60 | 		return;
 61 | 	}
 62 | 
 63 | 	/* BEGIN INITIALIZATION */
 64 | 	/* Determine effective tree height */
 65 | 	if (size >= DELTA(TREE_SIZE))
 66 | 		t = TREE_SIZE;
 67 | 	else {
 68 | 		for (i = TREE_SIZE - 1; i >= 1; i--)
 69 | 			if (DELTA(i) <= size && size < DELTA(i + 1)) {
 70 | 				t = i;
 71 | 				i = 0; /* break the loop */
 72 | 			}
 73 | 	}
 74 | 
 75 | 	/* Find other parameters needed to complete computation */
 76 | 	q = r = 0;
 77 | 	if (size > DELTA(t)) {
 78 | 		q = (size - DELTA(t)) / LAMDA(t);
 79 | 		r = (size - DELTA(t)) % LAMDA(t);
 80 | 		if (r == 0) {
 81 | 			q--;
 82 | 			r = LAMDA(t);
 83 | 		}
 84 | 	}
 85 | 
 86 | 	b = r / (2 * PARSHA256_BLOCK_SIZE - 2 * PARSHA256_HASH_SIZE - PARSHA256_IV_SIZE);
 87 | 	if (r % (2 * PARSHA256_BLOCK_SIZE - 2 * PARSHA256_HASH_SIZE - PARSHA256_IV_SIZE))
 88 | 		b++;
 89 | 
 90 | 	/* Total number of processors for the first round */
 91 | 	total_threads = POW2(t);
 92 | #if 0
 93 | #ifdef _DEBUG
 94 | 	printf ("tree size: %d\n", t);
 95 | 	printf ("total threads: %d\n", total_threads);
 96 | 	printf ("q, r, b: %d %d %d\n", q, r, b);
 97 | #endif
 98 | #endif
 99 | 	CUT_SAFE_CALL (cutResetTimer (chmeter.malloc_timer));
100 | 	CUT_SAFE_CALL (cutStartTimer (chmeter.malloc_timer));
101 | 	/* Allocate enough memory on the device */
102 | 	cudaMalloc ((void**)&d_input, total_threads * PARSHA256_768BITSB);
103 | 	CUT_CHECK_ERROR ("Memory allocation failed");
104 | 	cudaMalloc ((void**)&d_output, total_threads * PARSHA256_256BITSB);
105 | 	CUT_CHECK_ERROR ("Memory allocation failed");
106 | 	CUT_SAFE_CALL (cutStopTimer (chmeter.malloc_timer));
107 | 
108 | 	/* END INITIALIZATION */
109 | 
110 | 	/* BEGIN FIRST ROUND */
111 | 	buffer_ptr = input;
112 | 	CUT_SAFE_CALL (cutResetTimer (chmeter.memcpy_timer));
113 | 	CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer));
114 | 	for (i = 0; i < total_threads; i++) {
115 | 		/* Copy 512 bits */
116 | 		cudaMemcpy(d_input + i * PARSHA256_768BITSB, buffer_ptr, PARSHA256_512BITSB,
117 | 				cudaMemcpyHostToDevice);
118 | 		CUT_CHECK_ERROR ("Memory copy failed");
119 | 		/* Add 256 bits of IV */
120 | 		cudaMemcpy(d_input + i * PARSHA256_768BITSB + PARSHA256_512BITSB,
121 | 				(unsigned char *)&IV, PARSHA256_256BITSB, cudaMemcpyHostToDevice);
122 | 		CUT_CHECK_ERROR ("Memory copy failed");
123 | 		buffer_ptr += PARSHA256_512BITSB;
124 | 		bytes_read += PARSHA256_512BITSB;
125 | 	}
126 | 	CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer));
127 | 
128 | 	/* execute kernel */
129 | 	total_blocks = total_threads / threads_per_block + (total_threads % threads_per_block == 0 ? 0 : 1);
130 | #if 0
131 | #ifdef _DEBUG
132 | 	printf ("bytes read: %d\n", bytes_read);
133 | 	printf ("total blocks: %d\n", total_blocks);
134 | 	printf ("total_threads: %d\n", total_threads);
135 | 	printf ("threads_per_block: %d\n", threads_per_block);
136 | #endif
137 | #endif
138 | 	CUT_SAFE_CALL (cutResetTimer (chmeter.kernel_timer));
139 | 	CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer));
140 | 	parsha256_kernel <<<total_blocks, threads_per_block>>> (d_input, d_output, total_threads);
141 | 	CUT_CHECK_ERROR ("Kernel execution failed");
142 | 	CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer));
143 | 
144 | 	/* END FIRST ROUND */
145 | 	/* BEGIN STEADY STATE */
146 | 	tmp2 = q + 1;
147 | 	for (i = 2; i <= tmp2; i++) {
148 | 		tmp1 = POW2 (t - 1) - 1;
149 | 
150 | 		CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer));
151 | 
152 | 		for (j = 0; j <= tmp1; j++) {
153 | 			/* Copy intermediate hash states */
154 | 			cudaMemcpy (d_input + j * PARSHA256_768BITSB, d_output + j * PARSHA256_512BITSB,
155 | 					PARSHA256_256BITSB, cudaMemcpyDeviceToDevice);
156 | 			CUT_CHECK_ERROR ("Memory copy failed");
157 | 			cudaMemcpy (d_input + j * PARSHA256_768BITSB + PARSHA256_256BITSB,
158 | 					d_output + j * PARSHA256_512BITSB + PARSHA256_256BITSB,
159 | 					PARSHA256_256BITSB,
160 | 					cudaMemcpyDeviceToDevice);
161 | 			CUT_CHECK_ERROR ("Memory copy failed");
162 | 			/* Copy 256 bits from input message */
163 | 			cudaMemcpy (d_input + j * PARSHA256_768BITSB + PARSHA256_512BITSB, buffer_ptr,
164 | 					PARSHA256_256BITSB, cudaMemcpyHostToDevice);
165 | 			buffer_ptr += PARSHA256_256BITSB;
166 | 			bytes_read += PARSHA256_256BITSB;
167 | 		}
168 | 
169 | 		tmp1 = POW2 (t) - 1;
170 | 		for (j = POW2 (t - 1); j <= tmp1; j++) {
171 | 			/* Copy 512 bits */
172 | 			cudaMemcpy(d_input + j * PARSHA256_768BITSB, buffer_ptr, PARSHA256_512BITSB,
173 | 					cudaMemcpyHostToDevice);
174 | 			CUT_CHECK_ERROR ("Memory copy failed");
175 | 			/* Add 256 bits of IV */
176 | 			cudaMemcpy(d_input + j * PARSHA256_768BITSB + PARSHA256_512BITSB,
177 | 					(unsigned char *)&IV, PARSHA256_256BITSB, cudaMemcpyHostToDevice);
178 | 			CUT_CHECK_ERROR ("Memory copy failed");
179 | 			buffer_ptr += PARSHA256_512BITSB;
180 | 			bytes_read += PARSHA256_512BITSB;
181 | 		}
182 | 
183 | 		CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer));
184 | 
185 | 		/* execute kernel */
186 | 		total_blocks = total_threads / threads_per_block + (total_threads % threads_per_block == 0 ? 0 : 1);
187 | #if 0
188 | #ifdef _DEBUG
189 | 		printf ("bytes read (steady state): %d\n", bytes_read);
190 | 		printf ("total blocks: %d\n", total_blocks);
191 | 		printf ("total_threads: %d\n", total_threads);
192 | 		printf ("threads_per_block: %d\n", threads_per_block);
193 | #endif
194 | #endif
195 | 		CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer));
196 | 		parsha256_kernel <<<total_blocks, threads_per_block>>> (d_input, d_output, total_threads);
197 | 		CUT_CHECK_ERROR ("Kernel execution failed");
198 | 		CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer));
199 | 	}
200 | 
201 | 	tmp1 = POW2(t - 1) - 1;
202 | 	total_threads = POW2(t - 1) + b - 1;
203 | 
204 | 	CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer));
205 | 
206 | 	for (i = 0; i <= tmp1; i++) {
207 | 		/* Copy intermediate hash states */
208 | 		cudaMemcpy (d_input + i * PARSHA256_768BITSB, d_output + i * PARSHA256_512BITSB,
209 | 				PARSHA256_256BITSB, cudaMemcpyDeviceToDevice);
210 | 		CUT_CHECK_ERROR ("Memory copy failed");
211 | 		cudaMemcpy (d_input + i * PARSHA256_768BITSB + PARSHA256_256BITSB,
212 | 				d_output + i * PARSHA256_512BITSB + PARSHA256_256BITSB,
213 | 				PARSHA256_256BITSB,
214 | 				cudaMemcpyDeviceToDevice);
215 | 		CUT_CHECK_ERROR ("Memory copy failed");
216 | 		/* Copy 256 bits from input message */
217 | 		cudaMemcpy (d_input + i * PARSHA256_768BITSB + PARSHA256_512BITSB, buffer_ptr,
218 | 				PARSHA256_256BITSB, cudaMemcpyHostToDevice);
219 | 		buffer_ptr += PARSHA256_256BITSB;
220 | 		bytes_read += PARSHA256_256BITSB;
221 | 	}
222 | 
223 | 	for (i = POW2(t - 1); i <= total_threads; i++) {
224 | 		/* Copy 512 bits */
225 | 		cudaMemcpy(d_input + i * PARSHA256_768BITSB, buffer_ptr, PARSHA256_512BITSB,
226 | 				cudaMemcpyHostToDevice);
227 | 		CUT_CHECK_ERROR ("Memory copy failed");
228 | 		/* Add 256 bits of IV */
229 | 		cudaMemcpy(d_input + i * PARSHA256_768BITSB + PARSHA256_512BITSB,
230 | 				(unsigned char *)&IV, PARSHA256_256BITSB, cudaMemcpyHostToDevice);
231 | 		CUT_CHECK_ERROR ("Memory copy failed");
232 | 		buffer_ptr += PARSHA256_512BITSB;
233 | 		bytes_read += PARSHA256_512BITSB;
234 | 	}
235 | 
236 | 	CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer));
237 | 
238 | 	/* execute kernel */
239 | 	total_blocks = total_threads / threads_per_block + (total_threads % threads_per_block == 0 ? 0 : 1);
240 | #if 0
241 | #ifdef _DEBUG
242 | 	printf ("bytes read (end game): %d\n", bytes_read);
243 | 	printf ("total blocks: %d\n", total_blocks);
244 | 	printf ("total_threads: %d\n", total_threads);
245 | 	printf ("threads_per_block: %d\n", threads_per_block);
246 | #endif
247 | #endif
248 | 	CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer));
249 | 	parsha256_kernel <<<total_blocks, threads_per_block>>> (d_input, d_output, total_threads);
250 | 	CUT_CHECK_ERROR ("Kernel execution failed");
251 | 	CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer));
252 | 
253 | 	/* BEGIN FLUSHING */
254 | 	tmp1 = q + t + 1;
255 | 	size = size / 8;	/* back to bytes */
256 | 	for (i = q + 3; i <= tmp1; i++) {
257 | 		s = q + t + 2 - i;
258 | 		k = (b - 1 + POW2 (t - s - 1)) / POW2 (t - s);
259 | 		l1 = (b - 1 + POW2 (t - s)) / POW2 (t - s);
260 | 		K1 = POW2 (s - 1) + k;
261 | 		L1 = POW2 (s - 1) + l1;
262 | 		
263 | 		/* zero out the buffer for padding I guess */
264 | 		CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer));
265 | 		cudaMemset(d_input, 0, K1 * PARSHA256_256BITSB);
266 | 		CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer));
267 | 		tmp2 = K1 - 1;
268 | 
269 | 		if (size - bytes_read >= K1 * PARSHA256_256BITSB)
270 | 			bytes_read += (K1 * PARSHA256_256BITSB);
271 | 		else
272 | 			bytes_read += (size - bytes_read);
273 | 
274 | 		CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer));
275 | 		for (j = 0; j <= tmp2; j++) {
276 | 			/* Copy intermediate hash states */
277 | 			cudaMemcpy (d_input + j * PARSHA256_768BITSB, d_output + j * PARSHA256_512BITSB,
278 | 					PARSHA256_256BITSB, cudaMemcpyDeviceToDevice);
279 | 			CUT_CHECK_ERROR ("Memory copy failed");
280 | 			cudaMemcpy (d_input + j * PARSHA256_768BITSB + PARSHA256_256BITSB,
281 | 					d_output + j * PARSHA256_512BITSB + PARSHA256_256BITSB,
282 | 					PARSHA256_256BITSB,
283 | 					cudaMemcpyDeviceToDevice);
284 | 			CUT_CHECK_ERROR ("Memory copy failed");
285 | 			/* Copy 256 bits from input message */
286 | 			cudaMemcpy (d_input + j * PARSHA256_768BITSB + PARSHA256_512BITSB, buffer_ptr,
287 | 					PARSHA256_256BITSB, cudaMemcpyHostToDevice);
288 | 			buffer_ptr += PARSHA256_256BITSB;
289 | 		}
290 | 		CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer));
291 | 
292 | 		/* execute the kernel */
293 | 		total_threads = K1;
294 | 		total_blocks = total_threads / threads_per_block + (total_threads % threads_per_block == 0 ? 0 : 1);
295 | #if 0
296 | #ifdef _DEBUG
297 | 		printf ("bytes readi (flushing): %d\n", bytes_read);
298 | 		printf ("total blocks: %d\n", total_blocks);
299 | 		printf ("total_threads: %d\n", total_threads);
300 | 		printf ("threads_per_block: %d\n", threads_per_block);
301 | #endif
302 | #endif
303 | 		CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer));
304 | 		parsha256_kernel <<<total_blocks, threads_per_block>>> (d_input, d_output, total_threads);
305 | 		CUT_CHECK_ERROR ("Kernel execution failed");
306 | 		CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer));
307 | 
308 | 		tmp2 = L1 - 1;
309 | 		CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer));
310 | 		for (j = K1; j <= tmp2; j++) {
311 | 			cudaMemcpy (d_output + j * PARSHA256_256BITSB, d_output + j * PARSHA256_512BITSB,
312 | 					PARSHA256_256BITSB, cudaMemcpyDeviceToDevice);
313 | 		}
314 | 		CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer));
315 | 	}
316 | 
317 | 	total_blocks  = 1;
318 | 	total_threads = 1;
319 | 	if (b > 0) {
320 | 		CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer));
321 | 		cudaMemset (d_input, 0, PARSHA256_768BITSB);
322 | 		/* Copy intermediate hash states */
323 | 		cudaMemcpy (d_input, d_output, PARSHA256_256BITSB, cudaMemcpyDeviceToDevice);
324 | 		CUT_CHECK_ERROR ("Memory copy failed");
325 | 		cudaMemcpy (d_input + PARSHA256_256BITSB, d_output + PARSHA256_256BITSB,
326 | 			PARSHA256_256BITSB, cudaMemcpyDeviceToDevice);
327 | 		CUT_CHECK_ERROR ("Memory copy failed");
328 | 		CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer));
329 | 
330 | 		if (size - bytes_read >= PARSHA256_256BITSB) {
331 | 			CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer));
332 | 			cudaMemcpy (d_input + PARSHA256_512BITSB, buffer_ptr, PARSHA256_256BITSB,
333 | 					cudaMemcpyHostToDevice);
334 | 			CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer));
335 | 			buffer_ptr += PARSHA256_256BITSB;
336 | 			bytes_read += PARSHA256_256BITSB;
337 | 		} else {
338 | 			CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer));
339 | 			cudaMemcpy (d_input + PARSHA256_512BITSB, buffer_ptr, size - bytes_read,
340 | 					cudaMemcpyHostToDevice);
341 | 			CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer));
342 | 			bytes_read += (size - bytes_read);
343 | 		}
344 | 
345 | 		CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer));
346 | 		parsha256_kernel <<<total_blocks, threads_per_block>>> (d_input, d_output, total_threads);
347 | 		CUT_CHECK_ERROR ("Kernel execution failed");
348 | //		cudaThreadSynchronize();
349 | 		CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer));
350 | 	}
351 | 
352 | 	CUT_SAFE_CALL (cutStartTimer (chmeter.memcpy_timer));
353 | 	cudaMemset (d_output + PARSHA256_256BITSB, 0, PARSHA256_512BITSB - 8);
354 | 	cudaMemcpy (d_input, d_output, PARSHA256_768BITSB, cudaMemcpyDeviceToDevice);
355 | 	size = size * 8;
356 | 	/*
357 | 	 * The following line should fail on 32 bit machines. Since reference machine I
358 | 	 * am writing this code on uses 64 bit words thus size of int is 8 bytes.
359 | 	 */
360 | 	cudaMemcpy (d_input + PARSHA256_768BITSB - 8, &size, 8, cudaMemcpyHostToDevice);
361 | 
362 | 	/* Hash one more time */
363 | 	CUT_SAFE_CALL (cutStopTimer (chmeter.memcpy_timer));
364 | 	CUT_SAFE_CALL (cutStartTimer (chmeter.kernel_timer));
365 | 	parsha256_kernel <<<total_blocks, threads_per_block>>> (d_input, d_output, 1);
366 | 	CUT_SAFE_CALL (cutStopTimer (chmeter.kernel_timer));
367 | 
368 | 	/* And we are done here */
369 | 	cudaMemcpy (output, d_output, PARSHA256_256BITSB, cudaMemcpyDeviceToHost);
370 | 
371 | 	CUT_SAFE_CALL (cutResetTimer (chmeter.free_timer));
372 | 	CUT_SAFE_CALL (cutStartTimer (chmeter.free_timer));
373 | 	cudaFree (d_input);
374 | 	cudaFree (d_output);
375 | 	CUT_SAFE_CALL (cutStopTimer (chmeter.free_timer));
376 | }
377 | 
378 | int main (int argc, char **argv)
379 | {
380 | 	unsigned char *buffer;
381 | 	unsigned int size;
382 | 	unsigned char output[32];
383 | 
384 | 	printf ("========================================\n");
385 | 	printf ("PARSHA-256 HASH ALGORITHM BENCHMARK TEST\n");
386 | 	printf ("========================================\n\n");
387 | 
388 | 	CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.kernel_timer));
389 | 	CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.malloc_timer));
390 | 	CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.memcpy_timer));
391 | 	CUT_SAFE_CALL (cutCreateTimer ((unsigned int*)&chmeter.free_timer));
392 | 
393 | 	printf ("SIZE      EXEC KERNEL\tcudaMemcpy\tcudaMalloc\tcudaFree\n");
394 | 
395 | 	for (size = 1000; size <= 100000000; size *= 10) {
396 | 		buffer = (unsigned char *) malloc (size * sizeof (char));
397 | 		if (buffer == NULL) {
398 | 			printf ("Memory allocation failed\n");
399 | 			return -1;
400 | 		}
401 | 
402 | 		parsha256_gpu (buffer, size, output);
403 | 		printf ("%-10d%f\t%f\t%f\t%f\n", size,
404 | 				cutGetTimerValue (chmeter.kernel_timer),
405 | 				cutGetTimerValue (chmeter.memcpy_timer),
406 | 				cutGetTimerValue (chmeter.malloc_timer),
407 | 				cutGetTimerValue (chmeter.free_timer));
408 | 
409 | 
410 | 		free (buffer);
411 | 	}
412 | }
413 | 


--------------------------------------------------------------------------------