├── 5
├── CUDA - Assignment.pdf
├── README.md
├── LICENSE
├── 1_Vector_addition.cu
├── 4_Finding_Maximum_in_Array.cu
├── 3_Parallel_Binary_Search.cu
└── 2_Matrix_addition.cu


/5:
--------------------------------------------------------------------------------
1 | 
2 | https://github.com/AlexDWong/dijkstra-CUDA
3 | 


--------------------------------------------------------------------------------
/CUDA - Assignment.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jishanshaikh4/cuda-programs/HEAD/CUDA - Assignment.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CUDA-Programs
 2 | 
 3 | CUDA Programs for Hadoop/CUDA Lab at MANIT, Bhopal
 4 | 
 5 | 
 6 | 4
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Jishan Shaikh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/1_Vector_addition.cu:
--------------------------------------------------------------------------------
 1 | // Program for Parallel Vector Addition in CUDA
 2 | // For Hadoop-CUDA Lab
 3 | 
 4 | #include <stdio.h>
 5 | #include <cuda.h>
 6 | #include <stdlib.h>
 7 | #include <time.h>
 8 | 
 9 | #define N 1024         // size of array
10 | 
11 | __global__ void add(int *a,int *b, int *c) {
12 | 	int tid = blockIdx.x *  blockDim.x + threadIdx.x;
13 |         if(tid < N){
14 |           c[tid] = a[tid]+b[tid];
15 |         }
16 | }
17 | 
18 | int main(int argc, char *argv[])  {
19 | 	int T = 10, B = 1;            // threads per block and blocks per grid, taking default values
20 | 	int a[N],b[N],c[N];
21 | 	int *dev_a, *dev_b, *dev_c;
22 | 
23 | 	printf("Size of array = %d\n", N);
24 | 	do {
25 | 		printf("Enter number of threads per block: ");
26 | 		scanf("%d",&T);
27 | 		printf("\nEnter nuumber of blocks per grid: ");
28 | 		scanf("%d",&B);
29 | 		if (T * B != N) printf("Error T x B != N, try again");
30 | 	} while (T * B != N);
31 | 
32 | 	cudaEvent_t start, stop;     // using cuda events to measure time
33 | 	float elapsed_time_ms;       // which is applicable for asynchronous code also
34 | 
35 | 	cudaMalloc((void**)&dev_a,N * sizeof(int));
36 | 	cudaMalloc((void**)&dev_b,N * sizeof(int));
37 | 	cudaMalloc((void**)&dev_c,N * sizeof(int));
38 | 
39 | 	for(int i=0;i<N;i++) {    // load arrays with some numbers
40 | 		a[i] = i;
41 | 		b[i] = i*1;
42 | 	}
43 | 
44 | 	cudaMemcpy(dev_a, a , N*sizeof(int),cudaMemcpyHostToDevice);
45 | 	cudaMemcpy(dev_b, b , N*sizeof(int),cudaMemcpyHostToDevice);
46 | 	cudaMemcpy(dev_c, c , N*sizeof(int),cudaMemcpyHostToDevice);
47 | 
48 | 	cudaEventCreate( &start );     // instrument code to measure start time
49 | 	cudaEventCreate( &stop );
50 | 	cudaEventRecord( start, 0 );
51 | 
52 | 	add<<<B,T>>>(dev_a,dev_b,dev_c);
53 | 
54 | 	cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);
55 | 
56 | 	cudaEventRecord( stop, 0 );     // instrument code to measue end time
57 | 	cudaEventSynchronize( stop );
58 | 	cudaEventElapsedTime( &elapsed_time_ms, start, stop );
59 | 
60 | 	for(int i=0;i<N;i++) {
61 | 		printf("%d+%d=%d\n",a[i],b[i],c[i]);
62 | 	}
63 | 
64 | 	printf("Time to calculate results: %f ms.\n", elapsed_time_ms);  // print out execution time
65 | 
66 | 	// clean up
67 | 	cudaFree(dev_a);
68 | 	cudaFree(dev_b);
69 | 	cudaFree(dev_c);
70 | 
71 | 	cudaEventDestroy(start);
72 | 	cudaEventDestroy(stop);
73 | 
74 | 	return 0;
75 | }
76 | 
77 | 


--------------------------------------------------------------------------------
/4_Finding_Maximum_in_Array.cu:
--------------------------------------------------------------------------------
  1 | // Program for Finding Maximum element in CUDA using Reduction technique
  2 | // For Hadoop-CUDA Lab
  3 | // NOTE: THIS PROGRAM USES SOME DEPRECATED FUNCTIONS; HENCE THE WARNINGS!
  4 | 
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <math.h>
  8 | #include <time.h>
  9 | #include <cuda.h>
 10 | 
 11 | // Thread block size
 12 | #define BLOCK_SIZE 512
 13 | 
 14 | //  Size of Array
 15 | #define SOA 8192
 16 | 
 17 | // Allocates an array with random integer entries.
 18 | void randomInit(int* data, int size)
 19 | {
 20 | 	for (int i = 0; i < size; ++i)
 21 | 		data[i] = rand_r();
 22 | }
 23 | 
 24 | __global__ void ReductionMax2(int *input, int *results, int n)    //take thread divergence into account
 25 | {	
 26 | 	extern __shared__ int sdata[]; 
 27 | 	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 28 | 	unsigned int tx = threadIdx.x; 
 29 | 	 //load input into __shared__ memory 
 30 | 	int x = INT_MIN; 
 31 | 	if(i < n) 
 32 | 		x = input[i]; 
 33 | 	sdata[tx] = x; 
 34 | 	__syncthreads();
 35 | 
 36 | 	// block-wide reduction
 37 | 	for(unsigned int offset = blockDim.x>>1; offset > 0; offset >>= 1)
 38 | 	{
 39 | 		__syncthreads();
 40 | 		if(tx < offset)
 41 | 	    {
 42 | 			if(sdata[tx + offset] > sdata[tx])
 43 | 				sdata[tx] = sdata[tx + offset];
 44 | 		}
 45 | 
 46 | 	}
 47 | 
 48 | 		// finally, thread 0 writes the result 
 49 | 	if(threadIdx.x == 0) 
 50 | 	{ 
 51 | 		// the result is per-block 
 52 | 		results[blockIdx.x] = sdata[0]; 
 53 | 	} 
 54 | }
 55 | 
 56 | 
 57 | // get global max element via per-block reductions 
 58 | 	int main() 
 59 | 	{ 
 60 | 		int num_blocks = SOA / BLOCK_SIZE;
 61 | 		
 62 | 		//allocate host memory for array a
 63 | 		unsigned int mem_size_a = sizeof(int) * SOA;
 64 | 		int* h_a = (int*)malloc(mem_size_a);
 65 | 		
 66 | 		//initialize host memory
 67 | 		randomInit(h_a,SOA);
 68 | 
 69 | 		//allocate device memory
 70 | 		int* d_a;
 71 | 		cudaMalloc((void**) &d_a, mem_size_a);
 72 | 
 73 | 		//copy host memory to device
 74 | 		cudaMemcpy(d_a, h_a, mem_size_a, cudaMemcpyHostToDevice);
 75 | 
 76 | 		//allocate device memory for temporary results
 77 | 		unsigned int mem_size_b = sizeof(int) * SOA;
 78 | 		int* d_b;
 79 | 		cudaMalloc((void**) &d_b, mem_size_b);
 80 | 		
 81 | 		//allocate device memory for final result
 82 | 		unsigned int mem_size_c = sizeof(int) * num_blocks;
 83 | 		int* d_c;
 84 | 		cudaMalloc((void**) &d_c, mem_size_c);	
 85 | 
 86 | 		//setup execution parameters
 87 | 		dim3 block(1,BLOCK_SIZE);
 88 | 		dim3 grid(4,4);
 89 | 
 90 | 		//execute the kernel
 91 | 		//first reduce per-block partial maxs
 92 | 		ReductionMax2<<<grid, block>>>(d_a,d_b,SOA);
 93 | 		
 94 | 		//then reduce partial maxs to a final max
 95 | 		ReductionMax2<<<grid, block>>>(d_b,d_c,num_blocks);
 96 | 
 97 |        	// allocate host memory for the result
 98 | 		int* h_c = (int*)malloc(mem_size_c);
 99 | 
100 | 		//copy final result from device to host
101 | 		cudaMemcpy(h_c, d_c, mem_size_c, cudaMemcpyDeviceToHost);
102 | 
103 | 		//clean up memory
104 | 		free(h_a);
105 | 		free(h_c);
106 | 		cudaFree(d_a);
107 | 		cudaFree(d_b);
108 | 		cudaFree(d_c);
109 | 
110 | 		cudaThreadExit();
111 | 
112 | }
113 | 


--------------------------------------------------------------------------------
/3_Parallel_Binary_Search.cu:
--------------------------------------------------------------------------------
  1 | // Program for Parallel Binary Search in CUDA
  2 | // For Hadoop-CUDA Lab
  3 | 
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <cuda.h>
  7 | #include <cuda_runtime.h>
  8 | 
  9 | 
 10 | #include <assert.h>
 11 | 
 12 | __device__ int get_index_to_check(int thread, int num_threads, int set_size, int offset) {
 13 | 
 14 | 	// Integer division trick to round up
 15 | 	return (((set_size + num_threads) / num_threads) * thread) + offset;
 16 | }
 17 | 
 18 | __global__ void p_ary_search(int search, int array_length,  int *arr, int *ret_val ) {
 19 | 
 20 | 	const int num_threads = blockDim.x * gridDim.x;
 21 | 	const int thread = blockIdx.x * blockDim.x + threadIdx.x;
 22 | 	
 23 | 	//ret_val[0] = -1;
 24 | 	//ret_val[1] = offset;
 25 | 
 26 | 	int set_size = array_length;
 27 | 
 28 | 	
 29 | 	while(set_size != 0){
 30 | 		// Get the offset of the array, initially set to 0
 31 | 		int offset = ret_val[1];
 32 | 		
 33 | 		// I think this is necessary in case a thread gets ahead, and resets offset before it's read
 34 | 		// This isn't necessary for the unit tests to pass, but I still like it here
 35 | 		__syncthreads();	
 36 | 
 37 | 		// Get the next index to check
 38 | 		int index_to_check = get_index_to_check(thread, num_threads, set_size, offset);
 39 | 
 40 | 		// If the index is outside the bounds of the array then lets not check it
 41 | 		if (index_to_check < array_length){
 42 | 
 43 | 			// If the next index is outside the bounds of the array, then set it to maximum array size
 44 | 			int next_index_to_check = get_index_to_check(thread + 1, num_threads, set_size, offset);
 45 | 
 46 | 			if (next_index_to_check >= array_length){
 47 | 				next_index_to_check = array_length - 1;
 48 | 			}
 49 | 
 50 | 			// If we're at the mid section of the array reset the offset to this index
 51 | 			if (search > arr[index_to_check] && (search < arr[next_index_to_check])) {
 52 | 				ret_val[1] = index_to_check;
 53 | 			}
 54 | 			else if (search == arr[index_to_check]) {
 55 | 				// Set the return var if we hit it
 56 | 				ret_val[0] = index_to_check;
 57 | 			}	
 58 | 		}
 59 | 
 60 | 		// Since this is a p-ary search divide by our total threads to get the next set size
 61 | 		set_size = set_size / num_threads;
 62 | 		
 63 | 		// Sync up so no threads jump ahead and get a bad offset
 64 | 		__syncthreads();
 65 | 	}
 66 | }
 67 | 
 68 | 
 69 | int chop_position(int search, int *search_array, int array_length)
 70 | {
 71 | 	// Get the size of the array for future use
 72 | 	int array_size = array_length * sizeof(int);
 73 | 
 74 | 	// Don't bother with small arrays
 75 | 	if (array_size == 0) return -1;
 76 | 
 77 | 	// Setup array to use on device
 78 |     int    *dev_arr;
 79 | 	cudaMalloc((void**)&dev_arr, array_size);
 80 | 
 81 | 	// Copy search array values
 82 | 	cudaMemcpy(dev_arr, search_array, array_size, cudaMemcpyHostToDevice);
 83 | 
 84 | 	// return values here and on device
 85 | 	int		*ret_val = (int*)malloc(sizeof(int) * 2);
 86 | 	ret_val[0] = -1; // return value
 87 | 	ret_val[1] = 0; // offset
 88 | 	array_length = array_length % 2 == 0 ? array_length : array_length - 1; // array size
 89 | 
 90 | 	int		*dev_ret_val;
 91 | 	cudaMalloc((void**)&dev_ret_val, sizeof(int) * 2);
 92 | 
 93 | 	// Send in some intialized values
 94 | 	cudaMemcpy(dev_ret_val, ret_val, sizeof(int) * 2, cudaMemcpyHostToDevice);
 95 | 	
 96 | 	// Launch kernel
 97 | 	// This seems to be the best combo for p-ary search
 98 | 	// Optimized around 10-15 registers per thread
 99 | 	p_ary_search<<<16, 64>>>(search, array_length, dev_arr, dev_ret_val);
100 | 
101 | 	// Get results
102 | 	cudaMemcpy(ret_val, dev_ret_val, 2 * sizeof(int), cudaMemcpyDeviceToHost);
103 | 
104 | 	int ret = ret_val[0];
105 | 
106 | 	printf("Ret Val %i    Offset %i\n", ret, ret_val[1]);
107 | 
108 | 	// Free memory on device
109 | 	cudaFree(dev_arr);
110 | 	cudaFree(dev_ret_val);
111 | 	
112 | 	free(ret_val);
113 | 
114 | 	return ret;
115 | }
116 | 
117 | // Test region
118 | static int * build_array(int length) {
119 | 
120 | 	int *ret_val = (int*)malloc(length * sizeof(int));
121 | 
122 | 	for (int i = 0; i < length; i++)
123 | 	{
124 | 		ret_val[i] = i * 2 - 1;
125 | 	}
126 | 
127 | 	return ret_val;
128 | }
129 | 
130 | static void test_array(int length, int search, int index) {
131 | 	
132 | 	printf("Length %i   Search %i    Index %i\n", length, search, index);
133 | 	// assert(index == chop_position(search, build_array(length), length) && "test_small_array()");
134 | 
135 | }
136 | 
137 | static void test_arrays() {	
138 | 	
139 | 	test_array(200, 200, -1);
140 | 	
141 | 	test_array(200, -1, 0);
142 | 	
143 | 	test_array(200, 1, 1);
144 | 	
145 | 	test_array(200, 29, 15);
146 | 	
147 | 	test_array(200, 129, 65);	
148 | 
149 | 	test_array(200, 395, 198);
150 | 	
151 | 	test_array(20000, 395, 198);
152 | 	
153 | 	test_array(2000000, 394, -1);
154 | 	
155 | 	test_array(20000000, 394, -1);
156 | }
157 | 
158 | 
159 | int main(){
160 | 	test_arrays();
161 | }
162 | 


--------------------------------------------------------------------------------
/2_Matrix_addition.cu:
--------------------------------------------------------------------------------
  1 | // Program for Matrix Addition in CUDA
  2 | // For Hadoop-CUDA Lab
  3 | 
  4 | #include <stdio.h>
  5 | #include <cuda.h>
  6 | #include <stdlib.h>
  7 | 
  8 | __global__ void gpu_matrixadd(int *a,int *b, int *c, int N) {
  9 | 
 10 | 	int col = threadIdx.x + blockDim.x * blockIdx.x; 
 11 | 	int row = threadIdx.y + blockDim.y * blockIdx.y;
 12 | 
 13 | 	int index = row * N + col;
 14 | 
 15 |       	if(col < N && row < N)
 16 |           c[index] = a[index]+b[index];
 17 | 
 18 | }
 19 | 
 20 | void cpu_matrixadd(int *a,int *b, int *c, int N) {
 21 | 
 22 | 	int index;
 23 | 	for(int col=0;col < N; col++) 
 24 | 		for(int row=0;row < N; row++) {
 25 | 			index = row * N + col;
 26 |            		c[index] = a[index]+b[index];
 27 | 		}
 28 | }
 29 | 
 30 | int main(int argc, char *argv[])  {
 31 | 
 32 | 	char key;
 33 | 
 34 | 	int i, j; 					// loop counters
 35 | 
 36 | 	int Grid_Dim_x=1, Grid_Dim_y=1;			//Grid structure values
 37 | 	int Block_Dim_x=1, Block_Dim_y=1;		//Block structure values
 38 | 
 39 | 	int noThreads_x, noThreads_y;		// number of threads available in device, each dimension
 40 | 	int noThreads_block;				// number of threads in a block
 41 | 
 42 | 	int N = 10;  					// size of array in each dimension
 43 | 	int *a,*b,*c,*d;
 44 | 	int *dev_a, *dev_b, *dev_c;
 45 | 	int size;					// number of bytes in arrays
 46 | 
 47 | 	cudaEvent_t start, stop;     		// using cuda events to measure time
 48 | 	float elapsed_time_ms;       		// which is applicable for asynchronous code also
 49 | 
 50 | /* --------------------ENTER INPUT PARAMETERS AND DATA -----------------------*/
 51 | 
 52 | do {  // loop to repeat complete program
 53 | 
 54 | 	printf ("Device characteristics -- some limitations (compute capability 1.0)\n");
 55 | 	printf ("		Maximum number of threads per block = 512\n");
 56 | 	printf ("		Maximum sizes of x- and y- dimension of thread block = 512\n");
 57 | 	printf ("		Maximum size of each dimension of grid of thread blocks = 65535\n");
 58 | 	
 59 | 	printf("Enter size of array in one dimension (square array), currently %d\n",N);
 60 | 	scanf("%d",&N);
 61 | 		
 62 | 	do {
 63 | 		printf("\nEnter nuumber of blocks per grid in x dimension), currently %d  : ",Grid_Dim_x);
 64 | 		scanf("%d",&Grid_Dim_x);
 65 | 
 66 | 		printf("\nEnter nuumber of blocks per grid in y dimension), currently %d  : ",Grid_Dim_y);
 67 | 		scanf("%d",&Grid_Dim_y);
 68 | 
 69 | 		printf("\nEnter nuumber of threads per block in x dimension), currently %d  : ",Block_Dim_x);
 70 | 		scanf("%d",&Block_Dim_x);
 71 | 
 72 | 		printf("\nEnter nuumber of threads per block in y dimension), currently %d  : ",Block_Dim_y);
 73 | 		scanf("%d",&Block_Dim_y);
 74 | 
 75 | 		noThreads_x = Grid_Dim_x * Block_Dim_x;		// number of threads in x dimension
 76 | 		noThreads_y = Grid_Dim_y * Block_Dim_y;		// number of threads in y dimension
 77 | 
 78 | 		noThreads_block = Block_Dim_x * Block_Dim_y;	// number of threads in a block
 79 | 
 80 | 		if (noThreads_x < N) printf("Error -- number of threads in x dimension less than number of elements in arrays, try again\n");
 81 | 		else if (noThreads_y < N) printf("Error -- number of threads in y dimension less than number of elements in arrays, try again\n");
 82 | 		else if (noThreads_block > 512) printf("Error -- too many threads in block, try again\n");
 83 | 		else printf("Number of threads not used = %d\n", noThreads_x * noThreads_y - N * N);
 84 | 
 85 | 	} while (noThreads_x < N || noThreads_y < N || noThreads_block > 512);
 86 | 
 87 | 	dim3 Grid(Grid_Dim_x, Grid_Dim_x);		//Grid structure
 88 | 	dim3 Block(Block_Dim_x,Block_Dim_y);	//Block structure, threads/block limited by specific device
 89 | 
 90 | 	size = N * N * sizeof(int);		// number of bytes in total in arrays
 91 | 
 92 | 	a = (int*) malloc(size);		//this time use dynamically allocated memory for arrays on host
 93 | 	b = (int*) malloc(size);
 94 | 	c = (int*) malloc(size);		// results from GPU
 95 | 	d = (int*) malloc(size);		// results from CPU
 96 | 
 97 | 	for(i=0;i < N;i++)			// load arrays with some numbers
 98 | 	for(j=0;j < N;j++) {
 99 | 		a[i * N + j] = i;
100 | 		b[i * N + j] = i;
101 | 	}
102 | 
103 | /* ------------- COMPUTATION DONE ON GPU ----------------------------*/
104 | 
105 | 	cudaMalloc((void**)&dev_a, size);		// allocate memory on device
106 | 	cudaMalloc((void**)&dev_b, size);
107 | 	cudaMalloc((void**)&dev_c, size);
108 | 
109 | 	cudaMemcpy(dev_a, a , size ,cudaMemcpyHostToDevice);
110 | 	cudaMemcpy(dev_b, b , size ,cudaMemcpyHostToDevice);
111 | 	cudaMemcpy(dev_c, c , size ,cudaMemcpyHostToDevice);
112 | 
113 | 	cudaEventCreate(&start);     		// instrument code to measure start time
114 | 	cudaEventCreate(&stop);
115 | 
116 | 	cudaEventRecord(start, 0);
117 | //	cudaEventSynchronize(start);  	// Needed?
118 | 
119 | 	gpu_matrixadd<<<Grid,Block>>>(dev_a,dev_b,dev_c,N);
120 | 
121 | 	cudaMemcpy(c,dev_c, size ,cudaMemcpyDeviceToHost);
122 | 
123 | 	cudaEventRecord(stop, 0);     	// instrument code to measue end time
124 | 	cudaEventSynchronize(stop);
125 | 	cudaEventElapsedTime(&elapsed_time_ms, start, stop );
126 | 
127 | //	for(i=0;i < N;i++) 
128 | //	for(j=0;j < N;j++)
129 | //	   printf("%d+%d=%d\n",a[i * N + j],b[i * N + j],c[i * N + j]);
130 | 
131 | 	printf("Time to calculate results on GPU: %f ms.\n", elapsed_time_ms);  // print out execution time
132 | 
133 | /* ------------- COMPUTATION DONE ON HOST CPU ----------------------------*/
134 | 
135 | 	cudaEventRecord(start, 0);		// use same timing
136 | //	cudaEventSynchronize(start);  	// Needed?
137 | 
138 | 	cpu_matrixadd(a,b,d,N);		// do calculation on host
139 | 
140 | 	cudaEventRecord(stop, 0);     	// instrument code to measue end time
141 | 	cudaEventSynchronize(stop);
142 | 	cudaEventElapsedTime(&elapsed_time_ms, start, stop );
143 | 
144 | 	printf("Time to calculate results on CPU: %f ms.\n", elapsed_time_ms);  // print out execution time
145 | 
146 | /* ------------------- check device creates correct results -----------------*/
147 | 
148 | 	for(i=0;i < N*N;i++) {
149 | 		if (c[i] != d[i]) printf("*********** ERROR in results, CPU and GPU create different answers ********\n");
150 | 		break;
151 | 	}
152 | 
153 | 	printf("\nEnter c to repeat, return to terminate\n");
154 | 	scanf("%c",&key);
155 | 	scanf("%c",&key);
156 | 
157 | } while (key == 'c'); // loop of complete program
158 | 
159 | /* --------------  clean up  ---------------------------------------*/
160 | 	free(a);
161 | 	free(b);
162 | 	free(c);
163 | 	cudaFree(dev_a);
164 | 	cudaFree(dev_b);
165 | 	cudaFree(dev_c);
166 | 
167 | 	cudaEventDestroy(start);
168 | 	cudaEventDestroy(stop);
169 | 
170 | 	return 0;
171 | }
172 | 
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------