├── 5 ├── CUDA - Assignment.pdf ├── README.md ├── LICENSE ├── 1_Vector_addition.cu ├── 4_Finding_Maximum_in_Array.cu ├── 3_Parallel_Binary_Search.cu └── 2_Matrix_addition.cu /5: -------------------------------------------------------------------------------- 1 | 2 | https://github.com/AlexDWong/dijkstra-CUDA 3 | -------------------------------------------------------------------------------- /CUDA - Assignment.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jishanshaikh4/cuda-programs/HEAD/CUDA - Assignment.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CUDA-Programs 2 | 3 | CUDA Programs for Hadoop/CUDA Lab at MANIT, Bhopal 4 | 5 | 6 | 4 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jishan Shaikh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /1_Vector_addition.cu: -------------------------------------------------------------------------------- 1 | // Program for Parallel Vector Addition in CUDA 2 | // For Hadoop-CUDA Lab 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define N 1024 // size of array 10 | 11 | __global__ void add(int *a,int *b, int *c) { 12 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 13 | if(tid < N){ 14 | c[tid] = a[tid]+b[tid]; 15 | } 16 | } 17 | 18 | int main(int argc, char *argv[]) { 19 | int T = 10, B = 1; // threads per block and blocks per grid, taking default values 20 | int a[N],b[N],c[N]; 21 | int *dev_a, *dev_b, *dev_c; 22 | 23 | printf("Size of array = %d\n", N); 24 | do { 25 | printf("Enter number of threads per block: "); 26 | scanf("%d",&T); 27 | printf("\nEnter nuumber of blocks per grid: "); 28 | scanf("%d",&B); 29 | if (T * B != N) printf("Error T x B != N, try again"); 30 | } while (T * B != N); 31 | 32 | cudaEvent_t start, stop; // using cuda events to measure time 33 | float elapsed_time_ms; // which is applicable for asynchronous code also 34 | 35 | cudaMalloc((void**)&dev_a,N * sizeof(int)); 36 | cudaMalloc((void**)&dev_b,N * sizeof(int)); 37 | cudaMalloc((void**)&dev_c,N * sizeof(int)); 38 | 39 | for(int i=0;i>>(dev_a,dev_b,dev_c); 53 | 54 | cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost); 55 | 56 | cudaEventRecord( stop, 0 ); // instrument code to measue end time 57 | cudaEventSynchronize( stop ); 58 | cudaEventElapsedTime( &elapsed_time_ms, start, stop ); 59 | 60 | for(int i=0;i 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // Thread block size 12 | #define BLOCK_SIZE 512 13 | 14 | // Size of Array 15 | #define SOA 8192 16 | 17 | // Allocates an array with random integer entries. 18 | void randomInit(int* data, int size) 19 | { 20 | for (int i = 0; i < size; ++i) 21 | data[i] = rand_r(); 22 | } 23 | 24 | __global__ void ReductionMax2(int *input, int *results, int n) //take thread divergence into account 25 | { 26 | extern __shared__ int sdata[]; 27 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 28 | unsigned int tx = threadIdx.x; 29 | //load input into __shared__ memory 30 | int x = INT_MIN; 31 | if(i < n) 32 | x = input[i]; 33 | sdata[tx] = x; 34 | __syncthreads(); 35 | 36 | // block-wide reduction 37 | for(unsigned int offset = blockDim.x>>1; offset > 0; offset >>= 1) 38 | { 39 | __syncthreads(); 40 | if(tx < offset) 41 | { 42 | if(sdata[tx + offset] > sdata[tx]) 43 | sdata[tx] = sdata[tx + offset]; 44 | } 45 | 46 | } 47 | 48 | // finally, thread 0 writes the result 49 | if(threadIdx.x == 0) 50 | { 51 | // the result is per-block 52 | results[blockIdx.x] = sdata[0]; 53 | } 54 | } 55 | 56 | 57 | // get global max element via per-block reductions 58 | int main() 59 | { 60 | int num_blocks = SOA / BLOCK_SIZE; 61 | 62 | //allocate host memory for array a 63 | unsigned int mem_size_a = sizeof(int) * SOA; 64 | int* h_a = (int*)malloc(mem_size_a); 65 | 66 | //initialize host memory 67 | randomInit(h_a,SOA); 68 | 69 | //allocate device memory 70 | int* d_a; 71 | cudaMalloc((void**) &d_a, mem_size_a); 72 | 73 | //copy host memory to device 74 | cudaMemcpy(d_a, h_a, mem_size_a, cudaMemcpyHostToDevice); 75 | 76 | //allocate device memory for temporary results 77 | unsigned int mem_size_b = sizeof(int) * SOA; 78 | int* d_b; 79 | cudaMalloc((void**) &d_b, mem_size_b); 80 | 81 | //allocate device memory for final result 82 | unsigned int mem_size_c = sizeof(int) * num_blocks; 83 | int* d_c; 84 | cudaMalloc((void**) &d_c, mem_size_c); 85 | 86 | //setup execution parameters 87 | dim3 block(1,BLOCK_SIZE); 88 | dim3 grid(4,4); 89 | 90 | //execute the kernel 91 | //first reduce per-block partial maxs 92 | ReductionMax2<<>>(d_a,d_b,SOA); 93 | 94 | //then reduce partial maxs to a final max 95 | ReductionMax2<<>>(d_b,d_c,num_blocks); 96 | 97 | // allocate host memory for the result 98 | int* h_c = (int*)malloc(mem_size_c); 99 | 100 | //copy final result from device to host 101 | cudaMemcpy(h_c, d_c, mem_size_c, cudaMemcpyDeviceToHost); 102 | 103 | //clean up memory 104 | free(h_a); 105 | free(h_c); 106 | cudaFree(d_a); 107 | cudaFree(d_b); 108 | cudaFree(d_c); 109 | 110 | cudaThreadExit(); 111 | 112 | } 113 | -------------------------------------------------------------------------------- /3_Parallel_Binary_Search.cu: -------------------------------------------------------------------------------- 1 | // Program for Parallel Binary Search in CUDA 2 | // For Hadoop-CUDA Lab 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | #include 11 | 12 | __device__ int get_index_to_check(int thread, int num_threads, int set_size, int offset) { 13 | 14 | // Integer division trick to round up 15 | return (((set_size + num_threads) / num_threads) * thread) + offset; 16 | } 17 | 18 | __global__ void p_ary_search(int search, int array_length, int *arr, int *ret_val ) { 19 | 20 | const int num_threads = blockDim.x * gridDim.x; 21 | const int thread = blockIdx.x * blockDim.x + threadIdx.x; 22 | 23 | //ret_val[0] = -1; 24 | //ret_val[1] = offset; 25 | 26 | int set_size = array_length; 27 | 28 | 29 | while(set_size != 0){ 30 | // Get the offset of the array, initially set to 0 31 | int offset = ret_val[1]; 32 | 33 | // I think this is necessary in case a thread gets ahead, and resets offset before it's read 34 | // This isn't necessary for the unit tests to pass, but I still like it here 35 | __syncthreads(); 36 | 37 | // Get the next index to check 38 | int index_to_check = get_index_to_check(thread, num_threads, set_size, offset); 39 | 40 | // If the index is outside the bounds of the array then lets not check it 41 | if (index_to_check < array_length){ 42 | 43 | // If the next index is outside the bounds of the array, then set it to maximum array size 44 | int next_index_to_check = get_index_to_check(thread + 1, num_threads, set_size, offset); 45 | 46 | if (next_index_to_check >= array_length){ 47 | next_index_to_check = array_length - 1; 48 | } 49 | 50 | // If we're at the mid section of the array reset the offset to this index 51 | if (search > arr[index_to_check] && (search < arr[next_index_to_check])) { 52 | ret_val[1] = index_to_check; 53 | } 54 | else if (search == arr[index_to_check]) { 55 | // Set the return var if we hit it 56 | ret_val[0] = index_to_check; 57 | } 58 | } 59 | 60 | // Since this is a p-ary search divide by our total threads to get the next set size 61 | set_size = set_size / num_threads; 62 | 63 | // Sync up so no threads jump ahead and get a bad offset 64 | __syncthreads(); 65 | } 66 | } 67 | 68 | 69 | int chop_position(int search, int *search_array, int array_length) 70 | { 71 | // Get the size of the array for future use 72 | int array_size = array_length * sizeof(int); 73 | 74 | // Don't bother with small arrays 75 | if (array_size == 0) return -1; 76 | 77 | // Setup array to use on device 78 | int *dev_arr; 79 | cudaMalloc((void**)&dev_arr, array_size); 80 | 81 | // Copy search array values 82 | cudaMemcpy(dev_arr, search_array, array_size, cudaMemcpyHostToDevice); 83 | 84 | // return values here and on device 85 | int *ret_val = (int*)malloc(sizeof(int) * 2); 86 | ret_val[0] = -1; // return value 87 | ret_val[1] = 0; // offset 88 | array_length = array_length % 2 == 0 ? array_length : array_length - 1; // array size 89 | 90 | int *dev_ret_val; 91 | cudaMalloc((void**)&dev_ret_val, sizeof(int) * 2); 92 | 93 | // Send in some intialized values 94 | cudaMemcpy(dev_ret_val, ret_val, sizeof(int) * 2, cudaMemcpyHostToDevice); 95 | 96 | // Launch kernel 97 | // This seems to be the best combo for p-ary search 98 | // Optimized around 10-15 registers per thread 99 | p_ary_search<<<16, 64>>>(search, array_length, dev_arr, dev_ret_val); 100 | 101 | // Get results 102 | cudaMemcpy(ret_val, dev_ret_val, 2 * sizeof(int), cudaMemcpyDeviceToHost); 103 | 104 | int ret = ret_val[0]; 105 | 106 | printf("Ret Val %i Offset %i\n", ret, ret_val[1]); 107 | 108 | // Free memory on device 109 | cudaFree(dev_arr); 110 | cudaFree(dev_ret_val); 111 | 112 | free(ret_val); 113 | 114 | return ret; 115 | } 116 | 117 | // Test region 118 | static int * build_array(int length) { 119 | 120 | int *ret_val = (int*)malloc(length * sizeof(int)); 121 | 122 | for (int i = 0; i < length; i++) 123 | { 124 | ret_val[i] = i * 2 - 1; 125 | } 126 | 127 | return ret_val; 128 | } 129 | 130 | static void test_array(int length, int search, int index) { 131 | 132 | printf("Length %i Search %i Index %i\n", length, search, index); 133 | // assert(index == chop_position(search, build_array(length), length) && "test_small_array()"); 134 | 135 | } 136 | 137 | static void test_arrays() { 138 | 139 | test_array(200, 200, -1); 140 | 141 | test_array(200, -1, 0); 142 | 143 | test_array(200, 1, 1); 144 | 145 | test_array(200, 29, 15); 146 | 147 | test_array(200, 129, 65); 148 | 149 | test_array(200, 395, 198); 150 | 151 | test_array(20000, 395, 198); 152 | 153 | test_array(2000000, 394, -1); 154 | 155 | test_array(20000000, 394, -1); 156 | } 157 | 158 | 159 | int main(){ 160 | test_arrays(); 161 | } 162 | -------------------------------------------------------------------------------- /2_Matrix_addition.cu: -------------------------------------------------------------------------------- 1 | // Program for Matrix Addition in CUDA 2 | // For Hadoop-CUDA Lab 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | __global__ void gpu_matrixadd(int *a,int *b, int *c, int N) { 9 | 10 | int col = threadIdx.x + blockDim.x * blockIdx.x; 11 | int row = threadIdx.y + blockDim.y * blockIdx.y; 12 | 13 | int index = row * N + col; 14 | 15 | if(col < N && row < N) 16 | c[index] = a[index]+b[index]; 17 | 18 | } 19 | 20 | void cpu_matrixadd(int *a,int *b, int *c, int N) { 21 | 22 | int index; 23 | for(int col=0;col < N; col++) 24 | for(int row=0;row < N; row++) { 25 | index = row * N + col; 26 | c[index] = a[index]+b[index]; 27 | } 28 | } 29 | 30 | int main(int argc, char *argv[]) { 31 | 32 | char key; 33 | 34 | int i, j; // loop counters 35 | 36 | int Grid_Dim_x=1, Grid_Dim_y=1; //Grid structure values 37 | int Block_Dim_x=1, Block_Dim_y=1; //Block structure values 38 | 39 | int noThreads_x, noThreads_y; // number of threads available in device, each dimension 40 | int noThreads_block; // number of threads in a block 41 | 42 | int N = 10; // size of array in each dimension 43 | int *a,*b,*c,*d; 44 | int *dev_a, *dev_b, *dev_c; 45 | int size; // number of bytes in arrays 46 | 47 | cudaEvent_t start, stop; // using cuda events to measure time 48 | float elapsed_time_ms; // which is applicable for asynchronous code also 49 | 50 | /* --------------------ENTER INPUT PARAMETERS AND DATA -----------------------*/ 51 | 52 | do { // loop to repeat complete program 53 | 54 | printf ("Device characteristics -- some limitations (compute capability 1.0)\n"); 55 | printf (" Maximum number of threads per block = 512\n"); 56 | printf (" Maximum sizes of x- and y- dimension of thread block = 512\n"); 57 | printf (" Maximum size of each dimension of grid of thread blocks = 65535\n"); 58 | 59 | printf("Enter size of array in one dimension (square array), currently %d\n",N); 60 | scanf("%d",&N); 61 | 62 | do { 63 | printf("\nEnter nuumber of blocks per grid in x dimension), currently %d : ",Grid_Dim_x); 64 | scanf("%d",&Grid_Dim_x); 65 | 66 | printf("\nEnter nuumber of blocks per grid in y dimension), currently %d : ",Grid_Dim_y); 67 | scanf("%d",&Grid_Dim_y); 68 | 69 | printf("\nEnter nuumber of threads per block in x dimension), currently %d : ",Block_Dim_x); 70 | scanf("%d",&Block_Dim_x); 71 | 72 | printf("\nEnter nuumber of threads per block in y dimension), currently %d : ",Block_Dim_y); 73 | scanf("%d",&Block_Dim_y); 74 | 75 | noThreads_x = Grid_Dim_x * Block_Dim_x; // number of threads in x dimension 76 | noThreads_y = Grid_Dim_y * Block_Dim_y; // number of threads in y dimension 77 | 78 | noThreads_block = Block_Dim_x * Block_Dim_y; // number of threads in a block 79 | 80 | if (noThreads_x < N) printf("Error -- number of threads in x dimension less than number of elements in arrays, try again\n"); 81 | else if (noThreads_y < N) printf("Error -- number of threads in y dimension less than number of elements in arrays, try again\n"); 82 | else if (noThreads_block > 512) printf("Error -- too many threads in block, try again\n"); 83 | else printf("Number of threads not used = %d\n", noThreads_x * noThreads_y - N * N); 84 | 85 | } while (noThreads_x < N || noThreads_y < N || noThreads_block > 512); 86 | 87 | dim3 Grid(Grid_Dim_x, Grid_Dim_x); //Grid structure 88 | dim3 Block(Block_Dim_x,Block_Dim_y); //Block structure, threads/block limited by specific device 89 | 90 | size = N * N * sizeof(int); // number of bytes in total in arrays 91 | 92 | a = (int*) malloc(size); //this time use dynamically allocated memory for arrays on host 93 | b = (int*) malloc(size); 94 | c = (int*) malloc(size); // results from GPU 95 | d = (int*) malloc(size); // results from CPU 96 | 97 | for(i=0;i < N;i++) // load arrays with some numbers 98 | for(j=0;j < N;j++) { 99 | a[i * N + j] = i; 100 | b[i * N + j] = i; 101 | } 102 | 103 | /* ------------- COMPUTATION DONE ON GPU ----------------------------*/ 104 | 105 | cudaMalloc((void**)&dev_a, size); // allocate memory on device 106 | cudaMalloc((void**)&dev_b, size); 107 | cudaMalloc((void**)&dev_c, size); 108 | 109 | cudaMemcpy(dev_a, a , size ,cudaMemcpyHostToDevice); 110 | cudaMemcpy(dev_b, b , size ,cudaMemcpyHostToDevice); 111 | cudaMemcpy(dev_c, c , size ,cudaMemcpyHostToDevice); 112 | 113 | cudaEventCreate(&start); // instrument code to measure start time 114 | cudaEventCreate(&stop); 115 | 116 | cudaEventRecord(start, 0); 117 | // cudaEventSynchronize(start); // Needed? 118 | 119 | gpu_matrixadd<<>>(dev_a,dev_b,dev_c,N); 120 | 121 | cudaMemcpy(c,dev_c, size ,cudaMemcpyDeviceToHost); 122 | 123 | cudaEventRecord(stop, 0); // instrument code to measue end time 124 | cudaEventSynchronize(stop); 125 | cudaEventElapsedTime(&elapsed_time_ms, start, stop ); 126 | 127 | // for(i=0;i < N;i++) 128 | // for(j=0;j < N;j++) 129 | // printf("%d+%d=%d\n",a[i * N + j],b[i * N + j],c[i * N + j]); 130 | 131 | printf("Time to calculate results on GPU: %f ms.\n", elapsed_time_ms); // print out execution time 132 | 133 | /* ------------- COMPUTATION DONE ON HOST CPU ----------------------------*/ 134 | 135 | cudaEventRecord(start, 0); // use same timing 136 | // cudaEventSynchronize(start); // Needed? 137 | 138 | cpu_matrixadd(a,b,d,N); // do calculation on host 139 | 140 | cudaEventRecord(stop, 0); // instrument code to measue end time 141 | cudaEventSynchronize(stop); 142 | cudaEventElapsedTime(&elapsed_time_ms, start, stop ); 143 | 144 | printf("Time to calculate results on CPU: %f ms.\n", elapsed_time_ms); // print out execution time 145 | 146 | /* ------------------- check device creates correct results -----------------*/ 147 | 148 | for(i=0;i < N*N;i++) { 149 | if (c[i] != d[i]) printf("*********** ERROR in results, CPU and GPU create different answers ********\n"); 150 | break; 151 | } 152 | 153 | printf("\nEnter c to repeat, return to terminate\n"); 154 | scanf("%c",&key); 155 | scanf("%c",&key); 156 | 157 | } while (key == 'c'); // loop of complete program 158 | 159 | /* -------------- clean up ---------------------------------------*/ 160 | free(a); 161 | free(b); 162 | free(c); 163 | cudaFree(dev_a); 164 | cudaFree(dev_b); 165 | cudaFree(dev_c); 166 | 167 | cudaEventDestroy(start); 168 | cudaEventDestroy(stop); 169 | 170 | return 0; 171 | } 172 | 173 | 174 | 175 | --------------------------------------------------------------------------------