├── CUDA ├── asyncAPI.cu ├── cdpSimplePrint.cu ├── cdpSimpleQuicksort.cu ├── clock.cu ├── cppIntegration.cu ├── cppOverload.cu ├── matrixMul.cu ├── simpleAssert.cu ├── simpleAssert_kernel.cu ├── template_runtime.cu └── vectorAdd.cu ├── MPI ├── SimpleSendRcv.c ├── array_prod.c ├── average_reduce.c ├── average_scatter.c ├── factorial.c ├── lognSum.c ├── matrixMult.c ├── mpi_hello_world.c ├── mpibcast.c └── pieCalculation.c ├── OMP ├── Critical.c ├── Fibonacci.c ├── HelloWorld.c ├── MatrixMul.c ├── ParallelTreeSearch.c ├── PiCalculation.c ├── ReductionPI.c ├── Single.c ├── Sorting.c └── SumOfArrays.c └── README.md /CUDA/asyncAPI.cu: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // 3 | // Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 4 | // 5 | // Please refer to the NVIDIA end user license agreement (EULA) associated 6 | // with this source code for terms and conditions that govern your use of 7 | // this software. Any use, reproduction, disclosure, or distribution of 8 | // this software and related documentation outside the terms of the EULA 9 | // is strictly prohibited. 10 | // 11 | //////////////////////////////////////////////////////////////////////////// 12 | 13 | // 14 | // This sample illustrates the usage of CUDA events for both GPU timing and 15 | // overlapping CPU and GPU execution. Events are inserted into a stream 16 | // of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can 17 | // perform computations while GPU is executing (including DMA memcopies 18 | // between the host and device). CPU can query CUDA events to determine 19 | // whether GPU has completed tasks. 20 | // 21 | 22 | // includes, system 23 | #include 24 | 25 | // includes CUDA Runtime 26 | #include 27 | 28 | // includes, project 29 | #include 30 | #include // helper utility functions 31 | 32 | __global__ void increment_kernel(int *g_data, int inc_value) 33 | { 34 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 35 | g_data[idx] = g_data[idx] + inc_value; 36 | } 37 | 38 | bool correct_output(int *data, const int n, const int x) 39 | { 40 | for (int i = 0; i < n; i++) 41 | if (data[i] != x) 42 | { 43 | printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); 44 | return false; 45 | } 46 | 47 | return true; 48 | } 49 | 50 | int main(int argc, char *argv[]) 51 | { 52 | int devID; 53 | cudaDeviceProp deviceProps; 54 | 55 | printf("[%s] - Starting...\n", argv[0]); 56 | 57 | // This will pick the best possible CUDA capable device 58 | devID = findCudaDevice(argc, (const char **)argv); 59 | 60 | // get device name 61 | checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); 62 | printf("CUDA device [%s]\n", deviceProps.name); 63 | 64 | int n = 16 * 1024 * 1024; 65 | int nbytes = n * sizeof(int); 66 | int value = 26; 67 | 68 | // allocate host memory 69 | int *a = 0; 70 | checkCudaErrors(cudaMallocHost((void **)&a, nbytes)); 71 | memset(a, 0, nbytes); 72 | 73 | // allocate device memory 74 | int *d_a=0; 75 | checkCudaErrors(cudaMalloc((void **)&d_a, nbytes)); 76 | checkCudaErrors(cudaMemset(d_a, 255, nbytes)); 77 | 78 | // set kernel launch configuration 79 | dim3 threads = dim3(512, 1); 80 | dim3 blocks = dim3(n / threads.x, 1); 81 | 82 | // create cuda event handles 83 | cudaEvent_t start, stop; 84 | checkCudaErrors(cudaEventCreate(&start)); 85 | checkCudaErrors(cudaEventCreate(&stop)); 86 | 87 | StopWatchInterface *timer = NULL; 88 | sdkCreateTimer(&timer); 89 | sdkResetTimer(&timer); 90 | 91 | checkCudaErrors(cudaDeviceSynchronize()); 92 | float gpu_time = 0.0f; 93 | 94 | // asynchronously issue work to the GPU (all to stream 0) 95 | sdkStartTimer(&timer); 96 | cudaEventRecord(start, 0); 97 | cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0); 98 | increment_kernel<<>>(d_a, value); 99 | cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0); 100 | cudaEventRecord(stop, 0); 101 | sdkStopTimer(&timer); 102 | 103 | // have CPU do some work while waiting for stage 1 to finish 104 | unsigned long int counter=0; 105 | 106 | while (cudaEventQuery(stop) == cudaErrorNotReady) 107 | { 108 | counter++; 109 | } 110 | 111 | checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop)); 112 | 113 | // print the cpu and gpu times 114 | printf("time spent executing by the GPU: %.2f\n", gpu_time); 115 | printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer)); 116 | printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter); 117 | 118 | // check the output for correctness 119 | bool bFinalResults = correct_output(a, n, value); 120 | 121 | // release resources 122 | checkCudaErrors(cudaEventDestroy(start)); 123 | checkCudaErrors(cudaEventDestroy(stop)); 124 | checkCudaErrors(cudaFreeHost(a)); 125 | checkCudaErrors(cudaFree(d_a)); 126 | 127 | // cudaDeviceReset causes the driver to clean up all state. While 128 | // not mandatory in normal operation, it is good practice. It is also 129 | // needed to ensure correct operation when the application is being 130 | // profiled. Calling cudaDeviceReset causes all profile data to be 131 | // flushed before the application exits 132 | cudaDeviceReset(); 133 | 134 | exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE); 135 | } 136 | -------------------------------------------------------------------------------- /CUDA/cdpSimplePrint.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | //////////////////////////////////////////////////////////////////////////////// 19 | // Variable on the GPU used to generate unique identifiers of blocks. 20 | //////////////////////////////////////////////////////////////////////////////// 21 | __device__ int g_uids = 0; 22 | 23 | //////////////////////////////////////////////////////////////////////////////// 24 | // Print a simple message to signal the block which is currently executing. 25 | //////////////////////////////////////////////////////////////////////////////// 26 | __device__ void print_info(int depth, int thread, int uid, int parent_uid) 27 | { 28 | if (threadIdx.x == 0) 29 | { 30 | if (depth == 0) 31 | printf("BLOCK %d launched by the host\n", uid); 32 | else 33 | { 34 | char buffer[32]; 35 | 36 | for (int i = 0 ; i < depth ; ++i) 37 | { 38 | buffer[3*i+0] = '|'; 39 | buffer[3*i+1] = ' '; 40 | buffer[3*i+2] = ' '; 41 | } 42 | 43 | buffer[3*depth] = '\0'; 44 | printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, thread, parent_uid); 45 | } 46 | } 47 | 48 | __syncthreads(); 49 | } 50 | 51 | //////////////////////////////////////////////////////////////////////////////// 52 | // The kernel using CUDA dynamic parallelism. 53 | // 54 | // It generates a unique identifier for each block. Prints the information 55 | // about that block. Finally, if the 'max_depth' has not been reached, the 56 | // block launches new blocks directly from the GPU. 57 | //////////////////////////////////////////////////////////////////////////////// 58 | __global__ void cdp_kernel(int max_depth, int depth, int thread, int parent_uid) 59 | { 60 | // We create a unique ID per block. Thread 0 does that and shares the value with the other threads. 61 | __shared__ int s_uid; 62 | 63 | if (threadIdx.x == 0) 64 | { 65 | s_uid = atomicAdd(&g_uids, 1); 66 | } 67 | 68 | __syncthreads(); 69 | 70 | // We print the ID of the block and information about its parent. 71 | print_info(depth, thread, s_uid, parent_uid); 72 | 73 | // We launch new blocks if we haven't reached the max_depth yet. 74 | if (++depth >= max_depth) 75 | { 76 | return; 77 | } 78 | 79 | cdp_kernel<<>>(max_depth, depth, threadIdx.x, s_uid); 80 | } 81 | 82 | //////////////////////////////////////////////////////////////////////////////// 83 | // Main entry point. 84 | //////////////////////////////////////////////////////////////////////////////// 85 | int main(int argc, char **argv) 86 | { 87 | printf("starting Simple Print (CUDA Dynamic Parallelism)\n"); 88 | 89 | // Parse a few command-line arguments. 90 | int max_depth = 2; 91 | 92 | if (checkCmdLineFlag(argc, (const char **)argv, "help") || 93 | checkCmdLineFlag(argc, (const char **)argv, "h")) 94 | { 95 | printf("Usage: %s depth=\t(where max_depth is a value between 1 and 8).\n", argv[0]); 96 | exit(EXIT_SUCCESS); 97 | } 98 | 99 | if (checkCmdLineFlag(argc, (const char **)argv, "depth")) 100 | { 101 | max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth"); 102 | 103 | if (max_depth < 1 || max_depth > 8) 104 | { 105 | printf("depth parameter has to be between 1 and 8\n"); 106 | exit(EXIT_FAILURE); 107 | } 108 | } 109 | 110 | // Find/set the device. 111 | int device_count = 0, device = -1; 112 | 113 | if(checkCmdLineFlag(argc, (const char **)argv, "device")) 114 | { 115 | device = getCmdLineArgumentInt(argc, (const char **)argv, "device"); 116 | 117 | cudaDeviceProp properties; 118 | checkCudaErrors(cudaGetDeviceProperties(&properties, device)); 119 | 120 | if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5)) 121 | { 122 | std::cout << "Running on GPU " << device << " (" << properties.name << ")" << std::endl; 123 | } 124 | else 125 | { 126 | std::cout << "ERROR: cdpsimplePrint requires GPU devices with compute SM 3.5 or higher."<< std::endl; 127 | std::cout << "Current GPU device has compute SM" << properties.major <<"."<< properties.minor <<". Exiting..." << std::endl; 128 | exit(EXIT_FAILURE); 129 | } 130 | 131 | } 132 | else 133 | { 134 | checkCudaErrors(cudaGetDeviceCount(&device_count)); 135 | for (int i = 0 ; i < device_count ; ++i) 136 | { 137 | cudaDeviceProp properties; 138 | checkCudaErrors(cudaGetDeviceProperties(&properties, i)); 139 | if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5)) 140 | { 141 | device = i; 142 | std::cout << "Running on GPU " << i << " (" << properties.name << ")" << std::endl; 143 | break; 144 | } 145 | std::cout << "GPU " << i << " (" << properties.name << ") does not support CUDA Dynamic Parallelism" << std::endl; 146 | } 147 | } 148 | if (device == -1) 149 | { 150 | std::cerr << "cdpSimplePrint requires GPU devices with compute SM 3.5 or higher. Exiting..." << std::endl; 151 | exit(EXIT_WAIVED); 152 | } 153 | cudaSetDevice(device); 154 | 155 | // Print a message describing what the sample does. 156 | printf("***************************************************************************\n"); 157 | printf("The CPU launches 2 blocks of 2 threads each. On the device each thread will\n"); 158 | printf("launch 2 blocks of 2 threads each. The GPU we will do that recursively\n"); 159 | printf("until it reaches max_depth=%d\n\n", max_depth); 160 | printf("In total 2"); 161 | int num_blocks = 2, sum = 2; 162 | 163 | for (int i = 1 ; i < max_depth ; ++i) 164 | { 165 | num_blocks *= 4; 166 | printf("+%d", num_blocks); 167 | sum += num_blocks; 168 | } 169 | 170 | printf("=%d blocks are launched!!! (%d from the GPU)\n", sum, sum-2); 171 | printf("***************************************************************************\n\n"); 172 | 173 | // We set the recursion limit for CDP to max_depth. 174 | cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth); 175 | 176 | // Launch the kernel from the CPU. 177 | printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n\n"); 178 | cdp_kernel<<<2, 2>>>(max_depth, 0, 0, -1); 179 | checkCudaErrors(cudaGetLastError()); 180 | 181 | // Finalize. 182 | checkCudaErrors(cudaDeviceSynchronize()); 183 | 184 | // cudaDeviceReset causes the driver to clean up all state. While 185 | // not mandatory in normal operation, it is good practice. It is also 186 | // needed to ensure correct operation when the application is being 187 | // profiled. Calling cudaDeviceReset causes all profile data to be 188 | // flushed before the application exits 189 | checkCudaErrors(cudaDeviceReset()); 190 | 191 | exit(EXIT_SUCCESS); 192 | } 193 | -------------------------------------------------------------------------------- /CUDA/cdpSimpleQuicksort.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #define MAX_DEPTH 16 17 | #define INSERTION_SORT 32 18 | 19 | //////////////////////////////////////////////////////////////////////////////// 20 | // Selection sort used when depth gets too big or the number of elements drops 21 | // below a threshold. 22 | //////////////////////////////////////////////////////////////////////////////// 23 | __device__ void selection_sort(unsigned int *data, int left, int right) 24 | { 25 | for (int i = left ; i <= right ; ++i) 26 | { 27 | unsigned min_val = data[i]; 28 | int min_idx = i; 29 | 30 | // Find the smallest value in the range [left, right]. 31 | for (int j = i+1 ; j <= right ; ++j) 32 | { 33 | unsigned val_j = data[j]; 34 | 35 | if (val_j < min_val) 36 | { 37 | min_idx = j; 38 | min_val = val_j; 39 | } 40 | } 41 | 42 | // Swap the values. 43 | if (i != min_idx) 44 | { 45 | data[min_idx] = data[i]; 46 | data[i] = min_val; 47 | } 48 | } 49 | } 50 | 51 | //////////////////////////////////////////////////////////////////////////////// 52 | // Very basic quicksort algorithm, recursively launching the next level. 53 | //////////////////////////////////////////////////////////////////////////////// 54 | __global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth) 55 | { 56 | // If we're too deep or there are few elements left, we use an insertion sort... 57 | if (depth >= MAX_DEPTH || right-left <= INSERTION_SORT) 58 | { 59 | selection_sort(data, left, right); 60 | return; 61 | } 62 | 63 | unsigned int *lptr = data+left; 64 | unsigned int *rptr = data+right; 65 | unsigned int pivot = data[(left+right)/2]; 66 | 67 | // Do the partitioning. 68 | while (lptr <= rptr) 69 | { 70 | // Find the next left- and right-hand values to swap 71 | unsigned int lval = *lptr; 72 | unsigned int rval = *rptr; 73 | 74 | // Move the left pointer as long as the pointed element is smaller than the pivot. 75 | while (lval < pivot) 76 | { 77 | lptr++; 78 | lval = *lptr; 79 | } 80 | 81 | // Move the right pointer as long as the pointed element is larger than the pivot. 82 | while (rval > pivot) 83 | { 84 | rptr--; 85 | rval = *rptr; 86 | } 87 | 88 | // If the swap points are valid, do the swap! 89 | if (lptr <= rptr) 90 | { 91 | *lptr++ = rval; 92 | *rptr-- = lval; 93 | } 94 | } 95 | 96 | // Now the recursive part 97 | int nright = rptr - data; 98 | int nleft = lptr - data; 99 | 100 | // Launch a new block to sort the left part. 101 | if (left < (rptr-data)) 102 | { 103 | cudaStream_t s; 104 | cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); 105 | cdp_simple_quicksort<<< 1, 1, 0, s >>>(data, left, nright, depth+1); 106 | cudaStreamDestroy(s); 107 | } 108 | 109 | // Launch a new block to sort the right part. 110 | if ((lptr-data) < right) 111 | { 112 | cudaStream_t s1; 113 | cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking); 114 | cdp_simple_quicksort<<< 1, 1, 0, s1 >>>(data, nleft, right, depth+1); 115 | cudaStreamDestroy(s1); 116 | } 117 | } 118 | 119 | //////////////////////////////////////////////////////////////////////////////// 120 | // Call the quicksort kernel from the host. 121 | //////////////////////////////////////////////////////////////////////////////// 122 | void run_qsort(unsigned int *data, unsigned int nitems) 123 | { 124 | // Prepare CDP for the max depth 'MAX_DEPTH'. 125 | checkCudaErrors(cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH)); 126 | 127 | // Launch on device 128 | int left = 0; 129 | int right = nitems-1; 130 | std::cout << "Launching kernel on the GPU" << std::endl; 131 | cdp_simple_quicksort<<< 1, 1 >>>(data, left, right, 0); 132 | checkCudaErrors(cudaDeviceSynchronize()); 133 | } 134 | 135 | //////////////////////////////////////////////////////////////////////////////// 136 | // Initialize data on the host. 137 | //////////////////////////////////////////////////////////////////////////////// 138 | void initialize_data(unsigned int *dst, unsigned int nitems) 139 | { 140 | // Fixed seed for illustration 141 | srand(2047); 142 | 143 | // Fill dst with random values 144 | for (unsigned i = 0 ; i < nitems ; i++) 145 | dst[i] = rand() % nitems ; 146 | } 147 | 148 | //////////////////////////////////////////////////////////////////////////////// 149 | // Verify the results. 150 | //////////////////////////////////////////////////////////////////////////////// 151 | void check_results(int n, unsigned int *results_d) 152 | { 153 | unsigned int *results_h = new unsigned[n]; 154 | checkCudaErrors(cudaMemcpy(results_h, results_d, n*sizeof(unsigned), cudaMemcpyDeviceToHost)); 155 | 156 | for (int i = 1 ; i < n ; ++i) 157 | if (results_h[i-1] > results_h[i]) 158 | { 159 | std::cout << "Invalid item[" << i-1 << "]: " << results_h[i-1] << " greater than " << results_h[i] << std::endl; 160 | exit(EXIT_FAILURE); 161 | } 162 | 163 | std::cout << "OK" << std::endl; 164 | delete[] results_h; 165 | } 166 | 167 | //////////////////////////////////////////////////////////////////////////////// 168 | // Main entry point. 169 | //////////////////////////////////////////////////////////////////////////////// 170 | int main(int argc, char **argv) 171 | { 172 | int num_items = 128; 173 | bool verbose = false; 174 | 175 | if (checkCmdLineFlag(argc, (const char **)argv, "help") || 176 | checkCmdLineFlag(argc, (const char **)argv, "h")) 177 | { 178 | std::cerr << "Usage: " << argv[0] << " num_items=\twhere num_items is the number of items to sort" << std::endl; 179 | exit(EXIT_SUCCESS); 180 | } 181 | 182 | if (checkCmdLineFlag(argc, (const char **)argv, "v")) 183 | { 184 | verbose = true; 185 | } 186 | 187 | if (checkCmdLineFlag(argc, (const char **)argv, "num_items")) 188 | { 189 | num_items = getCmdLineArgumentInt(argc, (const char **)argv, "num_items"); 190 | 191 | if (num_items < 1) 192 | { 193 | std::cerr << "ERROR: num_items has to be greater than 1" << std::endl; 194 | exit(EXIT_FAILURE); 195 | } 196 | } 197 | 198 | // Get device properties 199 | int device_count = 0, device = -1; 200 | 201 | if(checkCmdLineFlag(argc, (const char **)argv, "device")) 202 | { 203 | device = getCmdLineArgumentInt(argc, (const char **)argv, "device"); 204 | 205 | cudaDeviceProp properties; 206 | checkCudaErrors(cudaGetDeviceProperties(&properties, device)); 207 | 208 | if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5)) 209 | { 210 | std::cout << "Running on GPU " << device << " (" << properties.name << ")" << std::endl; 211 | } 212 | else 213 | { 214 | std::cout << "ERROR: cdpsimpleQuicksort requires GPU devices with compute SM 3.5 or higher."<< std::endl; 215 | std::cout << "Current GPU device has compute SM" << properties.major <<"."<< properties.minor <<". Exiting..." << std::endl; 216 | exit(EXIT_FAILURE); 217 | } 218 | 219 | } 220 | else 221 | { 222 | checkCudaErrors(cudaGetDeviceCount(&device_count)); 223 | 224 | for (int i = 0 ; i < device_count ; ++i) 225 | { 226 | cudaDeviceProp properties; 227 | checkCudaErrors(cudaGetDeviceProperties(&properties, i)); 228 | 229 | if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5)) 230 | { 231 | device = i; 232 | std::cout << "Running on GPU " << i << " (" << properties.name << ")" << std::endl; 233 | break; 234 | } 235 | 236 | std::cout << "GPU " << i << " (" << properties.name << ") does not support CUDA Dynamic Parallelism" << std::endl; 237 | } 238 | } 239 | 240 | if (device == -1) 241 | { 242 | std::cerr << "cdpSimpleQuicksort requires GPU devices with compute SM 3.5 or higher. Exiting..." << std::endl; 243 | exit(EXIT_WAIVED); 244 | } 245 | 246 | cudaSetDevice(device); 247 | 248 | // Create input data 249 | unsigned int *h_data = 0; 250 | unsigned int *d_data = 0; 251 | 252 | // Allocate CPU memory and initialize data. 253 | std::cout << "Initializing data:" << std::endl; 254 | h_data =(unsigned int *)malloc(num_items*sizeof(unsigned int)); 255 | initialize_data(h_data, num_items); 256 | 257 | if (verbose) 258 | { 259 | for (int i=0 ; i 21 | #include 22 | 23 | // CUDA runtime 24 | #include 25 | 26 | // helper functions and utilities to work with CUDA 27 | #include 28 | #include 29 | 30 | // This kernel computes a standard parallel reduction and evaluates the 31 | // time it takes to do that for each block. The timing results are stored 32 | // in device memory. 33 | __global__ static void timedReduction(const float *input, float *output, clock_t *timer) 34 | { 35 | // __shared__ float shared[2 * blockDim.x]; 36 | extern __shared__ float shared[]; 37 | 38 | const int tid = threadIdx.x; 39 | const int bid = blockIdx.x; 40 | 41 | if (tid == 0) timer[bid] = clock(); 42 | 43 | // Copy input. 44 | shared[tid] = input[tid]; 45 | shared[tid + blockDim.x] = input[tid + blockDim.x]; 46 | 47 | // Perform reduction to find minimum. 48 | for (int d = blockDim.x; d > 0; d /= 2) 49 | { 50 | __syncthreads(); 51 | 52 | if (tid < d) 53 | { 54 | float f0 = shared[tid]; 55 | float f1 = shared[tid + d]; 56 | 57 | if (f1 < f0) 58 | { 59 | shared[tid] = f1; 60 | } 61 | } 62 | } 63 | 64 | // Write result. 65 | if (tid == 0) output[bid] = shared[0]; 66 | 67 | __syncthreads(); 68 | 69 | if (tid == 0) timer[bid+gridDim.x] = clock(); 70 | } 71 | 72 | 73 | // This example shows how to use the clock function to measure the performance of 74 | // a kernel accurately. 75 | // 76 | // Blocks are executed in parallel and out of order. Since there's no synchronization 77 | // mechanism between blocks, we measure the clock once for each block. The clock 78 | // samples are written to device memory. 79 | 80 | #define NUM_BLOCKS 64 81 | #define NUM_THREADS 256 82 | 83 | // It's interesting to change the number of blocks and the number of threads to 84 | // understand how to keep the hardware busy. 85 | // 86 | // Here are some numbers I get on my G80: 87 | // blocks - clocks 88 | // 1 - 3096 89 | // 8 - 3232 90 | // 16 - 3364 91 | // 32 - 4615 92 | // 64 - 9981 93 | // 94 | // With less than 16 blocks some of the multiprocessors of the device are idle. With 95 | // more than 16 you are using all the multiprocessors, but there's only one block per 96 | // multiprocessor and that doesn't allow you to hide the latency of the memory. With 97 | // more than 32 the speed scales linearly. 98 | 99 | // Start the main CUDA Sample here 100 | int main(int argc, char **argv) 101 | { 102 | printf("CUDA Clock sample\n"); 103 | 104 | // This will pick the best possible CUDA capable device 105 | int dev = findCudaDevice(argc, (const char **)argv); 106 | 107 | float *dinput = NULL; 108 | float *doutput = NULL; 109 | clock_t *dtimer = NULL; 110 | 111 | clock_t timer[NUM_BLOCKS * 2]; 112 | float input[NUM_THREADS * 2]; 113 | 114 | for (int i = 0; i < NUM_THREADS * 2; i++) 115 | { 116 | input[i] = (float)i; 117 | } 118 | 119 | checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2)); 120 | checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS)); 121 | checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)); 122 | 123 | checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice)); 124 | 125 | timedReduction<<>>(dinput, doutput, dtimer); 126 | 127 | checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost)); 128 | 129 | checkCudaErrors(cudaFree(dinput)); 130 | checkCudaErrors(cudaFree(doutput)); 131 | checkCudaErrors(cudaFree(dtimer)); 132 | 133 | 134 | // Compute the difference between the last block end and the first block start. 135 | clock_t minStart = timer[0]; 136 | clock_t maxEnd = timer[NUM_BLOCKS]; 137 | 138 | for (int i = 1; i < NUM_BLOCKS; i++) 139 | { 140 | minStart = timer[i] < minStart ? timer[i] : minStart; 141 | maxEnd = timer[NUM_BLOCKS+i] > maxEnd ? timer[NUM_BLOCKS+i] : maxEnd; 142 | } 143 | 144 | printf("Total clocks = %d\n", (int)(maxEnd - minStart)); 145 | 146 | 147 | // cudaDeviceReset causes the driver to clean up all state. While 148 | // not mandatory in normal operation, it is good practice. It is also 149 | // needed to ensure correct operation when the application is being 150 | // profiled. Calling cudaDeviceReset causes all profile data to be 151 | // flushed before the application exits 152 | cudaDeviceReset(); 153 | 154 | return EXIT_SUCCESS; 155 | } 156 | -------------------------------------------------------------------------------- /CUDA/cppIntegration.cu: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // 3 | // Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 4 | // 5 | // Please refer to the NVIDIA end user license agreement (EULA) associated 6 | // with this source code for terms and conditions that govern your use of 7 | // this software. Any use, reproduction, disclosure, or distribution of 8 | // this software and related documentation outside the terms of the EULA 9 | // is strictly prohibited. 10 | // 11 | //////////////////////////////////////////////////////////////////////////// 12 | 13 | /* Example of integrating CUDA functions into an existing 14 | * application / framework. 15 | * Host part of the device code. 16 | * Compiled with Cuda compiler. 17 | */ 18 | 19 | // System includes 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | // CUDA runtime 27 | #include 28 | 29 | // helper functions and utilities to work with CUDA 30 | #include 31 | #include 32 | 33 | #ifndef MAX 34 | #define MAX(a,b) (a > b ? a : b) 35 | #endif 36 | 37 | //////////////////////////////////////////////////////////////////////////////// 38 | // declaration, forward 39 | 40 | extern "C" void computeGold(char *reference, char *idata, const unsigned int len); 41 | extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len); 42 | 43 | /////////////////////////////////////////////////////////////////////////////// 44 | //! Simple test kernel for device functionality 45 | //! @param g_odata memory to process (in and out) 46 | /////////////////////////////////////////////////////////////////////////////// 47 | __global__ void kernel(int *g_data) 48 | { 49 | // write data to global memory 50 | const unsigned int tid = threadIdx.x; 51 | int data = g_data[tid]; 52 | 53 | // use integer arithmetic to process all four bytes with one thread 54 | // this serializes the execution, but is the simplest solutions to avoid 55 | // bank conflicts for this very low number of threads 56 | // in general it is more efficient to process each byte by a separate thread, 57 | // to avoid bank conflicts the access pattern should be 58 | // g_data[4 * wtid + wid], where wtid is the thread id within the half warp 59 | // and wid is the warp id 60 | // see also the programming guide for a more in depth discussion. 61 | g_data[tid] = ((((data << 0) >> 24) - 10) << 24) 62 | | ((((data << 8) >> 24) - 10) << 16) 63 | | ((((data << 16) >> 24) - 10) << 8) 64 | | ((((data << 24) >> 24) - 10) << 0); 65 | } 66 | 67 | /////////////////////////////////////////////////////////////////////////////// 68 | //! Demonstration that int2 data can be used in the cpp code 69 | //! @param g_odata memory to process (in and out) 70 | /////////////////////////////////////////////////////////////////////////////// 71 | __global__ void 72 | kernel2(int2 *g_data) 73 | { 74 | // write data to global memory 75 | const unsigned int tid = threadIdx.x; 76 | int2 data = g_data[tid]; 77 | 78 | // use integer arithmetic to process all four bytes with one thread 79 | // this serializes the execution, but is the simplest solutions to avoid 80 | // bank conflicts for this very low number of threads 81 | // in general it is more efficient to process each byte by a separate thread, 82 | // to avoid bank conflicts the access pattern should be 83 | // g_data[4 * wtid + wid], where wtid is the thread id within the half warp 84 | // and wid is the warp id 85 | // see also the programming guide for a more in depth discussion. 86 | g_data[tid].x = data.x - data.y; 87 | } 88 | 89 | //////////////////////////////////////////////////////////////////////////////// 90 | //! Entry point for Cuda functionality on host side 91 | //! @param argc command line argument count 92 | //! @param argv command line arguments 93 | //! @param data data to process on the device 94 | //! @param len len of \a data 95 | //////////////////////////////////////////////////////////////////////////////// 96 | extern "C" bool 97 | runTest(const int argc, const char **argv, char *data, int2 *data_int2, unsigned int len) 98 | { 99 | // use command-line specified CUDA device, otherwise use device with highest Gflops/s 100 | findCudaDevice(argc, (const char **)argv); 101 | 102 | const unsigned int num_threads = len / 4; 103 | assert(0 == (len % 4)); 104 | const unsigned int mem_size = sizeof(char) * len; 105 | const unsigned int mem_size_int2 = sizeof(int2) * len; 106 | 107 | // allocate device memory 108 | char *d_data; 109 | checkCudaErrors(cudaMalloc((void **) &d_data, mem_size)); 110 | // copy host memory to device 111 | checkCudaErrors(cudaMemcpy(d_data, data, mem_size, 112 | cudaMemcpyHostToDevice)); 113 | // allocate device memory for int2 version 114 | int2 *d_data_int2; 115 | checkCudaErrors(cudaMalloc((void **) &d_data_int2, mem_size_int2)); 116 | // copy host memory to device 117 | checkCudaErrors(cudaMemcpy(d_data_int2, data_int2, mem_size_int2, 118 | cudaMemcpyHostToDevice)); 119 | 120 | // setup execution parameters 121 | dim3 grid(1, 1, 1); 122 | dim3 threads(num_threads, 1, 1); 123 | dim3 threads2(len, 1, 1); // more threads needed fir separate int2 version 124 | // execute the kernel 125 | kernel<<< grid, threads >>>((int *) d_data); 126 | kernel2<<< grid, threads2 >>>(d_data_int2); 127 | 128 | // check if kernel execution generated and error 129 | getLastCudaError("Kernel execution failed"); 130 | 131 | // compute reference solutions 132 | char *reference = (char *) malloc(mem_size); 133 | computeGold(reference, data, len); 134 | int2 *reference2 = (int2 *) malloc(mem_size_int2); 135 | computeGold2(reference2, data_int2, len); 136 | 137 | // copy results from device to host 138 | checkCudaErrors(cudaMemcpy(data, d_data, mem_size, 139 | cudaMemcpyDeviceToHost)); 140 | checkCudaErrors(cudaMemcpy(data_int2, d_data_int2, mem_size_int2, 141 | cudaMemcpyDeviceToHost)); 142 | 143 | // check result 144 | bool success = true; 145 | 146 | for (unsigned int i = 0; i < len; i++) 147 | { 148 | if (reference[i] != data[i] || 149 | reference2[i].x != data_int2[i].x || 150 | reference2[i].y != data_int2[i].y) 151 | { 152 | success = false; 153 | } 154 | } 155 | 156 | // cleanup memory 157 | checkCudaErrors(cudaFree(d_data)); 158 | checkCudaErrors(cudaFree(d_data_int2)); 159 | free(reference); 160 | free(reference2); 161 | 162 | return success; 163 | } 164 | -------------------------------------------------------------------------------- /CUDA/cppOverload.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | #define THREAD_N 256 12 | #define N 1024 13 | #define DIV_UP(a, b) (((a) + (b) - 1) / (b)) 14 | 15 | // Includes, system 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include "cppOverload_kernel.cuh" 21 | 22 | const char *sampleName = "C++ Function Overloading"; 23 | 24 | #define OUTPUT_ATTR(attr) \ 25 | printf("Shared Size: %d\n", (int)attr.sharedSizeBytes); \ 26 | printf("Constant Size: %d\n", (int)attr.constSizeBytes); \ 27 | printf("Local Size: %d\n", (int)attr.localSizeBytes); \ 28 | printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock); \ 29 | printf("Number of Registers: %d\n", attr.numRegs); \ 30 | printf("PTX Version: %d\n", attr.ptxVersion); \ 31 | printf("Binary Version: %d\n", attr.binaryVersion); \ 32 | 33 | 34 | bool check_func1(int *hInput, int *hOutput, int a) 35 | { 36 | for (int i = 0; i < N; ++i) 37 | { 38 | int cpuRes = hInput[i]*a + i; 39 | 40 | if (hOutput[i] != cpuRes) 41 | { 42 | return false; 43 | } 44 | } 45 | 46 | return true; 47 | } 48 | 49 | bool check_func2(int2 *hInput, int *hOutput, int a) 50 | { 51 | for (int i = 0; i < N; i++) 52 | { 53 | int cpuRes = (hInput[i].x + hInput[i].y)*a + i; 54 | 55 | if (hOutput[i] != cpuRes) 56 | { 57 | return false; 58 | } 59 | } 60 | 61 | return true; 62 | } 63 | 64 | bool check_func3(int *hInput1, int *hInput2, int *hOutput, int a) 65 | { 66 | for (int i = 0; i < N; i++) 67 | { 68 | if (hOutput[i] != (hInput1[i] + hInput2[i])*a + i) 69 | { 70 | return false; 71 | } 72 | } 73 | 74 | return true; 75 | } 76 | 77 | int main(int argc, const char *argv[]) 78 | { 79 | int *hInput = NULL; 80 | int *hOutput = NULL; 81 | int *dInput = NULL; 82 | int *dOutput = NULL; 83 | 84 | printf("%s starting...\n", sampleName); 85 | 86 | int deviceCount; 87 | checkCudaErrors(cudaGetDeviceCount(&deviceCount)); 88 | printf("DevicecheckCudaErrors Count: %d\n", deviceCount); 89 | 90 | int deviceID = findCudaDevice(argc, argv); 91 | cudaDeviceProp prop; 92 | checkCudaErrors(cudaGetDeviceProperties(&prop, deviceID)); 93 | if (prop.major < 2) 94 | { 95 | printf("ERROR: cppOverload requires GPU devices with compute SM 2.0 or higher.\n"); 96 | printf("Current GPU device has compute SM%d.%d, Exiting...", prop.major, prop.minor); 97 | exit(EXIT_WAIVED); 98 | } 99 | 100 | checkCudaErrors(cudaSetDevice(deviceID)); 101 | 102 | // Allocate device memory 103 | checkCudaErrors(cudaMalloc(&dInput , sizeof(int)*N*2)); 104 | checkCudaErrors(cudaMalloc(&dOutput, sizeof(int)*N)); 105 | 106 | // Allocate host memory 107 | checkCudaErrors(cudaMallocHost(&hInput , sizeof(int)*N*2)); 108 | checkCudaErrors(cudaMallocHost(&hOutput, sizeof(int)*N)); 109 | 110 | for (int i = 0; i < N*2; i++) 111 | { 112 | hInput[i] = i; 113 | } 114 | 115 | // Copy data from host to device 116 | checkCudaErrors(cudaMemcpy(dInput, hInput, sizeof(int)*N*2, cudaMemcpyHostToDevice)); 117 | 118 | // Test C++ overloading 119 | bool testResult = true; 120 | bool funcResult = true; 121 | int a = 1; 122 | 123 | void (*func1)(const int *, int *, int); 124 | void (*func2)(const int2 *, int *, int); 125 | void (*func3)(const int *, const int *, int *, int); 126 | struct cudaFuncAttributes attr; 127 | 128 | // overload function 1 129 | func1 = simple_kernel; 130 | memset(&attr, 0, sizeof(attr)); 131 | checkCudaErrors(cudaFuncSetCacheConfig(*func1, cudaFuncCachePreferShared)); 132 | checkCudaErrors(cudaFuncGetAttributes(&attr, *func1)); 133 | OUTPUT_ATTR(attr); 134 | (*func1)<<>>(dInput, dOutput, a); 135 | checkCudaErrors(cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost)); 136 | funcResult = check_func1(hInput, hOutput, a); 137 | printf("simple_kernel(const int *pIn, int *pOut, int a) %s\n\n", funcResult ? "PASSED" : "FAILED"); 138 | testResult &= funcResult; 139 | 140 | // overload function 2 141 | func2 = simple_kernel; 142 | memset(&attr, 0, sizeof(attr)); 143 | checkCudaErrors(cudaFuncSetCacheConfig(*func2, cudaFuncCachePreferShared)); 144 | checkCudaErrors(cudaFuncGetAttributes(&attr, *func2)); 145 | OUTPUT_ATTR(attr); 146 | (*func2)<<>>((int2 *)dInput, dOutput, a); 147 | checkCudaErrors(cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost)); 148 | funcResult = check_func2(reinterpret_cast(hInput), hOutput, a); 149 | printf("simple_kernel(const int2 *pIn, int *pOut, int a) %s\n\n", funcResult ? "PASSED" : "FAILED"); 150 | testResult &= funcResult; 151 | 152 | // overload function 3 153 | func3 = simple_kernel; 154 | memset(&attr, 0, sizeof(attr)); 155 | checkCudaErrors(cudaFuncSetCacheConfig(*func3, cudaFuncCachePreferShared)); 156 | checkCudaErrors(cudaFuncGetAttributes(&attr, *func3)); 157 | OUTPUT_ATTR(attr); 158 | (*func3)<<>>(dInput, dInput+N, dOutput, a); 159 | checkCudaErrors(cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost)); 160 | funcResult = check_func3(&hInput[0], &hInput[N], hOutput, a); 161 | printf("simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) %s\n\n", funcResult ? "PASSED" : "FAILED"); 162 | testResult &= funcResult; 163 | 164 | checkCudaErrors(cudaFree(dInput)); 165 | checkCudaErrors(cudaFree(dOutput)); 166 | checkCudaErrors(cudaFreeHost(hOutput)); 167 | checkCudaErrors(cudaFreeHost(hInput)); 168 | 169 | checkCudaErrors(cudaDeviceSynchronize()); 170 | 171 | // cudaDeviceReset causes the driver to clean up all state. While 172 | // not mandatory in normal operation, it is good practice. It is also 173 | // needed to ensure correct operation when the application is being 174 | // profiled. Calling cudaDeviceReset causes all profile data to be 175 | // flushed before the application exits 176 | checkCudaErrors(cudaDeviceReset()); 177 | 178 | exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); 179 | } 180 | -------------------------------------------------------------------------------- /CUDA/matrixMul.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | /** 13 | * Matrix multiplication: C = A * B. 14 | * Host code. 15 | * 16 | * This sample implements matrix multiplication as described in Chapter 3 17 | * of the programming guide. 18 | * It has been written for clarity of exposition to illustrate various CUDA 19 | * programming principles, not with the goal of providing the most 20 | * performant generic kernel for matrix multiplication. 21 | * 22 | * See also: 23 | * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra," 24 | * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08), 25 | * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11. 26 | */ 27 | 28 | // System includes 29 | #include 30 | #include 31 | 32 | // CUDA runtime 33 | #include 34 | 35 | // Helper functions and utilities to work with CUDA 36 | #include 37 | 38 | /** 39 | * Matrix multiplication (CUDA Kernel) on the device: C = A * B 40 | * wA is A's width and wB is B's width 41 | */ 42 | template __global__ void 43 | matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) 44 | { 45 | // Block index 46 | int bx = blockIdx.x; 47 | int by = blockIdx.y; 48 | 49 | // Thread index 50 | int tx = threadIdx.x; 51 | int ty = threadIdx.y; 52 | 53 | // Index of the first sub-matrix of A processed by the block 54 | int aBegin = wA * BLOCK_SIZE * by; 55 | 56 | // Index of the last sub-matrix of A processed by the block 57 | int aEnd = aBegin + wA - 1; 58 | 59 | // Step size used to iterate through the sub-matrices of A 60 | int aStep = BLOCK_SIZE; 61 | 62 | // Index of the first sub-matrix of B processed by the block 63 | int bBegin = BLOCK_SIZE * bx; 64 | 65 | // Step size used to iterate through the sub-matrices of B 66 | int bStep = BLOCK_SIZE * wB; 67 | 68 | // Csub is used to store the element of the block sub-matrix 69 | // that is computed by the thread 70 | float Csub = 0; 71 | 72 | // Loop over all the sub-matrices of A and B 73 | // required to compute the block sub-matrix 74 | for (int a = aBegin, b = bBegin; 75 | a <= aEnd; 76 | a += aStep, b += bStep) 77 | { 78 | 79 | // Declaration of the shared memory array As used to 80 | // store the sub-matrix of A 81 | __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 82 | 83 | // Declaration of the shared memory array Bs used to 84 | // store the sub-matrix of B 85 | __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 86 | 87 | // Load the matrices from device memory 88 | // to shared memory; each thread loads 89 | // one element of each matrix 90 | As[ty][tx] = A[a + wA * ty + tx]; 91 | Bs[ty][tx] = B[b + wB * ty + tx]; 92 | 93 | // Synchronize to make sure the matrices are loaded 94 | __syncthreads(); 95 | 96 | // Multiply the two matrices together; 97 | // each thread computes one element 98 | // of the block sub-matrix 99 | #pragma unroll 100 | 101 | for (int k = 0; k < BLOCK_SIZE; ++k) 102 | { 103 | Csub += As[ty][k] * Bs[k][tx]; 104 | } 105 | 106 | // Synchronize to make sure that the preceding 107 | // computation is done before loading two new 108 | // sub-matrices of A and B in the next iteration 109 | __syncthreads(); 110 | } 111 | 112 | // Write the block sub-matrix to device memory; 113 | // each thread writes one element 114 | int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; 115 | C[c + wB * ty + tx] = Csub; 116 | } 117 | 118 | void constantInit(float *data, int size, float val) 119 | { 120 | for (int i = 0; i < size; ++i) 121 | { 122 | data[i] = val; 123 | } 124 | } 125 | 126 | /** 127 | * Run a simple test of matrix multiplication using CUDA 128 | */ 129 | int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB) 130 | { 131 | // Allocate host memory for matrices A and B 132 | unsigned int size_A = dimsA.x * dimsA.y; 133 | unsigned int mem_size_A = sizeof(float) * size_A; 134 | float *h_A = (float *)malloc(mem_size_A); 135 | unsigned int size_B = dimsB.x * dimsB.y; 136 | unsigned int mem_size_B = sizeof(float) * size_B; 137 | float *h_B = (float *)malloc(mem_size_B); 138 | 139 | // Initialize host memory 140 | const float valB = 0.01f; 141 | constantInit(h_A, size_A, 1.0f); 142 | constantInit(h_B, size_B, valB); 143 | 144 | // Allocate device memory 145 | float *d_A, *d_B, *d_C; 146 | 147 | // Allocate host matrix C 148 | dim3 dimsC(dimsB.x, dimsA.y, 1); 149 | unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); 150 | float *h_C = (float *) malloc(mem_size_C); 151 | 152 | if (h_C == NULL) 153 | { 154 | fprintf(stderr, "Failed to allocate host matrix C!\n"); 155 | exit(EXIT_FAILURE); 156 | } 157 | 158 | cudaError_t error; 159 | 160 | error = cudaMalloc((void **) &d_A, mem_size_A); 161 | 162 | if (error != cudaSuccess) 163 | { 164 | printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__); 165 | exit(EXIT_FAILURE); 166 | } 167 | 168 | error = cudaMalloc((void **) &d_B, mem_size_B); 169 | 170 | if (error != cudaSuccess) 171 | { 172 | printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__); 173 | exit(EXIT_FAILURE); 174 | } 175 | 176 | error = cudaMalloc((void **) &d_C, mem_size_C); 177 | 178 | if (error != cudaSuccess) 179 | { 180 | printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__); 181 | exit(EXIT_FAILURE); 182 | } 183 | 184 | // copy host memory to device 185 | error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); 186 | 187 | if (error != cudaSuccess) 188 | { 189 | printf("cudaMemcpy (d_A,h_A) returned error code %d, line(%d)\n", error, __LINE__); 190 | exit(EXIT_FAILURE); 191 | } 192 | 193 | error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); 194 | 195 | if (error != cudaSuccess) 196 | { 197 | printf("cudaMemcpy (d_B,h_B) returned error code %d, line(%d)\n", error, __LINE__); 198 | exit(EXIT_FAILURE); 199 | } 200 | 201 | // Setup execution parameters 202 | dim3 threads(block_size, block_size); 203 | dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); 204 | 205 | // Create and start timer 206 | printf("Computing result using CUDA Kernel...\n"); 207 | 208 | // Performs warmup operation using matrixMul CUDA kernel 209 | if (block_size == 16) 210 | { 211 | matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 212 | } 213 | else 214 | { 215 | matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 216 | } 217 | 218 | printf("done\n"); 219 | 220 | cudaDeviceSynchronize(); 221 | 222 | // Allocate CUDA events that we'll use for timing 223 | cudaEvent_t start; 224 | error = cudaEventCreate(&start); 225 | 226 | if (error != cudaSuccess) 227 | { 228 | fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); 229 | exit(EXIT_FAILURE); 230 | } 231 | 232 | cudaEvent_t stop; 233 | error = cudaEventCreate(&stop); 234 | 235 | if (error != cudaSuccess) 236 | { 237 | fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); 238 | exit(EXIT_FAILURE); 239 | } 240 | 241 | // Record the start event 242 | error = cudaEventRecord(start, NULL); 243 | 244 | if (error != cudaSuccess) 245 | { 246 | fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); 247 | exit(EXIT_FAILURE); 248 | } 249 | 250 | // Execute the kernel 251 | int nIter = 300; 252 | 253 | for (int j = 0; j < nIter; j++) 254 | { 255 | if (block_size == 16) 256 | { 257 | matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 258 | } 259 | else 260 | { 261 | matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 262 | } 263 | } 264 | 265 | // Record the stop event 266 | error = cudaEventRecord(stop, NULL); 267 | 268 | if (error != cudaSuccess) 269 | { 270 | fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); 271 | exit(EXIT_FAILURE); 272 | } 273 | 274 | // Wait for the stop event to complete 275 | error = cudaEventSynchronize(stop); 276 | 277 | if (error != cudaSuccess) 278 | { 279 | fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); 280 | exit(EXIT_FAILURE); 281 | } 282 | 283 | float msecTotal = 0.0f; 284 | error = cudaEventElapsedTime(&msecTotal, start, stop); 285 | 286 | if (error != cudaSuccess) 287 | { 288 | fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); 289 | exit(EXIT_FAILURE); 290 | } 291 | 292 | // Compute and print the performance 293 | float msecPerMatrixMul = msecTotal / nIter; 294 | double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x; 295 | double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); 296 | printf( 297 | "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n", 298 | gigaFlops, 299 | msecPerMatrixMul, 300 | flopsPerMatrixMul, 301 | threads.x * threads.y); 302 | 303 | // Copy result from device to host 304 | error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); 305 | 306 | if (error != cudaSuccess) 307 | { 308 | printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, __LINE__); 309 | exit(EXIT_FAILURE); 310 | } 311 | 312 | printf("Checking computed result for correctness: "); 313 | bool correct = true; 314 | 315 | // test relative error by the formula 316 | // |_cpu - _gpu|/<|x|, |y|> < eps 317 | double eps = 1.e-6 ; // machine zero 318 | 319 | for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) 320 | { 321 | double abs_err = fabs(h_C[i] - (dimsA.x * valB)); 322 | double dot_length = dimsA.x; 323 | double abs_val = fabs(h_C[i]); 324 | double rel_err = abs_err/abs_val/dot_length ; 325 | 326 | if (rel_err > eps) 327 | { 328 | printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps); 329 | correct = false; 330 | } 331 | } 332 | 333 | printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); 334 | 335 | // Clean up memory 336 | free(h_A); 337 | free(h_B); 338 | free(h_C); 339 | cudaFree(d_A); 340 | cudaFree(d_B); 341 | cudaFree(d_C); 342 | 343 | printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); 344 | 345 | // cudaDeviceReset causes the driver to clean up all state. While 346 | // not mandatory in normal operation, it is good practice. It is also 347 | // needed to ensure correct operation when the application is being 348 | // profiled. Calling cudaDeviceReset causes all profile data to be 349 | // flushed before the application exits 350 | cudaDeviceReset(); 351 | 352 | if (correct) 353 | { 354 | return EXIT_SUCCESS; 355 | } 356 | else 357 | { 358 | return EXIT_FAILURE; 359 | } 360 | } 361 | 362 | 363 | /** 364 | * Program main 365 | */ 366 | int main(int argc, char **argv) 367 | { 368 | printf("[Matrix Multiply Using CUDA] - Starting...\n"); 369 | 370 | if (checkCmdLineFlag(argc, (const char **)argv, "help") || 371 | checkCmdLineFlag(argc, (const char **)argv, "?")) 372 | { 373 | printf("Usage -device=n (n >= 0 for deviceID)\n"); 374 | printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); 375 | printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); 376 | printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n"); 377 | 378 | exit(EXIT_SUCCESS); 379 | } 380 | 381 | // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line 382 | int devID = 0; 383 | 384 | if (checkCmdLineFlag(argc, (const char **)argv, "device")) 385 | { 386 | devID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); 387 | cudaSetDevice(devID); 388 | } 389 | 390 | cudaError_t error; 391 | cudaDeviceProp deviceProp; 392 | error = cudaGetDevice(&devID); 393 | 394 | if (error != cudaSuccess) 395 | { 396 | printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__); 397 | } 398 | 399 | error = cudaGetDeviceProperties(&deviceProp, devID); 400 | 401 | if (deviceProp.computeMode == cudaComputeModeProhibited) 402 | { 403 | fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n"); 404 | exit(EXIT_SUCCESS); 405 | } 406 | 407 | if (error != cudaSuccess) 408 | { 409 | printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); 410 | } 411 | else 412 | { 413 | printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); 414 | } 415 | 416 | // Use a larger block size for Fermi and above 417 | int block_size = (deviceProp.major < 2) ? 16 : 32; 418 | 419 | dim3 dimsA(5*2*block_size, 5*2*block_size, 1); 420 | dim3 dimsB(5*4*block_size, 5*2*block_size, 1); 421 | 422 | // width of Matrix A 423 | if (checkCmdLineFlag(argc, (const char **)argv, "wA")) 424 | { 425 | dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); 426 | } 427 | 428 | // height of Matrix A 429 | if (checkCmdLineFlag(argc, (const char **)argv, "hA")) 430 | { 431 | dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); 432 | } 433 | 434 | // width of Matrix B 435 | if (checkCmdLineFlag(argc, (const char **)argv, "wB")) 436 | { 437 | dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); 438 | } 439 | 440 | // height of Matrix B 441 | if (checkCmdLineFlag(argc, (const char **)argv, "hB")) 442 | { 443 | dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); 444 | } 445 | 446 | if (dimsA.x != dimsB.y) 447 | { 448 | printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", 449 | dimsA.x, dimsB.y); 450 | exit(EXIT_FAILURE); 451 | } 452 | 453 | printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y); 454 | 455 | int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB); 456 | 457 | exit(matrix_result); 458 | } 459 | -------------------------------------------------------------------------------- /CUDA/simpleAssert.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | #ifdef _WIN32 12 | # define WINDOWS_LEAN_AND_MEAN 13 | # define NOMINMAX 14 | # include 15 | #else 16 | # include 17 | #endif 18 | 19 | // Includes, system 20 | #include 21 | #include 22 | 23 | // Includes CUDA 24 | #include 25 | 26 | // Utilities and timing functions 27 | #include // includes cuda.h and cuda_runtime_api.h 28 | 29 | // CUDA helper functions 30 | #include // helper functions for CUDA error check 31 | 32 | const char *sampleName = "simpleAssert"; 33 | 34 | //////////////////////////////////////////////////////////////////////////////// 35 | // Auto-Verification Code 36 | bool testResult = true; 37 | 38 | //////////////////////////////////////////////////////////////////////////////// 39 | // Kernels 40 | //////////////////////////////////////////////////////////////////////////////// 41 | //! Tests assert function. 42 | //! Thread whose id > N will print assertion failed error message. 43 | //////////////////////////////////////////////////////////////////////////////// 44 | __global__ void testKernel(int N) 45 | { 46 | int gtid = blockIdx.x*blockDim.x + threadIdx.x ; 47 | assert(gtid < N) ; 48 | } 49 | 50 | //////////////////////////////////////////////////////////////////////////////// 51 | // Declaration, forward 52 | void runTest(int argc, char **argv); 53 | 54 | //////////////////////////////////////////////////////////////////////////////// 55 | // Program main 56 | //////////////////////////////////////////////////////////////////////////////// 57 | int main(int argc, char **argv) 58 | { 59 | printf("%s starting...\n", sampleName); 60 | 61 | runTest(argc, argv); 62 | 63 | // cudaDeviceReset causes the driver to clean up all state. While 64 | // not mandatory in normal operation, it is good practice. It is also 65 | // needed to ensure correct operation when the application is being 66 | // profiled. Calling cudaDeviceReset causes all profile data to be 67 | // flushed before the application exits 68 | cudaDeviceReset(); 69 | printf("%s completed, returned %s\n", 70 | sampleName, 71 | testResult ? "OK" : "ERROR!"); 72 | exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); 73 | } 74 | 75 | void runTest(int argc, char **argv) 76 | { 77 | int devID; 78 | int Nblocks = 2; 79 | int Nthreads = 32; 80 | cudaError_t error ; 81 | cudaDeviceProp deviceProp; 82 | 83 | #ifndef _WIN32 84 | utsname OS_System_Type; 85 | uname(&OS_System_Type); 86 | 87 | printf("OS_System_Type.release = %s\n", OS_System_Type.release); 88 | 89 | if (!strcasecmp(OS_System_Type.sysname, "Darwin")) 90 | { 91 | printf("simpleAssert is not current supported on Mac OSX\n\n"); 92 | exit(EXIT_SUCCESS); 93 | } 94 | else 95 | { 96 | printf("OS Info: <%s>\n\n", OS_System_Type.version); 97 | } 98 | 99 | #endif 100 | 101 | // This will pick the best possible CUDA capable device 102 | devID = findCudaDevice(argc, (const char **)argv); 103 | 104 | checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); 105 | 106 | if (deviceProp.major < 2) 107 | { 108 | printf("simpleAssert requires a GPU with compute capability " 109 | "2.0 or later, exiting...\n"); 110 | 111 | // cudaDeviceReset causes the driver to clean up all state. While 112 | // not mandatory in normal operation, it is good practice. It is also 113 | // needed to ensure correct operation when the application is being 114 | // profiled. Calling cudaDeviceReset causes all profile data to be 115 | // flushed before the application exits 116 | cudaDeviceReset(); 117 | exit(EXIT_SUCCESS); 118 | } 119 | 120 | // Kernel configuration, where a one-dimensional 121 | // grid and one-dimensional blocks are configured. 122 | dim3 dimGrid(Nblocks); 123 | dim3 dimBlock(Nthreads); 124 | 125 | printf("Launch kernel to generate assertion failures\n"); 126 | testKernel<<>>(60); 127 | 128 | //Synchronize (flushes assert output). 129 | printf("\n-- Begin assert output\n\n"); 130 | error = cudaDeviceSynchronize(); 131 | printf("\n-- End assert output\n\n"); 132 | 133 | //Check for errors and failed asserts in asynchronous kernel launch. 134 | if (error == cudaErrorAssert) 135 | { 136 | printf("Device assert failed as expected, " 137 | "CUDA error message is: %s\n\n", 138 | cudaGetErrorString(error)); 139 | } 140 | 141 | 142 | testResult = error == cudaErrorAssert; 143 | } 144 | -------------------------------------------------------------------------------- /CUDA/simpleAssert_kernel.cu: -------------------------------------------------------------------------------- 1 | 2 | //////////////////////////////////////////////////////////////////////////////// 3 | // Kernels 4 | //////////////////////////////////////////////////////////////////////////////// 5 | //! Tests assert function. 6 | //! Thread whose id > N will print assertion failed error message. 7 | //////////////////////////////////////////////////////////////////////////////// 8 | 9 | 10 | extern "C" __global__ void testKernel(int N) 11 | { 12 | int gtid = blockIdx.x*blockDim.x + threadIdx.x ; 13 | assert(gtid < N) ; 14 | } 15 | 16 | -------------------------------------------------------------------------------- /CUDA/template_runtime.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | /* Template project which demonstrates the basics on how to setup a project 13 | * example application, doesn't use cutil library. 14 | */ 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | using namespace std; 21 | 22 | #ifdef _WIN32 23 | #define STRCASECMP _stricmp 24 | #define STRNCASECMP _strnicmp 25 | #else 26 | #define STRCASECMP strcasecmp 27 | #define STRNCASECMP strncasecmp 28 | #endif 29 | 30 | #define ASSERT(x, msg, retcode) \ 31 | if (!(x)) \ 32 | { \ 33 | cout << msg << " " << __FILE__ << ":" << __LINE__ << endl; \ 34 | return retcode; \ 35 | } 36 | 37 | __global__ void sequence_gpu(int *d_ptr, int length) 38 | { 39 | int elemID = blockIdx.x * blockDim.x + threadIdx.x; 40 | 41 | if (elemID < length) 42 | { 43 | d_ptr[elemID] = elemID; 44 | } 45 | } 46 | 47 | 48 | void sequence_cpu(int *h_ptr, int length) 49 | { 50 | for (int elemID=0; elemID>>(d_ptr, N); 77 | ASSERT(cudaSuccess == cudaGetLastError(), "Kernel launch failed", -1); 78 | ASSERT(cudaSuccess == cudaDeviceSynchronize(), "Kernel synchronization failed", -1); 79 | 80 | sequence_cpu(h_ptr, N); 81 | 82 | cout << "CUDA and CPU algorithm implementations finished" << endl; 83 | 84 | int *h_d_ptr; 85 | ASSERT(cudaSuccess == cudaMallocHost(&h_d_ptr, N *sizeof(int)), "Host allocation of " << N << " ints failed", -1); 86 | ASSERT(cudaSuccess == cudaMemcpy(h_d_ptr, d_ptr, N *sizeof(int), cudaMemcpyDeviceToHost), "Copy of " << N << " ints from device to host failed", -1); 87 | bool bValid = true; 88 | 89 | for (int i=0; i 21 | 22 | // For the CUDA runtime routines (prefixed with "cuda_") 23 | #include 24 | 25 | /** 26 | * CUDA Kernel Device code 27 | * 28 | * Computes the vector addition of A and B into C. The 3 vectors have the same 29 | * number of elements numElements. 30 | */ 31 | __global__ void 32 | vectorAdd(const float *A, const float *B, float *C, int numElements) 33 | { 34 | int i = blockDim.x * blockIdx.x + threadIdx.x; 35 | 36 | if (i < numElements) 37 | { 38 | C[i] = A[i] + B[i]; 39 | } 40 | } 41 | 42 | /** 43 | * Host main routine 44 | */ 45 | int 46 | main(void) 47 | { 48 | // Error code to check return values for CUDA calls 49 | cudaError_t err = cudaSuccess; 50 | 51 | // Print the vector length to be used, and compute its size 52 | int numElements = 50000; 53 | size_t size = numElements * sizeof(float); 54 | printf("[Vector addition of %d elements]\n", numElements); 55 | 56 | // Allocate the host input vector A 57 | float *h_A = (float *)malloc(size); 58 | 59 | // Allocate the host input vector B 60 | float *h_B = (float *)malloc(size); 61 | 62 | // Allocate the host output vector C 63 | float *h_C = (float *)malloc(size); 64 | 65 | // Verify that allocations succeeded 66 | if (h_A == NULL || h_B == NULL || h_C == NULL) 67 | { 68 | fprintf(stderr, "Failed to allocate host vectors!\n"); 69 | exit(EXIT_FAILURE); 70 | } 71 | 72 | // Initialize the host input vectors 73 | for (int i = 0; i < numElements; ++i) 74 | { 75 | h_A[i] = rand()/(float)RAND_MAX; 76 | h_B[i] = rand()/(float)RAND_MAX; 77 | } 78 | 79 | // Allocate the device input vector A 80 | float *d_A = NULL; 81 | err = cudaMalloc((void **)&d_A, size); 82 | 83 | if (err != cudaSuccess) 84 | { 85 | fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); 86 | exit(EXIT_FAILURE); 87 | } 88 | 89 | // Allocate the device input vector B 90 | float *d_B = NULL; 91 | err = cudaMalloc((void **)&d_B, size); 92 | 93 | if (err != cudaSuccess) 94 | { 95 | fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err)); 96 | exit(EXIT_FAILURE); 97 | } 98 | 99 | // Allocate the device output vector C 100 | float *d_C = NULL; 101 | err = cudaMalloc((void **)&d_C, size); 102 | 103 | if (err != cudaSuccess) 104 | { 105 | fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err)); 106 | exit(EXIT_FAILURE); 107 | } 108 | 109 | // Copy the host input vectors A and B in host memory to the device input vectors in 110 | // device memory 111 | printf("Copy input data from the host memory to the CUDA device\n"); 112 | err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 113 | 114 | if (err != cudaSuccess) 115 | { 116 | fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err)); 117 | exit(EXIT_FAILURE); 118 | } 119 | 120 | err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 121 | 122 | if (err != cudaSuccess) 123 | { 124 | fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err)); 125 | exit(EXIT_FAILURE); 126 | } 127 | 128 | // Launch the Vector Add CUDA Kernel 129 | int threadsPerBlock = 256; 130 | int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; 131 | printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); 132 | vectorAdd<<>>(d_A, d_B, d_C, numElements); 133 | err = cudaGetLastError(); 134 | 135 | if (err != cudaSuccess) 136 | { 137 | fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); 138 | exit(EXIT_FAILURE); 139 | } 140 | 141 | // Copy the device result vector in device memory to the host result vector 142 | // in host memory. 143 | printf("Copy output data from the CUDA device to the host memory\n"); 144 | err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 145 | 146 | if (err != cudaSuccess) 147 | { 148 | fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err)); 149 | exit(EXIT_FAILURE); 150 | } 151 | 152 | // Verify that the result vector is correct 153 | for (int i = 0; i < numElements; ++i) 154 | { 155 | if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) 156 | { 157 | fprintf(stderr, "Result verification failed at element %d!\n", i); 158 | exit(EXIT_FAILURE); 159 | } 160 | } 161 | 162 | printf("Test PASSED\n"); 163 | 164 | // Free device global memory 165 | err = cudaFree(d_A); 166 | 167 | if (err != cudaSuccess) 168 | { 169 | fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err)); 170 | exit(EXIT_FAILURE); 171 | } 172 | 173 | err = cudaFree(d_B); 174 | 175 | if (err != cudaSuccess) 176 | { 177 | fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err)); 178 | exit(EXIT_FAILURE); 179 | } 180 | 181 | err = cudaFree(d_C); 182 | 183 | if (err != cudaSuccess) 184 | { 185 | fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err)); 186 | exit(EXIT_FAILURE); 187 | } 188 | 189 | // Free host memory 190 | free(h_A); 191 | free(h_B); 192 | free(h_C); 193 | 194 | // Reset the device and exit 195 | // cudaDeviceReset causes the driver to clean up all state. While 196 | // not mandatory in normal operation, it is good practice. It is also 197 | // needed to ensure correct operation when the application is being 198 | // profiled. Calling cudaDeviceReset causes all profile data to be 199 | // flushed before the application exits 200 | err = cudaDeviceReset(); 201 | 202 | if (err != cudaSuccess) 203 | { 204 | fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err)); 205 | exit(EXIT_FAILURE); 206 | } 207 | 208 | printf("Done\n"); 209 | return 0; 210 | } 211 | 212 | -------------------------------------------------------------------------------- /MPI/SimpleSendRcv.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | int main(argc,argv) 7 | int argc; 8 | char *argv[]; 9 | { 10 | int myid, numprocs; 11 | int tag,source,destination,count; 12 | int buffer; 13 | MPI_Status status; 14 | 15 | MPI_Init(&argc,&argv); 16 | MPI_Comm_size(MPI_COMM_WORLD,&numprocs); 17 | MPI_Comm_rank(MPI_COMM_WORLD,&myid); 18 | tag=1234; 19 | source=0; 20 | destination=1; 21 | count=1; 22 | if(myid == source){ 23 | buffer=5678; 24 | MPI_Send(&buffer,count,MPI_INT,destination,tag,MPI_COMM_WORLD); 25 | printf("processor %d sent %d\n",myid,buffer); 26 | } 27 | if(myid == destination){ 28 | MPI_Recv(&buffer,count,MPI_INT,source,tag,MPI_COMM_WORLD,&status); 29 | printf("processor %d got %d\n",myid,buffer); 30 | } 31 | MPI_Finalize(); 32 | } 33 | -------------------------------------------------------------------------------- /MPI/array_prod.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main(int argc,char** argv){ 6 | int rank,size,epp; 7 | int* A1=NULL; 8 | int* A2=NULL; 9 | int* Rec1=NULL; 10 | int* Rec2=NULL; 11 | int i,n; 12 | MPI_Init(NULL,NULL); 13 | MPI_Comm_rank(MPI_COMM_WORLD,&rank); 14 | MPI_Comm_size(MPI_COMM_WORLD,&size); 15 | if(rank==0){ 16 | printf("Enter size of arrays...\n"); 17 | scanf("%d",&n); 18 | epp=n/size+(n%size==0?0:1); 19 | A1=(int*)malloc(sizeof(int)*size*epp); 20 | A2=(int*)malloc(sizeof(int)*size*epp); 21 | for(i=0;i 2 | #include 3 | #include 4 | 5 | int main(int argc,char** argv){ 6 | int rank,size,epp; 7 | int* dataSend=NULL; 8 | int i; 9 | MPI_Init(NULL,NULL); 10 | MPI_Comm_rank(MPI_COMM_WORLD,&rank); 11 | MPI_Comm_size(MPI_COMM_WORLD,&size); 12 | epp=2; 13 | if(rank==0){ 14 | printf("Master creating data...\n"); 15 | dataSend=(int*)malloc(sizeof(int)*size*epp); 16 | for(i=0;i 2 | #include 3 | #include 4 | 5 | int main(int argc,char** argv){ 6 | int rank,size,epp; 7 | int* dataSend=NULL; 8 | int i; 9 | MPI_Init(NULL,NULL); 10 | MPI_Comm_rank(MPI_COMM_WORLD,&rank); 11 | MPI_Comm_size(MPI_COMM_WORLD,&size); 12 | epp=2; 13 | if(rank==0){ 14 | printf("Master creating data...\n"); 15 | dataSend=(int*)malloc(sizeof(int)*size*epp); 16 | for(i=0;i 8 | #include "mpi.h" 9 | 10 | int main(int argc, char *argv[]){ 11 | int myRank; 12 | int size; 13 | int fact; 14 | int lower,upper; 15 | int i; 16 | double local_result = 1.0; 17 | double total; 18 | 19 | /* initialize MPI */ 20 | MPI_Init(&argc,&argv); 21 | /* get my rank and the size of the communicator */ 22 | MPI_Comm_rank(MPI_COMM_WORLD, &myRank); 23 | MPI_Comm_size(MPI_COMM_WORLD, &size); 24 | 25 | /* get the input. (only if i have rank 0) */ 26 | if(myRank==0){ 27 | printf("Enter a number:"); 28 | scanf("%d",&fact); 29 | } 30 | /* since only the process with rank 0 has the input, 31 | * we must pass it to all the other processes. */ 32 | 33 | MPI_Bcast(&fact, /* in/out parameter */ 34 | 1, /* count */ 35 | MPI_INT, /* datatype */ 36 | 0, /* root */ 37 | MPI_COMM_WORLD); /* communicator */ 38 | 39 | /* calculate the upper and lower boundaries 40 | * for each process 41 | */ 42 | if(myRank==0){ 43 | lower = 1; 44 | }else 45 | lower = myRank * (fact / size) + 1; 46 | if(myRank==(size-1)) 47 | upper = fact; 48 | else 49 | upper = (myRank + 1) * (fact / size); 50 | 51 | /* now that we know upper and lower, do the 52 | * multiplication in our local area 53 | */ 54 | for(i=lower;i<=upper;i++){ 55 | local_result = local_result * (double)i; 56 | } 57 | 58 | /* combine all the local results by multiplying them 59 | * together 60 | */ 61 | MPI_Reduce(&local_result, /* operand */ 62 | &total, /* result */ 63 | 1, /* count */ 64 | MPI_DOUBLE, /* datatype */ 65 | MPI_PROD, /* operator */ 66 | 0, /* root rank */ 67 | MPI_COMM_WORLD); /* communicator */ 68 | 69 | /* give the output to the user */ 70 | if(myRank==0){ 71 | printf("The factorial of %d is %lf, and was calculated using %d processes\n",fact,total,size); 72 | } 73 | 74 | /* shut down MPI */ 75 | MPI_Finalize(); 76 | 77 | return 0; 78 | } 79 | -------------------------------------------------------------------------------- /MPI/lognSum.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main(int argc,char** argv){ 6 | int rank,size; 7 | int i,n; 8 | int* A=NULL; 9 | 10 | int D[2],sum; 11 | MPI_Init(NULL,NULL); 12 | MPI_Comm_rank(MPI_COMM_WORLD,&rank); 13 | MPI_Comm_size(MPI_COMM_WORLD,&size); 14 | int ctr=size*2; 15 | A=(int*)malloc(sizeof(int)*size*2); 16 | if(rank==0){ 17 | printf("Enter %d Elements :\n",size*2); 18 | for(i=0;i 4 | #include 5 | 6 | #define SIZE 12 /* Size of matrices */ 7 | 8 | int A[SIZE][SIZE], B[SIZE][SIZE], C[SIZE][SIZE]; 9 | 10 | void print_matrix(int m[SIZE][SIZE], int n) 11 | { 12 | int i, j = 0; 13 | for (i=0; i 2 | #include 3 | #include 4 | 5 | int main(int argc, char** argv) { 6 | MPI_Init(NULL, NULL); 7 | int world_size; 8 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 9 | int world_rank; 10 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 11 | char processor_name[MPI_MAX_PROCESSOR_NAME]; 12 | int name_len; 13 | MPI_Get_processor_name(processor_name, &name_len); 14 | printf("Hello world from processor %s, rank %d out of %d processors\n", 15 | processor_name, world_rank, world_size); 16 | MPI_Finalize(); 17 | } 18 | -------------------------------------------------------------------------------- /MPI/mpibcast.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(int argc,char** argv){ 5 | int rank,size; 6 | int data; 7 | MPI_Init(NULL,NULL); 8 | MPI_Comm_rank(MPI_COMM_WORLD,&rank); 9 | MPI_Comm_size(MPI_COMM_WORLD,&size); 10 | if(rank==0) 11 | data=9000; 12 | MPI_Bcast(&data,1,MPI_INT,0,MPI_COMM_WORLD); 13 | if(rank==0){ 14 | printf("Master sends %d\n",data); 15 | } 16 | else{ 17 | printf("Slave %d recieves %d\n",rank,data); 18 | } 19 | MPI_Finalize(); 20 | } 21 | -------------------------------------------------------------------------------- /MPI/pieCalculation.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #define M_PI 3.14159265358979323846 5 | #include 6 | 7 | /* main program */ 8 | 9 | int main(int argc, char* argv[]) { 10 | 11 | char* usage_fmt = "usage: %s number_of_samples seed\n"; 12 | char* end_ptr_for_strtol; 13 | 14 | /* initialize for MPI */ 15 | if (MPI_Init(&argc, &argv) != MPI_SUCCESS) { 16 | fprintf(stderr, "MPI initialization error\n"); 17 | return EXIT_FAILURE; 18 | } 19 | int nprocs, myID; 20 | MPI_Comm_size(MPI_COMM_WORLD, &nprocs); 21 | MPI_Comm_rank(MPI_COMM_WORLD, &myID); 22 | 23 | /* process command-line arguments */ 24 | if (argc != 3) { 25 | fprintf(stderr, usage_fmt, argv[0]); 26 | MPI_Finalize(); exit(EXIT_FAILURE); 27 | } 28 | long num_samples = strtol(argv[1], &end_ptr_for_strtol, 10); 29 | if (*end_ptr_for_strtol != '\0') { 30 | fprintf(stderr, usage_fmt, argv[0]); 31 | exit(EXIT_FAILURE); 32 | } 33 | long seed = strtol(argv[2], &end_ptr_for_strtol, 10); 34 | if (*end_ptr_for_strtol != '\0') { 35 | fprintf(stderr, usage_fmt, argv[0]); 36 | exit(EXIT_FAILURE); 37 | } 38 | 39 | /* do calculation */ 40 | srand((unsigned int) seed); 41 | int count = 0; 42 | int local_count = 0; 43 | int i; 44 | double x, y; 45 | double pi = 0; 46 | for (i = myID; i < num_samples; i += nprocs) { 47 | x = (double) rand() / (double) (RAND_MAX); 48 | y = (double) rand() / (double) (RAND_MAX); 49 | if ((x*x + y*y) <= 1.0) 50 | ++local_count; 51 | } 52 | MPI_Reduce(&local_count, &count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); 53 | if (myID == 0) 54 | pi = 4.0 * (double) count / (double) num_samples; 55 | 56 | if (myID == 0) { 57 | printf("MPI program results with %d processes:\n", nprocs); 58 | printf("number of samples = %ld, seed = %ld\n", num_samples, seed); 59 | printf("estimated pi = %12.10f\n", pi); 60 | printf("difference between estimated pi and math.h M_PI = %12.10f\n", 61 | fabs(pi - M_PI)); 62 | } 63 | 64 | /* clean up and return */ 65 | MPI_Finalize(); 66 | return EXIT_SUCCESS; 67 | } 68 | -------------------------------------------------------------------------------- /OMP/Critical.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(){ 5 | int i=0; 6 | omp_set_num_threads(5); 7 | #pragma omp parallel 8 | { 9 | printf("A(%d)\n",omp_get_thread_num()); 10 | printf("B(%d)\n",omp_get_thread_num()); 11 | printf("C(%d)\n",omp_get_thread_num()); 12 | #pragma omp critical 13 | { 14 | printf("This is Critcal (%d) ",omp_get_thread_num()); 15 | printf("For (%d)",omp_get_thread_num()); 16 | printf("the thread (%d) \n",omp_get_thread_num()); 17 | } 18 | printf("D(%d)\n",omp_get_thread_num()); 19 | printf("E(%d)\n",omp_get_thread_num()); 20 | } 21 | return 0; 22 | } 23 | -------------------------------------------------------------------------------- /OMP/Fibonacci.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int fibonacci(int x){ 5 | int f[2],i; 6 | if(x==1){ 7 | return 0; 8 | } 9 | else if(x==2){ 10 | return 1; 11 | } 12 | else { 13 | #pragma omp parallel for 14 | for(i=1;i<=2;i++){ 15 | f[i-1]=fibonacci(x-i); 16 | printf("Thread %d calculates term %d\n",omp_get_thread_num(),x-i); 17 | } 18 | return f[0]+f[1]; 19 | } 20 | } 21 | 22 | int main(){ 23 | int x; 24 | omp_set_num_threads(2); 25 | omp_set_nested(1); 26 | printf("Enter which term to be found out : "); 27 | scanf("%d",&x); 28 | printf("%dth term is : %d\n",x,fibonacci(x)); 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /OMP/HelloWorld.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | int main(){ 4 | #pragma omp parallel 5 | { 6 | int ID=omp_get_thread_num(); 7 | printf("Hello by %d\n",ID); 8 | printf("World by %d\n",ID); 9 | } 10 | return 0; 11 | } 12 | -------------------------------------------------------------------------------- /OMP/MatrixMul.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | int main(){ 4 | int A[10][10],B[10][10],C[10][10]; 5 | int n,i,j,k; 6 | printf("Enter the value of N :"); 7 | scanf("%d",&n); 8 | omp_set_num_threads(n*n*n); 9 | 10 | printf("Enter A:\n"); 11 | for(i=0;i 2 | #include 3 | int tree[512]; 4 | 5 | void createTree(int i){ 6 | int ch; 7 | printf("Enter node %d Data :",i); 8 | scanf("%d",&tree[i]); 9 | printf("Is there a Left node of NODE %d ? [0 if not]: ",i); 10 | scanf("%d",&ch); 11 | if(ch!=0) 12 | createTree(2*i); 13 | printf("Is there a Right node of NODE %d ? [0 if not]: ",i); 14 | scanf("%d",&ch); 15 | if(ch!=0) 16 | createTree(2*i+1); 17 | } 18 | 19 | void searchTree(int data,int i){ 20 | int k; 21 | if(tree[i]!=0){ 22 | if(tree[i]==data){ 23 | printf("Node at %d \n",i); 24 | } 25 | else{ 26 | omp_set_num_threads(2); 27 | #pragma omp parallel for 28 | for(k=0;k<2;k++){ 29 | searchTree(data,2*i+k); 30 | } 31 | } 32 | } 33 | } 34 | 35 | int main(){ 36 | int d; 37 | printf("Creating tree \n"); 38 | createTree(1); 39 | omp_set_num_threads(2); 40 | printf("Enter data to be searched : "); 41 | scanf("%d",&d); 42 | searchTree(d,1); 43 | } 44 | -------------------------------------------------------------------------------- /OMP/PiCalculation.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main(){ 6 | int n=1024; 7 | int i=0; 8 | float x; 9 | omp_set_num_threads(10); 10 | float* pi=(float*)malloc(sizeof(float)*n); 11 | float PI=0.0f; 12 | #pragma omp parallel for 13 | for(i=0;i 2 | #include 3 | #include 4 | 5 | int main(){ 6 | int n=1024; 7 | int i=0; 8 | float x; 9 | omp_set_num_threads(10); 10 | float* pi=(float*)malloc(sizeof(float)*n); 11 | float PI=0.0f; 12 | #pragma omp parallel for 13 | for(i=0;i 2 | #include 3 | 4 | int main(){ 5 | int i=0; 6 | omp_set_num_threads(5); 7 | printf("SINGLE\n"); 8 | #pragma omp parallel 9 | { 10 | printf("Executed by %d \n",omp_get_thread_num()); 11 | #pragma omp single 12 | { 13 | printf("Executed ONLY by %d \n",omp_get_thread_num()); 14 | } 15 | } 16 | printf("\nMASTER\n"); 17 | #pragma omp parallel 18 | { 19 | printf("Executed by %d \n",omp_get_thread_num()); 20 | #pragma omp master 21 | { 22 | printf("Executed ONLY by %d \n",omp_get_thread_num()); 23 | } 24 | } 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /OMP/Sorting.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | int main(){ 4 | int A[20],B[20][20],C[20]; 5 | int n; 6 | int i,j,k; 7 | 8 | printf("Enter the value of N :"); 9 | scanf("%d",&n); 10 | printf("Enter Array : \n"); 11 | 12 | for(i=0;iA[j]) 26 | { 27 | B[i][j]=1; 28 | } 29 | else 30 | B[i][j]=0; 31 | } 32 | int x=0; 33 | #pragma omp parallel for reduction(+:x) 34 | for(j=0;j 2 | #include 3 | int main(){ 4 | int i,n=5; 5 | int a[5]={1,2,3,4,5},b[5]={1,1,1,2,1},c[5]; 6 | #pragma omp parallel for 7 | for(i=0;i ./program 13 | 14 | ## To run OpenMP : 15 | 16 | 1. gcc -fopenmp program.c -o program 17 | 2. ./program 18 | 3. To change the number of processors : export OMP_NUM_THREADS= 19 | --------------------------------------------------------------------------------