├── .gitignore ├── README.md └── tutorials ├── 01-BasicElements └── hello_cuda │ ├── hello_cuda.cu │ └── hello_cuda.out ├── 02-OrganizationOfThreads ├── organization │ └── main.cu └── organization2 │ └── main.cu ├── 03-IndexCalculation └── 1DIndexCalculation │ └── main.cu ├── 04-MemoryTransfer └── main.cu ├── 05-SumArray └── main.cu ├── 06-ErrorHandling ├── cuda_common.cuh └── main.cu ├── 07-Timing └── main.cu ├── 08-DeviceQuerry └── main.cu ├── 09-Warps └── main.cu ├── 10-WarpDivergence └── main.cu └── 11-Occupancy └── main.cu /.gitignore: -------------------------------------------------------------------------------- 1 | *.i 2 | *.ii 3 | *.gpu 4 | *.ptx 5 | *.cubin 6 | *.fatbin 7 | *.out 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CUDA-tutorial 2 | This is a repo for my training cuda code. 3 | -------------------------------------------------------------------------------- /tutorials/01-BasicElements/hello_cuda/hello_cuda.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "device_launch_parameters.h" 3 | 4 | #include 5 | 6 | 7 | __global__ void hello_cuda() { 8 | 9 | printf("hello CUDA world\n"); 10 | } 11 | 12 | int main(void) 13 | { 14 | // hello_cuda<<<1, 1>>>(); 15 | //hello_cuda<<<1, 20>>>(); 16 | // dim3 block(4); 17 | // dim3 grid(8); 18 | dim3 block(8, 2); 19 | dim3 grid(2, 2); 20 | 21 | hello_cuda<<>>(); 22 | 23 | cudaDeviceSynchronize(); 24 | 25 | cudaDeviceReset(); 26 | 27 | return 0; 28 | 29 | } 30 | -------------------------------------------------------------------------------- /tutorials/01-BasicElements/hello_cuda/hello_cuda.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ybai62868/CUDA-tutorial/93ac81d2e426e4ecb66e39c12fdf849e68c2e1b3/tutorials/01-BasicElements/hello_cuda/hello_cuda.out -------------------------------------------------------------------------------- /tutorials/02-OrganizationOfThreads/organization/main.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "device_launch_parameters.h" 3 | 4 | #include 5 | 6 | 7 | __global__ void print_threadIds() { 8 | printf("blockIdx.x : %d, blockIdx.y : %d, blockIdx.z : %d , blockDim.x : %d, blockDim.y : %d, gridDim.x : %d, gridDim.y : %d \n", 9 | blockIdx.x, blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, 10 | gridDim.x, gridDim.y); 11 | 12 | 13 | } 14 | 15 | 16 | int main(void) 17 | { 18 | int nx, ny; 19 | nx = 16; 20 | ny = 16; 21 | dim3 block(8, 8); 22 | dim3 grid(nx / block.x, ny / block.y); 23 | 24 | print_threadIds <<>> (); 25 | cudaDeviceSynchronize(); 26 | 27 | cudaDeviceReset(); 28 | 29 | 30 | return 0; 31 | } 32 | 33 | -------------------------------------------------------------------------------- /tutorials/02-OrganizationOfThreads/organization2/main.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "device_launch_parameters.h" 3 | 4 | #include 5 | 6 | 7 | __global__ void print_threadIds() { 8 | printf("threadIdx.x : %d, threadIdx.y : %d, threadIdx.z : %d \n", 9 | threadIdx.x, threadIdx.y, threadIdx.z); 10 | 11 | 12 | 13 | } 14 | 15 | 16 | int main(void) 17 | { 18 | int nx, ny; 19 | nx = 16; 20 | ny = 16; 21 | dim3 block(8, 8); 22 | dim3 grid(nx / block.x, ny / block.y); 23 | 24 | print_threadIds <<>> (); 25 | cudaDeviceSynchronize(); 26 | 27 | cudaDeviceReset(); 28 | 29 | 30 | return 0; 31 | } 32 | 33 | -------------------------------------------------------------------------------- /tutorials/03-IndexCalculation/1DIndexCalculation/main.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "device_launch_parameters.h" 3 | 4 | #include 5 | #include 6 | 7 | __global__ void unique_idx_calc_threadIdx(int* input) 8 | { 9 | int tid = threadIdx.x; 10 | printf("threadIdx : %d, value: %d \n", tid, input[tid]); 11 | 12 | } 13 | 14 | __global__ void unique_gid_calculation(int* input) 15 | { 16 | int tid = threadIdx.x; 17 | int offset = blockIdx.x * blockDim.x; 18 | int gid = tid + offset; 19 | printf("blockIdx.x : %d, threadIdx.x : %d, grid : %d, value : %d\n", blockIdx.x, 20 | threadIdx.x, gid, input[gid]); 21 | 22 | } 23 | 24 | 25 | 26 | int main(void) 27 | { 28 | // int array_size = 8; 29 | int array_size = 16; 30 | int array_byte_size = sizeof(int) * array_size; 31 | // int h_data[] = {23, 9, 4, 53, 65, 12, 1, 33}; 32 | int h_data[] = {23, 9, 4, 53, 65, 12, 1, 33, 1, 6, 2, 2, 6, 8, 6, 10}; 33 | 34 | for ( int i = 0;i < array_size;i++ ) { 35 | printf("%d ", h_data[i]); 36 | } 37 | printf("\n \n"); 38 | 39 | int* d_data; 40 | cudaMalloc((void**)&d_data, array_byte_size); 41 | cudaMemcpy(d_data, h_data, array_byte_size, cudaMemcpyHostToDevice); 42 | 43 | //dim3 block(8); 44 | //dim3 grid(1); 45 | 46 | dim3 block(4); 47 | dim3 grid(4); 48 | 49 | 50 | // unique_idx_calc_threadIdx<<>>(d_data); 51 | unique_gid_calculation<<>>(d_data); 52 | cudaDeviceSynchronize(); 53 | 54 | cudaDeviceReset(); 55 | 56 | return 0; 57 | } 58 | -------------------------------------------------------------------------------- /tutorials/04-MemoryTransfer/main.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "device_launch_parameters.h" 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | __global__ void mem_trs_test(int* input) 11 | { 12 | int grid = blockIdx.x * blockDim.x + threadIdx.x; 13 | printf("tid : %d, gid : %d, value : %d \n", threadIdx.x, grid, input[grid]); 14 | 15 | } 16 | 17 | 18 | int main(void) 19 | { 20 | int size = 128; 21 | int byte_size = size * sizeof(int); 22 | 23 | int* h_input = (int*)malloc(byte_size); 24 | 25 | time_t t; 26 | srand((unsigned)time(&t)); 27 | for ( int i = 0;i < size;i++ ) { 28 | h_input[i] = (int)(rand() & 0xff); 29 | } 30 | 31 | int* d_input; 32 | cudaMalloc((void**)&d_input, byte_size); 33 | cudaMemcpy(d_input, h_input, byte_size, cudaMemcpyHostToDevice); 34 | 35 | dim3 block(64); 36 | dim3 grid(2); 37 | 38 | mem_trs_test<<>>(d_input); 39 | 40 | cudaDeviceSynchronize(); 41 | 42 | cudaFree(d_input); 43 | free(h_input); 44 | 45 | 46 | cudaDeviceReset(); 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /tutorials/05-SumArray/main.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "device_launch_parameters.h" 3 | 4 | //#include "cuda_common.cuh" 5 | #include 6 | 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | __global__ void sum_array_gpu(int* a, int* b, int* c, int size) 14 | { 15 | int gid = blockIdx.x * blockDim.x + threadIdx.x; 16 | if (gid < size) { 17 | c[gid] = a[gid] + b[gid]; 18 | } 19 | } 20 | 21 | void sum_array_cpu(int* a, int* b, int* c, int size) 22 | { 23 | for ( int i = 0;i < size;i++ ) { 24 | c[i] = a[i] + b[i]; 25 | } 26 | } 27 | 28 | 29 | void compare_arrays(int* a, int* b, int size) 30 | { 31 | for ( int i = 0;i < size;i++ ) { 32 | if (a[i] != b[i]) { 33 | printf("Arrays are different!"); 34 | return; 35 | } 36 | } 37 | printf("Arrays are same\n"); 38 | } 39 | 40 | 41 | int main(void) 42 | { 43 | int size = 10000; 44 | int block_size = 128; 45 | 46 | int NO_BYTES = size * sizeof(int); 47 | 48 | // host pointer 49 | int* h_a, *h_b, *gpu_results; 50 | int* h_c; 51 | h_a = (int*)malloc(NO_BYTES); 52 | h_b = (int*)malloc(NO_BYTES); 53 | h_c = (int*)malloc(NO_BYTES); 54 | gpu_results = (int*)malloc(NO_BYTES); 55 | 56 | 57 | 58 | time_t t; 59 | srand((unsigned)time(&t)); 60 | for (int i = 0;i < size;i++) { 61 | h_a[i] = (int)(rand() & 0xff); 62 | h_b[i] = (int)(rand() & 0xff); 63 | } 64 | 65 | sum_array_cpu(h_a, h_b, h_c, size); 66 | memset(gpu_results, 0, NO_BYTES); 67 | 68 | 69 | // device pointer 70 | int* d_a, *d_b, *d_c; 71 | cudaMalloc((int**)&d_a, NO_BYTES); 72 | cudaMalloc((int**)&d_b, NO_BYTES); 73 | cudaMalloc((int**)&d_c, NO_BYTES); 74 | 75 | cudaMemcpy(d_a, h_a, NO_BYTES, cudaMemcpyHostToDevice); 76 | cudaMemcpy(d_b, h_b, NO_BYTES, cudaMemcpyHostToDevice); 77 | 78 | 79 | // launching the grid 80 | dim3 block(block_size); 81 | dim3 grid((size / block.x) + 1); 82 | 83 | sum_array_gpu<<>> (d_a, d_b, d_c, size); 84 | cudaDeviceSynchronize(); 85 | 86 | 87 | cudaMemcpy(gpu_results, d_c, NO_BYTES, cudaMemcpyDeviceToHost); 88 | 89 | 90 | // array comparison 91 | compare_arrays(h_c, gpu_results, size); 92 | 93 | 94 | 95 | cudaFree(d_a); 96 | cudaFree(d_b); 97 | cudaFree(d_c); 98 | 99 | 100 | free(gpu_results); 101 | cudaDeviceReset(); 102 | 103 | 104 | return 0; 105 | 106 | } 107 | -------------------------------------------------------------------------------- /tutorials/06-ErrorHandling/cuda_common.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_COMMON_H 2 | #define CUDA_COMMON_H 3 | 4 | 5 | #include "cuda_runtime.h" 6 | #include "device_launch_parameters.h" 7 | #include 8 | #include 9 | 10 | 11 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__);} 12 | 13 | inline void gpuAssert(cudaError_t code, const char* file, int line, bool 14 | abort=true) { 15 | if (code != cudaSuccess) { 16 | fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, 17 | line); 18 | if (abort) { 19 | exit(code); 20 | } 21 | } 22 | } 23 | 24 | #endif //!CUDA_COMMON_H 25 | -------------------------------------------------------------------------------- /tutorials/06-ErrorHandling/main.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "device_launch_parameters.h" 3 | 4 | #include "cuda_common.cuh" 5 | #include 6 | 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | __global__ void sum_array_gpu(int* a, int* b, int* c, int size) 14 | { 15 | int gid = blockIdx.x * blockDim.x + threadIdx.x; 16 | if (gid < size) { 17 | c[gid] = a[gid] + b[gid]; 18 | } 19 | } 20 | 21 | void sum_array_cpu(int* a, int* b, int* c, int size) 22 | { 23 | for ( int i = 0;i < size;i++ ) { 24 | c[i] = a[i] + b[i]; 25 | } 26 | } 27 | 28 | 29 | void compare_arrays(int* a, int* b, int size) 30 | { 31 | for ( int i = 0;i < size;i++ ) { 32 | if (a[i] != b[i]) { 33 | printf("Arrays are different!"); 34 | return; 35 | } 36 | } 37 | printf("Arrays are same\n"); 38 | } 39 | 40 | 41 | int main(void) 42 | { 43 | int size = 10000; 44 | int block_size = 128; 45 | 46 | int NO_BYTES = size * sizeof(int); 47 | 48 | // host pointer 49 | int* h_a, *h_b, *gpu_results; 50 | int* h_c; 51 | h_a = (int*)malloc(NO_BYTES); 52 | h_b = (int*)malloc(NO_BYTES); 53 | h_c = (int*)malloc(NO_BYTES); 54 | gpu_results = (int*)malloc(NO_BYTES); 55 | cudaError error; 56 | 57 | 58 | 59 | time_t t; 60 | srand((unsigned)time(&t)); 61 | for (int i = 0;i < size;i++) { 62 | h_a[i] = (int)(rand() & 0xff); 63 | h_b[i] = (int)(rand() & 0xff); 64 | } 65 | 66 | sum_array_cpu(h_a, h_b, h_c, size); 67 | memset(gpu_results, 0, NO_BYTES); 68 | 69 | 70 | // device pointer 71 | int* d_a, *d_b, *d_c; 72 | error = cudaMalloc((int**)&d_a, NO_BYTES); 73 | if (error != cudaSuccess) { 74 | fprintf(stderr, "Error : %s \n", cudaGetErrorString(error)); 75 | } 76 | 77 | cudaMalloc((int**)&d_b, NO_BYTES); 78 | cudaMalloc((int**)&d_c, NO_BYTES); 79 | 80 | cudaMemcpy(d_a, h_a, NO_BYTES, cudaMemcpyHostToDevice); 81 | cudaMemcpy(d_b, h_b, NO_BYTES, cudaMemcpyHostToDevice); 82 | 83 | 84 | // launching the grid 85 | dim3 block(block_size); 86 | dim3 grid((size / block.x) + 1); 87 | 88 | sum_array_gpu<<>> (d_a, d_b, d_c, size); 89 | cudaDeviceSynchronize(); 90 | 91 | 92 | cudaMemcpy(gpu_results, d_c, NO_BYTES, cudaMemcpyDeviceToHost); 93 | 94 | 95 | // array comparison 96 | compare_arrays(h_c, gpu_results, size); 97 | 98 | 99 | 100 | cudaFree(d_a); 101 | cudaFree(d_b); 102 | cudaFree(d_c); 103 | 104 | 105 | free(gpu_results); 106 | cudaDeviceReset(); 107 | 108 | 109 | return 0; 110 | 111 | } 112 | -------------------------------------------------------------------------------- /tutorials/07-Timing/main.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "device_launch_parameters.h" 3 | 4 | //#include "cuda_common.cuh" 5 | #include 6 | 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | __global__ void sum_array_gpu(int* a, int* b, int* c, int size) 14 | { 15 | int gid = blockIdx.x * blockDim.x + threadIdx.x; 16 | if (gid < size) { 17 | c[gid] = a[gid] + b[gid]; 18 | } 19 | } 20 | 21 | void sum_array_cpu(int* a, int* b, int* c, int size) 22 | { 23 | for ( int i = 0;i < size;i++ ) { 24 | c[i] = a[i] + b[i]; 25 | } 26 | } 27 | 28 | 29 | void compare_arrays(int* a, int* b, int size) 30 | { 31 | for ( int i = 0;i < size;i++ ) { 32 | if (a[i] != b[i]) { 33 | printf("Arrays are different!"); 34 | return; 35 | } 36 | } 37 | printf("Arrays are same\n"); 38 | } 39 | 40 | 41 | int main(void) 42 | { 43 | int size = 10000; 44 | int block_size = 256; 45 | 46 | int NO_BYTES = size * sizeof(int); 47 | 48 | // host pointer 49 | int* h_a, *h_b, *gpu_results; 50 | int* h_c; 51 | h_a = (int*)malloc(NO_BYTES); 52 | h_b = (int*)malloc(NO_BYTES); 53 | h_c = (int*)malloc(NO_BYTES); 54 | gpu_results = (int*)malloc(NO_BYTES); 55 | 56 | 57 | 58 | time_t t; 59 | srand((unsigned)time(&t)); 60 | for (int i = 0;i < size;i++) { 61 | h_a[i] = (int)(rand() & 0xff); 62 | h_b[i] = (int)(rand() & 0xff); 63 | } 64 | 65 | clock_t cpu_start, cpu_end; 66 | cpu_start = clock(); 67 | sum_array_cpu(h_a, h_b, h_c, size); 68 | cpu_end = clock(); 69 | printf("Sum array CPU execution time : %4.6f \n",(double)((double)(cpu_end - 70 | cpu_start)/CLOCKS_PER_SEC)); 71 | memset(gpu_results, 0, NO_BYTES); 72 | 73 | 74 | // device pointer 75 | int* d_a, *d_b, *d_c; 76 | cudaMalloc((int**)&d_a, NO_BYTES); 77 | cudaMalloc((int**)&d_b, NO_BYTES); 78 | cudaMalloc((int**)&d_c, NO_BYTES); 79 | 80 | clock_t htod_start, htod_end; 81 | htod_start = clock(); 82 | cudaMemcpy(d_a, h_a, NO_BYTES, cudaMemcpyHostToDevice); 83 | cudaMemcpy(d_b, h_b, NO_BYTES, cudaMemcpyHostToDevice); 84 | htod_end = clock(); 85 | printf("Sum array host to device time : %4.6f \n",(double)((double)(htod_end - 86 | htod_start)/CLOCKS_PER_SEC)); 87 | 88 | 89 | // launching the grid 90 | dim3 block(block_size); 91 | dim3 grid((size / block.x) + 1); 92 | 93 | clock_t gpu_start, gpu_end; 94 | gpu_start = clock(); 95 | sum_array_gpu<<>> (d_a, d_b, d_c, size); 96 | gpu_end = clock(); 97 | printf("Sum array GPU execution time : %4.6f \n",(double)((double)(gpu_end - 98 | gpu_start)/CLOCKS_PER_SEC)); 99 | cudaDeviceSynchronize(); 100 | 101 | clock_t dtoh_start, dtoh_end; 102 | dtoh_start = clock(); 103 | cudaMemcpy(gpu_results, d_c, NO_BYTES, cudaMemcpyDeviceToHost); 104 | dtoh_end = clock(); 105 | printf("Sum array GPU total time : %4.6f \n",(double)((double)(dtoh_end - 106 | htod_start)/CLOCKS_PER_SEC)); 107 | 108 | // array comparison 109 | compare_arrays(h_c, gpu_results, size); 110 | 111 | 112 | 113 | cudaFree(d_a); 114 | cudaFree(d_b); 115 | cudaFree(d_c); 116 | 117 | 118 | free(gpu_results); 119 | cudaDeviceReset(); 120 | 121 | 122 | return 0; 123 | 124 | } 125 | -------------------------------------------------------------------------------- /tutorials/08-DeviceQuerry/main.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "device_launch_parameters.h" 3 | #include 4 | 5 | 6 | void query_device() 7 | { 8 | int deviceCount = 0; 9 | cudaGetDeviceCount(&deviceCount); 10 | 11 | if (deviceCount == 0) { 12 | printf("No CUDA Support device found!"); 13 | } 14 | 15 | int devNo = 0; 16 | cudaDeviceProp iProp; 17 | cudaGetDeviceProperties(&iProp, devNo); 18 | 19 | printf("Device %d : %s\n", devNo, iProp.name); 20 | printf("Number of multiprocessors: %d\n", iProp.multiProcessorCount); 21 | printf("clock rate : %d\n", iProp.clockRate); 22 | printf("Compute capability : %d.%d\n", iProp.major, iProp.minor); 23 | printf("Total amount of global memory : %4.2f KB\n", iProp.totalGlobalMem / 24 | 1024.0); 25 | printf("Total amount of constant memory : %4.2f KB\n", iProp.totalConstMem 26 | /1024.0); 27 | printf("Total amount of shared memory per block : %4.2f KB\n", 28 | iProp.sharedMemPerBlock / 1024.0); 29 | printf("Total amount of shared memory per MP : %4.2f KB\n", 30 | iProp.sharedMemPerMultiprocessor / 1024.0); 31 | printf("Warp size : %d\n", iProp.warpSize); 32 | printf("Maximum number of threads per block: %d\n", 33 | iProp.maxThreadsPerBlock); 34 | printf("Maximum number of threads per multiprocessor: %d\n", 35 | iProp.maxThreadsPerMultiProcessor); 36 | printf("Maximum number of warps per multiprocessor: %d\n", 37 | iProp.maxThreadsPerMultiProcessor / 32); 38 | printf("Maximum Grid size : (%d, %d, %d)\n", iProp.maxGridSize[0], 39 | iProp.maxGridSize[1], iProp.maxGridSize[2]); 40 | printf("Maximum block dimension : (%d, %d, %d)\n", iProp.maxThreadsDim[0], 41 | iProp.maxThreadsDim[1], iProp.maxThreadsDim[2]); 42 | 43 | } 44 | 45 | int main(void) 46 | { 47 | query_device(); 48 | 49 | return 0; 50 | 51 | } 52 | -------------------------------------------------------------------------------- /tutorials/09-Warps/main.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "device_launch_parameters.h" 3 | 4 | #include 5 | #include 6 | 7 | __global__ void print_details_of_warps() 8 | { 9 | int gid = blockIdx.y * gridDim.x * blockDim.x + blockIdx.x * blockDim.x + 10 | threadIdx.x; 11 | 12 | int warp_id = threadIdx.x / 32; 13 | 14 | int gbid = blockIdx.y * gridDim.x + blockIdx.x; 15 | 16 | printf("tid : %d, bid.x : %d, bid.y: %d, gid : %d, warp_id : %d, gbid : %d\n", 17 | threadIdx.x, blockIdx.x, blockIdx.y, gid, warp_id, gbid); 18 | } 19 | 20 | 21 | 22 | int main(void) 23 | { 24 | dim3 block_size(42); 25 | dim3 grid_size(2, 2); 26 | 27 | print_details_of_warps<<>>(); 28 | cudaDeviceSynchronize(); 29 | 30 | cudaDeviceReset(); 31 | return 0; 32 | } 33 | -------------------------------------------------------------------------------- /tutorials/10-WarpDivergence/main.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "device_launch_parameters.h" 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | __global__ void code_without_divergence() 9 | { 10 | int gid = blockIdx.x * blockDim.x + threadIdx.x; 11 | 12 | float a, b; 13 | a = b = 0; 14 | int warp_id = gid / 32; 15 | 16 | if (warp_id % 2 == 0) { 17 | a = 100.0; 18 | b = 50.0; 19 | } else { 20 | a = 200.0; 21 | b = 75.0; 22 | } 23 | } 24 | 25 | 26 | __global__ void divergence() 27 | { 28 | int gid = blockIdx.x * blockDim.x + threadIdx.x; 29 | 30 | float a, b; 31 | a = b = 0.0; 32 | if (gid % 2 == 0) { 33 | a = 100.0; 34 | b = 50.0; 35 | } else { 36 | a = 200.0; 37 | b = 75.0; 38 | } 39 | } 40 | 41 | int main(void) 42 | { 43 | int size = 1 << 22; 44 | dim3 block_size(128); 45 | dim3 grid_size((size + block_size.x - 1) / block_size.x); 46 | 47 | 48 | code_without_divergence<<>>(); 49 | cudaDeviceSynchronize(); 50 | 51 | divergence<<>>(); 52 | cudaDeviceSynchronize(); 53 | cudaDeviceReset(); 54 | return 0; 55 | } 56 | -------------------------------------------------------------------------------- /tutorials/11-Occupancy/main.cu: -------------------------------------------------------------------------------- 1 | // nvcc -Xptxas=-v -o a.out main.cu 2 | #include "cuda.h" 3 | #include "cuda_runtime.h" 4 | #include "device_launch_parameters.h" 5 | #include 6 | #include 7 | 8 | __global__ void occupancy_test(int* results) 9 | { 10 | int gid = blockIdx.x * blockDim.x + threadIdx.x; 11 | int x1 = 1; 12 | int x2 = 2; 13 | int x3 = 3; 14 | int x4 = 4; 15 | int x5 = 5; 16 | int x6 = 6; 17 | int x7 = 7; 18 | int x8 = 8; 19 | results[gid] = x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8; 20 | } 21 | 22 | int main(void) 23 | { 24 | 25 | 26 | 27 | return 0; 28 | } 29 | --------------------------------------------------------------------------------