├── Code files ├── Section 1 │ └── 1.5 │ │ ├── Makefile │ │ ├── array-add │ │ └── array-add.cu ├── Section 2 │ ├── 2.2 │ │ ├── Makefile │ │ ├── launch-bounds-1d │ │ ├── launch-bounds-1d.cu │ │ ├── launch-bounds-2d-a │ │ ├── launch-bounds-2d-a.cu │ │ ├── launch-bounds-2d-b │ │ ├── launch-bounds-2d-b.cu │ │ ├── occupancy-api │ │ └── occupancy-api.cu │ ├── 2.3 │ │ ├── Makefile │ │ ├── array-add-bug │ │ └── array-add-bug.cu │ ├── 2.4 │ │ ├── Makefile │ │ ├── array-add-bug │ │ └── array-add-bug.cu │ └── 2.5 │ │ ├── Makefile │ │ ├── error-handling │ │ └── error-handling.cu ├── Section 3 │ ├── 3.1 │ │ ├── Makefile │ │ ├── monochrome │ │ └── monochrome.cu │ ├── 3.2 │ │ ├── Makefile │ │ ├── monochrome │ │ └── monochrome.cu │ ├── 3.3 │ │ ├── Makefile │ │ ├── spotlights │ │ ├── spotlights-2d │ │ ├── spotlights-2d.cu │ │ └── spotlights.cu │ ├── 3.4 │ │ ├── Makefile │ │ ├── manylights-const │ │ ├── manylights-const.cu │ │ ├── manylights1.cu │ │ ├── manylights2 │ │ ├── manylights2.cu │ │ ├── warp │ │ ├── warp-texture │ │ ├── warp-texture.cu │ │ └── warp.cu │ └── 3.5 │ │ ├── Makefile │ │ ├── manylights-ilp │ │ └── manylights-ilp.cu ├── Section 4 │ ├── 4.1 │ │ ├── Makefile │ │ ├── transpose │ │ ├── transpose-shared │ │ ├── transpose-shared.cu │ │ └── transpose.cu │ ├── 4.2 │ │ ├── Makefile │ │ ├── reduce │ │ └── reduce.cu │ ├── 4.3 │ │ ├── Makefile │ │ ├── scan │ │ └── scan.cu │ └── 4.4 │ │ ├── Makefile │ │ ├── filter │ │ └── filter.cu ├── Section 5 │ └── 5.4 │ │ ├── Makefile │ │ ├── thrust │ │ └── thrust.cu ├── Section 6 │ ├── 6.1 │ │ ├── Makefile │ │ ├── reduce-stream │ │ └── reduce-stream.cu │ ├── 6.2 │ │ ├── Makefile │ │ ├── scan-page-locked │ │ ├── scan-page-locked.cu │ │ ├── scan-stream │ │ └── scan-stream.cu │ ├── 6.4 │ │ ├── Makefile │ │ ├── scan-multi-device │ │ └── scan-multi-device.cu │ ├── 6.5 │ │ ├── Makefile │ │ ├── scan-unified │ │ └── scan-unified.cu │ ├── 6.6 │ │ ├── Makefile │ │ ├── bst │ │ └── bst.cu │ └── 6.7 │ │ ├── Makefile │ │ ├── bst-sum │ │ ├── bst-sum-kernels.cu │ │ ├── bst-sum-kernels.cuh │ │ ├── bst-sum-kernels.o │ │ ├── bst-sum.cu │ │ └── bst-sum.o ├── utils.cu ├── utils.h └── utils.o ├── LICENSE ├── README.md ├── Section 1 ├── 1.1_JJ_MC.pptx ├── 1.2_YM_MC.pptx ├── 1.3_YM_MC.pptx ├── 1.4_YM_MC.pptx └── 1.5_YM_MC.pptx ├── Section 2 ├── 2.1_TK_MC.pptx ├── 2.2_TK_MC.pptx ├── 2.3_TK_MC.pptx ├── 2.4_TK_MC.pptx └── 2.5_TK_MC.pptx ├── Section 3 ├── 3.1_YM_MC.pptx ├── 3.2_YM_MC.pptx ├── 3.3_YM_MC.pptx ├── 3.4_YM_MC.pptx └── 3.5_YM_MC.pptx ├── Section 4 ├── 4.1_YM_MC.pptx ├── 4.2_YM_MC.pptx ├── 4.3_YM_MC.pptx └── 4.4_YM_MC.pptx ├── Section 5 ├── 5.1_TK_MC.pptx ├── 5.2_TK_MC.pptx ├── 5.3_TK_MC.pptx └── 5.4_TK_MC.pptx ├── Section 6 ├── 6.1_TK_MC.pptx ├── 6.2_TK_MC.pptx ├── 6.3_TK_MC.pptx ├── 6.4_TK_MC.pptx ├── 6.5_TK_MC.pptx ├── 6.6_TK_MC.pptx └── 6.7_TK_MC.pptx └── Section 7 ├── 7.1_TK_MC.pptx ├── 7.2_TK_MC.pptx └── 7.3_TK_MC.pptx /Code files/Section 1/1.5/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = array-add 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 1/1.5/array-add: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 1/1.5/array-add -------------------------------------------------------------------------------- /Code files/Section 1/1.5/array-add.cu: -------------------------------------------------------------------------------- 1 | // Add two arrays using CUDA. 2 | /// Example for videos 1.5 and 2.1 3 | 4 | #include 5 | #include 6 | 7 | // Standard CUDA API functions 8 | #include 9 | 10 | // Error checking macro 11 | #define cudaCheckError(code) \ 12 | { \ 13 | if ((code) != cudaSuccess) { \ 14 | fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \ 15 | cudaGetErrorString(code)); \ 16 | } \ 17 | } 18 | 19 | // Host function for array addition 20 | void add_loop(float *dest, int n_elts, const float *a, const float *b) 21 | { 22 | for (int i = 0; i < n_elts; i++) { 23 | dest[i] = a[i] + b[i]; 24 | } 25 | } 26 | 27 | // Device kernel for array addition. 28 | __global__ void add_kernel(float *dest, int n_elts, const float *a, 29 | const float *b) 30 | { 31 | int index = blockIdx.x * blockDim.x + threadIdx.x; 32 | if (index >= n_elts) return; 33 | 34 | dest[index] = a[index] + b[index]; 35 | } 36 | 37 | int main() 38 | { 39 | const int ARRAY_LENGTH = 100; 40 | 41 | // Generate some data on the host 42 | float host_array_a[ARRAY_LENGTH]; 43 | float host_array_b[ARRAY_LENGTH]; 44 | float host_array_dest[ARRAY_LENGTH]; 45 | 46 | for (int i = 0; i < ARRAY_LENGTH; i++) { 47 | host_array_a[i] = 2 * i; 48 | host_array_b[i] = 2 * i + 1; 49 | } 50 | 51 | // Allocate device memory 52 | float *device_array_a, *device_array_b, *device_array_dest; 53 | cudaCheckError(cudaMalloc(&device_array_a, sizeof(host_array_a))); 54 | cudaCheckError(cudaMalloc(&device_array_b, sizeof(host_array_b))); 55 | cudaCheckError(cudaMalloc(&device_array_dest, sizeof(host_array_dest))); 56 | 57 | // Transfer data to device 58 | cudaCheckError(cudaMemcpy(device_array_a, host_array_a, sizeof(host_array_a), 59 | cudaMemcpyHostToDevice)); 60 | cudaCheckError(cudaMemcpy(device_array_b, host_array_b, sizeof(host_array_b), 61 | cudaMemcpyHostToDevice)); 62 | 63 | // Calculate lauch configuration 64 | const int BLOCK_SIZE = 128; 65 | int n_blocks = (ARRAY_LENGTH + BLOCK_SIZE - 1) / BLOCK_SIZE; 66 | 67 | // Add arrays on device 68 | add_kernel<<>>(device_array_dest, ARRAY_LENGTH, 69 | device_array_a, device_array_b); 70 | 71 | // Meanwhile, add arrays on the host, for comparison 72 | add_loop(host_array_dest, ARRAY_LENGTH, host_array_a, host_array_b); 73 | 74 | // Copy result back to host and compare 75 | float host_array_tmp[ARRAY_LENGTH]; 76 | cudaCheckError(cudaMemcpy(host_array_tmp, device_array_dest, 77 | sizeof(host_array_tmp), cudaMemcpyDeviceToHost)); 78 | for (int i = 0; i < ARRAY_LENGTH; i++) { 79 | assert(host_array_tmp[i] == host_array_dest[i]); 80 | printf("%g + %g = %g\n", host_array_a[i], host_array_b[i], 81 | host_array_tmp[i]); 82 | } 83 | 84 | return 0; 85 | } 86 | -------------------------------------------------------------------------------- /Code files/Section 2/2.2/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = launch-bounds-1d launch-bounds-2d-a launch-bounds-2d-b occupancy-api 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 2/2.2/launch-bounds-1d: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 2/2.2/launch-bounds-1d -------------------------------------------------------------------------------- /Code files/Section 2/2.2/launch-bounds-1d.cu: -------------------------------------------------------------------------------- 1 | // Demonstration of kernel execution configuration for a one-dimensional 2 | // grid. 3 | // Example for video 2.2. 4 | 5 | #include 6 | #include 7 | 8 | // Error checking macro 9 | #define cudaCheckError(code) \ 10 | { \ 11 | if ((code) != cudaSuccess) { \ 12 | fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \ 13 | cudaGetErrorString(code)); \ 14 | } \ 15 | } 16 | 17 | __global__ void kernel_1d() 18 | { 19 | int index = blockIdx.x * blockDim.x + threadIdx.x; 20 | printf("block %d, thread %d, index %d\n", blockIdx.x, threadIdx.x, index); 21 | } 22 | 23 | int main() 24 | { 25 | kernel_1d<<<4, 8>>>(); 26 | cudaCheckError(cudaDeviceSynchronize()); 27 | } 28 | -------------------------------------------------------------------------------- /Code files/Section 2/2.2/launch-bounds-2d-a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 2/2.2/launch-bounds-2d-a -------------------------------------------------------------------------------- /Code files/Section 2/2.2/launch-bounds-2d-a.cu: -------------------------------------------------------------------------------- 1 | // Example of generating two-dimensional data coordinates from a 2 | // one-dimensional grid. A two-dimensional grid would be better suited here. 3 | // Example for video 2.2. 4 | 5 | #include 6 | #include 7 | 8 | // Error checking macro 9 | #define cudaCheckError(code) \ 10 | { \ 11 | if ((code) != cudaSuccess) { \ 12 | fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \ 13 | cudaGetErrorString(code)); \ 14 | } \ 15 | } 16 | 17 | __global__ void kernel_1d(int width) 18 | { 19 | int index = blockIdx.x * blockDim.x + threadIdx.x; 20 | int x = index % width; 21 | int y = index / width; 22 | printf("block %d, thread %d, index (%d, %d)\n", blockIdx.x, threadIdx.x, x, 23 | y); 24 | } 25 | 26 | int main() 27 | { 28 | kernel_1d<<<4, 8>>>(16); 29 | cudaCheckError(cudaDeviceSynchronize()); 30 | } 31 | -------------------------------------------------------------------------------- /Code files/Section 2/2.2/launch-bounds-2d-b: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 2/2.2/launch-bounds-2d-b -------------------------------------------------------------------------------- /Code files/Section 2/2.2/launch-bounds-2d-b.cu: -------------------------------------------------------------------------------- 1 | // Demonstration of kernel execution configuration for a two-dimensional 2 | // grid. 3 | // Example for video 2.2. 4 | 5 | #include 6 | #include 7 | 8 | // Error checking macro 9 | #define cudaCheckError(code) \ 10 | { \ 11 | if ((code) != cudaSuccess) { \ 12 | fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \ 13 | cudaGetErrorString(code)); \ 14 | } \ 15 | } 16 | 17 | __global__ void kernel_2d() 18 | { 19 | int x = blockIdx.x * blockDim.x + threadIdx.x; 20 | int y = blockIdx.y * blockDim.y + threadIdx.y; 21 | printf("block (%d, %d), thread (%d, %d), index (%d, %d)\n", blockIdx.x, 22 | blockIdx.y, threadIdx.x, threadIdx.y, x, y); 23 | } 24 | 25 | int main() 26 | { 27 | dim3 block_dim(8, 2); 28 | dim3 grid_dim(2, 1); 29 | kernel_2d<<>>(); 30 | cudaCheckError(cudaDeviceSynchronize()); 31 | } 32 | -------------------------------------------------------------------------------- /Code files/Section 2/2.2/occupancy-api: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 2/2.2/occupancy-api -------------------------------------------------------------------------------- /Code files/Section 2/2.2/occupancy-api.cu: -------------------------------------------------------------------------------- 1 | // Demonstration of the CUDA occupancy API. 2 | // Example for video 2.2. 3 | 4 | #include 5 | #include 6 | 7 | __global__ void kernel_1d() {} 8 | 9 | int main() 10 | { 11 | int block_size; // The launch configurator returned block size 12 | int min_grid_size; // The minimum grid size needed to achieve max occupancy 13 | 14 | cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, kernel_1d, 0, 15 | 0); 16 | 17 | printf("Block size %d\nMin grid size %d\n", block_size, min_grid_size); 18 | } 19 | -------------------------------------------------------------------------------- /Code files/Section 2/2.3/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = array-add-bug 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 2/2.3/array-add-bug: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 2/2.3/array-add-bug -------------------------------------------------------------------------------- /Code files/Section 2/2.3/array-add-bug.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // Standard CUDA API functions 5 | #include 6 | 7 | // Error checking macro 8 | #define cudaCheckError(code) \ 9 | { \ 10 | if ((code) != cudaSuccess) { \ 11 | fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \ 12 | cudaGetErrorString(code)); \ 13 | } \ 14 | } 15 | 16 | // Host function for array addition 17 | void add_loop(float *dest, int n_elts, const float *a, const float *b) 18 | { 19 | for (int i = 0; i < n_elts; i++) { 20 | dest[i] = a[i] + b[i]; 21 | } 22 | } 23 | 24 | // Device kernel for array addition. 25 | __global__ void add_kernel(float *dest, int n_elts, const float *a, 26 | const float *b) 27 | { 28 | int index = blockIdx.x * blockDim.x + threadIdx.x; 29 | if (index >= n_elts) return; 30 | 31 | dest[index] = a[index] + b[index]; 32 | } 33 | 34 | int main() 35 | { 36 | const int ARRAY_LENGTH = 10000; 37 | 38 | // Generate some data on the host 39 | float host_array_a[ARRAY_LENGTH]; 40 | float host_array_b[ARRAY_LENGTH]; 41 | float host_array_dest[ARRAY_LENGTH]; 42 | 43 | for (int i = 0; i < ARRAY_LENGTH; i++) { 44 | host_array_a[i] = 2 * i; 45 | host_array_b[i] = 2 * i + 1; 46 | } 47 | 48 | // Allocate device memory 49 | float *device_array_a, *device_array_b, *device_array_dest; 50 | cudaCheckError(cudaMalloc(&device_array_a, sizeof(host_array_a))); 51 | cudaCheckError(cudaMalloc(&device_array_b, sizeof(host_array_b))); 52 | cudaCheckError(cudaMalloc(&device_array_dest, sizeof(host_array_dest))); 53 | 54 | // Transfer data to device 55 | cudaCheckError(cudaMemcpy(device_array_a, host_array_a, sizeof(host_array_a), 56 | cudaMemcpyHostToDevice)); 57 | cudaCheckError(cudaMemcpy(device_array_b, host_array_b, sizeof(host_array_b), 58 | cudaMemcpyHostToDevice)); 59 | 60 | // Calculate lauch configuration 61 | const int BLOCK_SIZE = 128; 62 | int n_blocks = (ARRAY_LENGTH + BLOCK_SIZE - 1) / BLOCK_SIZE; 63 | 64 | // Add arrays on device 65 | add_kernel<<>>(device_array_dest, ARRAY_LENGTH, 66 | device_array_a, nullptr); 67 | 68 | // Meanwhile, add arrays on the host, for comparison 69 | add_loop(host_array_dest, ARRAY_LENGTH, host_array_a, host_array_b); 70 | 71 | // Copy result back to host and compare 72 | float host_array_tmp[ARRAY_LENGTH]; 73 | cudaCheckError(cudaMemcpy(host_array_tmp, device_array_dest, 74 | sizeof(host_array_tmp), cudaMemcpyDeviceToHost)); 75 | 76 | for (int i = 0; i < ARRAY_LENGTH; i++) { 77 | assert(host_array_tmp[i] == host_array_dest[i]); 78 | printf("%g + %g = %g\n", host_array_a[i], host_array_b[i], 79 | host_array_tmp[i]); 80 | } 81 | 82 | return 0; 83 | } 84 | -------------------------------------------------------------------------------- /Code files/Section 2/2.4/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = array-add-bug 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 2/2.4/array-add-bug: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 2/2.4/array-add-bug -------------------------------------------------------------------------------- /Code files/Section 2/2.4/array-add-bug.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // Standard CUDA API functions 5 | #include 6 | 7 | // Error checking macro 8 | #define cudaCheckError(code) \ 9 | { \ 10 | if ((code) != cudaSuccess) { \ 11 | fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \ 12 | cudaGetErrorString(code)); \ 13 | } \ 14 | } 15 | 16 | // Host function for array addition 17 | void add_loop(float *dest, int n_elts, const float *a, const float *b) 18 | { 19 | for (int i = 0; i < n_elts; i++) { 20 | dest[i] = a[i] + b[i]; 21 | } 22 | } 23 | 24 | // Device kernel for array addition. 25 | __global__ void add_kernel(float *dest, int n_elts, const float *a, 26 | const float *b) 27 | { 28 | int index = blockIdx.x * blockDim.x + threadIdx.x; 29 | if (index >= n_elts) return; 30 | 31 | dest[index] = a[index] + b[index]; 32 | } 33 | 34 | int main() 35 | { 36 | const int ARRAY_LENGTH = 10000; 37 | 38 | // Generate some data on the host 39 | float host_array_a[ARRAY_LENGTH]; 40 | float host_array_b[ARRAY_LENGTH]; 41 | float host_array_dest[ARRAY_LENGTH]; 42 | 43 | for (int i = 0; i < ARRAY_LENGTH; i++) { 44 | host_array_a[i] = 2 * i; 45 | host_array_b[i] = 2 * i + 1; 46 | } 47 | 48 | // Allocate device memory 49 | float *device_array_a, *device_array_b, *device_array_dest; 50 | cudaCheckError(cudaMalloc(&device_array_a, sizeof(host_array_a))); 51 | cudaCheckError(cudaMalloc(&device_array_b, sizeof(host_array_b))); 52 | cudaCheckError(cudaMalloc(&device_array_dest, sizeof(host_array_dest))); 53 | 54 | // Transfer data to device 55 | cudaCheckError(cudaMemcpy(device_array_a, host_array_a, sizeof(host_array_a), 56 | cudaMemcpyHostToDevice)); 57 | cudaCheckError(cudaMemcpy(device_array_b, host_array_b, sizeof(host_array_b), 58 | cudaMemcpyHostToDevice)); 59 | 60 | // Calculate lauch configuration 61 | const int BLOCK_SIZE = 128; 62 | int n_blocks = (ARRAY_LENGTH + BLOCK_SIZE - 1) / BLOCK_SIZE; 63 | 64 | // Add arrays on device 65 | add_kernel<<>>(device_array_dest, ARRAY_LENGTH, 66 | device_array_a, nullptr); 67 | 68 | // Meanwhile, add arrays on the host, for comparison 69 | add_loop(host_array_dest, ARRAY_LENGTH, host_array_a, host_array_b); 70 | 71 | // Copy result back to host and compare 72 | float host_array_tmp[ARRAY_LENGTH]; 73 | cudaCheckError(cudaMemcpy(host_array_tmp, device_array_dest, 74 | sizeof(host_array_tmp), cudaMemcpyDeviceToHost)); 75 | 76 | for (int i = 0; i < ARRAY_LENGTH; i++) { 77 | assert(host_array_tmp[i] == host_array_dest[i]); 78 | printf("%g + %g = %g\n", host_array_a[i], host_array_b[i], 79 | host_array_tmp[i]); 80 | } 81 | 82 | return 0; 83 | } 84 | -------------------------------------------------------------------------------- /Code files/Section 2/2.5/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = error-handling 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 2/2.5/error-handling: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 2/2.5/error-handling -------------------------------------------------------------------------------- /Code files/Section 2/2.5/error-handling.cu: -------------------------------------------------------------------------------- 1 | // Demonstration of basic CUDA error handling. 2 | // Example fgor video 2.5. 3 | 4 | #include 5 | 6 | // Standard CUDA API functions 7 | #include 8 | 9 | // Error checking macro 10 | #define cudaCheckError(code) \ 11 | { \ 12 | if ((code) != cudaSuccess) { \ 13 | fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \ 14 | cudaGetErrorString(code)); \ 15 | } \ 16 | } 17 | 18 | __global__ void bad() 19 | { 20 | char *x = nullptr; 21 | *x = 1; 22 | } 23 | 24 | __global__ void good() {} 25 | 26 | int main() 27 | { 28 | int *foo = nullptr; 29 | size_t size = 1lu << 33; 30 | cudaError_t status = cudaMalloc(&foo, size); 31 | const char *message = cudaGetErrorString(status); 32 | 33 | status = cudaGetLastError(); 34 | 35 | status = cudaMalloc(&foo, 16); 36 | message = cudaGetErrorString(status); 37 | 38 | bad<<<1, 1>>>(); 39 | status = cudaDeviceSynchronize(); 40 | message = cudaGetErrorString(status); 41 | 42 | good<<<1, 16>>>(); 43 | status = cudaDeviceSynchronize(); 44 | message = cudaGetErrorString(status); 45 | 46 | cudaCheckError(cudaMalloc(&foo, 16)) 47 | 48 | return 0; 49 | } 50 | -------------------------------------------------------------------------------- /Code files/Section 3/3.1/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = monochrome 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 3/3.1/monochrome: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.1/monochrome -------------------------------------------------------------------------------- /Code files/Section 3/3.1/monochrome.cu: -------------------------------------------------------------------------------- 1 | // Convert a color image to monochrome. 2 | // Example for videos 3.1 and 3.2. 3 | 4 | #include 5 | #include 6 | 7 | // Standard CUDA API functions 8 | #include 9 | 10 | #include "../utils.h" 11 | 12 | __global__ void monochrome(const pixel *source, pixel *dest, int size) 13 | { 14 | int index = blockIdx.x * blockDim.x + threadIdx.x; 15 | if (index >= size) return; 16 | 17 | float value(source[index].red * 0.3125f + source[index].green * 0.5f + 18 | source[index].blue * .1875f); 19 | 20 | dest[index].red = value; 21 | dest[index].green = value; 22 | dest[index].blue = value; 23 | dest[index].alpha = source[index].alpha; 24 | } 25 | 26 | int main(int argc, char **argv) 27 | { 28 | test_params params = set_up_test(argc, argv); 29 | 30 | int pixel_count = params.width * params.height; 31 | int BLOCK_SIZE = 128; 32 | int n_blocks = (pixel_count + BLOCK_SIZE - 1) / BLOCK_SIZE; 33 | 34 | { 35 | KernelTimer t; 36 | monochrome<<>>(params.input_image, 37 | params.output_image, pixel_count); 38 | } 39 | 40 | finish_test(params); 41 | 42 | return 0; 43 | } 44 | -------------------------------------------------------------------------------- /Code files/Section 3/3.2/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = monochrome 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 3/3.2/monochrome: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.2/monochrome -------------------------------------------------------------------------------- /Code files/Section 3/3.2/monochrome.cu: -------------------------------------------------------------------------------- 1 | // Convert a color image to monochrome. 2 | // Example for videos 3.1 and 3.2. 3 | 4 | #include 5 | #include 6 | 7 | // Standard CUDA API functions 8 | #include 9 | 10 | #include "../utils.h" 11 | 12 | __global__ void monochrome(const pixel *source, pixel *dest, int size) 13 | { 14 | int index = blockIdx.x * blockDim.x + threadIdx.x; 15 | if (index >= size) return; 16 | 17 | float value(source[index].red * 0.3125f + source[index].green * 0.5f + 18 | source[index].blue * .1875f); 19 | 20 | dest[index].red = value; 21 | dest[index].green = value; 22 | dest[index].blue = value; 23 | dest[index].alpha = source[index].alpha; 24 | } 25 | 26 | int main(int argc, char **argv) 27 | { 28 | test_params params = set_up_test(argc, argv); 29 | 30 | int pixel_count = params.width * params.height; 31 | int BLOCK_SIZE = 128; 32 | int n_blocks = (pixel_count + BLOCK_SIZE - 1) / BLOCK_SIZE; 33 | 34 | { 35 | KernelTimer t; 36 | monochrome<<>>(params.input_image, 37 | params.output_image, pixel_count); 38 | } 39 | 40 | finish_test(params); 41 | 42 | return 0; 43 | } 44 | -------------------------------------------------------------------------------- /Code files/Section 3/3.3/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = spotlights spotlights-2d 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 3/3.3/spotlights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.3/spotlights -------------------------------------------------------------------------------- /Code files/Section 3/3.3/spotlights-2d: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.3/spotlights-2d -------------------------------------------------------------------------------- /Code files/Section 3/3.3/spotlights-2d.cu: -------------------------------------------------------------------------------- 1 | // Render several spotlights on an image. 2 | // Uses a two-dimensional memory layout to ensure coalesced access. 3 | // Example for video 3.3. 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | // Standard CUDA API functions 10 | #include 11 | 12 | #include "../utils.h" 13 | 14 | struct light { 15 | float x; 16 | float y; 17 | float radius; 18 | float brightness; 19 | }; 20 | 21 | __device__ float clamp(float value) { return value > 1.0f ? 1.0f : value; } 22 | 23 | __device__ float light_brightness(float x, float y, unsigned int width, 24 | unsigned int height, const light &light) 25 | { 26 | float norm_x = x / width; 27 | float norm_y = y / height; 28 | 29 | float dx = norm_x - light.x; 30 | float dy = norm_y - light.y; 31 | float distance_squared = dx * dx + dy * dy; 32 | if (distance_squared > light.radius * light.radius) { 33 | return 0; 34 | } 35 | float distance = sqrtf(distance_squared); 36 | 37 | float scaled_distance = distance / light.radius; 38 | if (scaled_distance > 0.8) { 39 | return (1.0f - (scaled_distance - 0.8f) * 5.0f) * light.brightness; 40 | } else { 41 | return light.brightness; 42 | } 43 | } 44 | 45 | template 46 | __device__ T *pointer2d(T *base_pointer, int x, int y, size_t pitch) 47 | { 48 | return (T *)((char *)base_pointer + y * pitch) + x; 49 | } 50 | 51 | __global__ void spotlights(const image source, image dest, unsigned int width, 52 | unsigned int height, size_t pitch, float ambient, 53 | light light1, light light2, light light3, 54 | light light4) 55 | { 56 | int x = blockIdx.x * blockDim.x + threadIdx.x; 57 | int y = blockIdx.y * blockDim.y + threadIdx.y; 58 | if (x >= width || y >= height) return; 59 | 60 | float brightness = ambient + light_brightness(x, y, width, height, light1) + 61 | light_brightness(x, y, width, height, light2) + 62 | light_brightness(x, y, width, height, light3) + 63 | light_brightness(x, y, width, height, light4); 64 | 65 | *pointer2d(dest.red, x, y, pitch) = 66 | clamp(*pointer2d(source.red, x, y, pitch) * brightness); 67 | *pointer2d(dest.green, x, y, pitch) = 68 | clamp(*pointer2d(source.green, x, y, pitch) * brightness); 69 | *pointer2d(dest.blue, x, y, pitch) = 70 | clamp(*pointer2d(source.blue, x, y, pitch) * brightness); 71 | } 72 | 73 | int main(int argc, char **argv) 74 | { 75 | auto params = set_up_test_planar(argc, argv); 76 | 77 | light light1 = {0.2, 0.1, 0.1, 4.0}; 78 | light light2 = {0.25, 0.2, 0.075, 2.0}; 79 | light light3 = {0.5, 0.5, 0.3, 0.3}; 80 | light light4 = {0.7, 0.65, 0.15, 0.8}; 81 | 82 | image input2d, output2d; 83 | size_t byte_width = params.width * sizeof(float); 84 | size_t pitch; 85 | 86 | // Allocate 2D aligned image 87 | cudaCheckError( 88 | cudaMallocPitch(&input2d.red, &pitch, byte_width, params.height)); 89 | // Copy from 1D to 2D image 90 | cudaCheckError(cudaMemcpy2D(input2d.red, pitch, params.input_image.red, 91 | byte_width, byte_width, params.height, 92 | cudaMemcpyDeviceToDevice)); 93 | std::cout << "Width: " << byte_width << " bytes. Pitch: " << pitch 94 | << " bytes\n"; 95 | 96 | // Allocate and copy other channels 97 | // Note: pitch will be the same for all of these allocations 98 | cudaCheckError( 99 | cudaMallocPitch(&input2d.green, &pitch, byte_width, params.height)); 100 | cudaCheckError( 101 | cudaMallocPitch(&input2d.blue, &pitch, byte_width, params.height)); 102 | cudaCheckError( 103 | cudaMallocPitch(&output2d.red, &pitch, byte_width, params.height)); 104 | cudaCheckError( 105 | cudaMallocPitch(&output2d.green, &pitch, byte_width, params.height)); 106 | cudaCheckError( 107 | cudaMallocPitch(&output2d.blue, &pitch, byte_width, params.height)); 108 | cudaCheckError(cudaMemcpy2D(input2d.green, pitch, params.input_image.green, 109 | byte_width, byte_width, params.height, 110 | cudaMemcpyDeviceToDevice)); 111 | cudaCheckError(cudaMemcpy2D(input2d.blue, pitch, params.input_image.blue, 112 | byte_width, byte_width, params.height, 113 | cudaMemcpyDeviceToDevice)); 114 | 115 | dim3 BLOCK_DIM(32, 16); 116 | dim3 grid_dim((params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x, 117 | (params.height + BLOCK_DIM.y - 1) / BLOCK_DIM.y); 118 | 119 | { 120 | KernelTimer t; 121 | spotlights<<>>(input2d, output2d, params.width, 122 | params.height, pitch, 0.3, light1, 123 | light2, light3, light4); 124 | } 125 | 126 | cudaCheckError(cudaMemcpy2D(params.output_image.red, byte_width, output2d.red, 127 | pitch, byte_width, params.height, 128 | cudaMemcpyDeviceToDevice)); 129 | cudaCheckError(cudaMemcpy2D(params.output_image.green, byte_width, 130 | output2d.green, pitch, byte_width, params.height, 131 | cudaMemcpyDeviceToDevice)); 132 | cudaCheckError(cudaMemcpy2D(params.output_image.blue, byte_width, 133 | output2d.blue, pitch, byte_width, params.height, 134 | cudaMemcpyDeviceToDevice)); 135 | 136 | free_image(input2d); 137 | free_image(output2d); 138 | 139 | finish_test_planar(params); 140 | 141 | return 0; 142 | } 143 | -------------------------------------------------------------------------------- /Code files/Section 3/3.3/spotlights.cu: -------------------------------------------------------------------------------- 1 | // Render several spotlights on an image. 2 | // Uses a two-dimensional grid with a one-dimensional memory layout, so 3 | // performance is not optimal. 4 | // Example for video 3.3. 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | // Standard CUDA API functions 11 | #include 12 | 13 | #include "../utils.h" 14 | 15 | struct light { 16 | float x; 17 | float y; 18 | float radius; 19 | float brightness; 20 | }; 21 | 22 | __device__ float clamp(float value) { return value > 1.0f ? 1.0f : value; } 23 | 24 | __device__ float light_brightness(float x, float y, unsigned int width, 25 | unsigned int height, const light &light) 26 | { 27 | float norm_x = x / width; 28 | float norm_y = y / height; 29 | 30 | float dx = norm_x - light.x; 31 | float dy = norm_y - light.y; 32 | float distance_squared = dx * dx + dy * dy; 33 | if (distance_squared > light.radius * light.radius) { 34 | return 0; 35 | } 36 | float distance = sqrtf(distance_squared); 37 | 38 | float scaled_distance = distance / light.radius; 39 | if (scaled_distance > 0.8) { 40 | return (1.0f - (scaled_distance - 0.8f) * 5.0f) * light.brightness; 41 | } else { 42 | return light.brightness; 43 | } 44 | } 45 | 46 | __global__ void spotlights(const image source, image dest, unsigned int width, 47 | unsigned int height, float ambient, light light1, 48 | light light2, light light3, light light4) 49 | { 50 | int x = blockIdx.x * blockDim.x + threadIdx.x; 51 | int y = blockIdx.y * blockDim.y + threadIdx.y; 52 | if (x >= width || y >= height) return; 53 | 54 | int index = y * width + x; 55 | 56 | float brightness = ambient + light_brightness(x, y, width, height, light1) + 57 | light_brightness(x, y, width, height, light2) + 58 | light_brightness(x, y, width, height, light3) + 59 | light_brightness(x, y, width, height, light4); 60 | 61 | dest.red[index] = clamp(source.red[index] * brightness); 62 | dest.green[index] = clamp(source.green[index] * brightness); 63 | dest.blue[index] = clamp(source.blue[index] * brightness); 64 | } 65 | 66 | int main(int argc, char **argv) 67 | { 68 | auto params = set_up_test_planar(argc, argv); 69 | 70 | light light1 = {0.2, 0.1, 0.1, 4.0}; 71 | light light2 = {0.25, 0.2, 0.075, 2.0}; 72 | light light3 = {0.5, 0.5, 0.3, 0.3}; 73 | light light4 = {0.7, 0.65, 0.15, 0.8}; 74 | 75 | dim3 BLOCK_DIM(32, 16); 76 | dim3 grid_dim((params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x, 77 | (params.height + BLOCK_DIM.y - 1) / BLOCK_DIM.y); 78 | 79 | { 80 | KernelTimer t; 81 | spotlights<<>>(params.input_image, params.output_image, 82 | params.width, params.height, 0.3, 83 | light1, light2, light3, light4); 84 | } 85 | 86 | finish_test_planar(params); 87 | 88 | return 0; 89 | } 90 | -------------------------------------------------------------------------------- /Code files/Section 3/3.4/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = manylights2 manylights-const warp warp-texture 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 3/3.4/manylights-const: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.4/manylights-const -------------------------------------------------------------------------------- /Code files/Section 3/3.4/manylights-const.cu: -------------------------------------------------------------------------------- 1 | // Render many spotlights on an image, passing the light definitions in 2 | // constant memory. 3 | // Example for video 3.4. 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | // Standard CUDA API functions 10 | #include 11 | 12 | #include "../utils.h" 13 | 14 | struct light { 15 | float x; 16 | float y; 17 | float radius; 18 | float brightness; 19 | }; 20 | 21 | struct lots_of_lights { 22 | unsigned int count; 23 | light lights[1024]; 24 | }; 25 | 26 | __constant__ lots_of_lights dev_lights; 27 | 28 | __device__ float clamp(float value) { return value > 1.0f ? 1.0f : value; } 29 | 30 | __device__ float light_brightness(float x, float y, unsigned int width, 31 | unsigned int height, const light &light) 32 | { 33 | float norm_x = x / width; 34 | float norm_y = y / height; 35 | 36 | float dx = norm_x - light.x; 37 | float dy = norm_y - light.y; 38 | float distance_squared = dx * dx + dy * dy; 39 | if (distance_squared > light.radius * light.radius) { 40 | return 0; 41 | } 42 | float distance = sqrtf(distance_squared); 43 | 44 | float scaled_distance = distance / light.radius; 45 | if (scaled_distance > 0.8) { 46 | return (1.0f - (scaled_distance - 0.8f) * 5.0f) * light.brightness; 47 | } else { 48 | return light.brightness; 49 | } 50 | } 51 | 52 | template 53 | __device__ T *pointer2d(T *base_pointer, int x, int y, size_t pitch) 54 | { 55 | return (T *)((char *)base_pointer + y * pitch) + x; 56 | } 57 | 58 | __global__ void spotlights(const image source, image dest, unsigned int width, 59 | unsigned int height, size_t pitch, float ambient) 60 | { 61 | int x = blockIdx.x * blockDim.x + threadIdx.x; 62 | int y = blockIdx.y * blockDim.y + threadIdx.y; 63 | if (x >= width || y >= height) return; 64 | 65 | float brightness = ambient; 66 | for (int i = 0; i < dev_lights.count; i++) { 67 | brightness += light_brightness(x, y, width, height, dev_lights.lights[i]); 68 | } 69 | 70 | *pointer2d(dest.red, x, y, pitch) = 71 | clamp(*pointer2d(source.red, x, y, pitch) * brightness); 72 | *pointer2d(dest.green, x, y, pitch) = 73 | clamp(*pointer2d(source.green, x, y, pitch) * brightness); 74 | *pointer2d(dest.blue, x, y, pitch) = 75 | clamp(*pointer2d(source.blue, x, y, pitch) * brightness); 76 | } 77 | 78 | int main(int argc, char **argv) 79 | { 80 | auto params = set_up_test_planar(argc, argv); 81 | 82 | image input2d, output2d; 83 | size_t byte_width = params.width * sizeof(float); 84 | size_t pitch; 85 | 86 | // Allocate 2D aligned image 87 | cudaCheckError( 88 | cudaMallocPitch(&input2d.red, &pitch, byte_width, params.height)); 89 | // Copy from 1D to 2D image 90 | cudaCheckError(cudaMemcpy2D(input2d.red, pitch, params.input_image.red, 91 | byte_width, byte_width, params.height, 92 | cudaMemcpyDeviceToDevice)); 93 | 94 | // Allocate and copy other channels 95 | // Note: pitch will be the same for all of these allocations 96 | cudaCheckError( 97 | cudaMallocPitch(&input2d.green, &pitch, byte_width, params.height)); 98 | cudaCheckError( 99 | cudaMallocPitch(&input2d.blue, &pitch, byte_width, params.height)); 100 | cudaCheckError( 101 | cudaMallocPitch(&output2d.red, &pitch, byte_width, params.height)); 102 | cudaCheckError( 103 | cudaMallocPitch(&output2d.green, &pitch, byte_width, params.height)); 104 | cudaCheckError( 105 | cudaMallocPitch(&output2d.blue, &pitch, byte_width, params.height)); 106 | cudaCheckError(cudaMemcpy2D(input2d.green, pitch, params.input_image.green, 107 | byte_width, byte_width, params.height, 108 | cudaMemcpyDeviceToDevice)); 109 | cudaCheckError(cudaMemcpy2D(input2d.blue, pitch, params.input_image.blue, 110 | byte_width, byte_width, params.height, 111 | cudaMemcpyDeviceToDevice)); 112 | 113 | lots_of_lights lights = {1024}; 114 | float spacing = 1.0f / 32.0f; 115 | for (int x = 0; x < 32; x++) { 116 | for (int y = 0; y < 32; y++) { 117 | int index = y * 32 + x; 118 | lights.lights[index] = {x * spacing, y * spacing, 0.05, 0.2}; 119 | } 120 | } 121 | 122 | cudaCheckError( 123 | cudaMemcpyToSymbol(dev_lights, &lights, sizeof(lots_of_lights))); 124 | 125 | dim3 BLOCK_DIM(32, 16); 126 | dim3 grid_dim((params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x, 127 | (params.height + BLOCK_DIM.y - 1) / BLOCK_DIM.y); 128 | 129 | { 130 | KernelTimer t; 131 | spotlights<<>>(input2d, output2d, params.width, 132 | params.height, pitch, 0.0); 133 | } 134 | 135 | cudaCheckError(cudaMemcpy2D(params.output_image.red, byte_width, output2d.red, 136 | pitch, byte_width, params.height, 137 | cudaMemcpyDeviceToDevice)); 138 | cudaCheckError(cudaMemcpy2D(params.output_image.green, byte_width, 139 | output2d.green, pitch, byte_width, params.height, 140 | cudaMemcpyDeviceToDevice)); 141 | cudaCheckError(cudaMemcpy2D(params.output_image.blue, byte_width, 142 | output2d.blue, pitch, byte_width, params.height, 143 | cudaMemcpyDeviceToDevice)); 144 | 145 | free_image(input2d); 146 | free_image(output2d); 147 | 148 | finish_test_planar(params); 149 | 150 | return 0; 151 | } 152 | -------------------------------------------------------------------------------- /Code files/Section 3/3.4/manylights1.cu: -------------------------------------------------------------------------------- 1 | // Render many spotlights on an image. 2 | // This will not compile as it exceeds the maximum parameter space for 3 | // launching a kernel. 4 | // Example for video 3.4. 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | // Standard CUDA API functions 11 | #include 12 | 13 | #include "../utils.h" 14 | 15 | struct light { 16 | float x; 17 | float y; 18 | float radius; 19 | float brightness; 20 | }; 21 | 22 | struct lots_of_lights { 23 | unsigned int count; 24 | light lights[1024]; 25 | }; 26 | 27 | __device__ float clamp(float value) { return value > 1.0f ? 1.0f : value; } 28 | 29 | __device__ float light_brightness(float x, float y, unsigned int width, 30 | unsigned int height, const light &light) 31 | { 32 | float norm_x = x / width; 33 | float norm_y = y / height; 34 | 35 | float dx = norm_x - light.x; 36 | float dy = norm_y - light.y; 37 | float distance_squared = dx * dx + dy * dy; 38 | if (distance_squared > light.radius * light.radius) { 39 | return 0; 40 | } 41 | float distance = sqrtf(distance_squared); 42 | 43 | float scaled_distance = distance / light.radius; 44 | if (scaled_distance > 0.8) { 45 | return (1.0f - (scaled_distance - 0.8f) * 5.0f) * light.brightness; 46 | } else { 47 | return light.brightness; 48 | } 49 | } 50 | 51 | template 52 | __device__ T *pointer2d(T *base_pointer, int x, int y, size_t pitch) 53 | { 54 | return (T *)((char *)base_pointer + y * pitch) + x; 55 | } 56 | 57 | __global__ void spotlights(const image source, image dest, unsigned int width, 58 | unsigned int height, size_t pitch, float ambient, 59 | lots_of_lights lights) 60 | { 61 | int x = blockIdx.x * blockDim.x + threadIdx.x; 62 | int y = blockIdx.y * blockDim.y + threadIdx.y; 63 | if (x >= width || y >= height) return; 64 | 65 | float brightness = ambient; 66 | for (int i = 0; i < lights.count; i++) { 67 | brightness += light_brightness(x, y, width, height, lights.lights[i]); 68 | } 69 | 70 | *pointer2d(dest.red, x, y, pitch) = 71 | clamp(*pointer2d(source.red, x, y, pitch) * brightness); 72 | *pointer2d(dest.green, x, y, pitch) = 73 | clamp(*pointer2d(source.green, x, y, pitch) * brightness); 74 | *pointer2d(dest.blue, x, y, pitch) = 75 | clamp(*pointer2d(source.blue, x, y, pitch) * brightness); 76 | } 77 | 78 | int main(int argc, char **argv) 79 | { 80 | auto params = set_up_test_planar(argc, argv); 81 | 82 | image input2d, output2d; 83 | size_t byte_width = params.width * sizeof(float); 84 | size_t pitch; 85 | 86 | // Allocate 2D aligned image 87 | cudaCheckError( 88 | cudaMallocPitch(&input2d.red, &pitch, byte_width, params.height)); 89 | // Copy from 1D to 2D image 90 | cudaCheckError(cudaMemcpy2D(input2d.red, pitch, params.input_image.red, 91 | byte_width, byte_width, params.height, 92 | cudaMemcpyDeviceToDevice)); 93 | 94 | // Allocate and copy other channels 95 | // Note: pitch will be the same for all of these allocations 96 | cudaCheckError( 97 | cudaMallocPitch(&input2d.green, &pitch, byte_width, params.height)); 98 | cudaCheckError( 99 | cudaMallocPitch(&input2d.blue, &pitch, byte_width, params.height)); 100 | cudaCheckError( 101 | cudaMallocPitch(&output2d.red, &pitch, byte_width, params.height)); 102 | cudaCheckError( 103 | cudaMallocPitch(&output2d.green, &pitch, byte_width, params.height)); 104 | cudaCheckError( 105 | cudaMallocPitch(&output2d.blue, &pitch, byte_width, params.height)); 106 | cudaCheckError(cudaMemcpy2D(input2d.green, pitch, params.input_image.green, 107 | byte_width, byte_width, params.height, 108 | cudaMemcpyDeviceToDevice)); 109 | cudaCheckError(cudaMemcpy2D(input2d.blue, pitch, params.input_image.blue, 110 | byte_width, byte_width, params.height, 111 | cudaMemcpyDeviceToDevice)); 112 | 113 | lots_of_lights lights = {1024}; 114 | float spacing = 1.0f / 32.0f; 115 | for (int x = 0; x < 32; x++) { 116 | for (int y = 0; y < 32; y++) { 117 | int index = y * 32 + x; 118 | lights.lights[index] = {x * spacing, y * spacing, 0.1, 0.5}; 119 | } 120 | } 121 | 122 | dim3 BLOCK_DIM(32, 16); 123 | dim3 grid_dim((params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x, 124 | (params.height + BLOCK_DIM.y - 1) / BLOCK_DIM.y); 125 | 126 | { 127 | KernelTimer t; 128 | spotlights<<>>(input2d, output2d, params.width, 129 | params.height, pitch, 0.3, lights); 130 | } 131 | 132 | cudaCheckError(cudaMemcpy2D(params.output_image.red, byte_width, output2d.red, 133 | pitch, byte_width, params.height, 134 | cudaMemcpyDeviceToDevice)); 135 | cudaCheckError(cudaMemcpy2D(params.output_image.green, byte_width, 136 | output2d.green, pitch, byte_width, params.height, 137 | cudaMemcpyDeviceToDevice)); 138 | cudaCheckError(cudaMemcpy2D(params.output_image.blue, byte_width, 139 | output2d.blue, pitch, byte_width, params.height, 140 | cudaMemcpyDeviceToDevice)); 141 | 142 | free_image(input2d); 143 | free_image(output2d); 144 | 145 | finish_test_planar(params); 146 | 147 | return 0; 148 | } 149 | -------------------------------------------------------------------------------- /Code files/Section 3/3.4/manylights2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.4/manylights2 -------------------------------------------------------------------------------- /Code files/Section 3/3.4/manylights2.cu: -------------------------------------------------------------------------------- 1 | // Render many spotlights on an image, passing the light definitions in 2 | // global memory. This works, but is inefficient. 3 | // Example for video 3.4. 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | // Standard CUDA API functions 10 | #include 11 | 12 | #include "../utils.h" 13 | 14 | struct light { 15 | float x; 16 | float y; 17 | float radius; 18 | float brightness; 19 | }; 20 | 21 | struct lots_of_lights { 22 | unsigned int count; 23 | light lights[1024]; 24 | }; 25 | 26 | __device__ float clamp(float value) { return value > 1.0f ? 1.0f : value; } 27 | 28 | __device__ float light_brightness(float x, float y, unsigned int width, 29 | unsigned int height, const light &light) 30 | { 31 | float norm_x = x / width; 32 | float norm_y = y / height; 33 | 34 | float dx = norm_x - light.x; 35 | float dy = norm_y - light.y; 36 | float distance_squared = dx * dx + dy * dy; 37 | if (distance_squared > light.radius * light.radius) { 38 | return 0; 39 | } 40 | float distance = sqrtf(distance_squared); 41 | 42 | float scaled_distance = distance / light.radius; 43 | if (scaled_distance > 0.8) { 44 | return (1.0f - (scaled_distance - 0.8f) * 5.0f) * light.brightness; 45 | } else { 46 | return light.brightness; 47 | } 48 | } 49 | 50 | template 51 | __device__ T *pointer2d(T *base_pointer, int x, int y, size_t pitch) 52 | { 53 | return (T *)((char *)base_pointer + y * pitch) + x; 54 | } 55 | 56 | __global__ void spotlights(const image source, image dest, unsigned int width, 57 | unsigned int height, size_t pitch, float ambient, 58 | lots_of_lights *lights) 59 | { 60 | int x = blockIdx.x * blockDim.x + threadIdx.x; 61 | int y = blockIdx.y * blockDim.y + threadIdx.y; 62 | if (x >= width || y >= height) return; 63 | 64 | float brightness = ambient; 65 | for (int i = 0; i < lights->count; i++) { 66 | brightness += light_brightness(x, y, width, height, lights->lights[i]); 67 | } 68 | 69 | *pointer2d(dest.red, x, y, pitch) = 70 | clamp(*pointer2d(source.red, x, y, pitch) * brightness); 71 | *pointer2d(dest.green, x, y, pitch) = 72 | clamp(*pointer2d(source.green, x, y, pitch) * brightness); 73 | *pointer2d(dest.blue, x, y, pitch) = 74 | clamp(*pointer2d(source.blue, x, y, pitch) * brightness); 75 | } 76 | 77 | int main(int argc, char **argv) 78 | { 79 | auto params = set_up_test_planar(argc, argv); 80 | 81 | image input2d, output2d; 82 | size_t byte_width = params.width * sizeof(float); 83 | size_t pitch; 84 | 85 | // Allocate 2D aligned image 86 | cudaCheckError( 87 | cudaMallocPitch(&input2d.red, &pitch, byte_width, params.height)); 88 | // Copy from 1D to 2D image 89 | cudaCheckError(cudaMemcpy2D(input2d.red, pitch, params.input_image.red, 90 | byte_width, byte_width, params.height, 91 | cudaMemcpyDeviceToDevice)); 92 | 93 | // Allocate and copy other channels 94 | // Note: pitch will be the same for all of these allocations 95 | cudaCheckError( 96 | cudaMallocPitch(&input2d.green, &pitch, byte_width, params.height)); 97 | cudaCheckError( 98 | cudaMallocPitch(&input2d.blue, &pitch, byte_width, params.height)); 99 | cudaCheckError( 100 | cudaMallocPitch(&output2d.red, &pitch, byte_width, params.height)); 101 | cudaCheckError( 102 | cudaMallocPitch(&output2d.green, &pitch, byte_width, params.height)); 103 | cudaCheckError( 104 | cudaMallocPitch(&output2d.blue, &pitch, byte_width, params.height)); 105 | cudaCheckError(cudaMemcpy2D(input2d.green, pitch, params.input_image.green, 106 | byte_width, byte_width, params.height, 107 | cudaMemcpyDeviceToDevice)); 108 | cudaCheckError(cudaMemcpy2D(input2d.blue, pitch, params.input_image.blue, 109 | byte_width, byte_width, params.height, 110 | cudaMemcpyDeviceToDevice)); 111 | 112 | lots_of_lights lights = {1024}; 113 | float spacing = 1.0f / 32.0f; 114 | for (int x = 0; x < 32; x++) { 115 | for (int y = 0; y < 32; y++) { 116 | int index = y * 32 + x; 117 | lights.lights[index] = {x * spacing, y * spacing, 0.05, 0.2}; 118 | } 119 | } 120 | 121 | lots_of_lights *dev_lights; 122 | cudaCheckError(cudaMalloc(&dev_lights, sizeof(lots_of_lights))); 123 | cudaCheckError(cudaMemcpy(dev_lights, &lights, sizeof(lots_of_lights), 124 | cudaMemcpyHostToDevice)); 125 | 126 | dim3 BLOCK_DIM(32, 16); 127 | dim3 grid_dim((params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x, 128 | (params.height + BLOCK_DIM.y - 1) / BLOCK_DIM.y); 129 | 130 | { 131 | KernelTimer t; 132 | spotlights<<>>(input2d, output2d, params.width, 133 | params.height, pitch, 0.0, dev_lights); 134 | } 135 | 136 | cudaCheckError(cudaFree(dev_lights)); 137 | 138 | cudaCheckError(cudaMemcpy2D(params.output_image.red, byte_width, output2d.red, 139 | pitch, byte_width, params.height, 140 | cudaMemcpyDeviceToDevice)); 141 | cudaCheckError(cudaMemcpy2D(params.output_image.green, byte_width, 142 | output2d.green, pitch, byte_width, params.height, 143 | cudaMemcpyDeviceToDevice)); 144 | cudaCheckError(cudaMemcpy2D(params.output_image.blue, byte_width, 145 | output2d.blue, pitch, byte_width, params.height, 146 | cudaMemcpyDeviceToDevice)); 147 | 148 | free_image(input2d); 149 | free_image(output2d); 150 | 151 | finish_test_planar(params); 152 | 153 | return 0; 154 | } 155 | -------------------------------------------------------------------------------- /Code files/Section 3/3.4/warp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.4/warp -------------------------------------------------------------------------------- /Code files/Section 3/3.4/warp-texture: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.4/warp-texture -------------------------------------------------------------------------------- /Code files/Section 3/3.4/warp-texture.cu: -------------------------------------------------------------------------------- 1 | // Image warping using texture memory to improve performance. 2 | // Example for video 3.4. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Standard CUDA API functions 10 | #include 11 | 12 | #include "../utils.h" 13 | 14 | struct warp_params { 15 | float matrix[4]; 16 | float inverse_matrix[4]; 17 | float x_shift; 18 | float y_shift; 19 | }; 20 | 21 | struct texture_image { 22 | cudaTextureObject_t red; 23 | cudaTextureObject_t green; 24 | cudaTextureObject_t blue; 25 | }; 26 | 27 | template 28 | __device__ T *pointer2d(T *base_pointer, int x, int y, size_t pitch) 29 | { 30 | return (T *)((char *)base_pointer + y * pitch) + x; 31 | } 32 | 33 | __device__ float get_pixel(const float *source, unsigned int width, 34 | unsigned int height, size_t pitch, int x, int y) 35 | { 36 | if (x < 0 || x >= width || y < 0 || y >= height) { 37 | return 0.0f; 38 | } else { 39 | return *pointer2d(source, x, y, pitch); 40 | } 41 | } 42 | 43 | __device__ float average_pixels(const float *source, unsigned int width, 44 | unsigned int height, size_t pitch, int x0, 45 | float weight_x, int y0, float weight_y) 46 | 47 | { 48 | float p00 = get_pixel(source, width, height, pitch, x0, y0); 49 | float p01 = get_pixel(source, width, height, pitch, x0, y0 + 1); 50 | float p10 = get_pixel(source, width, height, pitch, x0 + 1, y0); 51 | float p11 = get_pixel(source, width, height, pitch, x0 + 1, y0 + 1); 52 | 53 | return (p00 * weight_x + p10 * (1.0f - weight_x)) * weight_y + 54 | (p01 * weight_x + p11 * (1.0f - weight_x)) * (1.0f - weight_y); 55 | } 56 | 57 | __global__ void warp_image(texture_image source, image dest, unsigned int width, 58 | unsigned int height, size_t pitch, 59 | warp_params params) 60 | { 61 | int x = blockIdx.x * blockDim.x + threadIdx.x; 62 | int y = blockIdx.y * blockDim.y + threadIdx.y; 63 | if (x >= width || y >= height) return; 64 | 65 | float source_x = params.inverse_matrix[0] * x + params.inverse_matrix[1] * y - 66 | params.x_shift; 67 | float source_y = params.inverse_matrix[2] * x + params.inverse_matrix[3] * y - 68 | params.y_shift; 69 | 70 | *pointer2d(dest.red, x, y, pitch) = 71 | tex2D(source.red, source_x, source_y); 72 | *pointer2d(dest.green, x, y, pitch) = 73 | tex2D(source.green, source_x, source_y); 74 | *pointer2d(dest.blue, x, y, pitch) = 75 | tex2D(source.blue, source_x, source_y); 76 | } 77 | 78 | static void mult_matrix(float mat[4], float a, float b, float c, float d) 79 | { 80 | float dst_a = mat[0] * a + mat[1] * c; 81 | float dst_b = mat[0] * b + mat[1] * d; 82 | float dst_c = mat[2] * a + mat[3] * c; 83 | float dst_d = mat[2] * b + mat[3] * d; 84 | 85 | mat[0] = dst_a; 86 | mat[1] = dst_b; 87 | mat[2] = dst_c; 88 | mat[3] = dst_d; 89 | } 90 | 91 | static void invert_matrix(float inverse[4], const float mat[4]) 92 | { 93 | float determinant = mat[0] * mat[3] - mat[1] * mat[2]; 94 | assert(determinant != 0); // Shouldn't happen if scales are non-zero 95 | float inverse_determinant = 1.0f / determinant; 96 | 97 | inverse[0] = mat[3] * inverse_determinant; 98 | inverse[1] = -1 * mat[1] * inverse_determinant; 99 | inverse[2] = -1 * mat[2] * inverse_determinant; 100 | inverse[3] = mat[0] * inverse_determinant; 101 | } 102 | 103 | int main(int argc, char **argv) 104 | { 105 | auto params = set_up_test_planar(argc, argv); 106 | image output2d; 107 | size_t byte_width = params.width * sizeof(float); 108 | size_t pitch; 109 | 110 | cudaCheckError( 111 | cudaMallocPitch(&output2d.red, &pitch, byte_width, params.height)); 112 | cudaCheckError( 113 | cudaMallocPitch(&output2d.green, &pitch, byte_width, params.height)); 114 | cudaCheckError( 115 | cudaMallocPitch(&output2d.blue, &pitch, byte_width, params.height)); 116 | 117 | // Set up warp parameters 118 | const float SCALE = 0.65f; 119 | const float ROTATE_RADS = 0.3; 120 | warp_params warp; 121 | // Scaling matrix 122 | warp.matrix[0] = warp.matrix[3] = SCALE; 123 | warp.matrix[1] = warp.matrix[2] = 0; 124 | // Add rotation 125 | mult_matrix(warp.matrix, cosf(ROTATE_RADS), sinf(ROTATE_RADS), 126 | -1 * sinf(ROTATE_RADS), cosf(ROTATE_RADS)); 127 | // Kernel will use inverse 128 | invert_matrix(warp.inverse_matrix, warp.matrix); 129 | // Add translation 130 | warp.x_shift = 0.1f * params.width; 131 | warp.y_shift = 0.3f * params.height; 132 | 133 | // Create arrays: opaque memory layouts optimized for texture 134 | // fetching. Copy our input images to them. 135 | cudaChannelFormatDesc channelDesc = 136 | cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); 137 | cudaArray *red_array, *green_array, *blue_array; 138 | cudaCheckError( 139 | cudaMallocArray(&red_array, &channelDesc, params.width, params.height)); 140 | cudaCheckError(cudaMemcpy2DToArray( 141 | red_array, 0, 0, params.input_image.red, params.width * sizeof(float), 142 | params.width * sizeof(float), params.height, cudaMemcpyDeviceToDevice)); 143 | cudaCheckError( 144 | cudaMallocArray(&green_array, &channelDesc, params.width, params.height)); 145 | cudaCheckError(cudaMemcpy2DToArray( 146 | green_array, 0, 0, params.input_image.green, params.width * sizeof(float), 147 | params.width * sizeof(float), params.height, cudaMemcpyDeviceToDevice)); 148 | cudaCheckError( 149 | cudaMallocArray(&blue_array, &channelDesc, params.width, params.height)); 150 | cudaCheckError(cudaMemcpy2DToArray( 151 | blue_array, 0, 0, params.input_image.blue, params.width * sizeof(float), 152 | params.width * sizeof(float), params.height, cudaMemcpyDeviceToDevice)); 153 | 154 | // Create resource descriptions for each channel, for use in texture setup. 155 | struct cudaResourceDesc red_resource = {cudaResourceTypeArray}; 156 | red_resource.res.array.array = red_array; 157 | struct cudaResourceDesc green_resource = {cudaResourceTypeArray}; 158 | green_resource.res.array.array = green_array; 159 | struct cudaResourceDesc blue_resource = {cudaResourceTypeArray}; 160 | blue_resource.res.array.array = blue_array; 161 | 162 | // Create texture description, specifying settings for texture fetches. 163 | struct cudaTextureDesc texture_desc = {}; 164 | texture_desc.addressMode[0] = cudaAddressModeBorder; 165 | texture_desc.addressMode[1] = cudaAddressModeBorder; 166 | texture_desc.filterMode = cudaFilterModeLinear; 167 | texture_desc.readMode = cudaReadModeElementType; 168 | texture_desc.normalizedCoords = 0; 169 | 170 | // Create texture objects which combine the resources and the texture 171 | // descriptions. 172 | texture_image source_texture; 173 | cudaCreateTextureObject(&source_texture.red, &red_resource, &texture_desc, 174 | NULL); 175 | cudaCreateTextureObject(&source_texture.green, &green_resource, &texture_desc, 176 | NULL); 177 | cudaCreateTextureObject(&source_texture.blue, &blue_resource, &texture_desc, 178 | NULL); 179 | 180 | dim3 BLOCK_DIM(32, 16); 181 | dim3 grid_dim((params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x, 182 | (params.height + BLOCK_DIM.y - 1) / BLOCK_DIM.y); 183 | 184 | { 185 | KernelTimer t; 186 | warp_image<<>>(source_texture, output2d, params.width, 187 | params.height, pitch, warp); 188 | } 189 | 190 | cudaCheckError(cudaDestroyTextureObject(source_texture.red)); 191 | cudaCheckError(cudaDestroyTextureObject(source_texture.green)); 192 | cudaCheckError(cudaDestroyTextureObject(source_texture.blue)); 193 | 194 | cudaCheckError(cudaFreeArray(red_array)); 195 | cudaCheckError(cudaFreeArray(green_array)); 196 | cudaCheckError(cudaFreeArray(blue_array)); 197 | 198 | cudaCheckError(cudaMemcpy2D(params.output_image.red, byte_width, output2d.red, 199 | pitch, byte_width, params.height, 200 | cudaMemcpyDeviceToDevice)); 201 | cudaCheckError(cudaMemcpy2D(params.output_image.green, byte_width, 202 | output2d.green, pitch, byte_width, params.height, 203 | cudaMemcpyDeviceToDevice)); 204 | cudaCheckError(cudaMemcpy2D(params.output_image.blue, byte_width, 205 | output2d.blue, pitch, byte_width, params.height, 206 | cudaMemcpyDeviceToDevice)); 207 | 208 | free_image(output2d); 209 | 210 | finish_test_planar(params); 211 | 212 | return 0; 213 | } 214 | -------------------------------------------------------------------------------- /Code files/Section 3/3.4/warp.cu: -------------------------------------------------------------------------------- 1 | // Image warping using global memory. 2 | // Reads are uncoalesced so performance is not optimal. 3 | // Example for video 3.4. 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | // Standard CUDA API functions 11 | #include 12 | 13 | #include "../utils.h" 14 | 15 | struct warp_params { 16 | float matrix[4]; 17 | float inverse_matrix[4]; 18 | float x_shift; 19 | float y_shift; 20 | }; 21 | 22 | template 23 | __device__ T *pointer2d(T *base_pointer, int x, int y, size_t pitch) 24 | { 25 | return (T *)((char *)base_pointer + y * pitch) + x; 26 | } 27 | 28 | __device__ float get_pixel(const float *source, unsigned int width, 29 | unsigned int height, size_t pitch, int x, int y) 30 | { 31 | if (x < 0 || x >= width || y < 0 || y >= height) { 32 | return 0.0f; 33 | } else { 34 | return *pointer2d(source, x, y, pitch); 35 | } 36 | } 37 | 38 | __device__ float average_pixels(const float *source, unsigned int width, 39 | unsigned int height, size_t pitch, int x0, 40 | float weight_x, int y0, float weight_y) 41 | 42 | { 43 | float p00 = get_pixel(source, width, height, pitch, x0, y0); 44 | float p01 = get_pixel(source, width, height, pitch, x0, y0 + 1); 45 | float p10 = get_pixel(source, width, height, pitch, x0 + 1, y0); 46 | float p11 = get_pixel(source, width, height, pitch, x0 + 1, y0 + 1); 47 | 48 | return (p00 * weight_x + p10 * (1.0f - weight_x)) * weight_y + 49 | (p01 * weight_x + p11 * (1.0f - weight_x)) * (1.0f - weight_y); 50 | } 51 | 52 | __global__ void warp_image(const image source, image dest, unsigned int width, 53 | unsigned int height, size_t pitch, 54 | warp_params params) 55 | { 56 | int x = blockIdx.x * blockDim.x + threadIdx.x; 57 | int y = blockIdx.y * blockDim.y + threadIdx.y; 58 | if (x >= width || y >= height) return; 59 | 60 | float source_x = params.inverse_matrix[0] * x + params.inverse_matrix[1] * y - 61 | params.x_shift; 62 | float source_y = params.inverse_matrix[2] * x + params.inverse_matrix[3] * y - 63 | params.y_shift; 64 | 65 | float x0 = floorf(source_x); 66 | float weight_x = source_x - x0; 67 | int x0_int = static_cast(x0); 68 | float y0 = floorf(source_y); 69 | float weight_y = source_y - y0; 70 | int y0_int = static_cast(y0); 71 | 72 | *pointer2d(dest.red, x, y, pitch) = average_pixels( 73 | source.red, width, height, pitch, x0_int, weight_x, y0_int, weight_y); 74 | *pointer2d(dest.green, x, y, pitch) = average_pixels( 75 | source.green, width, height, pitch, x0_int, weight_x, y0_int, weight_y); 76 | *pointer2d(dest.blue, x, y, pitch) = average_pixels( 77 | source.blue, width, height, pitch, x0_int, weight_x, y0_int, weight_y); 78 | } 79 | 80 | static void mult_matrix(float mat[4], float a, float b, float c, float d) 81 | { 82 | float dst_a = mat[0] * a + mat[1] * c; 83 | float dst_b = mat[0] * b + mat[1] * d; 84 | float dst_c = mat[2] * a + mat[3] * c; 85 | float dst_d = mat[2] * b + mat[3] * d; 86 | 87 | mat[0] = dst_a; 88 | mat[1] = dst_b; 89 | mat[2] = dst_c; 90 | mat[3] = dst_d; 91 | } 92 | 93 | static void invert_matrix(float inverse[4], const float mat[4]) 94 | { 95 | float determinant = mat[0] * mat[3] - mat[1] * mat[2]; 96 | assert(determinant != 0); // Shouldn't happen if scales are non-zero 97 | float inverse_determinant = 1.0f / determinant; 98 | 99 | inverse[0] = mat[3] * inverse_determinant; 100 | inverse[1] = -1 * mat[1] * inverse_determinant; 101 | inverse[2] = -1 * mat[2] * inverse_determinant; 102 | inverse[3] = mat[0] * inverse_determinant; 103 | } 104 | 105 | int main(int argc, char **argv) 106 | { 107 | auto params = set_up_test_planar(argc, argv); 108 | image input2d, output2d; 109 | size_t byte_width = params.width * sizeof(float); 110 | size_t pitch; 111 | 112 | // Allocate 2D aligned image 113 | cudaCheckError( 114 | cudaMallocPitch(&input2d.red, &pitch, byte_width, params.height)); 115 | // Copy from 1D to 2D image 116 | cudaCheckError(cudaMemcpy2D(input2d.red, pitch, params.input_image.red, 117 | byte_width, byte_width, params.height, 118 | cudaMemcpyDeviceToDevice)); 119 | 120 | // Allocate and copy other channels 121 | // Note: pitch will be the same for all of these allocations 122 | cudaCheckError( 123 | cudaMallocPitch(&input2d.green, &pitch, byte_width, params.height)); 124 | cudaCheckError( 125 | cudaMallocPitch(&input2d.blue, &pitch, byte_width, params.height)); 126 | cudaCheckError( 127 | cudaMallocPitch(&output2d.red, &pitch, byte_width, params.height)); 128 | cudaCheckError( 129 | cudaMallocPitch(&output2d.green, &pitch, byte_width, params.height)); 130 | cudaCheckError( 131 | cudaMallocPitch(&output2d.blue, &pitch, byte_width, params.height)); 132 | cudaCheckError(cudaMemcpy2D(input2d.green, pitch, params.input_image.green, 133 | byte_width, byte_width, params.height, 134 | cudaMemcpyDeviceToDevice)); 135 | cudaCheckError(cudaMemcpy2D(input2d.blue, pitch, params.input_image.blue, 136 | byte_width, byte_width, params.height, 137 | cudaMemcpyDeviceToDevice)); 138 | 139 | // Set up warp parameters 140 | const float SCALE = 0.65f; 141 | const float ROTATE_RADS = 0.3; 142 | warp_params warp; 143 | // Scaling matrix 144 | warp.matrix[0] = warp.matrix[3] = SCALE; 145 | warp.matrix[1] = warp.matrix[2] = 0; 146 | // Add rotation 147 | mult_matrix(warp.matrix, cosf(ROTATE_RADS), sinf(ROTATE_RADS), 148 | -1 * sinf(ROTATE_RADS), cosf(ROTATE_RADS)); 149 | // Kernel will use inverse 150 | invert_matrix(warp.inverse_matrix, warp.matrix); 151 | // Add translation 152 | warp.x_shift = 0.1f * params.width; 153 | warp.y_shift = 0.3f * params.height; 154 | 155 | dim3 BLOCK_DIM(32, 16); 156 | dim3 grid_dim((params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x, 157 | (params.height + BLOCK_DIM.y - 1) / BLOCK_DIM.y); 158 | 159 | { 160 | KernelTimer t; 161 | warp_image<<>>(input2d, output2d, params.width, 162 | params.height, pitch, warp); 163 | } 164 | 165 | cudaCheckError(cudaMemcpy2D(params.output_image.red, byte_width, output2d.red, 166 | pitch, byte_width, params.height, 167 | cudaMemcpyDeviceToDevice)); 168 | cudaCheckError(cudaMemcpy2D(params.output_image.green, byte_width, 169 | output2d.green, pitch, byte_width, params.height, 170 | cudaMemcpyDeviceToDevice)); 171 | cudaCheckError(cudaMemcpy2D(params.output_image.blue, byte_width, 172 | output2d.blue, pitch, byte_width, params.height, 173 | cudaMemcpyDeviceToDevice)); 174 | 175 | free_image(input2d); 176 | free_image(output2d); 177 | 178 | finish_test_planar(params); 179 | 180 | return 0; 181 | } 182 | -------------------------------------------------------------------------------- /Code files/Section 3/3.5/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = manylights-ilp 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 3/3.5/manylights-ilp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.5/manylights-ilp -------------------------------------------------------------------------------- /Code files/Section 3/3.5/manylights-ilp.cu: -------------------------------------------------------------------------------- 1 | // Render many spotlights on an image, computing multiple results per thread 2 | // in order to increase instruction-level parallelism. 3 | // Example for video 3.5. 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | // Standard CUDA API functions 10 | #include 11 | 12 | #include "../utils.h" 13 | 14 | struct light { 15 | float x; 16 | float y; 17 | float radius; 18 | float brightness; 19 | }; 20 | 21 | struct lots_of_lights { 22 | unsigned int count; 23 | light lights[1024]; 24 | }; 25 | 26 | __constant__ lots_of_lights dev_lights; 27 | 28 | __device__ float clamp(float value) { return value > 1.0f ? 1.0f : value; } 29 | 30 | __device__ float light_brightness(float x, float y, unsigned int width, 31 | unsigned int height, const light &light) 32 | { 33 | float norm_x = x / width; 34 | float norm_y = y / height; 35 | 36 | float dx = norm_x - light.x; 37 | float dy = norm_y - light.y; 38 | float distance_squared = dx * dx + dy * dy; 39 | if (distance_squared > light.radius * light.radius) { 40 | return 0; 41 | } 42 | float distance = sqrtf(distance_squared); 43 | 44 | float scaled_distance = distance / light.radius; 45 | if (scaled_distance > 0.8) { 46 | return (1.0f - (scaled_distance - 0.8f) * 5.0f) * light.brightness; 47 | } else { 48 | return light.brightness; 49 | } 50 | } 51 | 52 | template 53 | __device__ T *pointer2d(T *base_pointer, int x, int y, size_t pitch) 54 | { 55 | return (T *)((char *)base_pointer + y * pitch) + x; 56 | } 57 | 58 | const int OUTPUTS_PER_THREAD = 2; 59 | 60 | __global__ void spotlights(const image source, image dest, unsigned int width, 61 | unsigned int height, size_t pitch, float ambient) 62 | { 63 | for (int i = 0; i < OUTPUTS_PER_THREAD; i++) { 64 | int x = blockIdx.x * blockDim.x + threadIdx.x; 65 | int y = OUTPUTS_PER_THREAD * blockIdx.y * blockDim.y + threadIdx.y + 66 | i * blockDim.y; 67 | if (x >= width || y >= height) return; 68 | 69 | float brightness = ambient; 70 | for (int i = 0; i < dev_lights.count; i++) { 71 | brightness += light_brightness(x, y, width, height, dev_lights.lights[i]); 72 | } 73 | 74 | *pointer2d(dest.red, x, y, pitch) = 75 | clamp(*pointer2d(source.red, x, y, pitch) * brightness); 76 | *pointer2d(dest.green, x, y, pitch) = 77 | clamp(*pointer2d(source.green, x, y, pitch) * brightness); 78 | *pointer2d(dest.blue, x, y, pitch) = 79 | clamp(*pointer2d(source.blue, x, y, pitch) * brightness); 80 | } 81 | } 82 | 83 | int main(int argc, char **argv) 84 | { 85 | auto params = set_up_test_planar(argc, argv); 86 | 87 | image input2d, output2d; 88 | size_t byte_width = params.width * sizeof(float); 89 | size_t pitch; 90 | 91 | // Allocate 2D aligned image 92 | cudaCheckError( 93 | cudaMallocPitch(&input2d.red, &pitch, byte_width, params.height)); 94 | // Copy from 1D to 2D image 95 | cudaCheckError(cudaMemcpy2D(input2d.red, pitch, params.input_image.red, 96 | byte_width, byte_width, params.height, 97 | cudaMemcpyDeviceToDevice)); 98 | 99 | // Allocate and copy other channels 100 | // Note: pitch will be the same for all of these allocations 101 | cudaCheckError( 102 | cudaMallocPitch(&input2d.green, &pitch, byte_width, params.height)); 103 | cudaCheckError( 104 | cudaMallocPitch(&input2d.blue, &pitch, byte_width, params.height)); 105 | cudaCheckError( 106 | cudaMallocPitch(&output2d.red, &pitch, byte_width, params.height)); 107 | cudaCheckError( 108 | cudaMallocPitch(&output2d.green, &pitch, byte_width, params.height)); 109 | cudaCheckError( 110 | cudaMallocPitch(&output2d.blue, &pitch, byte_width, params.height)); 111 | cudaCheckError(cudaMemcpy2D(input2d.green, pitch, params.input_image.green, 112 | byte_width, byte_width, params.height, 113 | cudaMemcpyDeviceToDevice)); 114 | cudaCheckError(cudaMemcpy2D(input2d.blue, pitch, params.input_image.blue, 115 | byte_width, byte_width, params.height, 116 | cudaMemcpyDeviceToDevice)); 117 | 118 | lots_of_lights lights = {1024}; 119 | float spacing = 1.0f / 32.0f; 120 | for (int x = 0; x < 32; x++) { 121 | for (int y = 0; y < 32; y++) { 122 | int index = y * 32 + x; 123 | lights.lights[index] = {x * spacing, y * spacing, 0.05, 0.2}; 124 | } 125 | } 126 | 127 | cudaCheckError( 128 | cudaMemcpyToSymbol(dev_lights, &lights, sizeof(lots_of_lights))); 129 | 130 | dim3 BLOCK_DIM(32, 16); 131 | dim3 grid_dim( 132 | (params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x, 133 | (params.height + BLOCK_DIM.y - 1) / (BLOCK_DIM.y * OUTPUTS_PER_THREAD)); 134 | 135 | { 136 | KernelTimer t; 137 | spotlights<<>>(input2d, output2d, params.width, 138 | params.height, pitch, 0.0); 139 | } 140 | 141 | cudaCheckError(cudaMemcpy2D(params.output_image.red, byte_width, output2d.red, 142 | pitch, byte_width, params.height, 143 | cudaMemcpyDeviceToDevice)); 144 | cudaCheckError(cudaMemcpy2D(params.output_image.green, byte_width, 145 | output2d.green, pitch, byte_width, params.height, 146 | cudaMemcpyDeviceToDevice)); 147 | cudaCheckError(cudaMemcpy2D(params.output_image.blue, byte_width, 148 | output2d.blue, pitch, byte_width, params.height, 149 | cudaMemcpyDeviceToDevice)); 150 | 151 | free_image(input2d); 152 | free_image(output2d); 153 | 154 | finish_test_planar(params); 155 | 156 | return 0; 157 | } 158 | -------------------------------------------------------------------------------- /Code files/Section 4/4.1/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = transpose transpose-shared 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 4/4.1/transpose: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 4/4.1/transpose -------------------------------------------------------------------------------- /Code files/Section 4/4.1/transpose-shared: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 4/4.1/transpose-shared -------------------------------------------------------------------------------- /Code files/Section 4/4.1/transpose-shared.cu: -------------------------------------------------------------------------------- 1 | // Matrix transpose using shared memory to ensure all writes coalesce. 2 | // Example for video 4.1. 3 | 4 | #include 5 | #include 6 | 7 | // Standard CUDA API functions 8 | #include 9 | 10 | // CUDA cooperative groups API 11 | #include 12 | 13 | #include "../utils.h" 14 | 15 | const int TILE_DIM = 16; 16 | 17 | // Reference implementation on the host 18 | void transpose_reference(const float *source, float *dest, 19 | unsigned int dimension) 20 | { 21 | for (int y = 0; y < dimension; y++) { 22 | for (int x = 0; x < dimension; x++) { 23 | dest[y + x * dimension] = source[x + y * dimension]; 24 | } 25 | } 26 | } 27 | 28 | // Transpose a matrix 29 | // For simplicity, we assume that the matrix is square, and that its 30 | // dimension is a multiple of the block size, so we don't have to worry about 31 | // pitch or bounds checking. 32 | __global__ void transpose(const float *source, float *dest, 33 | unsigned int dimension) 34 | { 35 | // Shared memory to temporarily store data. 36 | // Note the padding of the Y dimension, to avoid bank conflicts. 37 | __shared__ float tile[TILE_DIM][TILE_DIM + 1]; 38 | 39 | int x_in = blockIdx.x * blockDim.x + threadIdx.x; 40 | int y_in = blockIdx.y * blockDim.y + threadIdx.y; 41 | int source_index = x_in + y_in * dimension; 42 | 43 | // Read from global memory to shared memory. Global memory access is 44 | // aligned. 45 | tile[threadIdx.y][threadIdx.x] = source[source_index]; 46 | 47 | // Wait for all threads in the block to finish, so the shared memory tile 48 | // is filled. 49 | cooperative_groups::thread_block block = 50 | cooperative_groups::this_thread_block(); 51 | cooperative_groups::sync(block); 52 | 53 | // Output coordinates. Note that blockIdx.y is used to determine x_out, and 54 | // blockIdx.x is used to determine y_out. 55 | int x_out = blockIdx.y * blockDim.y + threadIdx.x; 56 | int y_out = blockIdx.x * blockDim.y + threadIdx.y; 57 | int dest_index = x_out + y_out * dimension; 58 | 59 | // Read from a different index in the shared memory tile, and write to 60 | // global memory. Global memory access is once again aligned. 61 | dest[dest_index] = tile[threadIdx.x][threadIdx.y]; 62 | } 63 | 64 | int main(int argc, char **argv) 65 | { 66 | const unsigned int DIMENSION = 4096; 67 | const unsigned int COUNT = DIMENSION * DIMENSION; 68 | std::unique_ptr source(new float[COUNT]); 69 | std::unique_ptr dest(new float[COUNT]); 70 | 71 | // Fill source matrix with some arbitrary test values 72 | for (int i = 0; i < COUNT; i++) { 73 | source[i] = i; 74 | } 75 | 76 | // Allocate and fill device memory 77 | float *source_dev, *dest_dev; 78 | size_t size = COUNT * sizeof(float); 79 | cudaCheckError(cudaMalloc(&dest_dev, size)); 80 | cudaCheckError(cudaMalloc(&source_dev, size)); 81 | cudaCheckError( 82 | cudaMemcpy(source_dev, source.get(), size, cudaMemcpyHostToDevice)); 83 | 84 | // Run the kernel 85 | dim3 block_dim(TILE_DIM, TILE_DIM); 86 | dim3 grid_dim((DIMENSION + block_dim.x - 1) / block_dim.x, 87 | (DIMENSION + block_dim.y - 1) / block_dim.y); 88 | 89 | { 90 | KernelTimer t; 91 | transpose<<>>(source_dev, dest_dev, DIMENSION); 92 | } 93 | 94 | // Copy results back to the host 95 | cudaCheckError( 96 | cudaMemcpy(dest.get(), dest_dev, size, cudaMemcpyDeviceToHost)); 97 | cudaCheckError(cudaFree(dest_dev)); 98 | cudaCheckError(cudaFree(source_dev)); 99 | 100 | // Compare with reference implementation 101 | std::unique_ptr dest_reference(new float[COUNT]); 102 | transpose_reference(source.get(), dest_reference.get(), DIMENSION); 103 | 104 | for (int i = 0; i < COUNT; i++) { 105 | assert(dest_reference.get()[i] == dest.get()[i]); 106 | } 107 | 108 | return 0; 109 | } 110 | -------------------------------------------------------------------------------- /Code files/Section 4/4.1/transpose.cu: -------------------------------------------------------------------------------- 1 | // Matrix transpose with direct access to global memory. 2 | // Writes are uncoalesced. 3 | // Example for video 4.1. 4 | 5 | #include 6 | #include 7 | 8 | // Standard CUDA API functions 9 | #include 10 | 11 | #include "../utils.h" 12 | 13 | const int TILE_DIM = 16; 14 | 15 | // Reference implementation on the host 16 | void transpose_reference(const float *source, float *dest, 17 | unsigned int dimension) 18 | { 19 | for (int y = 0; y < dimension; y++) { 20 | for (int x = 0; x < dimension; x++) { 21 | dest[y + x * dimension] = source[x + y * dimension]; 22 | } 23 | } 24 | } 25 | 26 | // Transpose a matrix 27 | // For simplicity, we assume that the matrix is square, and that its 28 | // dimension is a multiple of the block size, so we don't have to worry about 29 | // pitch or bounds checking. 30 | __global__ void transpose(const float *source, float *dest, 31 | unsigned int dimension) 32 | { 33 | int x = blockIdx.x * blockDim.x + threadIdx.x; 34 | int y = blockIdx.y * blockDim.y + threadIdx.y; 35 | 36 | int source_index = y * dimension + x; 37 | int dest_index = x * dimension + y; 38 | 39 | dest[dest_index] = source[source_index]; 40 | } 41 | 42 | int main(int argc, char **argv) 43 | { 44 | const unsigned int DIMENSION = 4096; 45 | const unsigned int COUNT = DIMENSION * DIMENSION; 46 | std::unique_ptr source(new float[COUNT]); 47 | std::unique_ptr dest(new float[COUNT]); 48 | 49 | // Fill source matrix with some arbitrary test values 50 | for (int i = 0; i < COUNT; i++) { 51 | source[i] = i; 52 | } 53 | 54 | // Allocate and fill device memory 55 | float *source_dev, *dest_dev; 56 | size_t size = COUNT * sizeof(float); 57 | cudaCheckError(cudaMalloc(&dest_dev, size)); 58 | cudaCheckError(cudaMalloc(&source_dev, size)); 59 | cudaCheckError( 60 | cudaMemcpy(source_dev, source.get(), size, cudaMemcpyHostToDevice)); 61 | 62 | // Run the kernel 63 | dim3 block_dim(TILE_DIM, TILE_DIM); 64 | dim3 grid_dim((DIMENSION + block_dim.x - 1) / block_dim.x, 65 | (DIMENSION + block_dim.y - 1) / block_dim.y); 66 | 67 | { 68 | KernelTimer t; 69 | transpose<<>>(source_dev, dest_dev, DIMENSION); 70 | } 71 | 72 | // Copy results back to the host 73 | cudaCheckError( 74 | cudaMemcpy(dest.get(), dest_dev, size, cudaMemcpyDeviceToHost)); 75 | cudaCheckError(cudaFree(dest_dev)); 76 | cudaCheckError(cudaFree(source_dev)); 77 | 78 | // Compare with reference implementation 79 | std::unique_ptr dest_reference(new float[COUNT]); 80 | transpose_reference(source.get(), dest_reference.get(), DIMENSION); 81 | 82 | for (int i = 0; i < COUNT; i++) { 83 | assert(dest_reference.get()[i] == dest.get()[i]); 84 | } 85 | 86 | return 0; 87 | } 88 | -------------------------------------------------------------------------------- /Code files/Section 4/4.2/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = reduce 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 4/4.2/reduce: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 4/4.2/reduce -------------------------------------------------------------------------------- /Code files/Section 4/4.2/reduce.cu: -------------------------------------------------------------------------------- 1 | // Reduce an array to a single value by summing all of its elements. 2 | // Example for video 4.1. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | // Standard CUDA API functions 11 | #include 12 | 13 | // CUDA cooperative groups API 14 | #include 15 | 16 | #include "../utils.h" 17 | 18 | __device__ unsigned int blocks_finished = 0; 19 | // Wait for all blocks in the grid to execute this function. 20 | // Returns true for thread 0 of the last block, false for all 21 | // other threads. 22 | __device__ bool wait_for_all_blocks() 23 | { 24 | // Wait until global write is visible to all other blocks 25 | __threadfence(); 26 | 27 | // Wait for all blocks to finish by atomically incrementing a counter 28 | bool is_last = false; 29 | if (threadIdx.x == 0) { 30 | unsigned int ticket = atomicInc(&blocks_finished, gridDim.x); 31 | is_last = (ticket == gridDim.x - 1); 32 | } 33 | if (is_last) { 34 | blocks_finished = 0; 35 | } 36 | return is_last; 37 | } 38 | 39 | __device__ int reduce_block(const int *source, int sdata[], 40 | cooperative_groups::thread_block block) 41 | { 42 | unsigned int index = blockIdx.x * blockDim.x * 2 + threadIdx.x; 43 | auto tid = threadIdx.x; 44 | 45 | // Add two elements into shared memory 46 | sdata[tid] = source[index] + source[index + blockDim.x]; 47 | 48 | cooperative_groups::sync(block); 49 | 50 | // When shared memory block is filled, reduce within that block. 51 | for (int stride = 1; stride < blockDim.x; stride *= 2) { 52 | int index = 2 * stride * tid; 53 | if (index < blockDim.x) { 54 | sdata[index] += sdata[index + stride]; 55 | } 56 | cooperative_groups::sync(block); 57 | } 58 | 59 | return sdata[0]; 60 | } 61 | 62 | // Sum the source array. The dest array must have one element per block -- 63 | // the first element will contain the final result, and the rest are used for 64 | // temporary storage. 65 | __global__ void reduce(const int *source, int *dest) 66 | { 67 | extern __shared__ int sdata[]; 68 | 69 | int block_result = 70 | reduce_block(source, sdata, cooperative_groups::this_thread_block()); 71 | 72 | // The last thread of each block writes the block result into global memory 73 | if (threadIdx.x == 0) { 74 | dest[blockIdx.x] = block_result; 75 | } 76 | 77 | bool is_last = wait_for_all_blocks(); 78 | 79 | // All blocks have passed the threadfence, so all writes are visible to all 80 | // blocks. Now we can use one thread to sum the results from each block. 81 | if (is_last) { 82 | int sum = 0; 83 | for (int i = 0; i < gridDim.x; i++) { 84 | sum += dest[i]; 85 | } 86 | // Final sum goes in dest[0] 87 | dest[0] = sum; 88 | } 89 | } 90 | 91 | int main(int argc, char **argv) 92 | { 93 | const unsigned int COUNT = 4096 * 4096; 94 | std::unique_ptr source(new int[COUNT]); 95 | 96 | // Fill source matrix with some arbitrary test values 97 | std::mt19937 rng; 98 | rng.seed(0); 99 | std::uniform_int_distribution dist(0, 9); 100 | 101 | for (int i = 0; i < COUNT; i++) { 102 | source[i] = dist(rng); 103 | } 104 | 105 | // Allocate and fill device memory 106 | int *source_dev, *dest_dev; 107 | size_t size = COUNT * sizeof(int); 108 | cudaCheckError(cudaMalloc(&source_dev, size)); 109 | cudaCheckError( 110 | cudaMemcpy(source_dev, source.get(), size, cudaMemcpyHostToDevice)); 111 | 112 | // Run the kernel 113 | int BLOCK_SIZE = 128; 114 | int n_blocks = (COUNT + BLOCK_SIZE - 1) / (2 * BLOCK_SIZE); 115 | 116 | cudaCheckError(cudaMalloc(&dest_dev, n_blocks * sizeof(int))); 117 | 118 | { 119 | KernelTimer t; 120 | size_t shared_memory_size = BLOCK_SIZE * sizeof(int); 121 | reduce<<>>(source_dev, dest_dev); 122 | } 123 | 124 | // Copy result back to the host 125 | int result; 126 | cudaCheckError( 127 | cudaMemcpy(&result, dest_dev, sizeof(result), cudaMemcpyDeviceToHost)); 128 | cudaCheckError(cudaFree(source_dev)); 129 | cudaCheckError(cudaFree(dest_dev)); 130 | 131 | // Compare with reference implementation 132 | int result_reference = std::accumulate(source.get(), source.get() + COUNT, 0); 133 | std::cout << "Sum of " << COUNT << " elements: " << result << "\n"; 134 | assert(result_reference == result); 135 | 136 | return 0; 137 | } 138 | -------------------------------------------------------------------------------- /Code files/Section 4/4.3/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = scan 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 4/4.3/scan: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 4/4.3/scan -------------------------------------------------------------------------------- /Code files/Section 4/4.3/scan.cu: -------------------------------------------------------------------------------- 1 | // Implementation of parallel prefix sum, aka scan. 2 | // Example for video 4.3. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | // Standard CUDA API functions 11 | #include 12 | 13 | // CUDA cooperative groups API 14 | #include 15 | 16 | #include "../utils.h" 17 | 18 | void scan_reference(const int *source, int *dest, unsigned int count) 19 | { 20 | int sum = 0; 21 | for (int i = 0; i < count; i++) { 22 | sum += source[i]; 23 | dest[i] = sum; 24 | } 25 | } 26 | 27 | const int BLOCK_SIZE = 128; 28 | 29 | // Scan using shared memory, within a single block. 30 | __device__ int block_scan(int idata, int shared_data[], 31 | cooperative_groups::thread_block block) 32 | { 33 | // Index into shared memory 34 | int si = threadIdx.x; 35 | shared_data[si] = 0; 36 | si += blockDim.x; 37 | shared_data[si] = idata; 38 | 39 | for (int offset = 1; offset < blockDim.x; offset *= 2) { 40 | cooperative_groups::sync(block); 41 | int t = shared_data[si] + shared_data[si - offset]; 42 | cooperative_groups::sync(block); 43 | shared_data[si] = t; 44 | } 45 | 46 | return shared_data[si]; 47 | } 48 | 49 | // First step of scan: process each block separately 50 | __global__ void scan1(const int *source, int *dest) 51 | { 52 | // Shared memory buffer. By allocating extra elements we avoid bounds 53 | // checks on shared memory access. 54 | __shared__ int shared_data[2 * BLOCK_SIZE]; 55 | 56 | // Index into global memory 57 | int index = blockIdx.x * blockDim.x + threadIdx.x; 58 | 59 | // Load data from global memory 60 | int idata = source[index]; 61 | 62 | // Shared memory scan within this block 63 | int result = 64 | block_scan(idata, shared_data, cooperative_groups::this_thread_block()); 65 | 66 | // Write back to global memory 67 | dest[index] = result; 68 | } 69 | 70 | // Second step of scan: compute prefix sums for each block 71 | __global__ void scan2(const int *dest, int *block_sums, unsigned int count) 72 | { 73 | // Shared memory buffer. By allocating extra elements we avoid bounds 74 | // checks on shared memory access. 75 | __shared__ int shared_data[2 * BLOCK_SIZE]; 76 | 77 | int index = blockIdx.x * blockDim.x + threadIdx.x; 78 | 79 | int idata = (index == 0) ? 0 : dest[index * blockDim.x - 1]; 80 | block_sums[index] = 81 | block_scan(idata, shared_data, cooperative_groups::this_thread_block()); 82 | } 83 | 84 | // Final step of scan: add block sums to every result. 85 | __global__ void finish_scan(const int *block_sums, int *dest) 86 | { 87 | __shared__ int block_sum; 88 | 89 | if (threadIdx.x == 0) { 90 | block_sum = block_sums[blockIdx.x]; 91 | } 92 | cooperative_groups::sync(cooperative_groups::this_thread_block()); 93 | 94 | int index = blockIdx.x * blockDim.x + threadIdx.x; 95 | dest[index] += block_sum; 96 | } 97 | 98 | int main(int argc, char **argv) 99 | { 100 | // Maximum possible size with two-level scan. 101 | const unsigned int COUNT = BLOCK_SIZE * BLOCK_SIZE; 102 | std::unique_ptr source(new int[COUNT]); 103 | std::unique_ptr dest(new int[COUNT]); 104 | 105 | // Fill source matrix with some arbitrary test values 106 | std::mt19937 rng; 107 | rng.seed(0); 108 | std::uniform_int_distribution dist(0, 9); 109 | 110 | for (int i = 0; i < COUNT; i++) { 111 | source[i] = dist(rng); 112 | } 113 | 114 | // Allocate and fill device memory 115 | int *source_dev, *dest_dev; 116 | size_t size = COUNT * sizeof(int); 117 | cudaCheckError(cudaMalloc(&source_dev, size)); 118 | cudaCheckError( 119 | cudaMemcpy(source_dev, source.get(), size, cudaMemcpyHostToDevice)); 120 | cudaCheckError(cudaMalloc(&dest_dev, size)); 121 | 122 | int n_blocks1 = (COUNT + BLOCK_SIZE - 1) / BLOCK_SIZE; 123 | 124 | // Temporary buffer for kernel 125 | int *block_sums; 126 | cudaCheckError(cudaMalloc(&block_sums, n_blocks1 * sizeof(int))); 127 | 128 | { 129 | KernelTimer t; 130 | 131 | // Run the kernel 132 | scan1<<>>(source_dev, dest_dev); 133 | 134 | int n_blocks2 = (n_blocks1 + BLOCK_SIZE - 1) / BLOCK_SIZE; 135 | // If we had multiple blocks here, we'd need a third level of scans to 136 | // get the final result. 137 | assert(n_blocks2 == 1); 138 | scan2<<>>(dest_dev, block_sums, n_blocks1); 139 | 140 | finish_scan<<>>(block_sums, dest_dev); 141 | } 142 | 143 | // Copy result back to the host 144 | cudaCheckError( 145 | cudaMemcpy(dest.get(), dest_dev, size, cudaMemcpyDeviceToHost)); 146 | cudaCheckError(cudaFree(source_dev)); 147 | cudaCheckError(cudaFree(dest_dev)); 148 | cudaCheckError(cudaFree(block_sums)); 149 | 150 | // Compare with reference implementation 151 | std::unique_ptr dest_reference(new int[COUNT]); 152 | scan_reference(source.get(), dest_reference.get(), COUNT); 153 | for (int i = 0; i < COUNT; i++) { 154 | assert(dest_reference.get()[i] == dest.get()[i]); 155 | } 156 | 157 | return 0; 158 | } 159 | -------------------------------------------------------------------------------- /Code files/Section 4/4.4/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = filter 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 4/4.4/filter: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 4/4.4/filter -------------------------------------------------------------------------------- /Code files/Section 4/4.4/filter.cu: -------------------------------------------------------------------------------- 1 | // Filter the contents of an array. 2 | // Uses a scan, followed by a separate kernel to fill the output. 3 | // Example for video 4.4. 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | // Standard CUDA API functions 13 | #include 14 | 15 | // CUDA cooperative groups API 16 | #include 17 | 18 | #include "../utils.h" 19 | 20 | __host__ __device__ bool divisible_by_three(int value) 21 | { 22 | return (value % 3) == 0; 23 | } 24 | 25 | const int BLOCK_SIZE = 128; 26 | 27 | // Scan using shared memory, within a single block. 28 | __device__ int block_scan(int idata, int shared_data[], 29 | cooperative_groups::thread_block block) 30 | { 31 | // Index into shared memory 32 | int si = threadIdx.x; 33 | shared_data[si] = 0; 34 | si += blockDim.x; 35 | shared_data[si] = idata; 36 | 37 | for (int offset = 1; offset < blockDim.x; offset *= 2) { 38 | cooperative_groups::sync(block); 39 | int t = shared_data[si] + shared_data[si - offset]; 40 | cooperative_groups::sync(block); 41 | shared_data[si] = t; 42 | } 43 | 44 | return shared_data[si]; 45 | } 46 | 47 | // First step of scan: process each block separately 48 | __global__ void scan1(const int *source, int *dest) 49 | { 50 | // Shared memory buffer. By allocating extra elements we avoid bounds 51 | // checks on shared memory access. 52 | __shared__ int shared_data[2 * BLOCK_SIZE]; 53 | 54 | // Index into global memory 55 | int index = blockIdx.x * blockDim.x + threadIdx.x; 56 | 57 | // Load data from global memory 58 | int idata = source[index]; 59 | 60 | // Shared memory scan within this block 61 | int result = 62 | block_scan(idata, shared_data, cooperative_groups::this_thread_block()); 63 | 64 | // Write back to global memory 65 | dest[index] = result; 66 | } 67 | 68 | // Second step of scan: compute prefix sums for each block 69 | __global__ void scan2(const int *dest, int *block_sums, unsigned int count) 70 | { 71 | // Shared memory buffer. By allocating extra elements we avoid bounds 72 | // checks on shared memory access. 73 | __shared__ int shared_data[2 * BLOCK_SIZE]; 74 | 75 | int index = blockIdx.x * blockDim.x + threadIdx.x; 76 | 77 | int idata = (index == 0) ? 0 : dest[index * blockDim.x - 1]; 78 | block_sums[index] = 79 | block_scan(idata, shared_data, cooperative_groups::this_thread_block()); 80 | } 81 | 82 | // Final step of scan: add block sums to every result. 83 | __global__ void finish_scan(const int *block_sums, int *dest) 84 | { 85 | __shared__ int block_sum; 86 | 87 | if (threadIdx.x == 0) { 88 | block_sum = block_sums[blockIdx.x]; 89 | } 90 | cooperative_groups::sync(cooperative_groups::this_thread_block()); 91 | 92 | int index = blockIdx.x * blockDim.x + threadIdx.x; 93 | dest[index] += block_sum; 94 | } 95 | 96 | // Compute prefix sum of source 97 | void scan(const int *source, int *dest, unsigned int count) 98 | { 99 | int n_blocks1 = (count + BLOCK_SIZE - 1) / BLOCK_SIZE; 100 | 101 | // Temporary buffer for kernel 102 | int *block_sums; 103 | cudaCheckError(cudaMalloc(&block_sums, n_blocks1 * sizeof(int))); 104 | 105 | // Run the kernel 106 | scan1<<>>(source, dest); 107 | 108 | int n_blocks2 = (n_blocks1 + BLOCK_SIZE - 1) / BLOCK_SIZE; 109 | // If we had multiple blocks here, we'd need a third level of scans to 110 | // get the final result. 111 | assert(n_blocks2 == 1); 112 | scan2<<>>(dest, block_sums, n_blocks1); 113 | 114 | finish_scan<<>>(block_sums, dest); 115 | 116 | cudaCheckError(cudaFree(block_sums)); 117 | } 118 | 119 | // Test predicate for all elements of source. Fill result with a 1 for values 120 | // that satisfy the predicate, and a 0 otherwise. 121 | __global__ void evaluate_predicate(const int *source, int *result, 122 | unsigned int count) 123 | { 124 | int index = threadIdx.x + blockIdx.x * blockDim.x; 125 | if (index < count) { 126 | result[index] = divisible_by_three(source[index]) ? 1 : 0; 127 | } 128 | } 129 | 130 | // Copy values that satisfy the predicate from source to result, using the 131 | // indices array to place them in the correct position. 132 | __global__ void fill_output(const int *source, const int *indices, int *result, 133 | unsigned int count) 134 | { 135 | int index = threadIdx.x + blockIdx.x * blockDim.x; 136 | if (index >= count) { 137 | return; 138 | } 139 | 140 | int value = source[index]; 141 | if (divisible_by_three(value)) { 142 | // Subtract 1 from index because scan is inclusive (it counts the current 143 | // element), so the indices array will contain 1-based indices. 144 | int output_index = indices[index] - 1; 145 | result[output_index] = value; 146 | } 147 | } 148 | 149 | int main(int argc, char **argv) 150 | { 151 | // Maximum possible size with two-level scan. 152 | const unsigned int COUNT = BLOCK_SIZE * BLOCK_SIZE; 153 | std::unique_ptr source(new int[COUNT]); 154 | std::unique_ptr dest(new int[COUNT]); 155 | 156 | // Fill source matrix with some arbitrary test values 157 | std::mt19937 rng; 158 | rng.seed(0); 159 | std::uniform_int_distribution dist(0, 9); 160 | 161 | for (int i = 0; i < COUNT; i++) { 162 | source[i] = dist(rng); 163 | } 164 | 165 | // Allocate and fill device memory 166 | int *source_dev, *dest_dev; 167 | // Result of evaluating predicates 168 | int *predicates; 169 | // Indices at which to store each result element 170 | int *indices; 171 | size_t size = COUNT * sizeof(int); 172 | // Number of elements in the output array 173 | int output_count; 174 | cudaCheckError(cudaMalloc(&source_dev, size)); 175 | cudaCheckError( 176 | cudaMemcpy(source_dev, source.get(), size, cudaMemcpyHostToDevice)); 177 | cudaCheckError(cudaMalloc(&predicates, size)); 178 | cudaCheckError(cudaMalloc(&indices, size)); 179 | 180 | { 181 | KernelTimer t; 182 | 183 | int n_blocks = (COUNT + BLOCK_SIZE - 1) / BLOCK_SIZE; 184 | // Test predicate for all source values 185 | evaluate_predicate<<>>(source_dev, predicates, COUNT); 186 | // Scan the predicate array to compute output indices 187 | scan(predicates, indices, COUNT); 188 | 189 | // Find the length of the output from the last index, and allocate the 190 | // array. 191 | cudaCheckError(cudaMemcpy(&output_count, indices + COUNT - 1, sizeof(int), 192 | cudaMemcpyDeviceToHost)); 193 | cudaCheckError(cudaMalloc(&dest_dev, output_count * sizeof(int))); 194 | 195 | // Copy elements from input to output 196 | fill_output<<>>(source_dev, indices, dest_dev, COUNT); 197 | } 198 | 199 | // Copy result back to the host 200 | cudaCheckError(cudaMemcpy(dest.get(), dest_dev, output_count * sizeof(int), 201 | cudaMemcpyDeviceToHost)); 202 | cudaCheckError(cudaFree(source_dev)); 203 | cudaCheckError(cudaFree(dest_dev)); 204 | cudaCheckError(cudaFree(predicates)); 205 | cudaCheckError(cudaFree(indices)); 206 | 207 | // Compare with reference implementation 208 | std::vector dest_reference; 209 | std::copy_if(source.get(), source.get() + COUNT, 210 | std::back_inserter(dest_reference), divisible_by_three); 211 | assert(dest_reference.size() == output_count); 212 | for (int i = 0; i < output_count; i++) { 213 | assert(dest_reference[i] == dest.get()[i]); 214 | } 215 | 216 | return 0; 217 | } 218 | -------------------------------------------------------------------------------- /Code files/Section 5/5.4/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = thrust 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 5/5.4/thrust: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 5/5.4/thrust -------------------------------------------------------------------------------- /Code files/Section 5/5.4/thrust.cu: -------------------------------------------------------------------------------- 1 | // Demonstration of basic thrust functionality. 2 | // Example for video 5.4. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | int main(void) 12 | { 13 | // Allocate two device_vectors with 10 elements 14 | thrust::device_vector vec1(10); 15 | thrust::device_vector vec2(10); 16 | 17 | // Initialize vec1 to 0,1,2,3, .... 18 | thrust::sequence(vec1.begin(), vec1.end()); 19 | 20 | // vec2 = -vec1 21 | thrust::transform(vec1.begin(), vec1.end(), vec2.begin(), 22 | thrust::negate()); 23 | 24 | // print vec2 25 | thrust::copy(vec2.begin(), vec2.end(), 26 | std::ostream_iterator(std::cout, "\n")); 27 | 28 | return 0; 29 | } 30 | -------------------------------------------------------------------------------- /Code files/Section 6/6.1/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = reduce-stream 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 6/6.1/reduce-stream: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.1/reduce-stream -------------------------------------------------------------------------------- /Code files/Section 6/6.1/reduce-stream.cu: -------------------------------------------------------------------------------- 1 | // Concurrent execution of multiple single-block reductions. 2 | // The reduce kernel is very efficient, but occupancy is low, so multiple 3 | // concurrent launches are needed to achieve good throughput. 4 | // Example for video 6.1. 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | // Standard CUDA API functions 13 | #include 14 | 15 | // CUDA cooperative groups API 16 | #include 17 | 18 | #include "../utils.h" 19 | 20 | __device__ unsigned int blocks_finished = 0; 21 | // Wait for all blocks in the grid to execute this function. 22 | // Returns true for thread 0 of the last block, false for all 23 | // other threads. 24 | __device__ bool wait_for_all_blocks() 25 | { 26 | // Wait until global write is visible to all other blocks 27 | __threadfence(); 28 | 29 | // Wait for all blocks to finish by atomically incrementing a counter 30 | bool is_last = false; 31 | if (threadIdx.x == 0) { 32 | unsigned int ticket = atomicInc(&blocks_finished, gridDim.x); 33 | is_last = (ticket == gridDim.x - 1); 34 | } 35 | if (is_last) { 36 | blocks_finished = 0; 37 | } 38 | return is_last; 39 | } 40 | 41 | __device__ int reduce_block(int value, int sdata[], 42 | cooperative_groups::thread_block block) 43 | { 44 | auto tid = threadIdx.x; 45 | 46 | // Fill shared memory with initial values 47 | sdata[tid] = value; 48 | 49 | cooperative_groups::sync(block); 50 | 51 | // When shared memory block is filled, reduce within that block. 52 | for (int stride = 1; stride < blockDim.x; stride *= 2) { 53 | int index = 2 * stride * tid; 54 | if (index < blockDim.x) { 55 | sdata[index] += sdata[index + stride]; 56 | } 57 | cooperative_groups::sync(block); 58 | } 59 | 60 | return sdata[0]; 61 | } 62 | 63 | // Sum the source array and store the sum in dest. 64 | // Requires block_size * sizeof(int) bytes of shared memory. 65 | 66 | // This kernel should always be launched with a single block. Unlike the 67 | // previous reduce example, it keeps all threads busy and does not store any 68 | // temporary data in global memory. However, occupancy is very low due to 69 | // running a single block. 70 | __global__ void reduce_single_block(const int *source, int *dest, 71 | unsigned int count) 72 | { 73 | extern __shared__ int sdata[]; 74 | 75 | int sum = 0; 76 | for (int i = threadIdx.x; i < count; i += blockDim.x) { 77 | sum += source[i]; 78 | } 79 | 80 | sum = reduce_block(sum, sdata, cooperative_groups::this_thread_block()); 81 | 82 | // The last thread of the block writes the result into global memory 83 | if (threadIdx.x == 0) { 84 | *dest = sum; 85 | } 86 | } 87 | 88 | int main(int argc, char **argv) 89 | { 90 | const unsigned int COUNT = 4096 * 4096; 91 | std::unique_ptr source(new int[COUNT]); 92 | 93 | // Fill source matrix with some arbitrary test values 94 | std::mt19937 rng; 95 | rng.seed(0); 96 | std::uniform_int_distribution dist(0, 9); 97 | 98 | for (int i = 0; i < COUNT; i++) { 99 | source[i] = dist(rng); 100 | } 101 | 102 | int N_STREAMS = 16; 103 | int *results[N_STREAMS]; 104 | int *sources[N_STREAMS]; 105 | cudaStream_t stream[N_STREAMS]; 106 | 107 | // Create streams, and allocate input and output for each stream. 108 | size_t size = COUNT * sizeof(int); 109 | for (int i = 0; i < N_STREAMS; i++) { 110 | cudaCheckError(cudaStreamCreate(&stream[i])); 111 | cudaCheckError(cudaMalloc(&results[i], sizeof(int))); 112 | cudaCheckError(cudaMalloc(&sources[i], size)); 113 | cudaCheckError( 114 | cudaMemcpy(sources[i], source.get(), size, cudaMemcpyHostToDevice)); 115 | } 116 | 117 | // Run the kernel 118 | const int BLOCK_SIZE = 256; 119 | size_t shared_memory_size = BLOCK_SIZE * sizeof(int); 120 | 121 | { 122 | KernelTimer t; 123 | for (int i = 0; i < N_STREAMS; i++) { 124 | // Launch each instance of this kernel in a separate stream. 125 | reduce_single_block<<<1, BLOCK_SIZE, shared_memory_size, stream[i]>>>( 126 | sources[i], results[i], COUNT); 127 | } 128 | 129 | // All work has been dispatched to the device. The kernels will run 130 | // concurrently if there is room on the device. The host id idle now, and 131 | // we can do additional concurrent processing on the host. 132 | } 133 | 134 | // Wait for all streams to finish 135 | cudaCheckError(cudaDeviceSynchronize()); 136 | 137 | // Copy result back to the host 138 | int result; 139 | cudaCheckError( 140 | cudaMemcpy(&result, results[0], sizeof(result), cudaMemcpyDeviceToHost)); 141 | for (int i = 0; i < N_STREAMS; i++) { 142 | cudaCheckError(cudaFree(sources[i])); 143 | cudaCheckError(cudaFree(results[i])); 144 | } 145 | 146 | // Compare with reference implementation 147 | int result_reference = std::accumulate(source.get(), source.get() + COUNT, 0); 148 | std::cout << "Sum of " << COUNT << " elements: " << result << "\n"; 149 | assert(result_reference == result); 150 | 151 | return 0; 152 | } 153 | -------------------------------------------------------------------------------- /Code files/Section 6/6.2/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = scan-stream scan-page-locked 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 6/6.2/scan-page-locked: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.2/scan-page-locked -------------------------------------------------------------------------------- /Code files/Section 6/6.2/scan-page-locked.cu: -------------------------------------------------------------------------------- 1 | // Run multiple scans in separate streams, using page-locked memory to 2 | // overlap transfers and computation. 3 | // Example for video 6.2. 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // Standard CUDA API functions 12 | #include 13 | 14 | // CUDA cooperative groups API 15 | #include 16 | 17 | #include "../utils.h" 18 | 19 | void scan_reference(const int *source, int *dest, unsigned int count) 20 | { 21 | int sum = 0; 22 | for (int i = 0; i < count; i++) { 23 | sum += source[i]; 24 | dest[i] = sum; 25 | } 26 | } 27 | 28 | const int BLOCK_SIZE = 1024; 29 | 30 | // Scan using shared memory, within a single block. 31 | __device__ int block_scan(int idata, int shared_data[], 32 | cooperative_groups::thread_block block) 33 | { 34 | // Index into shared memory 35 | int si = threadIdx.x; 36 | shared_data[si] = 0; 37 | si += blockDim.x; 38 | shared_data[si] = idata; 39 | 40 | for (int offset = 1; offset < blockDim.x; offset *= 2) { 41 | cooperative_groups::sync(block); 42 | int t = shared_data[si] + shared_data[si - offset]; 43 | cooperative_groups::sync(block); 44 | shared_data[si] = t; 45 | } 46 | 47 | return shared_data[si]; 48 | } 49 | 50 | // First step of scan: process each block separately 51 | __global__ void scan1(const int *source, int *dest) 52 | { 53 | // Shared memory buffer. By allocating extra elements we avoid bounds 54 | // checks on shared memory access. 55 | __shared__ int shared_data[2 * BLOCK_SIZE]; 56 | 57 | // Index into global memory 58 | int index = blockIdx.x * blockDim.x + threadIdx.x; 59 | 60 | // Load data from global memory 61 | int idata = source[index]; 62 | 63 | // Shared memory scan within this block 64 | int result = 65 | block_scan(idata, shared_data, cooperative_groups::this_thread_block()); 66 | 67 | // Write back to global memory 68 | dest[index] = result; 69 | } 70 | 71 | // Second step of scan: compute prefix sums for each block 72 | __global__ void scan2(const int *dest, int *block_sums, unsigned int count) 73 | { 74 | // Shared memory buffer. By allocating extra elements we avoid bounds 75 | // checks on shared memory access. 76 | __shared__ int shared_data[2 * BLOCK_SIZE]; 77 | 78 | int index = blockIdx.x * blockDim.x + threadIdx.x; 79 | 80 | int idata = (index == 0) ? 0 : dest[index * blockDim.x - 1]; 81 | block_sums[index] = 82 | block_scan(idata, shared_data, cooperative_groups::this_thread_block()); 83 | } 84 | 85 | // Final step of scan: add block sums to every result. 86 | __global__ void finish_scan(const int *block_sums, int *dest) 87 | { 88 | __shared__ int block_sum; 89 | 90 | if (threadIdx.x == 0) { 91 | block_sum = block_sums[blockIdx.x]; 92 | } 93 | cooperative_groups::sync(cooperative_groups::this_thread_block()); 94 | 95 | int index = blockIdx.x * blockDim.x + threadIdx.x; 96 | dest[index] += block_sum; 97 | } 98 | 99 | int main(int argc, char **argv) 100 | { 101 | // Maximum possible size with two-level scan. 102 | const unsigned int COUNT = BLOCK_SIZE * BLOCK_SIZE; 103 | const int N_STREAMS = 2; 104 | 105 | int *sources[N_STREAMS], *dests[N_STREAMS]; 106 | 107 | // Fill source arrays with some arbitrary test values 108 | std::mt19937 rng; 109 | rng.seed(0); 110 | std::uniform_int_distribution dist(0, 9); 111 | 112 | for (int i = 0; i < N_STREAMS; i++) { 113 | // Allocate page-locked memory to allow asynchronous transfers. 114 | cudaMallocHost(&sources[i], COUNT * sizeof(int)); 115 | cudaMallocHost(&dests[i], COUNT * sizeof(int)); 116 | for (int j = 0; j < COUNT; j++) { 117 | sources[i][j] = dist(rng); 118 | } 119 | } 120 | 121 | // Allocate device memory and transfer data 122 | int n_blocks1 = (COUNT + BLOCK_SIZE - 1) / BLOCK_SIZE; 123 | 124 | int *sources_dev[N_STREAMS], *dests_dev[N_STREAMS], *block_sums[N_STREAMS]; 125 | size_t size = COUNT * sizeof(int); 126 | cudaStream_t stream[N_STREAMS]; 127 | 128 | for (int i = 0; i < N_STREAMS; i++) { 129 | cudaCheckError(cudaStreamCreate(&stream[i])); 130 | cudaCheckError(cudaMalloc(&sources_dev[i], size)); 131 | cudaCheckError(cudaMalloc(&dests_dev[i], size)); 132 | // Temporary buffer for kernels 133 | cudaCheckError(cudaMalloc(&block_sums[i], n_blocks1 * sizeof(int))); 134 | } 135 | 136 | { 137 | KernelTimer t; 138 | 139 | for (int i = 0; i < N_STREAMS; i++) { 140 | // Copy data to device 141 | cudaCheckError(cudaMemcpyAsync(sources_dev[i], sources[i], size, 142 | cudaMemcpyHostToDevice, stream[i])); 143 | 144 | // Run the scan 145 | scan1<<>>(sources_dev[i], 146 | dests_dev[i]); 147 | 148 | int n_blocks2 = (n_blocks1 + BLOCK_SIZE - 1) / BLOCK_SIZE; 149 | assert(n_blocks2 == 1); 150 | scan2<<>>(dests_dev[i], 151 | block_sums[i], n_blocks1); 152 | 153 | finish_scan<<>>(block_sums[i], 154 | dests_dev[i]); 155 | 156 | // Copy results back to the host 157 | cudaCheckError(cudaMemcpyAsync(dests[i], dests_dev[i], size, 158 | cudaMemcpyDeviceToHost, stream[i])); 159 | } 160 | } 161 | 162 | for (int i = 0; i < N_STREAMS; i++) { 163 | cudaCheckError(cudaFree(sources_dev[i])); 164 | cudaCheckError(cudaFree(dests_dev[i])); 165 | cudaCheckError(cudaFree(block_sums[i])); 166 | } 167 | 168 | // Compare with reference implementation 169 | std::unique_ptr dest_reference(new int[COUNT]); 170 | for (int i = 0; i < N_STREAMS; i++) { 171 | scan_reference(sources[i], dest_reference.get(), COUNT); 172 | for (int j = 0; j < COUNT; j++) { 173 | assert(dest_reference.get()[j] == dests[i][j]); 174 | } 175 | } 176 | 177 | return 0; 178 | } 179 | -------------------------------------------------------------------------------- /Code files/Section 6/6.2/scan-stream: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.2/scan-stream -------------------------------------------------------------------------------- /Code files/Section 6/6.2/scan-stream.cu: -------------------------------------------------------------------------------- 1 | // Run multiple scans in separate streams. 2 | // Example for video 6.2. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | // Standard CUDA API functions 11 | #include 12 | 13 | // CUDA cooperative groups API 14 | #include 15 | 16 | #include "../utils.h" 17 | 18 | void scan_reference(const int *source, int *dest, unsigned int count) 19 | { 20 | int sum = 0; 21 | for (int i = 0; i < count; i++) { 22 | sum += source[i]; 23 | dest[i] = sum; 24 | } 25 | } 26 | 27 | const int BLOCK_SIZE = 1024; 28 | 29 | // Scan using shared memory, within a single block. 30 | __device__ int block_scan(int idata, int shared_data[], 31 | cooperative_groups::thread_block block) 32 | { 33 | // Index into shared memory 34 | int si = threadIdx.x; 35 | shared_data[si] = 0; 36 | si += blockDim.x; 37 | shared_data[si] = idata; 38 | 39 | for (int offset = 1; offset < blockDim.x; offset *= 2) { 40 | cooperative_groups::sync(block); 41 | int t = shared_data[si] + shared_data[si - offset]; 42 | cooperative_groups::sync(block); 43 | shared_data[si] = t; 44 | } 45 | 46 | return shared_data[si]; 47 | } 48 | 49 | // First step of scan: process each block separately 50 | __global__ void scan1(const int *source, int *dest) 51 | { 52 | // Shared memory buffer. By allocating extra elements we avoid bounds 53 | // checks on shared memory access. 54 | __shared__ int shared_data[2 * BLOCK_SIZE]; 55 | 56 | // Index into global memory 57 | int index = blockIdx.x * blockDim.x + threadIdx.x; 58 | 59 | // Load data from global memory 60 | int idata = source[index]; 61 | 62 | // Shared memory scan within this block 63 | int result = 64 | block_scan(idata, shared_data, cooperative_groups::this_thread_block()); 65 | 66 | // Write back to global memory 67 | dest[index] = result; 68 | } 69 | 70 | // Second step of scan: compute prefix sums for each block 71 | __global__ void scan2(const int *dest, int *block_sums, unsigned int count) 72 | { 73 | // Shared memory buffer. By allocating extra elements we avoid bounds 74 | // checks on shared memory access. 75 | __shared__ int shared_data[2 * BLOCK_SIZE]; 76 | 77 | int index = blockIdx.x * blockDim.x + threadIdx.x; 78 | 79 | int idata = (index == 0) ? 0 : dest[index * blockDim.x - 1]; 80 | block_sums[index] = 81 | block_scan(idata, shared_data, cooperative_groups::this_thread_block()); 82 | } 83 | 84 | // Final step of scan: add block sums to every result. 85 | __global__ void finish_scan(const int *block_sums, int *dest) 86 | { 87 | __shared__ int block_sum; 88 | 89 | if (threadIdx.x == 0) { 90 | block_sum = block_sums[blockIdx.x]; 91 | } 92 | cooperative_groups::sync(cooperative_groups::this_thread_block()); 93 | 94 | int index = blockIdx.x * blockDim.x + threadIdx.x; 95 | dest[index] += block_sum; 96 | } 97 | 98 | int main(int argc, char **argv) 99 | { 100 | // Maximum possible size with two-level scan. 101 | const unsigned int COUNT = BLOCK_SIZE * BLOCK_SIZE; 102 | const int N_STREAMS = 2; 103 | 104 | int *sources[N_STREAMS], *dests[N_STREAMS]; 105 | 106 | // Fill source arrays with some arbitrary test values 107 | std::mt19937 rng; 108 | rng.seed(0); 109 | std::uniform_int_distribution dist(0, 9); 110 | 111 | for (int i = 0; i < N_STREAMS; i++) { 112 | sources[i] = new int[COUNT]; 113 | dests[i] = new int[COUNT]; 114 | for (int j = 0; j < COUNT; j++) { 115 | sources[i][j] = dist(rng); 116 | } 117 | } 118 | 119 | // Allocate device memory and transfer data 120 | int n_blocks1 = (COUNT + BLOCK_SIZE - 1) / BLOCK_SIZE; 121 | 122 | int *sources_dev[N_STREAMS], *dests_dev[N_STREAMS], *block_sums[N_STREAMS]; 123 | size_t size = COUNT * sizeof(int); 124 | cudaStream_t stream[N_STREAMS]; 125 | 126 | for (int i = 0; i < N_STREAMS; i++) { 127 | cudaCheckError(cudaStreamCreate(&stream[i])); 128 | cudaCheckError(cudaMalloc(&sources_dev[i], size)); 129 | cudaCheckError(cudaMalloc(&dests_dev[i], size)); 130 | // Temporary buffer for kernels 131 | cudaCheckError(cudaMalloc(&block_sums[i], n_blocks1 * sizeof(int))); 132 | } 133 | 134 | // Code in this block will be timed by KernelTimer 135 | { 136 | KernelTimer t; 137 | 138 | // Copy data to device 139 | for (int i = 0; i < N_STREAMS; i++) { 140 | cudaCheckError( 141 | cudaMemcpy(sources_dev[i], sources[i], size, cudaMemcpyHostToDevice)); 142 | } 143 | 144 | // Run the scans in separate streams 145 | for (int i = 0; i < N_STREAMS; i++) { 146 | scan1<<>>(sources_dev[i], 147 | dests_dev[i]); 148 | 149 | int n_blocks2 = (n_blocks1 + BLOCK_SIZE - 1) / BLOCK_SIZE; 150 | assert(n_blocks2 == 1); 151 | scan2<<>>(dests_dev[i], 152 | block_sums[i], n_blocks1); 153 | 154 | finish_scan<<>>(block_sums[i], 155 | dests_dev[i]); 156 | } 157 | 158 | // Copy results back to the host 159 | for (int i = 0; i < N_STREAMS; i++) { 160 | cudaCheckError( 161 | cudaMemcpy(dests[i], dests_dev[i], size, cudaMemcpyDeviceToHost)); 162 | } 163 | } 164 | 165 | for (int i = 0; i < N_STREAMS; i++) { 166 | cudaCheckError(cudaFree(sources_dev[i])); 167 | cudaCheckError(cudaFree(dests_dev[i])); 168 | cudaCheckError(cudaFree(block_sums[i])); 169 | } 170 | 171 | // Compare with reference implementation 172 | std::unique_ptr dest_reference(new int[COUNT]); 173 | for (int i = 0; i < N_STREAMS; i++) { 174 | scan_reference(sources[i], dest_reference.get(), COUNT); 175 | for (int j = 0; j < COUNT; j++) { 176 | assert(dest_reference.get()[j] == dests[i][j]); 177 | } 178 | } 179 | 180 | return 0; 181 | } 182 | -------------------------------------------------------------------------------- /Code files/Section 6/6.4/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = scan-multi-device 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 6/6.4/scan-multi-device: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.4/scan-multi-device -------------------------------------------------------------------------------- /Code files/Section 6/6.4/scan-multi-device.cu: -------------------------------------------------------------------------------- 1 | // Run multiple scans concurrently across all available devices. 2 | // Example for video 6.4. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | // Standard CUDA API functions 11 | #include 12 | 13 | // CUDA cooperative groups API 14 | #include 15 | 16 | #include "../utils.h" 17 | 18 | void scan_reference(const int *source, int *dest, unsigned int count) 19 | { 20 | int sum = 0; 21 | for (int i = 0; i < count; i++) { 22 | sum += source[i]; 23 | dest[i] = sum; 24 | } 25 | } 26 | 27 | const int BLOCK_SIZE = 1024; 28 | 29 | // Scan using shared memory, within a single block. 30 | __device__ int block_scan(int idata, int shared_data[], 31 | cooperative_groups::thread_block block) 32 | { 33 | // Index into shared memory 34 | int si = threadIdx.x; 35 | shared_data[si] = 0; 36 | si += blockDim.x; 37 | shared_data[si] = idata; 38 | 39 | for (int offset = 1; offset < blockDim.x; offset *= 2) { 40 | cooperative_groups::sync(block); 41 | int t = shared_data[si] + shared_data[si - offset]; 42 | cooperative_groups::sync(block); 43 | shared_data[si] = t; 44 | } 45 | 46 | return shared_data[si]; 47 | } 48 | 49 | // First step of scan: process each block separately 50 | __global__ void scan1(const int *source, int *dest) 51 | { 52 | // Shared memory buffer. By allocating extra elements we avoid bounds 53 | // checks on shared memory access. 54 | __shared__ int shared_data[2 * BLOCK_SIZE]; 55 | 56 | // Index into global memory 57 | int index = blockIdx.x * blockDim.x + threadIdx.x; 58 | 59 | // Load data from global memory 60 | int idata = source[index]; 61 | 62 | // Shared memory scan within this block 63 | int result = 64 | block_scan(idata, shared_data, cooperative_groups::this_thread_block()); 65 | 66 | // Write back to global memory 67 | dest[index] = result; 68 | } 69 | 70 | // Second step of scan: compute prefix sums for each block 71 | __global__ void scan2(const int *dest, int *block_sums, unsigned int count) 72 | { 73 | // Shared memory buffer. By allocating extra elements we avoid bounds 74 | // checks on shared memory access. 75 | __shared__ int shared_data[2 * BLOCK_SIZE]; 76 | 77 | int index = blockIdx.x * blockDim.x + threadIdx.x; 78 | 79 | int idata = (index == 0) ? 0 : dest[index * blockDim.x - 1]; 80 | block_sums[index] = 81 | block_scan(idata, shared_data, cooperative_groups::this_thread_block()); 82 | } 83 | 84 | // Final step of scan: add block sums to every result. 85 | __global__ void finish_scan(const int *block_sums, int *dest) 86 | { 87 | __shared__ int block_sum; 88 | 89 | if (threadIdx.x == 0) { 90 | block_sum = block_sums[blockIdx.x]; 91 | } 92 | cooperative_groups::sync(cooperative_groups::this_thread_block()); 93 | 94 | int index = blockIdx.x * blockDim.x + threadIdx.x; 95 | dest[index] += block_sum; 96 | } 97 | 98 | int main(int argc, char **argv) 99 | { 100 | // Maximum possible size with two-level scan. 101 | const unsigned int COUNT = BLOCK_SIZE * BLOCK_SIZE; 102 | const int N_STREAMS = 2; 103 | 104 | int *sources[N_STREAMS], *dests[N_STREAMS]; 105 | 106 | // Fill source arrays with some arbitrary test values 107 | std::mt19937 rng; 108 | rng.seed(0); 109 | std::uniform_int_distribution dist(0, 9); 110 | 111 | int device_count; 112 | cudaCheckError(cudaGetDeviceCount(&device_count)); 113 | 114 | for (int i = 0; i < N_STREAMS; i++) { 115 | // Allocate page-locked memory to allow asynchronous transfers. 116 | cudaMallocHost(&sources[i], COUNT * sizeof(int)); 117 | cudaMallocHost(&dests[i], COUNT * sizeof(int)); 118 | for (int j = 0; j < COUNT; j++) { 119 | sources[i][j] = dist(rng); 120 | } 121 | } 122 | 123 | // Allocate device memory and transfer data 124 | int n_blocks1 = (COUNT + BLOCK_SIZE - 1) / BLOCK_SIZE; 125 | 126 | int *sources_dev[N_STREAMS], *dests_dev[N_STREAMS], *block_sums[N_STREAMS]; 127 | size_t size = COUNT * sizeof(int); 128 | cudaStream_t stream[N_STREAMS]; 129 | 130 | for (int i = 0; i < N_STREAMS; i++) { 131 | int device = i % device_count; 132 | cudaCheckError(cudaSetDevice(device)); 133 | std::cout << "Stream " << i << " on device " << device << "\n"; 134 | cudaCheckError(cudaStreamCreate(&stream[i])); 135 | cudaCheckError(cudaMalloc(&sources_dev[i], size)); 136 | cudaCheckError(cudaMalloc(&dests_dev[i], size)); 137 | // Temporary buffer for kernels 138 | cudaCheckError(cudaMalloc(&block_sums[i], n_blocks1 * sizeof(int))); 139 | } 140 | 141 | { 142 | KernelTimer t; 143 | 144 | for (int i = 0; i < N_STREAMS; i++) { 145 | int device = i % device_count; 146 | cudaCheckError(cudaSetDevice(device)); 147 | 148 | // Copy data to device 149 | cudaCheckError(cudaMemcpyAsync(sources_dev[i], sources[i], size, 150 | cudaMemcpyHostToDevice, stream[i])); 151 | 152 | // Run the scan 153 | scan1<<>>(sources_dev[i], 154 | dests_dev[i]); 155 | 156 | int n_blocks2 = (n_blocks1 + BLOCK_SIZE - 1) / BLOCK_SIZE; 157 | assert(n_blocks2 == 1); 158 | scan2<<>>(dests_dev[i], 159 | block_sums[i], n_blocks1); 160 | 161 | finish_scan<<>>(block_sums[i], 162 | dests_dev[i]); 163 | 164 | // Copy results back to the host 165 | cudaCheckError(cudaMemcpyAsync(dests[i], dests_dev[i], size, 166 | cudaMemcpyDeviceToHost, stream[i])); 167 | } 168 | } 169 | 170 | for (int i = 0; i < N_STREAMS; i++) { 171 | cudaCheckError(cudaFree(sources_dev[i])); 172 | cudaCheckError(cudaFree(dests_dev[i])); 173 | cudaCheckError(cudaFree(block_sums[i])); 174 | } 175 | 176 | // Compare with reference implementation 177 | std::unique_ptr dest_reference(new int[COUNT]); 178 | for (int i = 0; i < N_STREAMS; i++) { 179 | scan_reference(sources[i], dest_reference.get(), COUNT); 180 | for (int j = 0; j < COUNT; j++) { 181 | assert(dest_reference.get()[j] == dests[i][j]); 182 | } 183 | } 184 | 185 | return 0; 186 | } 187 | -------------------------------------------------------------------------------- /Code files/Section 6/6.5/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = scan-unified 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 6/6.5/scan-unified: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.5/scan-unified -------------------------------------------------------------------------------- /Code files/Section 6/6.5/scan-unified.cu: -------------------------------------------------------------------------------- 1 | // Demonstration of the unified virtual address space. Run multiple scans 2 | // concurrently across all available devices 3 | // Example for video 6.5. 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // Standard CUDA API functions 12 | #include 13 | 14 | // CUDA cooperative groups API 15 | #include 16 | 17 | #include "../utils.h" 18 | 19 | void scan_reference(const int *source, int *dest, unsigned int count) 20 | { 21 | int sum = 0; 22 | for (int i = 0; i < count; i++) { 23 | sum += source[i]; 24 | dest[i] = sum; 25 | } 26 | } 27 | 28 | const int BLOCK_SIZE = 1024; 29 | 30 | // Scan using shared memory, within a single block. 31 | __device__ int block_scan(int idata, int shared_data[], 32 | cooperative_groups::thread_block block) 33 | { 34 | // Index into shared memory 35 | int si = threadIdx.x; 36 | shared_data[si] = 0; 37 | si += blockDim.x; 38 | shared_data[si] = idata; 39 | 40 | for (int offset = 1; offset < blockDim.x; offset *= 2) { 41 | cooperative_groups::sync(block); 42 | int t = shared_data[si] + shared_data[si - offset]; 43 | cooperative_groups::sync(block); 44 | shared_data[si] = t; 45 | } 46 | 47 | return shared_data[si]; 48 | } 49 | 50 | // First step of scan: process each block separately 51 | __global__ void scan1(const int *source, int *dest) 52 | { 53 | // Shared memory buffer. By allocating extra elements we avoid bounds 54 | // checks on shared memory access. 55 | __shared__ int shared_data[2 * BLOCK_SIZE]; 56 | 57 | // Index into global memory 58 | int index = blockIdx.x * blockDim.x + threadIdx.x; 59 | 60 | // Load data from global memory 61 | int idata = source[index]; 62 | 63 | // Shared memory scan within this block 64 | int result = 65 | block_scan(idata, shared_data, cooperative_groups::this_thread_block()); 66 | 67 | // Write back to global memory 68 | dest[index] = result; 69 | } 70 | 71 | // Second step of scan: compute prefix sums for each block 72 | __global__ void scan2(const int *dest, int *block_sums, unsigned int count) 73 | { 74 | // Shared memory buffer. By allocating extra elements we avoid bounds 75 | // checks on shared memory access. 76 | __shared__ int shared_data[2 * BLOCK_SIZE]; 77 | 78 | int index = blockIdx.x * blockDim.x + threadIdx.x; 79 | 80 | int idata = (index == 0) ? 0 : dest[index * blockDim.x - 1]; 81 | block_sums[index] = 82 | block_scan(idata, shared_data, cooperative_groups::this_thread_block()); 83 | } 84 | 85 | // Final step of scan: add block sums to every result. 86 | __global__ void finish_scan(const int *block_sums, int *dest) 87 | { 88 | __shared__ int block_sum; 89 | 90 | if (threadIdx.x == 0) { 91 | block_sum = block_sums[blockIdx.x]; 92 | } 93 | cooperative_groups::sync(cooperative_groups::this_thread_block()); 94 | 95 | int index = blockIdx.x * blockDim.x + threadIdx.x; 96 | dest[index] += block_sum; 97 | } 98 | 99 | static void print_pointer(const std::string &name, const void *pointer) 100 | { 101 | cudaPointerAttributes attributes; 102 | auto result = cudaPointerGetAttributes(&attributes, pointer); 103 | 104 | std::cout << name << ": "; 105 | if (result != cudaSuccess) { 106 | std::cout << "get attributes failed"; 107 | return; 108 | } else { 109 | switch (attributes.type) { 110 | case cudaMemoryTypeUnregistered: 111 | std::cout << "unregistered"; 112 | break; 113 | case cudaMemoryTypeHost: 114 | std::cout << "host memory"; 115 | break; 116 | case cudaMemoryTypeDevice: 117 | std::cout << "device " << attributes.device; 118 | break; 119 | case cudaMemoryTypeManaged: 120 | std::cout << "managed"; 121 | break; 122 | } 123 | } 124 | 125 | std::cout << "\n"; 126 | } 127 | 128 | int main(int argc, char **argv) 129 | { 130 | // Maximum possible size with two-level scan. 131 | const unsigned int COUNT = BLOCK_SIZE * BLOCK_SIZE; 132 | const int N_STREAMS = 2; 133 | 134 | int *sources[N_STREAMS], *dests[N_STREAMS]; 135 | 136 | // Fill source arrays with some arbitrary test values 137 | std::mt19937 rng; 138 | rng.seed(0); 139 | std::uniform_int_distribution dist(0, 9); 140 | 141 | int device_count; 142 | cudaCheckError(cudaGetDeviceCount(&device_count)); 143 | 144 | for (int i = 0; i < N_STREAMS; i++) { 145 | // Allocate page-locked memory to allow asynchronous transfers. 146 | cudaMallocHost(&sources[i], COUNT * sizeof(int)); 147 | cudaMallocHost(&dests[i], COUNT * sizeof(int)); 148 | for (int j = 0; j < COUNT; j++) { 149 | sources[i][j] = dist(rng); 150 | } 151 | } 152 | 153 | // Allocate device memory and transfer data 154 | int n_blocks1 = (COUNT + BLOCK_SIZE - 1) / BLOCK_SIZE; 155 | 156 | int *sources_dev[N_STREAMS], *dests_dev[N_STREAMS], *block_sums[N_STREAMS]; 157 | size_t size = COUNT * sizeof(int); 158 | cudaStream_t stream[N_STREAMS]; 159 | 160 | for (int i = 0; i < N_STREAMS; i++) { 161 | int device = i % device_count; 162 | cudaCheckError(cudaSetDevice(device)); 163 | cudaCheckError(cudaStreamCreate(&stream[i])); 164 | cudaCheckError(cudaMalloc(&sources_dev[i], size)); 165 | cudaCheckError(cudaMalloc(&dests_dev[i], size)); 166 | // Temporary buffer for kernels 167 | cudaCheckError(cudaMalloc(&block_sums[i], n_blocks1 * sizeof(int))); 168 | } 169 | 170 | { 171 | KernelTimer t; 172 | 173 | for (int i = 0; i < N_STREAMS; i++) { 174 | int device = i % device_count; 175 | cudaCheckError(cudaSetDevice(device)); 176 | 177 | std::cout << "Stream " << i << " on device " << device << "\n"; 178 | print_pointer("source", sources[i]); 179 | print_pointer("source_dev", sources_dev[i]); 180 | print_pointer("dest_dev", dests_dev[i]); 181 | print_pointer("dest", dests[i]); 182 | 183 | // Copy data to device 184 | cudaCheckError(cudaMemcpyAsync(sources_dev[i], sources[i], size, 185 | cudaMemcpyDefault, stream[i])); 186 | 187 | // Run the scan 188 | scan1<<>>(sources_dev[i], 189 | dests_dev[i]); 190 | 191 | int n_blocks2 = (n_blocks1 + BLOCK_SIZE - 1) / BLOCK_SIZE; 192 | assert(n_blocks2 == 1); 193 | scan2<<>>(dests_dev[i], 194 | block_sums[i], n_blocks1); 195 | 196 | finish_scan<<>>(block_sums[i], 197 | dests_dev[i]); 198 | 199 | // Copy results back to the host 200 | cudaCheckError(cudaMemcpyAsync(dests[i], dests_dev[i], size, 201 | cudaMemcpyDefault, stream[i])); 202 | std::cout << "\n"; 203 | } 204 | } 205 | 206 | for (int i = 0; i < N_STREAMS; i++) { 207 | cudaCheckError(cudaFree(sources_dev[i])); 208 | cudaCheckError(cudaFree(dests_dev[i])); 209 | cudaCheckError(cudaFree(block_sums[i])); 210 | } 211 | 212 | // Compare with reference implementation 213 | std::unique_ptr dest_reference(new int[COUNT]); 214 | for (int i = 0; i < N_STREAMS; i++) { 215 | scan_reference(sources[i], dest_reference.get(), COUNT); 216 | for (int j = 0; j < COUNT; j++) { 217 | assert(dest_reference.get()[j] == dests[i][j]); 218 | } 219 | } 220 | 221 | return 0; 222 | } 223 | -------------------------------------------------------------------------------- /Code files/Section 6/6.6/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = bst 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 6/6.6/bst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.6/bst -------------------------------------------------------------------------------- /Code files/Section 6/6.6/bst.cu: -------------------------------------------------------------------------------- 1 | // Build and print a binary search tree on the device, using dynamic global 2 | // memory allocation. 3 | // Example for video 6.6. 4 | 5 | #include 6 | #include 7 | 8 | // Standard CUDA API functions 9 | #include 10 | 11 | #include "../utils.h" 12 | 13 | struct Tree { 14 | int value; 15 | Tree *left; 16 | Tree *right; 17 | }; 18 | 19 | // Helper function to construct a binary search tree from a sorted array. 20 | __device__ void build_subtree(Tree *root, const int *source, int left, 21 | int right) 22 | { 23 | int middle = (left + right) / 2; 24 | root->value = source[middle]; 25 | 26 | if (middle == left) { 27 | root->left = nullptr; 28 | } else { 29 | root->left = new Tree(); 30 | build_subtree(root->left, source, left, middle - 1); 31 | } 32 | 33 | if (middle == right) { 34 | root->right = nullptr; 35 | } else { 36 | root->right = new Tree(); 37 | build_subtree(root->right, source, middle + 1, right); 38 | } 39 | } 40 | 41 | // Construct a binary search tree from a sorted array. This kernel should be 42 | // run with a single thread. 43 | __global__ void build_tree(const int *source, unsigned int length, Tree *root) 44 | { 45 | build_subtree(root, source, 0, length - 1); 46 | } 47 | 48 | // Print the nodes of a tree, in order. 49 | __device__ void print_subtree(const Tree *root) 50 | { 51 | if (root->left) { 52 | print_subtree(root->left); 53 | } 54 | printf("%d\n", root->value); 55 | if (root->right) { 56 | print_subtree(root->right); 57 | } 58 | } 59 | __global__ void print_tree(const Tree *root) { print_subtree(root); } 60 | 61 | // Free a device-allocated tree. 62 | __device__ void destroy_subtree(Tree *root) 63 | { 64 | if (root->left) { 65 | destroy_subtree(root->left); 66 | } 67 | if (root->right) { 68 | destroy_subtree(root->right); 69 | } 70 | delete root; 71 | } 72 | 73 | __global__ void destroy_tree(Tree *root) 74 | { 75 | if (root->left) { 76 | destroy_subtree(root->left); 77 | } 78 | if (root->right) { 79 | destroy_subtree(root->right); 80 | } 81 | // Do not destroy root! It was allocated with cudaMalloc and must be freed 82 | // from host code. 83 | } 84 | 85 | int main(int argc, char **argv) 86 | { 87 | const unsigned int COUNT = 128; 88 | 89 | // Create device vector with sequential integers 90 | thrust::device_vector source(COUNT); 91 | thrust::sequence(source.begin(), source.end()); 92 | 93 | // Allocate a root for the tree 94 | Tree *root; 95 | cudaCheckError(cudaMalloc(&root, sizeof(Tree))); 96 | 97 | // Build the tree from a sorted array 98 | build_tree<<<1, 1>>>(thrust::raw_pointer_cast(&source[0]), source.size(), 99 | root); 100 | 101 | // Print the tree values, in order 102 | print_tree<<<1, 1>>>(root); 103 | 104 | // Destroy all the subtrees which were allocated with new in device 105 | // code. 106 | destroy_tree<<<1, 1>>>(root); 107 | 108 | // Destroy the root which was allocated with cudaMalloc. 109 | cudaCheckError(cudaFree(root)); 110 | 111 | cudaCheckError(cudaDeviceSynchronize()); 112 | } 113 | -------------------------------------------------------------------------------- /Code files/Section 6/6.7/Makefile: -------------------------------------------------------------------------------- 1 | CUDAFLAGS ?= -g 2 | 3 | ALL = bst-sum 4 | 5 | all: $(ALL) 6 | 7 | ../utils.o: ../utils.cu ../utils.h 8 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $< 9 | 10 | %: %.cu ../utils.o 11 | nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^ 12 | 13 | # Dynamic parallelism requires separate compilation of kernels and 14 | # host code 15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh 16 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu 17 | 18 | bst-sum.o: ../utils.h bst-sum.cu 19 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu 20 | 21 | bst-sum: bst-sum.o bst-sum-kernels.o 22 | nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o 23 | 24 | clean: 25 | rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL) 26 | -------------------------------------------------------------------------------- /Code files/Section 6/6.7/bst-sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.7/bst-sum -------------------------------------------------------------------------------- /Code files/Section 6/6.7/bst-sum-kernels.cu: -------------------------------------------------------------------------------- 1 | // Kernel code for bst-sum example, used in video 6.7. 2 | 3 | #include 4 | 5 | #include "bst-sum-kernels.cuh" 6 | 7 | #define kernelCheckError(code) \ 8 | { \ 9 | if ((code) != cudaSuccess) { \ 10 | printf("Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \ 11 | cudaGetErrorString(code)); \ 12 | return; \ 13 | } \ 14 | } 15 | 16 | __device__ void build_subtree(Tree *root, const int *source, int left, 17 | int right) 18 | { 19 | int middle = (left + right) / 2; 20 | root->value = source[middle]; 21 | 22 | if (left >= right) { 23 | return; 24 | } 25 | 26 | if (middle == left) { 27 | root->left = nullptr; 28 | } else { 29 | root->left = new Tree(); 30 | build_subtree(root->left, source, left, middle - 1); 31 | } 32 | 33 | if (middle == right) { 34 | root->right = nullptr; 35 | } else { 36 | root->right = new Tree(); 37 | build_subtree(root->right, source, middle + 1, right); 38 | } 39 | } 40 | 41 | __global__ void build_tree(const int *source, unsigned int length, Tree *root) 42 | { 43 | build_subtree(root, source, 0, length - 1); 44 | } 45 | 46 | __device__ void destroy_subtree(Tree *root) 47 | { 48 | if (root->left) { 49 | destroy_subtree(root->left); 50 | } 51 | if (root->right) { 52 | destroy_subtree(root->right); 53 | } 54 | delete root; 55 | } 56 | 57 | __global__ void destroy_tree(Tree *root) 58 | { 59 | if (root->left) { 60 | destroy_subtree(root->left); 61 | } 62 | if (root->right) { 63 | destroy_subtree(root->right); 64 | } 65 | // Do not destroy root! It was allocated with cudaMalloc and must be freed 66 | // from host code. 67 | } 68 | 69 | __global__ void sum_tree(const Tree *root, int *result) 70 | { 71 | // Allocate temporary global memory for storing subtree results 72 | int *left_sum = new int; 73 | int *right_sum = new int; 74 | 75 | // Create independent streams to sum each subtree 76 | cudaStream_t left_stream, right_stream; 77 | kernelCheckError( 78 | cudaStreamCreateWithFlags(&left_stream, cudaStreamNonBlocking)); 79 | kernelCheckError( 80 | cudaStreamCreateWithFlags(&right_stream, cudaStreamNonBlocking)); 81 | 82 | if (root->left) { 83 | sum_tree<<<1, 1, 0, left_stream>>>(root->left, left_sum); 84 | } else { 85 | *left_sum = 0; 86 | } 87 | 88 | if (root->right) { 89 | sum_tree<<<1, 1, 0, right_stream>>>(root->right, right_sum); 90 | } else { 91 | *right_sum = 0; 92 | } 93 | 94 | // Wait for both streams to finish 95 | kernelCheckError(cudaDeviceSynchronize()); 96 | 97 | *result = root->value + *left_sum + *right_sum; 98 | 99 | kernelCheckError(cudaStreamDestroy(left_stream)); 100 | kernelCheckError(cudaStreamDestroy(right_stream)); 101 | 102 | delete left_sum; 103 | delete right_sum; 104 | } 105 | -------------------------------------------------------------------------------- /Code files/Section 6/6.7/bst-sum-kernels.cuh: -------------------------------------------------------------------------------- 1 | // Header for bst-sum example, using in video 6.7. 2 | 3 | struct Tree { 4 | int value; 5 | Tree *left; 6 | Tree *right; 7 | }; 8 | 9 | __global__ void build_tree(const int *source, unsigned int length, Tree *root); 10 | __global__ void destroy_tree(Tree *root); 11 | __global__ void sum_tree(const Tree *root, int *result); 12 | -------------------------------------------------------------------------------- /Code files/Section 6/6.7/bst-sum-kernels.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.7/bst-sum-kernels.o -------------------------------------------------------------------------------- /Code files/Section 6/6.7/bst-sum.cu: -------------------------------------------------------------------------------- 1 | // Sum the contents of a binary search tree on the device, using dynamic 2 | // parallelism. 3 | // Example for video 6.7. 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | // Standard CUDA API functions 12 | #include 13 | 14 | #include "bst-sum-kernels.cuh" 15 | #include "../utils.h" 16 | 17 | int main(int argc, char **argv) 18 | { 19 | const unsigned int COUNT = 128; 20 | 21 | // CUDA needs to reserve some device memory to manage synchronization for 22 | // nested kernels. If we exceed the maximum reserved depth, our kernel will 23 | // fail. This setting is sufficient for 128 elements. It should be adjusted 24 | // if COUNT is changed. 25 | cudaCheckError(cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 8)); 26 | 27 | // Create device vector with sequential integers 28 | thrust::device_vector source(COUNT); 29 | thrust::sequence(source.begin(), source.end()); 30 | 31 | // Build the tree 32 | Tree *root; 33 | cudaCheckError(cudaMalloc(&root, sizeof(Tree))); 34 | 35 | // Build the tree from a sorted array 36 | build_tree<<<1, 1>>>(thrust::raw_pointer_cast(&source[0]), source.size(), 37 | root); 38 | 39 | // Reduce 40 | int *result_dev; 41 | cudaCheckError(cudaMalloc(&result_dev, sizeof(int))); 42 | 43 | sum_tree<<<1, 1>>>(root, result_dev); 44 | 45 | // Check results 46 | int result; 47 | cudaCheckError( 48 | cudaMemcpy(&result, result_dev, sizeof(int), cudaMemcpyDefault)); 49 | int reference = 50 | thrust::reduce(source.begin(), source.end(), 0, thrust::plus()); 51 | 52 | printf("Sum of %u elements: %d\n", COUNT, result); 53 | assert(result == reference); 54 | 55 | // Clean up 56 | destroy_tree<<<1, 1>>>(root); 57 | cudaCheckError(cudaFree(root)); 58 | 59 | cudaCheckError(cudaDeviceSynchronize()); 60 | } 61 | -------------------------------------------------------------------------------- /Code files/Section 6/6.7/bst-sum.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.7/bst-sum.o -------------------------------------------------------------------------------- /Code files/utils.cu: -------------------------------------------------------------------------------- 1 | // Utility functions for example programs. 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | #include "utils.h" 13 | 14 | const unsigned int HEADER_SIZE = 0x40; 15 | const unsigned int CHANNELS = 3; 16 | 17 | bool loadPPM(const char *file, pixel **data, unsigned int *w, unsigned int *h) 18 | { 19 | FILE *fp = fopen(file, "rb"); 20 | 21 | if (!fp) { 22 | std::cerr << "loadPPM() : failed to open file: " << file << "\n"; 23 | return false; 24 | } 25 | 26 | // check header 27 | char header[HEADER_SIZE]; 28 | 29 | if (fgets(header, HEADER_SIZE, fp) == nullptr) { 30 | std::cerr << "loadPPM(): reading header returned NULL\n"; 31 | return false; 32 | } 33 | 34 | if (strncmp(header, "P6", 2)) { 35 | std::cerr << "unsupported image format\n"; 36 | return false; 37 | } 38 | 39 | // parse header, read maxval, width and height 40 | unsigned int width = 0; 41 | unsigned int height = 0; 42 | unsigned int maxval = 0; 43 | unsigned int i = 0; 44 | 45 | while (i < 3) { 46 | if (fgets(header, HEADER_SIZE, fp) == NULL) { 47 | std::cerr << "loadPPM() : reading PPM header returned NULL" << std::endl; 48 | return false; 49 | } 50 | 51 | if (header[0] == '#') { 52 | continue; 53 | } 54 | 55 | if (i == 0) { 56 | i += sscanf(header, "%u %u %u", &width, &height, &maxval); 57 | } else if (i == 1) { 58 | i += sscanf(header, "%u %u", &height, &maxval); 59 | } else if (i == 2) { 60 | i += sscanf(header, "%u", &maxval); 61 | } 62 | } 63 | 64 | size_t pixel_count = width * height; 65 | size_t data_size = sizeof(unsigned char) * pixel_count * CHANNELS; 66 | unsigned char *raw_data = static_cast(malloc(data_size)); 67 | *w = width; 68 | *h = height; 69 | 70 | // read and close file 71 | if (fread(raw_data, sizeof(unsigned char), pixel_count * CHANNELS, fp) == 0) { 72 | std::cerr << "loadPPM() read data returned error.\n"; 73 | } 74 | fclose(fp); 75 | 76 | pixel *pixel_data = static_cast(malloc(pixel_count * sizeof(pixel))); 77 | float scale = 1.0f / 255.0f; 78 | for (int i = 0; i < pixel_count; i++) { 79 | pixel_data[i].red = raw_data[3 * i] * scale; 80 | pixel_data[i].green = raw_data[3 * i + 1] * scale; 81 | pixel_data[i].blue = raw_data[3 * i + 2] * scale; 82 | } 83 | 84 | *data = pixel_data; 85 | free(raw_data); 86 | 87 | return true; 88 | } 89 | 90 | void savePPM(const char *file, pixel *data, unsigned int w, unsigned int h) 91 | { 92 | assert(data != nullptr); 93 | assert(w > 0); 94 | assert(h > 0); 95 | 96 | std::fstream fh(file, std::fstream::out | std::fstream::binary); 97 | 98 | if (fh.bad()) { 99 | std::cerr << "savePPM() : open failed.\n"; 100 | return; 101 | } 102 | 103 | fh << "P6\n"; 104 | fh << w << "\n" << h << "\n" << 0xff << "\n"; 105 | 106 | unsigned int pixel_count = w * h; 107 | for (unsigned int i = 0; (i < pixel_count) && fh.good(); ++i) { 108 | fh << static_cast(data[i].red * 255); 109 | fh << static_cast(data[i].green * 255); 110 | fh << static_cast(data[i].blue * 255); 111 | } 112 | 113 | fh.flush(); 114 | 115 | if (fh.bad()) { 116 | std::cerr << "savePPM() : writing data failed.\n"; 117 | return; 118 | } 119 | 120 | fh.close(); 121 | } 122 | 123 | test_params set_up_test(int argc, char **argv) 124 | { 125 | test_params params = {0, 0, nullptr, nullptr, nullptr}; 126 | 127 | bool show_help = false; 128 | for (int i = 1; i < argc; i++) { 129 | char *current = argv[i]; 130 | if (!strncmp(current, "--", 2)) { 131 | show_help = true; 132 | break; 133 | } else if (params.input_image == nullptr) { 134 | // Load input 135 | pixel *host_image = nullptr; 136 | if (!loadPPM(current, &host_image, ¶ms.width, ¶ms.height)) { 137 | exit(1); 138 | } 139 | 140 | size_t image_size = params.width * params.height * sizeof(pixel); 141 | cudaCheckError(cudaMalloc(¶ms.input_image, image_size)); 142 | cudaCheckError(cudaMalloc(¶ms.output_image, image_size)); 143 | cudaCheckError(cudaMemcpy(params.input_image, host_image, image_size, 144 | cudaMemcpyHostToDevice)); 145 | 146 | } else if (params.output_file == nullptr) { 147 | // Save output filename 148 | params.output_file = current; 149 | } else { 150 | show_help = true; 151 | break; 152 | } 153 | } 154 | 155 | if (!params.output_file || !params.input_image) { 156 | show_help = true; 157 | } 158 | 159 | if (show_help) { 160 | std::cout << "Usage: " << argv[0] << " INPUT_FILE OUTPUT_FILE\n"; 161 | exit(1); 162 | } 163 | 164 | return params; 165 | } 166 | 167 | void finish_test(const test_params ¶ms) 168 | { 169 | std::unique_ptr host_image(new pixel[params.width * params.height]); 170 | 171 | cudaCheckError(cudaMemcpy(host_image.get(), params.output_image, 172 | params.width * params.height * sizeof(pixel), 173 | cudaMemcpyDeviceToHost)); 174 | if (params.input_image) { 175 | cudaCheckError(cudaFree(params.input_image)); 176 | } 177 | if (params.output_image) { 178 | cudaCheckError(cudaFree(params.output_image)); 179 | } 180 | 181 | savePPM(params.output_file, host_image.get(), params.width, params.height); 182 | } 183 | 184 | __global__ void unpack_image(image planar, const pixel *packed, int pixel_count) 185 | { 186 | int index = blockIdx.x * blockDim.x + threadIdx.x; 187 | if (index >= pixel_count) return; 188 | 189 | planar.red[index] = packed[index].red; 190 | planar.green[index] = packed[index].green; 191 | planar.blue[index] = packed[index].blue; 192 | } 193 | 194 | __global__ void pack_image(const image planar, pixel *packed, int pixel_count) 195 | { 196 | int index = blockIdx.x * blockDim.x + threadIdx.x; 197 | if (index >= pixel_count) return; 198 | 199 | packed[index].red = planar.red[index]; 200 | packed[index].green = planar.green[index]; 201 | packed[index].blue = planar.blue[index]; 202 | } 203 | 204 | image malloc_image(int pixel_count) 205 | { 206 | image result; 207 | cudaCheckError(cudaMalloc(&result.red, pixel_count * sizeof(float))); 208 | cudaCheckError(cudaMalloc(&result.green, pixel_count * sizeof(float))); 209 | cudaCheckError(cudaMalloc(&result.blue, pixel_count * sizeof(float))); 210 | 211 | return result; 212 | } 213 | 214 | void free_image(const image &img) 215 | { 216 | cudaCheckError(cudaFree(img.red)); 217 | cudaCheckError(cudaFree(img.green)); 218 | cudaCheckError(cudaFree(img.blue)); 219 | } 220 | 221 | const int BLOCK_SIZE = 128; 222 | 223 | test_params_planar set_up_test_planar(int argc, char **argv) 224 | { 225 | test_params params1 = set_up_test(argc, argv); 226 | test_params_planar params = { 227 | params1.width, params1.height, {}, {}, params1.output_file}; 228 | 229 | int pixel_count = params.width * params.height; 230 | params.input_image = malloc_image(pixel_count); 231 | params.output_image = malloc_image(pixel_count); 232 | 233 | int n_blocks = (pixel_count + BLOCK_SIZE - 1) / BLOCK_SIZE; 234 | unpack_image<<>>(params.input_image, 235 | params1.input_image, pixel_count); 236 | 237 | cudaCheckError(cudaFree(params1.input_image)); 238 | params1.input_image = nullptr; 239 | 240 | return params; 241 | } 242 | 243 | void finish_test_planar(const test_params_planar ¶ms) 244 | { 245 | free_image(params.input_image); 246 | 247 | test_params params1 = {params.width, params.height, nullptr, nullptr, 248 | params.output_file}; 249 | 250 | int pixel_count = params.width * params.height; 251 | cudaCheckError( 252 | cudaMalloc(¶ms1.output_image, pixel_count * sizeof(pixel))); 253 | 254 | int n_blocks = (pixel_count + BLOCK_SIZE - 1) / BLOCK_SIZE; 255 | pack_image<<>>(params.output_image, 256 | params1.output_image, pixel_count); 257 | 258 | free_image(params.output_image); 259 | 260 | finish_test(params1); 261 | } 262 | 263 | KernelTimer::KernelTimer() 264 | { 265 | cudaCheckError(cudaDeviceSynchronize()); 266 | start = std::chrono::steady_clock::now(); 267 | } 268 | 269 | KernelTimer::~KernelTimer() 270 | { 271 | cudaCheckError(cudaDeviceSynchronize()); 272 | auto end = std::chrono::steady_clock::now(); 273 | auto elapsed = 274 | std::chrono::duration_cast(end - start) 275 | .count(); 276 | std::cout << "kernel ran in " << elapsed << " ms\n"; 277 | } 278 | -------------------------------------------------------------------------------- /Code files/utils.h: -------------------------------------------------------------------------------- 1 | // Utility functions for example programs. 2 | 3 | #ifndef __UTILS_H 4 | #define __UTILS_H 5 | 6 | #include 7 | 8 | // Error checking macro 9 | #define cudaCheckError(code) \ 10 | { \ 11 | if ((code) != cudaSuccess) { \ 12 | fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \ 13 | cudaGetErrorString(code)); \ 14 | } \ 15 | } 16 | 17 | /* A single pixel with floating-point channel values */ 18 | struct pixel { 19 | float red; 20 | float green; 21 | float blue; 22 | float alpha; 23 | }; 24 | 25 | /* An image with planar layout: separate buffers for each color channel */ 26 | struct image { 27 | float *red; 28 | float *green; 29 | float *blue; 30 | }; 31 | 32 | bool loadPPM(const char *file, pixel **data, unsigned int *w, unsigned int *h); 33 | void savePPM(const char *file, pixel *data, unsigned int w, unsigned int h); 34 | 35 | struct test_params { 36 | unsigned int width; 37 | unsigned int height; 38 | /* Device pointers to images */ 39 | pixel *input_image; 40 | pixel *output_image; 41 | const char *output_file; 42 | }; 43 | 44 | struct test_params_planar { 45 | unsigned int width; 46 | unsigned int height; 47 | /* Device pointers to images */ 48 | image input_image; 49 | image output_image; 50 | const char *output_file; 51 | }; 52 | 53 | test_params set_up_test(int argc, char **argv); 54 | void finish_test(const test_params ¶ms); 55 | test_params_planar set_up_test_planar(int argc, char **argv); 56 | void finish_test_planar(const test_params_planar ¶ms); 57 | void free_image(const image &img); 58 | 59 | class KernelTimer 60 | { 61 | public: 62 | KernelTimer(); 63 | ~KernelTimer(); 64 | 65 | private: 66 | std::chrono::time_point start; 67 | }; 68 | 69 | #endif // __UTILS_H 70 | -------------------------------------------------------------------------------- /Code files/utils.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/utils.o -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learning-CUDA-10-Programming 2 | Learning CUDA 10 Programming, published by Packt 3 | This is the code repository for [Learning CUDA 10 Programming]( https://www.packtpub.com/programming/learning-cuda-10-programming-video), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the video course from start to finish. 4 | ## About the Video Course 5 | Do you want to write GPU-accelerated applications, but don't know how to get started? With CUDA 10, you can easily add GPU processing to your C and C++ projects. CUDA 10 is the de-facto framework used to develop high-performance, GPU-accelerated applications. 6 | In this course, you will be introduced to CUDA programming through hands-on examples. CUDA provides a general-purpose programming model which gives you access to the tremendous computational power of modern GPUs, as well as powerful libraries for machine learning, image processing, linear algebra, and parallel algorithms. 7 | After working through this course, you will understand the fundamentals of CUDA programming and be able to start using it in your applications right away. 8 | 9 |

What You Will Learn

10 |
11 |
    12 |
  • Use CUDA to speed up your applications using machine learning, image processing, linear algebra, and more functions 13 |
  • Learn to debug CUDA programs and handle errors 14 |
  • Use optimization techniques to get the maximum performance from your CUDA programs 15 |
  • Master the fundamentals of concurrency and parallel algorithms on GPUs 16 |
  • Learn about the wide range of GPU-accelerated libraries included with CUDA 17 |
  • Learn the next steps you can take to continue building your CUDA skills
18 | 19 | ## Instructions and Navigation 20 | ### Assumed Knowledge 21 | If you want to learn how to use parallel and high-performance computing techniques to develop modern applications using GPUs and CUDA, then this course is for you. A good understanding of programming in modern C++ (C++17) is required in order to implement the concepts in this course. 22 | 23 | ### Technical Requirements 24 |
    25 | Minimum Hardware Requirements 26 |
  • OS: Windows, MacOS, or Linux 27 |
  • Processor: any 64-bit Intel or AMD processor 28 |
  • Memory: 2GB of RAM 29 |
  • Storage: 3GB of free space 30 |
  • Storage: 2GB
31 | 32 |
    33 | Recommended Hardware Requirements 34 |
  • For an optimal experience with hands-on labs and other practical activities, we recommend the following configuration: 35 |
  • OS: Windows 10 version 1703 or higher: Home, Professional, Education and Enterprise (LTSC and S are not supported) 36 | 1.8 GHz or faster processor. Quad-core or better recommended. 37 |
  • Memory: 2GB; 8GB of RAM recommended (2.5 GB minimum if running on a Virtual Machine) 38 |
  • Storage: Minimum of 800 MB up to 210 GB of disk space depending on the features installed. 39 |
  • Video Card that supports a minimum display resolution of 720p (1280 by 720); Visual Studio will work best at a resolution of WXGA (1366 by 768) or higher.
40 | 41 | 42 |
    43 | Software Requirements 44 |
  • OS: Windows, MacOS, or Linux 45 |
  • Processor: any 64-bit Intel or AMD processor 46 |
  • Memory: 8GB of RAM 47 |
  • Storage: 30GB of free space
48 | 49 |
    50 | Software Requirements 51 |
  • CUDA Toolkit Version 10.1 or later: https://developer.nvidia.com/cuda-downloads
52 | 53 | 54 | ## Related Products 55 | * [C++ Programming By Example [Video]](https://www.packtpub.com/application-development/c-programming-example-video) 56 | 57 | * [High-Performance Computing with Python 3.x [Video]](https://www.packtpub.com/application-development/high-performance-computing-python-3x-video?utm_source=github&utm_medium=repository&utm_campaign=9781789956252) 58 | 59 | * [Functional Programming in 7 Days [Video]](https://www.packtpub.com/application-development/functional-programming-7-days-video?utm_source=github&utm_medium=repository&utm_campaign=9781788990295) 60 | -------------------------------------------------------------------------------- /Section 1/1.1_JJ_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 1/1.1_JJ_MC.pptx -------------------------------------------------------------------------------- /Section 1/1.2_YM_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 1/1.2_YM_MC.pptx -------------------------------------------------------------------------------- /Section 1/1.3_YM_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 1/1.3_YM_MC.pptx -------------------------------------------------------------------------------- /Section 1/1.4_YM_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 1/1.4_YM_MC.pptx -------------------------------------------------------------------------------- /Section 1/1.5_YM_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 1/1.5_YM_MC.pptx -------------------------------------------------------------------------------- /Section 2/2.1_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 2/2.1_TK_MC.pptx -------------------------------------------------------------------------------- /Section 2/2.2_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 2/2.2_TK_MC.pptx -------------------------------------------------------------------------------- /Section 2/2.3_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 2/2.3_TK_MC.pptx -------------------------------------------------------------------------------- /Section 2/2.4_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 2/2.4_TK_MC.pptx -------------------------------------------------------------------------------- /Section 2/2.5_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 2/2.5_TK_MC.pptx -------------------------------------------------------------------------------- /Section 3/3.1_YM_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 3/3.1_YM_MC.pptx -------------------------------------------------------------------------------- /Section 3/3.2_YM_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 3/3.2_YM_MC.pptx -------------------------------------------------------------------------------- /Section 3/3.3_YM_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 3/3.3_YM_MC.pptx -------------------------------------------------------------------------------- /Section 3/3.4_YM_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 3/3.4_YM_MC.pptx -------------------------------------------------------------------------------- /Section 3/3.5_YM_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 3/3.5_YM_MC.pptx -------------------------------------------------------------------------------- /Section 4/4.1_YM_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 4/4.1_YM_MC.pptx -------------------------------------------------------------------------------- /Section 4/4.2_YM_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 4/4.2_YM_MC.pptx -------------------------------------------------------------------------------- /Section 4/4.3_YM_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 4/4.3_YM_MC.pptx -------------------------------------------------------------------------------- /Section 4/4.4_YM_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 4/4.4_YM_MC.pptx -------------------------------------------------------------------------------- /Section 5/5.1_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 5/5.1_TK_MC.pptx -------------------------------------------------------------------------------- /Section 5/5.2_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 5/5.2_TK_MC.pptx -------------------------------------------------------------------------------- /Section 5/5.3_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 5/5.3_TK_MC.pptx -------------------------------------------------------------------------------- /Section 5/5.4_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 5/5.4_TK_MC.pptx -------------------------------------------------------------------------------- /Section 6/6.1_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 6/6.1_TK_MC.pptx -------------------------------------------------------------------------------- /Section 6/6.2_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 6/6.2_TK_MC.pptx -------------------------------------------------------------------------------- /Section 6/6.3_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 6/6.3_TK_MC.pptx -------------------------------------------------------------------------------- /Section 6/6.4_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 6/6.4_TK_MC.pptx -------------------------------------------------------------------------------- /Section 6/6.5_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 6/6.5_TK_MC.pptx -------------------------------------------------------------------------------- /Section 6/6.6_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 6/6.6_TK_MC.pptx -------------------------------------------------------------------------------- /Section 6/6.7_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 6/6.7_TK_MC.pptx -------------------------------------------------------------------------------- /Section 7/7.1_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 7/7.1_TK_MC.pptx -------------------------------------------------------------------------------- /Section 7/7.2_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 7/7.2_TK_MC.pptx -------------------------------------------------------------------------------- /Section 7/7.3_TK_MC.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 7/7.3_TK_MC.pptx --------------------------------------------------------------------------------