├── Code files
    ├── Section 1
    │   └── 1.5
    │   │   ├── Makefile
    │   │   ├── array-add
    │   │   └── array-add.cu
    ├── Section 2
    │   ├── 2.2
    │   │   ├── Makefile
    │   │   ├── launch-bounds-1d
    │   │   ├── launch-bounds-1d.cu
    │   │   ├── launch-bounds-2d-a
    │   │   ├── launch-bounds-2d-a.cu
    │   │   ├── launch-bounds-2d-b
    │   │   ├── launch-bounds-2d-b.cu
    │   │   ├── occupancy-api
    │   │   └── occupancy-api.cu
    │   ├── 2.3
    │   │   ├── Makefile
    │   │   ├── array-add-bug
    │   │   └── array-add-bug.cu
    │   ├── 2.4
    │   │   ├── Makefile
    │   │   ├── array-add-bug
    │   │   └── array-add-bug.cu
    │   └── 2.5
    │   │   ├── Makefile
    │   │   ├── error-handling
    │   │   └── error-handling.cu
    ├── Section 3
    │   ├── 3.1
    │   │   ├── Makefile
    │   │   ├── monochrome
    │   │   └── monochrome.cu
    │   ├── 3.2
    │   │   ├── Makefile
    │   │   ├── monochrome
    │   │   └── monochrome.cu
    │   ├── 3.3
    │   │   ├── Makefile
    │   │   ├── spotlights
    │   │   ├── spotlights-2d
    │   │   ├── spotlights-2d.cu
    │   │   └── spotlights.cu
    │   ├── 3.4
    │   │   ├── Makefile
    │   │   ├── manylights-const
    │   │   ├── manylights-const.cu
    │   │   ├── manylights1.cu
    │   │   ├── manylights2
    │   │   ├── manylights2.cu
    │   │   ├── warp
    │   │   ├── warp-texture
    │   │   ├── warp-texture.cu
    │   │   └── warp.cu
    │   └── 3.5
    │   │   ├── Makefile
    │   │   ├── manylights-ilp
    │   │   └── manylights-ilp.cu
    ├── Section 4
    │   ├── 4.1
    │   │   ├── Makefile
    │   │   ├── transpose
    │   │   ├── transpose-shared
    │   │   ├── transpose-shared.cu
    │   │   └── transpose.cu
    │   ├── 4.2
    │   │   ├── Makefile
    │   │   ├── reduce
    │   │   └── reduce.cu
    │   ├── 4.3
    │   │   ├── Makefile
    │   │   ├── scan
    │   │   └── scan.cu
    │   └── 4.4
    │   │   ├── Makefile
    │   │   ├── filter
    │   │   └── filter.cu
    ├── Section 5
    │   └── 5.4
    │   │   ├── Makefile
    │   │   ├── thrust
    │   │   └── thrust.cu
    ├── Section 6
    │   ├── 6.1
    │   │   ├── Makefile
    │   │   ├── reduce-stream
    │   │   └── reduce-stream.cu
    │   ├── 6.2
    │   │   ├── Makefile
    │   │   ├── scan-page-locked
    │   │   ├── scan-page-locked.cu
    │   │   ├── scan-stream
    │   │   └── scan-stream.cu
    │   ├── 6.4
    │   │   ├── Makefile
    │   │   ├── scan-multi-device
    │   │   └── scan-multi-device.cu
    │   ├── 6.5
    │   │   ├── Makefile
    │   │   ├── scan-unified
    │   │   └── scan-unified.cu
    │   ├── 6.6
    │   │   ├── Makefile
    │   │   ├── bst
    │   │   └── bst.cu
    │   └── 6.7
    │   │   ├── Makefile
    │   │   ├── bst-sum
    │   │   ├── bst-sum-kernels.cu
    │   │   ├── bst-sum-kernels.cuh
    │   │   ├── bst-sum-kernels.o
    │   │   ├── bst-sum.cu
    │   │   └── bst-sum.o
    ├── utils.cu
    ├── utils.h
    └── utils.o
├── LICENSE
├── README.md
├── Section 1
    ├── 1.1_JJ_MC.pptx
    ├── 1.2_YM_MC.pptx
    ├── 1.3_YM_MC.pptx
    ├── 1.4_YM_MC.pptx
    └── 1.5_YM_MC.pptx
├── Section 2
    ├── 2.1_TK_MC.pptx
    ├── 2.2_TK_MC.pptx
    ├── 2.3_TK_MC.pptx
    ├── 2.4_TK_MC.pptx
    └── 2.5_TK_MC.pptx
├── Section 3
    ├── 3.1_YM_MC.pptx
    ├── 3.2_YM_MC.pptx
    ├── 3.3_YM_MC.pptx
    ├── 3.4_YM_MC.pptx
    └── 3.5_YM_MC.pptx
├── Section 4
    ├── 4.1_YM_MC.pptx
    ├── 4.2_YM_MC.pptx
    ├── 4.3_YM_MC.pptx
    └── 4.4_YM_MC.pptx
├── Section 5
    ├── 5.1_TK_MC.pptx
    ├── 5.2_TK_MC.pptx
    ├── 5.3_TK_MC.pptx
    └── 5.4_TK_MC.pptx
├── Section 6
    ├── 6.1_TK_MC.pptx
    ├── 6.2_TK_MC.pptx
    ├── 6.3_TK_MC.pptx
    ├── 6.4_TK_MC.pptx
    ├── 6.5_TK_MC.pptx
    ├── 6.6_TK_MC.pptx
    └── 6.7_TK_MC.pptx
└── Section 7
    ├── 7.1_TK_MC.pptx
    ├── 7.2_TK_MC.pptx
    └── 7.3_TK_MC.pptx


/Code files/Section 1/1.5/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = array-add
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 1/1.5/array-add:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 1/1.5/array-add


--------------------------------------------------------------------------------
/Code files/Section 1/1.5/array-add.cu:
--------------------------------------------------------------------------------
 1 | // Add two arrays using CUDA.
 2 | /// Example for videos 1.5 and 2.1
 3 | 
 4 | #include <assert.h>
 5 | #include <stdio.h>
 6 | 
 7 | // Standard CUDA API functions
 8 | #include <cuda_runtime_api.h>
 9 | 
10 | // Error checking macro
11 | #define cudaCheckError(code)                                             \
12 |   {                                                                      \
13 |     if ((code) != cudaSuccess) {                                         \
14 |       fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
15 |               cudaGetErrorString(code));                                 \
16 |     }                                                                    \
17 |   }
18 | 
19 | // Host function for array addition
20 | void add_loop(float *dest, int n_elts, const float *a, const float *b)
21 | {
22 |   for (int i = 0; i < n_elts; i++) {
23 |     dest[i] = a[i] + b[i];
24 |   }
25 | }
26 | 
27 | // Device kernel for array addition.
28 | __global__ void add_kernel(float *dest, int n_elts, const float *a,
29 |                            const float *b)
30 | {
31 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
32 |   if (index >= n_elts) return;
33 | 
34 |   dest[index] = a[index] + b[index];
35 | }
36 | 
37 | int main()
38 | {
39 |   const int ARRAY_LENGTH = 100;
40 | 
41 |   // Generate some data on the host
42 |   float host_array_a[ARRAY_LENGTH];
43 |   float host_array_b[ARRAY_LENGTH];
44 |   float host_array_dest[ARRAY_LENGTH];
45 | 
46 |   for (int i = 0; i < ARRAY_LENGTH; i++) {
47 |     host_array_a[i] = 2 * i;
48 |     host_array_b[i] = 2 * i + 1;
49 |   }
50 | 
51 |   // Allocate device memory
52 |   float *device_array_a, *device_array_b, *device_array_dest;
53 |   cudaCheckError(cudaMalloc(&device_array_a, sizeof(host_array_a)));
54 |   cudaCheckError(cudaMalloc(&device_array_b, sizeof(host_array_b)));
55 |   cudaCheckError(cudaMalloc(&device_array_dest, sizeof(host_array_dest)));
56 | 
57 |   // Transfer data to device
58 |   cudaCheckError(cudaMemcpy(device_array_a, host_array_a, sizeof(host_array_a),
59 |                             cudaMemcpyHostToDevice));
60 |   cudaCheckError(cudaMemcpy(device_array_b, host_array_b, sizeof(host_array_b),
61 |                             cudaMemcpyHostToDevice));
62 | 
63 |   // Calculate lauch configuration
64 |   const int BLOCK_SIZE = 128;
65 |   int n_blocks = (ARRAY_LENGTH + BLOCK_SIZE - 1) / BLOCK_SIZE;
66 | 
67 |   // Add arrays on device
68 |   add_kernel<<<BLOCK_SIZE, n_blocks>>>(device_array_dest, ARRAY_LENGTH,
69 |                                        device_array_a, device_array_b);
70 | 
71 |   // Meanwhile, add arrays on the host, for comparison
72 |   add_loop(host_array_dest, ARRAY_LENGTH, host_array_a, host_array_b);
73 | 
74 |   // Copy result back to host and compare
75 |   float host_array_tmp[ARRAY_LENGTH];
76 |   cudaCheckError(cudaMemcpy(host_array_tmp, device_array_dest,
77 |                             sizeof(host_array_tmp), cudaMemcpyDeviceToHost));
78 |   for (int i = 0; i < ARRAY_LENGTH; i++) {
79 |     assert(host_array_tmp[i] == host_array_dest[i]);
80 |     printf("%g + %g = %g\n", host_array_a[i], host_array_b[i],
81 |            host_array_tmp[i]);
82 |   }
83 | 
84 |   return 0;
85 | }
86 | 


--------------------------------------------------------------------------------
/Code files/Section 2/2.2/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = launch-bounds-1d launch-bounds-2d-a launch-bounds-2d-b occupancy-api
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 2/2.2/launch-bounds-1d:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 2/2.2/launch-bounds-1d


--------------------------------------------------------------------------------
/Code files/Section 2/2.2/launch-bounds-1d.cu:
--------------------------------------------------------------------------------
 1 | // Demonstration of kernel execution configuration for a one-dimensional
 2 | // grid.
 3 | // Example for video 2.2.
 4 | 
 5 | #include <cuda_runtime_api.h>
 6 | #include <stdio.h>
 7 | 
 8 | // Error checking macro
 9 | #define cudaCheckError(code)                                             \
10 |   {                                                                      \
11 |     if ((code) != cudaSuccess) {                                         \
12 |       fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
13 |               cudaGetErrorString(code));                                 \
14 |     }                                                                    \
15 |   }
16 | 
17 | __global__ void kernel_1d()
18 | {
19 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
20 |   printf("block %d, thread %d, index %d\n", blockIdx.x, threadIdx.x, index);
21 | }
22 | 
23 | int main()
24 | {
25 |   kernel_1d<<<4, 8>>>();
26 |   cudaCheckError(cudaDeviceSynchronize());
27 | }
28 | 


--------------------------------------------------------------------------------
/Code files/Section 2/2.2/launch-bounds-2d-a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 2/2.2/launch-bounds-2d-a


--------------------------------------------------------------------------------
/Code files/Section 2/2.2/launch-bounds-2d-a.cu:
--------------------------------------------------------------------------------
 1 | // Example of generating two-dimensional data coordinates from a
 2 | // one-dimensional grid. A two-dimensional grid would be better suited here.
 3 | // Example for video 2.2.
 4 | 
 5 | #include <cuda_runtime_api.h>
 6 | #include <stdio.h>
 7 | 
 8 | // Error checking macro
 9 | #define cudaCheckError(code)                                             \
10 |   {                                                                      \
11 |     if ((code) != cudaSuccess) {                                         \
12 |       fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
13 |               cudaGetErrorString(code));                                 \
14 |     }                                                                    \
15 |   }
16 | 
17 | __global__ void kernel_1d(int width)
18 | {
19 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
20 |   int x = index % width;
21 |   int y = index / width;
22 |   printf("block %d, thread %d, index (%d, %d)\n", blockIdx.x, threadIdx.x, x,
23 |          y);
24 | }
25 | 
26 | int main()
27 | {
28 |   kernel_1d<<<4, 8>>>(16);
29 |   cudaCheckError(cudaDeviceSynchronize());
30 | }
31 | 


--------------------------------------------------------------------------------
/Code files/Section 2/2.2/launch-bounds-2d-b:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 2/2.2/launch-bounds-2d-b


--------------------------------------------------------------------------------
/Code files/Section 2/2.2/launch-bounds-2d-b.cu:
--------------------------------------------------------------------------------
 1 | // Demonstration of kernel execution configuration for a two-dimensional
 2 | // grid.
 3 | // Example for video 2.2.
 4 | 
 5 | #include <cuda_runtime_api.h>
 6 | #include <stdio.h>
 7 | 
 8 | // Error checking macro
 9 | #define cudaCheckError(code)                                             \
10 |   {                                                                      \
11 |     if ((code) != cudaSuccess) {                                         \
12 |       fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
13 |               cudaGetErrorString(code));                                 \
14 |     }                                                                    \
15 |   }
16 | 
17 | __global__ void kernel_2d()
18 | {
19 |   int x = blockIdx.x * blockDim.x + threadIdx.x;
20 |   int y = blockIdx.y * blockDim.y + threadIdx.y;
21 |   printf("block (%d, %d), thread (%d, %d), index (%d, %d)\n", blockIdx.x,
22 |          blockIdx.y, threadIdx.x, threadIdx.y, x, y);
23 | }
24 | 
25 | int main()
26 | {
27 |   dim3 block_dim(8, 2);
28 |   dim3 grid_dim(2, 1);
29 |   kernel_2d<<<grid_dim, block_dim>>>();
30 |   cudaCheckError(cudaDeviceSynchronize());
31 | }
32 | 


--------------------------------------------------------------------------------
/Code files/Section 2/2.2/occupancy-api:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 2/2.2/occupancy-api


--------------------------------------------------------------------------------
/Code files/Section 2/2.2/occupancy-api.cu:
--------------------------------------------------------------------------------
 1 | // Demonstration of the CUDA occupancy API.
 2 | // Example for video 2.2.
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | #include <stdio.h>
 6 | 
 7 | __global__ void kernel_1d() {}
 8 | 
 9 | int main()
10 | {
11 |   int block_size;     // The launch configurator returned block size
12 |   int min_grid_size;  // The minimum grid size needed to achieve max occupancy
13 | 
14 |   cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, kernel_1d, 0,
15 |                                      0);
16 | 
17 |   printf("Block size %d\nMin grid size %d\n", block_size, min_grid_size);
18 | }
19 | 


--------------------------------------------------------------------------------
/Code files/Section 2/2.3/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = array-add-bug
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 2/2.3/array-add-bug:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 2/2.3/array-add-bug


--------------------------------------------------------------------------------
/Code files/Section 2/2.3/array-add-bug.cu:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdio.h>
 3 | 
 4 | // Standard CUDA API functions
 5 | #include <cuda_runtime_api.h>
 6 | 
 7 | // Error checking macro
 8 | #define cudaCheckError(code)                                             \
 9 |   {                                                                      \
10 |     if ((code) != cudaSuccess) {                                         \
11 |       fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
12 |               cudaGetErrorString(code));                                 \
13 |     }                                                                    \
14 |   }
15 | 
16 | // Host function for array addition
17 | void add_loop(float *dest, int n_elts, const float *a, const float *b)
18 | {
19 |   for (int i = 0; i < n_elts; i++) {
20 |     dest[i] = a[i] + b[i];
21 |   }
22 | }
23 | 
24 | // Device kernel for array addition.
25 | __global__ void add_kernel(float *dest, int n_elts, const float *a,
26 |                            const float *b)
27 | {
28 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
29 |   if (index >= n_elts) return;
30 | 
31 |   dest[index] = a[index] + b[index];
32 | }
33 | 
34 | int main()
35 | {
36 |   const int ARRAY_LENGTH = 10000;
37 | 
38 |   // Generate some data on the host
39 |   float host_array_a[ARRAY_LENGTH];
40 |   float host_array_b[ARRAY_LENGTH];
41 |   float host_array_dest[ARRAY_LENGTH];
42 | 
43 |   for (int i = 0; i < ARRAY_LENGTH; i++) {
44 |     host_array_a[i] = 2 * i;
45 |     host_array_b[i] = 2 * i + 1;
46 |   }
47 | 
48 |   // Allocate device memory
49 |   float *device_array_a, *device_array_b, *device_array_dest;
50 |   cudaCheckError(cudaMalloc(&device_array_a, sizeof(host_array_a)));
51 |   cudaCheckError(cudaMalloc(&device_array_b, sizeof(host_array_b)));
52 |   cudaCheckError(cudaMalloc(&device_array_dest, sizeof(host_array_dest)));
53 | 
54 |   // Transfer data to device
55 |   cudaCheckError(cudaMemcpy(device_array_a, host_array_a, sizeof(host_array_a),
56 |                             cudaMemcpyHostToDevice));
57 |   cudaCheckError(cudaMemcpy(device_array_b, host_array_b, sizeof(host_array_b),
58 |                             cudaMemcpyHostToDevice));
59 | 
60 |   // Calculate lauch configuration
61 |   const int BLOCK_SIZE = 128;
62 |   int n_blocks = (ARRAY_LENGTH + BLOCK_SIZE - 1) / BLOCK_SIZE;
63 | 
64 |   // Add arrays on device
65 |   add_kernel<<<n_blocks, BLOCK_SIZE>>>(device_array_dest, ARRAY_LENGTH,
66 |                                        device_array_a, nullptr);
67 | 
68 |   // Meanwhile, add arrays on the host, for comparison
69 |   add_loop(host_array_dest, ARRAY_LENGTH, host_array_a, host_array_b);
70 | 
71 |   // Copy result back to host and compare
72 |   float host_array_tmp[ARRAY_LENGTH];
73 |   cudaCheckError(cudaMemcpy(host_array_tmp, device_array_dest,
74 |                             sizeof(host_array_tmp), cudaMemcpyDeviceToHost));
75 | 
76 |   for (int i = 0; i < ARRAY_LENGTH; i++) {
77 |     assert(host_array_tmp[i] == host_array_dest[i]);
78 |     printf("%g + %g = %g\n", host_array_a[i], host_array_b[i],
79 |            host_array_tmp[i]);
80 |   }
81 | 
82 |   return 0;
83 | }
84 | 


--------------------------------------------------------------------------------
/Code files/Section 2/2.4/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = array-add-bug
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 2/2.4/array-add-bug:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 2/2.4/array-add-bug


--------------------------------------------------------------------------------
/Code files/Section 2/2.4/array-add-bug.cu:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdio.h>
 3 | 
 4 | // Standard CUDA API functions
 5 | #include <cuda_runtime_api.h>
 6 | 
 7 | // Error checking macro
 8 | #define cudaCheckError(code)                                             \
 9 |   {                                                                      \
10 |     if ((code) != cudaSuccess) {                                         \
11 |       fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
12 |               cudaGetErrorString(code));                                 \
13 |     }                                                                    \
14 |   }
15 | 
16 | // Host function for array addition
17 | void add_loop(float *dest, int n_elts, const float *a, const float *b)
18 | {
19 |   for (int i = 0; i < n_elts; i++) {
20 |     dest[i] = a[i] + b[i];
21 |   }
22 | }
23 | 
24 | // Device kernel for array addition.
25 | __global__ void add_kernel(float *dest, int n_elts, const float *a,
26 |                            const float *b)
27 | {
28 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
29 |   if (index >= n_elts) return;
30 | 
31 |   dest[index] = a[index] + b[index];
32 | }
33 | 
34 | int main()
35 | {
36 |   const int ARRAY_LENGTH = 10000;
37 | 
38 |   // Generate some data on the host
39 |   float host_array_a[ARRAY_LENGTH];
40 |   float host_array_b[ARRAY_LENGTH];
41 |   float host_array_dest[ARRAY_LENGTH];
42 | 
43 |   for (int i = 0; i < ARRAY_LENGTH; i++) {
44 |     host_array_a[i] = 2 * i;
45 |     host_array_b[i] = 2 * i + 1;
46 |   }
47 | 
48 |   // Allocate device memory
49 |   float *device_array_a, *device_array_b, *device_array_dest;
50 |   cudaCheckError(cudaMalloc(&device_array_a, sizeof(host_array_a)));
51 |   cudaCheckError(cudaMalloc(&device_array_b, sizeof(host_array_b)));
52 |   cudaCheckError(cudaMalloc(&device_array_dest, sizeof(host_array_dest)));
53 | 
54 |   // Transfer data to device
55 |   cudaCheckError(cudaMemcpy(device_array_a, host_array_a, sizeof(host_array_a),
56 |                             cudaMemcpyHostToDevice));
57 |   cudaCheckError(cudaMemcpy(device_array_b, host_array_b, sizeof(host_array_b),
58 |                             cudaMemcpyHostToDevice));
59 | 
60 |   // Calculate lauch configuration
61 |   const int BLOCK_SIZE = 128;
62 |   int n_blocks = (ARRAY_LENGTH + BLOCK_SIZE - 1) / BLOCK_SIZE;
63 | 
64 |   // Add arrays on device
65 |   add_kernel<<<n_blocks, BLOCK_SIZE>>>(device_array_dest, ARRAY_LENGTH,
66 |                                        device_array_a, nullptr);
67 | 
68 |   // Meanwhile, add arrays on the host, for comparison
69 |   add_loop(host_array_dest, ARRAY_LENGTH, host_array_a, host_array_b);
70 | 
71 |   // Copy result back to host and compare
72 |   float host_array_tmp[ARRAY_LENGTH];
73 |   cudaCheckError(cudaMemcpy(host_array_tmp, device_array_dest,
74 |                             sizeof(host_array_tmp), cudaMemcpyDeviceToHost));
75 | 
76 |   for (int i = 0; i < ARRAY_LENGTH; i++) {
77 |     assert(host_array_tmp[i] == host_array_dest[i]);
78 |     printf("%g + %g = %g\n", host_array_a[i], host_array_b[i],
79 |            host_array_tmp[i]);
80 |   }
81 | 
82 |   return 0;
83 | }
84 | 


--------------------------------------------------------------------------------
/Code files/Section 2/2.5/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = error-handling
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 2/2.5/error-handling:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 2/2.5/error-handling


--------------------------------------------------------------------------------
/Code files/Section 2/2.5/error-handling.cu:
--------------------------------------------------------------------------------
 1 | // Demonstration of basic CUDA error handling.
 2 | // Example fgor video 2.5.
 3 | 
 4 | #include <stdio.h>
 5 | 
 6 | // Standard CUDA API functions
 7 | #include <cuda_runtime_api.h>
 8 | 
 9 | // Error checking macro
10 | #define cudaCheckError(code)                                             \
11 |   {                                                                      \
12 |     if ((code) != cudaSuccess) {                                         \
13 |       fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
14 |               cudaGetErrorString(code));                                 \
15 |     }                                                                    \
16 |   }
17 | 
18 | __global__ void bad()
19 | {
20 |   char *x = nullptr;
21 |   *x = 1;
22 | }
23 | 
24 | __global__ void good() {}
25 | 
26 | int main()
27 | {
28 |   int *foo = nullptr;
29 |   size_t size = 1lu << 33;
30 |   cudaError_t status = cudaMalloc(&foo, size);
31 |   const char *message = cudaGetErrorString(status);
32 | 
33 |   status = cudaGetLastError();
34 | 
35 |   status = cudaMalloc(&foo, 16);
36 |   message = cudaGetErrorString(status);
37 | 
38 |   bad<<<1, 1>>>();
39 |   status = cudaDeviceSynchronize();
40 |   message = cudaGetErrorString(status);
41 | 
42 |   good<<<1, 16>>>();
43 |   status = cudaDeviceSynchronize();
44 |   message = cudaGetErrorString(status);
45 | 
46 |   cudaCheckError(cudaMalloc(&foo, 16))
47 | 
48 |       return 0;
49 | }
50 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.1/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = monochrome
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.1/monochrome:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.1/monochrome


--------------------------------------------------------------------------------
/Code files/Section 3/3.1/monochrome.cu:
--------------------------------------------------------------------------------
 1 | // Convert a color image to monochrome.
 2 | // Example for videos 3.1 and 3.2.
 3 | 
 4 | #include <cstdint>
 5 | #include <iostream>
 6 | 
 7 | // Standard CUDA API functions
 8 | #include <cuda_runtime_api.h>
 9 | 
10 | #include "../utils.h"
11 | 
12 | __global__ void monochrome(const pixel *source, pixel *dest, int size)
13 | {
14 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
15 |   if (index >= size) return;
16 | 
17 |   float value(source[index].red * 0.3125f + source[index].green * 0.5f +
18 |               source[index].blue * .1875f);
19 | 
20 |   dest[index].red = value;
21 |   dest[index].green = value;
22 |   dest[index].blue = value;
23 |   dest[index].alpha = source[index].alpha;
24 | }
25 | 
26 | int main(int argc, char **argv)
27 | {
28 |   test_params params = set_up_test(argc, argv);
29 | 
30 |   int pixel_count = params.width * params.height;
31 |   int BLOCK_SIZE = 128;
32 |   int n_blocks = (pixel_count + BLOCK_SIZE - 1) / BLOCK_SIZE;
33 | 
34 |   {
35 |     KernelTimer t;
36 |     monochrome<<<n_blocks, BLOCK_SIZE>>>(params.input_image,
37 |                                          params.output_image, pixel_count);
38 |   }
39 | 
40 |   finish_test(params);
41 | 
42 |   return 0;
43 | }
44 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.2/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = monochrome
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.2/monochrome:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.2/monochrome


--------------------------------------------------------------------------------
/Code files/Section 3/3.2/monochrome.cu:
--------------------------------------------------------------------------------
 1 | // Convert a color image to monochrome.
 2 | // Example for videos 3.1 and 3.2.
 3 | 
 4 | #include <cstdint>
 5 | #include <iostream>
 6 | 
 7 | // Standard CUDA API functions
 8 | #include <cuda_runtime_api.h>
 9 | 
10 | #include "../utils.h"
11 | 
12 | __global__ void monochrome(const pixel *source, pixel *dest, int size)
13 | {
14 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
15 |   if (index >= size) return;
16 | 
17 |   float value(source[index].red * 0.3125f + source[index].green * 0.5f +
18 |               source[index].blue * .1875f);
19 | 
20 |   dest[index].red = value;
21 |   dest[index].green = value;
22 |   dest[index].blue = value;
23 |   dest[index].alpha = source[index].alpha;
24 | }
25 | 
26 | int main(int argc, char **argv)
27 | {
28 |   test_params params = set_up_test(argc, argv);
29 | 
30 |   int pixel_count = params.width * params.height;
31 |   int BLOCK_SIZE = 128;
32 |   int n_blocks = (pixel_count + BLOCK_SIZE - 1) / BLOCK_SIZE;
33 | 
34 |   {
35 |     KernelTimer t;
36 |     monochrome<<<n_blocks, BLOCK_SIZE>>>(params.input_image,
37 |                                          params.output_image, pixel_count);
38 |   }
39 | 
40 |   finish_test(params);
41 | 
42 |   return 0;
43 | }
44 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.3/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = spotlights spotlights-2d
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.3/spotlights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.3/spotlights


--------------------------------------------------------------------------------
/Code files/Section 3/3.3/spotlights-2d:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.3/spotlights-2d


--------------------------------------------------------------------------------
/Code files/Section 3/3.3/spotlights-2d.cu:
--------------------------------------------------------------------------------
  1 | // Render several spotlights on an image.
  2 | // Uses a two-dimensional memory layout to ensure coalesced access.
  3 | // Example for video 3.3.
  4 | 
  5 | #include <cmath>
  6 | #include <cstdint>
  7 | #include <iostream>
  8 | 
  9 | // Standard CUDA API functions
 10 | #include <cuda_runtime_api.h>
 11 | 
 12 | #include "../utils.h"
 13 | 
 14 | struct light {
 15 |   float x;
 16 |   float y;
 17 |   float radius;
 18 |   float brightness;
 19 | };
 20 | 
 21 | __device__ float clamp(float value) { return value > 1.0f ? 1.0f : value; }
 22 | 
 23 | __device__ float light_brightness(float x, float y, unsigned int width,
 24 |                                   unsigned int height, const light &light)
 25 | {
 26 |   float norm_x = x / width;
 27 |   float norm_y = y / height;
 28 | 
 29 |   float dx = norm_x - light.x;
 30 |   float dy = norm_y - light.y;
 31 |   float distance_squared = dx * dx + dy * dy;
 32 |   if (distance_squared > light.radius * light.radius) {
 33 |     return 0;
 34 |   }
 35 |   float distance = sqrtf(distance_squared);
 36 | 
 37 |   float scaled_distance = distance / light.radius;
 38 |   if (scaled_distance > 0.8) {
 39 |     return (1.0f - (scaled_distance - 0.8f) * 5.0f) * light.brightness;
 40 |   } else {
 41 |     return light.brightness;
 42 |   }
 43 | }
 44 | 
 45 | template <typename T>
 46 | __device__ T *pointer2d(T *base_pointer, int x, int y, size_t pitch)
 47 | {
 48 |   return (T *)((char *)base_pointer + y * pitch) + x;
 49 | }
 50 | 
 51 | __global__ void spotlights(const image source, image dest, unsigned int width,
 52 |                            unsigned int height, size_t pitch, float ambient,
 53 |                            light light1, light light2, light light3,
 54 |                            light light4)
 55 | {
 56 |   int x = blockIdx.x * blockDim.x + threadIdx.x;
 57 |   int y = blockIdx.y * blockDim.y + threadIdx.y;
 58 |   if (x >= width || y >= height) return;
 59 | 
 60 |   float brightness = ambient + light_brightness(x, y, width, height, light1) +
 61 |                      light_brightness(x, y, width, height, light2) +
 62 |                      light_brightness(x, y, width, height, light3) +
 63 |                      light_brightness(x, y, width, height, light4);
 64 | 
 65 |   *pointer2d(dest.red, x, y, pitch) =
 66 |       clamp(*pointer2d(source.red, x, y, pitch) * brightness);
 67 |   *pointer2d(dest.green, x, y, pitch) =
 68 |       clamp(*pointer2d(source.green, x, y, pitch) * brightness);
 69 |   *pointer2d(dest.blue, x, y, pitch) =
 70 |       clamp(*pointer2d(source.blue, x, y, pitch) * brightness);
 71 | }
 72 | 
 73 | int main(int argc, char **argv)
 74 | {
 75 |   auto params = set_up_test_planar(argc, argv);
 76 | 
 77 |   light light1 = {0.2, 0.1, 0.1, 4.0};
 78 |   light light2 = {0.25, 0.2, 0.075, 2.0};
 79 |   light light3 = {0.5, 0.5, 0.3, 0.3};
 80 |   light light4 = {0.7, 0.65, 0.15, 0.8};
 81 | 
 82 |   image input2d, output2d;
 83 |   size_t byte_width = params.width * sizeof(float);
 84 |   size_t pitch;
 85 | 
 86 |   // Allocate 2D aligned image
 87 |   cudaCheckError(
 88 |       cudaMallocPitch(&input2d.red, &pitch, byte_width, params.height));
 89 |   // Copy from 1D to 2D image
 90 |   cudaCheckError(cudaMemcpy2D(input2d.red, pitch, params.input_image.red,
 91 |                               byte_width, byte_width, params.height,
 92 |                               cudaMemcpyDeviceToDevice));
 93 |   std::cout << "Width: " << byte_width << " bytes. Pitch: " << pitch
 94 |             << " bytes\n";
 95 | 
 96 |   // Allocate and copy other channels
 97 |   // Note: pitch will be the same for all of these allocations
 98 |   cudaCheckError(
 99 |       cudaMallocPitch(&input2d.green, &pitch, byte_width, params.height));
100 |   cudaCheckError(
101 |       cudaMallocPitch(&input2d.blue, &pitch, byte_width, params.height));
102 |   cudaCheckError(
103 |       cudaMallocPitch(&output2d.red, &pitch, byte_width, params.height));
104 |   cudaCheckError(
105 |       cudaMallocPitch(&output2d.green, &pitch, byte_width, params.height));
106 |   cudaCheckError(
107 |       cudaMallocPitch(&output2d.blue, &pitch, byte_width, params.height));
108 |   cudaCheckError(cudaMemcpy2D(input2d.green, pitch, params.input_image.green,
109 |                               byte_width, byte_width, params.height,
110 |                               cudaMemcpyDeviceToDevice));
111 |   cudaCheckError(cudaMemcpy2D(input2d.blue, pitch, params.input_image.blue,
112 |                               byte_width, byte_width, params.height,
113 |                               cudaMemcpyDeviceToDevice));
114 | 
115 |   dim3 BLOCK_DIM(32, 16);
116 |   dim3 grid_dim((params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x,
117 |                 (params.height + BLOCK_DIM.y - 1) / BLOCK_DIM.y);
118 | 
119 |   {
120 |     KernelTimer t;
121 |     spotlights<<<grid_dim, BLOCK_DIM>>>(input2d, output2d, params.width,
122 |                                         params.height, pitch, 0.3, light1,
123 |                                         light2, light3, light4);
124 |   }
125 | 
126 |   cudaCheckError(cudaMemcpy2D(params.output_image.red, byte_width, output2d.red,
127 |                               pitch, byte_width, params.height,
128 |                               cudaMemcpyDeviceToDevice));
129 |   cudaCheckError(cudaMemcpy2D(params.output_image.green, byte_width,
130 |                               output2d.green, pitch, byte_width, params.height,
131 |                               cudaMemcpyDeviceToDevice));
132 |   cudaCheckError(cudaMemcpy2D(params.output_image.blue, byte_width,
133 |                               output2d.blue, pitch, byte_width, params.height,
134 |                               cudaMemcpyDeviceToDevice));
135 | 
136 |   free_image(input2d);
137 |   free_image(output2d);
138 | 
139 |   finish_test_planar(params);
140 | 
141 |   return 0;
142 | }
143 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.3/spotlights.cu:
--------------------------------------------------------------------------------
 1 | // Render several spotlights on an image.
 2 | // Uses a two-dimensional grid with a one-dimensional memory layout, so
 3 | // performance is not optimal.
 4 | // Example for video 3.3.
 5 | 
 6 | #include <cmath>
 7 | #include <cstdint>
 8 | #include <iostream>
 9 | 
10 | // Standard CUDA API functions
11 | #include <cuda_runtime_api.h>
12 | 
13 | #include "../utils.h"
14 | 
15 | struct light {
16 |   float x;
17 |   float y;
18 |   float radius;
19 |   float brightness;
20 | };
21 | 
22 | __device__ float clamp(float value) { return value > 1.0f ? 1.0f : value; }
23 | 
24 | __device__ float light_brightness(float x, float y, unsigned int width,
25 |                                   unsigned int height, const light &light)
26 | {
27 |   float norm_x = x / width;
28 |   float norm_y = y / height;
29 | 
30 |   float dx = norm_x - light.x;
31 |   float dy = norm_y - light.y;
32 |   float distance_squared = dx * dx + dy * dy;
33 |   if (distance_squared > light.radius * light.radius) {
34 |     return 0;
35 |   }
36 |   float distance = sqrtf(distance_squared);
37 | 
38 |   float scaled_distance = distance / light.radius;
39 |   if (scaled_distance > 0.8) {
40 |     return (1.0f - (scaled_distance - 0.8f) * 5.0f) * light.brightness;
41 |   } else {
42 |     return light.brightness;
43 |   }
44 | }
45 | 
46 | __global__ void spotlights(const image source, image dest, unsigned int width,
47 |                            unsigned int height, float ambient, light light1,
48 |                            light light2, light light3, light light4)
49 | {
50 |   int x = blockIdx.x * blockDim.x + threadIdx.x;
51 |   int y = blockIdx.y * blockDim.y + threadIdx.y;
52 |   if (x >= width || y >= height) return;
53 | 
54 |   int index = y * width + x;
55 | 
56 |   float brightness = ambient + light_brightness(x, y, width, height, light1) +
57 |                      light_brightness(x, y, width, height, light2) +
58 |                      light_brightness(x, y, width, height, light3) +
59 |                      light_brightness(x, y, width, height, light4);
60 | 
61 |   dest.red[index] = clamp(source.red[index] * brightness);
62 |   dest.green[index] = clamp(source.green[index] * brightness);
63 |   dest.blue[index] = clamp(source.blue[index] * brightness);
64 | }
65 | 
66 | int main(int argc, char **argv)
67 | {
68 |   auto params = set_up_test_planar(argc, argv);
69 | 
70 |   light light1 = {0.2, 0.1, 0.1, 4.0};
71 |   light light2 = {0.25, 0.2, 0.075, 2.0};
72 |   light light3 = {0.5, 0.5, 0.3, 0.3};
73 |   light light4 = {0.7, 0.65, 0.15, 0.8};
74 | 
75 |   dim3 BLOCK_DIM(32, 16);
76 |   dim3 grid_dim((params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x,
77 |                 (params.height + BLOCK_DIM.y - 1) / BLOCK_DIM.y);
78 | 
79 |   {
80 |     KernelTimer t;
81 |     spotlights<<<grid_dim, BLOCK_DIM>>>(params.input_image, params.output_image,
82 |                                         params.width, params.height, 0.3,
83 |                                         light1, light2, light3, light4);
84 |   }
85 | 
86 |   finish_test_planar(params);
87 | 
88 |   return 0;
89 | }
90 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.4/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = manylights2 manylights-const warp warp-texture
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.4/manylights-const:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.4/manylights-const


--------------------------------------------------------------------------------
/Code files/Section 3/3.4/manylights-const.cu:
--------------------------------------------------------------------------------
  1 | // Render many spotlights on an image, passing the light definitions in
  2 | // constant memory.
  3 | // Example for video 3.4.
  4 | 
  5 | #include <cmath>
  6 | #include <cstdint>
  7 | #include <iostream>
  8 | 
  9 | // Standard CUDA API functions
 10 | #include <cuda_runtime_api.h>
 11 | 
 12 | #include "../utils.h"
 13 | 
 14 | struct light {
 15 |   float x;
 16 |   float y;
 17 |   float radius;
 18 |   float brightness;
 19 | };
 20 | 
 21 | struct lots_of_lights {
 22 |   unsigned int count;
 23 |   light lights[1024];
 24 | };
 25 | 
 26 | __constant__ lots_of_lights dev_lights;
 27 | 
 28 | __device__ float clamp(float value) { return value > 1.0f ? 1.0f : value; }
 29 | 
 30 | __device__ float light_brightness(float x, float y, unsigned int width,
 31 |                                   unsigned int height, const light &light)
 32 | {
 33 |   float norm_x = x / width;
 34 |   float norm_y = y / height;
 35 | 
 36 |   float dx = norm_x - light.x;
 37 |   float dy = norm_y - light.y;
 38 |   float distance_squared = dx * dx + dy * dy;
 39 |   if (distance_squared > light.radius * light.radius) {
 40 |     return 0;
 41 |   }
 42 |   float distance = sqrtf(distance_squared);
 43 | 
 44 |   float scaled_distance = distance / light.radius;
 45 |   if (scaled_distance > 0.8) {
 46 |     return (1.0f - (scaled_distance - 0.8f) * 5.0f) * light.brightness;
 47 |   } else {
 48 |     return light.brightness;
 49 |   }
 50 | }
 51 | 
 52 | template <typename T>
 53 | __device__ T *pointer2d(T *base_pointer, int x, int y, size_t pitch)
 54 | {
 55 |   return (T *)((char *)base_pointer + y * pitch) + x;
 56 | }
 57 | 
 58 | __global__ void spotlights(const image source, image dest, unsigned int width,
 59 |                            unsigned int height, size_t pitch, float ambient)
 60 | {
 61 |   int x = blockIdx.x * blockDim.x + threadIdx.x;
 62 |   int y = blockIdx.y * blockDim.y + threadIdx.y;
 63 |   if (x >= width || y >= height) return;
 64 | 
 65 |   float brightness = ambient;
 66 |   for (int i = 0; i < dev_lights.count; i++) {
 67 |     brightness += light_brightness(x, y, width, height, dev_lights.lights[i]);
 68 |   }
 69 | 
 70 |   *pointer2d(dest.red, x, y, pitch) =
 71 |       clamp(*pointer2d(source.red, x, y, pitch) * brightness);
 72 |   *pointer2d(dest.green, x, y, pitch) =
 73 |       clamp(*pointer2d(source.green, x, y, pitch) * brightness);
 74 |   *pointer2d(dest.blue, x, y, pitch) =
 75 |       clamp(*pointer2d(source.blue, x, y, pitch) * brightness);
 76 | }
 77 | 
 78 | int main(int argc, char **argv)
 79 | {
 80 |   auto params = set_up_test_planar(argc, argv);
 81 | 
 82 |   image input2d, output2d;
 83 |   size_t byte_width = params.width * sizeof(float);
 84 |   size_t pitch;
 85 | 
 86 |   // Allocate 2D aligned image
 87 |   cudaCheckError(
 88 |       cudaMallocPitch(&input2d.red, &pitch, byte_width, params.height));
 89 |   // Copy from 1D to 2D image
 90 |   cudaCheckError(cudaMemcpy2D(input2d.red, pitch, params.input_image.red,
 91 |                               byte_width, byte_width, params.height,
 92 |                               cudaMemcpyDeviceToDevice));
 93 | 
 94 |   // Allocate and copy other channels
 95 |   // Note: pitch will be the same for all of these allocations
 96 |   cudaCheckError(
 97 |       cudaMallocPitch(&input2d.green, &pitch, byte_width, params.height));
 98 |   cudaCheckError(
 99 |       cudaMallocPitch(&input2d.blue, &pitch, byte_width, params.height));
100 |   cudaCheckError(
101 |       cudaMallocPitch(&output2d.red, &pitch, byte_width, params.height));
102 |   cudaCheckError(
103 |       cudaMallocPitch(&output2d.green, &pitch, byte_width, params.height));
104 |   cudaCheckError(
105 |       cudaMallocPitch(&output2d.blue, &pitch, byte_width, params.height));
106 |   cudaCheckError(cudaMemcpy2D(input2d.green, pitch, params.input_image.green,
107 |                               byte_width, byte_width, params.height,
108 |                               cudaMemcpyDeviceToDevice));
109 |   cudaCheckError(cudaMemcpy2D(input2d.blue, pitch, params.input_image.blue,
110 |                               byte_width, byte_width, params.height,
111 |                               cudaMemcpyDeviceToDevice));
112 | 
113 |   lots_of_lights lights = {1024};
114 |   float spacing = 1.0f / 32.0f;
115 |   for (int x = 0; x < 32; x++) {
116 |     for (int y = 0; y < 32; y++) {
117 |       int index = y * 32 + x;
118 |       lights.lights[index] = {x * spacing, y * spacing, 0.05, 0.2};
119 |     }
120 |   }
121 | 
122 |   cudaCheckError(
123 |       cudaMemcpyToSymbol(dev_lights, &lights, sizeof(lots_of_lights)));
124 | 
125 |   dim3 BLOCK_DIM(32, 16);
126 |   dim3 grid_dim((params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x,
127 |                 (params.height + BLOCK_DIM.y - 1) / BLOCK_DIM.y);
128 | 
129 |   {
130 |     KernelTimer t;
131 |     spotlights<<<grid_dim, BLOCK_DIM>>>(input2d, output2d, params.width,
132 |                                         params.height, pitch, 0.0);
133 |   }
134 | 
135 |   cudaCheckError(cudaMemcpy2D(params.output_image.red, byte_width, output2d.red,
136 |                               pitch, byte_width, params.height,
137 |                               cudaMemcpyDeviceToDevice));
138 |   cudaCheckError(cudaMemcpy2D(params.output_image.green, byte_width,
139 |                               output2d.green, pitch, byte_width, params.height,
140 |                               cudaMemcpyDeviceToDevice));
141 |   cudaCheckError(cudaMemcpy2D(params.output_image.blue, byte_width,
142 |                               output2d.blue, pitch, byte_width, params.height,
143 |                               cudaMemcpyDeviceToDevice));
144 | 
145 |   free_image(input2d);
146 |   free_image(output2d);
147 | 
148 |   finish_test_planar(params);
149 | 
150 |   return 0;
151 | }
152 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.4/manylights1.cu:
--------------------------------------------------------------------------------
  1 | // Render many spotlights on an image.
  2 | // This will not compile as it exceeds the maximum parameter space for
  3 | // launching a kernel.
  4 | // Example for video 3.4.
  5 | 
  6 | #include <cmath>
  7 | #include <cstdint>
  8 | #include <iostream>
  9 | 
 10 | // Standard CUDA API functions
 11 | #include <cuda_runtime_api.h>
 12 | 
 13 | #include "../utils.h"
 14 | 
 15 | struct light {
 16 |   float x;
 17 |   float y;
 18 |   float radius;
 19 |   float brightness;
 20 | };
 21 | 
 22 | struct lots_of_lights {
 23 |   unsigned int count;
 24 |   light lights[1024];
 25 | };
 26 | 
 27 | __device__ float clamp(float value) { return value > 1.0f ? 1.0f : value; }
 28 | 
 29 | __device__ float light_brightness(float x, float y, unsigned int width,
 30 |                                   unsigned int height, const light &light)
 31 | {
 32 |   float norm_x = x / width;
 33 |   float norm_y = y / height;
 34 | 
 35 |   float dx = norm_x - light.x;
 36 |   float dy = norm_y - light.y;
 37 |   float distance_squared = dx * dx + dy * dy;
 38 |   if (distance_squared > light.radius * light.radius) {
 39 |     return 0;
 40 |   }
 41 |   float distance = sqrtf(distance_squared);
 42 | 
 43 |   float scaled_distance = distance / light.radius;
 44 |   if (scaled_distance > 0.8) {
 45 |     return (1.0f - (scaled_distance - 0.8f) * 5.0f) * light.brightness;
 46 |   } else {
 47 |     return light.brightness;
 48 |   }
 49 | }
 50 | 
 51 | template <typename T>
 52 | __device__ T *pointer2d(T *base_pointer, int x, int y, size_t pitch)
 53 | {
 54 |   return (T *)((char *)base_pointer + y * pitch) + x;
 55 | }
 56 | 
 57 | __global__ void spotlights(const image source, image dest, unsigned int width,
 58 |                            unsigned int height, size_t pitch, float ambient,
 59 |                            lots_of_lights lights)
 60 | {
 61 |   int x = blockIdx.x * blockDim.x + threadIdx.x;
 62 |   int y = blockIdx.y * blockDim.y + threadIdx.y;
 63 |   if (x >= width || y >= height) return;
 64 | 
 65 |   float brightness = ambient;
 66 |   for (int i = 0; i < lights.count; i++) {
 67 |     brightness += light_brightness(x, y, width, height, lights.lights[i]);
 68 |   }
 69 | 
 70 |   *pointer2d(dest.red, x, y, pitch) =
 71 |       clamp(*pointer2d(source.red, x, y, pitch) * brightness);
 72 |   *pointer2d(dest.green, x, y, pitch) =
 73 |       clamp(*pointer2d(source.green, x, y, pitch) * brightness);
 74 |   *pointer2d(dest.blue, x, y, pitch) =
 75 |       clamp(*pointer2d(source.blue, x, y, pitch) * brightness);
 76 | }
 77 | 
 78 | int main(int argc, char **argv)
 79 | {
 80 |   auto params = set_up_test_planar(argc, argv);
 81 | 
 82 |   image input2d, output2d;
 83 |   size_t byte_width = params.width * sizeof(float);
 84 |   size_t pitch;
 85 | 
 86 |   // Allocate 2D aligned image
 87 |   cudaCheckError(
 88 |       cudaMallocPitch(&input2d.red, &pitch, byte_width, params.height));
 89 |   // Copy from 1D to 2D image
 90 |   cudaCheckError(cudaMemcpy2D(input2d.red, pitch, params.input_image.red,
 91 |                               byte_width, byte_width, params.height,
 92 |                               cudaMemcpyDeviceToDevice));
 93 | 
 94 |   // Allocate and copy other channels
 95 |   // Note: pitch will be the same for all of these allocations
 96 |   cudaCheckError(
 97 |       cudaMallocPitch(&input2d.green, &pitch, byte_width, params.height));
 98 |   cudaCheckError(
 99 |       cudaMallocPitch(&input2d.blue, &pitch, byte_width, params.height));
100 |   cudaCheckError(
101 |       cudaMallocPitch(&output2d.red, &pitch, byte_width, params.height));
102 |   cudaCheckError(
103 |       cudaMallocPitch(&output2d.green, &pitch, byte_width, params.height));
104 |   cudaCheckError(
105 |       cudaMallocPitch(&output2d.blue, &pitch, byte_width, params.height));
106 |   cudaCheckError(cudaMemcpy2D(input2d.green, pitch, params.input_image.green,
107 |                               byte_width, byte_width, params.height,
108 |                               cudaMemcpyDeviceToDevice));
109 |   cudaCheckError(cudaMemcpy2D(input2d.blue, pitch, params.input_image.blue,
110 |                               byte_width, byte_width, params.height,
111 |                               cudaMemcpyDeviceToDevice));
112 | 
113 |   lots_of_lights lights = {1024};
114 |   float spacing = 1.0f / 32.0f;
115 |   for (int x = 0; x < 32; x++) {
116 |     for (int y = 0; y < 32; y++) {
117 |       int index = y * 32 + x;
118 |       lights.lights[index] = {x * spacing, y * spacing, 0.1, 0.5};
119 |     }
120 |   }
121 | 
122 |   dim3 BLOCK_DIM(32, 16);
123 |   dim3 grid_dim((params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x,
124 |                 (params.height + BLOCK_DIM.y - 1) / BLOCK_DIM.y);
125 | 
126 |   {
127 |     KernelTimer t;
128 |     spotlights<<<grid_dim, BLOCK_DIM>>>(input2d, output2d, params.width,
129 |                                         params.height, pitch, 0.3, lights);
130 |   }
131 | 
132 |   cudaCheckError(cudaMemcpy2D(params.output_image.red, byte_width, output2d.red,
133 |                               pitch, byte_width, params.height,
134 |                               cudaMemcpyDeviceToDevice));
135 |   cudaCheckError(cudaMemcpy2D(params.output_image.green, byte_width,
136 |                               output2d.green, pitch, byte_width, params.height,
137 |                               cudaMemcpyDeviceToDevice));
138 |   cudaCheckError(cudaMemcpy2D(params.output_image.blue, byte_width,
139 |                               output2d.blue, pitch, byte_width, params.height,
140 |                               cudaMemcpyDeviceToDevice));
141 | 
142 |   free_image(input2d);
143 |   free_image(output2d);
144 | 
145 |   finish_test_planar(params);
146 | 
147 |   return 0;
148 | }
149 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.4/manylights2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.4/manylights2


--------------------------------------------------------------------------------
/Code files/Section 3/3.4/manylights2.cu:
--------------------------------------------------------------------------------
  1 | // Render many spotlights on an image, passing the light definitions in
  2 | // global memory. This works, but is inefficient.
  3 | // Example for video 3.4.
  4 | 
  5 | #include <cmath>
  6 | #include <cstdint>
  7 | #include <iostream>
  8 | 
  9 | // Standard CUDA API functions
 10 | #include <cuda_runtime_api.h>
 11 | 
 12 | #include "../utils.h"
 13 | 
 14 | struct light {
 15 |   float x;
 16 |   float y;
 17 |   float radius;
 18 |   float brightness;
 19 | };
 20 | 
 21 | struct lots_of_lights {
 22 |   unsigned int count;
 23 |   light lights[1024];
 24 | };
 25 | 
 26 | __device__ float clamp(float value) { return value > 1.0f ? 1.0f : value; }
 27 | 
 28 | __device__ float light_brightness(float x, float y, unsigned int width,
 29 |                                   unsigned int height, const light &light)
 30 | {
 31 |   float norm_x = x / width;
 32 |   float norm_y = y / height;
 33 | 
 34 |   float dx = norm_x - light.x;
 35 |   float dy = norm_y - light.y;
 36 |   float distance_squared = dx * dx + dy * dy;
 37 |   if (distance_squared > light.radius * light.radius) {
 38 |     return 0;
 39 |   }
 40 |   float distance = sqrtf(distance_squared);
 41 | 
 42 |   float scaled_distance = distance / light.radius;
 43 |   if (scaled_distance > 0.8) {
 44 |     return (1.0f - (scaled_distance - 0.8f) * 5.0f) * light.brightness;
 45 |   } else {
 46 |     return light.brightness;
 47 |   }
 48 | }
 49 | 
 50 | template <typename T>
 51 | __device__ T *pointer2d(T *base_pointer, int x, int y, size_t pitch)
 52 | {
 53 |   return (T *)((char *)base_pointer + y * pitch) + x;
 54 | }
 55 | 
 56 | __global__ void spotlights(const image source, image dest, unsigned int width,
 57 |                            unsigned int height, size_t pitch, float ambient,
 58 |                            lots_of_lights *lights)
 59 | {
 60 |   int x = blockIdx.x * blockDim.x + threadIdx.x;
 61 |   int y = blockIdx.y * blockDim.y + threadIdx.y;
 62 |   if (x >= width || y >= height) return;
 63 | 
 64 |   float brightness = ambient;
 65 |   for (int i = 0; i < lights->count; i++) {
 66 |     brightness += light_brightness(x, y, width, height, lights->lights[i]);
 67 |   }
 68 | 
 69 |   *pointer2d(dest.red, x, y, pitch) =
 70 |       clamp(*pointer2d(source.red, x, y, pitch) * brightness);
 71 |   *pointer2d(dest.green, x, y, pitch) =
 72 |       clamp(*pointer2d(source.green, x, y, pitch) * brightness);
 73 |   *pointer2d(dest.blue, x, y, pitch) =
 74 |       clamp(*pointer2d(source.blue, x, y, pitch) * brightness);
 75 | }
 76 | 
 77 | int main(int argc, char **argv)
 78 | {
 79 |   auto params = set_up_test_planar(argc, argv);
 80 | 
 81 |   image input2d, output2d;
 82 |   size_t byte_width = params.width * sizeof(float);
 83 |   size_t pitch;
 84 | 
 85 |   // Allocate 2D aligned image
 86 |   cudaCheckError(
 87 |       cudaMallocPitch(&input2d.red, &pitch, byte_width, params.height));
 88 |   // Copy from 1D to 2D image
 89 |   cudaCheckError(cudaMemcpy2D(input2d.red, pitch, params.input_image.red,
 90 |                               byte_width, byte_width, params.height,
 91 |                               cudaMemcpyDeviceToDevice));
 92 | 
 93 |   // Allocate and copy other channels
 94 |   // Note: pitch will be the same for all of these allocations
 95 |   cudaCheckError(
 96 |       cudaMallocPitch(&input2d.green, &pitch, byte_width, params.height));
 97 |   cudaCheckError(
 98 |       cudaMallocPitch(&input2d.blue, &pitch, byte_width, params.height));
 99 |   cudaCheckError(
100 |       cudaMallocPitch(&output2d.red, &pitch, byte_width, params.height));
101 |   cudaCheckError(
102 |       cudaMallocPitch(&output2d.green, &pitch, byte_width, params.height));
103 |   cudaCheckError(
104 |       cudaMallocPitch(&output2d.blue, &pitch, byte_width, params.height));
105 |   cudaCheckError(cudaMemcpy2D(input2d.green, pitch, params.input_image.green,
106 |                               byte_width, byte_width, params.height,
107 |                               cudaMemcpyDeviceToDevice));
108 |   cudaCheckError(cudaMemcpy2D(input2d.blue, pitch, params.input_image.blue,
109 |                               byte_width, byte_width, params.height,
110 |                               cudaMemcpyDeviceToDevice));
111 | 
112 |   lots_of_lights lights = {1024};
113 |   float spacing = 1.0f / 32.0f;
114 |   for (int x = 0; x < 32; x++) {
115 |     for (int y = 0; y < 32; y++) {
116 |       int index = y * 32 + x;
117 |       lights.lights[index] = {x * spacing, y * spacing, 0.05, 0.2};
118 |     }
119 |   }
120 | 
121 |   lots_of_lights *dev_lights;
122 |   cudaCheckError(cudaMalloc(&dev_lights, sizeof(lots_of_lights)));
123 |   cudaCheckError(cudaMemcpy(dev_lights, &lights, sizeof(lots_of_lights),
124 |                             cudaMemcpyHostToDevice));
125 | 
126 |   dim3 BLOCK_DIM(32, 16);
127 |   dim3 grid_dim((params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x,
128 |                 (params.height + BLOCK_DIM.y - 1) / BLOCK_DIM.y);
129 | 
130 |   {
131 |     KernelTimer t;
132 |     spotlights<<<grid_dim, BLOCK_DIM>>>(input2d, output2d, params.width,
133 |                                         params.height, pitch, 0.0, dev_lights);
134 |   }
135 | 
136 |   cudaCheckError(cudaFree(dev_lights));
137 | 
138 |   cudaCheckError(cudaMemcpy2D(params.output_image.red, byte_width, output2d.red,
139 |                               pitch, byte_width, params.height,
140 |                               cudaMemcpyDeviceToDevice));
141 |   cudaCheckError(cudaMemcpy2D(params.output_image.green, byte_width,
142 |                               output2d.green, pitch, byte_width, params.height,
143 |                               cudaMemcpyDeviceToDevice));
144 |   cudaCheckError(cudaMemcpy2D(params.output_image.blue, byte_width,
145 |                               output2d.blue, pitch, byte_width, params.height,
146 |                               cudaMemcpyDeviceToDevice));
147 | 
148 |   free_image(input2d);
149 |   free_image(output2d);
150 | 
151 |   finish_test_planar(params);
152 | 
153 |   return 0;
154 | }
155 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.4/warp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.4/warp


--------------------------------------------------------------------------------
/Code files/Section 3/3.4/warp-texture:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.4/warp-texture


--------------------------------------------------------------------------------
/Code files/Section 3/3.4/warp-texture.cu:
--------------------------------------------------------------------------------
  1 | // Image warping using texture memory to improve performance.
  2 | // Example for video 3.4.
  3 | 
  4 | #include <assert.h>
  5 | #include <cmath>
  6 | #include <cstdint>
  7 | #include <iostream>
  8 | 
  9 | // Standard CUDA API functions
 10 | #include <cuda_runtime_api.h>
 11 | 
 12 | #include "../utils.h"
 13 | 
 14 | struct warp_params {
 15 |   float matrix[4];
 16 |   float inverse_matrix[4];
 17 |   float x_shift;
 18 |   float y_shift;
 19 | };
 20 | 
 21 | struct texture_image {
 22 |   cudaTextureObject_t red;
 23 |   cudaTextureObject_t green;
 24 |   cudaTextureObject_t blue;
 25 | };
 26 | 
 27 | template <typename T>
 28 | __device__ T *pointer2d(T *base_pointer, int x, int y, size_t pitch)
 29 | {
 30 |   return (T *)((char *)base_pointer + y * pitch) + x;
 31 | }
 32 | 
 33 | __device__ float get_pixel(const float *source, unsigned int width,
 34 |                            unsigned int height, size_t pitch, int x, int y)
 35 | {
 36 |   if (x < 0 || x >= width || y < 0 || y >= height) {
 37 |     return 0.0f;
 38 |   } else {
 39 |     return *pointer2d(source, x, y, pitch);
 40 |   }
 41 | }
 42 | 
 43 | __device__ float average_pixels(const float *source, unsigned int width,
 44 |                                 unsigned int height, size_t pitch, int x0,
 45 |                                 float weight_x, int y0, float weight_y)
 46 | 
 47 | {
 48 |   float p00 = get_pixel(source, width, height, pitch, x0, y0);
 49 |   float p01 = get_pixel(source, width, height, pitch, x0, y0 + 1);
 50 |   float p10 = get_pixel(source, width, height, pitch, x0 + 1, y0);
 51 |   float p11 = get_pixel(source, width, height, pitch, x0 + 1, y0 + 1);
 52 | 
 53 |   return (p00 * weight_x + p10 * (1.0f - weight_x)) * weight_y +
 54 |          (p01 * weight_x + p11 * (1.0f - weight_x)) * (1.0f - weight_y);
 55 | }
 56 | 
 57 | __global__ void warp_image(texture_image source, image dest, unsigned int width,
 58 |                            unsigned int height, size_t pitch,
 59 |                            warp_params params)
 60 | {
 61 |   int x = blockIdx.x * blockDim.x + threadIdx.x;
 62 |   int y = blockIdx.y * blockDim.y + threadIdx.y;
 63 |   if (x >= width || y >= height) return;
 64 | 
 65 |   float source_x = params.inverse_matrix[0] * x + params.inverse_matrix[1] * y -
 66 |                    params.x_shift;
 67 |   float source_y = params.inverse_matrix[2] * x + params.inverse_matrix[3] * y -
 68 |                    params.y_shift;
 69 | 
 70 |   *pointer2d(dest.red, x, y, pitch) =
 71 |       tex2D<float>(source.red, source_x, source_y);
 72 |   *pointer2d(dest.green, x, y, pitch) =
 73 |       tex2D<float>(source.green, source_x, source_y);
 74 |   *pointer2d(dest.blue, x, y, pitch) =
 75 |       tex2D<float>(source.blue, source_x, source_y);
 76 | }
 77 | 
 78 | static void mult_matrix(float mat[4], float a, float b, float c, float d)
 79 | {
 80 |   float dst_a = mat[0] * a + mat[1] * c;
 81 |   float dst_b = mat[0] * b + mat[1] * d;
 82 |   float dst_c = mat[2] * a + mat[3] * c;
 83 |   float dst_d = mat[2] * b + mat[3] * d;
 84 | 
 85 |   mat[0] = dst_a;
 86 |   mat[1] = dst_b;
 87 |   mat[2] = dst_c;
 88 |   mat[3] = dst_d;
 89 | }
 90 | 
 91 | static void invert_matrix(float inverse[4], const float mat[4])
 92 | {
 93 |   float determinant = mat[0] * mat[3] - mat[1] * mat[2];
 94 |   assert(determinant != 0);  // Shouldn't happen if scales are non-zero
 95 |   float inverse_determinant = 1.0f / determinant;
 96 | 
 97 |   inverse[0] = mat[3] * inverse_determinant;
 98 |   inverse[1] = -1 * mat[1] * inverse_determinant;
 99 |   inverse[2] = -1 * mat[2] * inverse_determinant;
100 |   inverse[3] = mat[0] * inverse_determinant;
101 | }
102 | 
103 | int main(int argc, char **argv)
104 | {
105 |   auto params = set_up_test_planar(argc, argv);
106 |   image output2d;
107 |   size_t byte_width = params.width * sizeof(float);
108 |   size_t pitch;
109 | 
110 |   cudaCheckError(
111 |       cudaMallocPitch(&output2d.red, &pitch, byte_width, params.height));
112 |   cudaCheckError(
113 |       cudaMallocPitch(&output2d.green, &pitch, byte_width, params.height));
114 |   cudaCheckError(
115 |       cudaMallocPitch(&output2d.blue, &pitch, byte_width, params.height));
116 | 
117 |   // Set up warp parameters
118 |   const float SCALE = 0.65f;
119 |   const float ROTATE_RADS = 0.3;
120 |   warp_params warp;
121 |   // Scaling matrix
122 |   warp.matrix[0] = warp.matrix[3] = SCALE;
123 |   warp.matrix[1] = warp.matrix[2] = 0;
124 |   // Add rotation
125 |   mult_matrix(warp.matrix, cosf(ROTATE_RADS), sinf(ROTATE_RADS),
126 |               -1 * sinf(ROTATE_RADS), cosf(ROTATE_RADS));
127 |   // Kernel will use inverse
128 |   invert_matrix(warp.inverse_matrix, warp.matrix);
129 |   // Add translation
130 |   warp.x_shift = 0.1f * params.width;
131 |   warp.y_shift = 0.3f * params.height;
132 | 
133 |   // Create arrays: opaque memory layouts optimized for texture
134 |   // fetching. Copy our input images to them.
135 |   cudaChannelFormatDesc channelDesc =
136 |       cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
137 |   cudaArray *red_array, *green_array, *blue_array;
138 |   cudaCheckError(
139 |       cudaMallocArray(&red_array, &channelDesc, params.width, params.height));
140 |   cudaCheckError(cudaMemcpy2DToArray(
141 |       red_array, 0, 0, params.input_image.red, params.width * sizeof(float),
142 |       params.width * sizeof(float), params.height, cudaMemcpyDeviceToDevice));
143 |   cudaCheckError(
144 |       cudaMallocArray(&green_array, &channelDesc, params.width, params.height));
145 |   cudaCheckError(cudaMemcpy2DToArray(
146 |       green_array, 0, 0, params.input_image.green, params.width * sizeof(float),
147 |       params.width * sizeof(float), params.height, cudaMemcpyDeviceToDevice));
148 |   cudaCheckError(
149 |       cudaMallocArray(&blue_array, &channelDesc, params.width, params.height));
150 |   cudaCheckError(cudaMemcpy2DToArray(
151 |       blue_array, 0, 0, params.input_image.blue, params.width * sizeof(float),
152 |       params.width * sizeof(float), params.height, cudaMemcpyDeviceToDevice));
153 | 
154 |   // Create resource descriptions for each channel, for use in texture setup.
155 |   struct cudaResourceDesc red_resource = {cudaResourceTypeArray};
156 |   red_resource.res.array.array = red_array;
157 |   struct cudaResourceDesc green_resource = {cudaResourceTypeArray};
158 |   green_resource.res.array.array = green_array;
159 |   struct cudaResourceDesc blue_resource = {cudaResourceTypeArray};
160 |   blue_resource.res.array.array = blue_array;
161 | 
162 |   // Create texture description, specifying settings for texture fetches.
163 |   struct cudaTextureDesc texture_desc = {};
164 |   texture_desc.addressMode[0] = cudaAddressModeBorder;
165 |   texture_desc.addressMode[1] = cudaAddressModeBorder;
166 |   texture_desc.filterMode = cudaFilterModeLinear;
167 |   texture_desc.readMode = cudaReadModeElementType;
168 |   texture_desc.normalizedCoords = 0;
169 | 
170 |   // Create texture objects which combine the resources and the texture
171 |   // descriptions.
172 |   texture_image source_texture;
173 |   cudaCreateTextureObject(&source_texture.red, &red_resource, &texture_desc,
174 |                           NULL);
175 |   cudaCreateTextureObject(&source_texture.green, &green_resource, &texture_desc,
176 |                           NULL);
177 |   cudaCreateTextureObject(&source_texture.blue, &blue_resource, &texture_desc,
178 |                           NULL);
179 | 
180 |   dim3 BLOCK_DIM(32, 16);
181 |   dim3 grid_dim((params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x,
182 |                 (params.height + BLOCK_DIM.y - 1) / BLOCK_DIM.y);
183 | 
184 |   {
185 |     KernelTimer t;
186 |     warp_image<<<grid_dim, BLOCK_DIM>>>(source_texture, output2d, params.width,
187 |                                         params.height, pitch, warp);
188 |   }
189 | 
190 |   cudaCheckError(cudaDestroyTextureObject(source_texture.red));
191 |   cudaCheckError(cudaDestroyTextureObject(source_texture.green));
192 |   cudaCheckError(cudaDestroyTextureObject(source_texture.blue));
193 | 
194 |   cudaCheckError(cudaFreeArray(red_array));
195 |   cudaCheckError(cudaFreeArray(green_array));
196 |   cudaCheckError(cudaFreeArray(blue_array));
197 | 
198 |   cudaCheckError(cudaMemcpy2D(params.output_image.red, byte_width, output2d.red,
199 |                               pitch, byte_width, params.height,
200 |                               cudaMemcpyDeviceToDevice));
201 |   cudaCheckError(cudaMemcpy2D(params.output_image.green, byte_width,
202 |                               output2d.green, pitch, byte_width, params.height,
203 |                               cudaMemcpyDeviceToDevice));
204 |   cudaCheckError(cudaMemcpy2D(params.output_image.blue, byte_width,
205 |                               output2d.blue, pitch, byte_width, params.height,
206 |                               cudaMemcpyDeviceToDevice));
207 | 
208 |   free_image(output2d);
209 | 
210 |   finish_test_planar(params);
211 | 
212 |   return 0;
213 | }
214 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.4/warp.cu:
--------------------------------------------------------------------------------
  1 | // Image warping using global memory.
  2 | // Reads are uncoalesced so performance is not optimal.
  3 | // Example for video 3.4.
  4 | 
  5 | #include <assert.h>
  6 | #include <cmath>
  7 | #include <cstdint>
  8 | #include <iostream>
  9 | 
 10 | // Standard CUDA API functions
 11 | #include <cuda_runtime_api.h>
 12 | 
 13 | #include "../utils.h"
 14 | 
 15 | struct warp_params {
 16 |   float matrix[4];
 17 |   float inverse_matrix[4];
 18 |   float x_shift;
 19 |   float y_shift;
 20 | };
 21 | 
 22 | template <typename T>
 23 | __device__ T *pointer2d(T *base_pointer, int x, int y, size_t pitch)
 24 | {
 25 |   return (T *)((char *)base_pointer + y * pitch) + x;
 26 | }
 27 | 
 28 | __device__ float get_pixel(const float *source, unsigned int width,
 29 |                            unsigned int height, size_t pitch, int x, int y)
 30 | {
 31 |   if (x < 0 || x >= width || y < 0 || y >= height) {
 32 |     return 0.0f;
 33 |   } else {
 34 |     return *pointer2d(source, x, y, pitch);
 35 |   }
 36 | }
 37 | 
 38 | __device__ float average_pixels(const float *source, unsigned int width,
 39 |                                 unsigned int height, size_t pitch, int x0,
 40 |                                 float weight_x, int y0, float weight_y)
 41 | 
 42 | {
 43 |   float p00 = get_pixel(source, width, height, pitch, x0, y0);
 44 |   float p01 = get_pixel(source, width, height, pitch, x0, y0 + 1);
 45 |   float p10 = get_pixel(source, width, height, pitch, x0 + 1, y0);
 46 |   float p11 = get_pixel(source, width, height, pitch, x0 + 1, y0 + 1);
 47 | 
 48 |   return (p00 * weight_x + p10 * (1.0f - weight_x)) * weight_y +
 49 |          (p01 * weight_x + p11 * (1.0f - weight_x)) * (1.0f - weight_y);
 50 | }
 51 | 
 52 | __global__ void warp_image(const image source, image dest, unsigned int width,
 53 |                            unsigned int height, size_t pitch,
 54 |                            warp_params params)
 55 | {
 56 |   int x = blockIdx.x * blockDim.x + threadIdx.x;
 57 |   int y = blockIdx.y * blockDim.y + threadIdx.y;
 58 |   if (x >= width || y >= height) return;
 59 | 
 60 |   float source_x = params.inverse_matrix[0] * x + params.inverse_matrix[1] * y -
 61 |                    params.x_shift;
 62 |   float source_y = params.inverse_matrix[2] * x + params.inverse_matrix[3] * y -
 63 |                    params.y_shift;
 64 | 
 65 |   float x0 = floorf(source_x);
 66 |   float weight_x = source_x - x0;
 67 |   int x0_int = static_cast<int>(x0);
 68 |   float y0 = floorf(source_y);
 69 |   float weight_y = source_y - y0;
 70 |   int y0_int = static_cast<int>(y0);
 71 | 
 72 |   *pointer2d(dest.red, x, y, pitch) = average_pixels(
 73 |       source.red, width, height, pitch, x0_int, weight_x, y0_int, weight_y);
 74 |   *pointer2d(dest.green, x, y, pitch) = average_pixels(
 75 |       source.green, width, height, pitch, x0_int, weight_x, y0_int, weight_y);
 76 |   *pointer2d(dest.blue, x, y, pitch) = average_pixels(
 77 |       source.blue, width, height, pitch, x0_int, weight_x, y0_int, weight_y);
 78 | }
 79 | 
 80 | static void mult_matrix(float mat[4], float a, float b, float c, float d)
 81 | {
 82 |   float dst_a = mat[0] * a + mat[1] * c;
 83 |   float dst_b = mat[0] * b + mat[1] * d;
 84 |   float dst_c = mat[2] * a + mat[3] * c;
 85 |   float dst_d = mat[2] * b + mat[3] * d;
 86 | 
 87 |   mat[0] = dst_a;
 88 |   mat[1] = dst_b;
 89 |   mat[2] = dst_c;
 90 |   mat[3] = dst_d;
 91 | }
 92 | 
 93 | static void invert_matrix(float inverse[4], const float mat[4])
 94 | {
 95 |   float determinant = mat[0] * mat[3] - mat[1] * mat[2];
 96 |   assert(determinant != 0);  // Shouldn't happen if scales are non-zero
 97 |   float inverse_determinant = 1.0f / determinant;
 98 | 
 99 |   inverse[0] = mat[3] * inverse_determinant;
100 |   inverse[1] = -1 * mat[1] * inverse_determinant;
101 |   inverse[2] = -1 * mat[2] * inverse_determinant;
102 |   inverse[3] = mat[0] * inverse_determinant;
103 | }
104 | 
105 | int main(int argc, char **argv)
106 | {
107 |   auto params = set_up_test_planar(argc, argv);
108 |   image input2d, output2d;
109 |   size_t byte_width = params.width * sizeof(float);
110 |   size_t pitch;
111 | 
112 |   // Allocate 2D aligned image
113 |   cudaCheckError(
114 |       cudaMallocPitch(&input2d.red, &pitch, byte_width, params.height));
115 |   // Copy from 1D to 2D image
116 |   cudaCheckError(cudaMemcpy2D(input2d.red, pitch, params.input_image.red,
117 |                               byte_width, byte_width, params.height,
118 |                               cudaMemcpyDeviceToDevice));
119 | 
120 |   // Allocate and copy other channels
121 |   // Note: pitch will be the same for all of these allocations
122 |   cudaCheckError(
123 |       cudaMallocPitch(&input2d.green, &pitch, byte_width, params.height));
124 |   cudaCheckError(
125 |       cudaMallocPitch(&input2d.blue, &pitch, byte_width, params.height));
126 |   cudaCheckError(
127 |       cudaMallocPitch(&output2d.red, &pitch, byte_width, params.height));
128 |   cudaCheckError(
129 |       cudaMallocPitch(&output2d.green, &pitch, byte_width, params.height));
130 |   cudaCheckError(
131 |       cudaMallocPitch(&output2d.blue, &pitch, byte_width, params.height));
132 |   cudaCheckError(cudaMemcpy2D(input2d.green, pitch, params.input_image.green,
133 |                               byte_width, byte_width, params.height,
134 |                               cudaMemcpyDeviceToDevice));
135 |   cudaCheckError(cudaMemcpy2D(input2d.blue, pitch, params.input_image.blue,
136 |                               byte_width, byte_width, params.height,
137 |                               cudaMemcpyDeviceToDevice));
138 | 
139 |   // Set up warp parameters
140 |   const float SCALE = 0.65f;
141 |   const float ROTATE_RADS = 0.3;
142 |   warp_params warp;
143 |   // Scaling matrix
144 |   warp.matrix[0] = warp.matrix[3] = SCALE;
145 |   warp.matrix[1] = warp.matrix[2] = 0;
146 |   // Add rotation
147 |   mult_matrix(warp.matrix, cosf(ROTATE_RADS), sinf(ROTATE_RADS),
148 |               -1 * sinf(ROTATE_RADS), cosf(ROTATE_RADS));
149 |   // Kernel will use inverse
150 |   invert_matrix(warp.inverse_matrix, warp.matrix);
151 |   // Add translation
152 |   warp.x_shift = 0.1f * params.width;
153 |   warp.y_shift = 0.3f * params.height;
154 | 
155 |   dim3 BLOCK_DIM(32, 16);
156 |   dim3 grid_dim((params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x,
157 |                 (params.height + BLOCK_DIM.y - 1) / BLOCK_DIM.y);
158 | 
159 |   {
160 |     KernelTimer t;
161 |     warp_image<<<grid_dim, BLOCK_DIM>>>(input2d, output2d, params.width,
162 |                                         params.height, pitch, warp);
163 |   }
164 | 
165 |   cudaCheckError(cudaMemcpy2D(params.output_image.red, byte_width, output2d.red,
166 |                               pitch, byte_width, params.height,
167 |                               cudaMemcpyDeviceToDevice));
168 |   cudaCheckError(cudaMemcpy2D(params.output_image.green, byte_width,
169 |                               output2d.green, pitch, byte_width, params.height,
170 |                               cudaMemcpyDeviceToDevice));
171 |   cudaCheckError(cudaMemcpy2D(params.output_image.blue, byte_width,
172 |                               output2d.blue, pitch, byte_width, params.height,
173 |                               cudaMemcpyDeviceToDevice));
174 | 
175 |   free_image(input2d);
176 |   free_image(output2d);
177 | 
178 |   finish_test_planar(params);
179 | 
180 |   return 0;
181 | }
182 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.5/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = manylights-ilp
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 3/3.5/manylights-ilp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 3/3.5/manylights-ilp


--------------------------------------------------------------------------------
/Code files/Section 3/3.5/manylights-ilp.cu:
--------------------------------------------------------------------------------
  1 | // Render many spotlights on an image, computing multiple results per thread
  2 | // in order to increase instruction-level parallelism.
  3 | // Example for video 3.5.
  4 | 
  5 | #include <cmath>
  6 | #include <cstdint>
  7 | #include <iostream>
  8 | 
  9 | // Standard CUDA API functions
 10 | #include <cuda_runtime_api.h>
 11 | 
 12 | #include "../utils.h"
 13 | 
 14 | struct light {
 15 |   float x;
 16 |   float y;
 17 |   float radius;
 18 |   float brightness;
 19 | };
 20 | 
 21 | struct lots_of_lights {
 22 |   unsigned int count;
 23 |   light lights[1024];
 24 | };
 25 | 
 26 | __constant__ lots_of_lights dev_lights;
 27 | 
 28 | __device__ float clamp(float value) { return value > 1.0f ? 1.0f : value; }
 29 | 
 30 | __device__ float light_brightness(float x, float y, unsigned int width,
 31 |                                   unsigned int height, const light &light)
 32 | {
 33 |   float norm_x = x / width;
 34 |   float norm_y = y / height;
 35 | 
 36 |   float dx = norm_x - light.x;
 37 |   float dy = norm_y - light.y;
 38 |   float distance_squared = dx * dx + dy * dy;
 39 |   if (distance_squared > light.radius * light.radius) {
 40 |     return 0;
 41 |   }
 42 |   float distance = sqrtf(distance_squared);
 43 | 
 44 |   float scaled_distance = distance / light.radius;
 45 |   if (scaled_distance > 0.8) {
 46 |     return (1.0f - (scaled_distance - 0.8f) * 5.0f) * light.brightness;
 47 |   } else {
 48 |     return light.brightness;
 49 |   }
 50 | }
 51 | 
 52 | template <typename T>
 53 | __device__ T *pointer2d(T *base_pointer, int x, int y, size_t pitch)
 54 | {
 55 |   return (T *)((char *)base_pointer + y * pitch) + x;
 56 | }
 57 | 
 58 | const int OUTPUTS_PER_THREAD = 2;
 59 | 
 60 | __global__ void spotlights(const image source, image dest, unsigned int width,
 61 |                            unsigned int height, size_t pitch, float ambient)
 62 | {
 63 |   for (int i = 0; i < OUTPUTS_PER_THREAD; i++) {
 64 |     int x = blockIdx.x * blockDim.x + threadIdx.x;
 65 |     int y = OUTPUTS_PER_THREAD * blockIdx.y * blockDim.y + threadIdx.y +
 66 |             i * blockDim.y;
 67 |     if (x >= width || y >= height) return;
 68 | 
 69 |     float brightness = ambient;
 70 |     for (int i = 0; i < dev_lights.count; i++) {
 71 |       brightness += light_brightness(x, y, width, height, dev_lights.lights[i]);
 72 |     }
 73 | 
 74 |     *pointer2d(dest.red, x, y, pitch) =
 75 |         clamp(*pointer2d(source.red, x, y, pitch) * brightness);
 76 |     *pointer2d(dest.green, x, y, pitch) =
 77 |         clamp(*pointer2d(source.green, x, y, pitch) * brightness);
 78 |     *pointer2d(dest.blue, x, y, pitch) =
 79 |         clamp(*pointer2d(source.blue, x, y, pitch) * brightness);
 80 |   }
 81 | }
 82 | 
 83 | int main(int argc, char **argv)
 84 | {
 85 |   auto params = set_up_test_planar(argc, argv);
 86 | 
 87 |   image input2d, output2d;
 88 |   size_t byte_width = params.width * sizeof(float);
 89 |   size_t pitch;
 90 | 
 91 |   // Allocate 2D aligned image
 92 |   cudaCheckError(
 93 |       cudaMallocPitch(&input2d.red, &pitch, byte_width, params.height));
 94 |   // Copy from 1D to 2D image
 95 |   cudaCheckError(cudaMemcpy2D(input2d.red, pitch, params.input_image.red,
 96 |                               byte_width, byte_width, params.height,
 97 |                               cudaMemcpyDeviceToDevice));
 98 | 
 99 |   // Allocate and copy other channels
100 |   // Note: pitch will be the same for all of these allocations
101 |   cudaCheckError(
102 |       cudaMallocPitch(&input2d.green, &pitch, byte_width, params.height));
103 |   cudaCheckError(
104 |       cudaMallocPitch(&input2d.blue, &pitch, byte_width, params.height));
105 |   cudaCheckError(
106 |       cudaMallocPitch(&output2d.red, &pitch, byte_width, params.height));
107 |   cudaCheckError(
108 |       cudaMallocPitch(&output2d.green, &pitch, byte_width, params.height));
109 |   cudaCheckError(
110 |       cudaMallocPitch(&output2d.blue, &pitch, byte_width, params.height));
111 |   cudaCheckError(cudaMemcpy2D(input2d.green, pitch, params.input_image.green,
112 |                               byte_width, byte_width, params.height,
113 |                               cudaMemcpyDeviceToDevice));
114 |   cudaCheckError(cudaMemcpy2D(input2d.blue, pitch, params.input_image.blue,
115 |                               byte_width, byte_width, params.height,
116 |                               cudaMemcpyDeviceToDevice));
117 | 
118 |   lots_of_lights lights = {1024};
119 |   float spacing = 1.0f / 32.0f;
120 |   for (int x = 0; x < 32; x++) {
121 |     for (int y = 0; y < 32; y++) {
122 |       int index = y * 32 + x;
123 |       lights.lights[index] = {x * spacing, y * spacing, 0.05, 0.2};
124 |     }
125 |   }
126 | 
127 |   cudaCheckError(
128 |       cudaMemcpyToSymbol(dev_lights, &lights, sizeof(lots_of_lights)));
129 | 
130 |   dim3 BLOCK_DIM(32, 16);
131 |   dim3 grid_dim(
132 |       (params.width + BLOCK_DIM.x - 1) / BLOCK_DIM.x,
133 |       (params.height + BLOCK_DIM.y - 1) / (BLOCK_DIM.y * OUTPUTS_PER_THREAD));
134 | 
135 |   {
136 |     KernelTimer t;
137 |     spotlights<<<grid_dim, BLOCK_DIM>>>(input2d, output2d, params.width,
138 |                                         params.height, pitch, 0.0);
139 |   }
140 | 
141 |   cudaCheckError(cudaMemcpy2D(params.output_image.red, byte_width, output2d.red,
142 |                               pitch, byte_width, params.height,
143 |                               cudaMemcpyDeviceToDevice));
144 |   cudaCheckError(cudaMemcpy2D(params.output_image.green, byte_width,
145 |                               output2d.green, pitch, byte_width, params.height,
146 |                               cudaMemcpyDeviceToDevice));
147 |   cudaCheckError(cudaMemcpy2D(params.output_image.blue, byte_width,
148 |                               output2d.blue, pitch, byte_width, params.height,
149 |                               cudaMemcpyDeviceToDevice));
150 | 
151 |   free_image(input2d);
152 |   free_image(output2d);
153 | 
154 |   finish_test_planar(params);
155 | 
156 |   return 0;
157 | }
158 | 


--------------------------------------------------------------------------------
/Code files/Section 4/4.1/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = transpose transpose-shared
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 4/4.1/transpose:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 4/4.1/transpose


--------------------------------------------------------------------------------
/Code files/Section 4/4.1/transpose-shared:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 4/4.1/transpose-shared


--------------------------------------------------------------------------------
/Code files/Section 4/4.1/transpose-shared.cu:
--------------------------------------------------------------------------------
  1 | // Matrix transpose using shared memory to ensure all writes coalesce.
  2 | // Example for video 4.1.
  3 | 
  4 | #include <assert.h>
  5 | #include <memory>
  6 | 
  7 | // Standard CUDA API functions
  8 | #include <cuda_runtime_api.h>
  9 | 
 10 | // CUDA cooperative groups API
 11 | #include <cooperative_groups.h>
 12 | 
 13 | #include "../utils.h"
 14 | 
 15 | const int TILE_DIM = 16;
 16 | 
 17 | // Reference implementation on the host
 18 | void transpose_reference(const float *source, float *dest,
 19 |                          unsigned int dimension)
 20 | {
 21 |   for (int y = 0; y < dimension; y++) {
 22 |     for (int x = 0; x < dimension; x++) {
 23 |       dest[y + x * dimension] = source[x + y * dimension];
 24 |     }
 25 |   }
 26 | }
 27 | 
 28 | // Transpose a matrix
 29 | // For simplicity, we assume that the matrix is square, and that its
 30 | // dimension is a multiple of the block size, so we don't have to worry about
 31 | // pitch or bounds checking.
 32 | __global__ void transpose(const float *source, float *dest,
 33 |                           unsigned int dimension)
 34 | {
 35 |   // Shared memory to temporarily store data.
 36 |   // Note the padding of the Y dimension, to avoid bank conflicts.
 37 |   __shared__ float tile[TILE_DIM][TILE_DIM + 1];
 38 | 
 39 |   int x_in = blockIdx.x * blockDim.x + threadIdx.x;
 40 |   int y_in = blockIdx.y * blockDim.y + threadIdx.y;
 41 |   int source_index = x_in + y_in * dimension;
 42 | 
 43 |   // Read from global memory to shared memory. Global memory access is
 44 |   // aligned.
 45 |   tile[threadIdx.y][threadIdx.x] = source[source_index];
 46 | 
 47 |   // Wait for all threads in the block to finish, so the shared memory tile
 48 |   // is filled.
 49 |   cooperative_groups::thread_block block =
 50 |       cooperative_groups::this_thread_block();
 51 |   cooperative_groups::sync(block);
 52 | 
 53 |   // Output coordinates. Note that blockIdx.y is used to determine x_out, and
 54 |   // blockIdx.x is used to determine y_out.
 55 |   int x_out = blockIdx.y * blockDim.y + threadIdx.x;
 56 |   int y_out = blockIdx.x * blockDim.y + threadIdx.y;
 57 |   int dest_index = x_out + y_out * dimension;
 58 | 
 59 |   // Read from a different index in the shared memory tile, and write to
 60 |   // global memory. Global memory access is once again aligned.
 61 |   dest[dest_index] = tile[threadIdx.x][threadIdx.y];
 62 | }
 63 | 
 64 | int main(int argc, char **argv)
 65 | {
 66 |   const unsigned int DIMENSION = 4096;
 67 |   const unsigned int COUNT = DIMENSION * DIMENSION;
 68 |   std::unique_ptr<float[]> source(new float[COUNT]);
 69 |   std::unique_ptr<float[]> dest(new float[COUNT]);
 70 | 
 71 |   // Fill source matrix with some arbitrary test values
 72 |   for (int i = 0; i < COUNT; i++) {
 73 |     source[i] = i;
 74 |   }
 75 | 
 76 |   // Allocate and fill device memory
 77 |   float *source_dev, *dest_dev;
 78 |   size_t size = COUNT * sizeof(float);
 79 |   cudaCheckError(cudaMalloc(&dest_dev, size));
 80 |   cudaCheckError(cudaMalloc(&source_dev, size));
 81 |   cudaCheckError(
 82 |       cudaMemcpy(source_dev, source.get(), size, cudaMemcpyHostToDevice));
 83 | 
 84 |   // Run the kernel
 85 |   dim3 block_dim(TILE_DIM, TILE_DIM);
 86 |   dim3 grid_dim((DIMENSION + block_dim.x - 1) / block_dim.x,
 87 |                 (DIMENSION + block_dim.y - 1) / block_dim.y);
 88 | 
 89 |   {
 90 |     KernelTimer t;
 91 |     transpose<<<grid_dim, block_dim>>>(source_dev, dest_dev, DIMENSION);
 92 |   }
 93 | 
 94 |   // Copy results back to the host
 95 |   cudaCheckError(
 96 |       cudaMemcpy(dest.get(), dest_dev, size, cudaMemcpyDeviceToHost));
 97 |   cudaCheckError(cudaFree(dest_dev));
 98 |   cudaCheckError(cudaFree(source_dev));
 99 | 
100 |   // Compare with reference implementation
101 |   std::unique_ptr<float[]> dest_reference(new float[COUNT]);
102 |   transpose_reference(source.get(), dest_reference.get(), DIMENSION);
103 | 
104 |   for (int i = 0; i < COUNT; i++) {
105 |     assert(dest_reference.get()[i] == dest.get()[i]);
106 |   }
107 | 
108 |   return 0;
109 | }
110 | 


--------------------------------------------------------------------------------
/Code files/Section 4/4.1/transpose.cu:
--------------------------------------------------------------------------------
 1 | // Matrix transpose with direct access to global memory.
 2 | // Writes are uncoalesced.
 3 | // Example for video 4.1.
 4 | 
 5 | #include <assert.h>
 6 | #include <memory>
 7 | 
 8 | // Standard CUDA API functions
 9 | #include <cuda_runtime_api.h>
10 | 
11 | #include "../utils.h"
12 | 
13 | const int TILE_DIM = 16;
14 | 
15 | // Reference implementation on the host
16 | void transpose_reference(const float *source, float *dest,
17 |                          unsigned int dimension)
18 | {
19 |   for (int y = 0; y < dimension; y++) {
20 |     for (int x = 0; x < dimension; x++) {
21 |       dest[y + x * dimension] = source[x + y * dimension];
22 |     }
23 |   }
24 | }
25 | 
26 | // Transpose a matrix
27 | // For simplicity, we assume that the matrix is square, and that its
28 | // dimension is a multiple of the block size, so we don't have to worry about
29 | // pitch or bounds checking.
30 | __global__ void transpose(const float *source, float *dest,
31 |                           unsigned int dimension)
32 | {
33 |   int x = blockIdx.x * blockDim.x + threadIdx.x;
34 |   int y = blockIdx.y * blockDim.y + threadIdx.y;
35 | 
36 |   int source_index = y * dimension + x;
37 |   int dest_index = x * dimension + y;
38 | 
39 |   dest[dest_index] = source[source_index];
40 | }
41 | 
42 | int main(int argc, char **argv)
43 | {
44 |   const unsigned int DIMENSION = 4096;
45 |   const unsigned int COUNT = DIMENSION * DIMENSION;
46 |   std::unique_ptr<float[]> source(new float[COUNT]);
47 |   std::unique_ptr<float[]> dest(new float[COUNT]);
48 | 
49 |   // Fill source matrix with some arbitrary test values
50 |   for (int i = 0; i < COUNT; i++) {
51 |     source[i] = i;
52 |   }
53 | 
54 |   // Allocate and fill device memory
55 |   float *source_dev, *dest_dev;
56 |   size_t size = COUNT * sizeof(float);
57 |   cudaCheckError(cudaMalloc(&dest_dev, size));
58 |   cudaCheckError(cudaMalloc(&source_dev, size));
59 |   cudaCheckError(
60 |       cudaMemcpy(source_dev, source.get(), size, cudaMemcpyHostToDevice));
61 | 
62 |   // Run the kernel
63 |   dim3 block_dim(TILE_DIM, TILE_DIM);
64 |   dim3 grid_dim((DIMENSION + block_dim.x - 1) / block_dim.x,
65 |                 (DIMENSION + block_dim.y - 1) / block_dim.y);
66 | 
67 |   {
68 |     KernelTimer t;
69 |     transpose<<<grid_dim, block_dim>>>(source_dev, dest_dev, DIMENSION);
70 |   }
71 | 
72 |   // Copy results back to the host
73 |   cudaCheckError(
74 |       cudaMemcpy(dest.get(), dest_dev, size, cudaMemcpyDeviceToHost));
75 |   cudaCheckError(cudaFree(dest_dev));
76 |   cudaCheckError(cudaFree(source_dev));
77 | 
78 |   // Compare with reference implementation
79 |   std::unique_ptr<float[]> dest_reference(new float[COUNT]);
80 |   transpose_reference(source.get(), dest_reference.get(), DIMENSION);
81 | 
82 |   for (int i = 0; i < COUNT; i++) {
83 |     assert(dest_reference.get()[i] == dest.get()[i]);
84 |   }
85 | 
86 |   return 0;
87 | }
88 | 


--------------------------------------------------------------------------------
/Code files/Section 4/4.2/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = reduce
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 4/4.2/reduce:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 4/4.2/reduce


--------------------------------------------------------------------------------
/Code files/Section 4/4.2/reduce.cu:
--------------------------------------------------------------------------------
  1 | // Reduce an array to a single value by summing all of its elements.
  2 | // Example for video 4.1.
  3 | 
  4 | #include <assert.h>
  5 | #include <iostream>
  6 | #include <memory>
  7 | #include <numeric>
  8 | #include <random>
  9 | 
 10 | // Standard CUDA API functions
 11 | #include <cuda_runtime_api.h>
 12 | 
 13 | // CUDA cooperative groups API
 14 | #include <cooperative_groups.h>
 15 | 
 16 | #include "../utils.h"
 17 | 
 18 | __device__ unsigned int blocks_finished = 0;
 19 | // Wait for all blocks in the grid to execute this function.
 20 | // Returns true for thread 0 of the last block, false for all
 21 | // other threads.
 22 | __device__ bool wait_for_all_blocks()
 23 | {
 24 |   // Wait until global write is visible to all other blocks
 25 |   __threadfence();
 26 | 
 27 |   // Wait for all blocks to finish by atomically incrementing a counter
 28 |   bool is_last = false;
 29 |   if (threadIdx.x == 0) {
 30 |     unsigned int ticket = atomicInc(&blocks_finished, gridDim.x);
 31 |     is_last = (ticket == gridDim.x - 1);
 32 |   }
 33 |   if (is_last) {
 34 |     blocks_finished = 0;
 35 |   }
 36 |   return is_last;
 37 | }
 38 | 
 39 | __device__ int reduce_block(const int *source, int sdata[],
 40 |                             cooperative_groups::thread_block block)
 41 | {
 42 |   unsigned int index = blockIdx.x * blockDim.x * 2 + threadIdx.x;
 43 |   auto tid = threadIdx.x;
 44 | 
 45 |   // Add two elements into shared memory
 46 |   sdata[tid] = source[index] + source[index + blockDim.x];
 47 | 
 48 |   cooperative_groups::sync(block);
 49 | 
 50 |   // When shared memory block is filled, reduce within that block.
 51 |   for (int stride = 1; stride < blockDim.x; stride *= 2) {
 52 |     int index = 2 * stride * tid;
 53 |     if (index < blockDim.x) {
 54 |       sdata[index] += sdata[index + stride];
 55 |     }
 56 |     cooperative_groups::sync(block);
 57 |   }
 58 | 
 59 |   return sdata[0];
 60 | }
 61 | 
 62 | // Sum the source array. The dest array must have one element per block --
 63 | // the first element will contain the final result, and the rest are used for
 64 | // temporary storage.
 65 | __global__ void reduce(const int *source, int *dest)
 66 | {
 67 |   extern __shared__ int sdata[];
 68 | 
 69 |   int block_result =
 70 |       reduce_block(source, sdata, cooperative_groups::this_thread_block());
 71 | 
 72 |   // The last thread of each block writes the block result into global memory
 73 |   if (threadIdx.x == 0) {
 74 |     dest[blockIdx.x] = block_result;
 75 |   }
 76 | 
 77 |   bool is_last = wait_for_all_blocks();
 78 | 
 79 |   // All blocks have passed the threadfence, so all writes are visible to all
 80 |   // blocks. Now we can use one thread to sum the results from each block.
 81 |   if (is_last) {
 82 |     int sum = 0;
 83 |     for (int i = 0; i < gridDim.x; i++) {
 84 |       sum += dest[i];
 85 |     }
 86 |     // Final sum goes in dest[0]
 87 |     dest[0] = sum;
 88 |   }
 89 | }
 90 | 
 91 | int main(int argc, char **argv)
 92 | {
 93 |   const unsigned int COUNT = 4096 * 4096;
 94 |   std::unique_ptr<int[]> source(new int[COUNT]);
 95 | 
 96 |   // Fill source matrix with some arbitrary test values
 97 |   std::mt19937 rng;
 98 |   rng.seed(0);
 99 |   std::uniform_int_distribution<std::mt19937::result_type> dist(0, 9);
100 | 
101 |   for (int i = 0; i < COUNT; i++) {
102 |     source[i] = dist(rng);
103 |   }
104 | 
105 |   // Allocate and fill device memory
106 |   int *source_dev, *dest_dev;
107 |   size_t size = COUNT * sizeof(int);
108 |   cudaCheckError(cudaMalloc(&source_dev, size));
109 |   cudaCheckError(
110 |       cudaMemcpy(source_dev, source.get(), size, cudaMemcpyHostToDevice));
111 | 
112 |   // Run the kernel
113 |   int BLOCK_SIZE = 128;
114 |   int n_blocks = (COUNT + BLOCK_SIZE - 1) / (2 * BLOCK_SIZE);
115 | 
116 |   cudaCheckError(cudaMalloc(&dest_dev, n_blocks * sizeof(int)));
117 | 
118 |   {
119 |     KernelTimer t;
120 |     size_t shared_memory_size = BLOCK_SIZE * sizeof(int);
121 |     reduce<<<n_blocks, BLOCK_SIZE, shared_memory_size>>>(source_dev, dest_dev);
122 |   }
123 | 
124 |   // Copy result back to the host
125 |   int result;
126 |   cudaCheckError(
127 |       cudaMemcpy(&result, dest_dev, sizeof(result), cudaMemcpyDeviceToHost));
128 |   cudaCheckError(cudaFree(source_dev));
129 |   cudaCheckError(cudaFree(dest_dev));
130 | 
131 |   // Compare with reference implementation
132 |   int result_reference = std::accumulate(source.get(), source.get() + COUNT, 0);
133 |   std::cout << "Sum of " << COUNT << " elements: " << result << "\n";
134 |   assert(result_reference == result);
135 | 
136 |   return 0;
137 | }
138 | 


--------------------------------------------------------------------------------
/Code files/Section 4/4.3/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = scan
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 4/4.3/scan:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 4/4.3/scan


--------------------------------------------------------------------------------
/Code files/Section 4/4.3/scan.cu:
--------------------------------------------------------------------------------
  1 | // Implementation of parallel prefix sum, aka scan.
  2 | // Example for video 4.3.
  3 | 
  4 | #include <assert.h>
  5 | #include <iostream>
  6 | #include <memory>
  7 | #include <numeric>
  8 | #include <random>
  9 | 
 10 | // Standard CUDA API functions
 11 | #include <cuda_runtime_api.h>
 12 | 
 13 | // CUDA cooperative groups API
 14 | #include <cooperative_groups.h>
 15 | 
 16 | #include "../utils.h"
 17 | 
 18 | void scan_reference(const int *source, int *dest, unsigned int count)
 19 | {
 20 |   int sum = 0;
 21 |   for (int i = 0; i < count; i++) {
 22 |     sum += source[i];
 23 |     dest[i] = sum;
 24 |   }
 25 | }
 26 | 
 27 | const int BLOCK_SIZE = 128;
 28 | 
 29 | // Scan using shared memory, within a single block.
 30 | __device__ int block_scan(int idata, int shared_data[],
 31 |                           cooperative_groups::thread_block block)
 32 | {
 33 |   // Index into shared memory
 34 |   int si = threadIdx.x;
 35 |   shared_data[si] = 0;
 36 |   si += blockDim.x;
 37 |   shared_data[si] = idata;
 38 | 
 39 |   for (int offset = 1; offset < blockDim.x; offset *= 2) {
 40 |     cooperative_groups::sync(block);
 41 |     int t = shared_data[si] + shared_data[si - offset];
 42 |     cooperative_groups::sync(block);
 43 |     shared_data[si] = t;
 44 |   }
 45 | 
 46 |   return shared_data[si];
 47 | }
 48 | 
 49 | // First step of scan: process each block separately
 50 | __global__ void scan1(const int *source, int *dest)
 51 | {
 52 |   // Shared memory buffer. By allocating extra elements we avoid bounds
 53 |   // checks on shared memory access.
 54 |   __shared__ int shared_data[2 * BLOCK_SIZE];
 55 | 
 56 |   // Index into global memory
 57 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 58 | 
 59 |   // Load data from global memory
 60 |   int idata = source[index];
 61 | 
 62 |   // Shared memory scan within this block
 63 |   int result =
 64 |       block_scan(idata, shared_data, cooperative_groups::this_thread_block());
 65 | 
 66 |   // Write back to global memory
 67 |   dest[index] = result;
 68 | }
 69 | 
 70 | // Second step of scan: compute prefix sums for each block
 71 | __global__ void scan2(const int *dest, int *block_sums, unsigned int count)
 72 | {
 73 |   // Shared memory buffer. By allocating extra elements we avoid bounds
 74 |   // checks on shared memory access.
 75 |   __shared__ int shared_data[2 * BLOCK_SIZE];
 76 | 
 77 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 78 | 
 79 |   int idata = (index == 0) ? 0 : dest[index * blockDim.x - 1];
 80 |   block_sums[index] =
 81 |       block_scan(idata, shared_data, cooperative_groups::this_thread_block());
 82 | }
 83 | 
 84 | // Final step of scan: add block sums to every result.
 85 | __global__ void finish_scan(const int *block_sums, int *dest)
 86 | {
 87 |   __shared__ int block_sum;
 88 | 
 89 |   if (threadIdx.x == 0) {
 90 |     block_sum = block_sums[blockIdx.x];
 91 |   }
 92 |   cooperative_groups::sync(cooperative_groups::this_thread_block());
 93 | 
 94 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 95 |   dest[index] += block_sum;
 96 | }
 97 | 
 98 | int main(int argc, char **argv)
 99 | {
100 |   // Maximum possible size with two-level scan.
101 |   const unsigned int COUNT = BLOCK_SIZE * BLOCK_SIZE;
102 |   std::unique_ptr<int[]> source(new int[COUNT]);
103 |   std::unique_ptr<int[]> dest(new int[COUNT]);
104 | 
105 |   // Fill source matrix with some arbitrary test values
106 |   std::mt19937 rng;
107 |   rng.seed(0);
108 |   std::uniform_int_distribution<std::mt19937::result_type> dist(0, 9);
109 | 
110 |   for (int i = 0; i < COUNT; i++) {
111 |     source[i] = dist(rng);
112 |   }
113 | 
114 |   // Allocate and fill device memory
115 |   int *source_dev, *dest_dev;
116 |   size_t size = COUNT * sizeof(int);
117 |   cudaCheckError(cudaMalloc(&source_dev, size));
118 |   cudaCheckError(
119 |       cudaMemcpy(source_dev, source.get(), size, cudaMemcpyHostToDevice));
120 |   cudaCheckError(cudaMalloc(&dest_dev, size));
121 | 
122 |   int n_blocks1 = (COUNT + BLOCK_SIZE - 1) / BLOCK_SIZE;
123 | 
124 |   // Temporary buffer for kernel
125 |   int *block_sums;
126 |   cudaCheckError(cudaMalloc(&block_sums, n_blocks1 * sizeof(int)));
127 | 
128 |   {
129 |     KernelTimer t;
130 | 
131 |     // Run the kernel
132 |     scan1<<<n_blocks1, BLOCK_SIZE>>>(source_dev, dest_dev);
133 | 
134 |     int n_blocks2 = (n_blocks1 + BLOCK_SIZE - 1) / BLOCK_SIZE;
135 |     // If we had multiple blocks here, we'd need a third level of scans to
136 |     // get the final result.
137 |     assert(n_blocks2 == 1);
138 |     scan2<<<n_blocks2, BLOCK_SIZE>>>(dest_dev, block_sums, n_blocks1);
139 | 
140 |     finish_scan<<<n_blocks1, BLOCK_SIZE>>>(block_sums, dest_dev);
141 |   }
142 | 
143 |   // Copy result back to the host
144 |   cudaCheckError(
145 |       cudaMemcpy(dest.get(), dest_dev, size, cudaMemcpyDeviceToHost));
146 |   cudaCheckError(cudaFree(source_dev));
147 |   cudaCheckError(cudaFree(dest_dev));
148 |   cudaCheckError(cudaFree(block_sums));
149 | 
150 |   // Compare with reference implementation
151 |   std::unique_ptr<int[]> dest_reference(new int[COUNT]);
152 |   scan_reference(source.get(), dest_reference.get(), COUNT);
153 |   for (int i = 0; i < COUNT; i++) {
154 |     assert(dest_reference.get()[i] == dest.get()[i]);
155 |   }
156 | 
157 |   return 0;
158 | }
159 | 


--------------------------------------------------------------------------------
/Code files/Section 4/4.4/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = filter
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 4/4.4/filter:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 4/4.4/filter


--------------------------------------------------------------------------------
/Code files/Section 4/4.4/filter.cu:
--------------------------------------------------------------------------------
  1 | // Filter the contents of an array.
  2 | // Uses a scan, followed by a separate kernel to fill the output.
  3 | // Example for video 4.4.
  4 | 
  5 | #include <assert.h>
  6 | #include <algorithm>
  7 | #include <iostream>
  8 | #include <memory>
  9 | #include <numeric>
 10 | #include <random>
 11 | 
 12 | // Standard CUDA API functions
 13 | #include <cuda_runtime_api.h>
 14 | 
 15 | // CUDA cooperative groups API
 16 | #include <cooperative_groups.h>
 17 | 
 18 | #include "../utils.h"
 19 | 
 20 | __host__ __device__ bool divisible_by_three(int value)
 21 | {
 22 |   return (value % 3) == 0;
 23 | }
 24 | 
 25 | const int BLOCK_SIZE = 128;
 26 | 
 27 | // Scan using shared memory, within a single block.
 28 | __device__ int block_scan(int idata, int shared_data[],
 29 |                           cooperative_groups::thread_block block)
 30 | {
 31 |   // Index into shared memory
 32 |   int si = threadIdx.x;
 33 |   shared_data[si] = 0;
 34 |   si += blockDim.x;
 35 |   shared_data[si] = idata;
 36 | 
 37 |   for (int offset = 1; offset < blockDim.x; offset *= 2) {
 38 |     cooperative_groups::sync(block);
 39 |     int t = shared_data[si] + shared_data[si - offset];
 40 |     cooperative_groups::sync(block);
 41 |     shared_data[si] = t;
 42 |   }
 43 | 
 44 |   return shared_data[si];
 45 | }
 46 | 
 47 | // First step of scan: process each block separately
 48 | __global__ void scan1(const int *source, int *dest)
 49 | {
 50 |   // Shared memory buffer. By allocating extra elements we avoid bounds
 51 |   // checks on shared memory access.
 52 |   __shared__ int shared_data[2 * BLOCK_SIZE];
 53 | 
 54 |   // Index into global memory
 55 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 56 | 
 57 |   // Load data from global memory
 58 |   int idata = source[index];
 59 | 
 60 |   // Shared memory scan within this block
 61 |   int result =
 62 |       block_scan(idata, shared_data, cooperative_groups::this_thread_block());
 63 | 
 64 |   // Write back to global memory
 65 |   dest[index] = result;
 66 | }
 67 | 
 68 | // Second step of scan: compute prefix sums for each block
 69 | __global__ void scan2(const int *dest, int *block_sums, unsigned int count)
 70 | {
 71 |   // Shared memory buffer. By allocating extra elements we avoid bounds
 72 |   // checks on shared memory access.
 73 |   __shared__ int shared_data[2 * BLOCK_SIZE];
 74 | 
 75 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 76 | 
 77 |   int idata = (index == 0) ? 0 : dest[index * blockDim.x - 1];
 78 |   block_sums[index] =
 79 |       block_scan(idata, shared_data, cooperative_groups::this_thread_block());
 80 | }
 81 | 
 82 | // Final step of scan: add block sums to every result.
 83 | __global__ void finish_scan(const int *block_sums, int *dest)
 84 | {
 85 |   __shared__ int block_sum;
 86 | 
 87 |   if (threadIdx.x == 0) {
 88 |     block_sum = block_sums[blockIdx.x];
 89 |   }
 90 |   cooperative_groups::sync(cooperative_groups::this_thread_block());
 91 | 
 92 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 93 |   dest[index] += block_sum;
 94 | }
 95 | 
 96 | // Compute prefix sum of source
 97 | void scan(const int *source, int *dest, unsigned int count)
 98 | {
 99 |   int n_blocks1 = (count + BLOCK_SIZE - 1) / BLOCK_SIZE;
100 | 
101 |   // Temporary buffer for kernel
102 |   int *block_sums;
103 |   cudaCheckError(cudaMalloc(&block_sums, n_blocks1 * sizeof(int)));
104 | 
105 |   // Run the kernel
106 |   scan1<<<n_blocks1, BLOCK_SIZE>>>(source, dest);
107 | 
108 |   int n_blocks2 = (n_blocks1 + BLOCK_SIZE - 1) / BLOCK_SIZE;
109 |   // If we had multiple blocks here, we'd need a third level of scans to
110 |   // get the final result.
111 |   assert(n_blocks2 == 1);
112 |   scan2<<<n_blocks2, BLOCK_SIZE>>>(dest, block_sums, n_blocks1);
113 | 
114 |   finish_scan<<<n_blocks1, BLOCK_SIZE>>>(block_sums, dest);
115 | 
116 |   cudaCheckError(cudaFree(block_sums));
117 | }
118 | 
119 | // Test predicate for all elements of source. Fill result with a 1 for values
120 | // that satisfy the predicate, and a 0 otherwise.
121 | __global__ void evaluate_predicate(const int *source, int *result,
122 |                                    unsigned int count)
123 | {
124 |   int index = threadIdx.x + blockIdx.x * blockDim.x;
125 |   if (index < count) {
126 |     result[index] = divisible_by_three(source[index]) ? 1 : 0;
127 |   }
128 | }
129 | 
130 | // Copy values that satisfy the predicate from source to result, using the
131 | // indices array to place them in the correct position.
132 | __global__ void fill_output(const int *source, const int *indices, int *result,
133 |                             unsigned int count)
134 | {
135 |   int index = threadIdx.x + blockIdx.x * blockDim.x;
136 |   if (index >= count) {
137 |     return;
138 |   }
139 | 
140 |   int value = source[index];
141 |   if (divisible_by_three(value)) {
142 |     // Subtract 1 from index because scan is inclusive (it counts the current
143 |     // element), so the indices array will contain 1-based indices.
144 |     int output_index = indices[index] - 1;
145 |     result[output_index] = value;
146 |   }
147 | }
148 | 
149 | int main(int argc, char **argv)
150 | {
151 |   // Maximum possible size with two-level scan.
152 |   const unsigned int COUNT = BLOCK_SIZE * BLOCK_SIZE;
153 |   std::unique_ptr<int[]> source(new int[COUNT]);
154 |   std::unique_ptr<int[]> dest(new int[COUNT]);
155 | 
156 |   // Fill source matrix with some arbitrary test values
157 |   std::mt19937 rng;
158 |   rng.seed(0);
159 |   std::uniform_int_distribution<std::mt19937::result_type> dist(0, 9);
160 | 
161 |   for (int i = 0; i < COUNT; i++) {
162 |     source[i] = dist(rng);
163 |   }
164 | 
165 |   // Allocate and fill device memory
166 |   int *source_dev, *dest_dev;
167 |   // Result of evaluating predicates
168 |   int *predicates;
169 |   // Indices at which to store each result element
170 |   int *indices;
171 |   size_t size = COUNT * sizeof(int);
172 |   // Number of elements in the output array
173 |   int output_count;
174 |   cudaCheckError(cudaMalloc(&source_dev, size));
175 |   cudaCheckError(
176 |       cudaMemcpy(source_dev, source.get(), size, cudaMemcpyHostToDevice));
177 |   cudaCheckError(cudaMalloc(&predicates, size));
178 |   cudaCheckError(cudaMalloc(&indices, size));
179 | 
180 |   {
181 |     KernelTimer t;
182 | 
183 |     int n_blocks = (COUNT + BLOCK_SIZE - 1) / BLOCK_SIZE;
184 |     // Test predicate for all source values
185 |     evaluate_predicate<<<n_blocks, BLOCK_SIZE>>>(source_dev, predicates, COUNT);
186 |     // Scan the predicate array to compute output indices
187 |     scan(predicates, indices, COUNT);
188 | 
189 |     // Find the length of the output from the last index, and allocate the
190 |     // array.
191 |     cudaCheckError(cudaMemcpy(&output_count, indices + COUNT - 1, sizeof(int),
192 |                               cudaMemcpyDeviceToHost));
193 |     cudaCheckError(cudaMalloc(&dest_dev, output_count * sizeof(int)));
194 | 
195 |     // Copy elements from input to output
196 |     fill_output<<<n_blocks, BLOCK_SIZE>>>(source_dev, indices, dest_dev, COUNT);
197 |   }
198 | 
199 |   // Copy result back to the host
200 |   cudaCheckError(cudaMemcpy(dest.get(), dest_dev, output_count * sizeof(int),
201 |                             cudaMemcpyDeviceToHost));
202 |   cudaCheckError(cudaFree(source_dev));
203 |   cudaCheckError(cudaFree(dest_dev));
204 |   cudaCheckError(cudaFree(predicates));
205 |   cudaCheckError(cudaFree(indices));
206 | 
207 |   // Compare with reference implementation
208 |   std::vector<int> dest_reference;
209 |   std::copy_if(source.get(), source.get() + COUNT,
210 |                std::back_inserter(dest_reference), divisible_by_three);
211 |   assert(dest_reference.size() == output_count);
212 |   for (int i = 0; i < output_count; i++) {
213 |     assert(dest_reference[i] == dest.get()[i]);
214 |   }
215 | 
216 |   return 0;
217 | }
218 | 


--------------------------------------------------------------------------------
/Code files/Section 5/5.4/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = thrust
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 5/5.4/thrust:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 5/5.4/thrust


--------------------------------------------------------------------------------
/Code files/Section 5/5.4/thrust.cu:
--------------------------------------------------------------------------------
 1 | // Demonstration of basic thrust functionality.
 2 | // Example for video 5.4.
 3 | 
 4 | #include <thrust/copy.h>
 5 | #include <thrust/device_vector.h>
 6 | #include <thrust/fill.h>
 7 | #include <thrust/sequence.h>
 8 | #include <thrust/transform.h>
 9 | #include <iostream>
10 | 
11 | int main(void)
12 | {
13 |   // Allocate two device_vectors with 10 elements
14 |   thrust::device_vector<int> vec1(10);
15 |   thrust::device_vector<int> vec2(10);
16 | 
17 |   // Initialize vec1 to 0,1,2,3, ....
18 |   thrust::sequence(vec1.begin(), vec1.end());
19 | 
20 |   // vec2 = -vec1
21 |   thrust::transform(vec1.begin(), vec1.end(), vec2.begin(),
22 |                     thrust::negate<int>());
23 | 
24 |   // print vec2
25 |   thrust::copy(vec2.begin(), vec2.end(),
26 |                std::ostream_iterator<int>(std::cout, "\n"));
27 | 
28 |   return 0;
29 | }
30 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.1/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = reduce-stream
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.1/reduce-stream:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.1/reduce-stream


--------------------------------------------------------------------------------
/Code files/Section 6/6.1/reduce-stream.cu:
--------------------------------------------------------------------------------
  1 | // Concurrent execution of multiple single-block reductions.
  2 | // The reduce kernel is very efficient, but occupancy is low, so multiple
  3 | // concurrent launches are needed to achieve good throughput.
  4 | // Example for video 6.1.
  5 | 
  6 | #include <assert.h>
  7 | #include <iostream>
  8 | #include <memory>
  9 | #include <numeric>
 10 | #include <random>
 11 | 
 12 | // Standard CUDA API functions
 13 | #include <cuda_runtime_api.h>
 14 | 
 15 | // CUDA cooperative groups API
 16 | #include <cooperative_groups.h>
 17 | 
 18 | #include "../utils.h"
 19 | 
 20 | __device__ unsigned int blocks_finished = 0;
 21 | // Wait for all blocks in the grid to execute this function.
 22 | // Returns true for thread 0 of the last block, false for all
 23 | // other threads.
 24 | __device__ bool wait_for_all_blocks()
 25 | {
 26 |   // Wait until global write is visible to all other blocks
 27 |   __threadfence();
 28 | 
 29 |   // Wait for all blocks to finish by atomically incrementing a counter
 30 |   bool is_last = false;
 31 |   if (threadIdx.x == 0) {
 32 |     unsigned int ticket = atomicInc(&blocks_finished, gridDim.x);
 33 |     is_last = (ticket == gridDim.x - 1);
 34 |   }
 35 |   if (is_last) {
 36 |     blocks_finished = 0;
 37 |   }
 38 |   return is_last;
 39 | }
 40 | 
 41 | __device__ int reduce_block(int value, int sdata[],
 42 |                             cooperative_groups::thread_block block)
 43 | {
 44 |   auto tid = threadIdx.x;
 45 | 
 46 |   // Fill shared memory with initial values
 47 |   sdata[tid] = value;
 48 | 
 49 |   cooperative_groups::sync(block);
 50 | 
 51 |   // When shared memory block is filled, reduce within that block.
 52 |   for (int stride = 1; stride < blockDim.x; stride *= 2) {
 53 |     int index = 2 * stride * tid;
 54 |     if (index < blockDim.x) {
 55 |       sdata[index] += sdata[index + stride];
 56 |     }
 57 |     cooperative_groups::sync(block);
 58 |   }
 59 | 
 60 |   return sdata[0];
 61 | }
 62 | 
 63 | // Sum the source array and store the sum in dest.
 64 | // Requires block_size * sizeof(int) bytes of shared memory.
 65 | 
 66 | // This kernel should always be launched with a single block. Unlike the
 67 | // previous reduce example, it keeps all threads busy and does not store any
 68 | // temporary data in global memory. However, occupancy is very low due to
 69 | // running a single block.
 70 | __global__ void reduce_single_block(const int *source, int *dest,
 71 |                                     unsigned int count)
 72 | {
 73 |   extern __shared__ int sdata[];
 74 | 
 75 |   int sum = 0;
 76 |   for (int i = threadIdx.x; i < count; i += blockDim.x) {
 77 |     sum += source[i];
 78 |   }
 79 | 
 80 |   sum = reduce_block(sum, sdata, cooperative_groups::this_thread_block());
 81 | 
 82 |   // The last thread of the block writes the result into global memory
 83 |   if (threadIdx.x == 0) {
 84 |     *dest = sum;
 85 |   }
 86 | }
 87 | 
 88 | int main(int argc, char **argv)
 89 | {
 90 |   const unsigned int COUNT = 4096 * 4096;
 91 |   std::unique_ptr<int[]> source(new int[COUNT]);
 92 | 
 93 |   // Fill source matrix with some arbitrary test values
 94 |   std::mt19937 rng;
 95 |   rng.seed(0);
 96 |   std::uniform_int_distribution<std::mt19937::result_type> dist(0, 9);
 97 | 
 98 |   for (int i = 0; i < COUNT; i++) {
 99 |     source[i] = dist(rng);
100 |   }
101 | 
102 |   int N_STREAMS = 16;
103 |   int *results[N_STREAMS];
104 |   int *sources[N_STREAMS];
105 |   cudaStream_t stream[N_STREAMS];
106 | 
107 |   // Create streams, and allocate input and output for each stream.
108 |   size_t size = COUNT * sizeof(int);
109 |   for (int i = 0; i < N_STREAMS; i++) {
110 |     cudaCheckError(cudaStreamCreate(&stream[i]));
111 |     cudaCheckError(cudaMalloc(&results[i], sizeof(int)));
112 |     cudaCheckError(cudaMalloc(&sources[i], size));
113 |     cudaCheckError(
114 |         cudaMemcpy(sources[i], source.get(), size, cudaMemcpyHostToDevice));
115 |   }
116 | 
117 |   // Run the kernel
118 |   const int BLOCK_SIZE = 256;
119 |   size_t shared_memory_size = BLOCK_SIZE * sizeof(int);
120 | 
121 |   {
122 |     KernelTimer t;
123 |     for (int i = 0; i < N_STREAMS; i++) {
124 |       // Launch each instance of this kernel in a separate stream.
125 |       reduce_single_block<<<1, BLOCK_SIZE, shared_memory_size, stream[i]>>>(
126 |           sources[i], results[i], COUNT);
127 |     }
128 | 
129 |     // All work has been dispatched to the device. The kernels will run
130 |     // concurrently if there is room on the device. The host id idle now, and
131 |     // we can do additional concurrent processing on the host.
132 |   }
133 | 
134 |   // Wait for all streams to finish
135 |   cudaCheckError(cudaDeviceSynchronize());
136 | 
137 |   // Copy result back to the host
138 |   int result;
139 |   cudaCheckError(
140 |       cudaMemcpy(&result, results[0], sizeof(result), cudaMemcpyDeviceToHost));
141 |   for (int i = 0; i < N_STREAMS; i++) {
142 |     cudaCheckError(cudaFree(sources[i]));
143 |     cudaCheckError(cudaFree(results[i]));
144 |   }
145 | 
146 |   // Compare with reference implementation
147 |   int result_reference = std::accumulate(source.get(), source.get() + COUNT, 0);
148 |   std::cout << "Sum of " << COUNT << " elements: " << result << "\n";
149 |   assert(result_reference == result);
150 | 
151 |   return 0;
152 | }
153 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.2/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = scan-stream scan-page-locked
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.2/scan-page-locked:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.2/scan-page-locked


--------------------------------------------------------------------------------
/Code files/Section 6/6.2/scan-page-locked.cu:
--------------------------------------------------------------------------------
  1 | // Run multiple scans in separate streams, using page-locked memory to
  2 | // overlap transfers and computation.
  3 | // Example for video 6.2.
  4 | 
  5 | #include <assert.h>
  6 | #include <iostream>
  7 | #include <memory>
  8 | #include <numeric>
  9 | #include <random>
 10 | 
 11 | // Standard CUDA API functions
 12 | #include <cuda_runtime_api.h>
 13 | 
 14 | // CUDA cooperative groups API
 15 | #include <cooperative_groups.h>
 16 | 
 17 | #include "../utils.h"
 18 | 
 19 | void scan_reference(const int *source, int *dest, unsigned int count)
 20 | {
 21 |   int sum = 0;
 22 |   for (int i = 0; i < count; i++) {
 23 |     sum += source[i];
 24 |     dest[i] = sum;
 25 |   }
 26 | }
 27 | 
 28 | const int BLOCK_SIZE = 1024;
 29 | 
 30 | // Scan using shared memory, within a single block.
 31 | __device__ int block_scan(int idata, int shared_data[],
 32 |                           cooperative_groups::thread_block block)
 33 | {
 34 |   // Index into shared memory
 35 |   int si = threadIdx.x;
 36 |   shared_data[si] = 0;
 37 |   si += blockDim.x;
 38 |   shared_data[si] = idata;
 39 | 
 40 |   for (int offset = 1; offset < blockDim.x; offset *= 2) {
 41 |     cooperative_groups::sync(block);
 42 |     int t = shared_data[si] + shared_data[si - offset];
 43 |     cooperative_groups::sync(block);
 44 |     shared_data[si] = t;
 45 |   }
 46 | 
 47 |   return shared_data[si];
 48 | }
 49 | 
 50 | // First step of scan: process each block separately
 51 | __global__ void scan1(const int *source, int *dest)
 52 | {
 53 |   // Shared memory buffer. By allocating extra elements we avoid bounds
 54 |   // checks on shared memory access.
 55 |   __shared__ int shared_data[2 * BLOCK_SIZE];
 56 | 
 57 |   // Index into global memory
 58 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 59 | 
 60 |   // Load data from global memory
 61 |   int idata = source[index];
 62 | 
 63 |   // Shared memory scan within this block
 64 |   int result =
 65 |       block_scan(idata, shared_data, cooperative_groups::this_thread_block());
 66 | 
 67 |   // Write back to global memory
 68 |   dest[index] = result;
 69 | }
 70 | 
 71 | // Second step of scan: compute prefix sums for each block
 72 | __global__ void scan2(const int *dest, int *block_sums, unsigned int count)
 73 | {
 74 |   // Shared memory buffer. By allocating extra elements we avoid bounds
 75 |   // checks on shared memory access.
 76 |   __shared__ int shared_data[2 * BLOCK_SIZE];
 77 | 
 78 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 79 | 
 80 |   int idata = (index == 0) ? 0 : dest[index * blockDim.x - 1];
 81 |   block_sums[index] =
 82 |       block_scan(idata, shared_data, cooperative_groups::this_thread_block());
 83 | }
 84 | 
 85 | // Final step of scan: add block sums to every result.
 86 | __global__ void finish_scan(const int *block_sums, int *dest)
 87 | {
 88 |   __shared__ int block_sum;
 89 | 
 90 |   if (threadIdx.x == 0) {
 91 |     block_sum = block_sums[blockIdx.x];
 92 |   }
 93 |   cooperative_groups::sync(cooperative_groups::this_thread_block());
 94 | 
 95 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 96 |   dest[index] += block_sum;
 97 | }
 98 | 
 99 | int main(int argc, char **argv)
100 | {
101 |   // Maximum possible size with two-level scan.
102 |   const unsigned int COUNT = BLOCK_SIZE * BLOCK_SIZE;
103 |   const int N_STREAMS = 2;
104 | 
105 |   int *sources[N_STREAMS], *dests[N_STREAMS];
106 | 
107 |   // Fill source arrays with some arbitrary test values
108 |   std::mt19937 rng;
109 |   rng.seed(0);
110 |   std::uniform_int_distribution<std::mt19937::result_type> dist(0, 9);
111 | 
112 |   for (int i = 0; i < N_STREAMS; i++) {
113 |     // Allocate page-locked memory to allow asynchronous transfers.
114 |     cudaMallocHost(&sources[i], COUNT * sizeof(int));
115 |     cudaMallocHost(&dests[i], COUNT * sizeof(int));
116 |     for (int j = 0; j < COUNT; j++) {
117 |       sources[i][j] = dist(rng);
118 |     }
119 |   }
120 | 
121 |   // Allocate device memory and transfer data
122 |   int n_blocks1 = (COUNT + BLOCK_SIZE - 1) / BLOCK_SIZE;
123 | 
124 |   int *sources_dev[N_STREAMS], *dests_dev[N_STREAMS], *block_sums[N_STREAMS];
125 |   size_t size = COUNT * sizeof(int);
126 |   cudaStream_t stream[N_STREAMS];
127 | 
128 |   for (int i = 0; i < N_STREAMS; i++) {
129 |     cudaCheckError(cudaStreamCreate(&stream[i]));
130 |     cudaCheckError(cudaMalloc(&sources_dev[i], size));
131 |     cudaCheckError(cudaMalloc(&dests_dev[i], size));
132 |     // Temporary buffer for kernels
133 |     cudaCheckError(cudaMalloc(&block_sums[i], n_blocks1 * sizeof(int)));
134 |   }
135 | 
136 |   {
137 |     KernelTimer t;
138 | 
139 |     for (int i = 0; i < N_STREAMS; i++) {
140 |       // Copy data to device
141 |       cudaCheckError(cudaMemcpyAsync(sources_dev[i], sources[i], size,
142 |                                      cudaMemcpyHostToDevice, stream[i]));
143 | 
144 |       // Run the scan
145 |       scan1<<<n_blocks1, BLOCK_SIZE, 0, stream[i]>>>(sources_dev[i],
146 |                                                      dests_dev[i]);
147 | 
148 |       int n_blocks2 = (n_blocks1 + BLOCK_SIZE - 1) / BLOCK_SIZE;
149 |       assert(n_blocks2 == 1);
150 |       scan2<<<n_blocks2, BLOCK_SIZE, 0, stream[i]>>>(dests_dev[i],
151 |                                                      block_sums[i], n_blocks1);
152 | 
153 |       finish_scan<<<n_blocks1, BLOCK_SIZE, 0, stream[i]>>>(block_sums[i],
154 |                                                            dests_dev[i]);
155 | 
156 |       // Copy results back to the host
157 |       cudaCheckError(cudaMemcpyAsync(dests[i], dests_dev[i], size,
158 |                                      cudaMemcpyDeviceToHost, stream[i]));
159 |     }
160 |   }
161 | 
162 |   for (int i = 0; i < N_STREAMS; i++) {
163 |     cudaCheckError(cudaFree(sources_dev[i]));
164 |     cudaCheckError(cudaFree(dests_dev[i]));
165 |     cudaCheckError(cudaFree(block_sums[i]));
166 |   }
167 | 
168 |   // Compare with reference implementation
169 |   std::unique_ptr<int[]> dest_reference(new int[COUNT]);
170 |   for (int i = 0; i < N_STREAMS; i++) {
171 |     scan_reference(sources[i], dest_reference.get(), COUNT);
172 |     for (int j = 0; j < COUNT; j++) {
173 |       assert(dest_reference.get()[j] == dests[i][j]);
174 |     }
175 |   }
176 | 
177 |   return 0;
178 | }
179 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.2/scan-stream:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.2/scan-stream


--------------------------------------------------------------------------------
/Code files/Section 6/6.2/scan-stream.cu:
--------------------------------------------------------------------------------
  1 | // Run multiple scans in separate streams.
  2 | // Example for video 6.2.
  3 | 
  4 | #include <assert.h>
  5 | #include <iostream>
  6 | #include <memory>
  7 | #include <numeric>
  8 | #include <random>
  9 | 
 10 | // Standard CUDA API functions
 11 | #include <cuda_runtime_api.h>
 12 | 
 13 | // CUDA cooperative groups API
 14 | #include <cooperative_groups.h>
 15 | 
 16 | #include "../utils.h"
 17 | 
 18 | void scan_reference(const int *source, int *dest, unsigned int count)
 19 | {
 20 |   int sum = 0;
 21 |   for (int i = 0; i < count; i++) {
 22 |     sum += source[i];
 23 |     dest[i] = sum;
 24 |   }
 25 | }
 26 | 
 27 | const int BLOCK_SIZE = 1024;
 28 | 
 29 | // Scan using shared memory, within a single block.
 30 | __device__ int block_scan(int idata, int shared_data[],
 31 |                           cooperative_groups::thread_block block)
 32 | {
 33 |   // Index into shared memory
 34 |   int si = threadIdx.x;
 35 |   shared_data[si] = 0;
 36 |   si += blockDim.x;
 37 |   shared_data[si] = idata;
 38 | 
 39 |   for (int offset = 1; offset < blockDim.x; offset *= 2) {
 40 |     cooperative_groups::sync(block);
 41 |     int t = shared_data[si] + shared_data[si - offset];
 42 |     cooperative_groups::sync(block);
 43 |     shared_data[si] = t;
 44 |   }
 45 | 
 46 |   return shared_data[si];
 47 | }
 48 | 
 49 | // First step of scan: process each block separately
 50 | __global__ void scan1(const int *source, int *dest)
 51 | {
 52 |   // Shared memory buffer. By allocating extra elements we avoid bounds
 53 |   // checks on shared memory access.
 54 |   __shared__ int shared_data[2 * BLOCK_SIZE];
 55 | 
 56 |   // Index into global memory
 57 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 58 | 
 59 |   // Load data from global memory
 60 |   int idata = source[index];
 61 | 
 62 |   // Shared memory scan within this block
 63 |   int result =
 64 |       block_scan(idata, shared_data, cooperative_groups::this_thread_block());
 65 | 
 66 |   // Write back to global memory
 67 |   dest[index] = result;
 68 | }
 69 | 
 70 | // Second step of scan: compute prefix sums for each block
 71 | __global__ void scan2(const int *dest, int *block_sums, unsigned int count)
 72 | {
 73 |   // Shared memory buffer. By allocating extra elements we avoid bounds
 74 |   // checks on shared memory access.
 75 |   __shared__ int shared_data[2 * BLOCK_SIZE];
 76 | 
 77 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 78 | 
 79 |   int idata = (index == 0) ? 0 : dest[index * blockDim.x - 1];
 80 |   block_sums[index] =
 81 |       block_scan(idata, shared_data, cooperative_groups::this_thread_block());
 82 | }
 83 | 
 84 | // Final step of scan: add block sums to every result.
 85 | __global__ void finish_scan(const int *block_sums, int *dest)
 86 | {
 87 |   __shared__ int block_sum;
 88 | 
 89 |   if (threadIdx.x == 0) {
 90 |     block_sum = block_sums[blockIdx.x];
 91 |   }
 92 |   cooperative_groups::sync(cooperative_groups::this_thread_block());
 93 | 
 94 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 95 |   dest[index] += block_sum;
 96 | }
 97 | 
 98 | int main(int argc, char **argv)
 99 | {
100 |   // Maximum possible size with two-level scan.
101 |   const unsigned int COUNT = BLOCK_SIZE * BLOCK_SIZE;
102 |   const int N_STREAMS = 2;
103 | 
104 |   int *sources[N_STREAMS], *dests[N_STREAMS];
105 | 
106 |   // Fill source arrays with some arbitrary test values
107 |   std::mt19937 rng;
108 |   rng.seed(0);
109 |   std::uniform_int_distribution<std::mt19937::result_type> dist(0, 9);
110 | 
111 |   for (int i = 0; i < N_STREAMS; i++) {
112 |     sources[i] = new int[COUNT];
113 |     dests[i] = new int[COUNT];
114 |     for (int j = 0; j < COUNT; j++) {
115 |       sources[i][j] = dist(rng);
116 |     }
117 |   }
118 | 
119 |   // Allocate device memory and transfer data
120 |   int n_blocks1 = (COUNT + BLOCK_SIZE - 1) / BLOCK_SIZE;
121 | 
122 |   int *sources_dev[N_STREAMS], *dests_dev[N_STREAMS], *block_sums[N_STREAMS];
123 |   size_t size = COUNT * sizeof(int);
124 |   cudaStream_t stream[N_STREAMS];
125 | 
126 |   for (int i = 0; i < N_STREAMS; i++) {
127 |     cudaCheckError(cudaStreamCreate(&stream[i]));
128 |     cudaCheckError(cudaMalloc(&sources_dev[i], size));
129 |     cudaCheckError(cudaMalloc(&dests_dev[i], size));
130 |     // Temporary buffer for kernels
131 |     cudaCheckError(cudaMalloc(&block_sums[i], n_blocks1 * sizeof(int)));
132 |   }
133 | 
134 |   // Code in this block will be timed by KernelTimer
135 |   {
136 |     KernelTimer t;
137 | 
138 |     // Copy data to device
139 |     for (int i = 0; i < N_STREAMS; i++) {
140 |       cudaCheckError(
141 |           cudaMemcpy(sources_dev[i], sources[i], size, cudaMemcpyHostToDevice));
142 |     }
143 | 
144 |     // Run the scans in separate streams
145 |     for (int i = 0; i < N_STREAMS; i++) {
146 |       scan1<<<n_blocks1, BLOCK_SIZE, 0, stream[i]>>>(sources_dev[i],
147 |                                                      dests_dev[i]);
148 | 
149 |       int n_blocks2 = (n_blocks1 + BLOCK_SIZE - 1) / BLOCK_SIZE;
150 |       assert(n_blocks2 == 1);
151 |       scan2<<<n_blocks2, BLOCK_SIZE, 0, stream[i]>>>(dests_dev[i],
152 |                                                      block_sums[i], n_blocks1);
153 | 
154 |       finish_scan<<<n_blocks1, BLOCK_SIZE, 0, stream[i]>>>(block_sums[i],
155 |                                                            dests_dev[i]);
156 |     }
157 | 
158 |     // Copy results back to the host
159 |     for (int i = 0; i < N_STREAMS; i++) {
160 |       cudaCheckError(
161 |           cudaMemcpy(dests[i], dests_dev[i], size, cudaMemcpyDeviceToHost));
162 |     }
163 |   }
164 | 
165 |   for (int i = 0; i < N_STREAMS; i++) {
166 |     cudaCheckError(cudaFree(sources_dev[i]));
167 |     cudaCheckError(cudaFree(dests_dev[i]));
168 |     cudaCheckError(cudaFree(block_sums[i]));
169 |   }
170 | 
171 |   // Compare with reference implementation
172 |   std::unique_ptr<int[]> dest_reference(new int[COUNT]);
173 |   for (int i = 0; i < N_STREAMS; i++) {
174 |     scan_reference(sources[i], dest_reference.get(), COUNT);
175 |     for (int j = 0; j < COUNT; j++) {
176 |       assert(dest_reference.get()[j] == dests[i][j]);
177 |     }
178 |   }
179 | 
180 |   return 0;
181 | }
182 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.4/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = scan-multi-device
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.4/scan-multi-device:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.4/scan-multi-device


--------------------------------------------------------------------------------
/Code files/Section 6/6.4/scan-multi-device.cu:
--------------------------------------------------------------------------------
  1 | // Run multiple scans concurrently across all available devices.
  2 | // Example for video 6.4.
  3 | 
  4 | #include <assert.h>
  5 | #include <iostream>
  6 | #include <memory>
  7 | #include <numeric>
  8 | #include <random>
  9 | 
 10 | // Standard CUDA API functions
 11 | #include <cuda_runtime_api.h>
 12 | 
 13 | // CUDA cooperative groups API
 14 | #include <cooperative_groups.h>
 15 | 
 16 | #include "../utils.h"
 17 | 
 18 | void scan_reference(const int *source, int *dest, unsigned int count)
 19 | {
 20 |   int sum = 0;
 21 |   for (int i = 0; i < count; i++) {
 22 |     sum += source[i];
 23 |     dest[i] = sum;
 24 |   }
 25 | }
 26 | 
 27 | const int BLOCK_SIZE = 1024;
 28 | 
 29 | // Scan using shared memory, within a single block.
 30 | __device__ int block_scan(int idata, int shared_data[],
 31 |                           cooperative_groups::thread_block block)
 32 | {
 33 |   // Index into shared memory
 34 |   int si = threadIdx.x;
 35 |   shared_data[si] = 0;
 36 |   si += blockDim.x;
 37 |   shared_data[si] = idata;
 38 | 
 39 |   for (int offset = 1; offset < blockDim.x; offset *= 2) {
 40 |     cooperative_groups::sync(block);
 41 |     int t = shared_data[si] + shared_data[si - offset];
 42 |     cooperative_groups::sync(block);
 43 |     shared_data[si] = t;
 44 |   }
 45 | 
 46 |   return shared_data[si];
 47 | }
 48 | 
 49 | // First step of scan: process each block separately
 50 | __global__ void scan1(const int *source, int *dest)
 51 | {
 52 |   // Shared memory buffer. By allocating extra elements we avoid bounds
 53 |   // checks on shared memory access.
 54 |   __shared__ int shared_data[2 * BLOCK_SIZE];
 55 | 
 56 |   // Index into global memory
 57 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 58 | 
 59 |   // Load data from global memory
 60 |   int idata = source[index];
 61 | 
 62 |   // Shared memory scan within this block
 63 |   int result =
 64 |       block_scan(idata, shared_data, cooperative_groups::this_thread_block());
 65 | 
 66 |   // Write back to global memory
 67 |   dest[index] = result;
 68 | }
 69 | 
 70 | // Second step of scan: compute prefix sums for each block
 71 | __global__ void scan2(const int *dest, int *block_sums, unsigned int count)
 72 | {
 73 |   // Shared memory buffer. By allocating extra elements we avoid bounds
 74 |   // checks on shared memory access.
 75 |   __shared__ int shared_data[2 * BLOCK_SIZE];
 76 | 
 77 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 78 | 
 79 |   int idata = (index == 0) ? 0 : dest[index * blockDim.x - 1];
 80 |   block_sums[index] =
 81 |       block_scan(idata, shared_data, cooperative_groups::this_thread_block());
 82 | }
 83 | 
 84 | // Final step of scan: add block sums to every result.
 85 | __global__ void finish_scan(const int *block_sums, int *dest)
 86 | {
 87 |   __shared__ int block_sum;
 88 | 
 89 |   if (threadIdx.x == 0) {
 90 |     block_sum = block_sums[blockIdx.x];
 91 |   }
 92 |   cooperative_groups::sync(cooperative_groups::this_thread_block());
 93 | 
 94 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 95 |   dest[index] += block_sum;
 96 | }
 97 | 
 98 | int main(int argc, char **argv)
 99 | {
100 |   // Maximum possible size with two-level scan.
101 |   const unsigned int COUNT = BLOCK_SIZE * BLOCK_SIZE;
102 |   const int N_STREAMS = 2;
103 | 
104 |   int *sources[N_STREAMS], *dests[N_STREAMS];
105 | 
106 |   // Fill source arrays with some arbitrary test values
107 |   std::mt19937 rng;
108 |   rng.seed(0);
109 |   std::uniform_int_distribution<std::mt19937::result_type> dist(0, 9);
110 | 
111 |   int device_count;
112 |   cudaCheckError(cudaGetDeviceCount(&device_count));
113 | 
114 |   for (int i = 0; i < N_STREAMS; i++) {
115 |     // Allocate page-locked memory to allow asynchronous transfers.
116 |     cudaMallocHost(&sources[i], COUNT * sizeof(int));
117 |     cudaMallocHost(&dests[i], COUNT * sizeof(int));
118 |     for (int j = 0; j < COUNT; j++) {
119 |       sources[i][j] = dist(rng);
120 |     }
121 |   }
122 | 
123 |   // Allocate device memory and transfer data
124 |   int n_blocks1 = (COUNT + BLOCK_SIZE - 1) / BLOCK_SIZE;
125 | 
126 |   int *sources_dev[N_STREAMS], *dests_dev[N_STREAMS], *block_sums[N_STREAMS];
127 |   size_t size = COUNT * sizeof(int);
128 |   cudaStream_t stream[N_STREAMS];
129 | 
130 |   for (int i = 0; i < N_STREAMS; i++) {
131 |     int device = i % device_count;
132 |     cudaCheckError(cudaSetDevice(device));
133 |     std::cout << "Stream " << i << " on device " << device << "\n";
134 |     cudaCheckError(cudaStreamCreate(&stream[i]));
135 |     cudaCheckError(cudaMalloc(&sources_dev[i], size));
136 |     cudaCheckError(cudaMalloc(&dests_dev[i], size));
137 |     // Temporary buffer for kernels
138 |     cudaCheckError(cudaMalloc(&block_sums[i], n_blocks1 * sizeof(int)));
139 |   }
140 | 
141 |   {
142 |     KernelTimer t;
143 | 
144 |     for (int i = 0; i < N_STREAMS; i++) {
145 |       int device = i % device_count;
146 |       cudaCheckError(cudaSetDevice(device));
147 | 
148 |       // Copy data to device
149 |       cudaCheckError(cudaMemcpyAsync(sources_dev[i], sources[i], size,
150 |                                      cudaMemcpyHostToDevice, stream[i]));
151 | 
152 |       // Run the scan
153 |       scan1<<<n_blocks1, BLOCK_SIZE, 0, stream[i]>>>(sources_dev[i],
154 |                                                      dests_dev[i]);
155 | 
156 |       int n_blocks2 = (n_blocks1 + BLOCK_SIZE - 1) / BLOCK_SIZE;
157 |       assert(n_blocks2 == 1);
158 |       scan2<<<n_blocks2, BLOCK_SIZE, 0, stream[i]>>>(dests_dev[i],
159 |                                                      block_sums[i], n_blocks1);
160 | 
161 |       finish_scan<<<n_blocks1, BLOCK_SIZE, 0, stream[i]>>>(block_sums[i],
162 |                                                            dests_dev[i]);
163 | 
164 |       // Copy results back to the host
165 |       cudaCheckError(cudaMemcpyAsync(dests[i], dests_dev[i], size,
166 |                                      cudaMemcpyDeviceToHost, stream[i]));
167 |     }
168 |   }
169 | 
170 |   for (int i = 0; i < N_STREAMS; i++) {
171 |     cudaCheckError(cudaFree(sources_dev[i]));
172 |     cudaCheckError(cudaFree(dests_dev[i]));
173 |     cudaCheckError(cudaFree(block_sums[i]));
174 |   }
175 | 
176 |   // Compare with reference implementation
177 |   std::unique_ptr<int[]> dest_reference(new int[COUNT]);
178 |   for (int i = 0; i < N_STREAMS; i++) {
179 |     scan_reference(sources[i], dest_reference.get(), COUNT);
180 |     for (int j = 0; j < COUNT; j++) {
181 |       assert(dest_reference.get()[j] == dests[i][j]);
182 |     }
183 |   }
184 | 
185 |   return 0;
186 | }
187 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.5/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = scan-unified
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.5/scan-unified:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.5/scan-unified


--------------------------------------------------------------------------------
/Code files/Section 6/6.5/scan-unified.cu:
--------------------------------------------------------------------------------
  1 | // Demonstration of the unified virtual address space. Run multiple scans
  2 | // concurrently across all available devices
  3 | // Example for video 6.5.
  4 | 
  5 | #include <assert.h>
  6 | #include <iostream>
  7 | #include <memory>
  8 | #include <numeric>
  9 | #include <random>
 10 | 
 11 | // Standard CUDA API functions
 12 | #include <cuda_runtime_api.h>
 13 | 
 14 | // CUDA cooperative groups API
 15 | #include <cooperative_groups.h>
 16 | 
 17 | #include "../utils.h"
 18 | 
 19 | void scan_reference(const int *source, int *dest, unsigned int count)
 20 | {
 21 |   int sum = 0;
 22 |   for (int i = 0; i < count; i++) {
 23 |     sum += source[i];
 24 |     dest[i] = sum;
 25 |   }
 26 | }
 27 | 
 28 | const int BLOCK_SIZE = 1024;
 29 | 
 30 | // Scan using shared memory, within a single block.
 31 | __device__ int block_scan(int idata, int shared_data[],
 32 |                           cooperative_groups::thread_block block)
 33 | {
 34 |   // Index into shared memory
 35 |   int si = threadIdx.x;
 36 |   shared_data[si] = 0;
 37 |   si += blockDim.x;
 38 |   shared_data[si] = idata;
 39 | 
 40 |   for (int offset = 1; offset < blockDim.x; offset *= 2) {
 41 |     cooperative_groups::sync(block);
 42 |     int t = shared_data[si] + shared_data[si - offset];
 43 |     cooperative_groups::sync(block);
 44 |     shared_data[si] = t;
 45 |   }
 46 | 
 47 |   return shared_data[si];
 48 | }
 49 | 
 50 | // First step of scan: process each block separately
 51 | __global__ void scan1(const int *source, int *dest)
 52 | {
 53 |   // Shared memory buffer. By allocating extra elements we avoid bounds
 54 |   // checks on shared memory access.
 55 |   __shared__ int shared_data[2 * BLOCK_SIZE];
 56 | 
 57 |   // Index into global memory
 58 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 59 | 
 60 |   // Load data from global memory
 61 |   int idata = source[index];
 62 | 
 63 |   // Shared memory scan within this block
 64 |   int result =
 65 |       block_scan(idata, shared_data, cooperative_groups::this_thread_block());
 66 | 
 67 |   // Write back to global memory
 68 |   dest[index] = result;
 69 | }
 70 | 
 71 | // Second step of scan: compute prefix sums for each block
 72 | __global__ void scan2(const int *dest, int *block_sums, unsigned int count)
 73 | {
 74 |   // Shared memory buffer. By allocating extra elements we avoid bounds
 75 |   // checks on shared memory access.
 76 |   __shared__ int shared_data[2 * BLOCK_SIZE];
 77 | 
 78 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 79 | 
 80 |   int idata = (index == 0) ? 0 : dest[index * blockDim.x - 1];
 81 |   block_sums[index] =
 82 |       block_scan(idata, shared_data, cooperative_groups::this_thread_block());
 83 | }
 84 | 
 85 | // Final step of scan: add block sums to every result.
 86 | __global__ void finish_scan(const int *block_sums, int *dest)
 87 | {
 88 |   __shared__ int block_sum;
 89 | 
 90 |   if (threadIdx.x == 0) {
 91 |     block_sum = block_sums[blockIdx.x];
 92 |   }
 93 |   cooperative_groups::sync(cooperative_groups::this_thread_block());
 94 | 
 95 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
 96 |   dest[index] += block_sum;
 97 | }
 98 | 
 99 | static void print_pointer(const std::string &name, const void *pointer)
100 | {
101 |   cudaPointerAttributes attributes;
102 |   auto result = cudaPointerGetAttributes(&attributes, pointer);
103 | 
104 |   std::cout << name << ": ";
105 |   if (result != cudaSuccess) {
106 |     std::cout << "get attributes failed";
107 |     return;
108 |   } else {
109 |     switch (attributes.type) {
110 |       case cudaMemoryTypeUnregistered:
111 |         std::cout << "unregistered";
112 |         break;
113 |       case cudaMemoryTypeHost:
114 |         std::cout << "host memory";
115 |         break;
116 |       case cudaMemoryTypeDevice:
117 |         std::cout << "device " << attributes.device;
118 |         break;
119 |       case cudaMemoryTypeManaged:
120 |         std::cout << "managed";
121 |         break;
122 |     }
123 |   }
124 | 
125 |   std::cout << "\n";
126 | }
127 | 
128 | int main(int argc, char **argv)
129 | {
130 |   // Maximum possible size with two-level scan.
131 |   const unsigned int COUNT = BLOCK_SIZE * BLOCK_SIZE;
132 |   const int N_STREAMS = 2;
133 | 
134 |   int *sources[N_STREAMS], *dests[N_STREAMS];
135 | 
136 |   // Fill source arrays with some arbitrary test values
137 |   std::mt19937 rng;
138 |   rng.seed(0);
139 |   std::uniform_int_distribution<std::mt19937::result_type> dist(0, 9);
140 | 
141 |   int device_count;
142 |   cudaCheckError(cudaGetDeviceCount(&device_count));
143 | 
144 |   for (int i = 0; i < N_STREAMS; i++) {
145 |     // Allocate page-locked memory to allow asynchronous transfers.
146 |     cudaMallocHost(&sources[i], COUNT * sizeof(int));
147 |     cudaMallocHost(&dests[i], COUNT * sizeof(int));
148 |     for (int j = 0; j < COUNT; j++) {
149 |       sources[i][j] = dist(rng);
150 |     }
151 |   }
152 | 
153 |   // Allocate device memory and transfer data
154 |   int n_blocks1 = (COUNT + BLOCK_SIZE - 1) / BLOCK_SIZE;
155 | 
156 |   int *sources_dev[N_STREAMS], *dests_dev[N_STREAMS], *block_sums[N_STREAMS];
157 |   size_t size = COUNT * sizeof(int);
158 |   cudaStream_t stream[N_STREAMS];
159 | 
160 |   for (int i = 0; i < N_STREAMS; i++) {
161 |     int device = i % device_count;
162 |     cudaCheckError(cudaSetDevice(device));
163 |     cudaCheckError(cudaStreamCreate(&stream[i]));
164 |     cudaCheckError(cudaMalloc(&sources_dev[i], size));
165 |     cudaCheckError(cudaMalloc(&dests_dev[i], size));
166 |     // Temporary buffer for kernels
167 |     cudaCheckError(cudaMalloc(&block_sums[i], n_blocks1 * sizeof(int)));
168 |   }
169 | 
170 |   {
171 |     KernelTimer t;
172 | 
173 |     for (int i = 0; i < N_STREAMS; i++) {
174 |       int device = i % device_count;
175 |       cudaCheckError(cudaSetDevice(device));
176 | 
177 |       std::cout << "Stream " << i << " on device " << device << "\n";
178 |       print_pointer("source", sources[i]);
179 |       print_pointer("source_dev", sources_dev[i]);
180 |       print_pointer("dest_dev", dests_dev[i]);
181 |       print_pointer("dest", dests[i]);
182 | 
183 |       // Copy data to device
184 |       cudaCheckError(cudaMemcpyAsync(sources_dev[i], sources[i], size,
185 |                                      cudaMemcpyDefault, stream[i]));
186 | 
187 |       // Run the scan
188 |       scan1<<<n_blocks1, BLOCK_SIZE, 0, stream[i]>>>(sources_dev[i],
189 |                                                      dests_dev[i]);
190 | 
191 |       int n_blocks2 = (n_blocks1 + BLOCK_SIZE - 1) / BLOCK_SIZE;
192 |       assert(n_blocks2 == 1);
193 |       scan2<<<n_blocks2, BLOCK_SIZE, 0, stream[i]>>>(dests_dev[i],
194 |                                                      block_sums[i], n_blocks1);
195 | 
196 |       finish_scan<<<n_blocks1, BLOCK_SIZE, 0, stream[i]>>>(block_sums[i],
197 |                                                            dests_dev[i]);
198 | 
199 |       // Copy results back to the host
200 |       cudaCheckError(cudaMemcpyAsync(dests[i], dests_dev[i], size,
201 |                                      cudaMemcpyDefault, stream[i]));
202 |       std::cout << "\n";
203 |     }
204 |   }
205 | 
206 |   for (int i = 0; i < N_STREAMS; i++) {
207 |     cudaCheckError(cudaFree(sources_dev[i]));
208 |     cudaCheckError(cudaFree(dests_dev[i]));
209 |     cudaCheckError(cudaFree(block_sums[i]));
210 |   }
211 | 
212 |   // Compare with reference implementation
213 |   std::unique_ptr<int[]> dest_reference(new int[COUNT]);
214 |   for (int i = 0; i < N_STREAMS; i++) {
215 |     scan_reference(sources[i], dest_reference.get(), COUNT);
216 |     for (int j = 0; j < COUNT; j++) {
217 |       assert(dest_reference.get()[j] == dests[i][j]);
218 |     }
219 |   }
220 | 
221 |   return 0;
222 | }
223 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.6/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = bst
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.6/bst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.6/bst


--------------------------------------------------------------------------------
/Code files/Section 6/6.6/bst.cu:
--------------------------------------------------------------------------------
  1 | // Build and print a binary search tree on the device, using dynamic global
  2 | // memory allocation.
  3 | // Example for video 6.6.
  4 | 
  5 | #include <thrust/device_vector.h>
  6 | #include <memory>
  7 | 
  8 | // Standard CUDA API functions
  9 | #include <cuda_runtime_api.h>
 10 | 
 11 | #include "../utils.h"
 12 | 
 13 | struct Tree {
 14 |   int value;
 15 |   Tree *left;
 16 |   Tree *right;
 17 | };
 18 | 
 19 | // Helper function to construct a binary search tree from a sorted array.
 20 | __device__ void build_subtree(Tree *root, const int *source, int left,
 21 |                               int right)
 22 | {
 23 |   int middle = (left + right) / 2;
 24 |   root->value = source[middle];
 25 | 
 26 |   if (middle == left) {
 27 |     root->left = nullptr;
 28 |   } else {
 29 |     root->left = new Tree();
 30 |     build_subtree(root->left, source, left, middle - 1);
 31 |   }
 32 | 
 33 |   if (middle == right) {
 34 |     root->right = nullptr;
 35 |   } else {
 36 |     root->right = new Tree();
 37 |     build_subtree(root->right, source, middle + 1, right);
 38 |   }
 39 | }
 40 | 
 41 | // Construct a binary search tree from a sorted array. This kernel should be
 42 | // run with a single thread.
 43 | __global__ void build_tree(const int *source, unsigned int length, Tree *root)
 44 | {
 45 |   build_subtree(root, source, 0, length - 1);
 46 | }
 47 | 
 48 | // Print the nodes of a tree, in order.
 49 | __device__ void print_subtree(const Tree *root)
 50 | {
 51 |   if (root->left) {
 52 |     print_subtree(root->left);
 53 |   }
 54 |   printf("%d\n", root->value);
 55 |   if (root->right) {
 56 |     print_subtree(root->right);
 57 |   }
 58 | }
 59 | __global__ void print_tree(const Tree *root) { print_subtree(root); }
 60 | 
 61 | // Free a device-allocated tree.
 62 | __device__ void destroy_subtree(Tree *root)
 63 | {
 64 |   if (root->left) {
 65 |     destroy_subtree(root->left);
 66 |   }
 67 |   if (root->right) {
 68 |     destroy_subtree(root->right);
 69 |   }
 70 |   delete root;
 71 | }
 72 | 
 73 | __global__ void destroy_tree(Tree *root)
 74 | {
 75 |   if (root->left) {
 76 |     destroy_subtree(root->left);
 77 |   }
 78 |   if (root->right) {
 79 |     destroy_subtree(root->right);
 80 |   }
 81 |   // Do not destroy root! It was allocated with cudaMalloc and must be freed
 82 |   // from host code.
 83 | }
 84 | 
 85 | int main(int argc, char **argv)
 86 | {
 87 |   const unsigned int COUNT = 128;
 88 | 
 89 |   // Create device vector with sequential integers
 90 |   thrust::device_vector<int> source(COUNT);
 91 |   thrust::sequence(source.begin(), source.end());
 92 | 
 93 |   // Allocate a root for the tree
 94 |   Tree *root;
 95 |   cudaCheckError(cudaMalloc(&root, sizeof(Tree)));
 96 | 
 97 |   // Build the tree from a sorted array
 98 |   build_tree<<<1, 1>>>(thrust::raw_pointer_cast(&source[0]), source.size(),
 99 |                        root);
100 | 
101 |   // Print the tree values, in order
102 |   print_tree<<<1, 1>>>(root);
103 | 
104 |   // Destroy all the subtrees which were allocated with new in device
105 |   // code.
106 |   destroy_tree<<<1, 1>>>(root);
107 | 
108 |   // Destroy the root which was allocated with cudaMalloc.
109 |   cudaCheckError(cudaFree(root));
110 | 
111 |   cudaCheckError(cudaDeviceSynchronize());
112 | }
113 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.7/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAFLAGS ?= -g
 2 | 
 3 | ALL = bst-sum
 4 | 
 5 | all: $(ALL)
 6 | 
 7 | ../utils.o: ../utils.cu ../utils.h
 8 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ -c $<
 9 | 
10 | %: %.cu ../utils.o
11 | 	nvcc -std=c++11 $(CUDAFLAGS) -o $@ $^
12 | 
13 | # Dynamic parallelism requires separate compilation of kernels and
14 | # host code
15 | bst-sum-kernels.o: bst-sum-kernels.cu bst-sum-kernels.cuh
16 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -dc bst-sum-kernels.cu
17 | 
18 | bst-sum.o: ../utils.h bst-sum.cu
19 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -c bst-sum.cu
20 | 
21 | bst-sum: bst-sum.o bst-sum-kernels.o
22 | 	nvcc -std=c++11 $(CUDAFLAGS) -arch compute_35 -o bst-sum bst-sum.o bst-sum-kernels.o
23 | 
24 | clean:
25 | 	rm -f ../utils.o bst-sum.o bst-sum-kernels.o manylights1 $(ALL)
26 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.7/bst-sum:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.7/bst-sum


--------------------------------------------------------------------------------
/Code files/Section 6/6.7/bst-sum-kernels.cu:
--------------------------------------------------------------------------------
  1 | // Kernel code for bst-sum example, used in video 6.7.
  2 | 
  3 | #include <stdio.h>
  4 | 
  5 | #include "bst-sum-kernels.cuh"
  6 | 
  7 | #define kernelCheckError(code)                                  \
  8 |   {                                                             \
  9 |     if ((code) != cudaSuccess) {                                \
 10 |       printf("Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
 11 |              cudaGetErrorString(code));                         \
 12 |       return;                                                   \
 13 |     }                                                           \
 14 |   }
 15 | 
 16 | __device__ void build_subtree(Tree *root, const int *source, int left,
 17 |                               int right)
 18 | {
 19 |   int middle = (left + right) / 2;
 20 |   root->value = source[middle];
 21 | 
 22 |   if (left >= right) {
 23 |     return;
 24 |   }
 25 | 
 26 |   if (middle == left) {
 27 |     root->left = nullptr;
 28 |   } else {
 29 |     root->left = new Tree();
 30 |     build_subtree(root->left, source, left, middle - 1);
 31 |   }
 32 | 
 33 |   if (middle == right) {
 34 |     root->right = nullptr;
 35 |   } else {
 36 |     root->right = new Tree();
 37 |     build_subtree(root->right, source, middle + 1, right);
 38 |   }
 39 | }
 40 | 
 41 | __global__ void build_tree(const int *source, unsigned int length, Tree *root)
 42 | {
 43 |   build_subtree(root, source, 0, length - 1);
 44 | }
 45 | 
 46 | __device__ void destroy_subtree(Tree *root)
 47 | {
 48 |   if (root->left) {
 49 |     destroy_subtree(root->left);
 50 |   }
 51 |   if (root->right) {
 52 |     destroy_subtree(root->right);
 53 |   }
 54 |   delete root;
 55 | }
 56 | 
 57 | __global__ void destroy_tree(Tree *root)
 58 | {
 59 |   if (root->left) {
 60 |     destroy_subtree(root->left);
 61 |   }
 62 |   if (root->right) {
 63 |     destroy_subtree(root->right);
 64 |   }
 65 |   // Do not destroy root! It was allocated with cudaMalloc and must be freed
 66 |   // from host code.
 67 | }
 68 | 
 69 | __global__ void sum_tree(const Tree *root, int *result)
 70 | {
 71 |   // Allocate temporary global memory for storing subtree results
 72 |   int *left_sum = new int;
 73 |   int *right_sum = new int;
 74 | 
 75 |   // Create independent streams to sum each subtree
 76 |   cudaStream_t left_stream, right_stream;
 77 |   kernelCheckError(
 78 |       cudaStreamCreateWithFlags(&left_stream, cudaStreamNonBlocking));
 79 |   kernelCheckError(
 80 |       cudaStreamCreateWithFlags(&right_stream, cudaStreamNonBlocking));
 81 | 
 82 |   if (root->left) {
 83 |     sum_tree<<<1, 1, 0, left_stream>>>(root->left, left_sum);
 84 |   } else {
 85 |     *left_sum = 0;
 86 |   }
 87 | 
 88 |   if (root->right) {
 89 |     sum_tree<<<1, 1, 0, right_stream>>>(root->right, right_sum);
 90 |   } else {
 91 |     *right_sum = 0;
 92 |   }
 93 | 
 94 |   // Wait for both streams to finish
 95 |   kernelCheckError(cudaDeviceSynchronize());
 96 | 
 97 |   *result = root->value + *left_sum + *right_sum;
 98 | 
 99 |   kernelCheckError(cudaStreamDestroy(left_stream));
100 |   kernelCheckError(cudaStreamDestroy(right_stream));
101 | 
102 |   delete left_sum;
103 |   delete right_sum;
104 | }
105 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.7/bst-sum-kernels.cuh:
--------------------------------------------------------------------------------
 1 | // Header for bst-sum example, using in video 6.7.
 2 | 
 3 | struct Tree {
 4 |   int value;
 5 |   Tree *left;
 6 |   Tree *right;
 7 | };
 8 | 
 9 | __global__ void build_tree(const int *source, unsigned int length, Tree *root);
10 | __global__ void destroy_tree(Tree *root);
11 | __global__ void sum_tree(const Tree *root, int *result);
12 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.7/bst-sum-kernels.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.7/bst-sum-kernels.o


--------------------------------------------------------------------------------
/Code files/Section 6/6.7/bst-sum.cu:
--------------------------------------------------------------------------------
 1 | // Sum the contents of a binary search tree on the device, using dynamic
 2 | // parallelism.
 3 | // Example for video 6.7.
 4 | 
 5 | #include <assert.h>
 6 | #include <thrust/device_vector.h>
 7 | #include <thrust/transform_reduce.h>
 8 | 
 9 | #include <stdio.h>
10 | 
11 | // Standard CUDA API functions
12 | #include <cuda_runtime_api.h>
13 | 
14 | #include "bst-sum-kernels.cuh"
15 | #include "../utils.h"
16 | 
17 | int main(int argc, char **argv)
18 | {
19 |   const unsigned int COUNT = 128;
20 | 
21 |   // CUDA needs to reserve some device memory to manage synchronization for
22 |   // nested kernels. If we exceed the maximum reserved depth, our kernel will
23 |   // fail. This setting is sufficient for 128 elements. It should be adjusted
24 |   // if COUNT is changed.
25 |   cudaCheckError(cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 8));
26 | 
27 |   // Create device vector with sequential integers
28 |   thrust::device_vector<int> source(COUNT);
29 |   thrust::sequence(source.begin(), source.end());
30 | 
31 |   // Build the tree
32 |   Tree *root;
33 |   cudaCheckError(cudaMalloc(&root, sizeof(Tree)));
34 | 
35 |   // Build the tree from a sorted array
36 |   build_tree<<<1, 1>>>(thrust::raw_pointer_cast(&source[0]), source.size(),
37 |                        root);
38 | 
39 |   // Reduce
40 |   int *result_dev;
41 |   cudaCheckError(cudaMalloc(&result_dev, sizeof(int)));
42 | 
43 |   sum_tree<<<1, 1>>>(root, result_dev);
44 | 
45 |   // Check results
46 |   int result;
47 |   cudaCheckError(
48 |       cudaMemcpy(&result, result_dev, sizeof(int), cudaMemcpyDefault));
49 |   int reference =
50 |       thrust::reduce(source.begin(), source.end(), 0, thrust::plus<int>());
51 | 
52 |   printf("Sum of %u elements: %d\n", COUNT, result);
53 |   assert(result == reference);
54 | 
55 |   // Clean up
56 |   destroy_tree<<<1, 1>>>(root);
57 |   cudaCheckError(cudaFree(root));
58 | 
59 |   cudaCheckError(cudaDeviceSynchronize());
60 | }
61 | 


--------------------------------------------------------------------------------
/Code files/Section 6/6.7/bst-sum.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/Section 6/6.7/bst-sum.o


--------------------------------------------------------------------------------
/Code files/utils.cu:
--------------------------------------------------------------------------------
  1 | // Utility functions for example programs.
  2 | 
  3 | #include <assert.h>
  4 | #include <getopt.h>
  5 | #include <cstring>
  6 | #include <fstream>
  7 | #include <iostream>
  8 | #include <memory>
  9 | 
 10 | #include <cuda_runtime_api.h>
 11 | 
 12 | #include "utils.h"
 13 | 
 14 | const unsigned int HEADER_SIZE = 0x40;
 15 | const unsigned int CHANNELS = 3;
 16 | 
 17 | bool loadPPM(const char *file, pixel **data, unsigned int *w, unsigned int *h)
 18 | {
 19 |   FILE *fp = fopen(file, "rb");
 20 | 
 21 |   if (!fp) {
 22 |     std::cerr << "loadPPM() : failed to open file: " << file << "\n";
 23 |     return false;
 24 |   }
 25 | 
 26 |   // check header
 27 |   char header[HEADER_SIZE];
 28 | 
 29 |   if (fgets(header, HEADER_SIZE, fp) == nullptr) {
 30 |     std::cerr << "loadPPM(): reading header returned NULL\n";
 31 |     return false;
 32 |   }
 33 | 
 34 |   if (strncmp(header, "P6", 2)) {
 35 |     std::cerr << "unsupported image format\n";
 36 |     return false;
 37 |   }
 38 | 
 39 |   // parse header, read maxval, width and height
 40 |   unsigned int width = 0;
 41 |   unsigned int height = 0;
 42 |   unsigned int maxval = 0;
 43 |   unsigned int i = 0;
 44 | 
 45 |   while (i < 3) {
 46 |     if (fgets(header, HEADER_SIZE, fp) == NULL) {
 47 |       std::cerr << "loadPPM() : reading PPM header returned NULL" << std::endl;
 48 |       return false;
 49 |     }
 50 | 
 51 |     if (header[0] == '#') {
 52 |       continue;
 53 |     }
 54 | 
 55 |     if (i == 0) {
 56 |       i += sscanf(header, "%u %u %u", &width, &height, &maxval);
 57 |     } else if (i == 1) {
 58 |       i += sscanf(header, "%u %u", &height, &maxval);
 59 |     } else if (i == 2) {
 60 |       i += sscanf(header, "%u", &maxval);
 61 |     }
 62 |   }
 63 | 
 64 |   size_t pixel_count = width * height;
 65 |   size_t data_size = sizeof(unsigned char) * pixel_count * CHANNELS;
 66 |   unsigned char *raw_data = static_cast<unsigned char *>(malloc(data_size));
 67 |   *w = width;
 68 |   *h = height;
 69 | 
 70 |   // read and close file
 71 |   if (fread(raw_data, sizeof(unsigned char), pixel_count * CHANNELS, fp) == 0) {
 72 |     std::cerr << "loadPPM() read data returned error.\n";
 73 |   }
 74 |   fclose(fp);
 75 | 
 76 |   pixel *pixel_data = static_cast<pixel *>(malloc(pixel_count * sizeof(pixel)));
 77 |   float scale = 1.0f / 255.0f;
 78 |   for (int i = 0; i < pixel_count; i++) {
 79 |     pixel_data[i].red = raw_data[3 * i] * scale;
 80 |     pixel_data[i].green = raw_data[3 * i + 1] * scale;
 81 |     pixel_data[i].blue = raw_data[3 * i + 2] * scale;
 82 |   }
 83 | 
 84 |   *data = pixel_data;
 85 |   free(raw_data);
 86 | 
 87 |   return true;
 88 | }
 89 | 
 90 | void savePPM(const char *file, pixel *data, unsigned int w, unsigned int h)
 91 | {
 92 |   assert(data != nullptr);
 93 |   assert(w > 0);
 94 |   assert(h > 0);
 95 | 
 96 |   std::fstream fh(file, std::fstream::out | std::fstream::binary);
 97 | 
 98 |   if (fh.bad()) {
 99 |     std::cerr << "savePPM() : open failed.\n";
100 |     return;
101 |   }
102 | 
103 |   fh << "P6\n";
104 |   fh << w << "\n" << h << "\n" << 0xff << "\n";
105 | 
106 |   unsigned int pixel_count = w * h;
107 |   for (unsigned int i = 0; (i < pixel_count) && fh.good(); ++i) {
108 |     fh << static_cast<unsigned char>(data[i].red * 255);
109 |     fh << static_cast<unsigned char>(data[i].green * 255);
110 |     fh << static_cast<unsigned char>(data[i].blue * 255);
111 |   }
112 | 
113 |   fh.flush();
114 | 
115 |   if (fh.bad()) {
116 |     std::cerr << "savePPM() : writing data failed.\n";
117 |     return;
118 |   }
119 | 
120 |   fh.close();
121 | }
122 | 
123 | test_params set_up_test(int argc, char **argv)
124 | {
125 |   test_params params = {0, 0, nullptr, nullptr, nullptr};
126 | 
127 |   bool show_help = false;
128 |   for (int i = 1; i < argc; i++) {
129 |     char *current = argv[i];
130 |     if (!strncmp(current, "--", 2)) {
131 |       show_help = true;
132 |       break;
133 |     } else if (params.input_image == nullptr) {
134 |       // Load input
135 |       pixel *host_image = nullptr;
136 |       if (!loadPPM(current, &host_image, &params.width, &params.height)) {
137 |         exit(1);
138 |       }
139 | 
140 |       size_t image_size = params.width * params.height * sizeof(pixel);
141 |       cudaCheckError(cudaMalloc(&params.input_image, image_size));
142 |       cudaCheckError(cudaMalloc(&params.output_image, image_size));
143 |       cudaCheckError(cudaMemcpy(params.input_image, host_image, image_size,
144 |                                 cudaMemcpyHostToDevice));
145 | 
146 |     } else if (params.output_file == nullptr) {
147 |       // Save output filename
148 |       params.output_file = current;
149 |     } else {
150 |       show_help = true;
151 |       break;
152 |     }
153 |   }
154 | 
155 |   if (!params.output_file || !params.input_image) {
156 |     show_help = true;
157 |   }
158 | 
159 |   if (show_help) {
160 |     std::cout << "Usage: " << argv[0] << " INPUT_FILE OUTPUT_FILE\n";
161 |     exit(1);
162 |   }
163 | 
164 |   return params;
165 | }
166 | 
167 | void finish_test(const test_params &params)
168 | {
169 |   std::unique_ptr<pixel[]> host_image(new pixel[params.width * params.height]);
170 | 
171 |   cudaCheckError(cudaMemcpy(host_image.get(), params.output_image,
172 |                             params.width * params.height * sizeof(pixel),
173 |                             cudaMemcpyDeviceToHost));
174 |   if (params.input_image) {
175 |     cudaCheckError(cudaFree(params.input_image));
176 |   }
177 |   if (params.output_image) {
178 |     cudaCheckError(cudaFree(params.output_image));
179 |   }
180 | 
181 |   savePPM(params.output_file, host_image.get(), params.width, params.height);
182 | }
183 | 
184 | __global__ void unpack_image(image planar, const pixel *packed, int pixel_count)
185 | {
186 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
187 |   if (index >= pixel_count) return;
188 | 
189 |   planar.red[index] = packed[index].red;
190 |   planar.green[index] = packed[index].green;
191 |   planar.blue[index] = packed[index].blue;
192 | }
193 | 
194 | __global__ void pack_image(const image planar, pixel *packed, int pixel_count)
195 | {
196 |   int index = blockIdx.x * blockDim.x + threadIdx.x;
197 |   if (index >= pixel_count) return;
198 | 
199 |   packed[index].red = planar.red[index];
200 |   packed[index].green = planar.green[index];
201 |   packed[index].blue = planar.blue[index];
202 | }
203 | 
204 | image malloc_image(int pixel_count)
205 | {
206 |   image result;
207 |   cudaCheckError(cudaMalloc(&result.red, pixel_count * sizeof(float)));
208 |   cudaCheckError(cudaMalloc(&result.green, pixel_count * sizeof(float)));
209 |   cudaCheckError(cudaMalloc(&result.blue, pixel_count * sizeof(float)));
210 | 
211 |   return result;
212 | }
213 | 
214 | void free_image(const image &img)
215 | {
216 |   cudaCheckError(cudaFree(img.red));
217 |   cudaCheckError(cudaFree(img.green));
218 |   cudaCheckError(cudaFree(img.blue));
219 | }
220 | 
221 | const int BLOCK_SIZE = 128;
222 | 
223 | test_params_planar set_up_test_planar(int argc, char **argv)
224 | {
225 |   test_params params1 = set_up_test(argc, argv);
226 |   test_params_planar params = {
227 |       params1.width, params1.height, {}, {}, params1.output_file};
228 | 
229 |   int pixel_count = params.width * params.height;
230 |   params.input_image = malloc_image(pixel_count);
231 |   params.output_image = malloc_image(pixel_count);
232 | 
233 |   int n_blocks = (pixel_count + BLOCK_SIZE - 1) / BLOCK_SIZE;
234 |   unpack_image<<<n_blocks, BLOCK_SIZE>>>(params.input_image,
235 |                                          params1.input_image, pixel_count);
236 | 
237 |   cudaCheckError(cudaFree(params1.input_image));
238 |   params1.input_image = nullptr;
239 | 
240 |   return params;
241 | }
242 | 
243 | void finish_test_planar(const test_params_planar &params)
244 | {
245 |   free_image(params.input_image);
246 | 
247 |   test_params params1 = {params.width, params.height, nullptr, nullptr,
248 |                          params.output_file};
249 | 
250 |   int pixel_count = params.width * params.height;
251 |   cudaCheckError(
252 |       cudaMalloc(&params1.output_image, pixel_count * sizeof(pixel)));
253 | 
254 |   int n_blocks = (pixel_count + BLOCK_SIZE - 1) / BLOCK_SIZE;
255 |   pack_image<<<n_blocks, BLOCK_SIZE>>>(params.output_image,
256 |                                        params1.output_image, pixel_count);
257 | 
258 |   free_image(params.output_image);
259 | 
260 |   finish_test(params1);
261 | }
262 | 
263 | KernelTimer::KernelTimer()
264 | {
265 |   cudaCheckError(cudaDeviceSynchronize());
266 |   start = std::chrono::steady_clock::now();
267 | }
268 | 
269 | KernelTimer::~KernelTimer()
270 | {
271 |   cudaCheckError(cudaDeviceSynchronize());
272 |   auto end = std::chrono::steady_clock::now();
273 |   auto elapsed =
274 |       std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
275 |           .count();
276 |   std::cout << "kernel ran in " << elapsed << " ms\n";
277 | }
278 | 


--------------------------------------------------------------------------------
/Code files/utils.h:
--------------------------------------------------------------------------------
 1 | // Utility functions for example programs.
 2 | 
 3 | #ifndef __UTILS_H
 4 | #define __UTILS_H
 5 | 
 6 | #include <chrono>
 7 | 
 8 | // Error checking macro
 9 | #define cudaCheckError(code)                                             \
10 |   {                                                                      \
11 |     if ((code) != cudaSuccess) {                                         \
12 |       fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
13 |               cudaGetErrorString(code));                                 \
14 |     }                                                                    \
15 |   }
16 | 
17 | /* A single pixel with floating-point channel values */
18 | struct pixel {
19 |   float red;
20 |   float green;
21 |   float blue;
22 |   float alpha;
23 | };
24 | 
25 | /* An image with planar layout: separate buffers for each color channel */
26 | struct image {
27 |   float *red;
28 |   float *green;
29 |   float *blue;
30 | };
31 | 
32 | bool loadPPM(const char *file, pixel **data, unsigned int *w, unsigned int *h);
33 | void savePPM(const char *file, pixel *data, unsigned int w, unsigned int h);
34 | 
35 | struct test_params {
36 |   unsigned int width;
37 |   unsigned int height;
38 |   /* Device pointers to images */
39 |   pixel *input_image;
40 |   pixel *output_image;
41 |   const char *output_file;
42 | };
43 | 
44 | struct test_params_planar {
45 |   unsigned int width;
46 |   unsigned int height;
47 |   /* Device pointers to images */
48 |   image input_image;
49 |   image output_image;
50 |   const char *output_file;
51 | };
52 | 
53 | test_params set_up_test(int argc, char **argv);
54 | void finish_test(const test_params &params);
55 | test_params_planar set_up_test_planar(int argc, char **argv);
56 | void finish_test_planar(const test_params_planar &params);
57 | void free_image(const image &img);
58 | 
59 | class KernelTimer
60 | {
61 |  public:
62 |   KernelTimer();
63 |   ~KernelTimer();
64 | 
65 |  private:
66 |   std::chrono::time_point<std::chrono::steady_clock> start;
67 | };
68 | 
69 | #endif  // __UTILS_H
70 | 


--------------------------------------------------------------------------------
/Code files/utils.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Code files/utils.o


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Learning-CUDA-10-Programming
 2 | Learning CUDA 10 Programming, published by Packt
 3 | This is the code repository for [Learning CUDA 10 Programming]( https://www.packtpub.com/programming/learning-cuda-10-programming-video), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the video course from start to finish.
 4 | ## About the Video Course
 5 | Do you want to write GPU-accelerated applications, but don't know how to get started? With CUDA 10, you can easily add GPU processing to your C and C++ projects. CUDA 10 is the de-facto framework used to develop high-performance, GPU-accelerated applications.
 6 | In this course, you will be introduced to CUDA programming through hands-on examples. CUDA provides a general-purpose programming model which gives you access to the tremendous computational power of modern GPUs, as well as powerful libraries for machine learning, image processing, linear algebra, and parallel algorithms.
 7 | After working through this course, you will understand the fundamentals of CUDA programming and be able to start using it in your applications right away.
 8 | 
 9 | <H2>What You Will Learn</H2>
10 | <DIV class=book-info-will-learn-text>
11 | <UL>
12 | <LI> Use CUDA to speed up your applications using machine learning, image processing, linear algebra, and more <SPAN style="BACKGROUND-COLOR: transparent"> functions</SPAN> 
13 | <LI> Learn to debug CUDA programs and handle errors
14 | <LI> Use optimization techniques to get the maximum performance from your CUDA programs
15 | <LI> Master the fundamentals of concurrency and parallel algorithms on GPUs
16 | <LI> Learn about the wide range of GPU-accelerated libraries included with CUDA
17 | <LI> Learn the next steps you can take to continue building your CUDA skills </LI></UL></DIV>
18 | 
19 | ## Instructions and Navigation
20 | ### Assumed Knowledge
21 | If you want to learn how to use parallel and high-performance computing techniques to develop modern applications using GPUs and CUDA, then this course is for you. A good understanding of programming in modern C++ (C++17) is required in order to implement the concepts in this course.
22 | 
23 | ### Technical Requirements
24 | <UL>
25 | <B> Minimum Hardware Requirements </B>
26 | <LI> OS: Windows, MacOS, or Linux
27 | <LI> Processor: any 64-bit Intel or AMD processor
28 | <LI> Memory: 2GB of RAM
29 | <LI> Storage: 3GB of free space
30 |                <LI> Storage: 2GB </LI></UL>
31 | 
32 | <UL>
33 | <B> Recommended Hardware Requirements </B>
34 | <LI> For an optimal experience with hands-on labs and other practical activities, we recommend the following configuration:
35 | <LI> OS: Windows 10 version 1703 or higher: Home, Professional, Education and Enterprise (LTSC and S are not supported)
36 | 1.8 GHz or faster processor. Quad-core or better recommended.
37 | <LI> Memory: 2GB; 8GB of RAM recommended (2.5 GB minimum if running on a Virtual Machine)
38 | <LI> Storage: Minimum of 800 MB up to 210 GB of disk space depending on the features installed.
39 | <LI> Video Card that supports a minimum display resolution of 720p (1280 by 720); Visual Studio will work best at a resolution of WXGA (1366 by 768) or higher. </LI></UL>
40 | 
41 | 
42 | <UL>
43 | <B> Software Requirements </B>
44 | <LI> OS: Windows, MacOS, or Linux
45 | <LI> Processor: any 64-bit Intel or AMD processor
46 | <LI> Memory: 8GB of RAM
47 | <LI> Storage: 30GB of free space </LI></UL>
48 | 
49 | <UL>
50 | <B> Software Requirements </B>
51 |   <LI>CUDA Toolkit Version 10.1 or later: https://developer.nvidia.com/cuda-downloads </LI></UL>
52 | 
53 | 
54 | ## Related Products
55 | * [C++ Programming By Example [Video]](https://www.packtpub.com/application-development/c-programming-example-video)
56 | 
57 | * [High-Performance Computing with Python 3.x [Video]](https://www.packtpub.com/application-development/high-performance-computing-python-3x-video?utm_source=github&utm_medium=repository&utm_campaign=9781789956252)
58 | 
59 | * [Functional Programming in 7 Days [Video]](https://www.packtpub.com/application-development/functional-programming-7-days-video?utm_source=github&utm_medium=repository&utm_campaign=9781788990295)
60 | 


--------------------------------------------------------------------------------
/Section 1/1.1_JJ_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 1/1.1_JJ_MC.pptx


--------------------------------------------------------------------------------
/Section 1/1.2_YM_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 1/1.2_YM_MC.pptx


--------------------------------------------------------------------------------
/Section 1/1.3_YM_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 1/1.3_YM_MC.pptx


--------------------------------------------------------------------------------
/Section 1/1.4_YM_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 1/1.4_YM_MC.pptx


--------------------------------------------------------------------------------
/Section 1/1.5_YM_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 1/1.5_YM_MC.pptx


--------------------------------------------------------------------------------
/Section 2/2.1_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 2/2.1_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 2/2.2_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 2/2.2_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 2/2.3_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 2/2.3_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 2/2.4_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 2/2.4_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 2/2.5_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 2/2.5_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 3/3.1_YM_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 3/3.1_YM_MC.pptx


--------------------------------------------------------------------------------
/Section 3/3.2_YM_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 3/3.2_YM_MC.pptx


--------------------------------------------------------------------------------
/Section 3/3.3_YM_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 3/3.3_YM_MC.pptx


--------------------------------------------------------------------------------
/Section 3/3.4_YM_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 3/3.4_YM_MC.pptx


--------------------------------------------------------------------------------
/Section 3/3.5_YM_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 3/3.5_YM_MC.pptx


--------------------------------------------------------------------------------
/Section 4/4.1_YM_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 4/4.1_YM_MC.pptx


--------------------------------------------------------------------------------
/Section 4/4.2_YM_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 4/4.2_YM_MC.pptx


--------------------------------------------------------------------------------
/Section 4/4.3_YM_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 4/4.3_YM_MC.pptx


--------------------------------------------------------------------------------
/Section 4/4.4_YM_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 4/4.4_YM_MC.pptx


--------------------------------------------------------------------------------
/Section 5/5.1_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 5/5.1_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 5/5.2_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 5/5.2_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 5/5.3_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 5/5.3_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 5/5.4_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 5/5.4_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 6/6.1_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 6/6.1_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 6/6.2_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 6/6.2_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 6/6.3_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 6/6.3_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 6/6.4_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 6/6.4_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 6/6.5_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 6/6.5_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 6/6.6_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 6/6.6_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 6/6.7_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 6/6.7_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 7/7.1_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 7/7.1_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 7/7.2_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 7/7.2_TK_MC.pptx


--------------------------------------------------------------------------------
/Section 7/7.3_TK_MC.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-CUDA-10-Programming/3a8ff5091a2b20a046dc3f0da916236ab581ad95/Section 7/7.3_TK_MC.pptx


--------------------------------------------------------------------------------