├── .gitattributes
├── materials
    └── dli-fundamentals-of-accelerated-computing-with-cuda-c-c++-datasheet.pdf
├── README.md
├── cu
    └── 02-saxpy-solution.cu
├── cuda_cheatsheet.md
├── AC_CUDA_C.md
├── AC_CUDA_C.ipynb
└── Streaming and Visual Profiling.md


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/materials/dli-fundamentals-of-accelerated-computing-with-cuda-c-c++-datasheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkonvicka/Nvidia-CUDA-course/HEAD/materials/dli-fundamentals-of-accelerated-computing-with-cuda-c-c++-datasheet.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Nvidia-CUDA-course
 2 |  Fundamentals of Accelerated Computing with CUDA C/C++
 3 |  
 4 |  [💡CUDA cheatsheet](./cuda_cheatsheet.md)
 5 |  
 6 | ## NVIDIA DLI Certificate
 7 | ✅ [ce7b1370c4484875811a1a47160a6ea6](https://courses.nvidia.com/certificates/ce7b1370c4484875811a1a47160a6ea6/)
 8 | 
 9 | **Final assessment kernel execution time:** 0.495862 sec [max. 1.3 sec]
10 | 
11 | ## Course
12 |  1. [Accelerating Applications with CUDA C/C++](AC_CUDA_C.ipynb)
13 |  2. [Unified Memory](Unified%20Memory.ipynb)
14 |  3. [Streaming and Visual Profiling](Streaming_and_Visual_Profiling.ipynb)
15 | 
16 | ## Objectives
17 | 	By the time you complete this lab, you will be able to:
18 | 
19 | 	* Write, compile, and run C/C++ programs that both call CPU functions and launch GPU kernels.
20 | 	* Control parallel thread hierarchy using execution configuration.
21 | 	* Refactor serial loops to execute their iterations in parallel on a GPU.
22 | 	* Allocate and free memory available to both CPUs and GPUs.
23 | 	* Handle errors generated by CUDA code.
24 | 	* Accelerate CPU-only applications.
25 | 


--------------------------------------------------------------------------------
/cu/02-saxpy-solution.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #define N 2048 * 2048 // Number of elements in each vector
 4 | 
 5 | __global__ void saxpy(int * a, int * b, int * c)
 6 | {
 7 |   // Determine our unique global thread ID, so we know which element to process
 8 |   int tid = blockIdx.x * blockDim.x + threadIdx.x;
 9 |   int stride = blockDim.x * gridDim.x;
10 |   
11 |   for (int i = tid; i < N; i += stride)
12 |     c[i] = 2 * a[i] + b[i];
13 | }
14 | 
15 | int main()
16 | {
17 |   int *a, *b, *c;
18 | 
19 |   int size = N * sizeof (int); // The total number of bytes per vector
20 | 
21 |   int deviceId;
22 |   int numberOfSMs;
23 | 
24 |   cudaGetDevice(&deviceId);
25 |   cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
26 | 
27 |   // Allocate memory
28 |   cudaMallocManaged(&a, size);
29 |   cudaMallocManaged(&b, size);
30 |   cudaMallocManaged(&c, size);
31 | 
32 |   // Initialize memory
33 |   for( int i = 0; i < N; ++i )
34 |   {
35 |     a[i] = 2;
36 |     b[i] = 1;
37 |     c[i] = 0;
38 |   }
39 | 
40 |   cudaMemPrefetchAsync(a, size, deviceId);
41 |   cudaMemPrefetchAsync(b, size, deviceId);
42 |   cudaMemPrefetchAsync(c, size, deviceId);
43 | 
44 |   int threads_per_block = 256;
45 |   int number_of_blocks = numberOfSMs * 32;
46 | 
47 |   saxpy <<<number_of_blocks, threads_per_block>>>( a, b, c );
48 | 
49 |   cudaDeviceSynchronize(); // Wait for the GPU to finish
50 | 
51 |   // Print out the first and last 5 values of c for a quality check
52 |   for( int i = 0; i < 5; ++i )
53 |     printf("c[%d] = %d, ", i, c[i]);
54 |   printf ("\n");
55 |   for( int i = N-5; i < N; ++i )
56 |     printf("c[%d] = %d, ", i, c[i]);
57 |   printf ("\n");
58 | 
59 |   // Free all our allocated memory
60 |   cudaFree( a ); cudaFree( b ); cudaFree( c );
61 | }
62 | 


--------------------------------------------------------------------------------
/cuda_cheatsheet.md:
--------------------------------------------------------------------------------
 1 | # CUDA cheatsheet
 2 | 
 3 | ### Kernel declaration
 4 | ``` cpp
 5 | __global__ void GPUFunction()
 6 | {
 7 |   printf("This function is defined to run on the GPU.\n");
 8 | }
 9 | ```
10 | ### Calling kernel
11 | ```cpp
12 | 	//n_blocks, n_threads
13 | GPUFunction<<<1, 1>>>();
14 | ```
15 | 
16 | ### CUDA Synchronization
17 | ```cpp
18 | cudaDeviceSynchronize();
19 | ```
20 | 
21 | ### Mapping cuda indexes into index of data
22 | ``` cpp
23 | int data_index = threadIdx.x + blockIdx.x * blockDim.x;
24 | ```
25 | 
26 | ### Computing number of needed blocks for data
27 | ``` cpp
28 | int number_of_blocks = (N + threads_per_block - 1) / threads_per_block;
29 | ```
30 | 
31 | ### Manging shared memory
32 | ``` cpp
33 | int N = 100;
34 | int *a;
35 | size_t size = N * sizeof(int);
36 | cudaMallocManaged(&a, size);
37 | cudaFree(a);
38 | ```
39 | 
40 | ### Grid stride loop
41 | ``` cpp
42 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
43 | int stride = gridDim.x * blockDim.x;
44 | 
45 | for (int i = idx; i < N; i += stride)
46 | {
47 | 	a[i] = a[i]; //do work
48 | }
49 | ```
50 | 
51 | ### Shared memory management error handling
52 | ``` cpp
53 | cudaError_t err;
54 | err = cudaMallocManaged(&a, N);                    	// Assume the existence of `a` and `N`.
55 | 
56 | if (err != cudaSuccess)                           	// `cudaSuccess` is provided by CUDA.
57 | {
58 | 	printf("Error: %s\n", cudaGetErrorString(err)); // `cudaGetErrorString` is provided by CUDA.
59 | }
60 | 
61 | ```
62 | 
63 | ### Kernel running error handling
64 | ``` cpp
65 | someKernel<<<1, -1>>>();  // -1 is not a valid number of threads.
66 | 
67 | cudaError_t err;
68 | err = cudaGetLastError(); // `cudaGetLastError` will return the error from above.
69 | if (err != cudaSuccess)
70 | {
71 | 	printf("Error: %s\n", cudaGetErrorString(err));
72 | }
73 | ```
74 | 
75 | ### General CUDA error handling function
76 | ``` cpp
77 | inline cudaError_t checkCuda(cudaError_t result)
78 | {
79 | 	if (result != cudaSuccess) {
80 | 		fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
81 | 		assert(result == cudaSuccess);
82 | 	}
83 | 	return result;
84 | }
85 | ```


--------------------------------------------------------------------------------
/AC_CUDA_C.md:
--------------------------------------------------------------------------------
  1 | <div align="center"><h1>Accelerating Applications with CUDA C/C++</h1></div>
  2 | 
  3 | ![CUDA](./images/CUDA_Logo.jpg)
  4 | 
  5 | Accelerated computing is replacing CPU-only computing as best practice. The parade of breakthroughs driven by accelerated computing, the ever increasing demand for accelerated applications, programming conventions that ease writing them, and constant improvements in the hardware that supports them, are driving this inevitable transition.
  6 | 
  7 | At the center of accelerated computing's success, both in terms of its impressive performance, and its ease of use, is the [CUDA](https://developer.nvidia.com/about-cuda) compute platform. CUDA provides a coding paradigm that extends languages like C, C++, Python, and Fortran, to be capable of running accelerated, massively parallelized code on the world's most performant parallel processors: NVIDIA GPUs. CUDA accelerates applications drastically with little effort, has an ecosystem of highly optimized libraries for [DNN](https://developer.nvidia.com/cudnn), [BLAS](https://developer.nvidia.com/cublas), [graph analytics](https://developer.nvidia.com/nvgraph), [FFT](https://developer.nvidia.com/cufft), and more, and also ships with powerful [command line and visual profilers](https://developer.nvidia.com/nsight-systems).
  8 | 
  9 | CUDA supports many, if not most, of the [world's most performant applications](https://www.nvidia.com/en-us/data-center/gpu-accelerated-applications/catalog/?product_category_id=58,59,60,293,98,172,223,227,228,265,487,488,114,389,220,258,461&search=) in: [Computational Fluid Dynamics](https://www.nvidia.com/en-us/data-center/gpu-accelerated-applications/catalog/?product_category_id=10,12,16,17,19,51,53,71,87,121,124,156,157,195,202,203,204,312,339,340,395,407,448,485,517,528,529,541,245,216,104,462,513,250,492,420,429,490,10,12,16,17,19,51,53,71,87,121,124,156,157,195,202,203,204,312,339,340,395,407,448,485,517,528,529,541,245,216,104,462,513,250,492,420,429,490,10,12,16,17,19,51,53,71,87,121,124,156,157,195,202,203,204,312,339,340,395,407,448,485,517,528,529,541,245,216,104,462,513,250,492,420,429,490&search=), [Molecular Dynamics](https://www.nvidia.com/en-us/data-center/gpu-accelerated-applications/catalog/?product_category_id=8,57,92,123,211,213,237,272,274,282,283,307,325,337,344,345,351,362,365,380,396,398,400,435,507,508,519,8,57,92,123,211,213,237,272,274,282,283,307,325,337,344,345,351,362,365,380,396,398,400,435,507,508,519,8,57,92,123,211,213,237,272,274,282,283,307,325,337,344,345,351,362,365,380,396,398,400,435,507,508,519,8,57,92,123,211,213,237,272,274,282,283,307,325,337,344,345,351,362,365,380,396,398,400,435,507,508,519&search=), [Quantum Chemistry](https://www.nvidia.com/en-us/data-center/gpu-accelerated-applications/catalog/?product_category_id=8,57,92,123,211,213,237,272,274,282,283,307,325,337,344,345,351,362,365,380,396,398,400,435,507,508,519,8,57,92,123,211,213,237,272,274,282,283,307,325,337,344,345,351,362,365,380,396,398,400,435,507,508,519&search=), [Physics](https://www.nvidia.com/en-us/data-center/gpu-accelerated-applications/catalog/?product_category_id=6,24,116,118,119,135,229,231,372,373,392,393,489,493,494,495,496,497,498,67,170,216,281,6,24,116,118,119,135,229,231,372,373,392,393,489,493,494,495,496,497,498,67,170,216,281,6,24,116,118,119,135,229,231,372,373,392,393,489,493,494,495,496,497,498,67,170,216,281,6,24,116,118,119,135,229,231,372,373,392,393,489,493,494,495,496,497,498,67,170,216,281,6,24,116,118,119,135,229,231,372,373,392,393,489,493,494,495,496,497,498,67,170,216,281&search=) and HPC.
 10 | 
 11 | Learning CUDA will enable you to accelerate your own applications. Accelerated applications perform much faster than their CPU-only counterparts, and make possible computations that would be otherwise prohibited given the limited performance of CPU-only applications. In this lab you will receive an introduction to programming accelerated applications with CUDA C/C++, enough to be able to begin work accelerating your own CPU-only applications for performance gains, and for moving into novel computational territory.
 12 | 
 13 | ---
 14 | ## Prerequisites
 15 | 
 16 | To get the most out of this lab you should already be able to:
 17 | 
 18 | - Declare variables, write loops, and use if / else statements in C.
 19 | - Define and invoke functions in C.
 20 | - Allocate arrays in C.
 21 | 
 22 | No previous CUDA knowledge is required.
 23 | 
 24 | ---
 25 | ## Objectives
 26 | 
 27 | By the time you complete this lab, you will be able to:
 28 | 
 29 | - Write, compile, and run C/C++ programs that both call CPU functions and **launch** GPU **kernels**.
 30 | - Control parallel **thread hierarchy** using **execution configuration**.
 31 | - Refactor serial loops to execute their iterations in parallel on a GPU.
 32 | - Allocate and free memory available to both CPUs and GPUs.
 33 | - Handle errors generated by CUDA code.
 34 | - Accelerate CPU-only applications.
 35 | 
 36 | ---
 37 | ## Accelerated Systems
 38 | 
 39 | *Accelerated systems*, also referred to as *heterogeneous systems*, are those composed of both CPUs and GPUs. Accelerated systems run CPU programs which in turn, launch functions that will benefit from the massive parallelism provided by GPUs. This lab environment is an accelerated system which includes an NVIDIA GPU. Information about this GPU can be queried with the `nvidia-smi` (*Systems Management Interface*) command line command. Issue the `nvidia-smi` command now, by `CTRL` + `ENTER` on the code execution cell below. You will find these cells throughout this lab any time you need to execute code. The output from running the command will be printed just below the code execution cell after the code runs. After running the code execution block immediately below, take care to find and note the name of the GPU in the output.
 40 | 
 41 | 
 42 | ```python
 43 | !nvidia-smi
 44 | ```
 45 | 
 46 |     Wed Jun 22 07:24:58 2022       
 47 |     +-----------------------------------------------------------------------------+
 48 |     | NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
 49 |     |-------------------------------+----------------------+----------------------+
 50 |     | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
 51 |     | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
 52 |     |                               |                      |               MIG M. |
 53 |     |===============================+======================+======================|
 54 |     |   0  Tesla T4            On   | 00000000:00:1E.0 Off |                    0 |
 55 |     | N/A   30C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
 56 |     |                               |                      |                  N/A |
 57 |     +-------------------------------+----------------------+----------------------+
 58 |                                                                                    
 59 |     +-----------------------------------------------------------------------------+
 60 |     | Processes:                                                                  |
 61 |     |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
 62 |     |        ID   ID                                                   Usage      |
 63 |     |=============================================================================|
 64 |     |  No running processes found                                                 |
 65 |     +-----------------------------------------------------------------------------+
 66 | 
 67 | 
 68 | ---
 69 | ## GPU-accelerated Vs. CPU-only Applications
 70 | 
 71 | The following slides present upcoming material visually, at a high level. Click through the slides before moving on to more detailed coverage of their topics in following sections.
 72 | 
 73 | <script>console.log('hi');</script>
 74 | 
 75 | 
 76 | ```python
 77 | %%HTML
 78 | 
 79 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_1.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
 80 | ```
 81 | 
 82 | 
 83 | 
 84 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_1.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
 85 | 
 86 | 
 87 | 
 88 | ---
 89 | ## Writing Application Code for the GPU
 90 | 
 91 | CUDA provides extensions for many common programming languages, in the case of this lab, C/C++. These language extensions easily allow developers to run functions in their source code on a GPU.
 92 | 
 93 | Below is a `.cu` file (`.cu` is the file extension for CUDA-accelerated programs). It contains two functions, the first which will run on the CPU, the second which will run on the GPU. Spend a little time identifying the differences between the functions, both in terms of how they are defined, and how they are invoked.
 94 | 
 95 | ```cpp
 96 | void CPUFunction()
 97 | {
 98 |   printf("This function is defined to run on the CPU.\n");
 99 | }
100 | 
101 | __global__ void GPUFunction()
102 | {
103 |   printf("This function is defined to run on the GPU.\n");
104 | }
105 | 
106 | int main()
107 | {
108 |   CPUFunction();
109 | 
110 |   GPUFunction<<<1, 1>>>();
111 |   cudaDeviceSynchronize();
112 | }
113 | ```
114 | 
115 | Here are some important lines of code to highlight, as well as some other common terms used in accelerated computing:
116 | 
117 | `__global__ void GPUFunction()`
118 |   - The `__global__` keyword indicates that the following function will run on the GPU, and can be invoked **globally**, which in this context means either by the CPU, or, by the GPU.
119 |   - Often, code executed on the CPU is referred to as **host** code, and code running on the GPU is referred to as **device** code.
120 |   - Notice the return type `void`. It is required that functions defined with the `__global__` keyword return type `void`.
121 | 
122 | `GPUFunction<<<1, 1>>>();`
123 |   - Typically, when calling a function to run on the GPU, we call this function a **kernel**, which is **launched**.
124 |   - When launching a kernel, we must provide an **execution configuration**, which is done by using the `<<< ... >>>` syntax just prior to passing the kernel any expected arguments.
125 |   - At a high level, execution configuration allows programmers to specify the **thread hierarchy** for a kernel launch, which defines the number of thread groupings (called **blocks**), as well as how many **threads** to execute in each block. Execution configuration will be explored at great length later in the lab, but for the time being, notice the kernel is launching with `1` block of threads (the first execution configuration argument) which contains `1` thread (the second configuration argument).
126 | 
127 | `cudaDeviceSynchronize();`
128 |   - Unlike much C/C++ code, launching kernels is **asynchronous**: the CPU code will continue to execute *without waiting for the kernel launch to complete*.
129 |   - A call to `cudaDeviceSynchronize`, a function provided by the CUDA runtime, will cause the host (CPU) code to wait until the device (GPU) code completes, and only then resume execution on the CPU.
130 | 
131 | ---
132 | ### Exercise: Write a Hello GPU Kernel
133 | 
134 | The [`01-hello-gpu.cu`](../edit/01-hello/01-hello-gpu.cu) (*<---- click on the link of the source file to open it in another tab for editing*) contains a program that is already working. It contains two functions, both with print "Hello from the CPU" messages. Your goal is to refactor the `helloGPU` function in the source file so that it actually runs on the GPU, and prints a message indicating that it does.
135 | 
136 | - Refactor the application, before compiling and running it with the `nvcc` command just below (remember, you can execute the contents of the code execution cell by `CTRL + ENTER` it). The comments in [`01-hello-gpu.cu`](../edit/01-hello/01-hello-gpu.cu) will assist your work. If you get stuck, or want to check your work, refer to the [solution](../edit/01-hello/solutions/01-hello-gpu-solution.cu). Don't forget to save your changes to the file before compiling and running with the command below.
137 | 
138 | 
139 | ```python
140 | !nvcc -arch=sm_70 -o hello-gpu 01-hello/01-hello-gpu.cu -run
141 | ```
142 | 
143 |     Hello from the CPU.
144 |     Hello from the GPU.
145 | 
146 | 
147 | After successfully refactoring [`01-hello-gpu.cu`](../edit/01-hello/01-hello-gpu.cu), make the following modifications, attempting to compile and run it after each change (by `CTRL + ENTER` clicking on the code execution cell above). When given errors, take the time to read them carefully: familiarity with them will serve you greatly when you begin writing your own accelerated code.
148 | 
149 | - Remove the keyword `__global__` from your kernel definition. Take care to note the line number in the error: what do you think is meant in the error by "configured"? Replace `__global__` when finished.
150 | - Remove the execution configuration: does your understanding of "configured" still make sense? Replace the execution configuration when finished.
151 | - Remove the call to `cudaDeviceSynchronize`. Before compiling and running the code, take a guess at what will happen, recalling that kernels are launched asynchronously, and that `cudaDeviceSynchronize` is what makes host execution in wait for kernel execution to complete before proceeding. Replace the call to `cudaDeviceSynchronize` when finished.
152 | - Refactor `01-hello-gpu.cu` so that `Hello from the GPU` prints **before** `Hello from the CPU`.
153 | - Refactor `01-hello-gpu.cu` so that `Hello from the GPU` prints **twice**, once  **before** `Hello from the CPU`, and once **after**.
154 | 
155 | ---
156 | ### Compiling and Running Accelerated CUDA Code
157 | 
158 | This section contains details about the `nvcc` command you issued above to compile and run your `.cu` program.
159 | 
160 | The CUDA platform ships with the [**NVIDIA CUDA Compiler**](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html) `nvcc`, which can compile CUDA accelerated applications, both the host, and the device code they contain. For the purposes of this lab, `nvcc` discussion will be pragmatically scoped to suit our immediate needs. After completing the lab anyone interested in a deeper dive into `nvcc` can start with [the documentation](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html).
161 | 
162 | `nvcc` will be very familiar to experienced `gcc` users. Compiling, for example, a `some-CUDA.cu` file, is simply:
163 | 
164 | `nvcc -arch=sm_70 -o out some-CUDA.cu -run`
165 |   - `nvcc` is the command line command for using the `nvcc` compiler.
166 |   - `some-CUDA.cu` is passed as the file to compile.
167 |   - The `o` flag is used to specify the output file for the compiled program.
168 |   - The `arch` flag indicates for which **architecture** the files must be compiled. For the present case `sm_70` will serve to compile specifically for the GPU this lab is running on, but for those interested in a deeper dive, please refer to the docs about the [`arch` flag](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-steering-gpu-code-generation), [virtual architecture features](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-feature-list) and [GPU features](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-feature-list).
169 |   - As a matter of convenience, providing the `run` flag will execute the successfully compiled binary.
170 | 
171 | ---
172 | ## CUDA Thread Hierarchy
173 | 
174 | The following slides present upcoming material visually, at a high level. Click through the slides before moving on to more detailed coverage of their topics in following sections.
175 | 
176 | 
177 | ```python
178 | %%HTML
179 | 
180 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_2.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
181 | ```
182 | 
183 | 
184 | 
185 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_2.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
186 | 
187 | 
188 | 
189 | ---
190 | ## Launching Parallel Kernels
191 | 
192 | The execution configuration allows programmers to specify details about launching the kernel to run in parallel on multiple GPU **threads**. More precisely, the execution configuration allows programmers to specify how many groups of threads - called **thread blocks**, or just **blocks** - and how many threads they would like each thread block to contain. The syntax for this is:
193 | 
194 | `<<< NUMBER_OF_BLOCKS, NUMBER_OF_THREADS_PER_BLOCK>>>`
195 | 
196 | ** The kernel code is executed by every thread in every thread block configured when the kernel is launched**.
197 | 
198 | Thus, under the assumption that a kernel called `someKernel` has been defined, the following are true:
199 |   - `someKernel<<<1, 1>>>()` is configured to run in a single thread block which has a single thread and will therefore run only once.
200 |   - `someKernel<<<1, 10>>>()` is configured to run in a single thread block which has 10 threads and will therefore run 10 times.
201 |   - `someKernel<<<10, 1>>>()` is configured to run in 10 thread blocks which each have a single thread and will therefore run 10 times.
202 |   - `someKernel<<<10, 10>>>()` is configured to run in 10 thread blocks which each have 10 threads and will therefore run 100 times.
203 | 
204 | ---
205 | ### Exercise: Launch Parallel Kernels
206 | 
207 | [`01-first-parallel.cu`](../edit/02-first-parallel/01-first-parallel.cu) currently makes a very basic function call that prints the message `This should be running in parallel.` Follow the steps below to refactor it first to run on the GPU, and then, in parallel, both in a single, and then, in multiple thread blocks. Refer to [the solution](../edit/02-first-parallel/solutions/01-first-parallel-solution.cu) if you get stuck.
208 | 
209 | - Refactor the `firstParallel` function to launch as a CUDA kernel on the GPU. You should still be able to see the output of the function after compiling and running `01-first-parallel.cu` with the `nvcc` command just below.
210 | - Refactor the `firstParallel` kernel to execute in parallel on 5 threads, all executing in a single thread block. You should see the output message printed 5 times after compiling and running the code.
211 | - Refactor the `firstParallel` kernel again, this time to execute in parallel inside 5 thread blocks, each containing 5 threads. You should see the output message printed 25 times now after compiling and running.
212 | 
213 | 
214 | ```python
215 | !nvcc -arch=sm_70 -o first-parallel 02-first-parallel/01-first-parallel.cu -run
216 | ```
217 | 
218 |     [1] This should be running in parallel.
219 |     [1] This should be running in parallel.
220 |     [1] This should be running in parallel.
221 |     [0] This should be running in parallel.
222 |     [0] This should be running in parallel.
223 |     [0] This should be running in parallel.
224 | 
225 | 
226 | ---
227 | 
228 | ## CUDA-Provided Thread Hierarchy Variables
229 | 
230 | The following slides present upcoming material visually, at a high level. Click through the slides before moving on to more detailed coverage of their topics in following sections.
231 | 
232 | 
233 | ```python
234 | %%HTML
235 | 
236 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_3.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
237 | ```
238 | 
239 | 
240 | 
241 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_3.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
242 | 
243 | 
244 | 
245 | ---
246 | ## Thread and Block Indices
247 | 
248 | Each thread is given an index within its thread block, starting at `0`. Additionally, each block is given an index, starting at `0`. Just as threads are grouped into thread blocks, blocks are grouped into a **grid**, which is the highest entity in the CUDA thread hierarchy. In summary, CUDA kernels are executed in a grid of 1 or more blocks, with each block containing the same number of 1 or more threads.
249 | 
250 | CUDA kernels have access to special variables identifying both the index of the thread (within the block) that is executing the kernel, and, the index of the block (within the grid) that the thread is within. These variables are `threadIdx.x` and `blockIdx.x` respectively.
251 | 
252 | ---
253 | ### Exercise: Use Specific Thread and Block Indices
254 | 
255 | Currently the [`01-thread-and-block-idx.cu`](../edit/03-indices/01-thread-and-block-idx.cu) file contains a working kernel that is printing a failure message. Open the file to learn how to update the execution configuration so that the success message will print. After refactoring, compile and run the code with the code execution cell below to confirm your work. Refer to [the solution](../edit/03-indices/solutions/01-thread-and-block-idx-solution.cu) if you get stuck.
256 | 
257 | 
258 | ```python
259 | !nvcc -arch=sm_70 -o thread-and-block-idx 03-indices/01-thread-and-block-idx.cu -run
260 | ```
261 | 
262 |     Success!
263 | 
264 | 
265 | ---
266 | ## Accelerating For Loops
267 | 
268 | For loops in CPU-only applications are ripe for acceleration: rather than run each iteration of the loop serially, each iteration of the loop can be run in parallel in its own thread. Consider the following for loop, and notice, though it is obvious, that it controls how many times the loop will execute, as well as defining what will happen for each iteration of the loop:
269 | 
270 | ```cpp
271 | int N = 2<<20;
272 | for (int i = 0; i < N; ++i)
273 | {
274 |   printf("%d\n", i);
275 | }
276 | ```
277 | 
278 | In order to parallelize this loop, 2 steps must be taken:
279 | 
280 | - A kernel must be written to do the work of a **single iteration of the loop**.
281 | - Because the kernel will be agnostic of other running kernels, the execution configuration must be such that the kernel executes the correct number of times, for example, the number of times the loop would have iterated.
282 | 
283 | ---
284 | ### Exercise: Accelerating a For Loop with a Single Block of Threads
285 | 
286 | Currently, the `loop` function inside [`01-single-block-loop.cu`](../edit/04-loops/01-single-block-loop.cu), runs a for loop that will serially print the numbers `0` through `9`. Refactor the `loop` function to be a CUDA kernel which will launch to execute `N` iterations in parallel. After successfully refactoring, the numbers `0` through `9` should still be printed. Refer to [the solution](../edit/04-loops/solutions/01-single-block-loop-solution.cu) if you get stuck.
287 | 
288 | 
289 | ```python
290 | !nvcc -arch=sm_70 -o single-block-loop 04-loops/01-single-block-loop.cu -run
291 | ```
292 | 
293 |     This is iteration number 0
294 |     This is iteration number 1
295 |     This is iteration number 2
296 |     This is iteration number 3
297 |     This is iteration number 4
298 |     This is iteration number 5
299 |     This is iteration number 6
300 |     This is iteration number 7
301 |     This is iteration number 8
302 |     This is iteration number 9
303 | 
304 | 
305 | ---
306 | ## Coordinating Parallel Threads
307 | 
308 | The following slides present upcoming material visually, at a high level. Click through the slides before moving on to more detailed coverage of their topics in following sections.
309 | 
310 | 
311 | ```python
312 | %%HTML
313 | 
314 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_4.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
315 | ```
316 | 
317 | 
318 | 
319 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_4.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
320 | 
321 | 
322 | 
323 | ---
324 | ## Using Block Dimensions for More Parallelization
325 | 
326 | There is a limit to the number of threads that can exist in a thread block: 1024 to be precise. In order to increase the amount of parallelism in accelerated applications, we must be able to coordinate among multiple thread blocks.
327 | 
328 | CUDA Kernels have access to a special variable that gives the number of threads in a block: `blockDim.x`. Using this variable, in conjunction with `blockIdx.x` and `threadIdx.x`, increased parallelization can be accomplished by organizing parallel execution across multiple blocks of multiple threads with the idiomatic expression `threadIdx.x + blockIdx.x * blockDim.x`. Here is a detailed example.
329 | 
330 | The execution configuration `<<<10, 10>>>` would launch a grid with a total of 100 threads, contained in 10 blocks of 10 threads. We would therefore hope for each thread to have the ability to calculate some index unique to itself between `0` and `99`.
331 | 
332 | - If block `blockIdx.x` equals `0`, then `blockIdx.x * blockDim.x` is `0`. Adding to `0` the possible `threadIdx.x` values `0` through `9`, then we can generate the indices `0` through `9` within the 100 thread grid.
333 | - If block `blockIdx.x` equals `1`, then `blockIdx.x * blockDim.x` is `10`. Adding to `10` the possible `threadIdx.x` values `0` through `9`, then we can generate the indices `10` through `19` within the 100 thread grid.
334 | - If block `blockIdx.x` equals `5`, then `blockIdx.x * blockDim.x` is `50`. Adding to `50` the possible `threadIdx.x` values `0` through `9`, then we can generate the indices `50` through `59` within the 100 thread grid.
335 | - If block `blockIdx.x` equals `9`, then `blockIdx.x * blockDim.x` is `90`. Adding to `90` the possible `threadIdx.x` values `0` through `9`, then we can generate the indices `90` through `99` within the 100 thread grid.
336 | 
337 | ---
338 | ### Exercise: Accelerating a For Loop with Multiple Blocks of Threads
339 | 
340 | Currently, the `loop` function inside [`02-multi-block-loop.cu`](../edit/04-loops/02-multi-block-loop.cu) runs a for loop that will serially print the numbers `0` through `9`. Refactor the `loop` function to be a CUDA kernel which will launch to execute `N` iterations in parallel. After successfully refactoring, the numbers `0` through `9` should still be printed. For this exercise, as an additional constraint, use an execution configuration that launches *at least 2 blocks of threads.* Refer to [the solution](../edit/04-loops/solutions/02-multi-block-loop-solution.cu) if you get stuck.
341 | 
342 | 
343 | ```python
344 | !nvcc -arch=sm_70 -o multi-block-loop 04-loops/02-multi-block-loop.cu -run
345 | ```
346 | 
347 |     Index: 5
348 |     Index: 6
349 |     Index: 7
350 |     Index: 8
351 |     Index: 9
352 |     Index: 0
353 |     Index: 1
354 |     Index: 2
355 |     Index: 3
356 |     Index: 4
357 | 
358 | 
359 | ---
360 | ## Allocating Memory to be accessed on the GPU and the CPU
361 | 
362 | More recent versions of CUDA (version 6 and later) have made it easy to allocate memory that is available to both the CPU host and any number of GPU devices, and while there are many [intermediate and advanced techniques](http://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#memory-optimizations) for memory management that will support the most optimal performance in accelerated applications, the most basic CUDA memory management technique we will now cover supports fantastic performance gains over CPU-only applications with almost no developer overhead.
363 | 
364 | To allocate and free memory, and obtain a pointer that can be referenced in both host and device code, replace calls to `malloc` and `free` with `cudaMallocManaged` and `cudaFree` as in the following example:
365 | 
366 | ```cpp
367 | // CPU-only
368 | 
369 | int N = 2<<20;
370 | size_t size = N * sizeof(int);
371 | 
372 | int *a;
373 | a = (int *)malloc(size);
374 | 
375 | // Use `a` in CPU-only program.
376 | 
377 | free(a);
378 | ```
379 | 
380 | ```cpp
381 | // Accelerated
382 | 
383 | int N = 2<<20;
384 | size_t size = N * sizeof(int);
385 | 
386 | int *a;
387 | // Note the address of `a` is passed as first argument.
388 | cudaMallocManaged(&a, size);
389 | 
390 | // Use `a` on the CPU and/or on any GPU in the accelerated system.
391 | 
392 | cudaFree(a);
393 | ```
394 | 
395 | ---
396 | ### Exercise: Array Manipulation on both the Host and Device
397 | 
398 | The [`01-double-elements.cu`](../edit/05-allocate/01-double-elements.cu) program allocates an array, initializes it with integer values on the host, attempts to double each of these values in parallel on the GPU, and then confirms whether or not the doubling operations were successful, on the host. Currently the program will not work: it is attempting to interact on both the host and the device with an array at pointer `a`, but has only allocated the array (using `malloc`) to be accessible on the host. Refactor the application to meet the following conditions, referring to [the solution](../edit/05-allocate/solutions/01-double-elements-solution.cu) if you get stuck:
399 | 
400 | - `a` should be available to both host and device code.
401 | - The memory at `a` should be correctly freed.
402 | 
403 | 
404 | ```python
405 | !nvcc -arch=sm_70 -o double-elements 05-allocate/01-double-elements.cu -run
406 | ```
407 | 
408 |     All elements were doubled? TRUE
409 | 
410 | 
411 | ## Grid Size Work Amount Mismatch
412 | 
413 | The following slides present upcoming material visually, at a high level. Click through the slides before moving on to more detailed coverage of their topics in following sections.
414 | 
415 | 
416 | ```python
417 | %%HTML
418 | 
419 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_5.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
420 | ```
421 | 
422 | 
423 | 
424 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_5.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
425 | 
426 | 
427 | 
428 | ---
429 | ## Handling Block Configuration Mismatches to Number of Needed Threads
430 | 
431 | It may be the case that an execution configuration cannot be expressed that will create the exact number of threads needed for parallelizing a loop.
432 | 
433 | A common example has to do with the desire to choose optimal block sizes. For example, due to GPU hardware traits, blocks that contain a number of threads that are a multiple of 32 are often desirable for performance benefits. Assuming that we wanted to launch blocks each containing 256 threads (a multiple of 32), and needed to run 1000 parallel tasks (a trivially small number for ease of explanation), then there is no number of blocks that would produce an exact total of 1000 threads in the grid, since there is no integer value 32 can be multiplied by to equal exactly 1000.
434 | 
435 | This scenario can be easily addressed in the following way:
436 | 
437 | - Write an execution configuration that creates **more** threads than necessary to perform the allotted work.
438 | - Pass a value as an argument into the kernel (`N`) that represents to the total size of the data set to be processed, or the total threads that are needed to complete the work.
439 | - After calculating the thread's index within the grid (using `tid+bid*bdim`), check that this index does not exceed `N`, and only perform the pertinent work of the kernel if it does not.
440 | 
441 | Here is an example of an idiomatic way to write an execution configuration when both `N` and the number of threads in a block are known, and an exact match between the number of threads in the grid and `N` cannot be guaranteed. It ensures that there are always at least as many threads as needed for `N`, and only 1 additional block's worth of threads extra, at most:
442 | 
443 | ```cpp
444 | // Assume `N` is known
445 | int N = 100000;
446 | 
447 | // Assume we have a desire to set `threads_per_block` exactly to `256`
448 | size_t threads_per_block = 256;
449 | 
450 | // Ensure there are at least `N` threads in the grid, but only 1 block's worth extra
451 | size_t number_of_blocks = (N + threads_per_block - 1) / threads_per_block;
452 | 
453 | some_kernel<<<number_of_blocks, threads_per_block>>>(N);
454 | ```
455 | 
456 | Because the execution configuration above results in more threads in the grid than `N`, care will need to be taken inside of the `some_kernel` definition so that `some_kernel` does not attempt to access out of range data elements, when being executed by one of the "extra" threads:
457 | 
458 | ```cpp
459 | __global__ some_kernel(int N)
460 | {
461 |   int idx = threadIdx.x + blockIdx.x * blockDim.x;
462 | 
463 |   if (idx < N) // Check to make sure `idx` maps to some value within `N`
464 |   {
465 |     // Only do work if it does
466 |   }
467 | }
468 | ```
469 | 
470 | ---
471 | ### Exercise: Accelerating a For Loop with a Mismatched Execution Configuration
472 | 
473 | The program in [`02-mismatched-config-loop.cu`](../edit/05-allocate/02-mismatched-config-loop.cu) allocates memory, using `cudaMallocManaged` for a 1000 element array of integers, and then seeks to initialize all the values of the array in parallel using a CUDA kernel. This program assumes that both `N` and the number of `threads_per_block` are known. Your task is to complete the following two objectives, refer to [the solution](../edit/05-allocate/solutions/02-mismatched-config-loop-solution.cu) if you get stuck:
474 | 
475 | - Assign a value to `number_of_blocks` that will make sure there are at least as many threads as there are elements in `a` to work on.
476 | - Update the `initializeElementsTo` kernel to make sure that it does not attempt to work on data elements that are out of range.
477 | 
478 | 
479 | ```python
480 | !nvcc -arch=sm_70 -o mismatched-config-loop 05-allocate/02-mismatched-config-loop.cu -run
481 | ```
482 | 
483 |     SUCCESS!
484 | 
485 | 
486 | ---
487 | ## Grid-Stride Loops
488 | 
489 | The following slides present upcoming material visually, at a high level. Click through the slides before moving on to more detailed coverage of their topics in following sections.
490 | 
491 | 
492 | ```python
493 | %%HTML
494 | 
495 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_6.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
496 | ```
497 | 
498 | 
499 | 
500 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_6.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
501 | 
502 | 
503 | 
504 | ---
505 | ## Data Sets Larger Than the Grid
506 | 
507 | Either by choice, often to create the most performant execution configuration, or out of necessity, the number of threads in a grid may be smaller than the size of a data set. Consider an array with 1000 elements, and a grid with 250 threads (using trivial sizes here for ease of explanation). Here, each thread in the grid will need to be used 4 times. One common method to do this is to use a **grid-stride loop** within the kernel.
508 | 
509 | In a grid-stride loop, each thread will calculate its unique index within the grid using `tid+bid*bdim`, perform its operation on the element at that index within the array, and then, add to its index the number of threads in the grid and repeat, until it is out of range of the array. For example, for a 500 element array and a 250 thread grid, the thread with index 20 in the grid would:
510 | 
511 | - Perform its operation on element 20 of the 500 element array
512 | - Increment its index by 250, the size of the grid, resulting in 270
513 | - Perform its operation on element 270 of the 500 element array
514 | - Increment its index by 250, the size of the grid, resulting in 520
515 | - Because 520 is now out of range for the array, the thread will stop its work
516 | 
517 | CUDA provides a special variable giving the number of blocks in a grid, `gridDim.x`. Calculating the total number of threads in a grid then is simply the number of blocks in a grid multiplied by the number of threads in each block, `gridDim.x * blockDim.x`. With this in mind, here is a verbose example of a grid-stride loop within a kernel:
518 | 
519 | ```cpp
520 | __global__ void kernel(int *a, int N)
521 | {
522 |   int indexWithinTheGrid = threadIdx.x + blockIdx.x * blockDim.x;
523 |   int gridStride = gridDim.x * blockDim.x;
524 | 
525 |   for (int i = indexWithinTheGrid; i < N; i += gridStride)
526 |   {
527 |     // do work on a[i];
528 |   }
529 | }
530 | ```
531 | 
532 | ---
533 | ### Exercise: Use a Grid-Stride Loop to Manipulate an Array Larger than the Grid
534 | 
535 | Refactor [`03-grid-stride-double.cu`](../edit/05-allocate/03-grid-stride-double.cu) to use a grid-stride loop in the `doubleElements` kernel, in order that the grid, which is smaller than `N`, can reuse threads to cover every element in the array. The program will print whether or not every element in the array has been doubled, currently the program accurately prints `FALSE`. Refer to [the solution](../edit/05-allocate/solutions/03-grid-stride-double-solution.cu) if you get stuck.
536 | 
537 | 
538 | ```python
539 | !nvcc -arch=sm_70 -o grid-stride-double 05-allocate/03-grid-stride-double.cu -run
540 | ```
541 | 
542 |     All elements were doubled? TRUE
543 | 
544 | 
545 | ---
546 | ## Error Handling
547 | 
548 | As in any application, error handling in accelerated CUDA code is essential. Many, if not most CUDA functions (see, for example, the [memory management functions](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY)) return a value of type `cudaError_t`, which can be used to check whether or not an error occurred while calling the function. Here is an example where error handling is performed for a call to `cudaMallocManaged`:
549 | 
550 | ```cpp
551 | cudaError_t err;
552 | err = cudaMallocManaged(&a, N)                    // Assume the existence of `a` and `N`.
553 | 
554 | if (err != cudaSuccess)                           // `cudaSuccess` is provided by CUDA.
555 | {
556 |   printf("Error: %s\n", cudaGetErrorString(err)); // `cudaGetErrorString` is provided by CUDA.
557 | }
558 | ```
559 | 
560 | Launching kernels, which are defined to return `void`, do not return a value of type `cudaError_t`. To check for errors occurring at the time of a kernel launch, for example if the launch configuration is erroneous, CUDA provides the `cudaGetLastError` function, which does return a value of type `cudaError_t`.
561 | 
562 | ```cpp
563 | /*
564 |  * This launch should cause an error, but the kernel itself
565 |  * cannot return it.
566 |  */
567 | 
568 | someKernel<<<1, -1>>>();  // -1 is not a valid number of threads.
569 | 
570 | cudaError_t err;
571 | err = cudaGetLastError(); // `cudaGetLastError` will return the error from above.
572 | if (err != cudaSuccess)
573 | {
574 |   printf("Error: %s\n", cudaGetErrorString(err));
575 | }
576 | ```
577 | 
578 | Finally, in order to catch errors that occur asynchronously, for example during the execution of an asynchronous kernel, it is essential to check the status returned by a subsequent synchronizing CUDA runtime API call, such as `cudaDeviceSynchronize`, which will return an error if one of the kernels launched previously should fail.
579 | 
580 | ---
581 | ### Exercise: Add Error Handling
582 | 
583 | Currently [`01-add-error-handling.cu`](../edit/06-errors/01-add-error-handling.cu) compiles, runs, and prints that the elements of the array were not successfully doubled. The program does not, however, indicate that there are any errors within it. Refactor the application to handle CUDA errors so that you can learn what is wrong with the program and effectively debug it. You will need to investigate both synchronous errors potentially created when calling CUDA functions, as well as asynchronous errors potentially created while a CUDA kernel is executing. Refer to [the solution](../edit/06-errors/solutions/01-add-error-handling-solution.cu) if you get stuck.
584 | 
585 | 
586 | ```python
587 | !nvcc -arch=sm_70 -o add-error-handling 06-errors/01-add-error-handling.cu -run
588 | ```
589 | 
590 |     Error: invalid configuration argument
591 |     All elements were doubled? FALSE
592 | 
593 | 
594 | ---
595 | ### CUDA Error Handling Function
596 | 
597 | It can be helpful to create a macro that wraps CUDA function calls for checking errors. Here is an example, feel free to use it in the remaining exercises:
598 | 
599 | ```cpp
600 | #include <stdio.h>
601 | #include <assert.h>
602 | 
603 | inline cudaError_t checkCuda(cudaError_t result)
604 | {
605 |   if (result != cudaSuccess) {
606 |     fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
607 |     assert(result == cudaSuccess);
608 |   }
609 |   return result;
610 | }
611 | 
612 | int main()
613 | {
614 | 
615 | /*
616 |  * The macro can be wrapped around any function returning
617 |  * a value of type `cudaError_t`.
618 |  */
619 | 
620 |   checkCuda( cudaDeviceSynchronize() )
621 | }
622 | ```
623 | 
624 | ---
625 | ## Summary
626 | 
627 | At this point in time you have accomplished all of the following lab objectives:
628 | 
629 | - Write, compile, and run C/C++ programs that both call CPU functions and **launch** GPU **kernels**.
630 | - Control parallel **thread hierarchy** using **execution configuration**.
631 | - Refactor serial loops to execute their iterations in parallel on a GPU.
632 | - Allocate and free memory available to both CPUs and GPUs.
633 | - Handle errors generated by CUDA code.
634 | 
635 | Now you will complete the final objective of the lab:
636 | 
637 | - Accelerate CPU-only applications.
638 | 
639 | ---
640 | ### Final Exercise: Accelerate Vector Addition Application
641 | 
642 | The following challenge will give you an opportunity to use everything that you have learned thus far in the lab. It involves accelerating a CPU-only vector addition program, which, while not the most sophisticated program, will give you an opportunity to focus on what you have learned about GPU-accelerating an application with CUDA. After completing this exercise, if you have time and interest, continue on to the *Advanced Content* section for some challenges that involve more complex code bases.
643 | 
644 | [`01-vector-add.cu`](../edit/07-vector-add/01-vector-add.cu) contains a functioning CPU-only vector addition application. Accelerate its `addVectorsInto` function to run as a CUDA kernel on the GPU and to do its work in parallel. Consider the following that need to occur, and refer to [the solution](../edit/07-vector-add/solutions/01-vector-add-solution.cu) if you get stuck.
645 | 
646 | - Augment the `addVectorsInto` definition so that it is a CUDA kernel.
647 | - Choose and utilize a working execution configuration so that `addVectorsInto` launches as a CUDA kernel.
648 | - Update memory allocations, and memory freeing to reflect that the 3 vectors `a`, `b`, and `result` need to be accessed by host and device code.
649 | - Refactor the body of `addVectorsInto`: it will be launched inside of a single thread, and only needs to do one thread's worth of work on the input vectors. Be certain the thread will never try to access elements outside the range of the input vectors, and take care to note whether or not the thread needs to do work on more than one element of the input vectors.
650 | - Add error handling in locations where CUDA code might otherwise silently fail.
651 | 
652 | 
653 | ```python
654 | !nvcc -arch=sm_70 -o vector-add 07-vector-add/01-vector-add.cu -run
655 | ```
656 | 
657 |     SUCCESS! All values added correctly.
658 | 
659 | 
660 | ---
661 | ## Advanced Content
662 | 
663 | The following exercises provide additional challenge for those with time and interest. They require the use of more advanced techniques, and provide less scaffolding. They are difficult and excellent for your development.
664 | 
665 | ---
666 | ## Grids and Blocks of 2 and 3 Dimensions
667 | 
668 | Grids and blocks can be defined to have up to 3 dimensions. Defining them with multiple dimensions does not impact their performance in any way, but can be very helpful when dealing with data that has multiple dimensions, for example, 2d matrices. To define either grids or blocks with two or 3 dimensions, use CUDA's `dim3` type as such:
669 | 
670 | ```cpp
671 | dim3 threads_per_block(16, 16, 1);
672 | dim3 number_of_blocks(16, 16, 1);
673 | someKernel<<<number_of_blocks, threads_per_block>>>();
674 | ```
675 | 
676 | Given the example just above, the variables `gridDim.x`, `gridDim.y`, `blockDim.x`, and `blockDim.y` inside of `someKernel`, would all be equal to `16`.
677 | 
678 | ---
679 | ### Exercise: Accelerate 2D Matrix Multiply Application
680 | 
681 | The file [`01-matrix-multiply-2d.cu`](../edit/08-matrix-multiply/01-matrix-multiply-2d.cu) contains a host function `matrixMulCPU` which is fully functional. Your task is to build out the `matrixMulGPU` CUDA kernel. The source code will execute the matrix multiplication with both functions, and compare their answers to verify the correctness of the CUDA kernel you will be writing. Use the following guidelines to support your work and refer to [the solution](../edit/08-matrix-multiply/solutions/01-matrix-multiply-2d-solution.cu) if you get stuck:
682 | 
683 | - You will need to create an execution configuration whose arguments are both `dim3` values with the `x` and `y` dimensions set to greater than `1`.
684 | - Inside the body of the kernel, you will need to establish the running thread's unique index within the grid per usual, but you should establish two indices for the thread: one for the x axis of the grid, and one for the y axis of the grid.
685 | 
686 | 
687 | ```python
688 | !nvcc -arch=sm_70 -o matrix-multiply-2d 08-matrix-multiply/01-matrix-multiply-2d.cu -run
689 | ```
690 | 
691 | ---
692 | ### Exercise: Accelerate A Thermal Conductivity Application
693 | 
694 | In the following exercise, you will be accelerating an application that simulates the thermal conduction of silver in 2 dimensional space.
695 | 
696 | Convert the `step_kernel_mod` function inside [`01-heat-conduction.cu`](../edit/09-heat/01-heat-conduction.cu) to execute on the GPU, and modify the `main` function to properly allocate data for use on CPU and GPU. The `step_kernel_ref` function executes on the CPU and is used for error checking. Because this code involves floating point calculations, different processors, or even simply reordering operations on the same processor, can result in slightly different results. For this reason the error checking code uses an error threshold, instead of looking for an exact match. Refer to [the solution](../edit/09-heat/solutions/01-heat-conduction-solution.cu) if you get stuck.
697 | 
698 | 
699 | ```python
700 | !nvcc -arch=sm_70 -o heat-conduction 09-heat/01-heat-conduction.cu -run
701 | ```
702 | 
703 | > Credit for the original Heat Conduction CPU source code in this task is given to the article [An OpenACC Example Code for a C-based heat conduction code](http://docplayer.net/30411068-An-openacc-example-code-for-a-c-based-heat-conduction-code.html) from the University of Houston.
704 | 


--------------------------------------------------------------------------------
/AC_CUDA_C.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "<div align=\"center\"><h1>Accelerating Applications with CUDA C/C++</h1></div>"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "![CUDA](./images/CUDA_Logo.jpg)"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "markdown",
  19 |    "metadata": {},
  20 |    "source": [
  21 |     "Accelerated computing is replacing CPU-only computing as best practice. The parade of breakthroughs driven by accelerated computing, the ever increasing demand for accelerated applications, programming conventions that ease writing them, and constant improvements in the hardware that supports them, are driving this inevitable transition.\n",
  22 |     "\n",
  23 |     "At the center of accelerated computing's success, both in terms of its impressive performance, and its ease of use, is the [CUDA](https://developer.nvidia.com/about-cuda) compute platform. CUDA provides a coding paradigm that extends languages like C, C++, Python, and Fortran, to be capable of running accelerated, massively parallelized code on the world's most performant parallel processors: NVIDIA GPUs. CUDA accelerates applications drastically with little effort, has an ecosystem of highly optimized libraries for [DNN](https://developer.nvidia.com/cudnn), [BLAS](https://developer.nvidia.com/cublas), [graph analytics](https://developer.nvidia.com/nvgraph), [FFT](https://developer.nvidia.com/cufft), and more, and also ships with powerful [command line and visual profilers](https://developer.nvidia.com/nsight-systems).\n",
  24 |     "\n",
  25 |     "CUDA supports many, if not most, of the [world's most performant applications](https://www.nvidia.com/en-us/data-center/gpu-accelerated-applications/catalog/?product_category_id=58,59,60,293,98,172,223,227,228,265,487,488,114,389,220,258,461&search=) in: [Computational Fluid Dynamics](https://www.nvidia.com/en-us/data-center/gpu-accelerated-applications/catalog/?product_category_id=10,12,16,17,19,51,53,71,87,121,124,156,157,195,202,203,204,312,339,340,395,407,448,485,517,528,529,541,245,216,104,462,513,250,492,420,429,490,10,12,16,17,19,51,53,71,87,121,124,156,157,195,202,203,204,312,339,340,395,407,448,485,517,528,529,541,245,216,104,462,513,250,492,420,429,490,10,12,16,17,19,51,53,71,87,121,124,156,157,195,202,203,204,312,339,340,395,407,448,485,517,528,529,541,245,216,104,462,513,250,492,420,429,490&search=), [Molecular Dynamics](https://www.nvidia.com/en-us/data-center/gpu-accelerated-applications/catalog/?product_category_id=8,57,92,123,211,213,237,272,274,282,283,307,325,337,344,345,351,362,365,380,396,398,400,435,507,508,519,8,57,92,123,211,213,237,272,274,282,283,307,325,337,344,345,351,362,365,380,396,398,400,435,507,508,519,8,57,92,123,211,213,237,272,274,282,283,307,325,337,344,345,351,362,365,380,396,398,400,435,507,508,519,8,57,92,123,211,213,237,272,274,282,283,307,325,337,344,345,351,362,365,380,396,398,400,435,507,508,519&search=), [Quantum Chemistry](https://www.nvidia.com/en-us/data-center/gpu-accelerated-applications/catalog/?product_category_id=8,57,92,123,211,213,237,272,274,282,283,307,325,337,344,345,351,362,365,380,396,398,400,435,507,508,519,8,57,92,123,211,213,237,272,274,282,283,307,325,337,344,345,351,362,365,380,396,398,400,435,507,508,519&search=), [Physics](https://www.nvidia.com/en-us/data-center/gpu-accelerated-applications/catalog/?product_category_id=6,24,116,118,119,135,229,231,372,373,392,393,489,493,494,495,496,497,498,67,170,216,281,6,24,116,118,119,135,229,231,372,373,392,393,489,493,494,495,496,497,498,67,170,216,281,6,24,116,118,119,135,229,231,372,373,392,393,489,493,494,495,496,497,498,67,170,216,281,6,24,116,118,119,135,229,231,372,373,392,393,489,493,494,495,496,497,498,67,170,216,281,6,24,116,118,119,135,229,231,372,373,392,393,489,493,494,495,496,497,498,67,170,216,281&search=) and HPC.\n",
  26 |     "\n",
  27 |     "Learning CUDA will enable you to accelerate your own applications. Accelerated applications perform much faster than their CPU-only counterparts, and make possible computations that would be otherwise prohibited given the limited performance of CPU-only applications. In this lab you will receive an introduction to programming accelerated applications with CUDA C/C++, enough to be able to begin work accelerating your own CPU-only applications for performance gains, and for moving into novel computational territory."
  28 |    ]
  29 |   },
  30 |   {
  31 |    "cell_type": "markdown",
  32 |    "metadata": {},
  33 |    "source": [
  34 |     "---\n",
  35 |     "## Prerequisites\n",
  36 |     "\n",
  37 |     "To get the most out of this lab you should already be able to:\n",
  38 |     "\n",
  39 |     "- Declare variables, write loops, and use if / else statements in C.\n",
  40 |     "- Define and invoke functions in C.\n",
  41 |     "- Allocate arrays in C.\n",
  42 |     "\n",
  43 |     "No previous CUDA knowledge is required."
  44 |    ]
  45 |   },
  46 |   {
  47 |    "cell_type": "markdown",
  48 |    "metadata": {},
  49 |    "source": [
  50 |     "---\n",
  51 |     "## Objectives\n",
  52 |     "\n",
  53 |     "By the time you complete this lab, you will be able to:\n",
  54 |     "\n",
  55 |     "- Write, compile, and run C/C++ programs that both call CPU functions and **launch** GPU **kernels**.\n",
  56 |     "- Control parallel **thread hierarchy** using **execution configuration**.\n",
  57 |     "- Refactor serial loops to execute their iterations in parallel on a GPU.\n",
  58 |     "- Allocate and free memory available to both CPUs and GPUs.\n",
  59 |     "- Handle errors generated by CUDA code.\n",
  60 |     "- Accelerate CPU-only applications."
  61 |    ]
  62 |   },
  63 |   {
  64 |    "cell_type": "markdown",
  65 |    "metadata": {},
  66 |    "source": [
  67 |     "---\n",
  68 |     "## Accelerated Systems\n",
  69 |     "\n",
  70 |     "*Accelerated systems*, also referred to as *heterogeneous systems*, are those composed of both CPUs and GPUs. Accelerated systems run CPU programs which in turn, launch functions that will benefit from the massive parallelism provided by GPUs. This lab environment is an accelerated system which includes an NVIDIA GPU. Information about this GPU can be queried with the `nvidia-smi` (*Systems Management Interface*) command line command. Issue the `nvidia-smi` command now, by `CTRL` + `ENTER` on the code execution cell below. You will find these cells throughout this lab any time you need to execute code. The output from running the command will be printed just below the code execution cell after the code runs. After running the code execution block immediately below, take care to find and note the name of the GPU in the output."
  71 |    ]
  72 |   },
  73 |   {
  74 |    "cell_type": "code",
  75 |    "execution_count": 1,
  76 |    "metadata": {},
  77 |    "outputs": [
  78 |     {
  79 |      "name": "stdout",
  80 |      "output_type": "stream",
  81 |      "text": [
  82 |       "Wed Jun 22 07:24:58 2022       \n",
  83 |       "+-----------------------------------------------------------------------------+\n",
  84 |       "| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
  85 |       "|-------------------------------+----------------------+----------------------+\n",
  86 |       "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
  87 |       "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
  88 |       "|                               |                      |               MIG M. |\n",
  89 |       "|===============================+======================+======================|\n",
  90 |       "|   0  Tesla T4            On   | 00000000:00:1E.0 Off |                    0 |\n",
  91 |       "| N/A   30C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |\n",
  92 |       "|                               |                      |                  N/A |\n",
  93 |       "+-------------------------------+----------------------+----------------------+\n",
  94 |       "                                                                               \n",
  95 |       "+-----------------------------------------------------------------------------+\n",
  96 |       "| Processes:                                                                  |\n",
  97 |       "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
  98 |       "|        ID   ID                                                   Usage      |\n",
  99 |       "|=============================================================================|\n",
 100 |       "|  No running processes found                                                 |\n",
 101 |       "+-----------------------------------------------------------------------------+\n"
 102 |      ]
 103 |     }
 104 |    ],
 105 |    "source": [
 106 |     "!nvidia-smi"
 107 |    ]
 108 |   },
 109 |   {
 110 |    "cell_type": "markdown",
 111 |    "metadata": {},
 112 |    "source": [
 113 |     "---\n",
 114 |     "## GPU-accelerated Vs. CPU-only Applications\n",
 115 |     "\n",
 116 |     "The following slides present upcoming material visually, at a high level. Click through the slides before moving on to more detailed coverage of their topics in following sections.\n",
 117 |     "\n",
 118 |     "<script>console.log('hi');</script>"
 119 |    ]
 120 |   },
 121 |   {
 122 |    "cell_type": "code",
 123 |    "execution_count": 2,
 124 |    "metadata": {},
 125 |    "outputs": [
 126 |     {
 127 |      "data": {
 128 |       "text/html": [
 129 |        "\n",
 130 |        "<div align=\"center\"><iframe src=\"https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_1.pptx\" width=\"800px\" height=\"500px\" frameborder=\"0\"></iframe></div>\n"
 131 |       ],
 132 |       "text/plain": [
 133 |        "<IPython.core.display.HTML object>"
 134 |       ]
 135 |      },
 136 |      "metadata": {},
 137 |      "output_type": "display_data"
 138 |     }
 139 |    ],
 140 |    "source": [
 141 |     "%%HTML\n",
 142 |     "\n",
 143 |     "<div align=\"center\"><iframe src=\"https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_1.pptx\" width=\"800px\" height=\"500px\" frameborder=\"0\"></iframe></div>"
 144 |    ]
 145 |   },
 146 |   {
 147 |    "cell_type": "markdown",
 148 |    "metadata": {},
 149 |    "source": [
 150 |     "---\n",
 151 |     "## Writing Application Code for the GPU\n",
 152 |     "\n",
 153 |     "CUDA provides extensions for many common programming languages, in the case of this lab, C/C++. These language extensions easily allow developers to run functions in their source code on a GPU.\n",
 154 |     "\n",
 155 |     "Below is a `.cu` file (`.cu` is the file extension for CUDA-accelerated programs). It contains two functions, the first which will run on the CPU, the second which will run on the GPU. Spend a little time identifying the differences between the functions, both in terms of how they are defined, and how they are invoked.\n",
 156 |     "\n",
 157 |     "```cpp\n",
 158 |     "void CPUFunction()\n",
 159 |     "{\n",
 160 |     "  printf(\"This function is defined to run on the CPU.\\n\");\n",
 161 |     "}\n",
 162 |     "\n",
 163 |     "__global__ void GPUFunction()\n",
 164 |     "{\n",
 165 |     "  printf(\"This function is defined to run on the GPU.\\n\");\n",
 166 |     "}\n",
 167 |     "\n",
 168 |     "int main()\n",
 169 |     "{\n",
 170 |     "  CPUFunction();\n",
 171 |     "\n",
 172 |     "  GPUFunction<<<1, 1>>>();\n",
 173 |     "  cudaDeviceSynchronize();\n",
 174 |     "}\n",
 175 |     "```\n",
 176 |     "\n",
 177 |     "Here are some important lines of code to highlight, as well as some other common terms used in accelerated computing:\n",
 178 |     "\n",
 179 |     "`__global__ void GPUFunction()`\n",
 180 |     "  - The `__global__` keyword indicates that the following function will run on the GPU, and can be invoked **globally**, which in this context means either by the CPU, or, by the GPU.\n",
 181 |     "  - Often, code executed on the CPU is referred to as **host** code, and code running on the GPU is referred to as **device** code.\n",
 182 |     "  - Notice the return type `void`. It is required that functions defined with the `__global__` keyword return type `void`.\n",
 183 |     "\n",
 184 |     "`GPUFunction<<<1, 1>>>();`\n",
 185 |     "  - Typically, when calling a function to run on the GPU, we call this function a **kernel**, which is **launched**.\n",
 186 |     "  - When launching a kernel, we must provide an **execution configuration**, which is done by using the `<<< ... >>>` syntax just prior to passing the kernel any expected arguments.\n",
 187 |     "  - At a high level, execution configuration allows programmers to specify the **thread hierarchy** for a kernel launch, which defines the number of thread groupings (called **blocks**), as well as how many **threads** to execute in each block. Execution configuration will be explored at great length later in the lab, but for the time being, notice the kernel is launching with `1` block of threads (the first execution configuration argument) which contains `1` thread (the second configuration argument).\n",
 188 |     "\n",
 189 |     "`cudaDeviceSynchronize();`\n",
 190 |     "  - Unlike much C/C++ code, launching kernels is **asynchronous**: the CPU code will continue to execute *without waiting for the kernel launch to complete*.\n",
 191 |     "  - A call to `cudaDeviceSynchronize`, a function provided by the CUDA runtime, will cause the host (CPU) code to wait until the device (GPU) code completes, and only then resume execution on the CPU."
 192 |    ]
 193 |   },
 194 |   {
 195 |    "cell_type": "markdown",
 196 |    "metadata": {},
 197 |    "source": [
 198 |     "---\n",
 199 |     "### Exercise: Write a Hello GPU Kernel\n",
 200 |     "\n",
 201 |     "The [`01-hello-gpu.cu`](../edit/01-hello/01-hello-gpu.cu) (*<---- click on the link of the source file to open it in another tab for editing*) contains a program that is already working. It contains two functions, both with print \"Hello from the CPU\" messages. Your goal is to refactor the `helloGPU` function in the source file so that it actually runs on the GPU, and prints a message indicating that it does.\n",
 202 |     "\n",
 203 |     "- Refactor the application, before compiling and running it with the `nvcc` command just below (remember, you can execute the contents of the code execution cell by `CTRL + ENTER` it). The comments in [`01-hello-gpu.cu`](../edit/01-hello/01-hello-gpu.cu) will assist your work. If you get stuck, or want to check your work, refer to the [solution](../edit/01-hello/solutions/01-hello-gpu-solution.cu). Don't forget to save your changes to the file before compiling and running with the command below."
 204 |    ]
 205 |   },
 206 |   {
 207 |    "cell_type": "code",
 208 |    "execution_count": 9,
 209 |    "metadata": {},
 210 |    "outputs": [
 211 |     {
 212 |      "name": "stdout",
 213 |      "output_type": "stream",
 214 |      "text": [
 215 |       "Hello from the CPU.\n",
 216 |       "Hello from the GPU.\n"
 217 |      ]
 218 |     }
 219 |    ],
 220 |    "source": [
 221 |     "!nvcc -arch=sm_70 -o hello-gpu 01-hello/01-hello-gpu.cu -run"
 222 |    ]
 223 |   },
 224 |   {
 225 |    "cell_type": "markdown",
 226 |    "metadata": {},
 227 |    "source": [
 228 |     "After successfully refactoring [`01-hello-gpu.cu`](../edit/01-hello/01-hello-gpu.cu), make the following modifications, attempting to compile and run it after each change (by `CTRL + ENTER` clicking on the code execution cell above). When given errors, take the time to read them carefully: familiarity with them will serve you greatly when you begin writing your own accelerated code.\n",
 229 |     "\n",
 230 |     "- Remove the keyword `__global__` from your kernel definition. Take care to note the line number in the error: what do you think is meant in the error by \"configured\"? Replace `__global__` when finished.\n",
 231 |     "- Remove the execution configuration: does your understanding of \"configured\" still make sense? Replace the execution configuration when finished.\n",
 232 |     "- Remove the call to `cudaDeviceSynchronize`. Before compiling and running the code, take a guess at what will happen, recalling that kernels are launched asynchronously, and that `cudaDeviceSynchronize` is what makes host execution in wait for kernel execution to complete before proceeding. Replace the call to `cudaDeviceSynchronize` when finished.\n",
 233 |     "- Refactor `01-hello-gpu.cu` so that `Hello from the GPU` prints **before** `Hello from the CPU`.\n",
 234 |     "- Refactor `01-hello-gpu.cu` so that `Hello from the GPU` prints **twice**, once  **before** `Hello from the CPU`, and once **after**."
 235 |    ]
 236 |   },
 237 |   {
 238 |    "cell_type": "markdown",
 239 |    "metadata": {},
 240 |    "source": [
 241 |     "---\n",
 242 |     "### Compiling and Running Accelerated CUDA Code\n",
 243 |     "\n",
 244 |     "This section contains details about the `nvcc` command you issued above to compile and run your `.cu` program.\n",
 245 |     "\n",
 246 |     "The CUDA platform ships with the [**NVIDIA CUDA Compiler**](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html) `nvcc`, which can compile CUDA accelerated applications, both the host, and the device code they contain. For the purposes of this lab, `nvcc` discussion will be pragmatically scoped to suit our immediate needs. After completing the lab anyone interested in a deeper dive into `nvcc` can start with [the documentation](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html).\n",
 247 |     "\n",
 248 |     "`nvcc` will be very familiar to experienced `gcc` users. Compiling, for example, a `some-CUDA.cu` file, is simply:\n",
 249 |     "\n",
 250 |     "`nvcc -arch=sm_70 -o out some-CUDA.cu -run`\n",
 251 |     "  - `nvcc` is the command line command for using the `nvcc` compiler.\n",
 252 |     "  - `some-CUDA.cu` is passed as the file to compile.\n",
 253 |     "  - The `o` flag is used to specify the output file for the compiled program.\n",
 254 |     "  - The `arch` flag indicates for which **architecture** the files must be compiled. For the present case `sm_70` will serve to compile specifically for the GPU this lab is running on, but for those interested in a deeper dive, please refer to the docs about the [`arch` flag](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-steering-gpu-code-generation), [virtual architecture features](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-feature-list) and [GPU features](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-feature-list).\n",
 255 |     "  - As a matter of convenience, providing the `run` flag will execute the successfully compiled binary."
 256 |    ]
 257 |   },
 258 |   {
 259 |    "cell_type": "markdown",
 260 |    "metadata": {},
 261 |    "source": [
 262 |     "---\n",
 263 |     "## CUDA Thread Hierarchy\n",
 264 |     "\n",
 265 |     "The following slides present upcoming material visually, at a high level. Click through the slides before moving on to more detailed coverage of their topics in following sections."
 266 |    ]
 267 |   },
 268 |   {
 269 |    "cell_type": "code",
 270 |    "execution_count": 10,
 271 |    "metadata": {},
 272 |    "outputs": [
 273 |     {
 274 |      "data": {
 275 |       "text/html": [
 276 |        "\n",
 277 |        "<div align=\"center\"><iframe src=\"https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_2.pptx\" width=\"800px\" height=\"500px\" frameborder=\"0\"></iframe></div>\n"
 278 |       ],
 279 |       "text/plain": [
 280 |        "<IPython.core.display.HTML object>"
 281 |       ]
 282 |      },
 283 |      "metadata": {},
 284 |      "output_type": "display_data"
 285 |     }
 286 |    ],
 287 |    "source": [
 288 |     "%%HTML\n",
 289 |     "\n",
 290 |     "<div align=\"center\"><iframe src=\"https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_2.pptx\" width=\"800px\" height=\"500px\" frameborder=\"0\"></iframe></div>"
 291 |    ]
 292 |   },
 293 |   {
 294 |    "cell_type": "markdown",
 295 |    "metadata": {},
 296 |    "source": [
 297 |     "---\n",
 298 |     "## Launching Parallel Kernels\n",
 299 |     "\n",
 300 |     "The execution configuration allows programmers to specify details about launching the kernel to run in parallel on multiple GPU **threads**. More precisely, the execution configuration allows programmers to specify how many groups of threads - called **thread blocks**, or just **blocks** - and how many threads they would like each thread block to contain. The syntax for this is:\n",
 301 |     "\n",
 302 |     "`<<< NUMBER_OF_BLOCKS, NUMBER_OF_THREADS_PER_BLOCK>>>`\n",
 303 |     "\n",
 304 |     "** The kernel code is executed by every thread in every thread block configured when the kernel is launched**.\n",
 305 |     "\n",
 306 |     "Thus, under the assumption that a kernel called `someKernel` has been defined, the following are true:\n",
 307 |     "  - `someKernel<<<1, 1>>>()` is configured to run in a single thread block which has a single thread and will therefore run only once.\n",
 308 |     "  - `someKernel<<<1, 10>>>()` is configured to run in a single thread block which has 10 threads and will therefore run 10 times.\n",
 309 |     "  - `someKernel<<<10, 1>>>()` is configured to run in 10 thread blocks which each have a single thread and will therefore run 10 times.\n",
 310 |     "  - `someKernel<<<10, 10>>>()` is configured to run in 10 thread blocks which each have 10 threads and will therefore run 100 times."
 311 |    ]
 312 |   },
 313 |   {
 314 |    "cell_type": "markdown",
 315 |    "metadata": {},
 316 |    "source": [
 317 |     "---\n",
 318 |     "### Exercise: Launch Parallel Kernels\n",
 319 |     "\n",
 320 |     "[`01-first-parallel.cu`](../edit/02-first-parallel/01-first-parallel.cu) currently makes a very basic function call that prints the message `This should be running in parallel.` Follow the steps below to refactor it first to run on the GPU, and then, in parallel, both in a single, and then, in multiple thread blocks. Refer to [the solution](../edit/02-first-parallel/solutions/01-first-parallel-solution.cu) if you get stuck.\n",
 321 |     "\n",
 322 |     "- Refactor the `firstParallel` function to launch as a CUDA kernel on the GPU. You should still be able to see the output of the function after compiling and running `01-first-parallel.cu` with the `nvcc` command just below.\n",
 323 |     "- Refactor the `firstParallel` kernel to execute in parallel on 5 threads, all executing in a single thread block. You should see the output message printed 5 times after compiling and running the code.\n",
 324 |     "- Refactor the `firstParallel` kernel again, this time to execute in parallel inside 5 thread blocks, each containing 5 threads. You should see the output message printed 25 times now after compiling and running."
 325 |    ]
 326 |   },
 327 |   {
 328 |    "cell_type": "code",
 329 |    "execution_count": 17,
 330 |    "metadata": {},
 331 |    "outputs": [
 332 |     {
 333 |      "name": "stdout",
 334 |      "output_type": "stream",
 335 |      "text": [
 336 |       "[1] This should be running in parallel.\r\n",
 337 |       "[1] This should be running in parallel.\r\n",
 338 |       "[1] This should be running in parallel.\r\n",
 339 |       "[0] This should be running in parallel.\r\n",
 340 |       "[0] This should be running in parallel.\r\n",
 341 |       "[0] This should be running in parallel.\r\n"
 342 |      ]
 343 |     }
 344 |    ],
 345 |    "source": [
 346 |     "!nvcc -arch=sm_70 -o first-parallel 02-first-parallel/01-first-parallel.cu -run"
 347 |    ]
 348 |   },
 349 |   {
 350 |    "cell_type": "markdown",
 351 |    "metadata": {},
 352 |    "source": [
 353 |     "---\n",
 354 |     "\n",
 355 |     "## CUDA-Provided Thread Hierarchy Variables\n",
 356 |     "\n",
 357 |     "The following slides present upcoming material visually, at a high level. Click through the slides before moving on to more detailed coverage of their topics in following sections."
 358 |    ]
 359 |   },
 360 |   {
 361 |    "cell_type": "code",
 362 |    "execution_count": 18,
 363 |    "metadata": {},
 364 |    "outputs": [
 365 |     {
 366 |      "data": {
 367 |       "text/html": [
 368 |        "\n",
 369 |        "<div align=\"center\"><iframe src=\"https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_3.pptx\" width=\"800px\" height=\"500px\" frameborder=\"0\"></iframe></div>\n"
 370 |       ],
 371 |       "text/plain": [
 372 |        "<IPython.core.display.HTML object>"
 373 |       ]
 374 |      },
 375 |      "metadata": {},
 376 |      "output_type": "display_data"
 377 |     }
 378 |    ],
 379 |    "source": [
 380 |     "%%HTML\n",
 381 |     "\n",
 382 |     "<div align=\"center\"><iframe src=\"https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_3.pptx\" width=\"800px\" height=\"500px\" frameborder=\"0\"></iframe></div>"
 383 |    ]
 384 |   },
 385 |   {
 386 |    "cell_type": "markdown",
 387 |    "metadata": {},
 388 |    "source": [
 389 |     "---\n",
 390 |     "## Thread and Block Indices\n",
 391 |     "\n",
 392 |     "Each thread is given an index within its thread block, starting at `0`. Additionally, each block is given an index, starting at `0`. Just as threads are grouped into thread blocks, blocks are grouped into a **grid**, which is the highest entity in the CUDA thread hierarchy. In summary, CUDA kernels are executed in a grid of 1 or more blocks, with each block containing the same number of 1 or more threads.\n",
 393 |     "\n",
 394 |     "CUDA kernels have access to special variables identifying both the index of the thread (within the block) that is executing the kernel, and, the index of the block (within the grid) that the thread is within. These variables are `threadIdx.x` and `blockIdx.x` respectively."
 395 |    ]
 396 |   },
 397 |   {
 398 |    "cell_type": "markdown",
 399 |    "metadata": {},
 400 |    "source": [
 401 |     "---\n",
 402 |     "### Exercise: Use Specific Thread and Block Indices\n",
 403 |     "\n",
 404 |     "Currently the [`01-thread-and-block-idx.cu`](../edit/03-indices/01-thread-and-block-idx.cu) file contains a working kernel that is printing a failure message. Open the file to learn how to update the execution configuration so that the success message will print. After refactoring, compile and run the code with the code execution cell below to confirm your work. Refer to [the solution](../edit/03-indices/solutions/01-thread-and-block-idx-solution.cu) if you get stuck."
 405 |    ]
 406 |   },
 407 |   {
 408 |    "cell_type": "code",
 409 |    "execution_count": 20,
 410 |    "metadata": {},
 411 |    "outputs": [
 412 |     {
 413 |      "name": "stdout",
 414 |      "output_type": "stream",
 415 |      "text": [
 416 |       "Success!\r\n"
 417 |      ]
 418 |     }
 419 |    ],
 420 |    "source": [
 421 |     "!nvcc -arch=sm_70 -o thread-and-block-idx 03-indices/01-thread-and-block-idx.cu -run"
 422 |    ]
 423 |   },
 424 |   {
 425 |    "cell_type": "markdown",
 426 |    "metadata": {},
 427 |    "source": [
 428 |     "---\n",
 429 |     "## Accelerating For Loops\n",
 430 |     "\n",
 431 |     "For loops in CPU-only applications are ripe for acceleration: rather than run each iteration of the loop serially, each iteration of the loop can be run in parallel in its own thread. Consider the following for loop, and notice, though it is obvious, that it controls how many times the loop will execute, as well as defining what will happen for each iteration of the loop:\n",
 432 |     "\n",
 433 |     "```cpp\n",
 434 |     "int N = 2<<20;\n",
 435 |     "for (int i = 0; i < N; ++i)\n",
 436 |     "{\n",
 437 |     "  printf(\"%d\\n\", i);\n",
 438 |     "}\n",
 439 |     "```\n",
 440 |     "\n",
 441 |     "In order to parallelize this loop, 2 steps must be taken:\n",
 442 |     "\n",
 443 |     "- A kernel must be written to do the work of a **single iteration of the loop**.\n",
 444 |     "- Because the kernel will be agnostic of other running kernels, the execution configuration must be such that the kernel executes the correct number of times, for example, the number of times the loop would have iterated."
 445 |    ]
 446 |   },
 447 |   {
 448 |    "cell_type": "markdown",
 449 |    "metadata": {},
 450 |    "source": [
 451 |     "---\n",
 452 |     "### Exercise: Accelerating a For Loop with a Single Block of Threads\n",
 453 |     "\n",
 454 |     "Currently, the `loop` function inside [`01-single-block-loop.cu`](../edit/04-loops/01-single-block-loop.cu), runs a for loop that will serially print the numbers `0` through `9`. Refactor the `loop` function to be a CUDA kernel which will launch to execute `N` iterations in parallel. After successfully refactoring, the numbers `0` through `9` should still be printed. Refer to [the solution](../edit/04-loops/solutions/01-single-block-loop-solution.cu) if you get stuck."
 455 |    ]
 456 |   },
 457 |   {
 458 |    "cell_type": "code",
 459 |    "execution_count": 24,
 460 |    "metadata": {},
 461 |    "outputs": [
 462 |     {
 463 |      "name": "stdout",
 464 |      "output_type": "stream",
 465 |      "text": [
 466 |       "This is iteration number 0\r\n",
 467 |       "This is iteration number 1\r\n",
 468 |       "This is iteration number 2\r\n",
 469 |       "This is iteration number 3\r\n",
 470 |       "This is iteration number 4\r\n",
 471 |       "This is iteration number 5\r\n",
 472 |       "This is iteration number 6\r\n",
 473 |       "This is iteration number 7\r\n",
 474 |       "This is iteration number 8\r\n",
 475 |       "This is iteration number 9\r\n"
 476 |      ]
 477 |     }
 478 |    ],
 479 |    "source": [
 480 |     "!nvcc -arch=sm_70 -o single-block-loop 04-loops/01-single-block-loop.cu -run"
 481 |    ]
 482 |   },
 483 |   {
 484 |    "cell_type": "markdown",
 485 |    "metadata": {},
 486 |    "source": [
 487 |     "---\n",
 488 |     "## Coordinating Parallel Threads\n",
 489 |     "\n",
 490 |     "The following slides present upcoming material visually, at a high level. Click through the slides before moving on to more detailed coverage of their topics in following sections."
 491 |    ]
 492 |   },
 493 |   {
 494 |    "cell_type": "code",
 495 |    "execution_count": 25,
 496 |    "metadata": {},
 497 |    "outputs": [
 498 |     {
 499 |      "data": {
 500 |       "text/html": [
 501 |        "\n",
 502 |        "<div align=\"center\"><iframe src=\"https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_4.pptx\" width=\"800px\" height=\"500px\" frameborder=\"0\"></iframe></div>\n"
 503 |       ],
 504 |       "text/plain": [
 505 |        "<IPython.core.display.HTML object>"
 506 |       ]
 507 |      },
 508 |      "metadata": {},
 509 |      "output_type": "display_data"
 510 |     }
 511 |    ],
 512 |    "source": [
 513 |     "%%HTML\n",
 514 |     "\n",
 515 |     "<div align=\"center\"><iframe src=\"https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_4.pptx\" width=\"800px\" height=\"500px\" frameborder=\"0\"></iframe></div>"
 516 |    ]
 517 |   },
 518 |   {
 519 |    "cell_type": "markdown",
 520 |    "metadata": {},
 521 |    "source": [
 522 |     "---\n",
 523 |     "## Using Block Dimensions for More Parallelization\n",
 524 |     "\n",
 525 |     "There is a limit to the number of threads that can exist in a thread block: 1024 to be precise. In order to increase the amount of parallelism in accelerated applications, we must be able to coordinate among multiple thread blocks.\n",
 526 |     "\n",
 527 |     "CUDA Kernels have access to a special variable that gives the number of threads in a block: `blockDim.x`. Using this variable, in conjunction with `blockIdx.x` and `threadIdx.x`, increased parallelization can be accomplished by organizing parallel execution across multiple blocks of multiple threads with the idiomatic expression `threadIdx.x + blockIdx.x * blockDim.x`. Here is a detailed example.\n",
 528 |     "\n",
 529 |     "The execution configuration `<<<10, 10>>>` would launch a grid with a total of 100 threads, contained in 10 blocks of 10 threads. We would therefore hope for each thread to have the ability to calculate some index unique to itself between `0` and `99`.\n",
 530 |     "\n",
 531 |     "- If block `blockIdx.x` equals `0`, then `blockIdx.x * blockDim.x` is `0`. Adding to `0` the possible `threadIdx.x` values `0` through `9`, then we can generate the indices `0` through `9` within the 100 thread grid.\n",
 532 |     "- If block `blockIdx.x` equals `1`, then `blockIdx.x * blockDim.x` is `10`. Adding to `10` the possible `threadIdx.x` values `0` through `9`, then we can generate the indices `10` through `19` within the 100 thread grid.\n",
 533 |     "- If block `blockIdx.x` equals `5`, then `blockIdx.x * blockDim.x` is `50`. Adding to `50` the possible `threadIdx.x` values `0` through `9`, then we can generate the indices `50` through `59` within the 100 thread grid.\n",
 534 |     "- If block `blockIdx.x` equals `9`, then `blockIdx.x * blockDim.x` is `90`. Adding to `90` the possible `threadIdx.x` values `0` through `9`, then we can generate the indices `90` through `99` within the 100 thread grid."
 535 |    ]
 536 |   },
 537 |   {
 538 |    "cell_type": "markdown",
 539 |    "metadata": {},
 540 |    "source": [
 541 |     "---\n",
 542 |     "### Exercise: Accelerating a For Loop with Multiple Blocks of Threads\n",
 543 |     "\n",
 544 |     "Currently, the `loop` function inside [`02-multi-block-loop.cu`](../edit/04-loops/02-multi-block-loop.cu) runs a for loop that will serially print the numbers `0` through `9`. Refactor the `loop` function to be a CUDA kernel which will launch to execute `N` iterations in parallel. After successfully refactoring, the numbers `0` through `9` should still be printed. For this exercise, as an additional constraint, use an execution configuration that launches *at least 2 blocks of threads.* Refer to [the solution](../edit/04-loops/solutions/02-multi-block-loop-solution.cu) if you get stuck."
 545 |    ]
 546 |   },
 547 |   {
 548 |    "cell_type": "code",
 549 |    "execution_count": 29,
 550 |    "metadata": {},
 551 |    "outputs": [
 552 |     {
 553 |      "name": "stdout",
 554 |      "output_type": "stream",
 555 |      "text": [
 556 |       "Index: 5\r\n",
 557 |       "Index: 6\r\n",
 558 |       "Index: 7\r\n",
 559 |       "Index: 8\r\n",
 560 |       "Index: 9\r\n",
 561 |       "Index: 0\r\n",
 562 |       "Index: 1\r\n",
 563 |       "Index: 2\r\n",
 564 |       "Index: 3\r\n",
 565 |       "Index: 4\r\n"
 566 |      ]
 567 |     }
 568 |    ],
 569 |    "source": [
 570 |     "!nvcc -arch=sm_70 -o multi-block-loop 04-loops/02-multi-block-loop.cu -run"
 571 |    ]
 572 |   },
 573 |   {
 574 |    "cell_type": "markdown",
 575 |    "metadata": {},
 576 |    "source": [
 577 |     "---\n",
 578 |     "## Allocating Memory to be accessed on the GPU and the CPU\n",
 579 |     "\n",
 580 |     "More recent versions of CUDA (version 6 and later) have made it easy to allocate memory that is available to both the CPU host and any number of GPU devices, and while there are many [intermediate and advanced techniques](http://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#memory-optimizations) for memory management that will support the most optimal performance in accelerated applications, the most basic CUDA memory management technique we will now cover supports fantastic performance gains over CPU-only applications with almost no developer overhead.\n",
 581 |     "\n",
 582 |     "To allocate and free memory, and obtain a pointer that can be referenced in both host and device code, replace calls to `malloc` and `free` with `cudaMallocManaged` and `cudaFree` as in the following example:\n",
 583 |     "\n",
 584 |     "```cpp\n",
 585 |     "// CPU-only\n",
 586 |     "\n",
 587 |     "int N = 2<<20;\n",
 588 |     "size_t size = N * sizeof(int);\n",
 589 |     "\n",
 590 |     "int *a;\n",
 591 |     "a = (int *)malloc(size);\n",
 592 |     "\n",
 593 |     "// Use `a` in CPU-only program.\n",
 594 |     "\n",
 595 |     "free(a);\n",
 596 |     "```\n",
 597 |     "\n",
 598 |     "```cpp\n",
 599 |     "// Accelerated\n",
 600 |     "\n",
 601 |     "int N = 2<<20;\n",
 602 |     "size_t size = N * sizeof(int);\n",
 603 |     "\n",
 604 |     "int *a;\n",
 605 |     "// Note the address of `a` is passed as first argument.\n",
 606 |     "cudaMallocManaged(&a, size);\n",
 607 |     "\n",
 608 |     "// Use `a` on the CPU and/or on any GPU in the accelerated system.\n",
 609 |     "\n",
 610 |     "cudaFree(a);\n",
 611 |     "```"
 612 |    ]
 613 |   },
 614 |   {
 615 |    "cell_type": "markdown",
 616 |    "metadata": {},
 617 |    "source": [
 618 |     "---\n",
 619 |     "### Exercise: Array Manipulation on both the Host and Device\n",
 620 |     "\n",
 621 |     "The [`01-double-elements.cu`](../edit/05-allocate/01-double-elements.cu) program allocates an array, initializes it with integer values on the host, attempts to double each of these values in parallel on the GPU, and then confirms whether or not the doubling operations were successful, on the host. Currently the program will not work: it is attempting to interact on both the host and the device with an array at pointer `a`, but has only allocated the array (using `malloc`) to be accessible on the host. Refactor the application to meet the following conditions, referring to [the solution](../edit/05-allocate/solutions/01-double-elements-solution.cu) if you get stuck:\n",
 622 |     "\n",
 623 |     "- `a` should be available to both host and device code.\n",
 624 |     "- The memory at `a` should be correctly freed."
 625 |    ]
 626 |   },
 627 |   {
 628 |    "cell_type": "code",
 629 |    "execution_count": 32,
 630 |    "metadata": {},
 631 |    "outputs": [
 632 |     {
 633 |      "name": "stdout",
 634 |      "output_type": "stream",
 635 |      "text": [
 636 |       "All elements were doubled? TRUE\r\n"
 637 |      ]
 638 |     }
 639 |    ],
 640 |    "source": [
 641 |     "!nvcc -arch=sm_70 -o double-elements 05-allocate/01-double-elements.cu -run"
 642 |    ]
 643 |   },
 644 |   {
 645 |    "cell_type": "markdown",
 646 |    "metadata": {},
 647 |    "source": [
 648 |     "## Grid Size Work Amount Mismatch\n",
 649 |     "\n",
 650 |     "The following slides present upcoming material visually, at a high level. Click through the slides before moving on to more detailed coverage of their topics in following sections."
 651 |    ]
 652 |   },
 653 |   {
 654 |    "cell_type": "code",
 655 |    "execution_count": 33,
 656 |    "metadata": {},
 657 |    "outputs": [
 658 |     {
 659 |      "data": {
 660 |       "text/html": [
 661 |        "\n",
 662 |        "<div align=\"center\"><iframe src=\"https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_5.pptx\" width=\"800px\" height=\"500px\" frameborder=\"0\"></iframe></div>\n"
 663 |       ],
 664 |       "text/plain": [
 665 |        "<IPython.core.display.HTML object>"
 666 |       ]
 667 |      },
 668 |      "metadata": {},
 669 |      "output_type": "display_data"
 670 |     }
 671 |    ],
 672 |    "source": [
 673 |     "%%HTML\n",
 674 |     "\n",
 675 |     "<div align=\"center\"><iframe src=\"https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_5.pptx\" width=\"800px\" height=\"500px\" frameborder=\"0\"></iframe></div>"
 676 |    ]
 677 |   },
 678 |   {
 679 |    "cell_type": "markdown",
 680 |    "metadata": {},
 681 |    "source": [
 682 |     "---\n",
 683 |     "## Handling Block Configuration Mismatches to Number of Needed Threads\n",
 684 |     "\n",
 685 |     "It may be the case that an execution configuration cannot be expressed that will create the exact number of threads needed for parallelizing a loop.\n",
 686 |     "\n",
 687 |     "A common example has to do with the desire to choose optimal block sizes. For example, due to GPU hardware traits, blocks that contain a number of threads that are a multiple of 32 are often desirable for performance benefits. Assuming that we wanted to launch blocks each containing 256 threads (a multiple of 32), and needed to run 1000 parallel tasks (a trivially small number for ease of explanation), then there is no number of blocks that would produce an exact total of 1000 threads in the grid, since there is no integer value 32 can be multiplied by to equal exactly 1000.\n",
 688 |     "\n",
 689 |     "This scenario can be easily addressed in the following way:\n",
 690 |     "\n",
 691 |     "- Write an execution configuration that creates **more** threads than necessary to perform the allotted work.\n",
 692 |     "- Pass a value as an argument into the kernel (`N`) that represents to the total size of the data set to be processed, or the total threads that are needed to complete the work.\n",
 693 |     "- After calculating the thread's index within the grid (using `tid+bid*bdim`), check that this index does not exceed `N`, and only perform the pertinent work of the kernel if it does not.\n",
 694 |     "\n",
 695 |     "Here is an example of an idiomatic way to write an execution configuration when both `N` and the number of threads in a block are known, and an exact match between the number of threads in the grid and `N` cannot be guaranteed. It ensures that there are always at least as many threads as needed for `N`, and only 1 additional block's worth of threads extra, at most:\n",
 696 |     "\n",
 697 |     "```cpp\n",
 698 |     "// Assume `N` is known\n",
 699 |     "int N = 100000;\n",
 700 |     "\n",
 701 |     "// Assume we have a desire to set `threads_per_block` exactly to `256`\n",
 702 |     "size_t threads_per_block = 256;\n",
 703 |     "\n",
 704 |     "// Ensure there are at least `N` threads in the grid, but only 1 block's worth extra\n",
 705 |     "size_t number_of_blocks = (N + threads_per_block - 1) / threads_per_block;\n",
 706 |     "\n",
 707 |     "some_kernel<<<number_of_blocks, threads_per_block>>>(N);\n",
 708 |     "```\n",
 709 |     "\n",
 710 |     "Because the execution configuration above results in more threads in the grid than `N`, care will need to be taken inside of the `some_kernel` definition so that `some_kernel` does not attempt to access out of range data elements, when being executed by one of the \"extra\" threads:\n",
 711 |     "\n",
 712 |     "```cpp\n",
 713 |     "__global__ some_kernel(int N)\n",
 714 |     "{\n",
 715 |     "  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n",
 716 |     "\n",
 717 |     "  if (idx < N) // Check to make sure `idx` maps to some value within `N`\n",
 718 |     "  {\n",
 719 |     "    // Only do work if it does\n",
 720 |     "  }\n",
 721 |     "}\n",
 722 |     "```"
 723 |    ]
 724 |   },
 725 |   {
 726 |    "cell_type": "markdown",
 727 |    "metadata": {},
 728 |    "source": [
 729 |     "---\n",
 730 |     "### Exercise: Accelerating a For Loop with a Mismatched Execution Configuration\n",
 731 |     "\n",
 732 |     "The program in [`02-mismatched-config-loop.cu`](../edit/05-allocate/02-mismatched-config-loop.cu) allocates memory, using `cudaMallocManaged` for a 1000 element array of integers, and then seeks to initialize all the values of the array in parallel using a CUDA kernel. This program assumes that both `N` and the number of `threads_per_block` are known. Your task is to complete the following two objectives, refer to [the solution](../edit/05-allocate/solutions/02-mismatched-config-loop-solution.cu) if you get stuck:\n",
 733 |     "\n",
 734 |     "- Assign a value to `number_of_blocks` that will make sure there are at least as many threads as there are elements in `a` to work on.\n",
 735 |     "- Update the `initializeElementsTo` kernel to make sure that it does not attempt to work on data elements that are out of range."
 736 |    ]
 737 |   },
 738 |   {
 739 |    "cell_type": "code",
 740 |    "execution_count": 34,
 741 |    "metadata": {},
 742 |    "outputs": [
 743 |     {
 744 |      "name": "stdout",
 745 |      "output_type": "stream",
 746 |      "text": [
 747 |       "SUCCESS!\r\n"
 748 |      ]
 749 |     }
 750 |    ],
 751 |    "source": [
 752 |     "!nvcc -arch=sm_70 -o mismatched-config-loop 05-allocate/02-mismatched-config-loop.cu -run"
 753 |    ]
 754 |   },
 755 |   {
 756 |    "cell_type": "markdown",
 757 |    "metadata": {},
 758 |    "source": [
 759 |     "---\n",
 760 |     "## Grid-Stride Loops\n",
 761 |     "\n",
 762 |     "The following slides present upcoming material visually, at a high level. Click through the slides before moving on to more detailed coverage of their topics in following sections."
 763 |    ]
 764 |   },
 765 |   {
 766 |    "cell_type": "code",
 767 |    "execution_count": 35,
 768 |    "metadata": {},
 769 |    "outputs": [
 770 |     {
 771 |      "data": {
 772 |       "text/html": [
 773 |        "\n",
 774 |        "<div align=\"center\"><iframe src=\"https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_6.pptx\" width=\"800px\" height=\"500px\" frameborder=\"0\"></iframe></div>\n"
 775 |       ],
 776 |       "text/plain": [
 777 |        "<IPython.core.display.HTML object>"
 778 |       ]
 779 |      },
 780 |      "metadata": {},
 781 |      "output_type": "display_data"
 782 |     }
 783 |    ],
 784 |    "source": [
 785 |     "%%HTML\n",
 786 |     "\n",
 787 |     "<div align=\"center\"><iframe src=\"https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task1/AC_CUDA_C_6.pptx\" width=\"800px\" height=\"500px\" frameborder=\"0\"></iframe></div>"
 788 |    ]
 789 |   },
 790 |   {
 791 |    "cell_type": "markdown",
 792 |    "metadata": {},
 793 |    "source": [
 794 |     "---\n",
 795 |     "## Data Sets Larger Than the Grid\n",
 796 |     "\n",
 797 |     "Either by choice, often to create the most performant execution configuration, or out of necessity, the number of threads in a grid may be smaller than the size of a data set. Consider an array with 1000 elements, and a grid with 250 threads (using trivial sizes here for ease of explanation). Here, each thread in the grid will need to be used 4 times. One common method to do this is to use a **grid-stride loop** within the kernel.\n",
 798 |     "\n",
 799 |     "In a grid-stride loop, each thread will calculate its unique index within the grid using `tid+bid*bdim`, perform its operation on the element at that index within the array, and then, add to its index the number of threads in the grid and repeat, until it is out of range of the array. For example, for a 500 element array and a 250 thread grid, the thread with index 20 in the grid would:\n",
 800 |     "\n",
 801 |     "- Perform its operation on element 20 of the 500 element array\n",
 802 |     "- Increment its index by 250, the size of the grid, resulting in 270\n",
 803 |     "- Perform its operation on element 270 of the 500 element array\n",
 804 |     "- Increment its index by 250, the size of the grid, resulting in 520\n",
 805 |     "- Because 520 is now out of range for the array, the thread will stop its work\n",
 806 |     "\n",
 807 |     "CUDA provides a special variable giving the number of blocks in a grid, `gridDim.x`. Calculating the total number of threads in a grid then is simply the number of blocks in a grid multiplied by the number of threads in each block, `gridDim.x * blockDim.x`. With this in mind, here is a verbose example of a grid-stride loop within a kernel:\n",
 808 |     "\n",
 809 |     "```cpp\n",
 810 |     "__global__ void kernel(int *a, int N)\n",
 811 |     "{\n",
 812 |     "  int indexWithinTheGrid = threadIdx.x + blockIdx.x * blockDim.x;\n",
 813 |     "  int gridStride = gridDim.x * blockDim.x;\n",
 814 |     "\n",
 815 |     "  for (int i = indexWithinTheGrid; i < N; i += gridStride)\n",
 816 |     "  {\n",
 817 |     "    // do work on a[i];\n",
 818 |     "  }\n",
 819 |     "}\n",
 820 |     "```"
 821 |    ]
 822 |   },
 823 |   {
 824 |    "cell_type": "markdown",
 825 |    "metadata": {},
 826 |    "source": [
 827 |     "---\n",
 828 |     "### Exercise: Use a Grid-Stride Loop to Manipulate an Array Larger than the Grid\n",
 829 |     "\n",
 830 |     "Refactor [`03-grid-stride-double.cu`](../edit/05-allocate/03-grid-stride-double.cu) to use a grid-stride loop in the `doubleElements` kernel, in order that the grid, which is smaller than `N`, can reuse threads to cover every element in the array. The program will print whether or not every element in the array has been doubled, currently the program accurately prints `FALSE`. Refer to [the solution](../edit/05-allocate/solutions/03-grid-stride-double-solution.cu) if you get stuck."
 831 |    ]
 832 |   },
 833 |   {
 834 |    "cell_type": "code",
 835 |    "execution_count": 37,
 836 |    "metadata": {},
 837 |    "outputs": [
 838 |     {
 839 |      "name": "stdout",
 840 |      "output_type": "stream",
 841 |      "text": [
 842 |       "All elements were doubled? TRUE\r\n"
 843 |      ]
 844 |     }
 845 |    ],
 846 |    "source": [
 847 |     "!nvcc -arch=sm_70 -o grid-stride-double 05-allocate/03-grid-stride-double.cu -run"
 848 |    ]
 849 |   },
 850 |   {
 851 |    "cell_type": "markdown",
 852 |    "metadata": {},
 853 |    "source": [
 854 |     "---\n",
 855 |     "## Error Handling\n",
 856 |     "\n",
 857 |     "As in any application, error handling in accelerated CUDA code is essential. Many, if not most CUDA functions (see, for example, the [memory management functions](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY)) return a value of type `cudaError_t`, which can be used to check whether or not an error occurred while calling the function. Here is an example where error handling is performed for a call to `cudaMallocManaged`:\n",
 858 |     "\n",
 859 |     "```cpp\n",
 860 |     "cudaError_t err;\n",
 861 |     "err = cudaMallocManaged(&a, N)                    // Assume the existence of `a` and `N`.\n",
 862 |     "\n",
 863 |     "if (err != cudaSuccess)                           // `cudaSuccess` is provided by CUDA.\n",
 864 |     "{\n",
 865 |     "  printf(\"Error: %s\\n\", cudaGetErrorString(err)); // `cudaGetErrorString` is provided by CUDA.\n",
 866 |     "}\n",
 867 |     "```\n",
 868 |     "\n",
 869 |     "Launching kernels, which are defined to return `void`, do not return a value of type `cudaError_t`. To check for errors occurring at the time of a kernel launch, for example if the launch configuration is erroneous, CUDA provides the `cudaGetLastError` function, which does return a value of type `cudaError_t`.\n",
 870 |     "\n",
 871 |     "```cpp\n",
 872 |     "/*\n",
 873 |     " * This launch should cause an error, but the kernel itself\n",
 874 |     " * cannot return it.\n",
 875 |     " */\n",
 876 |     "\n",
 877 |     "someKernel<<<1, -1>>>();  // -1 is not a valid number of threads.\n",
 878 |     "\n",
 879 |     "cudaError_t err;\n",
 880 |     "err = cudaGetLastError(); // `cudaGetLastError` will return the error from above.\n",
 881 |     "if (err != cudaSuccess)\n",
 882 |     "{\n",
 883 |     "  printf(\"Error: %s\\n\", cudaGetErrorString(err));\n",
 884 |     "}\n",
 885 |     "```\n",
 886 |     "\n",
 887 |     "Finally, in order to catch errors that occur asynchronously, for example during the execution of an asynchronous kernel, it is essential to check the status returned by a subsequent synchronizing CUDA runtime API call, such as `cudaDeviceSynchronize`, which will return an error if one of the kernels launched previously should fail."
 888 |    ]
 889 |   },
 890 |   {
 891 |    "cell_type": "markdown",
 892 |    "metadata": {},
 893 |    "source": [
 894 |     "---\n",
 895 |     "### Exercise: Add Error Handling\n",
 896 |     "\n",
 897 |     "Currently [`01-add-error-handling.cu`](../edit/06-errors/01-add-error-handling.cu) compiles, runs, and prints that the elements of the array were not successfully doubled. The program does not, however, indicate that there are any errors within it. Refactor the application to handle CUDA errors so that you can learn what is wrong with the program and effectively debug it. You will need to investigate both synchronous errors potentially created when calling CUDA functions, as well as asynchronous errors potentially created while a CUDA kernel is executing. Refer to [the solution](../edit/06-errors/solutions/01-add-error-handling-solution.cu) if you get stuck."
 898 |    ]
 899 |   },
 900 |   {
 901 |    "cell_type": "code",
 902 |    "execution_count": 2,
 903 |    "metadata": {},
 904 |    "outputs": [
 905 |     {
 906 |      "name": "stdout",
 907 |      "output_type": "stream",
 908 |      "text": [
 909 |       "Error: invalid configuration argument\r\n",
 910 |       "All elements were doubled? FALSE\r\n"
 911 |      ]
 912 |     }
 913 |    ],
 914 |    "source": [
 915 |     "!nvcc -arch=sm_70 -o add-error-handling 06-errors/01-add-error-handling.cu -run"
 916 |    ]
 917 |   },
 918 |   {
 919 |    "cell_type": "markdown",
 920 |    "metadata": {},
 921 |    "source": [
 922 |     "---\n",
 923 |     "### CUDA Error Handling Function\n",
 924 |     "\n",
 925 |     "It can be helpful to create a macro that wraps CUDA function calls for checking errors. Here is an example, feel free to use it in the remaining exercises:\n",
 926 |     "\n",
 927 |     "```cpp\n",
 928 |     "#include <stdio.h>\n",
 929 |     "#include <assert.h>\n",
 930 |     "\n",
 931 |     "inline cudaError_t checkCuda(cudaError_t result)\n",
 932 |     "{\n",
 933 |     "  if (result != cudaSuccess) {\n",
 934 |     "    fprintf(stderr, \"CUDA Runtime Error: %s\\n\", cudaGetErrorString(result));\n",
 935 |     "    assert(result == cudaSuccess);\n",
 936 |     "  }\n",
 937 |     "  return result;\n",
 938 |     "}\n",
 939 |     "\n",
 940 |     "int main()\n",
 941 |     "{\n",
 942 |     "\n",
 943 |     "/*\n",
 944 |     " * The macro can be wrapped around any function returning\n",
 945 |     " * a value of type `cudaError_t`.\n",
 946 |     " */\n",
 947 |     "\n",
 948 |     "  checkCuda( cudaDeviceSynchronize() )\n",
 949 |     "}\n",
 950 |     "```"
 951 |    ]
 952 |   },
 953 |   {
 954 |    "cell_type": "markdown",
 955 |    "metadata": {},
 956 |    "source": [
 957 |     "---\n",
 958 |     "## Summary\n",
 959 |     "\n",
 960 |     "At this point in time you have accomplished all of the following lab objectives:\n",
 961 |     "\n",
 962 |     "- Write, compile, and run C/C++ programs that both call CPU functions and **launch** GPU **kernels**.\n",
 963 |     "- Control parallel **thread hierarchy** using **execution configuration**.\n",
 964 |     "- Refactor serial loops to execute their iterations in parallel on a GPU.\n",
 965 |     "- Allocate and free memory available to both CPUs and GPUs.\n",
 966 |     "- Handle errors generated by CUDA code.\n",
 967 |     "\n",
 968 |     "Now you will complete the final objective of the lab:\n",
 969 |     "\n",
 970 |     "- Accelerate CPU-only applications."
 971 |    ]
 972 |   },
 973 |   {
 974 |    "cell_type": "markdown",
 975 |    "metadata": {},
 976 |    "source": [
 977 |     "---\n",
 978 |     "### Final Exercise: Accelerate Vector Addition Application\n",
 979 |     "\n",
 980 |     "The following challenge will give you an opportunity to use everything that you have learned thus far in the lab. It involves accelerating a CPU-only vector addition program, which, while not the most sophisticated program, will give you an opportunity to focus on what you have learned about GPU-accelerating an application with CUDA. After completing this exercise, if you have time and interest, continue on to the *Advanced Content* section for some challenges that involve more complex code bases.\n",
 981 |     "\n",
 982 |     "[`01-vector-add.cu`](../edit/07-vector-add/01-vector-add.cu) contains a functioning CPU-only vector addition application. Accelerate its `addVectorsInto` function to run as a CUDA kernel on the GPU and to do its work in parallel. Consider the following that need to occur, and refer to [the solution](../edit/07-vector-add/solutions/01-vector-add-solution.cu) if you get stuck.\n",
 983 |     "\n",
 984 |     "- Augment the `addVectorsInto` definition so that it is a CUDA kernel.\n",
 985 |     "- Choose and utilize a working execution configuration so that `addVectorsInto` launches as a CUDA kernel.\n",
 986 |     "- Update memory allocations, and memory freeing to reflect that the 3 vectors `a`, `b`, and `result` need to be accessed by host and device code.\n",
 987 |     "- Refactor the body of `addVectorsInto`: it will be launched inside of a single thread, and only needs to do one thread's worth of work on the input vectors. Be certain the thread will never try to access elements outside the range of the input vectors, and take care to note whether or not the thread needs to do work on more than one element of the input vectors.\n",
 988 |     "- Add error handling in locations where CUDA code might otherwise silently fail."
 989 |    ]
 990 |   },
 991 |   {
 992 |    "cell_type": "code",
 993 |    "execution_count": 3,
 994 |    "metadata": {},
 995 |    "outputs": [
 996 |     {
 997 |      "name": "stdout",
 998 |      "output_type": "stream",
 999 |      "text": [
1000 |       "SUCCESS! All values added correctly.\r\n"
1001 |      ]
1002 |     }
1003 |    ],
1004 |    "source": [
1005 |     "!nvcc -arch=sm_70 -o vector-add 07-vector-add/01-vector-add.cu -run"
1006 |    ]
1007 |   },
1008 |   {
1009 |    "cell_type": "markdown",
1010 |    "metadata": {},
1011 |    "source": [
1012 |     "---\n",
1013 |     "## Advanced Content\n",
1014 |     "\n",
1015 |     "The following exercises provide additional challenge for those with time and interest. They require the use of more advanced techniques, and provide less scaffolding. They are difficult and excellent for your development."
1016 |    ]
1017 |   },
1018 |   {
1019 |    "cell_type": "markdown",
1020 |    "metadata": {},
1021 |    "source": [
1022 |     "---\n",
1023 |     "## Grids and Blocks of 2 and 3 Dimensions\n",
1024 |     "\n",
1025 |     "Grids and blocks can be defined to have up to 3 dimensions. Defining them with multiple dimensions does not impact their performance in any way, but can be very helpful when dealing with data that has multiple dimensions, for example, 2d matrices. To define either grids or blocks with two or 3 dimensions, use CUDA's `dim3` type as such:\n",
1026 |     "\n",
1027 |     "```cpp\n",
1028 |     "dim3 threads_per_block(16, 16, 1);\n",
1029 |     "dim3 number_of_blocks(16, 16, 1);\n",
1030 |     "someKernel<<<number_of_blocks, threads_per_block>>>();\n",
1031 |     "```\n",
1032 |     "\n",
1033 |     "Given the example just above, the variables `gridDim.x`, `gridDim.y`, `blockDim.x`, and `blockDim.y` inside of `someKernel`, would all be equal to `16`."
1034 |    ]
1035 |   },
1036 |   {
1037 |    "cell_type": "markdown",
1038 |    "metadata": {},
1039 |    "source": [
1040 |     "---\n",
1041 |     "### Exercise: Accelerate 2D Matrix Multiply Application\n",
1042 |     "\n",
1043 |     "The file [`01-matrix-multiply-2d.cu`](../edit/08-matrix-multiply/01-matrix-multiply-2d.cu) contains a host function `matrixMulCPU` which is fully functional. Your task is to build out the `matrixMulGPU` CUDA kernel. The source code will execute the matrix multiplication with both functions, and compare their answers to verify the correctness of the CUDA kernel you will be writing. Use the following guidelines to support your work and refer to [the solution](../edit/08-matrix-multiply/solutions/01-matrix-multiply-2d-solution.cu) if you get stuck:\n",
1044 |     "\n",
1045 |     "- You will need to create an execution configuration whose arguments are both `dim3` values with the `x` and `y` dimensions set to greater than `1`.\n",
1046 |     "- Inside the body of the kernel, you will need to establish the running thread's unique index within the grid per usual, but you should establish two indices for the thread: one for the x axis of the grid, and one for the y axis of the grid."
1047 |    ]
1048 |   },
1049 |   {
1050 |    "cell_type": "code",
1051 |    "execution_count": null,
1052 |    "metadata": {},
1053 |    "outputs": [],
1054 |    "source": [
1055 |     "!nvcc -arch=sm_70 -o matrix-multiply-2d 08-matrix-multiply/01-matrix-multiply-2d.cu -run"
1056 |    ]
1057 |   },
1058 |   {
1059 |    "cell_type": "markdown",
1060 |    "metadata": {},
1061 |    "source": [
1062 |     "---\n",
1063 |     "### Exercise: Accelerate A Thermal Conductivity Application\n",
1064 |     "\n",
1065 |     "In the following exercise, you will be accelerating an application that simulates the thermal conduction of silver in 2 dimensional space.\n",
1066 |     "\n",
1067 |     "Convert the `step_kernel_mod` function inside [`01-heat-conduction.cu`](../edit/09-heat/01-heat-conduction.cu) to execute on the GPU, and modify the `main` function to properly allocate data for use on CPU and GPU. The `step_kernel_ref` function executes on the CPU and is used for error checking. Because this code involves floating point calculations, different processors, or even simply reordering operations on the same processor, can result in slightly different results. For this reason the error checking code uses an error threshold, instead of looking for an exact match. Refer to [the solution](../edit/09-heat/solutions/01-heat-conduction-solution.cu) if you get stuck."
1068 |    ]
1069 |   },
1070 |   {
1071 |    "cell_type": "code",
1072 |    "execution_count": null,
1073 |    "metadata": {},
1074 |    "outputs": [],
1075 |    "source": [
1076 |     "!nvcc -arch=sm_70 -o heat-conduction 09-heat/01-heat-conduction.cu -run"
1077 |    ]
1078 |   },
1079 |   {
1080 |    "cell_type": "markdown",
1081 |    "metadata": {},
1082 |    "source": [
1083 |     "> Credit for the original Heat Conduction CPU source code in this task is given to the article [An OpenACC Example Code for a C-based heat conduction code](http://docplayer.net/30411068-An-openacc-example-code-for-a-c-based-heat-conduction-code.html) from the University of Houston."
1084 |    ]
1085 |   }
1086 |  ],
1087 |  "metadata": {
1088 |   "kernelspec": {
1089 |    "display_name": "Python 3 (ipykernel)",
1090 |    "language": "python",
1091 |    "name": "python3"
1092 |   },
1093 |   "language_info": {
1094 |    "codemirror_mode": {
1095 |     "name": "ipython",
1096 |     "version": 3
1097 |    },
1098 |    "file_extension": ".py",
1099 |    "mimetype": "text/x-python",
1100 |    "name": "python",
1101 |    "nbconvert_exporter": "python",
1102 |    "pygments_lexer": "ipython3",
1103 |    "version": "3.9.7"
1104 |   }
1105 |  },
1106 |  "nbformat": 4,
1107 |  "nbformat_minor": 1
1108 | }
1109 | 


--------------------------------------------------------------------------------
/Streaming and Visual Profiling.md:
--------------------------------------------------------------------------------
   1 | <h1><div align="center">Asynchronous Streaming, and Visual Profiling with CUDA C/C++</div></h1>
   2 | 
   3 | ![CUDA](./images/CUDA_Logo.jpg)
   4 | 
   5 | The CUDA toolkit ships with the **Nsight Systems**, a powerful GUI application to support the development of accelerated CUDA applications. Nsight Systems generates a graphical timeline of an accelerated application, with detailed information about CUDA API calls, kernel execution, memory activity, and the use of **CUDA streams**.
   6 | 
   7 | In this lab, you will be using the Nsight Systems timeline to guide you in optimizing accelerated applications. Additionally, you will learn some intermediate CUDA programming techniques to support your work: **unmanaged memory allocation and migration**; **pinning**, or **page-locking** host memory; and **non-default concurrent CUDA streams**.
   8 | 
   9 | At the end of this lab, you will be presented with an assessment, to accelerate and optimize a simple n-body particle simulator, which will allow you to demonstrate the skills you have developed during this course. Those of you who are able to accelerate the simulator while maintaining its correctness, will be granted a certification as proof of your competency.
  10 | 
  11 | ---
  12 | ## Prerequisites
  13 | 
  14 | To get the most out of this lab you should already be able to:
  15 | 
  16 | - Write, compile, and run C/C++ programs that both call CPU functions and launch GPU kernels.
  17 | - Control parallel thread hierarchy using execution configuration.
  18 | - Refactor serial loops to execute their iterations in parallel on a GPU.
  19 | - Allocate and free CUDA Unified Memory.
  20 | - Understand the behavior of Unified Memory with regard to page faulting and data migrations.
  21 | - Use asynchronous memory prefetching to reduce page faults and data migrations.
  22 | 
  23 | ## Objectives
  24 | 
  25 | By the time you complete this lab you will be able to:
  26 | 
  27 | - Use **Nsight Systems** to visually profile the timeline of GPU-accelerated CUDA applications.
  28 | - Use Nsight Systems to identify, and exploit, optimization opportunities in GPU-accelerated CUDA applications.
  29 | - Utilize CUDA streams for concurrent kernel execution in accelerated applications.
  30 | - (**Optional Advanced Content**) Use manual device memory allocation, including allocating pinned memory, in order to asynchronously transfer data in concurrent CUDA streams.
  31 | 
  32 | ---
  33 | ## Running Nsight Systems
  34 | 
  35 | For this interactive lab environment, we have set up a remote desktop you can access from your browser, where you will be able to launch and use Nsight Systems.
  36 | 
  37 | You will begin by creating a report file for an already-existing vector addition program, after which you will be walked through a series of steps to open this report file in Nsight Systems, and to make the visual experience nice.
  38 | 
  39 | ### Generate Report File
  40 | 
  41 | [`01-vector-add.cu`](../edit/01-vector-add/01-vector-add.cu) (<-------- click on these links to source files to edit them in the browser) contains a working, accelerated, vector addition application. Use the code execution cell directly below (you can execute it, and any of the code execution cells in this lab by `CTRL` + clicking it) to compile and run it. You should see a message printed that indicates it was successful.
  42 | 
  43 | 
  44 | ```python
  45 | !nvcc -o vector-add-no-prefetch 01-vector-add/01-vector-add.cu -run
  46 | ```
  47 | 
  48 |     Success! All values calculated correctly.
  49 | 
  50 | 
  51 | Next, use `nsys profile --stats=true` to create a report file that you will be able to open in the Nsight Systems visual profiler. Here we use the `-o` flag to give the report file a memorable name:
  52 | 
  53 | 
  54 | ```python
  55 | !nsys profile --stats=true -o vector-add-no-prefetch-report ./vector-add-no-prefetch
  56 | ```
  57 | 
  58 |     Warning: LBR backtrace method is not supported on this platform. DWARF backtrace method will be used.
  59 |     WARNING: The command line includes a target application therefore the CPU context-switch scope has been set to process-tree.
  60 |     Collecting data...
  61 |     Success! All values calculated correctly.
  62 |     Processing events...
  63 |     Saving temporary "/tmp/nsys-report-6a5a-6eb4-e9e5-8890.qdstrm" file to disk...
  64 |     
  65 |     Creating final output files...
  66 |     Processing [==============================================================100%]
  67 |     Saved report file to "/tmp/nsys-report-6a5a-6eb4-e9e5-8890.qdrep"
  68 |     Exporting 10235 events: [=================================================100%]
  69 |     
  70 |     Exported successfully to
  71 |     /tmp/nsys-report-6a5a-6eb4-e9e5-8890.sqlite
  72 |     
  73 |     
  74 |     CUDA API Statistics:
  75 |     
  76 |      Time(%)  Total Time (ns)  Num Calls    Average     Minimum    Maximum           Name         
  77 |      -------  ---------------  ---------  -----------  ---------  ---------  ---------------------
  78 |         61.9        246083765          3   82027921.7      17255  246026906  cudaMallocManaged    
  79 |         32.5        129021842          1  129021842.0  129021842  129021842  cudaDeviceSynchronize
  80 |          5.6         22384952          3    7461650.7    6701513    8922765  cudaFree             
  81 |          0.0            45634          1      45634.0      45634      45634  cudaLaunchKernel     
  82 |     
  83 |     
  84 |     
  85 |     CUDA Kernel Statistics:
  86 |     
  87 |      Time(%)  Total Time (ns)  Instances    Average     Minimum    Maximum                      Name                    
  88 |      -------  ---------------  ---------  -----------  ---------  ---------  -------------------------------------------
  89 |        100.0        129012278          1  129012278.0  129012278  129012278  addVectorsInto(float*, float*, float*, int)
  90 |     
  91 |     
  92 |     
  93 |     CUDA Memory Operation Statistics (by time):
  94 |     
  95 |      Time(%)  Total Time (ns)  Operations  Average  Minimum  Maximum              Operation            
  96 |      -------  ---------------  ----------  -------  -------  -------  ---------------------------------
  97 |         78.8         78911017        8314   9491.3     2143   128189  [CUDA Unified Memory memcpy HtoD]
  98 |         21.2         21192428         768  27594.3     1567   159742  [CUDA Unified Memory memcpy DtoH]
  99 |     
 100 |     
 101 |     
 102 |     CUDA Memory Operation Statistics (by size in KiB):
 103 |     
 104 |        Total     Operations  Average  Minimum  Maximum               Operation            
 105 |      ----------  ----------  -------  -------  --------  ---------------------------------
 106 |      393216.000        8314   47.296    4.000   764.000  [CUDA Unified Memory memcpy HtoD]
 107 |      131072.000         768  170.667    4.000  1020.000  [CUDA Unified Memory memcpy DtoH]
 108 |     
 109 |     
 110 |     
 111 |     Operating System Runtime API Statistics:
 112 |     
 113 |      Time(%)  Total Time (ns)  Num Calls   Average    Minimum   Maximum        Name     
 114 |      -------  ---------------  ---------  ----------  -------  ---------  --------------
 115 |         83.3       1330490805         72  18479039.0    32582  100127978  poll          
 116 |          8.8        140558156         63   2231081.8    15224   20538800  sem_timedwait 
 117 |          6.3         99891506        676    147768.5     1070   18788038  ioctl         
 118 |          1.6         24916487         94    265069.0     1407    8868714  mmap          
 119 |          0.1          1469485         82     17920.5     4713      28255  open64        
 120 |          0.0           214257          3     71419.0    69421      74578  fgets         
 121 |          0.0           154636          4     38659.0    35417      46043  pthread_create
 122 |          0.0           117941         25      4717.6     1563      25048  fopen         
 123 |          0.0            77800         11      7072.7     4800      11352  write         
 124 |          0.0            41449         11      3768.1     1443       5929  munmap        
 125 |          0.0            28638          5      5727.6     3301       8493  open          
 126 |          0.0            26997         18      1499.8     1113       3778  fclose        
 127 |          0.0            22073          6      3678.8     1091       9657  fgetc         
 128 |          0.0            21042         13      1618.6     1047       2401  read          
 129 |          0.0            14895          3      4965.0     1663       7476  fread         
 130 |          0.0            10453          7      1493.3     1002       3690  fcntl         
 131 |          0.0             9649          2      4824.5     4477       5172  socket        
 132 |          0.0             7203          1      7203.0     7203       7203  pipe2         
 133 |          0.0             6243          1      6243.0     6243       6243  connect       
 134 |          0.0             2535          1      2535.0     2535       2535  bind          
 135 |          0.0             1461          1      1461.0     1461       1461  listen        
 136 |     
 137 |     Report file moved to "/dli/task/vector-add-no-prefetch-report.qdrep"
 138 |     Report file moved to "/dli/task/vector-add-no-prefetch-report.sqlite"
 139 |     
 140 | 
 141 | 
 142 | ### Open the Remote Desktop
 143 | 
 144 | Run the next cell to generate a link to the remote desktop. Then, read the instructions that follow in the notebook.
 145 | 
 146 | 
 147 | ```python
 148 | %%js
 149 | var port = ((window.location.port == 80) ? "" : (":"+window.location.port));
 150 | var url = 'http://' + window.location.hostname + port + '/nsight/vnc.html?resize=scale';
 151 | let a = document.createElement('a');
 152 | a.setAttribute('href', url)
 153 | a.setAttribute('target', '_blank')
 154 | a.innerText = 'Click to open remote desktop'
 155 | element.append(a);
 156 | ```
 157 | 
 158 | 
 159 |     <IPython.core.display.Javascript object>
 160 | 
 161 | 
 162 | After clicking the _Connect_ button you will be asked for a password, which is `nvidia`.
 163 | 
 164 | ### Open Nsight Systems
 165 | 
 166 | To open Nsight Systems, double-click the "NVIDIA Nsight Systems" icon on the remote desktop.
 167 | 
 168 | ![open nsight](images/open-nsight-sys.png)
 169 | 
 170 | ### Enable Usage Reporting
 171 | 
 172 | When prompted, click "Yes" to enable usage reporting:
 173 | 
 174 | ![enable usage](images/enable_usage.png)
 175 | 
 176 | ### Select GPU Rows on Top
 177 | 
 178 | When prompted, select _GPU Rows on Top_ and then click _Okay_.
 179 | 
 180 | ![gpu)_rows_on_top](images/gpu_on_top.png)
 181 | 
 182 | ### Open the Report File
 183 | 
 184 | Open this report file by visiting _File_ -> _Open_ from the Nsight Systems menu and select `vector-add-no-prefetch-report.qdrep`:
 185 | 
 186 | ![open-report](images/open-report.png)
 187 | 
 188 | ### Ignore Warnings/Errors
 189 | 
 190 | You can close and ignore any warnings or errors you see, which are just a result of our particular remote desktop environment:
 191 | 
 192 | ![ignore errors](images/ignore-error.png)
 193 | 
 194 | ### Make More Room for the Timelines
 195 | 
 196 | To make your experience nicer, full-screen the profiler, close the _Project Explorer_ and hide the *Events View*:
 197 | 
 198 | ![make nice](images/make-nice.png)
 199 | 
 200 | Your screen should now look like this:
 201 | 
 202 | ![now nice](images/now-nice.png)
 203 | 
 204 | ### Expand the CUDA Unified Memory Timelines
 205 | 
 206 | Next, expand the _CUDA_ -> _Unified memory_ and _Context_ timelines, and close the _Threads_ timelines:
 207 | 
 208 | ![open memory](images/open-memory.png)
 209 | 
 210 | ### Observe Many Memory Transfers
 211 | 
 212 | From a glance you can see that your application is taking about 1 second to run, and that also, during the time when the `addVectorsInto` kernel is running, that there is a lot of UM memory activity:
 213 | 
 214 | ![memory and kernel](images/memory-and-kernel.png)
 215 | 
 216 | Zoom into the memory timelines to see more clearly all the small memory transfers being caused by the on-demand memory page faults. A couple tips:
 217 | 
 218 | 1. You can zoom in and out at any point of the timeline by holding `CTRL` while scrolling your mouse/trackpad
 219 | 2. You can zoom into any section by click + dragging a rectangle around it, and then selecting _Zoom in_
 220 | 
 221 | Here's an example of zooming in to see the many small memory transfers:
 222 | 
 223 | ![many transfers](images/many-transfers.png)
 224 | 
 225 | ---
 226 | ## Comparing Code Refactors Iteratively with Nsight Systems
 227 | 
 228 | Now that you have Nsight Systems up and running and are comfortable moving around the timelines, you will be profiling a series of programs that were iteratively improved using techniques already familiar to you. Each time you profile, information in the timeline will give information supporting how you should next modify your code. Doing this will further increase your understanding of how various CUDA programming techniques affect application performance.
 229 | 
 230 | ### Exercise: Compare the Timelines of Prefetching vs. Non-Prefetching
 231 | 
 232 | [`01-vector-add-prefetch-solution.cu`](../edit/01-vector-add/solutions/01-vector-add-prefetch-solution.cu) refactors the vector addition application from above so that the 3 vectors needed by its `addVectorsInto` kernel are asynchronously prefetched to the active GPU device prior to launching the kernel (using [`cudaMemPrefetchAsync`](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8dc9199943d421bc8bc7f473df12e42)). Open the source code and identify where in the application these changes were made.
 233 | 
 234 | After reviewing the changes, compile and run the refactored application using the code execution cell directly below. You should see its success message printed.
 235 | 
 236 | 
 237 | ```python
 238 | !nvcc -o vector-add-prefetch 01-vector-add/solutions/01-vector-add-prefetch-solution.cu -run
 239 | ```
 240 | 
 241 |     Success! All values calculated correctly.
 242 | 
 243 | 
 244 | Now create a report file for this version of the application:
 245 | 
 246 | 
 247 | ```python
 248 | !nsys profile --stats=true -o vector-add-prefetch-report ./vector-add-prefetch
 249 | ```
 250 | 
 251 |     Warning: LBR backtrace method is not supported on this platform. DWARF backtrace method will be used.
 252 |     WARNING: The command line includes a target application therefore the CPU context-switch scope has been set to process-tree.
 253 |     Collecting data...
 254 |     Success! All values calculated correctly.
 255 |     Processing events...
 256 |     Saving temporary "/tmp/nsys-report-02b7-473d-4ff0-86b9.qdstrm" file to disk...
 257 |     
 258 |     Creating final output files...
 259 |     Processing [==============================================================100%]
 260 |     Saved report file to "/tmp/nsys-report-02b7-473d-4ff0-86b9.qdrep"
 261 |     Exporting 2126 events: [==================================================100%]
 262 |     
 263 |     Exported successfully to
 264 |     /tmp/nsys-report-02b7-473d-4ff0-86b9.sqlite
 265 |     
 266 |     
 267 |     CUDA API Statistics:
 268 |     
 269 |      Time(%)  Total Time (ns)  Num Calls   Average    Minimum    Maximum           Name         
 270 |      -------  ---------------  ---------  ----------  --------  ---------  ---------------------
 271 |         73.9        260466014          3  86822004.7     18188  260391147  cudaMallocManaged    
 272 |         16.7         58973167          1  58973167.0  58973167   58973167  cudaDeviceSynchronize
 273 |          6.4         22708163          3   7569387.7   6837370    8952196  cudaFree             
 274 |          2.9         10368727          3   3456242.3      6106   10236373  cudaMemPrefetchAsync 
 275 |          0.0            33007          1     33007.0     33007      33007  cudaLaunchKernel     
 276 |     
 277 |     
 278 |     
 279 |     CUDA Kernel Statistics:
 280 |     
 281 |      Time(%)  Total Time (ns)  Instances   Average   Minimum  Maximum                     Name                    
 282 |      -------  ---------------  ---------  ---------  -------  -------  -------------------------------------------
 283 |        100.0          1697611          1  1697611.0  1697611  1697611  addVectorsInto(float*, float*, float*, int)
 284 |     
 285 |     
 286 |     
 287 |     CUDA Memory Operation Statistics (by time):
 288 |     
 289 |      Time(%)  Total Time (ns)  Operations  Average   Minimum  Maximum              Operation            
 290 |      -------  ---------------  ----------  --------  -------  -------  ---------------------------------
 291 |         75.7         65658107         192  341969.3   339868   344220  [CUDA Unified Memory memcpy HtoD]
 292 |         24.3         21105829         768   27481.5     1631   160062  [CUDA Unified Memory memcpy DtoH]
 293 |     
 294 |     
 295 |     
 296 |     CUDA Memory Operation Statistics (by size in KiB):
 297 |     
 298 |        Total     Operations  Average   Minimum   Maximum               Operation            
 299 |      ----------  ----------  --------  --------  --------  ---------------------------------
 300 |      393216.000         192  2048.000  2048.000  2048.000  [CUDA Unified Memory memcpy HtoD]
 301 |      131072.000         768   170.667     4.000  1020.000  [CUDA Unified Memory memcpy DtoH]
 302 |     
 303 |     
 304 |     
 305 |     Operating System Runtime API Statistics:
 306 |     
 307 |      Time(%)  Total Time (ns)  Num Calls   Average    Minimum   Maximum        Name     
 308 |      -------  ---------------  ---------  ----------  -------  ---------  --------------
 309 |         79.8       1274388745         69  18469402.1     2354  100130974  poll          
 310 |          9.9        158140955        681    232218.7     1039   22553208  ioctl         
 311 |          7.8        125032904         57   2193559.7    13781   20810303  sem_timedwait 
 312 |          1.6         25390711         94    270113.9     1488    8895883  mmap          
 313 |          0.8         12179618          2   6089809.0    41011   12138607  sem_wait      
 314 |          0.1          1668057         82     20342.2     4819      40370  open64        
 315 |          0.0           216883          3     72294.3    70550      74923  fgets         
 316 |          0.0           199047          5     39809.4    32239      53739  pthread_create
 317 |          0.0           124619         25      4984.8     1740      19616  fopen         
 318 |          0.0           112647         12      9387.3     4251      13515  write         
 319 |          0.0            50935         11      4630.5     1828      13674  munmap        
 320 |          0.0            28969          5      5793.8     3550       9457  open          
 321 |          0.0            28270         18      1570.6     1104       4745  fclose        
 322 |          0.0            23903         17      1406.1     1117       3800  fcntl         
 323 |          0.0            21582         13      1660.2     1072       2530  read          
 324 |          0.0            21571          6      3595.2     1104       9659  fgetc         
 325 |          0.0            13691          1     13691.0    13691      13691  pipe2         
 326 |          0.0            10171          2      5085.5     4670       5501  socket        
 327 |          0.0             9623          3      3207.7     2083       4027  fread         
 328 |          0.0             5704          1      5704.0     5704       5704  connect       
 329 |          0.0             2150          1      2150.0     2150       2150  bind          
 330 |          0.0             1550          1      1550.0     1550       1550  listen        
 331 |     
 332 |     Report file moved to "/dli/task/vector-add-prefetch-report.qdrep"
 333 |     Report file moved to "/dli/task/vector-add-prefetch-report.sqlite"
 334 |     
 335 | 
 336 | 
 337 | Open the report in Nsight Systems, leaving the previous report open for comparison.
 338 | 
 339 | - How does the execution time compare to that of the `addVectorsInto` kernel prior to adding asynchronous prefetching?
 340 | - Locate `cudaMemPrefetchAsync` in the *CUDA API* section of the timeline.
 341 | - How have the memory transfers changed?
 342 | 
 343 | 
 344 | ### Exercise: Profile Refactor with Launch Init in Kernel
 345 | 
 346 | In the previous iteration of the vector addition application, the vector data is being initialized on the CPU, and therefore needs to be migrated to the GPU before the `addVectorsInto` kernel can operate on it.
 347 | 
 348 | The next iteration of the application, [01-init-kernel-solution.cu](../edit/02-init-kernel/solutions/01-init-kernel-solution.cu), the application has been refactored to initialize the data in parallel on the GPU.
 349 | 
 350 | Since the initialization now takes place on the GPU, prefetching has been done prior to initialization, rather than prior to the vector addition work. Review the source code to identify where these changes have been made.
 351 | 
 352 | After reviewing the changes, compile and run the refactored application using the code execution cell directly below. You should see its success message printed.
 353 | 
 354 | 
 355 | ```python
 356 | !nvcc -o init-kernel 02-init-kernel/solutions/01-init-kernel-solution.cu -run
 357 | ```
 358 | 
 359 |     Success! All values calculated correctly.
 360 | 
 361 | 
 362 | Now create a report file for this version of the application:
 363 | 
 364 | 
 365 | ```python
 366 | !nsys profile --stats=true -o init-kernel-report ./init-kernel
 367 | ```
 368 | 
 369 |     Warning: LBR backtrace method is not supported on this platform. DWARF backtrace method will be used.
 370 |     WARNING: The command line includes a target application therefore the CPU context-switch scope has been set to process-tree.
 371 |     Collecting data...
 372 |     Success! All values calculated correctly.
 373 |     Processing events...
 374 |     Saving temporary "/tmp/nsys-report-5f32-ace4-b6e6-efee.qdstrm" file to disk...
 375 |     
 376 |     Creating final output files...
 377 |     Processing [==============================================================100%]
 378 |     Saved report file to "/tmp/nsys-report-5f32-ace4-b6e6-efee.qdrep"
 379 |     Exporting 1859 events: [==================================================100%]
 380 |     
 381 |     Exported successfully to
 382 |     /tmp/nsys-report-5f32-ace4-b6e6-efee.sqlite
 383 |     
 384 |     
 385 |     CUDA API Statistics:
 386 |     
 387 |      Time(%)  Total Time (ns)  Num Calls   Average    Minimum   Maximum           Name         
 388 |      -------  ---------------  ---------  ----------  -------  ---------  ---------------------
 389 |         91.5        252124091          3  84041363.7    22287  252048304  cudaMallocManaged    
 390 |          6.4         17516020          3   5838673.3   829781   15599715  cudaFree             
 391 |          1.3          3558920          1   3558920.0  3558920    3558920  cudaDeviceSynchronize
 392 |          0.8          2160427          3    720142.3   689193     759977  cudaMemPrefetchAsync 
 393 |          0.0            52789          4     13197.3     5045      35027  cudaLaunchKernel     
 394 |     
 395 |     
 396 |     
 397 |     CUDA Kernel Statistics:
 398 |     
 399 |      Time(%)  Total Time (ns)  Instances   Average   Minimum  Maximum                     Name                    
 400 |      -------  ---------------  ---------  ---------  -------  -------  -------------------------------------------
 401 |         52.3          1867305          3   622435.0   617145   627960  initWith(float, float*, int)               
 402 |         47.7          1704011          1  1704011.0  1704011  1704011  addVectorsInto(float*, float*, float*, int)
 403 |     
 404 |     
 405 |     
 406 |     CUDA Memory Operation Statistics (by time):
 407 |     
 408 |      Time(%)  Total Time (ns)  Operations  Average  Minimum  Maximum              Operation            
 409 |      -------  ---------------  ----------  -------  -------  -------  ---------------------------------
 410 |        100.0         21213610         768  27621.9     1630   160062  [CUDA Unified Memory memcpy DtoH]
 411 |     
 412 |     
 413 |     
 414 |     CUDA Memory Operation Statistics (by size in KiB):
 415 |     
 416 |        Total     Operations  Average  Minimum  Maximum               Operation            
 417 |      ----------  ----------  -------  -------  --------  ---------------------------------
 418 |      131072.000         768  170.667    4.000  1020.000  [CUDA Unified Memory memcpy DtoH]
 419 |     
 420 |     
 421 |     
 422 |     Operating System Runtime API Statistics:
 423 |     
 424 |      Time(%)  Total Time (ns)  Num Calls   Average    Minimum   Maximum        Name     
 425 |      -------  ---------------  ---------  ----------  -------  ---------  --------------
 426 |         74.2        572547650         32  17892114.1    24139  100131583  poll          
 427 |         14.1        108530709        679    159839.0     1024   18504371  ioctl         
 428 |          8.8         68114580         27   2522762.2    21853   20563888  sem_timedwait 
 429 |          2.6         20412497         94    217154.2     1420   15532158  mmap          
 430 |          0.2          1543347         82     18821.3     5164      34251  open64        
 431 |          0.0           215139          3     71713.0    69654      75122  fgets         
 432 |          0.0           159600          4     39900.0    34073      45112  pthread_create
 433 |          0.0           132350         25      5294.0     1558      25807  fopen         
 434 |          0.0            82525         11      7502.3     4169      13477  write         
 435 |          0.0            45493         11      4135.7     1508       8834  munmap        
 436 |          0.0            33023          5      6604.6     4063       9975  open          
 437 |          0.0            28753         18      1597.4     1081       5567  fclose        
 438 |          0.0            28142         13      2164.8     1317       4375  read          
 439 |          0.0            23888          6      3981.3     1045      10151  fgetc         
 440 |          0.0            21889          4      5472.3     2005      12049  fread         
 441 |          0.0            14733          8      1841.6     1063       6533  fcntl         
 442 |          0.0            13630          2      6815.0     6462       7168  socket        
 443 |          0.0             8312          1      8312.0     8312       8312  pipe2         
 444 |          0.0             7776          1      7776.0     7776       7776  connect       
 445 |          0.0             3028          1      3028.0     3028       3028  bind          
 446 |          0.0             1572          1      1572.0     1572       1572  listen        
 447 |     
 448 |     Report file moved to "/dli/task/init-kernel-report.qdrep"
 449 |     Report file moved to "/dli/task/init-kernel-report.sqlite"
 450 |     
 451 | 
 452 | 
 453 | Open the new report file in Nsight Systems and do the following:
 454 | 
 455 | - Compare the application and `addVectorsInto` run times to the previous version of the application, how did they change?
 456 | - Look at the *Kernels* section of the timeline. Which of the two kernels (`addVectorsInto` and the initialization kernel) is taking up the majority of the time on the GPU?
 457 | - Which of the following does your application contain?
 458 |   - Data Migration (HtoD)
 459 |   - Data Migration (DtoH)
 460 | 
 461 | ### Exercise: Profile Refactor with Asynchronous Prefetch Back to the Host
 462 | 
 463 | Currently, the vector addition application verifies the work of the vector addition kernel on the host. The next refactor of the application, [01-prefetch-check-solution.cu](../edit/04-prefetch-check/solutions/01-prefetch-check-solution.cu), asynchronously prefetches the data back to the host for verification.
 464 | 
 465 | After reviewing the changes, compile and run the refactored application using the code execution cell directly below. You should see its success message printed.
 466 | 
 467 | 
 468 | ```python
 469 | !nvcc -o prefetch-to-host 04-prefetch-check/solutions/01-prefetch-check-solution.cu -run
 470 | ```
 471 | 
 472 |     Success! All values calculated correctly.
 473 | 
 474 | 
 475 | Now create a report file for this version of the application:
 476 | 
 477 | 
 478 | ```python
 479 | !nsys profile --stats=true -o prefetch-to-host-report ./prefetch-to-host
 480 | ```
 481 | 
 482 |     Warning: LBR backtrace method is not supported on this platform. DWARF backtrace method will be used.
 483 |     WARNING: The command line includes a target application therefore the CPU context-switch scope has been set to process-tree.
 484 |     Collecting data...
 485 |     Success! All values calculated correctly.
 486 |     Processing events...
 487 |     Saving temporary "/tmp/nsys-report-ccc2-31d8-5adf-38a3.qdstrm" file to disk...
 488 |     
 489 |     Creating final output files...
 490 |     Processing [==============================================================100%]
 491 |     Saved report file to "/tmp/nsys-report-ccc2-31d8-5adf-38a3.qdrep"
 492 |     Exporting 1158 events: [==================================================100%]
 493 |     
 494 |     Exported successfully to
 495 |     /tmp/nsys-report-ccc2-31d8-5adf-38a3.sqlite
 496 |     
 497 |     
 498 |     CUDA API Statistics:
 499 |     
 500 |      Time(%)  Total Time (ns)  Num Calls   Average    Minimum   Maximum           Name         
 501 |      -------  ---------------  ---------  ----------  -------  ---------  ---------------------
 502 |         81.2        252404214          3  84134738.0    18510  252336626  cudaMallocManaged    
 503 |         14.1         43878374          4  10969593.5   663019   41769636  cudaMemPrefetchAsync 
 504 |          3.4         10660288          3   3553429.3   825844    8735064  cudaFree             
 505 |          1.2          3720615          1   3720615.0  3720615    3720615  cudaDeviceSynchronize
 506 |          0.0            47051          4     11762.8     4583      30360  cudaLaunchKernel     
 507 |     
 508 |     
 509 |     
 510 |     CUDA Kernel Statistics:
 511 |     
 512 |      Time(%)  Total Time (ns)  Instances   Average   Minimum  Maximum                     Name                    
 513 |      -------  ---------------  ---------  ---------  -------  -------  -------------------------------------------
 514 |         52.3          1868937          3   622979.0   619928   624664  initWith(float, float*, int)               
 515 |         47.7          1705355          1  1705355.0  1705355  1705355  addVectorsInto(float*, float*, float*, int)
 516 |     
 517 |     
 518 |     
 519 |     CUDA Memory Operation Statistics (by time):
 520 |     
 521 |      Time(%)  Total Time (ns)  Operations  Average   Minimum  Maximum              Operation            
 522 |      -------  ---------------  ----------  --------  -------  -------  ---------------------------------
 523 |        100.0         20436902          64  319326.6   319068   320444  [CUDA Unified Memory memcpy DtoH]
 524 |     
 525 |     
 526 |     
 527 |     CUDA Memory Operation Statistics (by size in KiB):
 528 |     
 529 |        Total     Operations  Average   Minimum   Maximum               Operation            
 530 |      ----------  ----------  --------  --------  --------  ---------------------------------
 531 |      131072.000          64  2048.000  2048.000  2048.000  [CUDA Unified Memory memcpy DtoH]
 532 |     
 533 |     
 534 |     
 535 |     Operating System Runtime API Statistics:
 536 |     
 537 |      Time(%)  Total Time (ns)  Num Calls   Average    Minimum   Maximum        Name     
 538 |      -------  ---------------  ---------  ----------  -------  ---------  --------------
 539 |         66.1        431214396         27  15970903.6     4223  100128871  poll          
 540 |         23.0        149919620        684    219180.7     1008   41692932  ioctl         
 541 |          8.5         55186369         24   2299432.0    21688   20879828  sem_timedwait 
 542 |          2.1         13671847         94    145445.2     1398    8672313  mmap          
 543 |          0.2          1574155         82     19197.0     6394      35079  open64        
 544 |          0.0           218311          3     72770.3    69399      76987  fgets         
 545 |          0.0           184190          4     46047.5    36837      54460  pthread_create
 546 |          0.0           152442         25      6097.7     1623      25245  fopen         
 547 |          0.0            96930         11      8811.8     4477      15038  write         
 548 |          0.0            50963         12      4246.9     1589       7939  munmap        
 549 |          0.0            46298          5      9259.6     4515      16069  open          
 550 |          0.0            31548         18      1752.7     1149       5936  fclose        
 551 |          0.0            30225         13      2325.0     1444       3626  read          
 552 |          0.0            25703          6      4283.8     1085      12284  fgetc         
 553 |          0.0            19723          2      9861.5     8124      11599  socket        
 554 |          0.0            18406         12      1533.8     1022       5967  fcntl         
 555 |          0.0            14393          4      3598.3     2264       5370  fread         
 556 |          0.0            10377          1     10377.0    10377      10377  pipe2         
 557 |          0.0             9986          1      9986.0     9986       9986  connect       
 558 |          0.0             3235          1      3235.0     3235       3235  bind          
 559 |          0.0             1879          1      1879.0     1879       1879  listen        
 560 |     
 561 |     Report file moved to "/dli/task/prefetch-to-host-report.qdrep"
 562 |     Report file moved to "/dli/task/prefetch-to-host-report.sqlite"
 563 |     
 564 | 
 565 | 
 566 | Open this report file in Nsight Systems, and do the following:
 567 | 
 568 | - Use the *Unified Memory* section of the timeline to compare and contrast the *Data Migration (DtoH)* events before and after adding prefetching back to the CPU.
 569 | 
 570 | ---
 571 | ## Concurrent CUDA Streams
 572 | 
 573 | You are now going to learn about a new concept, **CUDA Streams**. After an introduction to them, you will return to using Nsight Systems to better evaluate their impact on your application's performance.
 574 | 
 575 | The following slides present upcoming material visually, at a high level. Click through the slides before moving on to more detailed coverage of their topics in following sections.
 576 | 
 577 | 
 578 | ```python
 579 | %%HTML
 580 | 
 581 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task3/NVVP-Streams-1.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
 582 | ```
 583 | 
 584 | 
 585 | 
 586 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task3/NVVP-Streams-1.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
 587 | 
 588 | 
 589 | 
 590 | In CUDA programming, a **stream** is a series of commands that execute in order. In CUDA applications, kernel execution, as well as some memory transfers, occur within CUDA streams. Up until this point in time, you have not been interacting explicitly with CUDA streams, but in fact, your CUDA code has been executing its kernels inside of a stream called *the default stream*.
 591 | 
 592 | CUDA programmers can create and utilize non-default CUDA streams in addition to the default stream, and in doing so, perform multiple operations, such as executing multiple kernels, concurrently, in different streams. Using multiple streams can add an additional layer of parallelization to your accelerated applications, and offers many more opportunities for application optimization.
 593 | 
 594 | ### Rules Governing the Behavior of CUDA Streams
 595 | 
 596 | There are a few rules, concerning the behavior of CUDA streams, that should be learned in order to utilize them effectively:
 597 | 
 598 | - Operations within a given stream occur in order.
 599 | - Operations in different non-default streams are not guaranteed to operate in any specific order relative to each other.
 600 | - The default stream is blocking and will both wait for all other streams to complete before running, and, will block other streams from running until it completes.
 601 | 
 602 | ### Creating, Utilizing, and Destroying Non-Default CUDA Streams
 603 | 
 604 | The following code snippet demonstrates how to create, utilize, and destroy a non-default CUDA stream. You will note, that to launch a CUDA kernel in a non-default CUDA stream, the stream must be passed as the optional 4th argument of the execution configuration. Up until now you have only utilized the first 2 arguments of the execution configuration:
 605 | 
 606 | ```cpp
 607 | cudaStream_t stream;       // CUDA streams are of type `cudaStream_t`.
 608 | cudaStreamCreate(&stream); // Note that a pointer must be passed to `cudaCreateStream`.
 609 | 
 610 | someKernel<<<number_of_blocks, threads_per_block, 0, stream>>>(); // `stream` is passed as 4th EC argument.
 611 | 
 612 | cudaStreamDestroy(stream); // Note that a value, not a pointer, is passed to `cudaDestroyStream`.
 613 | ```
 614 | 
 615 | Outside the scope of this lab, but worth mentioning, is the optional 3rd argument of the execution configuration. This argument allows programmers to supply the number of bytes in **shared memory** (an advanced topic that will not be covered presently) to be dynamically allocated per block for this kernel launch. The default number of bytes allocated to shared memory per block is `0`, and for the remainder of the lab, you will be passing `0` as this value, in order to expose the 4th argument, which is of immediate interest:
 616 | 
 617 | ### Exercise: Predict Default Stream Behavior
 618 | 
 619 | The [01-print-numbers](../edit/05-stream-intro/01-print-numbers.cu) application has a very simple `printNumber` kernel which accepts an integer and prints it. The kernel is only being executed with a single thread inside a single block. However, it is being executed 5 times, using a for-loop, and passing each launch the number of the for-loop's iteration.
 620 | 
 621 | Compile and run [01-print-numbers](../edit/05-stream-intro/01-print-numbers.cu) using the code execution block below. You should see the numbers `0` through `4` printed.
 622 | 
 623 | 
 624 | ```python
 625 | !nvcc -o print-numbers 05-stream-intro/01-print-numbers.cu -run
 626 | ```
 627 | 
 628 |     0
 629 |     1
 630 |     2
 631 |     3
 632 |     4
 633 | 
 634 | 
 635 | Knowing that by default kernels are executed in the default stream, would you expect that the 5 launches of the `print-numbers` program executed serially, or in parallel? You should be able to mention two features of the default stream to support your answer. Create a report file in the cell below and open it in Nsight Systems to confirm your answer.
 636 | 
 637 | 
 638 | ```python
 639 | !nsys profile --stats=true -o print-numbers-report ./print-numbers
 640 | ```
 641 | 
 642 |     Warning: LBR backtrace method is not supported on this platform. DWARF backtrace method will be used.
 643 |     WARNING: The command line includes a target application therefore the CPU context-switch scope has been set to process-tree.
 644 |     Collecting data...
 645 |     0
 646 |     1
 647 |     2
 648 |     3
 649 |     4
 650 |     Processing events...
 651 |     Saving temporary "/tmp/nsys-report-e76f-0d9b-67ab-9e93.qdstrm" file to disk...
 652 |     
 653 |     Creating final output files...
 654 |     Processing [==============================================================100%]
 655 |     Saved report file to "/tmp/nsys-report-e76f-0d9b-67ab-9e93.qdrep"
 656 |     Exporting 1028 events: [==================================================100%]
 657 |     
 658 |     Exported successfully to
 659 |     /tmp/nsys-report-e76f-0d9b-67ab-9e93.sqlite
 660 |     
 661 |     
 662 |     CUDA API Statistics:
 663 |     
 664 |      Time(%)  Total Time (ns)  Num Calls   Average    Minimum   Maximum           Name         
 665 |      -------  ---------------  ---------  ----------  -------  ---------  ---------------------
 666 |         99.9        231590227          5  46318045.4     3972  231570322  cudaLaunchKernel     
 667 |          0.1           274340          1    274340.0   274340     274340  cudaDeviceSynchronize
 668 |     
 669 |     
 670 |     
 671 |     CUDA Kernel Statistics:
 672 |     
 673 |      Time(%)  Total Time (ns)  Instances  Average  Minimum  Maximum        Name      
 674 |      -------  ---------------  ---------  -------  -------  -------  ----------------
 675 |        100.0           274364          5  54872.8    53087    61695  printNumber(int)
 676 |     
 677 |     
 678 |     
 679 |     Operating System Runtime API Statistics:
 680 |     
 681 |      Time(%)  Total Time (ns)  Num Calls   Average    Minimum   Maximum        Name     
 682 |      -------  ---------------  ---------  ----------  -------  ---------  --------------
 683 |         67.2        230846288         14  16489020.6    24046  100129940  poll          
 684 |         30.9        106382468        668    159255.2     1101   18435339  ioctl         
 685 |          0.9          3145982         87     36160.7     1479     984888  mmap          
 686 |          0.6          1893699         82     23093.9     7394      43232  open64        
 687 |          0.2           631128         11     57375.3    14440     384134  sem_timedwait 
 688 |          0.1           217219          3     72406.3    69462      75986  fgets         
 689 |          0.0           170027          4     42506.8    34711      50220  pthread_create
 690 |          0.0           165936         25      6637.4     1585      26231  fopen         
 691 |          0.0           103666         12      8638.8     4567      15562  write         
 692 |          0.0            36806          5      7361.2     4820      10487  open          
 693 |          0.0            31476          7      4496.6     1399       9667  munmap        
 694 |          0.0            30858         18      1714.3     1153       5721  fclose        
 695 |          0.0            24766          6      4127.7     1070      11018  fgetc         
 696 |          0.0            24370         13      1874.6     1013       2926  read          
 697 |          0.0            19169          2      9584.5     7529      11640  socket        
 698 |          0.0            17964          2      8982.0     5776      12188  fread         
 699 |          0.0            13661          9      1517.9     1003       4603  fcntl         
 700 |          0.0             8429          1      8429.0     8429       8429  pipe2         
 701 |          0.0             7592          1      7592.0     7592       7592  connect       
 702 |          0.0             2728          1      2728.0     2728       2728  bind          
 703 |          0.0             1577          1      1577.0     1577       1577  listen        
 704 |     
 705 |     Report file moved to "/dli/task/print-numbers-report.qdrep"
 706 |     Report file moved to "/dli/task/print-numbers-report.sqlite"
 707 |     
 708 | 
 709 | 
 710 | ### Exercise: Implement Concurrent CUDA Streams
 711 | 
 712 | Both because all 5 kernel launches occurred in the same stream, you should not be surprised to have seen that the 5 kernels executed serially. Additionally you could make the case that because the default stream is blocking, each launch of the kernel would wait to complete before the next launch, and this is also true.
 713 | 
 714 | Refactor [01-print-numbers](../edit/05-stream-intro/01-print-numbers.cu) so that each kernel launch occurs in its own non-default stream. Be sure to destroy the streams you create after they are no longer needed. Compile and run the refactored code with the code execution cell directly below. You should still see the numbers `0` through `4` printed, though not necessarily in ascending order. Refer to [the solution](../edit/05-stream-intro/solutions/01-print-numbers-solution.cu) if you get stuck.
 715 | 
 716 | 
 717 | ```python
 718 | !nvcc -o print-numbers-in-streams 05-stream-intro/01-print-numbers.cu -run
 719 | ```
 720 | 
 721 |     0
 722 |     1
 723 |     2
 724 |     3
 725 |     4
 726 | 
 727 | 
 728 | Now that you are using 5 different non-default streams for each of the 5 kernel launches, do you expect that they will run serially or in parallel? In addition to what you now know about streams, take into account how trivial the `printNumber` kernel is, meaning, even if you predict parallel runs, will the speed at which one kernel will complete allow for complete overlap?
 729 | 
 730 | After hypothesizing, open a new report file in Nsight Systems to view its actual behavior. You should notice that now, there are additional rows in the _CUDA_ section for each of the non-default streams you created:
 731 | 
 732 | 
 733 | ```python
 734 | !nsys profile --stats=true -o print-numbers-in-streams-report print-numbers-in-streams
 735 | ```
 736 | 
 737 |     Warning: LBR backtrace method is not supported on this platform. DWARF backtrace method will be used.
 738 |     WARNING: The command line includes a target application therefore the CPU context-switch scope has been set to process-tree.
 739 |     Collecting data...
 740 |     0
 741 |     1
 742 |     2
 743 |     3
 744 |     4
 745 |     Processing events...
 746 |     Saving temporary "/tmp/nsys-report-80b0-dda7-f5cd-d58e.qdstrm" file to disk...
 747 |     
 748 |     Creating final output files...
 749 |     Processing [==============================================================100%]
 750 |     Saved report file to "/tmp/nsys-report-80b0-dda7-f5cd-d58e.qdrep"
 751 |     Exporting 1031 events: [==================================================100%]
 752 |     
 753 |     Exported successfully to
 754 |     /tmp/nsys-report-80b0-dda7-f5cd-d58e.sqlite
 755 |     
 756 |     
 757 |     CUDA API Statistics:
 758 |     
 759 |      Time(%)  Total Time (ns)  Num Calls   Average    Minimum   Maximum           Name         
 760 |      -------  ---------------  ---------  ----------  -------  ---------  ---------------------
 761 |         99.9        232188000          5  46437600.0     3983  232166656  cudaLaunchKernel     
 762 |          0.1           274624          1    274624.0   274624     274624  cudaDeviceSynchronize
 763 |     
 764 |     
 765 |     
 766 |     CUDA Kernel Statistics:
 767 |     
 768 |      Time(%)  Total Time (ns)  Instances  Average  Minimum  Maximum        Name      
 769 |      -------  ---------------  ---------  -------  -------  -------  ----------------
 770 |        100.0           275516          5  55103.2    53183    62335  printNumber(int)
 771 |     
 772 |     
 773 |     
 774 |     Operating System Runtime API Statistics:
 775 |     
 776 |      Time(%)  Total Time (ns)  Num Calls   Average    Minimum   Maximum        Name     
 777 |      -------  ---------------  ---------  ----------  -------  ---------  --------------
 778 |         67.1        231071908         14  16505136.3    24669  100131150  poll          
 779 |         31.0        106625128        669    159379.9     1088   18340350  ioctl         
 780 |          0.9          3074756         87     35342.0     1324     942128  mmap          
 781 |          0.4          1499785         82     18290.1     5654      30185  open64        
 782 |          0.3           982950         11     89359.1    22281     659979  sem_timedwait 
 783 |          0.1           216922          3     72307.3    69507      75987  fgets         
 784 |          0.1           177391          4     44347.8    35740      52742  pthread_create
 785 |          0.0           138453         25      5538.1     1542      25362  fopen         
 786 |          0.0            93945         12      7828.8     4387      12563  write         
 787 |          0.0            65779          9      7308.8     1866      33186  munmap        
 788 |          0.0            43884          7      6269.1     1070      16570  fgetc         
 789 |          0.0            34449          5      6889.8     4479       9616  open          
 790 |          0.0            31744         14      2267.4     1342       3974  read          
 791 |          0.0            28994         18      1610.8     1144       5509  fclose        
 792 |          0.0            15567          2      7783.5     6599       8968  socket        
 793 |          0.0            13499          7      1928.4     1027       6238  fcntl         
 794 |          0.0             9418          2      4709.0     3816       5602  fread         
 795 |          0.0             8740          1      8740.0     8740       8740  connect       
 796 |          0.0             7586          1      7586.0     7586       7586  pipe2         
 797 |          0.0             2759          1      2759.0     2759       2759  bind          
 798 |          0.0             1730          1      1730.0     1730       1730  listen        
 799 |     
 800 |     Report file moved to "/dli/task/print-numbers-in-streams-report.qdrep"
 801 |     Report file moved to "/dli/task/print-numbers-in-streams-report.sqlite"
 802 |     
 803 | 
 804 | 
 805 | ![streams print](images/streams-print.png)
 806 | 
 807 | ### Exercise: Use Streams for Concurrent Data Initialization Kernels
 808 | 
 809 | The vector addition application you have been working with, [01-prefetch-check-solution.cu](../edit/04-prefetch-check/solutions/01-prefetch-check-solution.cu), currently launches an initialization kernel 3 times - once each for each of the 3 vectors needing initialization for the `vectorAdd` kernel. Refactor it to launch each of the 3 initialization kernel launches in their own non-default stream. You should still see the success message print when compiling and running with the code execution cell below. Refer to [the solution](../edit/06-stream-init/solutions/01-stream-init-solution.cu) if you get stuck.
 810 | 
 811 | 
 812 | ```python
 813 | !nvcc -o init-in-streams 04-prefetch-check/solutions/01-prefetch-check-solution.cu -run
 814 | ```
 815 | 
 816 |     Success! All values calculated correctly.
 817 | 
 818 | 
 819 | Open a report in Nsight Systems to confirm that your 3 initialization kernel launches are running in their own non-default streams, with some degree of concurrent overlap.
 820 | 
 821 | 
 822 | ```python
 823 | !nsys profile --stats=true -o init-in-streams-report ./init-in-streams
 824 | ```
 825 | 
 826 |     Warning: LBR backtrace method is not supported on this platform. DWARF backtrace method will be used.
 827 |     WARNING: The command line includes a target application therefore the CPU context-switch scope has been set to process-tree.
 828 |     Collecting data...
 829 |     Success! All values calculated correctly.
 830 |     Processing events...
 831 |     Saving temporary "/tmp/nsys-report-8dc5-ecf0-b876-18b7.qdstrm" file to disk...
 832 |     
 833 |     Creating final output files...
 834 |     Processing [==============================================================100%]
 835 |     Saved report file to "/tmp/nsys-report-8dc5-ecf0-b876-18b7.qdrep"
 836 |     Exporting 1159 events: [==================================================100%]
 837 |     
 838 |     Exported successfully to
 839 |     /tmp/nsys-report-8dc5-ecf0-b876-18b7.sqlite
 840 |     
 841 |     
 842 |     CUDA API Statistics:
 843 |     
 844 |      Time(%)  Total Time (ns)  Num Calls   Average    Minimum   Maximum           Name         
 845 |      -------  ---------------  ---------  ----------  -------  ---------  ---------------------
 846 |         80.9        244573306          3  81524435.3    19891  244517730  cudaMallocManaged    
 847 |         14.4         43671906          4  10917976.5   668161   41576927  cudaMemPrefetchAsync 
 848 |          3.5         10527865          3   3509288.3   842234    8773636  cudaFree             
 849 |          1.2          3561958          1   3561958.0  3561958    3561958  cudaDeviceSynchronize
 850 |          0.0            41950          4     10487.5     4543      26473  cudaLaunchKernel     
 851 |     
 852 |     
 853 |     
 854 |     CUDA Kernel Statistics:
 855 |     
 856 |      Time(%)  Total Time (ns)  Instances   Average   Minimum  Maximum                     Name                    
 857 |      -------  ---------------  ---------  ---------  -------  -------  -------------------------------------------
 858 |         52.3          1869641          3   623213.7   615769   628600  initWith(float, float*, int)               
 859 |         47.7          1702219          1  1702219.0  1702219  1702219  addVectorsInto(float*, float*, float*, int)
 860 |     
 861 |     
 862 |     
 863 |     CUDA Memory Operation Statistics (by time):
 864 |     
 865 |      Time(%)  Total Time (ns)  Operations  Average   Minimum  Maximum              Operation            
 866 |      -------  ---------------  ----------  --------  -------  -------  ---------------------------------
 867 |        100.0         20436450          64  319319.5   319036   320412  [CUDA Unified Memory memcpy DtoH]
 868 |     
 869 |     
 870 |     
 871 |     CUDA Memory Operation Statistics (by size in KiB):
 872 |     
 873 |        Total     Operations  Average   Minimum   Maximum               Operation            
 874 |      ----------  ----------  --------  --------  --------  ---------------------------------
 875 |      131072.000          64  2048.000  2048.000  2048.000  [CUDA Unified Memory memcpy DtoH]
 876 |     
 877 |     
 878 |     
 879 |     Operating System Runtime API Statistics:
 880 |     
 881 |      Time(%)  Total Time (ns)  Num Calls   Average    Minimum   Maximum        Name     
 882 |      -------  ---------------  ---------  ----------  -------  ---------  --------------
 883 |         67.3        440777481         28  15742052.9    23517  100130959  poll          
 884 |         21.9        143514488        682    210431.8     1015   41519041  ioctl         
 885 |          8.4         54711008         23   2378739.5    20803   20541787  sem_timedwait 
 886 |          2.0         13355760         94    142082.6     1431    8719833  mmap          
 887 |          0.3          1674280         82     20418.0     4831      37871  open64        
 888 |          0.0           213829          3     71276.3    69211      74445  fgets         
 889 |          0.0           148427          4     37106.8    32971      41242  pthread_create
 890 |          0.0           116171         25      4646.8     1548      20856  fopen         
 891 |          0.0            90438         11      8221.6     4346      13884  write         
 892 |          0.0            40755         11      3705.0     1866       5626  munmap        
 893 |          0.0            27540         18      1530.0     1109       4290  fclose        
 894 |          0.0            27466          5      5493.2     3306       8088  open          
 895 |          0.0            23602         16      1475.1     1024       4901  fcntl         
 896 |          0.0            22748         13      1749.8     1069       2634  read          
 897 |          0.0            21690          6      3615.0     1122       9576  fgetc         
 898 |          0.0            12134          4      3033.5     1844       4366  fread         
 899 |          0.0            10615          2      5307.5     4884       5731  socket        
 900 |          0.0             7620          1      7620.0     7620       7620  pipe2         
 901 |          0.0             6242          1      6242.0     6242       6242  connect       
 902 |          0.0             2160          1      2160.0     2160       2160  bind          
 903 |          0.0             1494          1      1494.0     1494       1494  listen        
 904 |     
 905 |     Report file moved to "/dli/task/init-in-streams-report.qdrep"
 906 |     Report file moved to "/dli/task/init-in-streams-report.sqlite"
 907 |     
 908 | 
 909 | 
 910 | ---
 911 | ## Summary
 912 | 
 913 | At this point in the lab you are able to:
 914 | 
 915 | - Use the **Nsight Systems** to visually profile the timeline of GPU-accelerated CUDA applications.
 916 | - Use Nsight Systems to identify, and exploit, optimization opportunities in GPU-accelerated CUDA applications.
 917 | - Utilize CUDA streams for concurrent kernel execution in accelerated applications.
 918 | 
 919 | At this point in time you have a wealth of fundamental tools and techniques for accelerating CPU-only applications, and for then optimizing those accelerated applications. In the final exercise, you will have a chance to apply everything that you've learned to accelerate an [n-body](https://en.wikipedia.org/wiki/N-body_problem) simulator, which predicts the individual motions of a group of objects interacting with each other gravitationally.
 920 | 
 921 | ---
 922 | ## Final Exercise: Accelerate and Optimize an N-Body Simulator
 923 | 
 924 | An [n-body](https://en.wikipedia.org/wiki/N-body_problem) simulator predicts the individual motions of a group of objects interacting with each other gravitationally. [01-nbody.cu](../edit/09-nbody/01-nbody.cu) contains a simple, though working, n-body simulator for bodies moving through 3 dimensional space.
 925 | 
 926 | In its current CPU-only form, this application takes about 5 seconds to run on 4096 particles, and **20 minutes** to run on 65536 particles. Your task is to GPU accelerate the program, retaining the correctness of the simulation.
 927 | 
 928 | ### Considerations to Guide Your Work
 929 | 
 930 | Here are some things to consider before beginning your work:
 931 | 
 932 | - Especially for your first refactors, the logic of the application, the `bodyForce` function in particular, can and should remain largely unchanged: focus on accelerating it as easily as possible.
 933 | - The code base contains a for-loop inside `main` for integrating the interbody forces calculated by `bodyForce` into the positions of the bodies in the system. This integration both needs to occur after `bodyForce` runs, and, needs to complete before the next call to `bodyForce`. Keep this in mind when choosing how and where to parallelize.
 934 | - Use a **profile driven** and iterative approach.
 935 | - You are not required to add error handling to your code, but you might find it helpful, as you are responsible for your code working correctly.
 936 | 
 937 | **Have Fun!**
 938 | 
 939 | Use this cell to compile the nbody simulator. Although it is initially a CPU-only application, is does accurately simulate the positions of the particles.
 940 | 
 941 | 
 942 | ```python
 943 | !nvcc -std=c++11 -o nbody 09-nbody/01-nbody.cu
 944 | ```
 945 | 
 946 | It is highly recommended you use the profiler to assist your work. Execute the following cell to generate a report file:
 947 | 
 948 | 
 949 | ```python
 950 | !nsys profile --stats=true --force-overwrite=true -o nbody-report ./nbody
 951 | ```
 952 | 
 953 |     Warning: LBR backtrace method is not supported on this platform. DWARF backtrace method will be used.
 954 |     WARNING: The command line includes a target application therefore the CPU context-switch scope has been set to process-tree.
 955 |     Collecting data...
 956 |     0.041 Billion Interactions / second
 957 |     Processing events...
 958 |     Saving temporary "/tmp/nsys-report-ec9f-8f8a-c6f2-19fc.qdstrm" file to disk...
 959 |     
 960 |     Creating final output files...
 961 |     Processing [==============================================================100%]
 962 |     Saved report file to "/tmp/nsys-report-ec9f-8f8a-c6f2-19fc.qdrep"
 963 |     Exporting 39 events: [====================================================100%]
 964 |     
 965 |     Exported successfully to
 966 |     /tmp/nsys-report-ec9f-8f8a-c6f2-19fc.sqlite
 967 |     
 968 |     
 969 |     Operating System Runtime API Statistics:
 970 |     
 971 |      Time(%)  Total Time (ns)  Num Calls  Average  Minimum  Maximum   Name  
 972 |      -------  ---------------  ---------  -------  -------  -------  -------
 973 |         32.5            80266          1  80266.0    80266    80266  writev 
 974 |         30.3            74685          2  37342.5     7774    66911  fopen64
 975 |         21.7            53575          1  53575.0    53575    53575  read   
 976 |         15.5            38284          2  19142.0     2357    35927  fclose 
 977 |     
 978 |     Report file moved to "/dli/task/nbody-report.qdrep"
 979 |     Report file moved to "/dli/task/nbody-report.sqlite"
 980 |     
 981 | 
 982 | 
 983 | Here we import a function that will run your `nbody` simulator against a various number of particles, checking for performance and accuracy.
 984 | 
 985 | 
 986 | ```python
 987 | from assessment import run_assessment
 988 | ```
 989 | 
 990 | Execute the following cell to run and assess `nbody`:
 991 | 
 992 | 
 993 | ```python
 994 | run_assessment()
 995 | ```
 996 | 
 997 |     Running nbody simulator with 4096 bodies
 998 |     ----------------------------------------
 999 |     
1000 |     Application should run faster than 0.9s
1001 |     Your application ran in: 4.1226s
1002 |     Your application is not yet fast enough
1003 | 
1004 | 
1005 | ## Generate a Certificate
1006 | 
1007 | If you passed the assessment, please return to the course page (shown below) and click the "ASSESS TASK" button, which will generate your certificate for the course.
1008 | 
1009 | ![run_assessment](./images/run_assessment.png)
1010 | 
1011 | ## Advanced Content
1012 | 
1013 | The following sections, for those of you with time and interest, introduce more intermediate techniques involving some manual device memory management, and using non-default streams to overlap kernel execution and memory copies.
1014 | 
1015 | After learning about each of the techniques below, try to further optimize your nbody simulation using these techniques.
1016 | 
1017 | ---
1018 | ## Manual Device Memory Allocation and Copying
1019 | 
1020 | While `cudaMallocManaged` and `cudaMemPrefetchAsync` are performant, and greatly simplify memory migration, sometimes it can be worth it to use more manual methods for memory allocation. This is particularly true when it is known that data will only be accessed on the device or host, and the cost of migrating data can be reclaimed in exchange for the fact that no automatic on-demand migration is needed.
1021 | 
1022 | Additionally, using manual device memory management can allow for the use of non-default streams for overlapping data transfers with computational work. In this section you will learn some basic manual device memory allocation and copy techniques, before extending these techniques to overlap data copies with computational work. 
1023 | 
1024 | Here are some CUDA commands for manual device memory management:
1025 | 
1026 | - `cudaMalloc` will allocate memory directly to the active GPU. This prevents all GPU page faults. In exchange, the pointer it returns is not available for access by host code.
1027 | - `cudaMallocHost` will allocate memory directly to the CPU. It also "pins" the memory, or page locks it, which will allow for asynchronous copying of the memory to and from a GPU. Too much pinned memory can interfere with CPU performance, so use it only with intention. Pinned memory should be freed with `cudaFreeHost`.
1028 | - `cudaMemcpy` can copy (not transfer) memory, either from host to device or from device to host.
1029 | 
1030 | ### Manual Device Memory Management Example
1031 | 
1032 | Here is a snippet of code that demonstrates the use of the above CUDA API calls.
1033 | 
1034 | ```cpp
1035 | int *host_a, *device_a;        // Define host-specific and device-specific arrays.
1036 | cudaMalloc(&device_a, size);   // `device_a` is immediately available on the GPU.
1037 | cudaMallocHost(&host_a, size); // `host_a` is immediately available on CPU, and is page-locked, or pinned.
1038 | 
1039 | initializeOnHost(host_a, N);   // No CPU page faulting since memory is already allocated on the host.
1040 | 
1041 | // `cudaMemcpy` takes the destination, source, size, and a CUDA-provided variable for the direction of the copy.
1042 | cudaMemcpy(device_a, host_a, size, cudaMemcpyHostToDevice);
1043 | 
1044 | kernel<<<blocks, threads, 0, someStream>>>(device_a, N);
1045 | 
1046 | // `cudaMemcpy` can also copy data from device to host.
1047 | cudaMemcpy(host_a, device_a, size, cudaMemcpyDeviceToHost);
1048 | 
1049 | verifyOnHost(host_a, N);
1050 | 
1051 | cudaFree(device_a);
1052 | cudaFreeHost(host_a);          // Free pinned memory like this.
1053 | ```
1054 | 
1055 | ### Exercise: Manually Allocate Host and Device Memory
1056 | 
1057 | The most recent iteration of the vector addition application, [01-stream-init-solution](../edit/06-stream-init/solutions/01-stream-init-solution.cu), is using `cudaMallocManaged` to allocate managed memory first used on the device by the initialization kernels, then on the device by the vector add kernel, and then by the host, where the memory is automatically transferred, for verification. This is a sensible approach, but it is worth experimenting with some manual device memory allocation and copying to observe its impact on the application's performance.
1058 | 
1059 | Refactor the [01-stream-init-solution](../edit/06-stream-init/solutions/01-stream-init-solution.cu) application to **not** use `cudaMallocManaged`. In order to do this you will need to do the following:
1060 | 
1061 | - Replace calls to `cudaMallocManaged` with `cudaMalloc`.
1062 | - Create an additional vector that will be used for verification on the host. This is required since the memory allocated with `cudaMalloc` is not available to the host. Allocate this host vector with `cudaMallocHost`.
1063 | - After the `addVectorsInto` kernel completes, use `cudaMemcpy` to copy the vector with the addition results, into the host vector you created with `cudaMallocHost`.
1064 | - Use `cudaFreeHost` to free the memory allocated with `cudaMallocHost`.
1065 | 
1066 | Refer to [the solution](../edit/07-manual-malloc/solutions/01-manual-malloc-solution.cu) if you get stuck.
1067 | 
1068 | 
1069 | ```python
1070 | !nvcc -o vector-add-manual-alloc 06-stream-init/solutions/01-stream-init-solution.cu -run
1071 | ```
1072 | 
1073 | After completing the refactor, open a report in Nsight Systems, and use the timeline to do the following:
1074 | 
1075 | - Notice that there is no longer a *Unified Memory* section of the timeline.
1076 | - Comparing this timeline to that of the previous refactor, compare the run times of `cudaMalloc` in the current application vs. `cudaMallocManaged` in the previous.
1077 | - Notice how in the current application, work on the initialization kernels does not start until a later time than it did in the previous iteration. Examination of the timeline will show the difference is the time taken by `cudaMallocHost`. This clearly points out the difference between memory transfers, and memory copies. When copying memory, as you are doing presently, the data will exist in 2 different places in the system. In the current case, the allocation of the 4th host-only vector incurs a small cost in performance, compared to only allocating 3 vectors in the previous iteration.
1078 | 
1079 | ---
1080 | ## Using Streams to Overlap Data Transfers and Code Execution
1081 | 
1082 | The following slides present upcoming material visually, at a high level. Click through the slides before moving on to more detailed coverage of their topics in following sections.
1083 | 
1084 | 
1085 | ```python
1086 | %%HTML
1087 | 
1088 | <div align="center"><iframe src="https://view.officeapps.live.com/op/view.aspx?src=https://developer.download.nvidia.com/training/courses/C-AC-01-V1/embedded/task3/NVVP-Streams-3.pptx" width="800px" height="500px" frameborder="0"></iframe></div>
1089 | ```
1090 | 
1091 | In addition to `cudaMemcpy` is `cudaMemcpyAsync` which can asynchronously copy memory either from host to device or from device to host as long as the host memory is pinned, which can be done by allocating it with `cudaMallocHost`.
1092 | 
1093 | Similar to kernel execution, `cudaMemcpyAsync` is only asynchronous by default with respect to the host. It executes, by default, in the default stream and therefore is a blocking operation with regard to other CUDA operations occurring on the GPU. The `cudaMemcpyAsync` function, however, takes as an optional 5th argument, a non-default stream. By passing it a non-default stream, the memory transfer can be concurrent to other CUDA operations occurring in other non-default streams.
1094 | 
1095 | A common and useful pattern is to use a combination of pinned host memory, asynchronous memory copies in non-default streams, and kernel executions in non-default streams, to overlap memory transfers with kernel execution.
1096 | 
1097 | In the following example, rather than wait for the entire memory copy to complete before beginning work on the kernel, segments of the required data are copied and worked on, with each copy/work segment running in its own non-default stream. Using this technique, work on parts of the data can begin while memory transfers for later segments occur concurrently. Extra care must be taken when using this technique to calculate segment-specific values for the number of operations, and the offset location inside arrays, as shown here:
1098 | 
1099 | ```cpp
1100 | int N = 2<<24;
1101 | int size = N * sizeof(int);
1102 | 
1103 | int *host_array;
1104 | int *device_array;
1105 | 
1106 | cudaMallocHost(&host_array, size);               // Pinned host memory allocation.
1107 | cudaMalloc(&device_array, size);                 // Allocation directly on the active GPU device.
1108 | 
1109 | initializeData(host_array, N);                   // Assume this application needs to initialize on the host.
1110 | 
1111 | const int numberOfSegments = 4;                  // This example demonstrates slicing the work into 4 segments.
1112 | int segmentN = N / numberOfSegments;             // A value for a segment's worth of `N` is needed.
1113 | size_t segmentSize = size / numberOfSegments;    // A value for a segment's worth of `size` is needed.
1114 | 
1115 | // For each of the 4 segments...
1116 | for (int i = 0; i < numberOfSegments; ++i)
1117 | {
1118 |   // Calculate the index where this particular segment should operate within the larger arrays.
1119 |   segmentOffset = i * segmentN;
1120 | 
1121 |   // Create a stream for this segment's worth of copy and work.
1122 |   cudaStream_t stream;
1123 |   cudaStreamCreate(&stream);
1124 |   
1125 |   // Asynchronously copy segment's worth of pinned host memory to device over non-default stream.
1126 |   cudaMemcpyAsync(&device_array[segmentOffset],  // Take care to access correct location in array.
1127 |                   &host_array[segmentOffset],    // Take care to access correct location in array.
1128 |                   segmentSize,                   // Only copy a segment's worth of memory.
1129 |                   cudaMemcpyHostToDevice,
1130 |                   stream);                       // Provide optional argument for non-default stream.
1131 |                   
1132 |   // Execute segment's worth of work over same non-default stream as memory copy.
1133 |   kernel<<<number_of_blocks, threads_per_block, 0, stream>>>(&device_array[segmentOffset], segmentN);
1134 |   
1135 |   // `cudaStreamDestroy` will return immediately (is non-blocking), but will not actually destroy stream until
1136 |   // all stream operations are complete.
1137 |   cudaStreamDestroy(stream);
1138 | }
1139 | ```
1140 | 
1141 | ### Exercise: Overlap Kernel Execution and Memory Copy Back to Host
1142 | 
1143 | The most recent iteration of the vector addition application, [01-manual-malloc-solution.cu](../edit/07-manual-malloc/solutions/01-manual-malloc-solution.cu), is currently performing all of its vector addition work on the GPU before copying the memory back to the host for verification.
1144 | 
1145 | Refactor [01-manual-malloc-solution.cu](../edit/07-manual-malloc/solutions/01-manual-malloc-solution.cu) to perform the vector addition in 4 segments, in non-default streams, so that asynchronous memory copies can begin before waiting for all vector addition work to complete. Refer to [the solution](../edit/08-overlap-xfer/solutions/01-overlap-xfer-solution.cu) if you get stuck.
1146 | 
1147 | 
1148 | ```python
1149 | !nvcc -o vector-add-manual-alloc 07-manual-malloc/solutions/01-manual-malloc-solution.cu -run
1150 | ```
1151 | 
1152 | After completing the refactor, open a report in Nsight Systems, and use the timeline to do the following:
1153 | 
1154 | - Note when the device to host memory transfers begin, is it before or after all kernel work has completed?
1155 | - Notice that the 4 memory copy segments themselves do not overlap. Even in separate non-default streams, only one memory transfer in a given direction (DtoH here) at a time can occur simultaneously. The performance gains here are in the ability to start the transfers earlier than otherwise, and it is not hard to imagine in an application where a less trivial amount of work was being done compared to a simple addition operation, that the memory copies would not only start earlier, but also overlap with kernel execution.
1156 | 


--------------------------------------------------------------------------------