├── 16_TensorCores
    ├── src
    │   └── main.cu
    └── CMakeLists.txt
├── 00_MyGPU
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 01_HelloGPU
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 06_MemoryBasics
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 14_ManagedMemory
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 03_GridConfiguration
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 02_HostDeviceFunctions
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 09_Streams
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 10_Events
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 15_GraphAPI
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 08_Reductions
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 11_PinnedMemory
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 13_MemoryFences
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 12_ErrorHandling
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 07_Synchronization
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 17_CooperativeGroups
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── .gitignore
├── README.md
├── 18_StandardLibrary
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 04_LegacyScheduling
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── 05_IndependentThreadScheduling
    ├── CMakeLists.txt
    └── src
    │   └── main.cu
├── CMakeLists.txt
└── shared
    └── include
        └── utility.h


/16_TensorCores/src/main.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CUDA-Tutorial/CodeSamples/HEAD/16_TensorCores/src/main.cu


--------------------------------------------------------------------------------
/00_MyGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(00_MyGPU
2 | 	src/main.cu
3 | )
4 | 
5 | target_include_directories(00_MyGPU PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/01_HelloGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(01_HelloGPU
2 | 	src/main.cu
3 | )
4 | 
5 | target_include_directories(01_HelloGPU PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/06_MemoryBasics/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(06_MemoryBasics
2 | 	src/main.cu
3 | )
4 | 
5 | target_include_directories(06_MemoryBasics PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/14_ManagedMemory/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(14_ManagedMemory
2 | 	src/main.cu
3 | )
4 | 
5 | target_include_directories(14_ManagedMemory PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/03_GridConfiguration/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(03_GridConfiguration
2 | 	src/main.cu
3 | )
4 | 
5 | target_include_directories(03_GridConfiguration PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/02_HostDeviceFunctions/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(02_HostDeviceFunctions
2 | 	src/main.cu
3 | )
4 | 
5 | target_include_directories(02_HostDeviceFunctions PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/09_Streams/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(09_Streams
2 | 	../../shared/include/utility.h
3 | 	src/main.cu
4 | )
5 | 
6 | target_include_directories(09_Streams PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/10_Events/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(10_Events
2 | 	../../shared/include/utility.h
3 | 	src/main.cu
4 | )
5 | 
6 | target_include_directories(10_Events PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/15_GraphAPI/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(15_GraphAPI
2 | 	../../shared/include/utility.h
3 | 	src/main.cu
4 | )
5 | 
6 | target_include_directories(15_GraphAPI PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/08_Reductions/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(08_Reductions
2 | 	../../shared/include/utility.h
3 | 	src/main.cu
4 | )
5 | 
6 | target_include_directories(08_Reductions PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/11_PinnedMemory/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(11_PinnedMemory
2 | 	../../shared/include/utility.h
3 | 	src/main.cu
4 | )
5 | 
6 | target_include_directories(11_PinnedMemory PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/13_MemoryFences/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(13_MemoryFences
2 | 	../../shared/include/utility.h
3 | 	src/main.cu
4 | )
5 | 
6 | target_include_directories(13_MemoryFences PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/12_ErrorHandling/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(12_ErrorHandling
2 | 	../../shared/include/utility.h
3 | 	src/main.cu
4 | )
5 | 
6 | target_include_directories(12_ErrorHandling PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/07_Synchronization/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(07_Synchronization
2 | 	../../shared/include/utility.h
3 | 	src/main.cu
4 | )
5 | 
6 | target_include_directories(07_Synchronization PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/17_CooperativeGroups/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(17_CooperativeGroups
2 | 	../../shared/include/utility.h
3 | 	src/main.cu
4 | )
5 | 
6 | target_include_directories(17_CooperativeGroups PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | CMakeLists.txt.user
 2 | CMakeCache.txt
 3 | CMakeFiles
 4 | CMakeScripts
 5 | Testing
 6 | Makefile
 7 | cmake_install.cmake
 8 | install_manifest.txt
 9 | compile_commands.json
10 | CTestTestfile.cmake
11 | _deps
12 | build
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CUDA Tutorial Code Samples
 2 | 
 3 | Here we provide the codebase for samples that accompany the tutorial "CUDA and Applications to Task-based Programming". 
 4 | 
 5 | Requirements:
 6 | 
 7 | * Recent Clang/GCC/Microsoft Visual C++
 8 | * CMake 3.20 (**Ubuntu** users please update!)
 9 | * CUDA capable GPU with compute capability 5.2 or later
10 | * CUDA Toolkit 9.0 or later
11 | 
12 | Recommended:
13 | 
14 | * GCC 10/Microsoft Visual C++ 2019 or later
15 | * Nsight Systems
16 | * Nsight Compute
17 | * CUDA capable GPU with compute capability 7.0 or later
18 | * CUDA Toolkit 11.0 or later


--------------------------------------------------------------------------------
/18_StandardLibrary/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(ARCHES_SECURED ${CMAKE_CUDA_ARCHITECTURES})
 2 | set(CMAKE_CUDA_ARCHITECTURES "")
 3 | 
 4 | set(ITS_ARCH_FOUND FALSE)
 5 | foreach(ARCH ${ARCHES_SECURED})
 6 | 	if (ARCH GREATER_EQUAL 70)
 7 | 		set(ITS_ARCH_FOUND TRUE)
 8 | 		list(APPEND CMAKE_CUDA_ARCHITECTURES ${ARCH})
 9 | 	endif()
10 | endforeach()
11 | 
12 | if(${ITS_ARCH_FOUND})
13 | 	add_executable(18_StandardLibrary
14 | 		src/main.cu
15 | 	)
16 | 	target_include_directories(18_StandardLibrary PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
17 | else()
18 | 	message("Not building sample 18 - CMAKE_CUDA_ARCHITECTURES do not include ITS support (>=70)")
19 | 	message("To define more than one architecture, separate them with semicolon (e.g., 60;70)")
20 | endif()


--------------------------------------------------------------------------------
/04_LegacyScheduling/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(ARCHES_SECURED ${CMAKE_CUDA_ARCHITECTURES})
 2 | set(CMAKE_CUDA_ARCHITECTURES "")
 3 | 
 4 | set(LEGACY_ARCH_FOUND FALSE)
 5 | foreach(ARCH ${ARCHES_SECURED})
 6 | 	if (ARCH LESS_EQUAL 60)
 7 | 		set(LEGACY_ARCH_FOUND TRUE)
 8 | 		list(APPEND CMAKE_CUDA_ARCHITECTURES ${ARCH})
 9 | 	endif()
10 | endforeach()
11 | 
12 | if(${LEGACY_ARCH_FOUND})
13 | 	add_executable(04_LegacyScheduling
14 | 		src/main.cu
15 | 		../shared/include/utility.h
16 | 	)
17 | 	target_include_directories(04_LegacyScheduling PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
18 | else()
19 | 	message("Not building sample 04 - CMAKE_CUDA_ARCHITECTURES do not include legacy (<=60)")
20 | 	message("To define more than one architecture, separate them with semicolon (e.g., 60;70)")
21 | endif()


--------------------------------------------------------------------------------
/05_IndependentThreadScheduling/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(ARCHES_SECURED ${CMAKE_CUDA_ARCHITECTURES})
 2 | set(CMAKE_CUDA_ARCHITECTURES "")
 3 | 
 4 | set(ITS_ARCH_FOUND FALSE)
 5 | foreach(ARCH ${ARCHES_SECURED})
 6 | 	if (ARCH GREATER_EQUAL 70)
 7 | 		set(ITS_ARCH_FOUND TRUE)
 8 | 		list(APPEND CMAKE_CUDA_ARCHITECTURES ${ARCH})
 9 | 	endif()
10 | endforeach()
11 | 
12 | if(${ITS_ARCH_FOUND})
13 | 	add_executable(05_IndependentThreadScheduling
14 | 		src/main.cu
15 | 		../shared/include/utility.h
16 | 	)
17 | 	target_include_directories(05_IndependentThreadScheduling PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
18 | else()
19 | 	message("Not building sample 05 - CMAKE_CUDA_ARCHITECTURES do not include ITS support (>=70)")
20 | 	message("To define more than one architecture, separate them with semicolon (e.g., 60;70)")
21 | endif()


--------------------------------------------------------------------------------
/01_HelloGPU/src/main.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime_api.h>
 2 | #include <iostream>
 3 | 
 4 | __global__ void HelloGPU()
 5 | {
 6 | 	// Print a simple message from the GPU
 7 | 	printf("Hello from the GPU!\n");
 8 | }
 9 | 
10 | int main()
11 | {
12 | 	std::cout << "==== Sample 01 - Hello GPU ====\n" << std::endl;
13 | 	// Expected output: 12x "Hello from the GPU!\n"
14 | 
15 | 	// Launch a kernel with 1 block that has 12 threads
16 | 	HelloGPU<<<1, 12>>>();
17 | 
18 | 	/*
19 | 	 Synchronize with GPU to wait for printf to finish.
20 | 	 Results of printf are buffered and copied back to
21 | 	 the CPU for I/O after the kernel has finished.
22 | 	*/
23 | 	cudaDeviceSynchronize();
24 | 	return 0;
25 | }
26 | 
27 | /*
28 | Exercises:
29 | 1) Change the message that is printed by the kernel
30 | 2) Write a different kernel (different name, different message)
31 | 3) Call the different kernels multiple times
32 | */


--------------------------------------------------------------------------------
/16_TensorCores/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(ARCHES_SECURED ${CMAKE_CUDA_ARCHITECTURES})
 2 | set(CMAKE_CUDA_ARCHITECTURES "")
 3 | 
 4 | set(ITS_ARCH_FOUND FALSE)
 5 | foreach(ARCH ${ARCHES_SECURED})
 6 | 	if (ARCH GREATER_EQUAL 70)
 7 | 		set(ITS_ARCH_FOUND TRUE)
 8 | 		list(APPEND CMAKE_CUDA_ARCHITECTURES ${ARCH})
 9 | 	endif()
10 | endforeach()
11 | 
12 | if(${ITS_ARCH_FOUND})
13 | 	add_executable(16_TensorCores
14 | 		src/main.cu
15 | 		../shared/include/utility.h
16 | 	)
17 | 	target_include_directories(16_TensorCores PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
18 | 	if (MSVC)
19 | 		target_link_libraries(16_TensorCores cublas.lib)
20 | 	else()
21 | 		target_link_libraries(16_TensorCores cublas)
22 | 	endif()
23 | else()
24 | 	message("Not building sample 16 - CMAKE_CUDA_ARCHITECTURES do not include ITS support (>=70)")
25 | 	message("To define more than one architecture, separate them with semicolon (e.g., 60;70)")
26 | endif()


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20)
 2 | 
 3 | set(CMAKE_CXX_STANDARD 17)
 4 | set(CMAKE_CXX_EXTENSIONS OFF)
 5 | set(CMAKE_CUDA_STANDARD 17)
 6 | 
 7 | project(TutorialSamples LANGUAGES CUDA CXX)
 8 | 
 9 | add_subdirectory(00_MyGPU)
10 | add_subdirectory(01_HelloGPU)
11 | add_subdirectory(02_HostDeviceFunctions)
12 | add_subdirectory(03_GridConfiguration)
13 | add_subdirectory(04_LegacyScheduling)
14 | add_subdirectory(05_IndependentThreadScheduling)
15 | add_subdirectory(06_MemoryBasics)
16 | add_subdirectory(07_Synchronization)
17 | add_subdirectory(08_Reductions)
18 | add_subdirectory(09_Streams)
19 | add_subdirectory(10_Events)
20 | add_subdirectory(11_PinnedMemory)
21 | add_subdirectory(12_ErrorHandling)
22 | add_subdirectory(13_MemoryFences)
23 | add_subdirectory(14_ManagedMemory)
24 | add_subdirectory(15_GraphAPI)
25 | add_subdirectory(16_TensorCores)
26 | add_subdirectory(17_CooperativeGroups)
27 | add_subdirectory(18_StandardLibrary)
28 | 
29 | 


--------------------------------------------------------------------------------
/04_LegacyScheduling/src/main.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime_api.h>
 2 | #include <iostream>
 3 | #include "../../shared/include/utility.h"
 4 | 
 5 | int main()
 6 | {
 7 |     std::cout << "==== Sample 04 - Legacy Thread Scheduling ====\n" << std::endl;
 8 |     /*
 9 |      This code will launch a particular test kernel.
10 |      It will launch 4 threads in total.
11 |      The program code is structured such that each
12 |      thread enters one of 4 possible branches and then
13 |      atomically increments a GPU variable N times:
14 |     
15 |                 .---- N operations by Thread 0
16 |            ----X
17 |          /      '---- N operations by Thread 1
18 |     ----X
19 |          \      .---- N operations by Thread 2
20 |            ----X
21 |                 '---- N operations by Thread 3
22 |     
23 |      Each thread will document consecutive ranges of 
24 |      values it observed for the incremented variable.  
25 |      Basically, this will give us an idea how threads
26 |      take turns running in this branching scenario.
27 |     
28 |      Expected output: 4 consecutive ranges, one for 
29 |      each thread, taking 128 consecutive turns until
30 |      they have completed their N steps.
31 |     
32 |      Disclaimer: behavior depends somewhat on compiler's
33 |      effort to optimize code. Results may vary.
34 |      */
35 | 
36 |     constexpr int N = 128;
37 |     // Using a utility function for demonstration
38 |     samplesutil::run2NestedBranchesForNSteps(N);
39 |     return 0;
40 | }
41 | 


--------------------------------------------------------------------------------
/03_GridConfiguration/src/main.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime_api.h>
 2 | #include <iostream>
 3 | 
 4 | __global__ void PrintIDs()
 5 | {
 6 |     // Use built-in variables blockIdx and threadIdx
 7 |     const auto tID = threadIdx;
 8 |     const auto bID = blockIdx;
 9 |     printf("Block Id: %d,%d - Thread Id: %d,%d\n", bID.x, bID.y, tID.x, tID.y);
10 | }
11 | 
12 | int main()
13 | {
14 |     std::cout << "==== Sample 03 - Grid Configurations ====\n" << std::endl;
15 |     /*
16 |     Expected output:
17 |     Block IDs and Thread IDs for two separate grids
18 |     */
19 | 
20 |     std::cout << "Small grid: \n";
21 |     // Configure the grid and block dimensions via built-in struct dim3 (X,Y,Z)
22 |     const dim3 gridSize_small{ 1, 1, 1 };
23 |     const dim3 blockSize_small{ 4, 4, 1 };
24 | 
25 |     // Launch kernel with custom grid
26 |     PrintIDs<<<gridSize_small, blockSize_small>>>();
27 | 
28 |     // Need to synchronize here to have the GPU and CPU printouts in the correct order
29 |     cudaDeviceSynchronize();
30 | 
31 |     std::cout << "\nLarger grid: \n";
32 |     const dim3 gridSize_large{ 2, 2, 1 };
33 |     const dim3 blockSize_large{ 16, 16, 1 };
34 |     PrintIDs<<<gridSize_large, blockSize_large >>>();
35 |     cudaDeviceSynchronize();
36 | 
37 |     return 0;
38 | }
39 | 
40 | /*
41 | Exercises:
42 | 1) Launch a 1D grid with 2D blocks (e.g., 4 x (4 x 4))
43 | 2) Launch a 2D grid with 2D blocks (e.g., (4 x 4) x (4 x 4))
44 | 3) Launch a grid where either the grid or block are 3D
45 | 4) Can you find out what the limits are to how many blocks/threads you can launch?
46 | */


--------------------------------------------------------------------------------
/05_IndependentThreadScheduling/src/main.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime_api.h>
 2 | #include <iostream>
 3 | #include "../../shared/include/utility.h"
 4 | 
 5 | int main()
 6 | {
 7 |     std::cout << "==== Sample 05 - Independent Thread Scheduling ====\n" << std::endl;
 8 |     /*
 9 |      This code will launch a particular test kernel.
10 |      It will launch 4 threads in total.
11 |      The program code is structured such that each
12 |      thread enters one of 4 possible branches and then
13 |      atomically increments a GPU variable N times:
14 |     
15 |                 .---- N operations by Thread 0
16 |            ----X
17 |          /      '---- N operations by Thread 1
18 |     ----X
19 |          \      .---- N operations by Thread 2
20 |            ----X
21 |                 '---- N operations by Thread 3
22 |     
23 |      Each thread will document consecutive ranges of 
24 |      values it observed for the incremented variable.  
25 |      Basically, this will give us an idea how threads
26 |      take turns running in this branching scenario.
27 |     
28 |      Expected output: Many smaller observed value ranges!
29 |      Independent thread scheduling is free to switch 
30 |      between branches to keep the compute units of the 
31 |      GPU busy with work. Thus, threads can alternate 
32 |      taking turns to complete their N steps each.
33 |     
34 |      Disclaimer: behavior is dependent on scheduling at
35 |      runtime. Results may vary.
36 |      */
37 | 
38 |     constexpr int N = 128;
39 |     // Using a utility function for demonstration
40 |     samplesutil::run2NestedBranchesForNSteps(N);
41 |     return 0;
42 | }
43 | 


--------------------------------------------------------------------------------
/00_MyGPU/src/main.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime_api.h>
 2 | #include <iostream>
 3 | 
 4 | /*
 5 | Before you use your GPU to do work, you should know the 
 6 | most essential things about its capabilities.
 7 | */
 8 | int main()
 9 | {
10 | 	// Count CUDA-capable devices on the system
11 | 	int numDevices;
12 | 	cudaGetDeviceCount(&numDevices);
13 | 
14 | 	if (numDevices == 0)
15 | 	{
16 | 		std::cout << "You have no CUDA devices available!" << std::endl;
17 | 		return -1;
18 | 	}
19 | 
20 | 	// Get the ID of the currently selected active CUDA device
21 | 	int device;
22 | 	cudaGetDevice(&device);
23 | 
24 | 	// Fetch its properties
25 | 	cudaDeviceProp props;
26 | 	cudaGetDeviceProperties(&props, device);
27 | 
28 | 	/* 
29 | 	We only print the most fundamental properties here. cudaDeviceProp 
30 | 	contains a long range of indicators to check for different things
31 | 	that your GPU may or may not support, as well as factors for 
32 | 	performance. However, the most essential property to know about is
33 | 	the compute capability of the device. 
34 | 	*/
35 | 	std::cout << "Model: " << props.name << std::endl;
36 | 	std::cout << "Compute capability: " << props.major << "." << props.minor << std::endl;
37 | 	std::cout << "Memory: " << props.totalGlobalMem / float(1 << 30) << " GiB" << std::endl;
38 | 	std::cout << "Multiprocessors: " << props.multiProcessorCount << std::endl;
39 | 	std::cout << "Clock rate: " << props.clockRate / float(1'000'000) << " GHz" << std::endl;
40 | 
41 | 	return 0;
42 | }
43 | 
44 | /*
45 | Exercises:
46 | 1) Change the behavior such that the properties are not just printed for one, but all available CUDA devices you have!
47 | (Even if you have just one)
48 | 2) Print a few more interesting properties and read up in the specification what they mean.
49 | */
50 | 


--------------------------------------------------------------------------------
/02_HostDeviceFunctions/src/main.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime_api.h>
 2 | #include <iostream>
 3 | 
 4 | // Define a function that will only be compiled for and called from host
 5 | __host__ void HostOnly()
 6 | {
 7 |     std::cout << "This function may only be called from the host" << std::endl;
 8 | }
 9 | 
10 | // Define a function that will only be compiled for and called from device
11 | __device__ void DeviceOnly()
12 | {
13 |     printf("This function may only be called from the device\n");
14 | }
15 | 
16 | // Define a function that will be compiled for both architectures
17 | __host__ __device__ float SquareAnywhere(float x)
18 | {
19 |     return x * x;
20 | }
21 | 
22 | // Call device and portable functions from a kernel
23 | __global__ void RunGPU(float x)
24 | {
25 |     DeviceOnly();
26 |     printf("%f\n", SquareAnywhere(x));
27 | }
28 | 
29 | /*
30 |  Call host and portable functions from a kernel
31 |  Note that, by default, if a function has no architecture
32 |  specified, it is assumed to be __host__ by NVCC.
33 | */
34 | void RunCPU(float x)
35 | {
36 |     HostOnly();
37 |     std::cout << SquareAnywhere(x) << std::endl;
38 | }
39 | 
40 | int main()
41 | {
42 |     std::cout << "==== Sample 02 - Host / Device Functions ====\n" << std::endl;
43 |     /*
44 |      Expected output:
45 |      "This function may only be called from the host"
46 |      1764
47 |      "This function may only be called from the device"
48 |      1764.00
49 |     */
50 | 
51 |     RunCPU(42);
52 |     RunGPU<<<1, 1>>>(42);
53 |     cudaDeviceSynchronize();
54 |     return 0;
55 | }
56 | 
57 | /*
58 | Exercises:
59 | 1) Write a function that prints a message and can run on both the device and host
60 | 2) Revise the function from 1, such that the CPU version use std::cout. Use the 
61 | __CUDA_ARCH__ macro to write code paths that contain architecture-specific code.
62 | */


--------------------------------------------------------------------------------
/14_ManagedMemory/src/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime_api.h>
  2 | #include <iostream>
  3 | 
  4 | // Managed variables may be defined like device variables
  5 | __managed__ unsigned int mFoo;
  6 | 
  7 | // Print a managed variable
  8 | __global__ void PrintFoo()
  9 | {
 10 | 	printf("mFoo GPU: %d\n", mFoo);
 11 | }
 12 | 
 13 | // Print a managed array of integers
 14 | __global__ void PrintBar(const int* mBarPtr, unsigned int numEntries)
 15 | {
 16 | 	printf("mBar GPU: ");
 17 | 	for (int i = 0; i < numEntries; i++)
 18 | 		printf("%d%s", mBarPtr[i], (i == numEntries - 1) ? "\n" : ", ");
 19 | }
 20 | 
 21 | int main()
 22 | {
 23 | 	std::cout << "==== Sample 13 - Managed Memory ====\n" << std::endl;
 24 | 	/*
 25 | 	Managed memory reduces code complexity by decoupling physical
 26 | 	memory location from address range. The CUDA runtime will take
 27 | 	care of moving the memory to the location where it is needed.
 28 | 	No copies are required, but care must be taken for concurrent
 29 | 	access. To avoid performance degradation, managed memory should
 30 | 	be prefetched.
 31 | 	
 32 | 	Expected output: 
 33 | 		mFoo GPU: 14
 34 | 		mBar GPU: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
 35 | 		mBar CPU: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
 36 | 
 37 | 		CUDA device does (NOT) support concurrent access
 38 | 		mFoo GPU: 42
 39 | 	*/
 40 | 
 41 | 	constexpr unsigned int VALUE = 14;
 42 | 
 43 | 	// We may assign values to managed variables on the CPU
 44 | 	mFoo = VALUE;
 45 | 	// Managed variables can be used without explicit transfer
 46 | 	PrintFoo<<<1,1>>>();
 47 | 	// Wait for printf output
 48 | 	cudaDeviceSynchronize();
 49 | 
 50 | 	// We may also allocate managed memory on demand
 51 | 	int* mBarPtr;
 52 | 	cudaMallocManaged((void**)&mBarPtr, VALUE * sizeof(int));
 53 | 	// Managed memory can be directly initialized on the CPU
 54 | 	for (int i = 0; i < VALUE; i++)
 55 | 		mBarPtr[i] = i;
 56 | 
 57 | 	/*
 58 | 	If we know ahead of time where managed memory will be used
 59 | 	and performance is essential, we can prefetch it to the
 60 | 	required location. This basically replaces memcpy. Note
 61 | 	however, that this action  requires support for the
 62 | 	concurrentAccess property. Support for concurrent access 
 63 | 	is queried via device properties.
 64 | 	*/
 65 | 
 66 | 	int device;
 67 | 	cudaGetDevice(&device);
 68 | 
 69 | 	cudaDeviceProp prop;
 70 | 	cudaGetDeviceProperties(&prop, device);
 71 | 	// Report support
 72 | 	std::cout << "\nCUDA device does " << (!prop.concurrentManagedAccess ? "NOT " : "") << "support concurrent access\n";
 73 | 
 74 | 	// If we can, we prefetch ahead of time
 75 | 	if(prop.concurrentManagedAccess)
 76 | 		cudaMemPrefetchAsync(mBarPtr, VALUE * sizeof(int), device);
 77 | 	// Launch kernel with managed memory pointer as parameter
 78 | 	PrintBar<<<1,1>>>(mBarPtr, VALUE);
 79 | 	// We may also prefetch it back to the CPU
 80 | 	if (prop.concurrentManagedAccess)
 81 | 		cudaMemPrefetchAsync(mBarPtr, VALUE * sizeof(int), cudaCpuDeviceId);
 82 | 	// Wait for GPU printing and prefetching to finish
 83 | 	cudaDeviceSynchronize();
 84 | 
 85 | 	std::cout << "mBar CPU: ";
 86 | 	for (int i = 0; i < VALUE; i++)
 87 | 		std::cout << mBarPtr[i] << (i == VALUE - 1 ? "\n" : ", ");
 88 | 
 89 | 	/*
 90 | 	Devices may or may not support concurrent access to variables.
 91 | 	If they don't, then the CPU must ensure that access to managed
 92 | 	memory does not overlap with GPU kernel execution, even if the
 93 | 	GPU does not use the managed memory in question. Modifying
 94 | 	a variable on the CPU before a kernel is fine, because the kernel
 95 | 	will only be launched if the CPU is done with prior instructions.
 96 | 	*/
 97 | 
 98 | 	// Handling access to managed memory, depending on device properties
 99 | 	mFoo = 42;
100 | 	PrintFoo<<<1, 1>>>();
101 | 
102 | 	if (!prop.concurrentManagedAccess)
103 | 		// CPU access to managed memory and GPU execution may not overlap
104 | 		cudaDeviceSynchronize(); 
105 | 	
106 | 	// Modify on CPU after / during GPU execution
107 | 	mBarPtr[0] = 20;
108 | 
109 | 	// Wait for results of printf
110 | 	cudaDeviceSynchronize();
111 | 
112 | 	return 0;
113 | }
114 | 
115 | /*
116 | Exercises:
117 | 1) Write a program computes the squares of the integers from 1 to 1000 and 
118 | stores them to managed memory. Print them on the CPU.
119 | 2) Choose one of the programs you used previously that did quite a bit of copying,
120 | and rewrite it here to use managed memory instead. How does the performance compare?
121 | */


--------------------------------------------------------------------------------
/18_StandardLibrary/src/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime_api.h>
  2 | #include <iostream>
  3 | #include <vector>
  4 | #include <iomanip>
  5 | #include <random>
  6 | #include <algorithm>
  7 | #include <cuda/std/atomic>
  8 | 
  9 | __managed__ cuda::std::atomic<int> mResultCPU{ 0 }, mResultGPU{ 0 };
 10 | 
 11 | /*
 12 |  Basic, architecture-agnostic reduction, using global atomics.
 13 |  Uses portable cuda::std::atomics. Note that even though this code is portable,
 14 |  it may not necessarily give the best performance. cuda::std::atomics have
 15 |  system-wide (CPU + GPU) scope. If an algorithm is sure to run on the GPU, better
 16 |  performance may be achieved using cuda::atomics, which take an additional parameter
 17 |  "thread_scope" (e.g., "device" for global, "block" for shared memory atomics).
 18 | */
 19 | __host__ __device__ void reduceAtomic(int tId, int numThreads, int N, const int* input, cuda::std::atomic<int>& result)
 20 | {
 21 | 	if (tId >= N)
 22 | 		return;
 23 | 
 24 | 	// Compute input portion to be handled by each thread
 25 | 	int perThread = N / numThreads;
 26 | 	int myStart = perThread * tId;
 27 | 	int myEnd = (tId == numThreads - 1) ? N : myStart + perThread;
 28 | 
 29 | 	// For each value in the assigned portion, atomically add it to accumulated sum.
 30 | 	for (int i = myStart; i < myEnd; i++)
 31 | 		result.fetch_add(input[i], cuda::std::memory_order_relaxed);
 32 | }
 33 | 
 34 | __global__ void completeReductionGPU(int N, const int* input)
 35 | {
 36 | 	// Launchpad code for calling the architecutre-agnostic reduction function
 37 | 	int tId = blockIdx.x * blockDim.x + threadIdx.x;
 38 | 	reduceAtomic(tId, N, N, input, mResultGPU);
 39 | }
 40 | 
 41 | template<unsigned int NUM_THREADS>
 42 | __host__ void completeReductionCPU(int N, int* mNumbers)
 43 | {
 44 | 	/*
 45 | 	 Simple multi-threaded launch for computing the reduction
 46 | 	 in parallel. This function also makes sure to join on the
 47 | 	 threads. Otherwise, we would have to be careful not to
 48 | 	 overlap CPU access to managed memory and GPU execution.
 49 | 	 Each thread uses the architecture-agnostic implementation
 50 | 	 to compute part of the full reduction.
 51 | 	*/
 52 | 	std::vector<std::thread> threads(NUM_THREADS);
 53 | 	for (int i = 0; i < threads.size(); i++)
 54 | 		threads[i] = std::thread(reduceAtomic, i, NUM_THREADS, N, mNumbers, std::ref(mResultCPU));
 55 | 	for (std::thread& t : threads)
 56 | 		t.join();
 57 | }
 58 | 
 59 | int main()
 60 | {
 61 | 	std::cout << "==== Sample 18 - Standard Library ====\n" << std::endl;
 62 | 	/*
 63 | 	The libcu++ standard library allows us to make code more portable.
 64 | 	Users can use familiar concepts from programming for the CPU and
 65 | 	apply them with minimal changes on the GPU as well. In this
 66 | 	example, we show a method for parallel reduction where the same
 67 | 	reduction function can be executed by a thread on the CPU or the
 68 | 	GPU thanks to the support for std:: atomics.
 69 | 
 70 | 	We use integers in this example. Float atomics are only part
 71 | 	of the standard in C++20. Should be widely available soon! Once
 72 | 	more, the libcu++ standard library is much more powerful than
 73 | 	we can show with just a few samples. We encourage you to check
 74 | 	out the documentation for libcu++, as well as related talks for
 75 | 	more examples of use cases.
 76 | 
 77 | 	Expected output: the result of a reduction with random integers,
 78 | 	once computed on the CPU and once on the GPU, both of them
 79 | 	yielding the same accumulated value.
 80 | 	*/
 81 | 
 82 | 	// Define the number of inputs to reduce, and number of CPU threads
 83 | 	constexpr int N = 1 << 16, CPUThreads = 4;
 84 | 
 85 | 	// Allocate managed memory and fill it with random numbers
 86 | 	int* mNumbers;
 87 | 	std::default_random_engine eng(42);
 88 | 	std::uniform_int_distribution<int> dist(10, 42);
 89 | 	cudaMallocManaged((void**)&mNumbers, sizeof(int) * N);
 90 | 	std::for_each(mNumbers, mNumbers + N, [&dist, &eng](int& v) { v = dist(eng); });
 91 | 
 92 | 	// Launch the reduction with the given number of CPU threads
 93 | 	completeReductionCPU<CPUThreads>(N, mNumbers);
 94 | 	// Launch the reduction on the GPU with as many threads as there are inputs
 95 | 	completeReductionGPU << <(N + 255) / 256, 256 >> > (N, mNumbers);
 96 | 	cudaDeviceSynchronize();
 97 | 
 98 | 	// Output both results
 99 | 	std::cout << "Reduction result CPU: " << mResultCPU << "\n" << std::endl;
100 | 	std::cout << "Reduction result GPU: " << mResultGPU << "\n" << std::endl;
101 | 
102 | 	// finally, release the managed memory for the inputs and results
103 | 	cudaFree(mNumbers);
104 | }
105 | 
106 | /*
107 | Exercises:
108 | 1) The CUDA standard library is continuously being expanded. Check out
109 | their documentation and use an include for one of the recent features
110 | and demonstrate it.
111 | 2) Write a simple kernel with a single block that frequently updates a
112 | single cuda::atomic variable. For performance reasons, it should be one with
113 | thread_scope "block".
114 | 3) Try to show that there is a performance difference in 2) between using
115 | the default cuda::std::atomic and the cuda::atomic with block thread_scope.
116 | */


--------------------------------------------------------------------------------
/07_Synchronization/src/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime_api.h>
  2 | #include <iostream>
  3 | #include "../../shared/include/utility.h"
  4 | 
  5 | __global__ void WriteSlow(int* out, int val)
  6 | {
  7 |     samplesutil::WasteTime(1'000'000'000ULL);
  8 |     // Finally write value
  9 |     *out = val;
 10 | }
 11 | 
 12 | __global__ void Square(int* out)
 13 | {
 14 |     *out = *out * *out;
 15 | }
 16 | 
 17 | __global__ void ApproximatePi(bool synchronized)
 18 | {
 19 |     // Create block-shared variable for approximated Pi
 20 |     __shared__ float sPi;
 21 |     // Thread 0 computes Pi and stores it to shared memory
 22 |     if (threadIdx.x == 0)
 23 |         sPi = samplesutil::GregoryLeibniz(100'000);
 24 | 
 25 |     // Boolean decides whether threads synchronize or not
 26 |     if (synchronized)
 27 |         __syncthreads();
 28 | 
 29 |     // Every thread should now perform some task with Pi
 30 |     if (threadIdx.x%32 == 0)
 31 |         printf("Thread %d thinks Pi = %f\n", threadIdx.x, sPi);
 32 | }
 33 | 
 34 | int main()
 35 | {
 36 |     std::cout << "==== Sample 07 - Synchronization ====\n" << std::endl;
 37 |     /*
 38 |     Expected output:
 39 |         Demonstrating implicit synchronization:
 40 |         42 squared = 1764
 41 | 
 42 |         No __syncthreads after computing a block-shared Pi:
 43 |         Thread 32 thinks Pi = 0.000000
 44 |         Thread 64 thinks Pi = 0.000000
 45 |         Thread 96 thinks Pi = 0.000000
 46 |         Thread 0 thinks Pi = 3.141586
 47 |         (or similar. Results may be correct, but not safe!)
 48 | 
 49 |         __syncthreads after computing a block-shared Pi:
 50 |         Thread 64 thinks Pi = 3.141586
 51 |         Thread 96 thinks Pi = 3.141586
 52 |         Thread 0 thinks Pi = 3.141586
 53 |         Thread 32 thinks Pi = 3.141586
 54 |         (or similar)
 55 |     */
 56 | 
 57 |     /*
 58 |     Implicit synchronization between kernels and cudaMemcpy:
 59 | 
 60 |     Consider the example below, where we have two kernels. The first
 61 |     kernel writes some data (slowly), the second modifies that data.
 62 |     Afterwards, we copy the modified data back to the CPU. By default,
 63 |     CUDA will assume that each command depends on the previous command
 64 |     and therefore will implicitly synchronize them: a kernel will only
 65 |     run when previous kernels have finished, note however that the CPU
 66 |     is free to continue working in the meantime. Similarly, cudaMemcpy 
 67 |     will only start when all previous kernels have finished, but it 
 68 |     will also make the CPU wait until the copy has finished. Hence, we 
 69 |     don't need any other synchronization in this scenario. 
 70 |     */
 71 |     std::cout << "Demonstrating implicit synchronization:" << std::endl;
 72 |     // Allocate some device memory for kernels to work with
 73 |     int* dFooPtr;
 74 |     cudaMalloc(&dFooPtr, sizeof(int));
 75 |     // First kernel sets device memory to 42 (slowly)
 76 |     WriteSlow<<<1,1>>>(dFooPtr, 42);
 77 |     // Second kernel squares value of variable
 78 |     Square<<<1,1>>>(dFooPtr);
 79 |     // Finally, we copy the result back to the CPU
 80 |     int foo;
 81 |     cudaMemcpy(&foo, dFooPtr, sizeof(int), cudaMemcpyDeviceToHost);
 82 |     // Print the result of the GPU's computation
 83 |     std::cout << "42 squared = " << foo << std::endl;
 84 | 
 85 |     /*
 86 |     Block-wide synchronization with syncthreads:
 87 | 
 88 |     The following kernels compute an approximation of Pi.
 89 |     The algorithm used is inherently sequential, therefore
 90 |     only one thread performs the communication and then 
 91 |     shares the result with all threads in the block. 
 92 |     However, while one thread is busy performing work, the
 93 |     other threads in the block are free to move ahead. 
 94 |     With __syncthreads, we force all threads in a block to 
 95 |     wait at a given point in the program until all other 
 96 |     threads get there.
 97 |     */
 98 |     std::cout << "\nNo __syncthreads after computing a block-shared Pi:" << std::endl;
 99 |     // Run once without syncthreads
100 |     ApproximatePi<<<1, 128>>>(false);
101 |     // Wait for printf to finish
102 |     cudaDeviceSynchronize();
103 | 
104 |     std::cout << "\n__syncthreads after computing a block-shared Pi:" << std::endl;
105 |     // Run again with syncthreads
106 |     ApproximatePi<<<1, 128>>>(true);
107 |     // Wait for printf to finish
108 |     cudaDeviceSynchronize();
109 | 
110 |     return 0;
111 | }
112 | 
113 | /*
114 | Exercises:
115 | 1) Try launching a simple CUDA kernel 1000-10000 times in a loop, once
116 | with cudaDeviceSynchronize after each launch, once without it. 
117 | What's the effect on runtime? Does anything change about the program behavior?
118 | 2) You can also memcpy from device to device. Perform a few of them (e.g. moving a
119 | value from device location A to device location B to C) and then back to CPU and
120 | confirm that everything happened properly in order.
121 | 3) Try running a kernel where the first 16 threads in each warp take one branch,
122 | the other 16 take the other, with a syncthreads in each branch. What happens?
123 | Why? Document what happens when the first 32 threads in a block of size 64 take 
124 | one branch, the other 32 the other, with a syncthreads in each branch. What happens 
125 | now? Provide your best guess why.
126 | */


--------------------------------------------------------------------------------
/12_ErrorHandling/src/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime_api.h>
  2 | #include <iostream>
  3 | #include <vector>
  4 | #include <random>
  5 | #include <chrono>
  6 | #include "../../shared/include/utility.h"
  7 | 
  8 | __device__ int dVal = 42;
  9 | __device__ int dOut;
 10 | 
 11 | // Very simple kernel that updates a variable
 12 | __global__ void CopyVal(const int* val)
 13 | {
 14 | 	// Simulating a little work
 15 | 	samplesutil::WasteTime(1'000'000ULL);
 16 | 	// Update a global value
 17 | 	dOut = *val;
 18 | }
 19 | 
 20 | void checkForErrors()
 21 | {
 22 | 	// Catch errors that can be detected without synchronization, clear them
 23 | 	cudaError_t err;
 24 | 	err = cudaGetLastError();
 25 | 	if (err == cudaSuccess)
 26 | 		std::cout << "cudaGetLastError() before sync found no error" << std::endl;
 27 | 	else
 28 | 		std::cout << "cudaGetLastError() before sync found error: " << cudaGetErrorName(err) << ", CLEARS ERROR" << std::endl;
 29 | 
 30 | 	// Catch errors that require explicit synchronization, do not clear them
 31 | 	err = cudaDeviceSynchronize();
 32 | 	if (err == cudaSuccess)
 33 | 		std::cout << "cudaDeviceSynchronize() found no error" << std::endl;
 34 | 	else
 35 | 		std::cout << "cudaDeviceSynchronize() found error: " << cudaGetErrorName(err) << ", KEEPS ERROR" << std::endl;
 36 | 
 37 | 	// If errors were found via synchronization, cudaGetLastError clears them
 38 | 	err = cudaGetLastError();
 39 | 	if (err == cudaSuccess)
 40 | 		std::cout << "cudaGetLastError() after sync found no error" << std::endl;
 41 | 	else
 42 | 		std::cout << "cudaGetLastError() after sync found error: " << cudaGetErrorName(err) << ", CLEARS ERROR" << std::endl;
 43 | 
 44 | 	std::cout << std::endl;
 45 | }
 46 | 
 47 | #define PRINT_RUN_CHECK(S)		\
 48 | std::cout << #S << std::endl;	\
 49 | S;								\
 50 | checkForErrors();
 51 | 
 52 | int main()
 53 | {
 54 | 	std::cout << "==== Sample 12 - Error Handling ====\n" << std::endl;
 55 | 	/*
 56 | 	 Many functions in the CUDA API return error codes that indicate
 57 | 	 that something has gone wrong. However, this error is not 
 58 | 	 necessarily caused by the function that returns it. Kernels and
 59 | 	 asynchronous memcopies, e.g., return immediately and may only
 60 | 	 encounter errors after the return value is observed on the CPU. 
 61 | 	 Such errors can be detected at some later point, for instance by
 62 | 	 a synchronous function like cudaMemcpy or cudaDeviceSynchronize,
 63 | 	 or by cudaGetLastError after a synchronization. To ensure that 
 64 | 	 every single CUDA call worked without error, we would have to 
 65 | 	 sacrifice concurrency and asynchronicity. Hence, error checking 
 66 | 	 is, in practice, rather opportunistic and happens e.g. at runtime 
 67 | 	 when an algorithm is synchronized anyway or when we debug misbehaving 
 68 | 	 code. The error checking in this code is thus not practical and only 
 69 | 	 serves to illustrate how different mechanisms detect previous errors. 
 70 | 
 71 | 	 Expected output:
 72 | 
 73 | 		(CopyVal<<<1, 1>>>(validDAddress))
 74 | 		cudaGetLastError() before sync found no error
 75 | 		cudaDeviceSynchronize() found no error
 76 | 		cudaGetLastError() after sync found no error
 77 | 
 78 | 		(CopyVal<<<1, (1<<16)>>>(validDAddress))
 79 | 		cudaGetLastError() before sync found error: cudaErrorInvalidConfiguration, CLEARS ERROR
 80 | 		cudaDeviceSynchronize() found no error
 81 | 		cudaGetLastError() after sync found no error
 82 | 
 83 | 		(CopyVal<<<1, 1>>>(nullptr))
 84 | 		cudaGetLastError() before sync found no error
 85 | 		cudaDeviceSynchronize() found error: cudaErrorIllegalAddress, KEEPS ERROR
 86 | 		cudaGetLastError() after sync found error: cudaErrorIllegalAddress, CLEARS ERROR
 87 | 
 88 | 		cudaErrorInvalidPc: invalid program counter
 89 | 	*/
 90 | 
 91 | 	int* validDAddress;
 92 | 	// A function may return an error code - should check those for success
 93 | 	cudaError_t err = cudaGetSymbolAddress((void**)&validDAddress, dVal);
 94 | 
 95 | 	if (err != cudaSuccess)
 96 | 		// If an error occurred, identify it with cudaGetErrorName and react!
 97 | 		std::cout << cudaGetErrorName(err) << std::endl;
 98 | 	// Alternatively, you may peek at the last error to see if the program is ok
 99 | 	err = cudaPeekAtLastError();
100 | 	// Getting the last error effectively resets it. Useful after reacting to it
101 | 	err = cudaGetLastError();
102 | 
103 | 	/* 
104 | 	Launching a kernel with proper configuration and parameters.
105 | 	If the system is set up correctly, this should succeed.
106 | 	*/
107 | 	PRINT_RUN_CHECK((CopyVal<<<1, 1>>>(validDAddress)));
108 | 
109 | 	/* 
110 | 	Launching a kernel with bigger block than possible.
111 | 	cudaGetLastError() can catch SOME errors without synchronizing!
112 | 	*/
113 | 	PRINT_RUN_CHECK((CopyVal<<<1, (1<<16)>>>(validDAddress)));
114 | 
115 | 	/*
116 | 	Launching a kernel with invalid address - error occurs after launch.
117 | 	cudaGetLastError() alone may miss this without synchronization.
118 | 	*/
119 | 	PRINT_RUN_CHECK((CopyVal<<<1, 1>>>(nullptr)));
120 | 
121 | 	// For any kind of error, CUDA also provides a more verbose description.
122 | 	std::cout << cudaGetErrorName(cudaErrorInvalidPc) << ": " << cudaGetErrorString(cudaErrorInvalidPc) << std::endl;
123 | }
124 | 
125 | /*
126 | Exercises:
127 | 1) Write a program that creates many pinned large allocations, and stop when 
128 | the first error occurs. What is this error? When and why does it occur?
129 | 2) cudaMemcpy can implicitly synchronize the GPU and CPU, hence its return values
130 | can be used to find any errors of kernels that were launched before it. Demonstrate
131 | this for a simple example where a kernel does something illegal that you discover
132 | using cudaMemcpy.
133 | 3) Try to produce an exotic error that does not occur already occur in this program
134 | */
135 | 


--------------------------------------------------------------------------------
/09_Streams/src/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime_api.h>
  2 | #include <iostream>
  3 | #include <algorithm>
  4 | #include <thread>
  5 | #include <atomic>
  6 | #include <mutex>
  7 | #include <condition_variable>
  8 | #include "../../shared/include/utility.h"
  9 | 
 10 | // A simple kernel function to keep threads busy for a while
 11 | __global__ void busy()
 12 | {
 13 | 	samplesutil::WasteTime(1'000'000'000ULL);
 14 | 	printf("I'm awake!\n");
 15 | }
 16 | 
 17 | constexpr unsigned int KERNEL_CALLS = 2;
 18 | 
 19 | int main()
 20 | {
 21 | 	std::cout << "==== Sample 09 - Streams ====\n" << std::endl;
 22 | 	/*
 23 | 	 Expected output: "I'm awake!\n" x 4 x KERNEL_CALLS + 4
 24 | 
 25 | 	 If you watch the output carefully or analyze the execution of 
 26 | 	 this program with NVIDIA Nsight Systems, it should show that the 
 27 | 	 first group of kernels run consecutively, while the second and 
 28 | 	 third group run in parallel. 
 29 | 	 
 30 | 	 Finally, there should be two kernels running sequentially,
 31 | 	 followed by two kernels running in parallel.
 32 | 	*/
 33 | 
 34 | 	std::cout << "Running sequential launches" << std::endl;
 35 | 	// Launch the same kernel several times in a row
 36 | 	for (unsigned int i = 0; i < KERNEL_CALLS; i++)
 37 | 		busy<<<1, 1>>>();
 38 | 	// Synchronize before continuing to get clear separation in Nsight
 39 | 	cudaDeviceSynchronize();
 40 | 
 41 | 	std::cout << "\nRunning launches in streams" << std::endl;
 42 | 	// Allocate one stream for each kernel to be launched
 43 | 	cudaStream_t streams[KERNEL_CALLS];
 44 | 	for (cudaStream_t& s : streams)
 45 | 	{
 46 | 		// Create stream and launch kernel into it
 47 | 		cudaStreamCreate(&s);
 48 | 		busy<<<1, 1, 0, s>>>();
 49 | 	}
 50 | 	/*
 51 | 	Destroy all streams. It is fine to do that immediately. Will not
 52 | 	implicitly synchronize, but the GPU will continue running their 
 53 | 	jobs until they have all been taken care of
 54 | 	*/
 55 | 	for (cudaStream_t& s : streams)
 56 | 		cudaStreamDestroy(s);
 57 | 	cudaDeviceSynchronize();
 58 | 
 59 | 	/*
 60 | 	If we don't specify a stream, then the kernel is launched into the default 
 61 | 	stream. Also, many operations like cudaDeviceSynchronize and 
 62 | 	cudaStreamSynchronize are submitted to the default stream. Usually, only a 
 63 | 	single default stream is defined per application, meaning that if you don't 
 64 | 	specify streams, you will not be able to benefit from kernels running 
 65 | 	concurrently. Hence, any elaborate CUDA application should be using streams. 
 66 | 	
 67 | 	However, if the task can be cleanly separated into CPU threads, there is another 
 68 | 	option: using per-thread default streams. Each thread will use its own default
 69 | 	stream if we pass the built-in value cudaStreamPerThread as the stream to use.
 70 | 	Kernels can then run concurrently on the GPU by creating multiple CPU threads.
 71 | 	Alternatively, you may set the compiler option "--default-stream per-thread". 
 72 | 	This way, CPU threads will use separate default streams if none are specified.
 73 | 	*/
 74 | 	std::cout << "\nRunning threads with different default streams" << std::endl;
 75 | 
 76 | 	// Create mutex, condition variable and kernel counter for communication
 77 | 	std::mutex mutex;
 78 | 	std::condition_variable cv;
 79 | 	unsigned int kernelsLaunched = 0;
 80 | 	// Allocate sufficient number of threads
 81 | 	std::thread threads[KERNEL_CALLS];
 82 | 	// Create a separate thread for each kernel call (task)
 83 | 	for (std::thread& t : threads)
 84 | 	{
 85 | 		t = std::thread([&mutex, &cv, &kernelsLaunched] {
 86 | 			// Launch kernel to thread's default stream
 87 | 			busy<<<1, 1, 0, cudaStreamPerThread>>>();
 88 | 			/*
 89 | 			 Make sure all kernels are submitted before synchronizing,
 90 | 			 because cudaStreamSynchronize goes into the default 0 stream:
 91 | 			 busy<1> -> sync<0>(1) -> busy<2> -> sync<0>(2)... may serialize.
 92 | 			 busy<1> -> busy<2> -> sync<0>(1) -> sync<0>(2)... parallelizes.
 93 | 			*/
 94 | 			std::unique_lock<std::mutex> lock(mutex);
 95 | 			++kernelsLaunched;
 96 | 			cv.wait(lock, [&kernelsLaunched] { return kernelsLaunched == KERNEL_CALLS; });
 97 | 			cv.notify_all();
 98 | 			// Synchronize to wait for printf output
 99 | 			cudaStreamSynchronize(cudaStreamPerThread);
100 | 		});
101 | 	}
102 | 	// Wait for all threads to finish launching their kernels in individual streams
103 | 	std::for_each(threads, threads + KERNEL_CALLS, [](std::thread& t) {t.join(); });
104 | 
105 | 	/*
106 | 	By default, custom created streams will implicitly synchronize with the 
107 | 	default stream. Consider, e.g., a kernel A running in a custom stream, 
108 | 	followed by a kernel B in the default stream. If we use cudaStreamCreate
109 | 	as above, then A will end before B starts. Alternatively, we may create 
110 | 	custom streams with the flag cudaStreamNonBlocking. In this case, the 
111 | 	custom stream will not synchronize with the default stream anymore. 
112 | 	*/
113 | 	cudaStream_t customRegular, customNonblocking;
114 | 	cudaStreamCreate(&customRegular);
115 | 	cudaStreamCreateWithFlags(&customNonblocking, cudaStreamNonBlocking);
116 | 
117 | 	auto testAB = [](const char* kind, cudaStream_t stream) {
118 | 		std::cout << "\nLaunching A (custom) -> B (default) with " << kind << " custom stream" << std::endl;
119 | 		busy<<<1, 1, 0, stream>>>();
120 | 		busy<<<1, 1>>>();
121 | 		cudaDeviceSynchronize();
122 | 	};
123 | 
124 | 	testAB("regular", customRegular);
125 | 	testAB("non-blocking", customNonblocking);
126 | 
127 | 	// Clean up generated streams
128 | 	cudaStreamDestroy(customRegular);
129 | 	cudaStreamDestroy(customNonblocking);
130 | 
131 | 	return 0;
132 | }
133 | 
134 | /*
135 | Exercises:
136 | 1) Streams are a great way to bring task parallelism to the GPU. Think of a small
137 | program that can benefit from running two different kernels at the same time and
138 | write it, along with documentation of its inputs/outputs and usefulness.
139 | */


--------------------------------------------------------------------------------
/06_MemoryBasics/src/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime_api.h>
  2 | #include <iostream>
  3 | 
  4 | 
  5 | // Declaration of a device variable in constant memory
  6 | __constant__ int cFoo;
  7 | 
  8 | __global__ void ReadConstantMemory()
  9 | {
 10 |     printf("GPU: Reading constant memory --> %x\n", cFoo);
 11 | }
 12 | 
 13 | // Definition of a device variable in global memory
 14 | __device__ const int dFoo = 42;
 15 | 
 16 | __global__ void ReadGlobalMemory(const int* __restrict dBarPtr)
 17 | {
 18 |     printf("GPU: Reading global memory --> %d %x\n", dFoo, *dBarPtr);
 19 | }
 20 | 
 21 | __global__ void WriteGlobalMemory(int* __restrict dOutPtr)
 22 | {
 23 |     *dOutPtr = dFoo * dFoo;
 24 | }
 25 | 
 26 | __device__ void WriteAndPrintSharedMemory(int* sFoo)
 27 | {
 28 |     // Write a computed result to shared memory for other threads to see
 29 |     sFoo[threadIdx.x] = 42 * (threadIdx.x + 1);
 30 |     // We make sure that no thread prints while the other still writes (parallelism!)
 31 |     __syncwarp();
 32 |     // Print own computed result and result by neighbor
 33 |     printf("ThreadID: %d, sFoo[0]: %d, sFoo[1]: %d\n", threadIdx.x, sFoo[0], sFoo[1]);
 34 | }
 35 | 
 36 | __global__ void WriteAndPrintSharedMemoryFixed()
 37 | {
 38 |     // Fixed allocation of two integers in shared memory
 39 |     __shared__ int sFoo[2];
 40 |     // Use it for efficient exchange of information
 41 |     WriteAndPrintSharedMemory(sFoo);
 42 | }
 43 | 
 44 | __global__ void WriteAndPrintSharedMemoryDynamic()
 45 | {
 46 |     // Use dynamically allocated shared memory
 47 |     extern __shared__ int sFoo[];
 48 |     // Use it for efficient exchange of information
 49 |     WriteAndPrintSharedMemory(sFoo);
 50 | }
 51 | 
 52 | int main()
 53 | {
 54 |     std::cout << "==== Sample 06 - Memory Basics ====\n" << std::endl;
 55 |     /*
 56 |      Expected output:
 57 |         GPU: Reading constant memory --> caffe
 58 |         GPU: Reading global memory --> 42 caffe
 59 |         CPU: Copied back from GPU --> 1764
 60 | 
 61 |         Using static shared memory to share computed results
 62 |         ThreadID: 0, sFoo[0]: 42, sFoo[1]: 84
 63 |         ThreadID: 1, sFoo[0]: 42, sFoo[1]: 84
 64 | 
 65 |         Using dynamic shared memory to share computed results
 66 |         ThreadID: 0, sFoo[0]: 42, sFoo[1]: 84
 67 |         ThreadID: 1, sFoo[0]: 42, sFoo[1]: 84
 68 |     */
 69 | 
 70 |     const int bar = 0xcaffe;
 71 |     /*
 72 |      Uniform variables should best be placed in constant
 73 |      GPU memory. Can be updated with cudaMemcpyToSymbol.
 74 |      This syntax is unusual, but this is how it should be
 75 |     */
 76 |     cudaMemcpyToSymbol(cFoo, &bar, sizeof(int));
 77 |     ReadConstantMemory<<<1, 1>>>();
 78 |     cudaDeviceSynchronize();
 79 | 
 80 |     /*
 81 |      Larger or read-write data is easiest provisioned by
 82 |      global memory. Can be allocated with cudaMalloc and
 83 |      updated with cudaMemcpy. Must be free'd afterward.
 84 |     */
 85 |     int* dBarPtr;
 86 |     cudaMalloc((void**)&dBarPtr, sizeof(int));
 87 |     cudaMemcpy(dBarPtr, &bar, sizeof(int), cudaMemcpyHostToDevice);
 88 |     ReadGlobalMemory<<<1, 1>>>(dBarPtr);
 89 |     cudaDeviceSynchronize();
 90 |     cudaFree(dBarPtr);
 91 | 
 92 |     /*
 93 |      The CPU may also read back updates from the GPU by
 94 |      copying the relevant data from global memory after
 95 |      running the kernel. Notice that here, we do not use
 96 |      cudaDeviceSynchronize: cudaMemcpy will synchronize
 97 |      with the CPU automatically.
 98 |     */
 99 |     int out, *dOutPtr;
100 |     cudaMalloc((void**)&dOutPtr, sizeof(int));
101 |     WriteGlobalMemory<<<1,1>>>(dOutPtr);
102 |     cudaMemcpy(&out, dOutPtr, sizeof(int), cudaMemcpyDeviceToHost);
103 |     cudaFree(dOutPtr);
104 |     std::cout << "CPU: Copied back from GPU --> " << out << std::endl;
105 | 
106 |     /*
107 |     For information that is shared only within a single threadblock,
108 |     we can also use shared memory, which is usually more efficient than
109 |     global memory. Shared memory for a block may be statically allocated
110 |     inside the kernel, or dynamically allocated at the kernel launch. In
111 |     the latter case, the size of the required shared memory is provided as
112 |     the third launch parameter, and the kernel will be able to access the 
113 |     allocated shared memory via an array with the "extern" decoration. 
114 |     Below, we use both methods to provide shared memory for a kernel with 
115 |     two threads that exchange computed integers. 
116 |     */
117 |     std::cout << "\nUsing static shared memory to share computed results" << std::endl;
118 |     WriteAndPrintSharedMemoryFixed<<<1, 2>>>();
119 |     cudaDeviceSynchronize();
120 | 
121 |     std::cout << "\nUsing dynamic shared memory to share computed results" << std::endl;
122 |     WriteAndPrintSharedMemoryDynamic<<<1, 2, 2 * sizeof(int)>>>();
123 |     cudaDeviceSynchronize();
124 | 
125 |     return 0;
126 | }
127 | 
128 | /*
129 | Exercises:
130 | 1) Write a function that takes data from constant memory and writes it to global. 
131 | Copy it back from the GPU and print on the CPU.
132 | 2) Combine allocation, memcpy and several kernels in succession to produce a
133 | more complex result. E.g., kernel A adds global values X + Y and writes the
134 | result back to global, kernel B multiplies that result by Z. Convince yourself 
135 | that the results remain in global memory between kernel launches and that a
136 | kernel or a memcpy that runs after an earlier kernel can safely access the global
137 | data that it produced, even if you don't use cudaDeviceSynchronize inbetween.
138 | 3) Try to write a kernel where one thread writes a value to shared memory without 
139 | a syncwarp, so that other threads may fail to see it. You might need a block 
140 | size larger than 32 threads for this to happen and you may have to let the writing 
141 | thread do some "fake" work to delay its write to shared memory. Or it may work
142 | immediately :) A solution should be provided by the following code sample.
143 | */


--------------------------------------------------------------------------------
/10_Events/src/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime_api.h>
  2 | #include <iostream>
  3 | #include <chrono>
  4 | #include <thread>
  5 | #include "../../shared/include/utility.h"
  6 | 
  7 | // A kernel that wastes some time
  8 | __global__ void SlowKernel()
  9 | {
 10 |     samplesutil::WasteTime(1'000'000'000ULL);
 11 | }
 12 | 
 13 | __device__ int dFoo;
 14 | 
 15 | // A kernel that only sets dFoo
 16 | __global__ void SetFoo(int foo)
 17 | {
 18 |     dFoo = foo;
 19 | }
 20 | 
 21 | // A kernel that prints dFoo
 22 | __global__ void PrintFoo()
 23 | {
 24 |     printf("foo: %d\n", dFoo);
 25 | }
 26 | 
 27 | int main()
 28 | {
 29 |     std::cout << "==== Sample 10 - Events ====\n" << std::endl;
 30 |     /*
 31 |      Using events to measure time and communicate across streams.
 32 | 
 33 |      Expected output: 
 34 |      1) Unrealistically short time with chrono measurements without syncing, 
 35 |      similar times for chrono with syncing and when using CUDA events.
 36 |      2) foo: 42
 37 |     */
 38 |     using namespace std::chrono_literals;
 39 |     using namespace std::chrono;
 40 | 
 41 |     // Create CUDA events
 42 |     cudaEvent_t start, end;
 43 |     cudaEventCreate(&start);
 44 |     cudaEventCreate(&end);
 45 | 
 46 |     // Synchronize GPU with CPU to capture adequate time
 47 |     cudaDeviceSynchronize();
 48 |     auto before = std::chrono::system_clock::now();
 49 | 
 50 |     // Record start directly before first relevant GPU command
 51 |     cudaEventRecord(start);
 52 |     // Launch a light-weight GPU kernel and heavy GPU kernel
 53 |     SetFoo<<<1,1>>>(0);
 54 |     SlowKernel<<<1,1>>>();
 55 |     // Record end directly after last relevant GPU command
 56 |     cudaEventRecord(end);
 57 |     // Also measure CPU time after last GPU command, without synching
 58 |     auto afterNoSync = std::chrono::system_clock::now();
 59 | 
 60 |     // Synchronize CPU and GPU
 61 |     cudaDeviceSynchronize();
 62 |     // Measure CPU time after last GPU command, with synching
 63 |     auto afterSync = std::chrono::system_clock::now();
 64 | 
 65 |     // Print measured CPU time without synchronization
 66 |     float msCPUNoSync = 1000.f * duration_cast<duration<float>>(afterNoSync - before).count();
 67 |     std::cout << "Measured time (chrono, no sync): " << msCPUNoSync << "ms\n";
 68 | 
 69 |     // Print measured CPU time with synchronization
 70 |     float msCPUSync = 1000.f * duration_cast<duration<float>>(afterSync - before).count();
 71 |     std::cout << "Measured time (chrono, sync): " << msCPUSync << "ms\n";
 72 | 
 73 |     // Print measured GPU time measured with CUDA events
 74 |     float msGPU;
 75 |     cudaEventElapsedTime(&msGPU, start, end);
 76 |     std::cout << "Measured time (CUDA events): " << msGPU << "ms\n";
 77 | 
 78 |     /*
 79 |     The difference between the two methods, CPU timing and events, is
 80 |     important when writing more complex projects: kernels are being 
 81 |     launched asynchronously. The launch returns immediately so the CPU
 82 |     can progress with other jobs. This means that to get a proper timing,
 83 |     we always have to synchronize CPU and GPU before measuring current time
 84 |     with chrono. With CUDA events, we can insert them into streams before
 85 |     and after the actions we want to measure. We can have multiple events
 86 |     inserted at many different points. We still have to synchronize, but 
 87 |     only when we eventually want to ACCESS the measurements on the CPU 
 88 |     (e.g., once for all timings at the end of a frame to get a report).
 89 | 
 90 |     Make sure that you don't try to measure parts of your program with 
 91 |     events that mix GPU and CPU code. Events for start and end should 
 92 |     only enclose code portions with GPU tasks. Otherwise you won't be 
 93 |     sure what you are measuring and might get non-reproducible results!
 94 |     */
 95 | 
 96 |     //Clean up events
 97 |     cudaEventDestroy(start);
 98 |     cudaEventDestroy(end);
 99 | 
100 |     /*
101 |     Dependencies across streams:
102 | 
103 |     Events may also be used to introduce dependencies
104 |     across streams. One stream may compute an important
105 |     piece of information that another should use. This
106 |     dependency can be modelled by recording an event in
107 |     one stream and have the target stream wait on this 
108 |     event. Commands launched to the stream will not 
109 |     continue until the event is observed.
110 |     */
111 | 
112 |     // Create a new event to signal that data is ready
113 |     cudaEvent_t fooReady;
114 |     cudaEventCreate(&fooReady);
115 | 
116 |     // Create two streams, one producer, one consumer
117 |     cudaStream_t producer, consumer;
118 |     cudaStreamCreate(&producer);
119 |     cudaStreamCreate(&consumer);
120 | 
121 |     /* 
122 |     Enforce the following behavior for producer/consumer streams:
123 | 
124 |         Producer    Consumer
125 |            |            .
126 |       slow kernel       .
127 |            |            .
128 |        sets foo         .
129 |            \____________.
130 |                         |
131 |                     print foo
132 |     */
133 | 
134 |     // Producer stream simulates some hard work
135 |     SlowKernel<<<1, 1, 0, producer>>>();
136 |     // Producer sets foo to an important value
137 |     SetFoo<<<1, 1, 0, producer>>>(42);
138 |     // Producer notifies consumer stream that foo is ready
139 |     cudaEventRecord(fooReady, producer);
140 | 
141 |     // Consumer waits for ready event
142 |     cudaStreamWaitEvent(consumer, fooReady);
143 |     // Without waiting, consumer MAY print before foo is ready!
144 |     PrintFoo<<<1, 1, 0, consumer>>>();
145 | 
146 |     // Wait for printf outputs
147 |     cudaDeviceSynchronize();
148 |  
149 |     return 0;
150 | }
151 | 
152 | /*
153 | Exercises:
154 | 1) Write a simple function that calls several different kernels, each of which
155 | should do a bit of work. Use multiple events and just one cuda...Synchronize,
156 | and report in % how much each kernel contributes to the total procedure run time. 
157 | */


--------------------------------------------------------------------------------
/11_PinnedMemory/src/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime_api.h>
  2 | #include <iostream>
  3 | #include <vector>
  4 | #include <random>
  5 | #include <chrono>
  6 | #include <thread>
  7 | #include "../../shared/include/utility.h"
  8 | 
  9 | // Simulate a complex task, but actually only compute a square
 10 | __global__ void PerformComplexTask(float input, float* __restrict result)
 11 | {
 12 | 	samplesutil::WasteTime(100'000'000ULL);
 13 | 	*result = input * input;
 14 | }
 15 | 
 16 | int main()
 17 | {
 18 | 	std::cout << "==== Sample 11 - Pinned Memory ====\n" << std::endl;
 19 | 	/*
 20 | 	Pinned memory becomes relevant once we start using streams
 21 | 	and memory transfer enters the mix. The default memcpy operation
 22 | 	cudaMemcpy is, by default synchronous, i.e., when it is called,
 23 | 	the CPU will stall until the memcpy has finished. However, in 
 24 | 	many cases we don't want this. Consider the example below, 
 25 | 	where we use several streams to compute some expensive result
 26 | 	from different inputs. For each 'task', we launch a kernel to
 27 | 	a separate stream, followed by a memcpy of the result back to
 28 | 	the GPU. 
 29 | 
 30 | 	Ideally, we would like the memory transfers to overlap with kernels
 31 | 	that run in different streams. But if we use cudaMemcpy, the kernel 
 32 | 	calls will execute sequentially, because each cudaMemcpy implicitly 
 33 | 	synchronizes the default stream with the CPU, and all basic streams 
 34 | 	are synchronized with the default stream. However, there is a different 
 35 | 	memory transfer function of the name cudaMemcpyAsync, which also takes 
 36 | 	an additional stream parameter in which to run. However, using this 
 37 | 	function alone is not enough to overlap memory transfer and kernels.
 38 | 	To perform asynchronous memcpy between the device and the host, CUDA 
 39 | 	must be sure that the host memory is available in main memory. We 
 40 | 	can guarantee this by allocating memory with cudaMallocHost. This is
 41 | 	so-called "pinned" memory, which may never be moved or swapped out.
 42 | 	If we use pinned memory and cudaMemcpyAsync, then copies and kernels
 43 | 	that run in different streams are free to overlap.
 44 | 
 45 | 	Expected output: slow performance for all combinations that are not
 46 | 	pinned memory and asynchronous copy, due to implicit synchronization
 47 | 	preventing concurrent execution of kernels.
 48 | 	*/
 49 | 
 50 | 	constexpr unsigned int TASKS = 4;
 51 | 
 52 | 	// Allocate result values for GPU to write to
 53 | 	float* dResultsPtr;
 54 | 	cudaMalloc((void**)&dResultsPtr, sizeof(float) * TASKS);
 55 | 
 56 | 	// Generate necessary streams and events
 57 | 	cudaStream_t streams[TASKS];
 58 | 	cudaEvent_t events[TASKS];
 59 | 	for (int i = 0; i < TASKS; i++)
 60 | 	{
 61 | 		cudaStreamCreate(&streams[i]);
 62 | 		cudaEventCreate(&events[i]);
 63 | 	}
 64 | 
 65 | 	// Two CPU-side memory ranges: one regular and one pinned
 66 | 	float results[TASKS], * results_pinned;
 67 | 	cudaMallocHost((void**)&results_pinned, sizeof(float) * TASKS);
 68 | 
 69 | 	// We run the tasks with regular/async memcpy
 70 | 	enum class CPYTYPE { MEMCPY, MEMCPYASYNC };
 71 | 	// We run the tasks with regular/pinned memory
 72 | 	enum class MEMTYPE { REGULAR, PINNED};
 73 | 
 74 | 	for (auto cpy : { CPYTYPE::MEMCPY, CPYTYPE::MEMCPYASYNC })
 75 | 	{
 76 | 		for (auto mem : { MEMTYPE::REGULAR, MEMTYPE::PINNED })
 77 | 		{
 78 | 			float* dst = (mem == MEMTYPE::PINNED ? results_pinned : results);
 79 | 
 80 | 			std::cout << "Performing tasks with " << (mem == MEMTYPE::PINNED ? "pinned memory" : "regular memory");
 81 | 			std::cout << " and " << (cpy == CPYTYPE::MEMCPYASYNC ? "asynchronous" : "regular") << " copy" << std::endl;
 82 | 
 83 | 			// Reset GPU result
 84 | 			cudaMemset(dResultsPtr, 0, sizeof(float) * TASKS);
 85 | 
 86 | 			// Synchronize to get adequate CPU time measurements
 87 | 			cudaDeviceSynchronize();
 88 | 			const auto before = std::chrono::system_clock::now();
 89 | 
 90 | 			for (int i = 0; i < TASKS; i++)
 91 | 			{
 92 | 				// Unnecessarily slow kernel
 93 | 				PerformComplexTask<<<1, 1, 0, streams[i]>>>(i+1, dResultsPtr+i);
 94 | 				// Use either regular or asynchronous copy for reading back results
 95 | 				if (cpy == CPYTYPE::MEMCPYASYNC)
 96 | 					cudaMemcpyAsync(&dst[i], dResultsPtr+i, sizeof(float), cudaMemcpyDeviceToHost, streams[i]);
 97 | 				else
 98 | 					cudaMemcpy(&dst[i], dResultsPtr + i, sizeof(float), cudaMemcpyDeviceToHost);
 99 | 			}
100 | 
101 | 			// Wait for results being copied back
102 | 			for (int i = 0; i < TASKS; i++)
103 | 			{
104 | 				// Wait for the current stream
105 | 				cudaStreamSynchronize(streams[i]);
106 | 
107 | 				// Evaluate result and print
108 | 				if (dst[i] != (i + 1) * (i + 1))
109 | 					std::cout << "Task failed or CPU received wrong value!" << std::endl;
110 | 				else
111 | 					std::cout << "Finished task " << i << ", produced output: " << results[i] << std::endl;
112 | 			}
113 | 
114 | 			const auto after = std::chrono::system_clock::now();
115 | 			std::cout << "Time: " << std::chrono::duration_cast<std::chrono::duration<float>>(after-before).count() << "s\n\n";
116 | 		}
117 | 	}
118 | 
119 | 	// Clean up streams
120 | 	for (cudaStream_t& s : streams)
121 | 		cudaStreamDestroy(s);
122 | 
123 | 	// Pinned memory should be freed with cudaFreeHost
124 | 	cudaFreeHost(results_pinned);
125 | }
126 | 
127 | /*
128 | Exercises:
129 | 1) Create a sketch of this experiment to illustrate what is happening. 
130 | For each of the configurations, try to establish the timeline of what happens on
131 | CPU/GPU and why you get the results you do. If you can, feel free to use NVIDIA
132 | Nsight Systems, which should make this very easy.
133 | 2) Write a simple program that just compares the performance of copying pinned vs
134 | non-pinned back and forth with cudaMemcpy in a single stream a couple of times.
135 | How does the performance difference develop as you change the size of the copy?
136 | 3) Try to make the streams non-blocking using 
137 | cudaCreateStreamWithFlags(..., cudaStreamNonBlocking)! In this case, you
138 | can expect wrong results for cudaMemcpy: Non-blocking means the default
139 | stream won't wait for the custom streams running the kernels to finish
140 | before it starts copying. Can you draw a sketch of what is going wrong?
141 | */


--------------------------------------------------------------------------------
/15_GraphAPI/src/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime_api.h>
  2 | #include <iostream>
  3 | #include <vector>
  4 | #include <cooperative_groups.h>
  5 | #include "../../shared/include/utility.h"
  6 | 
  7 | // A simple kernel function to keep threads busy for a while
  8 | __global__ void busy()
  9 | {
 10 | 	samplesutil::WasteTime(1'000'000'000ULL);
 11 | 	printf("I'm awake!\n");
 12 | }
 13 | 
 14 | void runTasksSequentially(unsigned int numTasks)
 15 | {
 16 | 	// We use cudaStreamPerThread here. It makes no difference
 17 | 	// for the program flow because we are only single-threaded
 18 | 	// anyway, but capturing the application-wide default stream 
 19 | 	// cudaStreamLegacy is not permitted by the graph API.
 20 | 
 21 | 	for (int i = 0; i < numTasks; i++)
 22 | 		busy << <1, 1, 0, cudaStreamPerThread >> > ();
 23 | }
 24 | 
 25 | void runTasksWithStreams(unsigned int numTasks)
 26 | {
 27 | 	/*
 28 | 	This stream-based function can be directly captured with graph API.
 29 | 	Events are used to encode dependencies / start / end of capture.
 30 | 	*/
 31 | 	std::vector<cudaStream_t> streams(numTasks);
 32 | 	std::vector<cudaEvent_t> finished(numTasks);
 33 | 	for (int i = 0; i < numTasks; i++)
 34 | 	{
 35 | 		cudaStreamCreate(&streams[i]);
 36 | 		cudaEventCreate(&finished[i]);
 37 | 	}
 38 | 	// We need an additional event to represent the capture start
 39 | 	cudaEvent_t start;
 40 | 	cudaEventCreate(&start);
 41 | 	// Immediately record the starting event so other streams can connect to it
 42 | 	cudaEventRecord(start, cudaStreamPerThread);
 43 | 	// All other streams must connect to origin stream via event to get captured
 44 | 	for (int i = 0; i < numTasks; i++)
 45 | 	{
 46 | 		// Establish dependency / connection to origin (is now included in capture)
 47 | 		cudaStreamWaitEvent(streams[i], start);
 48 | 		// Run actual task (kernel) in stream
 49 | 		busy << <1, 1, 0, streams[i] >> > ();
 50 | 		// Record end event of this stream so origin can wait on it
 51 | 		cudaEventRecord(finished[i], streams[i]);
 52 | 	}
 53 | 	// Origin stream waits until all custom streams have finished their task
 54 | 	for (int i = 0; i < numTasks; i++)
 55 | 	{
 56 | 		cudaStreamWaitEvent(cudaStreamPerThread, finished[i]);
 57 | 		cudaStreamDestroy(streams[i]);
 58 | 	}
 59 | }
 60 | 
 61 | template <typename T, typename ...P>
 62 | cudaGraphExec_t recordGraphFromFunction(const T& func, P ...params)
 63 | {
 64 | 	// Create a graph for recording GPU commands
 65 | 	cudaGraph_t graph;
 66 | 	cudaGraphCreate(&graph, 0);
 67 | 
 68 | 	// Record a graph, assuming that functions start from thread's default stream
 69 | 	cudaStreamBeginCapture(cudaStreamPerThread, cudaStreamCaptureModeGlobal);
 70 | 	func(params...);
 71 | 	cudaStreamEndCapture(cudaStreamPerThread, &graph);
 72 | 
 73 | 	// Turn the recorded graph into an executable instance
 74 | 	cudaGraphExec_t instance;
 75 | 	cudaGraphInstantiate(&instance, graph, nullptr, nullptr, 0);
 76 | 	// The recorded graph is no longer needed
 77 | 	cudaGraphDestroy(graph);
 78 | 	return instance;
 79 | }
 80 | 
 81 | cudaGraphExec_t buildGraphForParallelTasks(unsigned int numTasks)
 82 | {
 83 | 	// Set up a graph from scratch
 84 | 	cudaGraph_t graph;
 85 | 	cudaGraphCreate(&graph, 0);
 86 | 
 87 | 	// Create a node for each kernel in the graph, with grid config and parameters
 88 | 	std::vector<cudaGraphNode_t> nodes(numTasks);
 89 | 	cudaKernelNodeParams params = { reinterpret_cast<void*>(busy), {1,1,1}, {1,1,1}, 0, nullptr, nullptr };
 90 | 
 91 | 	// Add them to the graph. This simple setup has no dependencies, passing nullptr
 92 | 	for (int i = 0; i < numTasks; i++)
 93 | 		cudaGraphAddKernelNode(&nodes[i], graph, nullptr, 0, &params);
 94 | 
 95 | 	// Create executable graph, destroy manually built graph
 96 | 	cudaGraphExec_t instance;
 97 | 	cudaGraphInstantiate(&instance, graph, 0, 0, 0);
 98 | 	cudaGraphDestroy(graph);
 99 | 	return instance;
100 | }
101 | 
102 | int main()
103 | {
104 | 	std::cout << "==== Sample 15 - Graph API ====\n" << std::endl;
105 | 	/*
106 | 	The graph API enables the creation of well-defined structures that
107 | 	encode the types, parameters and dependencies of instructions that
108 | 	the GPU should process. By preparing this information, developers can
109 | 	decouple the definition and execution of the parallel workload. The
110 | 	driver is then free to optimize its execution. Graphs may be created
111 | 	either by setting up graphs manually from scratch or by recording
112 | 	already available code, which may occasionally require modifications:
113 | 	CUDA graphs usually do not include synchronization methods with the
114 | 	CPU. This means that waiting actions (e.g., until all streams have
115 | 	finished) must be modelled via dependencies/events instead.
116 | 
117 | 	Expected output: 5 x TASKS "I'm awake\n", first two groups launching
118 | 	sequentially, the last three groups running concurrently. 
119 | 	*/
120 | 
121 | 	constexpr int TASKS = 4;
122 | 
123 | 	std::cout << "Launching multiple tasks sequentially" << std::endl;
124 | 	// Launching multiple tasks as kernels one after the other
125 | 	runTasksSequentially(TASKS);
126 | 	cudaDeviceSynchronize();
127 | 
128 | 	std::cout << "Running recorded graph from existing sequential code" << std::endl;
129 | 	// Recording a graph from the existing sequential code and launching its instance
130 | 	cudaGraphExec_t recordedSequential = recordGraphFromFunction(runTasksSequentially, TASKS);
131 | 	cudaGraphLaunch(recordedSequential, 0);
132 | 	cudaDeviceSynchronize();
133 | 
134 | 	std::cout << "Launching multiple tasks with streams" << std::endl;
135 | 	// Launching multiple tasks in multiple streams
136 | 	runTasksWithStreams(TASKS);
137 | 	cudaDeviceSynchronize();
138 | 
139 | 	std::cout << "Running recorded graph from existing stream-based code" << std::endl;
140 | 	// Recording a graph from the existing stream-based code, launching instance
141 | 	cudaGraphExec_t recordedStreams = recordGraphFromFunction(runTasksWithStreams, TASKS);
142 | 	cudaGraphLaunch(recordedStreams, 0);
143 | 	cudaDeviceSynchronize();
144 | 
145 | 	std::cout << "Running manually-built graph that behaves like streams" << std::endl;
146 | 	// Example for building a scratch manually without recording
147 | 	cudaGraphExec_t instanceBuilt = buildGraphForParallelTasks(TASKS);
148 | 	cudaGraphLaunch(instanceBuilt, 0);
149 | 	cudaDeviceSynchronize();
150 | 
151 | 	return 0;
152 | }
153 | 
154 | /*
155 | Exercises:
156 | 1) Manually build and instantiate a working CUDA graph that includes a host-side 
157 | function node and demonstrate what it does.
158 | 2) Manually build and instantiate a working graph that includes a different kind 
159 | of node that does not yet occur in this application or in exercise 1).
160 | 3) Given a particular CUDA graph, create a simple GraphViz graph in DOT format of its 
161 | structure. Use your solution to sketch out the graphs created by this source code.
162 | */


--------------------------------------------------------------------------------
/13_MemoryFences/src/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime_api.h>
  2 | #include <iostream>
  3 | #include "../../shared/include/utility.h"
  4 | 
  5 | /*
  6 |  Producer function.
  7 | 
  8 |  Following a threadfence (memory barrier) with a volatile yields a release pattern.
  9 |  Following a threadfence (memory barrier) with an atomic yields a release pattern.
 10 |  Note however, that neither of these options is ideal. For one, we combine two
 11 |  operations to achieve a certain behavior. Second, threadfence is a general memory
 12 |  barrier, and is thus stronger than it may have to be (e.g., release barrier only).
 13 |  Volta+ actually have support for memory coherency models with proper acquire /
 14 |  release semantics, which are exposed to the programmer via cuda::std::atomic in
 15 |  libcu++.
 16 | */
 17 | template <bool ATOMIC>
 18 | __device__ void ProduceFoo(unsigned int id, float* dFooPtr, int *dFooReadyPtr)
 19 | {
 20 | 	float pi = samplesutil::GregoryLeibniz(10'000'000);
 21 | 	dFooPtr[id] = pi;
 22 | 
 23 | 	__threadfence();
 24 | 
 25 | 	if (ATOMIC)
 26 | 		atomicExch(&dFooReadyPtr[id], 1);
 27 | 	else
 28 | 		*((volatile int*)&dFooReadyPtr[id]) = 1;
 29 | }
 30 | 
 31 | /*
 32 |  Consumer function.
 33 | 
 34 |  Preceding a threadfence (memory barrier) with a volatile yields an acquire pattern.
 35 |  Preceding a threadfence (memory barrier) with an atomic yields an acquire pattern.
 36 |  Note however, that neither of these options is ideal. For one, we combine two 
 37 |  operations to achieve a certain behavior. Second, threadfence is a general memory
 38 |  barrier, and is thus stronger than it may have to be (e.g., acquire barrier only).
 39 |  Volta+ actually have support for memory coherency models with proper acquire /
 40 |  release semantics, which are exposed to the programmer via cuda::std::atomic in
 41 |  libcu++.
 42 | */
 43 | template <bool ATOMIC>
 44 | __device__ void ConsumeFoo(unsigned int id, const float* dFooPtr, int* dFooReadyPtr)
 45 | {
 46 | 	if (ATOMIC)
 47 | 		while (atomicAdd(&dFooReadyPtr[id], 0) == 0);
 48 | 	else
 49 | 		while (*((volatile int*)&dFooReadyPtr[id]) == 0);
 50 | 
 51 | 	__threadfence();
 52 | 
 53 | 	printf("Consumer %d thinks Pi is: %f\n", id, dFooPtr[id]);
 54 | }
 55 | 
 56 | // Launch either version of a safe producer / consumer scenarios
 57 | template <unsigned int N, bool ATOMIC>
 58 | __global__ void ProducerConsumer(float* dFooPtr, int* dFooReadyPtr)
 59 | {
 60 | 	int id = (blockIdx.x * blockDim.x + threadIdx.x);
 61 | 
 62 | 	if (id < N)
 63 | 		ProduceFoo<ATOMIC>(id, dFooPtr, dFooReadyPtr);
 64 | 	else
 65 | 		ConsumeFoo<ATOMIC>(id - N, dFooPtr, dFooReadyPtr);
 66 | }
 67 | 
 68 | /*
 69 |  As we have seen before, although we didnt explicilty mention it,
 70 |  using a syncthreads inside a block is sufficient to make sure that
 71 |  the other threads can observe the data that was previously written
 72 |  by another thread in the block. Here we illustrate this again, 
 73 |  with a simple, safe producer / consumer setup, where syncthreads
 74 |  ensures ordering of operations and visibility of the data for all
 75 |  threads in the block.
 76 | */
 77 | __global__ void ProducerConsumerShared()
 78 | {
 79 | 	extern __shared__ float sFoo[];
 80 | 
 81 | 	if (threadIdx.x < blockDim.x/2)
 82 | 	{
 83 | 		float pi = samplesutil::GregoryLeibniz(10'000'000);
 84 | 		sFoo[threadIdx.x] = pi;
 85 | 	}
 86 | 	// Synchronize threads in block AND ensure memory access ordering among them
 87 | 	__syncthreads();
 88 | 	if (threadIdx.x >= blockDim.x / 2)
 89 | 	{
 90 | 		int cId = threadIdx.x - blockDim.x / 2;
 91 | 		printf("Comsumer %d thinks Pi is %f\n", cId, sFoo[cId]);
 92 | 	}
 93 | }
 94 | 
 95 | int main()
 96 | {
 97 | 	std::cout << "==== Sample 13 - Memory Fences ====\n" << std::endl;
 98 | 	/*
 99 | 	 So far, we have ignored the problem of memory fencing, which
100 | 	 is relevant in multi-threaded applications. We can exchange 
101 | 	 information securely via atomic variables, however when we store
102 | 	 data in bulk or need to ensure a particular ordering of observed
103 | 	 events, for instance in a producer/consumer scenario, we need clear 
104 | 	 orderings of data accesses that are definite for all involved threads. 
105 | 	 For threads within a block, this is trivially achieved by using 
106 | 	 syncthreads. For establishing orderings across blocks, CUDA offers
107 | 	 the __threadfence operation. This can be necessary, because the default 
108 | 	 atomicXXX operations of CUDA only give us "RELAXED" semantics, i.e., 
109 | 	 they have no synchronization effect on other memory. However, combining
110 | 	 a thread fence with relaxed atomics can---much like in C++11---give us 
111 | 	 acquire / release semantics. 
112 | 	 
113 | 	 At its core, threadfence is a general memory barrier, which makes sure 
114 | 	 that all writes below it occur after all writes above it, and that all 
115 | 	 reads below it occur after all reads above it. However, there are some
116 | 	 intricacies that make the safe use of threadfence a little tricky. 
117 | 	 Understanding all possible scenarios is complex task, and may not be worth 
118 | 	 the effort, since modern CUDA offers better alternatives (see material and 
119 | 	 samples for CUDA standard library). A basic recipe for safely using
120 | 	 __threadfence is as part of a release-acquire pattern. The PTX ISA states
121 | 	 that a __threadfence, followed by an atomic or volatile memory operation,
122 | 	 yields a release pattern, while a __threadfence preceding an atomic or
123 | 	 volatile memory operation yields an acquire pattern. With these patterns,
124 | 	 we can for instance solve the producer / consumer scenario by using flags
125 | 	 that indicate when data is ready, and securing access to them with proper
126 | 	 acquire / release behavior. 
127 | 
128 | 	 Expected output: 
129 | 
130 | 		Producer / consumer pair in same block
131 | 		Comsumer 15 thinks Pi is 3.141597
132 | 		Comsumer 16 thinks Pi is 3.141597
133 | 		Comsumer 0 thinks Pi is 3.141597
134 | 		...
135 | 		(or similar)
136 | 
137 | 		Producer / consumer pair with volatile + threadfence
138 | 		Consumer 4 thinks Pi is: 3.141597
139 | 		...
140 | 		(or similar)
141 | 
142 | 		Producer / consumer pair with volatile + atomic
143 | 		Consumer 4 thinks Pi is: 3.141597
144 | 		...
145 | 		(or similar)
146 | 	*/
147 | 
148 | 	constexpr unsigned int N = 8;
149 | 	constexpr unsigned int blockSize = 4;
150 | 	
151 | 	// Compute how many producer / consumer blocks should be launched
152 | 	unsigned int numBlocks = N / blockSize;
153 | 
154 | 	// Run producer / consumer scenario inside a single block (simple)
155 | 	std::cout << "\nProducer / consumer pair in same block" << std::endl;
156 | 	ProducerConsumerShared<<<1, 34, 34 * sizeof(float)>>>();
157 | 	cudaDeviceSynchronize();
158 | 
159 | 	// Allocate and initialize mmeory for global producer / consumer scenario
160 | 	float* dFooPtr;
161 | 	int* dFooReadyPtr;
162 | 	cudaMalloc((void**)&dFooPtr, sizeof(float) * N);
163 | 	cudaMalloc((void**)&dFooReadyPtr, sizeof(int) * N);
164 | 	cudaMemset(dFooPtr, 0, sizeof(float) * N);
165 | 	cudaMemset(dFooReadyPtr, 0, sizeof(int) * N);
166 | 
167 | 	// Producer / consumer scenario across blocks in global memory, using volatile + threadfence
168 | 	std::cout << "\nProducer / consumer pair with volatile + threadfence" << std::endl;
169 | 	ProducerConsumer<N, false><<<numBlocks * 2, blockSize>>>(dFooPtr, dFooReadyPtr);
170 | 	cudaDeviceSynchronize();
171 | 
172 | 	// Producer / consumer scenario across blocks in global memory, using atomic + threadfence
173 | 	std::cout << "\nProducer / consumer pair with atomic + threadfence" << std::endl;
174 | 	ProducerConsumer<N, true><<<numBlocks * 2, blockSize>>>(dFooPtr, dFooReadyPtr);
175 | 	cudaDeviceSynchronize();
176 | 
177 | 	return 0;
178 | }
179 | 
180 | /*
181 | Exercises:
182 | 1) TRY to write a program where one thread reliably observes writes in the WRONG
183 | order, due to lack of threadfence (e.g., in your code one thread sets A from 0 
184 | to 1, followed by setting B from 0 to 1, but another thread observes A = 0, B = 1,
185 | or something similar). To do this, you may want to make sure those threads run in 
186 | different blocks, preferably even on different SMs, and communicate via global 
187 | memory, try atomics and volatiles. If you can't manage, document your best attempt. 
188 | */


--------------------------------------------------------------------------------
/17_CooperativeGroups/src/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime_api.h>
  2 | #include <iostream>
  3 | #include <vector>
  4 | #include <iomanip>
  5 | #include <cooperative_groups.h>
  6 | #include <cooperative_groups/reduce.h>
  7 | #include "../../shared/include/utility.h"
  8 | 
  9 | // Shortening cooperative groups namespace for convenience
 10 | namespace cg = cooperative_groups;
 11 | 
 12 | // We keep the result of the reduction in managed memory
 13 | __managed__ float mResult;
 14 | 
 15 | template <unsigned int BLOCK_SIZE>
 16 | __global__ void reduceGroups(const float* __restrict input, int N)
 17 | {
 18 |     // Can conveniently obtain groups for grid and block
 19 |     auto block = cg::this_thread_block();
 20 |     unsigned int gId = cg::this_grid().thread_rank();
 21 | 
 22 |     __shared__ float data[BLOCK_SIZE];
 23 |     data[block.thread_rank()] = (gId < N ? (input[gId] + input[gId + N / 2]) : 0);
 24 | 
 25 |     for (int s = blockDim.x / 2; s > 16; s /= 2)
 26 |     {
 27 |         // Rather than selecting explicit sync functions, groups offer sync()
 28 |         block.sync();
 29 |         if (block.thread_rank() < s)
 30 |             data[block.thread_rank()] += data[block.thread_rank() + s];
 31 |     }
 32 | 
 33 |     // Splitting blocks into warp groups is cleaner than checking threadIdx
 34 |     auto warp = cg::tiled_partition<32>(block);
 35 |     if (warp.meta_group_rank() == 0)
 36 |     {
 37 |         // Reduction primitives - will be hardware-accelerated on CC 8.0+
 38 |         float v = cg::reduce(warp, data[warp.thread_rank()], cg::plus<float>());
 39 |         if (warp.thread_rank() == 0)
 40 |             atomicAdd(&mResult, v);
 41 |     }
 42 | }
 43 | 
 44 | void ReduceWithGroups()
 45 | {
 46 |     constexpr unsigned int BLOCK_SIZE = 256, N = 1'000'000;
 47 | 
 48 |     std::cout << "Producing random inputs..." << std::endl;
 49 |     // Generate some random numbers to reduce
 50 |     std::vector<float> vals;
 51 |     float* dValsPtr;
 52 |     samplesutil::prepareRandomNumbersCPUGPU(N, vals, &dValsPtr);
 53 |     // Prepare grid configuration for input and used reduction technique
 54 |     const dim3 blockDim = { BLOCK_SIZE, 1, 1 };
 55 |     const dim3 gridDim = { (N / 2 + BLOCK_SIZE) / BLOCK_SIZE, 1, 1 };
 56 | 
 57 |     // Events for measuring run time
 58 |     cudaEvent_t start, end;
 59 |     cudaEventCreate(&start);
 60 |     cudaEventCreate(&end);
 61 | 
 62 |     // Setting managed result variable
 63 |     mResult = 0;
 64 |     cudaEventRecord(start);
 65 |     reduceGroups<BLOCK_SIZE><<<gridDim, blockDim>>>(dValsPtr, N);
 66 |     cudaEventRecord(end);
 67 | 
 68 |     float ms;
 69 |     // Synchronizing to event. Event is last, same effect as cudaDeviceSynchronize
 70 |     cudaEventSynchronize(end);
 71 |     cudaEventElapsedTime(&ms, start, end);
 72 |     std::cout << std::setw(20) << "Reduce Groups" << "\t" << ms << "ms \t" << mResult << "\n\n";
 73 | }
 74 | 
 75 | __managed__ unsigned int mHappyNumSum;
 76 | __managed__ unsigned int mHappyNumCount;
 77 | 
 78 | __global__ void happyNumbersGroups(unsigned int start, unsigned int N, unsigned int* mHappyNumbers)
 79 | {
 80 |     // Retrieve the input number based on the thread's global id
 81 |     unsigned int input = cg::this_grid().thread_rank() + start;
 82 |     // Compute whether or not the input number is in range and "happy" (utility function)
 83 |     bool happy = ((input-start) < N) && samplesutil::isHappy(input);
 84 |     // Create a group for the current warp
 85 |     auto warp = cg::tiled_partition<32>(cg::this_thread_block());
 86 |     // Create a child group to separate threads with happy and unhappy numbers
 87 |     auto g = cg::binary_partition(warp, happy);
 88 | 
 89 |     if (happy)
 90 |     {
 91 |         // Compute the group's partial result of the sum of happy numbers
 92 |         unsigned int partial_sum = cg::reduce(g, input, cg::plus<unsigned int>());
 93 |         // One thread uses aggregate atomics to compute sum and write offset of happy numbers
 94 |         unsigned int offset;
 95 |         if (g.thread_rank() == 0)
 96 |         {
 97 |             atomicAdd(&mHappyNumSum, partial_sum);
 98 |             offset = atomicAdd(&mHappyNumCount, g.size());
 99 |         }
100 |         // Distribute offset information from the thread that computed it to all others
101 |         offset = g.shfl(offset, 0);
102 |         // Each thread writes their happy number in a unique location
103 |         mHappyNumbers[offset + g.thread_rank()] = input;
104 |     }
105 | }
106 | 
107 | void HappyNummbersWithGroups(unsigned int start, unsigned int N)
108 | {
109 |     // Initialize / allocate result storage. For brevity, we use managed memory
110 |     mHappyNumSum = 0;
111 |     mHappyNumCount = 0;
112 |     unsigned int* mHappyNumbers;
113 |     cudaMallocManaged((void**)&mHappyNumbers, sizeof(unsigned int) * N);
114 | 
115 |     // Compute count, sum and list of base 10 "happy numbers" from start to start+N
116 |     happyNumbersGroups<<<(N + 255) / 256, 256>>>(start, N, mHappyNumbers);
117 |     cudaDeviceSynchronize();
118 | 
119 |     // Print the count, sum and list of happy numbers in the given range
120 |     std::cout << "No. of happy numbers in " << start << " - " << N << ": " << mHappyNumCount << "\n";
121 |     std::cout << "Sum of happy numbers in " << start << " - " << N << ": " << mHappyNumSum << "\n";
122 |     std::cout << "\nList of happy numbers in " << start << " - " << N << ": ";
123 | 
124 |     // Sort the managed memory happy number list in ascending order
125 |     std::sort(mHappyNumbers, mHappyNumbers + mHappyNumCount);
126 |     for (int i = 0; i < mHappyNumCount; i++)
127 |         std::cout << mHappyNumbers[i] << ((i == mHappyNumCount - 1) ? "\n" : ", ");
128 | }
129 | 
130 | int main()
131 | {
132 |     std::cout << "==== Sample 17 - Cooperative Groups ====\n" << std::endl;
133 |     /*
134 |     Cooperative groups are very versatile. They can be created for entire
135 |     grids, blocks, warps or opportunistically for converged threads. In
136 |     essence, they package a range of recent CUDA features in an interface
137 |     that abstracts away the low-level instructions, making CUDA code 
138 |     easier to understand. As such, cooperative groups have a vast range
139 |     of applications. The two examples in this project cannot do them 
140 |     justice, for further use cases please consider the advanced NVIDIA 
141 |     CUDA Samples that include detailed, elaborate applications.
142 | 
143 |     Expected output:
144 |     1) Result of reduction, now computed with cooperative groups
145 |     2) The count, sum and list of the happy numbers in a given range (1-1000)
146 |     */
147 | 
148 |     std::cout << "==== Computing a Reduction with Cooperative Groups ====" << std::endl;
149 | 
150 |     ReduceWithGroups();
151 | 
152 |     std::cout << "==== Computing Happy Numbers and their Sum ====" << std::endl;
153 | 
154 |     HappyNummbersWithGroups(1, 1000);
155 |     
156 |     return 0;
157 | }
158 | 
159 | /*
160 | Exercises:
161 | 1) Write a kernel where each thread first computes its ID in a register.
162 | Within each group of 4 consecutive threads, threads should then share their
163 | ID with all others, using shuffling. Write this kernel once with, once without
164 | cooperative groups, and confirm correctness via output.
165 | 2) Launch a COOPERATIVE KERNEL and use grid-wide synchronization to make sure
166 | all threads in the entire grid are at the same point in the program. Can you
167 | think of any use cases for this? Your device will need to support the attribute
168 | cudaDevAttrCooperativeLaunch for this, check if it has it before starting.
169 | 3) Write a simple program with the following tasks A, B, C, each with N threads.
170 | In A, each thread t should compute and store t*t in its output A_out[t]. In B,
171 | each thread t should compute A_out[N - t - 1] - t and store it in its output
172 | B_out[t]. In C, each thread t should compute B_out[N - t - 1] + 4 and store it
173 | in its output C_out[t]. Implement this once using one kernel for each task A,
174 | and once with a single kernel that uses grid synchronization between tasks.
175 | In the single kernel, do you need additional threadfences and/or volatiles?
176 | Again, in order to do grid sync, your device will need to support the
177 | cudaDevAttrCooperativeLaunch attribute, check if it has it before starting.
178 | */


--------------------------------------------------------------------------------
/shared/include/utility.h:
--------------------------------------------------------------------------------
  1 | // A simple kernel with two nested if clauses, 4 branches.
  2 | // Each thread will take a separate branch, and then perform
  3 | // N stepts. With legacy scheduling, each branch must be 
  4 | // finished before execution can continue with the next.
  5 | #include <random>
  6 | #include <iostream>
  7 | #include <vector>
  8 | #include <algorithm>
  9 | #include <cuda_runtime_api.h>
 10 | #include <mma.h>
 11 | 
 12 | #ifndef SAMPLES_UTIL_INCLUDED
 13 | #define SAMPLES_UTIL_INCLUDED
 14 | 
 15 | // Matrices are stored in row-major order:
 16 | // M(row, col) = *(M.elements + row * M.stride + col)
 17 | template <typename F>
 18 | struct Matrix {
 19 |     int width;
 20 |     int height;
 21 |     int stride;
 22 |     F* elements;
 23 | };
 24 | 
 25 | namespace samplesutil
 26 | {
 27 |     // Helper function to let threads spin
 28 |     __device__ void WasteTime(unsigned long long duration)
 29 |     {
 30 |         const unsigned long long int start = clock64();
 31 |         while ((clock64() - start) < duration);
 32 |     }
 33 | 
 34 |     __device__ float GregoryLeibniz(unsigned int iterations)
 35 |     {
 36 |         float pi = 0.f, m = 1.f;
 37 |         for (int n = 0; n < iterations; n++, m *= -1.f)
 38 |             pi += 4.f * (m / (2 * n + 1));
 39 |         return pi;
 40 |     }
 41 | 
 42 |     // Get a matrix element
 43 |     template <typename F>
 44 |     __device__ float GetElement(const F* A, int row, int col, unsigned int DIM)
 45 |     {
 46 |         return A[row * DIM + col];
 47 |     }
 48 | 
 49 |     // Set a matrix element
 50 |     template <typename F>
 51 |     __device__ void SetElement(F* A, int row, int col, float value, unsigned int DIM)
 52 |     {
 53 |         A[row * DIM + col] = value;
 54 |     }
 55 | 
 56 |     // Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
 57 |     // located col sub-matrices to the right and row sub-matrices down
 58 |     // from the upper-left corner of A
 59 |     template <typename F>
 60 |     __device__ F* GetSubMatrix(F* A, int row, int col, unsigned int BLOCK_SIZE, unsigned int DIM)
 61 |     {
 62 |         F* Asub = A + DIM * BLOCK_SIZE * row + BLOCK_SIZE * col;
 63 |         return Asub;
 64 |     }
 65 | 
 66 |     // Matrix multiplication kernel called by MatMul()
 67 |     template <unsigned int DIM, unsigned int BLOCK_SIZE>
 68 |     __global__ void MatMulKernel(half* A, half* B, float* C)
 69 |     {
 70 |         // Block row and column
 71 |         int blockRow = blockIdx.y;
 72 |         int blockCol = blockIdx.x;
 73 | 
 74 |         // Each thread block computes one sub-matrix Csub of C
 75 |         float* Csub = GetSubMatrix(C, blockRow, blockCol, BLOCK_SIZE, DIM);
 76 | 
 77 |         // Each thread computes one element of Csub
 78 |         // by accumulating results into Cvalue
 79 |         float Cvalue = 0;
 80 | 
 81 |         // Thread row and column within Csub
 82 |         int row = threadIdx.y;
 83 |         int col = threadIdx.x;
 84 | 
 85 |         // Loop over all the sub-matrices of A and B that are
 86 |         // required to compute Csub
 87 |         // Multiply each pair of sub-matrices together
 88 |         // and accumulate the results
 89 |         for (int m = 0; m < (DIM / BLOCK_SIZE); ++m) {
 90 | 
 91 |             // Get sub-matrix Asub of A
 92 |             half* Asub = GetSubMatrix(A, blockRow, m, BLOCK_SIZE, DIM);
 93 | 
 94 |             // Get sub-matrix Bsub of B
 95 |             half* Bsub = GetSubMatrix(B, m, blockCol, BLOCK_SIZE, DIM);
 96 | 
 97 |             // Shared memory used to store Asub and Bsub respectively
 98 |             __shared__ half As[BLOCK_SIZE][BLOCK_SIZE];
 99 |             __shared__ half Bs[BLOCK_SIZE][BLOCK_SIZE];
100 | 
101 |             // Load Asub and Bsub from device memory to shared memory
102 |             // Each thread loads one element of each sub-matrix
103 |             As[row][col] = GetElement(Asub, row, col, DIM);
104 |             Bs[row][col] = GetElement(Bsub, row, col, DIM);
105 | 
106 |             // Synchronize to make sure the sub-matrices are loaded
107 |             // before starting the computation
108 |             __syncthreads();
109 |             // Multiply Asub and Bsub together
110 |             for (int e = 0; e < BLOCK_SIZE; ++e)
111 |                 Cvalue += (float)As[row][e] * (float)Bs[e][col];
112 | 
113 |             // Synchronize to make sure that the preceding
114 |             // computation is done before loading two new
115 |             // sub-matrices of A and B in the next iteration
116 |             __syncthreads();
117 |         }
118 | 
119 |         // Write Csub to device memory
120 |         // Each thread writes one element
121 |         SetElement(Csub, row, col, Cvalue, DIM);
122 |     }
123 | 
124 |     // Matrix multiplication - Host code
125 |     // Matrix dimensions are assumed to be multiples of BLOCK_SIZE
126 |     template <typename K>
127 |     static void MatMul(const Matrix<half> A, const Matrix<half> B, Matrix<float> C, const K& kernel, dim3 grid, dim3 block)
128 |     {
129 |         // Load A and B to device memory
130 |         Matrix<half> d_A;
131 |         d_A.width = d_A.stride = A.width; d_A.height = A.height;
132 |         size_t size = A.width * A.height * sizeof(half);
133 |         cudaMalloc(&d_A.elements, size);
134 |         cudaMemcpy(d_A.elements, A.elements, size,
135 |             cudaMemcpyHostToDevice);
136 |         Matrix<half> d_B;
137 |         d_B.width = d_B.stride = B.width; d_B.height = B.height;
138 |         size = B.width * B.height * sizeof(half);
139 |         cudaMalloc(&d_B.elements, size);
140 |         cudaMemcpy(d_B.elements, B.elements, size,
141 |             cudaMemcpyHostToDevice);
142 | 
143 |         // Allocate C in device memory
144 |         Matrix<float> d_C;
145 |         d_C.width = d_C.stride = C.width; d_C.height = C.height;
146 |         size = C.width * C.height * sizeof(float);
147 |         cudaMalloc(&d_C.elements, size);
148 | 
149 |         cudaEvent_t start, end;
150 |         cudaEventCreate(&start);
151 |         cudaEventCreate(&end);
152 | 
153 |         // Invoke kernel
154 |         cudaEventRecord(start);
155 |         kernel<<<grid, block>>>(d_A, d_B, d_C);
156 |         cudaEventRecord(end);
157 | 
158 |         cudaEventSynchronize(end);
159 | 
160 |         float ms;
161 |         cudaEventElapsedTime(&ms, start, end);
162 |         std::cout << "Matrix multiplication took: " << ms << " ms\n" << std::endl;
163 | 
164 |         // Read C from device memory
165 |         cudaMemcpy(C.elements, d_C.elements, size,
166 |             cudaMemcpyDeviceToHost);
167 | 
168 |         // Free device memory
169 |         cudaFree(d_A.elements);
170 |         cudaFree(d_B.elements);
171 |         cudaFree(d_C.elements);
172 |     }
173 | 
174 |     template <unsigned int BLOCK_SIZE>
175 |     static void MatMulConv(Matrix<half> A, Matrix<half> B, Matrix<float> C)
176 |     {
177 |         dim3 block(BLOCK_SIZE, BLOCK_SIZE);
178 |         dim3 grid(B.width / block.x, A.height / block.y);
179 |         samplesutil::MatMul(A, B, C, samplesutil::MatMulKernel<BLOCK_SIZE>, grid, block);
180 |     }
181 | 
182 |     static __host__ void prepareRandomNumbersCPUGPU(unsigned int N, std::vector<float>& vals, float** dValsPtr)
183 |     {
184 | 	    constexpr float target = 42.f;
185 | 	    // Print expected value, because reference may be off due to floating point (im-)precision
186 | 	    std::cout << "\nExpected value: " << target * N << "\n" << std::endl;
187 | 
188 | 	    // Generate a few random inputs to accumulate
189 | 	    std::default_random_engine eng(0xcaffe);
190 | 	    std::normal_distribution<float> dist(target);
191 | 	    vals.resize(N);
192 | 	    std::for_each(vals.begin(), vals.end(), [&dist, &eng](float& f) { f = dist(eng); });
193 | 
194 | 	    // Allocate some global GPU memory to write the inputs to
195 | 	    cudaMalloc((void**)dValsPtr, sizeof(float) * N);
196 | 	    // Expliclity copy the inputs from the CPU to the GPU
197 | 	    cudaMemcpy(*dValsPtr, vals.data(), sizeof(float) * N, cudaMemcpyHostToDevice);
198 |     }
199 | 
200 |     // Define an unsigned integer variable that the GPU can work with
201 |     __device__ unsigned int step = 0;
202 | 
203 |     // Increment the GPU variable N times. Whenever a thread observes
204 |     // non-consecutive numbers, it prints the latest sequence. Hence,
205 |     // every thread documents the turns that it was given by the 
206 |     // scheduler. 
207 |     static __device__ void takeNTurns(const char* who, unsigned int N)
208 |     {
209 |         int lastTurn = -42, turn, start;
210 |         for (int i = 0; i < N; i++)
211 |         {
212 |             turn = atomicInc(&step, 0xFFFFFFFFU);
213 | 
214 |             bool switchOccurred = (lastTurn != (turn - 1));
215 |             bool done = (i == (N - 1));
216 | 
217 |             if (done || ((i > 0) && switchOccurred))
218 |                 printf("%s: %d--%d\n", who, start, lastTurn + (done ? 1 : 0));
219 | 
220 |             if (switchOccurred)
221 |                 start = turn;
222 | 
223 |             lastTurn = turn;
224 |         }
225 |     }
226 | 
227 |     static __global__ void testScheduling(int N)
228 |     {
229 |         if (threadIdx.x < 2) // Branch once
230 |             if (threadIdx.x == 0) // Branch again
231 |                 takeNTurns("Thread 1", N);
232 |             else
233 |                 takeNTurns("Thread 2", N);
234 |         else
235 |             if (threadIdx.x == 2) // Branch again
236 |                 takeNTurns("Thread 3", N);
237 |             else
238 |                 takeNTurns("Thread 4", N);
239 |     }
240 | 
241 |     static __host__ void run2NestedBranchesForNSteps(int N)
242 |     {
243 | 	    testScheduling<<<1, 4>>>(N);
244 | 	    cudaDeviceSynchronize();
245 |     }
246 | 
247 |     /*
248 |      Computes whether a given number is a "happy number".
249 |      https://en.wikipedia.org/wiki/Happy_number
250 |     */
251 |     static __host__ __device__ bool isHappy(unsigned int num)
252 |     {
253 |         while (num != 0 && num != 1 && num != 4)
254 |         {
255 |             unsigned int next_num = 0;
256 |             for (unsigned int n = num; n > 0; n /= 10)
257 |             {
258 |                 unsigned int t = n % 10;
259 |                 next_num += t * t;
260 |             }
261 |             num = next_num;
262 |         }
263 |         return num == 1;
264 |     }
265 | }
266 | 
267 | #endif


--------------------------------------------------------------------------------
/08_Reductions/src/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime_api.h>
  2 | #include <iostream>
  3 | #include <vector>
  4 | #include <random>
  5 | #include <chrono>
  6 | #include <tuple>
  7 | #include <utility>
  8 | #include <numeric>
  9 | #include <iomanip>
 10 | #include "../../shared/include/utility.h"
 11 | 
 12 | // Declare a GPU-visible floating point variable in global memory.
 13 | __device__ float dResult;
 14 | 
 15 | /*
 16 |  The most basic reduction kernel uses atomic operations to accumulate
 17 |  the individual inputs in a single, device-wide visible variable.
 18 |  If you have experience with atomics, it is important to note that the
 19 |  basic atomicXXX instructions of CUDA have RELAXED semantics (scary!).
 20 |  That means, the threads that operate atomically on them only agree that 
 21 |  there is a particular order for the accesses to that variable and nothing
 22 |  else (especially no acquire/release semantics).
 23 | */
 24 | __global__ void reduceAtomicGlobal(const float* __restrict input, int N)
 25 | {
 26 |     const int id = threadIdx.x + blockIdx.x * blockDim.x;
 27 |     /* 
 28 |     Since all blocks must have the same number of threads,
 29 |     we may have to launch more threads than there are 
 30 |     inputs. Superfluous threads should not try to read 
 31 |     from the input (out of bounds access!)
 32 |     */
 33 |     if (id < N)
 34 |         atomicAdd(&dResult, input[id]);
 35 | }
 36 | 
 37 | /*
 38 |  First improvement: shared memory is much faster than global
 39 |  memory. Each block can accumulate partial results in isolated
 40 |  block-wide visible memory. This relieves the contention on 
 41 |  a single global variable that all threads want access to.
 42 | */
 43 | __global__ void reduceAtomicShared(const float* __restrict input, int N)
 44 | {
 45 |     const int id = threadIdx.x + blockIdx.x * blockDim.x;
 46 | 
 47 |     // Declare a shared float for each block
 48 |     __shared__ float x;
 49 | 
 50 |     // Only one thread should initialize this shared value
 51 |     if (threadIdx.x == 0) 
 52 |         x = 0.0f;
 53 |     
 54 |     /*
 55 |     Before we continue, we must ensure that all threads
 56 |     can see this update (initialization) by thread 0
 57 |     */
 58 |     __syncthreads();
 59 | 
 60 |     /*
 61 |     Every thread in the block adds its input to the
 62 |     shared variable of the block.
 63 |     */
 64 |     if (id < N) 
 65 |         atomicAdd(&x, input[id]);
 66 | 
 67 |     // Wait until all threads have done their part
 68 |     __syncthreads();
 69 | 
 70 |     /*
 71 |     Once they are all done, only one thread must add
 72 |     the block's partial result to the global variable. 
 73 |     */
 74 |     if (threadIdx.x == 0) 
 75 |         atomicAdd(&dResult, x);
 76 | }
 77 | 
 78 | /*
 79 |  Second improvement: choosing a more suitable algorithm.
 80 |  We can exploit the fact that the GPU is massively parallel
 81 |  and come up with a fitting procedure that uses multiple
 82 |  iterations. In each iteration, threads accumulate partial 
 83 |  results from the previous iteration. Before, the contented
 84 |  accesses to one location forced the GPU to perform updates 
 85 |  sequentially O(N). Now, each thread can access its own, 
 86 |  exclusive shared variable in each iteration in parallel,
 87 |  giving an effective runtime that is closer to O(log N).
 88 | */
 89 | template <unsigned int BLOCK_SIZE>
 90 | __global__ void reduceShared(const float* __restrict input, int N)
 91 | {
 92 |     const int id = threadIdx.x + blockIdx.x * blockDim.x;
 93 | 
 94 |     /*
 95 |     Use a larger shared memory region so that each
 96 |     thread can store its own partial results
 97 |     */
 98 |     __shared__ float data[BLOCK_SIZE];
 99 |     /*
100 |     Use a new strategy to handle superfluous threads.
101 |     To make sure they stay alive and can help with
102 |     the reduction, threads without an input simply
103 |     produce a '0', which has no effect on the result.
104 |     */
105 |     data[threadIdx.x] = (id < N ? input[id] : 0);
106 | 
107 |     /*
108 |     log N iterations to complete. In each step, a thread
109 |     accumulates two partial values to form the input for
110 |     the next iteration. The sum of all partial results 
111 |     eventually yields the full result of the reduction. 
112 |     */
113 |     for (int s = blockDim.x / 2; s > 0; s /= 2)
114 |     {
115 |         /*
116 |         In each iteration, we must make sure that all
117 |         threads are done writing the updates of the
118 |         previous iteration / the initialization.
119 |         */
120 |         __syncthreads();
121 |         if (threadIdx.x < s)
122 |             data[threadIdx.x] += data[threadIdx.x + s];
123 |     }
124 | 
125 |     /*
126 |     Note: thread 0 is the last thread to combine two
127 |     partial results, and the one who writes to global
128 |     memory, therefore no synchronization is required
129 |     after the last iteration.
130 |     */
131 |     if (threadIdx.x == 0)
132 |         atomicAdd(&dResult, data[0]);
133 | }
134 | 
135 | /*
136 |  Warp-level improvement: using warp-level primitives to 
137 |  accelerate the final steps of the reduction. Warps
138 |  have a fast lane for communication. They are free 
139 |  to exchange values in registers when they are being
140 |  scheduled for execution. Warps will be formed from 
141 |  consecutive threads in groups of 32.
142 | */
143 | template <unsigned int BLOCK_SIZE>
144 | __global__ void reduceShuffle(const float* __restrict input, int N)
145 | {
146 |     const int id = threadIdx.x + blockIdx.x * blockDim.x;
147 | 
148 |     __shared__ float data[BLOCK_SIZE];
149 |     data[threadIdx.x] = (id < N ? input[id] : 0);
150 | 
151 |     // Only use shared memory until last 32 values
152 |     for (int s = blockDim.x / 2; s > 16; s /= 2)
153 |     {
154 |         __syncthreads();
155 |         if (threadIdx.x < s)
156 |             data[threadIdx.x] += data[threadIdx.x + s];
157 |     }
158 | 
159 |     // The last 32 values can be handled with warp-level primitives
160 |     float x = data[threadIdx.x];
161 |     if (threadIdx.x < 32)
162 |     {
163 |         /* 
164 |         The threads in the first warp shuffle their registers.
165 |         This replaces the last 5 iterations of the previous solution.
166 |         The mask indicates which threads participate in the shuffle.
167 |         The value indicates which register should be shuffled. 
168 |         The final parameter gives the source thread from which the
169 |         current one should receive the shuffled value. Accesses that
170 |         are out of range (>= 32) will wrap around, but are not needed
171 |         (they will not affect the final result written by thread 0).
172 |         In each shuffle, at least half of the threads only participate 
173 |         so they can provide useful data from the previous shuffle for 
174 |         lower threads. To keep the code short, we always let all threads
175 |         participate, because it is an error to let threads reach a shuffle
176 |         instruction that they don't participate in.
177 |         */
178 |         x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 16);
179 |         x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 8);
180 |         x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 4);
181 |         x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 2);
182 |         x += __shfl_sync(0xFFFFFFFF, x, 1);
183 |     }
184 | 
185 |     if (threadIdx.x == 0)
186 |         atomicAdd(&dResult, x);
187 | }
188 | 
189 | /*
190 |  Final improvement: half of our threads actually idle after 
191 |  they have loaded data from global memory to shared! Better
192 |  to have threads fetch two values at the start and then let
193 |  them all do at least some meaningful work. This means that
194 |  compared to all other methods, only half the number of 
195 |  threads must be launched in the grid!
196 | */
197 | template <unsigned int BLOCK_SIZE>
198 | __global__ void reduceFinal(const float* __restrict input, int N)
199 | {
200 |     const int id = threadIdx.x + blockIdx.x * blockDim.x;
201 | 
202 |     __shared__ float data[BLOCK_SIZE];
203 |     // Already combine two values upon load from global memory.
204 |     data[threadIdx.x] = id < N / 2 ? input[id] : 0;
205 |     data[threadIdx.x] += id + N/2 < N ? input[id + N / 2] : 0;
206 | 
207 |     for (int s = blockDim.x / 2; s > 16; s /= 2)
208 |     {
209 |         __syncthreads();
210 |         if (threadIdx.x < s)
211 |             data[threadIdx.x] += data[threadIdx.x + s];
212 |     }
213 | 
214 |     float x = data[threadIdx.x];
215 |     if (threadIdx.x < 32)
216 |     {
217 |         x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 16);
218 |         x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 8);
219 |         x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 4);
220 |         x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 2);
221 |         x += __shfl_sync(0xFFFFFFFF, x, 1);
222 |     }
223 | 
224 |     if (threadIdx.x == 0)
225 |         atomicAdd(&dResult, x);
226 | }
227 | 
228 | int main()
229 | {
230 |     std::cout << "==== Sample 08 - Reductions ====\n" << std::endl;
231 |     /*
232 |      Expected output: Accumulated results from CPU and GPU that 
233 |      approach 42 * NUM_ITEMS (can vary greatly due to floating point 
234 |      precision limitations). 
235 |     
236 |      With more sophisticated techniques, reported performance of the
237 |      GPU versions (measured runtime in ms) should generally decrease.
238 |     */
239 | 
240 |     constexpr unsigned int BLOCK_SIZE = 256;
241 |     constexpr unsigned int WARMUP_ITERATIONS = 10;
242 |     constexpr unsigned int TIMING_ITERATIONS = 20;
243 |     constexpr unsigned int N = 100'000'000;
244 | 
245 |     std::cout << "Producing random inputs...\n" << std::endl;
246 |     // Generate some random numbers to reduce
247 |     std::vector<float> vals;
248 |     float* dValsPtr;
249 |     samplesutil::prepareRandomNumbersCPUGPU(N, vals, &dValsPtr);
250 | 
251 |     std::cout << "==== CPU Reduction ====\n" << std::endl;
252 |     // A reference value is computed by sequential reduction
253 |     std::cout << "Computed CPU value: " << std::accumulate(vals.cbegin(), vals.cend(), 0.0f) << std::endl;
254 | 
255 |     std::cout << "==== GPU Reductions ====\n" << std::endl;
256 |     /*
257 |      Set up a collection of reductions to evaluate for performance. 
258 |      Each entry gives a technique's name, the kernel to call, and
259 |      the number of threads required for each individual technique.
260 |     */
261 |     const std::tuple<const char*, void(*)(const float*, int), unsigned int> reductionTechniques[]
262 |     {
263 |         {"Atomic Global", reduceAtomicGlobal, N},
264 |         {"Atomic Shared", reduceAtomicShared, N},
265 |         {"Reduce Shared", reduceShared<BLOCK_SIZE>, N},
266 |         {"Reduce Shuffle", reduceShuffle<BLOCK_SIZE>, N},
267 |         {"Reduce Final", reduceFinal<BLOCK_SIZE>, N / 2 + 1}
268 |     };
269 | 
270 |     // Evaluate each technique separately
271 |     for (const auto& [name, func, numThreads] : reductionTechniques)
272 |     {
273 |         // Compute the smallest grid to start required threads with a given block size
274 |         const dim3 blockDim = { BLOCK_SIZE, 1, 1 };
275 |         const dim3 gridDim = { (numThreads + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1 };
276 | 
277 |         // Run several reductions for GPU to warm up
278 |         for (int i = 0; i < WARMUP_ITERATIONS; i++)
279 |             func<<<gridDim, blockDim>>>(dValsPtr, N);
280 | 
281 |         // Synchronize to ensure CPU only records time after warmup is done
282 |         cudaDeviceSynchronize();
283 |         const auto before = std::chrono::system_clock::now();
284 | 
285 |         float result = 0.0f;
286 |         // Run several iterations to get an average measurement
287 |         for (int i = 0; i < TIMING_ITERATIONS; i++)
288 |         {
289 |             // Reset acummulated result to 0 in each run
290 |             cudaMemcpyToSymbol(dResult, &result, sizeof(float));
291 |             func<<<gridDim, blockDim>>>(dValsPtr, N);
292 |         }
293 | 
294 |         // cudaMemcpyFromSymbol will implicitly synchronize CPU and GPU
295 |         cudaMemcpyFromSymbol(&result, dResult, sizeof(float));
296 | 
297 |         // Can measure time without an extra synchronization
298 |         const auto after = std::chrono::system_clock::now();
299 |         const auto elapsed = 1000.f * std::chrono::duration_cast<std::chrono::duration<float>>(after - before).count();
300 |         std::cout << std::setw(20) << name << "\t" << elapsed / TIMING_ITERATIONS << "ms \t" << result << std::endl;
301 |     }
302 | 
303 |     // Free the allocated memory for input
304 |     cudaFree(dValsPtr);
305 |     return 0;
306 | }
307 | 
308 | /*
309 | Exercises: 
310 | 1) Change the program so that the methods reduce integer values instead of float. 
311 | Can you observe any difference in terms of speed / computed results?
312 | 2) Do you have any other ideas how the reduction could be improved?
313 | Making it even faster should be quite challenging, but if you have 
314 | some suggestions, try them out and see how they affect performance! 
315 | */


--------------------------------------------------------------------------------