├── 16_TensorCores ├── src │ └── main.cu └── CMakeLists.txt ├── 00_MyGPU ├── CMakeLists.txt └── src │ └── main.cu ├── 01_HelloGPU ├── CMakeLists.txt └── src │ └── main.cu ├── 06_MemoryBasics ├── CMakeLists.txt └── src │ └── main.cu ├── 14_ManagedMemory ├── CMakeLists.txt └── src │ └── main.cu ├── 03_GridConfiguration ├── CMakeLists.txt └── src │ └── main.cu ├── 02_HostDeviceFunctions ├── CMakeLists.txt └── src │ └── main.cu ├── 09_Streams ├── CMakeLists.txt └── src │ └── main.cu ├── 10_Events ├── CMakeLists.txt └── src │ └── main.cu ├── 15_GraphAPI ├── CMakeLists.txt └── src │ └── main.cu ├── 08_Reductions ├── CMakeLists.txt └── src │ └── main.cu ├── 11_PinnedMemory ├── CMakeLists.txt └── src │ └── main.cu ├── 13_MemoryFences ├── CMakeLists.txt └── src │ └── main.cu ├── 12_ErrorHandling ├── CMakeLists.txt └── src │ └── main.cu ├── 07_Synchronization ├── CMakeLists.txt └── src │ └── main.cu ├── 17_CooperativeGroups ├── CMakeLists.txt └── src │ └── main.cu ├── .gitignore ├── README.md ├── 18_StandardLibrary ├── CMakeLists.txt └── src │ └── main.cu ├── 04_LegacyScheduling ├── CMakeLists.txt └── src │ └── main.cu ├── 05_IndependentThreadScheduling ├── CMakeLists.txt └── src │ └── main.cu ├── CMakeLists.txt └── shared └── include └── utility.h /16_TensorCores/src/main.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CUDA-Tutorial/CodeSamples/HEAD/16_TensorCores/src/main.cu -------------------------------------------------------------------------------- /00_MyGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(00_MyGPU 2 | src/main.cu 3 | ) 4 | 5 | target_include_directories(00_MyGPU PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /01_HelloGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(01_HelloGPU 2 | src/main.cu 3 | ) 4 | 5 | target_include_directories(01_HelloGPU PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /06_MemoryBasics/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(06_MemoryBasics 2 | src/main.cu 3 | ) 4 | 5 | target_include_directories(06_MemoryBasics PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /14_ManagedMemory/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(14_ManagedMemory 2 | src/main.cu 3 | ) 4 | 5 | target_include_directories(14_ManagedMemory PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /03_GridConfiguration/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(03_GridConfiguration 2 | src/main.cu 3 | ) 4 | 5 | target_include_directories(03_GridConfiguration PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /02_HostDeviceFunctions/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(02_HostDeviceFunctions 2 | src/main.cu 3 | ) 4 | 5 | target_include_directories(02_HostDeviceFunctions PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /09_Streams/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(09_Streams 2 | ../../shared/include/utility.h 3 | src/main.cu 4 | ) 5 | 6 | target_include_directories(09_Streams PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /10_Events/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(10_Events 2 | ../../shared/include/utility.h 3 | src/main.cu 4 | ) 5 | 6 | target_include_directories(10_Events PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /15_GraphAPI/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(15_GraphAPI 2 | ../../shared/include/utility.h 3 | src/main.cu 4 | ) 5 | 6 | target_include_directories(15_GraphAPI PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /08_Reductions/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(08_Reductions 2 | ../../shared/include/utility.h 3 | src/main.cu 4 | ) 5 | 6 | target_include_directories(08_Reductions PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /11_PinnedMemory/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(11_PinnedMemory 2 | ../../shared/include/utility.h 3 | src/main.cu 4 | ) 5 | 6 | target_include_directories(11_PinnedMemory PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /13_MemoryFences/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(13_MemoryFences 2 | ../../shared/include/utility.h 3 | src/main.cu 4 | ) 5 | 6 | target_include_directories(13_MemoryFences PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /12_ErrorHandling/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(12_ErrorHandling 2 | ../../shared/include/utility.h 3 | src/main.cu 4 | ) 5 | 6 | target_include_directories(12_ErrorHandling PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /07_Synchronization/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(07_Synchronization 2 | ../../shared/include/utility.h 3 | src/main.cu 4 | ) 5 | 6 | target_include_directories(07_Synchronization PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /17_CooperativeGroups/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(17_CooperativeGroups 2 | ../../shared/include/utility.h 3 | src/main.cu 4 | ) 5 | 6 | target_include_directories(17_CooperativeGroups PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | CMakeLists.txt.user 2 | CMakeCache.txt 3 | CMakeFiles 4 | CMakeScripts 5 | Testing 6 | Makefile 7 | cmake_install.cmake 8 | install_manifest.txt 9 | compile_commands.json 10 | CTestTestfile.cmake 11 | _deps 12 | build 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CUDA Tutorial Code Samples 2 | 3 | Here we provide the codebase for samples that accompany the tutorial "CUDA and Applications to Task-based Programming". 4 | 5 | Requirements: 6 | 7 | * Recent Clang/GCC/Microsoft Visual C++ 8 | * CMake 3.20 (**Ubuntu** users please update!) 9 | * CUDA capable GPU with compute capability 5.2 or later 10 | * CUDA Toolkit 9.0 or later 11 | 12 | Recommended: 13 | 14 | * GCC 10/Microsoft Visual C++ 2019 or later 15 | * Nsight Systems 16 | * Nsight Compute 17 | * CUDA capable GPU with compute capability 7.0 or later 18 | * CUDA Toolkit 11.0 or later -------------------------------------------------------------------------------- /18_StandardLibrary/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(ARCHES_SECURED ${CMAKE_CUDA_ARCHITECTURES}) 2 | set(CMAKE_CUDA_ARCHITECTURES "") 3 | 4 | set(ITS_ARCH_FOUND FALSE) 5 | foreach(ARCH ${ARCHES_SECURED}) 6 | if (ARCH GREATER_EQUAL 70) 7 | set(ITS_ARCH_FOUND TRUE) 8 | list(APPEND CMAKE_CUDA_ARCHITECTURES ${ARCH}) 9 | endif() 10 | endforeach() 11 | 12 | if(${ITS_ARCH_FOUND}) 13 | add_executable(18_StandardLibrary 14 | src/main.cu 15 | ) 16 | target_include_directories(18_StandardLibrary PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) 17 | else() 18 | message("Not building sample 18 - CMAKE_CUDA_ARCHITECTURES do not include ITS support (>=70)") 19 | message("To define more than one architecture, separate them with semicolon (e.g., 60;70)") 20 | endif() -------------------------------------------------------------------------------- /04_LegacyScheduling/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(ARCHES_SECURED ${CMAKE_CUDA_ARCHITECTURES}) 2 | set(CMAKE_CUDA_ARCHITECTURES "") 3 | 4 | set(LEGACY_ARCH_FOUND FALSE) 5 | foreach(ARCH ${ARCHES_SECURED}) 6 | if (ARCH LESS_EQUAL 60) 7 | set(LEGACY_ARCH_FOUND TRUE) 8 | list(APPEND CMAKE_CUDA_ARCHITECTURES ${ARCH}) 9 | endif() 10 | endforeach() 11 | 12 | if(${LEGACY_ARCH_FOUND}) 13 | add_executable(04_LegacyScheduling 14 | src/main.cu 15 | ../shared/include/utility.h 16 | ) 17 | target_include_directories(04_LegacyScheduling PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) 18 | else() 19 | message("Not building sample 04 - CMAKE_CUDA_ARCHITECTURES do not include legacy (<=60)") 20 | message("To define more than one architecture, separate them with semicolon (e.g., 60;70)") 21 | endif() -------------------------------------------------------------------------------- /05_IndependentThreadScheduling/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(ARCHES_SECURED ${CMAKE_CUDA_ARCHITECTURES}) 2 | set(CMAKE_CUDA_ARCHITECTURES "") 3 | 4 | set(ITS_ARCH_FOUND FALSE) 5 | foreach(ARCH ${ARCHES_SECURED}) 6 | if (ARCH GREATER_EQUAL 70) 7 | set(ITS_ARCH_FOUND TRUE) 8 | list(APPEND CMAKE_CUDA_ARCHITECTURES ${ARCH}) 9 | endif() 10 | endforeach() 11 | 12 | if(${ITS_ARCH_FOUND}) 13 | add_executable(05_IndependentThreadScheduling 14 | src/main.cu 15 | ../shared/include/utility.h 16 | ) 17 | target_include_directories(05_IndependentThreadScheduling PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) 18 | else() 19 | message("Not building sample 05 - CMAKE_CUDA_ARCHITECTURES do not include ITS support (>=70)") 20 | message("To define more than one architecture, separate them with semicolon (e.g., 60;70)") 21 | endif() -------------------------------------------------------------------------------- /01_HelloGPU/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void HelloGPU() 5 | { 6 | // Print a simple message from the GPU 7 | printf("Hello from the GPU!\n"); 8 | } 9 | 10 | int main() 11 | { 12 | std::cout << "==== Sample 01 - Hello GPU ====\n" << std::endl; 13 | // Expected output: 12x "Hello from the GPU!\n" 14 | 15 | // Launch a kernel with 1 block that has 12 threads 16 | HelloGPU<<<1, 12>>>(); 17 | 18 | /* 19 | Synchronize with GPU to wait for printf to finish. 20 | Results of printf are buffered and copied back to 21 | the CPU for I/O after the kernel has finished. 22 | */ 23 | cudaDeviceSynchronize(); 24 | return 0; 25 | } 26 | 27 | /* 28 | Exercises: 29 | 1) Change the message that is printed by the kernel 30 | 2) Write a different kernel (different name, different message) 31 | 3) Call the different kernels multiple times 32 | */ -------------------------------------------------------------------------------- /16_TensorCores/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(ARCHES_SECURED ${CMAKE_CUDA_ARCHITECTURES}) 2 | set(CMAKE_CUDA_ARCHITECTURES "") 3 | 4 | set(ITS_ARCH_FOUND FALSE) 5 | foreach(ARCH ${ARCHES_SECURED}) 6 | if (ARCH GREATER_EQUAL 70) 7 | set(ITS_ARCH_FOUND TRUE) 8 | list(APPEND CMAKE_CUDA_ARCHITECTURES ${ARCH}) 9 | endif() 10 | endforeach() 11 | 12 | if(${ITS_ARCH_FOUND}) 13 | add_executable(16_TensorCores 14 | src/main.cu 15 | ../shared/include/utility.h 16 | ) 17 | target_include_directories(16_TensorCores PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) 18 | if (MSVC) 19 | target_link_libraries(16_TensorCores cublas.lib) 20 | else() 21 | target_link_libraries(16_TensorCores cublas) 22 | endif() 23 | else() 24 | message("Not building sample 16 - CMAKE_CUDA_ARCHITECTURES do not include ITS support (>=70)") 25 | message("To define more than one architecture, separate them with semicolon (e.g., 60;70)") 26 | endif() -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | 3 | set(CMAKE_CXX_STANDARD 17) 4 | set(CMAKE_CXX_EXTENSIONS OFF) 5 | set(CMAKE_CUDA_STANDARD 17) 6 | 7 | project(TutorialSamples LANGUAGES CUDA CXX) 8 | 9 | add_subdirectory(00_MyGPU) 10 | add_subdirectory(01_HelloGPU) 11 | add_subdirectory(02_HostDeviceFunctions) 12 | add_subdirectory(03_GridConfiguration) 13 | add_subdirectory(04_LegacyScheduling) 14 | add_subdirectory(05_IndependentThreadScheduling) 15 | add_subdirectory(06_MemoryBasics) 16 | add_subdirectory(07_Synchronization) 17 | add_subdirectory(08_Reductions) 18 | add_subdirectory(09_Streams) 19 | add_subdirectory(10_Events) 20 | add_subdirectory(11_PinnedMemory) 21 | add_subdirectory(12_ErrorHandling) 22 | add_subdirectory(13_MemoryFences) 23 | add_subdirectory(14_ManagedMemory) 24 | add_subdirectory(15_GraphAPI) 25 | add_subdirectory(16_TensorCores) 26 | add_subdirectory(17_CooperativeGroups) 27 | add_subdirectory(18_StandardLibrary) 28 | 29 | -------------------------------------------------------------------------------- /04_LegacyScheduling/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "../../shared/include/utility.h" 4 | 5 | int main() 6 | { 7 | std::cout << "==== Sample 04 - Legacy Thread Scheduling ====\n" << std::endl; 8 | /* 9 | This code will launch a particular test kernel. 10 | It will launch 4 threads in total. 11 | The program code is structured such that each 12 | thread enters one of 4 possible branches and then 13 | atomically increments a GPU variable N times: 14 | 15 | .---- N operations by Thread 0 16 | ----X 17 | / '---- N operations by Thread 1 18 | ----X 19 | \ .---- N operations by Thread 2 20 | ----X 21 | '---- N operations by Thread 3 22 | 23 | Each thread will document consecutive ranges of 24 | values it observed for the incremented variable. 25 | Basically, this will give us an idea how threads 26 | take turns running in this branching scenario. 27 | 28 | Expected output: 4 consecutive ranges, one for 29 | each thread, taking 128 consecutive turns until 30 | they have completed their N steps. 31 | 32 | Disclaimer: behavior depends somewhat on compiler's 33 | effort to optimize code. Results may vary. 34 | */ 35 | 36 | constexpr int N = 128; 37 | // Using a utility function for demonstration 38 | samplesutil::run2NestedBranchesForNSteps(N); 39 | return 0; 40 | } 41 | -------------------------------------------------------------------------------- /03_GridConfiguration/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void PrintIDs() 5 | { 6 | // Use built-in variables blockIdx and threadIdx 7 | const auto tID = threadIdx; 8 | const auto bID = blockIdx; 9 | printf("Block Id: %d,%d - Thread Id: %d,%d\n", bID.x, bID.y, tID.x, tID.y); 10 | } 11 | 12 | int main() 13 | { 14 | std::cout << "==== Sample 03 - Grid Configurations ====\n" << std::endl; 15 | /* 16 | Expected output: 17 | Block IDs and Thread IDs for two separate grids 18 | */ 19 | 20 | std::cout << "Small grid: \n"; 21 | // Configure the grid and block dimensions via built-in struct dim3 (X,Y,Z) 22 | const dim3 gridSize_small{ 1, 1, 1 }; 23 | const dim3 blockSize_small{ 4, 4, 1 }; 24 | 25 | // Launch kernel with custom grid 26 | PrintIDs<<>>(); 27 | 28 | // Need to synchronize here to have the GPU and CPU printouts in the correct order 29 | cudaDeviceSynchronize(); 30 | 31 | std::cout << "\nLarger grid: \n"; 32 | const dim3 gridSize_large{ 2, 2, 1 }; 33 | const dim3 blockSize_large{ 16, 16, 1 }; 34 | PrintIDs<<>>(); 35 | cudaDeviceSynchronize(); 36 | 37 | return 0; 38 | } 39 | 40 | /* 41 | Exercises: 42 | 1) Launch a 1D grid with 2D blocks (e.g., 4 x (4 x 4)) 43 | 2) Launch a 2D grid with 2D blocks (e.g., (4 x 4) x (4 x 4)) 44 | 3) Launch a grid where either the grid or block are 3D 45 | 4) Can you find out what the limits are to how many blocks/threads you can launch? 46 | */ -------------------------------------------------------------------------------- /05_IndependentThreadScheduling/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "../../shared/include/utility.h" 4 | 5 | int main() 6 | { 7 | std::cout << "==== Sample 05 - Independent Thread Scheduling ====\n" << std::endl; 8 | /* 9 | This code will launch a particular test kernel. 10 | It will launch 4 threads in total. 11 | The program code is structured such that each 12 | thread enters one of 4 possible branches and then 13 | atomically increments a GPU variable N times: 14 | 15 | .---- N operations by Thread 0 16 | ----X 17 | / '---- N operations by Thread 1 18 | ----X 19 | \ .---- N operations by Thread 2 20 | ----X 21 | '---- N operations by Thread 3 22 | 23 | Each thread will document consecutive ranges of 24 | values it observed for the incremented variable. 25 | Basically, this will give us an idea how threads 26 | take turns running in this branching scenario. 27 | 28 | Expected output: Many smaller observed value ranges! 29 | Independent thread scheduling is free to switch 30 | between branches to keep the compute units of the 31 | GPU busy with work. Thus, threads can alternate 32 | taking turns to complete their N steps each. 33 | 34 | Disclaimer: behavior is dependent on scheduling at 35 | runtime. Results may vary. 36 | */ 37 | 38 | constexpr int N = 128; 39 | // Using a utility function for demonstration 40 | samplesutil::run2NestedBranchesForNSteps(N); 41 | return 0; 42 | } 43 | -------------------------------------------------------------------------------- /00_MyGPU/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /* 5 | Before you use your GPU to do work, you should know the 6 | most essential things about its capabilities. 7 | */ 8 | int main() 9 | { 10 | // Count CUDA-capable devices on the system 11 | int numDevices; 12 | cudaGetDeviceCount(&numDevices); 13 | 14 | if (numDevices == 0) 15 | { 16 | std::cout << "You have no CUDA devices available!" << std::endl; 17 | return -1; 18 | } 19 | 20 | // Get the ID of the currently selected active CUDA device 21 | int device; 22 | cudaGetDevice(&device); 23 | 24 | // Fetch its properties 25 | cudaDeviceProp props; 26 | cudaGetDeviceProperties(&props, device); 27 | 28 | /* 29 | We only print the most fundamental properties here. cudaDeviceProp 30 | contains a long range of indicators to check for different things 31 | that your GPU may or may not support, as well as factors for 32 | performance. However, the most essential property to know about is 33 | the compute capability of the device. 34 | */ 35 | std::cout << "Model: " << props.name << std::endl; 36 | std::cout << "Compute capability: " << props.major << "." << props.minor << std::endl; 37 | std::cout << "Memory: " << props.totalGlobalMem / float(1 << 30) << " GiB" << std::endl; 38 | std::cout << "Multiprocessors: " << props.multiProcessorCount << std::endl; 39 | std::cout << "Clock rate: " << props.clockRate / float(1'000'000) << " GHz" << std::endl; 40 | 41 | return 0; 42 | } 43 | 44 | /* 45 | Exercises: 46 | 1) Change the behavior such that the properties are not just printed for one, but all available CUDA devices you have! 47 | (Even if you have just one) 48 | 2) Print a few more interesting properties and read up in the specification what they mean. 49 | */ 50 | -------------------------------------------------------------------------------- /02_HostDeviceFunctions/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // Define a function that will only be compiled for and called from host 5 | __host__ void HostOnly() 6 | { 7 | std::cout << "This function may only be called from the host" << std::endl; 8 | } 9 | 10 | // Define a function that will only be compiled for and called from device 11 | __device__ void DeviceOnly() 12 | { 13 | printf("This function may only be called from the device\n"); 14 | } 15 | 16 | // Define a function that will be compiled for both architectures 17 | __host__ __device__ float SquareAnywhere(float x) 18 | { 19 | return x * x; 20 | } 21 | 22 | // Call device and portable functions from a kernel 23 | __global__ void RunGPU(float x) 24 | { 25 | DeviceOnly(); 26 | printf("%f\n", SquareAnywhere(x)); 27 | } 28 | 29 | /* 30 | Call host and portable functions from a kernel 31 | Note that, by default, if a function has no architecture 32 | specified, it is assumed to be __host__ by NVCC. 33 | */ 34 | void RunCPU(float x) 35 | { 36 | HostOnly(); 37 | std::cout << SquareAnywhere(x) << std::endl; 38 | } 39 | 40 | int main() 41 | { 42 | std::cout << "==== Sample 02 - Host / Device Functions ====\n" << std::endl; 43 | /* 44 | Expected output: 45 | "This function may only be called from the host" 46 | 1764 47 | "This function may only be called from the device" 48 | 1764.00 49 | */ 50 | 51 | RunCPU(42); 52 | RunGPU<<<1, 1>>>(42); 53 | cudaDeviceSynchronize(); 54 | return 0; 55 | } 56 | 57 | /* 58 | Exercises: 59 | 1) Write a function that prints a message and can run on both the device and host 60 | 2) Revise the function from 1, such that the CPU version use std::cout. Use the 61 | __CUDA_ARCH__ macro to write code paths that contain architecture-specific code. 62 | */ -------------------------------------------------------------------------------- /14_ManagedMemory/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // Managed variables may be defined like device variables 5 | __managed__ unsigned int mFoo; 6 | 7 | // Print a managed variable 8 | __global__ void PrintFoo() 9 | { 10 | printf("mFoo GPU: %d\n", mFoo); 11 | } 12 | 13 | // Print a managed array of integers 14 | __global__ void PrintBar(const int* mBarPtr, unsigned int numEntries) 15 | { 16 | printf("mBar GPU: "); 17 | for (int i = 0; i < numEntries; i++) 18 | printf("%d%s", mBarPtr[i], (i == numEntries - 1) ? "\n" : ", "); 19 | } 20 | 21 | int main() 22 | { 23 | std::cout << "==== Sample 13 - Managed Memory ====\n" << std::endl; 24 | /* 25 | Managed memory reduces code complexity by decoupling physical 26 | memory location from address range. The CUDA runtime will take 27 | care of moving the memory to the location where it is needed. 28 | No copies are required, but care must be taken for concurrent 29 | access. To avoid performance degradation, managed memory should 30 | be prefetched. 31 | 32 | Expected output: 33 | mFoo GPU: 14 34 | mBar GPU: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 35 | mBar CPU: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 36 | 37 | CUDA device does (NOT) support concurrent access 38 | mFoo GPU: 42 39 | */ 40 | 41 | constexpr unsigned int VALUE = 14; 42 | 43 | // We may assign values to managed variables on the CPU 44 | mFoo = VALUE; 45 | // Managed variables can be used without explicit transfer 46 | PrintFoo<<<1,1>>>(); 47 | // Wait for printf output 48 | cudaDeviceSynchronize(); 49 | 50 | // We may also allocate managed memory on demand 51 | int* mBarPtr; 52 | cudaMallocManaged((void**)&mBarPtr, VALUE * sizeof(int)); 53 | // Managed memory can be directly initialized on the CPU 54 | for (int i = 0; i < VALUE; i++) 55 | mBarPtr[i] = i; 56 | 57 | /* 58 | If we know ahead of time where managed memory will be used 59 | and performance is essential, we can prefetch it to the 60 | required location. This basically replaces memcpy. Note 61 | however, that this action requires support for the 62 | concurrentAccess property. Support for concurrent access 63 | is queried via device properties. 64 | */ 65 | 66 | int device; 67 | cudaGetDevice(&device); 68 | 69 | cudaDeviceProp prop; 70 | cudaGetDeviceProperties(&prop, device); 71 | // Report support 72 | std::cout << "\nCUDA device does " << (!prop.concurrentManagedAccess ? "NOT " : "") << "support concurrent access\n"; 73 | 74 | // If we can, we prefetch ahead of time 75 | if(prop.concurrentManagedAccess) 76 | cudaMemPrefetchAsync(mBarPtr, VALUE * sizeof(int), device); 77 | // Launch kernel with managed memory pointer as parameter 78 | PrintBar<<<1,1>>>(mBarPtr, VALUE); 79 | // We may also prefetch it back to the CPU 80 | if (prop.concurrentManagedAccess) 81 | cudaMemPrefetchAsync(mBarPtr, VALUE * sizeof(int), cudaCpuDeviceId); 82 | // Wait for GPU printing and prefetching to finish 83 | cudaDeviceSynchronize(); 84 | 85 | std::cout << "mBar CPU: "; 86 | for (int i = 0; i < VALUE; i++) 87 | std::cout << mBarPtr[i] << (i == VALUE - 1 ? "\n" : ", "); 88 | 89 | /* 90 | Devices may or may not support concurrent access to variables. 91 | If they don't, then the CPU must ensure that access to managed 92 | memory does not overlap with GPU kernel execution, even if the 93 | GPU does not use the managed memory in question. Modifying 94 | a variable on the CPU before a kernel is fine, because the kernel 95 | will only be launched if the CPU is done with prior instructions. 96 | */ 97 | 98 | // Handling access to managed memory, depending on device properties 99 | mFoo = 42; 100 | PrintFoo<<<1, 1>>>(); 101 | 102 | if (!prop.concurrentManagedAccess) 103 | // CPU access to managed memory and GPU execution may not overlap 104 | cudaDeviceSynchronize(); 105 | 106 | // Modify on CPU after / during GPU execution 107 | mBarPtr[0] = 20; 108 | 109 | // Wait for results of printf 110 | cudaDeviceSynchronize(); 111 | 112 | return 0; 113 | } 114 | 115 | /* 116 | Exercises: 117 | 1) Write a program computes the squares of the integers from 1 to 1000 and 118 | stores them to managed memory. Print them on the CPU. 119 | 2) Choose one of the programs you used previously that did quite a bit of copying, 120 | and rewrite it here to use managed memory instead. How does the performance compare? 121 | */ -------------------------------------------------------------------------------- /18_StandardLibrary/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | __managed__ cuda::std::atomic mResultCPU{ 0 }, mResultGPU{ 0 }; 10 | 11 | /* 12 | Basic, architecture-agnostic reduction, using global atomics. 13 | Uses portable cuda::std::atomics. Note that even though this code is portable, 14 | it may not necessarily give the best performance. cuda::std::atomics have 15 | system-wide (CPU + GPU) scope. If an algorithm is sure to run on the GPU, better 16 | performance may be achieved using cuda::atomics, which take an additional parameter 17 | "thread_scope" (e.g., "device" for global, "block" for shared memory atomics). 18 | */ 19 | __host__ __device__ void reduceAtomic(int tId, int numThreads, int N, const int* input, cuda::std::atomic& result) 20 | { 21 | if (tId >= N) 22 | return; 23 | 24 | // Compute input portion to be handled by each thread 25 | int perThread = N / numThreads; 26 | int myStart = perThread * tId; 27 | int myEnd = (tId == numThreads - 1) ? N : myStart + perThread; 28 | 29 | // For each value in the assigned portion, atomically add it to accumulated sum. 30 | for (int i = myStart; i < myEnd; i++) 31 | result.fetch_add(input[i], cuda::std::memory_order_relaxed); 32 | } 33 | 34 | __global__ void completeReductionGPU(int N, const int* input) 35 | { 36 | // Launchpad code for calling the architecutre-agnostic reduction function 37 | int tId = blockIdx.x * blockDim.x + threadIdx.x; 38 | reduceAtomic(tId, N, N, input, mResultGPU); 39 | } 40 | 41 | template 42 | __host__ void completeReductionCPU(int N, int* mNumbers) 43 | { 44 | /* 45 | Simple multi-threaded launch for computing the reduction 46 | in parallel. This function also makes sure to join on the 47 | threads. Otherwise, we would have to be careful not to 48 | overlap CPU access to managed memory and GPU execution. 49 | Each thread uses the architecture-agnostic implementation 50 | to compute part of the full reduction. 51 | */ 52 | std::vector threads(NUM_THREADS); 53 | for (int i = 0; i < threads.size(); i++) 54 | threads[i] = std::thread(reduceAtomic, i, NUM_THREADS, N, mNumbers, std::ref(mResultCPU)); 55 | for (std::thread& t : threads) 56 | t.join(); 57 | } 58 | 59 | int main() 60 | { 61 | std::cout << "==== Sample 18 - Standard Library ====\n" << std::endl; 62 | /* 63 | The libcu++ standard library allows us to make code more portable. 64 | Users can use familiar concepts from programming for the CPU and 65 | apply them with minimal changes on the GPU as well. In this 66 | example, we show a method for parallel reduction where the same 67 | reduction function can be executed by a thread on the CPU or the 68 | GPU thanks to the support for std:: atomics. 69 | 70 | We use integers in this example. Float atomics are only part 71 | of the standard in C++20. Should be widely available soon! Once 72 | more, the libcu++ standard library is much more powerful than 73 | we can show with just a few samples. We encourage you to check 74 | out the documentation for libcu++, as well as related talks for 75 | more examples of use cases. 76 | 77 | Expected output: the result of a reduction with random integers, 78 | once computed on the CPU and once on the GPU, both of them 79 | yielding the same accumulated value. 80 | */ 81 | 82 | // Define the number of inputs to reduce, and number of CPU threads 83 | constexpr int N = 1 << 16, CPUThreads = 4; 84 | 85 | // Allocate managed memory and fill it with random numbers 86 | int* mNumbers; 87 | std::default_random_engine eng(42); 88 | std::uniform_int_distribution dist(10, 42); 89 | cudaMallocManaged((void**)&mNumbers, sizeof(int) * N); 90 | std::for_each(mNumbers, mNumbers + N, [&dist, &eng](int& v) { v = dist(eng); }); 91 | 92 | // Launch the reduction with the given number of CPU threads 93 | completeReductionCPU(N, mNumbers); 94 | // Launch the reduction on the GPU with as many threads as there are inputs 95 | completeReductionGPU << <(N + 255) / 256, 256 >> > (N, mNumbers); 96 | cudaDeviceSynchronize(); 97 | 98 | // Output both results 99 | std::cout << "Reduction result CPU: " << mResultCPU << "\n" << std::endl; 100 | std::cout << "Reduction result GPU: " << mResultGPU << "\n" << std::endl; 101 | 102 | // finally, release the managed memory for the inputs and results 103 | cudaFree(mNumbers); 104 | } 105 | 106 | /* 107 | Exercises: 108 | 1) The CUDA standard library is continuously being expanded. Check out 109 | their documentation and use an include for one of the recent features 110 | and demonstrate it. 111 | 2) Write a simple kernel with a single block that frequently updates a 112 | single cuda::atomic variable. For performance reasons, it should be one with 113 | thread_scope "block". 114 | 3) Try to show that there is a performance difference in 2) between using 115 | the default cuda::std::atomic and the cuda::atomic with block thread_scope. 116 | */ -------------------------------------------------------------------------------- /07_Synchronization/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "../../shared/include/utility.h" 4 | 5 | __global__ void WriteSlow(int* out, int val) 6 | { 7 | samplesutil::WasteTime(1'000'000'000ULL); 8 | // Finally write value 9 | *out = val; 10 | } 11 | 12 | __global__ void Square(int* out) 13 | { 14 | *out = *out * *out; 15 | } 16 | 17 | __global__ void ApproximatePi(bool synchronized) 18 | { 19 | // Create block-shared variable for approximated Pi 20 | __shared__ float sPi; 21 | // Thread 0 computes Pi and stores it to shared memory 22 | if (threadIdx.x == 0) 23 | sPi = samplesutil::GregoryLeibniz(100'000); 24 | 25 | // Boolean decides whether threads synchronize or not 26 | if (synchronized) 27 | __syncthreads(); 28 | 29 | // Every thread should now perform some task with Pi 30 | if (threadIdx.x%32 == 0) 31 | printf("Thread %d thinks Pi = %f\n", threadIdx.x, sPi); 32 | } 33 | 34 | int main() 35 | { 36 | std::cout << "==== Sample 07 - Synchronization ====\n" << std::endl; 37 | /* 38 | Expected output: 39 | Demonstrating implicit synchronization: 40 | 42 squared = 1764 41 | 42 | No __syncthreads after computing a block-shared Pi: 43 | Thread 32 thinks Pi = 0.000000 44 | Thread 64 thinks Pi = 0.000000 45 | Thread 96 thinks Pi = 0.000000 46 | Thread 0 thinks Pi = 3.141586 47 | (or similar. Results may be correct, but not safe!) 48 | 49 | __syncthreads after computing a block-shared Pi: 50 | Thread 64 thinks Pi = 3.141586 51 | Thread 96 thinks Pi = 3.141586 52 | Thread 0 thinks Pi = 3.141586 53 | Thread 32 thinks Pi = 3.141586 54 | (or similar) 55 | */ 56 | 57 | /* 58 | Implicit synchronization between kernels and cudaMemcpy: 59 | 60 | Consider the example below, where we have two kernels. The first 61 | kernel writes some data (slowly), the second modifies that data. 62 | Afterwards, we copy the modified data back to the CPU. By default, 63 | CUDA will assume that each command depends on the previous command 64 | and therefore will implicitly synchronize them: a kernel will only 65 | run when previous kernels have finished, note however that the CPU 66 | is free to continue working in the meantime. Similarly, cudaMemcpy 67 | will only start when all previous kernels have finished, but it 68 | will also make the CPU wait until the copy has finished. Hence, we 69 | don't need any other synchronization in this scenario. 70 | */ 71 | std::cout << "Demonstrating implicit synchronization:" << std::endl; 72 | // Allocate some device memory for kernels to work with 73 | int* dFooPtr; 74 | cudaMalloc(&dFooPtr, sizeof(int)); 75 | // First kernel sets device memory to 42 (slowly) 76 | WriteSlow<<<1,1>>>(dFooPtr, 42); 77 | // Second kernel squares value of variable 78 | Square<<<1,1>>>(dFooPtr); 79 | // Finally, we copy the result back to the CPU 80 | int foo; 81 | cudaMemcpy(&foo, dFooPtr, sizeof(int), cudaMemcpyDeviceToHost); 82 | // Print the result of the GPU's computation 83 | std::cout << "42 squared = " << foo << std::endl; 84 | 85 | /* 86 | Block-wide synchronization with syncthreads: 87 | 88 | The following kernels compute an approximation of Pi. 89 | The algorithm used is inherently sequential, therefore 90 | only one thread performs the communication and then 91 | shares the result with all threads in the block. 92 | However, while one thread is busy performing work, the 93 | other threads in the block are free to move ahead. 94 | With __syncthreads, we force all threads in a block to 95 | wait at a given point in the program until all other 96 | threads get there. 97 | */ 98 | std::cout << "\nNo __syncthreads after computing a block-shared Pi:" << std::endl; 99 | // Run once without syncthreads 100 | ApproximatePi<<<1, 128>>>(false); 101 | // Wait for printf to finish 102 | cudaDeviceSynchronize(); 103 | 104 | std::cout << "\n__syncthreads after computing a block-shared Pi:" << std::endl; 105 | // Run again with syncthreads 106 | ApproximatePi<<<1, 128>>>(true); 107 | // Wait for printf to finish 108 | cudaDeviceSynchronize(); 109 | 110 | return 0; 111 | } 112 | 113 | /* 114 | Exercises: 115 | 1) Try launching a simple CUDA kernel 1000-10000 times in a loop, once 116 | with cudaDeviceSynchronize after each launch, once without it. 117 | What's the effect on runtime? Does anything change about the program behavior? 118 | 2) You can also memcpy from device to device. Perform a few of them (e.g. moving a 119 | value from device location A to device location B to C) and then back to CPU and 120 | confirm that everything happened properly in order. 121 | 3) Try running a kernel where the first 16 threads in each warp take one branch, 122 | the other 16 take the other, with a syncthreads in each branch. What happens? 123 | Why? Document what happens when the first 32 threads in a block of size 64 take 124 | one branch, the other 32 the other, with a syncthreads in each branch. What happens 125 | now? Provide your best guess why. 126 | */ -------------------------------------------------------------------------------- /12_ErrorHandling/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "../../shared/include/utility.h" 7 | 8 | __device__ int dVal = 42; 9 | __device__ int dOut; 10 | 11 | // Very simple kernel that updates a variable 12 | __global__ void CopyVal(const int* val) 13 | { 14 | // Simulating a little work 15 | samplesutil::WasteTime(1'000'000ULL); 16 | // Update a global value 17 | dOut = *val; 18 | } 19 | 20 | void checkForErrors() 21 | { 22 | // Catch errors that can be detected without synchronization, clear them 23 | cudaError_t err; 24 | err = cudaGetLastError(); 25 | if (err == cudaSuccess) 26 | std::cout << "cudaGetLastError() before sync found no error" << std::endl; 27 | else 28 | std::cout << "cudaGetLastError() before sync found error: " << cudaGetErrorName(err) << ", CLEARS ERROR" << std::endl; 29 | 30 | // Catch errors that require explicit synchronization, do not clear them 31 | err = cudaDeviceSynchronize(); 32 | if (err == cudaSuccess) 33 | std::cout << "cudaDeviceSynchronize() found no error" << std::endl; 34 | else 35 | std::cout << "cudaDeviceSynchronize() found error: " << cudaGetErrorName(err) << ", KEEPS ERROR" << std::endl; 36 | 37 | // If errors were found via synchronization, cudaGetLastError clears them 38 | err = cudaGetLastError(); 39 | if (err == cudaSuccess) 40 | std::cout << "cudaGetLastError() after sync found no error" << std::endl; 41 | else 42 | std::cout << "cudaGetLastError() after sync found error: " << cudaGetErrorName(err) << ", CLEARS ERROR" << std::endl; 43 | 44 | std::cout << std::endl; 45 | } 46 | 47 | #define PRINT_RUN_CHECK(S) \ 48 | std::cout << #S << std::endl; \ 49 | S; \ 50 | checkForErrors(); 51 | 52 | int main() 53 | { 54 | std::cout << "==== Sample 12 - Error Handling ====\n" << std::endl; 55 | /* 56 | Many functions in the CUDA API return error codes that indicate 57 | that something has gone wrong. However, this error is not 58 | necessarily caused by the function that returns it. Kernels and 59 | asynchronous memcopies, e.g., return immediately and may only 60 | encounter errors after the return value is observed on the CPU. 61 | Such errors can be detected at some later point, for instance by 62 | a synchronous function like cudaMemcpy or cudaDeviceSynchronize, 63 | or by cudaGetLastError after a synchronization. To ensure that 64 | every single CUDA call worked without error, we would have to 65 | sacrifice concurrency and asynchronicity. Hence, error checking 66 | is, in practice, rather opportunistic and happens e.g. at runtime 67 | when an algorithm is synchronized anyway or when we debug misbehaving 68 | code. The error checking in this code is thus not practical and only 69 | serves to illustrate how different mechanisms detect previous errors. 70 | 71 | Expected output: 72 | 73 | (CopyVal<<<1, 1>>>(validDAddress)) 74 | cudaGetLastError() before sync found no error 75 | cudaDeviceSynchronize() found no error 76 | cudaGetLastError() after sync found no error 77 | 78 | (CopyVal<<<1, (1<<16)>>>(validDAddress)) 79 | cudaGetLastError() before sync found error: cudaErrorInvalidConfiguration, CLEARS ERROR 80 | cudaDeviceSynchronize() found no error 81 | cudaGetLastError() after sync found no error 82 | 83 | (CopyVal<<<1, 1>>>(nullptr)) 84 | cudaGetLastError() before sync found no error 85 | cudaDeviceSynchronize() found error: cudaErrorIllegalAddress, KEEPS ERROR 86 | cudaGetLastError() after sync found error: cudaErrorIllegalAddress, CLEARS ERROR 87 | 88 | cudaErrorInvalidPc: invalid program counter 89 | */ 90 | 91 | int* validDAddress; 92 | // A function may return an error code - should check those for success 93 | cudaError_t err = cudaGetSymbolAddress((void**)&validDAddress, dVal); 94 | 95 | if (err != cudaSuccess) 96 | // If an error occurred, identify it with cudaGetErrorName and react! 97 | std::cout << cudaGetErrorName(err) << std::endl; 98 | // Alternatively, you may peek at the last error to see if the program is ok 99 | err = cudaPeekAtLastError(); 100 | // Getting the last error effectively resets it. Useful after reacting to it 101 | err = cudaGetLastError(); 102 | 103 | /* 104 | Launching a kernel with proper configuration and parameters. 105 | If the system is set up correctly, this should succeed. 106 | */ 107 | PRINT_RUN_CHECK((CopyVal<<<1, 1>>>(validDAddress))); 108 | 109 | /* 110 | Launching a kernel with bigger block than possible. 111 | cudaGetLastError() can catch SOME errors without synchronizing! 112 | */ 113 | PRINT_RUN_CHECK((CopyVal<<<1, (1<<16)>>>(validDAddress))); 114 | 115 | /* 116 | Launching a kernel with invalid address - error occurs after launch. 117 | cudaGetLastError() alone may miss this without synchronization. 118 | */ 119 | PRINT_RUN_CHECK((CopyVal<<<1, 1>>>(nullptr))); 120 | 121 | // For any kind of error, CUDA also provides a more verbose description. 122 | std::cout << cudaGetErrorName(cudaErrorInvalidPc) << ": " << cudaGetErrorString(cudaErrorInvalidPc) << std::endl; 123 | } 124 | 125 | /* 126 | Exercises: 127 | 1) Write a program that creates many pinned large allocations, and stop when 128 | the first error occurs. What is this error? When and why does it occur? 129 | 2) cudaMemcpy can implicitly synchronize the GPU and CPU, hence its return values 130 | can be used to find any errors of kernels that were launched before it. Demonstrate 131 | this for a simple example where a kernel does something illegal that you discover 132 | using cudaMemcpy. 133 | 3) Try to produce an exotic error that does not occur already occur in this program 134 | */ 135 | -------------------------------------------------------------------------------- /09_Streams/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "../../shared/include/utility.h" 9 | 10 | // A simple kernel function to keep threads busy for a while 11 | __global__ void busy() 12 | { 13 | samplesutil::WasteTime(1'000'000'000ULL); 14 | printf("I'm awake!\n"); 15 | } 16 | 17 | constexpr unsigned int KERNEL_CALLS = 2; 18 | 19 | int main() 20 | { 21 | std::cout << "==== Sample 09 - Streams ====\n" << std::endl; 22 | /* 23 | Expected output: "I'm awake!\n" x 4 x KERNEL_CALLS + 4 24 | 25 | If you watch the output carefully or analyze the execution of 26 | this program with NVIDIA Nsight Systems, it should show that the 27 | first group of kernels run consecutively, while the second and 28 | third group run in parallel. 29 | 30 | Finally, there should be two kernels running sequentially, 31 | followed by two kernels running in parallel. 32 | */ 33 | 34 | std::cout << "Running sequential launches" << std::endl; 35 | // Launch the same kernel several times in a row 36 | for (unsigned int i = 0; i < KERNEL_CALLS; i++) 37 | busy<<<1, 1>>>(); 38 | // Synchronize before continuing to get clear separation in Nsight 39 | cudaDeviceSynchronize(); 40 | 41 | std::cout << "\nRunning launches in streams" << std::endl; 42 | // Allocate one stream for each kernel to be launched 43 | cudaStream_t streams[KERNEL_CALLS]; 44 | for (cudaStream_t& s : streams) 45 | { 46 | // Create stream and launch kernel into it 47 | cudaStreamCreate(&s); 48 | busy<<<1, 1, 0, s>>>(); 49 | } 50 | /* 51 | Destroy all streams. It is fine to do that immediately. Will not 52 | implicitly synchronize, but the GPU will continue running their 53 | jobs until they have all been taken care of 54 | */ 55 | for (cudaStream_t& s : streams) 56 | cudaStreamDestroy(s); 57 | cudaDeviceSynchronize(); 58 | 59 | /* 60 | If we don't specify a stream, then the kernel is launched into the default 61 | stream. Also, many operations like cudaDeviceSynchronize and 62 | cudaStreamSynchronize are submitted to the default stream. Usually, only a 63 | single default stream is defined per application, meaning that if you don't 64 | specify streams, you will not be able to benefit from kernels running 65 | concurrently. Hence, any elaborate CUDA application should be using streams. 66 | 67 | However, if the task can be cleanly separated into CPU threads, there is another 68 | option: using per-thread default streams. Each thread will use its own default 69 | stream if we pass the built-in value cudaStreamPerThread as the stream to use. 70 | Kernels can then run concurrently on the GPU by creating multiple CPU threads. 71 | Alternatively, you may set the compiler option "--default-stream per-thread". 72 | This way, CPU threads will use separate default streams if none are specified. 73 | */ 74 | std::cout << "\nRunning threads with different default streams" << std::endl; 75 | 76 | // Create mutex, condition variable and kernel counter for communication 77 | std::mutex mutex; 78 | std::condition_variable cv; 79 | unsigned int kernelsLaunched = 0; 80 | // Allocate sufficient number of threads 81 | std::thread threads[KERNEL_CALLS]; 82 | // Create a separate thread for each kernel call (task) 83 | for (std::thread& t : threads) 84 | { 85 | t = std::thread([&mutex, &cv, &kernelsLaunched] { 86 | // Launch kernel to thread's default stream 87 | busy<<<1, 1, 0, cudaStreamPerThread>>>(); 88 | /* 89 | Make sure all kernels are submitted before synchronizing, 90 | because cudaStreamSynchronize goes into the default 0 stream: 91 | busy<1> -> sync<0>(1) -> busy<2> -> sync<0>(2)... may serialize. 92 | busy<1> -> busy<2> -> sync<0>(1) -> sync<0>(2)... parallelizes. 93 | */ 94 | std::unique_lock lock(mutex); 95 | ++kernelsLaunched; 96 | cv.wait(lock, [&kernelsLaunched] { return kernelsLaunched == KERNEL_CALLS; }); 97 | cv.notify_all(); 98 | // Synchronize to wait for printf output 99 | cudaStreamSynchronize(cudaStreamPerThread); 100 | }); 101 | } 102 | // Wait for all threads to finish launching their kernels in individual streams 103 | std::for_each(threads, threads + KERNEL_CALLS, [](std::thread& t) {t.join(); }); 104 | 105 | /* 106 | By default, custom created streams will implicitly synchronize with the 107 | default stream. Consider, e.g., a kernel A running in a custom stream, 108 | followed by a kernel B in the default stream. If we use cudaStreamCreate 109 | as above, then A will end before B starts. Alternatively, we may create 110 | custom streams with the flag cudaStreamNonBlocking. In this case, the 111 | custom stream will not synchronize with the default stream anymore. 112 | */ 113 | cudaStream_t customRegular, customNonblocking; 114 | cudaStreamCreate(&customRegular); 115 | cudaStreamCreateWithFlags(&customNonblocking, cudaStreamNonBlocking); 116 | 117 | auto testAB = [](const char* kind, cudaStream_t stream) { 118 | std::cout << "\nLaunching A (custom) -> B (default) with " << kind << " custom stream" << std::endl; 119 | busy<<<1, 1, 0, stream>>>(); 120 | busy<<<1, 1>>>(); 121 | cudaDeviceSynchronize(); 122 | }; 123 | 124 | testAB("regular", customRegular); 125 | testAB("non-blocking", customNonblocking); 126 | 127 | // Clean up generated streams 128 | cudaStreamDestroy(customRegular); 129 | cudaStreamDestroy(customNonblocking); 130 | 131 | return 0; 132 | } 133 | 134 | /* 135 | Exercises: 136 | 1) Streams are a great way to bring task parallelism to the GPU. Think of a small 137 | program that can benefit from running two different kernels at the same time and 138 | write it, along with documentation of its inputs/outputs and usefulness. 139 | */ -------------------------------------------------------------------------------- /06_MemoryBasics/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | // Declaration of a device variable in constant memory 6 | __constant__ int cFoo; 7 | 8 | __global__ void ReadConstantMemory() 9 | { 10 | printf("GPU: Reading constant memory --> %x\n", cFoo); 11 | } 12 | 13 | // Definition of a device variable in global memory 14 | __device__ const int dFoo = 42; 15 | 16 | __global__ void ReadGlobalMemory(const int* __restrict dBarPtr) 17 | { 18 | printf("GPU: Reading global memory --> %d %x\n", dFoo, *dBarPtr); 19 | } 20 | 21 | __global__ void WriteGlobalMemory(int* __restrict dOutPtr) 22 | { 23 | *dOutPtr = dFoo * dFoo; 24 | } 25 | 26 | __device__ void WriteAndPrintSharedMemory(int* sFoo) 27 | { 28 | // Write a computed result to shared memory for other threads to see 29 | sFoo[threadIdx.x] = 42 * (threadIdx.x + 1); 30 | // We make sure that no thread prints while the other still writes (parallelism!) 31 | __syncwarp(); 32 | // Print own computed result and result by neighbor 33 | printf("ThreadID: %d, sFoo[0]: %d, sFoo[1]: %d\n", threadIdx.x, sFoo[0], sFoo[1]); 34 | } 35 | 36 | __global__ void WriteAndPrintSharedMemoryFixed() 37 | { 38 | // Fixed allocation of two integers in shared memory 39 | __shared__ int sFoo[2]; 40 | // Use it for efficient exchange of information 41 | WriteAndPrintSharedMemory(sFoo); 42 | } 43 | 44 | __global__ void WriteAndPrintSharedMemoryDynamic() 45 | { 46 | // Use dynamically allocated shared memory 47 | extern __shared__ int sFoo[]; 48 | // Use it for efficient exchange of information 49 | WriteAndPrintSharedMemory(sFoo); 50 | } 51 | 52 | int main() 53 | { 54 | std::cout << "==== Sample 06 - Memory Basics ====\n" << std::endl; 55 | /* 56 | Expected output: 57 | GPU: Reading constant memory --> caffe 58 | GPU: Reading global memory --> 42 caffe 59 | CPU: Copied back from GPU --> 1764 60 | 61 | Using static shared memory to share computed results 62 | ThreadID: 0, sFoo[0]: 42, sFoo[1]: 84 63 | ThreadID: 1, sFoo[0]: 42, sFoo[1]: 84 64 | 65 | Using dynamic shared memory to share computed results 66 | ThreadID: 0, sFoo[0]: 42, sFoo[1]: 84 67 | ThreadID: 1, sFoo[0]: 42, sFoo[1]: 84 68 | */ 69 | 70 | const int bar = 0xcaffe; 71 | /* 72 | Uniform variables should best be placed in constant 73 | GPU memory. Can be updated with cudaMemcpyToSymbol. 74 | This syntax is unusual, but this is how it should be 75 | */ 76 | cudaMemcpyToSymbol(cFoo, &bar, sizeof(int)); 77 | ReadConstantMemory<<<1, 1>>>(); 78 | cudaDeviceSynchronize(); 79 | 80 | /* 81 | Larger or read-write data is easiest provisioned by 82 | global memory. Can be allocated with cudaMalloc and 83 | updated with cudaMemcpy. Must be free'd afterward. 84 | */ 85 | int* dBarPtr; 86 | cudaMalloc((void**)&dBarPtr, sizeof(int)); 87 | cudaMemcpy(dBarPtr, &bar, sizeof(int), cudaMemcpyHostToDevice); 88 | ReadGlobalMemory<<<1, 1>>>(dBarPtr); 89 | cudaDeviceSynchronize(); 90 | cudaFree(dBarPtr); 91 | 92 | /* 93 | The CPU may also read back updates from the GPU by 94 | copying the relevant data from global memory after 95 | running the kernel. Notice that here, we do not use 96 | cudaDeviceSynchronize: cudaMemcpy will synchronize 97 | with the CPU automatically. 98 | */ 99 | int out, *dOutPtr; 100 | cudaMalloc((void**)&dOutPtr, sizeof(int)); 101 | WriteGlobalMemory<<<1,1>>>(dOutPtr); 102 | cudaMemcpy(&out, dOutPtr, sizeof(int), cudaMemcpyDeviceToHost); 103 | cudaFree(dOutPtr); 104 | std::cout << "CPU: Copied back from GPU --> " << out << std::endl; 105 | 106 | /* 107 | For information that is shared only within a single threadblock, 108 | we can also use shared memory, which is usually more efficient than 109 | global memory. Shared memory for a block may be statically allocated 110 | inside the kernel, or dynamically allocated at the kernel launch. In 111 | the latter case, the size of the required shared memory is provided as 112 | the third launch parameter, and the kernel will be able to access the 113 | allocated shared memory via an array with the "extern" decoration. 114 | Below, we use both methods to provide shared memory for a kernel with 115 | two threads that exchange computed integers. 116 | */ 117 | std::cout << "\nUsing static shared memory to share computed results" << std::endl; 118 | WriteAndPrintSharedMemoryFixed<<<1, 2>>>(); 119 | cudaDeviceSynchronize(); 120 | 121 | std::cout << "\nUsing dynamic shared memory to share computed results" << std::endl; 122 | WriteAndPrintSharedMemoryDynamic<<<1, 2, 2 * sizeof(int)>>>(); 123 | cudaDeviceSynchronize(); 124 | 125 | return 0; 126 | } 127 | 128 | /* 129 | Exercises: 130 | 1) Write a function that takes data from constant memory and writes it to global. 131 | Copy it back from the GPU and print on the CPU. 132 | 2) Combine allocation, memcpy and several kernels in succession to produce a 133 | more complex result. E.g., kernel A adds global values X + Y and writes the 134 | result back to global, kernel B multiplies that result by Z. Convince yourself 135 | that the results remain in global memory between kernel launches and that a 136 | kernel or a memcpy that runs after an earlier kernel can safely access the global 137 | data that it produced, even if you don't use cudaDeviceSynchronize inbetween. 138 | 3) Try to write a kernel where one thread writes a value to shared memory without 139 | a syncwarp, so that other threads may fail to see it. You might need a block 140 | size larger than 32 threads for this to happen and you may have to let the writing 141 | thread do some "fake" work to delay its write to shared memory. Or it may work 142 | immediately :) A solution should be provided by the following code sample. 143 | */ -------------------------------------------------------------------------------- /10_Events/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "../../shared/include/utility.h" 6 | 7 | // A kernel that wastes some time 8 | __global__ void SlowKernel() 9 | { 10 | samplesutil::WasteTime(1'000'000'000ULL); 11 | } 12 | 13 | __device__ int dFoo; 14 | 15 | // A kernel that only sets dFoo 16 | __global__ void SetFoo(int foo) 17 | { 18 | dFoo = foo; 19 | } 20 | 21 | // A kernel that prints dFoo 22 | __global__ void PrintFoo() 23 | { 24 | printf("foo: %d\n", dFoo); 25 | } 26 | 27 | int main() 28 | { 29 | std::cout << "==== Sample 10 - Events ====\n" << std::endl; 30 | /* 31 | Using events to measure time and communicate across streams. 32 | 33 | Expected output: 34 | 1) Unrealistically short time with chrono measurements without syncing, 35 | similar times for chrono with syncing and when using CUDA events. 36 | 2) foo: 42 37 | */ 38 | using namespace std::chrono_literals; 39 | using namespace std::chrono; 40 | 41 | // Create CUDA events 42 | cudaEvent_t start, end; 43 | cudaEventCreate(&start); 44 | cudaEventCreate(&end); 45 | 46 | // Synchronize GPU with CPU to capture adequate time 47 | cudaDeviceSynchronize(); 48 | auto before = std::chrono::system_clock::now(); 49 | 50 | // Record start directly before first relevant GPU command 51 | cudaEventRecord(start); 52 | // Launch a light-weight GPU kernel and heavy GPU kernel 53 | SetFoo<<<1,1>>>(0); 54 | SlowKernel<<<1,1>>>(); 55 | // Record end directly after last relevant GPU command 56 | cudaEventRecord(end); 57 | // Also measure CPU time after last GPU command, without synching 58 | auto afterNoSync = std::chrono::system_clock::now(); 59 | 60 | // Synchronize CPU and GPU 61 | cudaDeviceSynchronize(); 62 | // Measure CPU time after last GPU command, with synching 63 | auto afterSync = std::chrono::system_clock::now(); 64 | 65 | // Print measured CPU time without synchronization 66 | float msCPUNoSync = 1000.f * duration_cast>(afterNoSync - before).count(); 67 | std::cout << "Measured time (chrono, no sync): " << msCPUNoSync << "ms\n"; 68 | 69 | // Print measured CPU time with synchronization 70 | float msCPUSync = 1000.f * duration_cast>(afterSync - before).count(); 71 | std::cout << "Measured time (chrono, sync): " << msCPUSync << "ms\n"; 72 | 73 | // Print measured GPU time measured with CUDA events 74 | float msGPU; 75 | cudaEventElapsedTime(&msGPU, start, end); 76 | std::cout << "Measured time (CUDA events): " << msGPU << "ms\n"; 77 | 78 | /* 79 | The difference between the two methods, CPU timing and events, is 80 | important when writing more complex projects: kernels are being 81 | launched asynchronously. The launch returns immediately so the CPU 82 | can progress with other jobs. This means that to get a proper timing, 83 | we always have to synchronize CPU and GPU before measuring current time 84 | with chrono. With CUDA events, we can insert them into streams before 85 | and after the actions we want to measure. We can have multiple events 86 | inserted at many different points. We still have to synchronize, but 87 | only when we eventually want to ACCESS the measurements on the CPU 88 | (e.g., once for all timings at the end of a frame to get a report). 89 | 90 | Make sure that you don't try to measure parts of your program with 91 | events that mix GPU and CPU code. Events for start and end should 92 | only enclose code portions with GPU tasks. Otherwise you won't be 93 | sure what you are measuring and might get non-reproducible results! 94 | */ 95 | 96 | //Clean up events 97 | cudaEventDestroy(start); 98 | cudaEventDestroy(end); 99 | 100 | /* 101 | Dependencies across streams: 102 | 103 | Events may also be used to introduce dependencies 104 | across streams. One stream may compute an important 105 | piece of information that another should use. This 106 | dependency can be modelled by recording an event in 107 | one stream and have the target stream wait on this 108 | event. Commands launched to the stream will not 109 | continue until the event is observed. 110 | */ 111 | 112 | // Create a new event to signal that data is ready 113 | cudaEvent_t fooReady; 114 | cudaEventCreate(&fooReady); 115 | 116 | // Create two streams, one producer, one consumer 117 | cudaStream_t producer, consumer; 118 | cudaStreamCreate(&producer); 119 | cudaStreamCreate(&consumer); 120 | 121 | /* 122 | Enforce the following behavior for producer/consumer streams: 123 | 124 | Producer Consumer 125 | | . 126 | slow kernel . 127 | | . 128 | sets foo . 129 | \____________. 130 | | 131 | print foo 132 | */ 133 | 134 | // Producer stream simulates some hard work 135 | SlowKernel<<<1, 1, 0, producer>>>(); 136 | // Producer sets foo to an important value 137 | SetFoo<<<1, 1, 0, producer>>>(42); 138 | // Producer notifies consumer stream that foo is ready 139 | cudaEventRecord(fooReady, producer); 140 | 141 | // Consumer waits for ready event 142 | cudaStreamWaitEvent(consumer, fooReady); 143 | // Without waiting, consumer MAY print before foo is ready! 144 | PrintFoo<<<1, 1, 0, consumer>>>(); 145 | 146 | // Wait for printf outputs 147 | cudaDeviceSynchronize(); 148 | 149 | return 0; 150 | } 151 | 152 | /* 153 | Exercises: 154 | 1) Write a simple function that calls several different kernels, each of which 155 | should do a bit of work. Use multiple events and just one cuda...Synchronize, 156 | and report in % how much each kernel contributes to the total procedure run time. 157 | */ -------------------------------------------------------------------------------- /11_PinnedMemory/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "../../shared/include/utility.h" 8 | 9 | // Simulate a complex task, but actually only compute a square 10 | __global__ void PerformComplexTask(float input, float* __restrict result) 11 | { 12 | samplesutil::WasteTime(100'000'000ULL); 13 | *result = input * input; 14 | } 15 | 16 | int main() 17 | { 18 | std::cout << "==== Sample 11 - Pinned Memory ====\n" << std::endl; 19 | /* 20 | Pinned memory becomes relevant once we start using streams 21 | and memory transfer enters the mix. The default memcpy operation 22 | cudaMemcpy is, by default synchronous, i.e., when it is called, 23 | the CPU will stall until the memcpy has finished. However, in 24 | many cases we don't want this. Consider the example below, 25 | where we use several streams to compute some expensive result 26 | from different inputs. For each 'task', we launch a kernel to 27 | a separate stream, followed by a memcpy of the result back to 28 | the GPU. 29 | 30 | Ideally, we would like the memory transfers to overlap with kernels 31 | that run in different streams. But if we use cudaMemcpy, the kernel 32 | calls will execute sequentially, because each cudaMemcpy implicitly 33 | synchronizes the default stream with the CPU, and all basic streams 34 | are synchronized with the default stream. However, there is a different 35 | memory transfer function of the name cudaMemcpyAsync, which also takes 36 | an additional stream parameter in which to run. However, using this 37 | function alone is not enough to overlap memory transfer and kernels. 38 | To perform asynchronous memcpy between the device and the host, CUDA 39 | must be sure that the host memory is available in main memory. We 40 | can guarantee this by allocating memory with cudaMallocHost. This is 41 | so-called "pinned" memory, which may never be moved or swapped out. 42 | If we use pinned memory and cudaMemcpyAsync, then copies and kernels 43 | that run in different streams are free to overlap. 44 | 45 | Expected output: slow performance for all combinations that are not 46 | pinned memory and asynchronous copy, due to implicit synchronization 47 | preventing concurrent execution of kernels. 48 | */ 49 | 50 | constexpr unsigned int TASKS = 4; 51 | 52 | // Allocate result values for GPU to write to 53 | float* dResultsPtr; 54 | cudaMalloc((void**)&dResultsPtr, sizeof(float) * TASKS); 55 | 56 | // Generate necessary streams and events 57 | cudaStream_t streams[TASKS]; 58 | cudaEvent_t events[TASKS]; 59 | for (int i = 0; i < TASKS; i++) 60 | { 61 | cudaStreamCreate(&streams[i]); 62 | cudaEventCreate(&events[i]); 63 | } 64 | 65 | // Two CPU-side memory ranges: one regular and one pinned 66 | float results[TASKS], * results_pinned; 67 | cudaMallocHost((void**)&results_pinned, sizeof(float) * TASKS); 68 | 69 | // We run the tasks with regular/async memcpy 70 | enum class CPYTYPE { MEMCPY, MEMCPYASYNC }; 71 | // We run the tasks with regular/pinned memory 72 | enum class MEMTYPE { REGULAR, PINNED}; 73 | 74 | for (auto cpy : { CPYTYPE::MEMCPY, CPYTYPE::MEMCPYASYNC }) 75 | { 76 | for (auto mem : { MEMTYPE::REGULAR, MEMTYPE::PINNED }) 77 | { 78 | float* dst = (mem == MEMTYPE::PINNED ? results_pinned : results); 79 | 80 | std::cout << "Performing tasks with " << (mem == MEMTYPE::PINNED ? "pinned memory" : "regular memory"); 81 | std::cout << " and " << (cpy == CPYTYPE::MEMCPYASYNC ? "asynchronous" : "regular") << " copy" << std::endl; 82 | 83 | // Reset GPU result 84 | cudaMemset(dResultsPtr, 0, sizeof(float) * TASKS); 85 | 86 | // Synchronize to get adequate CPU time measurements 87 | cudaDeviceSynchronize(); 88 | const auto before = std::chrono::system_clock::now(); 89 | 90 | for (int i = 0; i < TASKS; i++) 91 | { 92 | // Unnecessarily slow kernel 93 | PerformComplexTask<<<1, 1, 0, streams[i]>>>(i+1, dResultsPtr+i); 94 | // Use either regular or asynchronous copy for reading back results 95 | if (cpy == CPYTYPE::MEMCPYASYNC) 96 | cudaMemcpyAsync(&dst[i], dResultsPtr+i, sizeof(float), cudaMemcpyDeviceToHost, streams[i]); 97 | else 98 | cudaMemcpy(&dst[i], dResultsPtr + i, sizeof(float), cudaMemcpyDeviceToHost); 99 | } 100 | 101 | // Wait for results being copied back 102 | for (int i = 0; i < TASKS; i++) 103 | { 104 | // Wait for the current stream 105 | cudaStreamSynchronize(streams[i]); 106 | 107 | // Evaluate result and print 108 | if (dst[i] != (i + 1) * (i + 1)) 109 | std::cout << "Task failed or CPU received wrong value!" << std::endl; 110 | else 111 | std::cout << "Finished task " << i << ", produced output: " << results[i] << std::endl; 112 | } 113 | 114 | const auto after = std::chrono::system_clock::now(); 115 | std::cout << "Time: " << std::chrono::duration_cast>(after-before).count() << "s\n\n"; 116 | } 117 | } 118 | 119 | // Clean up streams 120 | for (cudaStream_t& s : streams) 121 | cudaStreamDestroy(s); 122 | 123 | // Pinned memory should be freed with cudaFreeHost 124 | cudaFreeHost(results_pinned); 125 | } 126 | 127 | /* 128 | Exercises: 129 | 1) Create a sketch of this experiment to illustrate what is happening. 130 | For each of the configurations, try to establish the timeline of what happens on 131 | CPU/GPU and why you get the results you do. If you can, feel free to use NVIDIA 132 | Nsight Systems, which should make this very easy. 133 | 2) Write a simple program that just compares the performance of copying pinned vs 134 | non-pinned back and forth with cudaMemcpy in a single stream a couple of times. 135 | How does the performance difference develop as you change the size of the copy? 136 | 3) Try to make the streams non-blocking using 137 | cudaCreateStreamWithFlags(..., cudaStreamNonBlocking)! In this case, you 138 | can expect wrong results for cudaMemcpy: Non-blocking means the default 139 | stream won't wait for the custom streams running the kernels to finish 140 | before it starts copying. Can you draw a sketch of what is going wrong? 141 | */ -------------------------------------------------------------------------------- /15_GraphAPI/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "../../shared/include/utility.h" 6 | 7 | // A simple kernel function to keep threads busy for a while 8 | __global__ void busy() 9 | { 10 | samplesutil::WasteTime(1'000'000'000ULL); 11 | printf("I'm awake!\n"); 12 | } 13 | 14 | void runTasksSequentially(unsigned int numTasks) 15 | { 16 | // We use cudaStreamPerThread here. It makes no difference 17 | // for the program flow because we are only single-threaded 18 | // anyway, but capturing the application-wide default stream 19 | // cudaStreamLegacy is not permitted by the graph API. 20 | 21 | for (int i = 0; i < numTasks; i++) 22 | busy << <1, 1, 0, cudaStreamPerThread >> > (); 23 | } 24 | 25 | void runTasksWithStreams(unsigned int numTasks) 26 | { 27 | /* 28 | This stream-based function can be directly captured with graph API. 29 | Events are used to encode dependencies / start / end of capture. 30 | */ 31 | std::vector streams(numTasks); 32 | std::vector finished(numTasks); 33 | for (int i = 0; i < numTasks; i++) 34 | { 35 | cudaStreamCreate(&streams[i]); 36 | cudaEventCreate(&finished[i]); 37 | } 38 | // We need an additional event to represent the capture start 39 | cudaEvent_t start; 40 | cudaEventCreate(&start); 41 | // Immediately record the starting event so other streams can connect to it 42 | cudaEventRecord(start, cudaStreamPerThread); 43 | // All other streams must connect to origin stream via event to get captured 44 | for (int i = 0; i < numTasks; i++) 45 | { 46 | // Establish dependency / connection to origin (is now included in capture) 47 | cudaStreamWaitEvent(streams[i], start); 48 | // Run actual task (kernel) in stream 49 | busy << <1, 1, 0, streams[i] >> > (); 50 | // Record end event of this stream so origin can wait on it 51 | cudaEventRecord(finished[i], streams[i]); 52 | } 53 | // Origin stream waits until all custom streams have finished their task 54 | for (int i = 0; i < numTasks; i++) 55 | { 56 | cudaStreamWaitEvent(cudaStreamPerThread, finished[i]); 57 | cudaStreamDestroy(streams[i]); 58 | } 59 | } 60 | 61 | template 62 | cudaGraphExec_t recordGraphFromFunction(const T& func, P ...params) 63 | { 64 | // Create a graph for recording GPU commands 65 | cudaGraph_t graph; 66 | cudaGraphCreate(&graph, 0); 67 | 68 | // Record a graph, assuming that functions start from thread's default stream 69 | cudaStreamBeginCapture(cudaStreamPerThread, cudaStreamCaptureModeGlobal); 70 | func(params...); 71 | cudaStreamEndCapture(cudaStreamPerThread, &graph); 72 | 73 | // Turn the recorded graph into an executable instance 74 | cudaGraphExec_t instance; 75 | cudaGraphInstantiate(&instance, graph, nullptr, nullptr, 0); 76 | // The recorded graph is no longer needed 77 | cudaGraphDestroy(graph); 78 | return instance; 79 | } 80 | 81 | cudaGraphExec_t buildGraphForParallelTasks(unsigned int numTasks) 82 | { 83 | // Set up a graph from scratch 84 | cudaGraph_t graph; 85 | cudaGraphCreate(&graph, 0); 86 | 87 | // Create a node for each kernel in the graph, with grid config and parameters 88 | std::vector nodes(numTasks); 89 | cudaKernelNodeParams params = { reinterpret_cast(busy), {1,1,1}, {1,1,1}, 0, nullptr, nullptr }; 90 | 91 | // Add them to the graph. This simple setup has no dependencies, passing nullptr 92 | for (int i = 0; i < numTasks; i++) 93 | cudaGraphAddKernelNode(&nodes[i], graph, nullptr, 0, ¶ms); 94 | 95 | // Create executable graph, destroy manually built graph 96 | cudaGraphExec_t instance; 97 | cudaGraphInstantiate(&instance, graph, 0, 0, 0); 98 | cudaGraphDestroy(graph); 99 | return instance; 100 | } 101 | 102 | int main() 103 | { 104 | std::cout << "==== Sample 15 - Graph API ====\n" << std::endl; 105 | /* 106 | The graph API enables the creation of well-defined structures that 107 | encode the types, parameters and dependencies of instructions that 108 | the GPU should process. By preparing this information, developers can 109 | decouple the definition and execution of the parallel workload. The 110 | driver is then free to optimize its execution. Graphs may be created 111 | either by setting up graphs manually from scratch or by recording 112 | already available code, which may occasionally require modifications: 113 | CUDA graphs usually do not include synchronization methods with the 114 | CPU. This means that waiting actions (e.g., until all streams have 115 | finished) must be modelled via dependencies/events instead. 116 | 117 | Expected output: 5 x TASKS "I'm awake\n", first two groups launching 118 | sequentially, the last three groups running concurrently. 119 | */ 120 | 121 | constexpr int TASKS = 4; 122 | 123 | std::cout << "Launching multiple tasks sequentially" << std::endl; 124 | // Launching multiple tasks as kernels one after the other 125 | runTasksSequentially(TASKS); 126 | cudaDeviceSynchronize(); 127 | 128 | std::cout << "Running recorded graph from existing sequential code" << std::endl; 129 | // Recording a graph from the existing sequential code and launching its instance 130 | cudaGraphExec_t recordedSequential = recordGraphFromFunction(runTasksSequentially, TASKS); 131 | cudaGraphLaunch(recordedSequential, 0); 132 | cudaDeviceSynchronize(); 133 | 134 | std::cout << "Launching multiple tasks with streams" << std::endl; 135 | // Launching multiple tasks in multiple streams 136 | runTasksWithStreams(TASKS); 137 | cudaDeviceSynchronize(); 138 | 139 | std::cout << "Running recorded graph from existing stream-based code" << std::endl; 140 | // Recording a graph from the existing stream-based code, launching instance 141 | cudaGraphExec_t recordedStreams = recordGraphFromFunction(runTasksWithStreams, TASKS); 142 | cudaGraphLaunch(recordedStreams, 0); 143 | cudaDeviceSynchronize(); 144 | 145 | std::cout << "Running manually-built graph that behaves like streams" << std::endl; 146 | // Example for building a scratch manually without recording 147 | cudaGraphExec_t instanceBuilt = buildGraphForParallelTasks(TASKS); 148 | cudaGraphLaunch(instanceBuilt, 0); 149 | cudaDeviceSynchronize(); 150 | 151 | return 0; 152 | } 153 | 154 | /* 155 | Exercises: 156 | 1) Manually build and instantiate a working CUDA graph that includes a host-side 157 | function node and demonstrate what it does. 158 | 2) Manually build and instantiate a working graph that includes a different kind 159 | of node that does not yet occur in this application or in exercise 1). 160 | 3) Given a particular CUDA graph, create a simple GraphViz graph in DOT format of its 161 | structure. Use your solution to sketch out the graphs created by this source code. 162 | */ -------------------------------------------------------------------------------- /13_MemoryFences/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "../../shared/include/utility.h" 4 | 5 | /* 6 | Producer function. 7 | 8 | Following a threadfence (memory barrier) with a volatile yields a release pattern. 9 | Following a threadfence (memory barrier) with an atomic yields a release pattern. 10 | Note however, that neither of these options is ideal. For one, we combine two 11 | operations to achieve a certain behavior. Second, threadfence is a general memory 12 | barrier, and is thus stronger than it may have to be (e.g., release barrier only). 13 | Volta+ actually have support for memory coherency models with proper acquire / 14 | release semantics, which are exposed to the programmer via cuda::std::atomic in 15 | libcu++. 16 | */ 17 | template 18 | __device__ void ProduceFoo(unsigned int id, float* dFooPtr, int *dFooReadyPtr) 19 | { 20 | float pi = samplesutil::GregoryLeibniz(10'000'000); 21 | dFooPtr[id] = pi; 22 | 23 | __threadfence(); 24 | 25 | if (ATOMIC) 26 | atomicExch(&dFooReadyPtr[id], 1); 27 | else 28 | *((volatile int*)&dFooReadyPtr[id]) = 1; 29 | } 30 | 31 | /* 32 | Consumer function. 33 | 34 | Preceding a threadfence (memory barrier) with a volatile yields an acquire pattern. 35 | Preceding a threadfence (memory barrier) with an atomic yields an acquire pattern. 36 | Note however, that neither of these options is ideal. For one, we combine two 37 | operations to achieve a certain behavior. Second, threadfence is a general memory 38 | barrier, and is thus stronger than it may have to be (e.g., acquire barrier only). 39 | Volta+ actually have support for memory coherency models with proper acquire / 40 | release semantics, which are exposed to the programmer via cuda::std::atomic in 41 | libcu++. 42 | */ 43 | template 44 | __device__ void ConsumeFoo(unsigned int id, const float* dFooPtr, int* dFooReadyPtr) 45 | { 46 | if (ATOMIC) 47 | while (atomicAdd(&dFooReadyPtr[id], 0) == 0); 48 | else 49 | while (*((volatile int*)&dFooReadyPtr[id]) == 0); 50 | 51 | __threadfence(); 52 | 53 | printf("Consumer %d thinks Pi is: %f\n", id, dFooPtr[id]); 54 | } 55 | 56 | // Launch either version of a safe producer / consumer scenarios 57 | template 58 | __global__ void ProducerConsumer(float* dFooPtr, int* dFooReadyPtr) 59 | { 60 | int id = (blockIdx.x * blockDim.x + threadIdx.x); 61 | 62 | if (id < N) 63 | ProduceFoo(id, dFooPtr, dFooReadyPtr); 64 | else 65 | ConsumeFoo(id - N, dFooPtr, dFooReadyPtr); 66 | } 67 | 68 | /* 69 | As we have seen before, although we didnt explicilty mention it, 70 | using a syncthreads inside a block is sufficient to make sure that 71 | the other threads can observe the data that was previously written 72 | by another thread in the block. Here we illustrate this again, 73 | with a simple, safe producer / consumer setup, where syncthreads 74 | ensures ordering of operations and visibility of the data for all 75 | threads in the block. 76 | */ 77 | __global__ void ProducerConsumerShared() 78 | { 79 | extern __shared__ float sFoo[]; 80 | 81 | if (threadIdx.x < blockDim.x/2) 82 | { 83 | float pi = samplesutil::GregoryLeibniz(10'000'000); 84 | sFoo[threadIdx.x] = pi; 85 | } 86 | // Synchronize threads in block AND ensure memory access ordering among them 87 | __syncthreads(); 88 | if (threadIdx.x >= blockDim.x / 2) 89 | { 90 | int cId = threadIdx.x - blockDim.x / 2; 91 | printf("Comsumer %d thinks Pi is %f\n", cId, sFoo[cId]); 92 | } 93 | } 94 | 95 | int main() 96 | { 97 | std::cout << "==== Sample 13 - Memory Fences ====\n" << std::endl; 98 | /* 99 | So far, we have ignored the problem of memory fencing, which 100 | is relevant in multi-threaded applications. We can exchange 101 | information securely via atomic variables, however when we store 102 | data in bulk or need to ensure a particular ordering of observed 103 | events, for instance in a producer/consumer scenario, we need clear 104 | orderings of data accesses that are definite for all involved threads. 105 | For threads within a block, this is trivially achieved by using 106 | syncthreads. For establishing orderings across blocks, CUDA offers 107 | the __threadfence operation. This can be necessary, because the default 108 | atomicXXX operations of CUDA only give us "RELAXED" semantics, i.e., 109 | they have no synchronization effect on other memory. However, combining 110 | a thread fence with relaxed atomics can---much like in C++11---give us 111 | acquire / release semantics. 112 | 113 | At its core, threadfence is a general memory barrier, which makes sure 114 | that all writes below it occur after all writes above it, and that all 115 | reads below it occur after all reads above it. However, there are some 116 | intricacies that make the safe use of threadfence a little tricky. 117 | Understanding all possible scenarios is complex task, and may not be worth 118 | the effort, since modern CUDA offers better alternatives (see material and 119 | samples for CUDA standard library). A basic recipe for safely using 120 | __threadfence is as part of a release-acquire pattern. The PTX ISA states 121 | that a __threadfence, followed by an atomic or volatile memory operation, 122 | yields a release pattern, while a __threadfence preceding an atomic or 123 | volatile memory operation yields an acquire pattern. With these patterns, 124 | we can for instance solve the producer / consumer scenario by using flags 125 | that indicate when data is ready, and securing access to them with proper 126 | acquire / release behavior. 127 | 128 | Expected output: 129 | 130 | Producer / consumer pair in same block 131 | Comsumer 15 thinks Pi is 3.141597 132 | Comsumer 16 thinks Pi is 3.141597 133 | Comsumer 0 thinks Pi is 3.141597 134 | ... 135 | (or similar) 136 | 137 | Producer / consumer pair with volatile + threadfence 138 | Consumer 4 thinks Pi is: 3.141597 139 | ... 140 | (or similar) 141 | 142 | Producer / consumer pair with volatile + atomic 143 | Consumer 4 thinks Pi is: 3.141597 144 | ... 145 | (or similar) 146 | */ 147 | 148 | constexpr unsigned int N = 8; 149 | constexpr unsigned int blockSize = 4; 150 | 151 | // Compute how many producer / consumer blocks should be launched 152 | unsigned int numBlocks = N / blockSize; 153 | 154 | // Run producer / consumer scenario inside a single block (simple) 155 | std::cout << "\nProducer / consumer pair in same block" << std::endl; 156 | ProducerConsumerShared<<<1, 34, 34 * sizeof(float)>>>(); 157 | cudaDeviceSynchronize(); 158 | 159 | // Allocate and initialize mmeory for global producer / consumer scenario 160 | float* dFooPtr; 161 | int* dFooReadyPtr; 162 | cudaMalloc((void**)&dFooPtr, sizeof(float) * N); 163 | cudaMalloc((void**)&dFooReadyPtr, sizeof(int) * N); 164 | cudaMemset(dFooPtr, 0, sizeof(float) * N); 165 | cudaMemset(dFooReadyPtr, 0, sizeof(int) * N); 166 | 167 | // Producer / consumer scenario across blocks in global memory, using volatile + threadfence 168 | std::cout << "\nProducer / consumer pair with volatile + threadfence" << std::endl; 169 | ProducerConsumer<<>>(dFooPtr, dFooReadyPtr); 170 | cudaDeviceSynchronize(); 171 | 172 | // Producer / consumer scenario across blocks in global memory, using atomic + threadfence 173 | std::cout << "\nProducer / consumer pair with atomic + threadfence" << std::endl; 174 | ProducerConsumer<<>>(dFooPtr, dFooReadyPtr); 175 | cudaDeviceSynchronize(); 176 | 177 | return 0; 178 | } 179 | 180 | /* 181 | Exercises: 182 | 1) TRY to write a program where one thread reliably observes writes in the WRONG 183 | order, due to lack of threadfence (e.g., in your code one thread sets A from 0 184 | to 1, followed by setting B from 0 to 1, but another thread observes A = 0, B = 1, 185 | or something similar). To do this, you may want to make sure those threads run in 186 | different blocks, preferably even on different SMs, and communicate via global 187 | memory, try atomics and volatiles. If you can't manage, document your best attempt. 188 | */ -------------------------------------------------------------------------------- /17_CooperativeGroups/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "../../shared/include/utility.h" 8 | 9 | // Shortening cooperative groups namespace for convenience 10 | namespace cg = cooperative_groups; 11 | 12 | // We keep the result of the reduction in managed memory 13 | __managed__ float mResult; 14 | 15 | template 16 | __global__ void reduceGroups(const float* __restrict input, int N) 17 | { 18 | // Can conveniently obtain groups for grid and block 19 | auto block = cg::this_thread_block(); 20 | unsigned int gId = cg::this_grid().thread_rank(); 21 | 22 | __shared__ float data[BLOCK_SIZE]; 23 | data[block.thread_rank()] = (gId < N ? (input[gId] + input[gId + N / 2]) : 0); 24 | 25 | for (int s = blockDim.x / 2; s > 16; s /= 2) 26 | { 27 | // Rather than selecting explicit sync functions, groups offer sync() 28 | block.sync(); 29 | if (block.thread_rank() < s) 30 | data[block.thread_rank()] += data[block.thread_rank() + s]; 31 | } 32 | 33 | // Splitting blocks into warp groups is cleaner than checking threadIdx 34 | auto warp = cg::tiled_partition<32>(block); 35 | if (warp.meta_group_rank() == 0) 36 | { 37 | // Reduction primitives - will be hardware-accelerated on CC 8.0+ 38 | float v = cg::reduce(warp, data[warp.thread_rank()], cg::plus()); 39 | if (warp.thread_rank() == 0) 40 | atomicAdd(&mResult, v); 41 | } 42 | } 43 | 44 | void ReduceWithGroups() 45 | { 46 | constexpr unsigned int BLOCK_SIZE = 256, N = 1'000'000; 47 | 48 | std::cout << "Producing random inputs..." << std::endl; 49 | // Generate some random numbers to reduce 50 | std::vector vals; 51 | float* dValsPtr; 52 | samplesutil::prepareRandomNumbersCPUGPU(N, vals, &dValsPtr); 53 | // Prepare grid configuration for input and used reduction technique 54 | const dim3 blockDim = { BLOCK_SIZE, 1, 1 }; 55 | const dim3 gridDim = { (N / 2 + BLOCK_SIZE) / BLOCK_SIZE, 1, 1 }; 56 | 57 | // Events for measuring run time 58 | cudaEvent_t start, end; 59 | cudaEventCreate(&start); 60 | cudaEventCreate(&end); 61 | 62 | // Setting managed result variable 63 | mResult = 0; 64 | cudaEventRecord(start); 65 | reduceGroups<<>>(dValsPtr, N); 66 | cudaEventRecord(end); 67 | 68 | float ms; 69 | // Synchronizing to event. Event is last, same effect as cudaDeviceSynchronize 70 | cudaEventSynchronize(end); 71 | cudaEventElapsedTime(&ms, start, end); 72 | std::cout << std::setw(20) << "Reduce Groups" << "\t" << ms << "ms \t" << mResult << "\n\n"; 73 | } 74 | 75 | __managed__ unsigned int mHappyNumSum; 76 | __managed__ unsigned int mHappyNumCount; 77 | 78 | __global__ void happyNumbersGroups(unsigned int start, unsigned int N, unsigned int* mHappyNumbers) 79 | { 80 | // Retrieve the input number based on the thread's global id 81 | unsigned int input = cg::this_grid().thread_rank() + start; 82 | // Compute whether or not the input number is in range and "happy" (utility function) 83 | bool happy = ((input-start) < N) && samplesutil::isHappy(input); 84 | // Create a group for the current warp 85 | auto warp = cg::tiled_partition<32>(cg::this_thread_block()); 86 | // Create a child group to separate threads with happy and unhappy numbers 87 | auto g = cg::binary_partition(warp, happy); 88 | 89 | if (happy) 90 | { 91 | // Compute the group's partial result of the sum of happy numbers 92 | unsigned int partial_sum = cg::reduce(g, input, cg::plus()); 93 | // One thread uses aggregate atomics to compute sum and write offset of happy numbers 94 | unsigned int offset; 95 | if (g.thread_rank() == 0) 96 | { 97 | atomicAdd(&mHappyNumSum, partial_sum); 98 | offset = atomicAdd(&mHappyNumCount, g.size()); 99 | } 100 | // Distribute offset information from the thread that computed it to all others 101 | offset = g.shfl(offset, 0); 102 | // Each thread writes their happy number in a unique location 103 | mHappyNumbers[offset + g.thread_rank()] = input; 104 | } 105 | } 106 | 107 | void HappyNummbersWithGroups(unsigned int start, unsigned int N) 108 | { 109 | // Initialize / allocate result storage. For brevity, we use managed memory 110 | mHappyNumSum = 0; 111 | mHappyNumCount = 0; 112 | unsigned int* mHappyNumbers; 113 | cudaMallocManaged((void**)&mHappyNumbers, sizeof(unsigned int) * N); 114 | 115 | // Compute count, sum and list of base 10 "happy numbers" from start to start+N 116 | happyNumbersGroups<<<(N + 255) / 256, 256>>>(start, N, mHappyNumbers); 117 | cudaDeviceSynchronize(); 118 | 119 | // Print the count, sum and list of happy numbers in the given range 120 | std::cout << "No. of happy numbers in " << start << " - " << N << ": " << mHappyNumCount << "\n"; 121 | std::cout << "Sum of happy numbers in " << start << " - " << N << ": " << mHappyNumSum << "\n"; 122 | std::cout << "\nList of happy numbers in " << start << " - " << N << ": "; 123 | 124 | // Sort the managed memory happy number list in ascending order 125 | std::sort(mHappyNumbers, mHappyNumbers + mHappyNumCount); 126 | for (int i = 0; i < mHappyNumCount; i++) 127 | std::cout << mHappyNumbers[i] << ((i == mHappyNumCount - 1) ? "\n" : ", "); 128 | } 129 | 130 | int main() 131 | { 132 | std::cout << "==== Sample 17 - Cooperative Groups ====\n" << std::endl; 133 | /* 134 | Cooperative groups are very versatile. They can be created for entire 135 | grids, blocks, warps or opportunistically for converged threads. In 136 | essence, they package a range of recent CUDA features in an interface 137 | that abstracts away the low-level instructions, making CUDA code 138 | easier to understand. As such, cooperative groups have a vast range 139 | of applications. The two examples in this project cannot do them 140 | justice, for further use cases please consider the advanced NVIDIA 141 | CUDA Samples that include detailed, elaborate applications. 142 | 143 | Expected output: 144 | 1) Result of reduction, now computed with cooperative groups 145 | 2) The count, sum and list of the happy numbers in a given range (1-1000) 146 | */ 147 | 148 | std::cout << "==== Computing a Reduction with Cooperative Groups ====" << std::endl; 149 | 150 | ReduceWithGroups(); 151 | 152 | std::cout << "==== Computing Happy Numbers and their Sum ====" << std::endl; 153 | 154 | HappyNummbersWithGroups(1, 1000); 155 | 156 | return 0; 157 | } 158 | 159 | /* 160 | Exercises: 161 | 1) Write a kernel where each thread first computes its ID in a register. 162 | Within each group of 4 consecutive threads, threads should then share their 163 | ID with all others, using shuffling. Write this kernel once with, once without 164 | cooperative groups, and confirm correctness via output. 165 | 2) Launch a COOPERATIVE KERNEL and use grid-wide synchronization to make sure 166 | all threads in the entire grid are at the same point in the program. Can you 167 | think of any use cases for this? Your device will need to support the attribute 168 | cudaDevAttrCooperativeLaunch for this, check if it has it before starting. 169 | 3) Write a simple program with the following tasks A, B, C, each with N threads. 170 | In A, each thread t should compute and store t*t in its output A_out[t]. In B, 171 | each thread t should compute A_out[N - t - 1] - t and store it in its output 172 | B_out[t]. In C, each thread t should compute B_out[N - t - 1] + 4 and store it 173 | in its output C_out[t]. Implement this once using one kernel for each task A, 174 | and once with a single kernel that uses grid synchronization between tasks. 175 | In the single kernel, do you need additional threadfences and/or volatiles? 176 | Again, in order to do grid sync, your device will need to support the 177 | cudaDevAttrCooperativeLaunch attribute, check if it has it before starting. 178 | */ -------------------------------------------------------------------------------- /shared/include/utility.h: -------------------------------------------------------------------------------- 1 | // A simple kernel with two nested if clauses, 4 branches. 2 | // Each thread will take a separate branch, and then perform 3 | // N stepts. With legacy scheduling, each branch must be 4 | // finished before execution can continue with the next. 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #ifndef SAMPLES_UTIL_INCLUDED 13 | #define SAMPLES_UTIL_INCLUDED 14 | 15 | // Matrices are stored in row-major order: 16 | // M(row, col) = *(M.elements + row * M.stride + col) 17 | template 18 | struct Matrix { 19 | int width; 20 | int height; 21 | int stride; 22 | F* elements; 23 | }; 24 | 25 | namespace samplesutil 26 | { 27 | // Helper function to let threads spin 28 | __device__ void WasteTime(unsigned long long duration) 29 | { 30 | const unsigned long long int start = clock64(); 31 | while ((clock64() - start) < duration); 32 | } 33 | 34 | __device__ float GregoryLeibniz(unsigned int iterations) 35 | { 36 | float pi = 0.f, m = 1.f; 37 | for (int n = 0; n < iterations; n++, m *= -1.f) 38 | pi += 4.f * (m / (2 * n + 1)); 39 | return pi; 40 | } 41 | 42 | // Get a matrix element 43 | template 44 | __device__ float GetElement(const F* A, int row, int col, unsigned int DIM) 45 | { 46 | return A[row * DIM + col]; 47 | } 48 | 49 | // Set a matrix element 50 | template 51 | __device__ void SetElement(F* A, int row, int col, float value, unsigned int DIM) 52 | { 53 | A[row * DIM + col] = value; 54 | } 55 | 56 | // Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is 57 | // located col sub-matrices to the right and row sub-matrices down 58 | // from the upper-left corner of A 59 | template 60 | __device__ F* GetSubMatrix(F* A, int row, int col, unsigned int BLOCK_SIZE, unsigned int DIM) 61 | { 62 | F* Asub = A + DIM * BLOCK_SIZE * row + BLOCK_SIZE * col; 63 | return Asub; 64 | } 65 | 66 | // Matrix multiplication kernel called by MatMul() 67 | template 68 | __global__ void MatMulKernel(half* A, half* B, float* C) 69 | { 70 | // Block row and column 71 | int blockRow = blockIdx.y; 72 | int blockCol = blockIdx.x; 73 | 74 | // Each thread block computes one sub-matrix Csub of C 75 | float* Csub = GetSubMatrix(C, blockRow, blockCol, BLOCK_SIZE, DIM); 76 | 77 | // Each thread computes one element of Csub 78 | // by accumulating results into Cvalue 79 | float Cvalue = 0; 80 | 81 | // Thread row and column within Csub 82 | int row = threadIdx.y; 83 | int col = threadIdx.x; 84 | 85 | // Loop over all the sub-matrices of A and B that are 86 | // required to compute Csub 87 | // Multiply each pair of sub-matrices together 88 | // and accumulate the results 89 | for (int m = 0; m < (DIM / BLOCK_SIZE); ++m) { 90 | 91 | // Get sub-matrix Asub of A 92 | half* Asub = GetSubMatrix(A, blockRow, m, BLOCK_SIZE, DIM); 93 | 94 | // Get sub-matrix Bsub of B 95 | half* Bsub = GetSubMatrix(B, m, blockCol, BLOCK_SIZE, DIM); 96 | 97 | // Shared memory used to store Asub and Bsub respectively 98 | __shared__ half As[BLOCK_SIZE][BLOCK_SIZE]; 99 | __shared__ half Bs[BLOCK_SIZE][BLOCK_SIZE]; 100 | 101 | // Load Asub and Bsub from device memory to shared memory 102 | // Each thread loads one element of each sub-matrix 103 | As[row][col] = GetElement(Asub, row, col, DIM); 104 | Bs[row][col] = GetElement(Bsub, row, col, DIM); 105 | 106 | // Synchronize to make sure the sub-matrices are loaded 107 | // before starting the computation 108 | __syncthreads(); 109 | // Multiply Asub and Bsub together 110 | for (int e = 0; e < BLOCK_SIZE; ++e) 111 | Cvalue += (float)As[row][e] * (float)Bs[e][col]; 112 | 113 | // Synchronize to make sure that the preceding 114 | // computation is done before loading two new 115 | // sub-matrices of A and B in the next iteration 116 | __syncthreads(); 117 | } 118 | 119 | // Write Csub to device memory 120 | // Each thread writes one element 121 | SetElement(Csub, row, col, Cvalue, DIM); 122 | } 123 | 124 | // Matrix multiplication - Host code 125 | // Matrix dimensions are assumed to be multiples of BLOCK_SIZE 126 | template 127 | static void MatMul(const Matrix A, const Matrix B, Matrix C, const K& kernel, dim3 grid, dim3 block) 128 | { 129 | // Load A and B to device memory 130 | Matrix d_A; 131 | d_A.width = d_A.stride = A.width; d_A.height = A.height; 132 | size_t size = A.width * A.height * sizeof(half); 133 | cudaMalloc(&d_A.elements, size); 134 | cudaMemcpy(d_A.elements, A.elements, size, 135 | cudaMemcpyHostToDevice); 136 | Matrix d_B; 137 | d_B.width = d_B.stride = B.width; d_B.height = B.height; 138 | size = B.width * B.height * sizeof(half); 139 | cudaMalloc(&d_B.elements, size); 140 | cudaMemcpy(d_B.elements, B.elements, size, 141 | cudaMemcpyHostToDevice); 142 | 143 | // Allocate C in device memory 144 | Matrix d_C; 145 | d_C.width = d_C.stride = C.width; d_C.height = C.height; 146 | size = C.width * C.height * sizeof(float); 147 | cudaMalloc(&d_C.elements, size); 148 | 149 | cudaEvent_t start, end; 150 | cudaEventCreate(&start); 151 | cudaEventCreate(&end); 152 | 153 | // Invoke kernel 154 | cudaEventRecord(start); 155 | kernel<<>>(d_A, d_B, d_C); 156 | cudaEventRecord(end); 157 | 158 | cudaEventSynchronize(end); 159 | 160 | float ms; 161 | cudaEventElapsedTime(&ms, start, end); 162 | std::cout << "Matrix multiplication took: " << ms << " ms\n" << std::endl; 163 | 164 | // Read C from device memory 165 | cudaMemcpy(C.elements, d_C.elements, size, 166 | cudaMemcpyDeviceToHost); 167 | 168 | // Free device memory 169 | cudaFree(d_A.elements); 170 | cudaFree(d_B.elements); 171 | cudaFree(d_C.elements); 172 | } 173 | 174 | template 175 | static void MatMulConv(Matrix A, Matrix B, Matrix C) 176 | { 177 | dim3 block(BLOCK_SIZE, BLOCK_SIZE); 178 | dim3 grid(B.width / block.x, A.height / block.y); 179 | samplesutil::MatMul(A, B, C, samplesutil::MatMulKernel, grid, block); 180 | } 181 | 182 | static __host__ void prepareRandomNumbersCPUGPU(unsigned int N, std::vector& vals, float** dValsPtr) 183 | { 184 | constexpr float target = 42.f; 185 | // Print expected value, because reference may be off due to floating point (im-)precision 186 | std::cout << "\nExpected value: " << target * N << "\n" << std::endl; 187 | 188 | // Generate a few random inputs to accumulate 189 | std::default_random_engine eng(0xcaffe); 190 | std::normal_distribution dist(target); 191 | vals.resize(N); 192 | std::for_each(vals.begin(), vals.end(), [&dist, &eng](float& f) { f = dist(eng); }); 193 | 194 | // Allocate some global GPU memory to write the inputs to 195 | cudaMalloc((void**)dValsPtr, sizeof(float) * N); 196 | // Expliclity copy the inputs from the CPU to the GPU 197 | cudaMemcpy(*dValsPtr, vals.data(), sizeof(float) * N, cudaMemcpyHostToDevice); 198 | } 199 | 200 | // Define an unsigned integer variable that the GPU can work with 201 | __device__ unsigned int step = 0; 202 | 203 | // Increment the GPU variable N times. Whenever a thread observes 204 | // non-consecutive numbers, it prints the latest sequence. Hence, 205 | // every thread documents the turns that it was given by the 206 | // scheduler. 207 | static __device__ void takeNTurns(const char* who, unsigned int N) 208 | { 209 | int lastTurn = -42, turn, start; 210 | for (int i = 0; i < N; i++) 211 | { 212 | turn = atomicInc(&step, 0xFFFFFFFFU); 213 | 214 | bool switchOccurred = (lastTurn != (turn - 1)); 215 | bool done = (i == (N - 1)); 216 | 217 | if (done || ((i > 0) && switchOccurred)) 218 | printf("%s: %d--%d\n", who, start, lastTurn + (done ? 1 : 0)); 219 | 220 | if (switchOccurred) 221 | start = turn; 222 | 223 | lastTurn = turn; 224 | } 225 | } 226 | 227 | static __global__ void testScheduling(int N) 228 | { 229 | if (threadIdx.x < 2) // Branch once 230 | if (threadIdx.x == 0) // Branch again 231 | takeNTurns("Thread 1", N); 232 | else 233 | takeNTurns("Thread 2", N); 234 | else 235 | if (threadIdx.x == 2) // Branch again 236 | takeNTurns("Thread 3", N); 237 | else 238 | takeNTurns("Thread 4", N); 239 | } 240 | 241 | static __host__ void run2NestedBranchesForNSteps(int N) 242 | { 243 | testScheduling<<<1, 4>>>(N); 244 | cudaDeviceSynchronize(); 245 | } 246 | 247 | /* 248 | Computes whether a given number is a "happy number". 249 | https://en.wikipedia.org/wiki/Happy_number 250 | */ 251 | static __host__ __device__ bool isHappy(unsigned int num) 252 | { 253 | while (num != 0 && num != 1 && num != 4) 254 | { 255 | unsigned int next_num = 0; 256 | for (unsigned int n = num; n > 0; n /= 10) 257 | { 258 | unsigned int t = n % 10; 259 | next_num += t * t; 260 | } 261 | num = next_num; 262 | } 263 | return num == 1; 264 | } 265 | } 266 | 267 | #endif -------------------------------------------------------------------------------- /08_Reductions/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "../../shared/include/utility.h" 11 | 12 | // Declare a GPU-visible floating point variable in global memory. 13 | __device__ float dResult; 14 | 15 | /* 16 | The most basic reduction kernel uses atomic operations to accumulate 17 | the individual inputs in a single, device-wide visible variable. 18 | If you have experience with atomics, it is important to note that the 19 | basic atomicXXX instructions of CUDA have RELAXED semantics (scary!). 20 | That means, the threads that operate atomically on them only agree that 21 | there is a particular order for the accesses to that variable and nothing 22 | else (especially no acquire/release semantics). 23 | */ 24 | __global__ void reduceAtomicGlobal(const float* __restrict input, int N) 25 | { 26 | const int id = threadIdx.x + blockIdx.x * blockDim.x; 27 | /* 28 | Since all blocks must have the same number of threads, 29 | we may have to launch more threads than there are 30 | inputs. Superfluous threads should not try to read 31 | from the input (out of bounds access!) 32 | */ 33 | if (id < N) 34 | atomicAdd(&dResult, input[id]); 35 | } 36 | 37 | /* 38 | First improvement: shared memory is much faster than global 39 | memory. Each block can accumulate partial results in isolated 40 | block-wide visible memory. This relieves the contention on 41 | a single global variable that all threads want access to. 42 | */ 43 | __global__ void reduceAtomicShared(const float* __restrict input, int N) 44 | { 45 | const int id = threadIdx.x + blockIdx.x * blockDim.x; 46 | 47 | // Declare a shared float for each block 48 | __shared__ float x; 49 | 50 | // Only one thread should initialize this shared value 51 | if (threadIdx.x == 0) 52 | x = 0.0f; 53 | 54 | /* 55 | Before we continue, we must ensure that all threads 56 | can see this update (initialization) by thread 0 57 | */ 58 | __syncthreads(); 59 | 60 | /* 61 | Every thread in the block adds its input to the 62 | shared variable of the block. 63 | */ 64 | if (id < N) 65 | atomicAdd(&x, input[id]); 66 | 67 | // Wait until all threads have done their part 68 | __syncthreads(); 69 | 70 | /* 71 | Once they are all done, only one thread must add 72 | the block's partial result to the global variable. 73 | */ 74 | if (threadIdx.x == 0) 75 | atomicAdd(&dResult, x); 76 | } 77 | 78 | /* 79 | Second improvement: choosing a more suitable algorithm. 80 | We can exploit the fact that the GPU is massively parallel 81 | and come up with a fitting procedure that uses multiple 82 | iterations. In each iteration, threads accumulate partial 83 | results from the previous iteration. Before, the contented 84 | accesses to one location forced the GPU to perform updates 85 | sequentially O(N). Now, each thread can access its own, 86 | exclusive shared variable in each iteration in parallel, 87 | giving an effective runtime that is closer to O(log N). 88 | */ 89 | template 90 | __global__ void reduceShared(const float* __restrict input, int N) 91 | { 92 | const int id = threadIdx.x + blockIdx.x * blockDim.x; 93 | 94 | /* 95 | Use a larger shared memory region so that each 96 | thread can store its own partial results 97 | */ 98 | __shared__ float data[BLOCK_SIZE]; 99 | /* 100 | Use a new strategy to handle superfluous threads. 101 | To make sure they stay alive and can help with 102 | the reduction, threads without an input simply 103 | produce a '0', which has no effect on the result. 104 | */ 105 | data[threadIdx.x] = (id < N ? input[id] : 0); 106 | 107 | /* 108 | log N iterations to complete. In each step, a thread 109 | accumulates two partial values to form the input for 110 | the next iteration. The sum of all partial results 111 | eventually yields the full result of the reduction. 112 | */ 113 | for (int s = blockDim.x / 2; s > 0; s /= 2) 114 | { 115 | /* 116 | In each iteration, we must make sure that all 117 | threads are done writing the updates of the 118 | previous iteration / the initialization. 119 | */ 120 | __syncthreads(); 121 | if (threadIdx.x < s) 122 | data[threadIdx.x] += data[threadIdx.x + s]; 123 | } 124 | 125 | /* 126 | Note: thread 0 is the last thread to combine two 127 | partial results, and the one who writes to global 128 | memory, therefore no synchronization is required 129 | after the last iteration. 130 | */ 131 | if (threadIdx.x == 0) 132 | atomicAdd(&dResult, data[0]); 133 | } 134 | 135 | /* 136 | Warp-level improvement: using warp-level primitives to 137 | accelerate the final steps of the reduction. Warps 138 | have a fast lane for communication. They are free 139 | to exchange values in registers when they are being 140 | scheduled for execution. Warps will be formed from 141 | consecutive threads in groups of 32. 142 | */ 143 | template 144 | __global__ void reduceShuffle(const float* __restrict input, int N) 145 | { 146 | const int id = threadIdx.x + blockIdx.x * blockDim.x; 147 | 148 | __shared__ float data[BLOCK_SIZE]; 149 | data[threadIdx.x] = (id < N ? input[id] : 0); 150 | 151 | // Only use shared memory until last 32 values 152 | for (int s = blockDim.x / 2; s > 16; s /= 2) 153 | { 154 | __syncthreads(); 155 | if (threadIdx.x < s) 156 | data[threadIdx.x] += data[threadIdx.x + s]; 157 | } 158 | 159 | // The last 32 values can be handled with warp-level primitives 160 | float x = data[threadIdx.x]; 161 | if (threadIdx.x < 32) 162 | { 163 | /* 164 | The threads in the first warp shuffle their registers. 165 | This replaces the last 5 iterations of the previous solution. 166 | The mask indicates which threads participate in the shuffle. 167 | The value indicates which register should be shuffled. 168 | The final parameter gives the source thread from which the 169 | current one should receive the shuffled value. Accesses that 170 | are out of range (>= 32) will wrap around, but are not needed 171 | (they will not affect the final result written by thread 0). 172 | In each shuffle, at least half of the threads only participate 173 | so they can provide useful data from the previous shuffle for 174 | lower threads. To keep the code short, we always let all threads 175 | participate, because it is an error to let threads reach a shuffle 176 | instruction that they don't participate in. 177 | */ 178 | x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 16); 179 | x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 8); 180 | x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 4); 181 | x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 2); 182 | x += __shfl_sync(0xFFFFFFFF, x, 1); 183 | } 184 | 185 | if (threadIdx.x == 0) 186 | atomicAdd(&dResult, x); 187 | } 188 | 189 | /* 190 | Final improvement: half of our threads actually idle after 191 | they have loaded data from global memory to shared! Better 192 | to have threads fetch two values at the start and then let 193 | them all do at least some meaningful work. This means that 194 | compared to all other methods, only half the number of 195 | threads must be launched in the grid! 196 | */ 197 | template 198 | __global__ void reduceFinal(const float* __restrict input, int N) 199 | { 200 | const int id = threadIdx.x + blockIdx.x * blockDim.x; 201 | 202 | __shared__ float data[BLOCK_SIZE]; 203 | // Already combine two values upon load from global memory. 204 | data[threadIdx.x] = id < N / 2 ? input[id] : 0; 205 | data[threadIdx.x] += id + N/2 < N ? input[id + N / 2] : 0; 206 | 207 | for (int s = blockDim.x / 2; s > 16; s /= 2) 208 | { 209 | __syncthreads(); 210 | if (threadIdx.x < s) 211 | data[threadIdx.x] += data[threadIdx.x + s]; 212 | } 213 | 214 | float x = data[threadIdx.x]; 215 | if (threadIdx.x < 32) 216 | { 217 | x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 16); 218 | x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 8); 219 | x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 4); 220 | x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 2); 221 | x += __shfl_sync(0xFFFFFFFF, x, 1); 222 | } 223 | 224 | if (threadIdx.x == 0) 225 | atomicAdd(&dResult, x); 226 | } 227 | 228 | int main() 229 | { 230 | std::cout << "==== Sample 08 - Reductions ====\n" << std::endl; 231 | /* 232 | Expected output: Accumulated results from CPU and GPU that 233 | approach 42 * NUM_ITEMS (can vary greatly due to floating point 234 | precision limitations). 235 | 236 | With more sophisticated techniques, reported performance of the 237 | GPU versions (measured runtime in ms) should generally decrease. 238 | */ 239 | 240 | constexpr unsigned int BLOCK_SIZE = 256; 241 | constexpr unsigned int WARMUP_ITERATIONS = 10; 242 | constexpr unsigned int TIMING_ITERATIONS = 20; 243 | constexpr unsigned int N = 100'000'000; 244 | 245 | std::cout << "Producing random inputs...\n" << std::endl; 246 | // Generate some random numbers to reduce 247 | std::vector vals; 248 | float* dValsPtr; 249 | samplesutil::prepareRandomNumbersCPUGPU(N, vals, &dValsPtr); 250 | 251 | std::cout << "==== CPU Reduction ====\n" << std::endl; 252 | // A reference value is computed by sequential reduction 253 | std::cout << "Computed CPU value: " << std::accumulate(vals.cbegin(), vals.cend(), 0.0f) << std::endl; 254 | 255 | std::cout << "==== GPU Reductions ====\n" << std::endl; 256 | /* 257 | Set up a collection of reductions to evaluate for performance. 258 | Each entry gives a technique's name, the kernel to call, and 259 | the number of threads required for each individual technique. 260 | */ 261 | const std::tuple reductionTechniques[] 262 | { 263 | {"Atomic Global", reduceAtomicGlobal, N}, 264 | {"Atomic Shared", reduceAtomicShared, N}, 265 | {"Reduce Shared", reduceShared, N}, 266 | {"Reduce Shuffle", reduceShuffle, N}, 267 | {"Reduce Final", reduceFinal, N / 2 + 1} 268 | }; 269 | 270 | // Evaluate each technique separately 271 | for (const auto& [name, func, numThreads] : reductionTechniques) 272 | { 273 | // Compute the smallest grid to start required threads with a given block size 274 | const dim3 blockDim = { BLOCK_SIZE, 1, 1 }; 275 | const dim3 gridDim = { (numThreads + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1 }; 276 | 277 | // Run several reductions for GPU to warm up 278 | for (int i = 0; i < WARMUP_ITERATIONS; i++) 279 | func<<>>(dValsPtr, N); 280 | 281 | // Synchronize to ensure CPU only records time after warmup is done 282 | cudaDeviceSynchronize(); 283 | const auto before = std::chrono::system_clock::now(); 284 | 285 | float result = 0.0f; 286 | // Run several iterations to get an average measurement 287 | for (int i = 0; i < TIMING_ITERATIONS; i++) 288 | { 289 | // Reset acummulated result to 0 in each run 290 | cudaMemcpyToSymbol(dResult, &result, sizeof(float)); 291 | func<<>>(dValsPtr, N); 292 | } 293 | 294 | // cudaMemcpyFromSymbol will implicitly synchronize CPU and GPU 295 | cudaMemcpyFromSymbol(&result, dResult, sizeof(float)); 296 | 297 | // Can measure time without an extra synchronization 298 | const auto after = std::chrono::system_clock::now(); 299 | const auto elapsed = 1000.f * std::chrono::duration_cast>(after - before).count(); 300 | std::cout << std::setw(20) << name << "\t" << elapsed / TIMING_ITERATIONS << "ms \t" << result << std::endl; 301 | } 302 | 303 | // Free the allocated memory for input 304 | cudaFree(dValsPtr); 305 | return 0; 306 | } 307 | 308 | /* 309 | Exercises: 310 | 1) Change the program so that the methods reduce integer values instead of float. 311 | Can you observe any difference in terms of speed / computed results? 312 | 2) Do you have any other ideas how the reduction could be improved? 313 | Making it even faster should be quite challenging, but if you have 314 | some suggestions, try them out and see how they affect performance! 315 | */ --------------------------------------------------------------------------------