├── cpu ├── cpp │ ├── .gitignore │ ├── CMakeLists.txt │ ├── cpu.cpp │ ├── task1.hpp │ ├── task3.hpp │ └── task2.hpp ├── rust │ ├── src │ │ ├── task2.rs │ │ ├── task3.rs │ │ ├── task1.rs │ │ └── main.rs │ └── Cargo.toml └── readme.md ├── cuda ├── .gitignore ├── readme.md └── main.cu ├── openmp ├── .gitignore ├── CMakeLists.txt ├── openmp.cpp └── readme.md ├── directx ├── computing │ ├── .gitignore │ ├── CMakeLists.txt │ ├── sum.hlsl │ └── main.cpp ├── graphics │ ├── .gitignore │ ├── CMakeLists.txt │ └── main.cpp └── readme.md ├── vulkan ├── graphics │ ├── .gitignore │ ├── shader.frag │ ├── shader.vert │ ├── CMakeLists.txt │ └── main.cpp ├── compute │ ├── .gitignore │ └── compute.py └── readme.md ├── opengl ├── computing │ ├── .gitignore │ ├── CMakeLists.txt │ └── main.cpp ├── graphics │ ├── .gitignore │ ├── CMakeLists.txt │ └── main.cpp └── readme.md ├── assets └── triangle.gif ├── opencl ├── src │ ├── test_kernel.cl │ └── main.rs ├── Cargo.toml └── readme.md ├── webgpu ├── graphics │ ├── Cargo.toml │ └── src │ │ ├── shader.wgsl │ │ └── main.rs ├── compute │ ├── Cargo.toml │ └── src │ │ ├── shader.wgsl │ │ └── main.rs └── readme.md ├── AcceleratedKernels.jl ├── Project │ ├── Project.toml │ └── src │ │ └── Project.jl └── readme.md ├── metal └── readme.md ├── triton ├── readme.md └── main.py ├── .gitignore ├── sycl ├── main.cpp └── readme.md ├── LICENSE ├── benchmark.py └── readme.rst /cpu/cpp/.gitignore: -------------------------------------------------------------------------------- 1 | build/* -------------------------------------------------------------------------------- /cuda/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /openmp/.gitignore: -------------------------------------------------------------------------------- 1 | build/* -------------------------------------------------------------------------------- /directx/computing/.gitignore: -------------------------------------------------------------------------------- 1 | build/* -------------------------------------------------------------------------------- /directx/graphics/.gitignore: -------------------------------------------------------------------------------- 1 | build/* -------------------------------------------------------------------------------- /vulkan/graphics/.gitignore: -------------------------------------------------------------------------------- 1 | build/* 2 | -------------------------------------------------------------------------------- /vulkan/compute/.gitignore: -------------------------------------------------------------------------------- 1 | *.comp 2 | *.spv 3 | -------------------------------------------------------------------------------- /opengl/computing/.gitignore: -------------------------------------------------------------------------------- 1 | build/* 2 | include/* 3 | src/* 4 | -------------------------------------------------------------------------------- /opengl/graphics/.gitignore: -------------------------------------------------------------------------------- 1 | build/* 2 | include/* 3 | src/* 4 | -------------------------------------------------------------------------------- /assets/triangle.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vincent-Therrien/gpu-arena/HEAD/assets/triangle.gif -------------------------------------------------------------------------------- /opencl/src/test_kernel.cl: -------------------------------------------------------------------------------- 1 | __kernel void add(__global float* buffer) { 2 | buffer[get_global_id(0)] += 2.0; 3 | } 4 | -------------------------------------------------------------------------------- /opencl/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "opencl" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | ocl = "0.19" 8 | -------------------------------------------------------------------------------- /cpu/rust/src/task2.rs: -------------------------------------------------------------------------------- 1 | /// Task 2: Multiply 2 matrices 2 | pub fn task_2(_n: u32, _threads: u32) -> f64 { 3 | panic!("Unsupported"); 4 | } 5 | -------------------------------------------------------------------------------- /webgpu/graphics/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "graphics" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | glfw = "0.59.0" 8 | wgpu = "24.0.1" 9 | pollster = "0.4.0" 10 | -------------------------------------------------------------------------------- /cpu/cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | project(cpu LANGUAGES CXX) 3 | 4 | set(CMAKE_CXX_STANDARD 20) 5 | set(CMAKE_CXX_STANDARD_REQUIRED True) 6 | 7 | add_executable(cpu cpu.cpp) 8 | -------------------------------------------------------------------------------- /openmp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | project(main LANGUAGES CXX) 3 | 4 | set(CMAKE_CXX_STANDARD 20) 5 | set(CMAKE_CXX_STANDARD_REQUIRED True) 6 | find_package(OpenMP) 7 | 8 | add_executable(main openmp.cpp) 9 | -------------------------------------------------------------------------------- /webgpu/compute/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "compute" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | glfw = "0.59.0" 8 | wgpu = "24.0.1" 9 | pollster = "0.4.0" 10 | bytemuck = "1.21.0" 11 | futures-intrusive = "0.5" 12 | -------------------------------------------------------------------------------- /cpu/rust/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cpu" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | clap = { version = "4.5.26", features = ["derive"] } 8 | ndarray = { version = "0.16.0", default-features = false } 9 | ndarray-rand = "0.15.0" 10 | floating-duration = "0.1.2" 11 | -------------------------------------------------------------------------------- /vulkan/graphics/shader.frag: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | layout(location = 0) in vec3 fragColor; 4 | layout(location = 0) out vec4 outColor; 5 | 6 | void main() { 7 | float levels = 10.0; // Number of discrete color levels 8 | vec3 quantizedColor = floor(fragColor.rgb * levels) / (levels - 1.0); 9 | outColor = vec4(quantizedColor, 1.0); 10 | } -------------------------------------------------------------------------------- /AcceleratedKernels.jl/Project/Project.toml: -------------------------------------------------------------------------------- 1 | name = "Project" 2 | uuid = "b047b328-b947-4008-9592-27f6f2e42ee3" 3 | authors = ["Vincent Therrien"] 4 | version = "0.1.0" 5 | 6 | [deps] 7 | AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" 8 | CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" 9 | GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" 10 | -------------------------------------------------------------------------------- /directx/graphics/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | project(DirectX_Project) 3 | 4 | # Set C++ standard 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | # Add the executable 8 | add_executable(graphics main.cpp) 9 | 10 | target_link_libraries(graphics d3d11 dxgi d3dcompiler) 11 | set_target_properties(graphics PROPERTIES LINK_FLAGS "/SUBSYSTEM:WINDOWS") 12 | -------------------------------------------------------------------------------- /metal/readme.md: -------------------------------------------------------------------------------- 1 | # Metal 2 | 3 | Metal is a graphics and general-purpose computing framework developed by Apple that targets GPUs. It 4 | only works on Apple operating systems (Mac / iOS). Metal uses the MSL shading language. 5 | 6 | I don't have an Apple device so I couldn't test it `:(`. Refer to the page 7 | https://developer.apple.com/documentation/metal/ if you want to know more! 8 | -------------------------------------------------------------------------------- /vulkan/graphics/shader.vert: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | layout(location = 0) out vec3 fragColor; 4 | 5 | vec2 positions[3] = vec2[]( 6 | vec2(0.0, -0.5), 7 | vec2(0.5, 0.5), 8 | vec2(-0.5, 0.5) 9 | ); 10 | 11 | vec3 colors[3] = vec3[]( 12 | vec3(1.0, 0.0, 0.0), 13 | vec3(0.0, 1.0, 0.0), 14 | vec3(0.0, 0.0, 1.0) 15 | ); 16 | 17 | void main() { 18 | gl_Position = vec4(positions[gl_VertexIndex], 0.0, 1.0); 19 | fragColor = colors[gl_VertexIndex]; 20 | } 21 | -------------------------------------------------------------------------------- /opengl/graphics/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | project(OpenGL_Project) 3 | 4 | # Set C++ standard 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | # Find OpenGL 8 | find_package(OpenGL REQUIRED) 9 | 10 | # Find GLFW 11 | find_package(glfw3 REQUIRED) 12 | 13 | # Include directories 14 | include_directories(include) 15 | 16 | # Create the executable 17 | add_executable(graphics main.cpp src/glad.c) 18 | 19 | # Link libraries 20 | target_link_libraries(graphics glfw OpenGL::GL) 21 | -------------------------------------------------------------------------------- /openmp/openmp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main() { 6 | const int N = 100000; 7 | std::vector elements(N, 1.0f); 8 | float* array = elements.data(); 9 | float sum = 0.0f; 10 | 11 | #pragma omp target data map(to: array[0:N]) map(tofrom: sum) 12 | { 13 | #pragma omp target teams distribute parallel for reduction(+:sum) 14 | for (int i = 0; i < N; ++i) { 15 | sum += array[i]; 16 | } 17 | } 18 | 19 | std::cout << "Sum: " << sum << std::endl; 20 | 21 | return 0; 22 | } 23 | -------------------------------------------------------------------------------- /vulkan/graphics/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | project(Vulkan_Project) 3 | 4 | # Set C++ standard 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | # Find Vulkan 8 | find_package(Vulkan REQUIRED) 9 | 10 | # Find GLFW 11 | find_package(glfw3 REQUIRED) 12 | 13 | # Find GLM 14 | find_package(glm REQUIRED) 15 | 16 | # Create the executable 17 | add_executable(graphics main.cpp) 18 | 19 | # Link libraries 20 | target_link_libraries(graphics Vulkan::Vulkan glfw) 21 | 22 | # Include GLM 23 | target_include_directories(graphics PRIVATE ${GLM_INCLUDE_DIRS}) 24 | target_compile_definitions(graphics PRIVATE ${GLM_DEFINITIONS}) -------------------------------------------------------------------------------- /directx/computing/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | project(DirectCompute_Project) 3 | 4 | # Set C++ standard 5 | set(CMAKE_CXX_STANDARD 17) 6 | set(CMAKE_CXX_STANDARD_REQUIRED True) 7 | 8 | # Define the source files 9 | set(SOURCES main.cpp) 10 | 11 | # Define the executable 12 | add_executable(computing ${SOURCES}) 13 | 14 | # Link against DirectX 11 libraries 15 | target_link_libraries(computing d3d11 d3dcompiler) 16 | 17 | # Copy the compute shader to the output directory 18 | add_custom_command(TARGET computing POST_BUILD 19 | COMMAND ${CMAKE_COMMAND} -E copy_if_different 20 | ${CMAKE_SOURCE_DIR}/sum.hlsl 21 | $/sum.hlsl 22 | ) 23 | -------------------------------------------------------------------------------- /triton/readme.md: -------------------------------------------------------------------------------- 1 | # Triton 2 | 3 | Triton is a high-level GPU programming API developed by OpenAI that targets neural network 4 | acceleration. It uses decorators in Python code to mark computations to accelerate. Since it uses 5 | CUDA as a backend, it is only supported on Nvidia GPUs. Also, it only works on Linux / WSL (as of 6 | March 2025). This project implements sum reduction and a softmax function. 7 | 8 | Relevant links: 9 | 10 | - OpenAI blog post: https://openai.com/index/triton/ 11 | - Github repository: https://github.com/triton-lang/triton 12 | 13 | Run the following commands to try Triton. 14 | 15 | ``` 16 | pip install numpy 17 | pip install torch 18 | pip install triton 19 | python3 main.py 20 | ``` 21 | -------------------------------------------------------------------------------- /opengl/computing/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | project(OpenGL_Project) 3 | 4 | # Set C++ standard 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | # Find OpenGL 8 | find_package(OpenGL REQUIRED) 9 | 10 | # Find GLFW 11 | find_package(glfw3 REQUIRED) 12 | 13 | # Find GLEW 14 | find_package(PkgConfig REQUIRED) 15 | pkg_check_modules(GLEW REQUIRED glew) 16 | 17 | # Add the executable 18 | add_executable(computing main.cpp) 19 | 20 | # Include directories 21 | target_include_directories(computing PRIVATE ${GLEW_INCLUDE_DIRS}) 22 | target_include_directories(computing PRIVATE ${GLFW3_INCLUDE_DIRS}) 23 | 24 | # Link libraries 25 | target_link_libraries(computing PRIVATE ${GLEW_LIBRARIES} glfw OpenGL::GL) 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | debug/ 4 | target/ 5 | 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 8 | Cargo.lock 9 | 10 | # These are backup files generated by rustfmt 11 | **/*.rs.bk 12 | 13 | # MSVC Windows builds of rustc generate these, which store debugging information 14 | *.pdb 15 | 16 | # RustRover 17 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 18 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 19 | # and can be added to the global gitignore or merged into this file. For a more nuclear 20 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 21 | #.idea/ -------------------------------------------------------------------------------- /webgpu/graphics/src/shader.wgsl: -------------------------------------------------------------------------------- 1 | struct VertexPayload { 2 | @builtin(position) position: vec4, 3 | @location(0) color: vec3, 4 | }; 5 | 6 | @vertex 7 | fn vertices(@builtin(vertex_index) i: u32) -> VertexPayload { 8 | 9 | var positions = array, 3>( 10 | vec2(-0.75, -0.75), 11 | vec2( 0.75, -0.75), 12 | vec2( 0.0, 0.75), 13 | ); 14 | 15 | var colors = array, 3>( 16 | vec3(1.0, 0.0, 0.0), 17 | vec3(0.0, 1.0, 0.0), 18 | vec3(0.0, 0.0, 1.0), 19 | ); 20 | 21 | var out: VertexPayload; 22 | out.position = vec4(positions[i], 0.0, 1.0); 23 | out.color = colors[i]; 24 | return out; 25 | } 26 | 27 | @fragment 28 | fn fragment(in: VertexPayload) -> @location(0) vec4 { 29 | let levels: f32 = 10.0; 30 | let quantizedColor: vec3 = floor(in.color * levels) / (levels - 1.0); 31 | return vec4(quantizedColor, 1.0); 32 | } 33 | -------------------------------------------------------------------------------- /AcceleratedKernels.jl/Project/src/Project.jl: -------------------------------------------------------------------------------- 1 | import AcceleratedKernels as AK 2 | using CUDA, GPUArrays 3 | 4 | # Define a reduce function to sum up all two elements. 5 | f = (a, b) -> a + b 6 | 7 | # Define a neutral element for the operation. This is used when there are more threads than elements 8 | # to reduce. Some elements will be reduced with `0` when they cannot be paired with another element. 9 | GPUArrays.neutral_element(::typeof(f), ::Type{T}) where T = zero(T) 10 | 11 | # Create a vector of elements comprised within the range [-1.0, 1.0]. 12 | v = CuArray(rand(Float32, 10000) * 2.0 .- 1.0) 13 | 14 | # Apply the function f to elements in v to produce the result. `init` defines the initial value of 15 | # the reduction. 16 | vsum = AK.reduce(f, v; init=zero(eltype(v))) 17 | 18 | # The result of the sum follows a shifted Irwin–Hall distribution. Since there are 10000 elements, 19 | # std = sqrt(10000 / 3) ~ 57.7. So the sum will typically fall in the range [-58, 58]. 20 | print(vsum) 21 | -------------------------------------------------------------------------------- /directx/computing/sum.hlsl: -------------------------------------------------------------------------------- 1 | // Thread group size 2 | #define GROUP_SIZE 256 3 | 4 | // Input and output buffers 5 | RWStructuredBuffer inputBuffer : register(u0); 6 | RWStructuredBuffer outputBuffer : register(u1); 7 | 8 | // Shared memory for local reduction 9 | groupshared float localSum[GROUP_SIZE]; 10 | 11 | [numthreads(GROUP_SIZE, 1, 1)] 12 | void main(uint3 threadID : SV_DispatchThreadID, uint3 groupID : SV_GroupID, uint3 localID : SV_GroupThreadID) { 13 | // Load data into shared memory 14 | localSum[localID.x] = inputBuffer[threadID.x]; 15 | GroupMemoryBarrierWithGroupSync(); 16 | 17 | // Parallel reduction within the group 18 | for (uint stride = GROUP_SIZE / 2; stride > 0; stride /= 2) { 19 | if (localID.x < stride) { 20 | localSum[localID.x] += localSum[localID.x + stride]; 21 | } 22 | GroupMemoryBarrierWithGroupSync(); 23 | } 24 | 25 | // Store the partial sum from each group into the output buffer 26 | if (localID.x == 0) { 27 | outputBuffer[groupID.x] = localSum[0]; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /sycl/main.cpp: -------------------------------------------------------------------------------- 1 | // This code is not functional yet! I have not managed to compile it with oneAPI. 2 | 3 | #include 4 | #include 5 | 6 | int main() { 7 | sycl::queue q; // Create a SYCL queue (chooses a default device) 8 | 9 | std::cout << "Running on: " << q.get_device().get_info() << "\n"; 10 | 11 | const int N = 1024; 12 | std::vector data(N, 1); // Initialize array with 1s 13 | 14 | { 15 | sycl::buffer buf(data.data(), sycl::range<1>(N)); 16 | q.submit([&](sycl::handler &h) { 17 | sycl::accessor acc(buf, h, sycl::write_only, sycl::no_init); 18 | h.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) { 19 | acc[i] = i[0]; // Set each element to its index 20 | }); 21 | }); 22 | } // Buffer goes out of scope -> Data is copied back 23 | 24 | std::cout << "First 10 elements: "; 25 | for (int i = 0; i < 10; i++) { 26 | std::cout << data[i] << " "; 27 | } 28 | std::cout << "\n"; 29 | 30 | return 0; 31 | } 32 | -------------------------------------------------------------------------------- /cpu/rust/src/task3.rs: -------------------------------------------------------------------------------- 1 | use ndarray::Array; 2 | use ndarray_rand::RandomExt; 3 | use ndarray_rand::rand_distr::Uniform; 4 | use std::time::Instant; 5 | use floating_duration::TimeAsFloat; 6 | use std::thread; 7 | use std::sync::Arc; 8 | 9 | fn parallel_softmax(numbers: Vec, num_threads: usize) -> f32 { 10 | let numbers = Arc::new(numbers); 11 | let chunk_size = (numbers.len() + num_threads - 1) / num_threads; 12 | let mut handles = Vec::new(); 13 | 14 | for i in 0..num_threads { 15 | let numbers = Arc::clone(&numbers); 16 | handles.push(thread::spawn(move || { 17 | numbers.iter().skip(i * chunk_size).take(chunk_size).sum::() 18 | })); 19 | } 20 | 21 | handles.into_iter().map(|h| h.join().unwrap()).sum() 22 | } 23 | 24 | /// Task 3: Softmax function. 25 | pub fn task_3(n: u32, threads: u32) -> f64 { 26 | let data = Array::random((n as usize, ), Uniform::new(-10.0, 10.0)).to_vec(); 27 | let now = Instant::now(); 28 | { 29 | parallel_softmax(data, threads as usize); 30 | } 31 | now.elapsed().as_fractional_secs() 32 | } 33 | -------------------------------------------------------------------------------- /cuda/readme.md: -------------------------------------------------------------------------------- 1 | # CUDA 2 | 3 | CUDA is a GPU programming framework by Nvidia that works only on their GPUs. You can find more 4 | information at https://developer.nvidia.com/cuda-toolkit. This project uses CUDA to acceleration 5 | sum reduction. 6 | 7 | CUDA was released in 2007 and is currently used in several machine learning projects, like PyTorch 8 | and Tensorflow. It is efficient and convenient to use, but since it is restricted to Nvidia GPUs, 9 | some projects look into alternatives to develop cross-platform applications. For instance, llama.cpp 10 | (https://github.com/ggml-org/llama.cpp) uses multiple GPU backends to support other platforms in 11 | addition to Nvidia GPUs. 12 | 13 | 14 | ## Build 15 | 16 | On linux, run the following commands: 17 | 18 | ``` 19 | sudo apt install nvidia-cuda-toolkit 20 | mkdir build 21 | cd build 22 | nvcc ../main.cu -o main 23 | ./main 24 | ``` 25 | 26 | 27 | On Windows, download CUDA from https://developer.nvidia.com/cuda-downloads and run the following 28 | commands: 29 | 30 | ``` 31 | mkdir build 32 | cd build 33 | nvcc ..\main.cu -o main 34 | main.exe 35 | ``` 36 | -------------------------------------------------------------------------------- /cpu/rust/src/task1.rs: -------------------------------------------------------------------------------- 1 | use ndarray::Array; 2 | use ndarray_rand::RandomExt; 3 | use ndarray_rand::rand_distr::Uniform; 4 | use std::time::Instant; 5 | use floating_duration::TimeAsFloat; 6 | use std::thread; 7 | use std::sync::Arc; 8 | 9 | fn parallel_sum(numbers: Vec, num_threads: usize) -> f32 { 10 | let numbers = Arc::new(numbers); 11 | let chunk_size = (numbers.len() + num_threads - 1) / num_threads; 12 | let mut handles = Vec::new(); 13 | 14 | for i in 0..num_threads { 15 | let numbers = Arc::clone(&numbers); 16 | handles.push(thread::spawn(move || { 17 | numbers.iter().skip(i * chunk_size).take(chunk_size).sum::() 18 | })); 19 | } 20 | 21 | handles.into_iter().map(|h| h.join().unwrap()).sum() 22 | } 23 | 24 | /// Task 1: Compute the sum of elements in a 1D array. 25 | pub fn task_1(n: u32, threads: u32) -> f64 { 26 | let data = Array::random((n as usize, ), Uniform::new(-1.0, 1.0)).to_vec(); 27 | let now = Instant::now(); 28 | { 29 | parallel_sum(data, threads as usize); 30 | } 31 | now.elapsed().as_fractional_secs() 32 | } 33 | -------------------------------------------------------------------------------- /webgpu/compute/src/shader.wgsl: -------------------------------------------------------------------------------- 1 | struct DataBuf { 2 | data: array, 3 | } 4 | 5 | @group(0) 6 | @binding(0) 7 | var inputBuffer: DataBuf; 8 | 9 | @group(0) 10 | @binding(1) 11 | var outputBuffer: DataBuf; 12 | 13 | @compute 14 | @workgroup_size(64) 15 | fn main(@builtin(global_invocation_id) global_id: vec3, 16 | @builtin(local_invocation_id) local_id: vec3, 17 | @builtin(workgroup_id) workgroup_id: vec3) { 18 | 19 | let index = global_id.x; 20 | let local_index = local_id.x; 21 | let workgroup_index = workgroup_id.x; 22 | 23 | var offset: u32 = 0; 24 | while (offset < 64) { 25 | if (local_index == 0) { 26 | outputBuffer.data[workgroup_index * 64] += inputBuffer.data[workgroup_index * 64 + offset]; 27 | } 28 | offset += 1; 29 | } 30 | workgroupBarrier(); 31 | 32 | offset = 64; 33 | while (offset < arrayLength(&inputBuffer.data)) { 34 | if (index == 0) { 35 | outputBuffer.data[0] += outputBuffer.data[offset]; 36 | } 37 | offset += 64; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Vincent-Therrien 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /opencl/readme.md: -------------------------------------------------------------------------------- 1 | # OpenCL 2 | 3 | OpenCL is a specification by Khronos group intended to accelerate programs across heterogeneous 4 | platforms, like GPUs, but also CPUs, DSPs, and FPGAs. This project uses the Rust library `ocl` to 5 | compile the example, but you can use it with other languages. OpenCL uses the OpenCL C language for 6 | its kernels, which is based on C99. 7 | 8 | I have two bones to pick with OpenCL: 9 | 10 | - OpenCL is stuck in 2011. The last major release (3.0, 2020) defines OpenCL 1.2 (2011) as a 11 | mandatory baseline. Newer features implemented in OpenCL 2.X releases are optional! Consequently, 12 | OpenCL is behind competitors in terms of productivity. 13 | - OpenCL has been described as generally slower than CUDA, but you can minimize the difference by 14 | optimizing your kernels (https://ieeexplore.ieee.org/document/6047190). 15 | 16 | 17 | ## Build 18 | 19 | Run: 20 | 21 | ``` 22 | cargo run 23 | ``` 24 | 25 | Cargo and the OpenCL runtime must be installed on your system. To install cargo, refer to 26 | https://doc.rust-lang.org/cargo/getting-started/installation.html. For OpenCL, install the driver 27 | package of your GPU manufacturer. 28 | -------------------------------------------------------------------------------- /cpu/cpp/cpu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "task1.hpp" 5 | #include "task2.hpp" 6 | #include "task3.hpp" 7 | 8 | void usage() { 9 | std::cout << "Usage:" << std::endl; 10 | std::cout << "cpu " << std::endl; 11 | } 12 | 13 | int main(int argc, char *argv[]) { 14 | if (argc < 5) { 15 | usage(); 16 | return 1; 17 | } 18 | int task, n, n_threads, iterations; 19 | try { 20 | task = stoi(std::string(argv[1])); 21 | n = stoi(std::string(argv[2])); 22 | iterations = stoi(std::string(argv[3])); 23 | n_threads = stoi(std::string(argv[4])); 24 | } 25 | catch (...) { 26 | usage(); 27 | return 1; 28 | } 29 | double duration = 0.0; 30 | for (unsigned int i = 0; i < iterations; i++) { 31 | if (task == 1) { 32 | duration += task_1(n, n_threads); 33 | } 34 | else if (task == 2) { 35 | duration += task_2(n, n_threads); 36 | } 37 | else if (task == 3) { 38 | duration += task_3(n, n_threads); 39 | } 40 | else { 41 | usage(); 42 | break; 43 | } 44 | } 45 | std::cout << "Average duration (s): " << duration << std::endl; 46 | } 47 | -------------------------------------------------------------------------------- /opencl/src/main.rs: -------------------------------------------------------------------------------- 1 | /// Simple example with OpenCL. The function creates a vector and modifies its value on GPU. 2 | 3 | use ocl::core; 4 | use ocl::ProQue; 5 | 6 | fn sum_reduce() -> ocl::Result<()> { 7 | const TEST_KERNEL_SOURCE: &str = include_str!("./test_kernel.cl"); 8 | let pro_que = ProQue::builder() 9 | .src(TEST_KERNEL_SOURCE) 10 | .dims(1 << 20) 11 | .build()?; 12 | 13 | let buffer = pro_que.create_buffer::()?; 14 | let vec_i = vec![1.0; buffer.len()]; 15 | let mut event = core::Event::null(); 16 | println!("The value at index [{}] is initially '{}'.", 60, vec_i[60]); 17 | unsafe { 18 | let _ = core::enqueue_write_buffer( 19 | &pro_que, 20 | &buffer, 21 | false, 22 | 0, 23 | &vec_i, 24 | None::, 25 | Some(&mut event), 26 | ); 27 | let kernel = pro_que.kernel_builder("add").arg(&buffer).build()?; 28 | kernel.enq()?; 29 | } 30 | 31 | let mut vec = vec![0.0; buffer.len()]; 32 | buffer.read(&mut vec).enq()?; 33 | println!("The value at index [{}] is now '{}'!", 60, vec[60]); 34 | Ok(()) 35 | } 36 | 37 | fn list_devices() { 38 | 39 | } 40 | 41 | fn main() { 42 | let _ = sum_reduce(); 43 | } 44 | -------------------------------------------------------------------------------- /AcceleratedKernels.jl/readme.md: -------------------------------------------------------------------------------- 1 | # AcceleratedKernels.jl 2 | 3 | AcceleratedKernels.jl (https://github.com/JuliaGPU/AcceleratedKernels.jl/tree/main) is a Julia 4 | package for parallel computation on CPU and GPUs. It supports multiple platforms. Visit the page 5 | https://juliagpu.github.io/AcceleratedKernels.jl/stable/ for explanations. 6 | 7 | I'd classify AcceleratedKernels.jl as a "high-level" interface similar to Triton because it lets 8 | developers mix CPU and GPU instructions in the same programming language instead of dividing a 9 | program into regular code and kernel code (e.g. C / GLSL). Julia is appreciated in scientific 10 | computing for its simplicity and performance, so support for GPU programming that further enhances 11 | its performance fits nicely with the language. 12 | 13 | 14 | ## Run the Example 15 | 16 | - Install Julia if it's not on your system: https://julialang.org/install/. 17 | - Modify the requirements in the file `Project.toml` to target your GPU runtime. I used `CUDA`, but 18 | you might have to use `oneAPI` or `ROCm`, for instance. Refer to https://juliagpu.github.io/AcceleratedKernels.jl/stable/api/using_backends/ 19 | to view the supported backends. 20 | - Open a terminal in the directory `Project`. 21 | - Install the requirements: enter the REPL by running `julia` in a terminal, type `]`, and run 22 | `instantiate`. 23 | - Exit the Julia REPL. 24 | - Run `julia src/Project.jl`. 25 | -------------------------------------------------------------------------------- /directx/readme.md: -------------------------------------------------------------------------------- 1 | # DirectX 2 | 3 | DirectX is a collection of APIs developed by Microsoft to handle graphics, sound effects, and other 4 | multimedia tasks. This project uses Direct3D for graphics and DirectCompute for GPGPU. It works 5 | only on Windows `:(`. 6 | 7 | DirectX uses the HLSL shading language. 8 | 9 | Relevant links: 10 | 11 | - Rendering pipeline: https://learn.microsoft.com/en-us/windows/win32/direct3d12/direct3d-12-graphics 12 | - Compute shaders: https://learn.microsoft.com/en-us/windows/win32/direct3d11/direct3d-11-advanced-stages-compute-shader 13 | 14 | 15 | ## Graphics 16 | 17 | The directory `graphics` is a self-contained C++ project that uses Direct3D to open a window and 18 | display simple graphics. Direct3D uses HLSL (High-Level Shader Language) to write shaders. 19 | 20 | The following snippet shows how to build and run it. This only works on Windows! 21 | 22 | ``` 23 | cd graphics 24 | mkdir build 25 | cd build 26 | cmake .. 27 | cmake --build . --config Release 28 | Release\graphics.exe 29 | ``` 30 | 31 | 32 | ## Computing Example 33 | 34 | The directory `computing` is a self-contained C++ project that uses DirectCompute to accelerate 35 | parallel computations. 36 | 37 | The following snippet shows how to build and run it. This only works on Windows! 38 | 39 | ``` 40 | cd computing 41 | mkdir build 42 | cd build 43 | cmake .. 44 | cmake --build . --config Release 45 | copy ..\sum.hlsl Release\sum.hlsl 46 | cd Release 47 | Release\computing.exe 48 | ``` 49 | -------------------------------------------------------------------------------- /cpu/rust/src/main.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | mod task1; 3 | mod task2; 4 | mod task3; 5 | 6 | /// Execute a multi-core accelerated program. 7 | #[derive(Parser)] 8 | #[command(version, about, long_about = None, arg_required_else_help = false)] 9 | struct Cli { 10 | /// Task to execute. One of `1`, `2`, or `3`. 11 | #[arg(long, required = false)] 12 | task: Option, 13 | 14 | /// Input dimension. 15 | #[arg(long, required = false)] 16 | n: Option, 17 | 18 | /// Number of iterations to compute the mean duration. 19 | #[arg(long, required = false)] 20 | iterations: Option, 21 | 22 | /// Number of threads. 23 | #[arg(long, required = false)] 24 | threads: Option, 25 | } 26 | 27 | /// Program entry point. 28 | fn main() { 29 | let cli = Cli::parse(); 30 | let task: u32 = match cli.task { 31 | Some(i) => i, 32 | None => 1, 33 | }; 34 | let n: u32 = match cli.n { 35 | Some(i) => i, 36 | None => 1000, 37 | }; 38 | let iterations: u32 = match cli.iterations { 39 | Some(i) => i, 40 | None => 1, 41 | }; 42 | let threads: u32 = match cli.threads { 43 | Some(i) => i, 44 | None => 1, 45 | }; 46 | let mut total_duration = 0.0; 47 | for _ in 0..iterations { 48 | let duration = match &task { 49 | 1 => task1::task_1(n, threads), 50 | 2 => task2::task_2(n, threads), 51 | 3 => task3::task_3(n, threads), 52 | _ => panic!("Invalid task."), 53 | }; 54 | total_duration += duration; 55 | } 56 | let average = total_duration / (iterations as f64); 57 | println!("Average duration (s): {}", average); 58 | } 59 | -------------------------------------------------------------------------------- /webgpu/readme.md: -------------------------------------------------------------------------------- 1 | # WebGPU 2 | 3 | WebGPU is a GPU API that can use Vulkan, DirectX, or Metal as its backend, making it truly 4 | multi-platform. It uses the WGSL shading language. 5 | 6 | WebGPU has some limitations. For instance, it only supports 32-bit values, so you have to find weird 7 | workarounds to use FP16 numbers. The project burn (https://github.com/tracel-ai/burn) uses WebGPU 8 | to accelerate computations because it is cross-platform, but it sometimes uses SPIR-V to perform 9 | some optimizations. But maybe the specification will evolve to support that more easily. 10 | 11 | Relevant links: 12 | 13 | - WebGPU specification: https://www.w3.org/TR/webgpu/ 14 | - wgpu (Rust library): https://github.com/gfx-rs/wgpu 15 | 16 | 17 | ## Graphics Example 18 | 19 | The directory `graphics` is a self-contained Rust program that uses the `wgpu` library to display 20 | simple graphics. It is based on the project https://github.com/sotrh/learn-wgpu, licensed under the 21 | MIT license. 22 | 23 | To run the example, execute the following instructions: 24 | 25 | ``` 26 | cd graphics 27 | cargo run 28 | ``` 29 | 30 | 31 | ## Compute Example 32 | 33 | The directory `compute` is a self-contained Rust program that uses the `wgpu` library to run 34 | compute shaders. It is based on the project https://github.com/googlefonts/compute-shader-101, 35 | licensed under the MIT license. This current project **sums the elements in an array** with a 36 | compute shader. This code is not optimized! It should use sum reduction, but it uses fixed-length 37 | instead. Refer to the `opengl` compute example in this repository or the file 38 | https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf to see how to program sum 39 | reduction. 40 | 41 | To run the example, execute the following instructions: 42 | 43 | ``` 44 | cd compute 45 | cargo run 46 | ``` 47 | -------------------------------------------------------------------------------- /cpu/cpp/task1.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #ifndef CACHE_LINE_SIZE 9 | #define CACHE_LINE_SIZE 128 // Pad the result vector to avoid false sharing. 10 | #endif 11 | 12 | /// @brief Computations carried out by each thread. 13 | /// @param a 14 | /// @param chunk 15 | /// @param n_elements 16 | /// @param results 17 | void partial_sum(const std::vector &a, int chunk, int n_elements, std::vector &results) 18 | { 19 | int start = chunk * n_elements; 20 | int end = std::min(start + n_elements, (int)a.size()); 21 | for (int i = start; i < end; i++) { 22 | results[chunk * CACHE_LINE_SIZE] += a[i]; 23 | } 24 | } 25 | 26 | /// @brief Dispatching 27 | /// @param a 28 | /// @param n_threads 29 | void parallel_sum(std::vector &a, int n_threads) 30 | { 31 | std::vector threads; 32 | std::vector results; 33 | 34 | if (n_threads == 1) { 35 | results.resize(1); 36 | partial_sum(std::ref(a), 0, a.size(), std::ref(results)); 37 | } 38 | else { 39 | results.resize(n_threads * CACHE_LINE_SIZE); 40 | for (int t = 0; t < n_threads; t++) { 41 | int n_elements = (a.size() + n_threads - 1) / n_threads; 42 | threads.emplace_back(partial_sum, std::ref(a), t, n_elements, std::ref(results)); 43 | } 44 | 45 | float result = 0.0; 46 | for (int t = 0; t < n_threads; t++) { 47 | threads[t].join(); 48 | result += results[t * CACHE_LINE_SIZE]; 49 | } 50 | } 51 | } 52 | 53 | double task_1(int n, int n_threads) 54 | { 55 | std::vector a; 56 | a.resize(n); 57 | for (unsigned int i = 0; i < n; i++) { 58 | a[i] = (((float)rand() / RAND_MAX) - 0.5) * 2.0; 59 | } 60 | auto begin = std::chrono::steady_clock::now(); 61 | parallel_sum(a, n_threads); 62 | auto end = std::chrono::steady_clock::now(); 63 | return std::chrono::duration_cast(end - begin).count() / 1000000.0; 64 | } 65 | -------------------------------------------------------------------------------- /opengl/readme.md: -------------------------------------------------------------------------------- 1 | # OpenGL 2 | 3 | OpenGL is an application programming interface designed for computer graphics, but it also supports 4 | general-purpose computing through computing shaders. OpenGL uses GLSL to write shaders. 5 | 6 | Relevant links: 7 | 8 | - History of OpenGL: https://www.khronos.org/opengl/wiki/History_of_OpenGL 9 | - Rendering pipeline: https://www.khronos.org/opengl/wiki/Rendering_Pipeline_Overview 10 | - Compute shaders: https://www.khronos.org/opengl/wiki/Compute_Shader 11 | 12 | 13 | ## Graphics Example 14 | 15 | The directory `graphics` is a self-contained C++ project that uses OpenGL and additional libraries 16 | to open a window and display simple graphics. 17 | 18 | The following instructions show how to build and run it. This only works on Linux! Use WSL if 19 | necessary! 20 | 21 | 1. Download the GLAD library at https://glad.dav1d.de/. GLAD is used to load OpenGL functions. 22 | 2. Place the files `glad.h` and `glad.c` in the `graphics` directory. 23 | 3. Run the following commands: 24 | 25 | ``` 26 | cd graphics 27 | mkdir include && mv glad.h include/glad.h # Place the GLAD header in an include directory. 28 | mkdir src && mv glad.h src/glad.c # Place the GLAD source file in a source directory. 29 | sudo apt install -y libglew-dev 30 | sudo apt update && sudo apt install -y cmake g++ libglfw3-dev libgl1-mesa-dev xorg-dev 31 | mkdir build 32 | cd build 33 | cmake .. 34 | cmake --build . 35 | ./graphics 36 | ``` 37 | 38 | This project displays a 3D rotating triangle. You can modify the file `graphics/main.cpp` to 39 | understand how it works. 40 | 41 | 42 | ## Computing Example 43 | 44 | The directory `computing` is a self-contained C++ project that uses OpenGL to accelerate parallel 45 | computations. The program computes the sum of elements in an array through sum reduction. 46 | 47 | The following instructions show how to build and run it. This only works on Linux! Use WSL if 48 | necessary! 49 | 50 | Run the following commands: 51 | 52 | ``` 53 | cd computing 54 | sudo apt install -y libglew-dev 55 | sudo apt update && sudo apt install -y cmake g++ libglfw3-dev libgl1-mesa-dev xorg-dev 56 | mkdir build 57 | cd build 58 | cmake .. 59 | cmake --build . 60 | ./computing 61 | ``` 62 | -------------------------------------------------------------------------------- /cuda/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define N 1024 // Array size (must be a power of 2 for reduction) 5 | #define THREADS_PER_BLOCK 256 6 | 7 | // CUDA kernel to sum array elements using parallel reduction 8 | __global__ void sumReduction(float *input, float *output) { 9 | __shared__ float sharedData[THREADS_PER_BLOCK]; 10 | 11 | int tid = threadIdx.x; 12 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 13 | 14 | // Load elements into shared memory 15 | sharedData[tid] = (idx < N) ? input[idx] : 0.0f; 16 | __syncthreads(); 17 | 18 | // Perform parallel reduction 19 | for (int s = blockDim.x / 2; s > 0; s >>= 1) { 20 | if (tid < s) { 21 | sharedData[tid] += sharedData[tid + s]; 22 | } 23 | __syncthreads(); 24 | } 25 | 26 | // Store result from each block 27 | if (tid == 0) { 28 | output[blockIdx.x] = sharedData[0]; 29 | } 30 | } 31 | 32 | // Host function to launch kernel 33 | float sumArrayOnGPU(float *h_array) { 34 | float *d_array, *d_partialSums; 35 | int numBlocks = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; 36 | 37 | cudaMalloc(&d_array, N * sizeof(float)); 38 | cudaMalloc(&d_partialSums, numBlocks * sizeof(float)); 39 | 40 | cudaMemcpy(d_array, h_array, N * sizeof(float), cudaMemcpyHostToDevice); 41 | 42 | // Launch kernel 43 | sumReduction<<>>(d_array, d_partialSums); 44 | 45 | // Copy partial sums back to host 46 | float *h_partialSums = new float[numBlocks]; 47 | cudaMemcpy(h_partialSums, d_partialSums, numBlocks * sizeof(float), cudaMemcpyDeviceToHost); 48 | 49 | // Final sum on CPU 50 | float totalSum = 0.0f; 51 | for (int i = 0; i < numBlocks; i++) { 52 | totalSum += h_partialSums[i]; 53 | } 54 | 55 | // Cleanup 56 | cudaFree(d_array); 57 | cudaFree(d_partialSums); 58 | delete[] h_partialSums; 59 | 60 | return totalSum; 61 | } 62 | 63 | // CPU code 64 | int main() { 65 | float h_array[N]; 66 | for (int i = 0; i < N; i++) { 67 | h_array[i] = 1.0f; 68 | } 69 | float sum = sumArrayOnGPU(h_array); 70 | std::cout << "Sum: " << sum << std::endl; 71 | return 0; 72 | } 73 | -------------------------------------------------------------------------------- /openmp/readme.md: -------------------------------------------------------------------------------- 1 | # OpenMP 2 | 3 | OpenMP is a directive-based API, meaning that code intended to run in parallel is flagged with 4 | compiler directive. In C and C++, this is done with `#pragma omp ...` directives. OpenMP is also 5 | compatible with Fortran. The page 6 | https://www.openmp.org/wp-content/uploads/2021-10-20-Webinar-OpenMP-Offload-Programming-Introduction.pdf 7 | presents a quick intro to OpenMP and the page https://enccs.github.io/openmp-gpu/ explains GPU 8 | programming with OpenMP in more detail. 9 | 10 | OpenMP contrasts with the other APIs in this repository. In most cases, the CPU and GPU code use 11 | different programming languages. For instance, OpenGL and WebGPU use shading languages 12 | to program GPUs and OpenCL / CUDA use variants of the C programming language. OpenMP integrates 13 | that kind of operation seamlessly into CPU code (I find that it makes it actually most similar to 14 | Triton out of all the other APIs in the repository). 15 | 16 | 17 | ## Build the Example 18 | 19 | To build the OpenMP program, run: 20 | 21 | ``` 22 | mkdir build 23 | cd build 24 | cmake .. 25 | cmake --build . --config Release 26 | ``` 27 | 28 | OpenMP comes installed with the compiler, but you may have to ensure that your GPU drivers are up to 29 | date and that there is a CUDA / ACC / OpenCL / ... runtime available on your system to use it. This 30 | program is just a minimal example of array summation. 31 | 32 | 33 | ## Why Is the Example So Small? 34 | 35 | The program written in `openmp.cpp` takes 22 lines of C++ code to sum the elements in an array. In 36 | OpenGL, this takes around 145 lines (see the computing example of OpenGL in the repository). 37 | 38 | This is because OpenMP lets you flag parallel code and then uses those flags to program the GPU 39 | by itself. In OpenGL / OpenCL / ..., you have to not only write the GPU code, but also: 40 | 41 | - set up the computing pipeline and initialize resources, 42 | - check for errors, 43 | - manage memory transfers or mappings between host and device (i.e. GPU) memory, 44 | - clean up resource. 45 | 46 | Of course, that makes those APIs more flexible and I tend to see them more often used in large 47 | projects than OpenMP, but in some cases, the simplicity of OpenMP makes it a more logical choice. 48 | -------------------------------------------------------------------------------- /sycl/readme.md: -------------------------------------------------------------------------------- 1 | # SYCL 2 | 3 | SYCL is a Khronos group specification that lets you write hardware-accelerated instructions into 4 | regular C++ code instead of manually delegating these computations to kernels. One implementation of 5 | SYCL is oneAPI, by Intel. 6 | 7 | Relevant links: 8 | 9 | - Khronos group page on SYCL: https://www.khronos.org/sycl/ 10 | - Tutorial: https://sycl.tech/getting-started 11 | 12 | This project does not work! I obtain the error described at 13 | https://github.com/intel/llvm/issues/15910. In theory, it could compile if I reinstalled the whole 14 | compiler toolchain `¯\_(ツ)_/¯`. 15 | 16 | 17 | ## Build 18 | 19 | I completed the following steps to build the example, but it does not work yet. 20 | 21 | Visit the page https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html 22 | to download oneAPI and execute the following commands: 23 | 24 | On Linux, run the following commands: 25 | 26 | ``` 27 | source /opt/intel/oneapi/setvars.sh 28 | icpx main.cpp -o main.exe 29 | ``` 30 | 31 | On Windows, run the following commands: 32 | 33 | ``` 34 | "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" 35 | icx -fsycl main.cpp -o main.exe 36 | ``` 37 | 38 | I get the following error message: 39 | 40 | ``` 41 | >icx -fsycl main.cpp -o main.exe 42 | Intel(R) oneAPI DPC++/C++ Compiler for applications running on Intel(R) 64, Version 2024.2.0 Build 20240602 43 | Copyright (C) 1985-2024 Intel Corporation. All rights reserved. 44 | 45 | In file included from gpu-arena\sycl\main.cpp:3: 46 | In file included from C:\Program Files (x86)\Intel\oneAPI\compiler\2024.2\include\sycl\CL\sycl.hpp:11: 47 | In file included from C:\Program Files (x86)\Intel\oneAPI\compiler\2024.2\include\sycl\sycl.hpp:25: 48 | In file included from C:\Program Files (x86)\Intel\oneAPI\compiler\2024.2\include\sycl\accessor.hpp:11: 49 | In file included from C:\Program Files (x86)\Intel\oneAPI\compiler\2024.2\include\sycl\access\access.hpp:14: 50 | In file included from C:\Program Files (x86)\Intel\oneAPI\compiler\2024.2\include\sycl\CL\__spirv\spirv_ops.hpp:25: 51 | In file included from C:\Program Files (x86)\Intel\oneAPI\compiler\2024.2\include\sycl\CL\__spirv\spirv_types.hpp:25: 52 | C:\Program Files (x86)\Intel\oneAPI\compiler\2024.2\include\sycl\detail\defines.hpp(15,10): fatal error: 'climits' file not found 53 | 15 | #include 54 | | ^~~~~~~~~ 55 | 1 error generated. 56 | ``` 57 | -------------------------------------------------------------------------------- /cpu/cpp/task3.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifndef CACHE_LINE_SIZE 10 | #define CACHE_LINE_SIZE 128 11 | #endif 12 | 13 | void exponential_partial_sum(const std::vector &a, int chunk, int n_elements, std::vector &results) 14 | { 15 | int start = chunk * n_elements; 16 | int end = std::min(start + n_elements, (int)a.size()); 17 | for (int i = start; i < end; i++) { 18 | results[chunk * CACHE_LINE_SIZE] += exp(a[i]); 19 | } 20 | } 21 | 22 | void modify_elements(std::vector &a, int chunk, int n_elements, float exponential_sum) 23 | { 24 | int start = chunk * n_elements; 25 | int end = std::min(start + n_elements, (int)a.size()); 26 | for (int i = start; i < end; i++) { 27 | a[i] = exp(a[i]) / exponential_sum; 28 | } 29 | } 30 | 31 | void parallel_softmax(std::vector &a, int n_threads) 32 | { 33 | std::vector threads; 34 | std::vector exponential_sums; 35 | exponential_sums.resize(n_threads * CACHE_LINE_SIZE); 36 | int n_elements = (a.size() + n_threads - 1) / n_threads; 37 | 38 | // Calculate the sum e raised to the power of all elements 39 | for (int t = 0; t < n_threads; t++) { 40 | threads.emplace_back(exponential_partial_sum, std::ref(a), t, n_elements, std::ref(exponential_sums)); 41 | } 42 | 43 | float exponential_sum = 0.0; 44 | for (int t = 0; t < n_threads; t++) { 45 | threads[t].join(); 46 | exponential_sum += exponential_sums[t * CACHE_LINE_SIZE]; 47 | } 48 | 49 | // Modify elements in place. 50 | threads.clear(); 51 | for (int t = 0; t < n_threads; t++) { 52 | threads.emplace_back(modify_elements, std::ref(a), t, n_elements, exponential_sum); 53 | } 54 | 55 | for (int t = 0; t < n_threads; t++) { 56 | threads[t].join(); 57 | } 58 | } 59 | 60 | double task_3(int n, int n_threads) 61 | { 62 | std::vector a; 63 | a.resize(n); 64 | for (unsigned int i = 0; i < n; i++) { 65 | a[i] = (((float)rand() / RAND_MAX) - 0.5) * 2.0; 66 | } 67 | auto begin = std::chrono::steady_clock::now(); 68 | parallel_softmax(a, n_threads); 69 | auto end = std::chrono::steady_clock::now(); 70 | return std::chrono::duration_cast(end - begin).count() / 1000000.0; 71 | } 72 | -------------------------------------------------------------------------------- /vulkan/readme.md: -------------------------------------------------------------------------------- 1 | # Vulkan 2 | 3 | Vulkan is an API designed for computer graphics, but it also supports general-purpose computing 4 | through compute shaders. Vulkan uses SPIR-V as its shading language, which is not intended to be 5 | human-readable. You can write the shaders in a human readable language, like GLSL and HLSL, and 6 | compile it to SPIR-V before feeding it to Vulkan. This project uses GLSL and the compiler 7 | `glslangValidator`. 8 | 9 | Note: The Website https://vulkan-tutorial.com/ is a **much** better resource to learn Vulkan! This 10 | current repository aims at providing simple comparisons between GPU programming APIs, so it only 11 | provides simple examples. 12 | 13 | 14 | ## Graphics Example 15 | 16 | The directory `graphics` is a self-contained C++ project that uses Vulkan and additional libraries 17 | to open a window and display simple graphics. This project is based on the file 18 | https://github.com/Overv/VulkanTutorial/blob/main/code/15_hello_triangle.cpp from the repository 19 | `VulkanTutorial`, available at https://github.com/Overv/VulkanTutorial/tree/main and licensed under 20 | the licenses CC0-1.0 and CC-BY-SA-4.0. 21 | 22 | The following instructions show how to build and run it. This only works on Linux! Use WSL if 23 | necessary! 24 | 25 | ``` 26 | cd graphics 27 | sudo apt install vulkan-tools 28 | sudo apt install libvulkan-dev 29 | sudo apt install vulkan-validationlayers-dev spirv-tools 30 | sudo apt install libglm-dev 31 | mkdir build 32 | cd build 33 | cmake .. 34 | cmake --build . 35 | glslangValidator -V ../shader.vert -o vertices.spv 36 | glslangValidator -V ../shader.frag -o fragment.spv 37 | ./graphics 38 | ``` 39 | 40 | 41 | ## Computing Example 42 | 43 | Vulkan *can* be used for general-purpose computing, but I've found it really impractical. The file 44 | https://github.com/SaschaWillems/Vulkan/blob/master/examples/computeheadless/computeheadless.cpp is 45 | a minimal example that shows how to use Vulkan for computations. It's more than 600 lines long! 46 | 47 | Vulkan has never been intended to be beginner-friendly; it's made for high performances and aimed 48 | at motivated users. Fortunately, the project Kompute (https://kompute.cc/), which is built atop 49 | Vulkan, lets you easily create and run compute shaders. It can be used with a C++ or Python 50 | interface; this repository uses Python. It is an updated version of the example found at 51 | https://kompute.cc/. 52 | 53 | To try Kompute, run the following commands on Linux or WSL: 54 | 55 | ``` 56 | cd compute 57 | sudo apt install spirv-tools # Install glslangValidator 58 | pip install kp # Install kompute. Ideally, use a virtual environment! 59 | python3 compute.py 60 | ``` 61 | -------------------------------------------------------------------------------- /cpu/cpp/task2.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | /// @brief Perfor a matrix multiplication with tiling to avoid false sharing. 9 | /// @param a 10 | /// @param b 11 | /// @param c 12 | /// @param chunk 13 | /// @param n_elements 14 | void partial_mul( 15 | std::vector> &a, 16 | std::vector> &b, 17 | std::vector> &c, 18 | int chunk, 19 | int n_elements 20 | ) { 21 | int N = a.size(); 22 | int TILE_SIZE = 32; 23 | for (int i = chunk; i < N; i += TILE_SIZE) { 24 | for (int j = 0; j < N; j += TILE_SIZE) { 25 | for (int k = 0; k < N; k += TILE_SIZE) { 26 | for (int ii = i; ii < std::min(i + TILE_SIZE, N); ++ii) { 27 | for (int jj = j; jj < std::min(j + TILE_SIZE, N); ++jj) { 28 | double sum = 0.0; 29 | for (int kk = k; kk < std::min(k + TILE_SIZE, N); ++kk) { 30 | sum += a[ii][kk] * b[kk][jj]; 31 | } 32 | c[ii][jj] += sum; 33 | } 34 | } 35 | } 36 | } 37 | } 38 | } 39 | 40 | void parallel_mat_mul( 41 | std::vector> &a, 42 | std::vector> &b, 43 | std::vector> &c, 44 | int n_threads 45 | ) { 46 | std::vector threads; 47 | int n_elements = (a.size() + n_threads - 1) / n_threads; 48 | for (int t = 0; t < n_threads; t++) { 49 | threads.emplace_back(partial_mul, std::ref(a), std::ref(b), std::ref(c), t, n_elements); 50 | } 51 | for (int t = 0; t < n_threads; t++) { 52 | threads[t].join(); 53 | } 54 | } 55 | 56 | double task_2(int n, int n_threads) { 57 | std::vector> a; // Input 58 | std::vector> b; // Input 59 | std::vector> c; // Result 60 | a.resize(n); 61 | b.resize(n); 62 | c.resize(n); 63 | for (unsigned int i = 0; i < n; i++) { 64 | a[i].resize(n); 65 | b[i].resize(n); 66 | c[i].resize(n); 67 | for (unsigned int j = 0; j < n; j++) { 68 | a[i][j] = (((float)rand() / RAND_MAX) - 0.5) * 2.0; 69 | b[i][j] = (((float)rand() / RAND_MAX) - 0.5) * 2.0; 70 | } 71 | } 72 | auto begin = std::chrono::steady_clock::now(); 73 | parallel_mat_mul(a, b, c, n_threads); 74 | auto end = std::chrono::steady_clock::now(); 75 | 76 | return std::chrono::duration_cast(end - begin).count() / 1000000.0; 77 | } 78 | -------------------------------------------------------------------------------- /cpu/readme.md: -------------------------------------------------------------------------------- 1 | # CPU 2 | 3 | Algorithm acceleration with multi-threading on CPUs. 4 | 5 | Most projects in this repository demonstrate how to use GPUs (graphics processing units), which 6 | excel at processing **large** amounts of ideally **weakly dependent** data. GPUs are inadequate for 7 | many situations, in which case you can fall back on CPUs (central processing units). This repository 8 | contains programs written in C++ and Rust to show how to use threads to accelerate algorithms. This 9 | is one way of achieving parallelism with a CPU. 10 | 11 | 12 | ## C++ Version 13 | 14 | This project uses the class `std::thread` to implement multithreading. It's also possible to use 15 | the function `pthread_create` to launch threads. 16 | 17 | 18 | ### Build the Project 19 | 20 | - Install [cmake](https://cmake.org/) on your system. 21 | - Install [clang](`https://clang.llvm.org/`) on your system (yes, even if you are on W*ndows!) 22 | - Navigate in the `cpp` directory. 23 | - Create a `build` subdirectory. 24 | - Navigate in the `build` subdirectory. 25 | - Run the command `cmake ..`. 26 | - Run the command `cmake --build . --config Release`. 27 | 28 | **Note**: The option `--config Release` is important because it instructs cmake to add optimization 29 | flags. Without them, the compiler will not optimize aggressively enough and you might notice that 30 | multithreading *decreases* performances instead of improving them. 31 | 32 | 33 | ### Usage 34 | 35 | Run: 36 | 37 | ``` 38 | # Linux 39 | ./build/Debug/cpu 40 | 41 | # On the OS that begins with the letter W 42 | build\Debug\cpu.exe 43 | ``` 44 | 45 | where: 46 | 47 | - `` is an integer ranging from 1 to 3, inclusively, that designates the task to execute. 48 | - `` is the dimension of the input data. 49 | - `` is the number of times that the computation must be repeated. The execution time 50 | reported by the program is the arithmetic mean of the duration of all iterations. 51 | - `` is the number of threads. 52 | 53 | 54 | ## Rust Version 55 | 56 | This project uses the standard modules `std::thread` and `std::sync::Arc` to implement 57 | multithreading. 58 | 59 | 60 | ### Build 61 | 62 | - Install [cargo](https://doc.rust-lang.org/stable/cargo/) on your system. 63 | - Run the command `cargo build --release`. 64 | 65 | 66 | ### Usage 67 | 68 | Run the command: 69 | 70 | ``` 71 | # Linux 72 | ./target/release/cpu --task --n --iterations --thread 73 | 74 | # Windows 75 | target\release\cpu --task --n --iterations --thread 76 | ``` 77 | 78 | where 79 | 80 | - `` is an integer ranging from 1 to 3, inclusively, that designates the task to execute. 81 | Default: 1 82 | - `` is the dimension of the input data. Default: 1000 83 | - `` is the number of times that the computation must be repeated. The execution time reported 84 | by the program is the arithmetic mean of the duration of all iterations. Default: 1 85 | - `` is the number of threads. Default: 1 86 | -------------------------------------------------------------------------------- /vulkan/compute/compute.py: -------------------------------------------------------------------------------- 1 | """Minimal kompute example. 2 | 3 | This script is an updated version of the example found at https://kompute.cc/. 4 | """ 5 | 6 | import os 7 | import numpy as np 8 | import kp 9 | 10 | 11 | def compile_source(source): 12 | open("tmp_kp_shader.comp", "w").write(source) 13 | os.system("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv") 14 | return open("tmp_kp_shader.comp.spv", "rb").read() 15 | 16 | 17 | def kompute(shader): 18 | # 1. Create Kompute Manager with default settings (device 0, first queue and no extensions) 19 | mgr = kp.Manager() 20 | 21 | # 2. Create and initialise Kompute Tensors through manager 22 | 23 | # Default tensor constructor simplifies creation of float values 24 | tensor_in_a = mgr.tensor([2, 2, 2]) 25 | tensor_in_b = mgr.tensor([1, 2, 3]) 26 | # Explicit type constructor supports uint32, int32, double, float and bool 27 | tensor_out_a = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32)) 28 | tensor_out_b = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32)) 29 | 30 | params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b] 31 | 32 | # 3. Create algorithm based on shader (supports buffers & push/spec constants) 33 | workgroup = (3, 1, 1) 34 | spec_consts = [2] 35 | push_consts_a = [2] 36 | push_consts_b = [3] 37 | 38 | # See documentation shader section for compile_source 39 | spirv = compile_source(shader) 40 | 41 | algo = mgr.algorithm(params, spirv, workgroup, spec_consts, push_consts_a) 42 | 43 | # 4. Run operation synchronously using sequence 44 | (mgr.sequence() 45 | .record(kp.OpTensorSyncDevice(params)) 46 | .record(kp.OpAlgoDispatch(algo)) # Binds default push consts provided 47 | .eval() # evaluates the two recorded ops 48 | .record(kp.OpAlgoDispatch(algo, push_consts_b)) # Overrides push consts 49 | .eval()) # evaluates only the last recorded op 50 | 51 | # 5. Sync results from the GPU asynchronously 52 | sq = mgr.sequence() 53 | sq.eval_async(kp.OpTensorSyncLocal(params)) 54 | 55 | # ... Do other work asynchronously whilst GPU finishes 56 | 57 | sq.eval_await() 58 | 59 | # Prints the first output which is: { 4, 8, 12 } 60 | print(tensor_out_a.data()) 61 | # Prints the first output which is: { 10, 10, 10 } 62 | print(tensor_out_b.data()) 63 | 64 | 65 | if __name__ == "__main__": 66 | 67 | # Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header 68 | # files). This shader shows some of the main components including constants, buffers, etc 69 | shader = """ 70 | #version 450 71 | 72 | layout (local_size_x = 1) in; 73 | 74 | // The input tensors bind index is relative to index in parameter passed 75 | layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; }; 76 | layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; }; 77 | layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; }; 78 | layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; }; 79 | 80 | // Kompute supports push constants updated on dispatch 81 | layout(push_constant) uniform PushConstants { 82 | float val; 83 | } push_const; 84 | 85 | // Kompute also supports spec constants on initialization 86 | layout(constant_id = 0) const float const_one = 0; 87 | 88 | void main() { 89 | uint index = gl_GlobalInvocationID.x; 90 | out_a[index] += uint( in_a[index] * in_b[index] ); 91 | out_b[index] += uint( const_one * push_const.val ); 92 | } 93 | """ 94 | kompute(shader) 95 | -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | Build and run multithreaded CPU programs. 3 | 4 | A multithreaded is not necessarily faster than a purely sequential one! 5 | Multithreading entails some overhead when splitting data and coordinating 6 | threads, so a single thread often outperforms multiple ones for small 7 | datasets. This script lets you measure which thread count is most efficient 8 | for a given dataset size. 9 | 10 | Usage: 11 | 12 | $python3 benchmark.py 13 | """ 14 | 15 | import os 16 | import subprocess 17 | import matplotlib.pyplot as plt 18 | 19 | BACKEND = "cpu-cpp" # WHich accelerated program to use (either cpu-cpp or cpu-rust). 20 | TASK = "1" # Refer to the file `cpu/readme.md`` for a description of each task. 21 | MIN_N = 10_000 # Minimum dataset size. 22 | MAX_N = 10_000_000 # Maximum dataset size. 23 | ITERATIONS = 5 # Number of iterations to perform. The duration is the average of all iterations. 24 | N_THREADS = (1, 2) # Number of threads to use in each comparison. 25 | 26 | durations = {} 27 | for n in N_THREADS: 28 | durations[n] = [] 29 | 30 | 31 | def cpu_cpp(): 32 | os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/cpu/cpp") 33 | try: 34 | subprocess.check_call(["mkdir", "build"]) 35 | except: 36 | pass 37 | os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/cpu/cpp/build") 38 | subprocess.check_call(["cmake", ".."]) 39 | subprocess.check_call(["cmake", "--build", ".", "--config", "Release"]) 40 | os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/cpu/cpp/build/Release") 41 | files = os.listdir() 42 | if "cpu.exe" in files: 43 | program = "cpu.exe" 44 | else: 45 | program = "cpu" 46 | 47 | x = [] 48 | n = MIN_N 49 | while n <= MAX_N: 50 | print(f"N: {n}") 51 | x.append(n) 52 | for t in durations: 53 | print(f" t: {t} = ", end = "") 54 | v = subprocess.run( 55 | [program, TASK, str(n), str(ITERATIONS), str(t)], 56 | capture_output = True, 57 | text = True 58 | ).stdout 59 | durations[t].append(float(str(v).split(" ")[-1].rstrip())) 60 | print(durations[t][-1]) 61 | n *= 10 62 | 63 | fig, ax = plt.subplots() 64 | for t in N_THREADS: 65 | ax.plot(x, durations[t], label=f"{t}") 66 | ax.set(xlabel='Number of data points', ylabel='Duration (s)') 67 | ax.set_xscale('log') 68 | ax.set_yscale('log') 69 | ax.legend() 70 | plt.show() 71 | 72 | 73 | def cpu_rust(): 74 | os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/cpu/rust") 75 | x = [] 76 | n = MIN_N 77 | while n <= MAX_N: 78 | print(f"N: {n}") 79 | x.append(n) 80 | for t in durations: 81 | print(f" t: {t} = ", end = "") 82 | program = [ 83 | "cargo", "run", "--", 84 | "--task", str(TASK), 85 | "--n", str(n), 86 | "--iterations", str(ITERATIONS), 87 | "--threads", str(t) 88 | ] 89 | v = subprocess.run( 90 | program, 91 | capture_output = True, 92 | text = True 93 | ).stdout 94 | durations[t].append(float(str(v).split(" ")[-1].rstrip())) 95 | print(durations[t][-1]) 96 | n *= 10 97 | 98 | fig, ax = plt.subplots() 99 | for t in N_THREADS: 100 | ax.plot(x, durations[t], label=f"{t}") 101 | ax.set(xlabel='Number of data points', ylabel='Duration (s)') 102 | ax.set_xscale('log') 103 | ax.set_yscale('log') 104 | ax.legend() 105 | plt.show() 106 | 107 | 108 | if BACKEND == "cpu-cpp" or BACKEND == "cpu": 109 | cpu_cpp() 110 | elif BACKEND == "cpu-rust": 111 | cpu_rust() 112 | -------------------------------------------------------------------------------- /triton/main.py: -------------------------------------------------------------------------------- 1 | """Calculate a sum reduction and a softmax function with Triton.""" 2 | 3 | import triton 4 | import triton.language as tl 5 | import torch 6 | 7 | 8 | @triton.jit 9 | def fold( 10 | x_ptr, 11 | middle: tl.constexpr, 12 | end: tl.constexpr, 13 | n_elements, 14 | BLOCK_SIZE: tl.constexpr 15 | ): 16 | """Perform one sum reduction pass. 17 | 18 | This function has to be separate from ``sum_reduction`` because the values 19 | used by the kernel change between iterations. Since Triton must compile the 20 | kernels before executing them, the parameters must be of type 21 | ``tl.constexpr``, not ``int``. 22 | 23 | Args: 24 | x_ptr: Pointer to the input tensor. Modified in place! 25 | middle: Half of the size of the current reduction. 26 | end: Size of the current reduction. 27 | n_elements: Total number of elements in the tensor. Can be of type 28 | ``int`` because it is used for masking only. 29 | BLOCK_SIZE: Number of elements processed by each program. 30 | """ 31 | m = tl.program_id(axis=0) 32 | block_start = m * BLOCK_SIZE 33 | right_offsets = block_start + tl.arange(middle, end) 34 | right_x = tl.load(x_ptr + right_offsets, mask=(right_offsets < n_elements)) 35 | left_offsets = block_start + tl.arange(0, middle) 36 | left_x = tl.load(x_ptr + left_offsets, mask=(left_offsets < n_elements)) 37 | tl.store(x_ptr + left_offsets, left_x + right_x, mask=(left_offsets < n_elements)) 38 | 39 | 40 | def sum_reduction(x_ptr, n_elements, BLOCK_SIZE: tl.constexpr): 41 | """GPU-accelerated sum reduction. 42 | 43 | This function adds up the elements in an array in pairs. For instance: 44 | 45 | Initial array: 0 1 2 3 4 5 6 7 46 | First reduction: 4 6 8 10 <- (0 + 4) (1 + 5) (2 + 6) (3 + 7) 47 | Second reduction: 12 16 48 | Final reduction: 28 49 | 50 | Refer to the file https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf 51 | for a detailed discussion! 52 | 53 | Args: 54 | x_ptr: Pointer to the input tensor to sum. Modified in place! 55 | n_elements: Total number of elements to sum. 56 | BLOCK_SIZE: Number of elements processed by each program. 57 | """ 58 | stride = BLOCK_SIZE // 2 59 | while stride > 0: 60 | grid = (n_elements, ) 61 | fold[grid](x_ptr, stride, stride * 2, n_elements, BLOCK_SIZE) 62 | stride //= 2 63 | 64 | 65 | print("Trying out sum reduction with Triton!") 66 | X = torch.Tensor((0, 1, 2, 3, 4, 5, 6, 7)).to('cuda') 67 | print(f" Input values: {X}") 68 | sum_reduction(X, len(X), len(X)) 69 | print(f" Output values: {X}") 70 | print(f" Sum: {X[0]}\n") 71 | 72 | 73 | @triton.jit 74 | def softmax(Y, stride_ym, stride_yn, X, stride_xm, stride_xn, N): 75 | """Compute a sum reduction. 76 | 77 | This function is taken from the blog post https://openai.com/index/triton/ 78 | by OpenAI. 79 | 80 | Args: 81 | Y: Pointer to the output tensor. 82 | stride_ym: Stride in axis 0. 83 | stride_yn: Stride in axis 1. 84 | X: Pointer to the input tensor. 85 | stride_xm: Stride in axis 0. 86 | stride_xn: Stride in axis 1. 87 | N: Number of columns. 88 | """ 89 | m = tl.program_id(0) 90 | BLOCK_SIZE: tl.constexpr = 1024 91 | n = tl.arange(0, BLOCK_SIZE) 92 | X = X + m * stride_xm + n * stride_xn 93 | x = tl.load(X, mask=n < N, other=-float('inf')) 94 | z = x - tl.max(x, axis=0) 95 | num = tl.exp(z) 96 | denom = tl.sum(num, axis=0) 97 | y = num / denom 98 | Y = Y + m * stride_ym + n * stride_yn 99 | tl.store(Y, y, mask=n < N) 100 | 101 | 102 | print("Trying out a Softmax function with Triton!") 103 | X = torch.normal(0, 1, size=(4, 4), device='cuda') 104 | Y = torch.empty_like(X) 105 | grid = (X.shape[0], ) 106 | softmax[grid](Y, Y.stride(0), Y.stride(1), 107 | X, X.stride(0), X.stride(1), 108 | X.shape[1]) 109 | print(f"Input values:\n{X}") 110 | print(f"Output values:\n{Y}\n") 111 | -------------------------------------------------------------------------------- /directx/computing/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #pragma comment(lib, "d3d11.lib") 7 | #pragma comment(lib, "d3dcompiler.lib") 8 | 9 | #define GROUP_SIZE 256 10 | 11 | // Error checking macro 12 | #define HR_CHECK(hr) if (FAILED(hr)) { std::cerr << "D3D Error at line " << __LINE__ << std::endl; return -1; } 13 | 14 | int main() { 15 | // Sample data (must be a multiple of GROUP_SIZE for simplicity) 16 | const int dataSize = 1024; 17 | std::vector data(dataSize, 1.0f); // Fill with ones for easy sum verification 18 | int numGroups = dataSize / GROUP_SIZE; 19 | 20 | // Step 1: Initialize Direct3D 21 | ID3D11Device* device = nullptr; 22 | ID3D11DeviceContext* context = nullptr; 23 | D3D_FEATURE_LEVEL featureLevel; 24 | D3D11CreateDevice(nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0, nullptr, 0, D3D11_SDK_VERSION, &device, &featureLevel, &context); 25 | 26 | // Step 2: Compile Compute Shader 27 | ID3DBlob* csBlob = nullptr; 28 | ID3DBlob* errorBlob = nullptr; 29 | HRESULT hr = D3DCompileFromFile(L"sum.hlsl", nullptr, nullptr, "main", "cs_5_0", 0, 0, &csBlob, &errorBlob); 30 | if (FAILED(hr)) { 31 | if (errorBlob) std::cout << "Error compiling the shader: " << (char*)errorBlob->GetBufferPointer(); 32 | return -1; 33 | } 34 | 35 | // Step 3: Create Compute Shader 36 | ID3D11ComputeShader* computeShader = nullptr; 37 | HR_CHECK(device->CreateComputeShader(csBlob->GetBufferPointer(), csBlob->GetBufferSize(), nullptr, &computeShader)); 38 | csBlob->Release(); 39 | 40 | // Step 4: Create Buffers (Input & Output) 41 | D3D11_BUFFER_DESC bufferDesc = {}; 42 | bufferDesc.Usage = D3D11_USAGE_DEFAULT; 43 | bufferDesc.ByteWidth = dataSize * sizeof(float); 44 | bufferDesc.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE; 45 | bufferDesc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; 46 | bufferDesc.StructureByteStride = sizeof(float); 47 | 48 | D3D11_SUBRESOURCE_DATA initData = { data.data(), 0, 0 }; 49 | ID3D11Buffer* inputBuffer = nullptr; 50 | HR_CHECK(device->CreateBuffer(&bufferDesc, &initData, &inputBuffer)); 51 | 52 | bufferDesc.ByteWidth = numGroups * sizeof(float); 53 | ID3D11Buffer* outputBuffer = nullptr; 54 | HR_CHECK(device->CreateBuffer(&bufferDesc, nullptr, &outputBuffer)); 55 | 56 | // Step 5: Create Unordered Access Views (UAV) 57 | D3D11_UNORDERED_ACCESS_VIEW_DESC uavDesc = {}; 58 | uavDesc.Format = DXGI_FORMAT_UNKNOWN; 59 | uavDesc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER; 60 | uavDesc.Buffer.NumElements = dataSize; 61 | ID3D11UnorderedAccessView* inputUAV = nullptr; 62 | HR_CHECK(device->CreateUnorderedAccessView(inputBuffer, &uavDesc, &inputUAV)); 63 | 64 | uavDesc.Buffer.NumElements = numGroups; 65 | ID3D11UnorderedAccessView* outputUAV = nullptr; 66 | HR_CHECK(device->CreateUnorderedAccessView(outputBuffer, &uavDesc, &outputUAV)); 67 | 68 | // Step 6: Run the Compute Shader 69 | context->CSSetShader(computeShader, nullptr, 0); 70 | context->CSSetUnorderedAccessViews(0, 1, &inputUAV, nullptr); 71 | context->CSSetUnorderedAccessViews(1, 1, &outputUAV, nullptr); 72 | context->Dispatch(numGroups, 1, 1); 73 | 74 | // Step 7: Read Back Result 75 | std::vector partialSums(numGroups); 76 | D3D11_BUFFER_DESC readbackDesc = {}; 77 | readbackDesc.Usage = D3D11_USAGE_STAGING; 78 | readbackDesc.ByteWidth = numGroups * sizeof(float); 79 | readbackDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; 80 | readbackDesc.StructureByteStride = sizeof(float); 81 | ID3D11Buffer* readbackBuffer = nullptr; 82 | HR_CHECK(device->CreateBuffer(&readbackDesc, nullptr, &readbackBuffer)); 83 | 84 | context->CopyResource(readbackBuffer, outputBuffer); 85 | D3D11_MAPPED_SUBRESOURCE mappedResource; 86 | HR_CHECK(context->Map(readbackBuffer, 0, D3D11_MAP_READ, 0, &mappedResource)); 87 | memcpy(partialSums.data(), mappedResource.pData, numGroups * sizeof(float)); 88 | context->Unmap(readbackBuffer, 0); 89 | 90 | // Step 8: Compute Final Sum on CPU 91 | float finalSum = 0.0f; 92 | for (float val : partialSums) { 93 | finalSum += val; 94 | } 95 | 96 | std::cout << "Sum of elements: " << finalSum << std::endl; // Expected: 1024 97 | 98 | // Cleanup 99 | inputUAV->Release(); 100 | outputUAV->Release(); 101 | inputBuffer->Release(); 102 | outputBuffer->Release(); 103 | readbackBuffer->Release(); 104 | computeShader->Release(); 105 | context->Release(); 106 | device->Release(); 107 | 108 | return 0; 109 | } 110 | -------------------------------------------------------------------------------- /opengl/computing/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define ARRAY_SIZE 10240000 // 1024 * 10000 8 | 9 | // Compute shader source (performs parallel reduction sum) 10 | // Refer to https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf for a discussion 11 | // of sum reduction on GPU if you have trouble understanding the code. 12 | const char* computeShaderSource = R"( 13 | #version 310 es // This is the only version that works on my system! 14 | #define ARRAY_SIZE 1024 15 | #define HALF_ARRAY_SIZE 512u 16 | 17 | layout(std430, binding = 0) buffer InputBuffer { 18 | float inputData[]; 19 | }; 20 | 21 | layout(std430, binding = 1) buffer OutputBuffer { 22 | float outputData[]; 23 | }; 24 | 25 | shared float groupData[ARRAY_SIZE]; 26 | 27 | layout(local_size_x = ARRAY_SIZE) in; 28 | 29 | void main() { 30 | uint index = gl_GlobalInvocationID.x; 31 | uint local_index = gl_LocalInvocationID.x; 32 | uint groupID = gl_WorkGroupID.x; 33 | 34 | groupData[local_index] = inputData[index]; 35 | barrier(); 36 | 37 | for (uint s = HALF_ARRAY_SIZE; s != 0u; s >>= 1) { 38 | if (local_index < s) { 39 | groupData[local_index] += groupData[local_index + s]; 40 | } 41 | barrier(); 42 | } 43 | 44 | if (local_index == 0u) { 45 | outputData[groupID] = groupData[0]; 46 | } 47 | } 48 | )"; 49 | 50 | GLuint createComputeShader(const char* source) { 51 | GLuint shader = glCreateShader(GL_COMPUTE_SHADER); 52 | glShaderSource(shader, 1, &source, nullptr); 53 | glCompileShader(shader); 54 | 55 | // Check for errors 56 | GLint success; 57 | glGetShaderiv(shader, GL_COMPILE_STATUS, &success); 58 | if (!success) { 59 | char log[512]; 60 | glGetShaderInfoLog(shader, 512, nullptr, log); 61 | std::cerr << "Compute Shader Compilation Error:\n" << log << std::endl; 62 | } 63 | 64 | GLuint program = glCreateProgram(); 65 | glAttachShader(program, shader); 66 | glLinkProgram(program); 67 | 68 | glDeleteShader(shader); 69 | return program; 70 | } 71 | 72 | int main() 73 | { 74 | // Initialize GLFW (no window needed) 75 | if (!glfwInit()) { 76 | std::cerr << "Failed to initialize GLFW\n"; 77 | return -1; 78 | } 79 | glfwWindowHint(GLFW_VISIBLE, GLFW_FALSE); // Hide window 80 | GLFWwindow* window = glfwCreateWindow(100, 100, "Compute Shader", nullptr, nullptr); 81 | glfwMakeContextCurrent(window); 82 | 83 | // Initialize GLEW 84 | if (glewInit() != GLEW_OK) { 85 | std::cerr << "Failed to initialize GLEW\n"; 86 | return -1; 87 | } 88 | 89 | // Input data 90 | std::vector inputData(ARRAY_SIZE, 1.0f); 91 | size_t dataSize = inputData.size() * sizeof(float); 92 | 93 | GLuint inputBuffer, outputBuffer; 94 | glGenBuffers(1, &inputBuffer); 95 | glGenBuffers(1, &outputBuffer); 96 | 97 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, inputBuffer); 98 | glBufferData(GL_SHADER_STORAGE_BUFFER, dataSize, inputData.data(), GL_DYNAMIC_COPY); 99 | 100 | // The reduced sum is 1024 times smaller than the input. 101 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, outputBuffer); 102 | glBufferData(GL_SHADER_STORAGE_BUFFER, dataSize / 1024, nullptr, GL_DYNAMIC_COPY); 103 | 104 | // Create and run compute shader 105 | GLuint computeProgram = createComputeShader(computeShaderSource); 106 | glUseProgram(computeProgram); 107 | 108 | auto begin = std::chrono::steady_clock::now(); 109 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, inputBuffer); 110 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, outputBuffer); 111 | 112 | int numWorkgroups = ARRAY_SIZE / 1024; 113 | numWorkgroups = numWorkgroups ? numWorkgroups : 1; 114 | 115 | glUseProgram(computeProgram); 116 | glDispatchCompute(numWorkgroups, 1, 1); 117 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 118 | 119 | // Retrieve result 120 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, outputBuffer); 121 | auto end = std::chrono::steady_clock::now(); 122 | float* mappedData = (float*)glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY); 123 | 124 | // Aggregate the reduced sums. If the input array is bigger than 1024, we have to add up 125 | // multiple values because the array is not totally reduced. We could do another pass of the 126 | // shader to reduce the array again... or we can just use a for loop to add up the sums :). 127 | float total = 0; 128 | for (unsigned int i = 0; i < numWorkgroups; i++) { 129 | total += mappedData[i]; 130 | } 131 | glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); 132 | 133 | double duration = std::chrono::duration_cast(end - begin).count() / 1000000.0; 134 | 135 | std::cout << "Result: " << total << " (expected " << ARRAY_SIZE << ")." << std::endl; 136 | std::cout << "Duration: " << duration << " s." << std::endl; 137 | 138 | // Cleanup 139 | glDeleteProgram(computeProgram); 140 | glDeleteBuffers(1, &inputBuffer); 141 | glDeleteBuffers(1, &outputBuffer); 142 | glfwDestroyWindow(window); 143 | glfwTerminate(); 144 | return 0; 145 | } 146 | -------------------------------------------------------------------------------- /opengl/graphics/main.cpp: -------------------------------------------------------------------------------- 1 | #include "glad.h" // Load OpenGL functions 2 | #include // Window management 3 | #include 4 | #include 5 | 6 | // Vertex shader source code. This modifies the position of the vertices. 7 | const char* vertexShaderSource = R"( 8 | #version 330 core 9 | 10 | layout(location = 0) in vec3 inPos; 11 | layout(location = 1) in vec4 inColor; 12 | 13 | out vec4 fragColor; 14 | 15 | uniform float angle; 16 | 17 | void main() { 18 | mat2 rotation = mat2( 19 | cos(angle), -sin(angle), 20 | sin(angle), cos(angle) 21 | ); 22 | vec2 rotatedPos = rotation * inPos.xz; 23 | gl_Position = vec4(rotatedPos.x, inPos.y, rotatedPos.y, 1.0); 24 | fragColor = inColor; 25 | } 26 | )"; 27 | 28 | // Fragment (i.e. pixel most of the time) shader source code. This determines the final colors. 29 | const char* fragmentShaderSource = R"( 30 | #version 330 core 31 | 32 | in vec4 fragColor; 33 | out vec4 outColor; 34 | 35 | void main() { 36 | float levels = 10.0; 37 | vec3 quantizedColor = 38 | floor(fragColor.rgb * levels) 39 | / (levels - 1.0); 40 | outColor = vec4(quantizedColor, 1.0); 41 | } 42 | )"; 43 | 44 | void framebuffer_size_callback(GLFWwindow* window, int width, int height) { 45 | glViewport(0, 0, width, height); 46 | } 47 | 48 | int main() { 49 | // Initialize GLFW 50 | if (!glfwInit()) { 51 | std::cerr << "Failed to initialize GLFW\n"; 52 | return -1; 53 | } 54 | 55 | // Set OpenGL version (3.3 Core Profile) 56 | glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3); 57 | glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3); 58 | glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); 59 | 60 | // Create a window 61 | GLFWwindow* window = glfwCreateWindow(800, 600, "OpenGL!", NULL, NULL); 62 | if (!window) { 63 | std::cerr << "Failed to create GLFW window\n"; 64 | glfwTerminate(); 65 | return -1; 66 | } 67 | 68 | glfwMakeContextCurrent(window); 69 | 70 | // Load OpenGL functions using GLAD 71 | if (!gladLoadGLLoader((GLADloadproc)glfwGetProcAddress)) { 72 | std::cerr << "Failed to initialize GLAD\n"; 73 | return -1; 74 | } 75 | 76 | glViewport(0, 0, 800, 600); 77 | glfwSetFramebufferSizeCallback(window, framebuffer_size_callback); 78 | 79 | // Vertex Data 80 | float vertices[] = { 81 | // Positions // Colors 82 | 0.0f, 0.5f, 0.0f, 1.0f, 0.0f, 0.0f, // Top (Red) 83 | -0.5f, -0.5f, 0.0f, 0.0f, 1.0f, 0.0f, // Left (Green) 84 | 0.5f, -0.5f, 0.0f, 0.0f, 0.0f, 1.0f // Right (Blue) 85 | }; 86 | 87 | // Create a Vertex Buffer Object (VBO) and Vertex Array Object (VAO) 88 | unsigned int VBO, VAO; 89 | glGenVertexArrays(1, &VAO); 90 | glGenBuffers(1, &VBO); 91 | 92 | // Bind the VAO 93 | glBindVertexArray(VAO); 94 | 95 | // Bind and fill VBO 96 | glBindBuffer(GL_ARRAY_BUFFER, VBO); 97 | glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW); 98 | 99 | // Define vertex attributes (location = 0) 100 | glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 6 * sizeof(float), (void*)0); 101 | glEnableVertexAttribArray(0); 102 | 103 | // Color attribute (location = 1) 104 | glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, 6 * sizeof(float), (void*)(3 * sizeof(float))); 105 | glEnableVertexAttribArray(1); 106 | 107 | // Create and compile Vertex Shader 108 | unsigned int vertexShader = glCreateShader(GL_VERTEX_SHADER); 109 | glShaderSource(vertexShader, 1, &vertexShaderSource, NULL); 110 | glCompileShader(vertexShader); 111 | 112 | // Create and compile Fragment Shader 113 | unsigned int fragmentShader = glCreateShader(GL_FRAGMENT_SHADER); 114 | glShaderSource(fragmentShader, 1, &fragmentShaderSource, NULL); 115 | glCompileShader(fragmentShader); 116 | 117 | // Link shaders into a Shader Program 118 | unsigned int shaderProgram = glCreateProgram(); 119 | glAttachShader(shaderProgram, vertexShader); 120 | glAttachShader(shaderProgram, fragmentShader); 121 | glLinkProgram(shaderProgram); 122 | 123 | // Cleanup 124 | glDeleteShader(vertexShader); 125 | glDeleteShader(fragmentShader); 126 | 127 | // Render Loop 128 | float rotationAngle = 0.0f; 129 | auto now = std::chrono::steady_clock::now(); 130 | while (!glfwWindowShouldClose(window)) { 131 | glClear(GL_COLOR_BUFFER_BIT); // Clear screen 132 | 133 | int angleLocation = glGetUniformLocation(shaderProgram, "angle"); 134 | glUniform1f(angleLocation, rotationAngle); 135 | glUseProgram(shaderProgram); 136 | glBindVertexArray(VAO); 137 | glDrawArrays(GL_TRIANGLES, 0, 3); // Draw triangle 138 | 139 | glfwSwapBuffers(window); 140 | glfwPollEvents(); 141 | 142 | auto end = std::chrono::steady_clock::now(); 143 | auto duration = std::chrono::duration_cast(end - now).count() / 1000.0; 144 | rotationAngle += duration * 0.5; 145 | glBindBuffer(GL_ARRAY_BUFFER, VBO); 146 | glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW); 147 | now = end; 148 | } 149 | 150 | // Cleanup 151 | glDeleteVertexArrays(1, &VAO); 152 | glDeleteBuffers(1, &VBO); 153 | glDeleteProgram(shaderProgram); 154 | glfwDestroyWindow(window); 155 | glfwTerminate(); 156 | 157 | return 0; 158 | } 159 | -------------------------------------------------------------------------------- /webgpu/compute/src/main.rs: -------------------------------------------------------------------------------- 1 | // Based on the file https://github.com/googlefonts/compute-shader-101, licensed under the MIT 2 | // license 3 | 4 | use std::time::Instant; 5 | use wgpu::{util::DeviceExt, PipelineCompilationOptions}; 6 | use bytemuck; 7 | 8 | const N_ELEMENTS: u32 = 1024; 9 | 10 | async fn run() { 11 | let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor::default()); 12 | let adapter = instance.request_adapter(&Default::default()).await.unwrap(); 13 | let features = adapter.features(); 14 | let (device, queue) = adapter 15 | .request_device( 16 | &wgpu::DeviceDescriptor::default(), 17 | None, 18 | ) 19 | .await 20 | .unwrap(); 21 | let query_set = if false && features.contains(wgpu::Features::TIMESTAMP_QUERY) { 22 | Some(device.create_query_set(&wgpu::QuerySetDescriptor { 23 | count: 2, 24 | ty: wgpu::QueryType::Timestamp, 25 | label: None, 26 | })) 27 | } else { 28 | None 29 | }; 30 | 31 | let start_instant = Instant::now(); 32 | let cs_module = device.create_shader_module(wgpu::ShaderModuleDescriptor { 33 | label: None, 34 | source: wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into()), 35 | }); 36 | println!("shader compilation {:?}", start_instant.elapsed()); 37 | let input_v = (0..N_ELEMENTS).map(|i| i as f32).collect::>(); 38 | let input: &[u8] = bytemuck::cast_slice(&input_v); 39 | let input_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { 40 | label: Some("Input"), 41 | contents: input, 42 | usage: wgpu::BufferUsages::STORAGE 43 | | wgpu::BufferUsages::COPY_SRC, 44 | }); 45 | let storage_buffer = device.create_buffer(&wgpu::BufferDescriptor { 46 | label: Some("Output"), 47 | size: input.len() as u64, 48 | usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC, 49 | mapped_at_creation: false, 50 | }); 51 | let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor { 52 | label: Some("Staging Buffer"), 53 | size: input.len() as u64, 54 | usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST, 55 | mapped_at_creation: false, 56 | }); 57 | let query_buf = device.create_buffer(&wgpu::BufferDescriptor { 58 | label: None, 59 | size: 16, 60 | usage: wgpu::BufferUsages::COPY_SRC | wgpu::BufferUsages::QUERY_RESOLVE, 61 | mapped_at_creation: false, 62 | }); 63 | let query_staging_buf = device.create_buffer(&wgpu::BufferDescriptor { 64 | label: None, 65 | size: 16, 66 | usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST, 67 | mapped_at_creation: false, 68 | }); 69 | 70 | let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { 71 | label: None, 72 | entries: &[ 73 | // Input 74 | wgpu::BindGroupLayoutEntry { 75 | binding: 0, 76 | visibility: wgpu::ShaderStages::COMPUTE, 77 | ty: wgpu::BindingType::Buffer { 78 | ty: wgpu::BufferBindingType::Storage { read_only: true }, 79 | has_dynamic_offset: false, 80 | min_binding_size: None, 81 | }, 82 | count: None, 83 | }, 84 | // Output 85 | wgpu::BindGroupLayoutEntry { 86 | binding: 1, 87 | visibility: wgpu::ShaderStages::COMPUTE, 88 | ty: wgpu::BindingType::Buffer { 89 | ty: wgpu::BufferBindingType::Storage { read_only: false }, 90 | has_dynamic_offset: false, 91 | min_binding_size: None, 92 | }, 93 | count: None, 94 | } 95 | ], 96 | }); 97 | let compute_pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { 98 | label: None, 99 | bind_group_layouts: &[&bind_group_layout], 100 | push_constant_ranges: &[], 101 | }); 102 | let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { 103 | label: None, 104 | layout: Some(&compute_pipeline_layout), 105 | module: &cs_module, 106 | entry_point: Some("main"), 107 | cache: None, 108 | compilation_options: PipelineCompilationOptions::default(), 109 | }); 110 | 111 | let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { 112 | label: None, 113 | layout: &bind_group_layout, 114 | entries: &[ 115 | // Input 116 | wgpu::BindGroupEntry { 117 | binding: 0, 118 | resource: input_buf.as_entire_binding(), 119 | }, 120 | // Output 121 | wgpu::BindGroupEntry { 122 | binding: 1, 123 | resource: storage_buffer.as_entire_binding(), 124 | } 125 | ], 126 | }); 127 | 128 | let mut encoder = device.create_command_encoder(&Default::default()); 129 | if let Some(query_set) = &query_set { 130 | encoder.write_timestamp(query_set, 0); 131 | } 132 | { 133 | let mut cpass = encoder.begin_compute_pass(&Default::default()); 134 | cpass.set_pipeline(&pipeline); 135 | cpass.set_bind_group(0, &bind_group, &[]); 136 | cpass.dispatch_workgroups(input_v.len() as u32, 1, 1); 137 | } 138 | if let Some(query_set) = &query_set { 139 | encoder.write_timestamp(query_set, 1); 140 | } 141 | if let Some(query_set) = &query_set { 142 | encoder.resolve_query_set(query_set, 0..2, &query_buf, 0); 143 | } 144 | encoder.copy_buffer_to_buffer(&query_buf, 0, &query_staging_buf, 0, 16); 145 | encoder.copy_buffer_to_buffer(&storage_buffer, 0, &staging_buffer, 0, input.len() as u64); 146 | queue.submit(Some(encoder.finish())); 147 | 148 | let buf_slice = staging_buffer.slice(..); 149 | let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel(); 150 | 151 | buf_slice.map_async(wgpu::MapMode::Read, move |result| { 152 | sender.send(result).unwrap(); 153 | }); 154 | 155 | device.poll(wgpu::Maintain::Wait); 156 | receiver.receive().await.unwrap().unwrap(); 157 | 158 | let data = buf_slice.get_mapped_range(); 159 | let result: &[f32] = bytemuck::cast_slice(&data); 160 | println!("Sum of the elements in the array: {:?}", result[0]); 161 | 162 | drop(data); 163 | staging_buffer.unmap(); 164 | } 165 | 166 | fn main() { 167 | pollster::block_on(run()); 168 | } 169 | -------------------------------------------------------------------------------- /directx/graphics/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // Link required libraries 8 | #pragma comment(lib, "d3d11.lib") 9 | #pragma comment(lib, "d3dcompiler.lib") 10 | 11 | // Direct3D globals 12 | IDXGISwapChain* swapChain = nullptr; 13 | ID3D11Device* device = nullptr; 14 | ID3D11DeviceContext* deviceContext = nullptr; 15 | ID3D11RenderTargetView* renderTargetView = nullptr; 16 | ID3D11VertexShader* vertexShader = nullptr; 17 | ID3D11PixelShader* pixelShader = nullptr; 18 | ID3D11InputLayout* inputLayout = nullptr; 19 | ID3D11Buffer* vertexBuffer = nullptr; 20 | 21 | // Vertex structure 22 | struct Vertex { 23 | DirectX::XMFLOAT3 position; 24 | DirectX::XMFLOAT4 color; 25 | }; 26 | 27 | // Shader source 28 | const char* vertexShaderSource = R"( 29 | struct VS_INPUT { 30 | float3 pos : POSITION; 31 | float4 color : COLOR; 32 | }; 33 | 34 | struct PS_INPUT { 35 | float4 pos : SV_POSITION; 36 | float4 color : COLOR; 37 | }; 38 | 39 | PS_INPUT main(VS_INPUT input) { 40 | PS_INPUT output; 41 | output.pos = float4(input.pos, 1.0); 42 | output.color = input.color; 43 | return output; 44 | } 45 | )"; 46 | 47 | const char* pixelShaderSource = R"( 48 | struct PS_INPUT { 49 | float4 pos : SV_POSITION; 50 | float4 color : COLOR; 51 | }; 52 | 53 | float4 main(PS_INPUT input) : SV_TARGET { 54 | return input.color; 55 | } 56 | )"; 57 | 58 | // Window Procedure 59 | LRESULT CALLBACK WindowProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam) { 60 | if (uMsg == WM_DESTROY) { 61 | PostQuitMessage(0); 62 | return 0; 63 | } 64 | return DefWindowProc(hwnd, uMsg, wParam, lParam); 65 | } 66 | 67 | // Compile shader 68 | ID3DBlob* CompileShader(const char* source, const char* entryPoint, const char* target) { 69 | ID3DBlob* shaderBlob = nullptr; 70 | ID3DBlob* errorBlob = nullptr; 71 | if (FAILED(D3DCompile(source, strlen(source), nullptr, nullptr, nullptr, entryPoint, target, 0, 0, &shaderBlob, &errorBlob))) { 72 | if (errorBlob) { 73 | std::cerr << (char*)errorBlob->GetBufferPointer() << std::endl; 74 | errorBlob->Release(); 75 | } 76 | return nullptr; 77 | } 78 | return shaderBlob; 79 | } 80 | 81 | // Initialize Direct3D 82 | bool InitD3D(HWND hwnd) { 83 | DXGI_SWAP_CHAIN_DESC scd = {}; 84 | scd.BufferCount = 1; 85 | scd.BufferDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; 86 | scd.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT; 87 | scd.OutputWindow = hwnd; 88 | scd.SampleDesc.Count = 1; 89 | scd.Windowed = TRUE; 90 | scd.SwapEffect = DXGI_SWAP_EFFECT_DISCARD; 91 | 92 | if (FAILED(D3D11CreateDeviceAndSwapChain(nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0, nullptr, 0, 93 | D3D11_SDK_VERSION, &scd, &swapChain, &device, nullptr, &deviceContext))) { 94 | return false; 95 | } 96 | 97 | D3D11_VIEWPORT viewport = {}; 98 | viewport.Width = 800.0f; 99 | viewport.Height = 600.0f; 100 | viewport.MinDepth = 0.0f; 101 | viewport.MaxDepth = 1.0f; 102 | viewport.TopLeftX = 0; 103 | viewport.TopLeftY = 0; 104 | 105 | deviceContext->RSSetViewports(1, &viewport); 106 | 107 | // Get back buffer 108 | ID3D11Texture2D* backBuffer = nullptr; 109 | swapChain->GetBuffer(0, __uuidof(ID3D11Texture2D), (void**)&backBuffer); 110 | device->CreateRenderTargetView(backBuffer, nullptr, &renderTargetView); 111 | backBuffer->Release(); 112 | deviceContext->OMSetRenderTargets(1, &renderTargetView, nullptr); 113 | 114 | // Compile shaders 115 | ID3DBlob* vsBlob = CompileShader(vertexShaderSource, "main", "vs_5_0"); 116 | ID3DBlob* psBlob = CompileShader(pixelShaderSource, "main", "ps_5_0"); 117 | 118 | if (!vsBlob || !psBlob) return false; 119 | 120 | // Create shaders 121 | device->CreateVertexShader(vsBlob->GetBufferPointer(), vsBlob->GetBufferSize(), nullptr, &vertexShader); 122 | device->CreatePixelShader(psBlob->GetBufferPointer(), psBlob->GetBufferSize(), nullptr, &pixelShader); 123 | 124 | // Input layout 125 | D3D11_INPUT_ELEMENT_DESC layoutDesc[] = { 126 | { "POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0 }, 127 | { "COLOR", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0 } 128 | }; 129 | device->CreateInputLayout(layoutDesc, 2, vsBlob->GetBufferPointer(), vsBlob->GetBufferSize(), &inputLayout); 130 | deviceContext->IASetInputLayout(inputLayout); 131 | 132 | vsBlob->Release(); 133 | psBlob->Release(); 134 | 135 | // Triangle vertices 136 | Vertex vertices[] = { 137 | {{ 0.0f, 0.5f, 0.0f }, { 1.0f, 0.0f, 0.0f, 1.0f }}, 138 | {{ 0.5f, -0.5f, 0.0f }, { 0.0f, 0.0f, 1.0f, 1.0f }}, 139 | {{ -0.5f, -0.5f, 0.0f }, { 0.0f, 1.0f, 0.0f, 1.0f }} 140 | }; 141 | 142 | // Create vertex buffer 143 | D3D11_BUFFER_DESC bufferDesc = {}; 144 | bufferDesc.Usage = D3D11_USAGE_DEFAULT; 145 | bufferDesc.ByteWidth = sizeof(vertices); 146 | bufferDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER; 147 | 148 | D3D11_SUBRESOURCE_DATA initData = { vertices }; 149 | device->CreateBuffer(&bufferDesc, &initData, &vertexBuffer); 150 | 151 | return true; 152 | } 153 | 154 | // Render function 155 | void Render() { 156 | float clearColor[4] = { 0.0f, 0.0f, 0.0f, 1.0f }; 157 | deviceContext->ClearRenderTargetView(renderTargetView, clearColor); 158 | 159 | // Set shaders 160 | deviceContext->VSSetShader(vertexShader, nullptr, 0); 161 | deviceContext->PSSetShader(pixelShader, nullptr, 0); 162 | 163 | // Bind vertex buffer 164 | UINT stride = sizeof(Vertex); 165 | UINT offset = 0; 166 | deviceContext->IASetVertexBuffers(0, 1, &vertexBuffer, &stride, &offset); 167 | deviceContext->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST); 168 | 169 | // Draw triangle 170 | deviceContext->Draw(3, 0); 171 | 172 | swapChain->Present(1, 0); 173 | } 174 | 175 | // Cleanup 176 | void CleanupD3D() { 177 | if (swapChain) swapChain->Release(); 178 | if (renderTargetView) renderTargetView->Release(); 179 | if (deviceContext) deviceContext->Release(); 180 | if (device) device->Release(); 181 | if (vertexShader) vertexShader->Release(); 182 | if (pixelShader) pixelShader->Release(); 183 | if (inputLayout) inputLayout->Release(); 184 | if (vertexBuffer) vertexBuffer->Release(); 185 | } 186 | 187 | // Main function 188 | int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE, LPSTR, int nCmdShow) { 189 | WNDCLASS wc = { 0 }; 190 | wc.lpfnWndProc = WindowProc; 191 | wc.hInstance = hInstance; 192 | wc.lpszClassName = "Direct3DWindowClass"; 193 | RegisterClass(&wc); 194 | 195 | HWND hwnd = CreateWindowEx(0, wc.lpszClassName, "Direct3D Triangle", WS_OVERLAPPEDWINDOW, 196 | 100, 100, 800, 600, nullptr, nullptr, hInstance, nullptr); 197 | ShowWindow(hwnd, nCmdShow); 198 | 199 | if (!InitD3D(hwnd)) return -1; 200 | 201 | MSG msg = {}; 202 | while (msg.message != WM_QUIT) { 203 | if (PeekMessage(&msg, nullptr, 0, 0, PM_REMOVE)) { 204 | TranslateMessage(&msg); 205 | DispatchMessage(&msg); 206 | } else { 207 | Render(); 208 | } 209 | } 210 | 211 | CleanupD3D(); 212 | return 0; 213 | } 214 | -------------------------------------------------------------------------------- /webgpu/graphics/src/main.rs: -------------------------------------------------------------------------------- 1 | // Based on the file https://github.com/sotrh/learn-wgpu, licensed under the MIT 2 | // license 3 | 4 | use glfw::{fail_on_errors, Action, Key, Window, WindowHint, ClientApiHint}; 5 | use std::env::current_dir; 6 | use std::fs; 7 | 8 | pub struct PipelineBuilder { 9 | shader_filename: String, 10 | vertex_entry: String, 11 | fragment_entry: String, 12 | pixel_format: wgpu::TextureFormat, 13 | } 14 | 15 | impl PipelineBuilder { 16 | 17 | pub fn new() -> Self { 18 | PipelineBuilder { 19 | shader_filename: String::new(), 20 | vertex_entry: String::new(), 21 | fragment_entry: String::new(), 22 | pixel_format: wgpu::TextureFormat::Rgba8Unorm, 23 | } 24 | } 25 | 26 | pub fn set_shader_module(&mut self, shader_filename: &str, vertex_entry: &str, fragment_entry: &str) { 27 | 28 | self.shader_filename = shader_filename.to_string(); 29 | self.vertex_entry = vertex_entry.to_string(); 30 | self.fragment_entry = fragment_entry.to_string(); 31 | } 32 | 33 | pub fn set_pixel_format(&mut self, pixel_format: wgpu::TextureFormat) { 34 | 35 | self.pixel_format = pixel_format; 36 | } 37 | 38 | pub fn build_pipeline(&self, device: &wgpu::Device) -> wgpu::RenderPipeline { 39 | 40 | let mut filepath = current_dir().unwrap(); 41 | filepath.push("src/"); 42 | filepath.push(self.shader_filename.as_str()); 43 | let filepath = filepath.into_os_string().into_string().unwrap(); 44 | let source_code = fs::read_to_string(filepath).expect("Failed to read the source code."); 45 | 46 | let shader_module_descriptor = wgpu::ShaderModuleDescriptor { 47 | label: Some("Shader Module"), 48 | source: wgpu::ShaderSource::Wgsl(source_code.into()), 49 | }; 50 | let shader_module = device.create_shader_module(shader_module_descriptor); 51 | 52 | let pipeline_layout_descriptor = wgpu::PipelineLayoutDescriptor { 53 | label: Some("Render Pipeline Layout"), 54 | bind_group_layouts: &[], 55 | push_constant_ranges: &[], 56 | }; 57 | let pipeline_layout = device.create_pipeline_layout(&pipeline_layout_descriptor); 58 | 59 | let render_targets = [Some(wgpu::ColorTargetState { 60 | format: self.pixel_format, 61 | blend: Some(wgpu::BlendState::REPLACE), 62 | write_mask: wgpu::ColorWrites::ALL, 63 | })]; 64 | 65 | let render_pipeline_descriptor = wgpu::RenderPipelineDescriptor { 66 | label: Some("Render Pipeline"), 67 | layout: Some(&pipeline_layout), 68 | 69 | vertex: wgpu::VertexState { 70 | module: &shader_module, 71 | entry_point: Some(&self.vertex_entry), 72 | buffers: &[], 73 | compilation_options: wgpu::PipelineCompilationOptions::default(), 74 | }, 75 | 76 | primitive: wgpu::PrimitiveState { 77 | topology: wgpu::PrimitiveTopology::TriangleList, 78 | strip_index_format: None, 79 | front_face: wgpu::FrontFace::Ccw, 80 | cull_mode: Some(wgpu::Face::Back), 81 | polygon_mode: wgpu::PolygonMode::Fill, 82 | unclipped_depth: false, 83 | conservative: false, 84 | }, 85 | 86 | fragment: Some(wgpu::FragmentState { 87 | module: &shader_module, 88 | entry_point: Some(&self.fragment_entry), 89 | targets: &render_targets, 90 | compilation_options: wgpu::PipelineCompilationOptions::default(), 91 | }), 92 | 93 | depth_stencil: None, 94 | multisample: wgpu::MultisampleState { 95 | count: 1, 96 | mask: !0, 97 | alpha_to_coverage_enabled: false, 98 | }, 99 | multiview: None, 100 | cache: None, 101 | }; 102 | 103 | device.create_render_pipeline(&render_pipeline_descriptor) 104 | } 105 | } 106 | 107 | struct State<'a> { 108 | instance: wgpu::Instance, 109 | surface: wgpu::Surface<'a>, 110 | device: wgpu::Device, 111 | queue: wgpu::Queue, 112 | config: wgpu::SurfaceConfiguration, 113 | size: (i32, i32), 114 | window: &'a mut Window, 115 | render_pipeline: wgpu::RenderPipeline, 116 | } 117 | 118 | impl<'a> State<'a> { 119 | 120 | async fn new(window: &'a mut Window) -> Self { 121 | 122 | let size = window.get_framebuffer_size(); 123 | 124 | let instance_descriptor = wgpu::InstanceDescriptor { 125 | backends: wgpu::Backends::all(), ..Default::default() 126 | }; 127 | let instance = wgpu::Instance::new(&instance_descriptor); 128 | let surface = instance.create_surface(window.render_context()).unwrap(); 129 | 130 | let adapter_descriptor = wgpu::RequestAdapterOptionsBase { 131 | power_preference: wgpu::PowerPreference::default(), 132 | compatible_surface: Some(&surface), 133 | force_fallback_adapter: false, 134 | }; 135 | let adapter = instance.request_adapter(&adapter_descriptor) 136 | .await.unwrap(); 137 | 138 | let device_descriptor = wgpu::DeviceDescriptor { 139 | required_features: wgpu::Features::empty(), 140 | required_limits: wgpu::Limits::default(), 141 | label: Some("Device"), 142 | memory_hints: wgpu::MemoryHints::default(), 143 | }; 144 | let (device, queue) = adapter 145 | .request_device(&device_descriptor, None) 146 | .await.unwrap(); 147 | 148 | 149 | let surface_capabilities = surface.get_capabilities(&adapter); 150 | let surface_format = surface_capabilities 151 | .formats 152 | .iter() 153 | .copied() 154 | .filter(|f | f.is_srgb()) 155 | .next() 156 | .unwrap_or(surface_capabilities.formats[0]); 157 | let config = wgpu::SurfaceConfiguration { 158 | usage: wgpu::TextureUsages::RENDER_ATTACHMENT, 159 | format: surface_format, 160 | width: size.0 as u32, 161 | height: size.1 as u32, 162 | present_mode: surface_capabilities.present_modes[0], 163 | alpha_mode: surface_capabilities.alpha_modes[0], 164 | view_formats: vec![], 165 | desired_maximum_frame_latency: 2 166 | }; 167 | surface.configure(&device, &config); 168 | 169 | let mut pipeline_builder = PipelineBuilder::new(); 170 | pipeline_builder.set_shader_module("shader.wgsl", "vertices", "fragment"); 171 | pipeline_builder.set_pixel_format(config.format); 172 | let render_pipeline = pipeline_builder.build_pipeline(&device); 173 | 174 | Self { 175 | instance, 176 | window, 177 | surface, 178 | device, 179 | queue, 180 | config, 181 | size, 182 | render_pipeline, 183 | } 184 | } 185 | 186 | fn resize(&mut self, new_size: (i32, i32)) { 187 | if new_size.0 > 0 && new_size.1 > 0 { 188 | self.size = new_size; 189 | self.config.width = new_size.0 as u32; 190 | self.config.height = new_size.1 as u32; 191 | self.surface.configure(&self.device, &self.config); 192 | } 193 | } 194 | 195 | fn update_surface(&mut self) { 196 | self.surface = self.instance.create_surface(self.window.render_context()).unwrap(); 197 | } 198 | 199 | fn render(&mut self) -> Result<(), wgpu::SurfaceError>{ 200 | 201 | let drawable = self.surface.get_current_texture()?; 202 | let image_view_descriptor = wgpu::TextureViewDescriptor::default(); 203 | let image_view = drawable.texture.create_view(&image_view_descriptor); 204 | 205 | let command_encoder_descriptor = wgpu::CommandEncoderDescriptor { 206 | label: Some("Render Encoder") 207 | }; 208 | let mut command_encoder = self.device.create_command_encoder(&command_encoder_descriptor); 209 | 210 | let color_attachment = wgpu::RenderPassColorAttachment { 211 | view: &image_view, 212 | resolve_target: None, 213 | ops: wgpu::Operations { 214 | load: wgpu::LoadOp::Clear(wgpu::Color { 215 | r: 0.0, 216 | g: 0.0, 217 | b: 0.0, 218 | a: 1.0 219 | }), 220 | store: wgpu::StoreOp::Store, 221 | }, 222 | }; 223 | 224 | let render_pass_descriptor = wgpu::RenderPassDescriptor { 225 | label: Some("Render Pass"), 226 | color_attachments: &[Some(color_attachment)], 227 | depth_stencil_attachment: None, 228 | occlusion_query_set: None, 229 | timestamp_writes: None 230 | }; 231 | 232 | { 233 | let mut renderpass = command_encoder.begin_render_pass(&render_pass_descriptor); 234 | renderpass.set_pipeline(&self.render_pipeline); 235 | renderpass.draw(0..3, 0..1); 236 | } 237 | self.queue.submit(std::iter::once(command_encoder.finish())); 238 | 239 | drawable.present(); 240 | 241 | Ok(()) 242 | } 243 | } 244 | 245 | async fn run() { 246 | 247 | let mut glfw = glfw::init(fail_on_errors!()) 248 | .unwrap(); 249 | glfw.window_hint(WindowHint::ClientApi(ClientApiHint::NoApi)); 250 | let (mut window, events) = 251 | glfw.create_window( 252 | 800, 600, "WGPU Graphics", 253 | glfw::WindowMode::Windowed).unwrap(); 254 | 255 | let mut state = State::new(&mut window).await; 256 | 257 | state.window.set_framebuffer_size_polling(true); 258 | state.window.set_key_polling(true); 259 | state.window.set_mouse_button_polling(true); 260 | state.window.set_pos_polling(true); 261 | 262 | while !state.window.should_close() { 263 | glfw.poll_events(); 264 | for (_, event) in glfw::flush_messages(&events) { 265 | match event { 266 | 267 | glfw::WindowEvent::Key(Key::Escape, _, Action::Press, _) => { 268 | state.window.set_should_close(true) 269 | } 270 | 271 | glfw::WindowEvent::Pos(..) => { 272 | state.update_surface(); 273 | state.resize(state.size); 274 | } 275 | 276 | glfw::WindowEvent::FramebufferSize(width, height) => { 277 | state.update_surface(); 278 | state.resize((width, height)); 279 | } 280 | _ => {} 281 | } 282 | } 283 | 284 | match state.render() { 285 | Ok(_) => {}, 286 | Err(wgpu::SurfaceError::Lost | wgpu::SurfaceError::Outdated) => { 287 | state.update_surface(); 288 | state.resize(state.size); 289 | }, 290 | Err(e) => eprintln!("{:?}", e), 291 | } 292 | } 293 | } 294 | 295 | fn main() { 296 | pollster::block_on(run()); 297 | } 298 | -------------------------------------------------------------------------------- /readme.rst: -------------------------------------------------------------------------------- 1 | gpu-arena 2 | ========= 3 | 4 | - `English (en) <#a-guided-tour-of-gpu-frameworks>`_ 5 | - `Français (fr) <#visite-guidée-de-cadres-logiciels-pour-processeurs-graphiques>`_ 6 | 7 | .. image:: assets/triangle.gif 8 | :width: 500 9 | :align: center 10 | :alt: Demonstration of simple 3D graphics. A colored triangle rotates on its vertical axis in 11 | front of a black background. The corners of the triangle are red, blue, and green, and the 12 | center of the triangle are colored in shades of these colors. 13 | 14 | 15 | A Guided Tour of GPU Programming Frameworks 16 | +++++++++++++++++++++++++++++++++++++++++++ 17 | 18 | Self-contained projects that show how to install GPU programming frameworks, build 19 | GPU-accelerated programs, and execute them. Click on the links in the index table below to access 20 | the ``readme`` file of each project for more information. 21 | 22 | The projects are minimal examples, not complete tutorials! Each ``readme`` file provides references 23 | to more detailed resources. Contributions are welcome - you can enrich the current projects and even 24 | add other GPU programming frameworks! 25 | 26 | 27 | Project Index 28 | ------------- 29 | 30 | Click on the links in the leftmost column to access the corresponding subdirectory. ``Y`` indicates 31 | that the framework supports the application or device. ``N`` indicates that it does not support 32 | them. 33 | 34 | +------------------------------------------+----------------------------+-------------------------------------------+---------------+------------------+ 35 | | Framework | Applications | Devices | Operating | Shading / kernel | 36 | | +----------+-----------------+-----+-------+-------+-----+---------------+ Systems | language | 37 | | | Graphics | General-purpose | CPU |Nvidia | Intel | AMD | Apple Silicon | | | 38 | +==========================================+==========+=================+=====+=======+=======+=====+===============+===============+==================+ 39 | |`OpenGL `__ | Y | Y (since | N | Y | Y | Y | N | Any | GLSL | 40 | | | | version 4.3, | | | | | | (deprecated | | 41 | | | | 2012) | | | | | | on Mac) | | 42 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 43 | |`Metal `__ | Y* | Y* | N | N | N | N | Y | Mac / iOS | MSL | 44 | | | | | | | | | | | | 45 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 46 | |`DirectX `__ | Y | Y | N | Y | Y | Y | N | Windows | HLSL | 47 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 48 | |`Vulkan `__ | Y | Y (implemented | N | Y | Y | Y | N | Any | Anything that | 49 | | | | with kompute) | | | | | | (deprecated | compiles to | 50 | | | | | | | | | | on Mac) | SPIR-V | 51 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 52 | |`WebGPU `__ | Y | Y | N | Y | Y | Y | Y | Any | WGSL | 53 | | | | | | | | | | | | 54 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 55 | |`CUDA `__ | N | Y | N | Y | N | N | N | Windows, | CUDA | 56 | | | | | | | | | | Linux | | 57 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 58 | |`OpenCL `__ | N | Y | Y | Y | Y | Y | Y | Any | OpenCL C | 59 | | | | | | | | | | (deprecated | | 60 | | | | | | | | | | on Mac) | | 61 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 62 | |`SYCL `__ | N | Y* | Y | Y | Y | Y | Y | Any (CPU-only | C++ extensions | 63 | | | | | | | | | | on Mac) | | 64 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 65 | |`Triton `__ | N | Y | N | Y | N | Y | N | Linux | Decorated Python | 66 | | | | | | | | | | | functions | 67 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 68 | | `OpenMP `__ | N | Y | Y | Y | Y | Y | Y | Any | Compiler | 69 | | | | | | | | | | | directives | 70 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 71 | | `AcceleratedKernels.jl | N | Y | Y | Y | Y | Y | Y | Any | Julia functions | 72 | | `__ | | | | | | | | | | 73 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 74 | |`CPU `__ (baseline) | N | Y | Y | N | N | N | N | Any | N/A | 75 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 76 | 77 | - ``*``: The corresponding example is not implemented in the project. 78 | 79 | 80 | Other Frameworks 81 | ---------------- 82 | 83 | There are even more frameworks that can be used to program GPUs! Below are listed a few of them; 84 | no example is implemented in this repository, but you can follow the links to learn more about 85 | them. 86 | 87 | - Bend (https://github.com/HigherOrderCO/Bend): a programming language for parallel computing. 88 | - Chapel (https://chapel-lang.org/gpu/): another programming language for parallel computing. 89 | - Mojo (https://www.modular.com/mojo): a programming language for heterogeneous computing. 90 | - oneAPI (https://www.intel.com/content/www/us/en/developer/tools/oneapi/overview.html): A 91 | software stack for high performance computing by Intel. Based on SYCL, but also adds custom 92 | extensions to implement new features. 93 | - OpenACC (https://www.openacc.org/): A parallel computing standard. 94 | - ROCm (https://www.amd.com/fr/products/software/rocm.html): A software stack for high performance 95 | computing by AMD. Supports OpenCL, HIP, OpenMP. 96 | - Slang (https://www.khronos.org/news/press/khronos-group-launches-slang-initiative-hosting-open-source-compiler-contributed-by-nvidia): 97 | a shading language and compiler that can target multiple APIs. 98 | - rust-gpu (https://github.com/Rust-GPU/rust-gpu) a framework under development that enables 99 | seamless integration of GPU code into Rust code. It's a little like SYCL but for Rust instead of 100 | C++, and in contrast to SYCL, rust-gpu supports both general-purpose AND graphics applications. 101 | The project is not production-ready as of July 2025. 102 | 103 | 104 | GPU Projects 105 | ------------ 106 | 107 | Some projects that use GPU programming. Don't hesitate to create a PR if you want to add any! 108 | 109 | - Artificial Intelligence: 110 | - burn (https://github.com/tracel-ai/burn): Deep learning framework that uses WebGPU as its 111 | backend for increased portability. It also uses SPIR-V to perform some optimizations that 112 | WebGPU does not support. 113 | - PyTorch (https://github.com/pytorch/pytorch): Deep learning library that uses CUDA and ROCm for 114 | GPU acceleration. 115 | - TensorFlow (https://github.com/tensorflow/tensorflow): Deep learning library that uses CUDA for 116 | GPU acceleration. 117 | - Graphics: 118 | - Godot Shaders (https://godotshaders.com/): A collection of shaders that can be used in the 119 | Godot game engine. It uses a shading language similar to GLSL. 120 | - Physics: 121 | - FluidX3D (https://github.com/ProjectPhysX/FluidX3D): Computational fluid dynamics software 122 | implemented with OpenCL. 123 | - gpu-io (https://github.com/amandaghassaei/gpu-io): A library for running physics simulations in 124 | a browser. Implemented with WebGL. 125 | - PixelFlow (https://github.com/diwi/PixelFlow): A physics simulation framework based on Java 126 | and OpenGL. 127 | - Bioinformatics: 128 | - genome-spy (https://github.com/genome-spy/genome-spy): Toolkit for analyzing genomic data 129 | implemented with WebGL. 130 | - GenomeWorks (https://github.com/NVIDIA-Genomics-Research/GenomeWorks): CUDA-accelerated DNA 131 | analysis and alignment SDK. 132 | - Cryptography: 133 | - hashcat (https://github.com/hashcat/hashcat): Software recovery program implemented with 134 | OpenMP, CUDA, and OpenCL. 135 | 136 | 137 | Additional Resources 138 | -------------------- 139 | 140 | - Step-by-step guide that explains how to optimize a GPU-accelerate program (CUDA): 141 | https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf 142 | - Introduction to CUDA and OpenCL programming: https://parlab.eecs.berkeley.edu/sites/all/parlab/files/CatanzaroIntroToCUDAOpenCL_0.pdf. 143 | Check the slide 27 for a comparison of the lexicon used in the two frameworks. 144 | 145 | 146 | Benchmarking 147 | ------------ 148 | 149 | Run the Python script ``benchmark.py`` to compare how performances vary depending on the number of 150 | threads running on CPU: 151 | 152 | .. code:: bash 153 | 154 | # Linux 155 | python3 benchmark.py 156 | 157 | # OS that begins with the letter W 158 | py benchmark.py 159 | 160 | 161 | ----- 162 | 163 | 164 | Visite guidée de cadres logiciels pour processeurs graphiques 165 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 166 | 167 | Ce dépôt contient des projets sans dépendances qui montrent comment installer un cadre logiciel de 168 | programmation de GPU, comment construire des programmes accélérés par GPU, et comment les exécuter. 169 | Cliquez sur les liens dans le tableau ci-dessous pour accéder à des informations supplémentaires 170 | sur chaque projet. 171 | 172 | Ces projets sont des exemples minimalistes et non des tutoriels complets. Les fichiers 173 | ``readme`` dans chaque sous-répertoire fournissent des ressources plus détaillées. 174 | 175 | 176 | Indice des projets 177 | ------------------ 178 | 179 | +------------------------------------------+----------------------------+-------------------------------------------+---------------+------------------+ 180 | | Cadre logiciel | Applications | Appareils | Systèmes | Language de | 181 | | +----------+-----------------+-----+-------+-------+-----+---------------+ d'exploitation| nuanceurs / | 182 | | |Graphique | Calculs généraux| CPU |Nvidia | Intel | AMD | Apple Silicon | | noyaux | 183 | +==========================================+==========+=================+=====+=======+=======+=====+===============+===============+==================+ 184 | |`OpenGL `__ | O | O (depuis la | N | O | O | O | N | Tous | GLSL | 185 | | | | version 4.3, | | | | | | (réprouvé | | 186 | | | | 2012) | | | | | | sur Mac) | | 187 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 188 | |`DirectX `__ | O | O | N | O | O | O | N | Windows | HLSL | 189 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 190 | |`Metal `__ | O* | O* | N | N | N | N | O | Mac / iOS | MSL | 191 | | | | | | | | | | | | 192 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 193 | |`Vulkan `__ | O | O (avec | N | O | O | O | N | Tous | Tous se qui se | 194 | | | | kompute) | | | | | | (réprouvé | compile vers | 195 | | | | | | | | | | sur Mac) | SPIR-V | 196 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 197 | |`WebGPU `__ | O | O | N | O | O | O | O | Tous | WGSL | 198 | | | | | | | | | | | | 199 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 200 | |`CUDA `__ | N | O | N | O | N | N | N | Windows, | CUDA | 201 | | | | | | | | | | Linux | | 202 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 203 | |`OpenCL `__ | N | O | O | O | O | O | O | Tous | OpenCL C | 204 | | | | | | | | | | (réprouvé | | 205 | | | | | | | | | | sur Mac) | | 206 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 207 | |`SYCL `__ | N | O* | O | O | O | O | O | Tous (CPU | Extensions C++ | 208 | | | | | | | | | | seulement sur | | 209 | | | | | | | | | | Mac) | | 210 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 211 | |`Triton `__ | N | O | N | O | N | O | N | Linux | Fonctions | 212 | | | | | | | | | | | Pythons | 213 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 214 | | `OpenMP `__ | N | O | O | O | O | O | O | Tous | Directives de | 215 | | | | | | | | | | | compilateur | 216 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 217 | | `AcceleratedKernels.jl | N | O | O | O | O | O | O | Tous | Fonctions Julia | 218 | | `__ | | | | | | | | | | 219 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 220 | |`CPU `__ | N | O | O | N | N | N | N | Tous | N/A | 221 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+ 222 | 223 | - Le signe ``*`` indique que l'exemple correspondant n'est pas inclus dans le projet. 224 | 225 | 226 | Autres cadriciels 227 | ----------------- 228 | 229 | Encore d'autres cadriciels sont disponibles pour programmer des GPU! La liste ci-dessous en présente 230 | quelques-uns. Aucun exemple n'est implémenté pour eux dans ce dépôt, mais vous pouvez suivre les 231 | liens pour en apprendre davantage. 232 | 233 | - Bend (https://github.com/HigherOrderCO/Bend): un langage de programmation pour le calcul 234 | parallèle. 235 | - Chapel (https://chapel-lang.org/gpu/): un autre langage de programmation pour le calcul 236 | parallèle. 237 | - Mojo (https://www.modular.com/mojo): un langage pour calcul hétérogène. 238 | - oneAPI (https://www.intel.com/content/www/us/en/developer/tools/oneapi/overview.html): Une pile 239 | logicielle pour le calcul haute performance par Intel. Basé sur SYCL, mais utilise aussi des 240 | extensions spécifiques au projet pour implémenter de nouvelles fonctionnalités. 241 | - OpenACC (https://www.openacc.org/): un standard de calcul parallèle. 242 | - ROCm (https://www.amd.com/fr/products/software/rocm.html): Une pile logicielle pour calcule de 243 | haute performance par AMD. Supporte OpenCL, HIP, OpenMP. 244 | - Slang (https://www.khronos.org/news/press/khronos-group-launches-slang-initiative-hosting-open-source-compiler-contributed-by-nvidia): 245 | un compilateur et langage de nuanceur qui cible plusieurs API. 246 | - rust-gpu (https://github.com/Rust-GPU/rust-gpu) un cadriciel en développement pour intégrer du 247 | code destiné à un GPU dans du code Rust. Similaire à SYCL mais vise Rust au lieu du C++ et permet 248 | de développer des applications graphiques. Pas encore prêt pour la production en Juillet 2025. 249 | 250 | 251 | Projets 252 | ------- 253 | 254 | Quelques projets qui utilisent des GPU. N'hésitez pas à créer un PR pour en ajouter à la liste : 255 | 256 | - Intelligence artificielle : 257 | - burn (https://github.com/tracel-ai/burn) : Cadriciel d'apprentissage profond qui utilise 258 | WebGPU pour améliorer la portabilité. Utilise aussi SPIR-V directement pour effectuer certaines 259 | optimisations que WebGPU ne supporte pas. 260 | - PyTorch (https://github.com/pytorch/pytorch) : Cadriciel d'apprentissage profond qui utilise 261 | CUDA et ROCm. 262 | - TensorFlow (https://github.com/tensorflow/tensorflow) : Cadriciel d'apprentissage profond 263 | qui utilise CUDA. 264 | - Graphisme : 265 | - Godot Shaders (https://godotshaders.com/) : Un ensemble de nuanceurs qui peuvent être utilisés 266 | avec le moteur de jeu Godot. Ils utilisent un langage de nuanceur proche de GLSL. 267 | - Physique : 268 | - FluidX3D (https://github.com/ProjectPhysX/FluidX3D): Programme de dynamique des fluides 269 | réalisé avec OpenCL. 270 | - gpu-io (https://github.com/amandaghassaei/gpu-io): Bibliothèque de simulation physique 271 | utilisable dans un navigateur Web. Réalisé avec WebGL. 272 | - PixelFlow (https://github.com/diwi/PixelFlow): Cadriciel de simulation physique réalisé avec 273 | Java et OpenGL. 274 | - Bioinformatique : 275 | - genome-spy (https://github.com/genome-spy/genome-spy) : Outils d'analyse génomique réalisé avec 276 | WebGL. 277 | - GenomeWorks (https://github.com/NVIDIA-Genomics-Research/GenomeWorks) : Analyse et alignement 278 | d'ADN avec CUDA. 279 | - Cryptographie : 280 | - hashcat (https://github.com/hashcat/hashcat) : Programme de récupération de mots de passe 281 | réalisé avec OpenMP, CUDA et OpenCL. 282 | 283 | 284 | Ressources additionnelles 285 | ------------------------- 286 | 287 | - Guide d'optimisation de programme pour GPU (CUDA) : 288 | https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf 289 | - Introduction à la programmation sur CUDA et OpenCL : https://parlab.eecs.berkeley.edu/sites/all/parlab/files/CatanzaroIntroToCUDAOpenCL_0.pdf. 290 | Consultez la diapositive 27 pour une comparaison des lexiques utilisés par chaque cadre logiciel. 291 | 292 | 293 | Comparaisons 294 | ------------- 295 | 296 | Exécutez le script ``benchmark.py`` pour comparer les performances d'un programme utilisant 297 | plusieurs fils d'exécution sur CPU: 298 | 299 | .. code:: bash 300 | 301 | # Linux 302 | python3 benchmark.py 303 | 304 | # OS that begins with the letter W 305 | py benchmark.py 306 | -------------------------------------------------------------------------------- /vulkan/graphics/main.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is taken from 3 | * https://github.com/Overv/VulkanTutorial/blob/main/code/15_hello_triangle.cpp, 4 | * from the repository `VulkanTutorial`, available at 5 | * https://github.com/Overv/VulkanTutorial/tree/main and licensed under the 6 | * licenses CC0-1.0 and CC-BY-SA-4.0. 7 | */ 8 | 9 | #define GLFW_INCLUDE_VULKAN 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | const uint32_t WIDTH = 800; 25 | const uint32_t HEIGHT = 600; 26 | 27 | const int MAX_FRAMES_IN_FLIGHT = 2; 28 | 29 | const std::vector validationLayers = { 30 | "VK_LAYER_KHRONOS_validation" 31 | }; 32 | 33 | const std::vector deviceExtensions = { 34 | VK_KHR_SWAPCHAIN_EXTENSION_NAME 35 | }; 36 | 37 | #ifdef NDEBUG 38 | const bool enableValidationLayers = false; 39 | #else 40 | const bool enableValidationLayers = true; 41 | #endif 42 | 43 | VkResult CreateDebugUtilsMessengerEXT(VkInstance instance, const VkDebugUtilsMessengerCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugUtilsMessengerEXT* pDebugMessenger) { 44 | auto func = (PFN_vkCreateDebugUtilsMessengerEXT) vkGetInstanceProcAddr(instance, "vkCreateDebugUtilsMessengerEXT"); 45 | if (func != nullptr) { 46 | return func(instance, pCreateInfo, pAllocator, pDebugMessenger); 47 | } else { 48 | return VK_ERROR_EXTENSION_NOT_PRESENT; 49 | } 50 | } 51 | 52 | void DestroyDebugUtilsMessengerEXT(VkInstance instance, VkDebugUtilsMessengerEXT debugMessenger, const VkAllocationCallbacks* pAllocator) { 53 | auto func = (PFN_vkDestroyDebugUtilsMessengerEXT) vkGetInstanceProcAddr(instance, "vkDestroyDebugUtilsMessengerEXT"); 54 | if (func != nullptr) { 55 | func(instance, debugMessenger, pAllocator); 56 | } 57 | } 58 | 59 | struct QueueFamilyIndices { 60 | std::optional graphicsFamily; 61 | std::optional presentFamily; 62 | 63 | bool isComplete() { 64 | return graphicsFamily.has_value() && presentFamily.has_value(); 65 | } 66 | }; 67 | 68 | struct SwapChainSupportDetails { 69 | VkSurfaceCapabilitiesKHR capabilities; 70 | std::vector formats; 71 | std::vector presentModes; 72 | }; 73 | 74 | class HelloTriangleApplication { 75 | public: 76 | void run() { 77 | initWindow(); 78 | initVulkan(); 79 | mainLoop(); 80 | cleanup(); 81 | } 82 | 83 | private: 84 | GLFWwindow* window; 85 | 86 | VkInstance instance; 87 | VkDebugUtilsMessengerEXT debugMessenger; 88 | VkSurfaceKHR surface; 89 | 90 | VkPhysicalDevice physicalDevice = VK_NULL_HANDLE; 91 | VkDevice device; 92 | 93 | VkQueue graphicsQueue; 94 | VkQueue presentQueue; 95 | 96 | VkSwapchainKHR swapChain; 97 | std::vector swapChainImages; 98 | VkFormat swapChainImageFormat; 99 | VkExtent2D swapChainExtent; 100 | std::vector swapChainImageViews; 101 | std::vector swapChainFramebuffers; 102 | 103 | VkRenderPass renderPass; 104 | VkPipelineLayout pipelineLayout; 105 | VkPipeline graphicsPipeline; 106 | 107 | VkCommandPool commandPool; 108 | VkCommandBuffer commandBuffer; 109 | 110 | VkSemaphore imageAvailableSemaphore; 111 | VkSemaphore renderFinishedSemaphore; 112 | VkFence inFlightFence; 113 | 114 | void initWindow() { 115 | glfwInit(); 116 | 117 | glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API); 118 | glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE); 119 | 120 | window = glfwCreateWindow(WIDTH, HEIGHT, "Vulkan", nullptr, nullptr); 121 | } 122 | 123 | void initVulkan() { 124 | createInstance(); 125 | setupDebugMessenger(); 126 | createSurface(); 127 | pickPhysicalDevice(); 128 | createLogicalDevice(); 129 | createSwapChain(); 130 | createImageViews(); 131 | createRenderPass(); 132 | createGraphicsPipeline(); 133 | createFramebuffers(); 134 | createCommandPool(); 135 | createCommandBuffer(); 136 | createSyncObjects(); 137 | } 138 | 139 | void mainLoop() { 140 | while (!glfwWindowShouldClose(window)) { 141 | glfwPollEvents(); 142 | drawFrame(); 143 | } 144 | 145 | vkDeviceWaitIdle(device); 146 | } 147 | 148 | void cleanup() { 149 | vkDestroySemaphore(device, renderFinishedSemaphore, nullptr); 150 | vkDestroySemaphore(device, imageAvailableSemaphore, nullptr); 151 | vkDestroyFence(device, inFlightFence, nullptr); 152 | 153 | vkDestroyCommandPool(device, commandPool, nullptr); 154 | 155 | for (auto framebuffer : swapChainFramebuffers) { 156 | vkDestroyFramebuffer(device, framebuffer, nullptr); 157 | } 158 | 159 | vkDestroyPipeline(device, graphicsPipeline, nullptr); 160 | vkDestroyPipelineLayout(device, pipelineLayout, nullptr); 161 | vkDestroyRenderPass(device, renderPass, nullptr); 162 | 163 | for (auto imageView : swapChainImageViews) { 164 | vkDestroyImageView(device, imageView, nullptr); 165 | } 166 | 167 | vkDestroySwapchainKHR(device, swapChain, nullptr); 168 | vkDestroyDevice(device, nullptr); 169 | 170 | if (enableValidationLayers) { 171 | DestroyDebugUtilsMessengerEXT(instance, debugMessenger, nullptr); 172 | } 173 | 174 | vkDestroySurfaceKHR(instance, surface, nullptr); 175 | vkDestroyInstance(instance, nullptr); 176 | 177 | glfwDestroyWindow(window); 178 | 179 | glfwTerminate(); 180 | } 181 | 182 | void createInstance() { 183 | if (enableValidationLayers && !checkValidationLayerSupport()) { 184 | throw std::runtime_error("validation layers requested, but not available!"); 185 | } 186 | 187 | VkApplicationInfo appInfo{}; 188 | appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; 189 | appInfo.pApplicationName = "Hello Triangle"; 190 | appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0); 191 | appInfo.pEngineName = "No Engine"; 192 | appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0); 193 | appInfo.apiVersion = VK_API_VERSION_1_0; 194 | 195 | VkInstanceCreateInfo createInfo{}; 196 | createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; 197 | createInfo.pApplicationInfo = &appInfo; 198 | 199 | auto extensions = getRequiredExtensions(); 200 | createInfo.enabledExtensionCount = static_cast(extensions.size()); 201 | createInfo.ppEnabledExtensionNames = extensions.data(); 202 | 203 | VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo{}; 204 | if (enableValidationLayers) { 205 | createInfo.enabledLayerCount = static_cast(validationLayers.size()); 206 | createInfo.ppEnabledLayerNames = validationLayers.data(); 207 | 208 | populateDebugMessengerCreateInfo(debugCreateInfo); 209 | createInfo.pNext = (VkDebugUtilsMessengerCreateInfoEXT*) &debugCreateInfo; 210 | } else { 211 | createInfo.enabledLayerCount = 0; 212 | 213 | createInfo.pNext = nullptr; 214 | } 215 | 216 | if (vkCreateInstance(&createInfo, nullptr, &instance) != VK_SUCCESS) { 217 | throw std::runtime_error("failed to create instance!"); 218 | } 219 | } 220 | 221 | void populateDebugMessengerCreateInfo(VkDebugUtilsMessengerCreateInfoEXT& createInfo) { 222 | createInfo = {}; 223 | createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; 224 | createInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; 225 | createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; 226 | createInfo.pfnUserCallback = debugCallback; 227 | } 228 | 229 | void setupDebugMessenger() { 230 | if (!enableValidationLayers) return; 231 | 232 | VkDebugUtilsMessengerCreateInfoEXT createInfo; 233 | populateDebugMessengerCreateInfo(createInfo); 234 | 235 | if (CreateDebugUtilsMessengerEXT(instance, &createInfo, nullptr, &debugMessenger) != VK_SUCCESS) { 236 | throw std::runtime_error("failed to set up debug messenger!"); 237 | } 238 | } 239 | 240 | void createSurface() { 241 | if (glfwCreateWindowSurface(instance, window, nullptr, &surface) != VK_SUCCESS) { 242 | throw std::runtime_error("failed to create window surface!"); 243 | } 244 | } 245 | 246 | void pickPhysicalDevice() { 247 | uint32_t deviceCount = 0; 248 | vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr); 249 | 250 | if (deviceCount == 0) { 251 | throw std::runtime_error("failed to find GPUs with Vulkan support!"); 252 | } 253 | 254 | std::vector devices(deviceCount); 255 | vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data()); 256 | 257 | for (const auto& device : devices) { 258 | if (isDeviceSuitable(device)) { 259 | physicalDevice = device; 260 | break; 261 | } 262 | } 263 | 264 | if (physicalDevice == VK_NULL_HANDLE) { 265 | throw std::runtime_error("failed to find a suitable GPU!"); 266 | } 267 | } 268 | 269 | void createLogicalDevice() { 270 | QueueFamilyIndices indices = findQueueFamilies(physicalDevice); 271 | 272 | std::vector queueCreateInfos; 273 | std::set uniqueQueueFamilies = {indices.graphicsFamily.value(), indices.presentFamily.value()}; 274 | 275 | float queuePriority = 1.0f; 276 | for (uint32_t queueFamily : uniqueQueueFamilies) { 277 | VkDeviceQueueCreateInfo queueCreateInfo{}; 278 | queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; 279 | queueCreateInfo.queueFamilyIndex = queueFamily; 280 | queueCreateInfo.queueCount = 1; 281 | queueCreateInfo.pQueuePriorities = &queuePriority; 282 | queueCreateInfos.push_back(queueCreateInfo); 283 | } 284 | 285 | VkPhysicalDeviceFeatures deviceFeatures{}; 286 | 287 | VkDeviceCreateInfo createInfo{}; 288 | createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; 289 | 290 | createInfo.queueCreateInfoCount = static_cast(queueCreateInfos.size()); 291 | createInfo.pQueueCreateInfos = queueCreateInfos.data(); 292 | 293 | createInfo.pEnabledFeatures = &deviceFeatures; 294 | 295 | createInfo.enabledExtensionCount = static_cast(deviceExtensions.size()); 296 | createInfo.ppEnabledExtensionNames = deviceExtensions.data(); 297 | 298 | if (enableValidationLayers) { 299 | createInfo.enabledLayerCount = static_cast(validationLayers.size()); 300 | createInfo.ppEnabledLayerNames = validationLayers.data(); 301 | } else { 302 | createInfo.enabledLayerCount = 0; 303 | } 304 | 305 | if (vkCreateDevice(physicalDevice, &createInfo, nullptr, &device) != VK_SUCCESS) { 306 | throw std::runtime_error("failed to create logical device!"); 307 | } 308 | 309 | vkGetDeviceQueue(device, indices.graphicsFamily.value(), 0, &graphicsQueue); 310 | vkGetDeviceQueue(device, indices.presentFamily.value(), 0, &presentQueue); 311 | } 312 | 313 | void createSwapChain() { 314 | SwapChainSupportDetails swapChainSupport = querySwapChainSupport(physicalDevice); 315 | 316 | VkSurfaceFormatKHR surfaceFormat = chooseSwapSurfaceFormat(swapChainSupport.formats); 317 | VkPresentModeKHR presentMode = chooseSwapPresentMode(swapChainSupport.presentModes); 318 | VkExtent2D extent = chooseSwapExtent(swapChainSupport.capabilities); 319 | 320 | uint32_t imageCount = swapChainSupport.capabilities.minImageCount + 1; 321 | if (swapChainSupport.capabilities.maxImageCount > 0 && imageCount > swapChainSupport.capabilities.maxImageCount) { 322 | imageCount = swapChainSupport.capabilities.maxImageCount; 323 | } 324 | 325 | VkSwapchainCreateInfoKHR createInfo{}; 326 | createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR; 327 | createInfo.surface = surface; 328 | 329 | createInfo.minImageCount = imageCount; 330 | createInfo.imageFormat = surfaceFormat.format; 331 | createInfo.imageColorSpace = surfaceFormat.colorSpace; 332 | createInfo.imageExtent = extent; 333 | createInfo.imageArrayLayers = 1; 334 | createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; 335 | 336 | QueueFamilyIndices indices = findQueueFamilies(physicalDevice); 337 | uint32_t queueFamilyIndices[] = {indices.graphicsFamily.value(), indices.presentFamily.value()}; 338 | 339 | if (indices.graphicsFamily != indices.presentFamily) { 340 | createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT; 341 | createInfo.queueFamilyIndexCount = 2; 342 | createInfo.pQueueFamilyIndices = queueFamilyIndices; 343 | } else { 344 | createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE; 345 | } 346 | 347 | createInfo.preTransform = swapChainSupport.capabilities.currentTransform; 348 | createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; 349 | createInfo.presentMode = presentMode; 350 | createInfo.clipped = VK_TRUE; 351 | 352 | createInfo.oldSwapchain = VK_NULL_HANDLE; 353 | 354 | if (vkCreateSwapchainKHR(device, &createInfo, nullptr, &swapChain) != VK_SUCCESS) { 355 | throw std::runtime_error("failed to create swap chain!"); 356 | } 357 | 358 | vkGetSwapchainImagesKHR(device, swapChain, &imageCount, nullptr); 359 | swapChainImages.resize(imageCount); 360 | vkGetSwapchainImagesKHR(device, swapChain, &imageCount, swapChainImages.data()); 361 | 362 | swapChainImageFormat = surfaceFormat.format; 363 | swapChainExtent = extent; 364 | } 365 | 366 | void createImageViews() { 367 | swapChainImageViews.resize(swapChainImages.size()); 368 | 369 | for (size_t i = 0; i < swapChainImages.size(); i++) { 370 | VkImageViewCreateInfo createInfo{}; 371 | createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; 372 | createInfo.image = swapChainImages[i]; 373 | createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; 374 | createInfo.format = swapChainImageFormat; 375 | createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; 376 | createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; 377 | createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; 378 | createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; 379 | createInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; 380 | createInfo.subresourceRange.baseMipLevel = 0; 381 | createInfo.subresourceRange.levelCount = 1; 382 | createInfo.subresourceRange.baseArrayLayer = 0; 383 | createInfo.subresourceRange.layerCount = 1; 384 | 385 | if (vkCreateImageView(device, &createInfo, nullptr, &swapChainImageViews[i]) != VK_SUCCESS) { 386 | throw std::runtime_error("failed to create image views!"); 387 | } 388 | } 389 | } 390 | 391 | void createRenderPass() { 392 | VkAttachmentDescription colorAttachment{}; 393 | colorAttachment.format = swapChainImageFormat; 394 | colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT; 395 | colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; 396 | colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; 397 | colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; 398 | colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; 399 | colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; 400 | colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; 401 | 402 | VkAttachmentReference colorAttachmentRef{}; 403 | colorAttachmentRef.attachment = 0; 404 | colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; 405 | 406 | VkSubpassDescription subpass{}; 407 | subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; 408 | subpass.colorAttachmentCount = 1; 409 | subpass.pColorAttachments = &colorAttachmentRef; 410 | 411 | VkSubpassDependency dependency{}; 412 | dependency.srcSubpass = VK_SUBPASS_EXTERNAL; 413 | dependency.dstSubpass = 0; 414 | dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; 415 | dependency.srcAccessMask = 0; 416 | dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; 417 | dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; 418 | 419 | VkRenderPassCreateInfo renderPassInfo{}; 420 | renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; 421 | renderPassInfo.attachmentCount = 1; 422 | renderPassInfo.pAttachments = &colorAttachment; 423 | renderPassInfo.subpassCount = 1; 424 | renderPassInfo.pSubpasses = &subpass; 425 | renderPassInfo.dependencyCount = 1; 426 | renderPassInfo.pDependencies = &dependency; 427 | 428 | if (vkCreateRenderPass(device, &renderPassInfo, nullptr, &renderPass) != VK_SUCCESS) { 429 | throw std::runtime_error("failed to create render pass!"); 430 | } 431 | } 432 | 433 | void createGraphicsPipeline() { 434 | auto vertShaderCode = readFile("vertices.spv"); 435 | auto fragShaderCode = readFile("fragment.spv"); 436 | 437 | VkShaderModule vertShaderModule = createShaderModule(vertShaderCode); 438 | VkShaderModule fragShaderModule = createShaderModule(fragShaderCode); 439 | 440 | VkPipelineShaderStageCreateInfo vertShaderStageInfo{}; 441 | vertShaderStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; 442 | vertShaderStageInfo.stage = VK_SHADER_STAGE_VERTEX_BIT; 443 | vertShaderStageInfo.module = vertShaderModule; 444 | vertShaderStageInfo.pName = "main"; 445 | 446 | VkPipelineShaderStageCreateInfo fragShaderStageInfo{}; 447 | fragShaderStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; 448 | fragShaderStageInfo.stage = VK_SHADER_STAGE_FRAGMENT_BIT; 449 | fragShaderStageInfo.module = fragShaderModule; 450 | fragShaderStageInfo.pName = "main"; 451 | 452 | VkPipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo}; 453 | 454 | VkPipelineVertexInputStateCreateInfo vertexInputInfo{}; 455 | vertexInputInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; 456 | vertexInputInfo.vertexBindingDescriptionCount = 0; 457 | vertexInputInfo.vertexAttributeDescriptionCount = 0; 458 | 459 | VkPipelineInputAssemblyStateCreateInfo inputAssembly{}; 460 | inputAssembly.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; 461 | inputAssembly.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; 462 | inputAssembly.primitiveRestartEnable = VK_FALSE; 463 | 464 | VkPipelineViewportStateCreateInfo viewportState{}; 465 | viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; 466 | viewportState.viewportCount = 1; 467 | viewportState.scissorCount = 1; 468 | 469 | VkPipelineRasterizationStateCreateInfo rasterizer{}; 470 | rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; 471 | rasterizer.depthClampEnable = VK_FALSE; 472 | rasterizer.rasterizerDiscardEnable = VK_FALSE; 473 | rasterizer.polygonMode = VK_POLYGON_MODE_FILL; 474 | rasterizer.lineWidth = 1.0f; 475 | rasterizer.cullMode = VK_CULL_MODE_BACK_BIT; 476 | rasterizer.frontFace = VK_FRONT_FACE_CLOCKWISE; 477 | rasterizer.depthBiasEnable = VK_FALSE; 478 | 479 | VkPipelineMultisampleStateCreateInfo multisampling{}; 480 | multisampling.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; 481 | multisampling.sampleShadingEnable = VK_FALSE; 482 | multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; 483 | 484 | VkPipelineColorBlendAttachmentState colorBlendAttachment{}; 485 | colorBlendAttachment.colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; 486 | colorBlendAttachment.blendEnable = VK_FALSE; 487 | 488 | VkPipelineColorBlendStateCreateInfo colorBlending{}; 489 | colorBlending.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; 490 | colorBlending.logicOpEnable = VK_FALSE; 491 | colorBlending.logicOp = VK_LOGIC_OP_COPY; 492 | colorBlending.attachmentCount = 1; 493 | colorBlending.pAttachments = &colorBlendAttachment; 494 | colorBlending.blendConstants[0] = 0.0f; 495 | colorBlending.blendConstants[1] = 0.0f; 496 | colorBlending.blendConstants[2] = 0.0f; 497 | colorBlending.blendConstants[3] = 0.0f; 498 | 499 | std::vector dynamicStates = { 500 | VK_DYNAMIC_STATE_VIEWPORT, 501 | VK_DYNAMIC_STATE_SCISSOR 502 | }; 503 | VkPipelineDynamicStateCreateInfo dynamicState{}; 504 | dynamicState.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; 505 | dynamicState.dynamicStateCount = static_cast(dynamicStates.size()); 506 | dynamicState.pDynamicStates = dynamicStates.data(); 507 | 508 | VkPipelineLayoutCreateInfo pipelineLayoutInfo{}; 509 | pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; 510 | pipelineLayoutInfo.setLayoutCount = 0; 511 | pipelineLayoutInfo.pushConstantRangeCount = 0; 512 | 513 | if (vkCreatePipelineLayout(device, &pipelineLayoutInfo, nullptr, &pipelineLayout) != VK_SUCCESS) { 514 | throw std::runtime_error("failed to create pipeline layout!"); 515 | } 516 | 517 | VkGraphicsPipelineCreateInfo pipelineInfo{}; 518 | pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; 519 | pipelineInfo.stageCount = 2; 520 | pipelineInfo.pStages = shaderStages; 521 | pipelineInfo.pVertexInputState = &vertexInputInfo; 522 | pipelineInfo.pInputAssemblyState = &inputAssembly; 523 | pipelineInfo.pViewportState = &viewportState; 524 | pipelineInfo.pRasterizationState = &rasterizer; 525 | pipelineInfo.pMultisampleState = &multisampling; 526 | pipelineInfo.pColorBlendState = &colorBlending; 527 | pipelineInfo.pDynamicState = &dynamicState; 528 | pipelineInfo.layout = pipelineLayout; 529 | pipelineInfo.renderPass = renderPass; 530 | pipelineInfo.subpass = 0; 531 | pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; 532 | 533 | if (vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &graphicsPipeline) != VK_SUCCESS) { 534 | throw std::runtime_error("failed to create graphics pipeline!"); 535 | } 536 | 537 | vkDestroyShaderModule(device, fragShaderModule, nullptr); 538 | vkDestroyShaderModule(device, vertShaderModule, nullptr); 539 | } 540 | 541 | void createFramebuffers() { 542 | swapChainFramebuffers.resize(swapChainImageViews.size()); 543 | 544 | for (size_t i = 0; i < swapChainImageViews.size(); i++) { 545 | VkImageView attachments[] = { 546 | swapChainImageViews[i] 547 | }; 548 | 549 | VkFramebufferCreateInfo framebufferInfo{}; 550 | framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; 551 | framebufferInfo.renderPass = renderPass; 552 | framebufferInfo.attachmentCount = 1; 553 | framebufferInfo.pAttachments = attachments; 554 | framebufferInfo.width = swapChainExtent.width; 555 | framebufferInfo.height = swapChainExtent.height; 556 | framebufferInfo.layers = 1; 557 | 558 | if (vkCreateFramebuffer(device, &framebufferInfo, nullptr, &swapChainFramebuffers[i]) != VK_SUCCESS) { 559 | throw std::runtime_error("failed to create framebuffer!"); 560 | } 561 | } 562 | } 563 | 564 | void createCommandPool() { 565 | QueueFamilyIndices queueFamilyIndices = findQueueFamilies(physicalDevice); 566 | 567 | VkCommandPoolCreateInfo poolInfo{}; 568 | poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; 569 | poolInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; 570 | poolInfo.queueFamilyIndex = queueFamilyIndices.graphicsFamily.value(); 571 | 572 | if (vkCreateCommandPool(device, &poolInfo, nullptr, &commandPool) != VK_SUCCESS) { 573 | throw std::runtime_error("failed to create command pool!"); 574 | } 575 | } 576 | 577 | void createCommandBuffer() { 578 | VkCommandBufferAllocateInfo allocInfo{}; 579 | allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; 580 | allocInfo.commandPool = commandPool; 581 | allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; 582 | allocInfo.commandBufferCount = 1; 583 | 584 | if (vkAllocateCommandBuffers(device, &allocInfo, &commandBuffer) != VK_SUCCESS) { 585 | throw std::runtime_error("failed to allocate command buffers!"); 586 | } 587 | } 588 | 589 | void recordCommandBuffer(VkCommandBuffer commandBuffer, uint32_t imageIndex) { 590 | VkCommandBufferBeginInfo beginInfo{}; 591 | beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; 592 | 593 | if (vkBeginCommandBuffer(commandBuffer, &beginInfo) != VK_SUCCESS) { 594 | throw std::runtime_error("failed to begin recording command buffer!"); 595 | } 596 | 597 | VkRenderPassBeginInfo renderPassInfo{}; 598 | renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; 599 | renderPassInfo.renderPass = renderPass; 600 | renderPassInfo.framebuffer = swapChainFramebuffers[imageIndex]; 601 | renderPassInfo.renderArea.offset = {0, 0}; 602 | renderPassInfo.renderArea.extent = swapChainExtent; 603 | 604 | VkClearValue clearColor = {{{0.0f, 0.0f, 0.0f, 1.0f}}}; 605 | renderPassInfo.clearValueCount = 1; 606 | renderPassInfo.pClearValues = &clearColor; 607 | 608 | vkCmdBeginRenderPass(commandBuffer, &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE); 609 | 610 | vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, graphicsPipeline); 611 | 612 | VkViewport viewport{}; 613 | viewport.x = 0.0f; 614 | viewport.y = 0.0f; 615 | viewport.width = static_cast(swapChainExtent.width); 616 | viewport.height = static_cast(swapChainExtent.height); 617 | viewport.minDepth = 0.0f; 618 | viewport.maxDepth = 1.0f; 619 | vkCmdSetViewport(commandBuffer, 0, 1, &viewport); 620 | 621 | VkRect2D scissor{}; 622 | scissor.offset = {0, 0}; 623 | scissor.extent = swapChainExtent; 624 | vkCmdSetScissor(commandBuffer, 0, 1, &scissor); 625 | 626 | vkCmdDraw(commandBuffer, 3, 1, 0, 0); 627 | 628 | vkCmdEndRenderPass(commandBuffer); 629 | 630 | if (vkEndCommandBuffer(commandBuffer) != VK_SUCCESS) { 631 | throw std::runtime_error("failed to record command buffer!"); 632 | } 633 | } 634 | 635 | void createSyncObjects() { 636 | VkSemaphoreCreateInfo semaphoreInfo{}; 637 | semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; 638 | 639 | VkFenceCreateInfo fenceInfo{}; 640 | fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; 641 | fenceInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT; 642 | 643 | if (vkCreateSemaphore(device, &semaphoreInfo, nullptr, &imageAvailableSemaphore) != VK_SUCCESS || 644 | vkCreateSemaphore(device, &semaphoreInfo, nullptr, &renderFinishedSemaphore) != VK_SUCCESS || 645 | vkCreateFence(device, &fenceInfo, nullptr, &inFlightFence) != VK_SUCCESS) { 646 | throw std::runtime_error("failed to create synchronization objects for a frame!"); 647 | } 648 | 649 | } 650 | 651 | void drawFrame() { 652 | vkWaitForFences(device, 1, &inFlightFence, VK_TRUE, UINT64_MAX); 653 | vkResetFences(device, 1, &inFlightFence); 654 | 655 | uint32_t imageIndex; 656 | vkAcquireNextImageKHR(device, swapChain, UINT64_MAX, imageAvailableSemaphore, VK_NULL_HANDLE, &imageIndex); 657 | 658 | vkResetCommandBuffer(commandBuffer, /*VkCommandBufferResetFlagBits*/ 0); 659 | recordCommandBuffer(commandBuffer, imageIndex); 660 | 661 | VkSubmitInfo submitInfo{}; 662 | submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; 663 | 664 | VkSemaphore waitSemaphores[] = {imageAvailableSemaphore}; 665 | VkPipelineStageFlags waitStages[] = {VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT}; 666 | submitInfo.waitSemaphoreCount = 1; 667 | submitInfo.pWaitSemaphores = waitSemaphores; 668 | submitInfo.pWaitDstStageMask = waitStages; 669 | 670 | submitInfo.commandBufferCount = 1; 671 | submitInfo.pCommandBuffers = &commandBuffer; 672 | 673 | VkSemaphore signalSemaphores[] = {renderFinishedSemaphore}; 674 | submitInfo.signalSemaphoreCount = 1; 675 | submitInfo.pSignalSemaphores = signalSemaphores; 676 | 677 | if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, inFlightFence) != VK_SUCCESS) { 678 | throw std::runtime_error("failed to submit draw command buffer!"); 679 | } 680 | 681 | VkPresentInfoKHR presentInfo{}; 682 | presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; 683 | 684 | presentInfo.waitSemaphoreCount = 1; 685 | presentInfo.pWaitSemaphores = signalSemaphores; 686 | 687 | VkSwapchainKHR swapChains[] = {swapChain}; 688 | presentInfo.swapchainCount = 1; 689 | presentInfo.pSwapchains = swapChains; 690 | 691 | presentInfo.pImageIndices = &imageIndex; 692 | 693 | vkQueuePresentKHR(presentQueue, &presentInfo); 694 | } 695 | 696 | VkShaderModule createShaderModule(const std::vector& code) { 697 | VkShaderModuleCreateInfo createInfo{}; 698 | createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; 699 | createInfo.codeSize = code.size(); 700 | createInfo.pCode = reinterpret_cast(code.data()); 701 | 702 | VkShaderModule shaderModule; 703 | if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != VK_SUCCESS) { 704 | throw std::runtime_error("failed to create shader module!"); 705 | } 706 | 707 | return shaderModule; 708 | } 709 | 710 | VkSurfaceFormatKHR chooseSwapSurfaceFormat(const std::vector& availableFormats) { 711 | for (const auto& availableFormat : availableFormats) { 712 | if (availableFormat.format == VK_FORMAT_B8G8R8A8_SRGB && availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) { 713 | return availableFormat; 714 | } 715 | } 716 | 717 | return availableFormats[0]; 718 | } 719 | 720 | VkPresentModeKHR chooseSwapPresentMode(const std::vector& availablePresentModes) { 721 | for (const auto& availablePresentMode : availablePresentModes) { 722 | if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) { 723 | return availablePresentMode; 724 | } 725 | } 726 | 727 | return VK_PRESENT_MODE_FIFO_KHR; 728 | } 729 | 730 | VkExtent2D chooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities) { 731 | if (capabilities.currentExtent.width != std::numeric_limits::max()) { 732 | return capabilities.currentExtent; 733 | } else { 734 | int width, height; 735 | glfwGetFramebufferSize(window, &width, &height); 736 | 737 | VkExtent2D actualExtent = { 738 | static_cast(width), 739 | static_cast(height) 740 | }; 741 | 742 | actualExtent.width = std::clamp(actualExtent.width, capabilities.minImageExtent.width, capabilities.maxImageExtent.width); 743 | actualExtent.height = std::clamp(actualExtent.height, capabilities.minImageExtent.height, capabilities.maxImageExtent.height); 744 | 745 | return actualExtent; 746 | } 747 | } 748 | 749 | SwapChainSupportDetails querySwapChainSupport(VkPhysicalDevice device) { 750 | SwapChainSupportDetails details; 751 | 752 | vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &details.capabilities); 753 | 754 | uint32_t formatCount; 755 | vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr); 756 | 757 | if (formatCount != 0) { 758 | details.formats.resize(formatCount); 759 | vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, details.formats.data()); 760 | } 761 | 762 | uint32_t presentModeCount; 763 | vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, nullptr); 764 | 765 | if (presentModeCount != 0) { 766 | details.presentModes.resize(presentModeCount); 767 | vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, details.presentModes.data()); 768 | } 769 | 770 | return details; 771 | } 772 | 773 | bool isDeviceSuitable(VkPhysicalDevice device) { 774 | QueueFamilyIndices indices = findQueueFamilies(device); 775 | 776 | bool extensionsSupported = checkDeviceExtensionSupport(device); 777 | 778 | bool swapChainAdequate = false; 779 | if (extensionsSupported) { 780 | SwapChainSupportDetails swapChainSupport = querySwapChainSupport(device); 781 | swapChainAdequate = !swapChainSupport.formats.empty() && !swapChainSupport.presentModes.empty(); 782 | } 783 | 784 | return indices.isComplete() && extensionsSupported && swapChainAdequate; 785 | } 786 | 787 | bool checkDeviceExtensionSupport(VkPhysicalDevice device) { 788 | uint32_t extensionCount; 789 | vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, nullptr); 790 | 791 | std::vector availableExtensions(extensionCount); 792 | vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, availableExtensions.data()); 793 | 794 | std::set requiredExtensions(deviceExtensions.begin(), deviceExtensions.end()); 795 | 796 | for (const auto& extension : availableExtensions) { 797 | requiredExtensions.erase(extension.extensionName); 798 | } 799 | 800 | return requiredExtensions.empty(); 801 | } 802 | 803 | QueueFamilyIndices findQueueFamilies(VkPhysicalDevice device) { 804 | QueueFamilyIndices indices; 805 | 806 | uint32_t queueFamilyCount = 0; 807 | vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr); 808 | 809 | std::vector queueFamilies(queueFamilyCount); 810 | vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, queueFamilies.data()); 811 | 812 | int i = 0; 813 | for (const auto& queueFamily : queueFamilies) { 814 | if (queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) { 815 | indices.graphicsFamily = i; 816 | } 817 | 818 | VkBool32 presentSupport = false; 819 | vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport); 820 | 821 | if (presentSupport) { 822 | indices.presentFamily = i; 823 | } 824 | 825 | if (indices.isComplete()) { 826 | break; 827 | } 828 | 829 | i++; 830 | } 831 | 832 | return indices; 833 | } 834 | 835 | std::vector getRequiredExtensions() { 836 | uint32_t glfwExtensionCount = 0; 837 | const char** glfwExtensions; 838 | glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount); 839 | 840 | std::vector extensions(glfwExtensions, glfwExtensions + glfwExtensionCount); 841 | 842 | if (enableValidationLayers) { 843 | extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); 844 | } 845 | 846 | return extensions; 847 | } 848 | 849 | bool checkValidationLayerSupport() { 850 | uint32_t layerCount; 851 | vkEnumerateInstanceLayerProperties(&layerCount, nullptr); 852 | 853 | std::vector availableLayers(layerCount); 854 | vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data()); 855 | 856 | for (const char* layerName : validationLayers) { 857 | bool layerFound = false; 858 | 859 | for (const auto& layerProperties : availableLayers) { 860 | if (strcmp(layerName, layerProperties.layerName) == 0) { 861 | layerFound = true; 862 | break; 863 | } 864 | } 865 | 866 | if (!layerFound) { 867 | return false; 868 | } 869 | } 870 | 871 | return true; 872 | } 873 | 874 | static std::vector readFile(const std::string& filename) { 875 | std::ifstream file(filename, std::ios::ate | std::ios::binary); 876 | 877 | if (!file.is_open()) { 878 | throw std::runtime_error("failed to open file!"); 879 | } 880 | 881 | size_t fileSize = (size_t) file.tellg(); 882 | std::vector buffer(fileSize); 883 | 884 | file.seekg(0); 885 | file.read(buffer.data(), fileSize); 886 | 887 | file.close(); 888 | 889 | return buffer; 890 | } 891 | 892 | static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, VkDebugUtilsMessageTypeFlagsEXT messageType, const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData, void* pUserData) { 893 | std::cerr << "validation layer: " << pCallbackData->pMessage << std::endl; 894 | 895 | return VK_FALSE; 896 | } 897 | }; 898 | 899 | int main() { 900 | HelloTriangleApplication app; 901 | 902 | try { 903 | app.run(); 904 | } catch (const std::exception& e) { 905 | std::cerr << e.what() << std::endl; 906 | return EXIT_FAILURE; 907 | } 908 | 909 | return EXIT_SUCCESS; 910 | } 911 | --------------------------------------------------------------------------------