├── cpu
    ├── cpp
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── cpu.cpp
    │   ├── task1.hpp
    │   ├── task3.hpp
    │   └── task2.hpp
    ├── rust
    │   ├── src
    │   │   ├── task2.rs
    │   │   ├── task3.rs
    │   │   ├── task1.rs
    │   │   └── main.rs
    │   └── Cargo.toml
    └── readme.md
├── cuda
    ├── .gitignore
    ├── readme.md
    └── main.cu
├── openmp
    ├── .gitignore
    ├── CMakeLists.txt
    ├── openmp.cpp
    └── readme.md
├── directx
    ├── computing
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── sum.hlsl
    │   └── main.cpp
    ├── graphics
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   └── main.cpp
    └── readme.md
├── vulkan
    ├── graphics
    │   ├── .gitignore
    │   ├── shader.frag
    │   ├── shader.vert
    │   ├── CMakeLists.txt
    │   └── main.cpp
    ├── compute
    │   ├── .gitignore
    │   └── compute.py
    └── readme.md
├── opengl
    ├── computing
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   └── main.cpp
    ├── graphics
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   └── main.cpp
    └── readme.md
├── assets
    └── triangle.gif
├── opencl
    ├── src
    │   ├── test_kernel.cl
    │   └── main.rs
    ├── Cargo.toml
    └── readme.md
├── webgpu
    ├── graphics
    │   ├── Cargo.toml
    │   └── src
    │   │   ├── shader.wgsl
    │   │   └── main.rs
    ├── compute
    │   ├── Cargo.toml
    │   └── src
    │   │   ├── shader.wgsl
    │   │   └── main.rs
    └── readme.md
├── AcceleratedKernels.jl
    ├── Project
    │   ├── Project.toml
    │   └── src
    │   │   └── Project.jl
    └── readme.md
├── metal
    └── readme.md
├── triton
    ├── readme.md
    └── main.py
├── .gitignore
├── sycl
    ├── main.cpp
    └── readme.md
├── LICENSE
├── benchmark.py
└── readme.rst


/cpu/cpp/.gitignore:
--------------------------------------------------------------------------------
1 | build/*


--------------------------------------------------------------------------------
/cuda/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/openmp/.gitignore:
--------------------------------------------------------------------------------
1 | build/*


--------------------------------------------------------------------------------
/directx/computing/.gitignore:
--------------------------------------------------------------------------------
1 | build/*


--------------------------------------------------------------------------------
/directx/graphics/.gitignore:
--------------------------------------------------------------------------------
1 | build/*


--------------------------------------------------------------------------------
/vulkan/graphics/.gitignore:
--------------------------------------------------------------------------------
1 | build/*
2 | 


--------------------------------------------------------------------------------
/vulkan/compute/.gitignore:
--------------------------------------------------------------------------------
1 | *.comp
2 | *.spv
3 | 


--------------------------------------------------------------------------------
/opengl/computing/.gitignore:
--------------------------------------------------------------------------------
1 | build/*
2 | include/*
3 | src/*
4 | 


--------------------------------------------------------------------------------
/opengl/graphics/.gitignore:
--------------------------------------------------------------------------------
1 | build/*
2 | include/*
3 | src/*
4 | 


--------------------------------------------------------------------------------
/assets/triangle.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vincent-Therrien/gpu-arena/HEAD/assets/triangle.gif


--------------------------------------------------------------------------------
/opencl/src/test_kernel.cl:
--------------------------------------------------------------------------------
1 | __kernel void add(__global float* buffer) {
2 |     buffer[get_global_id(0)] += 2.0;
3 | }
4 | 


--------------------------------------------------------------------------------
/opencl/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "opencl"
3 | version = "0.1.0"
4 | edition = "2021"
5 | 
6 | [dependencies]
7 | ocl = "0.19"
8 | 


--------------------------------------------------------------------------------
/cpu/rust/src/task2.rs:
--------------------------------------------------------------------------------
1 | /// Task 2: Multiply 2 matrices
2 | pub fn task_2(_n: u32, _threads: u32) -> f64 {
3 |     panic!("Unsupported");
4 | }
5 | 


--------------------------------------------------------------------------------
/webgpu/graphics/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "graphics"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | glfw = "0.59.0"
 8 | wgpu = "24.0.1"
 9 | pollster = "0.4.0"
10 | 


--------------------------------------------------------------------------------
/cpu/cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.20)
2 | project(cpu LANGUAGES CXX)
3 | 
4 | set(CMAKE_CXX_STANDARD 20)
5 | set(CMAKE_CXX_STANDARD_REQUIRED True)
6 | 
7 | add_executable(cpu cpu.cpp)
8 | 


--------------------------------------------------------------------------------
/openmp/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.20)
2 | project(main LANGUAGES CXX)
3 | 
4 | set(CMAKE_CXX_STANDARD 20)
5 | set(CMAKE_CXX_STANDARD_REQUIRED True)
6 | find_package(OpenMP)
7 | 
8 | add_executable(main openmp.cpp)
9 | 


--------------------------------------------------------------------------------
/webgpu/compute/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "compute"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | glfw = "0.59.0"
 8 | wgpu = "24.0.1"
 9 | pollster = "0.4.0"
10 | bytemuck = "1.21.0"
11 | futures-intrusive = "0.5"
12 | 


--------------------------------------------------------------------------------
/cpu/rust/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cpu"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | clap = { version = "4.5.26", features = ["derive"] }
 8 | ndarray = { version = "0.16.0", default-features = false }
 9 | ndarray-rand = "0.15.0"
10 | floating-duration = "0.1.2"
11 | 


--------------------------------------------------------------------------------
/vulkan/graphics/shader.frag:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | layout(location = 0) in vec3 fragColor;
 4 | layout(location = 0) out vec4 outColor;
 5 | 
 6 | void main() {
 7 |     float levels = 10.0; // Number of discrete color levels
 8 |     vec3 quantizedColor = floor(fragColor.rgb * levels) / (levels - 1.0);
 9 |     outColor = vec4(quantizedColor, 1.0);
10 | }


--------------------------------------------------------------------------------
/AcceleratedKernels.jl/Project/Project.toml:
--------------------------------------------------------------------------------
 1 | name = "Project"
 2 | uuid = "b047b328-b947-4008-9592-27f6f2e42ee3"
 3 | authors = ["Vincent Therrien"]
 4 | version = "0.1.0"
 5 | 
 6 | [deps]
 7 | AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 8 | CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 9 | GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
10 | 


--------------------------------------------------------------------------------
/directx/graphics/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | project(DirectX_Project)
 3 | 
 4 | # Set C++ standard
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | 
 7 | # Add the executable
 8 | add_executable(graphics main.cpp)
 9 | 
10 | target_link_libraries(graphics d3d11 dxgi d3dcompiler)
11 | set_target_properties(graphics PROPERTIES LINK_FLAGS "/SUBSYSTEM:WINDOWS")
12 | 


--------------------------------------------------------------------------------
/metal/readme.md:
--------------------------------------------------------------------------------
1 | # Metal
2 | 
3 | Metal is a graphics and general-purpose computing framework developed by Apple that targets GPUs. It
4 | only works on Apple operating systems (Mac / iOS). Metal uses the MSL shading language.
5 | 
6 | I don't have an Apple device so I couldn't test it `:(`. Refer to the page
7 | https://developer.apple.com/documentation/metal/ if you want to know more!
8 | 


--------------------------------------------------------------------------------
/vulkan/graphics/shader.vert:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | layout(location = 0) out vec3 fragColor;
 4 | 
 5 | vec2 positions[3] = vec2[](
 6 |     vec2(0.0, -0.5),
 7 |     vec2(0.5, 0.5),
 8 |     vec2(-0.5, 0.5)
 9 | );
10 | 
11 | vec3 colors[3] = vec3[](
12 |     vec3(1.0, 0.0, 0.0),
13 |     vec3(0.0, 1.0, 0.0),
14 |     vec3(0.0, 0.0, 1.0)
15 | );
16 | 
17 | void main() {
18 |     gl_Position = vec4(positions[gl_VertexIndex], 0.0, 1.0);
19 |     fragColor = colors[gl_VertexIndex];
20 | }
21 | 


--------------------------------------------------------------------------------
/opengl/graphics/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | project(OpenGL_Project)
 3 | 
 4 | # Set C++ standard
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | 
 7 | # Find OpenGL
 8 | find_package(OpenGL REQUIRED)
 9 | 
10 | # Find GLFW
11 | find_package(glfw3 REQUIRED)
12 | 
13 | # Include directories
14 | include_directories(include)
15 | 
16 | # Create the executable
17 | add_executable(graphics main.cpp src/glad.c)
18 | 
19 | # Link libraries
20 | target_link_libraries(graphics glfw OpenGL::GL)
21 | 


--------------------------------------------------------------------------------
/openmp/openmp.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include <omp.h>
 4 | 
 5 | int main() {
 6 |     const int N = 100000;
 7 |     std::vector<float> elements(N, 1.0f);
 8 |     float* array = elements.data();
 9 |     float sum = 0.0f;
10 | 
11 |     #pragma omp target data map(to: array[0:N]) map(tofrom: sum)
12 |     {
13 |         #pragma omp target teams distribute parallel for reduction(+:sum)
14 |         for (int i = 0; i < N; ++i) {
15 |             sum += array[i];
16 |         }
17 |     }
18 | 
19 |     std::cout << "Sum: " << sum << std::endl;
20 | 
21 |     return 0;
22 | }
23 | 


--------------------------------------------------------------------------------
/vulkan/graphics/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | project(Vulkan_Project)
 3 | 
 4 | # Set C++ standard
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | 
 7 | # Find Vulkan
 8 | find_package(Vulkan REQUIRED)
 9 | 
10 | # Find GLFW
11 | find_package(glfw3 REQUIRED)
12 | 
13 | # Find GLM
14 | find_package(glm REQUIRED)
15 | 
16 | # Create the executable
17 | add_executable(graphics main.cpp)
18 | 
19 | # Link libraries
20 | target_link_libraries(graphics Vulkan::Vulkan glfw)
21 | 
22 | # Include GLM
23 | target_include_directories(graphics PRIVATE ${GLM_INCLUDE_DIRS})
24 | target_compile_definitions(graphics PRIVATE ${GLM_DEFINITIONS})


--------------------------------------------------------------------------------
/directx/computing/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | project(DirectCompute_Project)
 3 | 
 4 | # Set C++ standard
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | set(CMAKE_CXX_STANDARD_REQUIRED True)
 7 | 
 8 | # Define the source files
 9 | set(SOURCES main.cpp)
10 | 
11 | # Define the executable
12 | add_executable(computing ${SOURCES})
13 | 
14 | # Link against DirectX 11 libraries
15 | target_link_libraries(computing d3d11 d3dcompiler)
16 | 
17 | # Copy the compute shader to the output directory
18 | add_custom_command(TARGET computing POST_BUILD
19 |     COMMAND ${CMAKE_COMMAND} -E copy_if_different
20 |         ${CMAKE_SOURCE_DIR}/sum.hlsl
21 |         $<TARGET_FILE_DIR:computing>/sum.hlsl
22 | )
23 | 


--------------------------------------------------------------------------------
/triton/readme.md:
--------------------------------------------------------------------------------
 1 | # Triton
 2 | 
 3 | Triton is a high-level GPU programming API developed by OpenAI that targets neural network
 4 | acceleration. It uses decorators in Python code to mark computations to accelerate. Since it uses
 5 | CUDA as a backend, it is only supported on Nvidia GPUs. Also, it only works on Linux / WSL (as of
 6 | March 2025). This project implements sum reduction and a softmax function.
 7 | 
 8 | Relevant links:
 9 | 
10 | - OpenAI blog post: https://openai.com/index/triton/
11 | - Github repository: https://github.com/triton-lang/triton
12 | 
13 | Run the following commands to try Triton.
14 | 
15 | ```
16 | pip install numpy
17 | pip install torch
18 | pip install triton
19 | python3 main.py
20 | ```
21 | 


--------------------------------------------------------------------------------
/opengl/computing/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | project(OpenGL_Project)
 3 | 
 4 | # Set C++ standard
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | 
 7 | # Find OpenGL
 8 | find_package(OpenGL REQUIRED)
 9 | 
10 | # Find GLFW
11 | find_package(glfw3 REQUIRED)
12 | 
13 | # Find GLEW
14 | find_package(PkgConfig REQUIRED)
15 | pkg_check_modules(GLEW REQUIRED glew)
16 | 
17 | # Add the executable
18 | add_executable(computing main.cpp)
19 | 
20 | # Include directories
21 | target_include_directories(computing PRIVATE ${GLEW_INCLUDE_DIRS})
22 | target_include_directories(computing PRIVATE ${GLFW3_INCLUDE_DIRS})
23 | 
24 | # Link libraries
25 | target_link_libraries(computing PRIVATE ${GLEW_LIBRARIES} glfw OpenGL::GL)
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | debug/
 4 | target/
 5 | 
 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 8 | Cargo.lock
 9 | 
10 | # These are backup files generated by rustfmt
11 | **/*.rs.bk
12 | 
13 | # MSVC Windows builds of rustc generate these, which store debugging information
14 | *.pdb
15 | 
16 | # RustRover
17 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
18 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
19 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
20 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
21 | #.idea/


--------------------------------------------------------------------------------
/webgpu/graphics/src/shader.wgsl:
--------------------------------------------------------------------------------
 1 | struct VertexPayload {
 2 |     @builtin(position) position: vec4<f32>,
 3 |     @location(0) color: vec3<f32>,
 4 | };
 5 | 
 6 | @vertex
 7 | fn vertices(@builtin(vertex_index) i: u32) -> VertexPayload {
 8 | 
 9 |     var positions = array<vec2<f32>, 3>(
10 |         vec2<f32>(-0.75, -0.75),
11 |         vec2<f32>( 0.75, -0.75),
12 |         vec2<f32>(  0.0,  0.75),
13 |     );
14 | 
15 |     var colors = array<vec3<f32>, 3>(
16 |         vec3<f32>(1.0, 0.0, 0.0),
17 |         vec3<f32>(0.0, 1.0, 0.0),
18 |         vec3<f32>(0.0, 0.0, 1.0),
19 |     );
20 | 
21 |     var out: VertexPayload;
22 |     out.position = vec4<f32>(positions[i], 0.0, 1.0);
23 |     out.color = colors[i];
24 |     return out;
25 | }
26 | 
27 | @fragment
28 | fn fragment(in: VertexPayload) -> @location(0) vec4<f32> {
29 |     let levels: f32 = 10.0;
30 |     let quantizedColor: vec3<f32> = floor(in.color * levels) / (levels - 1.0);
31 |     return vec4<f32>(quantizedColor, 1.0);
32 | }
33 | 


--------------------------------------------------------------------------------
/AcceleratedKernels.jl/Project/src/Project.jl:
--------------------------------------------------------------------------------
 1 | import AcceleratedKernels as AK
 2 | using CUDA, GPUArrays
 3 | 
 4 | # Define a reduce function to sum up all two elements.
 5 | f = (a, b) -> a + b
 6 | 
 7 | # Define a neutral element for the operation. This is used when there are more threads than elements
 8 | # to reduce. Some elements will be reduced with `0` when they cannot be paired with another element.
 9 | GPUArrays.neutral_element(::typeof(f), ::Type{T}) where T = zero(T)
10 | 
11 | # Create a vector of elements comprised within the range [-1.0, 1.0].
12 | v = CuArray(rand(Float32, 10000) * 2.0 .- 1.0)
13 | 
14 | # Apply the function f to elements in v to produce the result. `init` defines the initial value of
15 | # the reduction.
16 | vsum = AK.reduce(f, v; init=zero(eltype(v)))
17 | 
18 | # The result of the sum follows a shifted Irwin–Hall distribution. Since there are 10000 elements,
19 | # std = sqrt(10000 / 3) ~ 57.7. So the sum will typically fall in the range [-58, 58].
20 | print(vsum)
21 | 


--------------------------------------------------------------------------------
/directx/computing/sum.hlsl:
--------------------------------------------------------------------------------
 1 | // Thread group size
 2 | #define GROUP_SIZE 256
 3 | 
 4 | // Input and output buffers
 5 | RWStructuredBuffer<float> inputBuffer : register(u0);
 6 | RWStructuredBuffer<float> outputBuffer : register(u1);
 7 | 
 8 | // Shared memory for local reduction
 9 | groupshared float localSum[GROUP_SIZE];
10 | 
11 | [numthreads(GROUP_SIZE, 1, 1)]
12 | void main(uint3 threadID : SV_DispatchThreadID, uint3 groupID : SV_GroupID, uint3 localID : SV_GroupThreadID) {
13 |     // Load data into shared memory
14 |     localSum[localID.x] = inputBuffer[threadID.x];
15 |     GroupMemoryBarrierWithGroupSync();
16 | 
17 |     // Parallel reduction within the group
18 |     for (uint stride = GROUP_SIZE / 2; stride > 0; stride /= 2) {
19 |         if (localID.x < stride) {
20 |             localSum[localID.x] += localSum[localID.x + stride];
21 |         }
22 |         GroupMemoryBarrierWithGroupSync();
23 |     }
24 | 
25 |     // Store the partial sum from each group into the output buffer
26 |     if (localID.x == 0) {
27 |         outputBuffer[groupID.x] = localSum[0];
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/sycl/main.cpp:
--------------------------------------------------------------------------------
 1 | // This code is not functional yet! I have not managed to compile it with oneAPI.
 2 | 
 3 | #include <CL/sycl.hpp>
 4 | #include <iostream>
 5 | 
 6 | int main() {
 7 |     sycl::queue q; // Create a SYCL queue (chooses a default device)
 8 | 
 9 |     std::cout << "Running on: " << q.get_device().get_info<sycl::info::device::name>() << "\n";
10 | 
11 |     const int N = 1024;
12 |     std::vector<int> data(N, 1); // Initialize array with 1s
13 | 
14 |     {
15 |         sycl::buffer<int, 1> buf(data.data(), sycl::range<1>(N));
16 |         q.submit([&](sycl::handler &h) {
17 |             sycl::accessor acc(buf, h, sycl::write_only, sycl::no_init);
18 |             h.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
19 |                 acc[i] = i[0]; // Set each element to its index
20 |             });
21 |         });
22 |     } // Buffer goes out of scope -> Data is copied back
23 | 
24 |     std::cout << "First 10 elements: ";
25 |     for (int i = 0; i < 10; i++) {
26 |         std::cout << data[i] << " ";
27 |     }
28 |     std::cout << "\n";
29 | 
30 |     return 0;
31 | }
32 | 


--------------------------------------------------------------------------------
/cpu/rust/src/task3.rs:
--------------------------------------------------------------------------------
 1 | use ndarray::Array;
 2 | use ndarray_rand::RandomExt;
 3 | use ndarray_rand::rand_distr::Uniform;
 4 | use std::time::Instant;
 5 | use floating_duration::TimeAsFloat;
 6 | use std::thread;
 7 | use std::sync::Arc;
 8 | 
 9 | fn parallel_softmax(numbers: Vec<f32>, num_threads: usize) -> f32 {
10 |     let numbers = Arc::new(numbers);
11 |     let chunk_size = (numbers.len() + num_threads - 1) / num_threads;
12 |     let mut handles = Vec::new();
13 | 
14 |     for i in 0..num_threads {
15 |         let numbers = Arc::clone(&numbers);
16 |         handles.push(thread::spawn(move || {
17 |             numbers.iter().skip(i * chunk_size).take(chunk_size).sum::<f32>()
18 |         }));
19 |     }
20 | 
21 |     handles.into_iter().map(|h| h.join().unwrap()).sum()
22 | }
23 | 
24 | /// Task 3: Softmax function.
25 | pub fn task_3(n: u32, threads: u32) -> f64 {
26 |     let data = Array::random((n as usize, ), Uniform::new(-10.0, 10.0)).to_vec();
27 |     let now = Instant::now();
28 |     {
29 |         parallel_softmax(data, threads as usize);
30 |     }
31 |     now.elapsed().as_fractional_secs()
32 | }
33 | 


--------------------------------------------------------------------------------
/cuda/readme.md:
--------------------------------------------------------------------------------
 1 | # CUDA
 2 | 
 3 | CUDA is a GPU programming framework by Nvidia that works only on their GPUs. You can find more
 4 | information at https://developer.nvidia.com/cuda-toolkit. This project uses CUDA to acceleration
 5 | sum reduction.
 6 | 
 7 | CUDA was released in 2007 and is currently used in several machine learning projects, like PyTorch
 8 | and Tensorflow. It is efficient and convenient to use, but since it is restricted to Nvidia GPUs,
 9 | some projects look into alternatives to develop cross-platform applications. For instance, llama.cpp
10 | (https://github.com/ggml-org/llama.cpp) uses multiple GPU backends to support other platforms in
11 | addition to Nvidia GPUs.
12 | 
13 | 
14 | ## Build
15 | 
16 | On linux, run the following commands:
17 | 
18 | ```
19 | sudo apt install nvidia-cuda-toolkit
20 | mkdir build
21 | cd build
22 | nvcc ../main.cu -o main
23 | ./main
24 | ```
25 | 
26 | 
27 | On Windows, download CUDA from https://developer.nvidia.com/cuda-downloads and run the following
28 | commands:
29 | 
30 | ```
31 | mkdir build
32 | cd build
33 | nvcc ..\main.cu -o main
34 | main.exe
35 | ```
36 | 


--------------------------------------------------------------------------------
/cpu/rust/src/task1.rs:
--------------------------------------------------------------------------------
 1 | use ndarray::Array;
 2 | use ndarray_rand::RandomExt;
 3 | use ndarray_rand::rand_distr::Uniform;
 4 | use std::time::Instant;
 5 | use floating_duration::TimeAsFloat;
 6 | use std::thread;
 7 | use std::sync::Arc;
 8 | 
 9 | fn parallel_sum(numbers: Vec<f32>, num_threads: usize) -> f32 {
10 |     let numbers = Arc::new(numbers);
11 |     let chunk_size = (numbers.len() + num_threads - 1) / num_threads;
12 |     let mut handles = Vec::new();
13 | 
14 |     for i in 0..num_threads {
15 |         let numbers = Arc::clone(&numbers);
16 |         handles.push(thread::spawn(move || {
17 |             numbers.iter().skip(i * chunk_size).take(chunk_size).sum::<f32>()
18 |         }));
19 |     }
20 | 
21 |     handles.into_iter().map(|h| h.join().unwrap()).sum()
22 | }
23 | 
24 | /// Task 1: Compute the sum of elements in a 1D array.
25 | pub fn task_1(n: u32, threads: u32) -> f64 {
26 |     let data = Array::random((n as usize, ), Uniform::new(-1.0, 1.0)).to_vec();
27 |     let now = Instant::now();
28 |     {
29 |         parallel_sum(data, threads as usize);
30 |     }
31 |     now.elapsed().as_fractional_secs()
32 | }
33 | 


--------------------------------------------------------------------------------
/webgpu/compute/src/shader.wgsl:
--------------------------------------------------------------------------------
 1 | struct DataBuf {
 2 |     data: array<f32>,
 3 | }
 4 | 
 5 | @group(0)
 6 | @binding(0)
 7 | var<storage, read> inputBuffer: DataBuf;
 8 | 
 9 | @group(0)
10 | @binding(1)
11 | var<storage, read_write> outputBuffer: DataBuf;
12 | 
13 | @compute
14 | @workgroup_size(64)
15 | fn main(@builtin(global_invocation_id) global_id: vec3<u32>,
16 |         @builtin(local_invocation_id) local_id: vec3<u32>,
17 |         @builtin(workgroup_id) workgroup_id: vec3<u32>) {
18 | 
19 |     let index = global_id.x;
20 |     let local_index = local_id.x;
21 |     let workgroup_index = workgroup_id.x;
22 | 
23 |     var offset: u32 = 0;
24 |     while (offset < 64) {
25 |         if (local_index == 0) {
26 |             outputBuffer.data[workgroup_index * 64] += inputBuffer.data[workgroup_index * 64 + offset];
27 |         }
28 |         offset += 1;
29 |     }
30 |     workgroupBarrier();
31 | 
32 |     offset = 64;
33 |     while (offset < arrayLength(&inputBuffer.data)) {
34 |         if (index == 0) {
35 |             outputBuffer.data[0] += outputBuffer.data[offset];
36 |         }
37 |         offset += 64;
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Vincent-Therrien
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/opencl/readme.md:
--------------------------------------------------------------------------------
 1 | # OpenCL
 2 | 
 3 | OpenCL is a specification by Khronos group intended to accelerate programs across heterogeneous
 4 | platforms, like GPUs, but also CPUs, DSPs, and FPGAs. This project uses the Rust library `ocl` to
 5 | compile the example, but you can use it with other languages. OpenCL uses the OpenCL C language for
 6 | its kernels, which is based on C99.
 7 | 
 8 | I have two bones to pick with OpenCL:
 9 | 
10 | - OpenCL is stuck in 2011. The last major release (3.0, 2020) defines OpenCL 1.2 (2011) as a
11 |   mandatory baseline. Newer features implemented in OpenCL 2.X releases are optional! Consequently,
12 |   OpenCL is behind competitors in terms of productivity.
13 | - OpenCL has been described as generally slower than CUDA, but you can minimize the difference by
14 |   optimizing your kernels (https://ieeexplore.ieee.org/document/6047190).
15 | 
16 | 
17 | ## Build
18 | 
19 | Run:
20 | 
21 | ```
22 | cargo run
23 | ```
24 | 
25 | Cargo and the OpenCL runtime must be installed on your system. To install cargo, refer to
26 | https://doc.rust-lang.org/cargo/getting-started/installation.html. For OpenCL, install the driver
27 | package of your GPU manufacturer.
28 | 


--------------------------------------------------------------------------------
/cpu/cpp/cpu.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <string>
 3 | 
 4 | #include "task1.hpp"
 5 | #include "task2.hpp"
 6 | #include "task3.hpp"
 7 | 
 8 | void usage() {
 9 |     std::cout << "Usage:" << std::endl;
10 |     std::cout << "cpu <task> <iterations> <n> <threads>" << std::endl;
11 | }
12 | 
13 | int main(int argc, char *argv[]) {
14 |     if (argc < 5) {
15 |         usage();
16 |         return 1;
17 |     }
18 |     int task, n, n_threads, iterations;
19 |     try {
20 |         task = stoi(std::string(argv[1]));
21 |         n = stoi(std::string(argv[2]));
22 |         iterations = stoi(std::string(argv[3]));
23 |         n_threads = stoi(std::string(argv[4]));
24 |     }
25 |     catch (...) {
26 |         usage();
27 |         return 1;
28 |     }
29 |     double duration = 0.0;
30 |     for (unsigned int i = 0; i < iterations; i++) {
31 |         if (task == 1) {
32 |             duration += task_1(n, n_threads);
33 |         }
34 |         else if (task == 2) {
35 |             duration += task_2(n, n_threads);
36 |         }
37 |         else if (task == 3) {
38 |             duration += task_3(n, n_threads);
39 |         }
40 |         else {
41 |             usage();
42 |             break;
43 |         }
44 |     }
45 |     std::cout << "Average duration (s): " << duration << std::endl;
46 | }
47 | 


--------------------------------------------------------------------------------
/opencl/src/main.rs:
--------------------------------------------------------------------------------
 1 | /// Simple example with OpenCL. The function creates a vector and modifies its value on GPU.
 2 | 
 3 | use ocl::core;
 4 | use ocl::ProQue;
 5 | 
 6 | fn sum_reduce() -> ocl::Result<()> {
 7 |     const TEST_KERNEL_SOURCE: &str = include_str!("./test_kernel.cl");
 8 |     let pro_que = ProQue::builder()
 9 |         .src(TEST_KERNEL_SOURCE)
10 |         .dims(1 << 20)
11 |         .build()?;
12 | 
13 |     let buffer = pro_que.create_buffer::<f32>()?;
14 |     let vec_i = vec![1.0; buffer.len()];
15 |     let mut event = core::Event::null();
16 |     println!("The value at index [{}] is initially '{}'.", 60, vec_i[60]);
17 |     unsafe {
18 |         let _ = core::enqueue_write_buffer(
19 |             &pro_que,
20 |             &buffer,
21 |             false,
22 |             0,
23 |             &vec_i,
24 |             None::<core::Event>,
25 |             Some(&mut event),
26 |         );
27 |         let kernel = pro_que.kernel_builder("add").arg(&buffer).build()?;
28 |         kernel.enq()?;
29 |     }
30 | 
31 |     let mut vec = vec![0.0; buffer.len()];
32 |     buffer.read(&mut vec).enq()?;
33 |     println!("The value at index [{}] is now '{}'!", 60, vec[60]);
34 |     Ok(())
35 | }
36 | 
37 | fn list_devices() {
38 | 
39 | }
40 | 
41 | fn main() {
42 |     let _ = sum_reduce();
43 | }
44 | 


--------------------------------------------------------------------------------
/AcceleratedKernels.jl/readme.md:
--------------------------------------------------------------------------------
 1 | # AcceleratedKernels.jl
 2 | 
 3 | AcceleratedKernels.jl (https://github.com/JuliaGPU/AcceleratedKernels.jl/tree/main) is a Julia
 4 | package for parallel computation on CPU and GPUs. It supports multiple platforms. Visit the page
 5 | https://juliagpu.github.io/AcceleratedKernels.jl/stable/ for explanations.
 6 | 
 7 | I'd classify AcceleratedKernels.jl as a "high-level" interface similar to Triton because it lets
 8 | developers mix CPU and GPU instructions in the same programming language instead of dividing a
 9 | program into regular code and kernel code (e.g. C / GLSL). Julia is appreciated in scientific
10 | computing for its simplicity and performance, so support for GPU programming that further enhances
11 | its performance fits nicely with the language.
12 | 
13 | 
14 | ## Run the Example
15 | 
16 | - Install Julia if it's not on your system: https://julialang.org/install/.
17 | - Modify the requirements in the file `Project.toml` to target your GPU runtime. I used `CUDA`, but
18 |   you might have to use `oneAPI` or `ROCm`, for instance. Refer to https://juliagpu.github.io/AcceleratedKernels.jl/stable/api/using_backends/
19 |   to view the supported backends.
20 | - Open a terminal in the directory `Project`.
21 | - Install the requirements: enter the REPL by running `julia` in a terminal, type `]`, and run
22 |   `instantiate`.
23 | - Exit the Julia REPL.
24 | - Run `julia src/Project.jl`.
25 | 


--------------------------------------------------------------------------------
/directx/readme.md:
--------------------------------------------------------------------------------
 1 | # DirectX
 2 | 
 3 | DirectX is a collection of APIs developed by Microsoft to handle graphics, sound effects, and other
 4 | multimedia tasks. This project uses Direct3D for graphics and DirectCompute for GPGPU. It works
 5 | only on Windows `:(`.
 6 | 
 7 | DirectX uses the HLSL shading language.
 8 | 
 9 | Relevant links:
10 | 
11 | - Rendering pipeline: https://learn.microsoft.com/en-us/windows/win32/direct3d12/direct3d-12-graphics
12 | - Compute shaders: https://learn.microsoft.com/en-us/windows/win32/direct3d11/direct3d-11-advanced-stages-compute-shader
13 | 
14 | 
15 | ## Graphics
16 | 
17 | The directory `graphics` is a self-contained C++ project that uses Direct3D to open a window and
18 | display simple graphics. Direct3D uses HLSL (High-Level Shader Language) to write shaders.
19 | 
20 | The following snippet shows how to build and run it. This only works on Windows!
21 | 
22 | ```
23 | cd graphics
24 | mkdir build
25 | cd build
26 | cmake ..
27 | cmake --build . --config Release
28 | Release\graphics.exe
29 | ```
30 | 
31 | 
32 | ## Computing Example
33 | 
34 | The directory `computing` is a self-contained C++ project that uses DirectCompute to accelerate
35 | parallel computations.
36 | 
37 | The following snippet shows how to build and run it. This only works on Windows!
38 | 
39 | ```
40 | cd computing
41 | mkdir build
42 | cd build
43 | cmake ..
44 | cmake --build . --config Release
45 | copy ..\sum.hlsl Release\sum.hlsl
46 | cd Release
47 | Release\computing.exe
48 | ```
49 | 


--------------------------------------------------------------------------------
/cpu/rust/src/main.rs:
--------------------------------------------------------------------------------
 1 | use clap::Parser;
 2 | mod task1;
 3 | mod task2;
 4 | mod task3;
 5 | 
 6 | /// Execute a multi-core accelerated program.
 7 | #[derive(Parser)]
 8 | #[command(version, about, long_about = None, arg_required_else_help = false)]
 9 | struct Cli {
10 |     /// Task to execute. One of `1`, `2`, or `3`.
11 |     #[arg(long, required = false)]
12 |     task: Option<u32>,
13 | 
14 |     /// Input dimension.
15 |     #[arg(long, required = false)]
16 |     n: Option<u32>,
17 | 
18 |     /// Number of iterations to compute the mean duration.
19 |     #[arg(long, required = false)]
20 |     iterations: Option<u32>,
21 | 
22 |     /// Number of threads.
23 |     #[arg(long, required = false)]
24 |     threads: Option<u32>,
25 | }
26 | 
27 | /// Program entry point.
28 | fn main() {
29 |     let cli = Cli::parse();
30 |     let task: u32 = match cli.task {
31 |         Some(i) => i,
32 |         None => 1,
33 |     };
34 |     let n: u32 = match cli.n {
35 |         Some(i) => i,
36 |         None => 1000,
37 |     };
38 |     let iterations: u32 = match cli.iterations {
39 |         Some(i) => i,
40 |         None => 1,
41 |     };
42 |     let threads: u32 = match cli.threads {
43 |         Some(i) => i,
44 |         None => 1,
45 |     };
46 |     let mut total_duration = 0.0;
47 |     for _ in 0..iterations {
48 |         let duration = match &task {
49 |             1 => task1::task_1(n, threads),
50 |             2 => task2::task_2(n, threads),
51 |             3 => task3::task_3(n, threads),
52 |             _ => panic!("Invalid task."),
53 |         };
54 |         total_duration += duration;
55 |     }
56 |     let average = total_duration / (iterations as f64);
57 |     println!("Average duration (s): {}", average);
58 | }
59 | 


--------------------------------------------------------------------------------
/webgpu/readme.md:
--------------------------------------------------------------------------------
 1 | # WebGPU
 2 | 
 3 | WebGPU is a GPU API that can use Vulkan, DirectX, or Metal as its backend, making it truly
 4 | multi-platform. It uses the WGSL shading language.
 5 | 
 6 | WebGPU has some limitations. For instance, it only supports 32-bit values, so you have to find weird
 7 | workarounds to use FP16 numbers. The project burn (https://github.com/tracel-ai/burn) uses WebGPU
 8 | to accelerate computations because it is cross-platform, but it sometimes uses SPIR-V to perform
 9 | some optimizations. But maybe the specification will evolve to support that more easily.
10 | 
11 | Relevant links:
12 | 
13 | - WebGPU specification: https://www.w3.org/TR/webgpu/
14 | - wgpu (Rust library): https://github.com/gfx-rs/wgpu
15 | 
16 | 
17 | ## Graphics Example
18 | 
19 | The directory `graphics` is a self-contained Rust program that uses the `wgpu` library to display
20 | simple graphics. It is based on the project https://github.com/sotrh/learn-wgpu, licensed under the
21 | MIT license.
22 | 
23 | To run the example, execute the following instructions:
24 | 
25 | ```
26 | cd graphics
27 | cargo run
28 | ```
29 | 
30 | 
31 | ## Compute Example
32 | 
33 | The directory `compute` is a self-contained Rust program that uses the `wgpu` library to run
34 | compute shaders. It is based on the project https://github.com/googlefonts/compute-shader-101,
35 | licensed under the MIT license. This current project **sums the elements in an array** with a
36 | compute shader. This code is not optimized! It should use sum reduction, but it uses fixed-length
37 | instead. Refer to the `opengl` compute example in this repository or the file
38 | https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf to see how to program sum
39 | reduction.
40 | 
41 | To run the example, execute the following instructions:
42 | 
43 | ```
44 | cd compute
45 | cargo run
46 | ```
47 | 


--------------------------------------------------------------------------------
/cpu/cpp/task1.hpp:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <random>
 3 | #include <algorithm>
 4 | #include <iterator>
 5 | #include <vector>
 6 | #include <thread>
 7 | 
 8 | #ifndef CACHE_LINE_SIZE
 9 | #define CACHE_LINE_SIZE 128 // Pad the result vector to avoid false sharing.
10 | #endif
11 | 
12 | /// @brief Computations carried out by each thread.
13 | /// @param a
14 | /// @param chunk
15 | /// @param n_elements
16 | /// @param results
17 | void partial_sum(const std::vector<float> &a, int chunk, int n_elements, std::vector<float> &results)
18 | {
19 |     int start = chunk * n_elements;
20 |     int end = std::min(start + n_elements, (int)a.size());
21 |     for (int i = start; i < end; i++) {
22 |         results[chunk * CACHE_LINE_SIZE] += a[i];
23 |     }
24 | }
25 | 
26 | /// @brief  Dispatching
27 | /// @param a
28 | /// @param n_threads
29 | void parallel_sum(std::vector<float> &a, int n_threads)
30 | {
31 |     std::vector<std::thread> threads;
32 |     std::vector<float> results;
33 | 
34 |     if (n_threads == 1) {
35 |         results.resize(1);
36 |         partial_sum(std::ref(a), 0, a.size(), std::ref(results));
37 |     }
38 |     else {
39 |         results.resize(n_threads * CACHE_LINE_SIZE);
40 |         for (int t = 0; t < n_threads; t++) {
41 |             int n_elements = (a.size() + n_threads - 1) / n_threads;
42 |             threads.emplace_back(partial_sum, std::ref(a), t, n_elements, std::ref(results));
43 |         }
44 | 
45 |         float result = 0.0;
46 |         for (int t = 0; t < n_threads; t++) {
47 |             threads[t].join();
48 |             result += results[t * CACHE_LINE_SIZE];
49 |         }
50 |     }
51 | }
52 | 
53 | double task_1(int n, int n_threads)
54 | {
55 |     std::vector<float> a;
56 |     a.resize(n);
57 |     for (unsigned int i = 0; i < n; i++) {
58 |         a[i] = (((float)rand() / RAND_MAX) - 0.5) * 2.0;
59 |     }
60 |     auto begin = std::chrono::steady_clock::now();
61 |     parallel_sum(a, n_threads);
62 |     auto end = std::chrono::steady_clock::now();
63 |     return std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000000.0;
64 | }
65 | 


--------------------------------------------------------------------------------
/opengl/readme.md:
--------------------------------------------------------------------------------
 1 | # OpenGL
 2 | 
 3 | OpenGL is an application programming interface designed for computer graphics, but it also supports
 4 | general-purpose computing through computing shaders. OpenGL uses GLSL to write shaders.
 5 | 
 6 | Relevant links:
 7 | 
 8 | - History of OpenGL: https://www.khronos.org/opengl/wiki/History_of_OpenGL
 9 | - Rendering pipeline: https://www.khronos.org/opengl/wiki/Rendering_Pipeline_Overview
10 | - Compute shaders: https://www.khronos.org/opengl/wiki/Compute_Shader
11 | 
12 | 
13 | ## Graphics Example
14 | 
15 | The directory `graphics` is a self-contained C++ project that uses OpenGL and additional libraries
16 | to open a window and display simple graphics.
17 | 
18 | The following instructions show how to build and run it. This only works on Linux! Use WSL if
19 | necessary!
20 | 
21 | 1. Download the GLAD library at https://glad.dav1d.de/. GLAD is used to load OpenGL functions.
22 | 2. Place the files `glad.h` and `glad.c` in the `graphics` directory.
23 | 3. Run the following commands:
24 | 
25 | ```
26 | cd graphics
27 | mkdir include && mv glad.h include/glad.h  # Place the GLAD header in an include directory.
28 | mkdir src && mv glad.h src/glad.c  # Place the GLAD source file in a source directory.
29 | sudo apt install -y libglew-dev
30 | sudo apt update && sudo apt install -y cmake g++ libglfw3-dev libgl1-mesa-dev xorg-dev
31 | mkdir build
32 | cd build
33 | cmake ..
34 | cmake --build .
35 | ./graphics
36 | ```
37 | 
38 | This project displays a 3D rotating triangle. You can modify the file `graphics/main.cpp` to
39 | understand how it works.
40 | 
41 | 
42 | ## Computing Example
43 | 
44 | The directory `computing` is a self-contained C++ project that uses OpenGL to accelerate parallel
45 | computations. The program computes the sum of elements in an array through sum reduction.
46 | 
47 | The following instructions show how to build and run it. This only works on Linux! Use WSL if
48 | necessary!
49 | 
50 | Run the following commands:
51 | 
52 | ```
53 | cd computing
54 | sudo apt install -y libglew-dev
55 | sudo apt update && sudo apt install -y cmake g++ libglfw3-dev libgl1-mesa-dev xorg-dev
56 | mkdir build
57 | cd build
58 | cmake ..
59 | cmake --build .
60 | ./computing
61 | ```
62 | 


--------------------------------------------------------------------------------
/cuda/main.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | #define N 1024  // Array size (must be a power of 2 for reduction)
 5 | #define THREADS_PER_BLOCK 256
 6 | 
 7 | // CUDA kernel to sum array elements using parallel reduction
 8 | __global__ void sumReduction(float *input, float *output) {
 9 |     __shared__ float sharedData[THREADS_PER_BLOCK];
10 | 
11 |     int tid = threadIdx.x;
12 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
13 | 
14 |     // Load elements into shared memory
15 |     sharedData[tid] = (idx < N) ? input[idx] : 0.0f;
16 |     __syncthreads();
17 | 
18 |     // Perform parallel reduction
19 |     for (int s = blockDim.x / 2; s > 0; s >>= 1) {
20 |         if (tid < s) {
21 |             sharedData[tid] += sharedData[tid + s];
22 |         }
23 |         __syncthreads();
24 |     }
25 | 
26 |     // Store result from each block
27 |     if (tid == 0) {
28 |         output[blockIdx.x] = sharedData[0];
29 |     }
30 | }
31 | 
32 | // Host function to launch kernel
33 | float sumArrayOnGPU(float *h_array) {
34 |     float *d_array, *d_partialSums;
35 |     int numBlocks = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
36 | 
37 |     cudaMalloc(&d_array, N * sizeof(float));
38 |     cudaMalloc(&d_partialSums, numBlocks * sizeof(float));
39 | 
40 |     cudaMemcpy(d_array, h_array, N * sizeof(float), cudaMemcpyHostToDevice);
41 | 
42 |     // Launch kernel
43 |     sumReduction<<<numBlocks, THREADS_PER_BLOCK>>>(d_array, d_partialSums);
44 | 
45 |     // Copy partial sums back to host
46 |     float *h_partialSums = new float[numBlocks];
47 |     cudaMemcpy(h_partialSums, d_partialSums, numBlocks * sizeof(float), cudaMemcpyDeviceToHost);
48 | 
49 |     // Final sum on CPU
50 |     float totalSum = 0.0f;
51 |     for (int i = 0; i < numBlocks; i++) {
52 |         totalSum += h_partialSums[i];
53 |     }
54 | 
55 |     // Cleanup
56 |     cudaFree(d_array);
57 |     cudaFree(d_partialSums);
58 |     delete[] h_partialSums;
59 | 
60 |     return totalSum;
61 | }
62 | 
63 | // CPU code
64 | int main() {
65 |     float h_array[N];
66 |     for (int i = 0; i < N; i++) {
67 |         h_array[i] = 1.0f;
68 |     }
69 |     float sum = sumArrayOnGPU(h_array);
70 |     std::cout << "Sum: " << sum << std::endl;
71 |     return 0;
72 | }
73 | 


--------------------------------------------------------------------------------
/openmp/readme.md:
--------------------------------------------------------------------------------
 1 | # OpenMP
 2 | 
 3 | OpenMP is a directive-based API, meaning that code intended to run in parallel is flagged with
 4 | compiler directive. In C and C++, this is done with `#pragma omp ...` directives. OpenMP is also
 5 | compatible with Fortran. The page
 6 | https://www.openmp.org/wp-content/uploads/2021-10-20-Webinar-OpenMP-Offload-Programming-Introduction.pdf
 7 | presents a quick intro to OpenMP and the page https://enccs.github.io/openmp-gpu/ explains GPU
 8 | programming with OpenMP in more detail.
 9 | 
10 | OpenMP contrasts with the other APIs in this repository. In most cases, the CPU and GPU code use
11 | different programming languages. For instance, OpenGL and WebGPU use shading languages
12 | to program GPUs and OpenCL / CUDA use variants of the C programming language. OpenMP integrates
13 | that kind of operation seamlessly into CPU code (I find that it makes it actually most similar to
14 | Triton out of all the other APIs in the repository).
15 | 
16 | 
17 | ## Build the Example
18 | 
19 | To build the OpenMP program, run:
20 | 
21 | ```
22 | mkdir build
23 | cd build
24 | cmake ..
25 | cmake --build . --config Release
26 | ```
27 | 
28 | OpenMP comes installed with the compiler, but you may have to ensure that your GPU drivers are up to
29 | date and that there is a CUDA / ACC / OpenCL / ... runtime available on your system to use it. This
30 | program is just a minimal example of array summation.
31 | 
32 | 
33 | ## Why Is the Example So Small?
34 | 
35 | The program written in `openmp.cpp` takes 22 lines of C++ code to sum the elements in an array. In
36 | OpenGL, this takes around 145 lines (see the computing example of OpenGL in the repository).
37 | 
38 | This is because OpenMP lets you flag parallel code and then uses those flags to program the GPU
39 | by itself. In OpenGL / OpenCL / ..., you have to not only write the GPU code, but also:
40 | 
41 | - set up the computing pipeline and initialize resources,
42 | - check for errors,
43 | - manage memory transfers or mappings between host and device (i.e. GPU) memory,
44 | - clean up resource.
45 | 
46 | Of course, that makes those APIs more flexible and I tend to see them more often used in large
47 | projects than OpenMP, but in some cases, the simplicity of OpenMP makes it a more logical choice.
48 | 


--------------------------------------------------------------------------------
/sycl/readme.md:
--------------------------------------------------------------------------------
 1 | # SYCL
 2 | 
 3 | SYCL is a Khronos group specification that lets you write hardware-accelerated instructions into
 4 | regular C++ code instead of manually delegating these computations to kernels. One implementation of
 5 | SYCL is oneAPI, by Intel.
 6 | 
 7 | Relevant links:
 8 | 
 9 | - Khronos group page on SYCL: https://www.khronos.org/sycl/
10 | - Tutorial: https://sycl.tech/getting-started
11 | 
12 | This project does not work! I obtain the error described at
13 | https://github.com/intel/llvm/issues/15910. In theory, it could compile if I reinstalled the whole
14 | compiler toolchain `¯\_(ツ)_/¯`.
15 | 
16 | 
17 | ## Build
18 | 
19 | I completed the following steps to build the example, but it does not work yet.
20 | 
21 | Visit the page https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html
22 | to download oneAPI and execute the following commands:
23 | 
24 | On Linux, run the following commands:
25 | 
26 | ```
27 | source /opt/intel/oneapi/setvars.sh
28 | icpx main.cpp -o main.exe
29 | ```
30 | 
31 | On Windows, run the following commands:
32 | 
33 | ```
34 | "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
35 | icx -fsycl main.cpp -o main.exe
36 | ```
37 | 
38 | I get the following error message:
39 | 
40 | ```
41 | >icx -fsycl main.cpp -o main.exe
42 | Intel(R) oneAPI DPC++/C++ Compiler for applications running on Intel(R) 64, Version 2024.2.0 Build 20240602
43 | Copyright (C) 1985-2024 Intel Corporation. All rights reserved.
44 | 
45 | In file included from gpu-arena\sycl\main.cpp:3:
46 | In file included from C:\Program Files (x86)\Intel\oneAPI\compiler\2024.2\include\sycl\CL\sycl.hpp:11:
47 | In file included from C:\Program Files (x86)\Intel\oneAPI\compiler\2024.2\include\sycl\sycl.hpp:25:
48 | In file included from C:\Program Files (x86)\Intel\oneAPI\compiler\2024.2\include\sycl\accessor.hpp:11:
49 | In file included from C:\Program Files (x86)\Intel\oneAPI\compiler\2024.2\include\sycl\access\access.hpp:14:
50 | In file included from C:\Program Files (x86)\Intel\oneAPI\compiler\2024.2\include\sycl\CL\__spirv\spirv_ops.hpp:25:
51 | In file included from C:\Program Files (x86)\Intel\oneAPI\compiler\2024.2\include\sycl\CL\__spirv\spirv_types.hpp:25:
52 | C:\Program Files (x86)\Intel\oneAPI\compiler\2024.2\include\sycl\detail\defines.hpp(15,10): fatal error: 'climits' file not found
53 |    15 | #include <climits>
54 |       |          ^~~~~~~~~
55 | 1 error generated.
56 | ```
57 | 


--------------------------------------------------------------------------------
/cpu/cpp/task3.hpp:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <chrono>
 3 | #include <random>
 4 | #include <algorithm>
 5 | #include <iterator>
 6 | #include <vector>
 7 | #include <thread>
 8 | 
 9 | #ifndef CACHE_LINE_SIZE
10 | #define CACHE_LINE_SIZE 128
11 | #endif
12 | 
13 | void exponential_partial_sum(const std::vector<float> &a, int chunk, int n_elements, std::vector<float> &results)
14 | {
15 |     int start = chunk * n_elements;
16 |     int end = std::min(start + n_elements, (int)a.size());
17 |     for (int i = start; i < end; i++) {
18 |         results[chunk * CACHE_LINE_SIZE] += exp(a[i]);
19 |     }
20 | }
21 | 
22 | void modify_elements(std::vector<float> &a, int chunk, int n_elements, float exponential_sum)
23 | {
24 |     int start = chunk * n_elements;
25 |     int end = std::min(start + n_elements, (int)a.size());
26 |     for (int i = start; i < end; i++) {
27 |         a[i] = exp(a[i]) / exponential_sum;
28 |     }
29 | }
30 | 
31 | void parallel_softmax(std::vector<float> &a, int n_threads)
32 | {
33 |     std::vector<std::thread> threads;
34 |     std::vector<float> exponential_sums;
35 |     exponential_sums.resize(n_threads * CACHE_LINE_SIZE);
36 |     int n_elements = (a.size() + n_threads - 1) / n_threads;
37 | 
38 |     // Calculate the sum e raised to the power of all elements
39 |     for (int t = 0; t < n_threads; t++) {
40 |         threads.emplace_back(exponential_partial_sum, std::ref(a), t, n_elements, std::ref(exponential_sums));
41 |     }
42 | 
43 |     float exponential_sum = 0.0;
44 |     for (int t = 0; t < n_threads; t++) {
45 |         threads[t].join();
46 |         exponential_sum += exponential_sums[t * CACHE_LINE_SIZE];
47 |     }
48 | 
49 |     // Modify elements in place.
50 |     threads.clear();
51 |     for (int t = 0; t < n_threads; t++) {
52 |         threads.emplace_back(modify_elements, std::ref(a), t, n_elements, exponential_sum);
53 |     }
54 | 
55 |     for (int t = 0; t < n_threads; t++) {
56 |         threads[t].join();
57 |     }
58 | }
59 | 
60 | double task_3(int n, int n_threads)
61 | {
62 |     std::vector<float> a;
63 |     a.resize(n);
64 |     for (unsigned int i = 0; i < n; i++) {
65 |         a[i] = (((float)rand() / RAND_MAX) - 0.5) * 2.0;
66 |     }
67 |     auto begin = std::chrono::steady_clock::now();
68 |     parallel_softmax(a, n_threads);
69 |     auto end = std::chrono::steady_clock::now();
70 |     return std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000000.0;
71 | }
72 | 


--------------------------------------------------------------------------------
/vulkan/readme.md:
--------------------------------------------------------------------------------
 1 | # Vulkan
 2 | 
 3 | Vulkan is an API designed for computer graphics, but it also supports general-purpose computing
 4 | through compute shaders. Vulkan uses SPIR-V as its shading language, which is not intended to be
 5 | human-readable. You can write the shaders in a human readable language, like GLSL and HLSL, and
 6 | compile it to SPIR-V before feeding it to Vulkan. This project uses GLSL and the compiler
 7 | `glslangValidator`.
 8 | 
 9 | Note: The Website https://vulkan-tutorial.com/ is a **much** better resource to learn Vulkan! This
10 | current repository aims at providing simple comparisons between GPU programming APIs, so it only
11 | provides simple examples.
12 | 
13 | 
14 | ## Graphics Example
15 | 
16 | The directory `graphics` is a self-contained C++ project that uses Vulkan and additional libraries
17 | to open a window and display simple graphics. This project is based on the file
18 | https://github.com/Overv/VulkanTutorial/blob/main/code/15_hello_triangle.cpp from the repository
19 | `VulkanTutorial`, available at https://github.com/Overv/VulkanTutorial/tree/main and licensed under
20 | the licenses CC0-1.0 and CC-BY-SA-4.0.
21 | 
22 | The following instructions show how to build and run it. This only works on Linux! Use WSL if
23 | necessary!
24 | 
25 | ```
26 | cd graphics
27 | sudo apt install vulkan-tools
28 | sudo apt install libvulkan-dev
29 | sudo apt install vulkan-validationlayers-dev spirv-tools
30 | sudo apt install libglm-dev
31 | mkdir build
32 | cd build
33 | cmake ..
34 | cmake --build .
35 | glslangValidator -V ../shader.vert -o vertices.spv
36 | glslangValidator -V ../shader.frag -o fragment.spv
37 | ./graphics
38 | ```
39 | 
40 | 
41 | ## Computing Example
42 | 
43 | Vulkan *can* be used for general-purpose computing, but I've found it really impractical. The file
44 | https://github.com/SaschaWillems/Vulkan/blob/master/examples/computeheadless/computeheadless.cpp is
45 | a minimal example that shows how to use Vulkan for computations. It's more than 600 lines long!
46 | 
47 | Vulkan has never been intended to be beginner-friendly; it's made for high performances and aimed
48 | at motivated users. Fortunately, the project Kompute (https://kompute.cc/), which is built atop
49 | Vulkan, lets you easily create and run compute shaders. It can be used with a C++ or Python
50 | interface; this repository uses Python. It is an updated version of the example found at
51 | https://kompute.cc/.
52 | 
53 | To try Kompute, run the following commands on Linux or WSL:
54 | 
55 | ```
56 | cd compute
57 | sudo apt install spirv-tools  # Install glslangValidator
58 | pip install kp  # Install kompute. Ideally, use a virtual environment!
59 | python3 compute.py
60 | ```
61 | 


--------------------------------------------------------------------------------
/cpu/cpp/task2.hpp:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <random>
 3 | #include <algorithm>
 4 | #include <iterator>
 5 | #include <vector>
 6 | #include <thread>
 7 | 
 8 | /// @brief Perfor a matrix multiplication with tiling to avoid false sharing.
 9 | /// @param a
10 | /// @param b
11 | /// @param c
12 | /// @param chunk
13 | /// @param n_elements
14 | void partial_mul(
15 |     std::vector<std::vector<float>> &a,
16 |     std::vector<std::vector<float>> &b,
17 |     std::vector<std::vector<float>> &c,
18 |     int chunk,
19 |     int n_elements
20 | ) {
21 |     int N = a.size();
22 |     int TILE_SIZE = 32;
23 |     for (int i = chunk; i < N; i += TILE_SIZE) {
24 |         for (int j = 0; j < N; j += TILE_SIZE) {
25 |             for (int k = 0; k < N; k += TILE_SIZE) {
26 |                 for (int ii = i; ii < std::min(i + TILE_SIZE, N); ++ii) {
27 |                     for (int jj = j; jj < std::min(j + TILE_SIZE, N); ++jj) {
28 |                         double sum = 0.0;
29 |                         for (int kk = k; kk < std::min(k + TILE_SIZE, N); ++kk) {
30 |                             sum += a[ii][kk] * b[kk][jj];
31 |                         }
32 |                         c[ii][jj] += sum;
33 |                     }
34 |                 }
35 |             }
36 |         }
37 |     }
38 | }
39 | 
40 | void parallel_mat_mul(
41 |     std::vector<std::vector<float>> &a,
42 |     std::vector<std::vector<float>> &b,
43 |     std::vector<std::vector<float>> &c,
44 |     int n_threads
45 | ) {
46 |     std::vector<std::thread> threads;
47 |     int n_elements = (a.size() + n_threads - 1) / n_threads;
48 |     for (int t = 0; t < n_threads; t++) {
49 |         threads.emplace_back(partial_mul, std::ref(a), std::ref(b), std::ref(c), t, n_elements);
50 |     }
51 |     for (int t = 0; t < n_threads; t++) {
52 |         threads[t].join();
53 |     }
54 | }
55 | 
56 | double task_2(int n, int n_threads) {
57 |     std::vector<std::vector<float>> a; // Input
58 |     std::vector<std::vector<float>> b; // Input
59 |     std::vector<std::vector<float>> c; // Result
60 |     a.resize(n);
61 |     b.resize(n);
62 |     c.resize(n);
63 |     for (unsigned int i = 0; i < n; i++) {
64 |         a[i].resize(n);
65 |         b[i].resize(n);
66 |         c[i].resize(n);
67 |         for (unsigned int j = 0; j < n; j++) {
68 |             a[i][j] = (((float)rand() / RAND_MAX) - 0.5) * 2.0;
69 |             b[i][j] = (((float)rand() / RAND_MAX) - 0.5) * 2.0;
70 |         }
71 |     }
72 |     auto begin = std::chrono::steady_clock::now();
73 |     parallel_mat_mul(a, b, c, n_threads);
74 |     auto end = std::chrono::steady_clock::now();
75 | 
76 |     return std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000000.0;
77 | }
78 | 


--------------------------------------------------------------------------------
/cpu/readme.md:
--------------------------------------------------------------------------------
 1 | # CPU
 2 | 
 3 | Algorithm acceleration with multi-threading on CPUs.
 4 | 
 5 | Most projects in this repository demonstrate how to use GPUs (graphics processing units), which
 6 | excel at processing **large** amounts of ideally **weakly dependent** data. GPUs are inadequate for
 7 | many situations, in which case you can fall back on CPUs (central processing units). This repository
 8 | contains programs written in C++ and Rust to show how to use threads to accelerate algorithms. This
 9 | is one way of achieving parallelism with a CPU.
10 | 
11 | 
12 | ## C++ Version
13 | 
14 | This project uses the class `std::thread` to implement multithreading. It's also possible to use
15 | the function `pthread_create` to launch threads.
16 | 
17 | 
18 | ### Build the Project
19 | 
20 | - Install [cmake](https://cmake.org/) on your system.
21 | - Install [clang](`https://clang.llvm.org/`) on your system (yes, even if you are on W*ndows!)
22 | - Navigate in the `cpp` directory.
23 | - Create a `build` subdirectory.
24 | - Navigate in the `build` subdirectory.
25 | - Run the command `cmake ..`.
26 | - Run the command `cmake --build . --config Release`.
27 | 
28 | **Note**: The option `--config Release` is important because it instructs cmake to add optimization
29 | flags. Without them, the compiler will not optimize aggressively enough and you might notice that
30 | multithreading *decreases* performances instead of improving them.
31 | 
32 | 
33 | ### Usage
34 | 
35 | Run:
36 | 
37 | ```
38 | # Linux
39 | ./build/Debug/cpu <task> <n> <iterations> <threads>
40 | 
41 | # On the OS that begins with the letter W
42 | build\Debug\cpu.exe <task> <n> <iterations> <threads>
43 | ```
44 | 
45 | where:
46 | 
47 | - `<task>` is an integer ranging from 1 to 3, inclusively, that designates the task to execute.
48 | - `<n>` is the dimension of the input data.
49 | - `<iterations>` is the number of times that the computation must be repeated. The execution time
50 |   reported by the program is the arithmetic mean of the duration of all iterations.
51 | - `<threads>` is the number of threads.
52 | 
53 | 
54 | ## Rust Version
55 | 
56 | This project uses the standard modules `std::thread` and `std::sync::Arc` to implement
57 | multithreading.
58 | 
59 | 
60 | ### Build
61 | 
62 | - Install [cargo](https://doc.rust-lang.org/stable/cargo/) on your system.
63 | - Run the command `cargo build --release`.
64 | 
65 | 
66 | ### Usage
67 | 
68 | Run the command:
69 | 
70 | ```
71 | # Linux
72 | ./target/release/cpu --task <t> --n <n> --iterations <i> --thread <c>
73 | 
74 | # Windows
75 | target\release\cpu --task <t> --n <n> --iterations <i> --thread <c>
76 | ```
77 | 
78 | where
79 | 
80 | - `<t>` is an integer ranging from 1 to 3, inclusively, that designates the task to execute.
81 |   Default: 1
82 | - `<n>` is the dimension of the input data. Default: 1000
83 | - `<i>` is the number of times that the computation must be repeated. The execution time reported
84 |   by the program is the arithmetic mean of the duration of all iterations. Default: 1
85 | - `<c>` is the number of threads. Default: 1
86 | 


--------------------------------------------------------------------------------
/vulkan/compute/compute.py:
--------------------------------------------------------------------------------
 1 | """Minimal kompute example.
 2 | 
 3 | This script is an updated version of the example found at https://kompute.cc/.
 4 | """
 5 | 
 6 | import os
 7 | import numpy as np
 8 | import kp
 9 | 
10 | 
11 | def compile_source(source):
12 |     open("tmp_kp_shader.comp", "w").write(source)
13 |     os.system("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv")
14 |     return open("tmp_kp_shader.comp.spv", "rb").read()
15 | 
16 | 
17 | def kompute(shader):
18 |     # 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
19 |     mgr = kp.Manager()
20 | 
21 |     # 2. Create and initialise Kompute Tensors through manager
22 | 
23 |     # Default tensor constructor simplifies creation of float values
24 |     tensor_in_a = mgr.tensor([2, 2, 2])
25 |     tensor_in_b = mgr.tensor([1, 2, 3])
26 |     # Explicit type constructor supports uint32, int32, double, float and bool
27 |     tensor_out_a = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
28 |     tensor_out_b = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
29 | 
30 |     params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]
31 | 
32 |     # 3. Create algorithm based on shader (supports buffers & push/spec constants)
33 |     workgroup = (3, 1, 1)
34 |     spec_consts = [2]
35 |     push_consts_a = [2]
36 |     push_consts_b = [3]
37 | 
38 |     # See documentation shader section for compile_source
39 |     spirv = compile_source(shader)
40 | 
41 |     algo = mgr.algorithm(params, spirv, workgroup, spec_consts, push_consts_a)
42 | 
43 |     # 4. Run operation synchronously using sequence
44 |     (mgr.sequence()
45 |         .record(kp.OpTensorSyncDevice(params))
46 |         .record(kp.OpAlgoDispatch(algo)) # Binds default push consts provided
47 |         .eval() # evaluates the two recorded ops
48 |         .record(kp.OpAlgoDispatch(algo, push_consts_b)) # Overrides push consts
49 |         .eval()) # evaluates only the last recorded op
50 | 
51 |     # 5. Sync results from the GPU asynchronously
52 |     sq = mgr.sequence()
53 |     sq.eval_async(kp.OpTensorSyncLocal(params))
54 | 
55 |     # ... Do other work asynchronously whilst GPU finishes
56 | 
57 |     sq.eval_await()
58 | 
59 |     # Prints the first output which is: { 4, 8, 12 }
60 |     print(tensor_out_a.data())
61 |     # Prints the first output which is: { 10, 10, 10 }
62 |     print(tensor_out_b.data())
63 | 
64 | 
65 | if __name__ == "__main__":
66 | 
67 |     # Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
68 |     # files). This shader shows some of the main components including constants, buffers, etc
69 |     shader = """
70 |         #version 450
71 | 
72 |         layout (local_size_x = 1) in;
73 | 
74 |         // The input tensors bind index is relative to index in parameter passed
75 |         layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
76 |         layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
77 |         layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
78 |         layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
79 | 
80 |         // Kompute supports push constants updated on dispatch
81 |         layout(push_constant) uniform PushConstants {
82 |             float val;
83 |         } push_const;
84 | 
85 |         // Kompute also supports spec constants on initialization
86 |         layout(constant_id = 0) const float const_one = 0;
87 | 
88 |         void main() {
89 |             uint index = gl_GlobalInvocationID.x;
90 |             out_a[index] += uint( in_a[index] * in_b[index] );
91 |             out_b[index] += uint( const_one * push_const.val );
92 |         }
93 |     """
94 |     kompute(shader)
95 | 


--------------------------------------------------------------------------------
/benchmark.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Build and run multithreaded CPU programs.
  3 | 
  4 |     A multithreaded is not necessarily faster than a purely sequential one!
  5 |     Multithreading entails some overhead when splitting data and coordinating
  6 |     threads, so a single thread often outperforms multiple ones for small
  7 |     datasets. This script lets you measure which thread count is most efficient
  8 |     for a given dataset size.
  9 | 
 10 |     Usage:
 11 | 
 12 |     $python3 benchmark.py
 13 | """
 14 | 
 15 | import os
 16 | import subprocess
 17 | import matplotlib.pyplot as plt
 18 | 
 19 | BACKEND = "cpu-cpp"  # WHich accelerated program to use (either cpu-cpp or cpu-rust).
 20 | TASK = "1"  # Refer to the file `cpu/readme.md`` for a description of each task.
 21 | MIN_N = 10_000  # Minimum dataset size.
 22 | MAX_N = 10_000_000  # Maximum dataset size.
 23 | ITERATIONS = 5  # Number of iterations to perform. The duration is the average of all iterations.
 24 | N_THREADS = (1, 2)  # Number of threads to use in each comparison.
 25 | 
 26 | durations = {}
 27 | for n in N_THREADS:
 28 |     durations[n] = []
 29 | 
 30 | 
 31 | def cpu_cpp():
 32 |     os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/cpu/cpp")
 33 |     try:
 34 |         subprocess.check_call(["mkdir", "build"])
 35 |     except:
 36 |         pass
 37 |     os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/cpu/cpp/build")
 38 |     subprocess.check_call(["cmake", ".."])
 39 |     subprocess.check_call(["cmake", "--build", ".", "--config", "Release"])
 40 |     os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/cpu/cpp/build/Release")
 41 |     files = os.listdir()
 42 |     if "cpu.exe" in files:
 43 |         program = "cpu.exe"
 44 |     else:
 45 |         program = "cpu"
 46 | 
 47 |     x = []
 48 |     n = MIN_N
 49 |     while n <= MAX_N:
 50 |         print(f"N: {n}")
 51 |         x.append(n)
 52 |         for t in durations:
 53 |             print(f"   t: {t} = ", end = "")
 54 |             v = subprocess.run(
 55 |                 [program, TASK, str(n), str(ITERATIONS), str(t)],
 56 |                 capture_output = True,
 57 |                 text = True
 58 |             ).stdout
 59 |             durations[t].append(float(str(v).split(" ")[-1].rstrip()))
 60 |             print(durations[t][-1])
 61 |         n *= 10
 62 | 
 63 |     fig, ax = plt.subplots()
 64 |     for t in N_THREADS:
 65 |         ax.plot(x, durations[t], label=f"{t}")
 66 |     ax.set(xlabel='Number of data points', ylabel='Duration (s)')
 67 |     ax.set_xscale('log')
 68 |     ax.set_yscale('log')
 69 |     ax.legend()
 70 |     plt.show()
 71 | 
 72 | 
 73 | def cpu_rust():
 74 |     os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/cpu/rust")
 75 |     x = []
 76 |     n = MIN_N
 77 |     while n <= MAX_N:
 78 |         print(f"N: {n}")
 79 |         x.append(n)
 80 |         for t in durations:
 81 |             print(f"   t: {t} = ", end = "")
 82 |             program = [
 83 |                 "cargo", "run", "--",
 84 |                 "--task", str(TASK),
 85 |                 "--n", str(n),
 86 |                 "--iterations", str(ITERATIONS),
 87 |                 "--threads", str(t)
 88 |             ]
 89 |             v = subprocess.run(
 90 |                 program,
 91 |                 capture_output = True,
 92 |                 text = True
 93 |             ).stdout
 94 |             durations[t].append(float(str(v).split(" ")[-1].rstrip()))
 95 |             print(durations[t][-1])
 96 |         n *= 10
 97 | 
 98 |     fig, ax = plt.subplots()
 99 |     for t in N_THREADS:
100 |         ax.plot(x, durations[t], label=f"{t}")
101 |     ax.set(xlabel='Number of data points', ylabel='Duration (s)')
102 |     ax.set_xscale('log')
103 |     ax.set_yscale('log')
104 |     ax.legend()
105 |     plt.show()
106 | 
107 | 
108 | if BACKEND == "cpu-cpp" or BACKEND == "cpu":
109 |     cpu_cpp()
110 | elif BACKEND == "cpu-rust":
111 |     cpu_rust()
112 | 


--------------------------------------------------------------------------------
/triton/main.py:
--------------------------------------------------------------------------------
  1 | """Calculate a sum reduction and a softmax function with Triton."""
  2 | 
  3 | import triton
  4 | import triton.language as tl
  5 | import torch
  6 | 
  7 | 
  8 | @triton.jit
  9 | def fold(
 10 |         x_ptr,
 11 |         middle: tl.constexpr,
 12 |         end: tl.constexpr,
 13 |         n_elements,
 14 |         BLOCK_SIZE: tl.constexpr
 15 |     ):
 16 |     """Perform one sum reduction pass.
 17 | 
 18 |     This function has to be separate from ``sum_reduction`` because the values
 19 |     used by the kernel change between iterations. Since Triton must compile the
 20 |     kernels before executing them, the parameters must be of type
 21 |     ``tl.constexpr``, not ``int``.
 22 | 
 23 |     Args:
 24 |         x_ptr: Pointer to the input tensor. Modified in place!
 25 |         middle: Half of the size of the current reduction.
 26 |         end: Size of the current reduction.
 27 |         n_elements: Total number of elements in the tensor. Can be of type
 28 |             ``int`` because it is used for masking only.
 29 |         BLOCK_SIZE: Number of elements processed by each program.
 30 |     """
 31 |     m = tl.program_id(axis=0)
 32 |     block_start = m * BLOCK_SIZE
 33 |     right_offsets = block_start + tl.arange(middle, end)
 34 |     right_x = tl.load(x_ptr + right_offsets, mask=(right_offsets < n_elements))
 35 |     left_offsets = block_start + tl.arange(0, middle)
 36 |     left_x = tl.load(x_ptr + left_offsets, mask=(left_offsets < n_elements))
 37 |     tl.store(x_ptr + left_offsets, left_x + right_x, mask=(left_offsets < n_elements))
 38 | 
 39 | 
 40 | def sum_reduction(x_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
 41 |     """GPU-accelerated sum reduction.
 42 | 
 43 |     This function adds up the elements in an array in pairs. For instance:
 44 | 
 45 |     Initial array:    0  1  2  3  4  5  6  7
 46 |     First reduction:  4  6  8  10            <- (0 + 4) (1 + 5) (2 + 6) (3 + 7)
 47 |     Second reduction: 12 16
 48 |     Final reduction:  28
 49 | 
 50 |     Refer to the file https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
 51 |     for a detailed discussion!
 52 | 
 53 |     Args:
 54 |         x_ptr: Pointer to the input tensor to sum. Modified in place!
 55 |         n_elements: Total number of elements to sum.
 56 |         BLOCK_SIZE: Number of elements processed by each program.
 57 |     """
 58 |     stride = BLOCK_SIZE // 2
 59 |     while stride > 0:
 60 |         grid = (n_elements, )
 61 |         fold[grid](x_ptr, stride, stride * 2, n_elements, BLOCK_SIZE)
 62 |         stride //= 2
 63 | 
 64 | 
 65 | print("Trying out sum reduction with Triton!")
 66 | X = torch.Tensor((0, 1, 2, 3, 4, 5, 6, 7)).to('cuda')
 67 | print(f"    Input values: {X}")
 68 | sum_reduction(X, len(X), len(X))
 69 | print(f"    Output values: {X}")
 70 | print(f"    Sum: {X[0]}\n")
 71 | 
 72 | 
 73 | @triton.jit
 74 | def softmax(Y, stride_ym, stride_yn, X, stride_xm, stride_xn, N):
 75 |     """Compute a sum reduction.
 76 | 
 77 |     This function is taken from the blog post https://openai.com/index/triton/
 78 |     by OpenAI.
 79 | 
 80 |     Args:
 81 |         Y: Pointer to the output tensor.
 82 |         stride_ym: Stride in axis 0.
 83 |         stride_yn: Stride in axis 1.
 84 |         X: Pointer to the input tensor.
 85 |         stride_xm: Stride in axis 0.
 86 |         stride_xn: Stride in axis 1.
 87 |         N: Number of columns.
 88 |     """
 89 |     m = tl.program_id(0)
 90 |     BLOCK_SIZE: tl.constexpr = 1024
 91 |     n = tl.arange(0, BLOCK_SIZE)
 92 |     X = X + m * stride_xm + n * stride_xn
 93 |     x = tl.load(X, mask=n < N, other=-float('inf'))
 94 |     z = x - tl.max(x, axis=0)
 95 |     num = tl.exp(z)
 96 |     denom = tl.sum(num, axis=0)
 97 |     y = num / denom
 98 |     Y = Y + m * stride_ym + n * stride_yn
 99 |     tl.store(Y, y, mask=n < N)
100 | 
101 | 
102 | print("Trying out a Softmax function with Triton!")
103 | X = torch.normal(0, 1, size=(4, 4), device='cuda')
104 | Y = torch.empty_like(X)
105 | grid = (X.shape[0], )
106 | softmax[grid](Y, Y.stride(0), Y.stride(1),
107 |               X, X.stride(0), X.stride(1),
108 |               X.shape[1])
109 | print(f"Input values:\n{X}")
110 | print(f"Output values:\n{Y}\n")
111 | 


--------------------------------------------------------------------------------
/directx/computing/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <d3d11.h>
  2 | #include <d3dcompiler.h>
  3 | #include <vector>
  4 | #include <iostream>
  5 | 
  6 | #pragma comment(lib, "d3d11.lib")
  7 | #pragma comment(lib, "d3dcompiler.lib")
  8 | 
  9 | #define GROUP_SIZE 256
 10 | 
 11 | // Error checking macro
 12 | #define HR_CHECK(hr) if (FAILED(hr)) { std::cerr << "D3D Error at line " << __LINE__ << std::endl; return -1; }
 13 | 
 14 | int main() {
 15 |     // Sample data (must be a multiple of GROUP_SIZE for simplicity)
 16 |     const int dataSize = 1024;
 17 |     std::vector<float> data(dataSize, 1.0f); // Fill with ones for easy sum verification
 18 |     int numGroups = dataSize / GROUP_SIZE;
 19 | 
 20 |     // Step 1: Initialize Direct3D
 21 |     ID3D11Device* device = nullptr;
 22 |     ID3D11DeviceContext* context = nullptr;
 23 |     D3D_FEATURE_LEVEL featureLevel;
 24 |     D3D11CreateDevice(nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0, nullptr, 0, D3D11_SDK_VERSION, &device, &featureLevel, &context);
 25 | 
 26 |     // Step 2: Compile Compute Shader
 27 |     ID3DBlob* csBlob = nullptr;
 28 |     ID3DBlob* errorBlob = nullptr;
 29 |     HRESULT hr = D3DCompileFromFile(L"sum.hlsl", nullptr, nullptr, "main", "cs_5_0", 0, 0, &csBlob, &errorBlob);
 30 |     if (FAILED(hr)) {
 31 |         if (errorBlob) std::cout << "Error compiling the shader: " << (char*)errorBlob->GetBufferPointer();
 32 |         return -1;
 33 |     }
 34 | 
 35 |     // Step 3: Create Compute Shader
 36 |     ID3D11ComputeShader* computeShader = nullptr;
 37 |     HR_CHECK(device->CreateComputeShader(csBlob->GetBufferPointer(), csBlob->GetBufferSize(), nullptr, &computeShader));
 38 |     csBlob->Release();
 39 | 
 40 |     // Step 4: Create Buffers (Input & Output)
 41 |     D3D11_BUFFER_DESC bufferDesc = {};
 42 |     bufferDesc.Usage = D3D11_USAGE_DEFAULT;
 43 |     bufferDesc.ByteWidth = dataSize * sizeof(float);
 44 |     bufferDesc.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
 45 |     bufferDesc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
 46 |     bufferDesc.StructureByteStride = sizeof(float);
 47 | 
 48 |     D3D11_SUBRESOURCE_DATA initData = { data.data(), 0, 0 };
 49 |     ID3D11Buffer* inputBuffer = nullptr;
 50 |     HR_CHECK(device->CreateBuffer(&bufferDesc, &initData, &inputBuffer));
 51 | 
 52 |     bufferDesc.ByteWidth = numGroups * sizeof(float);
 53 |     ID3D11Buffer* outputBuffer = nullptr;
 54 |     HR_CHECK(device->CreateBuffer(&bufferDesc, nullptr, &outputBuffer));
 55 | 
 56 |     // Step 5: Create Unordered Access Views (UAV)
 57 |     D3D11_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
 58 |     uavDesc.Format = DXGI_FORMAT_UNKNOWN;
 59 |     uavDesc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
 60 |     uavDesc.Buffer.NumElements = dataSize;
 61 |     ID3D11UnorderedAccessView* inputUAV = nullptr;
 62 |     HR_CHECK(device->CreateUnorderedAccessView(inputBuffer, &uavDesc, &inputUAV));
 63 | 
 64 |     uavDesc.Buffer.NumElements = numGroups;
 65 |     ID3D11UnorderedAccessView* outputUAV = nullptr;
 66 |     HR_CHECK(device->CreateUnorderedAccessView(outputBuffer, &uavDesc, &outputUAV));
 67 | 
 68 |     // Step 6: Run the Compute Shader
 69 |     context->CSSetShader(computeShader, nullptr, 0);
 70 |     context->CSSetUnorderedAccessViews(0, 1, &inputUAV, nullptr);
 71 |     context->CSSetUnorderedAccessViews(1, 1, &outputUAV, nullptr);
 72 |     context->Dispatch(numGroups, 1, 1);
 73 | 
 74 |     // Step 7: Read Back Result
 75 |     std::vector<float> partialSums(numGroups);
 76 |     D3D11_BUFFER_DESC readbackDesc = {};
 77 |     readbackDesc.Usage = D3D11_USAGE_STAGING;
 78 |     readbackDesc.ByteWidth = numGroups * sizeof(float);
 79 |     readbackDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
 80 |     readbackDesc.StructureByteStride = sizeof(float);
 81 |     ID3D11Buffer* readbackBuffer = nullptr;
 82 |     HR_CHECK(device->CreateBuffer(&readbackDesc, nullptr, &readbackBuffer));
 83 | 
 84 |     context->CopyResource(readbackBuffer, outputBuffer);
 85 |     D3D11_MAPPED_SUBRESOURCE mappedResource;
 86 |     HR_CHECK(context->Map(readbackBuffer, 0, D3D11_MAP_READ, 0, &mappedResource));
 87 |     memcpy(partialSums.data(), mappedResource.pData, numGroups * sizeof(float));
 88 |     context->Unmap(readbackBuffer, 0);
 89 | 
 90 |     // Step 8: Compute Final Sum on CPU
 91 |     float finalSum = 0.0f;
 92 |     for (float val : partialSums) {
 93 |         finalSum += val;
 94 |     }
 95 | 
 96 |     std::cout << "Sum of elements: " << finalSum << std::endl; // Expected: 1024
 97 | 
 98 |     // Cleanup
 99 |     inputUAV->Release();
100 |     outputUAV->Release();
101 |     inputBuffer->Release();
102 |     outputBuffer->Release();
103 |     readbackBuffer->Release();
104 |     computeShader->Release();
105 |     context->Release();
106 |     device->Release();
107 | 
108 |     return 0;
109 | }
110 | 


--------------------------------------------------------------------------------
/opengl/computing/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <vector>
  3 | #include <GL/glew.h>
  4 | #include <GLFW/glfw3.h>
  5 | #include <chrono>
  6 | 
  7 | #define ARRAY_SIZE 10240000  // 1024 * 10000
  8 | 
  9 | // Compute shader source (performs parallel reduction sum)
 10 | // Refer to https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf for a discussion
 11 | // of sum reduction on GPU if you have trouble understanding the code.
 12 | const char* computeShaderSource = R"(
 13 | #version 310 es  // This is the only version that works on my system!
 14 | #define ARRAY_SIZE 1024
 15 | #define HALF_ARRAY_SIZE 512u
 16 | 
 17 | layout(std430, binding = 0) buffer InputBuffer {
 18 |     float inputData[];
 19 | };
 20 | 
 21 | layout(std430, binding = 1) buffer OutputBuffer {
 22 |     float outputData[];
 23 | };
 24 | 
 25 | shared float groupData[ARRAY_SIZE];
 26 | 
 27 | layout(local_size_x = ARRAY_SIZE) in;
 28 | 
 29 | void main() {
 30 |     uint index = gl_GlobalInvocationID.x;
 31 |     uint local_index = gl_LocalInvocationID.x;
 32 |     uint groupID = gl_WorkGroupID.x;
 33 | 
 34 |     groupData[local_index] = inputData[index];
 35 |     barrier();
 36 | 
 37 |     for (uint s = HALF_ARRAY_SIZE; s != 0u; s >>= 1) {
 38 |         if (local_index < s) {
 39 |             groupData[local_index] += groupData[local_index + s];
 40 |         }
 41 |         barrier();
 42 |     }
 43 | 
 44 |     if (local_index == 0u) {
 45 |         outputData[groupID] = groupData[0];
 46 |     }
 47 | }
 48 | )";
 49 | 
 50 | GLuint createComputeShader(const char* source) {
 51 |     GLuint shader = glCreateShader(GL_COMPUTE_SHADER);
 52 |     glShaderSource(shader, 1, &source, nullptr);
 53 |     glCompileShader(shader);
 54 | 
 55 |     // Check for errors
 56 |     GLint success;
 57 |     glGetShaderiv(shader, GL_COMPILE_STATUS, &success);
 58 |     if (!success) {
 59 |         char log[512];
 60 |         glGetShaderInfoLog(shader, 512, nullptr, log);
 61 |         std::cerr << "Compute Shader Compilation Error:\n" << log << std::endl;
 62 |     }
 63 | 
 64 |     GLuint program = glCreateProgram();
 65 |     glAttachShader(program, shader);
 66 |     glLinkProgram(program);
 67 | 
 68 |     glDeleteShader(shader);
 69 |     return program;
 70 | }
 71 | 
 72 | int main()
 73 | {
 74 |     // Initialize GLFW (no window needed)
 75 |     if (!glfwInit()) {
 76 |         std::cerr << "Failed to initialize GLFW\n";
 77 |         return -1;
 78 |     }
 79 |     glfwWindowHint(GLFW_VISIBLE, GLFW_FALSE); // Hide window
 80 |     GLFWwindow* window = glfwCreateWindow(100, 100, "Compute Shader", nullptr, nullptr);
 81 |     glfwMakeContextCurrent(window);
 82 | 
 83 |     // Initialize GLEW
 84 |     if (glewInit() != GLEW_OK) {
 85 |         std::cerr << "Failed to initialize GLEW\n";
 86 |         return -1;
 87 |     }
 88 | 
 89 |     // Input data
 90 |     std::vector<float> inputData(ARRAY_SIZE, 1.0f);
 91 |     size_t dataSize = inputData.size() * sizeof(float);
 92 | 
 93 |     GLuint inputBuffer, outputBuffer;
 94 |     glGenBuffers(1, &inputBuffer);
 95 |     glGenBuffers(1, &outputBuffer);
 96 | 
 97 |     glBindBuffer(GL_SHADER_STORAGE_BUFFER, inputBuffer);
 98 |     glBufferData(GL_SHADER_STORAGE_BUFFER, dataSize, inputData.data(), GL_DYNAMIC_COPY);
 99 | 
100 |     // The reduced sum is 1024 times smaller than the input.
101 |     glBindBuffer(GL_SHADER_STORAGE_BUFFER, outputBuffer);
102 |     glBufferData(GL_SHADER_STORAGE_BUFFER, dataSize / 1024, nullptr, GL_DYNAMIC_COPY);
103 | 
104 |     // Create and run compute shader
105 |     GLuint computeProgram = createComputeShader(computeShaderSource);
106 |     glUseProgram(computeProgram);
107 | 
108 |     auto begin = std::chrono::steady_clock::now();
109 |     glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, inputBuffer);
110 |     glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, outputBuffer);
111 | 
112 |     int numWorkgroups = ARRAY_SIZE / 1024;
113 |     numWorkgroups = numWorkgroups ? numWorkgroups : 1;
114 | 
115 |     glUseProgram(computeProgram);
116 |     glDispatchCompute(numWorkgroups, 1, 1);
117 |     glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
118 | 
119 |     // Retrieve result
120 |     glBindBuffer(GL_SHADER_STORAGE_BUFFER, outputBuffer);
121 |     auto end = std::chrono::steady_clock::now();
122 |     float* mappedData = (float*)glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY);
123 | 
124 |     // Aggregate the reduced sums. If the input array is bigger than 1024, we have to add up
125 |     // multiple values because the array is not totally reduced. We could do another pass of the
126 |     // shader to reduce the array again... or we can just use a for loop to add up the sums :).
127 |     float total = 0;
128 |     for (unsigned int i = 0; i < numWorkgroups; i++) {
129 |         total += mappedData[i];
130 |     }
131 |     glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
132 | 
133 |     double duration = std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000000.0;
134 | 
135 |     std::cout << "Result: " << total << " (expected " << ARRAY_SIZE << ")." << std::endl;
136 |     std::cout << "Duration: " << duration << " s." << std::endl;
137 | 
138 |     // Cleanup
139 |     glDeleteProgram(computeProgram);
140 |     glDeleteBuffers(1, &inputBuffer);
141 |     glDeleteBuffers(1, &outputBuffer);
142 |     glfwDestroyWindow(window);
143 |     glfwTerminate();
144 |     return 0;
145 | }
146 | 


--------------------------------------------------------------------------------
/opengl/graphics/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "glad.h"  // Load OpenGL functions
  2 | #include <GLFW/glfw3.h>  // Window management
  3 | #include <iostream>
  4 | #include <chrono>
  5 | 
  6 | // Vertex shader source code. This modifies the position of the vertices.
  7 | const char* vertexShaderSource = R"(
  8 |     #version 330 core
  9 | 
 10 |     layout(location = 0) in vec3 inPos;
 11 |     layout(location = 1) in vec4 inColor;
 12 | 
 13 |     out vec4 fragColor;
 14 | 
 15 |     uniform float angle;
 16 | 
 17 |     void main() {
 18 |         mat2 rotation = mat2(
 19 |             cos(angle), -sin(angle),
 20 |             sin(angle),  cos(angle)
 21 |         );
 22 |         vec2 rotatedPos = rotation * inPos.xz;
 23 |         gl_Position = vec4(rotatedPos.x, inPos.y, rotatedPos.y, 1.0);
 24 |         fragColor = inColor;
 25 |     }
 26 | )";
 27 | 
 28 | // Fragment (i.e. pixel most of the time) shader source code. This determines the final colors.
 29 | const char* fragmentShaderSource = R"(
 30 |     #version 330 core
 31 | 
 32 |     in vec4 fragColor;
 33 |     out vec4 outColor;
 34 | 
 35 |     void main() {
 36 |         float levels = 10.0;
 37 |         vec3 quantizedColor =
 38 |             floor(fragColor.rgb * levels)
 39 |             / (levels - 1.0);
 40 |         outColor = vec4(quantizedColor, 1.0);
 41 |     }
 42 | )";
 43 | 
 44 | void framebuffer_size_callback(GLFWwindow* window, int width, int height) {
 45 |     glViewport(0, 0, width, height);
 46 | }
 47 | 
 48 | int main() {
 49 |     // Initialize GLFW
 50 |     if (!glfwInit()) {
 51 |         std::cerr << "Failed to initialize GLFW\n";
 52 |         return -1;
 53 |     }
 54 | 
 55 |     // Set OpenGL version (3.3 Core Profile)
 56 |     glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
 57 |     glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
 58 |     glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
 59 | 
 60 |     // Create a window
 61 |     GLFWwindow* window = glfwCreateWindow(800, 600, "OpenGL!", NULL, NULL);
 62 |     if (!window) {
 63 |         std::cerr << "Failed to create GLFW window\n";
 64 |         glfwTerminate();
 65 |         return -1;
 66 |     }
 67 | 
 68 |     glfwMakeContextCurrent(window);
 69 | 
 70 |     // Load OpenGL functions using GLAD
 71 |     if (!gladLoadGLLoader((GLADloadproc)glfwGetProcAddress)) {
 72 |         std::cerr << "Failed to initialize GLAD\n";
 73 |         return -1;
 74 |     }
 75 | 
 76 |     glViewport(0, 0, 800, 600);
 77 |     glfwSetFramebufferSizeCallback(window, framebuffer_size_callback);
 78 | 
 79 |     // Vertex Data
 80 |     float vertices[] = {
 81 |        //  Positions        // Colors
 82 |         0.0f,  0.5f, 0.0f,  1.0f, 0.0f, 0.0f,  // Top (Red)
 83 |        -0.5f, -0.5f, 0.0f,  0.0f, 1.0f, 0.0f,  // Left (Green)
 84 |         0.5f, -0.5f, 0.0f,  0.0f, 0.0f, 1.0f   // Right (Blue)
 85 |     };
 86 | 
 87 |     // Create a Vertex Buffer Object (VBO) and Vertex Array Object (VAO)
 88 |     unsigned int VBO, VAO;
 89 |     glGenVertexArrays(1, &VAO);
 90 |     glGenBuffers(1, &VBO);
 91 | 
 92 |     // Bind the VAO
 93 |     glBindVertexArray(VAO);
 94 | 
 95 |     // Bind and fill VBO
 96 |     glBindBuffer(GL_ARRAY_BUFFER, VBO);
 97 |     glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
 98 | 
 99 |     // Define vertex attributes (location = 0)
100 |     glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 6 * sizeof(float), (void*)0);
101 |     glEnableVertexAttribArray(0);
102 | 
103 |     // Color attribute (location = 1)
104 |     glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, 6 * sizeof(float), (void*)(3 * sizeof(float)));
105 |     glEnableVertexAttribArray(1);
106 | 
107 |     // Create and compile Vertex Shader
108 |     unsigned int vertexShader = glCreateShader(GL_VERTEX_SHADER);
109 |     glShaderSource(vertexShader, 1, &vertexShaderSource, NULL);
110 |     glCompileShader(vertexShader);
111 | 
112 |     // Create and compile Fragment Shader
113 |     unsigned int fragmentShader = glCreateShader(GL_FRAGMENT_SHADER);
114 |     glShaderSource(fragmentShader, 1, &fragmentShaderSource, NULL);
115 |     glCompileShader(fragmentShader);
116 | 
117 |     // Link shaders into a Shader Program
118 |     unsigned int shaderProgram = glCreateProgram();
119 |     glAttachShader(shaderProgram, vertexShader);
120 |     glAttachShader(shaderProgram, fragmentShader);
121 |     glLinkProgram(shaderProgram);
122 | 
123 |     // Cleanup
124 |     glDeleteShader(vertexShader);
125 |     glDeleteShader(fragmentShader);
126 | 
127 |     // Render Loop
128 |     float rotationAngle = 0.0f;
129 |     auto now = std::chrono::steady_clock::now();
130 |     while (!glfwWindowShouldClose(window)) {
131 |         glClear(GL_COLOR_BUFFER_BIT);  // Clear screen
132 | 
133 |         int angleLocation = glGetUniformLocation(shaderProgram, "angle");
134 |         glUniform1f(angleLocation, rotationAngle);
135 |         glUseProgram(shaderProgram);
136 |         glBindVertexArray(VAO);
137 |         glDrawArrays(GL_TRIANGLES, 0, 3);  // Draw triangle
138 | 
139 |         glfwSwapBuffers(window);
140 |         glfwPollEvents();
141 | 
142 |         auto end = std::chrono::steady_clock::now();
143 |         auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - now).count() / 1000.0;
144 |         rotationAngle += duration * 0.5;
145 |         glBindBuffer(GL_ARRAY_BUFFER, VBO);
146 |         glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
147 |         now = end;
148 |     }
149 | 
150 |     // Cleanup
151 |     glDeleteVertexArrays(1, &VAO);
152 |     glDeleteBuffers(1, &VBO);
153 |     glDeleteProgram(shaderProgram);
154 |     glfwDestroyWindow(window);
155 |     glfwTerminate();
156 | 
157 |     return 0;
158 | }
159 | 


--------------------------------------------------------------------------------
/webgpu/compute/src/main.rs:
--------------------------------------------------------------------------------
  1 | // Based on the file https://github.com/googlefonts/compute-shader-101, licensed under the MIT
  2 | // license
  3 | 
  4 | use std::time::Instant;
  5 | use wgpu::{util::DeviceExt, PipelineCompilationOptions};
  6 | use bytemuck;
  7 | 
  8 | const N_ELEMENTS: u32 = 1024;
  9 | 
 10 | async fn run() {
 11 |     let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor::default());
 12 |     let adapter = instance.request_adapter(&Default::default()).await.unwrap();
 13 |     let features = adapter.features();
 14 |     let (device, queue) = adapter
 15 |         .request_device(
 16 |             &wgpu::DeviceDescriptor::default(),
 17 |             None,
 18 |         )
 19 |         .await
 20 |         .unwrap();
 21 |     let query_set = if false && features.contains(wgpu::Features::TIMESTAMP_QUERY) {
 22 |         Some(device.create_query_set(&wgpu::QuerySetDescriptor {
 23 |             count: 2,
 24 |             ty: wgpu::QueryType::Timestamp,
 25 |             label: None,
 26 |         }))
 27 |     } else {
 28 |         None
 29 |     };
 30 | 
 31 |     let start_instant = Instant::now();
 32 |     let cs_module = device.create_shader_module(wgpu::ShaderModuleDescriptor {
 33 |         label: None,
 34 |         source: wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into()),
 35 |     });
 36 |     println!("shader compilation {:?}", start_instant.elapsed());
 37 |     let input_v = (0..N_ELEMENTS).map(|i| i as f32).collect::<Vec<_>>();
 38 |     let input: &[u8] = bytemuck::cast_slice(&input_v);
 39 |     let input_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
 40 |         label: Some("Input"),
 41 |         contents: input,
 42 |         usage: wgpu::BufferUsages::STORAGE
 43 |             | wgpu::BufferUsages::COPY_SRC,
 44 |     });
 45 |     let storage_buffer = device.create_buffer(&wgpu::BufferDescriptor {
 46 |         label: Some("Output"),
 47 |         size: input.len() as u64,
 48 |         usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
 49 |         mapped_at_creation: false,
 50 |     });
 51 |     let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
 52 |         label: Some("Staging Buffer"),
 53 |         size: input.len() as u64,
 54 |         usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
 55 |         mapped_at_creation: false,
 56 |     });
 57 |     let query_buf = device.create_buffer(&wgpu::BufferDescriptor {
 58 |         label: None,
 59 |         size: 16,
 60 |         usage: wgpu::BufferUsages::COPY_SRC | wgpu::BufferUsages::QUERY_RESOLVE,
 61 |         mapped_at_creation: false,
 62 |     });
 63 |     let query_staging_buf = device.create_buffer(&wgpu::BufferDescriptor {
 64 |         label: None,
 65 |         size: 16,
 66 |         usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
 67 |         mapped_at_creation: false,
 68 |     });
 69 | 
 70 |     let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
 71 |         label: None,
 72 |         entries: &[
 73 |             // Input
 74 |             wgpu::BindGroupLayoutEntry {
 75 |                 binding: 0,
 76 |                 visibility: wgpu::ShaderStages::COMPUTE,
 77 |                 ty: wgpu::BindingType::Buffer {
 78 |                     ty: wgpu::BufferBindingType::Storage { read_only: true },
 79 |                     has_dynamic_offset: false,
 80 |                     min_binding_size: None,
 81 |                 },
 82 |                 count: None,
 83 |             },
 84 |             // Output
 85 |             wgpu::BindGroupLayoutEntry {
 86 |                 binding: 1,
 87 |                 visibility: wgpu::ShaderStages::COMPUTE,
 88 |                 ty: wgpu::BindingType::Buffer {
 89 |                     ty: wgpu::BufferBindingType::Storage { read_only: false },
 90 |                     has_dynamic_offset: false,
 91 |                     min_binding_size: None,
 92 |                 },
 93 |                 count: None,
 94 |             }
 95 |         ],
 96 |     });
 97 |     let compute_pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
 98 |         label: None,
 99 |         bind_group_layouts: &[&bind_group_layout],
100 |         push_constant_ranges: &[],
101 |     });
102 |     let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
103 |         label: None,
104 |         layout: Some(&compute_pipeline_layout),
105 |         module: &cs_module,
106 |         entry_point: Some("main"),
107 |         cache: None,
108 |         compilation_options: PipelineCompilationOptions::default(),
109 |     });
110 | 
111 |     let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
112 |         label: None,
113 |         layout: &bind_group_layout,
114 |         entries: &[
115 |             // Input
116 |             wgpu::BindGroupEntry {
117 |                 binding: 0,
118 |                 resource: input_buf.as_entire_binding(),
119 |             },
120 |             // Output
121 |             wgpu::BindGroupEntry {
122 |                 binding: 1,
123 |                 resource: storage_buffer.as_entire_binding(),
124 |             }
125 |         ],
126 |     });
127 | 
128 |     let mut encoder = device.create_command_encoder(&Default::default());
129 |     if let Some(query_set) = &query_set {
130 |         encoder.write_timestamp(query_set, 0);
131 |     }
132 |     {
133 |         let mut cpass = encoder.begin_compute_pass(&Default::default());
134 |         cpass.set_pipeline(&pipeline);
135 |         cpass.set_bind_group(0, &bind_group, &[]);
136 |         cpass.dispatch_workgroups(input_v.len() as u32, 1, 1);
137 |     }
138 |     if let Some(query_set) = &query_set {
139 |         encoder.write_timestamp(query_set, 1);
140 |     }
141 |     if let Some(query_set) = &query_set {
142 |         encoder.resolve_query_set(query_set, 0..2, &query_buf, 0);
143 |     }
144 |     encoder.copy_buffer_to_buffer(&query_buf, 0, &query_staging_buf, 0, 16);
145 |     encoder.copy_buffer_to_buffer(&storage_buffer, 0, &staging_buffer, 0, input.len() as u64);
146 |     queue.submit(Some(encoder.finish()));
147 | 
148 |     let buf_slice = staging_buffer.slice(..);
149 |     let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel();
150 | 
151 |     buf_slice.map_async(wgpu::MapMode::Read, move |result| {
152 |         sender.send(result).unwrap();
153 |     });
154 | 
155 |     device.poll(wgpu::Maintain::Wait);
156 |     receiver.receive().await.unwrap().unwrap();
157 | 
158 |     let data = buf_slice.get_mapped_range();
159 |     let result: &[f32] = bytemuck::cast_slice(&data);
160 |     println!("Sum of the elements in the array: {:?}", result[0]);
161 | 
162 |     drop(data);
163 |     staging_buffer.unmap();
164 | }
165 | 
166 | fn main() {
167 |     pollster::block_on(run());
168 | }
169 | 


--------------------------------------------------------------------------------
/directx/graphics/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <windows.h>
  2 | #include <d3d11.h>
  3 | #include <d3dcompiler.h>
  4 | #include <DirectXMath.h>
  5 | #include <iostream>
  6 | 
  7 | // Link required libraries
  8 | #pragma comment(lib, "d3d11.lib")
  9 | #pragma comment(lib, "d3dcompiler.lib")
 10 | 
 11 | // Direct3D globals
 12 | IDXGISwapChain* swapChain = nullptr;
 13 | ID3D11Device* device = nullptr;
 14 | ID3D11DeviceContext* deviceContext = nullptr;
 15 | ID3D11RenderTargetView* renderTargetView = nullptr;
 16 | ID3D11VertexShader* vertexShader = nullptr;
 17 | ID3D11PixelShader* pixelShader = nullptr;
 18 | ID3D11InputLayout* inputLayout = nullptr;
 19 | ID3D11Buffer* vertexBuffer = nullptr;
 20 | 
 21 | // Vertex structure
 22 | struct Vertex {
 23 |     DirectX::XMFLOAT3 position;
 24 |     DirectX::XMFLOAT4 color;
 25 | };
 26 | 
 27 | // Shader source
 28 | const char* vertexShaderSource = R"(
 29 |     struct VS_INPUT {
 30 |         float3 pos : POSITION;
 31 |         float4 color : COLOR;
 32 |     };
 33 | 
 34 |     struct PS_INPUT {
 35 |         float4 pos : SV_POSITION;
 36 |         float4 color : COLOR;
 37 |     };
 38 | 
 39 |     PS_INPUT main(VS_INPUT input) {
 40 |         PS_INPUT output;
 41 |         output.pos = float4(input.pos, 1.0);
 42 |         output.color = input.color;
 43 |         return output;
 44 |     }
 45 | )";
 46 | 
 47 | const char* pixelShaderSource = R"(
 48 |     struct PS_INPUT {
 49 |         float4 pos : SV_POSITION;
 50 |         float4 color : COLOR;
 51 |     };
 52 | 
 53 |     float4 main(PS_INPUT input) : SV_TARGET {
 54 |         return input.color;
 55 |     }
 56 | )";
 57 | 
 58 | // Window Procedure
 59 | LRESULT CALLBACK WindowProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam) {
 60 |     if (uMsg == WM_DESTROY) {
 61 |         PostQuitMessage(0);
 62 |         return 0;
 63 |     }
 64 |     return DefWindowProc(hwnd, uMsg, wParam, lParam);
 65 | }
 66 | 
 67 | // Compile shader
 68 | ID3DBlob* CompileShader(const char* source, const char* entryPoint, const char* target) {
 69 |     ID3DBlob* shaderBlob = nullptr;
 70 |     ID3DBlob* errorBlob = nullptr;
 71 |     if (FAILED(D3DCompile(source, strlen(source), nullptr, nullptr, nullptr, entryPoint, target, 0, 0, &shaderBlob, &errorBlob))) {
 72 |         if (errorBlob) {
 73 |             std::cerr << (char*)errorBlob->GetBufferPointer() << std::endl;
 74 |             errorBlob->Release();
 75 |         }
 76 |         return nullptr;
 77 |     }
 78 |     return shaderBlob;
 79 | }
 80 | 
 81 | // Initialize Direct3D
 82 | bool InitD3D(HWND hwnd) {
 83 |     DXGI_SWAP_CHAIN_DESC scd = {};
 84 |     scd.BufferCount = 1;
 85 |     scd.BufferDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
 86 |     scd.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
 87 |     scd.OutputWindow = hwnd;
 88 |     scd.SampleDesc.Count = 1;
 89 |     scd.Windowed = TRUE;
 90 |     scd.SwapEffect = DXGI_SWAP_EFFECT_DISCARD;
 91 | 
 92 |     if (FAILED(D3D11CreateDeviceAndSwapChain(nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0, nullptr, 0,
 93 |         D3D11_SDK_VERSION, &scd, &swapChain, &device, nullptr, &deviceContext))) {
 94 |         return false;
 95 |     }
 96 | 
 97 |     D3D11_VIEWPORT viewport = {};
 98 |     viewport.Width = 800.0f;
 99 |     viewport.Height = 600.0f;
100 |     viewport.MinDepth = 0.0f;
101 |     viewport.MaxDepth = 1.0f;
102 |     viewport.TopLeftX = 0;
103 |     viewport.TopLeftY = 0;
104 | 
105 |     deviceContext->RSSetViewports(1, &viewport);
106 | 
107 |     // Get back buffer
108 |     ID3D11Texture2D* backBuffer = nullptr;
109 |     swapChain->GetBuffer(0, __uuidof(ID3D11Texture2D), (void**)&backBuffer);
110 |     device->CreateRenderTargetView(backBuffer, nullptr, &renderTargetView);
111 |     backBuffer->Release();
112 |     deviceContext->OMSetRenderTargets(1, &renderTargetView, nullptr);
113 | 
114 |     // Compile shaders
115 |     ID3DBlob* vsBlob = CompileShader(vertexShaderSource, "main", "vs_5_0");
116 |     ID3DBlob* psBlob = CompileShader(pixelShaderSource, "main", "ps_5_0");
117 | 
118 |     if (!vsBlob || !psBlob) return false;
119 | 
120 |     // Create shaders
121 |     device->CreateVertexShader(vsBlob->GetBufferPointer(), vsBlob->GetBufferSize(), nullptr, &vertexShader);
122 |     device->CreatePixelShader(psBlob->GetBufferPointer(), psBlob->GetBufferSize(), nullptr, &pixelShader);
123 | 
124 |     // Input layout
125 |     D3D11_INPUT_ELEMENT_DESC layoutDesc[] = {
126 |         { "POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0 },
127 |         { "COLOR", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0 }
128 |     };
129 |     device->CreateInputLayout(layoutDesc, 2, vsBlob->GetBufferPointer(), vsBlob->GetBufferSize(), &inputLayout);
130 |     deviceContext->IASetInputLayout(inputLayout);
131 | 
132 |     vsBlob->Release();
133 |     psBlob->Release();
134 | 
135 |     // Triangle vertices
136 |     Vertex vertices[] = {
137 |         {{  0.0f,  0.5f, 0.0f }, { 1.0f, 0.0f, 0.0f, 1.0f }},
138 |         {{  0.5f, -0.5f, 0.0f }, { 0.0f, 0.0f, 1.0f, 1.0f }},
139 |         {{ -0.5f, -0.5f, 0.0f }, { 0.0f, 1.0f, 0.0f, 1.0f }}
140 |     };
141 | 
142 |     // Create vertex buffer
143 |     D3D11_BUFFER_DESC bufferDesc = {};
144 |     bufferDesc.Usage = D3D11_USAGE_DEFAULT;
145 |     bufferDesc.ByteWidth = sizeof(vertices);
146 |     bufferDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
147 | 
148 |     D3D11_SUBRESOURCE_DATA initData = { vertices };
149 |     device->CreateBuffer(&bufferDesc, &initData, &vertexBuffer);
150 | 
151 |     return true;
152 | }
153 | 
154 | // Render function
155 | void Render() {
156 |     float clearColor[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
157 |     deviceContext->ClearRenderTargetView(renderTargetView, clearColor);
158 | 
159 |     // Set shaders
160 |     deviceContext->VSSetShader(vertexShader, nullptr, 0);
161 |     deviceContext->PSSetShader(pixelShader, nullptr, 0);
162 | 
163 |     // Bind vertex buffer
164 |     UINT stride = sizeof(Vertex);
165 |     UINT offset = 0;
166 |     deviceContext->IASetVertexBuffers(0, 1, &vertexBuffer, &stride, &offset);
167 |     deviceContext->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
168 | 
169 |     // Draw triangle
170 |     deviceContext->Draw(3, 0);
171 | 
172 |     swapChain->Present(1, 0);
173 | }
174 | 
175 | // Cleanup
176 | void CleanupD3D() {
177 |     if (swapChain) swapChain->Release();
178 |     if (renderTargetView) renderTargetView->Release();
179 |     if (deviceContext) deviceContext->Release();
180 |     if (device) device->Release();
181 |     if (vertexShader) vertexShader->Release();
182 |     if (pixelShader) pixelShader->Release();
183 |     if (inputLayout) inputLayout->Release();
184 |     if (vertexBuffer) vertexBuffer->Release();
185 | }
186 | 
187 | // Main function
188 | int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE, LPSTR, int nCmdShow) {
189 |     WNDCLASS wc = { 0 };
190 |     wc.lpfnWndProc = WindowProc;
191 |     wc.hInstance = hInstance;
192 |     wc.lpszClassName = "Direct3DWindowClass";
193 |     RegisterClass(&wc);
194 | 
195 |     HWND hwnd = CreateWindowEx(0, wc.lpszClassName, "Direct3D Triangle", WS_OVERLAPPEDWINDOW,
196 |         100, 100, 800, 600, nullptr, nullptr, hInstance, nullptr);
197 |     ShowWindow(hwnd, nCmdShow);
198 | 
199 |     if (!InitD3D(hwnd)) return -1;
200 | 
201 |     MSG msg = {};
202 |     while (msg.message != WM_QUIT) {
203 |         if (PeekMessage(&msg, nullptr, 0, 0, PM_REMOVE)) {
204 |             TranslateMessage(&msg);
205 |             DispatchMessage(&msg);
206 |         } else {
207 |             Render();
208 |         }
209 |     }
210 | 
211 |     CleanupD3D();
212 |     return 0;
213 | }
214 | 


--------------------------------------------------------------------------------
/webgpu/graphics/src/main.rs:
--------------------------------------------------------------------------------
  1 | // Based on the file https://github.com/sotrh/learn-wgpu, licensed under the MIT
  2 | // license
  3 | 
  4 | use glfw::{fail_on_errors, Action, Key, Window, WindowHint, ClientApiHint};
  5 | use std::env::current_dir;
  6 | use std::fs;
  7 | 
  8 | pub struct PipelineBuilder {
  9 |     shader_filename: String,
 10 |     vertex_entry: String,
 11 |     fragment_entry: String,
 12 |     pixel_format: wgpu::TextureFormat,
 13 | }
 14 | 
 15 | impl PipelineBuilder {
 16 | 
 17 |     pub fn new() -> Self {
 18 |         PipelineBuilder {
 19 |             shader_filename: String::new(),
 20 |             vertex_entry: String::new(),
 21 |             fragment_entry: String::new(),
 22 |             pixel_format: wgpu::TextureFormat::Rgba8Unorm,
 23 |         }
 24 |     }
 25 | 
 26 |     pub fn set_shader_module(&mut self, shader_filename: &str, vertex_entry: &str, fragment_entry: &str) {
 27 | 
 28 |         self.shader_filename = shader_filename.to_string();
 29 |         self.vertex_entry = vertex_entry.to_string();
 30 |         self.fragment_entry = fragment_entry.to_string();
 31 |     }
 32 | 
 33 |     pub fn set_pixel_format(&mut self, pixel_format: wgpu::TextureFormat) {
 34 | 
 35 |         self.pixel_format = pixel_format;
 36 |     }
 37 | 
 38 |     pub fn build_pipeline(&self, device: &wgpu::Device) -> wgpu::RenderPipeline {
 39 | 
 40 |         let mut filepath = current_dir().unwrap();
 41 |         filepath.push("src/");
 42 |         filepath.push(self.shader_filename.as_str());
 43 |         let filepath = filepath.into_os_string().into_string().unwrap();
 44 |         let source_code = fs::read_to_string(filepath).expect("Failed to read the source code.");
 45 | 
 46 |         let shader_module_descriptor = wgpu::ShaderModuleDescriptor {
 47 |             label: Some("Shader Module"),
 48 |             source: wgpu::ShaderSource::Wgsl(source_code.into()),
 49 |         };
 50 |         let shader_module = device.create_shader_module(shader_module_descriptor);
 51 | 
 52 |         let pipeline_layout_descriptor = wgpu::PipelineLayoutDescriptor {
 53 |             label: Some("Render Pipeline Layout"),
 54 |             bind_group_layouts: &[],
 55 |             push_constant_ranges: &[],
 56 |         };
 57 |         let pipeline_layout = device.create_pipeline_layout(&pipeline_layout_descriptor);
 58 | 
 59 |         let render_targets = [Some(wgpu::ColorTargetState {
 60 |             format: self.pixel_format,
 61 |             blend: Some(wgpu::BlendState::REPLACE),
 62 |             write_mask: wgpu::ColorWrites::ALL,
 63 |         })];
 64 | 
 65 |         let render_pipeline_descriptor = wgpu::RenderPipelineDescriptor {
 66 |             label: Some("Render Pipeline"),
 67 |             layout: Some(&pipeline_layout),
 68 | 
 69 |             vertex: wgpu::VertexState {
 70 |                 module: &shader_module,
 71 |                 entry_point: Some(&self.vertex_entry),
 72 |                 buffers: &[],
 73 |                 compilation_options: wgpu::PipelineCompilationOptions::default(),
 74 |             },
 75 | 
 76 |             primitive: wgpu::PrimitiveState {
 77 |                 topology: wgpu::PrimitiveTopology::TriangleList,
 78 |                 strip_index_format: None,
 79 |                 front_face: wgpu::FrontFace::Ccw,
 80 |                 cull_mode: Some(wgpu::Face::Back),
 81 |                 polygon_mode: wgpu::PolygonMode::Fill,
 82 |                 unclipped_depth: false,
 83 |                 conservative: false,
 84 |             },
 85 | 
 86 |             fragment: Some(wgpu::FragmentState {
 87 |                 module: &shader_module,
 88 |                 entry_point: Some(&self.fragment_entry),
 89 |                 targets: &render_targets,
 90 |                 compilation_options: wgpu::PipelineCompilationOptions::default(),
 91 |             }),
 92 | 
 93 |             depth_stencil: None,
 94 |             multisample: wgpu::MultisampleState {
 95 |                 count: 1,
 96 |                 mask: !0,
 97 |                 alpha_to_coverage_enabled: false,
 98 |             },
 99 |             multiview: None,
100 |             cache: None,
101 |         };
102 | 
103 |         device.create_render_pipeline(&render_pipeline_descriptor)
104 |     }
105 | }
106 | 
107 | struct State<'a> {
108 |     instance: wgpu::Instance,
109 |     surface: wgpu::Surface<'a>,
110 |     device: wgpu::Device,
111 |     queue: wgpu::Queue,
112 |     config: wgpu::SurfaceConfiguration,
113 |     size: (i32, i32),
114 |     window: &'a mut Window,
115 |     render_pipeline: wgpu::RenderPipeline,
116 | }
117 | 
118 | impl<'a> State<'a> {
119 | 
120 |     async fn new(window: &'a mut Window) -> Self {
121 | 
122 |         let size = window.get_framebuffer_size();
123 | 
124 |         let instance_descriptor = wgpu::InstanceDescriptor {
125 |             backends: wgpu::Backends::all(), ..Default::default()
126 |         };
127 |         let instance = wgpu::Instance::new(&instance_descriptor);
128 |         let surface = instance.create_surface(window.render_context()).unwrap();
129 | 
130 |         let adapter_descriptor = wgpu::RequestAdapterOptionsBase {
131 |             power_preference: wgpu::PowerPreference::default(),
132 |             compatible_surface: Some(&surface),
133 |             force_fallback_adapter: false,
134 |         };
135 |         let adapter = instance.request_adapter(&adapter_descriptor)
136 |             .await.unwrap();
137 | 
138 |         let device_descriptor = wgpu::DeviceDescriptor {
139 |             required_features: wgpu::Features::empty(),
140 |             required_limits: wgpu::Limits::default(),
141 |             label: Some("Device"),
142 |             memory_hints: wgpu::MemoryHints::default(),
143 |         };
144 |         let (device, queue) = adapter
145 |             .request_device(&device_descriptor, None)
146 |             .await.unwrap();
147 | 
148 | 
149 |         let surface_capabilities = surface.get_capabilities(&adapter);
150 |         let surface_format = surface_capabilities
151 |             .formats
152 |             .iter()
153 |             .copied()
154 |             .filter(|f | f.is_srgb())
155 |             .next()
156 |             .unwrap_or(surface_capabilities.formats[0]);
157 |         let config = wgpu::SurfaceConfiguration {
158 |             usage: wgpu::TextureUsages::RENDER_ATTACHMENT,
159 |             format: surface_format,
160 |             width: size.0 as u32,
161 |             height: size.1 as u32,
162 |             present_mode: surface_capabilities.present_modes[0],
163 |             alpha_mode: surface_capabilities.alpha_modes[0],
164 |             view_formats: vec![],
165 |             desired_maximum_frame_latency: 2
166 |         };
167 |         surface.configure(&device, &config);
168 | 
169 |         let mut pipeline_builder = PipelineBuilder::new();
170 |         pipeline_builder.set_shader_module("shader.wgsl", "vertices", "fragment");
171 |         pipeline_builder.set_pixel_format(config.format);
172 |         let render_pipeline = pipeline_builder.build_pipeline(&device);
173 | 
174 |         Self {
175 |             instance,
176 |             window,
177 |             surface,
178 |             device,
179 |             queue,
180 |             config,
181 |             size,
182 |             render_pipeline,
183 |         }
184 |     }
185 | 
186 |     fn resize(&mut self, new_size: (i32, i32)) {
187 |         if new_size.0 > 0 && new_size.1 > 0 {
188 |             self.size = new_size;
189 |             self.config.width = new_size.0 as u32;
190 |             self.config.height = new_size.1 as u32;
191 |             self.surface.configure(&self.device, &self.config);
192 |         }
193 |     }
194 | 
195 |     fn update_surface(&mut self) {
196 |         self.surface = self.instance.create_surface(self.window.render_context()).unwrap();
197 |     }
198 | 
199 |     fn render(&mut self) -> Result<(), wgpu::SurfaceError>{
200 | 
201 |         let drawable = self.surface.get_current_texture()?;
202 |         let image_view_descriptor = wgpu::TextureViewDescriptor::default();
203 |         let image_view = drawable.texture.create_view(&image_view_descriptor);
204 | 
205 |         let command_encoder_descriptor = wgpu::CommandEncoderDescriptor {
206 |             label: Some("Render Encoder")
207 |         };
208 |         let mut command_encoder = self.device.create_command_encoder(&command_encoder_descriptor);
209 | 
210 |         let color_attachment = wgpu::RenderPassColorAttachment {
211 |             view: &image_view,
212 |             resolve_target: None,
213 |             ops: wgpu::Operations {
214 |                 load: wgpu::LoadOp::Clear(wgpu::Color {
215 |                     r: 0.0,
216 |                     g: 0.0,
217 |                     b: 0.0,
218 |                     a: 1.0
219 |                 }),
220 |                 store: wgpu::StoreOp::Store,
221 |             },
222 |         };
223 | 
224 |         let render_pass_descriptor = wgpu::RenderPassDescriptor {
225 |             label: Some("Render Pass"),
226 |             color_attachments: &[Some(color_attachment)],
227 |             depth_stencil_attachment: None,
228 |             occlusion_query_set: None,
229 |             timestamp_writes: None
230 |         };
231 | 
232 |         {
233 |             let mut renderpass = command_encoder.begin_render_pass(&render_pass_descriptor);
234 |             renderpass.set_pipeline(&self.render_pipeline);
235 |             renderpass.draw(0..3, 0..1);
236 |         }
237 |         self.queue.submit(std::iter::once(command_encoder.finish()));
238 | 
239 |         drawable.present();
240 | 
241 |         Ok(())
242 |     }
243 | }
244 | 
245 | async fn run() {
246 | 
247 |     let mut glfw = glfw::init(fail_on_errors!())
248 |         .unwrap();
249 |     glfw.window_hint(WindowHint::ClientApi(ClientApiHint::NoApi));
250 |     let (mut window, events) =
251 |         glfw.create_window(
252 |             800, 600, "WGPU Graphics",
253 |             glfw::WindowMode::Windowed).unwrap();
254 | 
255 |     let mut state = State::new(&mut window).await;
256 | 
257 |     state.window.set_framebuffer_size_polling(true);
258 |     state.window.set_key_polling(true);
259 |     state.window.set_mouse_button_polling(true);
260 |     state.window.set_pos_polling(true);
261 | 
262 |     while !state.window.should_close() {
263 |         glfw.poll_events();
264 |         for (_, event) in glfw::flush_messages(&events) {
265 |             match event {
266 | 
267 |                 glfw::WindowEvent::Key(Key::Escape, _, Action::Press, _) => {
268 |                     state.window.set_should_close(true)
269 |                 }
270 | 
271 |                 glfw::WindowEvent::Pos(..) => {
272 |                     state.update_surface();
273 |                     state.resize(state.size);
274 |                 }
275 | 
276 |                 glfw::WindowEvent::FramebufferSize(width, height) => {
277 |                     state.update_surface();
278 |                     state.resize((width, height));
279 |                 }
280 |                 _ => {}
281 |             }
282 |         }
283 | 
284 |         match state.render() {
285 |             Ok(_) => {},
286 |             Err(wgpu::SurfaceError::Lost | wgpu::SurfaceError::Outdated) => {
287 |                 state.update_surface();
288 |                 state.resize(state.size);
289 |             },
290 |             Err(e) => eprintln!("{:?}", e),
291 |         }
292 |     }
293 | }
294 | 
295 | fn main() {
296 |     pollster::block_on(run());
297 | }
298 | 


--------------------------------------------------------------------------------
/readme.rst:
--------------------------------------------------------------------------------
  1 | gpu-arena
  2 | =========
  3 | 
  4 | - `English (en) <#a-guided-tour-of-gpu-frameworks>`_
  5 | - `Français (fr) <#visite-guidée-de-cadres-logiciels-pour-processeurs-graphiques>`_
  6 | 
  7 | .. image:: assets/triangle.gif
  8 |    :width: 500
  9 |    :align: center
 10 |    :alt: Demonstration of simple 3D graphics. A colored triangle rotates on its vertical axis in
 11 |       front of a black background. The corners of the triangle are red, blue, and green, and the
 12 |       center of the triangle are colored in shades of these colors.
 13 | 
 14 | 
 15 | A Guided Tour of GPU Programming Frameworks
 16 | +++++++++++++++++++++++++++++++++++++++++++
 17 | 
 18 | Self-contained projects that show how to install GPU programming frameworks, build
 19 | GPU-accelerated programs, and execute them. Click on the links in the index table below to access
 20 | the ``readme`` file of each project for more information.
 21 | 
 22 | The projects are minimal examples, not complete tutorials! Each ``readme`` file provides references
 23 | to more detailed resources. Contributions are welcome - you can enrich the current projects and even
 24 | add other GPU programming frameworks!
 25 | 
 26 | 
 27 | Project Index
 28 | -------------
 29 | 
 30 | Click on the links in the leftmost column to access the corresponding subdirectory.  ``Y`` indicates
 31 | that the framework supports the application or device. ``N`` indicates that it does not support
 32 | them.
 33 | 
 34 | +------------------------------------------+----------------------------+-------------------------------------------+---------------+------------------+
 35 | | Framework                                | Applications               | Devices                                   | Operating     | Shading / kernel |
 36 | |                                          +----------+-----------------+-----+-------+-------+-----+---------------+ Systems       | language         |
 37 | |                                          | Graphics | General-purpose | CPU |Nvidia | Intel | AMD | Apple Silicon |               |                  |
 38 | +==========================================+==========+=================+=====+=======+=======+=====+===============+===============+==================+
 39 | |`OpenGL <opengl/readme.md>`__             | Y        | Y (since        | N   | Y     | Y     | Y   | N             | Any           | GLSL             |
 40 | |                                          |          | version 4.3,    |     |       |       |     |               | (deprecated   |                  |
 41 | |                                          |          | 2012)           |     |       |       |     |               | on Mac)       |                  |
 42 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
 43 | |`Metal <metal/readme.md>`__               | Y*       | Y*              | N   | N     | N     | N   | Y             | Mac / iOS     | MSL              |
 44 | |                                          |          |                 |     |       |       |     |               |               |                  |
 45 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
 46 | |`DirectX <directx/readme.md>`__           | Y        | Y               | N   | Y     | Y     | Y   | N             | Windows       | HLSL             |
 47 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
 48 | |`Vulkan <vulkan/readme.md>`__             | Y        | Y (implemented  | N   | Y     | Y     | Y   | N             | Any           | Anything that    |
 49 | |                                          |          | with kompute)   |     |       |       |     |               | (deprecated   | compiles to      |
 50 | |                                          |          |                 |     |       |       |     |               | on Mac)       | SPIR-V           |
 51 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
 52 | |`WebGPU <webgpu/readme.md>`__             | Y        | Y               | N   | Y     | Y     | Y   | Y             | Any           | WGSL             |
 53 | |                                          |          |                 |     |       |       |     |               |               |                  |
 54 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
 55 | |`CUDA <cuda/readme.md>`__                 | N        | Y               | N   | Y     | N     | N   | N             | Windows,      | CUDA             |
 56 | |                                          |          |                 |     |       |       |     |               | Linux         |                  |
 57 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
 58 | |`OpenCL <opencl/readme.md>`__             | N        | Y               | Y   | Y     | Y     | Y   | Y             | Any           | OpenCL C         |
 59 | |                                          |          |                 |     |       |       |     |               | (deprecated   |                  |
 60 | |                                          |          |                 |     |       |       |     |               | on Mac)       |                  |
 61 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
 62 | |`SYCL <sycl/readme.md>`__                 | N        | Y*              | Y   | Y     | Y     | Y   | Y             | Any (CPU-only | C++ extensions   |
 63 | |                                          |          |                 |     |       |       |     |               | on Mac)       |                  |
 64 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
 65 | |`Triton <triton/readme.md>`__             | N        | Y               | N   | Y     | N     | Y   | N             | Linux         | Decorated Python |
 66 | |                                          |          |                 |     |       |       |     |               |               | functions        |
 67 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
 68 | | `OpenMP <openmp/readme.md>`__            | N        | Y               | Y   | Y     | Y     | Y   | Y             | Any           | Compiler         |
 69 | |                                          |          |                 |     |       |       |     |               |               | directives       |
 70 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
 71 | | `AcceleratedKernels.jl                   | N        | Y               | Y   | Y     | Y     | Y   | Y             | Any           | Julia functions  |
 72 | | <AcceleratedKernels.jl/readme.md>`__     |          |                 |     |       |       |     |               |               |                  |
 73 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
 74 | |`CPU <cpu/readme.md>`__ (baseline)        | N        | Y               | Y   | N     | N     | N   | N             | Any           | N/A              |
 75 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
 76 | 
 77 | - ``*``: The corresponding example is not implemented in the project.
 78 | 
 79 | 
 80 | Other Frameworks
 81 | ----------------
 82 | 
 83 | There are even more frameworks that can be used to program GPUs! Below are listed a few of them;
 84 | no example is implemented in this repository, but you can follow the links to learn more about
 85 | them.
 86 | 
 87 | - Bend (https://github.com/HigherOrderCO/Bend): a programming language for parallel computing.
 88 | - Chapel (https://chapel-lang.org/gpu/): another programming language for parallel computing.
 89 | - Mojo (https://www.modular.com/mojo): a programming language for heterogeneous computing.
 90 | - oneAPI (https://www.intel.com/content/www/us/en/developer/tools/oneapi/overview.html): A
 91 |   software stack for high performance computing by Intel. Based on SYCL, but also adds custom
 92 |   extensions to implement new features.
 93 | - OpenACC (https://www.openacc.org/): A parallel computing standard.
 94 | - ROCm (https://www.amd.com/fr/products/software/rocm.html): A software stack for high performance
 95 |   computing by AMD. Supports OpenCL, HIP, OpenMP.
 96 | - Slang (https://www.khronos.org/news/press/khronos-group-launches-slang-initiative-hosting-open-source-compiler-contributed-by-nvidia):
 97 |   a shading language and compiler that can target multiple APIs.
 98 | - rust-gpu (https://github.com/Rust-GPU/rust-gpu) a framework under development that enables
 99 |   seamless integration of GPU code into Rust code. It's a little like SYCL but for Rust instead of
100 |   C++, and in contrast to SYCL, rust-gpu supports both general-purpose AND graphics applications.
101 |   The project is not production-ready as of July 2025.
102 | 
103 | 
104 | GPU Projects
105 | ------------
106 | 
107 | Some projects that use GPU programming. Don't hesitate to create a PR if you want to add any!
108 | 
109 | - Artificial Intelligence:
110 |     - burn (https://github.com/tracel-ai/burn): Deep learning framework that uses WebGPU as its
111 |       backend for increased portability. It also uses SPIR-V to perform some optimizations that
112 |       WebGPU does not support.
113 |     - PyTorch (https://github.com/pytorch/pytorch): Deep learning library that uses CUDA and ROCm for
114 |       GPU acceleration.
115 |     - TensorFlow (https://github.com/tensorflow/tensorflow): Deep learning library that uses CUDA for
116 |       GPU acceleration.
117 | - Graphics:
118 |     - Godot Shaders (https://godotshaders.com/): A collection of shaders that can be used in the
119 |       Godot game engine. It uses a shading language similar to GLSL.
120 | - Physics:
121 |     - FluidX3D (https://github.com/ProjectPhysX/FluidX3D): Computational fluid dynamics software
122 |       implemented with OpenCL.
123 |     - gpu-io (https://github.com/amandaghassaei/gpu-io): A library for running physics simulations in
124 |       a browser. Implemented with WebGL.
125 |     - PixelFlow (https://github.com/diwi/PixelFlow): A physics simulation framework based on Java
126 |       and OpenGL.
127 | - Bioinformatics:
128 |     - genome-spy (https://github.com/genome-spy/genome-spy): Toolkit for analyzing genomic data
129 |       implemented with WebGL.
130 |     - GenomeWorks (https://github.com/NVIDIA-Genomics-Research/GenomeWorks): CUDA-accelerated DNA
131 |       analysis and alignment SDK.
132 | - Cryptography:
133 |     - hashcat (https://github.com/hashcat/hashcat): Software recovery program implemented with
134 |       OpenMP, CUDA, and OpenCL.
135 | 
136 | 
137 | Additional Resources
138 | --------------------
139 | 
140 | - Step-by-step guide that explains how to optimize a GPU-accelerate program (CUDA):
141 |   https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
142 | - Introduction to CUDA and OpenCL programming: https://parlab.eecs.berkeley.edu/sites/all/parlab/files/CatanzaroIntroToCUDAOpenCL_0.pdf.
143 |   Check the slide 27 for a comparison of the lexicon used in the two frameworks.
144 | 
145 | 
146 | Benchmarking
147 | ------------
148 | 
149 | Run the Python script ``benchmark.py`` to compare how performances vary depending on the number of
150 | threads running on CPU:
151 | 
152 | .. code:: bash
153 | 
154 |    # Linux
155 |    python3 benchmark.py
156 | 
157 |    # OS that begins with the letter W
158 |    py benchmark.py
159 | 
160 | 
161 | -----
162 | 
163 | 
164 | Visite guidée de cadres logiciels pour processeurs graphiques
165 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
166 | 
167 | Ce dépôt contient des projets sans dépendances qui montrent comment installer un cadre logiciel de
168 | programmation de GPU, comment construire des programmes accélérés par GPU, et comment les exécuter.
169 | Cliquez sur les liens dans le tableau ci-dessous pour accéder à des informations supplémentaires
170 | sur chaque projet.
171 | 
172 | Ces projets sont des exemples minimalistes et non des tutoriels complets. Les fichiers
173 | ``readme`` dans chaque sous-répertoire fournissent des ressources plus détaillées.
174 | 
175 | 
176 | Indice des projets
177 | ------------------
178 | 
179 | +------------------------------------------+----------------------------+-------------------------------------------+---------------+------------------+
180 | | Cadre logiciel                           | Applications               | Appareils                                 | Systèmes      | Language de      |
181 | |                                          +----------+-----------------+-----+-------+-------+-----+---------------+ d'exploitation| nuanceurs /      |
182 | |                                          |Graphique | Calculs généraux| CPU |Nvidia | Intel | AMD | Apple Silicon |               | noyaux           |
183 | +==========================================+==========+=================+=====+=======+=======+=====+===============+===============+==================+
184 | |`OpenGL <opengl/readme.md>`__             | O        | O (depuis la    | N   | O     | O     | O   | N             | Tous          | GLSL             |
185 | |                                          |          | version 4.3,    |     |       |       |     |               | (réprouvé     |                  |
186 | |                                          |          | 2012)           |     |       |       |     |               | sur Mac)      |                  |
187 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
188 | |`DirectX <directx/readme.md>`__           | O        | O               | N   | O     | O     | O   | N             | Windows       | HLSL             |
189 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
190 | |`Metal <metal/readme.md>`__               | O*       | O*              | N   | N     | N     | N   | O             | Mac / iOS     | MSL              |
191 | |                                          |          |                 |     |       |       |     |               |               |                  |
192 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
193 | |`Vulkan <vulkan/readme.md>`__             | O        | O (avec         | N   | O     | O     | O   | N             | Tous          | Tous se qui se   |
194 | |                                          |          | kompute)        |     |       |       |     |               | (réprouvé     | compile vers     |
195 | |                                          |          |                 |     |       |       |     |               | sur Mac)      | SPIR-V           |
196 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
197 | |`WebGPU <webgpu/readme.md>`__             | O        | O               | N   | O     | O     | O   | O             | Tous          | WGSL             |
198 | |                                          |          |                 |     |       |       |     |               |               |                  |
199 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
200 | |`CUDA <cuda/readme.md>`__                 | N        | O               | N   | O     | N     | N   | N             | Windows,      | CUDA             |
201 | |                                          |          |                 |     |       |       |     |               | Linux         |                  |
202 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
203 | |`OpenCL <opencl/readme.md>`__             | N        | O               | O   | O     | O     | O   | O             | Tous          | OpenCL C         |
204 | |                                          |          |                 |     |       |       |     |               | (réprouvé     |                  |
205 | |                                          |          |                 |     |       |       |     |               | sur Mac)      |                  |
206 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
207 | |`SYCL <sycl/readme.md>`__                 | N        | O*              | O   | O     | O     | O   | O             | Tous (CPU     | Extensions C++   |
208 | |                                          |          |                 |     |       |       |     |               | seulement sur |                  |
209 | |                                          |          |                 |     |       |       |     |               | Mac)          |                  |
210 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
211 | |`Triton <triton/readme.md>`__             | N        | O               | N   | O     | N     | O   | N             | Linux         | Fonctions        |
212 | |                                          |          |                 |     |       |       |     |               |               | Pythons          |
213 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
214 | | `OpenMP <openmp/readme.md>`__            | N        | O               | O   | O     | O     | O   | O             | Tous          | Directives de    |
215 | |                                          |          |                 |     |       |       |     |               |               | compilateur      |
216 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
217 | | `AcceleratedKernels.jl                   | N        | O               | O   | O     | O     | O   | O             | Tous          | Fonctions Julia  |
218 | | <AcceleratedKernels.jl/readme.md>`__     |          |                 |     |       |       |     |               |               |                  |
219 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
220 | |`CPU <cpu/readme.md>`__                   | N        | O               | O   | N     | N     | N   | N             | Tous          | N/A              |
221 | +------------------------------------------+----------+-----------------+-----+-------+-------+-----+---------------+---------------+------------------+
222 | 
223 | - Le signe ``*`` indique que l'exemple correspondant n'est pas inclus dans le projet.
224 | 
225 | 
226 | Autres cadriciels
227 | -----------------
228 | 
229 | Encore d'autres cadriciels sont disponibles pour programmer des GPU! La liste ci-dessous en présente
230 | quelques-uns. Aucun exemple n'est implémenté pour eux dans ce dépôt, mais vous pouvez suivre les
231 | liens pour en apprendre davantage.
232 | 
233 | - Bend (https://github.com/HigherOrderCO/Bend): un langage de programmation pour le calcul
234 |   parallèle.
235 | - Chapel (https://chapel-lang.org/gpu/): un autre langage de programmation pour le calcul
236 |   parallèle.
237 | - Mojo (https://www.modular.com/mojo): un langage pour calcul hétérogène.
238 | - oneAPI (https://www.intel.com/content/www/us/en/developer/tools/oneapi/overview.html): Une pile
239 |   logicielle pour le calcul haute performance par Intel. Basé sur SYCL, mais utilise aussi des
240 |   extensions spécifiques au projet pour implémenter de nouvelles fonctionnalités.
241 | - OpenACC (https://www.openacc.org/): un standard de calcul parallèle.
242 | - ROCm (https://www.amd.com/fr/products/software/rocm.html): Une pile logicielle pour calcule de
243 |   haute performance par AMD. Supporte OpenCL, HIP, OpenMP.
244 | - Slang (https://www.khronos.org/news/press/khronos-group-launches-slang-initiative-hosting-open-source-compiler-contributed-by-nvidia):
245 |   un compilateur et langage de nuanceur qui cible plusieurs API.
246 | - rust-gpu (https://github.com/Rust-GPU/rust-gpu) un cadriciel en développement pour intégrer du
247 |   code destiné à un GPU dans du code Rust. Similaire à SYCL mais vise Rust au lieu du C++ et permet
248 |   de développer des applications graphiques. Pas encore prêt pour la production en Juillet 2025.
249 | 
250 | 
251 | Projets
252 | -------
253 | 
254 | Quelques projets qui utilisent des GPU. N'hésitez pas à créer un PR pour en ajouter à la liste :
255 | 
256 | - Intelligence artificielle :
257 |     - burn (https://github.com/tracel-ai/burn) : Cadriciel d'apprentissage profond qui utilise
258 |       WebGPU pour améliorer la portabilité. Utilise aussi SPIR-V directement pour effectuer certaines
259 |       optimisations que WebGPU ne supporte pas.
260 |     - PyTorch (https://github.com/pytorch/pytorch) : Cadriciel d'apprentissage profond qui utilise
261 |       CUDA et ROCm.
262 |     - TensorFlow (https://github.com/tensorflow/tensorflow) :  Cadriciel d'apprentissage profond
263 |       qui utilise CUDA.
264 | - Graphisme :
265 |     - Godot Shaders (https://godotshaders.com/) : Un ensemble de nuanceurs qui peuvent être utilisés
266 |       avec le moteur de jeu Godot. Ils utilisent un langage de nuanceur proche de GLSL.
267 | - Physique :
268 |     - FluidX3D (https://github.com/ProjectPhysX/FluidX3D): Programme de dynamique des fluides
269 |       réalisé avec OpenCL.
270 |     - gpu-io (https://github.com/amandaghassaei/gpu-io): Bibliothèque de simulation physique
271 |       utilisable dans un navigateur Web. Réalisé avec WebGL.
272 |     - PixelFlow (https://github.com/diwi/PixelFlow): Cadriciel de simulation physique réalisé avec
273 |       Java et OpenGL.
274 | - Bioinformatique :
275 |     - genome-spy (https://github.com/genome-spy/genome-spy) : Outils d'analyse génomique réalisé avec
276 |       WebGL.
277 |     - GenomeWorks (https://github.com/NVIDIA-Genomics-Research/GenomeWorks) : Analyse et alignement
278 |       d'ADN avec CUDA.
279 | - Cryptographie :
280 |     - hashcat (https://github.com/hashcat/hashcat) : Programme de récupération de mots de passe
281 |       réalisé avec OpenMP, CUDA et OpenCL.
282 | 
283 | 
284 | Ressources additionnelles
285 | -------------------------
286 | 
287 | - Guide d'optimisation de programme pour GPU (CUDA) :
288 |   https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
289 | - Introduction à la programmation sur CUDA et OpenCL : https://parlab.eecs.berkeley.edu/sites/all/parlab/files/CatanzaroIntroToCUDAOpenCL_0.pdf.
290 |   Consultez la diapositive 27 pour une comparaison des lexiques utilisés par chaque cadre logiciel.
291 | 
292 | 
293 | Comparaisons
294 | -------------
295 | 
296 | Exécutez le script ``benchmark.py`` pour comparer les performances d'un programme utilisant
297 | plusieurs fils d'exécution sur CPU:
298 | 
299 | .. code:: bash
300 | 
301 |    # Linux
302 |    python3 benchmark.py
303 | 
304 |    # OS that begins with the letter W
305 |    py benchmark.py
306 | 


--------------------------------------------------------------------------------
/vulkan/graphics/main.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is taken from
  3 |  * https://github.com/Overv/VulkanTutorial/blob/main/code/15_hello_triangle.cpp,
  4 |  * from the repository `VulkanTutorial`, available at
  5 |  * https://github.com/Overv/VulkanTutorial/tree/main and licensed under the
  6 |  * licenses CC0-1.0 and CC-BY-SA-4.0.
  7 |  */
  8 | 
  9 | #define GLFW_INCLUDE_VULKAN
 10 | #include <GLFW/glfw3.h>
 11 | 
 12 | #include <iostream>
 13 | #include <fstream>
 14 | #include <stdexcept>
 15 | #include <algorithm>
 16 | #include <vector>
 17 | #include <cstring>
 18 | #include <cstdlib>
 19 | #include <cstdint>
 20 | #include <limits>
 21 | #include <optional>
 22 | #include <set>
 23 | 
 24 | const uint32_t WIDTH = 800;
 25 | const uint32_t HEIGHT = 600;
 26 | 
 27 | const int MAX_FRAMES_IN_FLIGHT = 2;
 28 | 
 29 | const std::vector<const char*> validationLayers = {
 30 |     "VK_LAYER_KHRONOS_validation"
 31 | };
 32 | 
 33 | const std::vector<const char*> deviceExtensions = {
 34 |     VK_KHR_SWAPCHAIN_EXTENSION_NAME
 35 | };
 36 | 
 37 | #ifdef NDEBUG
 38 | const bool enableValidationLayers = false;
 39 | #else
 40 | const bool enableValidationLayers = true;
 41 | #endif
 42 | 
 43 | VkResult CreateDebugUtilsMessengerEXT(VkInstance instance, const VkDebugUtilsMessengerCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugUtilsMessengerEXT* pDebugMessenger) {
 44 |     auto func = (PFN_vkCreateDebugUtilsMessengerEXT) vkGetInstanceProcAddr(instance, "vkCreateDebugUtilsMessengerEXT");
 45 |     if (func != nullptr) {
 46 |         return func(instance, pCreateInfo, pAllocator, pDebugMessenger);
 47 |     } else {
 48 |         return VK_ERROR_EXTENSION_NOT_PRESENT;
 49 |     }
 50 | }
 51 | 
 52 | void DestroyDebugUtilsMessengerEXT(VkInstance instance, VkDebugUtilsMessengerEXT debugMessenger, const VkAllocationCallbacks* pAllocator) {
 53 |     auto func = (PFN_vkDestroyDebugUtilsMessengerEXT) vkGetInstanceProcAddr(instance, "vkDestroyDebugUtilsMessengerEXT");
 54 |     if (func != nullptr) {
 55 |         func(instance, debugMessenger, pAllocator);
 56 |     }
 57 | }
 58 | 
 59 | struct QueueFamilyIndices {
 60 |     std::optional<uint32_t> graphicsFamily;
 61 |     std::optional<uint32_t> presentFamily;
 62 | 
 63 |     bool isComplete() {
 64 |         return graphicsFamily.has_value() && presentFamily.has_value();
 65 |     }
 66 | };
 67 | 
 68 | struct SwapChainSupportDetails {
 69 |     VkSurfaceCapabilitiesKHR capabilities;
 70 |     std::vector<VkSurfaceFormatKHR> formats;
 71 |     std::vector<VkPresentModeKHR> presentModes;
 72 | };
 73 | 
 74 | class HelloTriangleApplication {
 75 | public:
 76 |     void run() {
 77 |         initWindow();
 78 |         initVulkan();
 79 |         mainLoop();
 80 |         cleanup();
 81 |     }
 82 | 
 83 | private:
 84 |     GLFWwindow* window;
 85 | 
 86 |     VkInstance instance;
 87 |     VkDebugUtilsMessengerEXT debugMessenger;
 88 |     VkSurfaceKHR surface;
 89 | 
 90 |     VkPhysicalDevice physicalDevice = VK_NULL_HANDLE;
 91 |     VkDevice device;
 92 | 
 93 |     VkQueue graphicsQueue;
 94 |     VkQueue presentQueue;
 95 | 
 96 |     VkSwapchainKHR swapChain;
 97 |     std::vector<VkImage> swapChainImages;
 98 |     VkFormat swapChainImageFormat;
 99 |     VkExtent2D swapChainExtent;
100 |     std::vector<VkImageView> swapChainImageViews;
101 |     std::vector<VkFramebuffer> swapChainFramebuffers;
102 | 
103 |     VkRenderPass renderPass;
104 |     VkPipelineLayout pipelineLayout;
105 |     VkPipeline graphicsPipeline;
106 | 
107 |     VkCommandPool commandPool;
108 |     VkCommandBuffer commandBuffer;
109 | 
110 |     VkSemaphore imageAvailableSemaphore;
111 |     VkSemaphore renderFinishedSemaphore;
112 |     VkFence inFlightFence;
113 | 
114 |     void initWindow() {
115 |         glfwInit();
116 | 
117 |         glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
118 |         glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE);
119 | 
120 |         window = glfwCreateWindow(WIDTH, HEIGHT, "Vulkan", nullptr, nullptr);
121 |     }
122 | 
123 |     void initVulkan() {
124 |         createInstance();
125 |         setupDebugMessenger();
126 |         createSurface();
127 |         pickPhysicalDevice();
128 |         createLogicalDevice();
129 |         createSwapChain();
130 |         createImageViews();
131 |         createRenderPass();
132 |         createGraphicsPipeline();
133 |         createFramebuffers();
134 |         createCommandPool();
135 |         createCommandBuffer();
136 |         createSyncObjects();
137 |     }
138 | 
139 |     void mainLoop() {
140 |         while (!glfwWindowShouldClose(window)) {
141 |             glfwPollEvents();
142 |             drawFrame();
143 |         }
144 | 
145 |         vkDeviceWaitIdle(device);
146 |     }
147 | 
148 |     void cleanup() {
149 |         vkDestroySemaphore(device, renderFinishedSemaphore, nullptr);
150 |         vkDestroySemaphore(device, imageAvailableSemaphore, nullptr);
151 |         vkDestroyFence(device, inFlightFence, nullptr);
152 | 
153 |         vkDestroyCommandPool(device, commandPool, nullptr);
154 | 
155 |         for (auto framebuffer : swapChainFramebuffers) {
156 |             vkDestroyFramebuffer(device, framebuffer, nullptr);
157 |         }
158 | 
159 |         vkDestroyPipeline(device, graphicsPipeline, nullptr);
160 |         vkDestroyPipelineLayout(device, pipelineLayout, nullptr);
161 |         vkDestroyRenderPass(device, renderPass, nullptr);
162 | 
163 |         for (auto imageView : swapChainImageViews) {
164 |             vkDestroyImageView(device, imageView, nullptr);
165 |         }
166 | 
167 |         vkDestroySwapchainKHR(device, swapChain, nullptr);
168 |         vkDestroyDevice(device, nullptr);
169 | 
170 |         if (enableValidationLayers) {
171 |             DestroyDebugUtilsMessengerEXT(instance, debugMessenger, nullptr);
172 |         }
173 | 
174 |         vkDestroySurfaceKHR(instance, surface, nullptr);
175 |         vkDestroyInstance(instance, nullptr);
176 | 
177 |         glfwDestroyWindow(window);
178 | 
179 |         glfwTerminate();
180 |     }
181 | 
182 |     void createInstance() {
183 |         if (enableValidationLayers && !checkValidationLayerSupport()) {
184 |             throw std::runtime_error("validation layers requested, but not available!");
185 |         }
186 | 
187 |         VkApplicationInfo appInfo{};
188 |         appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
189 |         appInfo.pApplicationName = "Hello Triangle";
190 |         appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
191 |         appInfo.pEngineName = "No Engine";
192 |         appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
193 |         appInfo.apiVersion = VK_API_VERSION_1_0;
194 | 
195 |         VkInstanceCreateInfo createInfo{};
196 |         createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
197 |         createInfo.pApplicationInfo = &appInfo;
198 | 
199 |         auto extensions = getRequiredExtensions();
200 |         createInfo.enabledExtensionCount = static_cast<uint32_t>(extensions.size());
201 |         createInfo.ppEnabledExtensionNames = extensions.data();
202 | 
203 |         VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo{};
204 |         if (enableValidationLayers) {
205 |             createInfo.enabledLayerCount = static_cast<uint32_t>(validationLayers.size());
206 |             createInfo.ppEnabledLayerNames = validationLayers.data();
207 | 
208 |             populateDebugMessengerCreateInfo(debugCreateInfo);
209 |             createInfo.pNext = (VkDebugUtilsMessengerCreateInfoEXT*) &debugCreateInfo;
210 |         } else {
211 |             createInfo.enabledLayerCount = 0;
212 | 
213 |             createInfo.pNext = nullptr;
214 |         }
215 | 
216 |         if (vkCreateInstance(&createInfo, nullptr, &instance) != VK_SUCCESS) {
217 |             throw std::runtime_error("failed to create instance!");
218 |         }
219 |     }
220 | 
221 |     void populateDebugMessengerCreateInfo(VkDebugUtilsMessengerCreateInfoEXT& createInfo) {
222 |         createInfo = {};
223 |         createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
224 |         createInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
225 |         createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
226 |         createInfo.pfnUserCallback = debugCallback;
227 |     }
228 | 
229 |     void setupDebugMessenger() {
230 |         if (!enableValidationLayers) return;
231 | 
232 |         VkDebugUtilsMessengerCreateInfoEXT createInfo;
233 |         populateDebugMessengerCreateInfo(createInfo);
234 | 
235 |         if (CreateDebugUtilsMessengerEXT(instance, &createInfo, nullptr, &debugMessenger) != VK_SUCCESS) {
236 |             throw std::runtime_error("failed to set up debug messenger!");
237 |         }
238 |     }
239 | 
240 |     void createSurface() {
241 |         if (glfwCreateWindowSurface(instance, window, nullptr, &surface) != VK_SUCCESS) {
242 |             throw std::runtime_error("failed to create window surface!");
243 |         }
244 |     }
245 | 
246 |     void pickPhysicalDevice() {
247 |         uint32_t deviceCount = 0;
248 |         vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr);
249 | 
250 |         if (deviceCount == 0) {
251 |             throw std::runtime_error("failed to find GPUs with Vulkan support!");
252 |         }
253 | 
254 |         std::vector<VkPhysicalDevice> devices(deviceCount);
255 |         vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data());
256 | 
257 |         for (const auto& device : devices) {
258 |             if (isDeviceSuitable(device)) {
259 |                 physicalDevice = device;
260 |                 break;
261 |             }
262 |         }
263 | 
264 |         if (physicalDevice == VK_NULL_HANDLE) {
265 |             throw std::runtime_error("failed to find a suitable GPU!");
266 |         }
267 |     }
268 | 
269 |     void createLogicalDevice() {
270 |         QueueFamilyIndices indices = findQueueFamilies(physicalDevice);
271 | 
272 |         std::vector<VkDeviceQueueCreateInfo> queueCreateInfos;
273 |         std::set<uint32_t> uniqueQueueFamilies = {indices.graphicsFamily.value(), indices.presentFamily.value()};
274 | 
275 |         float queuePriority = 1.0f;
276 |         for (uint32_t queueFamily : uniqueQueueFamilies) {
277 |             VkDeviceQueueCreateInfo queueCreateInfo{};
278 |             queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
279 |             queueCreateInfo.queueFamilyIndex = queueFamily;
280 |             queueCreateInfo.queueCount = 1;
281 |             queueCreateInfo.pQueuePriorities = &queuePriority;
282 |             queueCreateInfos.push_back(queueCreateInfo);
283 |         }
284 | 
285 |         VkPhysicalDeviceFeatures deviceFeatures{};
286 | 
287 |         VkDeviceCreateInfo createInfo{};
288 |         createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
289 | 
290 |         createInfo.queueCreateInfoCount = static_cast<uint32_t>(queueCreateInfos.size());
291 |         createInfo.pQueueCreateInfos = queueCreateInfos.data();
292 | 
293 |         createInfo.pEnabledFeatures = &deviceFeatures;
294 | 
295 |         createInfo.enabledExtensionCount = static_cast<uint32_t>(deviceExtensions.size());
296 |         createInfo.ppEnabledExtensionNames = deviceExtensions.data();
297 | 
298 |         if (enableValidationLayers) {
299 |             createInfo.enabledLayerCount = static_cast<uint32_t>(validationLayers.size());
300 |             createInfo.ppEnabledLayerNames = validationLayers.data();
301 |         } else {
302 |             createInfo.enabledLayerCount = 0;
303 |         }
304 | 
305 |         if (vkCreateDevice(physicalDevice, &createInfo, nullptr, &device) != VK_SUCCESS) {
306 |             throw std::runtime_error("failed to create logical device!");
307 |         }
308 | 
309 |         vkGetDeviceQueue(device, indices.graphicsFamily.value(), 0, &graphicsQueue);
310 |         vkGetDeviceQueue(device, indices.presentFamily.value(), 0, &presentQueue);
311 |     }
312 | 
313 |     void createSwapChain() {
314 |         SwapChainSupportDetails swapChainSupport = querySwapChainSupport(physicalDevice);
315 | 
316 |         VkSurfaceFormatKHR surfaceFormat = chooseSwapSurfaceFormat(swapChainSupport.formats);
317 |         VkPresentModeKHR presentMode = chooseSwapPresentMode(swapChainSupport.presentModes);
318 |         VkExtent2D extent = chooseSwapExtent(swapChainSupport.capabilities);
319 | 
320 |         uint32_t imageCount = swapChainSupport.capabilities.minImageCount + 1;
321 |         if (swapChainSupport.capabilities.maxImageCount > 0 && imageCount > swapChainSupport.capabilities.maxImageCount) {
322 |             imageCount = swapChainSupport.capabilities.maxImageCount;
323 |         }
324 | 
325 |         VkSwapchainCreateInfoKHR createInfo{};
326 |         createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR;
327 |         createInfo.surface = surface;
328 | 
329 |         createInfo.minImageCount = imageCount;
330 |         createInfo.imageFormat = surfaceFormat.format;
331 |         createInfo.imageColorSpace = surfaceFormat.colorSpace;
332 |         createInfo.imageExtent = extent;
333 |         createInfo.imageArrayLayers = 1;
334 |         createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
335 | 
336 |         QueueFamilyIndices indices = findQueueFamilies(physicalDevice);
337 |         uint32_t queueFamilyIndices[] = {indices.graphicsFamily.value(), indices.presentFamily.value()};
338 | 
339 |         if (indices.graphicsFamily != indices.presentFamily) {
340 |             createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT;
341 |             createInfo.queueFamilyIndexCount = 2;
342 |             createInfo.pQueueFamilyIndices = queueFamilyIndices;
343 |         } else {
344 |             createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE;
345 |         }
346 | 
347 |         createInfo.preTransform = swapChainSupport.capabilities.currentTransform;
348 |         createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR;
349 |         createInfo.presentMode = presentMode;
350 |         createInfo.clipped = VK_TRUE;
351 | 
352 |         createInfo.oldSwapchain = VK_NULL_HANDLE;
353 | 
354 |         if (vkCreateSwapchainKHR(device, &createInfo, nullptr, &swapChain) != VK_SUCCESS) {
355 |             throw std::runtime_error("failed to create swap chain!");
356 |         }
357 | 
358 |         vkGetSwapchainImagesKHR(device, swapChain, &imageCount, nullptr);
359 |         swapChainImages.resize(imageCount);
360 |         vkGetSwapchainImagesKHR(device, swapChain, &imageCount, swapChainImages.data());
361 | 
362 |         swapChainImageFormat = surfaceFormat.format;
363 |         swapChainExtent = extent;
364 |     }
365 | 
366 |     void createImageViews() {
367 |         swapChainImageViews.resize(swapChainImages.size());
368 | 
369 |         for (size_t i = 0; i < swapChainImages.size(); i++) {
370 |             VkImageViewCreateInfo createInfo{};
371 |             createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
372 |             createInfo.image = swapChainImages[i];
373 |             createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D;
374 |             createInfo.format = swapChainImageFormat;
375 |             createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
376 |             createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
377 |             createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
378 |             createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
379 |             createInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
380 |             createInfo.subresourceRange.baseMipLevel = 0;
381 |             createInfo.subresourceRange.levelCount = 1;
382 |             createInfo.subresourceRange.baseArrayLayer = 0;
383 |             createInfo.subresourceRange.layerCount = 1;
384 | 
385 |             if (vkCreateImageView(device, &createInfo, nullptr, &swapChainImageViews[i]) != VK_SUCCESS) {
386 |                 throw std::runtime_error("failed to create image views!");
387 |             }
388 |         }
389 |     }
390 | 
391 |     void createRenderPass() {
392 |         VkAttachmentDescription colorAttachment{};
393 |         colorAttachment.format = swapChainImageFormat;
394 |         colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT;
395 |         colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR;
396 |         colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
397 |         colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
398 |         colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
399 |         colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
400 |         colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
401 | 
402 |         VkAttachmentReference colorAttachmentRef{};
403 |         colorAttachmentRef.attachment = 0;
404 |         colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
405 | 
406 |         VkSubpassDescription subpass{};
407 |         subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
408 |         subpass.colorAttachmentCount = 1;
409 |         subpass.pColorAttachments = &colorAttachmentRef;
410 | 
411 |         VkSubpassDependency dependency{};
412 |         dependency.srcSubpass = VK_SUBPASS_EXTERNAL;
413 |         dependency.dstSubpass = 0;
414 |         dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
415 |         dependency.srcAccessMask = 0;
416 |         dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
417 |         dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
418 | 
419 |         VkRenderPassCreateInfo renderPassInfo{};
420 |         renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO;
421 |         renderPassInfo.attachmentCount = 1;
422 |         renderPassInfo.pAttachments = &colorAttachment;
423 |         renderPassInfo.subpassCount = 1;
424 |         renderPassInfo.pSubpasses = &subpass;
425 |         renderPassInfo.dependencyCount = 1;
426 |         renderPassInfo.pDependencies = &dependency;
427 | 
428 |         if (vkCreateRenderPass(device, &renderPassInfo, nullptr, &renderPass) != VK_SUCCESS) {
429 |             throw std::runtime_error("failed to create render pass!");
430 |         }
431 |     }
432 | 
433 |     void createGraphicsPipeline() {
434 |         auto vertShaderCode = readFile("vertices.spv");
435 |         auto fragShaderCode = readFile("fragment.spv");
436 | 
437 |         VkShaderModule vertShaderModule = createShaderModule(vertShaderCode);
438 |         VkShaderModule fragShaderModule = createShaderModule(fragShaderCode);
439 | 
440 |         VkPipelineShaderStageCreateInfo vertShaderStageInfo{};
441 |         vertShaderStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
442 |         vertShaderStageInfo.stage = VK_SHADER_STAGE_VERTEX_BIT;
443 |         vertShaderStageInfo.module = vertShaderModule;
444 |         vertShaderStageInfo.pName = "main";
445 | 
446 |         VkPipelineShaderStageCreateInfo fragShaderStageInfo{};
447 |         fragShaderStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
448 |         fragShaderStageInfo.stage = VK_SHADER_STAGE_FRAGMENT_BIT;
449 |         fragShaderStageInfo.module = fragShaderModule;
450 |         fragShaderStageInfo.pName = "main";
451 | 
452 |         VkPipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo};
453 | 
454 |         VkPipelineVertexInputStateCreateInfo vertexInputInfo{};
455 |         vertexInputInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
456 |         vertexInputInfo.vertexBindingDescriptionCount = 0;
457 |         vertexInputInfo.vertexAttributeDescriptionCount = 0;
458 | 
459 |         VkPipelineInputAssemblyStateCreateInfo inputAssembly{};
460 |         inputAssembly.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
461 |         inputAssembly.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
462 |         inputAssembly.primitiveRestartEnable = VK_FALSE;
463 | 
464 |         VkPipelineViewportStateCreateInfo viewportState{};
465 |         viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
466 |         viewportState.viewportCount = 1;
467 |         viewportState.scissorCount = 1;
468 | 
469 |         VkPipelineRasterizationStateCreateInfo rasterizer{};
470 |         rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
471 |         rasterizer.depthClampEnable = VK_FALSE;
472 |         rasterizer.rasterizerDiscardEnable = VK_FALSE;
473 |         rasterizer.polygonMode = VK_POLYGON_MODE_FILL;
474 |         rasterizer.lineWidth = 1.0f;
475 |         rasterizer.cullMode = VK_CULL_MODE_BACK_BIT;
476 |         rasterizer.frontFace = VK_FRONT_FACE_CLOCKWISE;
477 |         rasterizer.depthBiasEnable = VK_FALSE;
478 | 
479 |         VkPipelineMultisampleStateCreateInfo multisampling{};
480 |         multisampling.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
481 |         multisampling.sampleShadingEnable = VK_FALSE;
482 |         multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
483 | 
484 |         VkPipelineColorBlendAttachmentState colorBlendAttachment{};
485 |         colorBlendAttachment.colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT;
486 |         colorBlendAttachment.blendEnable = VK_FALSE;
487 | 
488 |         VkPipelineColorBlendStateCreateInfo colorBlending{};
489 |         colorBlending.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
490 |         colorBlending.logicOpEnable = VK_FALSE;
491 |         colorBlending.logicOp = VK_LOGIC_OP_COPY;
492 |         colorBlending.attachmentCount = 1;
493 |         colorBlending.pAttachments = &colorBlendAttachment;
494 |         colorBlending.blendConstants[0] = 0.0f;
495 |         colorBlending.blendConstants[1] = 0.0f;
496 |         colorBlending.blendConstants[2] = 0.0f;
497 |         colorBlending.blendConstants[3] = 0.0f;
498 | 
499 |         std::vector<VkDynamicState> dynamicStates = {
500 |             VK_DYNAMIC_STATE_VIEWPORT,
501 |             VK_DYNAMIC_STATE_SCISSOR
502 |         };
503 |         VkPipelineDynamicStateCreateInfo dynamicState{};
504 |         dynamicState.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO;
505 |         dynamicState.dynamicStateCount = static_cast<uint32_t>(dynamicStates.size());
506 |         dynamicState.pDynamicStates = dynamicStates.data();
507 | 
508 |         VkPipelineLayoutCreateInfo pipelineLayoutInfo{};
509 |         pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
510 |         pipelineLayoutInfo.setLayoutCount = 0;
511 |         pipelineLayoutInfo.pushConstantRangeCount = 0;
512 | 
513 |         if (vkCreatePipelineLayout(device, &pipelineLayoutInfo, nullptr, &pipelineLayout) != VK_SUCCESS) {
514 |             throw std::runtime_error("failed to create pipeline layout!");
515 |         }
516 | 
517 |         VkGraphicsPipelineCreateInfo pipelineInfo{};
518 |         pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO;
519 |         pipelineInfo.stageCount = 2;
520 |         pipelineInfo.pStages = shaderStages;
521 |         pipelineInfo.pVertexInputState = &vertexInputInfo;
522 |         pipelineInfo.pInputAssemblyState = &inputAssembly;
523 |         pipelineInfo.pViewportState = &viewportState;
524 |         pipelineInfo.pRasterizationState = &rasterizer;
525 |         pipelineInfo.pMultisampleState = &multisampling;
526 |         pipelineInfo.pColorBlendState = &colorBlending;
527 |         pipelineInfo.pDynamicState = &dynamicState;
528 |         pipelineInfo.layout = pipelineLayout;
529 |         pipelineInfo.renderPass = renderPass;
530 |         pipelineInfo.subpass = 0;
531 |         pipelineInfo.basePipelineHandle = VK_NULL_HANDLE;
532 | 
533 |         if (vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &graphicsPipeline) != VK_SUCCESS) {
534 |             throw std::runtime_error("failed to create graphics pipeline!");
535 |         }
536 | 
537 |         vkDestroyShaderModule(device, fragShaderModule, nullptr);
538 |         vkDestroyShaderModule(device, vertShaderModule, nullptr);
539 |     }
540 | 
541 |     void createFramebuffers() {
542 |         swapChainFramebuffers.resize(swapChainImageViews.size());
543 | 
544 |         for (size_t i = 0; i < swapChainImageViews.size(); i++) {
545 |             VkImageView attachments[] = {
546 |                 swapChainImageViews[i]
547 |             };
548 | 
549 |             VkFramebufferCreateInfo framebufferInfo{};
550 |             framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO;
551 |             framebufferInfo.renderPass = renderPass;
552 |             framebufferInfo.attachmentCount = 1;
553 |             framebufferInfo.pAttachments = attachments;
554 |             framebufferInfo.width = swapChainExtent.width;
555 |             framebufferInfo.height = swapChainExtent.height;
556 |             framebufferInfo.layers = 1;
557 | 
558 |             if (vkCreateFramebuffer(device, &framebufferInfo, nullptr, &swapChainFramebuffers[i]) != VK_SUCCESS) {
559 |                 throw std::runtime_error("failed to create framebuffer!");
560 |             }
561 |         }
562 |     }
563 | 
564 |     void createCommandPool() {
565 |         QueueFamilyIndices queueFamilyIndices = findQueueFamilies(physicalDevice);
566 | 
567 |         VkCommandPoolCreateInfo poolInfo{};
568 |         poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
569 |         poolInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
570 |         poolInfo.queueFamilyIndex = queueFamilyIndices.graphicsFamily.value();
571 | 
572 |         if (vkCreateCommandPool(device, &poolInfo, nullptr, &commandPool) != VK_SUCCESS) {
573 |             throw std::runtime_error("failed to create command pool!");
574 |         }
575 |     }
576 | 
577 |     void createCommandBuffer() {
578 |         VkCommandBufferAllocateInfo allocInfo{};
579 |         allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
580 |         allocInfo.commandPool = commandPool;
581 |         allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
582 |         allocInfo.commandBufferCount = 1;
583 | 
584 |         if (vkAllocateCommandBuffers(device, &allocInfo, &commandBuffer) != VK_SUCCESS) {
585 |             throw std::runtime_error("failed to allocate command buffers!");
586 |         }
587 |     }
588 | 
589 |     void recordCommandBuffer(VkCommandBuffer commandBuffer, uint32_t imageIndex) {
590 |         VkCommandBufferBeginInfo beginInfo{};
591 |         beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
592 | 
593 |         if (vkBeginCommandBuffer(commandBuffer, &beginInfo) != VK_SUCCESS) {
594 |             throw std::runtime_error("failed to begin recording command buffer!");
595 |         }
596 | 
597 |         VkRenderPassBeginInfo renderPassInfo{};
598 |         renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
599 |         renderPassInfo.renderPass = renderPass;
600 |         renderPassInfo.framebuffer = swapChainFramebuffers[imageIndex];
601 |         renderPassInfo.renderArea.offset = {0, 0};
602 |         renderPassInfo.renderArea.extent = swapChainExtent;
603 | 
604 |         VkClearValue clearColor = {{{0.0f, 0.0f, 0.0f, 1.0f}}};
605 |         renderPassInfo.clearValueCount = 1;
606 |         renderPassInfo.pClearValues = &clearColor;
607 | 
608 |         vkCmdBeginRenderPass(commandBuffer, &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE);
609 | 
610 |         vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, graphicsPipeline);
611 | 
612 |         VkViewport viewport{};
613 |         viewport.x = 0.0f;
614 |         viewport.y = 0.0f;
615 |         viewport.width = static_cast<float>(swapChainExtent.width);
616 |         viewport.height = static_cast<float>(swapChainExtent.height);
617 |         viewport.minDepth = 0.0f;
618 |         viewport.maxDepth = 1.0f;
619 |         vkCmdSetViewport(commandBuffer, 0, 1, &viewport);
620 | 
621 |         VkRect2D scissor{};
622 |         scissor.offset = {0, 0};
623 |         scissor.extent = swapChainExtent;
624 |         vkCmdSetScissor(commandBuffer, 0, 1, &scissor);
625 | 
626 |         vkCmdDraw(commandBuffer, 3, 1, 0, 0);
627 | 
628 |         vkCmdEndRenderPass(commandBuffer);
629 | 
630 |         if (vkEndCommandBuffer(commandBuffer) != VK_SUCCESS) {
631 |             throw std::runtime_error("failed to record command buffer!");
632 |         }
633 |     }
634 | 
635 |     void createSyncObjects() {
636 |         VkSemaphoreCreateInfo semaphoreInfo{};
637 |         semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
638 | 
639 |         VkFenceCreateInfo fenceInfo{};
640 |         fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
641 |         fenceInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT;
642 | 
643 |         if (vkCreateSemaphore(device, &semaphoreInfo, nullptr, &imageAvailableSemaphore) != VK_SUCCESS ||
644 |             vkCreateSemaphore(device, &semaphoreInfo, nullptr, &renderFinishedSemaphore) != VK_SUCCESS ||
645 |             vkCreateFence(device, &fenceInfo, nullptr, &inFlightFence) != VK_SUCCESS) {
646 |             throw std::runtime_error("failed to create synchronization objects for a frame!");
647 |         }
648 | 
649 |     }
650 | 
651 |     void drawFrame() {
652 |         vkWaitForFences(device, 1, &inFlightFence, VK_TRUE, UINT64_MAX);
653 |         vkResetFences(device, 1, &inFlightFence);
654 | 
655 |         uint32_t imageIndex;
656 |         vkAcquireNextImageKHR(device, swapChain, UINT64_MAX, imageAvailableSemaphore, VK_NULL_HANDLE, &imageIndex);
657 | 
658 |         vkResetCommandBuffer(commandBuffer, /*VkCommandBufferResetFlagBits*/ 0);
659 |         recordCommandBuffer(commandBuffer, imageIndex);
660 | 
661 |         VkSubmitInfo submitInfo{};
662 |         submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
663 | 
664 |         VkSemaphore waitSemaphores[] = {imageAvailableSemaphore};
665 |         VkPipelineStageFlags waitStages[] = {VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT};
666 |         submitInfo.waitSemaphoreCount = 1;
667 |         submitInfo.pWaitSemaphores = waitSemaphores;
668 |         submitInfo.pWaitDstStageMask = waitStages;
669 | 
670 |         submitInfo.commandBufferCount = 1;
671 |         submitInfo.pCommandBuffers = &commandBuffer;
672 | 
673 |         VkSemaphore signalSemaphores[] = {renderFinishedSemaphore};
674 |         submitInfo.signalSemaphoreCount = 1;
675 |         submitInfo.pSignalSemaphores = signalSemaphores;
676 | 
677 |         if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, inFlightFence) != VK_SUCCESS) {
678 |             throw std::runtime_error("failed to submit draw command buffer!");
679 |         }
680 | 
681 |         VkPresentInfoKHR presentInfo{};
682 |         presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
683 | 
684 |         presentInfo.waitSemaphoreCount = 1;
685 |         presentInfo.pWaitSemaphores = signalSemaphores;
686 | 
687 |         VkSwapchainKHR swapChains[] = {swapChain};
688 |         presentInfo.swapchainCount = 1;
689 |         presentInfo.pSwapchains = swapChains;
690 | 
691 |         presentInfo.pImageIndices = &imageIndex;
692 | 
693 |         vkQueuePresentKHR(presentQueue, &presentInfo);
694 |     }
695 | 
696 |     VkShaderModule createShaderModule(const std::vector<char>& code) {
697 |         VkShaderModuleCreateInfo createInfo{};
698 |         createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
699 |         createInfo.codeSize = code.size();
700 |         createInfo.pCode = reinterpret_cast<const uint32_t*>(code.data());
701 | 
702 |         VkShaderModule shaderModule;
703 |         if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != VK_SUCCESS) {
704 |             throw std::runtime_error("failed to create shader module!");
705 |         }
706 | 
707 |         return shaderModule;
708 |     }
709 | 
710 |     VkSurfaceFormatKHR chooseSwapSurfaceFormat(const std::vector<VkSurfaceFormatKHR>& availableFormats) {
711 |         for (const auto& availableFormat : availableFormats) {
712 |             if (availableFormat.format == VK_FORMAT_B8G8R8A8_SRGB && availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) {
713 |                 return availableFormat;
714 |             }
715 |         }
716 | 
717 |         return availableFormats[0];
718 |     }
719 | 
720 |     VkPresentModeKHR chooseSwapPresentMode(const std::vector<VkPresentModeKHR>& availablePresentModes) {
721 |         for (const auto& availablePresentMode : availablePresentModes) {
722 |             if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) {
723 |                 return availablePresentMode;
724 |             }
725 |         }
726 | 
727 |         return VK_PRESENT_MODE_FIFO_KHR;
728 |     }
729 | 
730 |     VkExtent2D chooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities) {
731 |         if (capabilities.currentExtent.width != std::numeric_limits<uint32_t>::max()) {
732 |             return capabilities.currentExtent;
733 |         } else {
734 |             int width, height;
735 |             glfwGetFramebufferSize(window, &width, &height);
736 | 
737 |             VkExtent2D actualExtent = {
738 |                 static_cast<uint32_t>(width),
739 |                 static_cast<uint32_t>(height)
740 |             };
741 | 
742 |             actualExtent.width = std::clamp(actualExtent.width, capabilities.minImageExtent.width, capabilities.maxImageExtent.width);
743 |             actualExtent.height = std::clamp(actualExtent.height, capabilities.minImageExtent.height, capabilities.maxImageExtent.height);
744 | 
745 |             return actualExtent;
746 |         }
747 |     }
748 | 
749 |     SwapChainSupportDetails querySwapChainSupport(VkPhysicalDevice device) {
750 |         SwapChainSupportDetails details;
751 | 
752 |         vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &details.capabilities);
753 | 
754 |         uint32_t formatCount;
755 |         vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr);
756 | 
757 |         if (formatCount != 0) {
758 |             details.formats.resize(formatCount);
759 |             vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, details.formats.data());
760 |         }
761 | 
762 |         uint32_t presentModeCount;
763 |         vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, nullptr);
764 | 
765 |         if (presentModeCount != 0) {
766 |             details.presentModes.resize(presentModeCount);
767 |             vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, details.presentModes.data());
768 |         }
769 | 
770 |         return details;
771 |     }
772 | 
773 |     bool isDeviceSuitable(VkPhysicalDevice device) {
774 |         QueueFamilyIndices indices = findQueueFamilies(device);
775 | 
776 |         bool extensionsSupported = checkDeviceExtensionSupport(device);
777 | 
778 |         bool swapChainAdequate = false;
779 |         if (extensionsSupported) {
780 |             SwapChainSupportDetails swapChainSupport = querySwapChainSupport(device);
781 |             swapChainAdequate = !swapChainSupport.formats.empty() && !swapChainSupport.presentModes.empty();
782 |         }
783 | 
784 |         return indices.isComplete() && extensionsSupported && swapChainAdequate;
785 |     }
786 | 
787 |     bool checkDeviceExtensionSupport(VkPhysicalDevice device) {
788 |         uint32_t extensionCount;
789 |         vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, nullptr);
790 | 
791 |         std::vector<VkExtensionProperties> availableExtensions(extensionCount);
792 |         vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, availableExtensions.data());
793 | 
794 |         std::set<std::string> requiredExtensions(deviceExtensions.begin(), deviceExtensions.end());
795 | 
796 |         for (const auto& extension : availableExtensions) {
797 |             requiredExtensions.erase(extension.extensionName);
798 |         }
799 | 
800 |         return requiredExtensions.empty();
801 |     }
802 | 
803 |     QueueFamilyIndices findQueueFamilies(VkPhysicalDevice device) {
804 |         QueueFamilyIndices indices;
805 | 
806 |         uint32_t queueFamilyCount = 0;
807 |         vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr);
808 | 
809 |         std::vector<VkQueueFamilyProperties> queueFamilies(queueFamilyCount);
810 |         vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, queueFamilies.data());
811 | 
812 |         int i = 0;
813 |         for (const auto& queueFamily : queueFamilies) {
814 |             if (queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) {
815 |                 indices.graphicsFamily = i;
816 |             }
817 | 
818 |             VkBool32 presentSupport = false;
819 |             vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport);
820 | 
821 |             if (presentSupport) {
822 |                 indices.presentFamily = i;
823 |             }
824 | 
825 |             if (indices.isComplete()) {
826 |                 break;
827 |             }
828 | 
829 |             i++;
830 |         }
831 | 
832 |         return indices;
833 |     }
834 | 
835 |     std::vector<const char*> getRequiredExtensions() {
836 |         uint32_t glfwExtensionCount = 0;
837 |         const char** glfwExtensions;
838 |         glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount);
839 | 
840 |         std::vector<const char*> extensions(glfwExtensions, glfwExtensions + glfwExtensionCount);
841 | 
842 |         if (enableValidationLayers) {
843 |             extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
844 |         }
845 | 
846 |         return extensions;
847 |     }
848 | 
849 |     bool checkValidationLayerSupport() {
850 |         uint32_t layerCount;
851 |         vkEnumerateInstanceLayerProperties(&layerCount, nullptr);
852 | 
853 |         std::vector<VkLayerProperties> availableLayers(layerCount);
854 |         vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data());
855 | 
856 |         for (const char* layerName : validationLayers) {
857 |             bool layerFound = false;
858 | 
859 |             for (const auto& layerProperties : availableLayers) {
860 |                 if (strcmp(layerName, layerProperties.layerName) == 0) {
861 |                     layerFound = true;
862 |                     break;
863 |                 }
864 |             }
865 | 
866 |             if (!layerFound) {
867 |                 return false;
868 |             }
869 |         }
870 | 
871 |         return true;
872 |     }
873 | 
874 |     static std::vector<char> readFile(const std::string& filename) {
875 |         std::ifstream file(filename, std::ios::ate | std::ios::binary);
876 | 
877 |         if (!file.is_open()) {
878 |             throw std::runtime_error("failed to open file!");
879 |         }
880 | 
881 |         size_t fileSize = (size_t) file.tellg();
882 |         std::vector<char> buffer(fileSize);
883 | 
884 |         file.seekg(0);
885 |         file.read(buffer.data(), fileSize);
886 | 
887 |         file.close();
888 | 
889 |         return buffer;
890 |     }
891 | 
892 |     static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, VkDebugUtilsMessageTypeFlagsEXT messageType, const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData, void* pUserData) {
893 |         std::cerr << "validation layer: " << pCallbackData->pMessage << std::endl;
894 | 
895 |         return VK_FALSE;
896 |     }
897 | };
898 | 
899 | int main() {
900 |     HelloTriangleApplication app;
901 | 
902 |     try {
903 |         app.run();
904 |     } catch (const std::exception& e) {
905 |         std::cerr << e.what() << std::endl;
906 |         return EXIT_FAILURE;
907 |     }
908 | 
909 |     return EXIT_SUCCESS;
910 | }
911 | 


--------------------------------------------------------------------------------