├── Add.comp
├── CMakeLists.txt
├── LICENSE
├── README.md
├── Reduce.comp
├── Scan.comp
├── generateGraphs.py
├── main.cpp
├── reduce.cpp
├── reduce.h
├── scan.cpp
└── scan.h


/Add.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | #extension GL_ARB_separate_shader_objects : enable
 3 | #extension GL_GOOGLE_include_directive : enable
 4 | 
 5 | layout (local_size_x_id = 1) in;
 6 | 
 7 | layout(push_constant) uniform Consts
 8 | {
 9 |   int n;
10 | }consts;
11 | 
12 | layout(std430, binding = 0) buffer Input
13 | {
14 |   float value[];
15 | }i;
16 | 
17 | layout(std430, binding = 1) buffer Output
18 | {
19 |   float value[];
20 | }o;
21 | 
22 | shared float sum;
23 | 
24 | void main()
25 | {
26 |   if (gl_WorkGroupID.x > 0 &&
27 |       gl_GlobalInvocationID.x < consts.n)
28 |   {
29 |     sum = 0.0;
30 |     if (gl_LocalInvocationID.x == 0)
31 |     {
32 |       sum = i.value[gl_WorkGroupID.x - 1];
33 |     }
34 | 
35 |     memoryBarrierShared();
36 |     barrier();
37 | 
38 |     o.value[gl_GlobalInvocationID.x] += sum;
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.11)
 2 | 
 3 | project(VulkanSubgroups)
 4 | 
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | 
 7 | # cmake file to download and include dependencies
 8 | include(FetchContent)
 9 | 
10 | FetchContent_Declare(vortex2d
11 |                      GIT_REPOSITORY      https://github.com/mmaldacker/Vortex2D.git
12 |                      GIT_TAG             master)
13 | FetchContent_GetProperties(vortex2d)
14 | if(NOT vortex2d_POPULATED)
15 |   FetchContent_Populate(vortex2d)
16 |   add_subdirectory(${vortex2d_SOURCE_DIR} ${vortex2d_BINARY_DIR})
17 | endif()
18 | 
19 | set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
20 | FetchContent_Declare(benchmark
21 |                      GIT_REPOSITORY      https://github.com/google/benchmark.git
22 |                      GIT_TAG             v1.4.1)
23 | if(NOT benchmark_POPULATED)
24 |   FetchContent_Populate(benchmark)
25 |   add_subdirectory(${benchmark_SOURCE_DIR} ${benchmark_BINARY_DIR})
26 | endif()
27 | 
28 | compile_shader(SOURCES
29 |                  ${CMAKE_CURRENT_SOURCE_DIR}/Reduce.comp
30 |                  ${CMAKE_CURRENT_SOURCE_DIR}/Scan.comp
31 |                  ${CMAKE_CURRENT_SOURCE_DIR}/Add.comp
32 |                OUTPUT "vulkansubgroups_spirv"
33 |                VERSION 1.1)
34 | 
35 | add_executable(VulkanSubgroups
36 |     main.cpp
37 |     reduce.h
38 |     reduce.cpp
39 |     scan.h
40 |     scan.cpp
41 |     Reduce.comp
42 |     Scan.comp
43 |     Add.comp
44 |     ${CMAKE_CURRENT_BINARY_DIR}/vulkansubgroups_spirv.cpp
45 |     ${CMAKE_CURRENT_BINARY_DIR}/vulkansubgroups_spirv.h
46 | )
47 | 
48 | target_include_directories(VulkanSubgroups PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
49 | 
50 | if (WIN32)
51 |     vortex2d_copy_dll(VulkanSubgroups)
52 | endif()
53 | 
54 | target_link_libraries(VulkanSubgroups vortex2d benchmark)
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Maximilian Maldacker
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # VulkanSubgroups
2 | vulkan subgroups example for reduce and scan
3 | 
4 | Project can be compiled with simple cmake commands on Windows and Linux. Dependencies are downloaded by cmake.
5 | 
6 | Blog post is at [Parallel reduce and scan on the GPU](https://cachemiss.xyz/blog/parallel-reduce-and-scan-on-the-GPU)


--------------------------------------------------------------------------------
/Reduce.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | #extension GL_KHR_shader_subgroup_arithmetic : enable
 3 | 
 4 | layout(std430, binding = 0) buffer Input
 5 | {
 6 |    float inputs[];
 7 | };
 8 | 
 9 | layout(std430, binding = 1) buffer Output
10 | {
11 |    float outputs[];
12 | };
13 | 
14 | layout (local_size_x_id = 1) in;
15 | layout (constant_id = 2) const int sumSubGroupSize = 64;
16 | 
17 | layout(push_constant) uniform PushConsts
18 | {
19 |   int n;
20 | } consts;
21 | 
22 | shared float sdata[sumSubGroupSize];
23 | 
24 | void main()
25 | {
26 |     float sum = 0.0;
27 |     if (gl_GlobalInvocationID.x < consts.n)
28 |     {
29 |         sum = inputs[gl_GlobalInvocationID.x];
30 |     }
31 | 
32 |     sum = subgroupAdd(sum);
33 | 
34 |     if (gl_SubgroupInvocationID == 0)
35 |     {
36 |         sdata[gl_SubgroupID] = sum;
37 |     }
38 | 
39 |     memoryBarrierShared();
40 |     barrier();
41 | 
42 |     if (gl_SubgroupID == 0)
43 |     {
44 |         sum = gl_SubgroupInvocationID < gl_NumSubgroups ? sdata[gl_SubgroupInvocationID] : 0;
45 |         sum = subgroupAdd(sum);
46 |     }
47 | 
48 |     if (gl_LocalInvocationID.x == 0)
49 |     {
50 |         outputs[gl_WorkGroupID.x] = sum;
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/Scan.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | #extension GL_KHR_shader_subgroup_arithmetic : enable
 3 | 
 4 | layout(std430, binding = 0) buffer Input
 5 | {
 6 |    float inputs[];
 7 | };
 8 | 
 9 | layout(std430, binding = 1) buffer Output
10 | {
11 |    float outputs[];
12 | };
13 | 
14 | layout(std430, binding = 2) buffer PartialSums
15 | {
16 |    float partial_sums[];
17 | };
18 | 
19 | layout (local_size_x_id = 1) in;
20 | layout (constant_id = 2) const int sumSubGroupSize = 64;
21 | 
22 | layout(push_constant) uniform PushConsts
23 | {
24 |   int n;
25 | } consts;
26 | 
27 | shared float sdata[sumSubGroupSize];
28 | 
29 | void main()
30 | {
31 |     float sum = 0.0;
32 |     if (gl_GlobalInvocationID.x < consts.n)
33 |     {
34 |         sum = inputs[gl_GlobalInvocationID.x];
35 |     }
36 | 
37 |     sum = subgroupInclusiveAdd(sum);
38 | 
39 |     if (gl_SubgroupInvocationID == gl_SubgroupSize - 1)
40 |     {
41 |         sdata[gl_SubgroupID] = sum;
42 |     }
43 | 
44 |     memoryBarrierShared();
45 |     barrier();
46 | 
47 |     if (gl_SubgroupID == 0)
48 |     {
49 |         float warpSum = gl_SubgroupInvocationID < gl_NumSubgroups ? sdata[gl_SubgroupInvocationID] : 0;
50 |         warpSum = subgroupInclusiveAdd(warpSum);
51 |         sdata[gl_SubgroupInvocationID] = warpSum;
52 |     }
53 | 
54 |     memoryBarrierShared();
55 |     barrier();
56 | 
57 |     float blockSum = 0;
58 |     if (gl_SubgroupID > 0)
59 |     {
60 |         blockSum = sdata[gl_SubgroupID - 1];
61 |     }
62 | 
63 |     sum += blockSum;
64 | 
65 |     if (gl_GlobalInvocationID.x < consts.n)
66 |     {
67 |         outputs[gl_GlobalInvocationID.x] = sum;
68 |     }
69 | 
70 |     if (gl_LocalInvocationID.x == gl_WorkGroupSize.x - 1)
71 |     {
72 |         partial_sums[gl_WorkGroupID.x] = sum;
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/generateGraphs.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import pygal
 3 | from collections import defaultdict
 4 | 
 5 | reduceChart = pygal.Line(logarithmic=True, title='Reduce', y_title='Time (us)', x_title='Number of elements')
 6 | scanChart = pygal.Line(logarithmic=True, title='Scan', y_title='Time (us)', x_title='Number of elements')
 7 | 
 8 | reduceDict = defaultdict(list)
 9 | scanDict = defaultdict(list)
10 | 
11 | sizes = []
12 | 
13 | with open('timing.csv', newline='') as csvfile:
14 |     reader = csv.DictReader(csvfile, delimiter=',')
15 |     for row in reader:
16 |         name = row['name'].split('/')
17 |         if name[0] == 'Reduce_GPU_Subgroup':
18 |             sizes.append(name[1])
19 |         if name[0].startswith('Reduce'):
20 |             reduceDict[name[0].replace('Reduce_', '').replace('_', ' ')].append(row['real_time'])
21 |         else:
22 |             scanDict[name[0].replace('Scan_', '').replace('_', ' ')].append(row['real_time'])
23 | 
24 | for name, values in reduceDict.items():
25 |     reduceChart.add(name, [float(value) / 1000.0 for value in values])
26 | 
27 | reduceChart.x_labels = sizes
28 | reduceChart.render_to_file('reduce.svg')
29 | 
30 | for name, values in scanDict.items():
31 |     scanChart.add(name, [float(value) / 1000.0 for value in values])
32 | 
33 | scanChart.x_labels = sizes
34 | scanChart.render_to_file('scan.svg')


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
 1 | #include "reduce.h"
 2 | #include "scan.h"
 3 | 
 4 | #include <benchmark/benchmark.h>
 5 | #include <iostream>
 6 | 
 7 | Vortex2D::Renderer::Device* gDevice = nullptr;
 8 | 
 9 | int main(int argc, char** argv)
10 | {
11 |   Vortex2D::Renderer::Instance instance("VulkanSubgroups", {}, false);
12 |   auto physicalDevice = instance.GetPhysicalDevice();
13 | 
14 |   auto properties = physicalDevice.getProperties2<vk::PhysicalDeviceProperties2, vk::PhysicalDeviceSubgroupProperties>();
15 |   auto subgroupProperties = properties.get<vk::PhysicalDeviceSubgroupProperties>();
16 | 
17 |   std::cout << "Subgroup size: " << subgroupProperties.subgroupSize << std::endl;
18 |   std::cout << "Subgroup supported operations: " << vk::to_string(subgroupProperties.supportedOperations) << std::endl;
19 | 
20 |   Vortex2D::Renderer::Device device(physicalDevice);
21 |   gDevice = &device;
22 | 
23 |   CheckReduce();
24 |   CheckScan();
25 | 
26 |   benchmark::Initialize(&argc, argv);
27 |   benchmark::RunSpecifiedBenchmarks();
28 | }
29 | 


--------------------------------------------------------------------------------
/reduce.cpp:
--------------------------------------------------------------------------------
  1 | #include "reduce.h"
  2 | #include <benchmark/benchmark.h>
  3 | #include <numeric>
  4 | #include <execution>
  5 | #include <iostream>
  6 | #include "vulkansubgroups_spirv.h"
  7 | 
  8 | namespace
  9 | {
 10 | int NextSize(int size, int localSize)
 11 | {
 12 |   return (size + localSize) / localSize;
 13 | }
 14 | }
 15 | 
 16 | Reduce::Reduce(const Vortex2D::Renderer::Device& device,
 17 |                int size,
 18 |                int localSize)
 19 |   : mTimer(device)
 20 |   , mUploadCmd(device)
 21 |   , mDownloadCmd(device)
 22 |   , mReduceCmd(device)
 23 |   , mReduceWork(device, {size, localSize}, Vortex2D::SPIRV::Reduce_comp)
 24 |   , mLocalInput(device, size, VMA_MEMORY_USAGE_CPU_TO_GPU)
 25 |   , mLocalOutput(device, 1, VMA_MEMORY_USAGE_GPU_TO_CPU)
 26 | {
 27 |   int n = size;
 28 |   while (n > 1)
 29 |   {
 30 |     mBuffers.emplace_back(device, n, VMA_MEMORY_USAGE_GPU_ONLY);
 31 |     n = NextSize(n, localSize);
 32 |   }
 33 | 
 34 |   mBuffers.emplace_back(device, 1, VMA_MEMORY_USAGE_GPU_ONLY);
 35 | 
 36 |   n = size;
 37 |   for (std::size_t i = 0; i < mBuffers.size() - 1; i++)
 38 |   {
 39 |     mReduce.emplace_back(mReduceWork.Bind({n, localSize}, {mBuffers[i], mBuffers[i + 1]}));
 40 |     n = NextSize(n, localSize);
 41 |   }
 42 | 
 43 |   mUploadCmd.Record([&](vk::CommandBuffer commandBuffer)
 44 |   {
 45 |     mBuffers.front().CopyFrom(commandBuffer, mLocalInput);
 46 |   });
 47 | 
 48 |   mDownloadCmd.Record([&](vk::CommandBuffer commandBuffer)
 49 |   {
 50 |     mLocalOutput.CopyFrom(commandBuffer, mBuffers.back());
 51 |   });
 52 | 
 53 |   mReduceCmd.Record([&](vk::CommandBuffer commandBuffer)
 54 |   {
 55 |     mTimer.Start(commandBuffer);
 56 | 
 57 |     int n = size;
 58 |     for (std::size_t i = 0; i < mReduce.size(); i++)
 59 |     {
 60 |       mReduce[i].PushConstant(commandBuffer, n);
 61 |       mReduce[i].Record(commandBuffer);
 62 | 
 63 |       Vortex2D::Renderer::BufferBarrier(mBuffers[i + 1].Handle(), commandBuffer, vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
 64 | 
 65 |       n = NextSize(n, localSize);
 66 |     }
 67 | 
 68 |     mTimer.Stop(commandBuffer);
 69 |   });
 70 | }
 71 | 
 72 | void Reduce::Upload(const std::vector<float>& input)
 73 | {
 74 |   Vortex2D::Renderer::CopyFrom(mLocalInput, input);
 75 |   mUploadCmd.Submit();
 76 | }
 77 | 
 78 | float Reduce::Download()
 79 | {
 80 |   mDownloadCmd.Submit();
 81 |   mDownloadCmd.Wait();
 82 | 
 83 |   float total = 0.0;
 84 |   Vortex2D::Renderer::CopyTo(mLocalOutput, total);
 85 |   return total;
 86 | }
 87 | 
 88 | void Reduce::Submit()
 89 | {
 90 |   mReduceCmd.Submit();
 91 |   mReduceCmd.Wait();
 92 | }
 93 | 
 94 | uint64_t Reduce::GetElapsedNs()
 95 | {
 96 |   return mTimer.GetElapsedNs();
 97 | }
 98 | 
 99 | static void Reduce_CPU_Seq(benchmark::State& state)
100 | {
101 |   auto size = state.range(0);
102 | 
103 |   std::vector<float> inputData(size, 1.0f);
104 |   std::iota(inputData.begin(), inputData.end(), 1.0f);
105 | 
106 |   for (auto _ : state)
107 |   {
108 |     benchmark::DoNotOptimize(std::reduce(std::execution::seq, inputData.begin(), inputData.end()));
109 |   }
110 | }
111 | 
112 | static void Reduce_CPU_Par(benchmark::State& state)
113 | {
114 |   auto size = state.range(0);
115 | 
116 |   std::vector<float> inputData(size, 1.0f);
117 |   std::iota(inputData.begin(), inputData.end(), 1.0f);
118 | 
119 |   for (auto _ : state)
120 |   {
121 |     benchmark::DoNotOptimize(std::reduce(std::execution::par, inputData.begin(), inputData.end()));
122 |   }
123 | }
124 | 
125 | static void Reduce_GPU_Subgroup(benchmark::State& state)
126 | {
127 |   auto size = state.range(0);
128 | 
129 |   Reduce reduce(*gDevice, size, 512);
130 | 
131 |   std::vector<float> inputData(size, 1.0f);
132 |   std::iota(inputData.begin(), inputData.end(), 1.0f);
133 | 
134 |   for (auto _ : state)
135 |   {
136 |     reduce.Submit();
137 |     state.SetIterationTime(reduce.GetElapsedNs() / 1000000000.0);
138 |   }
139 | }
140 | 
141 | static void Reduce_GPU_SharedMemory(benchmark::State& state)
142 | {
143 |   auto size = state.range(0);
144 | 
145 |   Vortex2D::Renderer::Timer timer(*gDevice);
146 |   Vortex2D::Fluid::ReduceSum reduce(*gDevice, {size, 1});
147 | 
148 |   Vortex2D::Renderer::Buffer<float> input(*gDevice, size, VMA_MEMORY_USAGE_GPU_ONLY);
149 |   Vortex2D::Renderer::Buffer<float> output(*gDevice, 1, VMA_MEMORY_USAGE_GPU_ONLY);
150 | 
151 |   auto boundReduce = reduce.Bind(input, output);
152 | 
153 |   Vortex2D::Renderer::CommandBuffer cmd(*gDevice);
154 |   cmd.Record([&](vk::CommandBuffer commandBuffer)
155 |   {
156 |     timer.Start(commandBuffer);
157 |     boundReduce.Record(commandBuffer);
158 |     timer.Stop(commandBuffer);
159 |   });
160 | 
161 |   for (auto _ : state)
162 |   {
163 |     cmd.Submit();
164 |     cmd.Wait();
165 |     state.SetIterationTime(timer.GetElapsedNs() / 1000000000.0);
166 |   }
167 | }
168 | 
169 | BENCHMARK(Reduce_GPU_Subgroup)->Range(8, 8<<20)->UseManualTime();
170 | BENCHMARK(Reduce_GPU_SharedMemory)->Range(8, 8<<20)->UseManualTime();
171 | BENCHMARK(Reduce_CPU_Seq)->Range(8, 8<<20);
172 | BENCHMARK(Reduce_CPU_Par)->Range(8, 8<<20);
173 | 
174 | void CheckReduce()
175 | {
176 |   int size = 300;
177 |   Reduce reduce(*gDevice, size, 256);
178 | 
179 |   std::vector<float> inputData(size, 1.0f);
180 |   std::iota(inputData.begin(), inputData.end(), 1.0f);
181 | 
182 |   reduce.Upload(inputData);
183 |   reduce.Submit();
184 |   float total = reduce.Download();
185 | 
186 |   std::cout << "Total " << total << std::endl;
187 |   std::cout << "Expected total " << 0.5f * size * (size + 1) << std::endl;
188 | }
189 | 


--------------------------------------------------------------------------------
/reduce.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <Vortex2D/Vortex2D.h>
 4 | 
 5 | extern Vortex2D::Renderer::Device* gDevice;
 6 | 
 7 | class Reduce
 8 | {
 9 | public:
10 |   Reduce(const Vortex2D::Renderer::Device& device,
11 |          int size,
12 |          int localSize);
13 | 
14 |   void Upload(const std::vector<float>& input);
15 |   float Download();
16 |   void Submit();
17 |   uint64_t GetElapsedNs();
18 | 
19 | private:
20 |   Vortex2D::Renderer::Timer mTimer;
21 |   Vortex2D::Renderer::CommandBuffer mUploadCmd, mDownloadCmd, mReduceCmd;
22 |   Vortex2D::Renderer::Work mReduceWork; // reduce shader
23 |   std::vector<Vortex2D::Renderer::Buffer<float>> mBuffers; // buffers for input, intermediate results and output
24 |   Vortex2D::Renderer::Buffer<float> mLocalInput, mLocalOutput; // buffers for copying input/output to device
25 |   std::vector<Vortex2D::Renderer::Work::Bound> mReduce; // bound reduce shaders for each level
26 | };
27 | 
28 | void CheckReduce();
29 | 


--------------------------------------------------------------------------------
/scan.cpp:
--------------------------------------------------------------------------------
  1 | #include "scan.h"
  2 | #include <benchmark/benchmark.h>
  3 | #include <iostream>
  4 | #include <numeric>
  5 | #include <execution>
  6 | #include "vulkansubgroups_spirv.h"
  7 | 
  8 | namespace
  9 | {
 10 | int NextSize(int size, int localSize)
 11 | {
 12 |   return (size + localSize) / localSize;
 13 | }
 14 | }
 15 | 
 16 | Scan::Scan(const Vortex2D::Renderer::Device& device,
 17 |            int size,
 18 |            int localSize)
 19 |   : mTimer(device)
 20 |   , mUploadCmd(device)
 21 |   , mDownloadCmd(device)
 22 |   , mScanCmd(device)
 23 |   , mScanWork(device, {size, localSize}, Vortex2D::SPIRV::Scan_comp)
 24 |   , mAddWork(device, {NextSize(size, localSize), localSize}, Vortex2D::SPIRV::Add_comp)
 25 |   , mLocalInput(device, size, VMA_MEMORY_USAGE_CPU_TO_GPU)
 26 |   , mLocalOutput(device, size, VMA_MEMORY_USAGE_GPU_TO_CPU)
 27 | {
 28 |   int n = size;
 29 |   while (n > 1)
 30 |   {
 31 |     mBuffers.emplace_back(device, n, VMA_MEMORY_USAGE_GPU_ONLY);
 32 |     n = NextSize(n, localSize);
 33 |   }
 34 | 
 35 |   mBuffers.emplace_back(device, 1, VMA_MEMORY_USAGE_GPU_ONLY);
 36 | 
 37 |   n = size;
 38 |   for (std::size_t i = 0; i < mBuffers.size() - 1; i++)
 39 |   {
 40 |     mScan.emplace_back(mScanWork.Bind({n, localSize}, {mBuffers[i], mBuffers[i], mBuffers[i + 1]}));
 41 |     mAdd.emplace_back(mAddWork.Bind({n, localSize}, {mBuffers[i + 1], mBuffers[i]}));
 42 |     n = NextSize(n, localSize);
 43 |   }
 44 | 
 45 |   mUploadCmd.Record([&](vk::CommandBuffer commandBuffer)
 46 |   {
 47 |     mBuffers[0].CopyFrom(commandBuffer, mLocalInput);
 48 |   });
 49 | 
 50 |   mDownloadCmd.Record([&](vk::CommandBuffer commandBuffer)
 51 |   {
 52 |     mLocalOutput.CopyFrom(commandBuffer, mBuffers[0]);
 53 |   });
 54 | 
 55 |   mScanCmd.Record([&](vk::CommandBuffer commandBuffer)
 56 |   {
 57 |     mTimer.Start(commandBuffer);
 58 | 
 59 |     int n = size;
 60 | 
 61 |     mScan[0].PushConstant(commandBuffer, n);
 62 |     mScan[0].Record(commandBuffer);
 63 |     Vortex2D::Renderer::BufferBarrier(mBuffers[0].Handle(), commandBuffer, vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
 64 |     Vortex2D::Renderer::BufferBarrier(mBuffers[1].Handle(), commandBuffer, vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
 65 | 
 66 |     for (std::size_t i = 1; i < mScan.size(); i++)
 67 |     {
 68 |       mScan[i].PushConstant(commandBuffer, NextSize(n, localSize));
 69 |       mScan[i].Record(commandBuffer);
 70 |       Vortex2D::Renderer::BufferBarrier(mBuffers[i - 1].Handle(), commandBuffer, vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
 71 |       Vortex2D::Renderer::BufferBarrier(mBuffers[i].Handle(), commandBuffer, vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
 72 | 
 73 |       mAdd[i - 1].PushConstant(commandBuffer, n);
 74 |       mAdd[i - 1].Record(commandBuffer);
 75 |       Vortex2D::Renderer::BufferBarrier(mBuffers[i - 1].Handle(), commandBuffer, vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
 76 | 
 77 |       n = NextSize(n, localSize);
 78 |     }
 79 | 
 80 |     mTimer.Stop(commandBuffer);
 81 |   });
 82 | }
 83 | 
 84 | void Scan::Upload(const std::vector<float>& input)
 85 | {
 86 |   Vortex2D::Renderer::CopyFrom(mLocalInput, input);
 87 |   mUploadCmd.Submit();
 88 | }
 89 | 
 90 | std::vector<float> Scan::Download()
 91 | {
 92 |   mDownloadCmd.Submit();
 93 |   mDownloadCmd.Wait();
 94 | 
 95 |   std::vector<float> output(mLocalOutput.Size() / sizeof(float), 0.0f);
 96 |   Vortex2D::Renderer::CopyTo(mLocalOutput, output);
 97 |   return output;
 98 | }
 99 | 
100 | void Scan::Submit()
101 | {
102 |   mScanCmd.Submit();
103 |   mScanCmd.Wait();
104 | }
105 | 
106 | uint64_t Scan::GetElapsedNs()
107 | {
108 |   return mTimer.GetElapsedNs();
109 | }
110 | 
111 | static void Scan_GPU_Subgroup(benchmark::State& state)
112 | {
113 |   auto size = state.range(0);
114 | 
115 |   Scan scan(*gDevice, size, 512);
116 | 
117 |   std::vector<float> inputData(size, 1.0f);
118 |   std::iota(inputData.begin(), inputData.end(), 1.0f);
119 | 
120 |   for (auto _ : state)
121 |   {
122 |     scan.Submit();
123 |     state.SetIterationTime(scan.GetElapsedNs() / 1000000000.0);
124 |   }
125 | }
126 | 
127 | static void Scan_GPU_SharedMemory(benchmark::State& state)
128 | {
129 |   auto size = state.range(0);
130 | 
131 |   Vortex2D::Renderer::Timer timer(*gDevice);
132 |   Vortex2D::Fluid::PrefixScan scan(*gDevice, {size, 1});
133 | 
134 |   Vortex2D::Renderer::Buffer<float> input(*gDevice, size, VMA_MEMORY_USAGE_GPU_ONLY);
135 |   Vortex2D::Renderer::Buffer<float> output(*gDevice, size, VMA_MEMORY_USAGE_GPU_TO_CPU);
136 | 
137 |   Vortex2D::Renderer::Buffer<Vortex2D::Renderer::DispatchParams> dispatchParams(*gDevice);
138 | 
139 |   auto boundScan = scan.Bind(input, output, dispatchParams);
140 | 
141 |   Vortex2D::Renderer::CommandBuffer cmd(*gDevice);
142 |   cmd.Record([&](vk::CommandBuffer commandBuffer)
143 |   {
144 |     timer.Start(commandBuffer);
145 |     boundScan.Record(commandBuffer);
146 |     timer.Stop(commandBuffer);
147 |   });
148 | 
149 |   std::vector<float> outputData(size);
150 | 
151 |   for (auto _ : state)
152 |   {
153 |     cmd.Submit();
154 |     cmd.Wait();
155 |     state.SetIterationTime(timer.GetElapsedNs() / 1000000000.0);
156 |   }
157 | }
158 | 
159 | static void Scan_CPU_Seq(benchmark::State& state)
160 | {
161 |   auto size = state.range(0);
162 | 
163 |   std::vector<float> inputData(size, 1.0f);
164 |   std::vector<float> outputData(size);
165 |   std::iota(inputData.begin(), inputData.end(), 1.0f);
166 | 
167 |   for (auto _ : state)
168 |   {
169 |     benchmark::DoNotOptimize(std::inclusive_scan(std::execution::seq, inputData.begin(), inputData.end(), outputData.begin()));
170 |   }
171 | }
172 | 
173 | static void Scan_CPU_Par(benchmark::State& state)
174 | {
175 |   auto size = state.range(0);
176 | 
177 |   std::vector<float> inputData(size, 1.0f);
178 |   std::vector<float> outputData(size);
179 |   std::iota(inputData.begin(), inputData.end(), 1.0f);
180 | 
181 |   for (auto _ : state)
182 |   {
183 |     benchmark::DoNotOptimize(std::inclusive_scan(std::execution::par, inputData.begin(), inputData.end(), outputData.begin()));
184 |   }
185 | }
186 | 
187 | 
188 | BENCHMARK(Scan_GPU_Subgroup)->Range(8, 8<<20)->UseManualTime();
189 | BENCHMARK(Scan_GPU_SharedMemory)->Range(8, 8<<20)->UseManualTime();
190 | BENCHMARK(Scan_CPU_Seq)->Range(8, 8<<20);
191 | BENCHMARK(Scan_CPU_Par)->Range(8, 8<<20);
192 | 
193 | void CheckScan()
194 | {
195 |   int size = 300;
196 |   Scan scan(*gDevice, size, 256);
197 | 
198 |   std::vector<float> inputData(size, 1.0f);
199 |   std::iota(inputData.begin(), inputData.end(), 1.0f);
200 | 
201 |   scan.Upload(inputData);
202 |   scan.Submit();
203 |   auto output = scan.Download();
204 | 
205 |   std::vector<float> expectedOutput(size);
206 |   std::inclusive_scan(std::execution::seq, inputData.begin(), inputData.end(), expectedOutput.begin());
207 | 
208 |   for (std::size_t i = 0; i < size; i++)
209 |   {
210 |     if (output[i] != expectedOutput[i])
211 |     {
212 |       std::cout << "Diference at " << i << " values " << output[i] << " != " << expectedOutput[i] << std::endl;
213 |     }
214 |   }
215 | 
216 |   std::cout << "Scan complete" << std::endl;
217 | }
218 | 


--------------------------------------------------------------------------------
/scan.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <Vortex2D/Vortex2D.h>
 4 | 
 5 | extern Vortex2D::Renderer::Device* gDevice;
 6 | 
 7 | class Scan
 8 | {
 9 | public:
10 |   Scan(const Vortex2D::Renderer::Device& device,
11 |        int size,
12 |        int localSize);
13 | 
14 |   void Upload(const std::vector<float>& input);
15 |   std::vector<float> Download();
16 |   void Submit();
17 |   uint64_t GetElapsedNs();
18 | 
19 | private:
20 |   Vortex2D::Renderer::Timer mTimer;
21 |   Vortex2D::Renderer::CommandBuffer mUploadCmd, mDownloadCmd, mScanCmd;
22 |   Vortex2D::Renderer::Work mScanWork;
23 |   Vortex2D::Renderer::Work mAddWork;
24 | 
25 |   std::vector<Vortex2D::Renderer::Buffer<float>> mBuffers; // buffers for intermediate results
26 |   Vortex2D::Renderer::Buffer<float> mLocalInput, mLocalOutput; // buffers for copying input/output to device
27 |   std::vector<Vortex2D::Renderer::Work::Bound> mScan; // bound scan shaders for each level
28 |   std::vector<Vortex2D::Renderer::Work::Bound> mAdd; // bound add shaders for each level
29 | };
30 | 
31 | void CheckScan();
32 | 


--------------------------------------------------------------------------------