├── Add.comp ├── CMakeLists.txt ├── LICENSE ├── README.md ├── Reduce.comp ├── Scan.comp ├── generateGraphs.py ├── main.cpp ├── reduce.cpp ├── reduce.h ├── scan.cpp └── scan.h /Add.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | #extension GL_ARB_separate_shader_objects : enable 3 | #extension GL_GOOGLE_include_directive : enable 4 | 5 | layout (local_size_x_id = 1) in; 6 | 7 | layout(push_constant) uniform Consts 8 | { 9 | int n; 10 | }consts; 11 | 12 | layout(std430, binding = 0) buffer Input 13 | { 14 | float value[]; 15 | }i; 16 | 17 | layout(std430, binding = 1) buffer Output 18 | { 19 | float value[]; 20 | }o; 21 | 22 | shared float sum; 23 | 24 | void main() 25 | { 26 | if (gl_WorkGroupID.x > 0 && 27 | gl_GlobalInvocationID.x < consts.n) 28 | { 29 | sum = 0.0; 30 | if (gl_LocalInvocationID.x == 0) 31 | { 32 | sum = i.value[gl_WorkGroupID.x - 1]; 33 | } 34 | 35 | memoryBarrierShared(); 36 | barrier(); 37 | 38 | o.value[gl_GlobalInvocationID.x] += sum; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.11) 2 | 3 | project(VulkanSubgroups) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | # cmake file to download and include dependencies 8 | include(FetchContent) 9 | 10 | FetchContent_Declare(vortex2d 11 | GIT_REPOSITORY https://github.com/mmaldacker/Vortex2D.git 12 | GIT_TAG master) 13 | FetchContent_GetProperties(vortex2d) 14 | if(NOT vortex2d_POPULATED) 15 | FetchContent_Populate(vortex2d) 16 | add_subdirectory(${vortex2d_SOURCE_DIR} ${vortex2d_BINARY_DIR}) 17 | endif() 18 | 19 | set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE) 20 | FetchContent_Declare(benchmark 21 | GIT_REPOSITORY https://github.com/google/benchmark.git 22 | GIT_TAG v1.4.1) 23 | if(NOT benchmark_POPULATED) 24 | FetchContent_Populate(benchmark) 25 | add_subdirectory(${benchmark_SOURCE_DIR} ${benchmark_BINARY_DIR}) 26 | endif() 27 | 28 | compile_shader(SOURCES 29 | ${CMAKE_CURRENT_SOURCE_DIR}/Reduce.comp 30 | ${CMAKE_CURRENT_SOURCE_DIR}/Scan.comp 31 | ${CMAKE_CURRENT_SOURCE_DIR}/Add.comp 32 | OUTPUT "vulkansubgroups_spirv" 33 | VERSION 1.1) 34 | 35 | add_executable(VulkanSubgroups 36 | main.cpp 37 | reduce.h 38 | reduce.cpp 39 | scan.h 40 | scan.cpp 41 | Reduce.comp 42 | Scan.comp 43 | Add.comp 44 | ${CMAKE_CURRENT_BINARY_DIR}/vulkansubgroups_spirv.cpp 45 | ${CMAKE_CURRENT_BINARY_DIR}/vulkansubgroups_spirv.h 46 | ) 47 | 48 | target_include_directories(VulkanSubgroups PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) 49 | 50 | if (WIN32) 51 | vortex2d_copy_dll(VulkanSubgroups) 52 | endif() 53 | 54 | target_link_libraries(VulkanSubgroups vortex2d benchmark) 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Maximilian Maldacker 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VulkanSubgroups 2 | vulkan subgroups example for reduce and scan 3 | 4 | Project can be compiled with simple cmake commands on Windows and Linux. Dependencies are downloaded by cmake. 5 | 6 | Blog post is at [Parallel reduce and scan on the GPU](https://cachemiss.xyz/blog/parallel-reduce-and-scan-on-the-GPU) -------------------------------------------------------------------------------- /Reduce.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | #extension GL_KHR_shader_subgroup_arithmetic : enable 3 | 4 | layout(std430, binding = 0) buffer Input 5 | { 6 | float inputs[]; 7 | }; 8 | 9 | layout(std430, binding = 1) buffer Output 10 | { 11 | float outputs[]; 12 | }; 13 | 14 | layout (local_size_x_id = 1) in; 15 | layout (constant_id = 2) const int sumSubGroupSize = 64; 16 | 17 | layout(push_constant) uniform PushConsts 18 | { 19 | int n; 20 | } consts; 21 | 22 | shared float sdata[sumSubGroupSize]; 23 | 24 | void main() 25 | { 26 | float sum = 0.0; 27 | if (gl_GlobalInvocationID.x < consts.n) 28 | { 29 | sum = inputs[gl_GlobalInvocationID.x]; 30 | } 31 | 32 | sum = subgroupAdd(sum); 33 | 34 | if (gl_SubgroupInvocationID == 0) 35 | { 36 | sdata[gl_SubgroupID] = sum; 37 | } 38 | 39 | memoryBarrierShared(); 40 | barrier(); 41 | 42 | if (gl_SubgroupID == 0) 43 | { 44 | sum = gl_SubgroupInvocationID < gl_NumSubgroups ? sdata[gl_SubgroupInvocationID] : 0; 45 | sum = subgroupAdd(sum); 46 | } 47 | 48 | if (gl_LocalInvocationID.x == 0) 49 | { 50 | outputs[gl_WorkGroupID.x] = sum; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /Scan.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | #extension GL_KHR_shader_subgroup_arithmetic : enable 3 | 4 | layout(std430, binding = 0) buffer Input 5 | { 6 | float inputs[]; 7 | }; 8 | 9 | layout(std430, binding = 1) buffer Output 10 | { 11 | float outputs[]; 12 | }; 13 | 14 | layout(std430, binding = 2) buffer PartialSums 15 | { 16 | float partial_sums[]; 17 | }; 18 | 19 | layout (local_size_x_id = 1) in; 20 | layout (constant_id = 2) const int sumSubGroupSize = 64; 21 | 22 | layout(push_constant) uniform PushConsts 23 | { 24 | int n; 25 | } consts; 26 | 27 | shared float sdata[sumSubGroupSize]; 28 | 29 | void main() 30 | { 31 | float sum = 0.0; 32 | if (gl_GlobalInvocationID.x < consts.n) 33 | { 34 | sum = inputs[gl_GlobalInvocationID.x]; 35 | } 36 | 37 | sum = subgroupInclusiveAdd(sum); 38 | 39 | if (gl_SubgroupInvocationID == gl_SubgroupSize - 1) 40 | { 41 | sdata[gl_SubgroupID] = sum; 42 | } 43 | 44 | memoryBarrierShared(); 45 | barrier(); 46 | 47 | if (gl_SubgroupID == 0) 48 | { 49 | float warpSum = gl_SubgroupInvocationID < gl_NumSubgroups ? sdata[gl_SubgroupInvocationID] : 0; 50 | warpSum = subgroupInclusiveAdd(warpSum); 51 | sdata[gl_SubgroupInvocationID] = warpSum; 52 | } 53 | 54 | memoryBarrierShared(); 55 | barrier(); 56 | 57 | float blockSum = 0; 58 | if (gl_SubgroupID > 0) 59 | { 60 | blockSum = sdata[gl_SubgroupID - 1]; 61 | } 62 | 63 | sum += blockSum; 64 | 65 | if (gl_GlobalInvocationID.x < consts.n) 66 | { 67 | outputs[gl_GlobalInvocationID.x] = sum; 68 | } 69 | 70 | if (gl_LocalInvocationID.x == gl_WorkGroupSize.x - 1) 71 | { 72 | partial_sums[gl_WorkGroupID.x] = sum; 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /generateGraphs.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import pygal 3 | from collections import defaultdict 4 | 5 | reduceChart = pygal.Line(logarithmic=True, title='Reduce', y_title='Time (us)', x_title='Number of elements') 6 | scanChart = pygal.Line(logarithmic=True, title='Scan', y_title='Time (us)', x_title='Number of elements') 7 | 8 | reduceDict = defaultdict(list) 9 | scanDict = defaultdict(list) 10 | 11 | sizes = [] 12 | 13 | with open('timing.csv', newline='') as csvfile: 14 | reader = csv.DictReader(csvfile, delimiter=',') 15 | for row in reader: 16 | name = row['name'].split('/') 17 | if name[0] == 'Reduce_GPU_Subgroup': 18 | sizes.append(name[1]) 19 | if name[0].startswith('Reduce'): 20 | reduceDict[name[0].replace('Reduce_', '').replace('_', ' ')].append(row['real_time']) 21 | else: 22 | scanDict[name[0].replace('Scan_', '').replace('_', ' ')].append(row['real_time']) 23 | 24 | for name, values in reduceDict.items(): 25 | reduceChart.add(name, [float(value) / 1000.0 for value in values]) 26 | 27 | reduceChart.x_labels = sizes 28 | reduceChart.render_to_file('reduce.svg') 29 | 30 | for name, values in scanDict.items(): 31 | scanChart.add(name, [float(value) / 1000.0 for value in values]) 32 | 33 | scanChart.x_labels = sizes 34 | scanChart.render_to_file('scan.svg') -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include "reduce.h" 2 | #include "scan.h" 3 | 4 | #include 5 | #include 6 | 7 | Vortex2D::Renderer::Device* gDevice = nullptr; 8 | 9 | int main(int argc, char** argv) 10 | { 11 | Vortex2D::Renderer::Instance instance("VulkanSubgroups", {}, false); 12 | auto physicalDevice = instance.GetPhysicalDevice(); 13 | 14 | auto properties = physicalDevice.getProperties2(); 15 | auto subgroupProperties = properties.get(); 16 | 17 | std::cout << "Subgroup size: " << subgroupProperties.subgroupSize << std::endl; 18 | std::cout << "Subgroup supported operations: " << vk::to_string(subgroupProperties.supportedOperations) << std::endl; 19 | 20 | Vortex2D::Renderer::Device device(physicalDevice); 21 | gDevice = &device; 22 | 23 | CheckReduce(); 24 | CheckScan(); 25 | 26 | benchmark::Initialize(&argc, argv); 27 | benchmark::RunSpecifiedBenchmarks(); 28 | } 29 | -------------------------------------------------------------------------------- /reduce.cpp: -------------------------------------------------------------------------------- 1 | #include "reduce.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "vulkansubgroups_spirv.h" 7 | 8 | namespace 9 | { 10 | int NextSize(int size, int localSize) 11 | { 12 | return (size + localSize) / localSize; 13 | } 14 | } 15 | 16 | Reduce::Reduce(const Vortex2D::Renderer::Device& device, 17 | int size, 18 | int localSize) 19 | : mTimer(device) 20 | , mUploadCmd(device) 21 | , mDownloadCmd(device) 22 | , mReduceCmd(device) 23 | , mReduceWork(device, {size, localSize}, Vortex2D::SPIRV::Reduce_comp) 24 | , mLocalInput(device, size, VMA_MEMORY_USAGE_CPU_TO_GPU) 25 | , mLocalOutput(device, 1, VMA_MEMORY_USAGE_GPU_TO_CPU) 26 | { 27 | int n = size; 28 | while (n > 1) 29 | { 30 | mBuffers.emplace_back(device, n, VMA_MEMORY_USAGE_GPU_ONLY); 31 | n = NextSize(n, localSize); 32 | } 33 | 34 | mBuffers.emplace_back(device, 1, VMA_MEMORY_USAGE_GPU_ONLY); 35 | 36 | n = size; 37 | for (std::size_t i = 0; i < mBuffers.size() - 1; i++) 38 | { 39 | mReduce.emplace_back(mReduceWork.Bind({n, localSize}, {mBuffers[i], mBuffers[i + 1]})); 40 | n = NextSize(n, localSize); 41 | } 42 | 43 | mUploadCmd.Record([&](vk::CommandBuffer commandBuffer) 44 | { 45 | mBuffers.front().CopyFrom(commandBuffer, mLocalInput); 46 | }); 47 | 48 | mDownloadCmd.Record([&](vk::CommandBuffer commandBuffer) 49 | { 50 | mLocalOutput.CopyFrom(commandBuffer, mBuffers.back()); 51 | }); 52 | 53 | mReduceCmd.Record([&](vk::CommandBuffer commandBuffer) 54 | { 55 | mTimer.Start(commandBuffer); 56 | 57 | int n = size; 58 | for (std::size_t i = 0; i < mReduce.size(); i++) 59 | { 60 | mReduce[i].PushConstant(commandBuffer, n); 61 | mReduce[i].Record(commandBuffer); 62 | 63 | Vortex2D::Renderer::BufferBarrier(mBuffers[i + 1].Handle(), commandBuffer, vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead); 64 | 65 | n = NextSize(n, localSize); 66 | } 67 | 68 | mTimer.Stop(commandBuffer); 69 | }); 70 | } 71 | 72 | void Reduce::Upload(const std::vector& input) 73 | { 74 | Vortex2D::Renderer::CopyFrom(mLocalInput, input); 75 | mUploadCmd.Submit(); 76 | } 77 | 78 | float Reduce::Download() 79 | { 80 | mDownloadCmd.Submit(); 81 | mDownloadCmd.Wait(); 82 | 83 | float total = 0.0; 84 | Vortex2D::Renderer::CopyTo(mLocalOutput, total); 85 | return total; 86 | } 87 | 88 | void Reduce::Submit() 89 | { 90 | mReduceCmd.Submit(); 91 | mReduceCmd.Wait(); 92 | } 93 | 94 | uint64_t Reduce::GetElapsedNs() 95 | { 96 | return mTimer.GetElapsedNs(); 97 | } 98 | 99 | static void Reduce_CPU_Seq(benchmark::State& state) 100 | { 101 | auto size = state.range(0); 102 | 103 | std::vector inputData(size, 1.0f); 104 | std::iota(inputData.begin(), inputData.end(), 1.0f); 105 | 106 | for (auto _ : state) 107 | { 108 | benchmark::DoNotOptimize(std::reduce(std::execution::seq, inputData.begin(), inputData.end())); 109 | } 110 | } 111 | 112 | static void Reduce_CPU_Par(benchmark::State& state) 113 | { 114 | auto size = state.range(0); 115 | 116 | std::vector inputData(size, 1.0f); 117 | std::iota(inputData.begin(), inputData.end(), 1.0f); 118 | 119 | for (auto _ : state) 120 | { 121 | benchmark::DoNotOptimize(std::reduce(std::execution::par, inputData.begin(), inputData.end())); 122 | } 123 | } 124 | 125 | static void Reduce_GPU_Subgroup(benchmark::State& state) 126 | { 127 | auto size = state.range(0); 128 | 129 | Reduce reduce(*gDevice, size, 512); 130 | 131 | std::vector inputData(size, 1.0f); 132 | std::iota(inputData.begin(), inputData.end(), 1.0f); 133 | 134 | for (auto _ : state) 135 | { 136 | reduce.Submit(); 137 | state.SetIterationTime(reduce.GetElapsedNs() / 1000000000.0); 138 | } 139 | } 140 | 141 | static void Reduce_GPU_SharedMemory(benchmark::State& state) 142 | { 143 | auto size = state.range(0); 144 | 145 | Vortex2D::Renderer::Timer timer(*gDevice); 146 | Vortex2D::Fluid::ReduceSum reduce(*gDevice, {size, 1}); 147 | 148 | Vortex2D::Renderer::Buffer input(*gDevice, size, VMA_MEMORY_USAGE_GPU_ONLY); 149 | Vortex2D::Renderer::Buffer output(*gDevice, 1, VMA_MEMORY_USAGE_GPU_ONLY); 150 | 151 | auto boundReduce = reduce.Bind(input, output); 152 | 153 | Vortex2D::Renderer::CommandBuffer cmd(*gDevice); 154 | cmd.Record([&](vk::CommandBuffer commandBuffer) 155 | { 156 | timer.Start(commandBuffer); 157 | boundReduce.Record(commandBuffer); 158 | timer.Stop(commandBuffer); 159 | }); 160 | 161 | for (auto _ : state) 162 | { 163 | cmd.Submit(); 164 | cmd.Wait(); 165 | state.SetIterationTime(timer.GetElapsedNs() / 1000000000.0); 166 | } 167 | } 168 | 169 | BENCHMARK(Reduce_GPU_Subgroup)->Range(8, 8<<20)->UseManualTime(); 170 | BENCHMARK(Reduce_GPU_SharedMemory)->Range(8, 8<<20)->UseManualTime(); 171 | BENCHMARK(Reduce_CPU_Seq)->Range(8, 8<<20); 172 | BENCHMARK(Reduce_CPU_Par)->Range(8, 8<<20); 173 | 174 | void CheckReduce() 175 | { 176 | int size = 300; 177 | Reduce reduce(*gDevice, size, 256); 178 | 179 | std::vector inputData(size, 1.0f); 180 | std::iota(inputData.begin(), inputData.end(), 1.0f); 181 | 182 | reduce.Upload(inputData); 183 | reduce.Submit(); 184 | float total = reduce.Download(); 185 | 186 | std::cout << "Total " << total << std::endl; 187 | std::cout << "Expected total " << 0.5f * size * (size + 1) << std::endl; 188 | } 189 | -------------------------------------------------------------------------------- /reduce.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | extern Vortex2D::Renderer::Device* gDevice; 6 | 7 | class Reduce 8 | { 9 | public: 10 | Reduce(const Vortex2D::Renderer::Device& device, 11 | int size, 12 | int localSize); 13 | 14 | void Upload(const std::vector& input); 15 | float Download(); 16 | void Submit(); 17 | uint64_t GetElapsedNs(); 18 | 19 | private: 20 | Vortex2D::Renderer::Timer mTimer; 21 | Vortex2D::Renderer::CommandBuffer mUploadCmd, mDownloadCmd, mReduceCmd; 22 | Vortex2D::Renderer::Work mReduceWork; // reduce shader 23 | std::vector> mBuffers; // buffers for input, intermediate results and output 24 | Vortex2D::Renderer::Buffer mLocalInput, mLocalOutput; // buffers for copying input/output to device 25 | std::vector mReduce; // bound reduce shaders for each level 26 | }; 27 | 28 | void CheckReduce(); 29 | -------------------------------------------------------------------------------- /scan.cpp: -------------------------------------------------------------------------------- 1 | #include "scan.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "vulkansubgroups_spirv.h" 7 | 8 | namespace 9 | { 10 | int NextSize(int size, int localSize) 11 | { 12 | return (size + localSize) / localSize; 13 | } 14 | } 15 | 16 | Scan::Scan(const Vortex2D::Renderer::Device& device, 17 | int size, 18 | int localSize) 19 | : mTimer(device) 20 | , mUploadCmd(device) 21 | , mDownloadCmd(device) 22 | , mScanCmd(device) 23 | , mScanWork(device, {size, localSize}, Vortex2D::SPIRV::Scan_comp) 24 | , mAddWork(device, {NextSize(size, localSize), localSize}, Vortex2D::SPIRV::Add_comp) 25 | , mLocalInput(device, size, VMA_MEMORY_USAGE_CPU_TO_GPU) 26 | , mLocalOutput(device, size, VMA_MEMORY_USAGE_GPU_TO_CPU) 27 | { 28 | int n = size; 29 | while (n > 1) 30 | { 31 | mBuffers.emplace_back(device, n, VMA_MEMORY_USAGE_GPU_ONLY); 32 | n = NextSize(n, localSize); 33 | } 34 | 35 | mBuffers.emplace_back(device, 1, VMA_MEMORY_USAGE_GPU_ONLY); 36 | 37 | n = size; 38 | for (std::size_t i = 0; i < mBuffers.size() - 1; i++) 39 | { 40 | mScan.emplace_back(mScanWork.Bind({n, localSize}, {mBuffers[i], mBuffers[i], mBuffers[i + 1]})); 41 | mAdd.emplace_back(mAddWork.Bind({n, localSize}, {mBuffers[i + 1], mBuffers[i]})); 42 | n = NextSize(n, localSize); 43 | } 44 | 45 | mUploadCmd.Record([&](vk::CommandBuffer commandBuffer) 46 | { 47 | mBuffers[0].CopyFrom(commandBuffer, mLocalInput); 48 | }); 49 | 50 | mDownloadCmd.Record([&](vk::CommandBuffer commandBuffer) 51 | { 52 | mLocalOutput.CopyFrom(commandBuffer, mBuffers[0]); 53 | }); 54 | 55 | mScanCmd.Record([&](vk::CommandBuffer commandBuffer) 56 | { 57 | mTimer.Start(commandBuffer); 58 | 59 | int n = size; 60 | 61 | mScan[0].PushConstant(commandBuffer, n); 62 | mScan[0].Record(commandBuffer); 63 | Vortex2D::Renderer::BufferBarrier(mBuffers[0].Handle(), commandBuffer, vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead); 64 | Vortex2D::Renderer::BufferBarrier(mBuffers[1].Handle(), commandBuffer, vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead); 65 | 66 | for (std::size_t i = 1; i < mScan.size(); i++) 67 | { 68 | mScan[i].PushConstant(commandBuffer, NextSize(n, localSize)); 69 | mScan[i].Record(commandBuffer); 70 | Vortex2D::Renderer::BufferBarrier(mBuffers[i - 1].Handle(), commandBuffer, vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead); 71 | Vortex2D::Renderer::BufferBarrier(mBuffers[i].Handle(), commandBuffer, vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead); 72 | 73 | mAdd[i - 1].PushConstant(commandBuffer, n); 74 | mAdd[i - 1].Record(commandBuffer); 75 | Vortex2D::Renderer::BufferBarrier(mBuffers[i - 1].Handle(), commandBuffer, vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead); 76 | 77 | n = NextSize(n, localSize); 78 | } 79 | 80 | mTimer.Stop(commandBuffer); 81 | }); 82 | } 83 | 84 | void Scan::Upload(const std::vector& input) 85 | { 86 | Vortex2D::Renderer::CopyFrom(mLocalInput, input); 87 | mUploadCmd.Submit(); 88 | } 89 | 90 | std::vector Scan::Download() 91 | { 92 | mDownloadCmd.Submit(); 93 | mDownloadCmd.Wait(); 94 | 95 | std::vector output(mLocalOutput.Size() / sizeof(float), 0.0f); 96 | Vortex2D::Renderer::CopyTo(mLocalOutput, output); 97 | return output; 98 | } 99 | 100 | void Scan::Submit() 101 | { 102 | mScanCmd.Submit(); 103 | mScanCmd.Wait(); 104 | } 105 | 106 | uint64_t Scan::GetElapsedNs() 107 | { 108 | return mTimer.GetElapsedNs(); 109 | } 110 | 111 | static void Scan_GPU_Subgroup(benchmark::State& state) 112 | { 113 | auto size = state.range(0); 114 | 115 | Scan scan(*gDevice, size, 512); 116 | 117 | std::vector inputData(size, 1.0f); 118 | std::iota(inputData.begin(), inputData.end(), 1.0f); 119 | 120 | for (auto _ : state) 121 | { 122 | scan.Submit(); 123 | state.SetIterationTime(scan.GetElapsedNs() / 1000000000.0); 124 | } 125 | } 126 | 127 | static void Scan_GPU_SharedMemory(benchmark::State& state) 128 | { 129 | auto size = state.range(0); 130 | 131 | Vortex2D::Renderer::Timer timer(*gDevice); 132 | Vortex2D::Fluid::PrefixScan scan(*gDevice, {size, 1}); 133 | 134 | Vortex2D::Renderer::Buffer input(*gDevice, size, VMA_MEMORY_USAGE_GPU_ONLY); 135 | Vortex2D::Renderer::Buffer output(*gDevice, size, VMA_MEMORY_USAGE_GPU_TO_CPU); 136 | 137 | Vortex2D::Renderer::Buffer dispatchParams(*gDevice); 138 | 139 | auto boundScan = scan.Bind(input, output, dispatchParams); 140 | 141 | Vortex2D::Renderer::CommandBuffer cmd(*gDevice); 142 | cmd.Record([&](vk::CommandBuffer commandBuffer) 143 | { 144 | timer.Start(commandBuffer); 145 | boundScan.Record(commandBuffer); 146 | timer.Stop(commandBuffer); 147 | }); 148 | 149 | std::vector outputData(size); 150 | 151 | for (auto _ : state) 152 | { 153 | cmd.Submit(); 154 | cmd.Wait(); 155 | state.SetIterationTime(timer.GetElapsedNs() / 1000000000.0); 156 | } 157 | } 158 | 159 | static void Scan_CPU_Seq(benchmark::State& state) 160 | { 161 | auto size = state.range(0); 162 | 163 | std::vector inputData(size, 1.0f); 164 | std::vector outputData(size); 165 | std::iota(inputData.begin(), inputData.end(), 1.0f); 166 | 167 | for (auto _ : state) 168 | { 169 | benchmark::DoNotOptimize(std::inclusive_scan(std::execution::seq, inputData.begin(), inputData.end(), outputData.begin())); 170 | } 171 | } 172 | 173 | static void Scan_CPU_Par(benchmark::State& state) 174 | { 175 | auto size = state.range(0); 176 | 177 | std::vector inputData(size, 1.0f); 178 | std::vector outputData(size); 179 | std::iota(inputData.begin(), inputData.end(), 1.0f); 180 | 181 | for (auto _ : state) 182 | { 183 | benchmark::DoNotOptimize(std::inclusive_scan(std::execution::par, inputData.begin(), inputData.end(), outputData.begin())); 184 | } 185 | } 186 | 187 | 188 | BENCHMARK(Scan_GPU_Subgroup)->Range(8, 8<<20)->UseManualTime(); 189 | BENCHMARK(Scan_GPU_SharedMemory)->Range(8, 8<<20)->UseManualTime(); 190 | BENCHMARK(Scan_CPU_Seq)->Range(8, 8<<20); 191 | BENCHMARK(Scan_CPU_Par)->Range(8, 8<<20); 192 | 193 | void CheckScan() 194 | { 195 | int size = 300; 196 | Scan scan(*gDevice, size, 256); 197 | 198 | std::vector inputData(size, 1.0f); 199 | std::iota(inputData.begin(), inputData.end(), 1.0f); 200 | 201 | scan.Upload(inputData); 202 | scan.Submit(); 203 | auto output = scan.Download(); 204 | 205 | std::vector expectedOutput(size); 206 | std::inclusive_scan(std::execution::seq, inputData.begin(), inputData.end(), expectedOutput.begin()); 207 | 208 | for (std::size_t i = 0; i < size; i++) 209 | { 210 | if (output[i] != expectedOutput[i]) 211 | { 212 | std::cout << "Diference at " << i << " values " << output[i] << " != " << expectedOutput[i] << std::endl; 213 | } 214 | } 215 | 216 | std::cout << "Scan complete" << std::endl; 217 | } 218 | -------------------------------------------------------------------------------- /scan.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | extern Vortex2D::Renderer::Device* gDevice; 6 | 7 | class Scan 8 | { 9 | public: 10 | Scan(const Vortex2D::Renderer::Device& device, 11 | int size, 12 | int localSize); 13 | 14 | void Upload(const std::vector& input); 15 | std::vector Download(); 16 | void Submit(); 17 | uint64_t GetElapsedNs(); 18 | 19 | private: 20 | Vortex2D::Renderer::Timer mTimer; 21 | Vortex2D::Renderer::CommandBuffer mUploadCmd, mDownloadCmd, mScanCmd; 22 | Vortex2D::Renderer::Work mScanWork; 23 | Vortex2D::Renderer::Work mAddWork; 24 | 25 | std::vector> mBuffers; // buffers for intermediate results 26 | Vortex2D::Renderer::Buffer mLocalInput, mLocalOutput; // buffers for copying input/output to device 27 | std::vector mScan; // bound scan shaders for each level 28 | std::vector mAdd; // bound add shaders for each level 29 | }; 30 | 31 | void CheckScan(); 32 | --------------------------------------------------------------------------------