├── .gitignore ├── LICENSE ├── README.md ├── build.bat ├── docs └── outline.txt ├── gpu_algorithms ├── __init__.py ├── __main__.py ├── gpu │ ├── __init__.py │ ├── clear_target.hlsl │ ├── meson.build │ ├── prefix_sum.hlsl │ ├── prefix_sum.py │ ├── radix_sort.hlsl │ ├── radix_sort.py │ ├── thread_utils.hlsl │ └── utilities.py ├── meson.build └── native │ ├── ModuleMain.cpp │ └── meson.build ├── meson.build └── pyproject.toml /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__* 2 | tags 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Kleber Garcia 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gpu_algorithms 2 | 3 | Misc repository for certain GPU algorithms. Will utilize this to benchmark / teach GPU and compute. 4 | 5 | To build and install editable package (inside this folder as a dev) 6 | run: 7 | 8 | ``` 9 | build.bat install-dev 10 | ``` 11 | 12 | To install as a wheel / package and to use outside this folder 13 | 14 | ``` 15 | build.bat install 16 | ``` 17 | -------------------------------------------------------------------------------- /build.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | set build_type="%~1" 3 | if %build_type% == "install-dev" ( 4 | set MESONPY_EDITABLE_VERBOSE=1 5 | py -m pip install --no-build-isolation --editable . 6 | ) else if %build_type% == "install" ( 7 | py -m pip install . 8 | ) else echo build.bat : build-type must be dev or install 9 | 10 | 11 | -------------------------------------------------------------------------------- /docs/outline.txt: -------------------------------------------------------------------------------- 1 | Title: 2 | Using your GPU for general purpose: algorithms, techniques and a compute abstraction layer for Python (CoalPy). 3 | 4 | Abstract: 5 | In recent years there has been a raise in public interest of scalable data processing, machine learning and computer graphics. All of these problem spaces rely more and more on general purpose graphics processing unit (GPGPU). We will present a couple of algorithms designed specifically for the GPU (Sorting and Prefixing), discuss applications and performance, we will also show how to write your own algorithms using CoalPy (Compute Abstraction layer for Python). 6 | 7 | Outline: 8 | 9 | * Agenda 10 | * About me 11 | * Summary of Applications of GPGPU 12 | * Performance Results: 13 | * GPU Radix Sort 14 | * GPU Prefix Count 15 | * GPGPU Concepts: 16 | * Graphics APIs 17 | * Dx12, Vulkan 18 | * CUDA 19 | * CoalPy 20 | * Introduction to SIMD 21 | * Hierarchy of computation: 22 | * register 23 | * waves 24 | * local data share (LDS), ram. 25 | * Compute kernel dispatches 26 | * Compute kernel groups 27 | * Compute kernel threads 28 | * Prefix sum introduction: 29 | * algorithm 30 | * applications 31 | * GPU Prefix Sum: 32 | * prefix sum in wave 33 | * prefix sum in thread group 34 | * prefix sum in data 35 | * Sorting in CPU 36 | * Counting sort 37 | * CPU Prefix Sum 38 | * Offsets 39 | * Sorting in GPU 40 | * Count Scatter 41 | * Prefix batch table 42 | * Prefix global table 43 | * scatter output 44 | * Tutorial for coalpy 45 | * Q&A 46 | 47 | -------------------------------------------------------------------------------- /gpu_algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kecho/gpu_algorithms/186a4550aa27749761f74592ffbaac54d64419e8/gpu_algorithms/__init__.py -------------------------------------------------------------------------------- /gpu_algorithms/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import coalpy.gpu 4 | import time 5 | from .gpu import prefix_sum 6 | from .gpu import radix_sort 7 | 8 | from . import native 9 | 10 | def benchmark_prefix_sum(args): 11 | sample_size = int(args.size) 12 | 13 | #prepare input 14 | rand_array = np.random.randint(0, high=sample_size, size=sample_size) 15 | 16 | print (":: Prefix Sum ::") 17 | 18 | if args.printresults: 19 | print("Input: " + str(rand_array)) 20 | 21 | benchmark_prefix_sum_numpy(sample_size, rand_array, args) 22 | benchmark_prefix_sum_cpu(sample_size, rand_array, args) 23 | benchmark_prefix_sum_gpu(sample_size, rand_array, args) 24 | 25 | def benchmark_prefix_sum_gpu(sample_size, sample_array, args): 26 | print("\t ::: GPU Prefix Sum :::") 27 | 28 | input_buffer = coalpy.gpu.Buffer( 29 | name="input_buffer", 30 | type = coalpy.gpu.BufferType.Standard, 31 | format = coalpy.gpu.Format.R32_UINT, 32 | element_count = sample_size, 33 | stride = 4 #size of uint 34 | ) 35 | 36 | prefix_sum_args = prefix_sum.allocate_args(sample_size) 37 | 38 | cmd_list = coalpy.gpu.CommandList() 39 | cmd_list.begin_marker("upload_resource") 40 | cmd_list.upload_resource( source=sample_array, destination=input_buffer ) 41 | cmd_list.end_marker() 42 | 43 | cmd_list.begin_marker("prefix_sum") 44 | output_buffer = prefix_sum.run(cmd_list, input_buffer, prefix_sum_args, is_exclusive=False) 45 | cmd_list.end_marker() 46 | 47 | coalpy.gpu.begin_collect_markers() 48 | coalpy.gpu.schedule(cmd_list) 49 | marker_results = coalpy.gpu.end_collect_markers() 50 | 51 | if args.printresults: 52 | download_request = coalpy.gpu.ResourceDownloadRequest(resource = output_buffer) 53 | download_request.resolve() 54 | cpu_result_buffer = np.frombuffer(download_request.data_as_bytearray(), dtype='i') 55 | cpu_result_buffer = np.resize(cpu_result_buffer, sample_size) 56 | print("\t Results: " + str(cpu_result_buffer)) 57 | 58 | #calculate time stamp markers 59 | marker_download_request = coalpy.gpu.ResourceDownloadRequest(resource = marker_results.timestamp_buffer) 60 | marker_download_request.resolve() 61 | marker_data = np.frombuffer(marker_download_request.data_as_bytearray(), dtype=np.uint64) 62 | marker_benchmarks = [ 63 | (name, (marker_data[ei]/marker_results.timestamp_frequency - marker_data[bi]/marker_results.timestamp_frequency) * 1000) for (name, pid, bi, ei) in marker_results.markers] 64 | 65 | (_, ellapsed_time) = marker_benchmarks[1] 66 | 67 | print("\t Elapsed time: " + str(ellapsed_time) + " ms.") 68 | print(); 69 | return 70 | 71 | def benchmark_prefix_sum_cpu(sample_size, sample_array, args): 72 | print ("\t ::: CPU (C) Prefix Sum :::") 73 | (time, result) = native.prefix_sum(sample_array) 74 | 75 | if args.printresults: 76 | array_value = np.frombuffer(result, dtype='i') 77 | print ("\t Results: " + str(array_value)) 78 | 79 | print ("\t Elapsed time: " + str(time) + " ms.") 80 | print(); 81 | return 82 | 83 | def benchmark_prefix_sum_numpy(sample_size, sample_array, args): 84 | print("\t ::: Numpy Prefix Sum :::") 85 | cpu_start_time = time.time() 86 | prefix_cpu_result = np.cumsum(sample_array) 87 | ellapsed_seconds = time.time() - cpu_start_time 88 | if args.printresults: 89 | print("\t Result: " + str(prefix_cpu_result)) 90 | print("\t Elapsed time: " + str(ellapsed_seconds * 1000) + " ms.") 91 | print() 92 | return 93 | 94 | 95 | def benchmark_quicksort_numpy(sample_size, rand_array, args): 96 | print ("\t ::: Numpy Quicksort :::") 97 | cpu_start_time = time.time() 98 | sort_result = np.sort(rand_array, axis=-1, kind='quicksort') 99 | ellapsed_seconds = time.time() - cpu_start_time 100 | if args.printresults: 101 | print("\t Results: " + str(sort_result)) 102 | print("\t Elapsed time: " + str(ellapsed_seconds * 1000) + " ms.") 103 | print() 104 | return 105 | 106 | def benchmark_radixsort_cpu(sample_size, rand_array, args): 107 | print ("\t ::: CPU (C) Radix Sort :::") 108 | (time, result) = native.radix_sort(rand_array) 109 | 110 | if args.printresults: 111 | array_value = np.frombuffer(result, dtype='i') 112 | print("\t Results: " + str(array_value)) 113 | 114 | print("\t Elapsed time: " + str(time) + " ms.") 115 | print() 116 | return 117 | 118 | def benchmark_radix_sort_gpu(sample_size, sample_array, args): 119 | print("\t ::: GPU Radix Sort :::") 120 | 121 | input_buffer = coalpy.gpu.Buffer( 122 | name="input_buffer", 123 | type = coalpy.gpu.BufferType.Standard, 124 | format = coalpy.gpu.Format.R32_UINT, 125 | element_count = sample_size, 126 | stride = 4 #size of uint 127 | ) 128 | 129 | radix_sort_args = radix_sort.allocate_args(sample_size, args.sort_output_ordering, args.indirect_args) 130 | 131 | indirect_args = None 132 | 133 | cmd_list = coalpy.gpu.CommandList() 134 | 135 | if args.indirect_args: 136 | indirect_args = coalpy.gpu.Buffer("IndirectSortArgs", element_count = 1, format = coalpy.gpu.Format.R32_UINT) 137 | cmd_list.upload_resource(source = [sample_size], destination = indirect_args) 138 | 139 | cmd_list.begin_marker("upload_resource") 140 | cmd_list.upload_resource( source=sample_array, destination=input_buffer ) 141 | cmd_list.end_marker() 142 | 143 | cmd_list.begin_marker("radix_sort") 144 | (output_buffer, count_table_prefix) = radix_sort.run(cmd_list, input_buffer, radix_sort_args, indirect_args) 145 | cmd_list.end_marker() 146 | 147 | coalpy.gpu.begin_collect_markers() 148 | coalpy.gpu.schedule(cmd_list) 149 | marker_results = coalpy.gpu.end_collect_markers() 150 | 151 | if args.printresults: 152 | output_download_request = coalpy.gpu.ResourceDownloadRequest(resource = output_buffer) 153 | output_download_request.resolve() 154 | cpu_result_buffer = np.frombuffer(output_download_request.data_as_bytearray(), dtype='i') 155 | cpu_result_buffer = np.resize(cpu_result_buffer, sample_size) 156 | if args.sort_output_ordering: 157 | for i in range(0, sample_size): 158 | cpu_result_buffer[i] = sample_array[cpu_result_buffer[i]] 159 | 160 | print("\t Results: " + str(cpu_result_buffer)) 161 | 162 | # uncomment to verify sort 163 | #for i in range(1, len(cpu_result_buffer)): 164 | # if (cpu_result_buffer[i - 1 ] > cpu_result_buffer[i]): 165 | # print("ERROR " + str(i)) 166 | 167 | #calculate time stamp markers 168 | marker_download_request = coalpy.gpu.ResourceDownloadRequest(resource = marker_results.timestamp_buffer) 169 | marker_download_request.resolve() 170 | marker_data = np.frombuffer(marker_download_request.data_as_bytearray(), dtype=np.uint64) 171 | marker_benchmarks = [ 172 | (name, (marker_data[ei]/marker_results.timestamp_frequency - marker_data[bi]/marker_results.timestamp_frequency) * 1000) for (name, pid, bi, ei) in marker_results.markers] 173 | 174 | #print (marker_benchmarks) 175 | 176 | (_, ellapsed_time) = marker_benchmarks[1] 177 | 178 | print("\t Elapsed time: " + str(ellapsed_time) + " ms.") 179 | print(); 180 | return 181 | 182 | 183 | def benchmark_sort(args): 184 | sample_size = int(args.size) 185 | 186 | #prepare input 187 | rand_array = np.random.randint(0, high=sample_size, size=sample_size) 188 | 189 | print (":: Sort ::") 190 | 191 | if args.printresults: 192 | print("Input: " + str(rand_array)) 193 | 194 | benchmark_quicksort_numpy(sample_size, rand_array, args) 195 | benchmark_radixsort_cpu(sample_size, rand_array, args) 196 | benchmark_radix_sort_gpu(sample_size, rand_array, args) 197 | 198 | 199 | RAND_SEED_DEFAULT = 1999 200 | 201 | if __name__ == '__main__': 202 | parser = argparse.ArgumentParser( 203 | prog="python -m gpu_algorithms", 204 | description = "::gpu_algorithms:: - benchmark tool for GPU algorithms") 205 | parser.add_argument("-s", "--size", default=1600, required=False, help="size of input") 206 | parser.add_argument("-r", "--randseed", default=RAND_SEED_DEFAULT, required=False, help="random seed") 207 | parser.add_argument("-p", "--printresults", action='store_true', help="print inputs/outputs") 208 | parser.add_argument("-g", "--printgpu", action='store_true', help="print the used GPU") 209 | parser.add_argument("-i", "--indirect_args", default=False, action='store_true', help="Use indirect arguments (for sorting only)") 210 | parser.add_argument("-o", "--sort_output_ordering", default=False, action='store_true', help="sort using extra buffer for keys / indices. Adds sampling cost.") 211 | args = parser.parse_args() 212 | 213 | if args.printgpu: 214 | print("Available gpus: " + str(coalpy.gpu.get_adapters())) 215 | print("Current gpu info: " + str(coalpy.gpu.get_current_adapter_info())) 216 | 217 | rand_seed = int(args.randseed) 218 | if rand_seed != RAND_SEED_DEFAULT: 219 | np.random.seed(int(args.randseed)) 220 | 221 | benchmark_prefix_sum(args) 222 | benchmark_sort(args) 223 | -------------------------------------------------------------------------------- /gpu_algorithms/gpu/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pathlib 4 | import coalpy.gpu as g 5 | 6 | def _checkGpu(gpuInfo, substring): 7 | (idx, nm) = gpuInfo 8 | return substring in nm.lower() 9 | 10 | selected_gpu = next((adapter for adapter in g.get_adapters() if _checkGpu(adapter, "nvidia") or _checkGpu(adapter, "amd")), None) 11 | if selected_gpu is not None: 12 | g.get_settings().adapter_index = selected_gpu[0] 13 | 14 | g_module_path = os.path.dirname(pathlib.Path(sys.modules[__name__].__file__)) + "\\" 15 | g.add_data_path(g_module_path) 16 | g.init() 17 | -------------------------------------------------------------------------------- /gpu_algorithms/gpu/clear_target.hlsl: -------------------------------------------------------------------------------- 1 | RWTexture2D g_output : register(u0); 2 | cbuffer Constants : register(b0) 3 | { 4 | float4 clearColor; 5 | } 6 | 7 | [numthreads(8,8,1)] 8 | void csMainClear(int2 dti : SV_DispatchThreadID) 9 | { 10 | g_output[dti] = clearColor; 11 | } 12 | 13 | cbuffer ConstantsUintBuff : register(b0) 14 | { 15 | uint g_uintClearVal; 16 | int g_clearOffset; 17 | int g_clearValSize; 18 | } 19 | 20 | RWBuffer g_output_buff_uint : register(u0); 21 | [numthreads(64,1,1)] 22 | void csMainClearUintBuffer(int3 dti : SV_DispatchThreadID) 23 | { 24 | if (dti.x >= g_clearValSize) 25 | return; 26 | 27 | g_output_buff_uint[g_clearOffset + dti.x] = g_uintClearVal; 28 | } 29 | -------------------------------------------------------------------------------- /gpu_algorithms/gpu/meson.build: -------------------------------------------------------------------------------- 1 | py.install_sources( 2 | [ 3 | '__init__.py', 4 | 'prefix_sum.hlsl', 5 | 'radix_sort.hlsl', 6 | 'thread_utils.hlsl', 7 | 'clear_target.hlsl', 8 | 'prefix_sum.py', 9 | 'radix_sort.py', 10 | 'utilities.py' 11 | ], 12 | subdir:'gpu_algorithms/gpu' 13 | ) 14 | -------------------------------------------------------------------------------- /gpu_algorithms/gpu/prefix_sum.hlsl: -------------------------------------------------------------------------------- 1 | // This value must match the group size in prefux_sum.py 2 | #define GROUP_SIZE 128 3 | #define HW_WAVE_SIZE 32 4 | #include "thread_utils.hlsl" 5 | 6 | Buffer g_inputBuffer : register(t0); 7 | RWBuffer g_outputBuffer : register(u0); 8 | 9 | cbuffer ConstantsPrefixSum : register(b0) 10 | { 11 | int4 g_bufferArgs0; 12 | } 13 | 14 | #define inputCount g_bufferArgs0.x 15 | #define inputOffset g_bufferArgs0.y 16 | #define outputOffset g_bufferArgs0.z 17 | #define parentOffset g_bufferArgs0.w 18 | 19 | [numthreads(GROUP_SIZE, 1, 1)] 20 | void csPrefixSumOnGroup( 21 | int3 dispatchThreadID : SV_DispatchThreadID, 22 | int groupIndex : SV_GroupIndex) 23 | { 24 | int threadID = dispatchThreadID.x; 25 | uint inputVal = threadID >= inputCount ? 0u : g_inputBuffer[threadID + inputOffset]; 26 | 27 | uint outputVal, count; 28 | ThreadUtils::PrefixExclusive(groupIndex, inputVal, outputVal, count); 29 | 30 | #ifndef EXCLUSIVE_PREFIX 31 | outputVal += inputVal; 32 | #endif 33 | g_outputBuffer[threadID + outputOffset] = outputVal; 34 | } 35 | 36 | [numthreads(GROUP_SIZE, 1, 1)] 37 | void csPrefixSumNextInput(int3 dispatchThreadID : SV_DispatchThreadID, int3 groupID : SV_GroupID) 38 | { 39 | g_outputBuffer[dispatchThreadID.x] = g_inputBuffer[inputOffset + dispatchThreadID.x * GROUP_SIZE + GROUP_SIZE - 1]; 40 | } 41 | 42 | groupshared uint g_parentSum; 43 | 44 | [numthreads(GROUP_SIZE, 1, 1)] 45 | void csPrefixSumResolveParent(int3 dispatchThreadID : SV_DispatchThreadID, int groupIndex : SV_GroupIndex, int3 groupID : SV_GroupID) 46 | { 47 | //if (groupIndex == 0) 48 | // g_parentSum = groupID.x == 0 ? 0 : g_outputBuffer[parentOffset + groupID.x - 1]; 49 | 50 | //no need to do barriers / etc since groupID will trigger a scalar load. We hope!! 51 | uint parentSum = groupID.x == 0 ? 0 : g_outputBuffer[parentOffset + groupID.x - 1]; 52 | int index = outputOffset + dispatchThreadID.x; 53 | #if EXCLUSIVE_PREFIX 54 | uint val = g_outputBuffer[index] - g_inputBuffer[index]; 55 | g_outputBuffer[index] = val + parentSum; 56 | #else 57 | g_outputBuffer[index] += parentSum; 58 | #endif 59 | } 60 | -------------------------------------------------------------------------------- /gpu_algorithms/gpu/prefix_sum.py: -------------------------------------------------------------------------------- 1 | import coalpy.gpu as g 2 | from . import utilities as utils 3 | 4 | g_group_size = 128 5 | g_prefix_sum_group = g.Shader(file = "prefix_sum.hlsl", main_function = "csPrefixSumOnGroup") 6 | g_prefix_sum_group_exclusive = g.Shader(file = "prefix_sum.hlsl", main_function = "csPrefixSumOnGroup", defines = ["EXCLUSIVE_PREFIX"]) 7 | g_prefix_sum_next_input = g.Shader(file = "prefix_sum.hlsl", main_function = "csPrefixSumNextInput") 8 | g_prefix_sum_resolve_parent = g.Shader(file = "prefix_sum.hlsl", main_function = "csPrefixSumResolveParent") 9 | g_prefix_sum_resolve_parent_exclusive = g.Shader(file = "prefix_sum.hlsl", main_function = "csPrefixSumResolveParent", defines = ["EXCLUSIVE_PREFIX"]) 10 | 11 | def allocate_args(input_counts): 12 | aligned_bin_count = utils.alignup(input_counts, g_group_size) 13 | reduction_count = 0 14 | c = input_counts 15 | perform_reduction = True 16 | while perform_reduction: 17 | reduction_count += utils.alignup(c, g_group_size) 18 | c = utils.divup(c, g_group_size) 19 | perform_reduction = c > 1 20 | 21 | return (g.Buffer(name = "reductionBufferInput", element_count = aligned_bin_count, format = g.Format.R32_UINT), 22 | g.Buffer(name = "reductionBufferOutput", element_count = reduction_count, format = g.Format.R32_UINT), 23 | input_counts) 24 | 25 | def run(cmd_list, input_buffer, prefix_sum_args, is_exclusive = False, input_counts = -1): 26 | reduction_buffer_in = prefix_sum_args[0] 27 | reduction_buffer_out = prefix_sum_args[1] 28 | if (input_counts == -1): 29 | input_counts = prefix_sum_args[2] 30 | group_count = input_counts 31 | perform_reduction = input_counts > 0 32 | iteration = 0 33 | input_count = 0 34 | input_offset = 0 35 | output_offset = 0 36 | pass_list = [] 37 | while perform_reduction: 38 | input_count = group_count 39 | group_count = utils.divup(group_count, g_group_size) 40 | pass_list.append((input_count, output_offset)) 41 | 42 | cmd_list.dispatch( 43 | x = group_count, y = 1, z = 1, 44 | shader = g_prefix_sum_group_exclusive if is_exclusive and iteration == 0 and group_count == 1 else g_prefix_sum_group, 45 | inputs = input_buffer if iteration == 0 else reduction_buffer_in, 46 | outputs = reduction_buffer_out, 47 | constants = [input_count, 0, output_offset, 0]) 48 | 49 | perform_reduction = group_count > 1 50 | if perform_reduction: 51 | next_group_count = utils.divup(group_count, g_group_size) 52 | cmd_list.dispatch( 53 | x = next_group_count, y = 1, z = 1, 54 | shader = g_prefix_sum_next_input, 55 | inputs = reduction_buffer_out, 56 | outputs = reduction_buffer_in, 57 | constants = [0, output_offset, 0, 0]) 58 | 59 | iteration += 1 60 | output_offset += utils.alignup(input_count, g_group_size) 61 | 62 | for i in range(1, len(pass_list)): 63 | idx = len(pass_list) - 1 - i 64 | (parent_count, parent_offset) = pass_list[idx + 1] 65 | (count, offset) = pass_list[idx] 66 | const = [0, 0, offset, parent_offset] 67 | if i == len(pass_list) - 1 and is_exclusive: 68 | cmd_list.dispatch( 69 | x = utils.divup(count, g_group_size), y = 1, z = 1, 70 | shader = g_prefix_sum_resolve_parent_exclusive, 71 | inputs = input_buffer, 72 | outputs = reduction_buffer_out, 73 | constants = const) 74 | else: 75 | cmd_list.dispatch( 76 | x = utils.divup(count, g_group_size), y = 1, z = 1, 77 | shader = g_prefix_sum_resolve_parent, 78 | outputs = reduction_buffer_out, 79 | constants = const) 80 | return reduction_buffer_out 81 | -------------------------------------------------------------------------------- /gpu_algorithms/gpu/radix_sort.hlsl: -------------------------------------------------------------------------------- 1 | #define BITS_PER_RADIX 8 2 | #define RADIX_COUNTS (1 << BITS_PER_RADIX) 3 | #define BATCH_SIZE 1024 4 | 5 | #ifndef GROUP_SIZE 6 | #define GROUP_SIZE 128 7 | #endif 8 | 9 | #define HW_WAVE_SIZE 32 10 | 11 | #include "thread_utils.hlsl" 12 | 13 | #define THREAD_DWORD_COMPONENTS (GROUP_SIZE / DWORD_BIT_SIZE) 14 | 15 | #define FLAGS_IS_FIRST_PASS 1 << 0 16 | #define FLAGS_OUTPUT_ORDERING 1 << 1 17 | 18 | cbuffer RadixArgs : register(b0) 19 | { 20 | uint g_inputCount; 21 | uint g_batchCount; 22 | uint g_radixMask; 23 | uint g_unused0; 24 | 25 | uint g_radixShift; 26 | uint g_flags; 27 | uint g_unused2; 28 | uint g_unused1; 29 | } 30 | 31 | Buffer g_inputIndirectCount : register(t0); 32 | RWBuffer g_outputConstantBuffer : register(u0); 33 | RWBuffer g_outputIndirectBuffer : register(u1); 34 | 35 | [numthreads(1, 1, 1)] 36 | void csWriteIndirectArguments() 37 | { 38 | uint inputCount = g_inputIndirectCount[0]; 39 | uint batchCounts = (inputCount + BATCH_SIZE - 1) / BATCH_SIZE; 40 | 41 | g_outputConstantBuffer[0] = inputCount; 42 | g_outputConstantBuffer[1] = batchCounts; 43 | 44 | g_outputIndirectBuffer[0] = batchCounts; 45 | g_outputIndirectBuffer[1] = 1; 46 | g_outputIndirectBuffer[2] = 1; 47 | } 48 | 49 | 50 | Buffer g_inputBuffer : register(t0); 51 | Buffer g_inputOrdering : register(t1); 52 | RWBuffer g_outputBatchOffset : register(u0); 53 | RWBuffer g_outputRadixTable : register(u1); 54 | 55 | #define RADIX_TABLE_SIZE (RADIX_COUNTS * THREAD_DWORD_COMPONENTS) 56 | groupshared uint gs_localRadixTable[RADIX_TABLE_SIZE]; 57 | groupshared uint gs_radixCounts[RADIX_COUNTS]; 58 | 59 | [numthreads(GROUP_SIZE, 1, 1)] 60 | void csCountScatterBuckets( 61 | int groupIndex : SV_GroupIndex, 62 | int3 groupID : SV_GroupID) 63 | { 64 | int batchIndex = groupID.x; 65 | int batchBegin = batchIndex * BATCH_SIZE; 66 | int batchEnd = min(g_inputCount, batchBegin + BATCH_SIZE); 67 | 68 | int threadComponentOffset = groupIndex >> DWORD_BIT_SIZE_LOG2; // divide by 32 69 | int threadComponentBitIndex = groupIndex & (DWORD_BIT_SIZE - 1); // modulus 32 70 | uint threadPrefixMask[THREAD_DWORD_COMPONENTS]; 71 | 72 | int bi, k, unused; 73 | 74 | for (k = 0; k < THREAD_DWORD_COMPONENTS; ++k) 75 | threadPrefixMask[k] = k >= threadComponentOffset ? (k == threadComponentOffset ? ((1u << threadComponentBitIndex) - 1u) : 0) : ~0; 76 | 77 | for (k = groupIndex; k < RADIX_COUNTS; k += GROUP_SIZE) 78 | gs_radixCounts[k] = 0; 79 | 80 | GroupMemoryBarrierWithGroupSync(); 81 | 82 | uint batchIterations = (batchEnd - batchBegin + GROUP_SIZE - 1)/GROUP_SIZE; 83 | 84 | bool outputsOrdering = (g_flags & FLAGS_OUTPUT_ORDERING) != 0; 85 | bool isFirstPass = (g_flags & FLAGS_IS_FIRST_PASS) != 0; 86 | bool sampleOrdering = outputsOrdering && !isFirstPass; 87 | 88 | [loop] 89 | for (bi = 0; bi < batchIterations; ++bi) 90 | { 91 | uint inputOffset = batchBegin + bi * GROUP_SIZE + groupIndex; 92 | 93 | [loop] 94 | for (k = groupIndex; k < RADIX_TABLE_SIZE; k += GROUP_SIZE) 95 | gs_localRadixTable[k] = 0; 96 | 97 | // wait for writes in gs_localRadixTable 98 | GroupMemoryBarrierWithGroupSync(); 99 | 100 | uint i = inputOffset; 101 | if (sampleOrdering) 102 | i = inputOffset < g_inputCount ? g_inputOrdering[inputOffset] : ~0u; 103 | 104 | uint value = i < g_inputCount ? g_inputBuffer[i] : ~0u; 105 | uint radix = (value >> g_radixShift) & g_radixMask; 106 | InterlockedOr(gs_localRadixTable[THREAD_DWORD_COMPONENTS*radix + threadComponentOffset], 1u << threadComponentBitIndex, unused); 107 | 108 | // wait atomics in gs_localRadixTable 109 | GroupMemoryBarrierWithGroupSync(); 110 | 111 | uint localOffset = 0; 112 | 113 | [unroll(THREAD_DWORD_COMPONENTS)] 114 | for (k = 0; k < THREAD_DWORD_COMPONENTS; ++k) 115 | localOffset += countbits(gs_localRadixTable[THREAD_DWORD_COMPONENTS*radix + k] & threadPrefixMask[k]); 116 | 117 | if (inputOffset < g_inputCount) 118 | g_outputBatchOffset[inputOffset] = localOffset + gs_radixCounts[radix]; 119 | 120 | // wait for reads in gs_radixCounts (we are about to write to it) 121 | GroupMemoryBarrierWithGroupSync(); 122 | 123 | for (uint usedRadix = groupIndex; usedRadix < RADIX_COUNTS; usedRadix += GROUP_SIZE) 124 | { 125 | uint localCountForRadix = 0; 126 | 127 | [unroll(THREAD_DWORD_COMPONENTS)] 128 | for (k = 0; k < THREAD_DWORD_COMPONENTS; ++k) 129 | localCountForRadix += countbits(gs_localRadixTable[THREAD_DWORD_COMPONENTS*usedRadix + k]); 130 | 131 | gs_radixCounts[usedRadix] += localCountForRadix; 132 | } 133 | 134 | // wait for reads in gs_localRadixTable. We are about to write to it in the next iteration. 135 | GroupMemoryBarrierWithGroupSync(); 136 | } 137 | 138 | // No need for this barrier!, since the loop above has one. 139 | //GroupMemoryBarrierWithGroupSync(); 140 | 141 | [loop] 142 | for (k = groupIndex; k < RADIX_COUNTS; k += GROUP_SIZE) 143 | g_outputRadixTable[batchIndex * RADIX_COUNTS + k] = gs_radixCounts[k]; 144 | } 145 | 146 | Buffer g_inputCounterTable : register(t0); 147 | RWBuffer g_outputCounterTablePrefix : register(u0); 148 | RWBuffer g_outputRadixTotalCounts : register(u1); 149 | 150 | [numthreads(GROUP_SIZE, 1, 1)] 151 | void csPrefixCountTable( 152 | int groupIndex : SV_GroupIndex, 153 | int3 groupID : SV_GroupID) 154 | { 155 | uint radix = groupID.x; 156 | uint tb = 0; 157 | uint radixCounts = 0; 158 | uint threadBatchesCount = (g_batchCount + GROUP_SIZE - 1)/ GROUP_SIZE; 159 | for (tb = 0; tb < threadBatchesCount; ++tb) 160 | { 161 | uint i = tb * GROUP_SIZE + groupIndex; 162 | 163 | uint countValue = i < g_batchCount ? g_inputCounterTable[i * RADIX_COUNTS + radix] : 0; 164 | 165 | uint batchOffset, batchCount; 166 | ThreadUtils::PrefixExclusive(groupIndex, countValue, batchOffset, batchCount); 167 | 168 | // Mandatory barrier: the prefix above could be using LDS, so the next iteration of this 169 | // loop could cause read / write collisions. 170 | GroupMemoryBarrierWithGroupSync(); 171 | 172 | if (i < g_batchCount) 173 | g_outputCounterTablePrefix[radix * g_batchCount + i] = batchOffset + radixCounts; 174 | 175 | radixCounts += batchCount; 176 | } 177 | 178 | if (groupIndex == 0) 179 | g_outputRadixTotalCounts[radix] = radixCounts; 180 | } 181 | 182 | Buffer g_inputRadixTotalCounts : register(t0); 183 | RWBuffer g_outputGlobalPrefix : register(u0); 184 | 185 | [numthreads(GROUP_SIZE, 1, 1)] 186 | void csPrefixGlobalTable(int groupIndex : SV_GroupIndex) 187 | { 188 | uint radixVal = g_inputRadixTotalCounts[groupIndex]; 189 | uint offset, unused; 190 | ThreadUtils::PrefixExclusive(groupIndex, radixVal, offset, unused); 191 | g_outputGlobalPrefix[groupIndex] = offset; 192 | } 193 | 194 | Buffer g_inputUnsorted : register(t0); 195 | Buffer g_inputUnsortedOrdering : register(t1); 196 | Buffer g_inputLocalBatchOffset : register(t2); 197 | Buffer g_inputCounterTablePrefix : register(t3); 198 | Buffer g_inputGlobalPrefix : register(t4); 199 | RWBuffer g_outputSorted : register(u0); 200 | 201 | [numthreads(GROUP_SIZE, 1, 1)] 202 | void csScatterOutput( 203 | uint3 dispatchThreadID : SV_DispatchThreadID, 204 | uint groupIndex : SV_GroupIndex, 205 | uint3 groupID : SV_GroupID) 206 | { 207 | uint batchIndex = groupID.x; 208 | uint batchOffset = groupIndex; 209 | uint i = dispatchThreadID.x; 210 | bool outputsOrdering = g_flags & FLAGS_OUTPUT_ORDERING; 211 | if (outputsOrdering && (g_flags & FLAGS_IS_FIRST_PASS) == 0) 212 | i = dispatchThreadID.x < g_inputCount ? g_inputUnsortedOrdering[dispatchThreadID.x] : ~0u; 213 | 214 | uint value = i < g_inputCount ? g_inputUnsorted[i] : ~0; 215 | uint radix = (value >> g_radixShift) & g_radixMask; 216 | if (i < g_inputCount) 217 | { 218 | uint outputIndex = g_inputGlobalPrefix[radix] + g_inputCounterTablePrefix[radix * g_batchCount + batchIndex] + g_inputLocalBatchOffset[dispatchThreadID.x]; 219 | g_outputSorted[outputIndex] = outputsOrdering ? i : value; 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /gpu_algorithms/gpu/radix_sort.py: -------------------------------------------------------------------------------- 1 | import coalpy.gpu as g 2 | from . import utilities as utils 3 | 4 | g_group_size = 128 5 | g_batch_size = 1024 6 | g_bits_per_radix = 8 7 | g_bytes_per_radix = int(g_bits_per_radix/8) 8 | g_radix_counts = int(1 << g_bits_per_radix) 9 | g_radix_iterations = int(32/g_bits_per_radix) 10 | 11 | g_write_indirect_args_shader = g.Shader(file = "radix_sort.hlsl", main_function = "csWriteIndirectArguments") 12 | g_count_scatter_shader = g.Shader(file = "radix_sort.hlsl", main_function = "csCountScatterBuckets") 13 | g_prefix_count_table_shader = g.Shader(file = "radix_sort.hlsl", main_function = "csPrefixCountTable", defines = ["GROUP_SIZE=256"]) 14 | g_prefix_global_table_shader = g.Shader(file = "radix_sort.hlsl", main_function = "csPrefixGlobalTable", defines = ["GROUP_SIZE=RADIX_COUNTS"]) 15 | g_scatter_output_shader = g.Shader(file = "radix_sort.hlsl", main_function = "csScatterOutput", defines=["GROUP_SIZE="+str(g_batch_size)]) 16 | 17 | 18 | # Must match flags in radix_sort.hlsl 19 | FLAGS_IS_FIRST_PASS = 1 << 0 20 | FLAGS_OUTPUT_ORDERING = 1 << 1 21 | 22 | def allocate_args(input_counts, output_ordering = False, is_indirect = False): 23 | aligned_batch_count = utils.divup(input_counts, g_batch_size) 24 | count_table_count = aligned_batch_count * g_radix_counts 25 | return ( 26 | g.Buffer(name="localOffsetsBuffer", element_count = input_counts, format = g.Format.R32_UINT), 27 | g.Buffer(name="pingBuffer", element_count = input_counts, format = g.Format.R32_UINT), 28 | g.Buffer(name="pongBuffer", element_count = input_counts, format = g.Format.R32_UINT), 29 | g.Buffer(name="countTableBatchPrefixBuffer", element_count = count_table_count, format = g.Format.R32_UINT), 30 | g.Buffer(name="radixTotalCounts", element_count = g_radix_counts, format = g.Format.R32_UINT), 31 | g.Buffer(name="countTableBuffer", element_count = count_table_count, format = g.Format.R32_UINT), 32 | g.Buffer(name="sortConstants", element_count = 8, format = g.Format.R32_UINT, usage = g.BufferUsage.Constant), 33 | g.Buffer(name="IndirectArgs", element_count = 4, format = g.Format.R32_UINT, usage = g.BufferUsage.IndirectArgs) if is_indirect else None, 34 | input_counts, 35 | output_ordering) 36 | 37 | def run (cmd_list, input_buffer, sort_args, indirect_count_buffer = None): 38 | ( 39 | local_offsets, 40 | ping_buffer, 41 | pong_buffer, 42 | count_table_prefix, 43 | radix_total_counts, 44 | count_table, 45 | constant_buffer, 46 | indirect_args, 47 | input_counts, 48 | output_ordering 49 | ) = sort_args 50 | 51 | if indirect_count_buffer == None and indirect_args != None: 52 | raise Exception("Indirect buffer has to be provided when the sorting uses indirect arguments.") 53 | 54 | batch_counts = utils.divup(input_counts, g_batch_size) 55 | 56 | radix_mask = int((1 << g_bits_per_radix) - 1) 57 | 58 | tmp_input_buffer = ping_buffer 59 | tmp_output_buffer = pong_buffer 60 | 61 | constant_data = [ 62 | int(input_counts), # g_inputCount 63 | int(batch_counts), # g_batchCount 64 | int(radix_mask), # g_radixMask 65 | int(0), # g_unused0 66 | int(0), # g_radixShift, set to 0 67 | int(0), # g_flags, set to 0 68 | int(0),# g_unused1 69 | int(0) ]# g_unused2 70 | 71 | cmd_list.upload_resource( source = constant_data, destination=constant_buffer ) 72 | 73 | if indirect_args != None: 74 | cmd_list.dispatch( 75 | x = 1, y = 1, z = 1, 76 | shader = g_write_indirect_args_shader, 77 | inputs = indirect_count_buffer, 78 | outputs = [ constant_buffer, indirect_args ]) 79 | 80 | for radix_i in range(0, g_radix_iterations): 81 | radix_shift = g_bits_per_radix * radix_i 82 | flags = FLAGS_IS_FIRST_PASS if radix_i == 0 else 0 83 | flags = flags | (FLAGS_OUTPUT_ORDERING if output_ordering else 0) 84 | 85 | (tmp_input_buffer, tmp_output_buffer) = (tmp_output_buffer, tmp_input_buffer) 86 | 87 | unsorted_buffer = None 88 | input_ordering = None 89 | 90 | if (flags & FLAGS_OUTPUT_ORDERING) == 0: 91 | unsorted_buffer = input_buffer if radix_i == 0 else tmp_input_buffer 92 | input_ordering = unsorted_buffer # unused so we set it as the unsorted buffer 93 | else: 94 | unsorted_buffer = input_buffer 95 | input_ordering = tmp_input_buffer 96 | 97 | 98 | #patch constant data, only elements that change 99 | constant_data_patch = [ 100 | int(radix_shift), # g_radixShift 101 | int(flags), # g_flags 102 | int(0), 103 | int(0) 104 | ] 105 | cmd_list.upload_resource( source = constant_data_patch, destination=constant_buffer, destination_offset = 4 * 4 ) 106 | 107 | cmd_list.begin_marker("count_scatter") 108 | 109 | if indirect_args == None: 110 | cmd_list.dispatch( 111 | x = batch_counts, y = 1, z = 1, 112 | shader = g_count_scatter_shader, 113 | inputs = [ unsorted_buffer, input_ordering ], 114 | outputs = [ local_offsets, count_table ], 115 | constants = constant_buffer 116 | ) 117 | else: 118 | cmd_list.dispatch( 119 | indirect_args = indirect_args, 120 | shader = g_count_scatter_shader, 121 | inputs = [ unsorted_buffer, input_ordering ], 122 | outputs = [ local_offsets, count_table ], 123 | constants = constant_buffer 124 | ) 125 | 126 | cmd_list.end_marker() 127 | 128 | cmd_list.begin_marker("prefix_batch_table") 129 | cmd_list.dispatch( 130 | x = int(g_radix_counts), y = 1, z = 1, 131 | shader = g_prefix_count_table_shader, 132 | inputs = count_table, 133 | outputs = [count_table_prefix, radix_total_counts], 134 | constants = constant_buffer 135 | ) 136 | cmd_list.end_marker() 137 | 138 | cmd_list.begin_marker("prefix_global_table") 139 | cmd_list.dispatch( 140 | x = 1, y = 1, z = 1, 141 | shader = g_prefix_global_table_shader, 142 | inputs = radix_total_counts, 143 | outputs = count_table 144 | ) 145 | cmd_list.end_marker() 146 | 147 | cmd_list.begin_marker("scatter_output") 148 | cmd_list.dispatch( 149 | x = batch_counts, y = 1, z = 1, 150 | shader = g_scatter_output_shader, 151 | inputs = [unsorted_buffer, input_ordering, local_offsets, count_table_prefix, count_table ], 152 | outputs = tmp_output_buffer, 153 | constants = constant_buffer 154 | ) 155 | cmd_list.end_marker() 156 | 157 | return (tmp_output_buffer, radix_total_counts) 158 | -------------------------------------------------------------------------------- /gpu_algorithms/gpu/thread_utils.hlsl: -------------------------------------------------------------------------------- 1 | #ifndef __THREAD_UTILS__ 2 | #define __THREAD_UTILS__ 3 | 4 | #ifndef GROUP_SIZE 5 | #error "ThreadUtils.hlsl requires definition of GROUP_SIZE" 6 | #endif 7 | 8 | #define DWORD_BIT_SIZE_LOG2 5 9 | #define DWORD_BIT_SIZE (1 << DWORD_BIT_SIZE_LOG2) 10 | 11 | #define THREAD_UTILS_MODE_IMPL_LDS 0 12 | #define THREAD_UTILS_MODE_GROUPWAVE 1 13 | #define THREAD_UTILS_MODE_SUBWAVE_IN_GROUP 2 14 | 15 | #if defined(HW_WAVE_SIZE) 16 | #if (GROUP_SIZE & (HW_WAVE_SIZE - 1)) != 0 17 | #error "Group size must be a multiple of the wave size for this library. Current setup not supported." 18 | #elif (HW_WAVE_SIZE > GROUP_SIZE) 19 | #error "Group size must be less than the wave size for this library. Current setup not supported." 20 | #elif HW_WAVE_SIZE == GROUP_SIZE 21 | #define THREAD_UTILS_MODE THREAD_UTILS_MODE_GROUPWAVE 22 | #elif HW_WAVE_SIZE < GROUP_SIZE 23 | #define THREAD_UTILS_MODE THREAD_UTILS_MODE_SUBWAVE_IN_GROUP 24 | #else 25 | #error "Unsupported group size / wave size configuration." 26 | #endif 27 | #else 28 | #define THREAD_UTILS_MODE THREAD_UTILS_MODE_IMPL_LDS 29 | #endif 30 | 31 | #ifndef THREAD_UTILS_MODE 32 | #error "THREAD_UTILS_MODE must be defined at this point in this library." 33 | #endif 34 | 35 | namespace ThreadUtils 36 | { 37 | #define BIT_MASK_SIZE ((GROUP_SIZE + DWORD_BIT_SIZE - 1)/ DWORD_BIT_SIZE) 38 | 39 | struct GroupData2 40 | { 41 | uint bitMask0[BIT_MASK_SIZE]; 42 | uint bitMask1[BIT_MASK_SIZE]; 43 | }; 44 | 45 | void PrefixBitSum(uint groupThreadIndex, bool bitValue, out uint outOffset, out uint outCount); 46 | void PrefixBit2Sum(uint groupThreadIndex, bool2 bitValues, out uint2 outOffsets, out uint2 outCounts, out GroupData2 groupData); 47 | void PrefixBit2Sum(uint groupThreadIndex, bool2 bitValues, out uint2 outOffsets, out uint2 outCounts) 48 | { 49 | GroupData2 unused; 50 | PrefixBit2Sum(groupThreadIndex, bitValues, outOffsets, outCounts, unused); 51 | } 52 | void PrefixExclusive(uint groupThreadIndex, uint value, out uint outOffset, out uint outCount); 53 | void PrefixInclusive(uint groupThreadIndex, uint value, out uint outOffset, out uint outCount) 54 | { 55 | PrefixExclusive(groupThreadIndex, value, outOffset, outCount); 56 | outOffset += value; 57 | } 58 | uint CalculateGlobalStorageOffset(RWByteAddressBuffer counterBuffer, uint groupThreadIndex, bool bitValue); 59 | uint CalculateGlobalValueStorageOffset(RWByteAddressBuffer counterBuffer, uint groupThreadIndex, uint valueCount); 60 | 61 | #if THREAD_UTILS_MODE == THREAD_UTILS_MODE_IMPL_LDS 62 | 63 | groupshared uint gs_BitMask0[BIT_MASK_SIZE]; 64 | groupshared uint gs_BitMask1[BIT_MASK_SIZE]; 65 | 66 | void PrefixBitSum(uint groupThreadIndex, bool bitValue, out uint outOffset, out uint outCount) 67 | { 68 | if (groupThreadIndex < BIT_MASK_SIZE) 69 | gs_BitMask0[groupThreadIndex] = 0; 70 | 71 | GroupMemoryBarrierWithGroupSync(); 72 | 73 | uint maskOffset = groupThreadIndex >> DWORD_BIT_SIZE_LOG2; 74 | uint maskBit = (groupThreadIndex & (DWORD_BIT_SIZE - 1)); 75 | uint mask = 1u << maskBit; 76 | uint unused; 77 | 78 | [branch] 79 | if (bitValue) 80 | InterlockedOr(gs_BitMask0[maskOffset], mask, unused); 81 | 82 | GroupMemoryBarrierWithGroupSync(); 83 | 84 | outOffset = 0; 85 | outCount = 0; 86 | { 87 | [unroll] 88 | for (uint i = 0; i < BIT_MASK_SIZE; ++i) 89 | { 90 | uint maskCount = countbits(gs_BitMask0[i]); 91 | if (i < maskOffset) 92 | outOffset += maskCount; 93 | outCount += maskCount; 94 | } 95 | uint v = gs_BitMask0[maskOffset]; 96 | outOffset += countbits((mask - 1u) & v); 97 | } 98 | } 99 | 100 | void PrefixBit2Sum(uint groupThreadIndex,bool2 bitValues,out uint2 outOffsets, out uint2 outCounts, out GroupData2 groupData) 101 | { 102 | if (groupThreadIndex < BIT_MASK_SIZE) 103 | { 104 | gs_BitMask0[groupThreadIndex] = 0; 105 | gs_BitMask1[groupThreadIndex] = 0; 106 | } 107 | 108 | GroupMemoryBarrierWithGroupSync(); 109 | 110 | uint maskOffset = groupThreadIndex >> DWORD_BIT_SIZE_LOG2; 111 | uint maskBit = (groupThreadIndex & (DWORD_BIT_SIZE - 1)); 112 | uint mask = 1u << maskBit; 113 | uint unused; 114 | [branch] 115 | if (bitValues.x) 116 | InterlockedOr(gs_BitMask0[maskOffset], mask, unused); 117 | if (bitValues.y) 118 | InterlockedOr(gs_BitMask1[maskOffset], mask, unused); 119 | 120 | GroupMemoryBarrierWithGroupSync(); 121 | 122 | outOffsets = 0; 123 | outCounts = 0; 124 | { 125 | [unroll(BIT_MASK_SIZE)] 126 | for (uint i = 0; i < BIT_MASK_SIZE; ++i) 127 | { 128 | uint2 maskCounts = uint2(countbits(gs_BitMask0[i]), countbits(gs_BitMask1[i])); 129 | if (i < maskOffset) 130 | outOffsets += maskCounts; 131 | outCounts += maskCounts; 132 | } 133 | uint2 v = uint2(gs_BitMask0[maskOffset],gs_BitMask1[maskOffset]); 134 | outOffsets += uint2(countbits((mask - 1u) & v.x), countbits((mask - 1u) & v.y)); 135 | } 136 | 137 | [unroll(BIT_MASK_SIZE)] 138 | for (uint i = 0; i < BIT_MASK_SIZE; ++i) 139 | { 140 | groupData.bitMask0[i] = gs_BitMask0[i]; 141 | groupData.bitMask1[i] = gs_BitMask1[i]; 142 | } 143 | } 144 | 145 | groupshared uint gs_PrefixCache[GROUP_SIZE]; 146 | 147 | void PrefixExclusive(uint groupThreadIndex, uint value, out uint outOffset, out uint outCount) 148 | { 149 | gs_PrefixCache[groupThreadIndex] = value; 150 | 151 | GroupMemoryBarrierWithGroupSync(); 152 | 153 | for (uint i = 1; i < GROUP_SIZE; i <<= 1) 154 | { 155 | uint sampleVal = groupThreadIndex >= i ? gs_PrefixCache[groupThreadIndex - i] : 0u; 156 | 157 | GroupMemoryBarrierWithGroupSync(); 158 | 159 | gs_PrefixCache[groupThreadIndex] += sampleVal; 160 | 161 | GroupMemoryBarrierWithGroupSync(); 162 | } 163 | 164 | outOffset = gs_PrefixCache[groupThreadIndex] - value; 165 | outCount = gs_PrefixCache[GROUP_SIZE - 1]; 166 | } 167 | 168 | uint CalculateGlobalStorageOffset(RWByteAddressBuffer counterBuffer, uint groupThreadIndex, bool bitValue) 169 | { 170 | uint localOffset, totalCount; 171 | PrefixBitSum(groupThreadIndex, bitValue, localOffset, totalCount); 172 | 173 | if (groupThreadIndex == 0 && totalCount > 0) 174 | { 175 | uint globalOffset = 0; 176 | counterBuffer.InterlockedAdd(0, totalCount, globalOffset); 177 | gs_BitMask0[0] = globalOffset; 178 | } 179 | 180 | GroupMemoryBarrierWithGroupSync(); 181 | 182 | return gs_BitMask0[0] + localOffset; 183 | } 184 | 185 | uint CalculateGlobalValueStorageOffset(RWByteAddressBuffer counterBuffer, uint groupThreadIndex, uint valueCount) 186 | { 187 | uint localOffset, totalCount; 188 | PrefixExclusive(groupThreadIndex, valueCount, localOffset, totalCount); 189 | 190 | if (groupThreadIndex == 0 && totalCount > 0) 191 | { 192 | uint globalOffset = 0; 193 | counterBuffer.InterlockedAdd(0, totalCount, globalOffset); 194 | gs_PrefixCache[0] = globalOffset; 195 | } 196 | 197 | GroupMemoryBarrierWithGroupSync(); 198 | 199 | return gs_PrefixCache[0] + localOffset; 200 | } 201 | 202 | #elif THREAD_UTILS_MODE == THREAD_UTILS_MODE_GROUPWAVE || THREAD_UTILS_MODE == THREAD_UTILS_MODE_SUBWAVE_IN_GROUP 203 | 204 | #if THREAD_UTILS_MODE == THREAD_UTILS_MODE_SUBWAVE_IN_GROUP 205 | #define GROUP_WAVE_COMPONENT_SIZE (GROUP_SIZE / HW_WAVE_SIZE) 206 | groupshared uint gs_GroupCache0[GROUP_WAVE_COMPONENT_SIZE]; 207 | groupshared uint gs_GroupCache1[GROUP_WAVE_COMPONENT_SIZE]; 208 | #endif //THREAD_UTILS_MODE == THREAD_UTILS_MODE_SUBWAVE_IN_GROUP 209 | 210 | void PrefixBitSum(uint groupThreadIndex, bool bitValue, out uint outOffset, out uint outCount) 211 | { 212 | uint widx = WaveReadLaneFirst(groupThreadIndex / HW_WAVE_SIZE); 213 | uint prefixOffset = WavePrefixCountBits(bitValue); 214 | uint waveCount = WaveActiveCountBits(bitValue); 215 | 216 | #if THREAD_UTILS_MODE == THREAD_UTILS_MODE_SUBWAVE_IN_GROUP 217 | if (WaveIsFirstLane()) 218 | gs_GroupCache0[widx] = waveCount; 219 | 220 | GroupMemoryBarrierWithGroupSync(); 221 | 222 | outCount = 0; 223 | outOffset = prefixOffset; 224 | for (uint pwid = 0; pwid < GROUP_WAVE_COMPONENT_SIZE; ++pwid) 225 | { 226 | uint cacheVal = gs_GroupCache0[pwid]; 227 | if (pwid < widx) 228 | outOffset += cacheVal; 229 | outCount += cacheVal; 230 | } 231 | #else 232 | outOffset = prefixOffset; 233 | outCount = waveCount; 234 | #endif 235 | } 236 | 237 | void PrefixBit2Sum(uint groupThreadIndex, bool2 bitValues, out uint2 outOffsets, out uint2 outCounts, out GroupData2 groupData) 238 | { 239 | uint widx = WaveReadLaneFirst(groupThreadIndex / HW_WAVE_SIZE); 240 | uint2 prefixOffsets = uint2(WavePrefixCountBits(bitValues.x), WavePrefixCountBits(bitValues.y)); 241 | uint2 waveCounts = uint2(WaveActiveCountBits(bitValues.x), WaveActiveCountBits(bitValues.y)); 242 | 243 | #if THREAD_UTILS_MODE == THREAD_UTILS_MODE_SUBWAVE_IN_GROUP 244 | if (WaveIsFirstLane()) 245 | { 246 | gs_GroupCache0[widx] = waveCounts.x; 247 | gs_GroupCache1[widx] = waveCounts.y; 248 | } 249 | 250 | GroupMemoryBarrierWithGroupSync(); 251 | 252 | outCounts = 0; 253 | outOffsets = prefixOffsets; 254 | for (uint pwid = 0; pwid < GROUP_WAVE_COMPONENT_SIZE; ++pwid) 255 | { 256 | uint2 cacheVals = uint2(gs_GroupCache0[pwid], gs_GroupCache1[pwid]); 257 | if (pwid < widx) 258 | outOffsets += cacheVals; 259 | outCounts += cacheVals; 260 | } 261 | #else 262 | outOffsets = prefixOffsets; 263 | outCounts = waveCounts; 264 | #endif 265 | 266 | { 267 | const uint groupOffset = ((groupThreadIndex + DWORD_BIT_SIZE - 1) / DWORD_BIT_SIZE); 268 | [unroll(BIT_MASK_SIZE)] 269 | for (int i = 0; i < BIT_MASK_SIZE; i++) 270 | { 271 | const uint ballotIndex = i % 2; 272 | groupData.bitMask0[i + groupOffset] = WaveActiveBallot(bitValues.x)[ballotIndex]; 273 | groupData.bitMask1[i + groupOffset] = WaveActiveBallot(bitValues.y)[ballotIndex]; 274 | } 275 | } 276 | } 277 | 278 | void PrefixExclusive(uint groupThreadIndex, uint value, out uint outOffset, out uint outCount) 279 | { 280 | uint widx = WaveReadLaneFirst(groupThreadIndex / HW_WAVE_SIZE); 281 | uint prefixOffset = WavePrefixSum(value); 282 | uint waveCount = WaveActiveSum(value); 283 | 284 | #if THREAD_UTILS_MODE == THREAD_UTILS_MODE_SUBWAVE_IN_GROUP 285 | if (WaveIsFirstLane()) 286 | gs_GroupCache0[widx] = waveCount; 287 | 288 | GroupMemoryBarrierWithGroupSync(); 289 | 290 | outCount = 0; 291 | outOffset = prefixOffset; 292 | 293 | for (uint pwid = 0; pwid < GROUP_WAVE_COMPONENT_SIZE; ++pwid) 294 | { 295 | uint cacheVal = gs_GroupCache0[pwid]; 296 | if (pwid < widx) 297 | outOffset += cacheVal; 298 | outCount += cacheVal; 299 | } 300 | #else 301 | outOffset = prefixOffset; 302 | outCount = waveCount; 303 | #endif 304 | } 305 | 306 | uint CalculateGlobalStorageOffset(RWByteAddressBuffer counterBuffer, uint groupThreadIndex, bool bitValue) 307 | { 308 | uint localOffset, totalCount; 309 | PrefixBitSum(groupThreadIndex, bitValue, localOffset, totalCount); 310 | 311 | #if THREAD_UTILS_MODE == THREAD_UTILS_MODE_SUBWAVE_IN_GROUP 312 | if (groupThreadIndex == 0 && totalCount > 0) 313 | { 314 | uint globalOffset = 0; 315 | counterBuffer.InterlockedAdd(0, totalCount, globalOffset); 316 | gs_GroupCache0[0] = globalOffset; 317 | } 318 | 319 | GroupMemoryBarrierWithGroupSync(); 320 | 321 | return gs_GroupCache0[0] + localOffset; 322 | #else 323 | uint globalOffset = 0; 324 | if (WaveIsFirstLane() && totalCount > 0) 325 | counterBuffer.InterlockedAdd(0, totalCount, globalOffset); 326 | 327 | return WaveReadLaneFirst(globalOffset) + localOffset; 328 | #endif 329 | } 330 | 331 | uint CalculateGlobalValueStorageOffset(RWByteAddressBuffer counterBuffer, uint groupThreadIndex, uint valueCount) 332 | { 333 | uint localOffset, totalCount; 334 | PrefixExclusive(groupThreadIndex, valueCount, localOffset, totalCount); 335 | 336 | #if THREAD_UTILS_MODE == THREAD_UTILS_MODE_SUBWAVE_IN_GROUP 337 | if (groupThreadIndex == 0 && totalCount > 0) 338 | { 339 | uint globalOffset = 0; 340 | counterBuffer.InterlockedAdd(0, totalCount, globalOffset); 341 | gs_GroupCache0[0] = globalOffset; 342 | } 343 | 344 | GroupMemoryBarrierWithGroupSync(); 345 | 346 | return gs_GroupCache0[0] + localOffset; 347 | #else 348 | uint globalOffset = 0; 349 | if (WaveIsFirstLane() && totalCount > 0) 350 | counterBuffer.InterlockedAdd(0, totalCount, globalOffset); 351 | 352 | return WaveReadLaneFirst(globalOffset) + localOffset; 353 | #endif 354 | } 355 | 356 | #else //THREAD_UTILS_MODE unknown 357 | #error "Implementation for thread utilities not supported." 358 | #endif 359 | 360 | } 361 | 362 | #endif 363 | -------------------------------------------------------------------------------- /gpu_algorithms/gpu/utilities.py: -------------------------------------------------------------------------------- 1 | import coalpy.gpu as g 2 | import math 3 | 4 | g_clear_target_shader = g.Shader(file = "clear_target.hlsl", name = "clear", main_function = "csMainClear" ) 5 | g_clear_uint_buffer_shader = g.Shader(file = "clear_target.hlsl", name = "clear", main_function = "csMainClearUintBuffer" ) 6 | 7 | def clear_texture(cmd_list, color, texture, w, h): 8 | cmd_list.dispatch( 9 | shader = g_clear_target_shader, 10 | constants = color, 11 | x = math.ceil(w / 8), 12 | y = math.ceil(h / 8), 13 | z = 1, 14 | outputs = texture) 15 | 16 | def clear_uint_buffer(cmd_list, clear_val, buff, el_offset, el_count): 17 | cmd_list.dispatch( 18 | shader = g_clear_uint_buffer_shader, 19 | constants = [int(clear_val), int(el_offset), int(el_count)], 20 | outputs = buff, 21 | x = math.ceil(el_count / 64), 22 | y = 1, 23 | z = 1) 24 | 25 | 26 | def divup(a, b): 27 | return int((a + b - 1)/b) 28 | 29 | def alignup(a, b): 30 | return divup(a, b) * b 31 | -------------------------------------------------------------------------------- /gpu_algorithms/meson.build: -------------------------------------------------------------------------------- 1 | py.install_sources( 2 | [ 3 | '__init__.py', 4 | '__main__.py' 5 | ], 6 | subdir:'gpu_algorithms' 7 | ) 8 | 9 | subdir('native') 10 | subdir('gpu') 11 | 12 | -------------------------------------------------------------------------------- /gpu_algorithms/native/ModuleMain.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | PyObject* prefix_sum_python(PyObject* self, PyObject* vargs, PyObject* kwds) 7 | { 8 | const char* arguments[] = { "list_values", nullptr }; 9 | PyObject* list_values = nullptr; 10 | if (!PyArg_ParseTupleAndKeywords(vargs, kwds, "O", const_cast(arguments), &list_values)) 11 | return nullptr; 12 | 13 | if (!PyObject_CheckBuffer(list_values)) 14 | return nullptr; 15 | 16 | Py_buffer buffer_view = {}; 17 | if (PyObject_GetBuffer(list_values, &buffer_view, 0) != 0 || buffer_view.buf == nullptr) 18 | return nullptr; 19 | 20 | if (buffer_view.itemsize != 4) 21 | return 0; 22 | 23 | int* input_vals = reinterpret_cast(buffer_view.buf); 24 | int len = buffer_view.len/buffer_view.itemsize; 25 | 26 | clock_t begin_time = clock(); 27 | int* results = (int*)malloc(buffer_view.len); 28 | int accumulator = 0; 29 | for (int i = 0; i < len; ++i) 30 | { 31 | accumulator += input_vals[i]; 32 | results[i] = accumulator; 33 | } 34 | 35 | PyBuffer_Release(&buffer_view); 36 | 37 | PyObject* outputObj = PyBytes_FromStringAndSize((const char*)results, buffer_view.len); 38 | Py_INCREF(outputObj); 39 | free(results); 40 | clock_t end_time = clock(); 41 | double timeMilli = ((double)(end_time-begin_time))/(double)(CLOCKS_PER_SEC) * 1000.0; 42 | return Py_BuildValue("(fO)", timeMilli, outputObj); 43 | } 44 | 45 | PyObject* radix_sort_python(PyObject* self, PyObject* vargs, PyObject* kwds) 46 | { 47 | const char* arguments[] = { "list_values", nullptr }; 48 | PyObject* list_values = nullptr; 49 | if (!PyArg_ParseTupleAndKeywords(vargs, kwds, "O", const_cast(arguments), &list_values)) 50 | return nullptr; 51 | 52 | if (!PyObject_CheckBuffer(list_values)) 53 | return nullptr; 54 | 55 | Py_buffer buffer_view = {}; 56 | if (PyObject_GetBuffer(list_values, &buffer_view, 0) != 0 || buffer_view.buf == nullptr) 57 | return nullptr; 58 | 59 | if (buffer_view.itemsize != 4) 60 | return 0; 61 | 62 | int* input_vals = reinterpret_cast(buffer_view.buf); 63 | int len = buffer_view.len/buffer_view.itemsize; 64 | 65 | clock_t begin_time = clock(); 66 | 67 | const int bits_per_radix = 8; 68 | const int bytes_per_radix = bits_per_radix / 8; 69 | const int count_table_len = (1 << bits_per_radix); 70 | 71 | void* dynamic_memory = malloc(buffer_view.len * 3 + count_table_len * sizeof(int)); 72 | int* ping = (int*)dynamic_memory; 73 | int* pong = &ping[len]; 74 | int* offset_table = pong + len; 75 | int* count_table = offset_table + len; 76 | 77 | int* results = nullptr; 78 | int pass_count = (int)sizeof(int)/bytes_per_radix; 79 | for (int k = 0; k < pass_count; ++k) 80 | { 81 | int* pass_input = k == 0 ? (input_vals) : ((k & 1) == 0 ? ping : pong); 82 | int* pass_output = results = (k & 1) == 0 ? pong : ping; 83 | 84 | for (int i = 0; i < count_table_len; ++i) 85 | count_table[i] = 0; 86 | 87 | //counts 88 | for (int i = 0; i < len; ++i) 89 | { 90 | int bucket = (pass_input[i] >> (k * bits_per_radix)) & (count_table_len - 1); 91 | offset_table[i] = count_table[bucket]++; 92 | } 93 | 94 | ////prefix_sum 95 | int accumulator = 0; 96 | for (int i = 0; i < count_table_len; ++i) 97 | { 98 | int v = count_table[i]; 99 | count_table[i] = accumulator; 100 | accumulator += v; 101 | } 102 | 103 | //writes 104 | for (int i = 0; i < len; ++i) 105 | { 106 | int value = pass_input[i]; 107 | int bucket = (value >> (k * bits_per_radix)) & (count_table_len - 1); 108 | pass_output[count_table[bucket] + offset_table[i]] = value; 109 | } 110 | } 111 | 112 | PyBuffer_Release(&buffer_view); 113 | 114 | PyObject* outputObj = PyBytes_FromStringAndSize((const char*)results, buffer_view.len); 115 | Py_INCREF(outputObj); 116 | free(dynamic_memory); 117 | clock_t end_time = clock(); 118 | double timeMilli = ((double)(end_time-begin_time))/(double)(CLOCKS_PER_SEC) * 1000.0; 119 | return Py_BuildValue("(fO)", timeMilli, outputObj); 120 | } 121 | 122 | static PyMethodDef methods[] = { 123 | {"prefix_sum", (PyCFunction)prefix_sum_python, METH_VARARGS | METH_KEYWORDS, NULL}, 124 | {"radix_sort", (PyCFunction)radix_sort_python, METH_VARARGS | METH_KEYWORDS, NULL}, 125 | {NULL, NULL, 0, NULL}, 126 | }; 127 | 128 | static struct PyModuleDef module = { 129 | PyModuleDef_HEAD_INIT, 130 | "native", 131 | NULL, 132 | -1, 133 | methods, 134 | }; 135 | 136 | PyMODINIT_FUNC PyInit_native(void) 137 | { 138 | return PyModule_Create(&module); 139 | } 140 | -------------------------------------------------------------------------------- /gpu_algorithms/native/meson.build: -------------------------------------------------------------------------------- 1 | py.extension_module( 2 | 'native', 3 | 'ModuleMain.cpp', 4 | install: true, 5 | subdir: 'gpu_algorithms' 6 | ) 7 | -------------------------------------------------------------------------------- /meson.build: -------------------------------------------------------------------------------- 1 | project('purelib-and-platlib', 'cpp') 2 | 3 | py = import('python').find_installation(pure: false) 4 | 5 | subdir('gpu_algorithms') 6 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = 'mesonpy' 3 | requires = ['meson-python'] 4 | 5 | [project] 6 | name = 'gpu_algorithms' 7 | version = '0.0.1' 8 | description = 'GPU algorithms package to compare / benchmark.' 9 | readme = 'README.md' 10 | requires-python = '>=3.8' 11 | license = {file = 'LICENSE'} 12 | authors = [ 13 | {name = 'Kleber Garcia', email = 'kecho_garcia@hotmail.com'}, 14 | ] 15 | dependencies = [ 'numpy', 'coalpy' ] 16 | --------------------------------------------------------------------------------