├── ComputeUtils.hlsl ├── ComputeUtils.py ├── ComputeUtilsTests.hlsl └── LICENSE /ComputeUtils.hlsl: -------------------------------------------------------------------------------- 1 | #ifndef __COMPUTE_UTILS_HEADER__ 2 | #define __COMPUTE_UTILS_HEADER__ 3 | 4 | #ifndef GROUP_SIZE 5 | #error "ComputeUtils.hlsl requires definition of GROUP_SIZE" 6 | #endif 7 | #ifndef GROUP_SIZE_LOG_2 8 | #if GROUP_SIZE == 32 9 | #define GROUP_SIZE_LOG_2 5 10 | #elif GROUP_SIZE == 64 11 | #define GROUP_SIZE_LOG_2 6 12 | #elif GROUP_SIZE == 128 13 | #define GROUP_SIZE_LOG_2 7 14 | #elif GROUP_SIZE == 256 15 | #define GROUP_SIZE_LOG_2 8 16 | #else 17 | #error "ComputeUtils.hlsl requires definition of GROUP_SIZE_LOG_2, which must be the log2 of GROUP_SIZE" 18 | #endif 19 | #endif 20 | 21 | #define DWORD_BIT_SIZE_LOG2 5 22 | #define DWORD_BIT_SIZE (1 << 5) 23 | #define BIT_MASK_SIZE ((GROUP_SIZE + DWORD_BIT_SIZE - 1)/ DWORD_BIT_SIZE) 24 | 25 | namespace ComputeUtils 26 | { 27 | 28 | groupshared uint gs_BitMask[BIT_MASK_SIZE]; 29 | 30 | void PrefixBitSum( 31 | uint groupThreadIndex, 32 | bool bitValue, 33 | out uint outOffset, 34 | out uint outCount) 35 | { 36 | if (groupThreadIndex < BIT_MASK_SIZE) 37 | gs_BitMask[groupThreadIndex] = 0; 38 | 39 | GroupMemoryBarrierWithGroupSync(); 40 | 41 | uint maskOffset = groupThreadIndex >> DWORD_BIT_SIZE_LOG2; 42 | uint maskBit = (groupThreadIndex & (DWORD_BIT_SIZE - 1)); 43 | uint mask = 1u << maskBit; 44 | 45 | [branch] 46 | if (bitValue) 47 | { 48 | uint unused; 49 | InterlockedOr(gs_BitMask[maskOffset], mask, unused); 50 | } 51 | 52 | GroupMemoryBarrierWithGroupSync(); 53 | 54 | outOffset = 0; 55 | if (bitValue) 56 | { 57 | for (uint i = 0; i < maskOffset; ++i) 58 | outOffset += countbits(gs_BitMask[i]); 59 | uint v = gs_BitMask[maskOffset]; 60 | outOffset += countbits((mask - 1u) & v); 61 | } 62 | 63 | outCount = 0; 64 | { 65 | [unroll] 66 | for (uint i = 0; i < BIT_MASK_SIZE; ++i) 67 | outCount += countbits(gs_BitMask[i]); 68 | } 69 | } 70 | 71 | groupshared uint gs_PrefixCache[GROUP_SIZE]; 72 | 73 | void PrefixExclusive( 74 | uint groupThreadIndex, 75 | uint value, 76 | out uint outOffset, 77 | out uint outCount) 78 | { 79 | gs_PrefixCache[groupThreadIndex] = value; 80 | 81 | GroupMemoryBarrierWithGroupSync(); 82 | 83 | for (uint i = 1; i < GROUP_SIZE; i <<= 1) 84 | { 85 | uint sampleVal = groupThreadIndex >= i ? gs_PrefixCache[groupThreadIndex - i] : 0u; 86 | 87 | GroupMemoryBarrierWithGroupSync(); 88 | 89 | gs_PrefixCache[groupThreadIndex] += sampleVal; 90 | 91 | GroupMemoryBarrierWithGroupSync(); 92 | } 93 | 94 | outOffset = gs_PrefixCache[groupThreadIndex] - value; 95 | outCount = gs_PrefixCache[GROUP_SIZE - 1]; 96 | } 97 | 98 | void PrefixInclusive( 99 | uint groupThreadIndex, 100 | uint value, 101 | out uint outOffset, 102 | out uint outCount) 103 | { 104 | PrefixExclusive(groupThreadIndex, value, outOffset, outCount); 105 | outOffset += value; 106 | } 107 | 108 | uint CalculateGlobalStorageOffset( 109 | RWByteAddressBuffer counterBuffer, 110 | uint groupThreadIndex, 111 | bool bitValue, 112 | out uint totalCount) 113 | { 114 | uint localOffset; 115 | PrefixBitSum(groupThreadIndex, bitValue, localOffset, totalCount); 116 | 117 | if (groupThreadIndex == 0 && totalCount > 0) 118 | { 119 | uint globalOffset = 0; 120 | counterBuffer.InterlockedAdd(0, totalCount, globalOffset); 121 | gs_BitMask[0] = globalOffset; 122 | } 123 | 124 | GroupMemoryBarrierWithGroupSync(); 125 | 126 | return gs_BitMask[0] + localOffset; 127 | } 128 | 129 | uint CalculateGlobalStorageOffset( 130 | RWByteAddressBuffer counterBuffer, 131 | uint groupThreadIndex, 132 | bool bitValue) 133 | { 134 | uint unused0; 135 | return CalculateGlobalStorageOffset(counterBuffer, groupThreadIndex, bitValue, unused0); 136 | } 137 | 138 | uint CalculateGlobalValueStorageOffset( 139 | RWByteAddressBuffer counterBuffer, 140 | uint groupThreadIndex, 141 | uint valueCount, 142 | out uint totalCount) 143 | { 144 | uint localOffset; 145 | PrefixExclusive(groupThreadIndex, valueCount, localOffset, totalCount); 146 | 147 | if (groupThreadIndex == 0 && totalCount > 0) 148 | { 149 | uint globalOffset = 0; 150 | counterBuffer.InterlockedAdd(0, totalCount, globalOffset); 151 | gs_PrefixCache[0] = globalOffset; 152 | } 153 | 154 | GroupMemoryBarrierWithGroupSync(); 155 | 156 | return gs_PrefixCache[0] + localOffset; 157 | } 158 | 159 | uint CalculateGlobalValueStorageOffset( 160 | RWByteAddressBuffer counterBuffer, 161 | uint groupThreadIndex, 162 | uint valueCount) 163 | { 164 | uint unused0; 165 | return CalculateGlobalValueStorageOffset(counterBuffer, groupThreadIndex, valueCount, unused0); 166 | } 167 | 168 | #define BROADCAST_COMPACT_IDX_PER_DWORD (32/GROUP_SIZE_LOG_2) 169 | #define BROADCAST_COMPACT_DWORDS ((GROUP_SIZE + BROADCAST_COMPACT_IDX_PER_DWORD - 1)/BROADCAST_COMPACT_IDX_PER_DWORD) 170 | groupshared uint gs_BroadcastLocalCount[GROUP_SIZE]; 171 | groupshared uint gs_BroadcastPackedIdxMap[BROADCAST_COMPACT_DWORDS]; 172 | 173 | bool BroadcastWork( 174 | uint groupThreadIndex, 175 | uint workCount, 176 | out uint outIndex, 177 | out uint outParentCount, 178 | out uint outParentGroupID) 179 | { 180 | uint unused0; 181 | if (groupThreadIndex < BROADCAST_COMPACT_DWORDS) 182 | gs_BroadcastPackedIdxMap[groupThreadIndex] = 0; 183 | 184 | bool validWorkCount = workCount != 0; 185 | uint compactIdx; 186 | PrefixBitSum(groupThreadIndex, validWorkCount, compactIdx, unused0); 187 | if (validWorkCount) 188 | InterlockedOr(gs_BroadcastPackedIdxMap[compactIdx/BROADCAST_COMPACT_IDX_PER_DWORD], groupThreadIndex << ((compactIdx%BROADCAST_COMPACT_IDX_PER_DWORD)*GROUP_SIZE_LOG_2), unused0); 189 | 190 | uint groupOffset, groupCount; 191 | PrefixInclusive(groupThreadIndex, workCount, groupOffset, groupCount); 192 | 193 | if (groupThreadIndex < BIT_MASK_SIZE) 194 | gs_BitMask[groupThreadIndex] = 0; 195 | 196 | int leftInLDS = max((int)GROUP_SIZE - (int)(groupOffset - workCount), 0); 197 | int actualWorkCount = min(leftInLDS, workCount); 198 | gs_BroadcastLocalCount[groupThreadIndex] = actualWorkCount; 199 | 200 | GroupMemoryBarrierWithGroupSync(); 201 | 202 | [branch] 203 | if (actualWorkCount > 0 && groupOffset < GROUP_SIZE) 204 | InterlockedOr(gs_BitMask[groupOffset >> DWORD_BIT_SIZE_LOG2], 1u << (groupOffset & (DWORD_BIT_SIZE - 1u)), unused0); 205 | 206 | GroupMemoryBarrierWithGroupSync(); 207 | 208 | uint compactSampleIdx = 0; 209 | { 210 | uint groupDWOffset = groupThreadIndex >> DWORD_BIT_SIZE_LOG2; 211 | uint groupBitMask = 1u << (groupThreadIndex & (DWORD_BIT_SIZE - 1u)); 212 | for (uint i = 0; i < groupDWOffset; ++i) 213 | compactSampleIdx += countbits(gs_BitMask[i]); 214 | uint v = gs_BitMask[groupDWOffset]; 215 | compactSampleIdx += countbits(((groupBitMask - 1u) | groupBitMask) & v); 216 | } 217 | 218 | outParentGroupID = (gs_BroadcastPackedIdxMap[compactSampleIdx/BROADCAST_COMPACT_IDX_PER_DWORD] >> ((compactSampleIdx%BROADCAST_COMPACT_IDX_PER_DWORD)*GROUP_SIZE_LOG_2)) & ((1u << GROUP_SIZE_LOG_2) - 1u); 219 | 220 | uint parentPrefixExclusive = (outParentGroupID == 0 ? 0 : gs_PrefixCache[outParentGroupID - 1u]); 221 | outParentCount = gs_BroadcastLocalCount[outParentGroupID]; 222 | outIndex = groupThreadIndex - parentPrefixExclusive; 223 | return outIndex < outParentCount; 224 | } 225 | 226 | } 227 | 228 | #endif 229 | -------------------------------------------------------------------------------- /ComputeUtils.py: -------------------------------------------------------------------------------- 1 | import coalpy.gpu as g 2 | import numpy as nm 3 | 4 | ## Compute Utils Test Script ## 5 | 6 | #variables 7 | g_layout_workloads = [540, 340, 299, 689, 229, 770, 8] 8 | g_round_count = 8450 9 | #g_round_count = 500 10 | g_instance_count = len(g_layout_workloads) * g_round_count 11 | g_total_output_count = sum(g_layout_workloads) * g_round_count; 12 | print ("instances: %d output: %d" % (g_instance_count, g_total_output_count)) 13 | ####### 14 | 15 | def load_gpu_buffer(gpu_buffer, numpy_type='int'): 16 | request = g.ResourceDownloadRequest(resource = gpu_buffer) 17 | request.resolve() 18 | return nm.frombuffer(request.data_as_bytearray(), dtype=numpy_type) 19 | 20 | def test_broadcast_work(workloads, rounds, output_list, output_list_count): 21 | expected_mask_workloads = [((1 << v) - 1) for v in workloads] 22 | instance_counts = len(expected_mask_workloads) * rounds 23 | expected_masks = [expected_mask_workloads[i%len(expected_mask_workloads)] for i in range(0, instance_counts)] 24 | output_masks = [0 for i in range(0, instance_counts)] 25 | 26 | for i in range(0, output_list_count): 27 | output_masks[output_list[(i*2)]] |= 1 << int(output_list[(i*2) + 1]) 28 | 29 | for i in range(0, len(output_masks)): 30 | if (expected_masks[i] != output_masks[i]): 31 | return False 32 | 33 | return True 34 | 35 | g_init_shader = g.Shader(file = "ComputeUtilsTests.hlsl", main_function = "InitInputBufferMain") 36 | g_init_shader.resolve() 37 | 38 | g_distribute_shader = g.Shader(file = "ComputeUtilsTests.hlsl", main_function = "DistributeMain") 39 | g_distribute_shader.resolve() 40 | 41 | g_distribute_naive_shader = g.Shader(file = "ComputeUtilsTests.hlsl", main_function = "DistributeNaiveMain") 42 | g_distribute_naive_shader.resolve() 43 | 44 | work_layout_buffer = g.Buffer(type=g.BufferType.Raw, element_count=len(g_layout_workloads)) 45 | work_buffer = g.Buffer(type=g.BufferType.Raw, element_count = g_instance_count * 2) 46 | 47 | counter_buffer = g.Buffer(type=g.BufferType.Raw, element_count = 1) 48 | output_counter_buffer = g.Buffer(type=g.BufferType.Raw, element_count = 1) 49 | output_buffer = g.Buffer(type=g.BufferType.Raw, element_count = g_total_output_count * 2) 50 | 51 | output_naive_counter_buffer = g.Buffer(type=g.BufferType.Raw, element_count = 1) 52 | output_naive_buffer = g.Buffer(type=g.BufferType.Raw, element_count = g_total_output_count * 2) 53 | 54 | debug_counter_buffer = g.Buffer(type=g.BufferType.Raw, element_count = 1) 55 | debug_buffer = g.Buffer(type=g.BufferType.Raw, element_count = 400 * 4) 56 | 57 | cmd = g.CommandList() 58 | 59 | g.begin_collect_markers() 60 | 61 | cmd.upload_resource( 62 | source = g_layout_workloads, 63 | destination = work_layout_buffer) 64 | 65 | cmd.upload_resource( 66 | source = [0], 67 | destination = counter_buffer) 68 | 69 | cmd.upload_resource( 70 | source = [0], 71 | destination = output_counter_buffer) 72 | 73 | cmd.upload_resource( 74 | source = [0], 75 | destination = output_naive_counter_buffer) 76 | 77 | cmd.dispatch( 78 | constants = [ int(g_instance_count), int(len(g_layout_workloads)), int(0), int(0)], 79 | inputs = work_layout_buffer, 80 | outputs = work_buffer, 81 | shader = g_init_shader, 82 | x = int((g_instance_count +63)/64), 83 | y = 1, 84 | z = 1) 85 | 86 | cmd.begin_marker("GroupDistribute") 87 | cmd.dispatch( 88 | constants = [ int(g_instance_count), int(0), int(0), int(0)], 89 | inputs = work_buffer, 90 | outputs = [counter_buffer, output_buffer, output_counter_buffer, debug_buffer, debug_counter_buffer], 91 | shader = g_distribute_shader, 92 | x = 3500, 93 | y = 1, 94 | z = 1) 95 | cmd.end_marker() 96 | 97 | cmd.begin_marker("GroupDistributeNaive") 98 | cmd.dispatch( 99 | constants = [ int(g_instance_count), int(0), int(0), int(0)], 100 | inputs = work_buffer, 101 | outputs = [counter_buffer, output_naive_buffer, output_naive_counter_buffer, debug_buffer, debug_counter_buffer], 102 | shader = g_distribute_naive_shader, 103 | x = int((g_instance_count + 63)/64), 104 | y = 1, 105 | z = 1) 106 | cmd.end_marker() 107 | 108 | g.schedule(cmd) 109 | 110 | marker_results = g.end_collect_markers() 111 | gpu_timestamps = load_gpu_buffer(marker_results.timestamp_buffer, nm.uint64) 112 | perf_data = [(name, (gpu_timestamps[ets] - gpu_timestamps[bts])/marker_results.timestamp_frequency) for (name, p, bts, ets) in marker_results.markers] 113 | print (perf_data) 114 | 115 | # Output 116 | """ 117 | output_counter_buffer_readback = load_gpu_buffer(output_counter_buffer)[0] 118 | output_buffer_readback = load_gpu_buffer(output_buffer) 119 | print("Distributed Received: %d" % output_counter_buffer_readback) 120 | print(test_broadcast_work(g_layout_workloads, g_round_count, output_buffer_readback, output_counter_buffer_readback)) 121 | 122 | output_counter_buffer_readback = load_gpu_buffer(output_naive_counter_buffer)[0] 123 | output_buffer_readback = load_gpu_buffer(output_naive_buffer) 124 | print("Distributed Naive Received: %d" % output_counter_buffer_readback) 125 | print(test_broadcast_work(g_layout_workloads, g_round_count, output_buffer_readback, output_counter_buffer_readback)) 126 | """ 127 | -------------------------------------------------------------------------------- /ComputeUtilsTests.hlsl: -------------------------------------------------------------------------------- 1 | #define GROUP_SIZE 64 2 | #include "ComputeUtils.hlsl" 3 | 4 | ByteAddressBuffer g_workLayout : register(t0); 5 | RWByteAddressBuffer g_outputWorkBuffer : register(u0); 6 | 7 | cbuffer Constants : register(b0) 8 | { 9 | uint g_InstanceCount; 10 | uint g_LayoutWorkloadsCount; 11 | uint2 g_padding; 12 | } 13 | 14 | [numthreads(GROUP_SIZE,1,1)] 15 | void InitInputBufferMain(uint3 dispatchThreadID : SV_DispatchThreadID) 16 | { 17 | if (dispatchThreadID.x >= g_InstanceCount) 18 | return; 19 | 20 | uint workLayoutCount = g_workLayout.Load(0); 21 | uint workID = dispatchThreadID.x; 22 | uint workCount = g_workLayout.Load((workID % g_LayoutWorkloadsCount) << 2); 23 | g_outputWorkBuffer.Store2(dispatchThreadID.x << 3, uint2(workID, workCount)); 24 | } 25 | 26 | ByteAddressBuffer g_InputBuffer : register(t0); 27 | RWByteAddressBuffer g_CounterBuffer : register(u0); 28 | RWByteAddressBuffer g_OutputBuffer : register(u1); 29 | RWByteAddressBuffer g_OutputCounter : register(u2); 30 | RWByteAddressBuffer g_DebugBuffer : register(u3); 31 | RWByteAddressBuffer g_DebugBufferCounter : register(u4); 32 | 33 | groupshared bool gs_GroupActive; 34 | groupshared bool gs_LoadInstanceBatch; 35 | groupshared uint gs_GroupInputOffset; 36 | groupshared uint gs_InstanceIDCache[GROUP_SIZE]; 37 | groupshared uint gs_InstanceIDWorkLeft[GROUP_SIZE]; 38 | groupshared uint gs_InstanceIDWorkOffset[GROUP_SIZE]; 39 | 40 | [numthreads(GROUP_SIZE, 1, 1)] 41 | void DistributeMain( 42 | uint3 dispatchThreadID : SV_DispatchThreadID, 43 | uint groupThreadIndex : SV_GroupIndex) 44 | { 45 | if (groupThreadIndex == 0) 46 | { 47 | gs_GroupActive = true; 48 | gs_LoadInstanceBatch = true; 49 | } 50 | 51 | GroupMemoryBarrierWithGroupSync(); 52 | 53 | while (gs_GroupActive) 54 | { 55 | if (gs_LoadInstanceBatch) 56 | { 57 | if (groupThreadIndex == 0) 58 | { 59 | uint inputOffset; 60 | g_CounterBuffer.InterlockedAdd(0, GROUP_SIZE, inputOffset); 61 | gs_GroupInputOffset = inputOffset; 62 | gs_GroupActive = inputOffset < g_InstanceCount; 63 | } 64 | 65 | GroupMemoryBarrierWithGroupSync(); 66 | 67 | uint sampleIndex = gs_GroupInputOffset + groupThreadIndex; 68 | uint2 data = sampleIndex < g_InstanceCount ? g_InputBuffer.Load2(sampleIndex << 3) : uint2(0,0); 69 | gs_InstanceIDCache[groupThreadIndex] = data.x; 70 | gs_InstanceIDWorkLeft[groupThreadIndex] = data.y; 71 | gs_InstanceIDWorkOffset[groupThreadIndex] = 0; 72 | gs_LoadInstanceBatch = false; 73 | 74 | GroupMemoryBarrierWithGroupSync(); 75 | } 76 | 77 | if (!gs_GroupActive) 78 | return; 79 | 80 | uint workIndex; 81 | uint workParentCount; 82 | uint workParentID; 83 | bool validWork = ComputeUtils::BroadcastWork( 84 | groupThreadIndex, 85 | gs_InstanceIDWorkLeft[groupThreadIndex], 86 | workIndex, workParentCount, workParentID); 87 | 88 | uint workOffset = workIndex + gs_InstanceIDWorkOffset[workParentID]; 89 | 90 | uint outputCount; 91 | uint outputIndex = ComputeUtils::CalculateGlobalStorageOffset(g_OutputCounter, groupThreadIndex, validWork, outputCount); 92 | if (validWork) 93 | g_OutputBuffer.Store2(outputIndex << 3, uint2(gs_InstanceIDCache[workParentID], workOffset)); 94 | 95 | if (workIndex == 0) 96 | { 97 | gs_InstanceIDWorkLeft[workParentID] -= workParentCount; 98 | gs_InstanceIDWorkOffset[workParentID] += workParentCount; 99 | } 100 | 101 | if (outputCount == 0) 102 | gs_LoadInstanceBatch = true; 103 | 104 | GroupMemoryBarrierWithGroupSync(); 105 | } 106 | } 107 | 108 | [numthreads(GROUP_SIZE, 1, 1)] 109 | void DistributeNaiveMain( 110 | uint3 dispatchThreadID : SV_DispatchThreadID) 111 | { 112 | if (dispatchThreadID.x >= g_InstanceCount) 113 | return; 114 | 115 | uint2 instanceJob = g_InputBuffer.Load2(dispatchThreadID.x << 3); 116 | uint outputOffset; 117 | g_OutputCounter.InterlockedAdd(0, instanceJob.y, outputOffset); 118 | for (uint i = 0; i < instanceJob.y; ++i) 119 | g_OutputBuffer.Store((outputOffset + i) << 3, uint2(instanceJob.x, i)); 120 | } 121 | 122 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Kleber Garcia 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | --------------------------------------------------------------------------------