├── LICENSE ├── README.md └── src ├── Init.cuh ├── Main.cu ├── OneSweep.cu ├── OneSweep.cuh └── Utils.cuh /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Thomas Smith 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Notice: This repository has been archived. 2 | This repository has been archived. The development and maintenance of its contents has been moved to https://github.com/b0nes164/GPUSorting. 3 | 4 | # OneSweep 5 | A simple library-less CUDA implementation of the Adinets and Merrill's [OneSweep](https://arxiv.org/abs/2206.01784) sorting algorithm. Given $2^{28}$ uniform random 32-bit keys, our implementation achieves a performance of $\sim$ 10.9 G keys/sec on a 2080 super, which is identical to the performance achieved with the CUB library. 6 | 7 | The purpose of this repo is to demystify the implmentation of the algorithm. It is not intended for production or use, instead a proper implementation can be found at the [CUB](https://github.com/NVIDIA/cub) library. Notably our implementation lacks: short circuit evaluation, support for data types besides `unsigned int`, support for aligned scattering, and tuning for cards other than the 2080 super. If you would like to run this code yourself, simply grab the latest version of the CUDA toolkit. 8 | 9 | ## Strongly Suggested Reading / Bibliography 10 | Andy Adinets and Duane Merrill. Onesweep: A Faster Least Significant Digit Radix Sort for GPUs. 2022. arXiv: 2206.01784 [cs.DC] 11 | 12 | Duane Merrill and Michael Garland. “Single-pass Parallel Prefix Scan with De-coupled Lookback”. In: 2016. url: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back 13 | 14 | Saman Ashkiani et al. “GPU Multisplit”. In: SIGPLAN Not. 51.8 (Feb. 2016). issn: 0362-1340. doi: 10.1145/3016078.2851169. url: https://doi.org/10.1145/3016078.2851169. 15 | -------------------------------------------------------------------------------- /src/Init.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "cuda_runtime.h" 4 | #include "device_launch_parameters.h" 5 | 6 | //Hybrid LCG-Tausworthe PRNG 7 | //From GPU GEMS 3, Chapter 37 8 | //Authors: Lee Howes and David Thomas 9 | #define TAUS_STEP_1 ((z1 & 4294967294U) << 12) ^ (((z1 << 13) ^ z1) >> 19) 10 | #define TAUS_STEP_2 ((z2 & 4294967288U) << 4) ^ (((z2 << 2) ^ z2) >> 25) 11 | #define TAUS_STEP_3 ((z3 & 4294967280U) << 17) ^ (((z3 << 3) ^ z3) >> 11) 12 | #define LCG_STEP (z4 * 1664525 + 1013904223U) 13 | #define HYBRID_TAUS (z1 ^ z2 ^ z3 ^ z4) 14 | 15 | //Initialize the input to a sequence of descending integers. 16 | __global__ void InitDescending(uint32_t* sort, uint32_t size) 17 | { 18 | for (uint32_t i = threadIdx.x + blockDim.x * blockIdx.x; i < size; i += blockDim.x * gridDim.x) 19 | sort[i] = size - i; 20 | } 21 | 22 | //Initialize the input to random integers. Because this is higher entropy than the descending sequence, and 23 | //becuase we do not implement short circuit evaluation, this tends to be significantly faster 24 | __global__ void InitRandom(uint32_t* sort, uint32_t size, uint32_t seed) 25 | { 26 | uint32_t idx = threadIdx.x + blockDim.x * blockIdx.x; 27 | 28 | uint32_t z1 = (idx << 2) * seed; 29 | uint32_t z2 = ((idx << 2) + 1) * seed; 30 | uint32_t z3 = ((idx << 2) + 2) * seed; 31 | uint32_t z4 = ((idx << 2) + 3) * seed; 32 | 33 | for (uint32_t i = threadIdx.x + blockDim.x * blockIdx.x; i < size; i += blockDim.x * gridDim.x) 34 | { 35 | z1 = TAUS_STEP_1; 36 | z2 = TAUS_STEP_2; 37 | z3 = TAUS_STEP_3; 38 | z4 = LCG_STEP; 39 | sort[i] = HYBRID_TAUS; 40 | } 41 | } -------------------------------------------------------------------------------- /src/Main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cuda_runtime.h" 3 | #include "device_launch_parameters.h" 4 | #include "Init.cuh" 5 | #include "OneSweep.cuh" 6 | 7 | const uint32_t size = (1 << 28); 8 | const uint32_t testIterations = 25; 9 | 10 | //Disable this when increasing test iterations, otherwise will be too slow 11 | //because of the device to host readback speed 12 | const uint32_t performValidation = true; 13 | 14 | const uint32_t radix = 256; 15 | const uint32_t radixPasses = 4; 16 | const uint32_t partitionSize = 7680; 17 | const uint32_t globalHistPartitionSize = 65536; 18 | const uint32_t globalHistThreads = 128; 19 | const uint32_t binningThreads = 512; //2080 super seems to really like 512 20 | const uint32_t binningThreadblocks = (size + partitionSize - 1) / partitionSize; 21 | const uint32_t globalHistThreadblocks = (size + globalHistPartitionSize - 1) / globalHistPartitionSize; 22 | 23 | uint32_t* sort; 24 | uint32_t* alt; 25 | uint32_t* index; 26 | uint32_t* globalHistogram; 27 | uint32_t* firstPassHistogram; 28 | uint32_t* secPassHistogram; 29 | uint32_t* thirdPassHistogram; 30 | uint32_t* fourthPassHistogram; 31 | 32 | void InitMemory() 33 | { 34 | cudaMemset(index, 0, radixPasses * sizeof(uint32_t)); 35 | cudaMemset(globalHistogram, 0, radix * radixPasses * sizeof(uint32_t)); 36 | cudaMemset(firstPassHistogram, 0, radix * binningThreadblocks * sizeof(uint32_t)); 37 | cudaMemset(secPassHistogram, 0, radix * binningThreadblocks * sizeof(uint32_t)); 38 | cudaMemset(thirdPassHistogram, 0, radix * binningThreadblocks * sizeof(uint32_t)); 39 | cudaMemset(fourthPassHistogram, 0, radix * binningThreadblocks * sizeof(uint32_t)); 40 | } 41 | 42 | void DispatchKernels() 43 | { 44 | InitMemory(); 45 | 46 | cudaDeviceSynchronize(); 47 | 48 | GlobalHistogram <<>> (sort, globalHistogram, size); 49 | 50 | Scan <<>> (globalHistogram, firstPassHistogram, secPassHistogram, 51 | thirdPassHistogram, fourthPassHistogram); 52 | 53 | DigitBinningPass <<>> (sort, alt, firstPassHistogram, 54 | index, size, 0); 55 | 56 | DigitBinningPass <<>> (alt, sort, secPassHistogram, 57 | index, size, 8); 58 | 59 | DigitBinningPass <<>> (sort, alt, thirdPassHistogram, 60 | index, size, 16); 61 | 62 | DigitBinningPass <<>> (alt, sort, fourthPassHistogram, 63 | index, size, 24); 64 | } 65 | 66 | //Test for correctness 67 | void ValidationTest() 68 | { 69 | printf("Beginning VALIDATION tests at size %u and %u iterations. \n", size, testIterations); 70 | uint32_t* validationArray = new uint32_t[size]; 71 | int testsPassed = 0; 72 | 73 | for (uint32_t i = 1; i <= testIterations; ++i) 74 | { 75 | InitRandom <<<256, 1024>>> (sort, size, i); 76 | DispatchKernels(); 77 | cudaDeviceSynchronize(); 78 | cudaMemcpy(validationArray, sort, size * sizeof(uint32_t), cudaMemcpyDeviceToHost); 79 | 80 | bool isCorrect = true; 81 | for (uint32_t k = 1; k < size; ++k) 82 | { 83 | if (validationArray[k] < validationArray[k - 1]) 84 | { 85 | isCorrect = false; 86 | break; 87 | } 88 | } 89 | 90 | if (isCorrect) 91 | testsPassed++; 92 | else 93 | printf("Test iteration %d failed.", i); 94 | } 95 | 96 | printf("%d/%d tests passed.\n", testsPassed, testIterations); 97 | delete[] validationArray; 98 | } 99 | 100 | //Discard the first result to prep caches and TLB 101 | void TimingTest() 102 | { 103 | printf("Beginning TIMING tests at size %u and %u iterations. \n", size, testIterations); 104 | printf("Running "); 105 | 106 | cudaEvent_t start; 107 | cudaEvent_t stop; 108 | cudaEventCreate(&start); 109 | cudaEventCreate(&stop); 110 | 111 | float totalTime = 0.0f; 112 | for (uint32_t i = 0; i <= testIterations; ++i) 113 | { 114 | InitRandom <<<256, 1024>>> (sort, size, i); 115 | cudaDeviceSynchronize(); 116 | cudaEventRecord(start); 117 | DispatchKernels(); 118 | cudaEventRecord(stop); 119 | cudaEventSynchronize(stop); 120 | 121 | float millis; 122 | cudaEventElapsedTime(&millis, start, stop); 123 | if (i) 124 | totalTime += millis; 125 | 126 | if ((i & 15) == 0) 127 | printf(". "); 128 | } 129 | 130 | printf("\n"); 131 | totalTime /= 1000.0f; 132 | printf("Total time elapsed: %f\n", totalTime); 133 | printf("Estimated speed at %u 32-bit elements: %E keys/sec\n", size, size / totalTime * testIterations); 134 | } 135 | 136 | int main() 137 | { 138 | cudaMalloc(&sort, size * sizeof(uint32_t)); 139 | cudaMalloc(&alt, size * sizeof(uint32_t)); 140 | cudaMalloc(&index, radixPasses * sizeof(uint32_t)); 141 | cudaMalloc(&globalHistogram, radix * radixPasses * sizeof(uint32_t)); 142 | cudaMalloc(&firstPassHistogram, binningThreadblocks * radix * sizeof(uint32_t)); 143 | cudaMalloc(&secPassHistogram, binningThreadblocks * radix * sizeof(uint32_t)); 144 | cudaMalloc(&thirdPassHistogram, binningThreadblocks * radix * sizeof(uint32_t)); 145 | cudaMalloc(&fourthPassHistogram, binningThreadblocks * radix * sizeof(uint32_t)); 146 | 147 | if (performValidation) 148 | ValidationTest(); 149 | TimingTest(); 150 | 151 | cudaFree(sort); 152 | cudaFree(alt); 153 | cudaFree(index); 154 | cudaFree(globalHistogram); 155 | cudaFree(firstPassHistogram); 156 | cudaFree(secPassHistogram); 157 | cudaFree(thirdPassHistogram); 158 | cudaFree(fourthPassHistogram); 159 | } -------------------------------------------------------------------------------- /src/OneSweep.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * OneSweep Implementation 3 | * 4 | * Author: Thomas Smith 2/26/2024 5 | * 6 | * Based off of Research by: 7 | * Andy Adinets, Nvidia Corporation 8 | * Duane Merrill, Nvidia Corporation 9 | * https://research.nvidia.com/publication/2022-06_onesweep-faster-least-significant-digit-radix-sort-gpus 10 | * 11 | * Copyright (c) 2024 Thomas Smith 12 | * 13 | * Permission is hereby granted, free of charge, to any person obtaining a copy 14 | * of this software and associated documentation files (the "Software"), to deal 15 | * in the Software without restriction, including without limitation the rights 16 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 17 | * copies of the Software, and to permit persons to whom the Software is 18 | * furnished to do so, subject to the following conditions: 19 | * 20 | * The above copyright notice and this permission notice shall be included in all 21 | * copies or substantial portions of the Software. 22 | * 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 24 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 25 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 26 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 27 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 28 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 29 | * SOFTWARE. 30 | * 31 | ******************************************************************************/ 32 | #include "OneSweep.cuh" 33 | 34 | #define RADIX 256 //Number of digit bins 35 | #define RADIX_MASK 255 //Mask of digit bins, to extract digits 36 | #define RADIX_LOG 8 //log2(RADIX) 37 | 38 | #define SEC_RADIX_START 256 39 | #define THIRD_RADIX_START 512 40 | #define FOURTH_RADIX_START 768 41 | 42 | //For the upfront global histogram kernel 43 | #define G_HIST_PART_SIZE 65536 44 | #define G_HIST_VEC_SIZE 16384 45 | 46 | //For the digit binning 47 | #define BIN_PART_SIZE 7680 //Partition tile size in k_DigitBinning 48 | #define BIN_HISTS_SIZE 4096 //Total size of warp histograms in shared memory in k_DigitBinning 49 | #define BIN_SUB_PART_SIZE 480 //Subpartition tile size of a single warp in k_DigitBinning 50 | #define BIN_WARPS 16 //Warps per threadblock in k_DigitBinning 51 | #define BIN_KEYS_PER_THREAD 15 //Keys per thread in k_DigitBinning 52 | #define BIN_SUB_PART_START (WARP_INDEX * BIN_SUB_PART_SIZE) //Starting offset of a subpartition tile 53 | #define BIN_PART_START (partitionIndex * BIN_PART_SIZE) //Starting offset of a partition tile 54 | 55 | //for the chained scan with decoupled lookback 56 | #define FLAG_NOT_READY 0 //Flag value inidicating neither inclusive sum, nor reduction of a partition tile is ready 57 | #define FLAG_REDUCTION 1 //Flag value indicating reduction of a partition tile is ready 58 | #define FLAG_INCLUSIVE 2 //Flag value indicating inclusive sum of a partition tile is ready 59 | #define FLAG_MASK 3 //Mask used to retrieve flag values 60 | 61 | __global__ void GlobalHistogram( 62 | uint32_t* sort, 63 | uint32_t* globalHistogram, 64 | uint32_t size) 65 | { 66 | __shared__ uint32_t s_globalHistFirst[RADIX * 2]; 67 | __shared__ uint32_t s_globalHistSec[RADIX * 2]; 68 | __shared__ uint32_t s_globalHistThird[RADIX * 2]; 69 | __shared__ uint32_t s_globalHistFourth[RADIX * 2]; 70 | 71 | //clear shared memory 72 | for (uint32_t i = threadIdx.x; i < RADIX * 2; i += blockDim.x) 73 | { 74 | s_globalHistFirst[i] = 0; 75 | s_globalHistSec[i] = 0; 76 | s_globalHistThird[i] = 0; 77 | s_globalHistFourth[i] = 0; 78 | } 79 | __syncthreads(); 80 | 81 | //histogram 82 | { 83 | //64 threads : 1 histogram in shared memory 84 | uint32_t* s_wavesHistFirst = &s_globalHistFirst[threadIdx.x / 64 * RADIX]; 85 | uint32_t* s_wavesHistSec = &s_globalHistSec[threadIdx.x / 64 * RADIX]; 86 | uint32_t* s_wavesHistThird = &s_globalHistThird[threadIdx.x / 64 * RADIX]; 87 | uint32_t* s_wavesHistFourth = &s_globalHistFourth[threadIdx.x / 64 * RADIX]; 88 | 89 | if (blockIdx.x < gridDim.x - 1) 90 | { 91 | const uint32_t partEnd = (blockIdx.x + 1) * G_HIST_VEC_SIZE; 92 | for (uint32_t i = threadIdx.x + (blockIdx.x * G_HIST_VEC_SIZE); i < partEnd; i += blockDim.x) 93 | { 94 | uint4 t[1] = { reinterpret_cast(sort)[i] }; 95 | 96 | atomicAdd(&s_wavesHistFirst[reinterpret_cast(t)[0]], 1); 97 | atomicAdd(&s_wavesHistSec[reinterpret_cast(t)[1]], 1); 98 | atomicAdd(&s_wavesHistThird[reinterpret_cast(t)[2]], 1); 99 | atomicAdd(&s_wavesHistFourth[reinterpret_cast(t)[3]], 1); 100 | 101 | atomicAdd(&s_wavesHistFirst[reinterpret_cast(t)[4]], 1); 102 | atomicAdd(&s_wavesHistSec[reinterpret_cast(t)[5]], 1); 103 | atomicAdd(&s_wavesHistThird[reinterpret_cast(t)[6]], 1); 104 | atomicAdd(&s_wavesHistFourth[reinterpret_cast(t)[7]], 1); 105 | 106 | atomicAdd(&s_wavesHistFirst[reinterpret_cast(t)[8]], 1); 107 | atomicAdd(&s_wavesHistSec[reinterpret_cast(t)[9]], 1); 108 | atomicAdd(&s_globalHistThird[reinterpret_cast(t)[10]], 1); 109 | atomicAdd(&s_wavesHistFourth[reinterpret_cast(t)[11]], 1); 110 | 111 | atomicAdd(&s_wavesHistFirst[reinterpret_cast(t)[12]], 1); 112 | atomicAdd(&s_wavesHistSec[reinterpret_cast(t)[13]], 1); 113 | atomicAdd(&s_wavesHistThird[reinterpret_cast(t)[14]], 1); 114 | atomicAdd(&s_wavesHistFourth[reinterpret_cast(t)[15]], 1); 115 | } 116 | } 117 | 118 | if (blockIdx.x == gridDim.x - 1) 119 | { 120 | for (uint32_t i = threadIdx.x + (blockIdx.x * G_HIST_PART_SIZE); i < size; i += blockDim.x) 121 | { 122 | uint32_t t[1] = { sort[i] }; 123 | atomicAdd(&s_wavesHistFirst[reinterpret_cast(t)[0]], 1); 124 | atomicAdd(&s_wavesHistSec[reinterpret_cast(t)[1]], 1); 125 | atomicAdd(&s_wavesHistThird[reinterpret_cast(t)[2]], 1); 126 | atomicAdd(&s_wavesHistFourth[reinterpret_cast(t)[3]], 1); 127 | } 128 | } 129 | } 130 | __syncthreads(); 131 | 132 | //reduce and add to device 133 | for (uint32_t i = threadIdx.x; i < RADIX; i += blockDim.x) 134 | { 135 | atomicAdd(&globalHistogram[i], s_globalHistFirst[i] + s_globalHistFirst[i + RADIX]); 136 | atomicAdd(&globalHistogram[i + SEC_RADIX_START], s_globalHistSec[i] + s_globalHistSec[i + RADIX]); 137 | atomicAdd(&globalHistogram[i + THIRD_RADIX_START], s_globalHistThird[i] + s_globalHistThird[i + RADIX]); 138 | atomicAdd(&globalHistogram[i + FOURTH_RADIX_START], s_globalHistFourth[i] + s_globalHistFourth[i + RADIX]); 139 | } 140 | } 141 | 142 | __global__ void Scan( 143 | uint32_t* globalHistogram, 144 | uint32_t* firstPassHistogram, 145 | uint32_t* secPassHistogram, 146 | uint32_t* thirdPassHistogram, 147 | uint32_t* fourthPassHistogram) 148 | { 149 | __shared__ uint32_t s_scan[RADIX]; 150 | 151 | s_scan[threadIdx.x] = InclusiveWarpScanCircularShift(globalHistogram[threadIdx.x + blockIdx.x * RADIX]); 152 | __syncthreads(); 153 | 154 | if(threadIdx.x < (RADIX >> LANE_LOG)) 155 | s_scan[threadIdx.x << LANE_LOG] = ActiveExclusiveWarpScan(s_scan[threadIdx.x << LANE_LOG]); 156 | __syncthreads(); 157 | 158 | switch (blockIdx.x) 159 | { 160 | case 0: 161 | firstPassHistogram[threadIdx.x] = 162 | (s_scan[threadIdx.x] + (getLaneId() ? __shfl_sync(0xfffffffe, s_scan[threadIdx.x - 1], 1) : 0)) << 2 | FLAG_INCLUSIVE; 163 | break; 164 | case 1: 165 | secPassHistogram[threadIdx.x] = 166 | (s_scan[threadIdx.x] + (getLaneId() ? __shfl_sync(0xfffffffe, s_scan[threadIdx.x - 1], 1) : 0)) << 2 | FLAG_INCLUSIVE; 167 | break; 168 | case 2: 169 | thirdPassHistogram[threadIdx.x] = 170 | (s_scan[threadIdx.x] + (getLaneId() ? __shfl_sync(0xfffffffe, s_scan[threadIdx.x - 1], 1) : 0)) << 2 | FLAG_INCLUSIVE; 171 | break; 172 | case 3: 173 | fourthPassHistogram[threadIdx.x] = 174 | (s_scan[threadIdx.x] + (getLaneId() ? __shfl_sync(0xfffffffe, s_scan[threadIdx.x - 1], 1) : 0)) << 2 | FLAG_INCLUSIVE; 175 | break; 176 | default: 177 | break; 178 | } 179 | } 180 | 181 | __global__ void DigitBinningPass( 182 | uint32_t* sort, 183 | uint32_t* alt, 184 | volatile uint32_t* passHistogram, 185 | volatile uint32_t* index, 186 | uint32_t size, 187 | uint32_t radixShift) 188 | { 189 | __shared__ uint32_t s_warpHistograms[BIN_PART_SIZE]; 190 | __shared__ uint32_t s_localHistogram[RADIX]; 191 | volatile uint32_t* s_warpHist = &s_warpHistograms[WARP_INDEX << RADIX_LOG]; 192 | 193 | //clear shared memory 194 | for (uint32_t i = threadIdx.x; i < BIN_HISTS_SIZE; i += blockDim.x) //unnecessary work for last partion but still a win to avoid another barrier 195 | s_warpHistograms[i] = 0; 196 | 197 | //atomically assign partition tiles 198 | if (threadIdx.x == 0) 199 | s_warpHistograms[BIN_PART_SIZE - 1] = atomicAdd((uint32_t*)&index[radixShift >> 3], 1); 200 | __syncthreads(); 201 | const uint32_t partitionIndex = s_warpHistograms[BIN_PART_SIZE - 1]; 202 | 203 | //To handle input sizes not perfect multiples of the partition tile size 204 | if (partitionIndex < gridDim.x - 1) 205 | { 206 | //load keys 207 | uint32_t keys[BIN_KEYS_PER_THREAD]; 208 | #pragma unroll 209 | for (uint32_t i = 0, t = getLaneId() + BIN_SUB_PART_START + BIN_PART_START; i < BIN_KEYS_PER_THREAD; ++i, t += LANE_COUNT) 210 | keys[i] = sort[t]; 211 | 212 | uint16_t offsets[BIN_KEYS_PER_THREAD]; 213 | 214 | //WLMS 215 | #pragma unroll 216 | for (uint32_t i = 0; i < BIN_KEYS_PER_THREAD; ++i) 217 | { 218 | //CUB version "match any" 219 | /* 220 | unsigned warpFlags; 221 | #pragma unroll 222 | for (int k = 0; k < RADIX_LOG; ++k) 223 | { 224 | uint32_t mask; 225 | uint32_t current_bit = 1 << k + radixShift; 226 | asm("{\n" 227 | " .reg .pred p;\n" 228 | " and.b32 %0, %1, %2;" 229 | " setp.ne.u32 p, %0, 0;\n" 230 | " vote.ballot.sync.b32 %0, p, 0xffffffff;\n" 231 | " @!p not.b32 %0, %0;\n" 232 | "}\n" : "=r"(mask) : "r"(keys[i]), "r"(current_bit)); 233 | warpFlags = (k == 0) ? mask : warpFlags & mask; 234 | } 235 | const uint32_t bits = __popc(warpFlags & getLaneMaskLt()); 236 | */ 237 | unsigned warpFlags = 0xffffffff; 238 | #pragma unroll 239 | for (int k = 0; k < RADIX_LOG; ++k) 240 | { 241 | const bool t2 = keys[i] >> k + radixShift & 1; 242 | warpFlags &= (t2 ? 0 : 0xffffffff) ^ __ballot_sync(0xffffffff, t2); 243 | } 244 | const uint32_t bits = __popc(warpFlags & getLaneMaskLt()); 245 | 246 | //An alternative, but slightly slower version. 247 | /* 248 | offsets[i] = s_warpHist[keys[i] >> radixShift & RADIX_MASK] + bits; 249 | __syncwarp(0xffffffff); 250 | if (bits == 0) 251 | s_warpHist[keys[i] >> radixShift & RADIX_MASK] += __popc(warpFlags); 252 | __syncwarp(0xffffffff); 253 | */ 254 | uint32_t preIncrementVal; 255 | if(bits == 0) 256 | preIncrementVal = atomicAdd((uint32_t*)&s_warpHist[keys[i] >> radixShift & RADIX_MASK], __popc(warpFlags)); 257 | 258 | offsets[i] = __shfl_sync(0xffffffff, preIncrementVal, __ffs(warpFlags) - 1) + bits; 259 | } 260 | __syncthreads(); 261 | 262 | //exclusive prefix sum up the warp histograms 263 | if (threadIdx.x < RADIX) 264 | { 265 | uint32_t reduction = s_warpHistograms[threadIdx.x]; 266 | for (uint32_t i = threadIdx.x + RADIX; i < BIN_HISTS_SIZE; i += RADIX) 267 | { 268 | reduction += s_warpHistograms[i]; 269 | s_warpHistograms[i] = reduction - s_warpHistograms[i]; 270 | } 271 | 272 | atomicAdd((uint32_t*)&passHistogram[threadIdx.x + (partitionIndex + 1) * RADIX], 273 | FLAG_REDUCTION | reduction << 2); 274 | 275 | //begin the exclusive prefix sum across the reductions 276 | s_localHistogram[threadIdx.x] = InclusiveWarpScanCircularShift(reduction); 277 | } 278 | __syncthreads(); 279 | 280 | if (threadIdx.x < (RADIX >> LANE_LOG)) 281 | s_localHistogram[threadIdx.x << LANE_LOG] = ActiveExclusiveWarpScan(s_localHistogram[threadIdx.x << LANE_LOG]); 282 | __syncthreads(); 283 | 284 | if (threadIdx.x < RADIX && getLaneId()) 285 | s_localHistogram[threadIdx.x] += __shfl_sync(0xfffffffe, s_localHistogram[threadIdx.x - 1], 1); 286 | __syncthreads(); 287 | 288 | //update offsets 289 | if (WARP_INDEX) 290 | { 291 | #pragma unroll 292 | for (uint32_t i = 0; i < BIN_KEYS_PER_THREAD; ++i) 293 | { 294 | const uint32_t t2 = keys[i] >> radixShift & RADIX_MASK; 295 | offsets[i] += s_warpHist[t2] + s_localHistogram[t2]; 296 | } 297 | } 298 | else 299 | { 300 | #pragma unroll 301 | for (uint32_t i = 0; i < BIN_KEYS_PER_THREAD; ++i) 302 | offsets[i] += s_localHistogram[keys[i] >> radixShift & RADIX_MASK]; 303 | } 304 | __syncthreads(); 305 | 306 | //scatter keys into shared memory 307 | #pragma unroll 308 | for (uint32_t i = 0; i < BIN_KEYS_PER_THREAD; ++i) 309 | s_warpHistograms[offsets[i]] = keys[i]; 310 | 311 | //split the warps into single thread cooperative groups and lookback 312 | if (threadIdx.x < RADIX) 313 | { 314 | uint32_t reduction = 0; 315 | for (uint32_t k = partitionIndex; k >= 0; ) 316 | { 317 | const uint32_t flagPayload = passHistogram[threadIdx.x + k * RADIX]; 318 | 319 | if ((flagPayload & FLAG_MASK) == FLAG_INCLUSIVE) 320 | { 321 | reduction += flagPayload >> 2; 322 | atomicAdd((uint32_t*)&passHistogram[threadIdx.x + (partitionIndex + 1) * RADIX], 1 | (reduction << 2)); 323 | s_localHistogram[threadIdx.x] = reduction - s_localHistogram[threadIdx.x]; 324 | break; 325 | } 326 | 327 | if ((flagPayload & FLAG_MASK) == FLAG_REDUCTION) 328 | { 329 | reduction += flagPayload >> 2; 330 | k--; 331 | } 332 | } 333 | } 334 | __syncthreads(); 335 | 336 | //scatter runs of keys into device memory 337 | #pragma unroll BIN_KEYS_PER_THREAD 338 | for (uint32_t i = threadIdx.x; i < BIN_PART_SIZE; i += blockDim.x) 339 | alt[s_localHistogram[s_warpHistograms[i] >> radixShift & RADIX_MASK] + i] = s_warpHistograms[i]; 340 | } 341 | 342 | //Process the final partition slightly differently 343 | if(partitionIndex == gridDim.x - 1) 344 | { 345 | //immediately begin lookback 346 | if (threadIdx.x < RADIX) 347 | { 348 | if (partitionIndex) 349 | { 350 | uint32_t reduction = 0; 351 | for (uint32_t k = partitionIndex; k >= 0; ) 352 | { 353 | const uint32_t flagPayload = passHistogram[threadIdx.x + k * RADIX]; 354 | 355 | if ((flagPayload & FLAG_MASK) == FLAG_INCLUSIVE) 356 | { 357 | reduction += flagPayload >> 2; 358 | s_localHistogram[threadIdx.x] = reduction; 359 | break; 360 | } 361 | 362 | if ((flagPayload & FLAG_MASK) == FLAG_REDUCTION) 363 | { 364 | reduction += flagPayload >> 2; 365 | k--; 366 | } 367 | } 368 | } 369 | else 370 | { 371 | s_localHistogram[threadIdx.x] = passHistogram[threadIdx.x] >> 2; 372 | } 373 | } 374 | __syncthreads(); 375 | 376 | const uint32_t partEnd = BIN_PART_START + BIN_PART_SIZE; 377 | for (uint32_t i = threadIdx.x + BIN_PART_START; i < partEnd; i += blockDim.x) 378 | { 379 | uint32_t key; 380 | uint32_t offset; 381 | unsigned warpFlags = 0xffffffff; 382 | 383 | if(i < size) 384 | key = sort[i]; 385 | 386 | #pragma unroll 387 | for (uint32_t k = 0; k < RADIX_LOG; ++k) 388 | { 389 | const bool t = key >> k + radixShift & 1; 390 | warpFlags &= (t ? 0 : 0xffffffff) ^ __ballot_sync(0xffffffff, t); 391 | } 392 | const uint32_t bits = __popc(warpFlags & getLaneMaskLt()); 393 | 394 | #pragma unroll 395 | for (uint32_t k = 0; k < BIN_WARPS; ++k) 396 | { 397 | uint32_t preIncrementVal; 398 | if (WARP_INDEX == k && bits == 0 && i < size) 399 | preIncrementVal = atomicAdd(&s_localHistogram[key >> radixShift & RADIX_MASK], __popc(warpFlags)); 400 | 401 | if (WARP_INDEX == k) 402 | offset = __shfl_sync(0xffffffff, preIncrementVal, __ffs(warpFlags) - 1) + bits; 403 | __syncthreads(); 404 | } 405 | 406 | if(i < size) 407 | alt[offset] = key; 408 | } 409 | } 410 | } -------------------------------------------------------------------------------- /src/OneSweep.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include "cuda_runtime.h" 5 | #include "device_launch_parameters.h" 6 | #include "Utils.cuh" 7 | 8 | __global__ void GlobalHistogram( 9 | uint32_t* sort, 10 | uint32_t* globalHistogram, 11 | uint32_t size); 12 | 13 | __global__ void Scan( 14 | uint32_t* globalHistogram, 15 | uint32_t* firstPassHistogram, 16 | uint32_t* secPassHistogram, 17 | uint32_t* thirdPassHistogram, 18 | uint32_t* fourthPassHistogram); 19 | 20 | __global__ void DigitBinningPass( 21 | uint32_t* sort, 22 | uint32_t* alt, 23 | volatile uint32_t* passHistogram, 24 | volatile uint32_t* index, 25 | uint32_t size, 26 | uint32_t radixShift); -------------------------------------------------------------------------------- /src/Utils.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cuda_runtime.h" 3 | #include "device_launch_parameters.h" 4 | #include 5 | #include 6 | 7 | //General macros 8 | #define LANE_COUNT 32 //Threads in a warp 9 | #define LANE_MASK 31 //Mask of the lane count 10 | #define LANE_LOG 5 //log2(LANE_COUNT) 11 | #define WARP_INDEX (threadIdx.x >> LANE_LOG) //Warp of a thread 12 | 13 | //PTX functions 14 | __device__ __forceinline__ uint32_t getLaneId() 15 | { 16 | uint32_t laneId; 17 | asm("mov.u32 %0, %%laneid;" : "=r"(laneId)); 18 | return laneId; 19 | } 20 | 21 | __device__ __forceinline__ unsigned getLaneMaskLt() 22 | { 23 | unsigned mask; 24 | asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask)); 25 | return mask; 26 | } 27 | 28 | //Warp scans 29 | __device__ __forceinline__ uint32_t InclusiveWarpScan(uint32_t val) 30 | { 31 | #pragma unroll 32 | for (int i = 1; i <= 16; i <<= 1) // 16 = LANE_COUNT >> 1 33 | { 34 | const uint32_t t = __shfl_up_sync(0xffffffff, val, i, 32); 35 | if (getLaneId() >= i) val += t; 36 | } 37 | 38 | return val; 39 | } 40 | 41 | __device__ __forceinline__ uint32_t ActiveInclusiveWarpScan(uint32_t val) 42 | { 43 | const uint32_t mask = __activemask(); 44 | #pragma unroll 45 | for (int i = 1; i <= 16; i <<= 1) 46 | { 47 | const uint32_t t = __shfl_up_sync(mask, val, i, 32); 48 | if (getLaneId() >= i) val += t; 49 | } 50 | 51 | return val; 52 | } 53 | 54 | __device__ __forceinline__ uint32_t InclusiveWarpScanCircularShift(uint32_t val) 55 | { 56 | #pragma unroll 57 | for (int i = 1; i <= 16; i <<= 1) // 16 = LANE_COUNT >> 1 58 | { 59 | const uint32_t t = __shfl_up_sync(0xffffffff, val, i, 32); 60 | if (getLaneId() >= i) val += t; 61 | } 62 | 63 | return __shfl_sync(0xffffffff, val, getLaneId() + LANE_MASK & LANE_MASK); 64 | } 65 | 66 | __device__ __forceinline__ uint32_t ActiveInclusiveWarpScanCircularShift(uint32_t val) 67 | { 68 | const uint32_t mask = __activemask(); 69 | #pragma unroll 70 | for (int i = 1; i <= 16; i <<= 1) // 16 = LANE_COUNT >> 1 71 | { 72 | const uint32_t t = __shfl_up_sync(mask, val, i, 32); 73 | if (getLaneId() >= i) val += t; 74 | } 75 | 76 | return __shfl_sync(mask, val, getLaneId() + LANE_MASK & LANE_MASK); 77 | } 78 | 79 | __device__ __forceinline__ uint32_t ExclusiveWarpScan(uint32_t val) 80 | { 81 | #pragma unroll 82 | for (int i = 1; i <= 16; i <<= 1) // 16 = LANE_COUNT >> 1 83 | { 84 | const uint32_t t = __shfl_up_sync(0xffffffff, val, i, 32); 85 | if (getLaneId() >= i) val += t; 86 | } 87 | 88 | const uint32_t t = __shfl_up_sync(0xffffffff, val, 1, 32); 89 | return getLaneId() ? t : 0; 90 | } 91 | 92 | __device__ __forceinline__ uint32_t ActiveExclusiveWarpScan(uint32_t val) 93 | { 94 | const uint32_t mask = __activemask(); 95 | #pragma unroll 96 | for (int i = 1; i <= 16; i <<= 1) // 16 = LANE_COUNT >> 1 97 | { 98 | const uint32_t t = __shfl_up_sync(mask, val, i, 32); 99 | if (getLaneId() >= i) val += t; 100 | } 101 | 102 | const uint32_t t = __shfl_up_sync(mask, val, 1, 32); 103 | return getLaneId() ? t : 0; 104 | } --------------------------------------------------------------------------------