├── .gitignore ├── README.md ├── radix_sort.sln └── radix_sort ├── Makefile ├── main.cu ├── radix_sort.vcxproj ├── scan.cu ├── scan.h ├── sort.cu ├── sort.h ├── timer.h └── utils.h /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.suo 8 | *.user 9 | *.userosscache 10 | *.sln.docstates 11 | 12 | # User-specific files (MonoDevelop/Xamarin Studio) 13 | *.userprefs 14 | 15 | # Build results 16 | [Dd]ebug/ 17 | [Dd]ebugPublic/ 18 | [Rr]elease/ 19 | [Rr]eleases/ 20 | x64/ 21 | x86/ 22 | bld/ 23 | [Bb]in/ 24 | [Oo]bj/ 25 | [Ll]og/ 26 | 27 | # Visual Studio 2015 cache/options directory 28 | .vs/ 29 | # Uncomment if you have tasks that create the project's static files in wwwroot 30 | #wwwroot/ 31 | 32 | # MSTest test Results 33 | [Tt]est[Rr]esult*/ 34 | [Bb]uild[Ll]og.* 35 | 36 | # NUNIT 37 | *.VisualState.xml 38 | TestResult.xml 39 | 40 | # Build Results of an ATL Project 41 | [Dd]ebugPS/ 42 | [Rr]eleasePS/ 43 | dlldata.c 44 | 45 | # .NET Core 46 | project.lock.json 47 | project.fragment.lock.json 48 | artifacts/ 49 | **/Properties/launchSettings.json 50 | 51 | *_i.c 52 | *_p.c 53 | *_i.h 54 | *.ilk 55 | *.meta 56 | *.obj 57 | *.pch 58 | *.pdb 59 | *.pgc 60 | *.pgd 61 | *.rsp 62 | *.sbr 63 | *.tlb 64 | *.tli 65 | *.tlh 66 | *.tmp 67 | *.tmp_proj 68 | *.log 69 | *.vspscc 70 | *.vssscc 71 | .builds 72 | *.pidb 73 | *.svclog 74 | *.scc 75 | 76 | # Chutzpah Test files 77 | _Chutzpah* 78 | 79 | # Visual C++ cache files 80 | ipch/ 81 | *.aps 82 | *.ncb 83 | *.opendb 84 | *.opensdf 85 | *.sdf 86 | *.cachefile 87 | *.VC.db 88 | *.VC.VC.opendb 89 | 90 | # Visual Studio profiler 91 | *.psess 92 | *.vsp 93 | *.vspx 94 | *.sap 95 | 96 | # TFS 2012 Local Workspace 97 | $tf/ 98 | 99 | # Guidance Automation Toolkit 100 | *.gpState 101 | 102 | # ReSharper is a .NET coding add-in 103 | _ReSharper*/ 104 | *.[Rr]e[Ss]harper 105 | *.DotSettings.user 106 | 107 | # JustCode is a .NET coding add-in 108 | .JustCode 109 | 110 | # TeamCity is a build add-in 111 | _TeamCity* 112 | 113 | # DotCover is a Code Coverage Tool 114 | *.dotCover 115 | 116 | # Visual Studio code coverage results 117 | *.coverage 118 | *.coveragexml 119 | 120 | # NCrunch 121 | _NCrunch_* 122 | .*crunch*.local.xml 123 | nCrunchTemp_* 124 | 125 | # MightyMoose 126 | *.mm.* 127 | AutoTest.Net/ 128 | 129 | # Web workbench (sass) 130 | .sass-cache/ 131 | 132 | # Installshield output folder 133 | [Ee]xpress/ 134 | 135 | # DocProject is a documentation generator add-in 136 | DocProject/buildhelp/ 137 | DocProject/Help/*.HxT 138 | DocProject/Help/*.HxC 139 | DocProject/Help/*.hhc 140 | DocProject/Help/*.hhk 141 | DocProject/Help/*.hhp 142 | DocProject/Help/Html2 143 | DocProject/Help/html 144 | 145 | # Click-Once directory 146 | publish/ 147 | 148 | # Publish Web Output 149 | *.[Pp]ublish.xml 150 | *.azurePubxml 151 | # TODO: Comment the next line if you want to checkin your web deploy settings 152 | # but database connection strings (with potential passwords) will be unencrypted 153 | *.pubxml 154 | *.publishproj 155 | 156 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 157 | # checkin your Azure Web App publish settings, but sensitive information contained 158 | # in these scripts will be unencrypted 159 | PublishScripts/ 160 | 161 | # NuGet Packages 162 | *.nupkg 163 | # The packages folder can be ignored because of Package Restore 164 | **/packages/* 165 | # except build/, which is used as an MSBuild target. 166 | !**/packages/build/ 167 | # Uncomment if necessary however generally it will be regenerated when needed 168 | #!**/packages/repositories.config 169 | # NuGet v3's project.json files produces more ignorable files 170 | *.nuget.props 171 | *.nuget.targets 172 | 173 | # Microsoft Azure Build Output 174 | csx/ 175 | *.build.csdef 176 | 177 | # Microsoft Azure Emulator 178 | ecf/ 179 | rcf/ 180 | 181 | # Windows Store app package directories and files 182 | AppPackages/ 183 | BundleArtifacts/ 184 | Package.StoreAssociation.xml 185 | _pkginfo.txt 186 | 187 | # Visual Studio cache files 188 | # files ending in .cache can be ignored 189 | *.[Cc]ache 190 | # but keep track of directories ending in .cache 191 | !*.[Cc]ache/ 192 | 193 | # Others 194 | ClientBin/ 195 | ~$* 196 | *~ 197 | *.dbmdl 198 | *.dbproj.schemaview 199 | *.jfm 200 | *.pfx 201 | *.publishsettings 202 | orleans.codegen.cs 203 | 204 | # Since there are multiple workflows, uncomment next line to ignore bower_components 205 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 206 | #bower_components/ 207 | 208 | # RIA/Silverlight projects 209 | Generated_Code/ 210 | 211 | # Backup & report files from converting an old project file 212 | # to a newer Visual Studio version. Backup files are not needed, 213 | # because we have git ;-) 214 | _UpgradeReport_Files/ 215 | Backup*/ 216 | UpgradeLog*.XML 217 | UpgradeLog*.htm 218 | 219 | # SQL Server files 220 | *.mdf 221 | *.ldf 222 | *.ndf 223 | 224 | # Business Intelligence projects 225 | *.rdl.data 226 | *.bim.layout 227 | *.bim_*.settings 228 | 229 | # Microsoft Fakes 230 | FakesAssemblies/ 231 | 232 | # GhostDoc plugin setting file 233 | *.GhostDoc.xml 234 | 235 | # Node.js Tools for Visual Studio 236 | .ntvs_analysis.dat 237 | node_modules/ 238 | 239 | # Typescript v1 declaration files 240 | typings/ 241 | 242 | # Visual Studio 6 build log 243 | *.plg 244 | 245 | # Visual Studio 6 workspace options file 246 | *.opt 247 | 248 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 249 | *.vbw 250 | 251 | # Visual Studio LightSwitch build output 252 | **/*.HTMLClient/GeneratedArtifacts 253 | **/*.DesktopClient/GeneratedArtifacts 254 | **/*.DesktopClient/ModelManifest.xml 255 | **/*.Server/GeneratedArtifacts 256 | **/*.Server/ModelManifest.xml 257 | _Pvt_Extensions 258 | 259 | # Paket dependency manager 260 | .paket/paket.exe 261 | paket-files/ 262 | 263 | # FAKE - F# Make 264 | .fake/ 265 | 266 | # JetBrains Rider 267 | .idea/ 268 | *.sln.iml 269 | 270 | # CodeRush 271 | .cr/ 272 | 273 | # Python Tools for Visual Studio (PTVS) 274 | __pycache__/ 275 | *.pyc 276 | 277 | # Cake - Uncomment if you are using it 278 | # tools/** 279 | # !tools/packages.config 280 | 281 | # Telerik's JustMock configuration file 282 | *.jmconfig 283 | 284 | # BizTalk build output 285 | *.btp.cs 286 | *.btm.cs 287 | *.odx.cs 288 | *.xsd.cs 289 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPU Radix Sort 2 | CUDA implementation of parallel radix sort using Blelloch scan 3 | - Implementation of 4-way radix sort as described in this [paper by Ha, Krüger, and Silva](https://vgc.poly.edu/~csilva/papers/cgf.pdf) 4 | - 2 bits per pass, resulting in 4-way split each pass 5 | - No order checking at every pass yet 6 | - Each block's internal scans now use Hillis-Steele instead of Blelloch, since the internal scan's input size is roughly the same size as the number of threads per block. In this case, Hillis-Steele's larger work complexity than Blelloch's is worth having for Hillis-Steele halving the span of Blelloch's. 7 | - Each block sorts its own local portion of the global array for greater memory coalescing during global shuffles 8 | - Prefix summing the global block sums uses the [large-scale bank-conflict free Blelloch scan](https://github.com/mark-poscablo/gpu-prefix-sum), which in turn uses the padded addressing solution for bank conflicts, described in this [presentation by Mark Harris](https://www.mimuw.edu.pl/~ps209291/kgkp/slides/scan.pdf) 9 | - For **randomly ordered** 134 million unsigned ints, **this outperforms** `std::sort()` by about **9.84x** 10 | - For **descendingly ordered** 134 million unsigned ints, **this outperforms** `std::sort()` by about **1.30x** 11 | - The results above were observed using a p2.xlarge AWS instance running the NVIDIA CUDA Toolkit 7.5 AMI. The instance is equipped with 12 EC2 Compute Units (4 virtual cores), plus 1 NVIDIA K80 (GK210) GPU. 12 | -------------------------------------------------------------------------------- /radix_sort.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "radix_sort", "radix_sort\radix_sort.vcxproj", "{C5311F0C-578C-44CA-952B-6FBE97DDF80D}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Debug|x64.ActiveCfg = Debug|x64 17 | {C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Debug|x64.Build.0 = Debug|x64 18 | {C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Debug|x86.ActiveCfg = Debug|Win32 19 | {C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Debug|x86.Build.0 = Debug|Win32 20 | {C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Release|x64.ActiveCfg = Release|x64 21 | {C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Release|x64.Build.0 = Release|x64 22 | {C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Release|x86.ActiveCfg = Release|Win32 23 | {C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | EndGlobal 29 | -------------------------------------------------------------------------------- /radix_sort/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH ?= "/usr/local/cuda-7.5" 2 | NVCC := $(CUDA_PATH)/bin/nvcc 3 | NVCC_OPTS=-O3 -arch=sm_37 -Xcompiler -Wall -Xcompiler -Wextra -m64 4 | 5 | radix_sort: main.cu scan.o sort.o Makefile 6 | $(NVCC) -o radix_sort main.cu sort.o scan.o $(NVCC_OPTS) 7 | 8 | sort.o: sort.cu 9 | $(NVCC) -c sort.cu $(NVCC_OPTS) 10 | 11 | scan.o: scan.cu 12 | $(NVCC) -c scan.cu $(NVCC_OPTS) 13 | 14 | clean: 15 | rm -f *.o radix_sort -------------------------------------------------------------------------------- /radix_sort/main.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "device_launch_parameters.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "sort.h" 10 | #include "utils.h" 11 | 12 | void cpu_sort(unsigned int* h_out, unsigned int* h_in, size_t len) 13 | { 14 | for (int i = 0; i < len; ++i) 15 | { 16 | h_out[i] = h_in[i]; 17 | } 18 | 19 | std::sort(h_out, h_out + len); 20 | } 21 | 22 | void test_cpu_vs_gpu(unsigned int* h_in, unsigned int num_elems) 23 | { 24 | std::clock_t start; 25 | 26 | unsigned int* h_out_cpu = new unsigned int[num_elems]; 27 | unsigned int* h_out_gpu = new unsigned int[num_elems]; 28 | 29 | start = std::clock(); 30 | cpu_sort(h_out_cpu, h_in, num_elems); 31 | double cpu_duration = (std::clock() - start) / (double)CLOCKS_PER_SEC; 32 | std::cout << "CPU time: " << cpu_duration << " s" << std::endl; 33 | 34 | unsigned int* d_in; 35 | unsigned int* d_out; 36 | checkCudaErrors(cudaMalloc(&d_in, sizeof(unsigned int) * num_elems)); 37 | checkCudaErrors(cudaMalloc(&d_out, sizeof(unsigned int) * num_elems)); 38 | checkCudaErrors(cudaMemcpy(d_in, h_in, sizeof(unsigned int) * num_elems, cudaMemcpyHostToDevice)); 39 | start = std::clock(); 40 | radix_sort(d_out, d_in, num_elems); 41 | double gpu_duration = (std::clock() - start) / (double)CLOCKS_PER_SEC; 42 | std::cout << "GPU time: " << gpu_duration << " s" << std::endl; 43 | checkCudaErrors(cudaMemcpy(h_out_gpu, d_out, sizeof(unsigned int) * num_elems, cudaMemcpyDeviceToHost)); 44 | checkCudaErrors(cudaFree(d_out)); 45 | checkCudaErrors(cudaFree(d_in)); 46 | 47 | // Calculate GPU / CPU speedup 48 | std::cout << "Speedup: " << cpu_duration / gpu_duration << "x" << std::endl; 49 | 50 | // Check for any mismatches between outputs of CPU and GPU 51 | bool match = true; 52 | int index_diff = 0; 53 | for (int i = 0; i < num_elems; ++i) 54 | { 55 | if (h_out_cpu[i] != h_out_gpu[i]) 56 | { 57 | match = false; 58 | index_diff = i; 59 | break; 60 | } 61 | } 62 | std::cout << "Match: " << match << std::endl; 63 | 64 | // Detail the mismatch if any 65 | if (!match) 66 | { 67 | std::cout << "Difference in index: " << index_diff << std::endl; 68 | std::cout << "CPU: " << h_out_cpu[index_diff] << std::endl; 69 | std::cout << "GPU Radix Sort: " << h_out_gpu[index_diff] << std::endl; 70 | int window_sz = 10; 71 | 72 | std::cout << "Contents: " << std::endl; 73 | std::cout << "CPU: "; 74 | for (int i = -(window_sz / 2); i < (window_sz / 2); ++i) 75 | { 76 | std::cout << h_out_cpu[index_diff + i] << ", "; 77 | } 78 | std::cout << std::endl; 79 | std::cout << "GPU Radix Sort: "; 80 | for (int i = -(window_sz / 2); i < (window_sz / 2); ++i) 81 | { 82 | std::cout << h_out_gpu[index_diff + i] << ", "; 83 | } 84 | std::cout << std::endl; 85 | } 86 | 87 | delete[] h_out_gpu; 88 | delete[] h_out_cpu; 89 | } 90 | 91 | int main() 92 | { 93 | // Set up clock for timing comparisons 94 | srand(1); 95 | 96 | for (int i = 27; i < 28; ++i) 97 | { 98 | unsigned int num_elems = (1 << i); 99 | //unsigned int num_elems = 8; 100 | std::cout << "h_in size: " << num_elems << std::endl; 101 | 102 | unsigned int* h_in = new unsigned int[num_elems]; 103 | unsigned int* h_in_rand = new unsigned int[num_elems]; 104 | 105 | for (int j = 0; j < num_elems; j++) 106 | { 107 | h_in[j] = (num_elems - 1) - j; 108 | h_in_rand[j] = rand() % num_elems; 109 | //std::cout << h_in[j] << " "; 110 | } 111 | //std::cout << std::endl; 112 | 113 | std::cout << "*** i: " << i << " ***" << std::endl; 114 | for (int j = 0; j < 5; ++j) { 115 | std::cout << "*****Descending order*****" << std::endl; 116 | test_cpu_vs_gpu(h_in, num_elems); 117 | std::cout << "*****Random order*****" << std::endl; 118 | test_cpu_vs_gpu(h_in_rand, num_elems); 119 | std::cout << std::endl; 120 | } 121 | 122 | delete[] h_in; 123 | delete[] h_in_rand; 124 | 125 | std::cout << std::endl; 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /radix_sort/radix_sort.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Debug 10 | x64 11 | 12 | 13 | Release 14 | Win32 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | {C5311F0C-578C-44CA-952B-6FBE97DDF80D} 23 | radix_sort 24 | 25 | 26 | 27 | Application 28 | true 29 | MultiByte 30 | v140 31 | 32 | 33 | Application 34 | true 35 | MultiByte 36 | v140 37 | 38 | 39 | Application 40 | false 41 | true 42 | MultiByte 43 | v140 44 | 45 | 46 | Application 47 | false 48 | true 49 | MultiByte 50 | v140 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | true 71 | 72 | 73 | true 74 | 75 | 76 | 77 | Level3 78 | Disabled 79 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 80 | 81 | 82 | true 83 | Console 84 | cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 85 | 86 | 87 | echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" 88 | copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" 89 | 90 | 91 | 92 | 93 | Level3 94 | Disabled 95 | WIN32;WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 96 | 97 | 98 | true 99 | Console 100 | cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 101 | 102 | 103 | echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" 104 | copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" 105 | 106 | 107 | 64 108 | compute_50,sm_50 109 | 110 | 111 | 112 | 113 | Level3 114 | MaxSpeed 115 | true 116 | true 117 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 118 | 119 | 120 | true 121 | true 122 | true 123 | Console 124 | cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 125 | 126 | 127 | echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" 128 | copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" 129 | 130 | 131 | 132 | 133 | Level3 134 | MaxSpeed 135 | true 136 | true 137 | WIN32;WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 138 | 139 | 140 | true 141 | true 142 | true 143 | Console 144 | cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 145 | 146 | 147 | echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" 148 | copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" 149 | 150 | 151 | 64 152 | compute_50,sm_50 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /radix_sort/scan.cu: -------------------------------------------------------------------------------- 1 | #include "scan.h" 2 | 3 | #define MAX_BLOCK_SZ 128 4 | #define NUM_BANKS 32 5 | #define LOG_NUM_BANKS 5 6 | 7 | //#define ZERO_BANK_CONFLICTS 8 | 9 | #ifdef ZERO_BANK_CONFLICTS 10 | #define CONFLICT_FREE_OFFSET(n) \ 11 | ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) 12 | #else 13 | #define CONFLICT_FREE_OFFSET(n) ((n) >> LOG_NUM_BANKS) 14 | #endif 15 | 16 | __global__ 17 | void gpu_add_block_sums(unsigned int* const d_out, 18 | const unsigned int* const d_in, 19 | unsigned int* const d_block_sums, 20 | const size_t numElems) 21 | { 22 | //unsigned int glbl_t_idx = blockDim.x * blockIdx.x + threadIdx.x; 23 | unsigned int d_block_sum_val = d_block_sums[blockIdx.x]; 24 | 25 | //unsigned int d_in_val_0 = 0; 26 | //unsigned int d_in_val_1 = 0; 27 | 28 | // Simple implementation's performance is not significantly (if at all) 29 | // better than previous verbose implementation 30 | unsigned int cpy_idx = 2 * blockIdx.x * blockDim.x + threadIdx.x; 31 | if (cpy_idx < numElems) 32 | { 33 | d_out[cpy_idx] = d_in[cpy_idx] + d_block_sum_val; 34 | if (cpy_idx + blockDim.x < numElems) 35 | d_out[cpy_idx + blockDim.x] = d_in[cpy_idx + blockDim.x] + d_block_sum_val; 36 | } 37 | 38 | //if (2 * glbl_t_idx < numElems) 39 | //{ 40 | // d_out[2 * glbl_t_idx] = d_in[2 * glbl_t_idx] + d_block_sum_val; 41 | // if (2 * glbl_t_idx + 1 < numElems) 42 | // d_out[2 * glbl_t_idx + 1] = d_in[2 * glbl_t_idx + 1] + d_block_sum_val; 43 | //} 44 | 45 | //if (2 * glbl_t_idx < numElems) 46 | //{ 47 | // d_in_val_0 = d_in[2 * glbl_t_idx]; 48 | // if (2 * glbl_t_idx + 1 < numElems) 49 | // d_in_val_1 = d_in[2 * glbl_t_idx + 1]; 50 | //} 51 | //else 52 | // return; 53 | //__syncthreads(); 54 | 55 | //d_out[2 * glbl_t_idx] = d_in_val_0 + d_block_sum_val; 56 | //if (2 * glbl_t_idx + 1 < numElems) 57 | // d_out[2 * glbl_t_idx + 1] = d_in_val_1 + d_block_sum_val; 58 | } 59 | 60 | // Modified version of Mark Harris' implementation of the Blelloch scan 61 | // according to https://www.mimuw.edu.pl/~ps209291/kgkp/slides/scan.pdf 62 | __global__ 63 | void gpu_prescan(unsigned int* const d_out, 64 | const unsigned int* const d_in, 65 | unsigned int* const d_block_sums, 66 | const unsigned int len, 67 | const unsigned int shmem_sz, 68 | const unsigned int max_elems_per_block) 69 | { 70 | // Allocated on invocation 71 | extern __shared__ unsigned int s_out[]; 72 | 73 | int thid = threadIdx.x; 74 | int ai = thid; 75 | int bi = thid + blockDim.x; 76 | 77 | // Zero out the shared memory 78 | // Helpful especially when input size is not power of two 79 | s_out[thid] = 0; 80 | s_out[thid + blockDim.x] = 0; 81 | // If CONFLICT_FREE_OFFSET is used, shared memory size 82 | // must be a 2 * blockDim.x + blockDim.x/num_banks 83 | s_out[thid + blockDim.x + (blockDim.x >> LOG_NUM_BANKS)] = 0; 84 | 85 | __syncthreads(); 86 | 87 | // Copy d_in to shared memory 88 | // Note that d_in's elements are scattered into shared memory 89 | // in light of avoiding bank conflicts 90 | unsigned int cpy_idx = max_elems_per_block * blockIdx.x + threadIdx.x; 91 | if (cpy_idx < len) 92 | { 93 | s_out[ai + CONFLICT_FREE_OFFSET(ai)] = d_in[cpy_idx]; 94 | if (cpy_idx + blockDim.x < len) 95 | s_out[bi + CONFLICT_FREE_OFFSET(bi)] = d_in[cpy_idx + blockDim.x]; 96 | } 97 | 98 | // For both upsweep and downsweep: 99 | // Sequential indices with conflict free padding 100 | // Amount of padding = target index / num banks 101 | // This "shifts" the target indices by one every multiple 102 | // of the num banks 103 | // offset controls the stride and starting index of 104 | // target elems at every iteration 105 | // d just controls which threads are active 106 | // Sweeps are pivoted on the last element of shared memory 107 | 108 | // Upsweep/Reduce step 109 | int offset = 1; 110 | for (int d = max_elems_per_block >> 1; d > 0; d >>= 1) 111 | { 112 | __syncthreads(); 113 | 114 | if (thid < d) 115 | { 116 | int ai = offset * ((thid << 1) + 1) - 1; 117 | int bi = offset * ((thid << 1) + 2) - 1; 118 | ai += CONFLICT_FREE_OFFSET(ai); 119 | bi += CONFLICT_FREE_OFFSET(bi); 120 | 121 | s_out[bi] += s_out[ai]; 122 | } 123 | offset <<= 1; 124 | } 125 | 126 | // Save the total sum on the global block sums array 127 | // Then clear the last element on the shared memory 128 | if (thid == 0) 129 | { 130 | d_block_sums[blockIdx.x] = s_out[max_elems_per_block - 1 131 | + CONFLICT_FREE_OFFSET(max_elems_per_block - 1)]; 132 | s_out[max_elems_per_block - 1 133 | + CONFLICT_FREE_OFFSET(max_elems_per_block - 1)] = 0; 134 | } 135 | 136 | // Downsweep step 137 | for (int d = 1; d < max_elems_per_block; d <<= 1) 138 | { 139 | offset >>= 1; 140 | __syncthreads(); 141 | 142 | if (thid < d) 143 | { 144 | int ai = offset * ((thid << 1) + 1) - 1; 145 | int bi = offset * ((thid << 1) + 2) - 1; 146 | ai += CONFLICT_FREE_OFFSET(ai); 147 | bi += CONFLICT_FREE_OFFSET(bi); 148 | 149 | unsigned int temp = s_out[ai]; 150 | s_out[ai] = s_out[bi]; 151 | s_out[bi] += temp; 152 | } 153 | } 154 | __syncthreads(); 155 | 156 | // Copy contents of shared memory to global memory 157 | if (cpy_idx < len) 158 | { 159 | d_out[cpy_idx] = s_out[ai + CONFLICT_FREE_OFFSET(ai)]; 160 | if (cpy_idx + blockDim.x < len) 161 | d_out[cpy_idx + blockDim.x] = s_out[bi + CONFLICT_FREE_OFFSET(bi)]; 162 | } 163 | } 164 | 165 | void sum_scan_blelloch(unsigned int* const d_out, 166 | const unsigned int* const d_in, 167 | const size_t numElems) 168 | { 169 | // Zero out d_out 170 | checkCudaErrors(cudaMemset(d_out, 0, numElems * sizeof(unsigned int))); 171 | 172 | // Set up number of threads and blocks 173 | 174 | unsigned int block_sz = MAX_BLOCK_SZ / 2; 175 | unsigned int max_elems_per_block = 2 * block_sz; // due to binary tree nature of algorithm 176 | 177 | // If input size is not power of two, the remainder will still need a whole block 178 | // Thus, number of blocks must be the ceiling of input size / max elems that a block can handle 179 | //unsigned int grid_sz = (unsigned int) std::ceil((double) numElems / (double) max_elems_per_block); 180 | // UPDATE: Instead of using ceiling and risking miscalculation due to precision, just automatically 181 | // add 1 to the grid size when the input size cannot be divided cleanly by the block's capacity 182 | unsigned int grid_sz = numElems / max_elems_per_block; 183 | // Take advantage of the fact that integer division drops the decimals 184 | if (numElems % max_elems_per_block != 0) 185 | grid_sz += 1; 186 | 187 | // Conflict free padding requires that shared memory be more than 2 * block_sz 188 | unsigned int shmem_sz = max_elems_per_block + ((max_elems_per_block) >> LOG_NUM_BANKS); 189 | 190 | // Allocate memory for array of total sums produced by each block 191 | // Array length must be the same as number of blocks 192 | unsigned int* d_block_sums; 193 | checkCudaErrors(cudaMalloc(&d_block_sums, sizeof(unsigned int) * grid_sz)); 194 | checkCudaErrors(cudaMemset(d_block_sums, 0, sizeof(unsigned int) * grid_sz)); 195 | 196 | // Sum scan data allocated to each block 197 | //gpu_sum_scan_blelloch<<>>(d_out, d_in, d_block_sums, numElems); 198 | gpu_prescan<<>>(d_out, 199 | d_in, 200 | d_block_sums, 201 | numElems, 202 | shmem_sz, 203 | max_elems_per_block); 204 | 205 | // Sum scan total sums produced by each block 206 | // Use basic implementation if number of total sums is <= 2 * block_sz 207 | // (This requires only one block to do the scan) 208 | if (grid_sz <= max_elems_per_block) 209 | { 210 | unsigned int* d_dummy_blocks_sums; 211 | checkCudaErrors(cudaMalloc(&d_dummy_blocks_sums, sizeof(unsigned int))); 212 | checkCudaErrors(cudaMemset(d_dummy_blocks_sums, 0, sizeof(unsigned int))); 213 | //gpu_sum_scan_blelloch<<<1, block_sz, sizeof(unsigned int) * max_elems_per_block>>>(d_block_sums, d_block_sums, d_dummy_blocks_sums, grid_sz); 214 | gpu_prescan<<<1, block_sz, sizeof(unsigned int) * shmem_sz>>>(d_block_sums, 215 | d_block_sums, 216 | d_dummy_blocks_sums, 217 | grid_sz, 218 | shmem_sz, 219 | max_elems_per_block); 220 | checkCudaErrors(cudaFree(d_dummy_blocks_sums)); 221 | } 222 | // Else, recurse on this same function as you'll need the full-blown scan 223 | // for the block sums 224 | else 225 | { 226 | unsigned int* d_in_block_sums; 227 | checkCudaErrors(cudaMalloc(&d_in_block_sums, sizeof(unsigned int) * grid_sz)); 228 | checkCudaErrors(cudaMemcpy(d_in_block_sums, d_block_sums, sizeof(unsigned int) * grid_sz, cudaMemcpyDeviceToDevice)); 229 | sum_scan_blelloch(d_block_sums, d_in_block_sums, grid_sz); 230 | checkCudaErrors(cudaFree(d_in_block_sums)); 231 | } 232 | 233 | //// Uncomment to examine block sums 234 | //unsigned int* h_block_sums = new unsigned int[grid_sz]; 235 | //checkCudaErrors(cudaMemcpy(h_block_sums, d_block_sums, sizeof(unsigned int) * grid_sz, cudaMemcpyDeviceToHost)); 236 | //std::cout << "Block sums: "; 237 | //for (int i = 0; i < grid_sz; ++i) 238 | //{ 239 | // std::cout << h_block_sums[i] << ", "; 240 | //} 241 | //std::cout << std::endl; 242 | //std::cout << "Block sums length: " << grid_sz << std::endl; 243 | //delete[] h_block_sums; 244 | 245 | // Add each block's total sum to its scan output 246 | // in order to get the final, global scanned array 247 | gpu_add_block_sums<<>>(d_out, d_out, d_block_sums, numElems); 248 | 249 | checkCudaErrors(cudaFree(d_block_sums)); 250 | } 251 | -------------------------------------------------------------------------------- /radix_sort/scan.h: -------------------------------------------------------------------------------- 1 | #ifndef SCAN_H__ 2 | #define SCAN_H__ 3 | 4 | #include "cuda_runtime.h" 5 | #include "device_launch_parameters.h" 6 | 7 | #include 8 | 9 | #include "utils.h" 10 | #include "timer.h" 11 | 12 | void sum_scan_naive(unsigned int* const d_out, 13 | const unsigned int* const d_in, 14 | const size_t numElems); 15 | 16 | void sum_scan_blelloch(unsigned int* const d_out, 17 | const unsigned int* const d_in, 18 | const size_t numElems); 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /radix_sort/sort.cu: -------------------------------------------------------------------------------- 1 | #include "sort.h" 2 | 3 | #define MAX_BLOCK_SZ 128 4 | 5 | __global__ void gpu_radix_sort_local(unsigned int* d_out_sorted, 6 | unsigned int* d_prefix_sums, 7 | unsigned int* d_block_sums, 8 | unsigned int input_shift_width, 9 | unsigned int* d_in, 10 | unsigned int d_in_len, 11 | unsigned int max_elems_per_block) 12 | { 13 | // need shared memory array for: 14 | // - block's share of the input data (local sort will be put here too) 15 | // - mask outputs 16 | // - scanned mask outputs 17 | // - merged scaned mask outputs ("local prefix sum") 18 | // - local sums of scanned mask outputs 19 | // - scanned local sums of scanned mask outputs 20 | 21 | // for all radix combinations: 22 | // build mask output for current radix combination 23 | // scan mask ouput 24 | // store needed value from current prefix sum array to merged prefix sum array 25 | // store total sum of mask output (obtained from scan) to global block sum array 26 | // calculate local sorted address from local prefix sum and scanned mask output's total sums 27 | // shuffle input block according to calculated local sorted addresses 28 | // shuffle local prefix sums according to calculated local sorted addresses 29 | // copy locally sorted array back to global memory 30 | // copy local prefix sum array back to global memory 31 | 32 | extern __shared__ unsigned int shmem[]; 33 | unsigned int* s_data = shmem; 34 | // s_mask_out[] will be scanned in place 35 | unsigned int s_mask_out_len = max_elems_per_block + 1; 36 | unsigned int* s_mask_out = &s_data[max_elems_per_block]; 37 | unsigned int* s_merged_scan_mask_out = &s_mask_out[s_mask_out_len]; 38 | unsigned int* s_mask_out_sums = &s_merged_scan_mask_out[max_elems_per_block]; 39 | unsigned int* s_scan_mask_out_sums = &s_mask_out_sums[4]; 40 | 41 | unsigned int thid = threadIdx.x; 42 | 43 | // Copy block's portion of global input data to shared memory 44 | unsigned int cpy_idx = max_elems_per_block * blockIdx.x + thid; 45 | if (cpy_idx < d_in_len) 46 | s_data[thid] = d_in[cpy_idx]; 47 | else 48 | s_data[thid] = 0; 49 | 50 | __syncthreads(); 51 | 52 | // To extract the correct 2 bits, we first shift the number 53 | // to the right until the correct 2 bits are in the 2 LSBs, 54 | // then mask on the number with 11 (3) to remove the bits 55 | // on the left 56 | unsigned int t_data = s_data[thid]; 57 | unsigned int t_2bit_extract = (t_data >> input_shift_width) & 3; 58 | 59 | for (unsigned int i = 0; i < 4; ++i) 60 | { 61 | // Zero out s_mask_out 62 | s_mask_out[thid] = 0; 63 | if (thid == 0) 64 | s_mask_out[s_mask_out_len - 1] = 0; 65 | 66 | __syncthreads(); 67 | 68 | // build bit mask output 69 | bool val_equals_i = false; 70 | if (cpy_idx < d_in_len) 71 | { 72 | val_equals_i = t_2bit_extract == i; 73 | s_mask_out[thid] = val_equals_i; 74 | } 75 | __syncthreads(); 76 | 77 | // Scan mask outputs (Hillis-Steele) 78 | int partner = 0; 79 | unsigned int sum = 0; 80 | unsigned int max_steps = (unsigned int) log2f(max_elems_per_block); 81 | for (unsigned int d = 0; d < max_steps; d++) { 82 | partner = thid - (1 << d); 83 | if (partner >= 0) { 84 | sum = s_mask_out[thid] + s_mask_out[partner]; 85 | } 86 | else { 87 | sum = s_mask_out[thid]; 88 | } 89 | __syncthreads(); 90 | s_mask_out[thid] = sum; 91 | __syncthreads(); 92 | } 93 | 94 | // Shift elements to produce the same effect as exclusive scan 95 | unsigned int cpy_val = 0; 96 | cpy_val = s_mask_out[thid]; 97 | __syncthreads(); 98 | s_mask_out[thid + 1] = cpy_val; 99 | __syncthreads(); 100 | 101 | if (thid == 0) 102 | { 103 | // Zero out first element to produce the same effect as exclusive scan 104 | s_mask_out[0] = 0; 105 | unsigned int total_sum = s_mask_out[s_mask_out_len - 1]; 106 | s_mask_out_sums[i] = total_sum; 107 | d_block_sums[i * gridDim.x + blockIdx.x] = total_sum; 108 | } 109 | __syncthreads(); 110 | 111 | if (val_equals_i && (cpy_idx < d_in_len)) 112 | { 113 | s_merged_scan_mask_out[thid] = s_mask_out[thid]; 114 | } 115 | 116 | __syncthreads(); 117 | } 118 | 119 | // Scan mask output sums 120 | // Just do a naive scan since the array is really small 121 | if (thid == 0) 122 | { 123 | unsigned int run_sum = 0; 124 | for (unsigned int i = 0; i < 4; ++i) 125 | { 126 | s_scan_mask_out_sums[i] = run_sum; 127 | run_sum += s_mask_out_sums[i]; 128 | } 129 | } 130 | 131 | __syncthreads(); 132 | 133 | if (cpy_idx < d_in_len) 134 | { 135 | // Calculate the new indices of the input elements for sorting 136 | unsigned int t_prefix_sum = s_merged_scan_mask_out[thid]; 137 | unsigned int new_pos = t_prefix_sum + s_scan_mask_out_sums[t_2bit_extract]; 138 | 139 | __syncthreads(); 140 | 141 | // Shuffle the block's input elements to actually sort them 142 | // Do this step for greater global memory transfer coalescing 143 | // in next step 144 | s_data[new_pos] = t_data; 145 | s_merged_scan_mask_out[new_pos] = t_prefix_sum; 146 | 147 | __syncthreads(); 148 | 149 | // Copy block - wise prefix sum results to global memory 150 | // Copy block-wise sort results to global 151 | d_prefix_sums[cpy_idx] = s_merged_scan_mask_out[thid]; 152 | d_out_sorted[cpy_idx] = s_data[thid]; 153 | } 154 | } 155 | 156 | __global__ void gpu_glbl_shuffle(unsigned int* d_out, 157 | unsigned int* d_in, 158 | unsigned int* d_scan_block_sums, 159 | unsigned int* d_prefix_sums, 160 | unsigned int input_shift_width, 161 | unsigned int d_in_len, 162 | unsigned int max_elems_per_block) 163 | { 164 | // get d = digit 165 | // get n = blockIdx 166 | // get m = local prefix sum array value 167 | // calculate global position = P_d[n] + m 168 | // copy input element to final position in d_out 169 | 170 | unsigned int thid = threadIdx.x; 171 | unsigned int cpy_idx = max_elems_per_block * blockIdx.x + thid; 172 | 173 | if (cpy_idx < d_in_len) 174 | { 175 | unsigned int t_data = d_in[cpy_idx]; 176 | unsigned int t_2bit_extract = (t_data >> input_shift_width) & 3; 177 | unsigned int t_prefix_sum = d_prefix_sums[cpy_idx]; 178 | unsigned int data_glbl_pos = d_scan_block_sums[t_2bit_extract * gridDim.x + blockIdx.x] 179 | + t_prefix_sum; 180 | __syncthreads(); 181 | d_out[data_glbl_pos] = t_data; 182 | } 183 | } 184 | 185 | // An attempt at the gpu radix sort variant described in this paper: 186 | // https://vgc.poly.edu/~csilva/papers/cgf.pdf 187 | void radix_sort(unsigned int* const d_out, 188 | unsigned int* const d_in, 189 | unsigned int d_in_len) 190 | { 191 | unsigned int block_sz = MAX_BLOCK_SZ; 192 | unsigned int max_elems_per_block = block_sz; 193 | unsigned int grid_sz = d_in_len / max_elems_per_block; 194 | // Take advantage of the fact that integer division drops the decimals 195 | if (d_in_len % max_elems_per_block != 0) 196 | grid_sz += 1; 197 | 198 | unsigned int* d_prefix_sums; 199 | unsigned int d_prefix_sums_len = d_in_len; 200 | checkCudaErrors(cudaMalloc(&d_prefix_sums, sizeof(unsigned int) * d_prefix_sums_len)); 201 | checkCudaErrors(cudaMemset(d_prefix_sums, 0, sizeof(unsigned int) * d_prefix_sums_len)); 202 | 203 | unsigned int* d_block_sums; 204 | unsigned int d_block_sums_len = 4 * grid_sz; // 4-way split 205 | checkCudaErrors(cudaMalloc(&d_block_sums, sizeof(unsigned int) * d_block_sums_len)); 206 | checkCudaErrors(cudaMemset(d_block_sums, 0, sizeof(unsigned int) * d_block_sums_len)); 207 | 208 | unsigned int* d_scan_block_sums; 209 | checkCudaErrors(cudaMalloc(&d_scan_block_sums, sizeof(unsigned int) * d_block_sums_len)); 210 | checkCudaErrors(cudaMemset(d_scan_block_sums, 0, sizeof(unsigned int) * d_block_sums_len)); 211 | 212 | // shared memory consists of 3 arrays the size of the block-wise input 213 | // and 2 arrays the size of n in the current n-way split (4) 214 | unsigned int s_data_len = max_elems_per_block; 215 | unsigned int s_mask_out_len = max_elems_per_block + 1; 216 | unsigned int s_merged_scan_mask_out_len = max_elems_per_block; 217 | unsigned int s_mask_out_sums_len = 4; // 4-way split 218 | unsigned int s_scan_mask_out_sums_len = 4; 219 | unsigned int shmem_sz = (s_data_len 220 | + s_mask_out_len 221 | + s_merged_scan_mask_out_len 222 | + s_mask_out_sums_len 223 | + s_scan_mask_out_sums_len) 224 | * sizeof(unsigned int); 225 | 226 | 227 | // for every 2 bits from LSB to MSB: 228 | // block-wise radix sort (write blocks back to global memory) 229 | for (unsigned int shift_width = 0; shift_width <= 30; shift_width += 2) 230 | { 231 | gpu_radix_sort_local<<>>(d_out, 232 | d_prefix_sums, 233 | d_block_sums, 234 | shift_width, 235 | d_in, 236 | d_in_len, 237 | max_elems_per_block); 238 | 239 | //unsigned int* h_test = new unsigned int[d_in_len]; 240 | //checkCudaErrors(cudaMemcpy(h_test, d_in, sizeof(unsigned int) * d_in_len, cudaMemcpyDeviceToHost)); 241 | //for (unsigned int i = 0; i < d_in_len; ++i) 242 | // std::cout << h_test[i] << " "; 243 | //std::cout << std::endl; 244 | //delete[] h_test; 245 | 246 | // scan global block sum array 247 | sum_scan_blelloch(d_scan_block_sums, d_block_sums, d_block_sums_len); 248 | 249 | // scatter/shuffle block-wise sorted array to final positions 250 | gpu_glbl_shuffle<<>>(d_in, 251 | d_out, 252 | d_scan_block_sums, 253 | d_prefix_sums, 254 | shift_width, 255 | d_in_len, 256 | max_elems_per_block); 257 | } 258 | checkCudaErrors(cudaMemcpy(d_out, d_in, sizeof(unsigned int) * d_in_len, cudaMemcpyDeviceToDevice)); 259 | 260 | checkCudaErrors(cudaFree(d_scan_block_sums)); 261 | checkCudaErrors(cudaFree(d_block_sums)); 262 | checkCudaErrors(cudaFree(d_prefix_sums)); 263 | } 264 | -------------------------------------------------------------------------------- /radix_sort/sort.h: -------------------------------------------------------------------------------- 1 | #ifndef SORT_H__ 2 | #define SORT_H__ 3 | 4 | #include "cuda_runtime.h" 5 | #include "device_launch_parameters.h" 6 | #include "scan.h" 7 | #include 8 | 9 | void radix_sort(unsigned int* const d_out, 10 | unsigned int* const d_in, 11 | unsigned int d_in_len); 12 | 13 | #endif -------------------------------------------------------------------------------- /radix_sort/timer.h: -------------------------------------------------------------------------------- 1 | // Originally from Udacity (https://www.udacity.com/course/intro-to-parallel-programming--cs344) 2 | // Used only for educational purposes 3 | 4 | #ifndef GPU_TIMER_H__ 5 | #define GPU_TIMER_H__ 6 | 7 | #include 8 | 9 | struct GpuTimer 10 | { 11 | cudaEvent_t start; 12 | cudaEvent_t stop; 13 | 14 | GpuTimer() 15 | { 16 | cudaEventCreate(&start); 17 | cudaEventCreate(&stop); 18 | } 19 | 20 | ~GpuTimer() 21 | { 22 | cudaEventDestroy(start); 23 | cudaEventDestroy(stop); 24 | } 25 | 26 | void Start() 27 | { 28 | cudaEventRecord(start, 0); 29 | } 30 | 31 | void Stop() 32 | { 33 | cudaEventRecord(stop, 0); 34 | } 35 | 36 | float Elapsed() 37 | { 38 | float elapsed; 39 | cudaEventSynchronize(stop); 40 | cudaEventElapsedTime(&elapsed, start, stop); 41 | return elapsed; 42 | } 43 | }; 44 | 45 | #endif /* GPU_TIMER_H__ */ 46 | -------------------------------------------------------------------------------- /radix_sort/utils.h: -------------------------------------------------------------------------------- 1 | // Originally from Udacity (https://www.udacity.com/course/intro-to-parallel-programming--cs344) 2 | // Used only for educational purposes 3 | 4 | #ifndef UTILS_H__ 5 | #define UTILS_H__ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 16 | 17 | template 18 | void check(T err, const char* const func, const char* const file, const int line) { 19 | if (err != cudaSuccess) { 20 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 21 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 22 | exit(1); 23 | } 24 | } 25 | 26 | template 27 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) { 28 | //check that the GPU result matches the CPU result 29 | for (size_t i = 0; i < numElem; ++i) { 30 | if (ref[i] != gpu[i]) { 31 | std::cerr << "Difference at pos " << i << std::endl; 32 | //the + is magic to convert char to int without messing 33 | //with other types 34 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 35 | "\nGPU : " << +gpu[i] << std::endl; 36 | exit(1); 37 | } 38 | } 39 | } 40 | 41 | template 42 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) { 43 | assert(eps1 >= 0 && eps2 >= 0); 44 | unsigned long long totalDiff = 0; 45 | unsigned numSmallDifferences = 0; 46 | for (size_t i = 0; i < numElem; ++i) { 47 | //subtract smaller from larger in case of unsigned types 48 | #ifdef _WIN32 49 | T smaller = std::fmin(ref[i], gpu[i]); 50 | T larger = std::fmax(ref[i], gpu[i]); 51 | #else 52 | T smaller = std::min(ref[i], gpu[i]); 53 | T larger = std::max(ref[i], gpu[i]); 54 | #endif 55 | T diff = larger - smaller; 56 | if (diff > 0 && diff <= eps1) { 57 | numSmallDifferences++; 58 | } 59 | else if (diff > eps1) { 60 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl; 61 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 62 | "\nGPU : " << +gpu[i] << std::endl; 63 | exit(1); 64 | } 65 | totalDiff += diff * diff; 66 | } 67 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem; 68 | if (percentSmallDifferences > eps2) { 69 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl; 70 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl; 71 | exit(1); 72 | } 73 | } 74 | 75 | //Uses the autodesk method of image comparison 76 | //Note the the tolerance here is in PIXELS not a percentage of input pixels 77 | template 78 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance) 79 | { 80 | 81 | size_t numBadPixels = 0; 82 | for (size_t i = 0; i < numElem; ++i) { 83 | #ifdef _WIN32 84 | T smaller = std::fmin(ref[i], gpu[i]); 85 | T larger = std::fmax(ref[i], gpu[i]); 86 | #else 87 | T smaller = std::min(ref[i], gpu[i]); 88 | T larger = std::max(ref[i], gpu[i]); 89 | #endif 90 | T diff = larger - smaller; 91 | if (diff > variance) 92 | ++numBadPixels; 93 | } 94 | 95 | if (numBadPixels > tolerance) { 96 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl; 97 | exit(1); 98 | } 99 | } 100 | 101 | #endif 102 | --------------------------------------------------------------------------------