├── .gitignore
├── README.md
├── radix_sort.sln
└── radix_sort
    ├── Makefile
    ├── main.cu
    ├── radix_sort.vcxproj
    ├── scan.cu
    ├── scan.h
    ├── sort.cu
    ├── sort.h
    ├── timer.h
    └── utils.h


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.suo
  8 | *.user
  9 | *.userosscache
 10 | *.sln.docstates
 11 | 
 12 | # User-specific files (MonoDevelop/Xamarin Studio)
 13 | *.userprefs
 14 | 
 15 | # Build results
 16 | [Dd]ebug/
 17 | [Dd]ebugPublic/
 18 | [Rr]elease/
 19 | [Rr]eleases/
 20 | x64/
 21 | x86/
 22 | bld/
 23 | [Bb]in/
 24 | [Oo]bj/
 25 | [Ll]og/
 26 | 
 27 | # Visual Studio 2015 cache/options directory
 28 | .vs/
 29 | # Uncomment if you have tasks that create the project's static files in wwwroot
 30 | #wwwroot/
 31 | 
 32 | # MSTest test Results
 33 | [Tt]est[Rr]esult*/
 34 | [Bb]uild[Ll]og.*
 35 | 
 36 | # NUNIT
 37 | *.VisualState.xml
 38 | TestResult.xml
 39 | 
 40 | # Build Results of an ATL Project
 41 | [Dd]ebugPS/
 42 | [Rr]eleasePS/
 43 | dlldata.c
 44 | 
 45 | # .NET Core
 46 | project.lock.json
 47 | project.fragment.lock.json
 48 | artifacts/
 49 | **/Properties/launchSettings.json
 50 | 
 51 | *_i.c
 52 | *_p.c
 53 | *_i.h
 54 | *.ilk
 55 | *.meta
 56 | *.obj
 57 | *.pch
 58 | *.pdb
 59 | *.pgc
 60 | *.pgd
 61 | *.rsp
 62 | *.sbr
 63 | *.tlb
 64 | *.tli
 65 | *.tlh
 66 | *.tmp
 67 | *.tmp_proj
 68 | *.log
 69 | *.vspscc
 70 | *.vssscc
 71 | .builds
 72 | *.pidb
 73 | *.svclog
 74 | *.scc
 75 | 
 76 | # Chutzpah Test files
 77 | _Chutzpah*
 78 | 
 79 | # Visual C++ cache files
 80 | ipch/
 81 | *.aps
 82 | *.ncb
 83 | *.opendb
 84 | *.opensdf
 85 | *.sdf
 86 | *.cachefile
 87 | *.VC.db
 88 | *.VC.VC.opendb
 89 | 
 90 | # Visual Studio profiler
 91 | *.psess
 92 | *.vsp
 93 | *.vspx
 94 | *.sap
 95 | 
 96 | # TFS 2012 Local Workspace
 97 | $tf/
 98 | 
 99 | # Guidance Automation Toolkit
100 | *.gpState
101 | 
102 | # ReSharper is a .NET coding add-in
103 | _ReSharper*/
104 | *.[Rr]e[Ss]harper
105 | *.DotSettings.user
106 | 
107 | # JustCode is a .NET coding add-in
108 | .JustCode
109 | 
110 | # TeamCity is a build add-in
111 | _TeamCity*
112 | 
113 | # DotCover is a Code Coverage Tool
114 | *.dotCover
115 | 
116 | # Visual Studio code coverage results
117 | *.coverage
118 | *.coveragexml
119 | 
120 | # NCrunch
121 | _NCrunch_*
122 | .*crunch*.local.xml
123 | nCrunchTemp_*
124 | 
125 | # MightyMoose
126 | *.mm.*
127 | AutoTest.Net/
128 | 
129 | # Web workbench (sass)
130 | .sass-cache/
131 | 
132 | # Installshield output folder
133 | [Ee]xpress/
134 | 
135 | # DocProject is a documentation generator add-in
136 | DocProject/buildhelp/
137 | DocProject/Help/*.HxT
138 | DocProject/Help/*.HxC
139 | DocProject/Help/*.hhc
140 | DocProject/Help/*.hhk
141 | DocProject/Help/*.hhp
142 | DocProject/Help/Html2
143 | DocProject/Help/html
144 | 
145 | # Click-Once directory
146 | publish/
147 | 
148 | # Publish Web Output
149 | *.[Pp]ublish.xml
150 | *.azurePubxml
151 | # TODO: Comment the next line if you want to checkin your web deploy settings
152 | # but database connection strings (with potential passwords) will be unencrypted
153 | *.pubxml
154 | *.publishproj
155 | 
156 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
157 | # checkin your Azure Web App publish settings, but sensitive information contained
158 | # in these scripts will be unencrypted
159 | PublishScripts/
160 | 
161 | # NuGet Packages
162 | *.nupkg
163 | # The packages folder can be ignored because of Package Restore
164 | **/packages/*
165 | # except build/, which is used as an MSBuild target.
166 | !**/packages/build/
167 | # Uncomment if necessary however generally it will be regenerated when needed
168 | #!**/packages/repositories.config
169 | # NuGet v3's project.json files produces more ignorable files
170 | *.nuget.props
171 | *.nuget.targets
172 | 
173 | # Microsoft Azure Build Output
174 | csx/
175 | *.build.csdef
176 | 
177 | # Microsoft Azure Emulator
178 | ecf/
179 | rcf/
180 | 
181 | # Windows Store app package directories and files
182 | AppPackages/
183 | BundleArtifacts/
184 | Package.StoreAssociation.xml
185 | _pkginfo.txt
186 | 
187 | # Visual Studio cache files
188 | # files ending in .cache can be ignored
189 | *.[Cc]ache
190 | # but keep track of directories ending in .cache
191 | !*.[Cc]ache/
192 | 
193 | # Others
194 | ClientBin/
195 | ~$*
196 | *~
197 | *.dbmdl
198 | *.dbproj.schemaview
199 | *.jfm
200 | *.pfx
201 | *.publishsettings
202 | orleans.codegen.cs
203 | 
204 | # Since there are multiple workflows, uncomment next line to ignore bower_components
205 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
206 | #bower_components/
207 | 
208 | # RIA/Silverlight projects
209 | Generated_Code/
210 | 
211 | # Backup & report files from converting an old project file
212 | # to a newer Visual Studio version. Backup files are not needed,
213 | # because we have git ;-)
214 | _UpgradeReport_Files/
215 | Backup*/
216 | UpgradeLog*.XML
217 | UpgradeLog*.htm
218 | 
219 | # SQL Server files
220 | *.mdf
221 | *.ldf
222 | *.ndf
223 | 
224 | # Business Intelligence projects
225 | *.rdl.data
226 | *.bim.layout
227 | *.bim_*.settings
228 | 
229 | # Microsoft Fakes
230 | FakesAssemblies/
231 | 
232 | # GhostDoc plugin setting file
233 | *.GhostDoc.xml
234 | 
235 | # Node.js Tools for Visual Studio
236 | .ntvs_analysis.dat
237 | node_modules/
238 | 
239 | # Typescript v1 declaration files
240 | typings/
241 | 
242 | # Visual Studio 6 build log
243 | *.plg
244 | 
245 | # Visual Studio 6 workspace options file
246 | *.opt
247 | 
248 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
249 | *.vbw
250 | 
251 | # Visual Studio LightSwitch build output
252 | **/*.HTMLClient/GeneratedArtifacts
253 | **/*.DesktopClient/GeneratedArtifacts
254 | **/*.DesktopClient/ModelManifest.xml
255 | **/*.Server/GeneratedArtifacts
256 | **/*.Server/ModelManifest.xml
257 | _Pvt_Extensions
258 | 
259 | # Paket dependency manager
260 | .paket/paket.exe
261 | paket-files/
262 | 
263 | # FAKE - F# Make
264 | .fake/
265 | 
266 | # JetBrains Rider
267 | .idea/
268 | *.sln.iml
269 | 
270 | # CodeRush
271 | .cr/
272 | 
273 | # Python Tools for Visual Studio (PTVS)
274 | __pycache__/
275 | *.pyc
276 | 
277 | # Cake - Uncomment if you are using it
278 | # tools/**
279 | # !tools/packages.config
280 | 
281 | # Telerik's JustMock configuration file
282 | *.jmconfig
283 | 
284 | # BizTalk build output
285 | *.btp.cs
286 | *.btm.cs
287 | *.odx.cs
288 | *.xsd.cs
289 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # GPU Radix Sort
 2 | CUDA implementation of parallel radix sort using Blelloch scan
 3 | - Implementation of 4-way radix sort as described in this [paper by Ha, Krüger, and Silva](https://vgc.poly.edu/~csilva/papers/cgf.pdf)
 4 | - 2 bits per pass, resulting in 4-way split each pass
 5 | - No order checking at every pass yet
 6 | - Each block's internal scans now use Hillis-Steele instead of Blelloch, since the internal scan's input size is roughly the same size as the number of threads per block. In this case, Hillis-Steele's larger work complexity than Blelloch's is worth having for Hillis-Steele halving the span of Blelloch's. 
 7 | - Each block sorts its own local portion of the global array for greater memory coalescing during global shuffles
 8 | - Prefix summing the global block sums uses the [large-scale bank-conflict free Blelloch scan](https://github.com/mark-poscablo/gpu-prefix-sum), which in turn uses the padded addressing solution for bank conflicts, described in this [presentation by Mark Harris](https://www.mimuw.edu.pl/~ps209291/kgkp/slides/scan.pdf)
 9 | - For **randomly ordered** 134 million unsigned ints, **this outperforms** `std::sort()` by about **9.84x**
10 | - For **descendingly ordered** 134 million unsigned ints, **this outperforms** `std::sort()` by about **1.30x**
11 | - The results above were observed using a p2.xlarge AWS instance running the NVIDIA CUDA Toolkit 7.5 AMI. The instance is equipped with 12 EC2 Compute Units (4 virtual cores), plus 1 NVIDIA K80 (GK210) GPU.
12 | 


--------------------------------------------------------------------------------
/radix_sort.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 14
 4 | VisualStudioVersion = 14.0.25420.1
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "radix_sort", "radix_sort\radix_sort.vcxproj", "{C5311F0C-578C-44CA-952B-6FBE97DDF80D}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|x64 = Debug|x64
11 | 		Debug|x86 = Debug|x86
12 | 		Release|x64 = Release|x64
13 | 		Release|x86 = Release|x86
14 | 	EndGlobalSection
15 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | 		{C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Debug|x64.ActiveCfg = Debug|x64
17 | 		{C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Debug|x64.Build.0 = Debug|x64
18 | 		{C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Debug|x86.ActiveCfg = Debug|Win32
19 | 		{C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Debug|x86.Build.0 = Debug|Win32
20 | 		{C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Release|x64.ActiveCfg = Release|x64
21 | 		{C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Release|x64.Build.0 = Release|x64
22 | 		{C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Release|x86.ActiveCfg = Release|Win32
23 | 		{C5311F0C-578C-44CA-952B-6FBE97DDF80D}.Release|x86.Build.0 = Release|Win32
24 | 	EndGlobalSection
25 | 	GlobalSection(SolutionProperties) = preSolution
26 | 		HideSolutionNode = FALSE
27 | 	EndGlobalSection
28 | EndGlobal
29 | 


--------------------------------------------------------------------------------
/radix_sort/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH ?= "/usr/local/cuda-7.5"
 2 | NVCC := $(CUDA_PATH)/bin/nvcc
 3 | NVCC_OPTS=-O3 -arch=sm_37 -Xcompiler -Wall -Xcompiler -Wextra -m64
 4 | 
 5 | radix_sort: main.cu scan.o sort.o Makefile
 6 | 	$(NVCC) -o radix_sort main.cu sort.o scan.o $(NVCC_OPTS)
 7 | 
 8 | sort.o: sort.cu
 9 | 	$(NVCC) -c sort.cu $(NVCC_OPTS)
10 | 
11 | scan.o: scan.cu
12 | 	$(NVCC) -c scan.cu $(NVCC_OPTS)
13 | 
14 | clean:
15 | 	rm -f *.o radix_sort


--------------------------------------------------------------------------------
/radix_sort/main.cu:
--------------------------------------------------------------------------------
  1 | #include "cuda_runtime.h"
  2 | #include "device_launch_parameters.h"
  3 | 
  4 | #include <stdio.h>
  5 | #include <algorithm>
  6 | #include <iostream>
  7 | #include <ctime>
  8 | 
  9 | #include "sort.h"
 10 | #include "utils.h"
 11 | 
 12 | void cpu_sort(unsigned int* h_out, unsigned int* h_in, size_t len)
 13 | {
 14 |     for (int i = 0; i < len; ++i)
 15 |     {
 16 |         h_out[i] = h_in[i];
 17 |     }
 18 | 
 19 |     std::sort(h_out, h_out + len);
 20 | }
 21 | 
 22 | void test_cpu_vs_gpu(unsigned int* h_in, unsigned int num_elems)
 23 | {
 24 |     std::clock_t start;
 25 | 
 26 |     unsigned int* h_out_cpu = new unsigned int[num_elems];
 27 |     unsigned int* h_out_gpu = new unsigned int[num_elems];
 28 | 
 29 |     start = std::clock();
 30 |     cpu_sort(h_out_cpu, h_in, num_elems);
 31 |     double cpu_duration = (std::clock() - start) / (double)CLOCKS_PER_SEC;
 32 |     std::cout << "CPU time: " << cpu_duration << " s" << std::endl;
 33 |     
 34 |     unsigned int* d_in;
 35 |     unsigned int* d_out;
 36 |     checkCudaErrors(cudaMalloc(&d_in, sizeof(unsigned int) * num_elems));
 37 |     checkCudaErrors(cudaMalloc(&d_out, sizeof(unsigned int) * num_elems));
 38 |     checkCudaErrors(cudaMemcpy(d_in, h_in, sizeof(unsigned int) * num_elems, cudaMemcpyHostToDevice));
 39 |     start = std::clock();
 40 |     radix_sort(d_out, d_in, num_elems);
 41 |     double gpu_duration = (std::clock() - start) / (double)CLOCKS_PER_SEC;
 42 |     std::cout << "GPU time: " << gpu_duration << " s" << std::endl;
 43 |     checkCudaErrors(cudaMemcpy(h_out_gpu, d_out, sizeof(unsigned int) * num_elems, cudaMemcpyDeviceToHost));
 44 |     checkCudaErrors(cudaFree(d_out));
 45 |     checkCudaErrors(cudaFree(d_in));
 46 | 
 47 |     // Calculate GPU / CPU speedup
 48 |     std::cout << "Speedup: " << cpu_duration / gpu_duration << "x" << std::endl;
 49 | 
 50 |     // Check for any mismatches between outputs of CPU and GPU
 51 |     bool match = true;
 52 |     int index_diff = 0;
 53 |     for (int i = 0; i < num_elems; ++i)
 54 |     {
 55 |         if (h_out_cpu[i] != h_out_gpu[i])
 56 |         {
 57 |             match = false;
 58 |             index_diff = i;
 59 |             break;
 60 |         }
 61 |     }
 62 |     std::cout << "Match: " << match << std::endl;
 63 |     
 64 |     // Detail the mismatch if any
 65 |     if (!match)
 66 |     {
 67 |         std::cout << "Difference in index: " << index_diff << std::endl;
 68 |         std::cout << "CPU: " << h_out_cpu[index_diff] << std::endl;
 69 |         std::cout << "GPU Radix Sort: " << h_out_gpu[index_diff] << std::endl;
 70 |         int window_sz = 10;
 71 |     
 72 |         std::cout << "Contents: " << std::endl;
 73 |         std::cout << "CPU: ";
 74 |         for (int i = -(window_sz / 2); i < (window_sz / 2); ++i)
 75 |         {
 76 |             std::cout << h_out_cpu[index_diff + i] << ", ";
 77 |         }
 78 |         std::cout << std::endl;
 79 |         std::cout << "GPU Radix Sort: ";
 80 |         for (int i = -(window_sz / 2); i < (window_sz / 2); ++i)
 81 |         {
 82 |             std::cout << h_out_gpu[index_diff + i] << ", ";
 83 |         }
 84 |         std::cout << std::endl;
 85 |     }
 86 |     
 87 |     delete[] h_out_gpu;
 88 |     delete[] h_out_cpu;
 89 | }
 90 | 
 91 | int main()
 92 | {
 93 |     // Set up clock for timing comparisons
 94 |     srand(1);
 95 | 
 96 |     for (int i = 27; i < 28; ++i)
 97 |     {
 98 |         unsigned int num_elems = (1 << i);
 99 |         //unsigned int num_elems = 8;
100 |         std::cout << "h_in size: " << num_elems << std::endl;
101 | 
102 |         unsigned int* h_in = new unsigned int[num_elems];
103 |         unsigned int* h_in_rand = new unsigned int[num_elems];
104 | 
105 |         for (int j = 0; j < num_elems; j++)
106 |         {
107 |             h_in[j] = (num_elems - 1) - j;
108 |             h_in_rand[j] = rand() % num_elems;
109 |             //std::cout << h_in[j] << " ";
110 |         }
111 |         //std::cout << std::endl;
112 | 
113 |         std::cout << "*** i: " << i << " ***" << std::endl;
114 |         for (int j = 0; j < 5; ++j) {
115 |             std::cout << "*****Descending order*****" << std::endl;
116 |             test_cpu_vs_gpu(h_in, num_elems);
117 |             std::cout << "*****Random order*****" << std::endl;
118 |             test_cpu_vs_gpu(h_in_rand, num_elems);
119 |             std::cout << std::endl;
120 |         }
121 | 
122 |         delete[] h_in;
123 |         delete[] h_in_rand;
124 | 
125 |         std::cout << std::endl;
126 |     }
127 | }
128 | 


--------------------------------------------------------------------------------
/radix_sort/radix_sort.vcxproj:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup Label="ProjectConfigurations">
  4 |     <ProjectConfiguration Include="Debug|Win32">
  5 |       <Configuration>Debug</Configuration>
  6 |       <Platform>Win32</Platform>
  7 |     </ProjectConfiguration>
  8 |     <ProjectConfiguration Include="Debug|x64">
  9 |       <Configuration>Debug</Configuration>
 10 |       <Platform>x64</Platform>
 11 |     </ProjectConfiguration>
 12 |     <ProjectConfiguration Include="Release|Win32">
 13 |       <Configuration>Release</Configuration>
 14 |       <Platform>Win32</Platform>
 15 |     </ProjectConfiguration>
 16 |     <ProjectConfiguration Include="Release|x64">
 17 |       <Configuration>Release</Configuration>
 18 |       <Platform>x64</Platform>
 19 |     </ProjectConfiguration>
 20 |   </ItemGroup>
 21 |   <PropertyGroup Label="Globals">
 22 |     <ProjectGuid>{C5311F0C-578C-44CA-952B-6FBE97DDF80D}</ProjectGuid>
 23 |     <RootNamespace>radix_sort</RootNamespace>
 24 |   </PropertyGroup>
 25 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 26 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
 27 |     <ConfigurationType>Application</ConfigurationType>
 28 |     <UseDebugLibraries>true</UseDebugLibraries>
 29 |     <CharacterSet>MultiByte</CharacterSet>
 30 |     <PlatformToolset>v140</PlatformToolset>
 31 |   </PropertyGroup>
 32 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
 33 |     <ConfigurationType>Application</ConfigurationType>
 34 |     <UseDebugLibraries>true</UseDebugLibraries>
 35 |     <CharacterSet>MultiByte</CharacterSet>
 36 |     <PlatformToolset>v140</PlatformToolset>
 37 |   </PropertyGroup>
 38 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
 39 |     <ConfigurationType>Application</ConfigurationType>
 40 |     <UseDebugLibraries>false</UseDebugLibraries>
 41 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 42 |     <CharacterSet>MultiByte</CharacterSet>
 43 |     <PlatformToolset>v140</PlatformToolset>
 44 |   </PropertyGroup>
 45 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
 46 |     <ConfigurationType>Application</ConfigurationType>
 47 |     <UseDebugLibraries>false</UseDebugLibraries>
 48 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 49 |     <CharacterSet>MultiByte</CharacterSet>
 50 |     <PlatformToolset>v140</PlatformToolset>
 51 |   </PropertyGroup>
 52 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 53 |   <ImportGroup Label="ExtensionSettings">
 54 |     <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
 55 |   </ImportGroup>
 56 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 57 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 58 |   </ImportGroup>
 59 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 60 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 61 |   </ImportGroup>
 62 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 63 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 64 |   </ImportGroup>
 65 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
 66 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 67 |   </ImportGroup>
 68 |   <PropertyGroup Label="UserMacros" />
 69 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 70 |     <LinkIncremental>true</LinkIncremental>
 71 |   </PropertyGroup>
 72 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 73 |     <LinkIncremental>true</LinkIncremental>
 74 |   </PropertyGroup>
 75 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 76 |     <ClCompile>
 77 |       <WarningLevel>Level3</WarningLevel>
 78 |       <Optimization>Disabled</Optimization>
 79 |       <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 80 |     </ClCompile>
 81 |     <Link>
 82 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 83 |       <SubSystem>Console</SubSystem>
 84 |       <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
 85 |     </Link>
 86 |     <PostBuildEvent>
 87 |       <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
 88 | copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
 89 |     </PostBuildEvent>
 90 |   </ItemDefinitionGroup>
 91 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 92 |     <ClCompile>
 93 |       <WarningLevel>Level3</WarningLevel>
 94 |       <Optimization>Disabled</Optimization>
 95 |       <PreprocessorDefinitions>WIN32;WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 96 |     </ClCompile>
 97 |     <Link>
 98 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 99 |       <SubSystem>Console</SubSystem>
100 |       <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
101 |     </Link>
102 |     <PostBuildEvent>
103 |       <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
104 | copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
105 |     </PostBuildEvent>
106 |     <CudaCompile>
107 |       <TargetMachinePlatform>64</TargetMachinePlatform>
108 |       <CodeGeneration>compute_50,sm_50</CodeGeneration>
109 |     </CudaCompile>
110 |   </ItemDefinitionGroup>
111 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
112 |     <ClCompile>
113 |       <WarningLevel>Level3</WarningLevel>
114 |       <Optimization>MaxSpeed</Optimization>
115 |       <FunctionLevelLinking>true</FunctionLevelLinking>
116 |       <IntrinsicFunctions>true</IntrinsicFunctions>
117 |       <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
118 |     </ClCompile>
119 |     <Link>
120 |       <GenerateDebugInformation>true</GenerateDebugInformation>
121 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
122 |       <OptimizeReferences>true</OptimizeReferences>
123 |       <SubSystem>Console</SubSystem>
124 |       <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
125 |     </Link>
126 |     <PostBuildEvent>
127 |       <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
128 | copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
129 |     </PostBuildEvent>
130 |   </ItemDefinitionGroup>
131 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
132 |     <ClCompile>
133 |       <WarningLevel>Level3</WarningLevel>
134 |       <Optimization>MaxSpeed</Optimization>
135 |       <FunctionLevelLinking>true</FunctionLevelLinking>
136 |       <IntrinsicFunctions>true</IntrinsicFunctions>
137 |       <PreprocessorDefinitions>WIN32;WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
138 |     </ClCompile>
139 |     <Link>
140 |       <GenerateDebugInformation>true</GenerateDebugInformation>
141 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
142 |       <OptimizeReferences>true</OptimizeReferences>
143 |       <SubSystem>Console</SubSystem>
144 |       <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
145 |     </Link>
146 |     <PostBuildEvent>
147 |       <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
148 | copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
149 |     </PostBuildEvent>
150 |     <CudaCompile>
151 |       <TargetMachinePlatform>64</TargetMachinePlatform>
152 |       <CodeGeneration>compute_50,sm_50</CodeGeneration>
153 |     </CudaCompile>
154 |   </ItemDefinitionGroup>
155 |   <ItemGroup>
156 |     <CudaCompile Include="main.cu" />
157 |     <CudaCompile Include="scan.cu" />
158 |     <CudaCompile Include="sort.cu" />
159 |   </ItemGroup>
160 |   <ItemGroup>
161 |     <ClInclude Include="scan.h" />
162 |     <ClInclude Include="sort.h" />
163 |     <ClInclude Include="timer.h" />
164 |     <ClInclude Include="utils.h" />
165 |   </ItemGroup>
166 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
167 |   <ImportGroup Label="ExtensionTargets">
168 |     <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
169 |   </ImportGroup>
170 | </Project>


--------------------------------------------------------------------------------
/radix_sort/scan.cu:
--------------------------------------------------------------------------------
  1 | #include "scan.h"
  2 | 
  3 | #define MAX_BLOCK_SZ 128
  4 | #define NUM_BANKS 32
  5 | #define LOG_NUM_BANKS 5
  6 | 
  7 | //#define ZERO_BANK_CONFLICTS
  8 | 
  9 | #ifdef ZERO_BANK_CONFLICTS
 10 | #define CONFLICT_FREE_OFFSET(n) \
 11 |     ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS))
 12 | #else
 13 | #define CONFLICT_FREE_OFFSET(n) ((n) >> LOG_NUM_BANKS)
 14 | #endif
 15 | 
 16 | __global__
 17 | void gpu_add_block_sums(unsigned int* const d_out,
 18 |     const unsigned int* const d_in,
 19 |     unsigned int* const d_block_sums,
 20 |     const size_t numElems)
 21 | {
 22 |     //unsigned int glbl_t_idx = blockDim.x * blockIdx.x + threadIdx.x;
 23 |     unsigned int d_block_sum_val = d_block_sums[blockIdx.x];
 24 | 
 25 |     //unsigned int d_in_val_0 = 0;
 26 |     //unsigned int d_in_val_1 = 0;
 27 | 
 28 |     // Simple implementation's performance is not significantly (if at all)
 29 |     //  better than previous verbose implementation
 30 |     unsigned int cpy_idx = 2 * blockIdx.x * blockDim.x + threadIdx.x;
 31 |     if (cpy_idx < numElems)
 32 |     {
 33 |         d_out[cpy_idx] = d_in[cpy_idx] + d_block_sum_val;
 34 |         if (cpy_idx + blockDim.x < numElems)
 35 |             d_out[cpy_idx + blockDim.x] = d_in[cpy_idx + blockDim.x] + d_block_sum_val;
 36 |     }
 37 | 
 38 |     //if (2 * glbl_t_idx < numElems)
 39 |     //{
 40 |     //    d_out[2 * glbl_t_idx] = d_in[2 * glbl_t_idx] + d_block_sum_val;
 41 |     //    if (2 * glbl_t_idx + 1 < numElems)
 42 |     //        d_out[2 * glbl_t_idx + 1] = d_in[2 * glbl_t_idx + 1] + d_block_sum_val;
 43 |     //}
 44 | 
 45 |     //if (2 * glbl_t_idx < numElems)
 46 |     //{
 47 |     //    d_in_val_0 = d_in[2 * glbl_t_idx];
 48 |     //    if (2 * glbl_t_idx + 1 < numElems)
 49 |     //        d_in_val_1 = d_in[2 * glbl_t_idx + 1];
 50 |     //}
 51 |     //else
 52 |     //    return;
 53 |     //__syncthreads();
 54 | 
 55 |     //d_out[2 * glbl_t_idx] = d_in_val_0 + d_block_sum_val;
 56 |     //if (2 * glbl_t_idx + 1 < numElems)
 57 |     //    d_out[2 * glbl_t_idx + 1] = d_in_val_1 + d_block_sum_val;
 58 | }
 59 | 
 60 | // Modified version of Mark Harris' implementation of the Blelloch scan
 61 | //  according to https://www.mimuw.edu.pl/~ps209291/kgkp/slides/scan.pdf
 62 | __global__
 63 | void gpu_prescan(unsigned int* const d_out,
 64 |     const unsigned int* const d_in,
 65 |     unsigned int* const d_block_sums,
 66 |     const unsigned int len,
 67 |     const unsigned int shmem_sz,
 68 |     const unsigned int max_elems_per_block)
 69 | {
 70 |     // Allocated on invocation
 71 |     extern __shared__ unsigned int s_out[];
 72 | 
 73 |     int thid = threadIdx.x;
 74 |     int ai = thid;
 75 |     int bi = thid + blockDim.x;
 76 | 
 77 |     // Zero out the shared memory
 78 |     // Helpful especially when input size is not power of two
 79 |     s_out[thid] = 0;
 80 |     s_out[thid + blockDim.x] = 0;
 81 |     // If CONFLICT_FREE_OFFSET is used, shared memory size
 82 |     //  must be a 2 * blockDim.x + blockDim.x/num_banks
 83 |     s_out[thid + blockDim.x + (blockDim.x >> LOG_NUM_BANKS)] = 0;
 84 |     
 85 |     __syncthreads();
 86 |     
 87 |     // Copy d_in to shared memory
 88 |     // Note that d_in's elements are scattered into shared memory
 89 |     //  in light of avoiding bank conflicts
 90 |     unsigned int cpy_idx = max_elems_per_block * blockIdx.x + threadIdx.x;
 91 |     if (cpy_idx < len)
 92 |     {
 93 |         s_out[ai + CONFLICT_FREE_OFFSET(ai)] = d_in[cpy_idx];
 94 |         if (cpy_idx + blockDim.x < len)
 95 |             s_out[bi + CONFLICT_FREE_OFFSET(bi)] = d_in[cpy_idx + blockDim.x];
 96 |     }
 97 | 
 98 |     // For both upsweep and downsweep:
 99 |     // Sequential indices with conflict free padding
100 |     //  Amount of padding = target index / num banks
101 |     //  This "shifts" the target indices by one every multiple
102 |     //   of the num banks
103 |     // offset controls the stride and starting index of 
104 |     //  target elems at every iteration
105 |     // d just controls which threads are active
106 |     // Sweeps are pivoted on the last element of shared memory
107 | 
108 |     // Upsweep/Reduce step
109 |     int offset = 1;
110 |     for (int d = max_elems_per_block >> 1; d > 0; d >>= 1)
111 |     {
112 |         __syncthreads();
113 | 
114 |         if (thid < d)
115 |         {
116 |             int ai = offset * ((thid << 1) + 1) - 1;
117 |             int bi = offset * ((thid << 1) + 2) - 1;
118 |             ai += CONFLICT_FREE_OFFSET(ai);
119 |             bi += CONFLICT_FREE_OFFSET(bi);
120 | 
121 |             s_out[bi] += s_out[ai];
122 |         }
123 |         offset <<= 1;
124 |     }
125 | 
126 |     // Save the total sum on the global block sums array
127 |     // Then clear the last element on the shared memory
128 |     if (thid == 0) 
129 |     { 
130 |         d_block_sums[blockIdx.x] = s_out[max_elems_per_block - 1 
131 |             + CONFLICT_FREE_OFFSET(max_elems_per_block - 1)];
132 |         s_out[max_elems_per_block - 1 
133 |             + CONFLICT_FREE_OFFSET(max_elems_per_block - 1)] = 0;
134 |     }
135 | 
136 |     // Downsweep step
137 |     for (int d = 1; d < max_elems_per_block; d <<= 1)
138 |     {
139 |         offset >>= 1;
140 |         __syncthreads();
141 | 
142 |         if (thid < d)
143 |         {
144 |             int ai = offset * ((thid << 1) + 1) - 1;
145 |             int bi = offset * ((thid << 1) + 2) - 1;
146 |             ai += CONFLICT_FREE_OFFSET(ai);
147 |             bi += CONFLICT_FREE_OFFSET(bi);
148 | 
149 |             unsigned int temp = s_out[ai];
150 |             s_out[ai] = s_out[bi];
151 |             s_out[bi] += temp;
152 |         }
153 |     }
154 |     __syncthreads();
155 | 
156 |     // Copy contents of shared memory to global memory
157 |     if (cpy_idx < len)
158 |     {
159 |         d_out[cpy_idx] = s_out[ai + CONFLICT_FREE_OFFSET(ai)];
160 |         if (cpy_idx + blockDim.x < len)
161 |             d_out[cpy_idx + blockDim.x] = s_out[bi + CONFLICT_FREE_OFFSET(bi)];
162 |     }
163 | }
164 |  
165 | void sum_scan_blelloch(unsigned int* const d_out,
166 |     const unsigned int* const d_in,
167 |     const size_t numElems)
168 | {
169 |     // Zero out d_out
170 |     checkCudaErrors(cudaMemset(d_out, 0, numElems * sizeof(unsigned int)));
171 | 
172 |     // Set up number of threads and blocks
173 |     
174 |     unsigned int block_sz = MAX_BLOCK_SZ / 2;
175 |     unsigned int max_elems_per_block = 2 * block_sz; // due to binary tree nature of algorithm
176 | 
177 |     // If input size is not power of two, the remainder will still need a whole block
178 |     // Thus, number of blocks must be the ceiling of input size / max elems that a block can handle
179 |     //unsigned int grid_sz = (unsigned int) std::ceil((double) numElems / (double) max_elems_per_block);
180 |     // UPDATE: Instead of using ceiling and risking miscalculation due to precision, just automatically  
181 |     //  add 1 to the grid size when the input size cannot be divided cleanly by the block's capacity
182 |     unsigned int grid_sz = numElems / max_elems_per_block;
183 |     // Take advantage of the fact that integer division drops the decimals
184 |     if (numElems % max_elems_per_block != 0) 
185 |         grid_sz += 1;
186 | 
187 |     // Conflict free padding requires that shared memory be more than 2 * block_sz
188 |     unsigned int shmem_sz = max_elems_per_block + ((max_elems_per_block) >> LOG_NUM_BANKS);
189 | 
190 |     // Allocate memory for array of total sums produced by each block
191 |     // Array length must be the same as number of blocks
192 |     unsigned int* d_block_sums;
193 |     checkCudaErrors(cudaMalloc(&d_block_sums, sizeof(unsigned int) * grid_sz));
194 |     checkCudaErrors(cudaMemset(d_block_sums, 0, sizeof(unsigned int) * grid_sz));
195 | 
196 |     // Sum scan data allocated to each block
197 |     //gpu_sum_scan_blelloch<<<grid_sz, block_sz, sizeof(unsigned int) * max_elems_per_block >>>(d_out, d_in, d_block_sums, numElems);
198 |     gpu_prescan<<<grid_sz, block_sz, sizeof(unsigned int) * shmem_sz>>>(d_out, 
199 |                                                                     d_in, 
200 |                                                                     d_block_sums, 
201 |                                                                     numElems, 
202 |                                                                     shmem_sz,
203 |                                                                     max_elems_per_block);
204 | 
205 |     // Sum scan total sums produced by each block
206 |     // Use basic implementation if number of total sums is <= 2 * block_sz
207 |     //  (This requires only one block to do the scan)
208 |     if (grid_sz <= max_elems_per_block)
209 |     {
210 |         unsigned int* d_dummy_blocks_sums;
211 |         checkCudaErrors(cudaMalloc(&d_dummy_blocks_sums, sizeof(unsigned int)));
212 |         checkCudaErrors(cudaMemset(d_dummy_blocks_sums, 0, sizeof(unsigned int)));
213 |         //gpu_sum_scan_blelloch<<<1, block_sz, sizeof(unsigned int) * max_elems_per_block>>>(d_block_sums, d_block_sums, d_dummy_blocks_sums, grid_sz);
214 |         gpu_prescan<<<1, block_sz, sizeof(unsigned int) * shmem_sz>>>(d_block_sums, 
215 |                                                                     d_block_sums, 
216 |                                                                     d_dummy_blocks_sums, 
217 |                                                                     grid_sz, 
218 |                                                                     shmem_sz,
219 |                                                                     max_elems_per_block);
220 |         checkCudaErrors(cudaFree(d_dummy_blocks_sums));
221 |     }
222 |     // Else, recurse on this same function as you'll need the full-blown scan
223 |     //  for the block sums
224 |     else
225 |     {
226 |         unsigned int* d_in_block_sums;
227 |         checkCudaErrors(cudaMalloc(&d_in_block_sums, sizeof(unsigned int) * grid_sz));
228 |         checkCudaErrors(cudaMemcpy(d_in_block_sums, d_block_sums, sizeof(unsigned int) * grid_sz, cudaMemcpyDeviceToDevice));
229 |         sum_scan_blelloch(d_block_sums, d_in_block_sums, grid_sz);
230 |         checkCudaErrors(cudaFree(d_in_block_sums));
231 |     }
232 |     
233 |     //// Uncomment to examine block sums
234 |     //unsigned int* h_block_sums = new unsigned int[grid_sz];
235 |     //checkCudaErrors(cudaMemcpy(h_block_sums, d_block_sums, sizeof(unsigned int) * grid_sz, cudaMemcpyDeviceToHost));
236 |     //std::cout << "Block sums: ";
237 |     //for (int i = 0; i < grid_sz; ++i)
238 |     //{
239 |     //    std::cout << h_block_sums[i] << ", ";
240 |     //}
241 |     //std::cout << std::endl;
242 |     //std::cout << "Block sums length: " << grid_sz << std::endl;
243 |     //delete[] h_block_sums;
244 | 
245 |     // Add each block's total sum to its scan output
246 |     // in order to get the final, global scanned array
247 |     gpu_add_block_sums<<<grid_sz, block_sz>>>(d_out, d_out, d_block_sums, numElems);
248 | 
249 |     checkCudaErrors(cudaFree(d_block_sums));
250 | }
251 | 


--------------------------------------------------------------------------------
/radix_sort/scan.h:
--------------------------------------------------------------------------------
 1 | #ifndef SCAN_H__
 2 | #define SCAN_H__
 3 | 
 4 | #include "cuda_runtime.h"
 5 | #include "device_launch_parameters.h"
 6 | 
 7 | #include <cmath>
 8 | 
 9 | #include "utils.h"
10 | #include "timer.h"
11 | 
12 | void sum_scan_naive(unsigned int* const d_out,
13 |     const unsigned int* const d_in,
14 |     const size_t numElems);
15 | 
16 | void sum_scan_blelloch(unsigned int* const d_out,
17 |     const unsigned int* const d_in,
18 |     const size_t numElems);
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/radix_sort/sort.cu:
--------------------------------------------------------------------------------
  1 | #include "sort.h"
  2 | 
  3 | #define MAX_BLOCK_SZ 128
  4 | 
  5 | __global__ void gpu_radix_sort_local(unsigned int* d_out_sorted,
  6 |     unsigned int* d_prefix_sums,
  7 |     unsigned int* d_block_sums,
  8 |     unsigned int input_shift_width,
  9 |     unsigned int* d_in,
 10 |     unsigned int d_in_len,
 11 |     unsigned int max_elems_per_block)
 12 | {
 13 |     // need shared memory array for:
 14 |     // - block's share of the input data (local sort will be put here too)
 15 |     // - mask outputs
 16 |     // - scanned mask outputs
 17 |     // - merged scaned mask outputs ("local prefix sum")
 18 |     // - local sums of scanned mask outputs
 19 |     // - scanned local sums of scanned mask outputs
 20 | 
 21 |     // for all radix combinations:
 22 |     //  build mask output for current radix combination
 23 |     //  scan mask ouput
 24 |     //  store needed value from current prefix sum array to merged prefix sum array
 25 |     //  store total sum of mask output (obtained from scan) to global block sum array
 26 |     // calculate local sorted address from local prefix sum and scanned mask output's total sums
 27 |     // shuffle input block according to calculated local sorted addresses
 28 |     // shuffle local prefix sums according to calculated local sorted addresses
 29 |     // copy locally sorted array back to global memory
 30 |     // copy local prefix sum array back to global memory
 31 | 
 32 |     extern __shared__ unsigned int shmem[];
 33 |     unsigned int* s_data = shmem;
 34 |     // s_mask_out[] will be scanned in place
 35 |     unsigned int s_mask_out_len = max_elems_per_block + 1;
 36 |     unsigned int* s_mask_out = &s_data[max_elems_per_block];
 37 |     unsigned int* s_merged_scan_mask_out = &s_mask_out[s_mask_out_len];
 38 |     unsigned int* s_mask_out_sums = &s_merged_scan_mask_out[max_elems_per_block];
 39 |     unsigned int* s_scan_mask_out_sums = &s_mask_out_sums[4];
 40 | 
 41 |     unsigned int thid = threadIdx.x;
 42 | 
 43 |     // Copy block's portion of global input data to shared memory
 44 |     unsigned int cpy_idx = max_elems_per_block * blockIdx.x + thid;
 45 |     if (cpy_idx < d_in_len)
 46 |         s_data[thid] = d_in[cpy_idx];
 47 |     else
 48 |         s_data[thid] = 0;
 49 | 
 50 |     __syncthreads();
 51 | 
 52 |     // To extract the correct 2 bits, we first shift the number
 53 |     //  to the right until the correct 2 bits are in the 2 LSBs,
 54 |     //  then mask on the number with 11 (3) to remove the bits
 55 |     //  on the left
 56 |     unsigned int t_data = s_data[thid];
 57 |     unsigned int t_2bit_extract = (t_data >> input_shift_width) & 3;
 58 | 
 59 |     for (unsigned int i = 0; i < 4; ++i)
 60 |     {
 61 |         // Zero out s_mask_out
 62 |         s_mask_out[thid] = 0;
 63 |         if (thid == 0)
 64 |             s_mask_out[s_mask_out_len - 1] = 0;
 65 | 
 66 |         __syncthreads();
 67 | 
 68 |         // build bit mask output
 69 |         bool val_equals_i = false;
 70 |         if (cpy_idx < d_in_len)
 71 |         {
 72 |             val_equals_i = t_2bit_extract == i;
 73 |             s_mask_out[thid] = val_equals_i;
 74 |         }
 75 |         __syncthreads();
 76 | 
 77 |         // Scan mask outputs (Hillis-Steele)
 78 |         int partner = 0;
 79 |         unsigned int sum = 0;
 80 |         unsigned int max_steps = (unsigned int) log2f(max_elems_per_block);
 81 |         for (unsigned int d = 0; d < max_steps; d++) {
 82 |             partner = thid - (1 << d);
 83 |             if (partner >= 0) {
 84 |                 sum = s_mask_out[thid] + s_mask_out[partner];
 85 |             }
 86 |             else {
 87 |                 sum = s_mask_out[thid];
 88 |             }
 89 |             __syncthreads();
 90 |             s_mask_out[thid] = sum;
 91 |             __syncthreads();
 92 |         }
 93 | 
 94 |         // Shift elements to produce the same effect as exclusive scan
 95 |         unsigned int cpy_val = 0;
 96 |         cpy_val = s_mask_out[thid];
 97 |         __syncthreads();
 98 |         s_mask_out[thid + 1] = cpy_val;
 99 |         __syncthreads();
100 | 
101 |         if (thid == 0)
102 |         {
103 |             // Zero out first element to produce the same effect as exclusive scan
104 |             s_mask_out[0] = 0;
105 |             unsigned int total_sum = s_mask_out[s_mask_out_len - 1];
106 |             s_mask_out_sums[i] = total_sum;
107 |             d_block_sums[i * gridDim.x + blockIdx.x] = total_sum;
108 |         }
109 |         __syncthreads();
110 | 
111 |         if (val_equals_i && (cpy_idx < d_in_len))
112 |         {
113 |             s_merged_scan_mask_out[thid] = s_mask_out[thid];
114 |         }
115 | 
116 |         __syncthreads();
117 |     }
118 | 
119 |     // Scan mask output sums
120 |     // Just do a naive scan since the array is really small
121 |     if (thid == 0)
122 |     {
123 |         unsigned int run_sum = 0;
124 |         for (unsigned int i = 0; i < 4; ++i)
125 |         {
126 |             s_scan_mask_out_sums[i] = run_sum;
127 |             run_sum += s_mask_out_sums[i];
128 |         }
129 |     }
130 | 
131 |     __syncthreads();
132 | 
133 |     if (cpy_idx < d_in_len)
134 |     {
135 |         // Calculate the new indices of the input elements for sorting
136 |         unsigned int t_prefix_sum = s_merged_scan_mask_out[thid];
137 |         unsigned int new_pos = t_prefix_sum + s_scan_mask_out_sums[t_2bit_extract];
138 |         
139 |         __syncthreads();
140 | 
141 |         // Shuffle the block's input elements to actually sort them
142 |         // Do this step for greater global memory transfer coalescing
143 |         //  in next step
144 |         s_data[new_pos] = t_data;
145 |         s_merged_scan_mask_out[new_pos] = t_prefix_sum;
146 |         
147 |         __syncthreads();
148 | 
149 |         // Copy block - wise prefix sum results to global memory
150 |         // Copy block-wise sort results to global 
151 |         d_prefix_sums[cpy_idx] = s_merged_scan_mask_out[thid];
152 |         d_out_sorted[cpy_idx] = s_data[thid];
153 |     }
154 | }
155 | 
156 | __global__ void gpu_glbl_shuffle(unsigned int* d_out,
157 |     unsigned int* d_in,
158 |     unsigned int* d_scan_block_sums,
159 |     unsigned int* d_prefix_sums,
160 |     unsigned int input_shift_width,
161 |     unsigned int d_in_len,
162 |     unsigned int max_elems_per_block)
163 | {
164 |     // get d = digit
165 |     // get n = blockIdx
166 |     // get m = local prefix sum array value
167 |     // calculate global position = P_d[n] + m
168 |     // copy input element to final position in d_out
169 | 
170 |     unsigned int thid = threadIdx.x;
171 |     unsigned int cpy_idx = max_elems_per_block * blockIdx.x + thid;
172 | 
173 |     if (cpy_idx < d_in_len)
174 |     {
175 |         unsigned int t_data = d_in[cpy_idx];
176 |         unsigned int t_2bit_extract = (t_data >> input_shift_width) & 3;
177 |         unsigned int t_prefix_sum = d_prefix_sums[cpy_idx];
178 |         unsigned int data_glbl_pos = d_scan_block_sums[t_2bit_extract * gridDim.x + blockIdx.x]
179 |             + t_prefix_sum;
180 |         __syncthreads();
181 |         d_out[data_glbl_pos] = t_data;
182 |     }
183 | }
184 | 
185 | // An attempt at the gpu radix sort variant described in this paper:
186 | // https://vgc.poly.edu/~csilva/papers/cgf.pdf
187 | void radix_sort(unsigned int* const d_out,
188 |     unsigned int* const d_in,
189 |     unsigned int d_in_len)
190 | {
191 |     unsigned int block_sz = MAX_BLOCK_SZ;
192 |     unsigned int max_elems_per_block = block_sz;
193 |     unsigned int grid_sz = d_in_len / max_elems_per_block;
194 |     // Take advantage of the fact that integer division drops the decimals
195 |     if (d_in_len % max_elems_per_block != 0)
196 |         grid_sz += 1;
197 | 
198 |     unsigned int* d_prefix_sums;
199 |     unsigned int d_prefix_sums_len = d_in_len;
200 |     checkCudaErrors(cudaMalloc(&d_prefix_sums, sizeof(unsigned int) * d_prefix_sums_len));
201 |     checkCudaErrors(cudaMemset(d_prefix_sums, 0, sizeof(unsigned int) * d_prefix_sums_len));
202 | 
203 |     unsigned int* d_block_sums;
204 |     unsigned int d_block_sums_len = 4 * grid_sz; // 4-way split
205 |     checkCudaErrors(cudaMalloc(&d_block_sums, sizeof(unsigned int) * d_block_sums_len));
206 |     checkCudaErrors(cudaMemset(d_block_sums, 0, sizeof(unsigned int) * d_block_sums_len));
207 | 
208 |     unsigned int* d_scan_block_sums;
209 |     checkCudaErrors(cudaMalloc(&d_scan_block_sums, sizeof(unsigned int) * d_block_sums_len));
210 |     checkCudaErrors(cudaMemset(d_scan_block_sums, 0, sizeof(unsigned int) * d_block_sums_len));
211 | 
212 |     // shared memory consists of 3 arrays the size of the block-wise input
213 |     //  and 2 arrays the size of n in the current n-way split (4)
214 |     unsigned int s_data_len = max_elems_per_block;
215 |     unsigned int s_mask_out_len = max_elems_per_block + 1;
216 |     unsigned int s_merged_scan_mask_out_len = max_elems_per_block;
217 |     unsigned int s_mask_out_sums_len = 4; // 4-way split
218 |     unsigned int s_scan_mask_out_sums_len = 4;
219 |     unsigned int shmem_sz = (s_data_len 
220 |                             + s_mask_out_len
221 |                             + s_merged_scan_mask_out_len
222 |                             + s_mask_out_sums_len
223 |                             + s_scan_mask_out_sums_len)
224 |                             * sizeof(unsigned int);
225 | 
226 | 
227 |     // for every 2 bits from LSB to MSB:
228 |     //  block-wise radix sort (write blocks back to global memory)
229 |     for (unsigned int shift_width = 0; shift_width <= 30; shift_width += 2)
230 |     {
231 |         gpu_radix_sort_local<<<grid_sz, block_sz, shmem_sz>>>(d_out, 
232 |                                                                 d_prefix_sums, 
233 |                                                                 d_block_sums, 
234 |                                                                 shift_width, 
235 |                                                                 d_in, 
236 |                                                                 d_in_len, 
237 |                                                                 max_elems_per_block);
238 | 
239 |         //unsigned int* h_test = new unsigned int[d_in_len];
240 |         //checkCudaErrors(cudaMemcpy(h_test, d_in, sizeof(unsigned int) * d_in_len, cudaMemcpyDeviceToHost));
241 |         //for (unsigned int i = 0; i < d_in_len; ++i)
242 |         //    std::cout << h_test[i] << " ";
243 |         //std::cout << std::endl;
244 |         //delete[] h_test;
245 | 
246 |         // scan global block sum array
247 |         sum_scan_blelloch(d_scan_block_sums, d_block_sums, d_block_sums_len);
248 | 
249 |         // scatter/shuffle block-wise sorted array to final positions
250 |         gpu_glbl_shuffle<<<grid_sz, block_sz>>>(d_in, 
251 |                                                     d_out, 
252 |                                                     d_scan_block_sums, 
253 |                                                     d_prefix_sums, 
254 |                                                     shift_width, 
255 |                                                     d_in_len, 
256 |                                                     max_elems_per_block);
257 |     }
258 |     checkCudaErrors(cudaMemcpy(d_out, d_in, sizeof(unsigned int) * d_in_len, cudaMemcpyDeviceToDevice));
259 | 
260 |     checkCudaErrors(cudaFree(d_scan_block_sums));
261 |     checkCudaErrors(cudaFree(d_block_sums));
262 |     checkCudaErrors(cudaFree(d_prefix_sums));
263 | }
264 | 


--------------------------------------------------------------------------------
/radix_sort/sort.h:
--------------------------------------------------------------------------------
 1 | #ifndef SORT_H__
 2 | #define SORT_H__
 3 | 
 4 | #include "cuda_runtime.h"
 5 | #include "device_launch_parameters.h"
 6 | #include "scan.h"
 7 | #include <cmath>
 8 | 
 9 | void radix_sort(unsigned int* const d_out,
10 |     unsigned int* const d_in,
11 |     unsigned int d_in_len);
12 | 
13 | #endif


--------------------------------------------------------------------------------
/radix_sort/timer.h:
--------------------------------------------------------------------------------
 1 | // Originally from Udacity (https://www.udacity.com/course/intro-to-parallel-programming--cs344)
 2 | // Used only for educational purposes
 3 | 
 4 | #ifndef GPU_TIMER_H__
 5 | #define GPU_TIMER_H__
 6 | 
 7 | #include <cuda_runtime.h>
 8 | 
 9 | struct GpuTimer
10 | {
11 |   cudaEvent_t start;
12 |   cudaEvent_t stop;
13 | 
14 |   GpuTimer()
15 |   {
16 |     cudaEventCreate(&start);
17 |     cudaEventCreate(&stop);
18 |   }
19 | 
20 |   ~GpuTimer()
21 |   {
22 |     cudaEventDestroy(start);
23 |     cudaEventDestroy(stop);
24 |   }
25 | 
26 |   void Start()
27 |   {
28 |     cudaEventRecord(start, 0);
29 |   }
30 | 
31 |   void Stop()
32 |   {
33 |     cudaEventRecord(stop, 0);
34 |   }
35 | 
36 |   float Elapsed()
37 |   {
38 |     float elapsed;
39 |     cudaEventSynchronize(stop);
40 |     cudaEventElapsedTime(&elapsed, start, stop);
41 |     return elapsed;
42 |   }
43 | };
44 | 
45 | #endif  /* GPU_TIMER_H__ */
46 | 


--------------------------------------------------------------------------------
/radix_sort/utils.h:
--------------------------------------------------------------------------------
  1 | // Originally from Udacity (https://www.udacity.com/course/intro-to-parallel-programming--cs344)
  2 | // Used only for educational purposes
  3 | 
  4 | #ifndef UTILS_H__
  5 | #define UTILS_H__
  6 | 
  7 | #include <iostream>
  8 | #include <iomanip>
  9 | #include <cuda.h>
 10 | #include <cuda_runtime.h>
 11 | #include <cuda_runtime_api.h>
 12 | #include <cassert>
 13 | #include <cmath>
 14 | 
 15 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
 16 | 
 17 | template<typename T>
 18 | void check(T err, const char* const func, const char* const file, const int line) {
 19 |   if (err != cudaSuccess) {
 20 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
 21 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
 22 |     exit(1);
 23 |   }
 24 | }
 25 | 
 26 | template<typename T>
 27 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
 28 |   //check that the GPU result matches the CPU result
 29 |   for (size_t i = 0; i < numElem; ++i) {
 30 |     if (ref[i] != gpu[i]) {
 31 |       std::cerr << "Difference at pos " << i << std::endl;
 32 |       //the + is magic to convert char to int without messing
 33 |       //with other types
 34 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
 35 |                  "\nGPU      : " << +gpu[i] << std::endl;
 36 |       exit(1);
 37 |     }
 38 |   }
 39 | }
 40 | 
 41 | template<typename T>
 42 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
 43 |   assert(eps1 >= 0 && eps2 >= 0);
 44 |   unsigned long long totalDiff = 0;
 45 |   unsigned numSmallDifferences = 0;
 46 |   for (size_t i = 0; i < numElem; ++i) {
 47 |     //subtract smaller from larger in case of unsigned types
 48 |     #ifdef _WIN32
 49 |     T smaller = std::fmin(ref[i], gpu[i]);
 50 |     T larger = std::fmax(ref[i], gpu[i]);
 51 |     #else
 52 |     T smaller = std::min(ref[i], gpu[i]);
 53 |     T larger = std::max(ref[i], gpu[i]);
 54 |     #endif
 55 |     T diff = larger - smaller;
 56 |     if (diff > 0 && diff <= eps1) {
 57 |       numSmallDifferences++;
 58 |     }
 59 |     else if (diff > eps1) {
 60 |       std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
 61 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
 62 |         "\nGPU      : " << +gpu[i] << std::endl;
 63 |       exit(1);
 64 |     }
 65 |     totalDiff += diff * diff;
 66 |   }
 67 |   double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
 68 |   if (percentSmallDifferences > eps2) {
 69 |     std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
 70 |     std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
 71 |     exit(1);
 72 |   }
 73 | }
 74 | 
 75 | //Uses the autodesk method of image comparison
 76 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
 77 | template<typename T>
 78 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
 79 | {
 80 | 
 81 |   size_t numBadPixels = 0;
 82 |   for (size_t i = 0; i < numElem; ++i) {
 83 |     #ifdef _WIN32
 84 |     T smaller = std::fmin(ref[i], gpu[i]);
 85 |     T larger = std::fmax(ref[i], gpu[i]);
 86 |     #else
 87 |     T smaller = std::min(ref[i], gpu[i]);
 88 |     T larger = std::max(ref[i], gpu[i]);
 89 |     #endif
 90 |     T diff = larger - smaller;
 91 |     if (diff > variance)
 92 |       ++numBadPixels;
 93 |   }
 94 | 
 95 |   if (numBadPixels > tolerance) {
 96 |     std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
 97 |     exit(1);
 98 |   }
 99 | }
100 | 
101 | #endif
102 | 


--------------------------------------------------------------------------------