├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── build_sorting_libs.py ├── condarecipe ├── bld.bat ├── build.sh └── meta.yaml ├── lib ├── cubradixsort.cu ├── dllexport.h ├── mgpucontext.cu └── mgpusort.cu └── test └── test_sorting_libs.py /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | _build 3 | __pycache__ 4 | *.pyc 5 | \#*\# 6 | *.so 7 | *.dll 8 | *.dylib 9 | .cache 10 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "thirdparty/moderngpu"] 2 | path = thirdparty/moderngpu 3 | url = https://github.com/moderngpu/moderngpu 4 | [submodule "thirdparty/cub"] 5 | path = thirdparty/cub 6 | url = https://github.com/NVlabs/cub 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2017, Anaconda, Inc. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pyculib\_sorting 2 | 3 | Pyculib\_sorting provides simplified interfaces to CUDA sorting libraries. 4 | At present it contains a wrapper around: 5 | 6 | * A radix sort implementation from [CUB](http://nvlabs.github.com/cub). 7 | * A segmented sort implementation from 8 | [ModernGPU](http://nvlabs.github.io/moderngpu) 9 | 10 | Pyculib\_sorting is predominantly used by [Pyculib](https://github.com/numba/pyculib) to provide 11 | sorting routines. 12 | 13 | 14 | ## Requirements 15 | Pyculib\_sorting requires the following programs to build and test: 16 | * Python 17 | * NVIDIA's `nvcc` compiler 18 | 19 | and the following Python packages 20 | * pytest 21 | * Numba 22 | 23 | 24 | ## Obtaining the source code 25 | Pyculib\_sorting relies on git submodules to access the CUB and ModernGPU source code, 26 | to obtain a code base suitable for building the libraries run: 27 | 28 | ``` 29 | #> git clone https://github.com/numba/pyculib_sorting.git 30 | 31 | #> cd pyculib_sorting 32 | 33 | #> git submodule update --init 34 | ``` 35 | 36 | the URL above may be adjusted to use `ssh` based 37 | `git@github.com:numba/pyculib_sorting.git` as desired. 38 | 39 | 40 | ## Building the libraries 41 | 42 | To build the libraries run: 43 | ``` 44 | #> python build_sorting_libs.py 45 | ``` 46 | 47 | 48 | ## Testing 49 | 50 | Testing uses pytest and is simply invoked with: 51 | ``` 52 | #> pytest 53 | ``` 54 | 55 | 56 | ## Conda build 57 | 58 | To create a conda package of Pyculib\_sorting, assuming conda-build is 59 | installed, run: 60 | 61 | ``` 62 | #> conda build condarecipe 63 | ``` 64 | 65 | from the root directory of Pyculib\_sorting. 66 | 67 | -------------------------------------------------------------------------------- /build_sorting_libs.py: -------------------------------------------------------------------------------- 1 | # A script to build external dependencies 2 | 3 | import os 4 | import subprocess 5 | import platform 6 | 7 | 8 | def basedir(): 9 | return os.path.abspath(os.path.dirname(__file__)) 10 | 11 | 12 | def cub_include(): 13 | return '-I%s/thirdparty/cub' % basedir() 14 | 15 | 16 | def mgpu_include(): 17 | return '-I%s/thirdparty/moderngpu/include' % basedir() 18 | 19 | 20 | def lib_dir(): 21 | return '%s/lib' % basedir() 22 | 23 | 24 | def run_shell(cmd): 25 | print(cmd) 26 | subprocess.check_call(cmd, shell=True) 27 | 28 | 29 | def library_extension(): 30 | p = platform.system() 31 | if p == 'Linux': 32 | return 'so' 33 | if p == 'Windows': 34 | return 'dll' 35 | if p == 'Darwin': 36 | return 'dylib' 37 | 38 | 39 | def gencode_flags(): 40 | # Generate code for all known architectures 41 | GENCODE_SMXX = "-gencode arch=compute_{CC},code=sm_{CC}" 42 | GENCODE_SM20 = GENCODE_SMXX.format(CC=20) 43 | GENCODE_SM30 = GENCODE_SMXX.format(CC=30) 44 | GENCODE_SM35 = GENCODE_SMXX.format(CC=35) 45 | GENCODE_SM37 = GENCODE_SMXX.format(CC=37) 46 | GENCODE_SM50 = GENCODE_SMXX.format(CC=50) 47 | GENCODE_SM52 = GENCODE_SMXX.format(CC=52) 48 | GENCODE_SM53 = GENCODE_SMXX.format(CC=53) 49 | 50 | # Provide forward-compatibility to architectures beyond CC 5.3 51 | GENCODE_COMPUTEXX = "-gencode arch=compute_{CC},code=compute_{CC}" 52 | GENCODE_COMPUTE53 = GENCODE_COMPUTEXX.format(CC=53) 53 | 54 | # Concatenate flags 55 | SM = [] 56 | SM.append(GENCODE_SM20) 57 | SM.append(GENCODE_SM30) 58 | SM.append(GENCODE_SM35) 59 | SM.append(GENCODE_SM37) 60 | SM.append(GENCODE_SM50) 61 | SM.append(GENCODE_SM52) 62 | SM.append(GENCODE_SM53) 63 | SM.append(GENCODE_COMPUTE53) 64 | return ' '.join(SM) 65 | 66 | 67 | def build_cuda(srcdir, out, ins, includes): 68 | # Allow specification of nvcc location in NVCC env var 69 | nvcc = os.environ.get('NVCC', 'nvcc') 70 | 71 | # Build for 32- or 64-bit 72 | optflags = '-m%s --compiler-options "-fPIC"' 73 | if tuple.__itemsize__ == 4: 74 | opt = optflags % 32 75 | elif tuple.__itemsize__ == 8: 76 | opt = optflags % 64 77 | 78 | ext = library_extension() 79 | output = os.path.join(lib_dir(), '%s.%s' % (out, ext)) 80 | inputs = ' '.join([os.path.join(srcdir, p) 81 | for p in ins]) 82 | argtemp = '{opt} {inc} -O3 {gen} --shared -o {out} {inp}' 83 | args = argtemp.format(inc=includes, gen=gencode_flags(), out=output, 84 | inp=inputs, opt=opt) 85 | cmd = ' '.join([nvcc, args]) 86 | run_shell(cmd) 87 | 88 | 89 | def build_radixsort(): 90 | build_cuda(srcdir=lib_dir(), 91 | out='pyculib_radixsort', 92 | ins=['cubradixsort.cu'], 93 | includes=cub_include(), ) 94 | 95 | 96 | def build_mgpusort(): 97 | build_cuda(srcdir=lib_dir(), 98 | out='pyculib_segsort', 99 | ins=['mgpusort.cu'], 100 | includes=mgpu_include(), ) 101 | 102 | 103 | if __name__ == '__main__': 104 | build_radixsort() 105 | build_mgpusort() 106 | -------------------------------------------------------------------------------- /condarecipe/bld.bat: -------------------------------------------------------------------------------- 1 | cd %RECIPE_DIR%\.. 2 | call "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\vcvarsall.bat" amd64 3 | %PYTHON% build_sorting_libs.py 4 | if errorlevel 1 exit 1 5 | 6 | mkdir %PREFIX%\DLLs 7 | copy %RECIPE_DIR%\..\lib\*.dll %PREFIX%\DLLs 8 | -------------------------------------------------------------------------------- /condarecipe/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | function build() { 6 | cd $RECIPE_DIR/.. 7 | python build_sorting_libs.py 8 | } 9 | 10 | build 11 | mkdir -p $PREFIX/lib 12 | 13 | if [ `uname` == Linux ] 14 | then 15 | EXT=so 16 | fi 17 | 18 | if [ `uname` == Darwin ] 19 | then 20 | EXT=dylib 21 | fi 22 | 23 | cp $RECIPE_DIR/../lib/*.$EXT $PREFIX/lib 24 | -------------------------------------------------------------------------------- /condarecipe/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: pyculib_sorting 3 | version: {{ GIT_DESCRIBE_TAG }} 4 | 5 | source: 6 | path: .. 7 | 8 | build: 9 | number: {{ GIT_DESCRIBE_NUMBER|int }} 10 | script_env: 11 | - LD_LIBRARY_PATH # pass cuda libs through for docker builds 12 | 13 | requirements: 14 | build: 15 | - python 16 | run: 17 | 18 | test: 19 | source_files: 20 | - test 21 | requires: 22 | - numba 23 | - numpy 24 | - cudatoolkit 25 | - pytest 26 | commands: 27 | - pytest test -v 28 | 29 | about: 30 | home: https://github.com/numba/pyculib_sorting 31 | license: BSD 32 | license_file: LICENSE 33 | summary: Sorting libraries for Pyculib. 34 | -------------------------------------------------------------------------------- /lib/cubradixsort.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "dllexport.h" 4 | 5 | 6 | // #define E(X) _debug_check((X), __LINE__, __FILE__) 7 | #define E(X) _release_check((X)) 8 | 9 | 10 | struct TempStorage{ 11 | void * storage; 12 | size_t storage_bytes; 13 | }; 14 | 15 | static 16 | void _release_check(cudaError_t err) { 17 | if (err != cudaSuccess) { 18 | fprintf(stderr, "Fatal CUDA error:\n"); 19 | fprintf(stderr, "%s\n", cudaGetErrorString(err)); 20 | exit(1); 21 | } 22 | } 23 | 24 | static 25 | void _debug_check(cudaError_t err, int line, const char * filename) { 26 | if (err != cudaSuccess) { 27 | fprintf(stderr, "Fatal CUDA error:\n"); 28 | fprintf(stderr, "at %d of %s\n", line, filename); 29 | fprintf(stderr, "%s\n", cudaGetErrorString(err)); 30 | exit(1); 31 | } 32 | } 33 | 34 | static 35 | void cleanup(TempStorage *ptr) { 36 | cudaFree(ptr->storage); 37 | delete ptr; 38 | } 39 | 40 | template 41 | struct RadixSort { 42 | 43 | 44 | static 45 | TempStorage* sort( TempStorage *temp, 46 | unsigned num_items, 47 | Tk *d_key_buf, 48 | Tk *d_key_alt_buf, 49 | Tv *d_value_buf, 50 | Tv *d_value_alt_buf, 51 | cudaStream_t stream, 52 | int descending, 53 | unsigned begin_bit, 54 | unsigned end_bit ) 55 | { 56 | cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); 57 | if (temp == 0) { 58 | temp = new TempStorage; 59 | temp->storage = 0; 60 | temp->storage_bytes = 0; 61 | } 62 | if (d_value_buf) { 63 | // Sort KeyValue pairs 64 | cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); 65 | if (descending) { 66 | E(cub::DeviceRadixSort::SortPairsDescending(temp->storage, 67 | temp->storage_bytes, 68 | d_keys, 69 | d_values, 70 | num_items, 71 | begin_bit, 72 | end_bit, 73 | stream)); 74 | } else { 75 | E(cub::DeviceRadixSort::SortPairs( temp->storage, 76 | temp->storage_bytes, 77 | d_keys, 78 | d_values, 79 | num_items, 80 | begin_bit, 81 | end_bit, 82 | stream )); 83 | } 84 | 85 | if (temp->storage && d_value_buf != d_values.Current()){ 86 | E(cudaMemcpyAsync(d_value_buf, d_value_alt_buf, 87 | num_items * sizeof(Tv), 88 | cudaMemcpyDeviceToDevice, 89 | stream)); 90 | } 91 | } else { 92 | // Sort Keys only 93 | if (descending) { 94 | E(cub::DeviceRadixSort::SortKeysDescending( temp->storage, 95 | temp->storage_bytes, 96 | d_keys, 97 | num_items, 98 | begin_bit, 99 | end_bit, 100 | stream )); 101 | } else { 102 | E(cub::DeviceRadixSort::SortKeys( temp->storage, 103 | temp->storage_bytes, 104 | d_keys, 105 | num_items, 106 | begin_bit, 107 | end_bit, 108 | stream )); 109 | } 110 | } 111 | 112 | if (temp->storage && d_key_buf != d_keys.Current()){ 113 | E(cudaMemcpyAsync(d_key_buf, d_key_alt_buf, num_items * sizeof(Tk), 114 | cudaMemcpyDeviceToDevice, stream)); 115 | } 116 | 117 | if (temp->storage == 0) { 118 | E(cudaMalloc(&temp->storage, temp->storage_bytes)); 119 | } 120 | 121 | return temp; 122 | } 123 | }; 124 | 125 | extern "C" { 126 | 127 | #define WRAP(Fn, Tk, Tv) \ 128 | DLLEXPORT void* \ 129 | radixsort_ ## Fn( TempStorage *temp, \ 130 | unsigned num_items, \ 131 | Tk *d_key_buf, \ 132 | Tk *d_key_alt_buf, \ 133 | Tv *d_value_buf, \ 134 | Tv *d_value_alt_buf, \ 135 | cudaStream_t stream, \ 136 | int descending, \ 137 | unsigned begin_bit, \ 138 | unsigned end_bit ) { \ 139 | return RadixSort::sort(temp, \ 140 | num_items, \ 141 | d_key_buf, \ 142 | d_key_alt_buf, \ 143 | d_value_buf, \ 144 | d_value_alt_buf, \ 145 | stream, \ 146 | descending, \ 147 | begin_bit, \ 148 | end_bit); \ 149 | } 150 | 151 | WRAP(float, float, unsigned) 152 | WRAP(double, double, unsigned) 153 | WRAP(int32, int32_t, unsigned) 154 | WRAP(uint32, uint32_t, unsigned) 155 | WRAP(int64, int64_t, unsigned) 156 | WRAP(uint64, uint64_t, unsigned) 157 | 158 | DLLEXPORT void 159 | radixsort_cleanup(TempStorage *ptr) { 160 | cleanup(ptr); 161 | } 162 | 163 | #undef WRAP 164 | } // end extern "C" 165 | -------------------------------------------------------------------------------- /lib/dllexport.h: -------------------------------------------------------------------------------- 1 | #ifndef DLLEXPORT 2 | 3 | #ifdef _WIN32 4 | #define DLLEXPORT __declspec( dllexport ) 5 | #else 6 | #define DLLEXPORT 7 | #endif 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /lib/mgpucontext.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #include "util/mgpucontext.h" 36 | 37 | namespace mgpu { 38 | 39 | //////////////////////////////////////////////////////////////////////////////// 40 | // CudaTimer 41 | 42 | void CudaTimer::Start() { 43 | cudaEventRecord(start); 44 | cudaDeviceSynchronize(); 45 | } 46 | double CudaTimer::Split() { 47 | cudaEventRecord(end); 48 | cudaDeviceSynchronize(); 49 | float t; 50 | cudaEventElapsedTime(&t, start, end); 51 | start.Swap(end); 52 | return (t / 1000.0); 53 | } 54 | double CudaTimer::Throughput(int count, int numIterations) { 55 | double elapsed = Split(); 56 | return (double)numIterations * count / elapsed; 57 | } 58 | 59 | //////////////////////////////////////////////////////////////////////////////// 60 | // CudaDevice 61 | 62 | __global__ void KernelVersionShim() { } 63 | 64 | struct DeviceGroup { 65 | int numCudaDevices; 66 | CudaDevice** cudaDevices; 67 | 68 | DeviceGroup() { 69 | numCudaDevices = -1; 70 | cudaDevices = 0; 71 | } 72 | 73 | int GetDeviceCount() { 74 | if(-1 == numCudaDevices) { 75 | cudaError_t error = cudaGetDeviceCount(&numCudaDevices); 76 | if(cudaSuccess != error || numCudaDevices <= 0) { 77 | fprintf(stderr, "ERROR ENUMERATING CUDA DEVICES.\nExiting.\n"); 78 | exit(0); 79 | } 80 | cudaDevices = new CudaDevice*[numCudaDevices]; 81 | memset(cudaDevices, 0, sizeof(CudaDevice*) * numCudaDevices); 82 | } 83 | return numCudaDevices; 84 | } 85 | 86 | CudaDevice* GetByOrdinal(int ordinal) { 87 | if(ordinal >= GetDeviceCount()) return 0; 88 | 89 | if(!cudaDevices[ordinal]) { 90 | // Retrieve the device properties. 91 | CudaDevice* device = cudaDevices[ordinal] = new CudaDevice; 92 | device->_ordinal = ordinal; 93 | cudaError_t error = cudaGetDeviceProperties(&device->_prop, 94 | ordinal); 95 | if(cudaSuccess != error) { 96 | fprintf(stderr, "FAILURE TO CREATE CUDA DEVICE %d\n", ordinal); 97 | exit(0); 98 | } 99 | 100 | // Get the compiler version for this device. 101 | //cudaSetDevice(ordinal); // don't create new context 102 | cudaFuncAttributes attr; 103 | error = cudaFuncGetAttributes(&attr, KernelVersionShim); 104 | if(cudaSuccess == error) 105 | device->_ptxVersion = 10 * attr.ptxVersion; 106 | else { 107 | printf("NOT COMPILED WITH COMPATIBLE PTX VERSION FOR DEVICE" 108 | " %d\n", ordinal); 109 | // The module wasn't compiled with support for this device. 110 | device->_ptxVersion = 0; 111 | } 112 | } 113 | return cudaDevices[ordinal]; 114 | } 115 | 116 | ~DeviceGroup() { 117 | if(cudaDevices) { 118 | for(int i = 0; i < numCudaDevices; ++i) 119 | delete cudaDevices[i]; 120 | delete [] cudaDevices; 121 | } 122 | cudaDeviceReset(); 123 | } 124 | }; 125 | 126 | std::auto_ptr deviceGroup; 127 | 128 | 129 | int CudaDevice::DeviceCount() { 130 | if(!deviceGroup.get()) 131 | deviceGroup.reset(new DeviceGroup); 132 | return deviceGroup->GetDeviceCount(); 133 | } 134 | 135 | CudaDevice& CudaDevice::ByOrdinal(int ordinal) { 136 | if(ordinal < 0 || ordinal >= DeviceCount()) { 137 | fprintf(stderr, "CODE REQUESTED INVALID CUDA DEVICE %d\n", ordinal); 138 | exit(0); 139 | } 140 | return *deviceGroup->GetByOrdinal(ordinal); 141 | } 142 | 143 | CudaDevice& CudaDevice::Selected() { 144 | int ordinal; 145 | cudaError_t error = cudaGetDevice(&ordinal); 146 | if(cudaSuccess != error) { 147 | fprintf(stderr, "ERROR RETRIEVING CUDA DEVICE ORDINAL\n"); 148 | exit(0); 149 | } 150 | return ByOrdinal(ordinal); 151 | } 152 | 153 | void CudaDevice::SetActive() { 154 | cudaError_t error = cudaSetDevice(_ordinal); 155 | if(cudaSuccess != error) { 156 | fprintf(stderr, "ERROR SETTING CUDA DEVICE TO ORDINAL %d\n", _ordinal); 157 | exit(0); 158 | } 159 | } 160 | 161 | std::string CudaDevice::DeviceString() const { 162 | size_t freeMem, totalMem; 163 | cudaError_t error = cudaMemGetInfo(&freeMem, &totalMem); 164 | if(cudaSuccess != error) { 165 | fprintf(stderr, "ERROR RETRIEVING MEM INFO FOR CUDA DEVICE %d\n", 166 | _ordinal); 167 | exit(0); 168 | } 169 | 170 | double memBandwidth = (_prop.memoryClockRate * 1000.0) * 171 | (_prop.memoryBusWidth / 8 * 2) / 1.0e9; 172 | 173 | std::string s = stringprintf( 174 | "%s : %8.3lf Mhz (Ordinal %d)\n" 175 | "%d SMs enabled. Compute Capability sm_%d%d\n" 176 | "FreeMem: %6dMB TotalMem: %6dMB %2d-bit pointers.\n" 177 | "Mem Clock: %8.3lf Mhz x %d bits (%5.1lf GB/s)\n" 178 | "ECC %s\n\n", 179 | _prop.name, _prop.clockRate / 1000.0, _ordinal, 180 | _prop.multiProcessorCount, _prop.major, _prop.minor, 181 | (int)(freeMem / (1<< 20)), (int)(totalMem / (1<< 20)), 8 * sizeof(int*), 182 | _prop.memoryClockRate / 1000.0, _prop.memoryBusWidth, memBandwidth, 183 | _prop.ECCEnabled ? "Enabled" : "Disabled"); 184 | return s; 185 | } 186 | 187 | //////////////////////////////////////////////////////////////////////////////// 188 | // CudaContext 189 | 190 | struct ContextGroup { 191 | CudaContext** standardContexts; 192 | int numDevices; 193 | 194 | ContextGroup() { 195 | numDevices = CudaDevice::DeviceCount(); 196 | standardContexts = new CudaContext*[numDevices]; 197 | memset(standardContexts, 0, sizeof(CudaContext*) * numDevices); 198 | } 199 | 200 | CudaContext* GetByOrdinal(int ordinal) { 201 | if(!standardContexts[ordinal]) { 202 | CudaDevice& device = CudaDevice::ByOrdinal(ordinal); 203 | standardContexts[ordinal] = new CudaContext(device, false, true); 204 | } 205 | return standardContexts[ordinal]; 206 | } 207 | 208 | ~ContextGroup() { 209 | if(standardContexts) { 210 | for(int i = 0; i < numDevices; ++i) 211 | delete standardContexts[i]; 212 | delete [] standardContexts; 213 | } 214 | } 215 | }; 216 | std::auto_ptr contextGroup; 217 | 218 | CudaContext::CudaContext(CudaDevice& device, bool newStream, bool standard) : 219 | _event(cudaEventDisableTiming /*| cudaEventBlockingSync */), 220 | _stream(0), _noRefCount(standard), _pageLocked(0) { 221 | 222 | // Create an allocator. 223 | if(standard) 224 | _alloc.reset(new CudaAllocSimple(device)); 225 | else 226 | _alloc = CreateDefaultAlloc(device); 227 | 228 | if(newStream) cudaStreamCreate(&_stream); 229 | _ownStream = newStream; 230 | 231 | // Allocate 4KB of page-locked memory. 232 | cudaError_t error; 233 | // error = cudaMallocHost((void**)&_pageLocked, 4096); 234 | 235 | // Allocate an auxiliary stream. 236 | error = cudaStreamCreate(&_auxStream); 237 | } 238 | 239 | CudaContext::~CudaContext() { 240 | if(_pageLocked) 241 | cudaFreeHost(_pageLocked); 242 | if(_ownStream && _stream) 243 | cudaStreamDestroy(_stream); 244 | if(_auxStream) 245 | cudaStreamDestroy(_auxStream); 246 | } 247 | 248 | AllocPtr CudaContext::CreateDefaultAlloc(CudaDevice& device) { 249 | intrusive_ptr alloc(new CudaAllocBuckets(device)); 250 | size_t freeMem, totalMem; 251 | 252 | cudaError_t error = cudaMemGetInfo(&freeMem, &totalMem); 253 | if(cudaSuccess != error) { 254 | fprintf(stderr, "ERROR RETRIEVING MEM INFO FOR CUDA DEVICE %d\n", 255 | device.Ordinal()); 256 | exit(0); 257 | } 258 | 259 | // Maintain a buffer of 128MB with max objects of 64MB. 260 | alloc->SetCapacity(128<< 20, 64<< 20); 261 | 262 | return AllocPtr(alloc.get()); 263 | } 264 | 265 | CudaContext& CudaContext::StandardContext(int ordinal) { 266 | bool setActive = -1 != ordinal; 267 | if(-1 == ordinal) { 268 | cudaError_t error = cudaGetDevice(&ordinal); 269 | if(cudaSuccess != error) { 270 | fprintf(stderr, "ERROR RETRIEVING CUDA DEVICE ORDINAL\n"); 271 | exit(0); 272 | } 273 | } 274 | int numDevices = CudaDevice::DeviceCount(); 275 | 276 | if(ordinal < 0 || ordinal >= numDevices) { 277 | fprintf(stderr, "CODE REQUESTED INVALID CUDA DEVICE %d\n", ordinal); 278 | exit(0); 279 | } 280 | 281 | if(!contextGroup.get()) 282 | contextGroup.reset(new ContextGroup); 283 | 284 | CudaContext& context = //*contextGroup->standardContexts[ordinal]; 285 | *contextGroup->GetByOrdinal(ordinal); 286 | if(!context.PTXVersion()) { 287 | fprintf(stderr, "This CUDA executable was not compiled with support" 288 | " for device %d (sm_%2d)\n", ordinal, context.ArchVersion() / 10); 289 | exit(0); 290 | } 291 | 292 | if(setActive) context.SetActive(); 293 | return context; 294 | } 295 | 296 | ContextPtr CreateCudaDevice(int ordinal) { 297 | CudaDevice& device = CudaDevice::ByOrdinal(ordinal); 298 | ContextPtr context(new CudaContext(device, false, false)); 299 | return context; 300 | } 301 | ContextPtr CreateCudaDevice(int argc, char** argv, bool printInfo) { 302 | int ordinal = 0; 303 | if(argc >= 2 && !sscanf(argv[1], "%d", &ordinal)) { 304 | fprintf(stderr, "INVALID COMMAND LINE ARGUMENT - NOT A CUDA ORDINAL\n"); 305 | exit(0); 306 | } 307 | ContextPtr context = CreateCudaDevice(ordinal); 308 | if(!context->PTXVersion()) { 309 | fprintf(stderr, "This CUDA executable was not compiled with support" 310 | " for device %d (sm_%2d)\n", ordinal, context->ArchVersion() / 10); 311 | exit(0); 312 | } 313 | 314 | context->SetActive(); 315 | if(printInfo) 316 | printf("%s\n", context->Device().DeviceString().c_str()); 317 | return context; 318 | } 319 | 320 | ContextPtr CreateCudaDeviceStream(int ordinal) { 321 | ContextPtr context(new CudaContext( 322 | CudaDevice::ByOrdinal(ordinal), true, false)); 323 | return context; 324 | } 325 | 326 | ContextPtr CreateCudaDeviceStream(int argc, char** argv, bool printInfo) { 327 | int ordinal = 0; 328 | if(argc >= 2 && !sscanf(argv[1], "%d", &ordinal)) { 329 | fprintf(stderr, "INVALID COMMAND LINE ARGUMENT - NOT A CUDA ORDINAL\n"); 330 | exit(0); 331 | } 332 | ContextPtr context = CreateCudaDeviceStream(ordinal); 333 | if(!context->PTXVersion()) { 334 | fprintf(stderr, "This CUDA executable was not compiled with support" 335 | " for device %d (sm_%2d)\n", ordinal, context->ArchVersion() / 10); 336 | exit(0); 337 | } 338 | 339 | context->SetActive(); 340 | if(printInfo) 341 | printf("%s\n", context->Device().DeviceString().c_str()); 342 | return context; 343 | } 344 | 345 | ContextPtr CreateCudaDeviceAttachStream(int ordinal, cudaStream_t stream) { 346 | ContextPtr context(new CudaContext( 347 | CudaDevice::ByOrdinal(ordinal), false, false)); 348 | context->_stream = stream; 349 | return context; 350 | } 351 | 352 | ContextPtr CreateCudaDeviceAttachStream(cudaStream_t stream) { 353 | int ordinal; 354 | cudaGetDevice(&ordinal); 355 | return CreateCudaDeviceAttachStream(ordinal, stream); 356 | } 357 | 358 | //////////////////////////////////////////////////////////////////////////////// 359 | // CudaAllocSimple 360 | 361 | cudaError_t CudaAllocSimple::Malloc(size_t size, void** p) { 362 | cudaError_t error = cudaSuccess; 363 | *p = 0; 364 | if(size) error = cudaMalloc(p, size); 365 | 366 | if(cudaSuccess != error) { 367 | printf("CUDA MALLOC ERROR %d\n", error); 368 | exit(0); 369 | } 370 | 371 | return error; 372 | } 373 | 374 | bool CudaAllocSimple::Free(void* p) { 375 | cudaError_t error = cudaSuccess; 376 | if(p) error = cudaFree(p); 377 | return cudaSuccess == error; 378 | } 379 | 380 | //////////////////////////////////////////////////////////////////////////////// 381 | // CudaAllocBuckets 382 | 383 | CudaAllocBuckets::CudaAllocBuckets(CudaDevice& device) : CudaAlloc(device) { 384 | _maxObjectSize = _capacity = _allocated = _committed = 0; 385 | _counter = 0; 386 | } 387 | 388 | CudaAllocBuckets::~CudaAllocBuckets() { 389 | SetCapacity(0, 0); 390 | assert(!_allocated); 391 | } 392 | 393 | bool CudaAllocBuckets::SanityCheck() const { 394 | // Iterate through all allocated objects and verify sizes. 395 | size_t allocatedCount = 0, committedCount = 0; 396 | for(AddressMap::const_iterator i = _addressMap.begin(); 397 | i != _addressMap.end(); ++i) { 398 | 399 | int bucket = i->second->bucket; 400 | size_t size = (bucket < NumBuckets) ? BucketSizes[bucket] : 0; 401 | allocatedCount += size; 402 | 403 | if(i->second->priority == _priorityMap.end()) 404 | committedCount += size; 405 | } 406 | 407 | return allocatedCount == _allocated && committedCount == _committed; 408 | } 409 | 410 | cudaError_t CudaAllocBuckets::Malloc(size_t size, void** p) { 411 | 412 | // Locate the bucket index and adjust the size of the allocation to the 413 | // bucket size. 414 | size_t allocSize = size; 415 | size_t commitSize = 0; 416 | int bucket = LocateBucket(size); 417 | if(bucket < NumBuckets) 418 | allocSize = commitSize = BucketSizes[bucket]; 419 | 420 | // Peel off an already-allocated node and reuse it. 421 | MemList& list = _memLists[bucket]; 422 | if(list.size() && list.front().priority != _priorityMap.end()) { 423 | MemList::iterator memIt = list.begin(); 424 | 425 | _priorityMap.erase(memIt->priority); 426 | memIt->priority = _priorityMap.end(); 427 | 428 | list.splice(list.end(), list, memIt); 429 | _committed += commitSize; 430 | 431 | *p = memIt->address->first; 432 | return cudaSuccess; 433 | } 434 | 435 | // Shrink if this allocation would put us over the limit. 436 | Compact(commitSize); 437 | 438 | cudaError_t error = cudaSuccess; 439 | *p = 0; 440 | if(size) error = cudaMalloc(p, allocSize); 441 | while((cudaErrorMemoryAllocation == error) && (_committed < _allocated)) { 442 | SetCapacity(_capacity - _capacity / 10, _maxObjectSize); 443 | error = cudaMalloc(p, size); 444 | } 445 | if(cudaSuccess != error) return error; 446 | 447 | MemList::iterator memIt = 448 | _memLists[bucket].insert(_memLists[bucket].end(), MemNode()); 449 | memIt->bucket = bucket; 450 | memIt->address = _addressMap.insert(std::make_pair(*p, memIt)).first; 451 | memIt->priority = _priorityMap.end(); 452 | _allocated += commitSize; 453 | _committed += commitSize; 454 | 455 | assert(SanityCheck()); 456 | 457 | return cudaSuccess; 458 | } 459 | 460 | bool CudaAllocBuckets::Free(void* p) { 461 | AddressMap::iterator it = _addressMap.find(p); 462 | if(it == _addressMap.end()) { 463 | // If the pointer was not found in the address map, cudaFree it anyways 464 | // but return false. 465 | if(p) cudaFree(p); 466 | return false; 467 | } 468 | 469 | // Because we're freeing a page, it had better not be in the priority queue. 470 | MemList::iterator memIt = it->second; 471 | assert(memIt->priority == _priorityMap.end()); 472 | 473 | // Always free allocations larger than the largest bucket 474 | it->second->priority = _priorityMap.insert( 475 | std::make_pair(_counter++ - memIt->bucket, memIt)); 476 | 477 | // Freed nodes are moved to the front, committed nodes are moved to the 478 | // end. 479 | int bucket = memIt->bucket; 480 | size_t commitSize = (bucket < NumBuckets) ? BucketSizes[bucket] : 0; 481 | 482 | MemList& list = _memLists[bucket]; 483 | list.splice(list.begin(), list, memIt); 484 | _committed -= commitSize; 485 | 486 | // Delete data that's not cached. 487 | if(NumBuckets == bucket) 488 | FreeNode(memIt); 489 | 490 | Compact(0); 491 | return true; 492 | } 493 | 494 | void CudaAllocBuckets::Clear() { 495 | Compact(_allocated); 496 | } 497 | 498 | void CudaAllocBuckets::FreeNode(CudaAllocBuckets::MemList::iterator memIt) { 499 | if(memIt->address->first) cudaFree(memIt->address->first); 500 | 501 | int bucket = memIt->bucket; 502 | size_t commitSize = (bucket < NumBuckets) ? BucketSizes[bucket] : 0; 503 | _addressMap.erase(memIt->address); 504 | if(memIt->priority != _priorityMap.end()) 505 | _priorityMap.erase(memIt->priority); 506 | else 507 | _committed -= commitSize; 508 | _allocated -= commitSize; 509 | 510 | _memLists[bucket].erase(memIt); 511 | 512 | assert(SanityCheck()); 513 | } 514 | 515 | void CudaAllocBuckets::Compact(size_t extra) { 516 | while(_allocated + extra > _capacity && _allocated > _committed) { 517 | // Walk the priority queue from beginning to end removing nodes. 518 | MemList::iterator memIt = _priorityMap.begin()->second; 519 | FreeNode(memIt); 520 | } 521 | } 522 | 523 | // Exponentially spaced buckets. 524 | const size_t CudaAllocBuckets::BucketSizes[CudaAllocBuckets::NumBuckets] = { 525 | 256, 512, 1024, 2048, 4096, 8192, 526 | 12288, 16384, 24576, 32768, 49152, 65536, 527 | 98304, 131072, 174848, 218624, 262144, 349696, 528 | 436992, 524288, 655360, 786432, 917504, 1048576, 529 | 1310720, 1572864, 1835008, 2097152, 2516736, 2936064, 530 | 3355648, 3774976, 4194304, 4893440, 5592576, 6291456, 531 | 6990592, 7689728, 8388608, 9786880, 11184896, 12582912, 532 | 13981184, 15379200, 16777216, 18874368, 20971520, 23068672, 533 | 25165824, 27262976, 29360128, 31457280, 33554432, 36910080, 534 | 40265472, 43620864, 46976256, 50331648, 53687296, 57042688, 535 | 60398080, 63753472, 67108864, 72701440, 78293760, 83886080, 536 | 89478656, 95070976, 100663296, 106255872, 111848192, 117440512, 537 | 123033088, 128625408, 134217728, 143804928, 153391872, 162978816, 538 | 172565760, 182152704, 191739648, 201326592, 210913792, 220500736 539 | }; 540 | 541 | int CudaAllocBuckets::LocateBucket(size_t size) const { 542 | if(size > _maxObjectSize || size > BucketSizes[NumBuckets - 1]) 543 | return NumBuckets; 544 | 545 | return (int)(std::lower_bound(BucketSizes, BucketSizes + NumBuckets, size) - 546 | BucketSizes); 547 | } 548 | 549 | } // namespace mgpu 550 | -------------------------------------------------------------------------------- /lib/mgpusort.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "mgpucontext.cu" 5 | #include "dllexport.h" 6 | // #include 7 | 8 | namespace mgpu{ 9 | std::string stringprintf(const char* format, ...) { return std::string(); } 10 | } 11 | 12 | namespace { 13 | 14 | using namespace mgpu; 15 | 16 | template 17 | void segsortpairs( Tkey *d_keys, 18 | Tval *d_vals, 19 | unsigned N, 20 | const int *d_segments, 21 | unsigned NumSegs, 22 | cudaStream_t stream ) 23 | { 24 | 25 | ContextPtr context = CreateCudaDeviceAttachStream(stream); 26 | 27 | SegSortPairsFromIndices( 28 | d_keys, 29 | d_vals, 30 | N, 31 | d_segments, 32 | NumSegs, 33 | *context, 34 | false); 35 | 36 | } 37 | 38 | } // end static namespace 39 | 40 | 41 | extern "C" { 42 | 43 | #define WRAP(F, Tkey, Tval) \ 44 | DLLEXPORT void segsortpairs_##F( Tkey *d_keys, \ 45 | Tval *d_vals, \ 46 | unsigned N, \ 47 | const int *d_segments, \ 48 | unsigned NumSegs, \ 49 | cudaStream_t stream ) \ 50 | { segsortpairs(d_keys, d_vals, N, d_segments, NumSegs, stream); } 51 | 52 | WRAP(int32, int32_t, unsigned) 53 | WRAP(int64, int64_t, unsigned) 54 | WRAP(uint32, uint32_t, unsigned) 55 | WRAP(uint64, uint64_t, unsigned) 56 | WRAP(float32, float, unsigned) 57 | WRAP(float64, double, unsigned) 58 | 59 | 60 | } 61 | -------------------------------------------------------------------------------- /test/test_sorting_libs.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, absolute_import, division 2 | 3 | """ 4 | Uses radixsort implementation from CUB which has the following license: 5 | 6 | Copyright (c) 2011, Duane Merrill. All rights reserved. 7 | Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 8 | Redistribution and use in source and binary forms, with or without 9 | modification, are permitted provided that the following conditions are met: 10 | Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | Neither the name of the NVIDIA CORPORATION nor the 16 | names of its contributors may be used to endorse or promote products 17 | derived from this software without specific prior written permission. 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 22 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 25 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | """ 29 | 30 | """ 31 | Uses segmented sort implementation from ModernGPU which has the following 32 | license: 33 | 34 | Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 35 | 36 | Redistribution and use in source and binary forms, with or without 37 | modification, are permitted provided that the following conditions are met: 38 | * Redistributions of source code must retain the above copyright 39 | notice, this list of conditions and the following disclaimer. 40 | * Redistributions in binary form must reproduce the above copyright 41 | notice, this list of conditions and the following disclaimer in the 42 | documentation and/or other materials provided with the distribution. 43 | * Neither the name of the NVIDIA CORPORATION nor the 44 | names of its contributors may be used to endorse or promote products 45 | derived from this software without specific prior written permission. 46 | 47 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 48 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 | ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 51 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 52 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 53 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 54 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 55 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 56 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 57 | """ 58 | 59 | 60 | import ctypes 61 | import os 62 | import platform 63 | import sys 64 | import warnings 65 | from contextlib import contextmanager 66 | 67 | import pytest 68 | 69 | import numpy as np 70 | from numba import findlib 71 | from numba.cuda.cudadrv.driver import device_pointer 72 | from numba.cuda.cudadrv.drvapi import cu_stream 73 | from numba.cuda.cudadrv.devicearray import auto_device, is_cuda_ndarray 74 | from numba import cuda 75 | 76 | 77 | def run_tests_on_hardware(): 78 | def cuda_compatible(): 79 | if sys.platform.startswith('darwin'): 80 | ver = platform.mac_ver()[0] 81 | # version string can contain two or three components 82 | major, minor = ver.split('.', 1) 83 | if '.' in minor: 84 | minor, micro = minor.split('.', 1) 85 | if (int(major), int(minor)) < (10, 9): 86 | return False 87 | 88 | is_64bits = sys.maxsize > 2**32 89 | if not is_64bits: 90 | return False 91 | 92 | return True 93 | 94 | if cuda_compatible(): 95 | return cuda.is_available() 96 | else: 97 | return False 98 | 99 | 100 | use_hardware = run_tests_on_hardware() 101 | 102 | 103 | def library_extension(): 104 | p = platform.system() 105 | if p == 'Linux': 106 | return 'so' 107 | if p == 'Windows': 108 | return 'dll' 109 | if p == 'Darwin': 110 | return 'dylib' 111 | 112 | 113 | def load_lib(libname): 114 | fullname = 'pyculib_%s.%s' % (libname, library_extension()) 115 | devpath = os.path.join(os.path.dirname(__file__), '..', 'lib') 116 | devlib = os.path.join(os.path.abspath(devpath), fullname) 117 | if os.path.exists(devlib): 118 | libpath = devlib 119 | warnings.warn('Using in-tree library %s' % libpath) 120 | else: 121 | libpath = os.path.join(findlib.get_lib_dir(), fullname) 122 | 123 | return ctypes.CDLL(libpath) 124 | 125 | 126 | radixlib = load_lib('radixsort') 127 | segsortlib = load_lib('segsort') 128 | 129 | 130 | def _bind_radixsort_double(): 131 | _argtypes = [ 132 | ctypes.c_void_p, # temp 133 | ctypes.c_uint, # count 134 | ctypes.c_void_p, # d_key 135 | ctypes.c_void_p, # d_key_alt 136 | ctypes.c_void_p, # d_vals 137 | ctypes.c_void_p, # d_vals_alt 138 | cu_stream, 139 | ctypes.c_int, # descending 140 | ctypes.c_uint, # begin_bit 141 | ctypes.c_uint, # end_bit 142 | ] 143 | dtype = np.float64 144 | fn = getattr(radixlib, "radixsort_double") 145 | fn.argtypes = _argtypes 146 | fn.restype = ctypes.c_void_p 147 | return fn 148 | 149 | 150 | def test_radixsort_bind(): 151 | # checks that the `radixsort_XYZ` symbols bind ok 152 | _known_types = ['float', 'double', 'int32', 'uint32', 'int64', 'uint64'] 153 | for x in _known_types: 154 | getattr(radixlib, "radixsort_{}".format(x)) 155 | 156 | 157 | @pytest.mark.skipif(not use_hardware, reason='No suitable hardware found.') 158 | def test_radixsort_operation(): 159 | # a crude radixsort test 160 | dtype = np.float64 161 | maxcount = 1000 162 | 163 | keys = np.random.rand(maxcount) 164 | reference = np.copy(keys) 165 | 166 | # copy to device 167 | dptr, _ = auto_device(keys) 168 | 169 | def runsort(temp, keys, vals, begin_bit=0, end_bit=None): 170 | stream = 0 171 | begin_bit = 0 172 | dtty = np.dtype(dtype) 173 | end_bit = dtty.itemsize * 8 174 | descending = 0 175 | count = maxcount 176 | if keys: 177 | count = keys.size 178 | 179 | _arysize = int(maxcount * dtty.itemsize) 180 | _sort = _bind_radixsort_double() 181 | 182 | ctx = cuda.current_context() 183 | _temp_keys = ctx.memalloc(_arysize) 184 | 185 | return _sort( 186 | temp, 187 | ctypes.c_uint(count), 188 | device_pointer(keys), 189 | device_pointer(_temp_keys), 190 | None, 191 | None, 192 | stream, 193 | descending, 194 | begin_bit, 195 | end_bit 196 | ) 197 | 198 | # tmp storage ref 199 | temp = runsort(None, None, None) 200 | 201 | # do the sort 202 | runsort(temp, dptr, None) 203 | 204 | # copy back 205 | dptr.copy_to_host(keys) 206 | 207 | # compare 208 | np.testing.assert_equal(np.sort(reference), keys) 209 | 210 | 211 | def _bind_segsort_double(): 212 | _argtypes = [ 213 | ctypes.c_void_p, # d_key 214 | ctypes.c_void_p, # d_vals 215 | ctypes.c_uint, # N 216 | ctypes.c_void_p, # segments 217 | ctypes.c_uint, # Nseg 218 | cu_stream, # stream 219 | ] 220 | fn = getattr(segsortlib, 'segsortpairs_float64') 221 | fn.argtypes = _argtypes 222 | return fn 223 | 224 | 225 | def test_segsort_bind(): 226 | # checks that the `segsort_XYZ` symbols bind ok 227 | _known_types = ['float32', 'float64', 'int32', 'uint32', 'int64', 'uint64'] 228 | for x in _known_types: 229 | getattr(segsortlib, "segsortpairs_{}".format(x)) 230 | 231 | 232 | @pytest.mark.skipif(not use_hardware, reason='No suitable hardware found.') 233 | def test_segsort_operation(): 234 | # a crude segsort test 235 | 236 | maxcount = 1000 237 | 238 | keys = np.random.rand(maxcount) 239 | reference = keys.copy() 240 | original = keys.copy() 241 | values = np.arange(keys.size, dtype=np.int32) 242 | segments = np.arange(64, maxcount, 64, dtype=np.int32) 243 | 244 | dptr_keys, _ = auto_device(keys) 245 | keys[:] = 0 246 | dptr_values, _ = auto_device(values) 247 | values[:] = 0 248 | dptr_segments, _ = auto_device(segments) 249 | 250 | def runsort(d_keys, d_vals, d_seg): 251 | _sort = _bind_segsort_double() 252 | _sort(device_pointer(d_keys), 253 | device_pointer(d_vals), 254 | d_keys.size, 255 | device_pointer(d_seg), 256 | d_seg.size, 257 | 0) 258 | 259 | runsort(dptr_keys, dptr_values, dptr_segments) 260 | 261 | # copy back 262 | dptr_keys.copy_to_host(keys) 263 | dptr_values.copy_to_host(values) 264 | 265 | # compare 266 | r = [z for z in segments] 267 | low = [0] + r 268 | high = r + [maxcount] 269 | for x, y in zip(low, high): 270 | reference[x:y].sort() 271 | 272 | np.testing.assert_equal(keys, reference) 273 | np.testing.assert_equal(original[values], reference) 274 | --------------------------------------------------------------------------------