├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── build_sorting_libs.py
├── condarecipe
    ├── bld.bat
    ├── build.sh
    └── meta.yaml
├── lib
    ├── cubradixsort.cu
    ├── dllexport.h
    ├── mgpucontext.cu
    └── mgpusort.cu
└── test
    └── test_sorting_libs.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | build
 2 | _build
 3 | __pycache__
 4 | *.pyc
 5 | \#*\#
 6 | *.so
 7 | *.dll
 8 | *.dylib
 9 | .cache
10 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "thirdparty/moderngpu"]
2 | 	path = thirdparty/moderngpu
3 | 	url = https://github.com/moderngpu/moderngpu
4 | [submodule "thirdparty/cub"]
5 | 	path = thirdparty/cub
6 | 	url = https://github.com/NVlabs/cub
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2017, Anaconda, Inc.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pyculib\_sorting
 2 | 
 3 | Pyculib\_sorting provides simplified interfaces to CUDA sorting libraries.
 4 | At present it contains a wrapper around:
 5 | 
 6 |  * A radix sort implementation from [CUB](http://nvlabs.github.com/cub).
 7 |  * A segmented sort implementation from
 8 |    [ModernGPU](http://nvlabs.github.io/moderngpu)
 9 | 
10 | Pyculib\_sorting is predominantly used by [Pyculib](https://github.com/numba/pyculib) to provide
11 | sorting routines.
12 | 
13 | 
14 | ## Requirements
15 | Pyculib\_sorting requires the following programs to build and test:
16 |  * Python
17 |  * NVIDIA's `nvcc` compiler
18 | 
19 | and the following Python packages
20 |  * pytest
21 |  * Numba
22 | 
23 | 
24 | ## Obtaining the source code
25 | Pyculib\_sorting relies on git submodules to access the CUB and ModernGPU source code,
26 | to obtain a code base suitable for building the libraries run:
27 | 
28 | ```
29 | #> git clone https://github.com/numba/pyculib_sorting.git
30 | 
31 | #> cd pyculib_sorting
32 | 
33 | #> git submodule update --init
34 | ```
35 | 
36 | the URL above may be adjusted to use `ssh` based
37 | `git@github.com:numba/pyculib_sorting.git` as desired.
38 | 
39 | 
40 | ## Building the libraries
41 | 
42 | To build the libraries run:
43 | ```
44 | #> python build_sorting_libs.py
45 | ```
46 | 
47 | 
48 | ## Testing
49 | 
50 | Testing uses pytest and is simply invoked with:
51 | ```
52 | #> pytest
53 | ```
54 | 
55 | 
56 | ## Conda build
57 | 
58 | To create a conda package of Pyculib\_sorting, assuming conda-build is
59 | installed, run:
60 | 
61 | ```
62 | #> conda build condarecipe
63 | ```
64 | 
65 | from the root directory of Pyculib\_sorting.
66 | 
67 | 


--------------------------------------------------------------------------------
/build_sorting_libs.py:
--------------------------------------------------------------------------------
  1 | # A script to build external dependencies
  2 | 
  3 | import os
  4 | import subprocess
  5 | import platform
  6 | 
  7 | 
  8 | def basedir():
  9 |     return os.path.abspath(os.path.dirname(__file__))
 10 | 
 11 | 
 12 | def cub_include():
 13 |     return '-I%s/thirdparty/cub' % basedir()
 14 | 
 15 | 
 16 | def mgpu_include():
 17 |     return '-I%s/thirdparty/moderngpu/include' % basedir()
 18 | 
 19 | 
 20 | def lib_dir():
 21 |     return '%s/lib' % basedir()
 22 | 
 23 | 
 24 | def run_shell(cmd):
 25 |     print(cmd)
 26 |     subprocess.check_call(cmd, shell=True)
 27 | 
 28 | 
 29 | def library_extension():
 30 |     p = platform.system()
 31 |     if p == 'Linux':
 32 |         return 'so'
 33 |     if p == 'Windows':
 34 |         return 'dll'
 35 |     if p == 'Darwin':
 36 |         return 'dylib'
 37 | 
 38 | 
 39 | def gencode_flags():
 40 |     # Generate code for all known architectures
 41 |     GENCODE_SMXX = "-gencode arch=compute_{CC},code=sm_{CC}"
 42 |     GENCODE_SM20 = GENCODE_SMXX.format(CC=20)
 43 |     GENCODE_SM30 = GENCODE_SMXX.format(CC=30)
 44 |     GENCODE_SM35 = GENCODE_SMXX.format(CC=35)
 45 |     GENCODE_SM37 = GENCODE_SMXX.format(CC=37)
 46 |     GENCODE_SM50 = GENCODE_SMXX.format(CC=50)
 47 |     GENCODE_SM52 = GENCODE_SMXX.format(CC=52)
 48 |     GENCODE_SM53 = GENCODE_SMXX.format(CC=53)
 49 | 
 50 |     # Provide forward-compatibility to architectures beyond CC 5.3
 51 |     GENCODE_COMPUTEXX = "-gencode arch=compute_{CC},code=compute_{CC}"
 52 |     GENCODE_COMPUTE53 = GENCODE_COMPUTEXX.format(CC=53)
 53 | 
 54 |     # Concatenate flags
 55 |     SM = []
 56 |     SM.append(GENCODE_SM20)
 57 |     SM.append(GENCODE_SM30)
 58 |     SM.append(GENCODE_SM35)
 59 |     SM.append(GENCODE_SM37)
 60 |     SM.append(GENCODE_SM50)
 61 |     SM.append(GENCODE_SM52)
 62 |     SM.append(GENCODE_SM53)
 63 |     SM.append(GENCODE_COMPUTE53)
 64 |     return ' '.join(SM)
 65 | 
 66 | 
 67 | def build_cuda(srcdir, out, ins, includes):
 68 |     # Allow specification of nvcc location in NVCC env var
 69 |     nvcc = os.environ.get('NVCC', 'nvcc')
 70 | 
 71 |     # Build for 32- or 64-bit
 72 |     optflags = '-m%s --compiler-options "-fPIC"'
 73 |     if tuple.__itemsize__ == 4:
 74 |         opt = optflags % 32
 75 |     elif tuple.__itemsize__ == 8:
 76 |         opt = optflags % 64
 77 | 
 78 |     ext = library_extension()
 79 |     output = os.path.join(lib_dir(), '%s.%s' % (out, ext))
 80 |     inputs = ' '.join([os.path.join(srcdir, p)
 81 |                        for p in ins])
 82 |     argtemp = '{opt} {inc} -O3 {gen} --shared -o {out} {inp}'
 83 |     args = argtemp.format(inc=includes, gen=gencode_flags(), out=output,
 84 |                           inp=inputs, opt=opt)
 85 |     cmd = ' '.join([nvcc, args])
 86 |     run_shell(cmd)
 87 | 
 88 | 
 89 | def build_radixsort():
 90 |     build_cuda(srcdir=lib_dir(),
 91 |                out='pyculib_radixsort',
 92 |                ins=['cubradixsort.cu'],
 93 |                includes=cub_include(), )
 94 | 
 95 | 
 96 | def build_mgpusort():
 97 |     build_cuda(srcdir=lib_dir(),
 98 |                out='pyculib_segsort',
 99 |                ins=['mgpusort.cu'],
100 |                includes=mgpu_include(), )
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     build_radixsort()
105 |     build_mgpusort()
106 | 


--------------------------------------------------------------------------------
/condarecipe/bld.bat:
--------------------------------------------------------------------------------
1 | cd %RECIPE_DIR%\..
2 | call "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\vcvarsall.bat" amd64
3 | %PYTHON% build_sorting_libs.py
4 | if errorlevel 1 exit 1
5 | 
6 | mkdir %PREFIX%\DLLs
7 | copy %RECIPE_DIR%\..\lib\*.dll %PREFIX%\DLLs
8 | 


--------------------------------------------------------------------------------
/condarecipe/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | function build() {
 6 |   cd $RECIPE_DIR/..
 7 |   python build_sorting_libs.py
 8 | }
 9 | 
10 | build
11 | mkdir -p $PREFIX/lib
12 | 
13 | if [ `uname` == Linux ]
14 | then
15 |      EXT=so
16 | fi
17 | 
18 | if [ `uname` == Darwin ]
19 | then
20 |     EXT=dylib
21 | fi
22 | 
23 | cp $RECIPE_DIR/../lib/*.$EXT $PREFIX/lib
24 | 


--------------------------------------------------------------------------------
/condarecipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |   name: pyculib_sorting
 3 |   version: {{ GIT_DESCRIBE_TAG }}
 4 | 
 5 | source:
 6 |   path: ..
 7 | 
 8 | build:
 9 |   number: {{ GIT_DESCRIBE_NUMBER|int }}
10 |   script_env:
11 |     - LD_LIBRARY_PATH # pass cuda libs through for docker builds
12 | 
13 | requirements:
14 |   build:
15 |     - python
16 |   run:
17 | 
18 | test:
19 |   source_files:
20 |     - test
21 |   requires:
22 |     - numba
23 |     - numpy
24 |     - cudatoolkit
25 |     - pytest
26 |   commands:
27 |     - pytest test -v
28 | 
29 | about:
30 |     home: https://github.com/numba/pyculib_sorting
31 |     license: BSD
32 |     license_file: LICENSE
33 |     summary: Sorting libraries for Pyculib.
34 | 


--------------------------------------------------------------------------------
/lib/cubradixsort.cu:
--------------------------------------------------------------------------------
  1 | #include <cub/device/device_radix_sort.cuh>
  2 | #include <stdint.h>
  3 | #include "dllexport.h"
  4 | 
  5 | 
  6 | // #define E(X) _debug_check((X), __LINE__, __FILE__)
  7 | #define E(X) _release_check((X))
  8 | 
  9 | 
 10 | struct TempStorage{
 11 |     void * storage;
 12 |     size_t storage_bytes;
 13 | };
 14 | 
 15 | static
 16 | void _release_check(cudaError_t err) {
 17 |     if (err != cudaSuccess) {
 18 |         fprintf(stderr, "Fatal CUDA error:\n");
 19 |         fprintf(stderr, "%s\n", cudaGetErrorString(err));
 20 |         exit(1);
 21 |     }
 22 | }
 23 | 
 24 | static
 25 | void _debug_check(cudaError_t err, int line, const char * filename) {
 26 |     if (err != cudaSuccess) {
 27 |         fprintf(stderr, "Fatal CUDA error:\n");
 28 |         fprintf(stderr, "at %d of %s\n", line, filename);
 29 |         fprintf(stderr, "%s\n", cudaGetErrorString(err));
 30 |         exit(1);
 31 |     }
 32 | }
 33 | 
 34 | static
 35 | void cleanup(TempStorage *ptr) {
 36 |     cudaFree(ptr->storage);
 37 |     delete ptr;
 38 | }
 39 | 
 40 | template <class Tk, class Tv=unsigned>
 41 | struct RadixSort {
 42 | 
 43 | 
 44 |     static
 45 |     TempStorage* sort(  TempStorage *temp,
 46 |                         unsigned  num_items,
 47 |                         Tk  *d_key_buf,
 48 |                         Tk  *d_key_alt_buf,
 49 |                         Tv  *d_value_buf,
 50 |                         Tv  *d_value_alt_buf,
 51 |                         cudaStream_t stream,
 52 |                         int descending,
 53 |                         unsigned begin_bit,
 54 |                         unsigned end_bit      )
 55 |     {
 56 |         cub::DoubleBuffer<Tk> d_keys(d_key_buf, d_key_alt_buf);
 57 |         if (temp == 0) {
 58 |             temp = new TempStorage;
 59 |             temp->storage = 0;
 60 |             temp->storage_bytes = 0;
 61 |         }
 62 |         if (d_value_buf) {
 63 |             // Sort KeyValue pairs
 64 |             cub::DoubleBuffer<Tv> d_values(d_value_buf, d_value_alt_buf);
 65 |             if (descending) {
 66 |                 E(cub::DeviceRadixSort::SortPairsDescending(temp->storage,
 67 |                                                           temp->storage_bytes,
 68 |                                                           d_keys,
 69 |                                                           d_values,
 70 |                                                           num_items,
 71 |                                                           begin_bit,
 72 |                                                           end_bit,
 73 |                                                           stream));
 74 |             } else {
 75 |                 E(cub::DeviceRadixSort::SortPairs(  temp->storage,
 76 |                                                   temp->storage_bytes,
 77 |                                                   d_keys,
 78 |                                                   d_values,
 79 |                                                   num_items,
 80 |                                                   begin_bit,
 81 |                                                   end_bit,
 82 |                                                   stream    ));
 83 |             }
 84 | 
 85 |             if (temp->storage && d_value_buf != d_values.Current()){
 86 |                 E(cudaMemcpyAsync(d_value_buf, d_value_alt_buf,
 87 |                                 num_items * sizeof(Tv),
 88 |                                 cudaMemcpyDeviceToDevice,
 89 |                                 stream));
 90 |             }
 91 |         } else {
 92 |             // Sort Keys only
 93 |             if (descending) {
 94 |                 E(cub::DeviceRadixSort::SortKeysDescending(   temp->storage,
 95 |                                                             temp->storage_bytes,
 96 |                                                             d_keys,
 97 |                                                             num_items,
 98 |                                                             begin_bit,
 99 |                                                             end_bit,
100 |                                                             stream  ));
101 |             } else {
102 |                 E(cub::DeviceRadixSort::SortKeys( temp->storage,
103 |                                                 temp->storage_bytes,
104 |                                                 d_keys,
105 |                                                 num_items,
106 |                                                 begin_bit,
107 |                                                 end_bit,
108 |                                                 stream  ));
109 |             }
110 |         }
111 | 
112 |         if (temp->storage && d_key_buf != d_keys.Current()){
113 |             E(cudaMemcpyAsync(d_key_buf, d_key_alt_buf, num_items * sizeof(Tk),
114 |                             cudaMemcpyDeviceToDevice, stream));
115 |         }
116 | 
117 |         if (temp->storage == 0) {
118 |             E(cudaMalloc(&temp->storage, temp->storage_bytes));
119 |         }
120 | 
121 |         return temp;
122 |     }
123 | };
124 | 
125 | extern "C" {
126 | 
127 | #define WRAP(Fn, Tk, Tv)                        \
128 | DLLEXPORT void*                                 \
129 | radixsort_ ## Fn(   TempStorage *temp,          \
130 |                     unsigned  num_items,        \
131 |                     Tk  *d_key_buf,             \
132 |                     Tk  *d_key_alt_buf,         \
133 |                     Tv  *d_value_buf,           \
134 |                     Tv  *d_value_alt_buf,       \
135 |                     cudaStream_t stream,        \
136 |                     int descending,             \
137 |                     unsigned begin_bit,         \
138 |                     unsigned end_bit      ) {   \
139 |     return RadixSort<Tk, Tv>::sort(temp,            \
140 |                                 num_items,          \
141 |                                 d_key_buf,          \
142 |                                 d_key_alt_buf,      \
143 |                                 d_value_buf,        \
144 |                                 d_value_alt_buf,    \
145 |                                 stream,             \
146 |                                 descending,         \
147 |                                 begin_bit,          \
148 |                                 end_bit);           \
149 | }
150 | 
151 | WRAP(float, float, unsigned)
152 | WRAP(double, double, unsigned)
153 | WRAP(int32, int32_t, unsigned)
154 | WRAP(uint32, uint32_t, unsigned)
155 | WRAP(int64, int64_t, unsigned)
156 | WRAP(uint64, uint64_t, unsigned)
157 | 
158 | DLLEXPORT void
159 | radixsort_cleanup(TempStorage *ptr) {
160 |     cleanup(ptr);
161 | }
162 | 
163 | #undef WRAP
164 | } // end extern "C"
165 | 


--------------------------------------------------------------------------------
/lib/dllexport.h:
--------------------------------------------------------------------------------
 1 | #ifndef DLLEXPORT
 2 | 
 3 | #ifdef _WIN32
 4 |     #define DLLEXPORT __declspec( dllexport )
 5 | #else
 6 |     #define DLLEXPORT
 7 | #endif
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/lib/mgpucontext.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #include "util/mgpucontext.h"
 36 | 
 37 | namespace mgpu {
 38 | 
 39 | ////////////////////////////////////////////////////////////////////////////////
 40 | // CudaTimer
 41 | 
 42 | void CudaTimer::Start() {
 43 | 	cudaEventRecord(start);
 44 | 	cudaDeviceSynchronize();
 45 | }
 46 | double CudaTimer::Split() {
 47 | 	cudaEventRecord(end);
 48 | 	cudaDeviceSynchronize();
 49 | 	float t;
 50 | 	cudaEventElapsedTime(&t, start, end);
 51 | 	start.Swap(end);
 52 | 	return (t / 1000.0);
 53 | }
 54 | double CudaTimer::Throughput(int count, int numIterations) {
 55 | 	double elapsed = Split();
 56 | 	return (double)numIterations * count / elapsed;
 57 | }
 58 | 
 59 | ////////////////////////////////////////////////////////////////////////////////
 60 | // CudaDevice
 61 | 
 62 | __global__ void KernelVersionShim() { }
 63 | 
 64 | struct DeviceGroup {
 65 | 	int numCudaDevices;
 66 | 	CudaDevice** cudaDevices;
 67 | 
 68 | 	DeviceGroup() {
 69 | 		numCudaDevices = -1;
 70 | 		cudaDevices = 0;
 71 | 	}
 72 | 
 73 | 	int GetDeviceCount() {
 74 | 		if(-1 == numCudaDevices) {
 75 | 			cudaError_t error = cudaGetDeviceCount(&numCudaDevices);
 76 | 			if(cudaSuccess != error || numCudaDevices <= 0) {
 77 | 				fprintf(stderr, "ERROR ENUMERATING CUDA DEVICES.\nExiting.\n");
 78 | 				exit(0);
 79 | 			}
 80 | 			cudaDevices = new CudaDevice*[numCudaDevices];
 81 | 			memset(cudaDevices, 0, sizeof(CudaDevice*) * numCudaDevices);
 82 | 		}
 83 | 		return numCudaDevices;
 84 | 	}
 85 | 
 86 | 	CudaDevice* GetByOrdinal(int ordinal) {
 87 | 		if(ordinal >= GetDeviceCount()) return 0;
 88 | 
 89 | 		if(!cudaDevices[ordinal]) {
 90 | 			// Retrieve the device properties.
 91 | 			CudaDevice* device = cudaDevices[ordinal] = new CudaDevice;
 92 | 			device->_ordinal = ordinal;
 93 | 			cudaError_t error = cudaGetDeviceProperties(&device->_prop,
 94 | 				ordinal);
 95 | 			if(cudaSuccess != error) {
 96 | 				fprintf(stderr, "FAILURE TO CREATE CUDA DEVICE %d\n", ordinal);
 97 | 				exit(0);
 98 | 			}
 99 | 
100 | 			// Get the compiler version for this device.
101 | 			//cudaSetDevice(ordinal); // don't create new context
102 | 			cudaFuncAttributes attr;
103 | 			error = cudaFuncGetAttributes(&attr, KernelVersionShim);
104 | 			if(cudaSuccess == error)
105 | 				device->_ptxVersion = 10 * attr.ptxVersion;
106 | 			else {
107 | 				printf("NOT COMPILED WITH COMPATIBLE PTX VERSION FOR DEVICE"
108 | 					" %d\n", ordinal);
109 | 				// The module wasn't compiled with support for this device.
110 | 				device->_ptxVersion = 0;
111 | 			}
112 | 		}
113 | 		return cudaDevices[ordinal];
114 | 	}
115 | 
116 | 	~DeviceGroup() {
117 | 		if(cudaDevices) {
118 | 			for(int i = 0; i < numCudaDevices; ++i)
119 | 				delete cudaDevices[i];
120 | 			delete [] cudaDevices;
121 | 		}
122 | 		cudaDeviceReset();
123 | 	}
124 | };
125 | 
126 | std::auto_ptr<DeviceGroup> deviceGroup;
127 | 
128 | 
129 | int CudaDevice::DeviceCount() {
130 | 	if(!deviceGroup.get())
131 | 		deviceGroup.reset(new DeviceGroup);
132 | 	return deviceGroup->GetDeviceCount();
133 | }
134 | 
135 | CudaDevice& CudaDevice::ByOrdinal(int ordinal) {
136 | 	if(ordinal < 0 || ordinal >= DeviceCount()) {
137 | 		fprintf(stderr, "CODE REQUESTED INVALID CUDA DEVICE %d\n", ordinal);
138 | 		exit(0);
139 | 	}
140 | 	return *deviceGroup->GetByOrdinal(ordinal);
141 | }
142 | 
143 | CudaDevice& CudaDevice::Selected() {
144 | 	int ordinal;
145 | 	cudaError_t error = cudaGetDevice(&ordinal);
146 | 	if(cudaSuccess != error) {
147 | 		fprintf(stderr, "ERROR RETRIEVING CUDA DEVICE ORDINAL\n");
148 | 		exit(0);
149 | 	}
150 | 	return ByOrdinal(ordinal);
151 | }
152 | 
153 | void CudaDevice::SetActive() {
154 | 	cudaError_t error = cudaSetDevice(_ordinal);
155 | 	if(cudaSuccess != error) {
156 | 		fprintf(stderr, "ERROR SETTING CUDA DEVICE TO ORDINAL %d\n", _ordinal);
157 | 		exit(0);
158 | 	}
159 | }
160 | 
161 | std::string CudaDevice::DeviceString() const {
162 | 	size_t freeMem, totalMem;
163 | 	cudaError_t error = cudaMemGetInfo(&freeMem, &totalMem);
164 | 	if(cudaSuccess != error) {
165 | 		fprintf(stderr, "ERROR RETRIEVING MEM INFO FOR CUDA DEVICE %d\n",
166 | 			_ordinal);
167 | 		exit(0);
168 | 	}
169 | 
170 | 	double memBandwidth = (_prop.memoryClockRate * 1000.0) *
171 | 		(_prop.memoryBusWidth / 8 * 2) / 1.0e9;
172 | 
173 | 	std::string s = stringprintf(
174 | 		"%s : %8.3lf Mhz   (Ordinal %d)\n"
175 | 		"%d SMs enabled. Compute Capability sm_%d%d\n"
176 | 		"FreeMem: %6dMB   TotalMem: %6dMB   %2d-bit pointers.\n"
177 | 		"Mem Clock: %8.3lf Mhz x %d bits   (%5.1lf GB/s)\n"
178 | 		"ECC %s\n\n",
179 | 		_prop.name, _prop.clockRate / 1000.0, _ordinal,
180 | 		_prop.multiProcessorCount, _prop.major, _prop.minor,
181 | 		(int)(freeMem / (1<< 20)), (int)(totalMem / (1<< 20)), 8 * sizeof(int*),
182 | 		_prop.memoryClockRate / 1000.0, _prop.memoryBusWidth, memBandwidth,
183 | 		_prop.ECCEnabled ? "Enabled" : "Disabled");
184 | 	return s;
185 | }
186 | 
187 | ////////////////////////////////////////////////////////////////////////////////
188 | // CudaContext
189 | 
190 | struct ContextGroup {
191 | 	CudaContext** standardContexts;
192 | 	int numDevices;
193 | 
194 | 	ContextGroup() {
195 | 		numDevices = CudaDevice::DeviceCount();
196 | 		standardContexts = new CudaContext*[numDevices];
197 | 		memset(standardContexts, 0, sizeof(CudaContext*) * numDevices);
198 | 	}
199 | 
200 | 	CudaContext* GetByOrdinal(int ordinal) {
201 | 		if(!standardContexts[ordinal]) {
202 | 			CudaDevice& device = CudaDevice::ByOrdinal(ordinal);
203 | 			standardContexts[ordinal] = new CudaContext(device, false, true);
204 | 		}
205 | 		return standardContexts[ordinal];
206 | 	}
207 | 
208 | 	~ContextGroup() {
209 | 		if(standardContexts) {
210 | 			for(int i = 0; i < numDevices; ++i)
211 | 				delete standardContexts[i];
212 | 			delete [] standardContexts;
213 | 		}
214 | 	}
215 | };
216 | std::auto_ptr<ContextGroup> contextGroup;
217 | 
218 | CudaContext::CudaContext(CudaDevice& device, bool newStream, bool standard) :
219 | 	_event(cudaEventDisableTiming /*| cudaEventBlockingSync */),
220 | 	_stream(0), _noRefCount(standard), _pageLocked(0) {
221 | 
222 | 	// Create an allocator.
223 | 	if(standard)
224 | 		_alloc.reset(new CudaAllocSimple(device));
225 | 	else
226 | 		_alloc = CreateDefaultAlloc(device);
227 | 
228 | 	if(newStream) cudaStreamCreate(&_stream);
229 | 	_ownStream = newStream;
230 | 
231 | 	// Allocate 4KB of page-locked memory.
232 | 	cudaError_t error;
233 | 	// error = cudaMallocHost((void**)&_pageLocked, 4096);
234 | 
235 | 	// Allocate an auxiliary stream.
236 | 	error = cudaStreamCreate(&_auxStream);
237 | }
238 | 
239 | CudaContext::~CudaContext() {
240 | 	if(_pageLocked)
241 | 		cudaFreeHost(_pageLocked);
242 | 	if(_ownStream && _stream)
243 | 		cudaStreamDestroy(_stream);
244 | 	if(_auxStream)
245 | 		cudaStreamDestroy(_auxStream);
246 | }
247 | 
248 | AllocPtr CudaContext::CreateDefaultAlloc(CudaDevice& device) {
249 | 	intrusive_ptr<CudaAllocBuckets> alloc(new CudaAllocBuckets(device));
250 | 	size_t freeMem, totalMem;
251 | 
252 | 	cudaError_t error = cudaMemGetInfo(&freeMem, &totalMem);
253 | 	if(cudaSuccess != error) {
254 | 		fprintf(stderr, "ERROR RETRIEVING MEM INFO FOR CUDA DEVICE %d\n",
255 | 			device.Ordinal());
256 | 		exit(0);
257 | 	}
258 | 
259 | 	// Maintain a buffer of 128MB with max objects of 64MB.
260 | 	alloc->SetCapacity(128<< 20, 64<< 20);
261 | 
262 | 	return AllocPtr(alloc.get());
263 | }
264 | 
265 | CudaContext& CudaContext::StandardContext(int ordinal) {
266 | 	bool setActive = -1 != ordinal;
267 | 	if(-1 == ordinal) {
268 | 		cudaError_t error = cudaGetDevice(&ordinal);
269 | 		if(cudaSuccess != error) {
270 | 			fprintf(stderr, "ERROR RETRIEVING CUDA DEVICE ORDINAL\n");
271 | 			exit(0);
272 | 		}
273 | 	}
274 | 	int numDevices = CudaDevice::DeviceCount();
275 | 
276 | 	if(ordinal < 0 || ordinal >= numDevices) {
277 | 		fprintf(stderr, "CODE REQUESTED INVALID CUDA DEVICE %d\n", ordinal);
278 | 		exit(0);
279 | 	}
280 | 
281 | 	if(!contextGroup.get())
282 | 		contextGroup.reset(new ContextGroup);
283 | 
284 | 	CudaContext& context = //*contextGroup->standardContexts[ordinal];
285 | 		*contextGroup->GetByOrdinal(ordinal);
286 | 	if(!context.PTXVersion()) {
287 | 		fprintf(stderr, "This CUDA executable was not compiled with support"
288 | 			" for device %d (sm_%2d)\n", ordinal, context.ArchVersion() / 10);
289 | 		exit(0);
290 | 	}
291 | 
292 | 	if(setActive) context.SetActive();
293 | 	return context;
294 | }
295 | 
296 | ContextPtr CreateCudaDevice(int ordinal) {
297 | 	CudaDevice& device = CudaDevice::ByOrdinal(ordinal);
298 | 	ContextPtr context(new CudaContext(device, false, false));
299 | 	return context;
300 | }
301 | ContextPtr CreateCudaDevice(int argc, char** argv, bool printInfo) {
302 | 	int ordinal = 0;
303 | 	if(argc >= 2 && !sscanf(argv[1], "%d", &ordinal)) {
304 | 		fprintf(stderr, "INVALID COMMAND LINE ARGUMENT - NOT A CUDA ORDINAL\n");
305 | 		exit(0);
306 | 	}
307 | 	ContextPtr context = CreateCudaDevice(ordinal);
308 | 	if(!context->PTXVersion()) {
309 | 		fprintf(stderr, "This CUDA executable was not compiled with support"
310 | 			" for device %d (sm_%2d)\n", ordinal, context->ArchVersion() / 10);
311 | 		exit(0);
312 | 	}
313 | 
314 | 	context->SetActive();
315 | 	if(printInfo)
316 | 		printf("%s\n", context->Device().DeviceString().c_str());
317 | 	return context;
318 | }
319 | 
320 | ContextPtr CreateCudaDeviceStream(int ordinal) {
321 | 	ContextPtr context(new CudaContext(
322 | 		CudaDevice::ByOrdinal(ordinal), true, false));
323 | 	return context;
324 | }
325 | 
326 | ContextPtr CreateCudaDeviceStream(int argc, char** argv, bool printInfo) {
327 | 	int ordinal = 0;
328 | 	if(argc >= 2 && !sscanf(argv[1], "%d", &ordinal)) {
329 | 		fprintf(stderr, "INVALID COMMAND LINE ARGUMENT - NOT A CUDA ORDINAL\n");
330 | 		exit(0);
331 | 	}
332 | 	ContextPtr context = CreateCudaDeviceStream(ordinal);
333 | 	if(!context->PTXVersion()) {
334 | 		fprintf(stderr, "This CUDA executable was not compiled with support"
335 | 			" for device %d (sm_%2d)\n", ordinal, context->ArchVersion() / 10);
336 | 		exit(0);
337 | 	}
338 | 
339 | 	context->SetActive();
340 | 	if(printInfo)
341 | 		printf("%s\n", context->Device().DeviceString().c_str());
342 | 	return context;
343 | }
344 | 
345 | ContextPtr CreateCudaDeviceAttachStream(int ordinal, cudaStream_t stream) {
346 | 	ContextPtr context(new CudaContext(
347 | 		CudaDevice::ByOrdinal(ordinal), false, false));
348 | 	context->_stream = stream;
349 | 	return context;
350 | }
351 | 
352 | ContextPtr CreateCudaDeviceAttachStream(cudaStream_t stream) {
353 | 	int ordinal;
354 | 	cudaGetDevice(&ordinal);
355 | 	return CreateCudaDeviceAttachStream(ordinal, stream);
356 | }
357 | 
358 | ////////////////////////////////////////////////////////////////////////////////
359 | // CudaAllocSimple
360 | 
361 | cudaError_t CudaAllocSimple::Malloc(size_t size, void** p) {
362 | 	cudaError_t error = cudaSuccess;
363 | 	*p = 0;
364 | 	if(size) error = cudaMalloc(p, size);
365 | 
366 | 	if(cudaSuccess != error) {
367 | 		printf("CUDA MALLOC ERROR %d\n", error);
368 | 		exit(0);
369 | 	}
370 | 
371 | 	return error;
372 | }
373 | 
374 | bool CudaAllocSimple::Free(void* p) {
375 | 	cudaError_t error = cudaSuccess;
376 | 	if(p) error = cudaFree(p);
377 | 	return cudaSuccess == error;
378 | }
379 | 
380 | ////////////////////////////////////////////////////////////////////////////////
381 | // CudaAllocBuckets
382 | 
383 | CudaAllocBuckets::CudaAllocBuckets(CudaDevice& device) : CudaAlloc(device) {
384 | 	_maxObjectSize = _capacity = _allocated = _committed = 0;
385 | 	_counter = 0;
386 | }
387 | 
388 | CudaAllocBuckets::~CudaAllocBuckets() {
389 | 	SetCapacity(0, 0);
390 | 	assert(!_allocated);
391 | }
392 | 
393 | bool CudaAllocBuckets::SanityCheck() const {
394 | 	// Iterate through all allocated objects and verify sizes.
395 | 	size_t allocatedCount = 0, committedCount = 0;
396 | 	for(AddressMap::const_iterator i = _addressMap.begin();
397 | 		i != _addressMap.end(); ++i) {
398 | 
399 | 		int bucket = i->second->bucket;
400 | 		size_t size = (bucket < NumBuckets) ? BucketSizes[bucket] : 0;
401 | 		allocatedCount += size;
402 | 
403 | 		if(i->second->priority == _priorityMap.end())
404 | 			committedCount += size;
405 | 	}
406 | 
407 | 	return allocatedCount == _allocated && committedCount == _committed;
408 | }
409 | 
410 | cudaError_t CudaAllocBuckets::Malloc(size_t size, void** p) {
411 | 
412 | 	// Locate the bucket index and adjust the size of the allocation to the
413 | 	// bucket size.
414 | 	size_t allocSize = size;
415 | 	size_t commitSize = 0;
416 | 	int bucket = LocateBucket(size);
417 | 	if(bucket < NumBuckets)
418 | 		allocSize = commitSize = BucketSizes[bucket];
419 | 
420 | 	// Peel off an already-allocated node and reuse it.
421 | 	MemList& list = _memLists[bucket];
422 | 	if(list.size() && list.front().priority != _priorityMap.end()) {
423 | 		MemList::iterator memIt = list.begin();
424 | 
425 | 		_priorityMap.erase(memIt->priority);
426 | 		memIt->priority = _priorityMap.end();
427 | 
428 | 		list.splice(list.end(), list, memIt);
429 | 		_committed += commitSize;
430 | 
431 | 		*p = memIt->address->first;
432 | 		return cudaSuccess;
433 | 	}
434 | 
435 | 	// Shrink if this allocation would put us over the limit.
436 | 	Compact(commitSize);
437 | 
438 | 	cudaError_t error = cudaSuccess;
439 | 	*p = 0;
440 | 	if(size) error = cudaMalloc(p, allocSize);
441 | 	while((cudaErrorMemoryAllocation == error) && (_committed < _allocated)) {
442 | 		SetCapacity(_capacity - _capacity / 10, _maxObjectSize);
443 | 		error = cudaMalloc(p, size);
444 | 	}
445 | 	if(cudaSuccess != error) return error;
446 | 
447 | 	MemList::iterator memIt =
448 | 		_memLists[bucket].insert(_memLists[bucket].end(), MemNode());
449 | 	memIt->bucket = bucket;
450 | 	memIt->address = _addressMap.insert(std::make_pair(*p, memIt)).first;
451 | 	memIt->priority = _priorityMap.end();
452 | 	_allocated += commitSize;
453 | 	_committed += commitSize;
454 | 
455 | 	assert(SanityCheck());
456 | 
457 | 	return cudaSuccess;
458 | }
459 | 
460 | bool CudaAllocBuckets::Free(void* p) {
461 | 	AddressMap::iterator it = _addressMap.find(p);
462 | 	if(it == _addressMap.end()) {
463 | 		// If the pointer was not found in the address map, cudaFree it anyways
464 | 		// but return false.
465 | 		if(p) cudaFree(p);
466 | 		return false;
467 | 	}
468 | 
469 | 	// Because we're freeing a page, it had better not be in the priority queue.
470 | 	MemList::iterator memIt = it->second;
471 | 	assert(memIt->priority == _priorityMap.end());
472 | 
473 | 	// Always free allocations larger than the largest bucket
474 | 	it->second->priority = _priorityMap.insert(
475 | 		std::make_pair(_counter++ - memIt->bucket, memIt));
476 | 
477 | 	// Freed nodes are moved to the front, committed nodes are moved to the
478 | 	// end.
479 | 	int bucket = memIt->bucket;
480 | 	size_t commitSize = (bucket < NumBuckets) ? BucketSizes[bucket] : 0;
481 | 
482 | 	MemList& list = _memLists[bucket];
483 | 	list.splice(list.begin(), list, memIt);
484 | 	_committed -= commitSize;
485 | 
486 | 	// Delete data that's not cached.
487 | 	if(NumBuckets == bucket)
488 | 		FreeNode(memIt);
489 | 
490 | 	Compact(0);
491 | 	return true;
492 | }
493 | 
494 | void CudaAllocBuckets::Clear() {
495 | 	Compact(_allocated);
496 | }
497 | 
498 | void CudaAllocBuckets::FreeNode(CudaAllocBuckets::MemList::iterator memIt) {
499 | 	if(memIt->address->first) cudaFree(memIt->address->first);
500 | 
501 | 	int bucket = memIt->bucket;
502 | 	size_t commitSize = (bucket < NumBuckets) ? BucketSizes[bucket] : 0;
503 | 	_addressMap.erase(memIt->address);
504 | 	if(memIt->priority != _priorityMap.end())
505 | 		_priorityMap.erase(memIt->priority);
506 | 	else
507 | 		_committed -= commitSize;
508 | 	_allocated -= commitSize;
509 | 
510 | 	_memLists[bucket].erase(memIt);
511 | 
512 | 	assert(SanityCheck());
513 | }
514 | 
515 | void CudaAllocBuckets::Compact(size_t extra) {
516 | 	while(_allocated + extra > _capacity && _allocated > _committed) {
517 | 		// Walk the priority queue from beginning to end removing nodes.
518 | 		MemList::iterator memIt = _priorityMap.begin()->second;
519 | 		FreeNode(memIt);
520 | 	}
521 | }
522 | 
523 | // Exponentially spaced buckets.
524 | const size_t CudaAllocBuckets::BucketSizes[CudaAllocBuckets::NumBuckets] = {
525 | 	       256,        512,       1024,       2048,       4096,       8192,
526 | 	     12288,      16384,      24576,      32768,      49152,      65536,
527 | 	     98304,     131072,     174848,     218624,     262144,     349696,
528 | 	    436992,     524288,     655360,     786432,     917504,    1048576,
529 | 	   1310720,    1572864,    1835008,    2097152,    2516736,    2936064,
530 | 	   3355648,    3774976,    4194304,    4893440,    5592576,    6291456,
531 | 	   6990592,    7689728,    8388608,    9786880,   11184896,   12582912,
532 | 	  13981184,   15379200,   16777216,   18874368,   20971520,   23068672,
533 | 	  25165824,   27262976,   29360128,   31457280,   33554432,   36910080,
534 | 	  40265472,   43620864,   46976256,   50331648,   53687296,   57042688,
535 | 	  60398080,   63753472,   67108864,   72701440,   78293760,   83886080,
536 | 	  89478656,   95070976,  100663296,  106255872,  111848192,  117440512,
537 | 	 123033088,  128625408,  134217728,  143804928,  153391872,  162978816,
538 | 	 172565760,  182152704,  191739648,  201326592,  210913792,  220500736
539 | };
540 | 
541 | int CudaAllocBuckets::LocateBucket(size_t size) const {
542 | 	if(size > _maxObjectSize || size > BucketSizes[NumBuckets - 1])
543 | 		return NumBuckets;
544 | 
545 | 	return (int)(std::lower_bound(BucketSizes, BucketSizes + NumBuckets, size) -
546 | 		BucketSizes);
547 | }
548 | 
549 | } // namespace mgpu
550 | 


--------------------------------------------------------------------------------
/lib/mgpusort.cu:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <moderngpu.cuh>
 3 | #include <util/mgpucontext.h>
 4 | #include "mgpucontext.cu"
 5 | #include "dllexport.h"
 6 | // #include <src/mgpuutil.cpp>
 7 | 
 8 | namespace mgpu{
 9 | 	std::string stringprintf(const char* format, ...) { return std::string(); }
10 | }
11 | 
12 | namespace {
13 | 
14 | using namespace mgpu;
15 | 
16 | template<class Tkey, class Tval>
17 | void segsortpairs( Tkey *d_keys,
18 | 				   Tval *d_vals,
19 | 				   unsigned N,
20 | 				   const int *d_segments,
21 | 				   unsigned NumSegs,
22 | 				   cudaStream_t stream	)
23 | {
24 | 
25 |     ContextPtr context = CreateCudaDeviceAttachStream(stream);
26 | 
27 |     SegSortPairsFromIndices(
28 |     	d_keys,
29 |     	d_vals,
30 |     	N,
31 |     	d_segments,
32 |     	NumSegs,
33 |     	*context,
34 |     	false);
35 | 
36 | }
37 | 
38 | } // end static namespace
39 | 
40 | 
41 | extern "C" {
42 | 
43 | #define WRAP(F, Tkey, Tval)												\
44 | DLLEXPORT void segsortpairs_##F( Tkey *d_keys,                          \
45 | 					   Tval *d_vals,									\
46 | 					   unsigned N,										\
47 | 					   const int *d_segments,							\
48 | 					   unsigned NumSegs,								\
49 | 					   cudaStream_t stream	)							\
50 | {  segsortpairs(d_keys, d_vals, N, d_segments, NumSegs, stream);  }
51 | 
52 | WRAP(int32, int32_t, unsigned)
53 | WRAP(int64, int64_t, unsigned)
54 | WRAP(uint32, uint32_t, unsigned)
55 | WRAP(uint64, uint64_t, unsigned)
56 | WRAP(float32, float, unsigned)
57 | WRAP(float64, double, unsigned)
58 | 
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/test/test_sorting_libs.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function, absolute_import, division
  2 | 
  3 | """
  4 | Uses radixsort implementation from CUB which has the following license:
  5 | 
  6 | Copyright (c) 2011, Duane Merrill.  All rights reserved.
  7 | Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  8 | Redistribution and use in source and binary forms, with or without
  9 | modification, are permitted provided that the following conditions are met:
 10 |    Redistributions of source code must retain the above copyright
 11 |       notice, this list of conditions and the following disclaimer.
 12 |    Redistributions in binary form must reproduce the above copyright
 13 |       notice, this list of conditions and the following disclaimer in the
 14 |       documentation and/or other materials provided with the distribution.
 15 |    Neither the name of the NVIDIA CORPORATION nor the
 16 |       names of its contributors may be used to endorse or promote products
 17 |       derived from this software without specific prior written permission.
 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 21 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 22 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 25 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | """
 29 | 
 30 | """
 31 | Uses segmented sort implementation from ModernGPU which has the following
 32 | license:
 33 | 
 34 | Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 35 | 
 36 | Redistribution and use in source and binary forms, with or without
 37 | modification, are permitted provided that the following conditions are met:
 38 |     * Redistributions of source code must retain the above copyright
 39 |       notice, this list of conditions and the following disclaimer.
 40 |     * Redistributions in binary form must reproduce the above copyright
 41 |       notice, this list of conditions and the following disclaimer in the
 42 |       documentation and/or other materials provided with the distribution.
 43 |     * Neither the name of the NVIDIA CORPORATION nor the
 44 |       names of its contributors may be used to endorse or promote products
 45 |       derived from this software without specific prior written permission.
 46 | 
 47 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 48 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 49 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 50 | ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 51 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 52 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 53 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 54 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 55 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 56 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 57 | """
 58 | 
 59 | 
 60 | import ctypes
 61 | import os
 62 | import platform
 63 | import sys
 64 | import warnings
 65 | from contextlib import contextmanager
 66 | 
 67 | import pytest
 68 | 
 69 | import numpy as np
 70 | from numba import findlib
 71 | from numba.cuda.cudadrv.driver import device_pointer
 72 | from numba.cuda.cudadrv.drvapi import cu_stream
 73 | from numba.cuda.cudadrv.devicearray import auto_device, is_cuda_ndarray
 74 | from numba import cuda
 75 | 
 76 | 
 77 | def run_tests_on_hardware():
 78 |     def cuda_compatible():
 79 |         if sys.platform.startswith('darwin'):
 80 |             ver = platform.mac_ver()[0]
 81 |             # version string can contain two or three components
 82 |             major, minor = ver.split('.', 1)
 83 |             if '.' in minor:
 84 |                 minor, micro = minor.split('.', 1)
 85 |             if (int(major), int(minor)) < (10, 9):
 86 |                 return False
 87 | 
 88 |         is_64bits = sys.maxsize > 2**32
 89 |         if not is_64bits:
 90 |             return False
 91 | 
 92 |     return True
 93 | 
 94 |     if cuda_compatible():
 95 |         return cuda.is_available()
 96 |     else:
 97 |         return False
 98 | 
 99 | 
100 | use_hardware = run_tests_on_hardware()
101 | 
102 | 
103 | def library_extension():
104 |     p = platform.system()
105 |     if p == 'Linux':
106 |         return 'so'
107 |     if p == 'Windows':
108 |         return 'dll'
109 |     if p == 'Darwin':
110 |         return 'dylib'
111 | 
112 | 
113 | def load_lib(libname):
114 |     fullname = 'pyculib_%s.%s' % (libname, library_extension())
115 |     devpath = os.path.join(os.path.dirname(__file__), '..', 'lib')
116 |     devlib = os.path.join(os.path.abspath(devpath), fullname)
117 |     if os.path.exists(devlib):
118 |         libpath = devlib
119 |         warnings.warn('Using in-tree library %s' % libpath)
120 |     else:
121 |         libpath = os.path.join(findlib.get_lib_dir(), fullname)
122 | 
123 |     return ctypes.CDLL(libpath)
124 | 
125 | 
126 | radixlib = load_lib('radixsort')
127 | segsortlib = load_lib('segsort')
128 | 
129 | 
130 | def _bind_radixsort_double():
131 |     _argtypes = [
132 |         ctypes.c_void_p,  # temp
133 |         ctypes.c_uint,  # count
134 |         ctypes.c_void_p,  # d_key
135 |         ctypes.c_void_p,  # d_key_alt
136 |         ctypes.c_void_p,  # d_vals
137 |         ctypes.c_void_p,  # d_vals_alt
138 |         cu_stream,
139 |         ctypes.c_int,  # descending
140 |         ctypes.c_uint,  # begin_bit
141 |         ctypes.c_uint,  # end_bit
142 |     ]
143 |     dtype = np.float64
144 |     fn = getattr(radixlib, "radixsort_double")
145 |     fn.argtypes = _argtypes
146 |     fn.restype = ctypes.c_void_p
147 |     return fn
148 | 
149 | 
150 | def test_radixsort_bind():
151 |     # checks that the `radixsort_XYZ` symbols bind ok
152 |     _known_types = ['float', 'double', 'int32', 'uint32', 'int64', 'uint64']
153 |     for x in _known_types:
154 |         getattr(radixlib, "radixsort_{}".format(x))
155 | 
156 | 
157 | @pytest.mark.skipif(not use_hardware, reason='No suitable hardware found.')
158 | def test_radixsort_operation():
159 |     # a crude radixsort test
160 |     dtype = np.float64
161 |     maxcount = 1000
162 | 
163 |     keys = np.random.rand(maxcount)
164 |     reference = np.copy(keys)
165 | 
166 |     # copy to device
167 |     dptr, _ = auto_device(keys)
168 | 
169 |     def runsort(temp, keys, vals, begin_bit=0, end_bit=None):
170 |         stream = 0
171 |         begin_bit = 0
172 |         dtty = np.dtype(dtype)
173 |         end_bit = dtty.itemsize * 8
174 |         descending = 0
175 |         count = maxcount
176 |         if keys:
177 |             count = keys.size
178 | 
179 |         _arysize = int(maxcount * dtty.itemsize)
180 |         _sort = _bind_radixsort_double()
181 | 
182 |         ctx = cuda.current_context()
183 |         _temp_keys = ctx.memalloc(_arysize)
184 | 
185 |         return _sort(
186 |             temp,
187 |             ctypes.c_uint(count),
188 |             device_pointer(keys),
189 |             device_pointer(_temp_keys),
190 |             None,
191 |             None,
192 |             stream,
193 |             descending,
194 |             begin_bit,
195 |             end_bit
196 |         )
197 | 
198 |     # tmp storage ref
199 |     temp = runsort(None, None, None)
200 | 
201 |     # do the sort
202 |     runsort(temp, dptr, None)
203 | 
204 |     # copy back
205 |     dptr.copy_to_host(keys)
206 | 
207 |     # compare
208 |     np.testing.assert_equal(np.sort(reference), keys)
209 | 
210 | 
211 | def _bind_segsort_double():
212 |     _argtypes = [
213 |         ctypes.c_void_p,  # d_key
214 |         ctypes.c_void_p,  # d_vals
215 |         ctypes.c_uint,  # N
216 |         ctypes.c_void_p,  # segments
217 |         ctypes.c_uint,  # Nseg
218 |         cu_stream,  # stream
219 |     ]
220 |     fn = getattr(segsortlib, 'segsortpairs_float64')
221 |     fn.argtypes = _argtypes
222 |     return fn
223 | 
224 | 
225 | def test_segsort_bind():
226 |     # checks that the `segsort_XYZ` symbols bind ok
227 |     _known_types = ['float32', 'float64', 'int32', 'uint32', 'int64', 'uint64']
228 |     for x in _known_types:
229 |         getattr(segsortlib, "segsortpairs_{}".format(x))
230 | 
231 | 
232 | @pytest.mark.skipif(not use_hardware, reason='No suitable hardware found.')
233 | def test_segsort_operation():
234 |     # a crude segsort test
235 | 
236 |     maxcount = 1000
237 | 
238 |     keys = np.random.rand(maxcount)
239 |     reference = keys.copy()
240 |     original = keys.copy()
241 |     values = np.arange(keys.size, dtype=np.int32)
242 |     segments = np.arange(64, maxcount, 64, dtype=np.int32)
243 | 
244 |     dptr_keys, _ = auto_device(keys)
245 |     keys[:] = 0
246 |     dptr_values, _ = auto_device(values)
247 |     values[:] = 0
248 |     dptr_segments, _ = auto_device(segments)
249 | 
250 |     def runsort(d_keys, d_vals, d_seg):
251 |         _sort = _bind_segsort_double()
252 |         _sort(device_pointer(d_keys),
253 |               device_pointer(d_vals),
254 |               d_keys.size,
255 |               device_pointer(d_seg),
256 |               d_seg.size,
257 |               0)
258 | 
259 |     runsort(dptr_keys, dptr_values, dptr_segments)
260 | 
261 |     # copy back
262 |     dptr_keys.copy_to_host(keys)
263 |     dptr_values.copy_to_host(values)
264 | 
265 |     # compare
266 |     r = [z for z in segments]
267 |     low = [0] + r
268 |     high = r + [maxcount]
269 |     for x, y in zip(low, high):
270 |         reference[x:y].sort()
271 | 
272 |     np.testing.assert_equal(keys, reference)
273 |     np.testing.assert_equal(original[values], reference)
274 | 


--------------------------------------------------------------------------------