├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── build └── .gitignore ├── gpu_library.cu ├── install.bash └── test_mul.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.i 2 | *.ii 3 | *.gpu 4 | *.ptx 5 | *.cubin 6 | *.fatbin 7 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "pybind11"] 2 | path = pybind11 3 | url = https://github.com/pybind/pybind11 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.6) 2 | find_package(CUDA REQUIRED) 3 | find_package(PythonInterp 3.6 REQUIRED) 4 | find_package(PythonLibs 3.6 REQUIRED) 5 | 6 | include_directories( 7 | ${PYTHON_INCLUDE_DIRS} 8 | ./pybind11/include 9 | ) 10 | 11 | link_directories( 12 | /usr/local/cuda/lib64 13 | ) 14 | 15 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 16 | 17 | cuda_add_library(gpu_library SHARED 18 | gpu_library.cu) 19 | 20 | target_link_libraries(gpu_library 21 | ${PYTHON_LIBRARIES} 22 | cudart) 23 | 24 | set_target_properties(gpu_library PROPERTIES PREFIX "") 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Peter Whidden 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pybind11-cuda 2 | 3 | Compiles out of the box with cmake 4 | 5 | Numpy integration 6 | 7 | C++ Templating for composable kernels with generic data types 8 | 9 | Originally based on https://github.com/torstem/demo-cuda-pybind11 10 | 11 | # Prerequisites 12 | 13 | Cuda installed in /usr/local/cuda 14 | 15 | Python 3.6 or greater 16 | 17 | Cmake 3.6 or greater 18 | 19 | # To build 20 | 21 | ```source install.bash``` 22 | 23 | Test it with 24 | ```python3 test_mul.py``` 25 | 26 | -------------------------------------------------------------------------------- /build/.gitignore: -------------------------------------------------------------------------------- 1 | CMakeCache.txt 2 | CMakeFiles 3 | cmake_install.cmake 4 | Makefile 5 | gpu_library.so 6 | -------------------------------------------------------------------------------- /gpu_library.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | template 10 | __global__ void kernel 11 | (T *vec, T scalar, int num_elements) 12 | { 13 | unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 14 | if (idx < num_elements) { 15 | vec[idx] = vec[idx] * scalar; 16 | } 17 | } 18 | 19 | template 20 | void run_kernel 21 | (T *vec, T scalar, int num_elements) 22 | { 23 | dim3 dimBlock(256, 1, 1); 24 | dim3 dimGrid(ceil((T)num_elements / dimBlock.x)); 25 | 26 | kernel<<>> 27 | (vec, scalar, num_elements); 28 | 29 | cudaError_t error = cudaGetLastError(); 30 | if (error != cudaSuccess) { 31 | std::stringstream strstr; 32 | strstr << "run_kernel launch failed" << std::endl; 33 | strstr << "dimBlock: " << dimBlock.x << ", " << dimBlock.y << std::endl; 34 | strstr << "dimGrid: " << dimGrid.x << ", " << dimGrid.y << std::endl; 35 | strstr << cudaGetErrorString(error); 36 | throw strstr.str(); 37 | } 38 | } 39 | 40 | template 41 | void map_array(pybind11::array_t vec, T scalar) 42 | { 43 | pybind11::buffer_info ha = vec.request(); 44 | 45 | if (ha.ndim != 1) { 46 | std::stringstream strstr; 47 | strstr << "ha.ndim != 1" << std::endl; 48 | strstr << "ha.ndim: " << ha.ndim << std::endl; 49 | throw std::runtime_error(strstr.str()); 50 | } 51 | 52 | int size = ha.shape[0]; 53 | int size_bytes = size*sizeof(T); 54 | T *gpu_ptr; 55 | cudaError_t error = cudaMalloc(&gpu_ptr, size_bytes); 56 | 57 | if (error != cudaSuccess) { 58 | throw std::runtime_error(cudaGetErrorString(error)); 59 | } 60 | 61 | T* ptr = reinterpret_cast(ha.ptr); 62 | error = cudaMemcpy(gpu_ptr, ptr, size_bytes, cudaMemcpyHostToDevice); 63 | if (error != cudaSuccess) { 64 | throw std::runtime_error(cudaGetErrorString(error)); 65 | } 66 | 67 | run_kernel(gpu_ptr, scalar, size); 68 | 69 | error = cudaMemcpy(ptr, gpu_ptr, size_bytes, cudaMemcpyDeviceToHost); 70 | if (error != cudaSuccess) { 71 | throw std::runtime_error(cudaGetErrorString(error)); 72 | } 73 | 74 | error = cudaFree(gpu_ptr); 75 | if (error != cudaSuccess) { 76 | throw std::runtime_error(cudaGetErrorString(error)); 77 | } 78 | } 79 | 80 | PYBIND11_MODULE(gpu_library, m) 81 | { 82 | m.def("multiply_with_scalar", map_array); 83 | } 84 | -------------------------------------------------------------------------------- /install.bash: -------------------------------------------------------------------------------- 1 | git submodule init 2 | git submodule update 3 | cd build 4 | cmake .. 5 | make 6 | export PYTHONPATH="$PWD:$PYTHONPATH" 7 | cd .. 8 | -------------------------------------------------------------------------------- /test_mul.py: -------------------------------------------------------------------------------- 1 | import gpu_library 2 | import numpy as np 3 | import time 4 | 5 | size = 100000000 6 | arr1 = np.linspace(1.0,100.0, size) 7 | arr2 = np.linspace(1.0,100.0, size) 8 | 9 | runs = 10 10 | factor = 3.0 11 | 12 | t0 = time.time() 13 | for _ in range(runs): 14 | gpu_library.multiply_with_scalar(arr1, factor) 15 | print("gpu time: " + str(time.time()-t0)) 16 | t0 = time.time() 17 | for _ in range(runs): 18 | arr2 = arr2 * factor 19 | print("cpu time: " + str(time.time()-t0)) 20 | 21 | print("results match: " + str(np.allclose(arr1,arr2))) 22 | --------------------------------------------------------------------------------