├── CMakeLists.txt ├── gpu_library.cpp ├── gpu_library.cu └── test.py /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | find_package(CUDA) 3 | find_package(PythonLibs 2.7 REQUIRED) 4 | 5 | include_directories(${PYTHON_INCLUDE_DIRS}) 6 | 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 8 | 9 | cuda_add_library(gpu_library SHARED 10 | gpu_library.cpp 11 | gpu_library.cu) 12 | 13 | target_link_libraries(gpu_library 14 | ${PYTHON_LIBRARIES} 15 | cudart) 16 | 17 | set_target_properties(gpu_library PROPERTIES PREFIX "") 18 | -------------------------------------------------------------------------------- /gpu_library.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | void run_kernel 8 | (double *vec, double scalar, int num_elements); 9 | 10 | void multiply_with_scalar(pybind11::array_t vec, double scalar) 11 | { 12 | int size = 10; 13 | double *gpu_ptr; 14 | cudaError_t error = cudaMalloc(&gpu_ptr, size * sizeof(double)); 15 | 16 | if (error != cudaSuccess) { 17 | throw std::runtime_error(cudaGetErrorString(error)); 18 | } 19 | auto ha = vec.request(); 20 | 21 | if (ha.ndim != 1) { 22 | std::stringstream strstr; 23 | strstr << "ha.ndim != 1" << std::endl; 24 | strstr << "ha.ndim: " << ha.ndim << std::endl; 25 | throw std::runtime_error(strstr.str()); 26 | } 27 | 28 | double* ptr = reinterpret_cast(ha.ptr); 29 | error = cudaMemcpy(gpu_ptr, ptr, size * sizeof(double), cudaMemcpyHostToDevice); 30 | if (error != cudaSuccess) { 31 | throw std::runtime_error(cudaGetErrorString(error)); 32 | } 33 | 34 | run_kernel(gpu_ptr, scalar, size); 35 | 36 | error = cudaMemcpy(ptr, gpu_ptr, size * sizeof(double), cudaMemcpyDeviceToHost); 37 | if (error != cudaSuccess) { 38 | throw std::runtime_error(cudaGetErrorString(error)); 39 | } 40 | 41 | error = cudaFree(gpu_ptr); 42 | if (error != cudaSuccess) { 43 | throw std::runtime_error(cudaGetErrorString(error)); 44 | } 45 | } 46 | 47 | PYBIND11_MODULE(gpu_library, m) 48 | { 49 | m.def("multiply_with_scalar", multiply_with_scalar); 50 | } 51 | -------------------------------------------------------------------------------- /gpu_library.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | __global__ void kernel 6 | (double *vec, double scalar, int num_elements) 7 | { 8 | unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 9 | if (idx < num_elements) { 10 | vec[idx] = vec[idx] * scalar; 11 | } 12 | } 13 | 14 | 15 | void run_kernel 16 | (double *vec, double scalar, int num_elements) 17 | { 18 | dim3 dimBlock(256, 1, 1); 19 | dim3 dimGrid(ceil((double)num_elements / dimBlock.x)); 20 | 21 | kernel<<>> 22 | (vec, scalar, num_elements); 23 | 24 | cudaError_t error = cudaGetLastError(); 25 | if (error != cudaSuccess) { 26 | std::stringstream strstr; 27 | strstr << "run_kernel launch failed" << std::endl; 28 | strstr << "dimBlock: " << dimBlock.x << ", " << dimBlock.y << std::endl; 29 | strstr << "dimGrid: " << dimGrid.x << ", " << dimGrid.y << std::endl; 30 | strstr << cudaGetErrorString(error); 31 | throw strstr.str(); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('./build') 3 | 4 | import gpu_library 5 | import numpy 6 | 7 | vec = numpy.linspace(0,1,10) 8 | 9 | print("before: ", vec) 10 | gpu_library.multiply_with_scalar(vec, 10) 11 | print("after: ", vec) 12 | --------------------------------------------------------------------------------