├── .gitignore ├── CHANGELOG ├── CMakeLists.txt ├── LICENSE ├── README.md ├── cmake └── Modules │ └── FindOpenCL.cmake ├── doc └── api.md ├── include ├── clpp11.h ├── cupp11.h └── cxpp11_common.hpp ├── samples ├── advanced.cc ├── device_info.cc ├── simple.cc └── smallest.cc └── test ├── catch.hpp └── unit_tests.cc /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | .* 3 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | 2 | Version 9.0 (2017-10-08): 3 | - Synchronized with the CLBLast's clpp11.h header 4 | - Added custom exception class hierarchy for catching errors 5 | - Removal of custom error codes for program building in favor of throwing exceptions 6 | - Added type aliases for raw types 7 | - Several minor fixes 8 | - Added new methods to the API: 9 | * Platform::Name 10 | * Platform::Vendor 11 | * Platform::Version 12 | * Device::HasExtension 13 | * Device::SupportsFP64 14 | * Device::SupportsFP16 15 | * Device::HasExtension 16 | * Device::AMDBoardName 17 | * Device::NVIDIAComputeCapability 18 | 19 | Version 8.0 (2016-09-27): 20 | - Several minor fixes 21 | - Added new methods to the API: 22 | * GetAllPlatforms 23 | * A new constructor for the Program class based on a binary or IR string (both OpenCL and CUDA) 24 | 25 | Version 7.0 (2016-08-03): 26 | - Re-wrote the OpenCL event implementation with proper memory management 27 | - Updated some return types of device-query information to fix issues on 32-bit systems 28 | - Updated the API documentation 29 | - Refactored some functions to reduce the amount of code 30 | - Added new methods to the API: 31 | * Kernel::GetFunctionName 32 | 33 | Version 6.0 (2016-06-29): 34 | - Added the possibility to use Event pointers, adjusted the Kernel::Launch function to do so 35 | - Added a new constructor for Program based on a binary (OpenCL only) 36 | - Fixed a bug when OpenCL 2.0 or newer is installed but the device doesn't support it 37 | - Added new methods to the API: 38 | * Device::VersionNumber (integer version of the string-getter Device::Version) 39 | * Device::IsCPU, Device::IsGPU, Device::IsAMD, Device::IsNVIDIA, Device::IsIntel, Device::IsARM 40 | 41 | Version 5.0 (2016-04-21): 42 | - Buffers can now also be 'not owned' to disable automatic memory freeing afterwards 43 | - Made 'Buffer::Read' and 'Buffer::ReadAsync' constant methods 44 | - Added new methods to the API: 45 | * Event::WaitForCompletion (OpenCL only) 46 | * Kernel::Launch (version with OpenCL waiting list) 47 | 48 | Version 4.0 (2015-11-01): 49 | - Made 'CopyTo' and 'CopyToAsync' constant methods 50 | - Added offset support to the Buffer class (credits go to 'ielhelw') 51 | - Added unit tests for {Event, Device, Context, Queue} classes 52 | - Added compact OpenCL example 53 | - Fixed compiler warnings and errors for Windows using MSVC 54 | - Fixed several general compiler warnings 55 | - Added new methods to the API: 56 | * Device::MaxAllocSize 57 | 58 | Version 3.0 (2015-09-04): 59 | - Renamed the project from 'Claduc' into 'CLCudaAPI' 60 | - SetArgument now takes both l-value and r-value arguments 61 | - Added first version of a test infrastructure 62 | - Added new methods to the API: 63 | * Platform::NumDevices 64 | * Buffer::Buffer (a constructor with default read-write access) 65 | * Buffer::Buffer (a constructor filled with data from C++ start/end iterators) 66 | * Kernel::Launch (version with default OpenCL workgroup size) 67 | 68 | Version 2.0 (2015-07-13): 69 | - Allows device program string to be moved into Program at construction 70 | - Cleaned-up device-information methods 71 | - Added new methods to the API: 72 | * Device::CoreClock 73 | * Device::ComputeUnits 74 | * Device::MemorySize 75 | * Device::MemoryClock 76 | * Device::MemoryBusWidth 77 | * Program::GetIR 78 | * Kernel::SetArguments 79 | 80 | Version 1.0 (2015-07-09): 81 | - Initial version 82 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | # ================================================================================================== 3 | # This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. This 4 | # project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | # width of 100 characters per line. 6 | # 7 | # Author(s): 8 | # Cedric Nugteren 9 | # 10 | # This provides a simple build infrastructure for the sample programs. The option USE_OPENCL can be 11 | # used to toggle between a CUDA or OpenCL back-end. 12 | # 13 | # ================================================================================================== 14 | # 15 | # Copyright 2015 SURFsara 16 | # 17 | # Licensed under the Apache License, Version 2.0 (the "License"); 18 | # you may not use this file except in compliance with the License. 19 | # You may obtain a copy of the License at 20 | # 21 | # http://www.apache.org/licenses/LICENSE-2.0 22 | # 23 | # Unless required by applicable law or agreed to in writing, software 24 | # distributed under the License is distributed on an "AS IS" BASIS, 25 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 26 | # See the License for the specific language governing permissions and 27 | # limitations under the License. 28 | # 29 | # ================================================================================================== 30 | 31 | # CMake project details 32 | cmake_minimum_required(VERSION 2.8.10) 33 | project("CLCudaAPI" CXX) 34 | set(CLCudaAPI_VERSION_MAJOR 8) 35 | set(CLCudaAPI_VERSION_MINOR 0) 36 | 37 | # ================================================================================================== 38 | 39 | # Enable tests 40 | option(ENABLE_TESTS "Build test-suite" ON) 41 | 42 | # Select between OpenCL and CUDA back-end 43 | option(USE_OPENCL "Use OpenCL instead of CUDA" ON) 44 | if(USE_OPENCL) 45 | message("-- Building samples with OpenCL") 46 | add_definitions(-DUSE_OPENCL) 47 | else() 48 | message("-- Building samples with CUDA") 49 | endif() 50 | 51 | # ================================================================================================== 52 | 53 | # Compiler-version check (requires at least CMake 2.8.10) 54 | if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) 55 | if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) 56 | message(FATAL_ERROR "GCC version must be at least 4.7") 57 | endif() 58 | elseif(CMAKE_CXX_COMPILER_ID STREQUAL Clang) 59 | if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3) 60 | message(FATAL_ERROR "Clang version must be at least 3.3") 61 | endif() 62 | elseif(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang) 63 | if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) 64 | message(FATAL_ERROR "AppleClang version must be at least 5.0") 65 | endif() 66 | elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel) 67 | if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0) 68 | message(FATAL_ERROR "ICC version must be at least 14.0") 69 | endif() 70 | elseif(MSVC) 71 | if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0) 72 | message(FATAL_ERROR "MS Visual Studio version must be at least 18.0") 73 | endif() 74 | endif() 75 | 76 | # C++ compiler settings 77 | if(MSVC) 78 | set(FLAGS "/Ox") 79 | else() 80 | set(FLAGS "-O3 -std=c++11 -Wall -Wno-comment") 81 | if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) 82 | if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0.0) 83 | # GCC does not support attributes on template arguments 84 | # in particular we hit this with the alignment attributes on cl_XXX types 85 | # which are then used to instantiate various templates in CLBlast 86 | set(FLAGS "${FLAGS} -Wno-ignored-attributes") 87 | endif() 88 | endif() 89 | endif() 90 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") 91 | 92 | # ================================================================================================== 93 | 94 | # Package scripts location 95 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") 96 | 97 | # Requires CUDA and OpenCL. The latter is found through the included "FindOpenCL.cmake". 98 | if(USE_OPENCL) 99 | find_package(OpenCL REQUIRED) 100 | else() 101 | find_package(CUDA REQUIRED) 102 | endif() 103 | 104 | # ================================================================================================== 105 | 106 | # Include directories: C++11 headers and OpenCL/CUDA includes 107 | include_directories(${CLCudaAPI_SOURCE_DIR}/include) 108 | if(USE_OPENCL) 109 | include_directories(${OPENCL_INCLUDE_DIRS}) 110 | else() 111 | include_directories(${CUDA_INCLUDE_DIRS}) 112 | endif() 113 | 114 | # Link directories: CUDA toolkit 115 | if(USE_OPENCL) 116 | 117 | else() 118 | link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64) 119 | endif() 120 | 121 | # ================================================================================================== 122 | 123 | # Adds the sample programs 124 | set(SAMPLE_PROGRAMS device_info simple advanced smallest) 125 | foreach(SAMPLE ${SAMPLE_PROGRAMS}) 126 | add_executable(${SAMPLE} samples/${SAMPLE}.cc) 127 | if(USE_OPENCL) 128 | target_link_libraries(${SAMPLE} ${OPENCL_LIBRARIES}) 129 | else() 130 | target_link_libraries(${SAMPLE} cuda nvrtc) 131 | endif() 132 | install(TARGETS ${SAMPLE} DESTINATION bin) 133 | endforeach() 134 | 135 | # ================================================================================================== 136 | 137 | # Optional: Enable inclusion of the test-suite 138 | if(ENABLE_TESTS) 139 | enable_testing() 140 | include_directories(${CLCudaAPI_SOURCE_DIR}/test) 141 | add_executable(unit_tests test/unit_tests.cc) 142 | if(USE_OPENCL) 143 | target_link_libraries(unit_tests ${OPENCL_LIBRARIES}) 144 | else() 145 | target_link_libraries(unit_tests cuda nvrtc) 146 | endif() 147 | add_test(unit_tests unit_tests) 148 | endif() 149 | 150 | # ================================================================================================== 151 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2015 SURFsara 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | CLCudaAPI: A portable high-level API with CUDA or OpenCL back-end 3 | ================ 4 | 5 | CLCudaAPI provides a C++ interface to the OpenCL API and/or CUDA API. This interface is high-level: all the details of setting up an OpenCL platform and device are handled automatically, as well as for example OpenCL and CUDA memory management. A similar high-level API is also provided by Khronos's `cl.hpp`, so why would someone use CLCudaAPI instead? The main reason is portability: CLCudaAPI provides two header files which both implement the exact same API, but with a different back-end. This allows __porting between OpenCL and CUDA by simply changing the header file!__ 6 | 7 | CLCudaAPI is written in C++11 and wraps CUDA and OpenCL objects in smart pointers, thus handling memory management automatically. It uses the CUDA driver API, since this is the closest to the OpenCL API, but it uses the OpenCL terminology, since this is the most generic. It compiles OpenCL and/or CUDA kernels at run-time, possible in CUDA only since release 7.0. CLCudaAPI handles the host API only: it still requires two versions of the kernel (although some simple defines could omit this requirement). 8 | 9 | 10 | What does it look like? 11 | ------------- 12 | 13 | To get started, include either of the two headers: 14 | 15 | ```c++ 16 | #include "clpp11.h" 17 | // or: 18 | #include "cupp11.h" 19 | ``` 20 | 21 | Here is a simple example of setting-up platform 0 and selecting device 2: 22 | 23 | ```c++ 24 | auto platform = CLCudaAPI::Platform(0); 25 | auto device = CLCudaAPI::Device(platform, 2); 26 | ``` 27 | 28 | Next, we'll create a CUDA/OpenCL context and a queue (== CUDA stream) on this device: 29 | 30 | ```c++ 31 | auto context = CLCudaAPI::Context(device); 32 | auto queue = CLCudaAPI::Queue(context, device); 33 | ``` 34 | 35 | And, once the context and queue are created, we can allocate and upload data to the device: 36 | 37 | ```c++ 38 | auto host_mem = std::vector(size); 39 | auto device_mem = CLCudaAPI::Buffer(context, size); 40 | device_mem.WriteBuffer(queue, size, host_mem); 41 | ``` 42 | 43 | Further examples are included in the `samples` folder. To start with CLCudaAPI, check out `samples/simple.cc`, which shows how to compile and launch a simple kernel. The full [CLCudaAPI API reference](doc/api.md) is also available in the current repository. 44 | 45 | 46 | Why would I use CLCudaAPI? 47 | ------------- 48 | 49 | The main reasons to use CLCudaAPI are: 50 | 51 | * __Portability__: the CUDA and OpenCL CLCudaAPI headers implement the exact same API. 52 | * __Memory management__: smart pointers allocate and free memory automatically. 53 | * __Error checking__: all CUDA and OpenCL API calls are automatically checked for errors. 54 | * __Abstraction__: CLCudaAPI provides a higher-level interface than OpenCL, CUDA, and `cl.hpp`. 55 | * __Easy to use__: simply ship two OS/hardware-independent header files, no compilation needed. 56 | * __Low overhead__ : all function calls are automatically in-lined by the compiler. 57 | * __Native compiler__: CLCudaAPI code can be compiled with a normal C++ compiler, there is no need to use `nvcc`. 58 | 59 | Nevertheless, there are also several cases when CLCudaAPI is not suitable: 60 | 61 | * When fine-grained control is desired: CLCudaAPI makes abstractions to certain OpenCL/CUDA handles and settings. 62 | * When unsupported features are desired: only the most common cases are currently implemented. Although this is not a fundamental limitation, it is a practical one. For example, OpenGL interoperability and CUDA constant/texture memory are not supported. 63 | * When run-time compilation is not an option: e.g. when compilation overhead is too high. 64 | 65 | What are the pre-requisites? 66 | ------------- 67 | 68 | The requirements to use the CLCudaAPI headers are: 69 | 70 | * CUDA 7.0 or higher 71 | * OpenCL 1.1 or higher 72 | * A C++11 compiler (e.g. GCC 4.7, Clang 3.3, MSVC 2015 or newer) 73 | 74 | If you also want to compile the samples and tests using the provided infrastructure, you'll also need: 75 | 76 | * CMake 2.8.10 or higher 77 | 78 | 79 | How do I compile the included examples with CMake? 80 | ------------- 81 | 82 | Use CMake to create an out-of-source build: 83 | 84 | ```bash 85 | mkdir build 86 | cd build 87 | cmake -DUSE_OPENCL=ON .. 88 | make 89 | ``` 90 | 91 | Replace `-DUSE_OPENCL=ON` with `-DUSE_OPENCL=OFF` to use CUDA instead of OpenCL as a back-end. After compilation, the `build` folder will contain a binary for each of the sample programs included in the `samples` subfolder. 92 | 93 | 94 | How do I compile the included test-suite with CMake? 95 | ------------- 96 | 97 | Compile the examples (see above) will also compile the tests (unless `-DENABLE_TESTS=OFF` is set). The tests will either use the OpenCL or CUDA back-end, similar to the samples. After compilation, the tests can be run using CTest or as follows: 98 | 99 | ```bash 100 | ./unit_tests 101 | ``` 102 | 103 | 104 | FAQ 105 | ------------- 106 | 107 | > Q: __After I include the CLCudaAPI CUDA header, the linker finds an undefined reference to `nvrtcGetErrorString'. What should I do?__ 108 | > 109 | > A: You need to link against the NVIDIA Run-Time Compilation Library (NVRTC). For example, pass `-lnvrtc -L/opt/cuda/lib64` to the compiler. 110 | -------------------------------------------------------------------------------- /cmake/Modules/FindOpenCL.cmake: -------------------------------------------------------------------------------- 1 | 2 | # ================================================================================================== 3 | # This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. This 4 | # project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | # width of 100 characters per line. 6 | # 7 | # Author(s): 8 | # Cedric Nugteren 9 | # 10 | # Defines the following variables: 11 | # OPENCL_FOUND Boolean holding whether or not the OpenCL library was found 12 | # OPENCL_INCLUDE_DIRS The OpenCL include directory 13 | # OPENCL_LIBRARIES The OpenCL library 14 | # 15 | # In case OpenCL is not installed in the default directory, set the OPENCL_ROOT variable to point to 16 | # the root of OpenCL, such that 'OpenCL/cl.h' or 'CL/cl.h' can be found in $OPENCL_ROOT/include. 17 | # This can either be done using an environmental variable (e.g. export OPENCL_ROOT=/path/to/opencl) 18 | # or using a CMake variable (e.g. cmake -DOPENCL_ROOT=/path/to/opencl ..). 19 | # 20 | # ================================================================================================== 21 | # 22 | # Copyright 2015 SURFsara 23 | # 24 | # Licensed under the Apache License, Version 2.0 (the "License"); 25 | # you may not use this file except in compliance with the License. 26 | # You may obtain a copy of the License at 27 | # 28 | # http://www.apache.org/licenses/LICENSE-2.0 29 | # 30 | # Unless required by applicable law or agreed to in writing, software 31 | # distributed under the License is distributed on an "AS IS" BASIS, 32 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 33 | # See the License for the specific language governing permissions and 34 | # limitations under the License. 35 | # 36 | # ================================================================================================== 37 | 38 | # Sets the possible install locations 39 | set(OPENCL_HINTS 40 | ${OPENCL_ROOT} 41 | $ENV{OPENCL_ROOT} 42 | $ENV{AMDAPPSDKROOT} 43 | $ENV{CUDA_PATH} 44 | $ENV{INTELOCLSDKROOT} 45 | $ENV{NVSDKCOMPUTE_ROOT} 46 | $ENV{ATISTREAMSDKROOT} 47 | ) 48 | set(OPENCL_PATHS 49 | /usr/local/cuda 50 | /opt/cuda 51 | /opt/intel/opencl 52 | /usr 53 | /usr/local 54 | ) 55 | 56 | # Finds the include directories 57 | find_path(OPENCL_INCLUDE_DIRS 58 | NAMES OpenCL/cl.h CL/cl.h 59 | HINTS ${OPENCL_HINTS} 60 | PATH_SUFFIXES include OpenCL/common/inc inc include/x86_64 include/x64 61 | PATHS ${OPENCL_PATHS} 62 | DOC "OpenCL include header OpenCL/cl.h or CL/cl.h" 63 | ) 64 | mark_as_advanced(OPENCL_INCLUDE_DIRS) 65 | 66 | # Finds the library 67 | find_library(OPENCL_LIBRARIES 68 | NAMES OpenCL 69 | HINTS ${OPENCL_HINTS} 70 | PATH_SUFFIXES lib lib64 lib/x86_64 lib/x86_64/sdk lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64 71 | PATHS ${OPENCL_PATHS} 72 | DOC "OpenCL library" 73 | ) 74 | mark_as_advanced(OPENCL_LIBRARIES) 75 | 76 | # ================================================================================================== 77 | 78 | # Notification messages 79 | if(NOT OPENCL_INCLUDE_DIRS) 80 | message(STATUS "Could NOT find 'OpenCL/cl.h' or 'CL/cl.h', install OpenCL or set OPENCL_ROOT") 81 | endif() 82 | if(NOT OPENCL_LIBRARIES) 83 | message(STATUS "Could NOT find OpenCL library, install it or set OPENCL_ROOT") 84 | endif() 85 | 86 | # Determines whether or not OpenCL was found 87 | include(FindPackageHandleStandardArgs) 88 | find_package_handle_standard_args(OpenCL DEFAULT_MSG OPENCL_INCLUDE_DIRS OPENCL_LIBRARIES) 89 | 90 | # ================================================================================================== 91 | -------------------------------------------------------------------------------- /doc/api.md: -------------------------------------------------------------------------------- 1 | CLCudaAPI: API reference 2 | ================ 3 | 4 | This file describes the high-level API for both the CUDA and OpenCL back-end of the CLCudaAPI headers. On top of the described API, each class has a constructor which takes the regular OpenCL or CUDA data-type and transforms it into a CLCudaAPI class. Furthermore, each class also implements a `()` operator which returns the regular OpenCL or CUDA data-type. 5 | 6 | 7 | CLCudaAPI::Event 8 | ------------- 9 | 10 | Constructor(s): 11 | 12 | * `Event()`: 13 | Creates a new event, to be used for example when timing kernels. 14 | 15 | Public method(s): 16 | 17 | * `void WaitForCompletion() const`: 18 | Waits for completion of an event (OpenCL) or does nothing (CUDA). 19 | 20 | * `float GetElapsedTime() const`: 21 | Retrieves the elapsed time in milliseconds of the last recorded event (e.g. a device kernel). This method first makes sure that the last event is finished before computing the elapsed time. 22 | 23 | 24 | CLCudaAPI::Platform 25 | ------------- 26 | 27 | Constructor(s): 28 | 29 | * `Platform(const size_t platform_id)`: 30 | When using the OpenCL back-end, this initializes a new OpenCL platform (e.g. AMD SDK, Intel SDK, NVIDIA SDK) specified by the integer `platform_id`. When using the CUDA back-end, this initializes the CUDA driver API. The `platform_id` argument is ignored: there is only one platform. 31 | 32 | Public method(s): 33 | 34 | * `std::string Name() const`: 35 | Retrieves the name of the platform. 36 | 37 | * `std::string Vendor() const`: 38 | Retrieves the name of the vendor of the platform. 39 | 40 | * `std::string Version() const`: 41 | Retrieves which version of an OpenCL platform is used (OpenCL back-end) or which CUDA driver is used (CUDA back-end). 42 | 43 | * `size_t NumDevices() const`: 44 | Retrieves the number of devices on this platform. 45 | 46 | Non-member function(s): 47 | 48 | * `std::vector GetAllPlatforms()`: 49 | Retrieves a vector containing all available platforms. 50 | 51 | 52 | CLCudaAPI::Device 53 | ------------- 54 | 55 | Constructor(s): 56 | 57 | * `Device(const Platform &platform, const size_t device_id)`: 58 | Initializes a new OpenCL or CUDA device on the specified platform. The `device_id` defines which device should be selected. 59 | 60 | Public method(s): 61 | 62 | * `RawPlatformID PlatformID() const`: 63 | Retrieves the raw `cl_platform_id` ID of the platform used (OpenCL back-end) or a 0 `size_t` in case of the CUDA back-end. 64 | 65 | * `std::string Version() const`: 66 | Retrieves which version of the OpenCL standard is supported (OpenCL back-end) or which CUDA driver is used (CUDA back-end). 67 | 68 | * `size_t VersionNumber() const`: 69 | The same as the `Version()` method, but without text, just the numeric value. 70 | 71 | * `std::string Vendor() const`: 72 | Retrieves the name of the vendor of the device. 73 | 74 | * `std::string Name() const`: 75 | Retrieves the name of the device. 76 | 77 | * `std::string Type() const`: 78 | Retrieves the type of the devices. Possible return values are 'CPU', 'GPU', 'accelerator', or 'default'. 79 | 80 | * `size_t MaxWorkGroupSize() const`: 81 | Retrieves the maximum total number of threads in an OpenCL work-group or CUDA thread-block. 82 | 83 | * `size_t MaxWorkItemDimensions() const`: 84 | Retrieves the maximum number of dimensions (e.g. 2D or 3D) in an OpenCL work-group or CUDA thread-block. 85 | 86 | * `unsigned long LocalMemSize() const`: 87 | Retrieves the maximum amount of on-chip scratchpad memory ('local memory') available to a single OpenCL work-group or CUDA thread-block. 88 | 89 | * `std::string Capabilities() const`: 90 | In case of the OpenCL back-end, this returns a list of the OpenCL extensions supported. For CUDA, this returns the device capability (e.g. SM 3.5). 91 | 92 | * `bool HasExtension(const std::string &extension) const`: 93 | In case of the OpenCL back-end, queries whether a certain extension is present (as reported by `Capabilities()`). For CUDA, this always returns false. 94 | 95 | * `bool SupportsFP64() const`: 96 | Returns whether or not double-precision floating-point 64-bit is supported by the device. 97 | 98 | * `bool SupportsFP16() const`: 99 | Returns whether or not half-precision floating-point 16-bit is supported by the device. 100 | 101 | * `size_t CoreClock() const`: 102 | Retrieves the device's core clock frequency in MHz. 103 | 104 | * `size_t ComputeUnits() const`: 105 | Retrieves the number of compute units (OpenCL terminology) or multi-processors (CUDA terminology) in the device. 106 | 107 | * `unsigned long MemorySize() const`: 108 | Retrieves the total global memory size. 109 | 110 | * `unsigned long MaxAllocSize() const`: 111 | Retrieves the maximum amount of allocatable global memory per allocation. 112 | 113 | * `size_t MemoryClock() const`: 114 | Retrieves the device's memory clock frequency in MHz (CUDA back-end) or 0 (OpenCL back-end). 115 | 116 | * `size_t MemoryBusWidth() const`: 117 | Retrieves the device's memory bus-width in bits (CUDA back-end) or 0 (OpenCL back-end). 118 | 119 | * `bool IsLocalMemoryValid(const size_t local_mem_usage) const`: 120 | Given a requested amount of local on-chip scratchpad memory, this method returns whether or not this is a valid configuration for this particular device. 121 | 122 | * `bool IsThreadConfigValid(const std::vector &local) const`: 123 | Given a requested OpenCL work-group or CUDA thread-block configuration `local`, this method returns whether or not this is a valid configuration for this particular device. 124 | 125 | * `bool IsCPU() const`: 126 | Determines whether this device is of the CPU type. 127 | 128 | * `bool IsGPU() const`: 129 | Determines whether this device is of the GPU type. 130 | 131 | * `bool IsAMD() const`: 132 | Determines whether this device is of the AMD brand. 133 | 134 | * `bool IsNVIDIA() const`: 135 | Determines whether this device is of the NVIDIA brand. 136 | 137 | * `bool IsIntel() const`: 138 | Determines whether this device is of the Intel brand. 139 | 140 | * `bool IsARM() const`: 141 | Determines whether this device is of the ARM brand. 142 | 143 | * `std::string AMDBoardName() const`: 144 | Returns the value of `CL_DEVICE_BOARD_NAME_AMD` if present. For the CUDA back-end, this always returns an empty string. 145 | 146 | * `std::string NVIDIAComputeCapability() const`: 147 | Returns the compute capability of an NVIDIA GPU, e.g. SM3.5. For the CUDA back-end, this returns the same as the `Capabilities()` method. 148 | 149 | 150 | CLCudaAPI::Context 151 | ------------- 152 | 153 | Constructor(s): 154 | 155 | * `Context(const Device &device)`: 156 | Initializes a new context on a given device. On top of this context, CLCudaAPI can create new programs, queues and buffers. 157 | 158 | 159 | CLCudaAPI::Program 160 | ------------- 161 | 162 | Constructor(s): 163 | 164 | * `Program(const Context &context, std::string source)`: 165 | Creates a new OpenCL or CUDA program on a given context. A program is a collection of one or more device kernels which form a single compilation unit together. The device-code is passed as a string. Such a string can for example be generated, hard-coded, or read from file at run-time. If passed as an r-value (e.g. using `std::move`), the device-code string is moved instead of copied into the class' member variable. 166 | 167 | * `Program(const Device &device, const Context &context, const std::string& binary)`: 168 | As above, but now the program is constructed based on an already compiled IR or binary of the device kernels. This requires a context corresponding to the binary. This constructor for OpenCL is based on the `clCreateProgramWithBinary` function. 169 | 170 | Public method(s): 171 | 172 | * `void Build(const Device &device, std::vector &options)`: 173 | This method invokes the OpenCL or CUDA compiler to build the program at run-time for a specific target device. Depending on the back-end, specific options can be passed to the compiler in the form of the `options` vector. Compilation errors generated by the run-time compiler result in an `std::runtime_error` exception, which can be caught. 174 | 175 | * `std::string GetBuildInfo(const Device &device) const`: 176 | Retrieves all compiler warnings and errors generated by the build process. 177 | 178 | * `std::string GetIR() const`: 179 | Retrieves the intermediate representation (IR) of the compiled program. When using the CUDA back-end, this returns the PTX-code. For the OpenCL back-end, this returns either an IR (e.g. PTX) or a binary. This is different per OpenCL implementation. 180 | 181 | CLCudaAPI::Queue 182 | ------------- 183 | 184 | Constructor(s): 185 | 186 | * `Queue(const Context &context, const Device &device)`: 187 | Creates a new queue to enqueue kernel launches and device memory operations. This is analogous to an OpenCL command queue or a CUDA stream. 188 | 189 | Public method(s): 190 | 191 | * `void Finish(Event &event) const` and `void Finish() const`: 192 | Completes all tasks in the queue. In the case of the CUDA back-end, the first form additionally synchronizes on the specified event. 193 | 194 | * `Context GetContext() const`: 195 | Retrieves the CUDA/OpenCL context associated with this queue. 196 | 197 | * `Device GetDevice() const`: 198 | Retrieves the CUDA/OpenCL device associated with this queue. 199 | 200 | 201 | template \ CLCudaAPI::BufferHost 202 | ------------- 203 | 204 | Constructor(s): 205 | 206 | * `BufferHost(const Context &, const size_t size)`: 207 | Initializes a new linear 1D memory buffer on the host of type T. This buffer is allocated with a fixed number of elements given by `size`. Note that the buffer's elements are not initialized. In the case of the CUDA back-end, this host buffer is implemented as page-locked memory. The OpenCL back-end uses a regular `std::vector` container. 208 | 209 | Public method(s): 210 | 211 | * `size_t GetSize() const`: 212 | Retrieves the allocated size in bytes. 213 | 214 | * Several `std::vector` methods: 215 | Adds some compatibility with `std::vector` by implementing the `size`, `begin`, `end`, `operator[]`, and `data` methods. 216 | 217 | 218 | template \ CLCudaAPI::Buffer 219 | ------------- 220 | 221 | Constants(s): 222 | 223 | * `enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }` 224 | Defines the different access types for the buffers. Writing to a read-only buffer will throw an error, as will reading from a write-only buffer. A buffer which is of type `kNotOwned` will not be automatically freed afterwards. 225 | 226 | Constructor(s): 227 | 228 | * `Buffer(const Context &context, const BufferAccess access, const size_t size)`: 229 | Initializes a new linear 1D memory buffer on the device of type T. This buffer is allocated with a fixed number of elements given by `size`. Note that the buffer's elements are not initialized. The buffer can be read-only, write-only, read-write, or not-owned as specified by the `access` argument. 230 | 231 | * `Buffer(const Context &context, const size_t size)`: 232 | As above, but now defaults to read-write access. 233 | 234 | * `template Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end)`: 235 | Creates a new buffer based on data in a linear C++ container (such as `std::vector`). The size is determined by the difference between the end and start iterators. This method both creates a new buffer and writes data to it. It synchronises the queue before returning. 236 | 237 | Public method(s): 238 | 239 | * `void ReadAsync(const Queue &queue, const size_t size, T* host) const` and 240 | `void ReadAsync(const Queue &queue, const size_t size, std::vector &host)` and 241 | `void ReadAsync(const Queue &queue, const size_t size, BufferHost &host)`: 242 | Copies `size` elements from the current device buffer to the target host buffer. The host buffer has to be pre-allocated with a size of at least `size` elements. This method is a-synchronous: it can return before the copy operation is completed. 243 | 244 | * `void Read(const Queue &queue, const size_t size, T* host) const` and 245 | `void Read(const Queue &queue, const size_t size, std::vector &host)` and 246 | `void Read(const Queue &queue, const size_t size, BufferHost &host)`: 247 | As above, but now completes the operation before returning. 248 | 249 | * `void WriteAsync(const Queue &queue, const size_t size, const T* host)` and 250 | `void WriteAsync(const Queue &queue, const size_t size, const std::vector &host)` and 251 | `void WriteAsync(const Queue &queue, const size_t size, const BufferHost &host)`: 252 | Copies `size` elements from a host buffer to the current device buffer. The device buffer has to be pre-allocated with a size of at least `size` elements. This method is a-synchronous: it can return before the copy operation is completed. 253 | 254 | * `void Write(const Queue &queue, const size_t size, const T* host)` and 255 | `void Write(const Queue &queue, const size_t size, const std::vector &host)` and 256 | `void Write(const Queue &queue, const size_t size, const BufferHost &host)`: 257 | As above, but now completes the operation before returning. 258 | 259 | * `void CopyToAsync(const Queue &queue, const size_t size, const Buffer &destination) const`: 260 | Copies `size` elements from the current device buffer to another device buffer given by `destination`. The destination buffer has to be pre-allocated with a size of at least `size` elements. This method is a-synchronous: it can return before the copy operation is completed. 261 | 262 | * `void CopyTo(const Queue &queue, const size_t size, const Buffer &destination) const`: 263 | As above, but now completes the operation before returning. 264 | 265 | * `size_t GetSize() const`: 266 | Retrieves the allocated size in bytes. 267 | 268 | 269 | CLCudaAPI::Kernel 270 | ------------- 271 | 272 | Constructor(s): 273 | 274 | * `Kernel(const Program &program, const std::string &name)`: 275 | Retrieves a new kernel from a compiled program. The kernel name is given as the string `name`. 276 | 277 | Public method(s): 278 | 279 | * `template void SetArgument(const size_t index, const T &value)`: 280 | Method to set a kernel argument (l-value or r-value). The argument `index` specifies the position in the list of kernel arguments. The argument `value` can also be a `CLCudaAPI::Buffer`. 281 | 282 | * `template void SetArguments(Args&... args)`: As above, but now sets all arguments in one go, starting at index 0. This overwrites any previous arguments (if any). The parameter pack `args` takes any number of arguments of different types, including `CLCudaAPI::Buffer`. 283 | 284 | * `unsigned long LocalMemUsage(const Device &device) const`: 285 | Retrieves the amount of on-chip scratchpad memory (local memory in OpenCL, shared memory in CUDA) required by this specific kernel. 286 | 287 | * `std::string GetFunctionName() const `: 288 | Retrieves the name of the kernel (OpenCL only). 289 | 290 | * `Launch(const Queue &queue, const std::vector &global, const std::vector &local, Event &event)`: 291 | Launches a kernel onto the specified queue. This kernel launch is a-synchronous: this method can return before the device kernel is completed. The total number of threads launched is equal to the `global` vector; the number of threads per OpenCL work-group or CUDA thread-block is given by the `local` vector. The elapsed time is recorded into the `event` argument. 292 | 293 | * `Launch(const Queue &queue, const std::vector &global, const std::vector &local, Event &event, std::vector& waitForEvents)`: As above, but now this kernel is only launched after the other specified events have finished (OpenCL only). If `local` is empty, the kernel-size is determined automatically (OpenCL only). 294 | 295 | -------------------------------------------------------------------------------- /include/clpp11.h: -------------------------------------------------------------------------------- 1 | 2 | // ================================================================================================= 3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | // width of 100 characters per line. 6 | // 7 | // Author(s): 8 | // Cedric Nugteren 9 | // 10 | // This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API 11 | // calls. The main benefits are increased abstraction, automatic memory management, and portability. 12 | // Portability here means that a similar header exists for CUDA with the same classes and 13 | // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. 14 | // 15 | // This is version 9.0 of CLCudaAPI. 16 | // 17 | // ================================================================================================= 18 | // 19 | // Copyright 2015 SURFsara 20 | // 21 | // Licensed under the Apache License, Version 2.0 (the "License"); 22 | // you may not use this file except in compliance with the License. 23 | // You may obtain a copy of the License at 24 | // 25 | // http://www.apache.org/licenses/LICENSE-2.0 26 | // 27 | // Unless required by applicable law or agreed to in writing, software 28 | // distributed under the License is distributed on an "AS IS" BASIS, 29 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 30 | // See the License for the specific language governing permissions and 31 | // limitations under the License. 32 | // 33 | // ================================================================================================= 34 | 35 | #ifndef CLCUDAAPI_CLPP11_H_ 36 | #define CLCUDAAPI_CLPP11_H_ 37 | 38 | // C++ 39 | #include // std::copy 40 | #include // std::string 41 | #include // std::vector 42 | #include // std::shared_ptr 43 | #include // std::accumulate 44 | #include // std::strlen 45 | 46 | // OpenCL 47 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings 48 | #if defined(__APPLE__) || defined(__MACOSX) 49 | #include 50 | #else 51 | #include 52 | #endif 53 | 54 | // Exception classes 55 | #include "cxpp11_common.hpp" 56 | 57 | namespace CLCudaAPI { 58 | // ================================================================================================= 59 | 60 | // Represents a runtime error returned by an OpenCL API function 61 | class CLCudaAPIError : public ErrorCode { 62 | public: 63 | explicit CLCudaAPIError(cl_int status, const std::string &where): 64 | ErrorCode(status, where, "OpenCL error: " + where + ": " + 65 | std::to_string(static_cast(status))) { 66 | } 67 | 68 | static void Check(const cl_int status, const std::string &where) { 69 | if (status != CL_SUCCESS) { 70 | throw CLCudaAPIError(status, where); 71 | } 72 | } 73 | 74 | static void CheckDtor(const cl_int status, const std::string &where) { 75 | if (status != CL_SUCCESS) { 76 | fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what()); 77 | } 78 | } 79 | }; 80 | 81 | // Exception returned when building a program 82 | using CLCudaAPIBuildError = CLCudaAPIError; 83 | 84 | // ================================================================================================= 85 | 86 | // Error occurred in OpenCL 87 | #define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call)) 88 | 89 | // Error occurred in OpenCL (no-exception version for destructors) 90 | #define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call)) 91 | 92 | // ================================================================================================= 93 | 94 | // C++11 version of 'cl_event' 95 | class Event { 96 | public: 97 | 98 | // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere 99 | explicit Event(const cl_event event): 100 | event_(new cl_event) { 101 | *event_ = event; 102 | } 103 | 104 | // Regular constructor with memory management 105 | explicit Event(): 106 | event_(new cl_event, [](cl_event* e) { 107 | if (*e) { CheckErrorDtor(clReleaseEvent(*e)); } 108 | delete e; 109 | }) { 110 | *event_ = nullptr; 111 | } 112 | 113 | // Waits for completion of this event 114 | void WaitForCompletion() const { 115 | CheckError(clWaitForEvents(1, &(*event_))); 116 | } 117 | 118 | // Retrieves the elapsed time of the last recorded event. 119 | // (Note that there is a bug in Apple's OpenCL implementation of the 'clGetEventProfilingInfo' function: 120 | // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx) 121 | // However, in our case the reply size is fixed to be cl_ulong, so we are not affected. 122 | float GetElapsedTime() const { 123 | WaitForCompletion(); 124 | const auto bytes = sizeof(cl_ulong); 125 | auto time_start = cl_ulong{0}; 126 | CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr)); 127 | auto time_end = cl_ulong{0}; 128 | CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr)); 129 | return static_cast(time_end - time_start) * 1.0e-6f; 130 | } 131 | 132 | // Accessor to the private data-member 133 | cl_event& operator()() { return *event_; } 134 | const cl_event& operator()() const { return *event_; } 135 | cl_event* pointer() { return &(*event_); } 136 | const cl_event* pointer() const { return &(*event_); } 137 | private: 138 | std::shared_ptr event_; 139 | }; 140 | 141 | // Pointer to an OpenCL event 142 | using EventPointer = cl_event*; 143 | 144 | // ================================================================================================= 145 | 146 | // Raw platform ID type 147 | using RawPlatformID = cl_platform_id; 148 | 149 | // C++11 version of 'cl_platform_id' 150 | class Platform { 151 | public: 152 | 153 | // Constructor based on the regular OpenCL data-type 154 | explicit Platform(const cl_platform_id platform): platform_(platform) { } 155 | 156 | // Initializes the platform 157 | explicit Platform(const size_t platform_id) { 158 | auto num_platforms = cl_uint{0}; 159 | CheckError(clGetPlatformIDs(0, nullptr, &num_platforms)); 160 | if (num_platforms == 0) { 161 | throw RuntimeError("Platform: no platforms found"); 162 | } 163 | if (platform_id >= num_platforms) { 164 | throw RuntimeError("Platform: invalid platform ID "+std::to_string(platform_id)); 165 | } 166 | auto platforms = std::vector(num_platforms); 167 | CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr)); 168 | platform_ = platforms[platform_id]; 169 | } 170 | 171 | // Methods to retrieve platform information 172 | std::string Name() const { return GetInfoString(CL_PLATFORM_NAME); } 173 | std::string Vendor() const { return GetInfoString(CL_PLATFORM_VENDOR); } 174 | std::string Version() const { return GetInfoString(CL_PLATFORM_VERSION); } 175 | 176 | // Returns the number of devices on this platform 177 | size_t NumDevices() const { 178 | auto result = cl_uint{0}; 179 | CheckError(clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, nullptr, &result)); 180 | return static_cast(result); 181 | } 182 | 183 | // Accessor to the private data-member 184 | const RawPlatformID& operator()() const { return platform_; } 185 | private: 186 | cl_platform_id platform_; 187 | 188 | // Private helper functions 189 | std::string GetInfoString(const cl_device_info info) const { 190 | auto bytes = size_t{0}; 191 | CheckError(clGetPlatformInfo(platform_, info, 0, nullptr, &bytes)); 192 | auto result = std::string{}; 193 | result.resize(bytes); 194 | CheckError(clGetPlatformInfo(platform_, info, bytes, &result[0], nullptr)); 195 | result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters 196 | return result; 197 | } 198 | }; 199 | 200 | // Retrieves a vector with all platforms 201 | inline std::vector GetAllPlatforms() { 202 | auto num_platforms = cl_uint{0}; 203 | CheckError(clGetPlatformIDs(0, nullptr, &num_platforms)); 204 | auto all_platforms = std::vector(); 205 | for (size_t platform_id = 0; platform_id < static_cast(num_platforms); ++platform_id) { 206 | all_platforms.push_back(Platform(platform_id)); 207 | } 208 | return all_platforms; 209 | } 210 | 211 | // ================================================================================================= 212 | 213 | // Raw device ID type 214 | using RawDeviceID = cl_device_id; 215 | 216 | // C++11 version of 'cl_device_id' 217 | class Device { 218 | public: 219 | 220 | // Constructor based on the regular OpenCL data-type 221 | explicit Device(const cl_device_id device): device_(device) { } 222 | 223 | // Initialize the device. Note that this constructor can throw exceptions! 224 | explicit Device(const Platform &platform, const size_t device_id) { 225 | auto num_devices = platform.NumDevices(); 226 | if (num_devices == 0) { 227 | throw RuntimeError("Device: no devices found"); 228 | } 229 | if (device_id >= num_devices) { 230 | throw RuntimeError("Device: invalid device ID "+std::to_string(device_id)); 231 | } 232 | 233 | auto devices = std::vector(num_devices); 234 | CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast(num_devices), 235 | devices.data(), nullptr)); 236 | device_ = devices[device_id]; 237 | } 238 | 239 | // Methods to retrieve device information 240 | RawPlatformID PlatformID() const { return GetInfo(CL_DEVICE_PLATFORM); } 241 | std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); } 242 | size_t VersionNumber() const 243 | { 244 | std::string version_string = Version().substr(7); 245 | // Space separates the end of the OpenCL version number from the beginning of the 246 | // vendor-specific information. 247 | size_t next_whitespace = version_string.find(' '); 248 | size_t version = (size_t) (100.0 * std::stod(version_string.substr(0, next_whitespace))); 249 | return version; 250 | } 251 | std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); } 252 | std::string Name() const { return GetInfoString(CL_DEVICE_NAME); } 253 | std::string Type() const { 254 | auto type = GetInfo(CL_DEVICE_TYPE); 255 | switch(type) { 256 | case CL_DEVICE_TYPE_CPU: return "CPU"; 257 | case CL_DEVICE_TYPE_GPU: return "GPU"; 258 | case CL_DEVICE_TYPE_ACCELERATOR: return "accelerator"; 259 | default: return "default"; 260 | } 261 | } 262 | size_t MaxWorkGroupSize() const { return GetInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE); } 263 | size_t MaxWorkItemDimensions() const { 264 | return static_cast(GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS)); 265 | } 266 | std::vector MaxWorkItemSizes() const { 267 | return GetInfoVector(CL_DEVICE_MAX_WORK_ITEM_SIZES); 268 | } 269 | unsigned long LocalMemSize() const { 270 | return static_cast(GetInfo(CL_DEVICE_LOCAL_MEM_SIZE)); 271 | } 272 | 273 | std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); } 274 | bool HasExtension(const std::string &extension) const { 275 | const auto extensions = Capabilities(); 276 | return extensions.find(extension) != std::string::npos; 277 | } 278 | bool SupportsFP64() const { 279 | return HasExtension("cl_khr_fp64"); 280 | } 281 | bool SupportsFP16() const { 282 | if (Name() == "Mali-T628") { return true; } // supports fp16 but not cl_khr_fp16 officially 283 | return HasExtension("cl_khr_fp16"); 284 | } 285 | 286 | size_t CoreClock() const { 287 | return static_cast(GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY)); 288 | } 289 | size_t ComputeUnits() const { 290 | return static_cast(GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS)); 291 | } 292 | unsigned long MemorySize() const { 293 | return static_cast(GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE)); 294 | } 295 | unsigned long MaxAllocSize() const { 296 | return static_cast(GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE)); 297 | } 298 | size_t MemoryClock() const { return 0; } // Not exposed in OpenCL 299 | size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL 300 | 301 | // Configuration-validity checks 302 | bool IsLocalMemoryValid(const cl_ulong local_mem_usage) const { 303 | return (local_mem_usage <= LocalMemSize()); 304 | } 305 | bool IsThreadConfigValid(const std::vector &local) const { 306 | auto local_size = size_t{1}; 307 | for (const auto &item: local) { local_size *= item; } 308 | for (auto i=size_t{0}; i MaxWorkItemSizes()[i]) { return false; } 310 | } 311 | if (local_size > MaxWorkGroupSize()) { return false; } 312 | if (local.size() > MaxWorkItemDimensions()) { return false; } 313 | return true; 314 | } 315 | 316 | // Query for a specific type of device or brand 317 | bool IsCPU() const { return Type() == "CPU"; } 318 | bool IsGPU() const { return Type() == "GPU"; } 319 | bool IsAMD() const { return Vendor() == "AMD" || 320 | Vendor() == "Advanced Micro Devices, Inc." || 321 | Vendor() == "AuthenticAMD"; } 322 | bool IsNVIDIA() const { return Vendor() == "NVIDIA" || 323 | Vendor() == "NVIDIA Corporation"; } 324 | bool IsIntel() const { return Vendor() == "INTEL" || 325 | Vendor() == "Intel" || 326 | Vendor() == "GenuineIntel" || 327 | Vendor() == "Intel(R) Corporation"; } 328 | bool IsARM() const { return Vendor() == "ARM"; } 329 | 330 | // Platform specific extensions 331 | std::string AMDBoardName() const { // check for 'cl_amd_device_attribute_query' first 332 | #ifndef CL_DEVICE_BOARD_NAME_AMD 333 | #define CL_DEVICE_BOARD_NAME_AMD 0x4038 334 | #endif 335 | return GetInfoString(CL_DEVICE_BOARD_NAME_AMD); 336 | } 337 | std::string NVIDIAComputeCapability() const { // check for 'cl_nv_device_attribute_query' first 338 | #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 339 | #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 340 | #endif 341 | #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 342 | #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 343 | #endif 344 | return std::string{"SM"} + std::to_string(GetInfo(CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV)) + 345 | std::string{"."} + std::to_string(GetInfo(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV)); 346 | } 347 | 348 | // Accessor to the private data-member 349 | const RawDeviceID& operator()() const { return device_; } 350 | private: 351 | cl_device_id device_; 352 | 353 | // Private helper functions 354 | template 355 | T GetInfo(const cl_device_info info) const { 356 | auto bytes = size_t{0}; 357 | CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); 358 | auto result = T(0); 359 | CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr)); 360 | return result; 361 | } 362 | template 363 | std::vector GetInfoVector(const cl_device_info info) const { 364 | auto bytes = size_t{0}; 365 | CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); 366 | auto result = std::vector(bytes/sizeof(T)); 367 | CheckError(clGetDeviceInfo(device_, info, bytes, result.data(), nullptr)); 368 | return result; 369 | } 370 | std::string GetInfoString(const cl_device_info info) const { 371 | auto bytes = size_t{0}; 372 | CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); 373 | auto result = std::string{}; 374 | result.resize(bytes); 375 | CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr)); 376 | result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters 377 | return result; 378 | } 379 | }; 380 | 381 | // ================================================================================================= 382 | 383 | // Raw context type 384 | using RawContext = cl_context; 385 | 386 | // C++11 version of 'cl_context' 387 | class Context { 388 | public: 389 | 390 | // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere 391 | explicit Context(const cl_context context): 392 | context_(new cl_context) { 393 | *context_ = context; 394 | } 395 | 396 | // Regular constructor with memory management 397 | explicit Context(const Device &device): 398 | context_(new cl_context, [](cl_context* c) { 399 | if (*c) { CheckErrorDtor(clReleaseContext(*c)); } 400 | delete c; 401 | }) { 402 | auto status = CL_SUCCESS; 403 | const cl_device_id dev = device(); 404 | *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status); 405 | CLCudaAPIError::Check(status, "clCreateContext"); 406 | } 407 | 408 | // Accessor to the private data-member 409 | const RawContext& operator()() const { return *context_; } 410 | RawContext* pointer() const { return &(*context_); } 411 | private: 412 | std::shared_ptr context_; 413 | }; 414 | 415 | // Pointer to an OpenCL context 416 | using ContextPointer = cl_context*; 417 | 418 | // ================================================================================================= 419 | 420 | // C++11 version of 'cl_program'. 421 | class Program { 422 | public: 423 | Program() = default; 424 | 425 | // Source-based constructor with memory management 426 | explicit Program(const Context &context, const std::string &source): 427 | program_(new cl_program, [](cl_program* p) { 428 | if (*p) { CheckErrorDtor(clReleaseProgram(*p)); } 429 | delete p; 430 | }) { 431 | const char *source_ptr = &source[0]; 432 | const auto length = source.length(); 433 | auto status = CL_SUCCESS; 434 | *program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status); 435 | CLCudaAPIError::Check(status, "clCreateProgramWithSource"); 436 | } 437 | 438 | // Binary-based constructor with memory management 439 | explicit Program(const Device &device, const Context &context, const std::string &binary): 440 | program_(new cl_program, [](cl_program* p) { 441 | if (*p) { CheckErrorDtor(clReleaseProgram(*p)); } 442 | delete p; 443 | }) { 444 | const char *binary_ptr = &binary[0]; 445 | const auto length = binary.length(); 446 | auto status1 = CL_SUCCESS; 447 | auto status2 = CL_SUCCESS; 448 | const auto dev = device(); 449 | *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length, 450 | reinterpret_cast(&binary_ptr), 451 | &status1, &status2); 452 | CLCudaAPIError::Check(status1, "clCreateProgramWithBinary (binary status)"); 453 | CLCudaAPIError::Check(status2, "clCreateProgramWithBinary"); 454 | } 455 | 456 | // Compiles the device program and checks whether or not there are any warnings/errors 457 | void Build(const Device &device, std::vector &options) { 458 | options.push_back("-cl-std=CL1.1"); 459 | auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "}); 460 | const cl_device_id dev = device(); 461 | CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr)); 462 | } 463 | 464 | // Confirms whether a certain status code is an actual compilation error or warning 465 | bool StatusIsCompilationWarningOrError(const cl_int status) const { 466 | return (status == CL_BUILD_PROGRAM_FAILURE); 467 | } 468 | 469 | // Retrieves the warning/error message from the compiler (if any) 470 | std::string GetBuildInfo(const Device &device) const { 471 | auto bytes = size_t{0}; 472 | auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG}; 473 | CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes)); 474 | auto result = std::string{}; 475 | result.resize(bytes); 476 | CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr)); 477 | return result; 478 | } 479 | 480 | // Retrieves a binary or an intermediate representation of the compiled program 481 | std::string GetIR() const { 482 | auto bytes = size_t{0}; 483 | CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr)); 484 | auto result = std::string{}; 485 | result.resize(bytes); 486 | auto result_ptr = result.data(); 487 | CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr)); 488 | return result; 489 | } 490 | 491 | // Accessor to the private data-member 492 | const cl_program& operator()() const { return *program_; } 493 | private: 494 | std::shared_ptr program_; 495 | }; 496 | 497 | // ================================================================================================= 498 | 499 | // Raw command-queue type 500 | using RawCommandQueue = cl_command_queue; 501 | 502 | // C++11 version of 'cl_command_queue' 503 | class Queue { 504 | public: 505 | 506 | // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere 507 | explicit Queue(const cl_command_queue queue): 508 | queue_(new cl_command_queue) { 509 | *queue_ = queue; 510 | } 511 | 512 | // Regular constructor with memory management 513 | explicit Queue(const Context &context, const Device &device): 514 | queue_(new cl_command_queue, [](cl_command_queue* s) { 515 | if (*s) { CheckErrorDtor(clReleaseCommandQueue(*s)); } 516 | delete s; 517 | }) { 518 | auto status = CL_SUCCESS; 519 | *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); 520 | CLCudaAPIError::Check(status, "clCreateCommandQueue"); 521 | } 522 | 523 | // Synchronizes the queue 524 | void Finish(Event &) const { 525 | Finish(); 526 | } 527 | void Finish() const { 528 | CheckError(clFinish(*queue_)); 529 | } 530 | 531 | // Retrieves the corresponding context or device 532 | Context GetContext() const { 533 | auto bytes = size_t{0}; 534 | CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes)); 535 | cl_context result; 536 | CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr)); 537 | return Context(result); 538 | } 539 | Device GetDevice() const { 540 | auto bytes = size_t{0}; 541 | CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes)); 542 | cl_device_id result; 543 | CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr)); 544 | return Device(result); 545 | } 546 | 547 | // Accessor to the private data-member 548 | const RawCommandQueue& operator()() const { return *queue_; } 549 | private: 550 | std::shared_ptr queue_; 551 | }; 552 | 553 | // ================================================================================================= 554 | 555 | // C++11 version of host memory 556 | template 557 | class BufferHost { 558 | public: 559 | 560 | // Regular constructor with memory management 561 | explicit BufferHost(const Context &, const size_t size): 562 | buffer_(new std::vector(size)) { 563 | } 564 | 565 | // Retrieves the actual allocated size in bytes 566 | size_t GetSize() const { 567 | return buffer_->size()*sizeof(T); 568 | } 569 | 570 | // Compatibility with std::vector 571 | size_t size() const { return buffer_->size(); } 572 | T* begin() { return &(*buffer_)[0]; } 573 | T* end() { return &(*buffer_)[buffer_->size()-1]; } 574 | T& operator[](const size_t i) { return (*buffer_)[i]; } 575 | T* data() { return buffer_->data(); } 576 | const T* data() const { return buffer_->data(); } 577 | 578 | private: 579 | std::shared_ptr> buffer_; 580 | }; 581 | 582 | // ================================================================================================= 583 | 584 | // Enumeration of buffer access types 585 | enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }; 586 | 587 | // C++11 version of 'cl_mem' 588 | template 589 | class Buffer { 590 | public: 591 | 592 | // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere 593 | explicit Buffer(const cl_mem buffer): 594 | buffer_(new cl_mem), 595 | access_(BufferAccess::kNotOwned) { 596 | *buffer_ = buffer; 597 | } 598 | 599 | // Regular constructor with memory management. If this class does not own the buffer object, then 600 | // the memory will not be freed automatically afterwards. 601 | explicit Buffer(const Context &context, const BufferAccess access, const size_t size): 602 | buffer_(new cl_mem, [access](cl_mem* m) { 603 | if (access != BufferAccess::kNotOwned) { CheckError(clReleaseMemObject(*m)); } 604 | delete m; 605 | }), 606 | access_(access) { 607 | auto flags = cl_mem_flags{CL_MEM_READ_WRITE}; 608 | if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; } 609 | if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; } 610 | auto status = CL_SUCCESS; 611 | *buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status); 612 | CLCudaAPIError::Check(status, "clCreateBuffer"); 613 | } 614 | 615 | // As above, but now with read/write access as a default 616 | explicit Buffer(const Context &context, const size_t size): 617 | Buffer(context, BufferAccess::kReadWrite, size) { 618 | } 619 | 620 | // Constructs a new buffer based on an existing host-container 621 | template 622 | explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end): 623 | Buffer(context, BufferAccess::kReadWrite, static_cast(end - start)) { 624 | auto size = static_cast(end - start); 625 | auto pointer = &*start; 626 | CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), pointer, 0, 627 | nullptr, nullptr)); 628 | queue.Finish(); 629 | } 630 | 631 | // Copies from device to host: reading the device buffer a-synchronously 632 | void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { 633 | if (access_ == BufferAccess::kWriteOnly) { 634 | throw LogicError("Buffer: reading from a write-only buffer"); 635 | } 636 | CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T), 637 | host, 0, nullptr, nullptr)); 638 | } 639 | void ReadAsync(const Queue &queue, const size_t size, std::vector &host, 640 | const size_t offset = 0) const { 641 | if (host.size() < size) { 642 | throw LogicError("Buffer: target host buffer is too small"); 643 | } 644 | ReadAsync(queue, size, host.data(), offset); 645 | } 646 | void ReadAsync(const Queue &queue, const size_t size, BufferHost &host, 647 | const size_t offset = 0) const { 648 | if (host.size() < size) { 649 | throw LogicError("Buffer: target host buffer is too small"); 650 | } 651 | ReadAsync(queue, size, host.data(), offset); 652 | } 653 | 654 | // Copies from device to host: reading the device buffer 655 | void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { 656 | ReadAsync(queue, size, host, offset); 657 | queue.Finish(); 658 | } 659 | void Read(const Queue &queue, const size_t size, std::vector &host, 660 | const size_t offset = 0) const { 661 | Read(queue, size, host.data(), offset); 662 | } 663 | void Read(const Queue &queue, const size_t size, BufferHost &host, 664 | const size_t offset = 0) const { 665 | Read(queue, size, host.data(), offset); 666 | } 667 | 668 | // Copies from host to device: writing the device buffer a-synchronously 669 | void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { 670 | if (GetSize() < (offset+size)*sizeof(T)) { 671 | throw LogicError("Buffer: target device buffer is too small"); 672 | } 673 | CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T), 674 | host, 0, nullptr, nullptr)); 675 | } 676 | void WriteAsync(const Queue &queue, const size_t size, const std::vector &host, 677 | const size_t offset = 0) { 678 | WriteAsync(queue, size, host.data(), offset); 679 | } 680 | void WriteAsync(const Queue &queue, const size_t size, const BufferHost &host, 681 | const size_t offset = 0) { 682 | WriteAsync(queue, size, host.data(), offset); 683 | } 684 | 685 | // Copies from host to device: writing the device buffer 686 | void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { 687 | WriteAsync(queue, size, host, offset); 688 | queue.Finish(); 689 | } 690 | void Write(const Queue &queue, const size_t size, const std::vector &host, 691 | const size_t offset = 0) { 692 | Write(queue, size, host.data(), offset); 693 | } 694 | void Write(const Queue &queue, const size_t size, const BufferHost &host, 695 | const size_t offset = 0) { 696 | Write(queue, size, host.data(), offset); 697 | } 698 | 699 | // Copies the contents of this buffer into another device buffer 700 | void CopyToAsync(const Queue &queue, const size_t size, const Buffer &destination) const { 701 | CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0, 702 | nullptr, nullptr)); 703 | } 704 | void CopyTo(const Queue &queue, const size_t size, const Buffer &destination) const { 705 | CopyToAsync(queue, size, destination); 706 | queue.Finish(); 707 | } 708 | 709 | // Retrieves the actual allocated size in bytes 710 | size_t GetSize() const { 711 | const auto bytes = sizeof(size_t); 712 | auto result = size_t{0}; 713 | CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr)); 714 | return result; 715 | } 716 | 717 | // Accessor to the private data-member 718 | const cl_mem& operator()() const { return *buffer_; } 719 | private: 720 | std::shared_ptr buffer_; 721 | const BufferAccess access_; 722 | }; 723 | 724 | // ================================================================================================= 725 | 726 | // C++11 version of 'cl_kernel' 727 | class Kernel { 728 | public: 729 | 730 | // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere 731 | explicit Kernel(const cl_kernel kernel): 732 | kernel_(new cl_kernel) { 733 | *kernel_ = kernel; 734 | } 735 | 736 | // Regular constructor with memory management 737 | explicit Kernel(const Program &program, const std::string &name): 738 | kernel_(new cl_kernel, [](cl_kernel* k) { 739 | if (*k) { CheckErrorDtor(clReleaseKernel(*k)); } 740 | delete k; 741 | }) { 742 | auto status = CL_SUCCESS; 743 | *kernel_ = clCreateKernel(program(), name.c_str(), &status); 744 | CLCudaAPIError::Check(status, "clCreateKernel"); 745 | } 746 | 747 | // Sets a kernel argument at the indicated position 748 | template 749 | void SetArgument(const size_t index, const T &value) { 750 | CheckError(clSetKernelArg(*kernel_, static_cast(index), sizeof(T), &value)); 751 | } 752 | template 753 | void SetArgument(const size_t index, Buffer &value) { 754 | SetArgument(index, value()); 755 | } 756 | 757 | // Sets all arguments in one go using parameter packs. Note that this overwrites previously set 758 | // arguments using 'SetArgument' or 'SetArguments'. 759 | template 760 | void SetArguments(Args&... args) { 761 | SetArgumentsRecursive(0, args...); 762 | } 763 | 764 | // Retrieves the amount of local memory used per work-group for this kernel 765 | unsigned long LocalMemUsage(const Device &device) const { 766 | const auto bytes = sizeof(cl_ulong); 767 | auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE}; 768 | auto result = cl_ulong{0}; 769 | CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr)); 770 | return static_cast(result); 771 | } 772 | 773 | // Retrieves the name of the kernel 774 | std::string GetFunctionName() const { 775 | auto bytes = size_t{0}; 776 | CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes)); 777 | auto result = std::string{}; 778 | result.resize(bytes); 779 | CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, bytes, &result[0], nullptr)); 780 | return std::string{result.c_str()}; // Removes any trailing '\0'-characters 781 | } 782 | 783 | // Launches a kernel onto the specified queue 784 | void Launch(const Queue &queue, const std::vector &global, 785 | const std::vector &local, EventPointer event) { 786 | CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), 787 | nullptr, global.data(), local.data(), 788 | 0, nullptr, event)); 789 | } 790 | 791 | // As above, but with an event waiting list 792 | void Launch(const Queue &queue, const std::vector &global, 793 | const std::vector &local, EventPointer event, 794 | const std::vector &waitForEvents) { 795 | 796 | // Builds a plain version of the events waiting list 797 | auto waitForEventsPlain = std::vector(); 798 | for (auto &waitEvent : waitForEvents) { 799 | if (waitEvent()) { waitForEventsPlain.push_back(waitEvent()); } 800 | } 801 | 802 | // Launches the kernel while waiting for other events 803 | CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), 804 | nullptr, global.data(), !local.empty() ? local.data() : nullptr, 805 | static_cast(waitForEventsPlain.size()), 806 | !waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr, 807 | event)); 808 | } 809 | 810 | // Accessor to the private data-member 811 | const cl_kernel& operator()() const { return *kernel_; } 812 | private: 813 | std::shared_ptr kernel_; 814 | 815 | // Internal implementation for the recursive SetArguments function. 816 | template 817 | void SetArgumentsRecursive(const size_t index, T &first) { 818 | SetArgument(index, first); 819 | } 820 | template 821 | void SetArgumentsRecursive(const size_t index, T &first, Args&... args) { 822 | SetArgument(index, first); 823 | SetArgumentsRecursive(index+1, args...); 824 | } 825 | }; 826 | 827 | // ================================================================================================= 828 | } // namespace CLCudaAPI 829 | 830 | // CLCUDAAPI_CLPP11_H_ 831 | #endif 832 | -------------------------------------------------------------------------------- /include/cupp11.h: -------------------------------------------------------------------------------- 1 | 2 | // ================================================================================================= 3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | // width of 100 characters per line. 6 | // 7 | // Author(s): 8 | // Cedric Nugteren 9 | // 10 | // This file implements a bunch of C++11 classes that act as wrappers around CUDA objects and API 11 | // calls. The main benefits are increased abstraction, automatic memory management, and portability. 12 | // Portability here means that a similar header exists for OpenCL with the same classes and 13 | // interfaces. In other words, moving from the CUDA API to the OpenCL API becomes a one-line change. 14 | // 15 | // This is version 9.0 of CLCudaAPI. 16 | // 17 | // ================================================================================================= 18 | // 19 | // Copyright 2015 SURFsara 20 | // 21 | // Licensed under the Apache License, Version 2.0 (the "License"); 22 | // you may not use this file except in compliance with the License. 23 | // You may obtain a copy of the License at 24 | // 25 | // http://www.apache.org/licenses/LICENSE-2.0 26 | // 27 | // Unless required by applicable law or agreed to in writing, software 28 | // distributed under the License is distributed on an "AS IS" BASIS, 29 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 30 | // See the License for the specific language governing permissions and 31 | // limitations under the License. 32 | // 33 | // ================================================================================================= 34 | 35 | #ifndef CLCUDAAPI_CUPP11_H_ 36 | #define CLCUDAAPI_CUPP11_H_ 37 | 38 | // C++ 39 | #include // std::copy 40 | #include // std::string 41 | #include // std::vector 42 | #include // std::shared_ptr 43 | 44 | // CUDA 45 | #include // CUDA driver API 46 | #include // NVIDIA runtime compilation API 47 | 48 | // Exception classes 49 | #include "cxpp11_common.hpp" 50 | 51 | namespace CLCudaAPI { 52 | // ================================================================================================= 53 | 54 | // Max-length of strings 55 | constexpr auto kStringLength = 256; 56 | 57 | // ================================================================================================= 58 | 59 | // Represents a runtime error returned by a CUDA driver API function 60 | class CLCudaAPIError : public ErrorCode { 61 | public: 62 | explicit CLCudaAPIError(CUresult status, const std::string &where): 63 | ErrorCode(status, where, "CUDA error: " + where + ": " + 64 | GetErrorName(status) + " --> " + GetErrorString(status)) { 65 | } 66 | 67 | static void Check(const CUresult status, const std::string &where) { 68 | if (status != CUDA_SUCCESS) { 69 | throw CLCudaAPIError(status, where); 70 | } 71 | } 72 | 73 | static void CheckDtor(const CUresult status, const std::string &where) { 74 | if (status != CUDA_SUCCESS) { 75 | fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what()); 76 | } 77 | } 78 | 79 | private: 80 | std::string GetErrorName(CUresult status) const { 81 | const char* status_code; 82 | cuGetErrorName(status, &status_code); 83 | return std::string(status_code); 84 | } 85 | std::string GetErrorString(CUresult status) const { 86 | const char* status_string; 87 | cuGetErrorString(status, &status_string); 88 | return std::string(status_string); 89 | } 90 | }; 91 | 92 | // Represents a runtime error returned by a CUDA runtime compilation API function 93 | class CLCudaAPINVRTCError : public ErrorCode { 94 | public: 95 | explicit CLCudaAPINVRTCError(nvrtcResult status, const std::string &where): 96 | ErrorCode(status, where, "CUDA NVRTC error: " + where + ": " + GetErrorString(status)) { 97 | } 98 | 99 | static void Check(const nvrtcResult status, const std::string &where) { 100 | if (status != NVRTC_SUCCESS) { 101 | throw CLCudaAPINVRTCError(status, where); 102 | } 103 | } 104 | 105 | static void CheckDtor(const nvrtcResult status, const std::string &where) { 106 | if (status != NVRTC_SUCCESS) { 107 | fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPINVRTCError(status, where).what()); 108 | } 109 | } 110 | 111 | private: 112 | std::string GetErrorString(nvrtcResult status) const { 113 | const char* status_string = nvrtcGetErrorString(status); 114 | return std::string(status_string); 115 | } 116 | }; 117 | 118 | // Exception returned when building a program 119 | using CLCudaAPIBuildError = CLCudaAPINVRTCError; 120 | 121 | // ================================================================================================= 122 | 123 | // Error occurred in CUDA driver or runtime compilation API 124 | #define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call)) 125 | #define CheckErrorNVRTC(call) CLCudaAPINVRTCError::Check(call, CLCudaAPINVRTCError::TrimCallString(#call)) 126 | 127 | // Error occurred in CUDA driver or runtime compilation API (no-exception version for destructors) 128 | #define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call)) 129 | #define CheckErrorDtorNVRTC(call) CLCudaAPINVRTCError::CheckDtor(call, CLCudaAPINVRTCError::TrimCallString(#call)) 130 | 131 | // ================================================================================================= 132 | 133 | // C++11 version of two 'CUevent' pointers 134 | class Event { 135 | public: 136 | // Note that there is no constructor based on the regular CUDA data-type because of extra state 137 | 138 | // Regular constructor with memory management 139 | explicit Event(): 140 | start_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }), 141 | end_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }) { 142 | CheckError(cuEventCreate(start_.get(), CU_EVENT_DEFAULT)); 143 | CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT)); 144 | } 145 | 146 | // Waits for completion of this event (not implemented for CUDA) 147 | void WaitForCompletion() const { } 148 | 149 | // Retrieves the elapsed time of the last recorded event 150 | float GetElapsedTime() const { 151 | auto result = 0.0f; 152 | cuEventElapsedTime(&result, *start_, *end_); 153 | return result; 154 | } 155 | 156 | // Accessors to the private data-members 157 | const CUevent& start() const { return *start_; } 158 | const CUevent& end() const { return *end_; } 159 | Event* pointer() { return this; } 160 | private: 161 | std::shared_ptr start_; 162 | std::shared_ptr end_; 163 | }; 164 | 165 | // Pointer to a CUDA event 166 | using EventPointer = Event*; 167 | 168 | // ================================================================================================= 169 | 170 | // Raw platform ID type 171 | using RawPlatformID = size_t; 172 | 173 | // The CUDA platform: initializes the CUDA driver API 174 | class Platform { 175 | public: 176 | 177 | // Initializes the platform. Note that the platform ID variable is not actually used for CUDA. 178 | explicit Platform(const size_t platform_id) : platform_id_(0) { 179 | if (platform_id != 0) { throw LogicError("CUDA back-end requires a platform ID of 0"); } 180 | CheckError(cuInit(0)); 181 | } 182 | 183 | // Methods to retrieve platform information 184 | std::string Name() const { return "CUDA"; } 185 | std::string Vendor() const { return "NVIDIA Corporation"; } 186 | std::string Version() const { 187 | auto result = 0; 188 | CheckError(cuDriverGetVersion(&result)); 189 | return "CUDA driver "+std::to_string(result); 190 | } 191 | 192 | // Returns the number of devices on this platform 193 | size_t NumDevices() const { 194 | auto result = 0; 195 | CheckError(cuDeviceGetCount(&result)); 196 | return static_cast(result); 197 | } 198 | 199 | // Accessor to the raw ID (which doesn't exist in the CUDA back-end, this is always just 0) 200 | const RawPlatformID& operator()() const { return platform_id_; } 201 | private: 202 | const size_t platform_id_; 203 | }; 204 | 205 | // Retrieves a vector with all platforms. Note that there is just one platform in CUDA. 206 | inline std::vector GetAllPlatforms() { 207 | auto all_platforms = std::vector{ Platform(size_t{0}) }; 208 | return all_platforms; 209 | } 210 | 211 | // ================================================================================================= 212 | 213 | // Raw device ID type 214 | using RawDeviceID = CUdevice; 215 | 216 | // C++11 version of 'CUdevice' 217 | class Device { 218 | public: 219 | 220 | // Constructor based on the regular CUDA data-type 221 | explicit Device(const CUdevice device): device_(device) { } 222 | 223 | // Initialization 224 | explicit Device(const Platform &platform, const size_t device_id) { 225 | auto num_devices = platform.NumDevices(); 226 | if (num_devices == 0) { 227 | throw RuntimeError("Device: no devices found"); 228 | } 229 | if (device_id >= num_devices) { 230 | throw RuntimeError("Device: invalid device ID "+std::to_string(device_id)); 231 | } 232 | 233 | CheckError(cuDeviceGet(&device_, device_id)); 234 | } 235 | 236 | // Methods to retrieve device information 237 | RawPlatformID PlatformID() const { return 0; } 238 | std::string Version() const { 239 | auto result = 0; 240 | CheckError(cuDriverGetVersion(&result)); 241 | return "CUDA driver "+std::to_string(result); 242 | } 243 | size_t VersionNumber() const { 244 | auto result = 0; 245 | CheckError(cuDriverGetVersion(&result)); 246 | return static_cast(result); 247 | } 248 | std::string Vendor() const { return "NVIDIA Corporation"; } 249 | std::string Name() const { 250 | auto result = std::string{}; 251 | result.resize(kStringLength); 252 | CheckError(cuDeviceGetName(&result[0], result.size(), device_)); 253 | return result; 254 | } 255 | std::string Type() const { return "GPU"; } 256 | size_t MaxWorkGroupSize() const {return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); } 257 | size_t MaxWorkItemDimensions() const { return size_t{3}; } 258 | std::vector MaxWorkItemSizes() const { 259 | return std::vector{GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X), 260 | GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y), 261 | GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)}; 262 | } 263 | unsigned long LocalMemSize() const { 264 | return static_cast(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)); 265 | } 266 | 267 | std::string Capabilities() const { 268 | const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); 269 | const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); 270 | return "SM"+std::to_string(major)+"."+std::to_string(minor); 271 | } 272 | bool HasExtension(const std::string &extension) const { return false; } 273 | bool SupportsFP64() const { return true; } 274 | bool SupportsFP16() const { 275 | const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); 276 | const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); 277 | if (major > 5) { return true; } // SM 6.x, 7.x and higher 278 | if (major == 5 && minor == 3) { return true; } // SM 5.3 279 | return false; 280 | } 281 | 282 | size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); } 283 | size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); } 284 | unsigned long MemorySize() const { 285 | auto result = size_t{0}; 286 | CheckError(cuDeviceTotalMem(&result, device_)); 287 | return static_cast(result); 288 | } 289 | unsigned long MaxAllocSize() const { return MemorySize(); } 290 | size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); } 291 | size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); } 292 | 293 | // Configuration-validity checks 294 | bool IsLocalMemoryValid(const size_t local_mem_usage) const { 295 | return (local_mem_usage <= LocalMemSize()); 296 | } 297 | bool IsThreadConfigValid(const std::vector &local) const { 298 | auto local_size = size_t{1}; 299 | for (const auto &item: local) { local_size *= item; } 300 | for (auto i=size_t{0}; i MaxWorkItemSizes()[i]) { return false; } 302 | } 303 | if (local_size > MaxWorkGroupSize()) { return false; } 304 | if (local.size() > MaxWorkItemDimensions()) { return false; } 305 | return true; 306 | } 307 | 308 | // Query for a specific type of device or brand 309 | bool IsCPU() const { return false; } 310 | bool IsGPU() const { return true; } 311 | bool IsAMD() const { return false; } 312 | bool IsNVIDIA() const { return true; } 313 | bool IsIntel() const { return false; } 314 | bool IsARM() const { return false; } 315 | 316 | // Platform specific extensions 317 | std::string AMDBoardName() const { return ""; } 318 | std::string NVIDIAComputeCapability() const { return Capabilities(); } 319 | 320 | // Accessor to the private data-member 321 | const RawDeviceID& operator()() const { return device_; } 322 | private: 323 | CUdevice device_; 324 | 325 | // Private helper function 326 | size_t GetInfo(const CUdevice_attribute info) const { 327 | auto result = 0; 328 | CheckError(cuDeviceGetAttribute(&result, info, device_)); 329 | return static_cast(result); 330 | } 331 | }; 332 | 333 | // ================================================================================================= 334 | 335 | // Raw context type 336 | using RawContext = CUcontext; 337 | 338 | // C++11 version of 'CUcontext' 339 | class Context { 340 | public: 341 | 342 | // Constructor based on the regular CUDA data-type: memory management is handled elsewhere 343 | explicit Context(const CUcontext context): 344 | context_(new CUcontext) { 345 | *context_ = context; 346 | } 347 | 348 | // Regular constructor with memory management 349 | explicit Context(const Device &device): 350 | context_(new CUcontext, [](CUcontext* c) { 351 | if (*c) { CheckErrorDtor(cuCtxDestroy(*c)); } 352 | delete c; 353 | }) { 354 | CheckError(cuCtxCreate(context_.get(), 0, device())); 355 | } 356 | 357 | // Accessor to the private data-member 358 | const RawContext& operator()() const { return *context_; } 359 | RawContext* pointer() const { return &(*context_); } 360 | private: 361 | std::shared_ptr context_; 362 | }; 363 | 364 | // Pointer to a raw CUDA context 365 | using ContextPointer = CUcontext*; 366 | 367 | // ================================================================================================= 368 | 369 | // C++11 version of 'nvrtcProgram'. Additionally holds the program's source code. 370 | class Program { 371 | public: 372 | // Note that there is no constructor based on the regular CUDA data-type because of extra state 373 | 374 | // Source-based constructor with memory management 375 | explicit Program(const Context &, std::string source): 376 | program_(new nvrtcProgram, [](nvrtcProgram* p) { 377 | if (*p) { CheckErrorDtorNVRTC(nvrtcDestroyProgram(p)); } 378 | delete p; 379 | }), 380 | source_(std::move(source)), 381 | from_binary_(false) { 382 | const auto source_ptr = &source_[0]; 383 | CheckErrorNVRTC(nvrtcCreateProgram(program_.get(), source_ptr, nullptr, 0, nullptr, nullptr)); 384 | } 385 | 386 | // PTX-based constructor 387 | explicit Program(const Device &device, const Context &context, const std::string &binary): 388 | program_(nullptr), // not used 389 | source_(binary), 390 | from_binary_(true) { 391 | } 392 | 393 | // Compiles the device program and checks whether or not there are any warnings/errors 394 | void Build(const Device &, std::vector &options) { 395 | if (from_binary_) { return; } 396 | auto raw_options = std::vector(); 397 | for (const auto &option: options) { 398 | raw_options.push_back(option.c_str()); 399 | } 400 | auto status = nvrtcCompileProgram(*program_, raw_options.size(), raw_options.data()); 401 | CLCudaAPINVRTCError::Check(status, "nvrtcCompileProgram"); 402 | } 403 | 404 | // Confirms whether a certain status code is an actual compilation error or warning 405 | bool StatusIsCompilationWarningOrError(const nvrtcResult status) const { 406 | return (status == NVRTC_ERROR_INVALID_INPUT); 407 | } 408 | 409 | // Retrieves the warning/error message from the compiler (if any) 410 | std::string GetBuildInfo(const Device &) const { 411 | if (from_binary_) { return std::string{}; } 412 | auto bytes = size_t{0}; 413 | CheckErrorNVRTC(nvrtcGetProgramLogSize(*program_, &bytes)); 414 | auto result = std::string{}; 415 | result.resize(bytes); 416 | CheckErrorNVRTC(nvrtcGetProgramLog(*program_, &result[0])); 417 | return result; 418 | } 419 | 420 | // Retrieves an intermediate representation of the compiled program (i.e. PTX) 421 | std::string GetIR() const { 422 | if (from_binary_) { return source_; } // holds the PTX 423 | auto bytes = size_t{0}; 424 | CheckErrorNVRTC(nvrtcGetPTXSize(*program_, &bytes)); 425 | auto result = std::string{}; 426 | result.resize(bytes); 427 | CheckErrorNVRTC(nvrtcGetPTX(*program_, &result[0])); 428 | return result; 429 | } 430 | 431 | // Accessor to the private data-member 432 | const nvrtcProgram& operator()() const { return *program_; } 433 | private: 434 | std::shared_ptr program_; 435 | const std::string source_; 436 | const bool from_binary_; 437 | }; 438 | 439 | // ================================================================================================= 440 | 441 | // Raw command-queue type 442 | using RawCommandQueue = CUstream; 443 | 444 | // C++11 version of 'CUstream' 445 | class Queue { 446 | public: 447 | // Note that there is no constructor based on the regular CUDA data-type because of extra state 448 | 449 | // Regular constructor with memory management 450 | explicit Queue(const Context &context, const Device &device): 451 | queue_(new CUstream, [](CUstream* s) { 452 | if (*s) { CheckErrorDtor(cuStreamDestroy(*s)); } 453 | delete s; 454 | }), 455 | context_(context), 456 | device_(device) { 457 | CheckError(cuStreamCreate(queue_.get(), CU_STREAM_NON_BLOCKING)); 458 | } 459 | 460 | // Synchronizes the queue and optionally also an event 461 | void Finish(Event &event) const { 462 | CheckError(cuEventSynchronize(event.end())); 463 | Finish(); 464 | } 465 | void Finish() const { 466 | CheckError(cuStreamSynchronize(*queue_)); 467 | } 468 | 469 | // Retrieves the corresponding context or device 470 | Context GetContext() const { return context_; } 471 | Device GetDevice() const { return device_; } 472 | 473 | // Accessor to the private data-member 474 | const RawCommandQueue& operator()() const { return *queue_; } 475 | private: 476 | std::shared_ptr queue_; 477 | const Context context_; 478 | const Device device_; 479 | }; 480 | 481 | // ================================================================================================= 482 | 483 | // C++11 version of page-locked host memory 484 | template 485 | class BufferHost { 486 | public: 487 | 488 | // Regular constructor with memory management 489 | explicit BufferHost(const Context &, const size_t size): 490 | buffer_(new void*, [](void** m) { CheckError(cuMemFreeHost(*m)); delete m; }), 491 | size_(size) { 492 | CheckError(cuMemAllocHost(buffer_.get(), size*sizeof(T))); 493 | } 494 | 495 | // Retrieves the actual allocated size in bytes 496 | size_t GetSize() const { 497 | return size_*sizeof(T); 498 | } 499 | 500 | // Compatibility with std::vector 501 | size_t size() const { return size_; } 502 | T* begin() { return &static_cast(*buffer_)[0]; } 503 | T* end() { return &static_cast(*buffer_)[size_-1]; } 504 | T& operator[](const size_t i) { return static_cast(*buffer_)[i]; } 505 | T* data() { return static_cast(*buffer_); } 506 | const T* data() const { return static_cast(*buffer_); } 507 | 508 | private: 509 | std::shared_ptr buffer_; 510 | const size_t size_; 511 | }; 512 | 513 | // ================================================================================================= 514 | 515 | // Enumeration of buffer access types 516 | enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }; 517 | 518 | // C++11 version of 'CUdeviceptr' 519 | template 520 | class Buffer { 521 | public: 522 | 523 | // Constructor based on the regular CUDA data-type: memory management is handled elsewhere 524 | explicit Buffer(const CUdeviceptr buffer): 525 | buffer_(new CUdeviceptr), 526 | access_(BufferAccess::kNotOwned) { 527 | *buffer_ = buffer; 528 | } 529 | 530 | // Regular constructor with memory management. If this class does not own the buffer object, then 531 | // the memory will not be freed automatically afterwards. 532 | explicit Buffer(const Context &, const BufferAccess access, const size_t size): 533 | buffer_(new CUdeviceptr, [access](CUdeviceptr* m) { 534 | if (access != BufferAccess::kNotOwned) { CheckError(cuMemFree(*m)); } 535 | delete m; 536 | }), 537 | access_(access) { 538 | CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T))); 539 | } 540 | 541 | // As above, but now with read/write access as a default 542 | explicit Buffer(const Context &context, const size_t size): 543 | Buffer(context, BufferAccess::kReadWrite, size) { 544 | } 545 | 546 | // Constructs a new buffer based on an existing host-container 547 | template 548 | explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end): 549 | Buffer(context, BufferAccess::kReadWrite, static_cast(end - start)) { 550 | auto size = static_cast(end - start); 551 | auto pointer = &*start; 552 | CheckError(cuMemcpyHtoDAsync(*buffer_, pointer, size*sizeof(T), queue())); 553 | queue.Finish(); 554 | } 555 | 556 | // Copies from device to host: reading the device buffer a-synchronously 557 | void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { 558 | if (access_ == BufferAccess::kWriteOnly) { 559 | throw LogicError("Buffer: reading from a write-only buffer"); 560 | } 561 | CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue())); 562 | } 563 | void ReadAsync(const Queue &queue, const size_t size, std::vector &host, 564 | const size_t offset = 0) const { 565 | if (host.size() < size) { 566 | throw LogicError("Buffer: target host buffer is too small"); 567 | } 568 | ReadAsync(queue, size, host.data(), offset); 569 | } 570 | void ReadAsync(const Queue &queue, const size_t size, BufferHost &host, 571 | const size_t offset = 0) const { 572 | if (host.size() < size) { 573 | throw LogicError("Buffer: target host buffer is too small"); 574 | } 575 | ReadAsync(queue, size, host.data(), offset); 576 | } 577 | 578 | // Copies from device to host: reading the device buffer 579 | void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { 580 | ReadAsync(queue, size, host, offset); 581 | queue.Finish(); 582 | } 583 | void Read(const Queue &queue, const size_t size, std::vector &host, 584 | const size_t offset = 0) const { 585 | Read(queue, size, host.data(), offset); 586 | } 587 | void Read(const Queue &queue, const size_t size, BufferHost &host, 588 | const size_t offset = 0) const { 589 | Read(queue, size, host.data(), offset); 590 | } 591 | 592 | // Copies from host to device: writing the device buffer a-synchronously 593 | void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { 594 | if (access_ == BufferAccess::kReadOnly) { 595 | throw LogicError("Buffer: writing to a read-only buffer"); 596 | } 597 | if (GetSize() < (offset+size)*sizeof(T)) { 598 | throw LogicError("Buffer: target device buffer is too small"); 599 | } 600 | CheckError(cuMemcpyHtoDAsync(*buffer_ + offset*sizeof(T), host, size*sizeof(T), queue())); 601 | } 602 | void WriteAsync(const Queue &queue, const size_t size, const std::vector &host, 603 | const size_t offset = 0) { 604 | WriteAsync(queue, size, host.data(), offset); 605 | } 606 | void WriteAsync(const Queue &queue, const size_t size, const BufferHost &host, 607 | const size_t offset = 0) { 608 | WriteAsync(queue, size, host.data(), offset); 609 | } 610 | 611 | // Copies from host to device: writing the device buffer 612 | void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { 613 | WriteAsync(queue, size, host, offset); 614 | queue.Finish(); 615 | } 616 | void Write(const Queue &queue, const size_t size, const std::vector &host, 617 | const size_t offset = 0) { 618 | Write(queue, size, host.data(), offset); 619 | } 620 | void Write(const Queue &queue, const size_t size, const BufferHost &host, 621 | const size_t offset = 0) { 622 | Write(queue, size, host.data(), offset); 623 | } 624 | 625 | // Copies the contents of this buffer into another device buffer 626 | void CopyToAsync(const Queue &queue, const size_t size, const Buffer &destination) const { 627 | CheckError(cuMemcpyDtoDAsync(destination(), *buffer_, size*sizeof(T), queue())); 628 | } 629 | void CopyTo(const Queue &queue, const size_t size, const Buffer &destination) const { 630 | CopyToAsync(queue, size, destination); 631 | queue.Finish(); 632 | } 633 | 634 | // Retrieves the actual allocated size in bytes 635 | size_t GetSize() const { 636 | auto result = size_t{0}; 637 | CheckError(cuMemGetAddressRange(nullptr, &result, *buffer_)); 638 | return result; 639 | } 640 | 641 | // Accessors to the private data-members 642 | CUdeviceptr operator()() const { return *buffer_; } 643 | CUdeviceptr& operator()() { return *buffer_; } 644 | private: 645 | std::shared_ptr buffer_; 646 | const BufferAccess access_; 647 | }; 648 | 649 | // ================================================================================================= 650 | 651 | // C++11 version of 'CUfunction' 652 | class Kernel { 653 | public: 654 | 655 | // Constructor based on the regular CUDA data-type: memory management is handled elsewhere 656 | explicit Kernel(const CUmodule module, const CUfunction kernel): 657 | module_(module), 658 | kernel_(kernel) { 659 | } 660 | 661 | // Regular constructor with memory management 662 | explicit Kernel(const Program &program, const std::string &name) { 663 | CheckError(cuModuleLoadDataEx(&module_, program.GetIR().data(), 0, nullptr, nullptr)); 664 | CheckError(cuModuleGetFunction(&kernel_, module_, name.c_str())); 665 | } 666 | 667 | // Sets a kernel argument at the indicated position. This stores both the value of the argument 668 | // (as raw bytes) and the index indicating where this value can be found. 669 | template 670 | void SetArgument(const size_t index, const T &value) { 671 | if (index >= arguments_indices_.size()) { arguments_indices_.resize(index+1); } 672 | arguments_indices_[index] = arguments_data_.size(); 673 | for (auto j=size_t(0); j(&value)[j]); 675 | } 676 | } 677 | template 678 | void SetArgument(const size_t index, Buffer &value) { 679 | SetArgument(index, value()); 680 | } 681 | 682 | // Sets all arguments in one go using parameter packs. Note that this resets all previously set 683 | // arguments using 'SetArgument' or 'SetArguments'. 684 | template 685 | void SetArguments(Args&... args) { 686 | arguments_indices_.clear(); 687 | arguments_data_.clear(); 688 | SetArgumentsRecursive(0, args...); 689 | } 690 | 691 | // Retrieves the amount of local memory used per work-group for this kernel. Note that this the 692 | // shared memory in CUDA terminology. 693 | unsigned long LocalMemUsage(const Device &) const { 694 | auto result = 0; 695 | CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_)); 696 | return static_cast(result); 697 | } 698 | 699 | // Retrieves the name of the kernel 700 | std::string GetFunctionName() const { 701 | return std::string{"unknown"}; // Not implemented for the CUDA backend 702 | } 703 | 704 | // Launches a kernel onto the specified queue 705 | void Launch(const Queue &queue, const std::vector &global, 706 | const std::vector &local, EventPointer event) { 707 | 708 | // Creates the grid (number of threadblocks) and sets the block sizes (threads per block) 709 | auto grid = std::vector{1, 1, 1}; 710 | auto block = std::vector{1, 1, 1}; 711 | if (global.size() != local.size()) { throw LogicError("invalid thread/workgroup dimensions"); } 712 | for (auto i=size_t{0}; i pointers; 717 | for (auto &index: arguments_indices_) { 718 | pointers.push_back(&arguments_data_[index]); 719 | } 720 | 721 | // Launches the kernel, its execution time is recorded by events 722 | CheckError(cuEventRecord(event->start(), queue())); 723 | CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2], 724 | 0, queue(), pointers.data(), nullptr)); 725 | CheckError(cuEventRecord(event->end(), queue())); 726 | } 727 | 728 | // As above, but with an event waiting list 729 | // TODO: Implement this function 730 | void Launch(const Queue &queue, const std::vector &global, 731 | const std::vector &local, EventPointer event, 732 | std::vector& waitForEvents) { 733 | if (local.size() == 0) { 734 | throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end"); 735 | } 736 | else if (waitForEvents.size() != 0) { 737 | throw LogicError("Kernel: launching with an event waiting list is not implemented for the CUDA back-end"); 738 | } 739 | else { 740 | return Launch(queue, global, local, event); 741 | } 742 | } 743 | 744 | // Accessors to the private data-members 745 | const CUfunction& operator()() const { return kernel_; } 746 | CUfunction operator()() { return kernel_; } 747 | private: 748 | CUmodule module_; 749 | CUfunction kernel_; 750 | std::vector arguments_indices_; // Indices of the arguments 751 | std::vector arguments_data_; // The arguments data as raw bytes 752 | 753 | // Internal implementation for the recursive SetArguments function. 754 | template 755 | void SetArgumentsRecursive(const size_t index, T &first) { 756 | SetArgument(index, first); 757 | } 758 | template 759 | void SetArgumentsRecursive(const size_t index, T &first, Args&... args) { 760 | SetArgument(index, first); 761 | SetArgumentsRecursive(index+1, args...); 762 | } 763 | }; 764 | 765 | // ================================================================================================= 766 | } // namespace CLCudaAPI 767 | 768 | // CLCUDAAPI_CUPP11_H_ 769 | #endif 770 | -------------------------------------------------------------------------------- /include/cxpp11_common.hpp: -------------------------------------------------------------------------------- 1 | 2 | // ================================================================================================= 3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | // width of 100 characters per line. 6 | // 7 | // Author(s): 8 | // Ivan Shapovalov 9 | // Cedric Nugteren 10 | // 11 | // This file contains exception classes as needed for either 'clpp11.hpp' or 'cupp11.h'. 12 | // 13 | // ================================================================================================= 14 | 15 | #ifndef CLCUDAAPI_CXPP11_COMMON_H_ 16 | #define CLCUDAAPI_CXPP11_COMMON_H_ 17 | 18 | #include // std::string 19 | #include // strchr 20 | #include // std::runtime_error 21 | 22 | namespace CLCudaAPI { 23 | // ================================================================================================= 24 | 25 | // Basic exception class: represents an error happened inside our code 26 | // (as opposed to an error in C++ runtime) 27 | template 28 | class Error : public Base { 29 | public: 30 | // Perfect forwarding of the constructor since "using Base::Base" is not supported by VS 2013 31 | template 32 | Error(Args&&... args): 33 | Base(std::forward(args)...) { 34 | } 35 | }; 36 | 37 | // ================================================================================================= 38 | 39 | // Represents a generic device-specific runtime error (returned by an OpenCL or CUDA API function) 40 | class DeviceError : public Error { 41 | public: 42 | // Perfect forwarding of the constructor since "using Error::Error" is not 43 | // supported by VS 2013 44 | template 45 | DeviceError(Args&&... args): 46 | Error(std::forward(args)...) { 47 | } 48 | 49 | static std::string TrimCallString(const char *where) { 50 | const char *paren = strchr(where, '('); 51 | if (paren) { 52 | return std::string(where, paren); 53 | } else { 54 | return std::string(where); 55 | } 56 | } 57 | }; 58 | 59 | // ================================================================================================= 60 | 61 | // Represents a generic runtime error (aka environmental problem) 62 | class RuntimeError : public Error { 63 | public: 64 | explicit RuntimeError(const std::string &reason): 65 | Error("Run-time error: " + reason) { 66 | } 67 | }; 68 | 69 | // ================================================================================================= 70 | 71 | // Represents a generic logic error (aka failed assertion) 72 | class LogicError : public Error { 73 | public: 74 | explicit LogicError(const std::string &reason): 75 | Error("Internal logic error: " + reason) { 76 | } 77 | }; 78 | 79 | // ================================================================================================= 80 | 81 | // Internal exception base class with a status field and a subclass-specific "details" field 82 | // which can be used to recreate an exception 83 | template 84 | class ErrorCode : public Base { 85 | public: 86 | ErrorCode(Status status, const std::string &details, const std::string &reason): 87 | Base(reason), 88 | status_(status), 89 | details_(details) { 90 | } 91 | 92 | Status status() const { 93 | return status_; 94 | } 95 | 96 | const std::string& details() const { 97 | return details_; 98 | } 99 | 100 | private: 101 | const Status status_; 102 | const std::string details_; 103 | }; 104 | 105 | // ================================================================================================= 106 | 107 | } // namespace CLCudaAPI 108 | 109 | // CLCUDAAPI_CXPP11_COMMON_H_ 110 | #endif 111 | -------------------------------------------------------------------------------- /samples/advanced.cc: -------------------------------------------------------------------------------- 1 | 2 | // ================================================================================================= 3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | // width of 100 characters per line. 6 | // 7 | // Author(s): 8 | // Cedric Nugteren 9 | // 10 | // This file demonstrates more advanced usage of the C++11 interfaces to CUDA and OpenCL through 11 | // CLCudaAPI. This includes 2D thread dimensions and asynchronous host-device communication. The 12 | // example conserns a 2D convolution kernel with a very simple hard-coded 3x3 blur filter. 13 | // 14 | // ================================================================================================= 15 | // 16 | // Copyright 2015 SURFsara 17 | // 18 | // Licensed under the Apache License, Version 2.0 (the "License"); 19 | // you may not use this file except in compliance with the License. 20 | // You may obtain a copy of the License at 21 | // 22 | // http://www.apache.org/licenses/LICENSE-2.0 23 | // 24 | // Unless required by applicable law or agreed to in writing, software 25 | // distributed under the License is distributed on an "AS IS" BASIS, 26 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 27 | // See the License for the specific language governing permissions and 28 | // limitations under the License. 29 | // 30 | // ================================================================================================= 31 | 32 | // Run with either OpenCL or CUDA as a back-end 33 | #if USE_OPENCL 34 | #include "clpp11.h" 35 | #else 36 | #include "cupp11.h" 37 | #endif 38 | 39 | // C++ includes 40 | #include 41 | #include 42 | #include 43 | 44 | // ================================================================================================= 45 | 46 | // This example uses a single monolithic function 47 | int main() { 48 | 49 | // This example passes different options to the run-time compiler based on which back-end is used 50 | #if USE_OPENCL 51 | auto compiler_options = std::vector{}; 52 | #else 53 | auto compiler_options = std::vector{"--gpu-architecture=compute_35"}; 54 | #endif 55 | 56 | // Example CUDA/OpenCL program as a string. Note that this is the first (header) part only, the 57 | // main body of the kernel is common among the two back-ends and is therefore not duplicated. 58 | #if USE_OPENCL 59 | auto program_head = R"( 60 | __kernel void convolution(__global float* x, __global float* y, 61 | const int size_x, const int size_y) { 62 | const int tid_x = get_global_id(0); 63 | const int tid_y = get_global_id(1); 64 | )"; 65 | #else 66 | auto program_head = R"( 67 | extern "C" __global__ void convolution(float* x, float* y, 68 | const int size_x, const int size_y) { 69 | const int tid_x = threadIdx.x + blockDim.x*blockIdx.x; 70 | const int tid_y = threadIdx.y + blockDim.y*blockIdx.y; 71 | )"; 72 | #endif 73 | 74 | // The common body of the OpenCL/CUDA program. This is glued after the 'program_head' string. 75 | // It implements a star-based fixed 3x3 blur filter. 76 | auto program_tail = R"( 77 | float value = 0.0f; 78 | if (tid_x >= 1 && tid_y >= 1 && tid_x < size_x-1 && tid_y < size_y-1) { 79 | value += 0.2*x[(tid_y+1)*size_x + (tid_x )]; 80 | value += 0.2*x[(tid_y-1)*size_x + (tid_x )]; 81 | value += 0.2*x[(tid_y )*size_x + (tid_x )]; 82 | value += 0.2*x[(tid_y )*size_x + (tid_x+1)]; 83 | value += 0.2*x[(tid_y )*size_x + (tid_x-1)]; 84 | } 85 | y[tid_y*size_x + tid_x] = value; 86 | })"; 87 | auto program_string = std::string{program_head} + std::string{program_tail}; 88 | 89 | // =============================================================================================== 90 | 91 | // Sets the size of the 2D input/output matrices 92 | constexpr auto size_x = size_t{2048}; 93 | constexpr auto size_y = size_t{2048}; 94 | auto size = size_x * size_y; 95 | 96 | // Platform/device settings 97 | constexpr auto platform_id = size_t{0}; 98 | constexpr auto device_id = size_t{0}; 99 | 100 | // Initializes the CLCudaAPI platform and device. This initializes the OpenCL/CUDA back-end and 101 | // selects a specific device on the platform. The device class has methods to retrieve properties 102 | // such as the device name and vendor. More examples of device properties are given in the 103 | // `device_info.cc` sample program. 104 | printf("\n## Initializing...\n"); 105 | auto platform = CLCudaAPI::Platform(platform_id); 106 | auto device = CLCudaAPI::Device(platform, device_id); 107 | printf(" > Running on device '%s' of '%s'\n", device.Name().c_str(), device.Vendor().c_str()); 108 | 109 | // Creates a new CLCudaAPI context and queue for this device. The queue can be used to schedule 110 | // commands such as launching a kernel or performing a device-host memory copy. 111 | auto context = CLCudaAPI::Context(device); 112 | auto queue = CLCudaAPI::Queue(context, device); 113 | 114 | // Creates a new CLCudaAPI event to be able to time kernels 115 | auto event = CLCudaAPI::Event(); 116 | 117 | // Creates a new program based on the kernel string. Note that the kernel string is moved-out when 118 | // constructing the program to save copying: it should no longer be used in the remainder of this 119 | // function. 120 | auto program = CLCudaAPI::Program(context, std::move(program_string)); 121 | 122 | // Builds this program and checks for any compilation errors. If there are any, they are printed 123 | // and execution is halted. 124 | printf("## Compiling the kernel...\n"); 125 | try { 126 | program.Build(device, compiler_options); 127 | } catch (const CLCudaAPI::CLCudaAPIBuildError &e) { 128 | if (program.StatusIsCompilationWarningOrError(e.status())) { 129 | auto message = program.GetBuildInfo(device); 130 | printf(" > Compiler error(s)/warning(s) found:\n%s\n", message.c_str()); 131 | } 132 | throw; 133 | } 134 | 135 | // Populate host matrices based on CUDA/OpenCL host buffers. When using the CUDA back-end, this 136 | // will create page-locked memories, benefiting from higher bandwidth when copying between the 137 | // host and device. These buffers mimic std::vector to some extend and can therefore be filled 138 | // using either the '[]' operator or range-based for-loops. 139 | auto host_a = CLCudaAPI::BufferHost(context, size); 140 | auto host_b = CLCudaAPI::BufferHost(context, size); 141 | for (auto x=size_t{0}; x(x + y/4); 144 | } 145 | } 146 | for (auto &item: host_b) { item = 0.0f; } 147 | 148 | // Creates two new device buffers and prints the sizes of these device buffers. Both buffers 149 | // in this example are readable and writable. 150 | printf("## Allocating device memory...\n"); 151 | auto dev_a = CLCudaAPI::Buffer(context, CLCudaAPI::BufferAccess::kReadWrite, size); 152 | auto dev_b = CLCudaAPI::Buffer(context, CLCudaAPI::BufferAccess::kReadWrite, size); 153 | printf(" > Size of buffer A is %zu bytes\n", dev_a.GetSize()); 154 | printf(" > Size of buffer B is %zu bytes\n", dev_b.GetSize()); 155 | 156 | // Copies the matrices to the device a-synchronously. The queue is then finished to ensure that 157 | // the operations are completed before continuing. 158 | dev_a.WriteAsync(queue, size, host_a); 159 | dev_b.WriteAsync(queue, size, host_b); 160 | queue.Finish(); 161 | 162 | // Creates the 'convolution' kernel from the compiled program and sets the four arguments. Note 163 | // that this uses the direct form instead of setting each argument separately. 164 | auto kernel = CLCudaAPI::Kernel(program, "convolution"); 165 | auto size_x_int = static_cast(size_x); 166 | auto size_y_int = static_cast(size_y); 167 | kernel.SetArguments(dev_a, dev_b, size_x_int, size_y_int); 168 | 169 | // Creates a 2-dimensional thread configuration with thread-blocks/work-groups of 16x16 threads 170 | // and a total number of threads equal to the number of elements in the input/output matrices. 171 | constexpr auto kWorkGroupSizeX = size_t{16}; 172 | constexpr auto kWorkGroupSizeY = size_t{16}; 173 | auto global = std::vector{static_cast(size_x), static_cast(size_y)}; 174 | auto local = std::vector{kWorkGroupSizeX, kWorkGroupSizeY}; 175 | 176 | // Makes sure that the thread configuration is legal on this device 177 | if (!device.IsThreadConfigValid(local)) { 178 | printf("## Unsupported local thread configuration for this device, exiting.\n"); 179 | return 1; 180 | } 181 | 182 | // Enqueues the kernel and waits for the result. Note that launching the kernel is always 183 | // a-synchronous and thus requires finishing the queue in order to complete the operation. 184 | printf("## Running the kernel...\n"); 185 | kernel.Launch(queue, global, local, event.pointer()); 186 | queue.Finish(event); 187 | printf(" > Took %.3lf ms\n", event.GetElapsedTime()); 188 | 189 | // For illustration purposes, this copies the result into a new device buffer. The old result 190 | // buffer 'dev_b' is now no longer used. 191 | auto dev_b_copy = CLCudaAPI::Buffer(context, CLCudaAPI::BufferAccess::kReadWrite, size); 192 | dev_b.CopyTo(queue, size, dev_b_copy); 193 | 194 | // Reads the results back from the new copy into the host memory 195 | dev_b_copy.ReadAsync(queue, size, host_b); 196 | queue.Finish(); 197 | 198 | // Prints the results for a couple of indices to verify that the work has been done 199 | printf("## All done. Sampled verification:\n"); 200 | const auto verification_indices = std::vector{20}; 201 | for (const auto &index: verification_indices) { 202 | printf(" > 0.2*%.lf + 0.2*%.lf + 0.2*%.lf + 0.2*%.lf + 0.2*%.lf = %.2lf\n", 203 | host_a[(index+1)*size_x + (index )], host_a[(index-1)*size_x + (index )], 204 | host_a[(index )*size_x + (index )], host_a[(index )*size_x + (index+1)], 205 | host_a[(index )*size_x + (index-1)], 206 | host_b[index*size_x + index]); 207 | } 208 | 209 | // End of the example: no frees or clean-up needed 210 | return 0; 211 | } 212 | 213 | // ================================================================================================= 214 | -------------------------------------------------------------------------------- /samples/device_info.cc: -------------------------------------------------------------------------------- 1 | 2 | // ================================================================================================= 3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | // width of 100 characters per line. 6 | // 7 | // Author(s): 8 | // Cedric Nugteren 9 | // 10 | // This file implements a generic version of 'clinfo' (OpenCL) and 'deviceQuery' (CUDA). This 11 | // demonstrates some of the features of CLCudaAPI's generic Device class. 12 | // 13 | // ================================================================================================= 14 | // 15 | // Copyright 2015 SURFsara 16 | // 17 | // Licensed under the Apache License, Version 2.0 (the "License"); 18 | // you may not use this file except in compliance with the License. 19 | // You may obtain a copy of the License at 20 | // 21 | // http://www.apache.org/licenses/LICENSE-2.0 22 | // 23 | // Unless required by applicable law or agreed to in writing, software 24 | // distributed under the License is distributed on an "AS IS" BASIS, 25 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 26 | // See the License for the specific language governing permissions and 27 | // limitations under the License. 28 | // 29 | // ================================================================================================= 30 | 31 | // C++ includes 32 | #include 33 | #include 34 | #include 35 | 36 | // Run with either OpenCL or CUDA as a back-end 37 | #if USE_OPENCL 38 | #include "clpp11.h" 39 | #else 40 | #include "cupp11.h" 41 | #endif 42 | 43 | // ================================================================================================= 44 | 45 | // Example implementation of a device-query/info program 46 | int main() { 47 | 48 | // Platform/device settings 49 | constexpr auto platform_id = size_t{0}; 50 | constexpr auto device_id = size_t{0}; 51 | 52 | // Initializes the CLCudaAPI platform and device. This initializes the OpenCL/CUDA back-end and 53 | // selects a specific device on the platform. 54 | const auto platform = CLCudaAPI::Platform(platform_id); 55 | const auto device = CLCudaAPI::Device(platform, device_id); 56 | 57 | // Prints information about the chosen device. Most of these results should stay the same when 58 | // switching between the CUDA and OpenCL back-ends. 59 | printf("\n## Printing platform information...\n"); 60 | printf(" > Platform ID %zu\n", platform_id); 61 | printf(" > Platform name %s\n", platform.Name().c_str()); 62 | printf(" > Platform vendor %s\n", platform.Vendor().c_str()); 63 | printf(" > Platform version %s\n", platform.Version().c_str()); 64 | printf("\n## Printing device information...\n"); 65 | printf(" > Device ID %zu\n", device_id); 66 | printf(" > Framework version %s\n", device.Version().c_str()); 67 | printf(" > Vendor %s\n", device.Vendor().c_str()); 68 | printf(" > Device name %s\n", device.Name().c_str()); 69 | if (device.HasExtension("cl_amd_device_attribute_query")) { 70 | printf(" > AMD board name %s\n", device.AMDBoardName().c_str()); 71 | } 72 | if (device.HasExtension("cl_nv_device_attribute_query")) { 73 | printf(" > NVIDIA compute capability %s\n", device.NVIDIAComputeCapability().c_str()); 74 | } 75 | printf(" > Device type %s\n", device.Type().c_str()); 76 | printf(" > Max work-group size %zu\n", device.MaxWorkGroupSize()); 77 | printf(" > Max thread dimensions %zu\n", device.MaxWorkItemDimensions()); 78 | printf(" > Max work-group sizes:\n"); 79 | for (auto i=size_t{0}; i Local memory per work-group %zu bytes\n", device.LocalMemSize()); 83 | printf(" > Device capabilities %s\n", device.Capabilities().c_str()); 84 | printf(" > Core clock rate %zu MHz\n", device.CoreClock()); 85 | printf(" > Number of compute units %zu\n", device.ComputeUnits()); 86 | printf(" > Total memory size %zu bytes\n", device.MemorySize()); 87 | printf(" > Maximum allocatable memory %zu bytes\n", device.MaxAllocSize()); 88 | printf(" > Memory clock rate %zu MHz\n", device.MemoryClock()); 89 | printf(" > Memory bus width %zu bits\n", device.MemoryBusWidth()); 90 | 91 | // End of the example: no frees or clean-up needed 92 | return 0; 93 | } 94 | 95 | // ================================================================================================= 96 | -------------------------------------------------------------------------------- /samples/simple.cc: -------------------------------------------------------------------------------- 1 | 2 | // ================================================================================================= 3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | // width of 100 characters per line. 6 | // 7 | // Author(s): 8 | // Cedric Nugteren 9 | // 10 | // This file implements a relatively simple toy example, in which an input vector is multiplied by 11 | // a constant to produce an output vector. This example demonstrates the basic usage of the C++11 12 | // interfaces to CUDA and OpenCL through CLCudaAPI. 13 | // 14 | // ================================================================================================= 15 | // 16 | // Copyright 2015 SURFsara 17 | // 18 | // Licensed under the Apache License, Version 2.0 (the "License"); 19 | // you may not use this file except in compliance with the License. 20 | // You may obtain a copy of the License at 21 | // 22 | // http://www.apache.org/licenses/LICENSE-2.0 23 | // 24 | // Unless required by applicable law or agreed to in writing, software 25 | // distributed under the License is distributed on an "AS IS" BASIS, 26 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 27 | // See the License for the specific language governing permissions and 28 | // limitations under the License. 29 | // 30 | // ================================================================================================= 31 | 32 | // Runs with either OpenCL or CUDA as a back-end 33 | #if USE_OPENCL 34 | #include "clpp11.h" 35 | #else 36 | #include "cupp11.h" 37 | #endif 38 | 39 | // C++ includes 40 | #include 41 | #include 42 | #include 43 | 44 | // ================================================================================================= 45 | 46 | // This example uses a single monolithic function 47 | int main() { 48 | 49 | // Example CUDA/OpenCL program as a string. Note that the strings are loaded here as raw string 50 | // literals (using C++11's R"(string)" syntax). However, they can also be generated in-line or 51 | // perhaps placed in a separate file and loaded at run-time. 52 | #if USE_OPENCL 53 | auto program_string = R"( 54 | __kernel void multiply(__global float* x, __global float* y, const int factor) { 55 | const int tid = get_global_id(0); 56 | y[tid] = x[tid] * factor; 57 | })"; 58 | #else 59 | auto program_string = R"( 60 | extern "C" __global__ void multiply(float* x, float* y, const int factor) { 61 | const int tid = threadIdx.x + blockDim.x*blockIdx.x; 62 | y[tid] = x[tid] * factor; 63 | })"; 64 | #endif 65 | 66 | // =============================================================================================== 67 | 68 | // Sets the size of the vectors and the data-multiplication factor 69 | constexpr auto size = static_cast(2048 * 2048); 70 | auto multiply_factor = 2; 71 | 72 | // Platform/device settings 73 | constexpr auto platform_id = size_t{0}; 74 | constexpr auto device_id = size_t{0}; 75 | 76 | // Initializes the CLCudaAPI platform and device. This initializes the OpenCL/CUDA back-end and 77 | // selects a specific device on the platform. The device class has methods to retrieve properties 78 | // such as the device name and vendor. More examples of device properties are given in the 79 | // `device_info.cc` sample program. 80 | printf("\n## Initializing...\n"); 81 | auto platform = CLCudaAPI::Platform(platform_id); 82 | auto device = CLCudaAPI::Device(platform, device_id); 83 | printf(" > Running on device '%s' of '%s'\n", device.Name().c_str(), device.Vendor().c_str()); 84 | 85 | // Creates a new CLCudaAPI context and queue for this device. The queue can be used to schedule 86 | // commands such as launching a kernel or performing a device-host memory copy. 87 | auto context = CLCudaAPI::Context(device); 88 | auto queue = CLCudaAPI::Queue(context, device); 89 | 90 | // Creates a new CLCudaAPI event to be able to time kernels 91 | auto event = CLCudaAPI::Event(); 92 | 93 | // Creates a new program based on the kernel string. Then, builds this program and checks for 94 | // any compilation errors. If there are any, they are printed and execution is halted. 95 | printf("## Compiling the kernel...\n"); 96 | auto program = CLCudaAPI::Program(context, program_string); 97 | auto compiler_options = std::vector{}; 98 | try { 99 | program.Build(device, compiler_options); 100 | } catch (const CLCudaAPI::CLCudaAPIBuildError &e) { 101 | if (program.StatusIsCompilationWarningOrError(e.status())) { 102 | auto message = program.GetBuildInfo(device); 103 | printf(" > Compiler error(s)/warning(s) found:\n%s\n", message.c_str()); 104 | } 105 | throw; 106 | } 107 | 108 | // Populates regular host vectors with example data 109 | auto host_a = std::vector(size); 110 | auto host_b = std::vector(size); 111 | for (auto i=size_t{0}; i(i); } 112 | for (auto &item: host_b) { item = 0.0f; } 113 | 114 | // Creates two new device buffers and copies the host data to these device buffers. 115 | auto dev_a = CLCudaAPI::Buffer(context, queue, host_a.begin(), host_a.end()); 116 | auto dev_b = CLCudaAPI::Buffer(context, queue, host_b.begin(), host_b.end()); 117 | 118 | // Creates the 'multiply' kernel from the compiled program and sets the three arguments. Note that 119 | // the indices of the arguments have to be set according to their order in the kernel. 120 | auto kernel = CLCudaAPI::Kernel(program, "multiply"); 121 | kernel.SetArgument(0, dev_a); 122 | kernel.SetArgument(1, dev_b); 123 | kernel.SetArgument(2, multiply_factor); 124 | 125 | // Creates a 1-dimensional thread configuration with thread-blocks/work-groups of 256 threads 126 | // and a total number of threads equal to the number of elements in the input/output vectors. 127 | constexpr auto kWorkGroupSize = size_t{256}; 128 | auto global = std::vector{size}; 129 | auto local = std::vector{kWorkGroupSize}; 130 | 131 | // Enqueues the kernel and waits for the result. Note that launching the kernel is always 132 | // a-synchronous and thus requires finishing the queue in order to complete the operation. 133 | printf("## Running the kernel...\n"); 134 | kernel.Launch(queue, global, local, event.pointer()); 135 | queue.Finish(event); 136 | printf(" > Took %.3lf ms\n", event.GetElapsedTime()); 137 | 138 | // Reads the results back to the host memory 139 | dev_b.Read(queue, size, host_b); 140 | 141 | // Prints the results for a couple of indices to verify that the work has been done 142 | printf("## All done. Sampled verification:\n"); 143 | const auto verification_indices = std::vector{4, 900}; 144 | for (const auto &index: verification_indices) { 145 | printf(" > %.lf*%d = %.lf\n", host_a[index], multiply_factor, host_b[index]); 146 | } 147 | 148 | // End of the example: no frees or clean-up needed 149 | return 0; 150 | } 151 | 152 | // ================================================================================================= 153 | -------------------------------------------------------------------------------- /samples/smallest.cc: -------------------------------------------------------------------------------- 1 | 2 | // ================================================================================================= 3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | // width of 100 characters per line. 6 | // 7 | // Author(s): 8 | // Cedric Nugteren 9 | // 10 | // This file implements a compact OpenCL/CUDA example inspired by the 'quest for the smallest OpenCL 11 | // program': http://arrayfire.com/quest-for-the-smallest-opencl-program/ 12 | // 13 | // ================================================================================================= 14 | // 15 | // Copyright 2015 SURFsara 16 | // 17 | // Licensed under the Apache License, Version 2.0 (the "License"); 18 | // you may not use this file except in compliance with the License. 19 | // You may obtain a copy of the License at 20 | // 21 | // http://www.apache.org/licenses/LICENSE-2.0 22 | // 23 | // Unless required by applicable law or agreed to in writing, software 24 | // distributed under the License is distributed on an "AS IS" BASIS, 25 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 26 | // See the License for the specific language governing permissions and 27 | // limitations under the License. 28 | // 29 | // ================================================================================================= 30 | 31 | // Compile using OpenCL ... 32 | #if USE_OPENCL 33 | #include "clpp11.h" 34 | static auto program_string = R"( 35 | __kernel void add(__global const float* a, __global const float* b, __global float* c) { 36 | unsigned idx = get_global_id(0); 37 | c[idx] = a[idx] + b[idx]; 38 | })"; 39 | 40 | // ... or use CUDA instead 41 | #else 42 | #include "cupp11.h" 43 | static auto program_string = R"( 44 | extern "C" __global__ void add(const float* a, const float* b, float* c) { 45 | unsigned idx = threadIdx.x + blockDim.x*blockIdx.x; 46 | c[idx] = a[idx] + b[idx]; 47 | })"; 48 | #endif 49 | 50 | #include 51 | 52 | int main() { 53 | constexpr auto platform_id = size_t{0}; 54 | constexpr auto device_id = size_t{0}; 55 | auto platform = CLCudaAPI::Platform(platform_id); 56 | auto device = CLCudaAPI::Device(platform, device_id); 57 | auto context = CLCudaAPI::Context(device); 58 | auto queue = CLCudaAPI::Queue(context, device); 59 | auto event = CLCudaAPI::Event(); 60 | 61 | // Creates and populates device memory 62 | constexpr auto elements = size_t{1024}; 63 | auto data = std::vector(elements, 5); 64 | auto a = CLCudaAPI::Buffer(context, CLCudaAPI::BufferAccess::kReadWrite, elements); 65 | auto b = CLCudaAPI::Buffer(context, CLCudaAPI::BufferAccess::kReadWrite, elements); 66 | auto c = CLCudaAPI::Buffer(context, CLCudaAPI::BufferAccess::kReadWrite, elements); 67 | a.Write(queue, elements, data); 68 | b.Write(queue, elements, data); 69 | 70 | // Compiles and launches the kernel 71 | auto program = CLCudaAPI::Program(context, program_string); 72 | auto compiler_options = std::vector{}; 73 | program.Build(device, compiler_options); 74 | auto kernel = CLCudaAPI::Kernel(program, "add"); 75 | kernel.SetArguments(a, b, c); 76 | kernel.Launch(queue, {elements}, {128}, event.pointer()); 77 | queue.Finish(event); 78 | 79 | // Reads the results back to the host memory 80 | auto result = std::vector(elements, 0); 81 | c.Read(queue, elements, result); 82 | for (auto &r: result) { printf("%.lf ", r); } 83 | printf("\n"); 84 | return 0; 85 | } 86 | 87 | // ================================================================================================= 88 | -------------------------------------------------------------------------------- /test/unit_tests.cc: -------------------------------------------------------------------------------- 1 | 2 | // ================================================================================================= 3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | // width of 100 characters per line. 6 | // 7 | // Author(s): 8 | // Cedric Nugteren 9 | // 10 | // This file implements unit tests based on the Catch header-only test framework. 11 | // 12 | // ================================================================================================= 13 | // 14 | // Copyright 2015 SURFsara 15 | // 16 | // Licensed under the Apache License, Version 2.0 (the "License"); 17 | // you may not use this file except in compliance with the License. 18 | // You may obtain a copy of the License at 19 | // 20 | // http://www.apache.org/licenses/LICENSE-2.0 21 | // 22 | // Unless required by applicable law or agreed to in writing, software 23 | // distributed under the License is distributed on an "AS IS" BASIS, 24 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 | // See the License for the specific language governing permissions and 26 | // limitations under the License. 27 | // 28 | // ================================================================================================= 29 | 30 | // Use Catch 31 | #define CATCH_CONFIG_MAIN 32 | #include "catch.hpp" 33 | 34 | // Runs with either OpenCL or CUDA as a back-end 35 | #if USE_OPENCL 36 | #include 37 | #else 38 | #include 39 | #endif 40 | 41 | // Settings 42 | const size_t kPlatformID = 0; 43 | const size_t kDeviceID = 0; 44 | const size_t kBufferSize = 10; 45 | 46 | // ================================================================================================= 47 | 48 | SCENARIO("events can be created and used", "[Event]") { 49 | GIVEN("An example event") { 50 | #if !USE_OPENCL 51 | auto platform = CLCudaAPI::Platform(kPlatformID); 52 | auto device = CLCudaAPI::Device(platform, kDeviceID); 53 | auto context = CLCudaAPI::Context(device); 54 | auto queue = CLCudaAPI::Queue(context, device); 55 | #endif 56 | auto event = CLCudaAPI::Event(); 57 | 58 | #if USE_OPENCL // Not available for the CUDA version 59 | WHEN("its underlying data-structure is retrieved") { 60 | auto raw_event = event(); 61 | THEN("a copy of this event can be created") { 62 | auto event_copy = CLCudaAPI::Event(raw_event); 63 | REQUIRE(event_copy() == event()); 64 | } 65 | } 66 | #else // Not available for the OpenCL version 67 | WHEN("its underlying data-structures are retrieved") { 68 | auto raw_start = event.start(); 69 | auto raw_end = event.end(); 70 | THEN("their underlying data-structures are not null") { 71 | REQUIRE(raw_start != nullptr); 72 | REQUIRE(raw_end != nullptr); 73 | } 74 | } 75 | #endif 76 | 77 | WHEN("a copy is created using the copy constructor") { 78 | auto event_copy = CLCudaAPI::Event(event); 79 | THEN("its underlying data-structure is unchanged") { 80 | #if USE_OPENCL 81 | REQUIRE(event_copy() == event()); 82 | #else 83 | REQUIRE(event_copy.start() == event.start()); 84 | REQUIRE(event_copy.end() == event.end()); 85 | #endif 86 | } 87 | } 88 | 89 | // TODO: Not working if nothing is recorded 90 | //WHEN("the elapsed time is retrieved") { 91 | // auto elapsed_time = event.GetElapsedTime(); 92 | // THEN("its value is valid") { 93 | // REQUIRE(elapsed_time == elapsed_time); 94 | // } 95 | //} 96 | } 97 | } 98 | 99 | // ================================================================================================= 100 | 101 | SCENARIO("platforms can be created and used", "[Platform]") { 102 | GIVEN("An example platform") { 103 | auto platform = CLCudaAPI::Platform(kPlatformID); 104 | auto num_devices = platform.NumDevices(); 105 | 106 | #if USE_OPENCL // Not available for the CUDA version 107 | WHEN("its underlying data-structure is retrieved") { 108 | auto raw_platform = platform(); 109 | THEN("a copy of this platform can be created") { 110 | auto platform_copy = CLCudaAPI::Platform(raw_platform); 111 | REQUIRE(platform_copy.NumDevices() == num_devices); 112 | } 113 | } 114 | #endif 115 | 116 | WHEN("a copy is created using the copy constructor") { 117 | auto platform_copy = CLCudaAPI::Platform(platform); 118 | THEN("the platform's properties remain unchanged") { 119 | REQUIRE(platform_copy.NumDevices() == num_devices); 120 | } 121 | } 122 | } 123 | } 124 | 125 | // ================================================================================================= 126 | 127 | TEST_CASE("a list of all platforms can be retrieved", "[Platform]") { 128 | auto all_platforms = CLCudaAPI::GetAllPlatforms(); 129 | REQUIRE(all_platforms.size() > 0); 130 | for (auto &platform : all_platforms) { 131 | auto num_devices = platform.NumDevices(); 132 | REQUIRE(num_devices > 0); 133 | } 134 | } 135 | 136 | // ================================================================================================= 137 | 138 | SCENARIO("devices can be created and used", "[Device][Platform]") { 139 | GIVEN("An example device on a platform") { 140 | auto platform = CLCudaAPI::Platform(kPlatformID); 141 | auto device = CLCudaAPI::Device(platform, kDeviceID); 142 | 143 | GIVEN("...and device properties") { 144 | auto device_version = device.Version(); 145 | auto device_vendor = device.Vendor(); 146 | auto device_name = device.Name(); 147 | auto device_type = device.Type(); 148 | auto device_max_work_group_size = device.MaxWorkGroupSize(); 149 | auto device_max_work_item_dimensions = device.MaxWorkItemDimensions(); 150 | auto device_max_work_item_sizes = device.MaxWorkItemSizes(); 151 | auto device_local_mem_size = device.LocalMemSize(); 152 | auto device_capabilities = device.Capabilities(); 153 | auto device_core_clock = device.CoreClock(); 154 | auto device_compute_units = device.ComputeUnits(); 155 | auto device_memory_size = device.MemorySize(); 156 | auto device_max_alloc_size = device.MaxAllocSize(); 157 | auto device_memory_clock = device.MemoryClock(); 158 | auto device_memory_bus_width = device.MemoryBusWidth(); 159 | 160 | // TODO: test for valid device properties 161 | 162 | WHEN("its underlying data-structure is retrieved") { 163 | auto raw_device = device(); 164 | THEN("a copy of this device can be created") { 165 | auto device_copy = CLCudaAPI::Device(raw_device); 166 | REQUIRE(device_copy.Name() == device_name); // Only verifying device name 167 | } 168 | } 169 | 170 | WHEN("a copy is created using the copy constructor") { 171 | auto device_copy = CLCudaAPI::Device(device); 172 | THEN("the device's properties remain unchanged") { 173 | REQUIRE(device_copy.Name() == device_name); // Only verifying device name 174 | } 175 | } 176 | 177 | WHEN("the local memory size is tested") { 178 | THEN("the maximum local memory size should be considered valid") { 179 | REQUIRE(device.IsLocalMemoryValid(device_local_mem_size) == true); 180 | } 181 | THEN("more than the maximum local memory size should be considered invalid") { 182 | REQUIRE(device.IsLocalMemoryValid(device_local_mem_size+1) == false); 183 | } 184 | } 185 | 186 | WHEN("the local thread configuration is tested") { 187 | THEN("equal to the maximum size in one dimension should be considered valid") { 188 | REQUIRE(device.IsThreadConfigValid({device_max_work_item_sizes[0],1,1}) == true); 189 | REQUIRE(device.IsThreadConfigValid({1,device_max_work_item_sizes[1],1}) == true); 190 | REQUIRE(device.IsThreadConfigValid({1,1,device_max_work_item_sizes[2]}) == true); 191 | } 192 | THEN("more than the maximum size in one dimension should be considered invalid") { 193 | REQUIRE(device.IsThreadConfigValid({device_max_work_item_sizes[0]+1,1,1}) == false); 194 | REQUIRE(device.IsThreadConfigValid({1,device_max_work_item_sizes[1]+1,1}) == false); 195 | REQUIRE(device.IsThreadConfigValid({1,1,device_max_work_item_sizes[2]+1}) == false); 196 | } 197 | } 198 | } 199 | } 200 | } 201 | 202 | // ================================================================================================= 203 | 204 | SCENARIO("contexts can be created and used", "[Context][Device][Platform]") { 205 | GIVEN("An example context on a device") { 206 | auto platform = CLCudaAPI::Platform(kPlatformID); 207 | auto device = CLCudaAPI::Device(platform, kDeviceID); 208 | auto context = CLCudaAPI::Context(device); 209 | 210 | WHEN("its underlying data-structure is retrieved") { 211 | auto raw_context = context(); 212 | THEN("a copy of this context can be created") { 213 | auto context_copy = CLCudaAPI::Context(raw_context); 214 | REQUIRE(context_copy() != nullptr); 215 | } 216 | } 217 | 218 | WHEN("a copy is created using the copy constructor") { 219 | auto context_copy = CLCudaAPI::Context(context); 220 | THEN("its underlying data-structure is not null") { 221 | REQUIRE(context_copy() != nullptr); 222 | } 223 | } 224 | } 225 | } 226 | 227 | // ================================================================================================= 228 | 229 | SCENARIO("programs can be created and used", "[Program][Context][Device][Platform]") { 230 | GIVEN("An example program for a specific context and device") { 231 | auto platform = CLCudaAPI::Platform(kPlatformID); 232 | auto device = CLCudaAPI::Device(platform, kDeviceID); 233 | auto context = CLCudaAPI::Context(device); 234 | #if USE_OPENCL 235 | auto source = R"( 236 | __kernel void add(__global const float* a, __global const float* b, __global float* c) { 237 | unsigned idx = get_global_id(0); 238 | c[idx] = a[idx] + b[idx]; 239 | })"; 240 | 241 | // ... or use CUDA instead 242 | #else 243 | auto source = R"( 244 | extern "C" __global__ void add(const float* a, const float* b, float* c) { 245 | unsigned idx = threadIdx.x + blockDim.x*blockIdx.x; 246 | c[idx] = a[idx] + b[idx]; 247 | })"; 248 | #endif 249 | auto options = std::vector(); 250 | 251 | auto program = CLCudaAPI::Program(context, source); 252 | program.Build(device, options); 253 | 254 | WHEN("an compiled IR is generated from the compiled program") { 255 | auto ir = program.GetIR(); 256 | THEN("a new program can be created based on the IR") { 257 | auto new_program = CLCudaAPI::Program(device, context, ir); 258 | new_program.Build(device, options); 259 | } 260 | } 261 | } 262 | } 263 | 264 | // ================================================================================================= 265 | 266 | SCENARIO("queues can be created and used", "[Queue][Context][Device][Platform][Event]") { 267 | GIVEN("An example queue associated to a context and device") { 268 | auto platform = CLCudaAPI::Platform(kPlatformID); 269 | auto device = CLCudaAPI::Device(platform, kDeviceID); 270 | auto context = CLCudaAPI::Context(device); 271 | auto queue = CLCudaAPI::Queue(context, device); 272 | 273 | #if USE_OPENCL // Not available for the CUDA version 274 | WHEN("its underlying data-structure is retrieved") { 275 | auto raw_queue = queue(); 276 | THEN("a copy of this queue can be created") { 277 | auto queue_copy = CLCudaAPI::Queue(raw_queue); 278 | REQUIRE(queue_copy() != nullptr); 279 | } 280 | } 281 | #endif 282 | 283 | WHEN("a copy is created using the copy constructor") { 284 | auto queue_copy = CLCudaAPI::Queue(queue); 285 | THEN("its underlying data-structure is not null") { 286 | REQUIRE(queue_copy() != nullptr); 287 | } 288 | } 289 | 290 | WHEN("the associated context is retrieved") { 291 | auto context_copy = queue.GetContext(); 292 | THEN("their underlying data-structures match") { 293 | REQUIRE(context_copy() == context()); 294 | } 295 | } 296 | WHEN("the associated device is retrieved") { 297 | auto device_copy = queue.GetDevice(); 298 | THEN("their underlying data-structures match") { 299 | REQUIRE(device_copy() == device()); 300 | } 301 | } 302 | 303 | WHEN("the queue is synchronised") { 304 | queue.Finish(); 305 | THEN("its underlying data-structure is not null") { 306 | REQUIRE(queue() != nullptr); 307 | } 308 | } 309 | WHEN("the queue is synchronised using an event") { 310 | auto event = CLCudaAPI::Event(); 311 | queue.Finish(event); 312 | THEN("its underlying data-structure is not null") { 313 | REQUIRE(queue() != nullptr); 314 | } 315 | } 316 | } 317 | } 318 | 319 | // ================================================================================================= 320 | 321 | SCENARIO("host buffers can be created and used", "[BufferHost][Context][Device][Platform]") { 322 | GIVEN("An example host buffer for a specific context and device") { 323 | auto platform = CLCudaAPI::Platform(kPlatformID); 324 | auto device = CLCudaAPI::Device(platform, kDeviceID); 325 | auto context = CLCudaAPI::Context(device); 326 | auto size = static_cast(kBufferSize); 327 | auto buffer_host = CLCudaAPI::BufferHost(context, size); 328 | 329 | // TODO: Fill in 330 | } 331 | } 332 | 333 | // ================================================================================================= 334 | 335 | SCENARIO("device buffers can be created and used", "[Buffer][Context][Device][Platform]") { 336 | GIVEN("An example device buffer for a specific context and device") { 337 | auto platform = CLCudaAPI::Platform(kPlatformID); 338 | auto device = CLCudaAPI::Device(platform, kDeviceID); 339 | auto context = CLCudaAPI::Context(device); 340 | auto size = static_cast(kBufferSize); 341 | auto buffer = CLCudaAPI::Buffer(context, size); 342 | 343 | // TODO: Fill in 344 | } 345 | } 346 | 347 | // ================================================================================================= 348 | 349 | SCENARIO("kernels can be created and used", "[Kernel][Program][Context][Device][Platform]") { 350 | GIVEN("An example device buffer for a specific context and device") { 351 | auto platform = CLCudaAPI::Platform(kPlatformID); 352 | auto device = CLCudaAPI::Device(platform, kDeviceID); 353 | auto context = CLCudaAPI::Context(device); 354 | auto source = std::string{""}; 355 | auto program = CLCudaAPI::Program(context, source); 356 | auto name = std::string{""}; 357 | //auto kernel = CLCudaAPI::Kernel(program, name); 358 | 359 | // TODO: Fill in 360 | } 361 | } 362 | 363 | // ================================================================================================= 364 | --------------------------------------------------------------------------------