├── .gitignore
├── CHANGELOG
├── CMakeLists.txt
├── LICENSE
├── README.md
├── cmake
    └── Modules
    │   └── FindOpenCL.cmake
├── doc
    └── api.md
├── include
    ├── clpp11.h
    ├── cupp11.h
    └── cxpp11_common.hpp
├── samples
    ├── advanced.cc
    ├── device_info.cc
    ├── simple.cc
    └── smallest.cc
└── test
    ├── catch.hpp
    └── unit_tests.cc


/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | .*
3 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | 
 2 | Version 9.0 (2017-10-08):
 3 | - Synchronized with the CLBLast's clpp11.h header
 4 | - Added custom exception class hierarchy for catching errors
 5 | - Removal of custom error codes for program building in favor of throwing exceptions
 6 | - Added type aliases for raw types
 7 | - Several minor fixes
 8 | - Added new methods to the API:
 9 |   * Platform::Name
10 |   * Platform::Vendor
11 |   * Platform::Version
12 |   * Device::HasExtension
13 |   * Device::SupportsFP64
14 |   * Device::SupportsFP16
15 |   * Device::HasExtension
16 |   * Device::AMDBoardName
17 |   * Device::NVIDIAComputeCapability
18 | 
19 | Version 8.0 (2016-09-27):
20 | - Several minor fixes
21 | - Added new methods to the API:
22 |   * GetAllPlatforms
23 |   * A new constructor for the Program class based on a binary or IR string (both OpenCL and CUDA)
24 | 
25 | Version 7.0 (2016-08-03):
26 | - Re-wrote the OpenCL event implementation with proper memory management
27 | - Updated some return types of device-query information to fix issues on 32-bit systems
28 | - Updated the API documentation
29 | - Refactored some functions to reduce the amount of code
30 | - Added new methods to the API:
31 |   * Kernel::GetFunctionName
32 | 
33 | Version 6.0 (2016-06-29):
34 | - Added the possibility to use Event pointers, adjusted the Kernel::Launch function to do so
35 | - Added a new constructor for Program based on a binary (OpenCL only)
36 | - Fixed a bug when OpenCL 2.0 or newer is installed but the device doesn't support it
37 | - Added new methods to the API:
38 |   * Device::VersionNumber (integer version of the string-getter Device::Version)
39 |   * Device::IsCPU, Device::IsGPU, Device::IsAMD, Device::IsNVIDIA, Device::IsIntel, Device::IsARM
40 | 
41 | Version 5.0 (2016-04-21):
42 | - Buffers can now also be 'not owned' to disable automatic memory freeing afterwards
43 | - Made 'Buffer::Read' and 'Buffer::ReadAsync' constant methods
44 | - Added new methods to the API:
45 |   * Event::WaitForCompletion (OpenCL only)
46 |   * Kernel::Launch (version with OpenCL waiting list)
47 | 
48 | Version 4.0 (2015-11-01):
49 | - Made 'CopyTo' and 'CopyToAsync' constant methods
50 | - Added offset support to the Buffer class (credits go to 'ielhelw')
51 | - Added unit tests for {Event, Device, Context, Queue} classes
52 | - Added compact OpenCL example
53 | - Fixed compiler warnings and errors for Windows using MSVC
54 | - Fixed several general compiler warnings
55 | - Added new methods to the API:
56 |   * Device::MaxAllocSize
57 | 
58 | Version 3.0 (2015-09-04):
59 | - Renamed the project from 'Claduc' into 'CLCudaAPI'
60 | - SetArgument now takes both l-value and r-value arguments
61 | - Added first version of a test infrastructure
62 | - Added new methods to the API:
63 |   * Platform::NumDevices
64 |   * Buffer::Buffer (a constructor with default read-write access)
65 |   * Buffer::Buffer (a constructor filled with data from C++ start/end iterators)
66 |   * Kernel::Launch (version with default OpenCL workgroup size)
67 | 
68 | Version 2.0 (2015-07-13):
69 | - Allows device program string to be moved into Program at construction
70 | - Cleaned-up device-information methods
71 | - Added new methods to the API:
72 |   * Device::CoreClock
73 |   * Device::ComputeUnits
74 |   * Device::MemorySize
75 |   * Device::MemoryClock
76 |   * Device::MemoryBusWidth
77 |   * Program::GetIR
78 |   * Kernel::SetArguments
79 | 
80 | Version 1.0 (2015-07-09):
81 | - Initial version
82 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | # ==================================================================================================
  3 | # This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. This
  4 | # project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
  5 | # width of 100 characters per line.
  6 | #
  7 | # Author(s):
  8 | #   Cedric Nugteren <www.cedricnugteren.nl>
  9 | #
 10 | # This provides a simple build infrastructure for the sample programs. The option USE_OPENCL can be
 11 | # used to toggle between a CUDA or OpenCL back-end.
 12 | #
 13 | # ==================================================================================================
 14 | #
 15 | # Copyright 2015 SURFsara
 16 | # 
 17 | # Licensed under the Apache License, Version 2.0 (the "License");
 18 | # you may not use this file except in compliance with the License.
 19 | # You may obtain a copy of the License at
 20 | # 
 21 | #  http://www.apache.org/licenses/LICENSE-2.0
 22 | # 
 23 | # Unless required by applicable law or agreed to in writing, software
 24 | # distributed under the License is distributed on an "AS IS" BASIS,
 25 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 26 | # See the License for the specific language governing permissions and
 27 | # limitations under the License.
 28 | #
 29 | # ==================================================================================================
 30 | 
 31 | # CMake project details
 32 | cmake_minimum_required(VERSION 2.8.10)
 33 | project("CLCudaAPI" CXX)
 34 | set(CLCudaAPI_VERSION_MAJOR 8)
 35 | set(CLCudaAPI_VERSION_MINOR 0)
 36 | 
 37 | # ==================================================================================================
 38 | 
 39 | # Enable tests
 40 | option(ENABLE_TESTS "Build test-suite" ON)
 41 | 
 42 | # Select between OpenCL and CUDA back-end
 43 | option(USE_OPENCL "Use OpenCL instead of CUDA" ON)
 44 | if(USE_OPENCL)
 45 |   message("-- Building samples with OpenCL")
 46 |   add_definitions(-DUSE_OPENCL)
 47 | else()
 48 |   message("-- Building samples with CUDA")
 49 | endif()
 50 | 
 51 | # ==================================================================================================
 52 | 
 53 | # Compiler-version check (requires at least CMake 2.8.10)
 54 | if(CMAKE_CXX_COMPILER_ID STREQUAL GNU)
 55 |   if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7)
 56 |     message(FATAL_ERROR "GCC version must be at least 4.7")
 57 |   endif()
 58 | elseif(CMAKE_CXX_COMPILER_ID STREQUAL Clang)
 59 |   if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3)
 60 |     message(FATAL_ERROR "Clang version must be at least 3.3")
 61 |   endif()
 62 | elseif(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang)
 63 |   if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
 64 |     message(FATAL_ERROR "AppleClang version must be at least 5.0")
 65 |   endif()
 66 | elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel)
 67 |   if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0)
 68 |     message(FATAL_ERROR "ICC version must be at least 14.0")
 69 |   endif()
 70 | elseif(MSVC)
 71 |   if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0)
 72 |     message(FATAL_ERROR "MS Visual Studio version must be at least 18.0")
 73 |   endif()
 74 | endif()
 75 | 
 76 | # C++ compiler settings
 77 | if(MSVC)
 78 |   set(FLAGS "/Ox")
 79 | else()
 80 |   set(FLAGS "-O3 -std=c++11 -Wall -Wno-comment")
 81 |   if(CMAKE_CXX_COMPILER_ID STREQUAL GNU)
 82 |     if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0.0)
 83 |       # GCC does not support attributes on template arguments
 84 |       # in particular we hit this with the alignment attributes on cl_XXX types
 85 |       # which are then used to instantiate various templates in CLBlast
 86 |       set(FLAGS "${FLAGS} -Wno-ignored-attributes")
 87 |     endif()
 88 |   endif()
 89 | endif()
 90 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
 91 | 
 92 | # ==================================================================================================
 93 | 
 94 | # Package scripts location
 95 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
 96 | 
 97 | # Requires CUDA and OpenCL. The latter is found through the included "FindOpenCL.cmake".
 98 | if(USE_OPENCL)
 99 |   find_package(OpenCL REQUIRED)
100 | else()
101 |   find_package(CUDA REQUIRED)
102 | endif()
103 | 
104 | # ==================================================================================================
105 | 
106 | # Include directories: C++11 headers and OpenCL/CUDA includes
107 | include_directories(${CLCudaAPI_SOURCE_DIR}/include)
108 | if(USE_OPENCL)
109 |   include_directories(${OPENCL_INCLUDE_DIRS})
110 | else()
111 |   include_directories(${CUDA_INCLUDE_DIRS})
112 | endif()
113 | 
114 | # Link directories: CUDA toolkit
115 | if(USE_OPENCL)
116 | 
117 | else()
118 |   link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
119 | endif()
120 | 
121 | # ==================================================================================================
122 | 
123 | # Adds the sample programs
124 | set(SAMPLE_PROGRAMS device_info simple advanced smallest)
125 | foreach(SAMPLE ${SAMPLE_PROGRAMS})
126 |   add_executable(${SAMPLE} samples/${SAMPLE}.cc)
127 |   if(USE_OPENCL)
128 |     target_link_libraries(${SAMPLE} ${OPENCL_LIBRARIES})
129 |   else()
130 |     target_link_libraries(${SAMPLE} cuda nvrtc)
131 |   endif()
132 |   install(TARGETS ${SAMPLE} DESTINATION bin)
133 | endforeach()
134 | 
135 | # ==================================================================================================
136 | 
137 | # Optional: Enable inclusion of the test-suite
138 | if(ENABLE_TESTS)
139 |   enable_testing()
140 |   include_directories(${CLCudaAPI_SOURCE_DIR}/test)
141 |   add_executable(unit_tests test/unit_tests.cc)
142 |   if(USE_OPENCL)
143 |     target_link_libraries(unit_tests ${OPENCL_LIBRARIES})
144 |   else()
145 |     target_link_libraries(unit_tests cuda nvrtc)
146 |   endif()
147 |   add_test(unit_tests unit_tests)
148 | endif()
149 | 
150 | # ==================================================================================================
151 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2015 SURFsara
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |  http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | CLCudaAPI: A portable high-level API with CUDA or OpenCL back-end
  3 | ================
  4 | 
  5 | CLCudaAPI provides a C++ interface to the OpenCL API and/or CUDA API. This interface is high-level: all the details of setting up an OpenCL platform and device are handled automatically, as well as for example OpenCL and CUDA memory management. A similar high-level API is also provided by Khronos's `cl.hpp`, so why would someone use CLCudaAPI instead? The main reason is portability: CLCudaAPI provides two header files which both implement the exact same API, but with a different back-end. This allows __porting between OpenCL and CUDA by simply changing the header file!__
  6 | 
  7 | CLCudaAPI is written in C++11 and wraps CUDA and OpenCL objects in smart pointers, thus handling memory management automatically. It uses the CUDA driver API, since this is the closest to the OpenCL API, but it uses the OpenCL terminology, since this is the most generic. It compiles OpenCL and/or CUDA kernels at run-time, possible in CUDA only since release 7.0. CLCudaAPI handles the host API only: it still requires two versions of the kernel (although some simple defines could omit this requirement).
  8 | 
  9 | 
 10 | What does it look like?
 11 | -------------
 12 | 
 13 | To get started, include either of the two headers:
 14 | 
 15 | ```c++
 16 | #include "clpp11.h"
 17 | // or:
 18 | #include "cupp11.h"
 19 | ```
 20 | 
 21 | Here is a simple example of setting-up platform 0 and selecting device 2:
 22 | 
 23 | ```c++
 24 | auto platform = CLCudaAPI::Platform(0);
 25 | auto device = CLCudaAPI::Device(platform, 2);
 26 | ```
 27 | 
 28 | Next, we'll create a CUDA/OpenCL context and a queue (== CUDA stream) on this device:
 29 | 
 30 | ```c++
 31 | auto context = CLCudaAPI::Context(device);
 32 | auto queue = CLCudaAPI::Queue(context, device);
 33 | ```
 34 | 
 35 | And, once the context and queue are created, we can allocate and upload data to the device:
 36 | 
 37 | ```c++
 38 | auto host_mem = std::vector<float>(size);
 39 | auto device_mem = CLCudaAPI::Buffer<float>(context, size);
 40 | device_mem.WriteBuffer(queue, size, host_mem);
 41 | ```
 42 | 
 43 | Further examples are included in the `samples` folder. To start with CLCudaAPI, check out `samples/simple.cc`, which shows how to compile and launch a simple kernel. The full [CLCudaAPI API reference](doc/api.md) is also available in the current repository.
 44 | 
 45 | 
 46 | Why would I use CLCudaAPI?
 47 | -------------
 48 | 
 49 | The main reasons to use CLCudaAPI are:
 50 | 
 51 | * __Portability__: the CUDA and OpenCL CLCudaAPI headers implement the exact same API.
 52 | * __Memory management__: smart pointers allocate and free memory automatically.
 53 | * __Error checking__: all CUDA and OpenCL API calls are automatically checked for errors.
 54 | * __Abstraction__: CLCudaAPI provides a higher-level interface than OpenCL, CUDA, and `cl.hpp`.
 55 | * __Easy to use__: simply ship two OS/hardware-independent header files, no compilation needed.
 56 | * __Low overhead__ : all function calls are automatically in-lined by the compiler.
 57 | * __Native compiler__: CLCudaAPI code can be compiled with a normal C++ compiler, there is no need to use `nvcc`.
 58 | 
 59 | Nevertheless, there are also several cases when CLCudaAPI is not suitable:
 60 | 
 61 | * When fine-grained control is desired: CLCudaAPI makes abstractions to certain OpenCL/CUDA handles and settings.
 62 | * When unsupported features are desired: only the most common cases are currently implemented. Although this is not a fundamental limitation, it is a practical one. For example, OpenGL interoperability and CUDA constant/texture memory are not supported.
 63 | * When run-time compilation is not an option: e.g. when compilation overhead is too high.
 64 | 
 65 | What are the pre-requisites?
 66 | -------------
 67 | 
 68 | The requirements to use the CLCudaAPI headers are:
 69 | 
 70 | * CUDA 7.0 or higher
 71 | * OpenCL 1.1 or higher
 72 | * A C++11 compiler (e.g. GCC 4.7, Clang 3.3, MSVC 2015 or newer)
 73 | 
 74 | If you also want to compile the samples and tests using the provided infrastructure, you'll also need:
 75 | 
 76 | * CMake 2.8.10 or higher
 77 | 
 78 | 
 79 | How do I compile the included examples with CMake?
 80 | -------------
 81 | 
 82 | Use CMake to create an out-of-source build:
 83 | 
 84 | ```bash
 85 | mkdir build
 86 | cd build
 87 | cmake -DUSE_OPENCL=ON ..
 88 | make
 89 | ```
 90 | 
 91 | Replace `-DUSE_OPENCL=ON` with `-DUSE_OPENCL=OFF` to use CUDA instead of OpenCL as a back-end. After compilation, the `build` folder will contain a binary for each of the sample programs included in the `samples` subfolder.
 92 | 
 93 | 
 94 | How do I compile the included test-suite with CMake?
 95 | -------------
 96 | 
 97 | Compile the examples (see above) will also compile the tests (unless `-DENABLE_TESTS=OFF` is set). The tests will either use the OpenCL or CUDA back-end, similar to the samples. After compilation, the tests can be run using CTest or as follows:
 98 | 
 99 | ```bash
100 | ./unit_tests
101 | ```
102 | 
103 | 
104 | FAQ
105 | -------------
106 | 
107 | > Q: __After I include the CLCudaAPI CUDA header, the linker finds an undefined reference to `nvrtcGetErrorString'. What should I do?__
108 | >
109 | > A: You need to link against the NVIDIA Run-Time Compilation Library (NVRTC). For example, pass `-lnvrtc -L/opt/cuda/lib64` to the compiler.
110 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindOpenCL.cmake:
--------------------------------------------------------------------------------
 1 | 
 2 | # ==================================================================================================
 3 | # This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. This
 4 | # project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 5 | # width of 100 characters per line.
 6 | #
 7 | # Author(s):
 8 | #   Cedric Nugteren <www.cedricnugteren.nl>
 9 | #
10 | # Defines the following variables:
11 | #   OPENCL_FOUND          Boolean holding whether or not the OpenCL library was found
12 | #   OPENCL_INCLUDE_DIRS   The OpenCL include directory
13 | #   OPENCL_LIBRARIES      The OpenCL library
14 | #
15 | # In case OpenCL is not installed in the default directory, set the OPENCL_ROOT variable to point to
16 | # the root of OpenCL, such that 'OpenCL/cl.h' or 'CL/cl.h' can be found in $OPENCL_ROOT/include.
17 | # This can either be done using an environmental variable (e.g. export OPENCL_ROOT=/path/to/opencl)
18 | # or using a CMake variable (e.g. cmake -DOPENCL_ROOT=/path/to/opencl ..).
19 | #
20 | # ==================================================================================================
21 | #
22 | # Copyright 2015 SURFsara
23 | # 
24 | # Licensed under the Apache License, Version 2.0 (the "License");
25 | # you may not use this file except in compliance with the License.
26 | # You may obtain a copy of the License at
27 | # 
28 | #  http://www.apache.org/licenses/LICENSE-2.0
29 | # 
30 | # Unless required by applicable law or agreed to in writing, software
31 | # distributed under the License is distributed on an "AS IS" BASIS,
32 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
33 | # See the License for the specific language governing permissions and
34 | # limitations under the License.
35 | #
36 | # ==================================================================================================
37 | 
38 | # Sets the possible install locations
39 | set(OPENCL_HINTS
40 |   ${OPENCL_ROOT}
41 |   $ENV{OPENCL_ROOT}
42 |   $ENV{AMDAPPSDKROOT}
43 |   $ENV{CUDA_PATH}
44 |   $ENV{INTELOCLSDKROOT}
45 |   $ENV{NVSDKCOMPUTE_ROOT}
46 |   $ENV{ATISTREAMSDKROOT}
47 | )
48 | set(OPENCL_PATHS
49 |   /usr/local/cuda
50 |   /opt/cuda
51 |   /opt/intel/opencl
52 |   /usr
53 |   /usr/local
54 | )
55 | 
56 | # Finds the include directories
57 | find_path(OPENCL_INCLUDE_DIRS
58 |   NAMES OpenCL/cl.h CL/cl.h
59 |   HINTS ${OPENCL_HINTS}
60 |   PATH_SUFFIXES include OpenCL/common/inc inc include/x86_64 include/x64
61 |   PATHS ${OPENCL_PATHS}
62 |   DOC "OpenCL include header OpenCL/cl.h or CL/cl.h"
63 | )
64 | mark_as_advanced(OPENCL_INCLUDE_DIRS)
65 | 
66 | # Finds the library
67 | find_library(OPENCL_LIBRARIES
68 |   NAMES OpenCL
69 |   HINTS ${OPENCL_HINTS}
70 |   PATH_SUFFIXES lib lib64 lib/x86_64 lib/x86_64/sdk lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64
71 |   PATHS ${OPENCL_PATHS}
72 |   DOC "OpenCL library"
73 | )
74 | mark_as_advanced(OPENCL_LIBRARIES)
75 | 
76 | # ==================================================================================================
77 | 
78 | # Notification messages
79 | if(NOT OPENCL_INCLUDE_DIRS)
80 |     message(STATUS "Could NOT find 'OpenCL/cl.h' or 'CL/cl.h', install OpenCL or set OPENCL_ROOT")
81 | endif()
82 | if(NOT OPENCL_LIBRARIES)
83 |     message(STATUS "Could NOT find OpenCL library, install it or set OPENCL_ROOT")
84 | endif()
85 | 
86 | # Determines whether or not OpenCL was found
87 | include(FindPackageHandleStandardArgs)
88 | find_package_handle_standard_args(OpenCL DEFAULT_MSG OPENCL_INCLUDE_DIRS OPENCL_LIBRARIES)
89 | 
90 | # ==================================================================================================
91 | 


--------------------------------------------------------------------------------
/doc/api.md:
--------------------------------------------------------------------------------
  1 | CLCudaAPI: API reference
  2 | ================
  3 | 
  4 | This file describes the high-level API for both the CUDA and OpenCL back-end of the CLCudaAPI headers. On top of the described API, each class has a constructor which takes the regular OpenCL or CUDA data-type and transforms it into a CLCudaAPI class. Furthermore, each class also implements a `()` operator which returns the regular OpenCL or CUDA data-type.
  5 | 
  6 | 
  7 | CLCudaAPI::Event
  8 | -------------
  9 | 
 10 | Constructor(s):
 11 | 
 12 | * `Event()`:
 13 | Creates a new event, to be used for example when timing kernels.
 14 | 
 15 | Public method(s):
 16 | 
 17 | * `void WaitForCompletion() const`:
 18 | Waits for completion of an event (OpenCL) or does nothing (CUDA).
 19 | 
 20 | * `float GetElapsedTime() const`:
 21 | Retrieves the elapsed time in milliseconds of the last recorded event (e.g. a device kernel). This method first makes sure that the last event is finished before computing the elapsed time.
 22 | 
 23 | 
 24 | CLCudaAPI::Platform
 25 | -------------
 26 | 
 27 | Constructor(s):
 28 | 
 29 | * `Platform(const size_t platform_id)`:
 30 | When using the OpenCL back-end, this initializes a new OpenCL platform (e.g. AMD SDK, Intel SDK, NVIDIA SDK) specified by the integer `platform_id`. When using the CUDA back-end, this initializes the CUDA driver API. The `platform_id` argument is ignored: there is only one platform.
 31 | 
 32 | Public method(s):
 33 | 
 34 | * `std::string Name() const`:
 35 | Retrieves the name of the platform.
 36 | 
 37 | * `std::string Vendor() const`:
 38 | Retrieves the name of the vendor of the platform.
 39 | 
 40 | * `std::string Version() const`:
 41 | Retrieves which version of an OpenCL platform is used (OpenCL back-end) or which CUDA driver is used (CUDA back-end).
 42 | 
 43 | * `size_t NumDevices() const`:
 44 | Retrieves the number of devices on this platform.
 45 | 
 46 | Non-member function(s):
 47 | 
 48 | * `std::vector<Platform> GetAllPlatforms()`:
 49 | Retrieves a vector containing all available platforms.
 50 | 
 51 | 
 52 | CLCudaAPI::Device
 53 | -------------
 54 | 
 55 | Constructor(s):
 56 | 
 57 | * `Device(const Platform &platform, const size_t device_id)`:
 58 | Initializes a new OpenCL or CUDA device on the specified platform. The `device_id` defines which device should be selected.
 59 | 
 60 | Public method(s):
 61 | 
 62 | * `RawPlatformID PlatformID() const`:
 63 | Retrieves the raw `cl_platform_id` ID of the platform used (OpenCL back-end) or a 0 `size_t` in case of the CUDA back-end.
 64 | 
 65 | * `std::string Version() const`:
 66 | Retrieves which version of the OpenCL standard is supported (OpenCL back-end) or which CUDA driver is used (CUDA back-end).
 67 | 
 68 | * `size_t VersionNumber() const`:
 69 | The same as the `Version()` method, but without text, just the numeric value.
 70 | 
 71 | * `std::string Vendor() const`:
 72 | Retrieves the name of the vendor of the device.
 73 | 
 74 | * `std::string Name() const`:
 75 | Retrieves the name of the device.
 76 | 
 77 | * `std::string Type() const`:
 78 | Retrieves the type of the devices. Possible return values are 'CPU', 'GPU', 'accelerator', or 'default'.
 79 | 
 80 | * `size_t MaxWorkGroupSize() const`:
 81 | Retrieves the maximum total number of threads in an OpenCL work-group or CUDA thread-block.
 82 | 
 83 | * `size_t MaxWorkItemDimensions() const`:
 84 | Retrieves the maximum number of dimensions (e.g. 2D or 3D) in an OpenCL work-group or CUDA thread-block.
 85 | 
 86 | * `unsigned long LocalMemSize() const`:
 87 | Retrieves the maximum amount of on-chip scratchpad memory ('local memory') available to a single OpenCL work-group or CUDA thread-block.
 88 | 
 89 | * `std::string Capabilities() const`:
 90 | In case of the OpenCL back-end, this returns a list of the OpenCL extensions supported. For CUDA, this returns the device capability (e.g. SM 3.5).
 91 | 
 92 | * `bool HasExtension(const std::string &extension) const`:
 93 | In case of the OpenCL back-end, queries whether a certain extension is present (as reported by `Capabilities()`). For CUDA, this always returns false.
 94 | 
 95 | * `bool SupportsFP64() const`:
 96 | Returns whether or not double-precision floating-point 64-bit is supported by the device.
 97 | 
 98 | * `bool SupportsFP16() const`:
 99 | Returns whether or not half-precision floating-point 16-bit is supported by the device.
100 | 
101 | * `size_t CoreClock() const`:
102 | Retrieves the device's core clock frequency in MHz.
103 | 
104 | * `size_t ComputeUnits() const`:
105 | Retrieves the number of compute units (OpenCL terminology) or multi-processors (CUDA terminology) in the device.
106 | 
107 | * `unsigned long MemorySize() const`:
108 | Retrieves the total global memory size.
109 | 
110 | * `unsigned long MaxAllocSize() const`:
111 | Retrieves the maximum amount of allocatable global memory per allocation.
112 | 
113 | * `size_t MemoryClock() const`:
114 | Retrieves the device's memory clock frequency in MHz (CUDA back-end) or 0 (OpenCL back-end).
115 | 
116 | * `size_t MemoryBusWidth() const`:
117 | Retrieves the device's memory bus-width in bits (CUDA back-end) or 0 (OpenCL back-end).
118 | 
119 | * `bool IsLocalMemoryValid(const size_t local_mem_usage) const`:
120 | Given a requested amount of local on-chip scratchpad memory, this method returns whether or not this is a valid configuration for this particular device.
121 | 
122 | * `bool IsThreadConfigValid(const std::vector<size_t> &local) const`:
123 | Given a requested OpenCL work-group or CUDA thread-block configuration `local`, this method returns whether or not this is a valid configuration for this particular device.
124 | 
125 | * `bool IsCPU() const`:
126 | Determines whether this device is of the CPU type.
127 | 
128 | * `bool IsGPU() const`:
129 | Determines whether this device is of the GPU type.
130 | 
131 | * `bool IsAMD() const`:
132 | Determines whether this device is of the AMD brand.
133 | 
134 | * `bool IsNVIDIA() const`:
135 | Determines whether this device is of the NVIDIA brand.
136 | 
137 | * `bool IsIntel() const`:
138 | Determines whether this device is of the Intel brand.
139 | 
140 | * `bool IsARM() const`:
141 | Determines whether this device is of the ARM brand.
142 | 
143 | * `std::string AMDBoardName() const`:
144 | Returns the value of `CL_DEVICE_BOARD_NAME_AMD` if present. For the CUDA back-end, this always returns an empty string.
145 | 
146 | * `std::string NVIDIAComputeCapability() const`:
147 | Returns the compute capability of an NVIDIA GPU, e.g. SM3.5. For the CUDA back-end, this returns the same as the `Capabilities()` method.
148 | 
149 | 
150 | CLCudaAPI::Context
151 | -------------
152 | 
153 | Constructor(s):
154 | 
155 | * `Context(const Device &device)`:
156 | Initializes a new context on a given device. On top of this context, CLCudaAPI can create new programs, queues and buffers.
157 | 
158 | 
159 | CLCudaAPI::Program
160 | -------------
161 | 
162 | Constructor(s):
163 | 
164 | * `Program(const Context &context, std::string source)`:
165 | Creates a new OpenCL or CUDA program on a given context. A program is a collection of one or more device kernels which form a single compilation unit together. The device-code is passed as a string. Such a string can for example be generated, hard-coded, or read from file at run-time. If passed as an r-value (e.g. using `std::move`), the device-code string is moved instead of copied into the class' member variable.
166 | 
167 | * `Program(const Device &device, const Context &context, const std::string& binary)`:
168 | As above, but now the program is constructed based on an already compiled IR or binary of the device kernels. This requires a context corresponding to the binary. This constructor for OpenCL is based on the `clCreateProgramWithBinary` function.
169 | 
170 | Public method(s):
171 | 
172 | * `void Build(const Device &device, std::vector<std::string> &options)`:
173 | This method invokes the OpenCL or CUDA compiler to build the program at run-time for a specific target device. Depending on the back-end, specific options can be passed to the compiler in the form of the `options` vector. Compilation errors generated by the run-time compiler result in an `std::runtime_error` exception, which can be caught.
174 | 
175 | * `std::string GetBuildInfo(const Device &device) const`:
176 | Retrieves all compiler warnings and errors generated by the build process.
177 | 
178 | * `std::string GetIR() const`:
179 | Retrieves the intermediate representation (IR) of the compiled program. When using the CUDA back-end, this returns the PTX-code. For the OpenCL back-end, this returns either an IR (e.g. PTX) or a binary. This is different per OpenCL implementation.
180 | 
181 | CLCudaAPI::Queue
182 | -------------
183 | 
184 | Constructor(s):
185 | 
186 | * `Queue(const Context &context, const Device &device)`:
187 | Creates a new queue to enqueue kernel launches and device memory operations. This is analogous to an OpenCL command queue or a CUDA stream.
188 | 
189 | Public method(s):
190 | 
191 | * `void Finish(Event &event) const` and `void Finish() const`:
192 | Completes all tasks in the queue. In the case of the CUDA back-end, the first form additionally synchronizes on the specified event.
193 | 
194 | * `Context GetContext() const`:
195 | Retrieves the CUDA/OpenCL context associated with this queue.
196 | 
197 | * `Device GetDevice() const`:
198 | Retrieves the CUDA/OpenCL device associated with this queue.
199 | 
200 | 
201 | template \<typename T\> CLCudaAPI::BufferHost
202 | -------------
203 | 
204 | Constructor(s):
205 | 
206 | * `BufferHost(const Context &, const size_t size)`:
207 | Initializes a new linear 1D memory buffer on the host of type T. This buffer is allocated with a fixed number of elements given by `size`. Note that the buffer's elements are not initialized. In the case of the CUDA back-end, this host buffer is implemented as page-locked memory. The OpenCL back-end uses a regular `std::vector` container.
208 | 
209 | Public method(s):
210 | 
211 | * `size_t GetSize() const`:
212 | Retrieves the allocated size in bytes.
213 | 
214 | * Several `std::vector` methods:
215 | Adds some compatibility with `std::vector` by implementing the `size`, `begin`, `end`, `operator[]`, and `data` methods.
216 | 
217 | 
218 | template \<typename T\> CLCudaAPI::Buffer
219 | -------------
220 | 
221 | Constants(s):
222 | 
223 | * `enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }`
224 | Defines the different access types for the buffers. Writing to a read-only buffer will throw an error, as will reading from a write-only buffer. A buffer which is of type `kNotOwned` will not be automatically freed afterwards.
225 | 
226 | Constructor(s):
227 | 
228 | * `Buffer(const Context &context, const BufferAccess access, const size_t size)`:
229 | Initializes a new linear 1D memory buffer on the device of type T. This buffer is allocated with a fixed number of elements given by `size`. Note that the buffer's elements are not initialized. The buffer can be read-only, write-only, read-write, or not-owned as specified by the `access` argument.
230 | 
231 | * `Buffer(const Context &context, const size_t size)`:
232 | As above, but now defaults to read-write access.
233 | 
234 | * `template <typename Iterator> Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end)`:
235 | Creates a new buffer based on data in a linear C++ container (such as `std::vector`). The size is determined by the difference between the end and start iterators. This method both creates a new buffer and writes data to it. It synchronises the queue before returning.
236 | 
237 | Public method(s):
238 | 
239 | * `void ReadAsync(const Queue &queue, const size_t size, T* host) const` and
240 | `void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host)` and
241 | `void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host)`:
242 | Copies `size` elements from the current device buffer to the target host buffer. The host buffer has to be pre-allocated with a size of at least `size` elements. This method is a-synchronous: it can return before the copy operation is completed.
243 | 
244 | * `void Read(const Queue &queue, const size_t size, T* host) const` and
245 | `void Read(const Queue &queue, const size_t size, std::vector<T> &host)` and
246 | `void Read(const Queue &queue, const size_t size, BufferHost<T> &host)`:
247 | As above, but now completes the operation before returning.
248 | 
249 | * `void WriteAsync(const Queue &queue, const size_t size, const T* host)` and
250 | `void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host)` and
251 | `void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host)`:
252 | Copies `size` elements from a host buffer to the current device buffer. The device buffer has to be pre-allocated with a size of at least `size` elements. This method is a-synchronous: it can return before the copy operation is completed.
253 | 
254 | * `void Write(const Queue &queue, const size_t size, const T* host)` and
255 | `void Write(const Queue &queue, const size_t size, const std::vector<T> &host)` and
256 | `void Write(const Queue &queue, const size_t size, const BufferHost<T> &host)`:
257 | As above, but now completes the operation before returning.
258 | 
259 | * `void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const`:
260 | Copies `size` elements from the current device buffer to another device buffer given by `destination`. The destination buffer has to be pre-allocated with a size of at least `size` elements. This method is a-synchronous: it can return before the copy operation is completed.
261 | 
262 | * `void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const`:
263 | As above, but now completes the operation before returning.
264 | 
265 | * `size_t GetSize() const`:
266 | Retrieves the allocated size in bytes.
267 | 
268 | 
269 | CLCudaAPI::Kernel
270 | -------------
271 | 
272 | Constructor(s):
273 | 
274 | * `Kernel(const Program &program, const std::string &name)`:
275 | Retrieves a new kernel from a compiled program. The kernel name is given as the string `name`.
276 | 
277 | Public method(s):
278 | 
279 | * `template <typename T> void SetArgument(const size_t index, const T &value)`:
280 | Method to set a kernel argument (l-value or r-value). The argument `index` specifies the position in the list of kernel arguments. The argument `value` can also be a `CLCudaAPI::Buffer`.
281 | 
282 | * `template <typename... Args> void SetArguments(Args&... args)`: As above, but now sets all arguments in one go, starting at index 0. This overwrites any previous arguments (if any). The parameter pack `args` takes any number of arguments of different types, including `CLCudaAPI::Buffer`.
283 | 
284 | * `unsigned long LocalMemUsage(const Device &device) const`:
285 | Retrieves the amount of on-chip scratchpad memory (local memory in OpenCL, shared memory in CUDA) required by this specific kernel.
286 | 
287 | * `std::string GetFunctionName() const `:
288 | Retrieves the name of the kernel (OpenCL only).
289 | 
290 | * `Launch(const Queue &queue, const std::vector<size_t> &global, const std::vector<size_t> &local, Event &event)`:
291 | Launches a kernel onto the specified queue. This kernel launch is a-synchronous: this method can return before the device kernel is completed. The total number of threads launched is equal to the `global` vector; the number of threads per OpenCL work-group or CUDA thread-block is given by the `local` vector. The elapsed time is recorded into the `event` argument.
292 | 
293 | * `Launch(const Queue &queue, const std::vector<size_t> &global, const std::vector<size_t> &local, Event &event, std::vector<Event>& waitForEvents)`: As above, but now this kernel is only launched after the other specified events have finished (OpenCL only). If `local` is empty, the kernel-size is determined automatically (OpenCL only).
294 | 
295 | 


--------------------------------------------------------------------------------
/include/clpp11.h:
--------------------------------------------------------------------------------
  1 | 
  2 | // =================================================================================================
  3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The
  4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
  5 | // width of 100 characters per line.
  6 | //
  7 | // Author(s):
  8 | //   Cedric Nugteren <www.cedricnugteren.nl>
  9 | //
 10 | // This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API
 11 | // calls. The main benefits are increased abstraction, automatic memory management, and portability.
 12 | // Portability here means that a similar header exists for CUDA with the same classes and
 13 | // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
 14 | //
 15 | // This is version 9.0 of CLCudaAPI.
 16 | //
 17 | // =================================================================================================
 18 | //
 19 | // Copyright 2015 SURFsara
 20 | // 
 21 | // Licensed under the Apache License, Version 2.0 (the "License");
 22 | // you may not use this file except in compliance with the License.
 23 | // You may obtain a copy of the License at
 24 | // 
 25 | //  http://www.apache.org/licenses/LICENSE-2.0
 26 | // 
 27 | // Unless required by applicable law or agreed to in writing, software
 28 | // distributed under the License is distributed on an "AS IS" BASIS,
 29 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 30 | // See the License for the specific language governing permissions and
 31 | // limitations under the License.
 32 | //
 33 | // =================================================================================================
 34 | 
 35 | #ifndef CLCUDAAPI_CLPP11_H_
 36 | #define CLCUDAAPI_CLPP11_H_
 37 | 
 38 | // C++
 39 | #include <algorithm> // std::copy
 40 | #include <string>    // std::string
 41 | #include <vector>    // std::vector
 42 | #include <memory>    // std::shared_ptr
 43 | #include <numeric>   // std::accumulate
 44 | #include <cstring>   // std::strlen
 45 | 
 46 | // OpenCL
 47 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
 48 | #if defined(__APPLE__) || defined(__MACOSX)
 49 |   #include <OpenCL/opencl.h>
 50 | #else
 51 |   #include <CL/opencl.h>
 52 | #endif
 53 | 
 54 | // Exception classes
 55 | #include "cxpp11_common.hpp"
 56 | 
 57 | namespace CLCudaAPI {
 58 | // =================================================================================================
 59 | 
 60 | // Represents a runtime error returned by an OpenCL API function
 61 | class CLCudaAPIError : public ErrorCode<DeviceError, cl_int> {
 62 |  public:
 63 |   explicit CLCudaAPIError(cl_int status, const std::string &where):
 64 |       ErrorCode(status, where, "OpenCL error: " + where + ": " +
 65 |                                std::to_string(static_cast<int>(status))) {
 66 |   }
 67 | 
 68 |   static void Check(const cl_int status, const std::string &where) {
 69 |     if (status != CL_SUCCESS) {
 70 |       throw CLCudaAPIError(status, where);
 71 |     }
 72 |   }
 73 | 
 74 |   static void CheckDtor(const cl_int status, const std::string &where) {
 75 |     if (status != CL_SUCCESS) {
 76 |       fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what());
 77 |     }
 78 |   }
 79 | };
 80 | 
 81 | // Exception returned when building a program
 82 | using CLCudaAPIBuildError = CLCudaAPIError;
 83 | 
 84 | // =================================================================================================
 85 | 
 86 | // Error occurred in OpenCL
 87 | #define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call))
 88 | 
 89 | // Error occurred in OpenCL (no-exception version for destructors)
 90 | #define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call))
 91 | 
 92 | // =================================================================================================
 93 | 
 94 | // C++11 version of 'cl_event'
 95 | class Event {
 96 |  public:
 97 | 
 98 |   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
 99 |   explicit Event(const cl_event event):
100 |       event_(new cl_event) {
101 |     *event_ = event;
102 |   }
103 | 
104 |   // Regular constructor with memory management
105 |   explicit Event():
106 |       event_(new cl_event, [](cl_event* e) {
107 |         if (*e) { CheckErrorDtor(clReleaseEvent(*e)); }
108 |         delete e;
109 |       }) {
110 |     *event_ = nullptr;
111 |   }
112 | 
113 |   // Waits for completion of this event
114 |   void WaitForCompletion() const {
115 |     CheckError(clWaitForEvents(1, &(*event_)));
116 |   }
117 | 
118 |   // Retrieves the elapsed time of the last recorded event.
119 |   // (Note that there is a bug in Apple's OpenCL implementation of the 'clGetEventProfilingInfo' function:
120 |   //  http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx)
121 |   // However, in our case the reply size is fixed to be cl_ulong, so we are not affected.
122 |   float GetElapsedTime() const {
123 |     WaitForCompletion();
124 |     const auto bytes = sizeof(cl_ulong);
125 |     auto time_start = cl_ulong{0};
126 |     CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr));
127 |     auto time_end = cl_ulong{0};
128 |     CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr));
129 |     return static_cast<float>(time_end - time_start) * 1.0e-6f;
130 |   }
131 | 
132 |   // Accessor to the private data-member
133 |   cl_event& operator()() { return *event_; }
134 |   const cl_event& operator()() const { return *event_; }
135 |   cl_event* pointer() { return &(*event_); }
136 |   const cl_event* pointer() const { return &(*event_); }
137 |  private:
138 |   std::shared_ptr<cl_event> event_;
139 | };
140 | 
141 | // Pointer to an OpenCL event
142 | using EventPointer = cl_event*;
143 | 
144 | // =================================================================================================
145 | 
146 | // Raw platform ID type
147 | using RawPlatformID = cl_platform_id;
148 | 
149 | // C++11 version of 'cl_platform_id'
150 | class Platform {
151 |  public:
152 | 
153 |   // Constructor based on the regular OpenCL data-type
154 |   explicit Platform(const cl_platform_id platform): platform_(platform) { }
155 | 
156 |   // Initializes the platform
157 |   explicit Platform(const size_t platform_id) {
158 |     auto num_platforms = cl_uint{0};
159 |     CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
160 |     if (num_platforms == 0) {
161 |       throw RuntimeError("Platform: no platforms found");
162 |     }
163 |     if (platform_id >= num_platforms) {
164 |       throw RuntimeError("Platform: invalid platform ID "+std::to_string(platform_id));
165 |     }
166 |     auto platforms = std::vector<cl_platform_id>(num_platforms);
167 |     CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr));
168 |     platform_ = platforms[platform_id];
169 |   }
170 | 
171 |   // Methods to retrieve platform information
172 |   std::string Name() const { return GetInfoString(CL_PLATFORM_NAME); }
173 |   std::string Vendor() const { return GetInfoString(CL_PLATFORM_VENDOR); }
174 |   std::string Version() const { return GetInfoString(CL_PLATFORM_VERSION); }
175 | 
176 |   // Returns the number of devices on this platform
177 |   size_t NumDevices() const {
178 |     auto result = cl_uint{0};
179 |     CheckError(clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, nullptr, &result));
180 |     return static_cast<size_t>(result);
181 |   }
182 | 
183 |   // Accessor to the private data-member
184 |   const RawPlatformID& operator()() const { return platform_; }
185 |  private:
186 |   cl_platform_id platform_;
187 | 
188 |   // Private helper functions
189 |   std::string GetInfoString(const cl_device_info info) const {
190 |     auto bytes = size_t{0};
191 |     CheckError(clGetPlatformInfo(platform_, info, 0, nullptr, &bytes));
192 |     auto result = std::string{};
193 |     result.resize(bytes);
194 |     CheckError(clGetPlatformInfo(platform_, info, bytes, &result[0], nullptr));
195 |     result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters
196 |     return result;
197 |   }
198 | };
199 | 
200 | // Retrieves a vector with all platforms
201 | inline std::vector<Platform> GetAllPlatforms() {
202 |   auto num_platforms = cl_uint{0};
203 |   CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
204 |   auto all_platforms = std::vector<Platform>();
205 |   for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) {
206 |     all_platforms.push_back(Platform(platform_id));
207 |   }
208 |   return all_platforms;
209 | }
210 | 
211 | // =================================================================================================
212 | 
213 | // Raw device ID type
214 | using RawDeviceID = cl_device_id;
215 | 
216 | // C++11 version of 'cl_device_id'
217 | class Device {
218 |  public:
219 | 
220 |   // Constructor based on the regular OpenCL data-type
221 |   explicit Device(const cl_device_id device): device_(device) { }
222 | 
223 |   // Initialize the device. Note that this constructor can throw exceptions!
224 |   explicit Device(const Platform &platform, const size_t device_id) {
225 |     auto num_devices = platform.NumDevices();
226 |     if (num_devices == 0) {
227 |       throw RuntimeError("Device: no devices found");
228 |     }
229 |     if (device_id >= num_devices) {
230 |       throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
231 |     }
232 | 
233 |     auto devices = std::vector<cl_device_id>(num_devices);
234 |     CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
235 |                               devices.data(), nullptr));
236 |     device_ = devices[device_id];
237 |   }
238 | 
239 |   // Methods to retrieve device information
240 |   RawPlatformID PlatformID() const { return GetInfo<cl_platform_id>(CL_DEVICE_PLATFORM); }
241 |   std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); }
242 |   size_t VersionNumber() const
243 |   {
244 |     std::string version_string = Version().substr(7);
245 |     // Space separates the end of the OpenCL version number from the beginning of the
246 |     // vendor-specific information.
247 |     size_t next_whitespace = version_string.find(' ');
248 |     size_t version = (size_t) (100.0 * std::stod(version_string.substr(0, next_whitespace)));
249 |     return version;
250 |   }
251 |   std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); }
252 |   std::string Name() const { return GetInfoString(CL_DEVICE_NAME); }
253 |   std::string Type() const {
254 |     auto type = GetInfo<cl_device_type>(CL_DEVICE_TYPE);
255 |     switch(type) {
256 |       case CL_DEVICE_TYPE_CPU: return "CPU";
257 |       case CL_DEVICE_TYPE_GPU: return "GPU";
258 |       case CL_DEVICE_TYPE_ACCELERATOR: return "accelerator";
259 |       default: return "default";
260 |     }
261 |   }
262 |   size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); }
263 |   size_t MaxWorkItemDimensions() const {
264 |     return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS));
265 |   }
266 |   std::vector<size_t> MaxWorkItemSizes() const {
267 |     return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
268 |   }
269 |   unsigned long LocalMemSize() const {
270 |     return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
271 |   }
272 | 
273 |   std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
274 |   bool HasExtension(const std::string &extension) const {
275 |     const auto extensions = Capabilities();
276 |     return extensions.find(extension) != std::string::npos;
277 |   }
278 |   bool SupportsFP64() const {
279 |     return HasExtension("cl_khr_fp64");
280 |   }
281 |   bool SupportsFP16() const {
282 |     if (Name() == "Mali-T628") { return true; } // supports fp16 but not cl_khr_fp16 officially
283 |     return HasExtension("cl_khr_fp16");
284 |   }
285 | 
286 |   size_t CoreClock() const {
287 |     return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_CLOCK_FREQUENCY));
288 |   }
289 |   size_t ComputeUnits() const {
290 |     return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_COMPUTE_UNITS));
291 |   }
292 |   unsigned long MemorySize() const {
293 |     return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_GLOBAL_MEM_SIZE));
294 |   }
295 |   unsigned long MaxAllocSize() const {
296 |     return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_MAX_MEM_ALLOC_SIZE));
297 |   }
298 |   size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
299 |   size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
300 | 
301 |   // Configuration-validity checks
302 |   bool IsLocalMemoryValid(const cl_ulong local_mem_usage) const {
303 |     return (local_mem_usage <= LocalMemSize());
304 |   }
305 |   bool IsThreadConfigValid(const std::vector<size_t> &local) const {
306 |     auto local_size = size_t{1};
307 |     for (const auto &item: local) { local_size *= item; }
308 |     for (auto i=size_t{0}; i<local.size(); ++i) {
309 |       if (local[i] > MaxWorkItemSizes()[i]) { return false; }
310 |     }
311 |     if (local_size > MaxWorkGroupSize()) { return false; }
312 |     if (local.size() > MaxWorkItemDimensions()) { return false; }
313 |     return true;
314 |   }
315 | 
316 |   // Query for a specific type of device or brand
317 |   bool IsCPU() const { return Type() == "CPU"; }
318 |   bool IsGPU() const { return Type() == "GPU"; }
319 |   bool IsAMD() const { return Vendor() == "AMD" ||
320 |                               Vendor() == "Advanced Micro Devices, Inc." ||
321 |                               Vendor() == "AuthenticAMD"; }
322 |   bool IsNVIDIA() const { return Vendor() == "NVIDIA" ||
323 |                                  Vendor() == "NVIDIA Corporation"; }
324 |   bool IsIntel() const { return Vendor() == "INTEL" ||
325 |                                 Vendor() == "Intel" ||
326 |                                 Vendor() == "GenuineIntel" ||
327 |                                 Vendor() == "Intel(R) Corporation"; }
328 |   bool IsARM() const { return Vendor() == "ARM"; }
329 | 
330 |   // Platform specific extensions
331 |   std::string AMDBoardName() const { // check for 'cl_amd_device_attribute_query' first
332 |     #ifndef CL_DEVICE_BOARD_NAME_AMD
333 |       #define CL_DEVICE_BOARD_NAME_AMD 0x4038
334 |     #endif
335 |     return GetInfoString(CL_DEVICE_BOARD_NAME_AMD);
336 |   }
337 |   std::string NVIDIAComputeCapability() const { // check for 'cl_nv_device_attribute_query' first
338 |     #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
339 |        #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
340 |     #endif
341 |     #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
342 |       #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
343 |     #endif
344 |     return std::string{"SM"} + std::to_string(GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV)) +
345 |            std::string{"."} + std::to_string(GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV));
346 |   }
347 | 
348 |   // Accessor to the private data-member
349 |   const RawDeviceID& operator()() const { return device_; }
350 |  private:
351 |   cl_device_id device_;
352 | 
353 |   // Private helper functions
354 |   template <typename T>
355 |   T GetInfo(const cl_device_info info) const {
356 |     auto bytes = size_t{0};
357 |     CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
358 |     auto result = T(0);
359 |     CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
360 |     return result;
361 |   }
362 |   template <typename T>
363 |   std::vector<T> GetInfoVector(const cl_device_info info) const {
364 |     auto bytes = size_t{0};
365 |     CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
366 |     auto result = std::vector<T>(bytes/sizeof(T));
367 |     CheckError(clGetDeviceInfo(device_, info, bytes, result.data(), nullptr));
368 |     return result;
369 |   }
370 |   std::string GetInfoString(const cl_device_info info) const {
371 |     auto bytes = size_t{0};
372 |     CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
373 |     auto result = std::string{};
374 |     result.resize(bytes);
375 |     CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
376 |     result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters
377 |     return result;
378 |   }
379 | };
380 | 
381 | // =================================================================================================
382 | 
383 | // Raw context type
384 | using RawContext = cl_context;
385 | 
386 | // C++11 version of 'cl_context'
387 | class Context {
388 |  public:
389 | 
390 |   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
391 |   explicit Context(const cl_context context):
392 |       context_(new cl_context) {
393 |     *context_ = context;
394 |   }
395 | 
396 |   // Regular constructor with memory management
397 |   explicit Context(const Device &device):
398 |       context_(new cl_context, [](cl_context* c) {
399 |         if (*c) { CheckErrorDtor(clReleaseContext(*c)); }
400 |         delete c;
401 |       }) {
402 |     auto status = CL_SUCCESS;
403 |     const cl_device_id dev = device();
404 |     *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
405 |     CLCudaAPIError::Check(status, "clCreateContext");
406 |   }
407 | 
408 |   // Accessor to the private data-member
409 |   const RawContext& operator()() const { return *context_; }
410 |   RawContext* pointer() const { return &(*context_); }
411 |  private:
412 |   std::shared_ptr<cl_context> context_;
413 | };
414 | 
415 | // Pointer to an OpenCL context
416 | using ContextPointer = cl_context*;
417 | 
418 | // =================================================================================================
419 | 
420 | // C++11 version of 'cl_program'.
421 | class Program {
422 |  public:
423 |   Program() = default;
424 | 
425 |   // Source-based constructor with memory management
426 |   explicit Program(const Context &context, const std::string &source):
427 |       program_(new cl_program, [](cl_program* p) {
428 |         if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
429 |         delete p;
430 |       }) {
431 |     const char *source_ptr = &source[0];
432 |     const auto length = source.length();
433 |     auto status = CL_SUCCESS;
434 |     *program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
435 |     CLCudaAPIError::Check(status, "clCreateProgramWithSource");
436 |   }
437 | 
438 |   // Binary-based constructor with memory management
439 |   explicit Program(const Device &device, const Context &context, const std::string &binary):
440 |       program_(new cl_program, [](cl_program* p) {
441 |         if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
442 |         delete p;
443 |       }) {
444 |     const char *binary_ptr = &binary[0];
445 |     const auto length = binary.length();
446 |     auto status1 = CL_SUCCESS;
447 |     auto status2 = CL_SUCCESS;
448 |     const auto dev = device();
449 |     *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
450 |                                           reinterpret_cast<const unsigned char**>(&binary_ptr),
451 |                                           &status1, &status2);
452 |     CLCudaAPIError::Check(status1, "clCreateProgramWithBinary (binary status)");
453 |     CLCudaAPIError::Check(status2, "clCreateProgramWithBinary");
454 |   }
455 | 
456 |   // Compiles the device program and checks whether or not there are any warnings/errors
457 |   void Build(const Device &device, std::vector<std::string> &options) {
458 |     options.push_back("-cl-std=CL1.1");
459 |     auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
460 |     const cl_device_id dev = device();
461 |     CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
462 |   }
463 | 
464 |   // Confirms whether a certain status code is an actual compilation error or warning
465 |   bool StatusIsCompilationWarningOrError(const cl_int status) const {
466 |     return (status == CL_BUILD_PROGRAM_FAILURE);
467 |   }
468 | 
469 |   // Retrieves the warning/error message from the compiler (if any)
470 |   std::string GetBuildInfo(const Device &device) const {
471 |     auto bytes = size_t{0};
472 |     auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG};
473 |     CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes));
474 |     auto result = std::string{};
475 |     result.resize(bytes);
476 |     CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr));
477 |     return result;
478 |   }
479 | 
480 |   // Retrieves a binary or an intermediate representation of the compiled program
481 |   std::string GetIR() const {
482 |     auto bytes = size_t{0};
483 |     CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
484 |     auto result = std::string{};
485 |     result.resize(bytes);
486 |     auto result_ptr = result.data();
487 |     CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
488 |     return result;
489 |   }
490 | 
491 |   // Accessor to the private data-member
492 |   const cl_program& operator()() const { return *program_; }
493 |  private:
494 |   std::shared_ptr<cl_program> program_;
495 | };
496 | 
497 | // =================================================================================================
498 | 
499 | // Raw command-queue type
500 | using RawCommandQueue = cl_command_queue;
501 | 
502 | // C++11 version of 'cl_command_queue'
503 | class Queue {
504 |  public:
505 | 
506 |   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
507 |   explicit Queue(const cl_command_queue queue):
508 |       queue_(new cl_command_queue) {
509 |     *queue_ = queue;
510 |   }
511 | 
512 |   // Regular constructor with memory management
513 |   explicit Queue(const Context &context, const Device &device):
514 |       queue_(new cl_command_queue, [](cl_command_queue* s) {
515 |         if (*s) { CheckErrorDtor(clReleaseCommandQueue(*s)); }
516 |         delete s;
517 |       }) {
518 |     auto status = CL_SUCCESS;
519 |     *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
520 |     CLCudaAPIError::Check(status, "clCreateCommandQueue");
521 |   }
522 | 
523 |   // Synchronizes the queue
524 |   void Finish(Event &) const {
525 |     Finish();
526 |   }
527 |   void Finish() const {
528 |     CheckError(clFinish(*queue_));
529 |   }
530 | 
531 |   // Retrieves the corresponding context or device
532 |   Context GetContext() const {
533 |     auto bytes = size_t{0};
534 |     CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes));
535 |     cl_context result;
536 |     CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr));
537 |     return Context(result);
538 |   }
539 |   Device GetDevice() const {
540 |     auto bytes = size_t{0};
541 |     CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes));
542 |     cl_device_id result;
543 |     CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr));
544 |     return Device(result);
545 |   }
546 | 
547 |   // Accessor to the private data-member
548 |   const RawCommandQueue& operator()() const { return *queue_; }
549 |  private:
550 |   std::shared_ptr<cl_command_queue> queue_;
551 | };
552 | 
553 | // =================================================================================================
554 | 
555 | // C++11 version of host memory
556 | template <typename T>
557 | class BufferHost {
558 |  public:
559 | 
560 |   // Regular constructor with memory management
561 |   explicit BufferHost(const Context &, const size_t size):
562 |       buffer_(new std::vector<T>(size)) {
563 |   }
564 | 
565 |   // Retrieves the actual allocated size in bytes
566 |   size_t GetSize() const {
567 |     return buffer_->size()*sizeof(T);
568 |   }
569 | 
570 |   // Compatibility with std::vector
571 |   size_t size() const { return buffer_->size(); }
572 |   T* begin() { return &(*buffer_)[0]; }
573 |   T* end() { return &(*buffer_)[buffer_->size()-1]; }
574 |   T& operator[](const size_t i) { return (*buffer_)[i]; }
575 |   T* data() { return buffer_->data(); }
576 |   const T* data() const { return buffer_->data(); }
577 | 
578 |  private:
579 |   std::shared_ptr<std::vector<T>> buffer_;
580 | };
581 | 
582 | // =================================================================================================
583 | 
584 | // Enumeration of buffer access types
585 | enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
586 | 
587 | // C++11 version of 'cl_mem'
588 | template <typename T>
589 | class Buffer {
590 |  public:
591 | 
592 |   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
593 |   explicit Buffer(const cl_mem buffer):
594 |       buffer_(new cl_mem),
595 |       access_(BufferAccess::kNotOwned) {
596 |     *buffer_ = buffer;
597 |   }
598 | 
599 |   // Regular constructor with memory management. If this class does not own the buffer object, then
600 |   // the memory will not be freed automatically afterwards.
601 |   explicit Buffer(const Context &context, const BufferAccess access, const size_t size):
602 |       buffer_(new cl_mem, [access](cl_mem* m) {
603 |         if (access != BufferAccess::kNotOwned) { CheckError(clReleaseMemObject(*m)); }
604 |         delete m;
605 |       }),
606 |       access_(access) {
607 |     auto flags = cl_mem_flags{CL_MEM_READ_WRITE};
608 |     if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; }
609 |     if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; }
610 |     auto status = CL_SUCCESS;
611 |     *buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status);
612 |     CLCudaAPIError::Check(status, "clCreateBuffer");
613 |   }
614 | 
615 |   // As above, but now with read/write access as a default
616 |   explicit Buffer(const Context &context, const size_t size):
617 |     Buffer<T>(context, BufferAccess::kReadWrite, size) {
618 |   }
619 | 
620 |   // Constructs a new buffer based on an existing host-container
621 |   template <typename Iterator>
622 |   explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
623 |     Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
624 |     auto size = static_cast<size_t>(end - start);
625 |     auto pointer = &*start;
626 |     CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), pointer, 0,
627 |                                     nullptr, nullptr));
628 |     queue.Finish();
629 |   }
630 | 
631 |   // Copies from device to host: reading the device buffer a-synchronously
632 |   void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
633 |     if (access_ == BufferAccess::kWriteOnly) {
634 |       throw LogicError("Buffer: reading from a write-only buffer");
635 |     }
636 |     CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
637 |                                    host, 0, nullptr, nullptr));
638 |   }
639 |   void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
640 |                  const size_t offset = 0) const {
641 |     if (host.size() < size) {
642 |       throw LogicError("Buffer: target host buffer is too small");
643 |     }
644 |     ReadAsync(queue, size, host.data(), offset);
645 |   }
646 |   void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
647 |                  const size_t offset = 0) const {
648 |     if (host.size() < size) {
649 |       throw LogicError("Buffer: target host buffer is too small");
650 |     }
651 |     ReadAsync(queue, size, host.data(), offset);
652 |   }
653 | 
654 |   // Copies from device to host: reading the device buffer
655 |   void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
656 |     ReadAsync(queue, size, host, offset);
657 |     queue.Finish();
658 |   }
659 |   void Read(const Queue &queue, const size_t size, std::vector<T> &host,
660 |             const size_t offset = 0) const {
661 |     Read(queue, size, host.data(), offset);
662 |   }
663 |   void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
664 |             const size_t offset = 0) const {
665 |     Read(queue, size, host.data(), offset);
666 |   }
667 | 
668 |   // Copies from host to device: writing the device buffer a-synchronously
669 |   void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
670 |     if (GetSize() < (offset+size)*sizeof(T)) {
671 |       throw LogicError("Buffer: target device buffer is too small");
672 |     }
673 |     CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
674 |                                     host, 0, nullptr, nullptr));
675 |   }
676 |   void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
677 |                   const size_t offset = 0) {
678 |     WriteAsync(queue, size, host.data(), offset);
679 |   }
680 |   void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
681 |                   const size_t offset = 0) {
682 |     WriteAsync(queue, size, host.data(), offset);
683 |   }
684 | 
685 |   // Copies from host to device: writing the device buffer
686 |   void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
687 |     WriteAsync(queue, size, host, offset);
688 |     queue.Finish();
689 |   }
690 |   void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
691 |              const size_t offset = 0) {
692 |     Write(queue, size, host.data(), offset);
693 |   }
694 |   void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
695 |              const size_t offset = 0) {
696 |     Write(queue, size, host.data(), offset);
697 |   }
698 | 
699 |   // Copies the contents of this buffer into another device buffer
700 |   void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
701 |     CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0,
702 |                                    nullptr, nullptr));
703 |   }
704 |   void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
705 |     CopyToAsync(queue, size, destination);
706 |     queue.Finish();
707 |   }
708 | 
709 |   // Retrieves the actual allocated size in bytes
710 |   size_t GetSize() const {
711 |     const auto bytes = sizeof(size_t);
712 |     auto result = size_t{0};
713 |     CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
714 |     return result;
715 |   }
716 | 
717 |   // Accessor to the private data-member
718 |   const cl_mem& operator()() const { return *buffer_; }
719 |  private:
720 |   std::shared_ptr<cl_mem> buffer_;
721 |   const BufferAccess access_;
722 | };
723 | 
724 | // =================================================================================================
725 | 
726 | // C++11 version of 'cl_kernel'
727 | class Kernel {
728 |  public:
729 | 
730 |   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
731 |   explicit Kernel(const cl_kernel kernel):
732 |       kernel_(new cl_kernel) {
733 |     *kernel_ = kernel;
734 |   }
735 | 
736 |   // Regular constructor with memory management
737 |   explicit Kernel(const Program &program, const std::string &name):
738 |       kernel_(new cl_kernel, [](cl_kernel* k) {
739 |         if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
740 |         delete k;
741 |       }) {
742 |     auto status = CL_SUCCESS;
743 |     *kernel_ = clCreateKernel(program(), name.c_str(), &status);
744 |     CLCudaAPIError::Check(status, "clCreateKernel");
745 |   }
746 | 
747 |   // Sets a kernel argument at the indicated position
748 |   template <typename T>
749 |   void SetArgument(const size_t index, const T &value) {
750 |     CheckError(clSetKernelArg(*kernel_, static_cast<cl_uint>(index), sizeof(T), &value));
751 |   }
752 |   template <typename T>
753 |   void SetArgument(const size_t index, Buffer<T> &value) {
754 |     SetArgument(index, value());
755 |   }
756 | 
757 |   // Sets all arguments in one go using parameter packs. Note that this overwrites previously set
758 |   // arguments using 'SetArgument' or 'SetArguments'.
759 |   template <typename... Args>
760 |   void SetArguments(Args&... args) {
761 |     SetArgumentsRecursive(0, args...);
762 |   }
763 | 
764 |   // Retrieves the amount of local memory used per work-group for this kernel
765 |   unsigned long LocalMemUsage(const Device &device) const {
766 |     const auto bytes = sizeof(cl_ulong);
767 |     auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
768 |     auto result = cl_ulong{0};
769 |     CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
770 |     return static_cast<unsigned long>(result);
771 |   }
772 | 
773 |   // Retrieves the name of the kernel
774 |   std::string GetFunctionName() const {
775 |     auto bytes = size_t{0};
776 |     CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
777 |     auto result = std::string{};
778 |     result.resize(bytes);
779 |     CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, bytes, &result[0], nullptr));
780 |     return std::string{result.c_str()}; // Removes any trailing '\0'-characters
781 |   }
782 | 
783 |   // Launches a kernel onto the specified queue
784 |   void Launch(const Queue &queue, const std::vector<size_t> &global,
785 |               const std::vector<size_t> &local, EventPointer event) {
786 |     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
787 |                                       nullptr, global.data(), local.data(),
788 |                                       0, nullptr, event));
789 |   }
790 | 
791 |   // As above, but with an event waiting list
792 |   void Launch(const Queue &queue, const std::vector<size_t> &global,
793 |               const std::vector<size_t> &local, EventPointer event,
794 |               const std::vector<Event> &waitForEvents) {
795 | 
796 |     // Builds a plain version of the events waiting list
797 |     auto waitForEventsPlain = std::vector<cl_event>();
798 |     for (auto &waitEvent : waitForEvents) {
799 |       if (waitEvent()) { waitForEventsPlain.push_back(waitEvent()); }
800 |     }
801 | 
802 |     // Launches the kernel while waiting for other events
803 |     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
804 |                                       nullptr, global.data(), !local.empty() ? local.data() : nullptr,
805 |                                       static_cast<cl_uint>(waitForEventsPlain.size()),
806 |                                       !waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr,
807 |                                       event));
808 |   }
809 | 
810 |   // Accessor to the private data-member
811 |   const cl_kernel& operator()() const { return *kernel_; }
812 |  private:
813 |   std::shared_ptr<cl_kernel> kernel_;
814 | 
815 |   // Internal implementation for the recursive SetArguments function.
816 |   template <typename T>
817 |   void SetArgumentsRecursive(const size_t index, T &first) {
818 |     SetArgument(index, first);
819 |   }
820 |   template <typename T, typename... Args>
821 |   void SetArgumentsRecursive(const size_t index, T &first, Args&... args) {
822 |     SetArgument(index, first);
823 |     SetArgumentsRecursive(index+1, args...);
824 |   }
825 | };
826 | 
827 | // =================================================================================================
828 | } // namespace CLCudaAPI
829 | 
830 | // CLCUDAAPI_CLPP11_H_
831 | #endif
832 | 


--------------------------------------------------------------------------------
/include/cupp11.h:
--------------------------------------------------------------------------------
  1 | 
  2 | // =================================================================================================
  3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The
  4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
  5 | // width of 100 characters per line.
  6 | //
  7 | // Author(s):
  8 | //   Cedric Nugteren <www.cedricnugteren.nl>
  9 | //
 10 | // This file implements a bunch of C++11 classes that act as wrappers around CUDA objects and API
 11 | // calls. The main benefits are increased abstraction, automatic memory management, and portability.
 12 | // Portability here means that a similar header exists for OpenCL with the same classes and
 13 | // interfaces. In other words, moving from the CUDA API to the OpenCL API becomes a one-line change.
 14 | //
 15 | // This is version 9.0 of CLCudaAPI.
 16 | //
 17 | // =================================================================================================
 18 | //
 19 | // Copyright 2015 SURFsara
 20 | // 
 21 | // Licensed under the Apache License, Version 2.0 (the "License");
 22 | // you may not use this file except in compliance with the License.
 23 | // You may obtain a copy of the License at
 24 | // 
 25 | //  http://www.apache.org/licenses/LICENSE-2.0
 26 | // 
 27 | // Unless required by applicable law or agreed to in writing, software
 28 | // distributed under the License is distributed on an "AS IS" BASIS,
 29 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 30 | // See the License for the specific language governing permissions and
 31 | // limitations under the License.
 32 | //
 33 | // =================================================================================================
 34 | 
 35 | #ifndef CLCUDAAPI_CUPP11_H_
 36 | #define CLCUDAAPI_CUPP11_H_
 37 | 
 38 | // C++
 39 | #include <algorithm> // std::copy
 40 | #include <string>    // std::string
 41 | #include <vector>    // std::vector
 42 | #include <memory>    // std::shared_ptr
 43 | 
 44 | // CUDA
 45 | #include <cuda.h>    // CUDA driver API
 46 | #include <nvrtc.h>   // NVIDIA runtime compilation API
 47 | 
 48 | // Exception classes
 49 | #include "cxpp11_common.hpp"
 50 | 
 51 | namespace CLCudaAPI {
 52 | // =================================================================================================
 53 | 
 54 | // Max-length of strings
 55 | constexpr auto kStringLength = 256;
 56 | 
 57 | // =================================================================================================
 58 | 
 59 | // Represents a runtime error returned by a CUDA driver API function
 60 | class CLCudaAPIError : public ErrorCode<DeviceError, CUresult> {
 61 | public:
 62 |   explicit CLCudaAPIError(CUresult status, const std::string &where):
 63 |       ErrorCode(status, where, "CUDA error: " + where + ": " +
 64 |                                GetErrorName(status) + " --> " + GetErrorString(status)) {
 65 |   }
 66 | 
 67 |   static void Check(const CUresult status, const std::string &where) {
 68 |     if (status != CUDA_SUCCESS) {
 69 |       throw CLCudaAPIError(status, where);
 70 |     }
 71 |   }
 72 | 
 73 |   static void CheckDtor(const CUresult status, const std::string &where) {
 74 |     if (status != CUDA_SUCCESS) {
 75 |       fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what());
 76 |     }
 77 |   }
 78 | 
 79 | private:
 80 |   std::string GetErrorName(CUresult status) const {
 81 |     const char* status_code;
 82 |     cuGetErrorName(status, &status_code);
 83 |     return std::string(status_code);
 84 |   }
 85 |   std::string GetErrorString(CUresult status) const {
 86 |     const char* status_string;
 87 |     cuGetErrorString(status, &status_string);
 88 |     return std::string(status_string);
 89 |   }
 90 | };
 91 | 
 92 | // Represents a runtime error returned by a CUDA runtime compilation API function
 93 | class CLCudaAPINVRTCError : public ErrorCode<DeviceError, nvrtcResult> {
 94 | public:
 95 |   explicit CLCudaAPINVRTCError(nvrtcResult status, const std::string &where):
 96 |       ErrorCode(status, where, "CUDA NVRTC error: " + where + ": " + GetErrorString(status)) {
 97 |   }
 98 | 
 99 |   static void Check(const nvrtcResult status, const std::string &where) {
100 |     if (status != NVRTC_SUCCESS) {
101 |       throw CLCudaAPINVRTCError(status, where);
102 |     }
103 |   }
104 | 
105 |   static void CheckDtor(const nvrtcResult status, const std::string &where) {
106 |     if (status != NVRTC_SUCCESS) {
107 |       fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPINVRTCError(status, where).what());
108 |     }
109 |   }
110 | 
111 | private:
112 |   std::string GetErrorString(nvrtcResult status) const {
113 |     const char* status_string = nvrtcGetErrorString(status);
114 |     return std::string(status_string);
115 |   }
116 | };
117 | 
118 | // Exception returned when building a program
119 | using CLCudaAPIBuildError = CLCudaAPINVRTCError;
120 | 
121 | // =================================================================================================
122 | 
123 | // Error occurred in CUDA driver or runtime compilation API
124 | #define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call))
125 | #define CheckErrorNVRTC(call) CLCudaAPINVRTCError::Check(call, CLCudaAPINVRTCError::TrimCallString(#call))
126 | 
127 | // Error occurred in CUDA driver or runtime compilation API (no-exception version for destructors)
128 | #define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call))
129 | #define CheckErrorDtorNVRTC(call) CLCudaAPINVRTCError::CheckDtor(call, CLCudaAPINVRTCError::TrimCallString(#call))
130 | 
131 | // =================================================================================================
132 | 
133 | // C++11 version of two 'CUevent' pointers
134 | class Event {
135 |  public:
136 |   // Note that there is no constructor based on the regular CUDA data-type because of extra state
137 | 
138 |   // Regular constructor with memory management
139 |   explicit Event():
140 |       start_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }),
141 |       end_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }) {
142 |     CheckError(cuEventCreate(start_.get(), CU_EVENT_DEFAULT));
143 |     CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT));
144 |   }
145 | 
146 |   // Waits for completion of this event (not implemented for CUDA)
147 |   void WaitForCompletion() const { }
148 | 
149 |   // Retrieves the elapsed time of the last recorded event
150 |   float GetElapsedTime() const {
151 |     auto result = 0.0f;
152 |     cuEventElapsedTime(&result, *start_, *end_);
153 |     return result;
154 |   }
155 | 
156 |   // Accessors to the private data-members
157 |   const CUevent& start() const { return *start_; }
158 |   const CUevent& end() const { return *end_; }
159 |   Event* pointer() { return this; }
160 |  private:
161 |   std::shared_ptr<CUevent> start_;
162 |   std::shared_ptr<CUevent> end_;
163 | };
164 | 
165 | // Pointer to a CUDA event
166 | using EventPointer = Event*;
167 | 
168 | // =================================================================================================
169 | 
170 | // Raw platform ID type
171 | using RawPlatformID = size_t;
172 | 
173 | // The CUDA platform: initializes the CUDA driver API
174 | class Platform {
175 |  public:
176 | 
177 |   // Initializes the platform. Note that the platform ID variable is not actually used for CUDA.
178 |   explicit Platform(const size_t platform_id) : platform_id_(0) {
179 |     if (platform_id != 0) { throw LogicError("CUDA back-end requires a platform ID of 0"); }
180 |     CheckError(cuInit(0));
181 |   }
182 | 
183 |   // Methods to retrieve platform information
184 |   std::string Name() const { return "CUDA"; }
185 |   std::string Vendor() const { return "NVIDIA Corporation"; }
186 |   std::string Version() const {
187 |     auto result = 0;
188 |     CheckError(cuDriverGetVersion(&result));
189 |     return "CUDA driver "+std::to_string(result);
190 |   }
191 | 
192 |   // Returns the number of devices on this platform
193 |   size_t NumDevices() const {
194 |     auto result = 0;
195 |     CheckError(cuDeviceGetCount(&result));
196 |     return static_cast<size_t>(result);
197 |   }
198 | 
199 |   // Accessor to the raw ID (which doesn't exist in the CUDA back-end, this is always just 0)
200 |   const RawPlatformID& operator()() const { return platform_id_; }
201 | private:
202 |   const size_t platform_id_;
203 | };
204 | 
205 | // Retrieves a vector with all platforms. Note that there is just one platform in CUDA.
206 | inline std::vector<Platform> GetAllPlatforms() {
207 |   auto all_platforms = std::vector<Platform>{ Platform(size_t{0}) };
208 |   return all_platforms;
209 | }
210 | 
211 | // =================================================================================================
212 | 
213 | // Raw device ID type
214 | using RawDeviceID = CUdevice;
215 | 
216 | // C++11 version of 'CUdevice'
217 | class Device {
218 |  public:
219 | 
220 |   // Constructor based on the regular CUDA data-type
221 |   explicit Device(const CUdevice device): device_(device) { }
222 | 
223 |   // Initialization
224 |   explicit Device(const Platform &platform, const size_t device_id) {
225 |     auto num_devices = platform.NumDevices();
226 |     if (num_devices == 0) {
227 |       throw RuntimeError("Device: no devices found");
228 |     }
229 |     if (device_id >= num_devices) {
230 |       throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
231 |     }
232 | 
233 |     CheckError(cuDeviceGet(&device_, device_id));
234 |   }
235 | 
236 |   // Methods to retrieve device information
237 |   RawPlatformID PlatformID() const { return 0; }
238 |   std::string Version() const {
239 |     auto result = 0;
240 |     CheckError(cuDriverGetVersion(&result));
241 |     return "CUDA driver "+std::to_string(result);
242 |   }
243 |   size_t VersionNumber() const {
244 |     auto result = 0;
245 |     CheckError(cuDriverGetVersion(&result));
246 |     return static_cast<size_t>(result);
247 |   }
248 |   std::string Vendor() const { return "NVIDIA Corporation"; }
249 |   std::string Name() const {
250 |     auto result = std::string{};
251 |     result.resize(kStringLength);
252 |     CheckError(cuDeviceGetName(&result[0], result.size(), device_));
253 |     return result;
254 |   }
255 |   std::string Type() const { return "GPU"; }
256 |   size_t MaxWorkGroupSize() const {return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); }
257 |   size_t MaxWorkItemDimensions() const { return size_t{3}; }
258 |   std::vector<size_t> MaxWorkItemSizes() const {
259 |     return std::vector<size_t>{GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X),
260 |                                GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y),
261 |                                GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)};
262 |   }
263 |   unsigned long LocalMemSize() const {
264 |     return static_cast<unsigned long>(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK));
265 |   }
266 | 
267 |   std::string Capabilities() const {
268 |     const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
269 |     const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
270 |     return "SM"+std::to_string(major)+"."+std::to_string(minor);
271 |   }
272 |   bool HasExtension(const std::string &extension) const { return false; }
273 |   bool SupportsFP64() const { return true; }
274 |   bool SupportsFP16() const {
275 |     const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
276 |     const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
277 |     if (major > 5) { return true; } // SM 6.x, 7.x and higher
278 |     if (major == 5 && minor == 3) { return true; } // SM 5.3
279 |     return false;
280 |   }
281 | 
282 |   size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); }
283 |   size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); }
284 |   unsigned long MemorySize() const {
285 |     auto result = size_t{0};
286 |     CheckError(cuDeviceTotalMem(&result, device_));
287 |     return static_cast<unsigned long>(result);
288 |   }
289 |   unsigned long MaxAllocSize() const { return MemorySize(); }
290 |   size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); }
291 |   size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); }
292 | 
293 |   // Configuration-validity checks
294 |   bool IsLocalMemoryValid(const size_t local_mem_usage) const {
295 |     return (local_mem_usage <= LocalMemSize());
296 |   }
297 |   bool IsThreadConfigValid(const std::vector<size_t> &local) const {
298 |     auto local_size = size_t{1};
299 |     for (const auto &item: local) { local_size *= item; }
300 |     for (auto i=size_t{0}; i<local.size(); ++i) {
301 |       if (local[i] > MaxWorkItemSizes()[i]) { return false; }
302 |     }
303 |     if (local_size > MaxWorkGroupSize()) { return false; }
304 |     if (local.size() > MaxWorkItemDimensions()) { return false; }
305 |     return true;
306 |   }
307 | 
308 |   // Query for a specific type of device or brand
309 |   bool IsCPU() const { return false; }
310 |   bool IsGPU() const { return true; }
311 |   bool IsAMD() const { return false; }
312 |   bool IsNVIDIA() const { return true; }
313 |   bool IsIntel() const { return false; }
314 |   bool IsARM() const { return false; }
315 | 
316 |   // Platform specific extensions
317 |   std::string AMDBoardName() const { return ""; }
318 |   std::string NVIDIAComputeCapability() const { return Capabilities(); }
319 | 
320 |   // Accessor to the private data-member
321 |   const RawDeviceID& operator()() const { return device_; }
322 |  private:
323 |   CUdevice device_;
324 | 
325 |   // Private helper function
326 |   size_t GetInfo(const CUdevice_attribute info) const {
327 |     auto result = 0;
328 |     CheckError(cuDeviceGetAttribute(&result, info, device_));
329 |     return static_cast<size_t>(result);
330 |   }
331 | };
332 | 
333 | // =================================================================================================
334 | 
335 | // Raw context type
336 | using RawContext = CUcontext;
337 | 
338 | // C++11 version of 'CUcontext'
339 | class Context {
340 |  public:
341 | 
342 |   // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
343 |   explicit Context(const CUcontext context):
344 |       context_(new CUcontext) {
345 |     *context_ = context;
346 |   }
347 | 
348 |   // Regular constructor with memory management
349 |   explicit Context(const Device &device):
350 |       context_(new CUcontext, [](CUcontext* c) {
351 |         if (*c) { CheckErrorDtor(cuCtxDestroy(*c)); }
352 |         delete c;
353 |       }) {
354 |     CheckError(cuCtxCreate(context_.get(), 0, device()));
355 |   }
356 | 
357 |   // Accessor to the private data-member
358 |   const RawContext& operator()() const { return *context_; }
359 |   RawContext* pointer() const { return &(*context_); }
360 |  private:
361 |   std::shared_ptr<CUcontext> context_;
362 | };
363 | 
364 | // Pointer to a raw CUDA context
365 | using ContextPointer = CUcontext*;
366 | 
367 | // =================================================================================================
368 | 
369 | // C++11 version of 'nvrtcProgram'. Additionally holds the program's source code.
370 | class Program {
371 |  public:
372 |   // Note that there is no constructor based on the regular CUDA data-type because of extra state
373 | 
374 |   // Source-based constructor with memory management
375 |   explicit Program(const Context &, std::string source):
376 |       program_(new nvrtcProgram, [](nvrtcProgram* p) {
377 |           if (*p) { CheckErrorDtorNVRTC(nvrtcDestroyProgram(p)); }
378 |           delete p;
379 |       }),
380 |       source_(std::move(source)),
381 |       from_binary_(false) {
382 |     const auto source_ptr = &source_[0];
383 |     CheckErrorNVRTC(nvrtcCreateProgram(program_.get(), source_ptr, nullptr, 0, nullptr, nullptr));
384 |   }
385 | 
386 |   // PTX-based constructor
387 |   explicit Program(const Device &device, const Context &context, const std::string &binary):
388 |       program_(nullptr), // not used
389 |       source_(binary),
390 |       from_binary_(true) {
391 |   }
392 | 
393 |   // Compiles the device program and checks whether or not there are any warnings/errors
394 |   void Build(const Device &, std::vector<std::string> &options) {
395 |     if (from_binary_) { return; }
396 |     auto raw_options = std::vector<const char*>();
397 |     for (const auto &option: options) {
398 |       raw_options.push_back(option.c_str());
399 |     }
400 |     auto status = nvrtcCompileProgram(*program_, raw_options.size(), raw_options.data());
401 |     CLCudaAPINVRTCError::Check(status, "nvrtcCompileProgram");
402 |   }
403 | 
404 |   // Confirms whether a certain status code is an actual compilation error or warning
405 |   bool StatusIsCompilationWarningOrError(const nvrtcResult status) const {
406 |     return (status == NVRTC_ERROR_INVALID_INPUT);
407 |   }
408 | 
409 |   // Retrieves the warning/error message from the compiler (if any)
410 |   std::string GetBuildInfo(const Device &) const {
411 |     if (from_binary_) { return std::string{}; }
412 |     auto bytes = size_t{0};
413 |     CheckErrorNVRTC(nvrtcGetProgramLogSize(*program_, &bytes));
414 |     auto result = std::string{};
415 |     result.resize(bytes);
416 |     CheckErrorNVRTC(nvrtcGetProgramLog(*program_, &result[0]));
417 |     return result;
418 |   }
419 | 
420 |   // Retrieves an intermediate representation of the compiled program (i.e. PTX)
421 |   std::string GetIR() const {
422 |     if (from_binary_) { return source_; } // holds the PTX
423 |     auto bytes = size_t{0};
424 |     CheckErrorNVRTC(nvrtcGetPTXSize(*program_, &bytes));
425 |     auto result = std::string{};
426 |     result.resize(bytes);
427 |     CheckErrorNVRTC(nvrtcGetPTX(*program_, &result[0]));
428 |     return result;
429 |   }
430 | 
431 |   // Accessor to the private data-member
432 |   const nvrtcProgram& operator()() const { return *program_; }
433 |  private:
434 |   std::shared_ptr<nvrtcProgram> program_;
435 |   const std::string source_;
436 |   const bool from_binary_;
437 | };
438 | 
439 | // =================================================================================================
440 | 
441 | // Raw command-queue type
442 | using RawCommandQueue = CUstream;
443 | 
444 | // C++11 version of 'CUstream'
445 | class Queue {
446 |  public:
447 |   // Note that there is no constructor based on the regular CUDA data-type because of extra state
448 | 
449 |   // Regular constructor with memory management
450 |   explicit Queue(const Context &context, const Device &device):
451 |       queue_(new CUstream, [](CUstream* s) {
452 |         if (*s) { CheckErrorDtor(cuStreamDestroy(*s)); }
453 |         delete s;
454 |       }),
455 |       context_(context),
456 |       device_(device) {
457 |     CheckError(cuStreamCreate(queue_.get(), CU_STREAM_NON_BLOCKING));
458 |   }
459 | 
460 |   // Synchronizes the queue and optionally also an event
461 |   void Finish(Event &event) const {
462 |     CheckError(cuEventSynchronize(event.end()));
463 |     Finish();
464 |   }
465 |   void Finish() const {
466 |     CheckError(cuStreamSynchronize(*queue_));
467 |   }
468 | 
469 |   // Retrieves the corresponding context or device
470 |   Context GetContext() const { return context_; }
471 |   Device GetDevice() const { return device_; }
472 | 
473 |   // Accessor to the private data-member
474 |   const RawCommandQueue& operator()() const { return *queue_; }
475 |  private:
476 |   std::shared_ptr<CUstream> queue_;
477 |   const Context context_;
478 |   const Device device_;
479 | };
480 | 
481 | // =================================================================================================
482 | 
483 | // C++11 version of page-locked host memory
484 | template <typename T>
485 | class BufferHost {
486 |  public:
487 | 
488 |   // Regular constructor with memory management
489 |   explicit BufferHost(const Context &, const size_t size):
490 |       buffer_(new void*, [](void** m) { CheckError(cuMemFreeHost(*m)); delete m; }),
491 |       size_(size) {
492 |     CheckError(cuMemAllocHost(buffer_.get(), size*sizeof(T)));
493 |   }
494 | 
495 |   // Retrieves the actual allocated size in bytes
496 |   size_t GetSize() const {
497 |     return size_*sizeof(T);
498 |   }
499 | 
500 |   // Compatibility with std::vector
501 |   size_t size() const { return size_; }
502 |   T* begin() { return &static_cast<T*>(*buffer_)[0]; }
503 |   T* end() { return &static_cast<T*>(*buffer_)[size_-1]; }
504 |   T& operator[](const size_t i) { return static_cast<T*>(*buffer_)[i]; }
505 |   T* data() { return static_cast<T*>(*buffer_); }
506 |   const T* data() const { return static_cast<T*>(*buffer_); }
507 | 
508 |  private:
509 |   std::shared_ptr<void*> buffer_;
510 |   const size_t size_;
511 | };
512 | 
513 | // =================================================================================================
514 | 
515 | // Enumeration of buffer access types
516 | enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
517 | 
518 | // C++11 version of 'CUdeviceptr'
519 | template <typename T>
520 | class Buffer {
521 |  public:
522 | 
523 |   // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
524 |   explicit Buffer(const CUdeviceptr buffer):
525 |       buffer_(new CUdeviceptr),
526 |       access_(BufferAccess::kNotOwned) {
527 |     *buffer_ = buffer;
528 |   }
529 | 
530 |   // Regular constructor with memory management. If this class does not own the buffer object, then
531 |   // the memory will not be freed automatically afterwards.
532 |   explicit Buffer(const Context &, const BufferAccess access, const size_t size):
533 |       buffer_(new CUdeviceptr, [access](CUdeviceptr* m) {
534 |         if (access != BufferAccess::kNotOwned) { CheckError(cuMemFree(*m)); }
535 |         delete m;
536 |       }),
537 |       access_(access) {
538 |     CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T)));
539 |   }
540 | 
541 |   // As above, but now with read/write access as a default
542 |   explicit Buffer(const Context &context, const size_t size):
543 |     Buffer<T>(context, BufferAccess::kReadWrite, size) {
544 |   }
545 | 
546 |   // Constructs a new buffer based on an existing host-container
547 |   template <typename Iterator>
548 |   explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
549 |     Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
550 |     auto size = static_cast<size_t>(end - start);
551 |     auto pointer = &*start;
552 |     CheckError(cuMemcpyHtoDAsync(*buffer_, pointer, size*sizeof(T), queue()));
553 |     queue.Finish();
554 |   }
555 | 
556 |   // Copies from device to host: reading the device buffer a-synchronously
557 |   void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
558 |     if (access_ == BufferAccess::kWriteOnly) {
559 |       throw LogicError("Buffer: reading from a write-only buffer");
560 |     }
561 |     CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue()));
562 |   }
563 |   void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
564 |                  const size_t offset = 0) const {
565 |     if (host.size() < size) {
566 |       throw LogicError("Buffer: target host buffer is too small");
567 |     }
568 |     ReadAsync(queue, size, host.data(), offset);
569 |   }
570 |   void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
571 |                  const size_t offset = 0) const {
572 |     if (host.size() < size) {
573 |       throw LogicError("Buffer: target host buffer is too small");
574 |     }
575 |     ReadAsync(queue, size, host.data(), offset);
576 |   }
577 | 
578 |   // Copies from device to host: reading the device buffer
579 |   void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
580 |     ReadAsync(queue, size, host, offset);
581 |     queue.Finish();
582 |   }
583 |   void Read(const Queue &queue, const size_t size, std::vector<T> &host,
584 |             const size_t offset = 0) const {
585 |     Read(queue, size, host.data(), offset);
586 |   }
587 |   void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
588 |             const size_t offset = 0) const {
589 |     Read(queue, size, host.data(), offset);
590 |   }
591 | 
592 |   // Copies from host to device: writing the device buffer a-synchronously
593 |   void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
594 |     if (access_ == BufferAccess::kReadOnly) {
595 |       throw LogicError("Buffer: writing to a read-only buffer");
596 |     }
597 |     if (GetSize() < (offset+size)*sizeof(T)) {
598 |       throw LogicError("Buffer: target device buffer is too small");
599 |     }
600 |     CheckError(cuMemcpyHtoDAsync(*buffer_ + offset*sizeof(T), host, size*sizeof(T), queue()));
601 |   }
602 |   void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
603 |                   const size_t offset = 0) {
604 |     WriteAsync(queue, size, host.data(), offset);
605 |   }
606 |   void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
607 |                   const size_t offset = 0) {
608 |     WriteAsync(queue, size, host.data(), offset);
609 |   }
610 | 
611 |   // Copies from host to device: writing the device buffer 
612 |   void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
613 |     WriteAsync(queue, size, host, offset);
614 |     queue.Finish();
615 |   }
616 |   void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
617 |              const size_t offset = 0) {
618 |     Write(queue, size, host.data(), offset);
619 |   }
620 |   void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
621 |              const size_t offset = 0) {
622 |     Write(queue, size, host.data(), offset);
623 |   }
624 | 
625 |   // Copies the contents of this buffer into another device buffer
626 |   void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
627 |     CheckError(cuMemcpyDtoDAsync(destination(), *buffer_, size*sizeof(T), queue()));
628 |   }
629 |   void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
630 |     CopyToAsync(queue, size, destination);
631 |     queue.Finish();
632 |   }
633 | 
634 |   // Retrieves the actual allocated size in bytes
635 |   size_t GetSize() const {
636 |     auto result = size_t{0};
637 |     CheckError(cuMemGetAddressRange(nullptr, &result, *buffer_));
638 |     return result;
639 |   }
640 | 
641 |   // Accessors to the private data-members
642 |   CUdeviceptr operator()() const { return *buffer_; }
643 |   CUdeviceptr& operator()() { return *buffer_; }
644 |  private:
645 |   std::shared_ptr<CUdeviceptr> buffer_;
646 |   const BufferAccess access_;
647 | };
648 | 
649 | // =================================================================================================
650 | 
651 | // C++11 version of 'CUfunction'
652 | class Kernel {
653 |  public:
654 | 
655 |   // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
656 |   explicit Kernel(const CUmodule module, const CUfunction kernel):
657 |     module_(module),
658 |     kernel_(kernel) {
659 |   }
660 | 
661 |   // Regular constructor with memory management
662 |   explicit Kernel(const Program &program, const std::string &name) {
663 |     CheckError(cuModuleLoadDataEx(&module_, program.GetIR().data(), 0, nullptr, nullptr));
664 |     CheckError(cuModuleGetFunction(&kernel_, module_, name.c_str()));
665 |   }
666 | 
667 |   // Sets a kernel argument at the indicated position. This stores both the value of the argument
668 |   // (as raw bytes) and the index indicating where this value can be found.
669 |   template <typename T>
670 |   void SetArgument(const size_t index, const T &value) {
671 |     if (index >= arguments_indices_.size()) { arguments_indices_.resize(index+1); }
672 |     arguments_indices_[index] = arguments_data_.size();
673 |     for (auto j=size_t(0); j<sizeof(T); ++j) {
674 |       arguments_data_.push_back(reinterpret_cast<const char*>(&value)[j]);
675 |     }
676 |   }
677 |   template <typename T>
678 |   void SetArgument(const size_t index, Buffer<T> &value) {
679 |     SetArgument(index, value());
680 |   }
681 | 
682 |   // Sets all arguments in one go using parameter packs. Note that this resets all previously set
683 |   // arguments using 'SetArgument' or 'SetArguments'.
684 |   template <typename... Args>
685 |   void SetArguments(Args&... args) {
686 |     arguments_indices_.clear();
687 |     arguments_data_.clear();
688 |     SetArgumentsRecursive(0, args...);
689 |   }
690 | 
691 |   // Retrieves the amount of local memory used per work-group for this kernel. Note that this the
692 |   // shared memory in CUDA terminology.
693 |   unsigned long LocalMemUsage(const Device &) const {
694 |     auto result = 0;
695 |     CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_));
696 |     return static_cast<unsigned long>(result);
697 |   }
698 | 
699 |   // Retrieves the name of the kernel
700 |   std::string GetFunctionName() const {
701 |     return std::string{"unknown"}; // Not implemented for the CUDA backend
702 |   }
703 | 
704 |   // Launches a kernel onto the specified queue
705 |   void Launch(const Queue &queue, const std::vector<size_t> &global,
706 |               const std::vector<size_t> &local, EventPointer event) {
707 | 
708 |     // Creates the grid (number of threadblocks) and sets the block sizes (threads per block)
709 |     auto grid = std::vector<size_t>{1, 1, 1};
710 |     auto block = std::vector<size_t>{1, 1, 1};
711 |     if (global.size() != local.size()) { throw LogicError("invalid thread/workgroup dimensions"); }
712 |     for (auto i=size_t{0}; i<local.size(); ++i) { grid[i] = global[i]/local[i]; }
713 |     for (auto i=size_t{0}; i<local.size(); ++i) { block[i] = local[i]; }
714 | 
715 |     // Creates the array of pointers from the arrays of indices & data
716 |     std::vector<void*> pointers;
717 |     for (auto &index: arguments_indices_) {
718 |       pointers.push_back(&arguments_data_[index]);
719 |     }
720 | 
721 |     // Launches the kernel, its execution time is recorded by events
722 |     CheckError(cuEventRecord(event->start(), queue()));
723 |     CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2],
724 |                               0, queue(), pointers.data(), nullptr));
725 |     CheckError(cuEventRecord(event->end(), queue()));
726 |   }
727 | 
728 |   // As above, but with an event waiting list
729 |   // TODO: Implement this function
730 |   void Launch(const Queue &queue, const std::vector<size_t> &global,
731 |               const std::vector<size_t> &local, EventPointer event,
732 |               std::vector<Event>& waitForEvents) {
733 |     if (local.size() == 0) {
734 |       throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end");
735 |     }
736 |     else if (waitForEvents.size() != 0) {
737 |       throw LogicError("Kernel: launching with an event waiting list is not implemented for the CUDA back-end");
738 |     }
739 |     else {
740 |      return Launch(queue, global, local, event);
741 |     }
742 |   }
743 | 
744 |   // Accessors to the private data-members
745 |   const CUfunction& operator()() const { return kernel_; }
746 |   CUfunction operator()() { return kernel_; }
747 |  private:
748 |   CUmodule module_;
749 |   CUfunction kernel_;
750 |   std::vector<size_t> arguments_indices_; // Indices of the arguments
751 |   std::vector<char> arguments_data_; // The arguments data as raw bytes
752 | 
753 |   // Internal implementation for the recursive SetArguments function.
754 |   template <typename T>
755 |   void SetArgumentsRecursive(const size_t index, T &first) {
756 |     SetArgument(index, first);
757 |   }
758 |   template <typename T, typename... Args>
759 |   void SetArgumentsRecursive(const size_t index, T &first, Args&... args) {
760 |     SetArgument(index, first);
761 |     SetArgumentsRecursive(index+1, args...);
762 |   }
763 | };
764 | 
765 | // =================================================================================================
766 | } // namespace CLCudaAPI
767 | 
768 | // CLCUDAAPI_CUPP11_H_
769 | #endif
770 | 


--------------------------------------------------------------------------------
/include/cxpp11_common.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | // =================================================================================================
  3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The
  4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
  5 | // width of 100 characters per line.
  6 | //
  7 | // Author(s):
  8 | //   Ivan Shapovalov <intelfx@intelfx.name>
  9 | //   Cedric Nugteren <www.cedricnugteren.nl>
 10 | //
 11 | // This file contains exception classes as needed for either 'clpp11.hpp' or 'cupp11.h'.
 12 | //
 13 | // =================================================================================================
 14 | 
 15 | #ifndef CLCUDAAPI_CXPP11_COMMON_H_
 16 | #define CLCUDAAPI_CXPP11_COMMON_H_
 17 | 
 18 | #include <string>    // std::string
 19 | #include <cstring>   // strchr
 20 | #include <stdexcept> // std::runtime_error
 21 | 
 22 | namespace CLCudaAPI {
 23 | // =================================================================================================
 24 | 
 25 | // Basic exception class: represents an error happened inside our code
 26 | // (as opposed to an error in C++ runtime)
 27 | template <typename Base>
 28 | class Error : public Base {
 29 |  public:
 30 |   // Perfect forwarding of the constructor since "using Base::Base" is not supported by VS 2013
 31 |   template <typename... Args>
 32 |   Error(Args&&... args):
 33 |       Base(std::forward<Args>(args)...) {
 34 |   }
 35 | };
 36 | 
 37 | // =================================================================================================
 38 | 
 39 | // Represents a generic device-specific runtime error (returned by an OpenCL or CUDA API function)
 40 | class DeviceError : public Error<std::runtime_error> {
 41 |  public:
 42 |    // Perfect forwarding of the constructor since "using Error<std::runtime_error>::Error" is not
 43 |    // supported by VS 2013
 44 |    template <typename... Args>
 45 |    DeviceError(Args&&... args):
 46 |        Error<std::runtime_error>(std::forward<Args>(args)...) {
 47 |    }
 48 | 
 49 |   static std::string TrimCallString(const char *where) {
 50 |     const char *paren = strchr(where, '(');
 51 |     if (paren) {
 52 |       return std::string(where, paren);
 53 |     } else {
 54 |       return std::string(where);
 55 |     }
 56 |   }
 57 | };
 58 | 
 59 | // =================================================================================================
 60 | 
 61 | // Represents a generic runtime error (aka environmental problem)
 62 | class RuntimeError : public Error<std::runtime_error> {
 63 |  public:
 64 |   explicit RuntimeError(const std::string &reason):
 65 |       Error("Run-time error: " + reason) {
 66 |   }
 67 | };
 68 | 
 69 | // =================================================================================================
 70 | 
 71 | // Represents a generic logic error (aka failed assertion)
 72 | class LogicError : public Error<std::logic_error> {
 73 |  public:
 74 |   explicit LogicError(const std::string &reason):
 75 |       Error("Internal logic error: " + reason) {
 76 |   }
 77 | };
 78 | 
 79 | // =================================================================================================
 80 | 
 81 | // Internal exception base class with a status field and a subclass-specific "details" field
 82 | // which can be used to recreate an exception
 83 | template <typename Base, typename Status>
 84 | class ErrorCode : public Base {
 85 |  public:
 86 |   ErrorCode(Status status, const std::string &details, const std::string &reason):
 87 |       Base(reason),
 88 |       status_(status),
 89 |       details_(details) {
 90 |   }
 91 | 
 92 |   Status status() const {
 93 |     return status_;
 94 |   }
 95 | 
 96 |   const std::string& details() const {
 97 |     return details_;
 98 |   }
 99 | 
100 |  private:
101 |   const Status status_;
102 |   const std::string details_;
103 | };
104 | 
105 | // =================================================================================================
106 | 
107 | } // namespace CLCudaAPI
108 | 
109 | // CLCUDAAPI_CXPP11_COMMON_H_
110 | #endif
111 | 


--------------------------------------------------------------------------------
/samples/advanced.cc:
--------------------------------------------------------------------------------
  1 | 
  2 | // =================================================================================================
  3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The
  4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
  5 | // width of 100 characters per line.
  6 | //
  7 | // Author(s):
  8 | //   Cedric Nugteren <www.cedricnugteren.nl>
  9 | //
 10 | // This file demonstrates more advanced usage of the C++11 interfaces to CUDA and OpenCL through
 11 | // CLCudaAPI. This includes 2D thread dimensions and asynchronous host-device communication. The
 12 | // example conserns a 2D convolution kernel with a very simple hard-coded 3x3 blur filter.
 13 | //
 14 | // =================================================================================================
 15 | //
 16 | // Copyright 2015 SURFsara
 17 | // 
 18 | // Licensed under the Apache License, Version 2.0 (the "License");
 19 | // you may not use this file except in compliance with the License.
 20 | // You may obtain a copy of the License at
 21 | // 
 22 | //  http://www.apache.org/licenses/LICENSE-2.0
 23 | // 
 24 | // Unless required by applicable law or agreed to in writing, software
 25 | // distributed under the License is distributed on an "AS IS" BASIS,
 26 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 27 | // See the License for the specific language governing permissions and
 28 | // limitations under the License.
 29 | //
 30 | // =================================================================================================
 31 | 
 32 | // Run with either OpenCL or CUDA as a back-end
 33 | #if USE_OPENCL
 34 |   #include "clpp11.h"
 35 | #else
 36 |   #include "cupp11.h"
 37 | #endif
 38 | 
 39 | // C++ includes
 40 | #include <vector>
 41 | #include <string>
 42 | #include <cstdio>
 43 | 
 44 | // =================================================================================================
 45 | 
 46 | // This example uses a single monolithic function
 47 | int main() {
 48 | 
 49 |   // This example passes different options to the run-time compiler based on which back-end is used
 50 |   #if USE_OPENCL
 51 |     auto compiler_options = std::vector<std::string>{};
 52 |   #else
 53 |     auto compiler_options = std::vector<std::string>{"--gpu-architecture=compute_35"};
 54 |   #endif
 55 | 
 56 |   // Example CUDA/OpenCL program as a string. Note that this is the first (header) part only, the
 57 |   // main body of the kernel is common among the two back-ends and is therefore not duplicated.
 58 |   #if USE_OPENCL
 59 |     auto program_head = R"(
 60 |     __kernel void convolution(__global float* x, __global float* y,
 61 |                               const int size_x, const int size_y) {
 62 |       const int tid_x = get_global_id(0);
 63 |       const int tid_y = get_global_id(1);
 64 |     )";
 65 |   #else
 66 |     auto program_head = R"(
 67 |     extern "C" __global__ void convolution(float* x, float* y,
 68 |                                            const int size_x, const int size_y) {
 69 |       const int tid_x = threadIdx.x + blockDim.x*blockIdx.x;
 70 |       const int tid_y = threadIdx.y + blockDim.y*blockIdx.y;
 71 |     )";
 72 |   #endif
 73 | 
 74 |   // The common body of the OpenCL/CUDA program. This is glued after the 'program_head' string.
 75 |   // It implements a star-based fixed 3x3 blur filter.
 76 |   auto program_tail = R"(
 77 |     float value = 0.0f;
 78 |     if (tid_x >= 1 && tid_y >= 1 && tid_x < size_x-1 && tid_y < size_y-1) {
 79 |       value += 0.2*x[(tid_y+1)*size_x + (tid_x  )];
 80 |       value += 0.2*x[(tid_y-1)*size_x + (tid_x  )];
 81 |       value += 0.2*x[(tid_y  )*size_x + (tid_x  )];
 82 |       value += 0.2*x[(tid_y  )*size_x + (tid_x+1)];
 83 |       value += 0.2*x[(tid_y  )*size_x + (tid_x-1)];
 84 |     }
 85 |     y[tid_y*size_x + tid_x] = value;
 86 |   })";
 87 |   auto program_string = std::string{program_head} + std::string{program_tail};
 88 | 
 89 |   // ===============================================================================================
 90 | 
 91 |   // Sets the size of the 2D input/output matrices
 92 |   constexpr auto size_x = size_t{2048};
 93 |   constexpr auto size_y = size_t{2048};
 94 |   auto size = size_x * size_y;
 95 | 
 96 |   // Platform/device settings
 97 |   constexpr auto platform_id = size_t{0};
 98 |   constexpr auto device_id = size_t{0};
 99 | 
100 |   // Initializes the CLCudaAPI platform and device. This initializes the OpenCL/CUDA back-end and
101 |   // selects a specific device on the platform. The device class has methods to retrieve properties
102 |   // such as the device name and vendor. More examples of device properties are given in the
103 |   // `device_info.cc` sample program. 
104 |   printf("\n## Initializing...\n");
105 |   auto platform = CLCudaAPI::Platform(platform_id);
106 |   auto device = CLCudaAPI::Device(platform, device_id);
107 |   printf(" > Running on device '%s' of '%s'\n", device.Name().c_str(), device.Vendor().c_str());
108 | 
109 |   // Creates a new CLCudaAPI context and queue for this device. The queue can be used to schedule
110 |   // commands such as launching a kernel or performing a device-host memory copy.
111 |   auto context = CLCudaAPI::Context(device);
112 |   auto queue = CLCudaAPI::Queue(context, device);
113 | 
114 |   // Creates a new CLCudaAPI event to be able to time kernels
115 |   auto event = CLCudaAPI::Event();
116 | 
117 |   // Creates a new program based on the kernel string. Note that the kernel string is moved-out when
118 |   // constructing the program to save copying: it should no longer be used in the remainder of this
119 |   // function.
120 |   auto program = CLCudaAPI::Program(context, std::move(program_string));
121 | 
122 |   // Builds this program and checks for any compilation errors. If there are any, they are printed
123 |   // and execution is halted.
124 |   printf("## Compiling the kernel...\n");
125 |   try {
126 |     program.Build(device, compiler_options);
127 |   } catch (const CLCudaAPI::CLCudaAPIBuildError &e) {
128 |     if (program.StatusIsCompilationWarningOrError(e.status())) {
129 |       auto message = program.GetBuildInfo(device);
130 |       printf(" > Compiler error(s)/warning(s) found:\n%s\n", message.c_str());
131 |     }
132 |     throw;
133 |   }
134 | 
135 |   // Populate host matrices based on CUDA/OpenCL host buffers. When using the CUDA back-end, this
136 |   // will create page-locked memories, benefiting from higher bandwidth when copying between the
137 |   // host and device. These buffers mimic std::vector to some extend and can therefore be filled
138 |   // using either the '[]' operator or range-based for-loops.
139 |   auto host_a = CLCudaAPI::BufferHost<float>(context, size);
140 |   auto host_b = CLCudaAPI::BufferHost<float>(context, size);
141 |   for (auto x=size_t{0}; x<size_x; ++x) {
142 |     for (auto y=size_t{0}; y<size_y; ++y) {
143 |       host_a[y*size_x + x] = static_cast<float>(x + y/4);
144 |     }
145 |   }
146 |   for (auto &item: host_b) { item = 0.0f; }
147 | 
148 |   // Creates two new device buffers and prints the sizes of these device buffers. Both buffers
149 |   // in this example are readable and writable.
150 |   printf("## Allocating device memory...\n");
151 |   auto dev_a = CLCudaAPI::Buffer<float>(context, CLCudaAPI::BufferAccess::kReadWrite, size);
152 |   auto dev_b = CLCudaAPI::Buffer<float>(context, CLCudaAPI::BufferAccess::kReadWrite, size);
153 |   printf(" > Size of buffer A is %zu bytes\n", dev_a.GetSize());
154 |   printf(" > Size of buffer B is %zu bytes\n", dev_b.GetSize());
155 | 
156 |   // Copies the matrices to the device a-synchronously. The queue is then finished to ensure that
157 |   // the operations are completed before continuing.
158 |   dev_a.WriteAsync(queue, size, host_a);
159 |   dev_b.WriteAsync(queue, size, host_b);
160 |   queue.Finish();
161 | 
162 |   // Creates the 'convolution' kernel from the compiled program and sets the four arguments. Note
163 |   // that this uses the direct form instead of setting each argument separately.
164 |   auto kernel = CLCudaAPI::Kernel(program, "convolution");
165 |   auto size_x_int = static_cast<int>(size_x);
166 |   auto size_y_int = static_cast<int>(size_y);
167 |   kernel.SetArguments(dev_a, dev_b, size_x_int, size_y_int);
168 | 
169 |   // Creates a 2-dimensional thread configuration with thread-blocks/work-groups of 16x16 threads
170 |   // and a total number of threads equal to the number of elements in the input/output matrices.
171 |   constexpr auto kWorkGroupSizeX = size_t{16};
172 |   constexpr auto kWorkGroupSizeY = size_t{16};
173 |   auto global = std::vector<size_t>{static_cast<size_t>(size_x), static_cast<size_t>(size_y)};
174 |   auto local = std::vector<size_t>{kWorkGroupSizeX, kWorkGroupSizeY};
175 | 
176 |   // Makes sure that the thread configuration is legal on this device
177 |   if (!device.IsThreadConfigValid(local)) {
178 |     printf("## Unsupported local thread configuration for this device, exiting.\n");
179 |     return 1;
180 |   }
181 | 
182 |   // Enqueues the kernel and waits for the result. Note that launching the kernel is always
183 |   // a-synchronous and thus requires finishing the queue in order to complete the operation.
184 |   printf("## Running the kernel...\n");
185 |   kernel.Launch(queue, global, local, event.pointer());
186 |   queue.Finish(event);
187 |   printf(" > Took %.3lf ms\n", event.GetElapsedTime());
188 | 
189 |   // For illustration purposes, this copies the result into a new device buffer. The old result
190 |   // buffer 'dev_b' is now no longer used.
191 |   auto dev_b_copy = CLCudaAPI::Buffer<float>(context, CLCudaAPI::BufferAccess::kReadWrite, size);
192 |   dev_b.CopyTo(queue, size, dev_b_copy);
193 | 
194 |   // Reads the results back from the new copy into the host memory
195 |   dev_b_copy.ReadAsync(queue, size, host_b);
196 |   queue.Finish();
197 | 
198 |   // Prints the results for a couple of indices to verify that the work has been done
199 |   printf("## All done. Sampled verification:\n");
200 |   const auto verification_indices = std::vector<size_t>{20};
201 |   for (const auto &index: verification_indices) {
202 |     printf(" > 0.2*%.lf + 0.2*%.lf + 0.2*%.lf + 0.2*%.lf + 0.2*%.lf = %.2lf\n",
203 |            host_a[(index+1)*size_x + (index  )], host_a[(index-1)*size_x + (index  )],
204 |            host_a[(index  )*size_x + (index  )], host_a[(index  )*size_x + (index+1)],
205 |            host_a[(index  )*size_x + (index-1)],
206 |            host_b[index*size_x + index]);
207 |   }
208 | 
209 |   // End of the example: no frees or clean-up needed
210 |   return 0;
211 | }
212 | 
213 | // =================================================================================================
214 | 


--------------------------------------------------------------------------------
/samples/device_info.cc:
--------------------------------------------------------------------------------
 1 | 
 2 | // =================================================================================================
 3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The
 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 5 | // width of 100 characters per line.
 6 | //
 7 | // Author(s):
 8 | //   Cedric Nugteren <www.cedricnugteren.nl>
 9 | //
10 | // This file implements a generic version of 'clinfo' (OpenCL) and 'deviceQuery' (CUDA). This
11 | // demonstrates some of the features of CLCudaAPI's generic Device class.
12 | //
13 | // =================================================================================================
14 | //
15 | // Copyright 2015 SURFsara
16 | // 
17 | // Licensed under the Apache License, Version 2.0 (the "License");
18 | // you may not use this file except in compliance with the License.
19 | // You may obtain a copy of the License at
20 | // 
21 | //  http://www.apache.org/licenses/LICENSE-2.0
22 | // 
23 | // Unless required by applicable law or agreed to in writing, software
24 | // distributed under the License is distributed on an "AS IS" BASIS,
25 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26 | // See the License for the specific language governing permissions and
27 | // limitations under the License.
28 | //
29 | // =================================================================================================
30 | 
31 | // C++ includes
32 | #include <cstdio>
33 | #include <vector>
34 | #include <string>
35 | 
36 | // Run with either OpenCL or CUDA as a back-end
37 | #if USE_OPENCL
38 |   #include "clpp11.h"
39 | #else
40 |   #include "cupp11.h"
41 | #endif
42 | 
43 | // =================================================================================================
44 | 
45 | // Example implementation of a device-query/info program
46 | int main() {
47 | 
48 |   // Platform/device settings
49 |   constexpr auto platform_id = size_t{0};
50 |   constexpr auto device_id = size_t{0};
51 | 
52 |   // Initializes the CLCudaAPI platform and device. This initializes the OpenCL/CUDA back-end and
53 |   // selects a specific device on the platform.
54 |   const auto platform = CLCudaAPI::Platform(platform_id);
55 |   const auto device = CLCudaAPI::Device(platform, device_id);
56 | 
57 |   // Prints information about the chosen device. Most of these results should stay the same when
58 |   // switching between the CUDA and OpenCL back-ends.
59 |   printf("\n## Printing platform information...\n");
60 |   printf(" > Platform ID                  %zu\n", platform_id);
61 |   printf(" > Platform name                %s\n", platform.Name().c_str());
62 |   printf(" > Platform vendor              %s\n", platform.Vendor().c_str());
63 |   printf(" > Platform version             %s\n", platform.Version().c_str());
64 |   printf("\n## Printing device information...\n");
65 |   printf(" > Device ID                    %zu\n", device_id);
66 |   printf(" > Framework version            %s\n", device.Version().c_str());
67 |   printf(" > Vendor                       %s\n", device.Vendor().c_str());
68 |   printf(" > Device name                  %s\n", device.Name().c_str());
69 |   if (device.HasExtension("cl_amd_device_attribute_query")) {
70 |     printf(" > AMD board name               %s\n", device.AMDBoardName().c_str());
71 |   }
72 |   if (device.HasExtension("cl_nv_device_attribute_query")) {
73 |     printf(" > NVIDIA compute capability    %s\n", device.NVIDIAComputeCapability().c_str());
74 |   }
75 |   printf(" > Device type                  %s\n", device.Type().c_str());
76 |   printf(" > Max work-group size          %zu\n", device.MaxWorkGroupSize());
77 |   printf(" > Max thread dimensions        %zu\n", device.MaxWorkItemDimensions());
78 |   printf(" > Max work-group sizes:\n");
79 |   for (auto i=size_t{0}; i<device.MaxWorkItemDimensions(); ++i) {
80 |     printf("   - in the %zu-dimension         %zu\n", i, device.MaxWorkItemSizes()[i]);
81 |   }
82 |   printf(" > Local memory per work-group  %zu bytes\n", device.LocalMemSize());
83 |   printf(" > Device capabilities          %s\n", device.Capabilities().c_str());
84 |   printf(" > Core clock rate              %zu MHz\n", device.CoreClock());
85 |   printf(" > Number of compute units      %zu\n", device.ComputeUnits());
86 |   printf(" > Total memory size            %zu bytes\n", device.MemorySize());
87 |   printf(" > Maximum allocatable memory   %zu bytes\n", device.MaxAllocSize());
88 |   printf(" > Memory clock rate            %zu MHz\n", device.MemoryClock());
89 |   printf(" > Memory bus width             %zu bits\n", device.MemoryBusWidth());
90 | 
91 |   // End of the example: no frees or clean-up needed
92 |   return 0;
93 | }
94 | 
95 | // =================================================================================================
96 | 


--------------------------------------------------------------------------------
/samples/simple.cc:
--------------------------------------------------------------------------------
  1 | 
  2 | // =================================================================================================
  3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The
  4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
  5 | // width of 100 characters per line.
  6 | //
  7 | // Author(s):
  8 | //   Cedric Nugteren <www.cedricnugteren.nl>
  9 | //
 10 | // This file implements a relatively simple toy example, in which an input vector is multiplied by
 11 | // a constant to produce an output vector. This example demonstrates the basic usage of the C++11
 12 | // interfaces to CUDA and OpenCL through CLCudaAPI.
 13 | //
 14 | // =================================================================================================
 15 | //
 16 | // Copyright 2015 SURFsara
 17 | // 
 18 | // Licensed under the Apache License, Version 2.0 (the "License");
 19 | // you may not use this file except in compliance with the License.
 20 | // You may obtain a copy of the License at
 21 | // 
 22 | //  http://www.apache.org/licenses/LICENSE-2.0
 23 | // 
 24 | // Unless required by applicable law or agreed to in writing, software
 25 | // distributed under the License is distributed on an "AS IS" BASIS,
 26 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 27 | // See the License for the specific language governing permissions and
 28 | // limitations under the License.
 29 | //
 30 | // =================================================================================================
 31 | 
 32 | // Runs with either OpenCL or CUDA as a back-end
 33 | #if USE_OPENCL
 34 |   #include "clpp11.h"
 35 | #else
 36 |   #include "cupp11.h"
 37 | #endif
 38 | 
 39 | // C++ includes
 40 | #include <vector>
 41 | #include <string>
 42 | #include <cstdio>
 43 | 
 44 | // =================================================================================================
 45 | 
 46 | // This example uses a single monolithic function
 47 | int main() {
 48 | 
 49 |   // Example CUDA/OpenCL program as a string. Note that the strings are loaded here as raw string
 50 |   // literals (using C++11's R"(string)" syntax). However, they can also be generated in-line or
 51 |   // perhaps placed in a separate file and loaded at run-time.
 52 |   #if USE_OPENCL
 53 |     auto program_string = R"(
 54 |     __kernel void multiply(__global float* x, __global float* y, const int factor) {
 55 |       const int tid = get_global_id(0);
 56 |       y[tid] = x[tid] * factor;
 57 |     })";
 58 |   #else
 59 |     auto program_string = R"(
 60 |     extern "C" __global__ void multiply(float* x, float* y, const int factor) {
 61 |       const int tid = threadIdx.x + blockDim.x*blockIdx.x;
 62 |       y[tid] = x[tid] * factor;
 63 |     })";
 64 |   #endif
 65 | 
 66 |   // ===============================================================================================
 67 | 
 68 |   // Sets the size of the vectors and the data-multiplication factor
 69 |   constexpr auto size = static_cast<size_t>(2048 * 2048);
 70 |   auto multiply_factor = 2;
 71 | 
 72 |   // Platform/device settings
 73 |   constexpr auto platform_id = size_t{0};
 74 |   constexpr auto device_id = size_t{0};
 75 | 
 76 |   // Initializes the CLCudaAPI platform and device. This initializes the OpenCL/CUDA back-end and
 77 |   // selects a specific device on the platform. The device class has methods to retrieve properties
 78 |   // such as the device name and vendor. More examples of device properties are given in the
 79 |   // `device_info.cc` sample program. 
 80 |   printf("\n## Initializing...\n");
 81 |   auto platform = CLCudaAPI::Platform(platform_id);
 82 |   auto device = CLCudaAPI::Device(platform, device_id);
 83 |   printf(" > Running on device '%s' of '%s'\n", device.Name().c_str(), device.Vendor().c_str());
 84 | 
 85 |   // Creates a new CLCudaAPI context and queue for this device. The queue can be used to schedule
 86 |   // commands such as launching a kernel or performing a device-host memory copy.
 87 |   auto context = CLCudaAPI::Context(device);
 88 |   auto queue = CLCudaAPI::Queue(context, device);
 89 | 
 90 |   // Creates a new CLCudaAPI event to be able to time kernels
 91 |   auto event = CLCudaAPI::Event();
 92 | 
 93 |   // Creates a new program based on the kernel string. Then, builds this program and checks for
 94 |   // any compilation errors. If there are any, they are printed and execution is halted.
 95 |   printf("## Compiling the kernel...\n");
 96 |   auto program = CLCudaAPI::Program(context, program_string);
 97 |   auto compiler_options = std::vector<std::string>{};
 98 |   try {
 99 |     program.Build(device, compiler_options);
100 |   } catch (const CLCudaAPI::CLCudaAPIBuildError &e) {
101 |     if (program.StatusIsCompilationWarningOrError(e.status())) {
102 |       auto message = program.GetBuildInfo(device);
103 |       printf(" > Compiler error(s)/warning(s) found:\n%s\n", message.c_str());
104 |     }
105 |     throw;
106 |   }
107 | 
108 |   // Populates regular host vectors with example data
109 |   auto host_a = std::vector<float>(size);
110 |   auto host_b = std::vector<float>(size);
111 |   for (auto i=size_t{0}; i<host_a.size(); ++i) { host_a[i] = static_cast<float>(i); }
112 |   for (auto &item: host_b) { item = 0.0f; }
113 | 
114 |   // Creates two new device buffers and copies the host data to these device buffers.
115 |   auto dev_a = CLCudaAPI::Buffer<float>(context, queue, host_a.begin(), host_a.end());
116 |   auto dev_b = CLCudaAPI::Buffer<float>(context, queue, host_b.begin(), host_b.end());
117 | 
118 |   // Creates the 'multiply' kernel from the compiled program and sets the three arguments. Note that
119 |   // the indices of the arguments have to be set according to their order in the kernel.
120 |   auto kernel = CLCudaAPI::Kernel(program, "multiply");
121 |   kernel.SetArgument(0, dev_a);
122 |   kernel.SetArgument(1, dev_b);
123 |   kernel.SetArgument(2, multiply_factor);
124 | 
125 |   // Creates a 1-dimensional thread configuration with thread-blocks/work-groups of 256 threads
126 |   // and a total number of threads equal to the number of elements in the input/output vectors.
127 |   constexpr auto kWorkGroupSize = size_t{256};
128 |   auto global = std::vector<size_t>{size};
129 |   auto local = std::vector<size_t>{kWorkGroupSize};
130 | 
131 |   // Enqueues the kernel and waits for the result. Note that launching the kernel is always
132 |   // a-synchronous and thus requires finishing the queue in order to complete the operation.
133 |   printf("## Running the kernel...\n");
134 |   kernel.Launch(queue, global, local, event.pointer());
135 |   queue.Finish(event);
136 |   printf(" > Took %.3lf ms\n", event.GetElapsedTime());
137 | 
138 |   // Reads the results back to the host memory
139 |   dev_b.Read(queue, size, host_b);
140 | 
141 |   // Prints the results for a couple of indices to verify that the work has been done
142 |   printf("## All done. Sampled verification:\n");
143 |   const auto verification_indices = std::vector<size_t>{4, 900};
144 |   for (const auto &index: verification_indices) {
145 |     printf(" > %.lf*%d = %.lf\n", host_a[index], multiply_factor, host_b[index]);
146 |   }
147 | 
148 |   // End of the example: no frees or clean-up needed
149 |   return 0;
150 | }
151 | 
152 | // =================================================================================================
153 | 


--------------------------------------------------------------------------------
/samples/smallest.cc:
--------------------------------------------------------------------------------
 1 | 
 2 | // =================================================================================================
 3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The
 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 5 | // width of 100 characters per line.
 6 | //
 7 | // Author(s):
 8 | //   Cedric Nugteren <www.cedricnugteren.nl>
 9 | //
10 | // This file implements a compact OpenCL/CUDA example inspired by the 'quest for the smallest OpenCL
11 | // program': http://arrayfire.com/quest-for-the-smallest-opencl-program/
12 | //
13 | // =================================================================================================
14 | //
15 | // Copyright 2015 SURFsara
16 | // 
17 | // Licensed under the Apache License, Version 2.0 (the "License");
18 | // you may not use this file except in compliance with the License.
19 | // You may obtain a copy of the License at
20 | // 
21 | //  http://www.apache.org/licenses/LICENSE-2.0
22 | // 
23 | // Unless required by applicable law or agreed to in writing, software
24 | // distributed under the License is distributed on an "AS IS" BASIS,
25 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26 | // See the License for the specific language governing permissions and
27 | // limitations under the License.
28 | //
29 | // =================================================================================================
30 | 
31 | // Compile using OpenCL ...
32 | #if USE_OPENCL
33 |   #include "clpp11.h"
34 |   static auto program_string = R"(
35 |   __kernel void add(__global const float* a, __global const float* b, __global float* c) {
36 |     unsigned idx = get_global_id(0);
37 |     c[idx] = a[idx] + b[idx];
38 |   })";
39 | 
40 | // ... or use CUDA instead
41 | #else
42 |   #include "cupp11.h"
43 |   static auto program_string = R"(
44 |   extern "C" __global__ void add(const float* a, const float* b, float* c) {
45 |     unsigned idx = threadIdx.x + blockDim.x*blockIdx.x;
46 |     c[idx] = a[idx] + b[idx];
47 |   })";
48 | #endif
49 | 
50 | #include <cstdio>
51 | 
52 | int main() {
53 |   constexpr auto platform_id = size_t{0};
54 |   constexpr auto device_id = size_t{0};
55 |   auto platform = CLCudaAPI::Platform(platform_id);
56 |   auto device = CLCudaAPI::Device(platform, device_id);
57 |   auto context = CLCudaAPI::Context(device);
58 |   auto queue = CLCudaAPI::Queue(context, device);
59 |   auto event = CLCudaAPI::Event();
60 | 
61 |   // Creates and populates device memory
62 |   constexpr auto elements = size_t{1024};
63 |   auto data = std::vector<float>(elements, 5);
64 |   auto a = CLCudaAPI::Buffer<float>(context, CLCudaAPI::BufferAccess::kReadWrite, elements);
65 |   auto b = CLCudaAPI::Buffer<float>(context, CLCudaAPI::BufferAccess::kReadWrite, elements);
66 |   auto c = CLCudaAPI::Buffer<float>(context, CLCudaAPI::BufferAccess::kReadWrite, elements);
67 |   a.Write(queue, elements, data);
68 |   b.Write(queue, elements, data);
69 | 
70 |   // Compiles and launches the kernel
71 |   auto program = CLCudaAPI::Program(context, program_string);
72 |   auto compiler_options = std::vector<std::string>{};
73 |   program.Build(device, compiler_options);
74 |   auto kernel = CLCudaAPI::Kernel(program, "add");
75 |   kernel.SetArguments(a, b, c);
76 |   kernel.Launch(queue, {elements}, {128}, event.pointer());
77 |   queue.Finish(event);
78 | 
79 |   // Reads the results back to the host memory
80 |   auto result = std::vector<float>(elements, 0);
81 |   c.Read(queue, elements, result);
82 |   for (auto &r: result) { printf("%.lf ", r); }
83 |   printf("\n");
84 |   return 0;
85 | }
86 | 
87 | // =================================================================================================
88 | 


--------------------------------------------------------------------------------
/test/unit_tests.cc:
--------------------------------------------------------------------------------
  1 | 
  2 | // =================================================================================================
  3 | // This file is part of the CLCudaAPI project. The project is licensed under Apache Version 2.0. The
  4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
  5 | // width of 100 characters per line.
  6 | //
  7 | // Author(s):
  8 | //   Cedric Nugteren <www.cedricnugteren.nl>
  9 | //
 10 | // This file implements unit tests based on the Catch header-only test framework.
 11 | //
 12 | // =================================================================================================
 13 | //
 14 | // Copyright 2015 SURFsara
 15 | // 
 16 | // Licensed under the Apache License, Version 2.0 (the "License");
 17 | // you may not use this file except in compliance with the License.
 18 | // You may obtain a copy of the License at
 19 | // 
 20 | //  http://www.apache.org/licenses/LICENSE-2.0
 21 | // 
 22 | // Unless required by applicable law or agreed to in writing, software
 23 | // distributed under the License is distributed on an "AS IS" BASIS,
 24 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25 | // See the License for the specific language governing permissions and
 26 | // limitations under the License.
 27 | //
 28 | // =================================================================================================
 29 | 
 30 | // Use Catch
 31 | #define CATCH_CONFIG_MAIN
 32 | #include "catch.hpp"
 33 | 
 34 | // Runs with either OpenCL or CUDA as a back-end
 35 | #if USE_OPENCL
 36 |   #include <clpp11.h>
 37 | #else
 38 |   #include <cupp11.h>
 39 | #endif
 40 | 
 41 | // Settings
 42 | const size_t kPlatformID = 0;
 43 | const size_t kDeviceID = 0;
 44 | const size_t kBufferSize = 10;
 45 | 
 46 | // =================================================================================================
 47 | 
 48 | SCENARIO("events can be created and used", "[Event]") {
 49 |   GIVEN("An example event") {
 50 |     #if !USE_OPENCL
 51 |     auto platform = CLCudaAPI::Platform(kPlatformID);
 52 |     auto device = CLCudaAPI::Device(platform, kDeviceID);
 53 |     auto context = CLCudaAPI::Context(device);
 54 |     auto queue = CLCudaAPI::Queue(context, device);
 55 |     #endif
 56 |     auto event = CLCudaAPI::Event();
 57 | 
 58 |     #if USE_OPENCL // Not available for the CUDA version
 59 |     WHEN("its underlying data-structure is retrieved") {
 60 |       auto raw_event = event();
 61 |       THEN("a copy of this event can be created") {
 62 |         auto event_copy = CLCudaAPI::Event(raw_event);
 63 |         REQUIRE(event_copy() == event());
 64 |       }
 65 |     }
 66 |     #else // Not available for the OpenCL version
 67 |     WHEN("its underlying data-structures are retrieved") {
 68 |       auto raw_start = event.start();
 69 |       auto raw_end = event.end();
 70 |       THEN("their underlying data-structures are not null") {
 71 |         REQUIRE(raw_start != nullptr);
 72 |         REQUIRE(raw_end != nullptr);
 73 |       }
 74 |     }
 75 |     #endif
 76 | 
 77 |     WHEN("a copy is created using the copy constructor") {
 78 |       auto event_copy = CLCudaAPI::Event(event);
 79 |       THEN("its underlying data-structure is unchanged") {
 80 |         #if USE_OPENCL
 81 |           REQUIRE(event_copy() == event());
 82 |         #else
 83 |           REQUIRE(event_copy.start() == event.start());
 84 |           REQUIRE(event_copy.end() == event.end());
 85 |         #endif
 86 |       }
 87 |     }
 88 | 
 89 |     // TODO: Not working if nothing is recorded
 90 |     //WHEN("the elapsed time is retrieved") {
 91 |     //  auto elapsed_time = event.GetElapsedTime();
 92 |     //  THEN("its value is valid") {
 93 |     //    REQUIRE(elapsed_time == elapsed_time);
 94 |     //  }
 95 |     //}
 96 |   }
 97 | }
 98 | 
 99 | // =================================================================================================
100 | 
101 | SCENARIO("platforms can be created and used", "[Platform]") {
102 |   GIVEN("An example platform") {
103 |     auto platform = CLCudaAPI::Platform(kPlatformID);
104 |     auto num_devices = platform.NumDevices();
105 | 
106 |     #if USE_OPENCL // Not available for the CUDA version
107 |     WHEN("its underlying data-structure is retrieved") {
108 |       auto raw_platform = platform();
109 |       THEN("a copy of this platform can be created") {
110 |         auto platform_copy = CLCudaAPI::Platform(raw_platform);
111 |         REQUIRE(platform_copy.NumDevices() == num_devices);
112 |       }
113 |     }
114 |     #endif
115 | 
116 |     WHEN("a copy is created using the copy constructor") {
117 |       auto platform_copy = CLCudaAPI::Platform(platform);
118 |       THEN("the platform's properties remain unchanged") {
119 |         REQUIRE(platform_copy.NumDevices() == num_devices);
120 |       }
121 |     }
122 |   }
123 | }
124 | 
125 | // =================================================================================================
126 | 
127 | TEST_CASE("a list of all platforms can be retrieved", "[Platform]") {
128 |   auto all_platforms = CLCudaAPI::GetAllPlatforms();
129 |   REQUIRE(all_platforms.size() > 0);
130 |   for (auto &platform : all_platforms) {
131 |     auto num_devices = platform.NumDevices();
132 |     REQUIRE(num_devices > 0);
133 |   }
134 | }
135 | 
136 | // =================================================================================================
137 | 
138 | SCENARIO("devices can be created and used", "[Device][Platform]") {
139 |   GIVEN("An example device on a platform") {
140 |     auto platform = CLCudaAPI::Platform(kPlatformID);
141 |     auto device = CLCudaAPI::Device(platform, kDeviceID);
142 | 
143 |     GIVEN("...and device properties") {
144 |       auto device_version = device.Version();
145 |       auto device_vendor = device.Vendor();
146 |       auto device_name = device.Name();
147 |       auto device_type = device.Type();
148 |       auto device_max_work_group_size = device.MaxWorkGroupSize();
149 |       auto device_max_work_item_dimensions = device.MaxWorkItemDimensions();
150 |       auto device_max_work_item_sizes = device.MaxWorkItemSizes();
151 |       auto device_local_mem_size = device.LocalMemSize();
152 |       auto device_capabilities = device.Capabilities();
153 |       auto device_core_clock = device.CoreClock();
154 |       auto device_compute_units = device.ComputeUnits();
155 |       auto device_memory_size = device.MemorySize();
156 |       auto device_max_alloc_size = device.MaxAllocSize();
157 |       auto device_memory_clock = device.MemoryClock();
158 |       auto device_memory_bus_width = device.MemoryBusWidth();
159 | 
160 |       // TODO: test for valid device properties
161 | 
162 |       WHEN("its underlying data-structure is retrieved") {
163 |         auto raw_device = device();
164 |         THEN("a copy of this device can be created") {
165 |           auto device_copy = CLCudaAPI::Device(raw_device);
166 |           REQUIRE(device_copy.Name() == device_name); // Only verifying device name
167 |         }
168 |       }
169 | 
170 |       WHEN("a copy is created using the copy constructor") {
171 |         auto device_copy = CLCudaAPI::Device(device);
172 |         THEN("the device's properties remain unchanged") {
173 |           REQUIRE(device_copy.Name() == device_name); // Only verifying device name
174 |         }
175 |       }
176 | 
177 |       WHEN("the local memory size is tested") {
178 |         THEN("the maximum local memory size should be considered valid") {
179 |           REQUIRE(device.IsLocalMemoryValid(device_local_mem_size) == true);
180 |         }
181 |         THEN("more than the maximum local memory size should be considered invalid") {
182 |           REQUIRE(device.IsLocalMemoryValid(device_local_mem_size+1) == false);
183 |         }
184 |       }
185 | 
186 |       WHEN("the local thread configuration is tested") {
187 |         THEN("equal to the maximum size in one dimension should be considered valid") {
188 |           REQUIRE(device.IsThreadConfigValid({device_max_work_item_sizes[0],1,1}) == true);
189 |           REQUIRE(device.IsThreadConfigValid({1,device_max_work_item_sizes[1],1}) == true);
190 |           REQUIRE(device.IsThreadConfigValid({1,1,device_max_work_item_sizes[2]}) == true);
191 |         }
192 |         THEN("more than the maximum size in one dimension should be considered invalid") {
193 |           REQUIRE(device.IsThreadConfigValid({device_max_work_item_sizes[0]+1,1,1}) == false);
194 |           REQUIRE(device.IsThreadConfigValid({1,device_max_work_item_sizes[1]+1,1}) == false);
195 |           REQUIRE(device.IsThreadConfigValid({1,1,device_max_work_item_sizes[2]+1}) == false);
196 |         }
197 |       }
198 |     }
199 |   }
200 | }
201 | 
202 | // =================================================================================================
203 | 
204 | SCENARIO("contexts can be created and used", "[Context][Device][Platform]") {
205 |   GIVEN("An example context on a device") {
206 |     auto platform = CLCudaAPI::Platform(kPlatformID);
207 |     auto device = CLCudaAPI::Device(platform, kDeviceID);
208 |     auto context = CLCudaAPI::Context(device);
209 | 
210 |     WHEN("its underlying data-structure is retrieved") {
211 |       auto raw_context = context();
212 |       THEN("a copy of this context can be created") {
213 |         auto context_copy = CLCudaAPI::Context(raw_context);
214 |         REQUIRE(context_copy() != nullptr);
215 |       }
216 |     }
217 | 
218 |     WHEN("a copy is created using the copy constructor") {
219 |       auto context_copy = CLCudaAPI::Context(context);
220 |       THEN("its underlying data-structure is not null") {
221 |         REQUIRE(context_copy() != nullptr);
222 |       }
223 |     }
224 |   }
225 | }
226 | 
227 | // =================================================================================================
228 | 
229 | SCENARIO("programs can be created and used", "[Program][Context][Device][Platform]") {
230 |   GIVEN("An example program for a specific context and device") {
231 |     auto platform = CLCudaAPI::Platform(kPlatformID);
232 |     auto device = CLCudaAPI::Device(platform, kDeviceID);
233 |     auto context = CLCudaAPI::Context(device);
234 |     #if USE_OPENCL
235 |       auto source = R"(
236 |       __kernel void add(__global const float* a, __global const float* b, __global float* c) {
237 |         unsigned idx = get_global_id(0);
238 |         c[idx] = a[idx] + b[idx];
239 |       })";
240 | 
241 |     // ... or use CUDA instead
242 |     #else
243 |       auto source = R"(
244 |       extern "C" __global__ void add(const float* a, const float* b, float* c) {
245 |         unsigned idx = threadIdx.x + blockDim.x*blockIdx.x;
246 |         c[idx] = a[idx] + b[idx];
247 |       })";
248 |     #endif
249 |     auto options = std::vector<std::string>();
250 | 
251 |     auto program = CLCudaAPI::Program(context, source);
252 |     program.Build(device, options);
253 | 
254 |     WHEN("an compiled IR is generated from the compiled program") {
255 |       auto ir = program.GetIR();
256 |       THEN("a new program can be created based on the IR") {
257 |         auto new_program = CLCudaAPI::Program(device, context, ir);
258 |         new_program.Build(device, options);
259 |       }
260 |     }
261 |   }
262 | }
263 | 
264 | // =================================================================================================
265 | 
266 | SCENARIO("queues can be created and used", "[Queue][Context][Device][Platform][Event]") {
267 |   GIVEN("An example queue associated to a context and device") {
268 |     auto platform = CLCudaAPI::Platform(kPlatformID);
269 |     auto device = CLCudaAPI::Device(platform, kDeviceID);
270 |     auto context = CLCudaAPI::Context(device);
271 |     auto queue = CLCudaAPI::Queue(context, device);
272 | 
273 |     #if USE_OPENCL // Not available for the CUDA version
274 |     WHEN("its underlying data-structure is retrieved") {
275 |       auto raw_queue = queue();
276 |       THEN("a copy of this queue can be created") {
277 |         auto queue_copy = CLCudaAPI::Queue(raw_queue);
278 |         REQUIRE(queue_copy() != nullptr);
279 |       }
280 |     }
281 |     #endif
282 | 
283 |     WHEN("a copy is created using the copy constructor") {
284 |       auto queue_copy = CLCudaAPI::Queue(queue);
285 |       THEN("its underlying data-structure is not null") {
286 |         REQUIRE(queue_copy() != nullptr);
287 |       }
288 |     }
289 | 
290 |     WHEN("the associated context is retrieved") {
291 |       auto context_copy = queue.GetContext();
292 |       THEN("their underlying data-structures match") {
293 |         REQUIRE(context_copy() == context());
294 |       }
295 |     }
296 |     WHEN("the associated device is retrieved") {
297 |       auto device_copy = queue.GetDevice();
298 |       THEN("their underlying data-structures match") {
299 |         REQUIRE(device_copy() == device());
300 |       }
301 |     }
302 | 
303 |     WHEN("the queue is synchronised") {
304 |       queue.Finish();
305 |       THEN("its underlying data-structure is not null") {
306 |         REQUIRE(queue() != nullptr);
307 |       }
308 |     }
309 |     WHEN("the queue is synchronised using an event") {
310 |       auto event = CLCudaAPI::Event();
311 |       queue.Finish(event);
312 |       THEN("its underlying data-structure is not null") {
313 |         REQUIRE(queue() != nullptr);
314 |       }
315 |     }
316 |   }
317 | }
318 | 
319 | // =================================================================================================
320 | 
321 | SCENARIO("host buffers can be created and used", "[BufferHost][Context][Device][Platform]") {
322 |   GIVEN("An example host buffer for a specific context and device") {
323 |     auto platform = CLCudaAPI::Platform(kPlatformID);
324 |     auto device = CLCudaAPI::Device(platform, kDeviceID);
325 |     auto context = CLCudaAPI::Context(device);
326 |     auto size = static_cast<size_t>(kBufferSize);
327 |     auto buffer_host = CLCudaAPI::BufferHost<float>(context, size);
328 | 
329 |     // TODO: Fill in
330 |   }
331 | }
332 | 
333 | // =================================================================================================
334 | 
335 | SCENARIO("device buffers can be created and used", "[Buffer][Context][Device][Platform]") {
336 |   GIVEN("An example device buffer for a specific context and device") {
337 |     auto platform = CLCudaAPI::Platform(kPlatformID);
338 |     auto device = CLCudaAPI::Device(platform, kDeviceID);
339 |     auto context = CLCudaAPI::Context(device);
340 |     auto size = static_cast<size_t>(kBufferSize);
341 |     auto buffer = CLCudaAPI::Buffer<float>(context, size);
342 | 
343 |     // TODO: Fill in
344 |   }
345 | }
346 | 
347 | // =================================================================================================
348 | 
349 | SCENARIO("kernels can be created and used", "[Kernel][Program][Context][Device][Platform]") {
350 |   GIVEN("An example device buffer for a specific context and device") {
351 |     auto platform = CLCudaAPI::Platform(kPlatformID);
352 |     auto device = CLCudaAPI::Device(platform, kDeviceID);
353 |     auto context = CLCudaAPI::Context(device);
354 |     auto source = std::string{""};
355 |     auto program = CLCudaAPI::Program(context, source);
356 |     auto name = std::string{""};
357 |     //auto kernel = CLCudaAPI::Kernel(program, name);
358 | 
359 |     // TODO: Fill in
360 |   }
361 | }
362 | 
363 | // =================================================================================================
364 | 


--------------------------------------------------------------------------------