├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── cmake ├── CompileOpenCL.cmake └── EmbedResource.cmake ├── common ├── CMakeLists.txt ├── OCLSample.cpp ├── OCLSample.hpp ├── Sample.hpp └── cl.hpp ├── kernels ├── matrix-multiply-tiled │ ├── matrix-multiply-tiled.cpp │ └── matrix-multiply-tiled.kernel.cpp ├── matrix-multiply │ ├── matrix-multiply.cpp │ └── matrix-multiply.kernel.cpp └── vector-add │ ├── vector-add.cpp │ └── vector-add.kernel.cpp └── opencl ├── CMakeLists.txt ├── blur2d ├── CMakeLists.txt ├── blur2d.cpp └── blur2d_kernel.cl ├── matmul-double ├── CMakeLists.txt ├── matmul-double.cpp └── matmul-double_kernel.cl └── matmul ├── CMakeLists.txt ├── matmul.cpp └── matmul_kernel.cl /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "libclc"] 2 | path = libclc 3 | url = git://git.pcc.me.uk/~peter/libclc.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2011 by Justin Holewinski 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | # 22 | 23 | cmake_minimum_required(VERSION 2.6) 24 | 25 | project(llvm-ptx-samples) 26 | 27 | # Locate needed packages 28 | # We use the CUDA package to locate the NVidia SDK for us 29 | find_package(CUDA REQUIRED) 30 | 31 | # Locate Clang 32 | find_program(CLANG_PROGRAM NAMES clang) 33 | if(NOT CLANG_PROGRAM) 34 | message(FATAL_ERROR "Unable to locate Clang.") 35 | else() 36 | message(STATUS "Found Clang: ${CLANG_PROGRAM}") 37 | endif() 38 | 39 | # Locate opt 40 | find_program(OPT_PROGRAM NAMES opt) 41 | if(NOT OPT_PROGRAM) 42 | message(FATAL_ERROR "Unable to locate opt.") 43 | else() 44 | message(STATUS "Found opt: ${OPT_PROGRAM}") 45 | endif() 46 | 47 | # Locate llc 48 | find_program(LLC_PROGRAM NAMES llc) 49 | if(NOT LLC_PROGRAM) 50 | message(FATAL_ERROR "Unable to locate llc.") 51 | else() 52 | message(STATUS "Found llc: ${LLC_PROGRAM}") 53 | endif() 54 | 55 | # Locate OpenCL library 56 | find_library(OPENCL_LIBRARY NAMES OpenCL) 57 | if(NOT OPENCL_LIBRARY) 58 | message(FATAL_ERROR "Unable to locate OpenCL library.") 59 | else() 60 | message(STATUS "Found OpenCL: ${OPENCL_LIBRARY}") 61 | endif() 62 | 63 | # Prep our bin directory 64 | file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 65 | set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin) 66 | 67 | # Include our custom build code 68 | include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/CompileOpenCL.cmake) 69 | include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/EmbedResource.cmake) 70 | 71 | include_directories(${CUDA_INCLUDE_DIRS}) 72 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}) 73 | 74 | add_subdirectory(common) 75 | add_subdirectory(opencl) 76 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2011 by Justin Holewinski 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | LLVM PTX Samples 2 | ================ 3 | 4 | This collection of sample programs highlights the PTX code generation back-end 5 | for the LLVM project. These programs serve as both examples for the usage of 6 | the back-end (as well as the Clang front-end integration) and a simple test 7 | suite. 8 | 9 | These samples are currently being converted to OpenCL. 10 | 11 | 12 | Usage 13 | ----- 14 | 15 | To compile the samples, CMake and the NVidia CUDA toolkit is required, as well as a reasonably up-to-date build of Clang/LLVM which was built with the PTX back-end. It is recommended that you build the examples in a separate directory. 16 | 17 | For most systems, the following build commands can be used: 18 | 19 | $ cd llvm-ptx-samples 20 | $ git submodule init && git submodule update 21 | $ mkdir build 22 | $ cd build 23 | $ cmake .. 24 | $ make 25 | 26 | If CMake complains that it cannot find a CUDA toolkit, please add -DCUDA_TOOLKIT_ROOT_DIR=... to your CMake invocation, specifying the root of your CUDA installation. Similarly, if Clang or the other LLVM tools cannot be found, please adjust your PATH to include them. 27 | 28 | Once the samples are built, they can be executed from the bin/ directory, created within your build directory. For example, to run the matrix multiplication sample, you can execute: 29 | 30 | $ cd bin 31 | $ ./ocl-matmul 32 | 33 | from your build directory. 34 | -------------------------------------------------------------------------------- /cmake/CompileOpenCL.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2011 by Justin Holewinski 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | # 22 | 23 | # Compiler flags 24 | set(LIBCLC ${CMAKE_SOURCE_DIR}/libclc) 25 | set(LIBCLC_FLAGS -I${LIBCLC}/include/ptx -I${LIBCLC}/include/generic -include clc/clc.h -Dcl_clang_storage_class_specifiers) 26 | set(CLANG_FLAGS -target nvptx-unknown-nvcl -S -emit-llvm -O4 ${LIBCLC_FLAGS}) 27 | set(OPT_FLAGS -O3 -loop-unroll) 28 | set(LLC_FLAGS -mcpu=sm_20) 29 | 30 | set(RESOURCE_OUTPUT_DIR ${CMAKE_BINARY_DIR}/bin) 31 | 32 | # By default, _llout is assumed to be relative to RESOURCE_OUTPUT_DIR and 33 | # _srcin is assumed to be relative to CMAKE_CURRENT_SOURCE_DIR 34 | macro(compile_opencl_to_llvmir _llout _srcin) 35 | get_filename_component(_srcin_abs ${_srcin} ABSOLUTE) 36 | add_custom_command(OUTPUT ${RESOURCE_OUTPUT_DIR}/${_llout} 37 | DEPENDS ${_srcin_abs} 38 | COMMAND ${CLANG_PROGRAM} ${CLANG_FLAGS} ${_srcin_abs} -o ${_llout} 39 | WORKING_DIRECTORY ${RESOURCE_OUTPUT_DIR} 40 | COMMENT "Compiling ${_srcin} -> ${_llout}") 41 | add_custom_target(${_llout} DEPENDS ${RESOURCE_OUTPUT_DIR}/${_llout}) 42 | endmacro() 43 | 44 | macro(optimize_llvmir _llout _llin) 45 | set(_llout_abs ${RESOURCE_OUTPUT_DIR}/${_llout}) 46 | set(_llin_abs ${RESOURCE_OUTPUT_DIR}/${_llin}) 47 | add_custom_command(OUTPUT ${_llout_abs} 48 | DEPENDS ${_llin_abs} 49 | COMMAND ${OPT_PROGRAM} -S ${OPT_FLAGS} ${_llin_abs} -o ${_llout_abs} 50 | WORKING_DIRECTORY ${RESOURCE_OUTPUT_DIR} 51 | COMMENT "Optimizing ${_llin} -> ${_llout}") 52 | add_custom_target(${_llout} DEPENDS ${_llout_abs}) 53 | endmacro() 54 | 55 | macro(codegen_ptx _ptxout _llin) 56 | set(_ptxout_abs ${RESOURCE_OUTPUT_DIR}/${_ptxout}) 57 | set(_llin_abs ${RESOURCE_OUTPUT_DIR}/${_llin}) 58 | add_custom_command(OUTPUT ${_ptxout_abs} 59 | DEPENDS ${_llin_abs} 60 | COMMAND ${LLC_PROGRAM} ${LLC_FLAGS} ${_llin_abs} -o ${_ptxout_abs} 61 | WORKING_DIRECTORY ${RESOURCE_OUTPUT_DIR} 62 | COMMENT "Compiling ${_llin} -> ${_ptxout}") 63 | add_custom_target(${_ptxout} DEPENDS ${_ptxout_abs}) 64 | endmacro() 65 | 66 | macro(copy_opencl _clin) 67 | set(_dest_abs ${RESOURCE_OUTPUT_DIR}/${_clin}) 68 | set(_src_abs ${CMAKE_CURRENT_SOURCE_DIR}/${_clin}) 69 | add_custom_command(OUTPUT ${_dest_abs} 70 | DEPENDS ${_src_abs} 71 | COMMAND ${CMAKE_COMMAND} -E copy ${_src_abs} ${_dest_abs} 72 | WORKING_DIRECTORY ${RESOURCE_OUTPUT_DIR} 73 | COMMENT "Copying OpenCL source ${_clin}") 74 | add_custom_target(${_clin} DEPENDS ${_dest_abs}) 75 | endmacro() 76 | 77 | macro(create_opencl_targets _targets _kernel) 78 | set(${_targets}) 79 | compile_opencl_to_llvmir(${_kernel}.ll ${_kernel}.cl) 80 | optimize_llvmir(${_kernel}.opt.ll ${_kernel}.ll) 81 | codegen_ptx(${_kernel}.ptx ${_kernel}.opt.ll) 82 | copy_opencl(${_kernel}.cl) 83 | list(APPEND ${_targets} ${_kernel}.ptx ${_kernel}.cl) 84 | endmacro() 85 | -------------------------------------------------------------------------------- /cmake/EmbedResource.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2011 by Justin Holewinski 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | # 22 | 23 | macro(embed_resource _res _src) 24 | add_custom_command(OUTPUT ${_res} 25 | DEPENDS ${_src} 26 | COMMAND ${XXD_PROGRAM} -i < ${_src} > ${_res} 27 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) 28 | set_source_files_properties(${_res} PROPERTIES GENERATED TRUE) 29 | add_custom_target(${_res} DEPENDS ${_res}) 30 | endmacro() 31 | -------------------------------------------------------------------------------- /common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2011 by Justin Holewinski 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | # 22 | 23 | set(_sources OCLSample.cpp) 24 | 25 | set(_headers OCLSample.hpp 26 | Sample.hpp) 27 | 28 | add_library(sampleutil STATIC ${_sources} ${_headers}) 29 | -------------------------------------------------------------------------------- /common/OCLSample.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include "common/OCLSample.hpp" 28 | 29 | OCLSample::OCLSample() 30 | : numIterations_(4) { 31 | initOpenCL(); 32 | } 33 | 34 | OCLSample::~OCLSample() { 35 | } 36 | 37 | void OCLSample::initialize() { 38 | } 39 | 40 | void OCLSample::createMemoryBuffers() { 41 | } 42 | 43 | void OCLSample::setupKernel(cl::Kernel kernel) { 44 | } 45 | 46 | void OCLSample::finishKernel(cl::Kernel kernel) { 47 | } 48 | 49 | void OCLSample::runKernel(cl::Kernel kernel, cl::Event* evt) { 50 | } 51 | 52 | void OCLSample::run() { 53 | double elapsed, average; 54 | 55 | initialize(); 56 | createMemoryBuffers(); 57 | 58 | std::cout << "------------------------------\n"; 59 | std::cout << "* Source Kernel\n"; 60 | std::cout << "------------------------------\n"; 61 | setupKernel(sourceKernel_); 62 | timeKernel(sourceKernel_, elapsed, average); 63 | finishKernel(sourceKernel_); 64 | 65 | std::cout << "Number of Iterations: " << numIterations_ << "\n"; 66 | std::cout << "Total Time: " << elapsed << " sec\n"; 67 | std::cout << "Average Time: " << average << " sec\n"; 68 | 69 | std::cout << "------------------------------\n"; 70 | std::cout << "* Binary Kernel\n"; 71 | std::cout << "------------------------------\n"; 72 | setupKernel(binaryKernel_); 73 | timeKernel(binaryKernel_, elapsed, average); 74 | finishKernel(binaryKernel_); 75 | 76 | std::cout << "Number of Iterations: " << numIterations_ << "\n"; 77 | std::cout << "Total Time: " << elapsed << " sec\n"; 78 | std::cout << "Average Time: " << average << " sec\n"; 79 | } 80 | 81 | void OCLSample::timeKernel(cl::Kernel kernel, double& elapsed, 82 | double& average) { 83 | cl::Event* events = new cl::Event[numIterations_]; 84 | cl_int result; 85 | 86 | elapsed = 0.0; 87 | 88 | for(unsigned i = 0; i < numIterations_; ++i) { 89 | cl_ulong start, end; 90 | runKernel(kernel, &events[i]); 91 | 92 | queue_.flush(); 93 | events[i].wait(); 94 | 95 | result = events[i].getProfilingInfo(CL_PROFILING_COMMAND_START, 96 | &start); 97 | assert(result == CL_SUCCESS && "Unable to get profiling information"); 98 | result = events[i].getProfilingInfo(CL_PROFILING_COMMAND_END, 99 | &end); 100 | assert(result == CL_SUCCESS && "Unable to get profiling information"); 101 | elapsed += (double)1e-9 * (end - start); 102 | } 103 | 104 | average = elapsed / (double)numIterations_; 105 | 106 | delete [] events; 107 | } 108 | 109 | cl::Program OCLSample::compileSource(const std::string& filename) { 110 | cl_int result; 111 | 112 | std::ifstream kernelStream(filename.c_str()); 113 | std::string source(std::istreambuf_iterator(kernelStream), 114 | (std::istreambuf_iterator())); 115 | kernelStream.close(); 116 | 117 | cl::Program::Sources sources(1, std::make_pair(source.c_str(), 118 | source.size())); 119 | cl::Program program(context_, sources, &result); 120 | assert(result == CL_SUCCESS && "Failed to load program source"); 121 | std::vector devices; 122 | devices.push_back(device_); 123 | result = program.build(devices); 124 | if(result != CL_SUCCESS) { 125 | std::cerr << "Source compilation failed.\n"; 126 | std::cerr << program.getBuildInfo(device_); 127 | assert(false && "Unable to continue"); 128 | } 129 | return program; 130 | } 131 | 132 | cl::Program OCLSample::loadBinary(const std::string& filename) { 133 | cl_int result; 134 | 135 | std::ifstream kernelStream(filename.c_str()); 136 | std::string binary(std::istreambuf_iterator(kernelStream), 137 | (std::istreambuf_iterator())); 138 | kernelStream.close(); 139 | 140 | cl::Program::Binaries binaries(1, std::make_pair(binary.c_str(), 141 | binary.size())); 142 | std::vector devices; 143 | devices.push_back(device_); 144 | cl::Program program(context_, devices, binaries, NULL, &result); 145 | assert(result == CL_SUCCESS && "Failed to load program source"); 146 | result = program.build(devices); 147 | if(result != CL_SUCCESS) { 148 | std::cerr << "Source compilation failed.\n"; 149 | std::cerr << program.getBuildInfo(device_); 150 | assert(false && "Unable to continue"); 151 | } 152 | return program; 153 | } 154 | 155 | void OCLSample::initOpenCL() { 156 | cl_int result; 157 | 158 | // First, select an OpenCL platform 159 | typedef std::vector PlatformVector; 160 | PlatformVector platforms; 161 | result = cl::Platform::get(&platforms); 162 | assert(result == CL_SUCCESS && "Failed to retrieve OpenCL platform"); 163 | assert(platforms.size() > 0 && "No OpenCL platforms found"); 164 | 165 | // For now, just blindly select the first platform. 166 | // @TODO: Implement a check here for the NVidia OpenCL platform. 167 | platform_ = platforms[0]; 168 | 169 | // Create an OpenCL context. 170 | cl_context_properties cps[] = { CL_CONTEXT_PLATFORM, 171 | (cl_context_properties)(platform_)(), 0 }; 172 | context_ = cl::Context(CL_DEVICE_TYPE_GPU, cps, NULL, NULL, &result); 173 | assert(result == CL_SUCCESS && "Failed to create OpenCL context"); 174 | 175 | // Retrieve the available OpenCL devices. 176 | typedef std::vector DeviceVector; 177 | DeviceVector devices; 178 | devices = context_.getInfo(); 179 | assert(devices.size() > 0 && "No OpenCL devices found"); 180 | 181 | // For now, just blindly select the first device. 182 | // @TODO: Implement some sort of device check here. 183 | device_ = devices[0]; 184 | 185 | // Create a command queue 186 | queue_ = cl::CommandQueue(context_, device_, CL_QUEUE_PROFILING_ENABLE, &result); 187 | assert(result == CL_SUCCESS && "Failed to create command queue"); 188 | } 189 | -------------------------------------------------------------------------------- /common/OCLSample.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | #if !defined(OCL_SAMPLE_HPP_INC) 24 | #define OCL_SAMPLE_HPP_INC 1 25 | 26 | #include "common/cl.hpp" 27 | #include "common/Sample.hpp" 28 | /** 29 | * Base class for OpenCL samples. 30 | */ 31 | class OCLSample : public Sample { 32 | public: 33 | 34 | OCLSample(); 35 | 36 | virtual ~OCLSample(); 37 | 38 | virtual void run(); 39 | 40 | protected: 41 | 42 | /** 43 | * Hook for samples to perform any initialization. 44 | */ 45 | virtual void initialize(); 46 | 47 | /** 48 | * Hook for samples to allocate any needed memory buffers. 49 | */ 50 | virtual void createMemoryBuffers(); 51 | 52 | /** 53 | * Hook for samples to perform any kernel setup, such as copying data from 54 | * the host to the device. 55 | */ 56 | virtual void setupKernel(cl::Kernel kernel); 57 | 58 | /** 59 | * Hook for samples to perform any kernel finalization, such as copying data 60 | * from the device to the host. */ 61 | virtual void finishKernel(cl::Kernel kernel); 62 | 63 | /** 64 | * Hook for samples to run the requested kernel. 65 | */ 66 | virtual void runKernel(cl::Kernel kernel, cl::Event* evt); 67 | 68 | cl::Program compileSource(const std::string& source); 69 | cl::Program loadBinary(const std::string& binary); 70 | 71 | void setSourceKernel(cl::Kernel kernel) { 72 | sourceKernel_ = kernel; 73 | } 74 | 75 | cl::Kernel getSourceKernel() { 76 | return sourceKernel_; 77 | } 78 | 79 | void setBinaryKernel(cl::Kernel kernel) { 80 | binaryKernel_ = kernel; 81 | } 82 | 83 | cl::Kernel getBinaryKernel() { 84 | return binaryKernel_; 85 | } 86 | 87 | cl::Context& getContext() { 88 | return context_; 89 | } 90 | 91 | cl::CommandQueue& getCommandQueue() { 92 | return queue_; 93 | } 94 | 95 | unsigned getNumberOfIterations() const { 96 | return numIterations_; 97 | } 98 | 99 | void setNumberOfIterations(unsigned iters) { 100 | numIterations_ = iters; 101 | } 102 | 103 | private: 104 | 105 | void initOpenCL(); 106 | void timeKernel(cl::Kernel kernel, double& elapsed, double& average); 107 | 108 | cl::Platform platform_; 109 | cl::Device device_; 110 | cl::Context context_; 111 | cl::CommandQueue queue_; 112 | cl::Kernel sourceKernel_; 113 | cl::Kernel binaryKernel_; 114 | unsigned numIterations_; 115 | 116 | }; 117 | 118 | #endif 119 | -------------------------------------------------------------------------------- /common/Sample.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | #if !defined(SAMPLE_HPP_INC) 24 | #define SAMPLE_HPP_INC 1 25 | 26 | /** 27 | * Base class from which all samples inherit. 28 | */ 29 | class Sample { 30 | }; 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /common/cl.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2010 The Khronos Group Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and/or associated documentation files (the 6 | * "Materials"), to deal in the Materials without restriction, including 7 | * without limitation the rights to use, copy, modify, merge, publish, 8 | * distribute, sublicense, and/or sell copies of the Materials, and to 9 | * permit persons to whom the Materials are furnished to do so, subject to 10 | * the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included 13 | * in all copies or substantial portions of the Materials. 14 | * 15 | * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. 22 | ******************************************************************************/ 23 | 24 | /*! \file 25 | * 26 | * \brief C++ bindings for OpenCL 1.0 (rev 48) and OpenCL 1.1 (rev 33) 27 | * \author Benedict R. Gaster and Laurent Morichetti 28 | * 29 | * Additions and fixes from Brian Cole, March 3rd 2010. 30 | * 31 | * \version 1.1 32 | * \date June 2010 33 | * 34 | * Optional extension support 35 | * 36 | * cl 37 | * cl_ext_device_fission 38 | * #define USE_CL_DEVICE_FISSION 39 | */ 40 | 41 | /*! \mainpage 42 | * \section intro Introduction 43 | * For many large applications C++ is the language of choice and so it seems 44 | * reasonable to define C++ bindings for OpenCL. 45 | * 46 | * 47 | * The interface is contained with a single C++ header file \em cl.hpp and all 48 | * definitions are contained within the namespace \em cl. There is no additional 49 | * requirement to include \em cl.h and to use either the C++ or original C 50 | * bindings it is enough to simply include \em cl.hpp. 51 | * 52 | * The bindings themselves are lightweight and correspond closely to the 53 | * underlying C API. Using the C++ bindings introduces no additional execution 54 | * overhead. 55 | * 56 | * For detail documentation on the bindings see: 57 | * 58 | * The OpenCL C++ Wrapper API 1.1 (revision 04) 59 | * http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.1.pdf 60 | * 61 | * \section example Example 62 | * 63 | * The following example shows a general use case for the C++ 64 | * bindings, including support for the optional exception feature and 65 | * also the supplied vector and string classes, see following sections for 66 | * decriptions of these features. 67 | * 68 | * \code 69 | * #define __CL_ENABLE_EXCEPTIONS 70 | * 71 | * #if defined(__APPLE__) || defined(__MACOSX) 72 | * #include 73 | * #else 74 | * #include 75 | * #endif 76 | * #include 77 | * #include 78 | * #include 79 | * 80 | * const char * helloStr = "__kernel void " 81 | * "hello(void) " 82 | * "{ " 83 | * " " 84 | * "} "; 85 | * 86 | * int 87 | * main(void) 88 | * { 89 | * cl_int err = CL_SUCCESS; 90 | * try { 91 | * 92 | * std::vector platforms; 93 | * cl::Platform::get(&platforms); 94 | * if (platforms.size() == 0) { 95 | * std::cout << "Platform size 0\n"; 96 | * return -1; 97 | * } 98 | * 99 | * cl_context_properties properties[] = 100 | * { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0}; 101 | * cl::Context context(CL_DEVICE_TYPE_CPU, properties); 102 | * 103 | * std::vector devices = context.getInfo(); 104 | * 105 | * cl::Program::Sources source(1, 106 | * std::make_pair(helloStr,strlen(helloStr))); 107 | * cl::Program program_ = cl::Program(context, source); 108 | * program_.build(devices); 109 | * 110 | * cl::Kernel kernel(program_, "hello", &err); 111 | * 112 | * cl::Event event; 113 | * cl::CommandQueue queue(context, devices[0], 0, &err); 114 | * queue.enqueueNDRangeKernel( 115 | * kernel, 116 | * cl::NullRange, 117 | * cl::NDRange(4,4), 118 | * cl::NullRange, 119 | * NULL, 120 | * &event); 121 | * 122 | * event.wait(); 123 | * } 124 | * catch (cl::Error err) { 125 | * std::cerr 126 | * << "ERROR: " 127 | * << err.what() 128 | * << "(" 129 | * << err.err() 130 | * << ")" 131 | * << std::endl; 132 | * } 133 | * 134 | * return EXIT_SUCCESS; 135 | * } 136 | * 137 | * \endcode 138 | * 139 | */ 140 | #ifndef CL_HPP_ 141 | #define CL_HPP_ 142 | 143 | #ifdef _WIN32 144 | #include 145 | #include 146 | #if defined(USE_DX_INTEROP) 147 | #include 148 | #endif 149 | #endif // _WIN32 150 | 151 | // 152 | #if defined(USE_CL_DEVICE_FISSION) 153 | #include 154 | #endif 155 | 156 | #if defined(__APPLE__) || defined(__MACOSX) 157 | #include 158 | #include 159 | #else 160 | #include 161 | #include 162 | #endif // !__APPLE__ 163 | 164 | #if !defined(CL_CALLBACK) 165 | #define CL_CALLBACK 166 | #endif //CL_CALLBACK 167 | 168 | #include 169 | 170 | #if !defined(__NO_STD_VECTOR) 171 | #include 172 | #endif 173 | 174 | #if !defined(__NO_STD_STRING) 175 | #include 176 | #endif 177 | 178 | #if defined(linux) || defined(__APPLE__) || defined(__MACOSX) 179 | # include 180 | #endif // linux 181 | 182 | #include 183 | 184 | /*! \namespace cl 185 | * 186 | * \brief The OpenCL C++ bindings are defined within this namespace. 187 | * 188 | */ 189 | namespace cl { 190 | 191 | #define __INIT_CL_EXT_FCN_PTR(name) \ 192 | if(!pfn_##name) { \ 193 | pfn_##name = (PFN_##name) \ 194 | clGetExtensionFunctionAddress(#name); \ 195 | if(!pfn_##name) { \ 196 | } \ 197 | } 198 | 199 | class Program; 200 | class Device; 201 | class Context; 202 | class CommandQueue; 203 | class Memory; 204 | 205 | #if defined(__CL_ENABLE_EXCEPTIONS) 206 | #include 207 | /*! \class Error 208 | * \brief Exception class 209 | */ 210 | class Error : public std::exception 211 | { 212 | private: 213 | cl_int err_; 214 | const char * errStr_; 215 | public: 216 | /*! Create a new CL error exception for a given error code 217 | * and corresponding message. 218 | */ 219 | Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr) 220 | {} 221 | 222 | ~Error() throw() {} 223 | 224 | /*! \brief Get error string associated with exception 225 | * 226 | * \return A memory pointer to the error message string. 227 | */ 228 | virtual const char * what() const throw () 229 | { 230 | if (errStr_ == NULL) { 231 | return "empty"; 232 | } 233 | else { 234 | return errStr_; 235 | } 236 | } 237 | 238 | /*! \brief Get error code associated with exception 239 | * 240 | * \return The error code. 241 | */ 242 | const cl_int err(void) const { return err_; } 243 | }; 244 | 245 | #define __ERR_STR(x) #x 246 | #else 247 | #define __ERR_STR(x) NULL 248 | #endif // __CL_ENABLE_EXCEPTIONS 249 | 250 | //! \cond DOXYGEN_DETAIL 251 | #if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) 252 | #define __GET_DEVICE_INFO_ERR __ERR_STR(clgetDeviceInfo) 253 | #define __GET_PLATFORM_INFO_ERR __ERR_STR(clGetPlatformInfo) 254 | #define __GET_DEVICE_IDS_ERR __ERR_STR(clGetDeviceIDs) 255 | #define __GET_PLATFORM_IDS_ERR __ERR_STR(clGetPlatformIDs) 256 | #define __GET_CONTEXT_INFO_ERR __ERR_STR(clGetContextInfo) 257 | #define __GET_EVENT_INFO_ERR __ERR_STR(clGetEventInfo) 258 | #define __GET_EVENT_PROFILE_INFO_ERR __ERR_STR(clGetEventProfileInfo) 259 | #define __GET_MEM_OBJECT_INFO_ERR __ERR_STR(clGetMemObjectInfo) 260 | #define __GET_IMAGE_INFO_ERR __ERR_STR(clGetImageInfo) 261 | #define __GET_SAMPLER_INFO_ERR __ERR_STR(clGetSamplerInfo) 262 | #define __GET_KERNEL_INFO_ERR __ERR_STR(clGetKernelInfo) 263 | #define __GET_KERNEL_WORK_GROUP_INFO_ERR __ERR_STR(clGetKernelWorkGroupInfo) 264 | #define __GET_PROGRAM_INFO_ERR __ERR_STR(clGetProgramInfo) 265 | #define __GET_PROGRAM_BUILD_INFO_ERR __ERR_STR(clGetProgramBuildInfo) 266 | #define __GET_COMMAND_QUEUE_INFO_ERR __ERR_STR(clGetCommandQueueInfo) 267 | 268 | #define __CREATE_CONTEXT_FROM_TYPE_ERR __ERR_STR(clCreateContextFromType) 269 | #define __GET_SUPPORTED_IMAGE_FORMATS_ERR __ERR_STR(clGetSupportedImageFormats) 270 | 271 | #define __CREATE_BUFFER_ERR __ERR_STR(clCreateBuffer) 272 | #define __CREATE_SUBBUFFER_ERR __ERR_STR(clCreateSubBuffer) 273 | #define __CREATE_GL_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer) 274 | #define __GET_GL_OBJECT_INFO_ERR __ERR_STR(clGetGLObjectInfo) 275 | #define __CREATE_IMAGE2D_ERR __ERR_STR(clCreateImage2D) 276 | #define __CREATE_IMAGE3D_ERR __ERR_STR(clCreateImage3D) 277 | #define __CREATE_SAMPLER_ERR __ERR_STR(clCreateSampler) 278 | #define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback) 279 | 280 | #define __CREATE_USER_EVENT_ERR __ERR_STR(clCreateUserEvent) 281 | #define __SET_USER_EVENT_STATUS_ERR __ERR_STR(clSetUserEventStatus) 282 | #define __SET_EVENT_CALLBACK_ERR __ERR_STR(clSetEventCallback) 283 | #define __WAIT_FOR_EVENTS_ERR __ERR_STR(clWaitForEvents) 284 | 285 | #define __CREATE_KERNEL_ERR __ERR_STR(clCreateKernel) 286 | #define __SET_KERNEL_ARGS_ERR __ERR_STR(clSetKernelArg) 287 | #define __CREATE_PROGRAM_WITH_SOURCE_ERR __ERR_STR(clCreateProgramWithSource) 288 | #define __CREATE_PROGRAM_WITH_BINARY_ERR __ERR_STR(clCreateProgramWithBinary) 289 | #define __BUILD_PROGRAM_ERR __ERR_STR(clBuildProgram) 290 | #define __CREATE_KERNELS_IN_PROGRAM_ERR __ERR_STR(clCreateKernelsInProgram) 291 | 292 | #define __CREATE_COMMAND_QUEUE_ERR __ERR_STR(clCreateCommandQueue) 293 | #define __SET_COMMAND_QUEUE_PROPERTY_ERR __ERR_STR(clSetCommandQueueProperty) 294 | #define __ENQUEUE_READ_BUFFER_ERR __ERR_STR(clEnqueueReadBuffer) 295 | #define __ENQUEUE_READ_BUFFER_RECT_ERR __ERR_STR(clEnqueueReadBufferRect) 296 | #define __ENQUEUE_WRITE_BUFFER_ERR __ERR_STR(clEnqueueWriteBuffer) 297 | #define __ENQUEUE_WRITE_BUFFER_RECT_ERR __ERR_STR(clEnqueueWriteBufferRect) 298 | #define __ENQEUE_COPY_BUFFER_ERR __ERR_STR(clEnqueueCopyBuffer) 299 | #define __ENQEUE_COPY_BUFFER_RECT_ERR __ERR_STR(clEnqueueCopyBufferRect) 300 | #define __ENQUEUE_READ_IMAGE_ERR __ERR_STR(clEnqueueReadImage) 301 | #define __ENQUEUE_WRITE_IMAGE_ERR __ERR_STR(clEnqueueWriteImage) 302 | #define __ENQUEUE_COPY_IMAGE_ERR __ERR_STR(clEnqueueCopyImage) 303 | #define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR __ERR_STR(clEnqueueCopyImageToBuffer) 304 | #define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR __ERR_STR(clEnqueueCopyBufferToImage) 305 | #define __ENQUEUE_MAP_BUFFER_ERR __ERR_STR(clEnqueueMapBuffer) 306 | #define __ENQUEUE_MAP_IMAGE_ERR __ERR_STR(clEnqueueMapImage) 307 | #define __ENQUEUE_UNMAP_MEM_OBJECT_ERR __ERR_STR(clEnqueueUnMapMemObject) 308 | #define __ENQUEUE_NDRANGE_KERNEL_ERR __ERR_STR(clEnqueueNDRangeKernel) 309 | #define __ENQUEUE_TASK_ERR __ERR_STR(clEnqueueTask) 310 | #define __ENQUEUE_NATIVE_KERNEL __ERR_STR(clEnqueueNativeKernel) 311 | #define __ENQUEUE_MARKER_ERR __ERR_STR(clEnqueueMarker) 312 | #define __ENQUEUE_WAIT_FOR_EVENTS_ERR __ERR_STR(clEnqueueWaitForEvents) 313 | #define __ENQUEUE_BARRIER_ERR __ERR_STR(clEnqueueBarrier) 314 | 315 | #define __ENQUEUE_ACQUIRE_GL_ERR __ERR_STR(clEnqueueAcquireGLObjects) 316 | #define __ENQUEUE_RELEASE_GL_ERR __ERR_STR(clEnqueueReleaseGLObjects) 317 | 318 | #define __UNLOAD_COMPILER_ERR __ERR_STR(clUnloadCompiler) 319 | 320 | #define __FLUSH_ERR __ERR_STR(clFlush) 321 | #define __FINISH_ERR __ERR_STR(clFinish) 322 | 323 | #define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevicesEXT) 324 | #endif // __CL_USER_OVERRIDE_ERROR_STRINGS 325 | //! \endcond 326 | 327 | /*! \class string 328 | * \brief Simple string class, that provides a limited subset of std::string 329 | * functionality but avoids many of the issues that come with that class. 330 | */ 331 | class string 332 | { 333 | private: 334 | ::size_t size_; 335 | char * str_; 336 | public: 337 | string(void) : size_(0), str_(NULL) 338 | { 339 | } 340 | 341 | string(char * str, ::size_t size) : 342 | size_(size), 343 | str_(NULL) 344 | { 345 | str_ = new char[size_+1]; 346 | if (str_ != NULL) { 347 | memcpy(str_, str, size_ * sizeof(char)); 348 | str_[size_] = '\0'; 349 | } 350 | else { 351 | size_ = 0; 352 | } 353 | } 354 | 355 | string(char * str) : 356 | str_(NULL) 357 | { 358 | size_= ::strlen(str); 359 | str_ = new char[size_ + 1]; 360 | if (str_ != NULL) { 361 | memcpy(str_, str, (size_ + 1) * sizeof(char)); 362 | } 363 | else { 364 | size_ = 0; 365 | } 366 | } 367 | 368 | string& operator=(const string& rhs) 369 | { 370 | if (this == &rhs) { 371 | return *this; 372 | } 373 | 374 | if (rhs.size_ == 0 || rhs.str_ == NULL) { 375 | size_ = 0; 376 | str_ = NULL; 377 | } 378 | else { 379 | size_ = rhs.size_; 380 | str_ = new char[size_ + 1]; 381 | if (str_ != NULL) { 382 | memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char)); 383 | } 384 | else { 385 | size_ = 0; 386 | } 387 | } 388 | 389 | return *this; 390 | } 391 | 392 | string(const string& rhs) 393 | { 394 | *this = rhs; 395 | } 396 | 397 | ~string() 398 | { 399 | if (str_ != NULL) { 400 | delete[] str_; 401 | } 402 | } 403 | 404 | ::size_t size(void) const { return size_; } 405 | ::size_t length(void) const { return size(); } 406 | 407 | const char * c_str(void) const { return (str_) ? str_ : "";} 408 | }; 409 | 410 | #if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING) 411 | #include 412 | typedef std::string STRING_CLASS; 413 | #elif !defined(__USE_DEV_STRING) 414 | typedef cl::string STRING_CLASS; 415 | #endif 416 | 417 | #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) 418 | #include 419 | #define VECTOR_CLASS std::vector 420 | #elif !defined(__USE_DEV_VECTOR) 421 | #define VECTOR_CLASS cl::vector 422 | #endif 423 | 424 | #if !defined(__MAX_DEFAULT_VECTOR_SIZE) 425 | #define __MAX_DEFAULT_VECTOR_SIZE 10 426 | #endif 427 | 428 | /*! \class vector 429 | * \brief Fixed sized vector implementation that mirroring 430 | * std::vector functionality. 431 | */ 432 | template 433 | class vector 434 | { 435 | private: 436 | T data_[N]; 437 | unsigned int size_; 438 | bool empty_; 439 | public: 440 | vector() : 441 | size_(-1), 442 | empty_(true) 443 | {} 444 | 445 | ~vector() {} 446 | 447 | unsigned int size(void) const 448 | { 449 | return size_ + 1; 450 | } 451 | 452 | void clear() 453 | { 454 | size_ = -1; 455 | empty_ = true; 456 | } 457 | 458 | void push_back (const T& x) 459 | { 460 | if (size() < N) { 461 | size_++; 462 | data_[size_] = x; 463 | empty_ = false; 464 | } 465 | } 466 | 467 | void pop_back(void) 468 | { 469 | if (!empty_) { 470 | data_[size_].~T(); 471 | size_--; 472 | if (size_ == -1) { 473 | empty_ = true; 474 | } 475 | } 476 | } 477 | 478 | vector(const vector& vec) : 479 | size_(vec.size_), 480 | empty_(vec.empty_) 481 | { 482 | if (!empty_) { 483 | memcpy(&data_[0], &vec.data_[0], size() * sizeof(T)); 484 | } 485 | } 486 | 487 | vector(unsigned int size, const T& val = T()) : 488 | size_(-1), 489 | empty_(true) 490 | { 491 | for (unsigned int i = 0; i < size; i++) { 492 | push_back(val); 493 | } 494 | } 495 | 496 | vector& operator=(const vector& rhs) 497 | { 498 | if (this == &rhs) { 499 | return *this; 500 | } 501 | 502 | size_ = rhs.size_; 503 | empty_ = rhs.empty_; 504 | 505 | if (!empty_) { 506 | memcpy(&data_[0], &rhs.data_[0], size() * sizeof(T)); 507 | } 508 | 509 | return *this; 510 | } 511 | 512 | bool operator==(vector &vec) 513 | { 514 | if (empty_ && vec.empty_) { 515 | return true; 516 | } 517 | 518 | if (size() != vec.size()) { 519 | return false; 520 | } 521 | 522 | return memcmp(&data_[0], &vec.data_[0], size() * sizeof(T)) == 0 ? true : false; 523 | } 524 | 525 | operator T* () { return data_; } 526 | operator const T* () const { return data_; } 527 | 528 | bool empty (void) const 529 | { 530 | return empty_; 531 | } 532 | 533 | unsigned int max_size (void) const 534 | { 535 | return N; 536 | } 537 | 538 | unsigned int capacity () const 539 | { 540 | return sizeof(T) * N; 541 | } 542 | 543 | T& operator[](int index) 544 | { 545 | return data_[index]; 546 | } 547 | 548 | T operator[](int index) const 549 | { 550 | return data_[index]; 551 | } 552 | 553 | template 554 | void assign(I start, I end) 555 | { 556 | clear(); 557 | while(start < end) { 558 | push_back(*start); 559 | start++; 560 | } 561 | } 562 | 563 | /*! \class iterator 564 | * \brief Iterator class for vectors 565 | */ 566 | class iterator 567 | { 568 | private: 569 | vector vec_; 570 | int index_; 571 | bool initialized_; 572 | public: 573 | iterator(void) : 574 | index_(-1), 575 | initialized_(false) 576 | { 577 | index_ = -1; 578 | initialized_ = false; 579 | } 580 | 581 | ~iterator(void) {} 582 | 583 | static iterator begin(vector &vec) 584 | { 585 | iterator i; 586 | 587 | if (!vec.empty()) { 588 | i.index_ = 0; 589 | } 590 | 591 | i.vec_ = vec; 592 | i.initialized_ = true; 593 | return i; 594 | } 595 | 596 | static iterator end(vector &vec) 597 | { 598 | iterator i; 599 | 600 | if (!vec.empty()) { 601 | i.index_ = vec.size(); 602 | } 603 | i.vec_ = vec; 604 | i.initialized_ = true; 605 | return i; 606 | } 607 | 608 | bool operator==(iterator i) 609 | { 610 | return ((vec_ == i.vec_) && 611 | (index_ == i.index_) && 612 | (initialized_ == i.initialized_)); 613 | } 614 | 615 | bool operator!=(iterator i) 616 | { 617 | return (!(*this==i)); 618 | } 619 | 620 | void operator++() 621 | { 622 | index_++; 623 | } 624 | 625 | void operator++(int x) 626 | { 627 | index_ += x; 628 | } 629 | 630 | void operator--() 631 | { 632 | index_--; 633 | } 634 | 635 | void operator--(int x) 636 | { 637 | index_ -= x; 638 | } 639 | 640 | T operator *() 641 | { 642 | return vec_[index_]; 643 | } 644 | }; 645 | 646 | iterator begin(void) 647 | { 648 | return iterator::begin(*this); 649 | } 650 | 651 | iterator end(void) 652 | { 653 | return iterator::end(*this); 654 | } 655 | 656 | T& front(void) 657 | { 658 | return data_[0]; 659 | } 660 | 661 | T& back(void) 662 | { 663 | return data_[size_]; 664 | } 665 | 666 | const T& front(void) const 667 | { 668 | return data_[0]; 669 | } 670 | 671 | const T& back(void) const 672 | { 673 | return data_[size_]; 674 | } 675 | }; 676 | 677 | /*! 678 | * \brief size_t class used to interface between C++ and 679 | * OpenCL C calls that require arrays of size_t values, who's 680 | * size is known statically. 681 | */ 682 | template 683 | struct size_t : public cl::vector< ::size_t, N> { }; 684 | 685 | namespace detail { 686 | 687 | // GetInfo help struct 688 | template 689 | struct GetInfoHelper 690 | { 691 | static cl_int 692 | get(Functor f, cl_uint name, T* param) 693 | { 694 | return f(name, sizeof(T), param, NULL); 695 | } 696 | }; 697 | 698 | // Specialized GetInfoHelper for VECTOR_CLASS params 699 | template 700 | struct GetInfoHelper > 701 | { 702 | static cl_int get(Func f, cl_uint name, VECTOR_CLASS* param) 703 | { 704 | ::size_t required; 705 | cl_int err = f(name, 0, NULL, &required); 706 | if (err != CL_SUCCESS) { 707 | return err; 708 | } 709 | 710 | T* value = (T*) alloca(required); 711 | err = f(name, required, value, NULL); 712 | if (err != CL_SUCCESS) { 713 | return err; 714 | } 715 | 716 | param->assign(&value[0], &value[required/sizeof(T)]); 717 | return CL_SUCCESS; 718 | } 719 | }; 720 | 721 | // Specialized for getInfo 722 | template 723 | struct GetInfoHelper > 724 | { 725 | static cl_int 726 | get(Func f, cl_uint name, VECTOR_CLASS* param) 727 | { 728 | cl_uint err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL); 729 | if (err != CL_SUCCESS) { 730 | return err; 731 | } 732 | 733 | return CL_SUCCESS; 734 | } 735 | }; 736 | 737 | // Specialized GetInfoHelper for STRING_CLASS params 738 | template 739 | struct GetInfoHelper 740 | { 741 | static cl_int get(Func f, cl_uint name, STRING_CLASS* param) 742 | { 743 | ::size_t required; 744 | cl_int err = f(name, 0, NULL, &required); 745 | if (err != CL_SUCCESS) { 746 | return err; 747 | } 748 | 749 | char* value = (char*) alloca(required); 750 | err = f(name, required, value, NULL); 751 | if (err != CL_SUCCESS) { 752 | return err; 753 | } 754 | 755 | *param = value; 756 | return CL_SUCCESS; 757 | } 758 | }; 759 | 760 | #define __GET_INFO_HELPER_WITH_RETAIN(CPP_TYPE) \ 761 | namespace detail { \ 762 | template \ 763 | struct GetInfoHelper \ 764 | { \ 765 | static cl_int get(Func f, cl_uint name, CPP_TYPE* param) \ 766 | { \ 767 | cl_uint err = f(name, sizeof(CPP_TYPE), param, NULL); \ 768 | if (err != CL_SUCCESS) { \ 769 | return err; \ 770 | } \ 771 | \ 772 | return ReferenceHandler::retain((*param)()); \ 773 | } \ 774 | }; \ 775 | } 776 | 777 | 778 | #define __PARAM_NAME_INFO_1_0(F) \ 779 | F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \ 780 | F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \ 781 | F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \ 782 | F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \ 783 | F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \ 784 | \ 785 | F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \ 786 | F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \ 787 | F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \ 788 | F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \ 789 | F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \ 790 | F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \ 791 | F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \ 792 | F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \ 793 | F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \ 794 | F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \ 795 | F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \ 796 | F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \ 797 | F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \ 798 | F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_bitfield) \ 799 | F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \ 800 | F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \ 801 | F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \ 802 | F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \ 803 | F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \ 804 | F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \ 805 | F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \ 806 | F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \ 807 | F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_uint) \ 808 | F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \ 809 | F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \ 810 | F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \ 811 | F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \ 812 | F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \ 813 | F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \ 814 | F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\ 815 | F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \ 816 | F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \ 817 | F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \ 818 | F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \ 819 | F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \ 820 | F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \ 821 | F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \ 822 | F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \ 823 | F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \ 824 | F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \ 825 | F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \ 826 | F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \ 827 | F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \ 828 | F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \ 829 | F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \ 830 | F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \ 831 | F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \ 832 | F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \ 833 | F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \ 834 | F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \ 835 | \ 836 | F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \ 837 | F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS) \ 838 | F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS) \ 839 | \ 840 | F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \ 841 | F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \ 842 | F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \ 843 | F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \ 844 | \ 845 | F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \ 846 | F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \ 847 | F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \ 848 | F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \ 849 | \ 850 | F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \ 851 | F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \ 852 | F(cl_mem_info, CL_MEM_SIZE, ::size_t) \ 853 | F(cl_mem_info, CL_MEM_HOST_PTR, void*) \ 854 | F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \ 855 | F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \ 856 | F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \ 857 | \ 858 | F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \ 859 | F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \ 860 | F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \ 861 | F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \ 862 | F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \ 863 | F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \ 864 | F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \ 865 | \ 866 | F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \ 867 | F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \ 868 | F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \ 869 | F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \ 870 | F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \ 871 | \ 872 | F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \ 873 | F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \ 874 | F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \ 875 | F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS) \ 876 | F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \ 877 | F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \ 878 | F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS) \ 879 | \ 880 | F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \ 881 | F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \ 882 | F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \ 883 | \ 884 | F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \ 885 | F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \ 886 | F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \ 887 | F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \ 888 | F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \ 889 | \ 890 | F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \ 891 | F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \ 892 | F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \ 893 | \ 894 | F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \ 895 | F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \ 896 | F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \ 897 | F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties) 898 | 899 | #if defined(CL_VERSION_1_1) 900 | #define __PARAM_NAME_INFO_1_1(F) \ 901 | F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\ 902 | F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \ 903 | F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \ 904 | F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \ 905 | F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \ 906 | F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \ 907 | F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \ 908 | F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \ 909 | F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \ 910 | F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \ 911 | F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \ 912 | F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \ 913 | \ 914 | F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \ 915 | F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \ 916 | \ 917 | F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \ 918 | F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \ 919 | \ 920 | F(cl_event_info, CL_EVENT_CONTEXT, cl::Context) 921 | #endif // CL_VERSION_1_1 922 | 923 | #if defined(USE_CL_DEVICE_FISSION) 924 | #define __PARAM_NAME_DEVICE_FISSION(F) \ 925 | F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \ 926 | F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS) \ 927 | F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS) \ 928 | F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \ 929 | F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS) 930 | #endif // USE_CL_DEVICE_FISSION 931 | 932 | template 933 | struct param_traits {}; 934 | 935 | #define __DECLARE_PARAM_TRAITS(token, param_name, T) \ 936 | struct token; \ 937 | template<> \ 938 | struct param_traits \ 939 | { \ 940 | enum { value = param_name }; \ 941 | typedef T param_type; \ 942 | }; 943 | 944 | __PARAM_NAME_INFO_1_0(__DECLARE_PARAM_TRAITS); 945 | #if defined(CL_VERSION_1_1) 946 | __PARAM_NAME_INFO_1_1(__DECLARE_PARAM_TRAITS); 947 | #endif // CL_VERSION_1_1 948 | 949 | #if defined(USE_CL_DEVICE_FISSION) 950 | __PARAM_NAME_DEVICE_FISSION(__DECLARE_PARAM_TRAITS); 951 | #endif // USE_CL_DEVICE_FISSION 952 | 953 | #undef __DECLARE_PARAM_TRAITS 954 | 955 | // Convenience functions 956 | 957 | template 958 | inline cl_int 959 | getInfo(Func f, cl_uint name, T* param) 960 | { 961 | return GetInfoHelper::get(f, name, param); 962 | } 963 | 964 | template 965 | struct GetInfoFunctor0 966 | { 967 | Func f_; const Arg0& arg0_; 968 | cl_int operator ()( 969 | cl_uint param, ::size_t size, void* value, ::size_t* size_ret) 970 | { return f_(arg0_, param, size, value, size_ret); } 971 | }; 972 | 973 | template 974 | struct GetInfoFunctor1 975 | { 976 | Func f_; const Arg0& arg0_; const Arg1& arg1_; 977 | cl_int operator ()( 978 | cl_uint param, ::size_t size, void* value, ::size_t* size_ret) 979 | { return f_(arg0_, arg1_, param, size, value, size_ret); } 980 | }; 981 | 982 | template 983 | inline cl_int 984 | getInfo(Func f, const Arg0& arg0, cl_uint name, T* param) 985 | { 986 | GetInfoFunctor0 f0 = { f, arg0 }; 987 | return GetInfoHelper, T> 988 | ::get(f0, name, param); 989 | } 990 | 991 | template 992 | inline cl_int 993 | getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param) 994 | { 995 | GetInfoFunctor1 f0 = { f, arg0, arg1 }; 996 | return GetInfoHelper, T> 997 | ::get(f0, name, param); 998 | } 999 | 1000 | template 1001 | struct ReferenceHandler 1002 | { }; 1003 | 1004 | template <> 1005 | struct ReferenceHandler 1006 | { 1007 | // cl_device_id does not have retain(). 1008 | static cl_int retain(cl_device_id) 1009 | { return CL_INVALID_DEVICE; } 1010 | // cl_device_id does not have release(). 1011 | static cl_int release(cl_device_id) 1012 | { return CL_INVALID_DEVICE; } 1013 | }; 1014 | 1015 | template <> 1016 | struct ReferenceHandler 1017 | { 1018 | // cl_platform_id does not have retain(). 1019 | static cl_int retain(cl_platform_id) 1020 | { return CL_INVALID_PLATFORM; } 1021 | // cl_platform_id does not have release(). 1022 | static cl_int release(cl_platform_id) 1023 | { return CL_INVALID_PLATFORM; } 1024 | }; 1025 | 1026 | template <> 1027 | struct ReferenceHandler 1028 | { 1029 | static cl_int retain(cl_context context) 1030 | { return ::clRetainContext(context); } 1031 | static cl_int release(cl_context context) 1032 | { return ::clReleaseContext(context); } 1033 | }; 1034 | 1035 | template <> 1036 | struct ReferenceHandler 1037 | { 1038 | static cl_int retain(cl_command_queue queue) 1039 | { return ::clRetainCommandQueue(queue); } 1040 | static cl_int release(cl_command_queue queue) 1041 | { return ::clReleaseCommandQueue(queue); } 1042 | }; 1043 | 1044 | template <> 1045 | struct ReferenceHandler 1046 | { 1047 | static cl_int retain(cl_mem memory) 1048 | { return ::clRetainMemObject(memory); } 1049 | static cl_int release(cl_mem memory) 1050 | { return ::clReleaseMemObject(memory); } 1051 | }; 1052 | 1053 | template <> 1054 | struct ReferenceHandler 1055 | { 1056 | static cl_int retain(cl_sampler sampler) 1057 | { return ::clRetainSampler(sampler); } 1058 | static cl_int release(cl_sampler sampler) 1059 | { return ::clReleaseSampler(sampler); } 1060 | }; 1061 | 1062 | template <> 1063 | struct ReferenceHandler 1064 | { 1065 | static cl_int retain(cl_program program) 1066 | { return ::clRetainProgram(program); } 1067 | static cl_int release(cl_program program) 1068 | { return ::clReleaseProgram(program); } 1069 | }; 1070 | 1071 | template <> 1072 | struct ReferenceHandler 1073 | { 1074 | static cl_int retain(cl_kernel kernel) 1075 | { return ::clRetainKernel(kernel); } 1076 | static cl_int release(cl_kernel kernel) 1077 | { return ::clReleaseKernel(kernel); } 1078 | }; 1079 | 1080 | template <> 1081 | struct ReferenceHandler 1082 | { 1083 | static cl_int retain(cl_event event) 1084 | { return ::clRetainEvent(event); } 1085 | static cl_int release(cl_event event) 1086 | { return ::clReleaseEvent(event); } 1087 | }; 1088 | 1089 | template 1090 | class Wrapper 1091 | { 1092 | public: 1093 | typedef T cl_type; 1094 | 1095 | protected: 1096 | cl_type object_; 1097 | 1098 | public: 1099 | Wrapper() : object_(NULL) { } 1100 | 1101 | ~Wrapper() 1102 | { 1103 | if (object_ != NULL) { release(); } 1104 | } 1105 | 1106 | Wrapper(const Wrapper& rhs) 1107 | { 1108 | object_ = rhs.object_; 1109 | if (object_ != NULL) { retain(); } 1110 | } 1111 | 1112 | Wrapper& operator = (const Wrapper& rhs) 1113 | { 1114 | if (object_ != NULL) { release(); } 1115 | object_ = rhs.object_; 1116 | if (object_ != NULL) { retain(); } 1117 | return *this; 1118 | } 1119 | 1120 | cl_type operator ()() const { return object_; } 1121 | 1122 | cl_type& operator ()() { return object_; } 1123 | 1124 | protected: 1125 | 1126 | cl_int retain() const 1127 | { 1128 | return ReferenceHandler::retain(object_); 1129 | } 1130 | 1131 | cl_int release() const 1132 | { 1133 | return ReferenceHandler::release(object_); 1134 | } 1135 | }; 1136 | 1137 | #if defined(__CL_ENABLE_EXCEPTIONS) 1138 | static inline cl_int errHandler ( 1139 | cl_int err, 1140 | const char * errStr = NULL) throw(Error) 1141 | { 1142 | if (err != CL_SUCCESS) { 1143 | throw Error(err, errStr); 1144 | } 1145 | return err; 1146 | } 1147 | #else 1148 | static inline cl_int errHandler (cl_int err, const char * errStr = NULL) 1149 | { 1150 | return err; 1151 | } 1152 | #endif // __CL_ENABLE_EXCEPTIONS 1153 | 1154 | } // namespace detail 1155 | //! \endcond 1156 | 1157 | /*! \stuct ImageFormat 1158 | * \brief ImageFormat interface fro cl_image_format. 1159 | */ 1160 | struct ImageFormat : public cl_image_format 1161 | { 1162 | ImageFormat(){} 1163 | 1164 | ImageFormat(cl_channel_order order, cl_channel_type type) 1165 | { 1166 | image_channel_order = order; 1167 | image_channel_data_type = type; 1168 | } 1169 | 1170 | ImageFormat& operator = (const ImageFormat& rhs) 1171 | { 1172 | if (this != &rhs) { 1173 | this->image_channel_data_type = rhs.image_channel_data_type; 1174 | this->image_channel_order = rhs.image_channel_order; 1175 | } 1176 | return *this; 1177 | } 1178 | }; 1179 | 1180 | /*! \class Device 1181 | * \brief Device interface for cl_device_id. 1182 | */ 1183 | class Device : public detail::Wrapper 1184 | { 1185 | public: 1186 | Device(cl_device_id device) { object_ = device; } 1187 | 1188 | Device() : detail::Wrapper() { } 1189 | 1190 | Device(const Device& device) : detail::Wrapper(device) { } 1191 | 1192 | Device& operator = (const Device& rhs) 1193 | { 1194 | if (this != &rhs) { 1195 | detail::Wrapper::operator=(rhs); 1196 | } 1197 | return *this; 1198 | } 1199 | 1200 | template 1201 | cl_int getInfo(cl_device_info name, T* param) const 1202 | { 1203 | return detail::errHandler( 1204 | detail::getInfo(&::clGetDeviceInfo, object_, name, param), 1205 | __GET_DEVICE_INFO_ERR); 1206 | } 1207 | 1208 | template typename 1209 | detail::param_traits::param_type 1210 | getInfo(cl_int* err = NULL) const 1211 | { 1212 | typename detail::param_traits< 1213 | detail::cl_device_info, name>::param_type param; 1214 | cl_int result = getInfo(name, ¶m); 1215 | if (err != NULL) { 1216 | *err = result; 1217 | } 1218 | return param; 1219 | } 1220 | 1221 | #if defined(USE_CL_DEVICE_FISSION) 1222 | cl_int createSubDevices( 1223 | const cl_device_partition_property_ext * properties, 1224 | VECTOR_CLASS* devices) 1225 | { 1226 | typedef CL_API_ENTRY cl_int 1227 | ( CL_API_CALL * PFN_clCreateSubDevicesEXT)( 1228 | cl_device_id /*in_device*/, 1229 | const cl_device_partition_property_ext * /* properties */, 1230 | cl_uint /*num_entries*/, 1231 | cl_device_id * /*out_devices*/, 1232 | cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; 1233 | 1234 | static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL; 1235 | __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT); 1236 | 1237 | cl_uint n = 0; 1238 | cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n); 1239 | if (err != CL_SUCCESS) { 1240 | return detail::errHandler(err, __CREATE_SUB_DEVICES); 1241 | } 1242 | 1243 | cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); 1244 | err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL); 1245 | if (err != CL_SUCCESS) { 1246 | return detail::errHandler(err, __CREATE_SUB_DEVICES); 1247 | } 1248 | 1249 | devices->assign(&ids[0], &ids[n]); 1250 | return CL_SUCCESS; 1251 | } 1252 | #endif 1253 | }; 1254 | 1255 | /*! \class Platform 1256 | * \brief Platform interface. 1257 | */ 1258 | class Platform : public detail::Wrapper 1259 | { 1260 | public: 1261 | static const Platform null(); 1262 | 1263 | Platform(cl_platform_id platform) { object_ = platform; } 1264 | 1265 | Platform() : detail::Wrapper() { } 1266 | 1267 | Platform(const Platform& platform) : detail::Wrapper(platform) { } 1268 | 1269 | Platform& operator = (const Platform& rhs) 1270 | { 1271 | if (this != &rhs) { 1272 | detail::Wrapper::operator=(rhs); 1273 | } 1274 | return *this; 1275 | } 1276 | 1277 | cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const 1278 | { 1279 | return detail::errHandler( 1280 | detail::getInfo(&::clGetPlatformInfo, object_, name, param), 1281 | __GET_PLATFORM_INFO_ERR); 1282 | } 1283 | 1284 | template typename 1285 | detail::param_traits::param_type 1286 | getInfo(cl_int* err = NULL) const 1287 | { 1288 | typename detail::param_traits< 1289 | detail::cl_platform_info, name>::param_type param; 1290 | cl_int result = getInfo(name, ¶m); 1291 | if (err != NULL) { 1292 | *err = result; 1293 | } 1294 | return param; 1295 | } 1296 | 1297 | cl_int getDevices( 1298 | cl_device_type type, 1299 | VECTOR_CLASS* devices) const 1300 | { 1301 | cl_uint n = 0; 1302 | cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n); 1303 | if (err != CL_SUCCESS) { 1304 | return detail::errHandler(err, __GET_DEVICE_IDS_ERR); 1305 | } 1306 | 1307 | cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); 1308 | err = ::clGetDeviceIDs(object_, type, n, ids, NULL); 1309 | if (err != CL_SUCCESS) { 1310 | return detail::errHandler(err, __GET_DEVICE_IDS_ERR); 1311 | } 1312 | 1313 | devices->assign(&ids[0], &ids[n]); 1314 | return CL_SUCCESS; 1315 | } 1316 | 1317 | #if defined(USE_DX_INTEROP) 1318 | /*! \brief Get the list of available D3D10 devices. 1319 | * 1320 | * \param d3d_device_source. 1321 | * 1322 | * \param d3d_object. 1323 | * 1324 | * \param d3d_device_set. 1325 | * 1326 | * \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device 1327 | * values returned in devices can be used to identify a specific OpenCL 1328 | * device. If \a devices argument is NULL, this argument is ignored. 1329 | * 1330 | * \return One of the following values: 1331 | * - CL_SUCCESS if the function is executed successfully. 1332 | * 1333 | * The application can query specific capabilities of the OpenCL device(s) 1334 | * returned by cl::getDevices. This can be used by the application to 1335 | * determine which device(s) to use. 1336 | * 1337 | * \note In the case that exceptions are enabled and a return value 1338 | * other than CL_SUCCESS is generated, then cl::Error exception is 1339 | * generated. 1340 | */ 1341 | cl_int getDevices( 1342 | cl_d3d10_device_source_khr d3d_device_source, 1343 | void * d3d_object, 1344 | cl_d3d10_device_set_khr d3d_device_set, 1345 | VECTOR_CLASS* devices) const 1346 | { 1347 | typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)( 1348 | cl_platform_id platform, 1349 | cl_d3d10_device_source_khr d3d_device_source, 1350 | void * d3d_object, 1351 | cl_d3d10_device_set_khr d3d_device_set, 1352 | cl_uint num_entries, 1353 | cl_device_id * devices, 1354 | cl_uint* num_devices); 1355 | 1356 | static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL; 1357 | __INIT_CL_EXT_FCN_PTR(clGetDeviceIDsFromD3D10KHR); 1358 | 1359 | cl_uint n = 0; 1360 | cl_int err = pfn_clGetDeviceIDsFromD3D10KHR( 1361 | object_, 1362 | d3d_device_source, 1363 | d3d_object, 1364 | d3d_device_set, 1365 | 0, 1366 | NULL, 1367 | &n); 1368 | if (err != CL_SUCCESS) { 1369 | return detail::errHandler(err, __GET_DEVICE_IDS_ERR); 1370 | } 1371 | 1372 | cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); 1373 | err = pfn_clGetDeviceIDsFromD3D10KHR( 1374 | object_, 1375 | d3d_device_source, 1376 | d3d_object, 1377 | d3d_device_set, 1378 | n, 1379 | ids, 1380 | NULL); 1381 | if (err != CL_SUCCESS) { 1382 | return detail::errHandler(err, __GET_DEVICE_IDS_ERR); 1383 | } 1384 | 1385 | devices->assign(&ids[0], &ids[n]); 1386 | return CL_SUCCESS; 1387 | } 1388 | #endif 1389 | 1390 | static cl_int get( 1391 | VECTOR_CLASS* platforms) 1392 | { 1393 | cl_uint n = 0; 1394 | cl_int err = ::clGetPlatformIDs(0, NULL, &n); 1395 | if (err != CL_SUCCESS) { 1396 | return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); 1397 | } 1398 | 1399 | cl_platform_id* ids = (cl_platform_id*) alloca( 1400 | n * sizeof(cl_platform_id)); 1401 | err = ::clGetPlatformIDs(n, ids, NULL); 1402 | if (err != CL_SUCCESS) { 1403 | return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); 1404 | } 1405 | 1406 | platforms->assign(&ids[0], &ids[n]); 1407 | return CL_SUCCESS; 1408 | } 1409 | }; 1410 | 1411 | static inline cl_int 1412 | UnloadCompiler() 1413 | { 1414 | return ::clUnloadCompiler(); 1415 | } 1416 | 1417 | class Context : public detail::Wrapper 1418 | { 1419 | public: 1420 | Context( 1421 | const VECTOR_CLASS& devices, 1422 | cl_context_properties* properties = NULL, 1423 | void (CL_CALLBACK * notifyFptr)( 1424 | const char *, 1425 | const void *, 1426 | ::size_t, 1427 | void *) = NULL, 1428 | void* data = NULL, 1429 | cl_int* err = NULL) 1430 | { 1431 | cl_int error; 1432 | object_ = ::clCreateContext( 1433 | properties, (cl_uint) devices.size(), 1434 | (cl_device_id*) &devices.front(), 1435 | notifyFptr, data, &error); 1436 | 1437 | detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); 1438 | if (err != NULL) { 1439 | *err = error; 1440 | } 1441 | } 1442 | 1443 | Context( 1444 | cl_device_type type, 1445 | cl_context_properties* properties = NULL, 1446 | void (CL_CALLBACK * notifyFptr)( 1447 | const char *, 1448 | const void *, 1449 | ::size_t, 1450 | void *) = NULL, 1451 | void* data = NULL, 1452 | cl_int* err = NULL) 1453 | { 1454 | cl_int error; 1455 | object_ = ::clCreateContextFromType( 1456 | properties, type, notifyFptr, data, &error); 1457 | 1458 | detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); 1459 | if (err != NULL) { 1460 | *err = error; 1461 | } 1462 | } 1463 | 1464 | Context() : detail::Wrapper() { } 1465 | 1466 | Context(const Context& context) : detail::Wrapper(context) { } 1467 | 1468 | Context& operator = (const Context& rhs) 1469 | { 1470 | if (this != &rhs) { 1471 | detail::Wrapper::operator=(rhs); 1472 | } 1473 | return *this; 1474 | } 1475 | 1476 | template 1477 | cl_int getInfo(cl_context_info name, T* param) const 1478 | { 1479 | return detail::errHandler( 1480 | detail::getInfo(&::clGetContextInfo, object_, name, param), 1481 | __GET_CONTEXT_INFO_ERR); 1482 | } 1483 | 1484 | template typename 1485 | detail::param_traits::param_type 1486 | getInfo(cl_int* err = NULL) const 1487 | { 1488 | typename detail::param_traits< 1489 | detail::cl_context_info, name>::param_type param; 1490 | cl_int result = getInfo(name, ¶m); 1491 | if (err != NULL) { 1492 | *err = result; 1493 | } 1494 | return param; 1495 | } 1496 | 1497 | cl_int getSupportedImageFormats( 1498 | cl_mem_flags flags, 1499 | cl_mem_object_type type, 1500 | VECTOR_CLASS* formats) const 1501 | { 1502 | cl_uint numEntries; 1503 | cl_int err = ::clGetSupportedImageFormats( 1504 | object_, 1505 | flags, 1506 | type, 1507 | 0, 1508 | NULL, 1509 | &numEntries); 1510 | if (err != CL_SUCCESS) { 1511 | return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); 1512 | } 1513 | 1514 | ImageFormat* value = (ImageFormat*) 1515 | alloca(numEntries * sizeof(ImageFormat)); 1516 | err = ::clGetSupportedImageFormats( 1517 | object_, 1518 | flags, 1519 | type, 1520 | numEntries, 1521 | (cl_image_format*) value, 1522 | NULL); 1523 | if (err != CL_SUCCESS) { 1524 | return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); 1525 | } 1526 | 1527 | formats->assign(&value[0], &value[numEntries]); 1528 | return CL_SUCCESS; 1529 | } 1530 | }; 1531 | 1532 | __GET_INFO_HELPER_WITH_RETAIN(cl::Context) 1533 | 1534 | /*! \class Event 1535 | * \brief Event interface for cl_event. 1536 | */ 1537 | class Event : public detail::Wrapper 1538 | { 1539 | public: 1540 | Event() : detail::Wrapper() { } 1541 | 1542 | Event(const Event& event) : detail::Wrapper(event) { } 1543 | 1544 | Event& operator = (const Event& rhs) 1545 | { 1546 | if (this != &rhs) { 1547 | detail::Wrapper::operator=(rhs); 1548 | } 1549 | return *this; 1550 | } 1551 | 1552 | template 1553 | cl_int getInfo(cl_event_info name, T* param) const 1554 | { 1555 | return detail::errHandler( 1556 | detail::getInfo(&::clGetEventInfo, object_, name, param), 1557 | __GET_EVENT_INFO_ERR); 1558 | } 1559 | 1560 | template typename 1561 | detail::param_traits::param_type 1562 | getInfo(cl_int* err = NULL) const 1563 | { 1564 | typename detail::param_traits< 1565 | detail::cl_event_info, name>::param_type param; 1566 | cl_int result = getInfo(name, ¶m); 1567 | if (err != NULL) { 1568 | *err = result; 1569 | } 1570 | return param; 1571 | } 1572 | 1573 | template 1574 | cl_int getProfilingInfo(cl_profiling_info name, T* param) const 1575 | { 1576 | return detail::errHandler(detail::getInfo( 1577 | &::clGetEventProfilingInfo, object_, name, param), 1578 | __GET_EVENT_PROFILE_INFO_ERR); 1579 | } 1580 | 1581 | template typename 1582 | detail::param_traits::param_type 1583 | getProfilingInfo(cl_int* err = NULL) const 1584 | { 1585 | typename detail::param_traits< 1586 | detail::cl_profiling_info, name>::param_type param; 1587 | cl_int result = getProfilingInfo(name, ¶m); 1588 | if (err != NULL) { 1589 | *err = result; 1590 | } 1591 | return param; 1592 | } 1593 | 1594 | cl_int wait() const 1595 | { 1596 | return detail::errHandler( 1597 | ::clWaitForEvents(1, &object_), 1598 | __WAIT_FOR_EVENTS_ERR); 1599 | } 1600 | 1601 | #if defined(CL_VERSION_1_1) 1602 | cl_int setCallback( 1603 | cl_int type, 1604 | void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *), 1605 | void * user_data = NULL) 1606 | { 1607 | return detail::errHandler( 1608 | ::clSetEventCallback( 1609 | object_, 1610 | type, 1611 | pfn_notify, 1612 | user_data), 1613 | __SET_EVENT_CALLBACK_ERR); 1614 | } 1615 | #endif 1616 | 1617 | static cl_int 1618 | waitForEvents(const VECTOR_CLASS& events) 1619 | { 1620 | return detail::errHandler( 1621 | ::clWaitForEvents( 1622 | (cl_uint) events.size(), (cl_event*)&events.front()), 1623 | __WAIT_FOR_EVENTS_ERR); 1624 | } 1625 | }; 1626 | 1627 | __GET_INFO_HELPER_WITH_RETAIN(cl::Event) 1628 | 1629 | #if defined(CL_VERSION_1_1) 1630 | /*! \class UserEvent 1631 | * \brief User event interface for cl_event. 1632 | */ 1633 | class UserEvent : public Event 1634 | { 1635 | public: 1636 | UserEvent( 1637 | const Context& context, 1638 | cl_int * err = NULL) 1639 | { 1640 | cl_int error; 1641 | object_ = ::clCreateUserEvent( 1642 | context(), 1643 | &error); 1644 | 1645 | detail::errHandler(error, __CREATE_USER_EVENT_ERR); 1646 | if (err != NULL) { 1647 | *err = error; 1648 | } 1649 | } 1650 | 1651 | UserEvent() : Event() { } 1652 | 1653 | UserEvent(const UserEvent& event) : Event(event) { } 1654 | 1655 | UserEvent& operator = (const UserEvent& rhs) 1656 | { 1657 | if (this != &rhs) { 1658 | Event::operator=(rhs); 1659 | } 1660 | return *this; 1661 | } 1662 | 1663 | cl_int setStatus(cl_int status) 1664 | { 1665 | return detail::errHandler( 1666 | ::clSetUserEventStatus(object_,status), 1667 | __SET_USER_EVENT_STATUS_ERR); 1668 | } 1669 | }; 1670 | #endif 1671 | 1672 | inline static cl_int 1673 | WaitForEvents(const VECTOR_CLASS& events) 1674 | { 1675 | return detail::errHandler( 1676 | ::clWaitForEvents( 1677 | (cl_uint) events.size(), (cl_event*)&events.front()), 1678 | __WAIT_FOR_EVENTS_ERR); 1679 | } 1680 | 1681 | /*! \class Memory 1682 | * \brief Memory interface for cl_mem. 1683 | */ 1684 | class Memory : public detail::Wrapper 1685 | { 1686 | public: 1687 | Memory() : detail::Wrapper() { } 1688 | 1689 | Memory(const Memory& memory) : detail::Wrapper(memory) { } 1690 | 1691 | Memory& operator = (const Memory& rhs) 1692 | { 1693 | if (this != &rhs) { 1694 | detail::Wrapper::operator=(rhs); 1695 | } 1696 | return *this; 1697 | } 1698 | 1699 | template 1700 | cl_int getInfo(cl_mem_info name, T* param) const 1701 | { 1702 | return detail::errHandler( 1703 | detail::getInfo(&::clGetMemObjectInfo, object_, name, param), 1704 | __GET_MEM_OBJECT_INFO_ERR); 1705 | } 1706 | 1707 | template typename 1708 | detail::param_traits::param_type 1709 | getInfo(cl_int* err = NULL) const 1710 | { 1711 | typename detail::param_traits< 1712 | detail::cl_mem_info, name>::param_type param; 1713 | cl_int result = getInfo(name, ¶m); 1714 | if (err != NULL) { 1715 | *err = result; 1716 | } 1717 | return param; 1718 | } 1719 | 1720 | #if defined(CL_VERSION_1_1) 1721 | cl_int setDestructorCallback( 1722 | void (CL_CALLBACK * pfn_notify)(cl_mem, void *), 1723 | void * user_data = NULL) 1724 | { 1725 | return detail::errHandler( 1726 | ::clSetMemObjectDestructorCallback( 1727 | object_, 1728 | pfn_notify, 1729 | user_data), 1730 | __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR); 1731 | } 1732 | #endif 1733 | 1734 | }; 1735 | 1736 | __GET_INFO_HELPER_WITH_RETAIN(cl::Memory) 1737 | 1738 | /*! \class Buffer 1739 | * \brief Memory buffer interface. 1740 | */ 1741 | class Buffer : public Memory 1742 | { 1743 | public: 1744 | Buffer( 1745 | const Context& context, 1746 | cl_mem_flags flags, 1747 | ::size_t size, 1748 | void* host_ptr = NULL, 1749 | cl_int* err = NULL) 1750 | { 1751 | cl_int error; 1752 | object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); 1753 | 1754 | detail::errHandler(error, __CREATE_BUFFER_ERR); 1755 | if (err != NULL) { 1756 | *err = error; 1757 | } 1758 | } 1759 | 1760 | Buffer() : Memory() { } 1761 | 1762 | Buffer(const Buffer& buffer) : Memory(buffer) { } 1763 | 1764 | Buffer& operator = (const Buffer& rhs) 1765 | { 1766 | if (this != &rhs) { 1767 | Memory::operator=(rhs); 1768 | } 1769 | return *this; 1770 | } 1771 | 1772 | #if defined(CL_VERSION_1_1) 1773 | Buffer createSubBuffer( 1774 | cl_mem_flags flags, 1775 | cl_buffer_create_type buffer_create_type, 1776 | const void * buffer_create_info, 1777 | cl_int * err = NULL) 1778 | { 1779 | Buffer result; 1780 | cl_int error; 1781 | result.object_ = ::clCreateSubBuffer( 1782 | object_, 1783 | flags, 1784 | buffer_create_type, 1785 | buffer_create_info, 1786 | &error); 1787 | 1788 | detail::errHandler(error, __CREATE_SUBBUFFER_ERR); 1789 | if (err != NULL) { 1790 | *err = error; 1791 | } 1792 | 1793 | return result; 1794 | } 1795 | #endif 1796 | }; 1797 | 1798 | #if defined (USE_DX_INTEROP) 1799 | class BufferD3D10 : public Buffer 1800 | { 1801 | public: 1802 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)( 1803 | cl_context context, cl_mem_flags flags, ID3D10Buffer* buffer, 1804 | cl_int* errcode_ret); 1805 | 1806 | BufferD3D10( 1807 | const Context& context, 1808 | cl_mem_flags flags, 1809 | ID3D10Buffer* bufobj, 1810 | cl_int * err = NULL) 1811 | { 1812 | static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL; 1813 | __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR); 1814 | 1815 | cl_int error; 1816 | object_ = pfn_clCreateFromD3D10BufferKHR( 1817 | context(), 1818 | flags, 1819 | bufobj, 1820 | &error); 1821 | 1822 | detail::errHandler(error, __CREATE_GL_BUFFER_ERR); 1823 | if (err != NULL) { 1824 | *err = error; 1825 | } 1826 | } 1827 | 1828 | BufferD3D10() : Buffer() { } 1829 | 1830 | BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { } 1831 | 1832 | BufferD3D10& operator = (const BufferD3D10& rhs) 1833 | { 1834 | if (this != &rhs) { 1835 | Buffer::operator=(rhs); 1836 | } 1837 | return *this; 1838 | } 1839 | }; 1840 | #endif 1841 | 1842 | /*! \class BufferGL 1843 | * \brief Memory buffer interface for GL interop. 1844 | */ 1845 | class BufferGL : public Buffer 1846 | { 1847 | public: 1848 | BufferGL( 1849 | const Context& context, 1850 | cl_mem_flags flags, 1851 | GLuint bufobj, 1852 | cl_int * err = NULL) 1853 | { 1854 | cl_int error; 1855 | object_ = ::clCreateFromGLBuffer( 1856 | context(), 1857 | flags, 1858 | bufobj, 1859 | &error); 1860 | 1861 | detail::errHandler(error, __CREATE_GL_BUFFER_ERR); 1862 | if (err != NULL) { 1863 | *err = error; 1864 | } 1865 | } 1866 | 1867 | BufferGL() : Buffer() { } 1868 | 1869 | BufferGL(const BufferGL& buffer) : Buffer(buffer) { } 1870 | 1871 | BufferGL& operator = (const BufferGL& rhs) 1872 | { 1873 | if (this != &rhs) { 1874 | Buffer::operator=(rhs); 1875 | } 1876 | return *this; 1877 | } 1878 | 1879 | cl_int getObjectInfo( 1880 | cl_gl_object_type *type, 1881 | GLuint * gl_object_name) 1882 | { 1883 | return detail::errHandler( 1884 | ::clGetGLObjectInfo(object_,type,gl_object_name), 1885 | __GET_GL_OBJECT_INFO_ERR); 1886 | } 1887 | }; 1888 | 1889 | /*! \class BufferRenderGL 1890 | * \brief Memory buffer interface for GL interop with renderbuffer. 1891 | */ 1892 | class BufferRenderGL : public Buffer 1893 | { 1894 | public: 1895 | BufferRenderGL( 1896 | const Context& context, 1897 | cl_mem_flags flags, 1898 | GLuint bufobj, 1899 | cl_int * err = NULL) 1900 | { 1901 | cl_int error; 1902 | object_ = ::clCreateFromGLRenderbuffer( 1903 | context(), 1904 | flags, 1905 | bufobj, 1906 | &error); 1907 | 1908 | detail::errHandler(error, __CREATE_GL_BUFFER_ERR); 1909 | if (err != NULL) { 1910 | *err = error; 1911 | } 1912 | } 1913 | 1914 | BufferRenderGL() : Buffer() { } 1915 | 1916 | BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { } 1917 | 1918 | BufferRenderGL& operator = (const BufferRenderGL& rhs) 1919 | { 1920 | if (this != &rhs) { 1921 | Buffer::operator=(rhs); 1922 | } 1923 | return *this; 1924 | } 1925 | 1926 | cl_int getObjectInfo( 1927 | cl_gl_object_type *type, 1928 | GLuint * gl_object_name) 1929 | { 1930 | return detail::errHandler( 1931 | ::clGetGLObjectInfo(object_,type,gl_object_name), 1932 | __GET_GL_OBJECT_INFO_ERR); 1933 | } 1934 | }; 1935 | 1936 | /*! \class Image 1937 | * \brief Base class interface for all images. 1938 | */ 1939 | class Image : public Memory 1940 | { 1941 | protected: 1942 | Image() : Memory() { } 1943 | 1944 | Image(const Image& image) : Memory(image) { } 1945 | 1946 | Image& operator = (const Image& rhs) 1947 | { 1948 | if (this != &rhs) { 1949 | Memory::operator=(rhs); 1950 | } 1951 | return *this; 1952 | } 1953 | public: 1954 | template 1955 | cl_int getImageInfo(cl_image_info name, T* param) const 1956 | { 1957 | return detail::errHandler( 1958 | detail::getInfo(&::clGetImageInfo, object_, name, param), 1959 | __GET_IMAGE_INFO_ERR); 1960 | } 1961 | 1962 | template typename 1963 | detail::param_traits::param_type 1964 | getImageInfo(cl_int* err = NULL) const 1965 | { 1966 | typename detail::param_traits< 1967 | detail::cl_image_info, name>::param_type param; 1968 | cl_int result = getImageInfo(name, ¶m); 1969 | if (err != NULL) { 1970 | *err = result; 1971 | } 1972 | return param; 1973 | } 1974 | }; 1975 | 1976 | /*! \class Image2D 1977 | * \brief Image interface for 2D images. 1978 | */ 1979 | class Image2D : public Image 1980 | { 1981 | public: 1982 | Image2D( 1983 | const Context& context, 1984 | cl_mem_flags flags, 1985 | ImageFormat format, 1986 | ::size_t width, 1987 | ::size_t height, 1988 | ::size_t row_pitch = 0, 1989 | void* host_ptr = NULL, 1990 | cl_int* err = NULL) 1991 | { 1992 | cl_int error; 1993 | object_ = ::clCreateImage2D( 1994 | context(), flags,&format, width, height, row_pitch, host_ptr, &error); 1995 | 1996 | detail::errHandler(error, __CREATE_IMAGE2D_ERR); 1997 | if (err != NULL) { 1998 | *err = error; 1999 | } 2000 | } 2001 | 2002 | Image2D() { } 2003 | 2004 | Image2D(const Image2D& image2D) : Image(image2D) { } 2005 | 2006 | Image2D& operator = (const Image2D& rhs) 2007 | { 2008 | if (this != &rhs) { 2009 | Image::operator=(rhs); 2010 | } 2011 | return *this; 2012 | } 2013 | }; 2014 | 2015 | /*! \class Image2DGL 2016 | * \brief 2D image interface for GL interop. 2017 | */ 2018 | class Image2DGL : public Image2D 2019 | { 2020 | public: 2021 | Image2DGL( 2022 | const Context& context, 2023 | cl_mem_flags flags, 2024 | GLenum target, 2025 | GLint miplevel, 2026 | GLuint texobj, 2027 | cl_int * err = NULL) 2028 | { 2029 | cl_int error; 2030 | object_ = ::clCreateFromGLTexture2D( 2031 | context(), 2032 | flags, 2033 | target, 2034 | miplevel, 2035 | texobj, 2036 | &error); 2037 | 2038 | detail::errHandler(error, __CREATE_GL_BUFFER_ERR); 2039 | if (err != NULL) { 2040 | *err = error; 2041 | } 2042 | } 2043 | 2044 | Image2DGL() : Image2D() { } 2045 | 2046 | Image2DGL(const Image2DGL& image) : Image2D(image) { } 2047 | 2048 | Image2DGL& operator = (const Image2DGL& rhs) 2049 | { 2050 | if (this != &rhs) { 2051 | Image2D::operator=(rhs); 2052 | } 2053 | return *this; 2054 | } 2055 | }; 2056 | 2057 | /*! \class Image3D 2058 | * \brief Image interface for 3D images. 2059 | */ 2060 | class Image3D : public Image 2061 | { 2062 | public: 2063 | Image3D( 2064 | const Context& context, 2065 | cl_mem_flags flags, 2066 | ImageFormat format, 2067 | ::size_t width, 2068 | ::size_t height, 2069 | ::size_t depth, 2070 | ::size_t row_pitch = 0, 2071 | ::size_t slice_pitch = 0, 2072 | void* host_ptr = NULL, 2073 | cl_int* err = NULL) 2074 | { 2075 | cl_int error; 2076 | object_ = ::clCreateImage3D( 2077 | context(), flags, &format, width, height, depth, row_pitch, 2078 | slice_pitch, host_ptr, &error); 2079 | 2080 | detail::errHandler(error, __CREATE_IMAGE3D_ERR); 2081 | if (err != NULL) { 2082 | *err = error; 2083 | } 2084 | } 2085 | 2086 | Image3D() { } 2087 | 2088 | Image3D(const Image3D& image3D) : Image(image3D) { } 2089 | 2090 | Image3D& operator = (const Image3D& rhs) 2091 | { 2092 | if (this != &rhs) { 2093 | Image::operator=(rhs); 2094 | } 2095 | return *this; 2096 | } 2097 | }; 2098 | 2099 | /*! \class Image2DGL 2100 | * \brief 2D image interface for GL interop. 2101 | */ 2102 | class Image3DGL : public Image3D 2103 | { 2104 | public: 2105 | Image3DGL( 2106 | const Context& context, 2107 | cl_mem_flags flags, 2108 | GLenum target, 2109 | GLint miplevel, 2110 | GLuint texobj, 2111 | cl_int * err = NULL) 2112 | { 2113 | cl_int error; 2114 | object_ = ::clCreateFromGLTexture3D( 2115 | context(), 2116 | flags, 2117 | target, 2118 | miplevel, 2119 | texobj, 2120 | &error); 2121 | 2122 | detail::errHandler(error, __CREATE_GL_BUFFER_ERR); 2123 | if (err != NULL) { 2124 | *err = error; 2125 | } 2126 | } 2127 | 2128 | Image3DGL() : Image3D() { } 2129 | 2130 | Image3DGL(const Image3DGL& image) : Image3D(image) { } 2131 | 2132 | Image3DGL& operator = (const Image3DGL& rhs) 2133 | { 2134 | if (this != &rhs) { 2135 | Image3D::operator=(rhs); 2136 | } 2137 | return *this; 2138 | } 2139 | }; 2140 | 2141 | /*! \class Sampler 2142 | * \brief Sampler interface for cl_sampler. 2143 | */ 2144 | class Sampler : public detail::Wrapper 2145 | { 2146 | public: 2147 | Sampler() { } 2148 | 2149 | Sampler( 2150 | const Context& context, 2151 | cl_bool normalized_coords, 2152 | cl_addressing_mode addressing_mode, 2153 | cl_filter_mode filter_mode, 2154 | cl_int* err = NULL) 2155 | { 2156 | cl_int error; 2157 | object_ = ::clCreateSampler( 2158 | context(), 2159 | normalized_coords, 2160 | addressing_mode, 2161 | filter_mode, 2162 | &error); 2163 | 2164 | detail::errHandler(error, __CREATE_SAMPLER_ERR); 2165 | if (err != NULL) { 2166 | *err = error; 2167 | } 2168 | } 2169 | 2170 | Sampler(const Sampler& sampler) : detail::Wrapper(sampler) { } 2171 | 2172 | Sampler& operator = (const Sampler& rhs) 2173 | { 2174 | if (this != &rhs) { 2175 | detail::Wrapper::operator=(rhs); 2176 | } 2177 | return *this; 2178 | } 2179 | 2180 | template 2181 | cl_int getInfo(cl_sampler_info name, T* param) const 2182 | { 2183 | return detail::errHandler( 2184 | detail::getInfo(&::clGetSamplerInfo, object_, name, param), 2185 | __GET_SAMPLER_INFO_ERR); 2186 | } 2187 | 2188 | template typename 2189 | detail::param_traits::param_type 2190 | getInfo(cl_int* err = NULL) const 2191 | { 2192 | typename detail::param_traits< 2193 | detail::cl_sampler_info, name>::param_type param; 2194 | cl_int result = getInfo(name, ¶m); 2195 | if (err != NULL) { 2196 | *err = result; 2197 | } 2198 | return param; 2199 | } 2200 | }; 2201 | 2202 | __GET_INFO_HELPER_WITH_RETAIN(cl::Sampler) 2203 | 2204 | class Program; 2205 | class CommandQueue; 2206 | class Kernel; 2207 | 2208 | /*! \class NDRange 2209 | * \brief NDRange interface 2210 | */ 2211 | class NDRange 2212 | { 2213 | private: 2214 | size_t<3> sizes_; 2215 | cl_uint dimensions_; 2216 | 2217 | public: 2218 | NDRange() 2219 | : dimensions_(0) 2220 | { } 2221 | 2222 | NDRange(::size_t size0) 2223 | : dimensions_(1) 2224 | { 2225 | sizes_.push_back(size0); 2226 | } 2227 | 2228 | NDRange(::size_t size0, ::size_t size1) 2229 | : dimensions_(2) 2230 | { 2231 | sizes_.push_back(size0); 2232 | sizes_.push_back(size1); 2233 | } 2234 | 2235 | NDRange(::size_t size0, ::size_t size1, ::size_t size2) 2236 | : dimensions_(3) 2237 | { 2238 | sizes_.push_back(size0); 2239 | sizes_.push_back(size1); 2240 | sizes_.push_back(size2); 2241 | } 2242 | 2243 | operator const ::size_t*() const { return (const ::size_t*) sizes_; } 2244 | ::size_t dimensions() const { return dimensions_; } 2245 | }; 2246 | 2247 | static const NDRange NullRange; 2248 | 2249 | /*! 2250 | * \struct LocalSpaceArg 2251 | * \brief Local address raper for use with Kernel::setArg 2252 | */ 2253 | struct LocalSpaceArg 2254 | { 2255 | ::size_t size_; 2256 | }; 2257 | 2258 | namespace detail { 2259 | 2260 | template 2261 | struct KernelArgumentHandler 2262 | { 2263 | static ::size_t size(const T&) { return sizeof(T); } 2264 | static T* ptr(T& value) { return &value; } 2265 | }; 2266 | 2267 | template <> 2268 | struct KernelArgumentHandler 2269 | { 2270 | static ::size_t size(const LocalSpaceArg& value) { return value.size_; } 2271 | static void* ptr(LocalSpaceArg&) { return NULL; } 2272 | }; 2273 | 2274 | } 2275 | //! \endcond 2276 | 2277 | inline LocalSpaceArg 2278 | __local(::size_t size) 2279 | { 2280 | LocalSpaceArg ret = { size }; 2281 | return ret; 2282 | } 2283 | 2284 | class KernelFunctor; 2285 | 2286 | /*! \class Kernel 2287 | * \brief Kernel interface that implements cl_kernel 2288 | */ 2289 | class Kernel : public detail::Wrapper 2290 | { 2291 | public: 2292 | inline Kernel(const Program& program, const char* name, cl_int* err = NULL); 2293 | 2294 | Kernel() { } 2295 | 2296 | Kernel(const Kernel& kernel) : detail::Wrapper(kernel) { } 2297 | 2298 | Kernel& operator = (const Kernel& rhs) 2299 | { 2300 | if (this != &rhs) { 2301 | detail::Wrapper::operator=(rhs); 2302 | } 2303 | return *this; 2304 | } 2305 | 2306 | template 2307 | cl_int getInfo(cl_kernel_info name, T* param) const 2308 | { 2309 | return detail::errHandler( 2310 | detail::getInfo(&::clGetKernelInfo, object_, name, param), 2311 | __GET_KERNEL_INFO_ERR); 2312 | } 2313 | 2314 | template typename 2315 | detail::param_traits::param_type 2316 | getInfo(cl_int* err = NULL) const 2317 | { 2318 | typename detail::param_traits< 2319 | detail::cl_kernel_info, name>::param_type param; 2320 | cl_int result = getInfo(name, ¶m); 2321 | if (err != NULL) { 2322 | *err = result; 2323 | } 2324 | return param; 2325 | } 2326 | 2327 | template 2328 | cl_int getWorkGroupInfo( 2329 | const Device& device, cl_kernel_work_group_info name, T* param) const 2330 | { 2331 | return detail::errHandler( 2332 | detail::getInfo( 2333 | &::clGetKernelWorkGroupInfo, object_, device(), name, param), 2334 | __GET_KERNEL_WORK_GROUP_INFO_ERR); 2335 | } 2336 | 2337 | template typename 2338 | detail::param_traits::param_type 2339 | getWorkGroupInfo(const Device& device, cl_int* err = NULL) const 2340 | { 2341 | typename detail::param_traits< 2342 | detail::cl_kernel_work_group_info, name>::param_type param; 2343 | cl_int result = getWorkGroupInfo(device, name, ¶m); 2344 | if (err != NULL) { 2345 | *err = result; 2346 | } 2347 | return param; 2348 | } 2349 | 2350 | template 2351 | cl_int setArg(cl_uint index, T value) 2352 | { 2353 | return detail::errHandler( 2354 | ::clSetKernelArg( 2355 | object_, 2356 | index, 2357 | detail::KernelArgumentHandler::size(value), 2358 | detail::KernelArgumentHandler::ptr(value)), 2359 | __SET_KERNEL_ARGS_ERR); 2360 | } 2361 | 2362 | cl_int setArg(cl_uint index, ::size_t size, void* argPtr) 2363 | { 2364 | return detail::errHandler( 2365 | ::clSetKernelArg(object_, index, size, argPtr), 2366 | __SET_KERNEL_ARGS_ERR); 2367 | } 2368 | 2369 | KernelFunctor bind( 2370 | const CommandQueue& queue, 2371 | const NDRange& offset, 2372 | const NDRange& global, 2373 | const NDRange& local); 2374 | 2375 | KernelFunctor bind( 2376 | const CommandQueue& queue, 2377 | const NDRange& global, 2378 | const NDRange& local); 2379 | }; 2380 | 2381 | __GET_INFO_HELPER_WITH_RETAIN(cl::Kernel) 2382 | 2383 | /*! \class Program 2384 | * \brief Program interface that implements cl_program. 2385 | */ 2386 | class Program : public detail::Wrapper 2387 | { 2388 | public: 2389 | typedef VECTOR_CLASS > Binaries; 2390 | typedef VECTOR_CLASS > Sources; 2391 | 2392 | Program( 2393 | const Context& context, 2394 | const Sources& sources, 2395 | cl_int* err = NULL) 2396 | { 2397 | cl_int error; 2398 | 2399 | const ::size_t n = (::size_t)sources.size(); 2400 | ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t)); 2401 | const char** strings = (const char**) alloca(n * sizeof(const char*)); 2402 | 2403 | for (::size_t i = 0; i < n; ++i) { 2404 | strings[i] = sources[(int)i].first; 2405 | lengths[i] = sources[(int)i].second; 2406 | } 2407 | 2408 | object_ = ::clCreateProgramWithSource( 2409 | context(), (cl_uint)n, strings, lengths, &error); 2410 | 2411 | detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); 2412 | if (err != NULL) { 2413 | *err = error; 2414 | } 2415 | } 2416 | 2417 | Program( 2418 | const Context& context, 2419 | const VECTOR_CLASS& devices, 2420 | const Binaries& binaries, 2421 | VECTOR_CLASS* binaryStatus = NULL, 2422 | cl_int* err = NULL) 2423 | { 2424 | cl_int error; 2425 | const ::size_t n = binaries.size(); 2426 | ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t)); 2427 | const unsigned char** images = (const unsigned char**) alloca(n * sizeof(const void*)); 2428 | 2429 | for (::size_t i = 0; i < n; ++i) { 2430 | images[i] = (const unsigned char*)binaries[(int)i].first; 2431 | lengths[i] = binaries[(int)i].second; 2432 | } 2433 | 2434 | object_ = ::clCreateProgramWithBinary( 2435 | context(), (cl_uint) devices.size(), 2436 | (cl_device_id*)&devices.front(), 2437 | lengths, images, binaryStatus != NULL 2438 | ? (cl_int*) &binaryStatus->front() 2439 | : NULL, &error); 2440 | 2441 | detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); 2442 | if (err != NULL) { 2443 | *err = error; 2444 | } 2445 | } 2446 | 2447 | Program() { } 2448 | 2449 | Program(const Program& program) : detail::Wrapper(program) { } 2450 | 2451 | Program& operator = (const Program& rhs) 2452 | { 2453 | if (this != &rhs) { 2454 | detail::Wrapper::operator=(rhs); 2455 | } 2456 | return *this; 2457 | } 2458 | 2459 | cl_int build( 2460 | const VECTOR_CLASS& devices, 2461 | const char* options = NULL, 2462 | void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, 2463 | void* data = NULL) const 2464 | { 2465 | return detail::errHandler( 2466 | ::clBuildProgram( 2467 | object_, 2468 | (cl_uint) 2469 | devices.size(), 2470 | (cl_device_id*)&devices.front(), 2471 | options, 2472 | notifyFptr, 2473 | data), 2474 | __BUILD_PROGRAM_ERR); 2475 | } 2476 | 2477 | template 2478 | cl_int getInfo(cl_program_info name, T* param) const 2479 | { 2480 | return detail::errHandler( 2481 | detail::getInfo(&::clGetProgramInfo, object_, name, param), 2482 | __GET_PROGRAM_INFO_ERR); 2483 | } 2484 | 2485 | template typename 2486 | detail::param_traits::param_type 2487 | getInfo(cl_int* err = NULL) const 2488 | { 2489 | typename detail::param_traits< 2490 | detail::cl_program_info, name>::param_type param; 2491 | cl_int result = getInfo(name, ¶m); 2492 | if (err != NULL) { 2493 | *err = result; 2494 | } 2495 | return param; 2496 | } 2497 | 2498 | template 2499 | cl_int getBuildInfo( 2500 | const Device& device, cl_program_build_info name, T* param) const 2501 | { 2502 | return detail::errHandler( 2503 | detail::getInfo( 2504 | &::clGetProgramBuildInfo, object_, device(), name, param), 2505 | __GET_PROGRAM_BUILD_INFO_ERR); 2506 | } 2507 | 2508 | template typename 2509 | detail::param_traits::param_type 2510 | getBuildInfo(const Device& device, cl_int* err = NULL) const 2511 | { 2512 | typename detail::param_traits< 2513 | detail::cl_program_build_info, name>::param_type param; 2514 | cl_int result = getBuildInfo(device, name, ¶m); 2515 | if (err != NULL) { 2516 | *err = result; 2517 | } 2518 | return param; 2519 | } 2520 | 2521 | cl_int createKernels(VECTOR_CLASS* kernels) 2522 | { 2523 | cl_uint numKernels; 2524 | cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels); 2525 | if (err != CL_SUCCESS) { 2526 | return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); 2527 | } 2528 | 2529 | Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel)); 2530 | err = ::clCreateKernelsInProgram( 2531 | object_, numKernels, (cl_kernel*) value, NULL); 2532 | if (err != CL_SUCCESS) { 2533 | return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); 2534 | } 2535 | 2536 | kernels->assign(&value[0], &value[numKernels]); 2537 | return CL_SUCCESS; 2538 | } 2539 | }; 2540 | 2541 | __GET_INFO_HELPER_WITH_RETAIN(cl::Program) 2542 | 2543 | inline Kernel::Kernel(const Program& program, const char* name, cl_int* err) 2544 | { 2545 | cl_int error; 2546 | 2547 | object_ = ::clCreateKernel(program(), name, &error); 2548 | detail::errHandler(error, __CREATE_KERNEL_ERR); 2549 | 2550 | if (err != NULL) { 2551 | *err = error; 2552 | } 2553 | 2554 | } 2555 | 2556 | /*! \class CommandQueue 2557 | * \brief CommandQueue interface for cl_command_queue. 2558 | */ 2559 | class CommandQueue : public detail::Wrapper 2560 | { 2561 | public: 2562 | CommandQueue( 2563 | const Context& context, 2564 | const Device& device, 2565 | cl_command_queue_properties properties = 0, 2566 | cl_int* err = NULL) 2567 | { 2568 | cl_int error; 2569 | object_ = ::clCreateCommandQueue( 2570 | context(), device(), properties, &error); 2571 | 2572 | detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); 2573 | if (err != NULL) { 2574 | *err = error; 2575 | } 2576 | } 2577 | 2578 | CommandQueue() { } 2579 | 2580 | CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper(commandQueue) { } 2581 | 2582 | CommandQueue& operator = (const CommandQueue& rhs) 2583 | { 2584 | if (this != &rhs) { 2585 | detail::Wrapper::operator=(rhs); 2586 | } 2587 | return *this; 2588 | } 2589 | 2590 | template 2591 | cl_int getInfo(cl_command_queue_info name, T* param) const 2592 | { 2593 | return detail::errHandler( 2594 | detail::getInfo( 2595 | &::clGetCommandQueueInfo, object_, name, param), 2596 | __GET_COMMAND_QUEUE_INFO_ERR); 2597 | } 2598 | 2599 | template typename 2600 | detail::param_traits::param_type 2601 | getInfo(cl_int* err = NULL) const 2602 | { 2603 | typename detail::param_traits< 2604 | detail::cl_command_queue_info, name>::param_type param; 2605 | cl_int result = getInfo(name, ¶m); 2606 | if (err != NULL) { 2607 | *err = result; 2608 | } 2609 | return param; 2610 | } 2611 | 2612 | cl_int enqueueReadBuffer( 2613 | const Buffer& buffer, 2614 | cl_bool blocking, 2615 | ::size_t offset, 2616 | ::size_t size, 2617 | void* ptr, 2618 | const VECTOR_CLASS* events = NULL, 2619 | Event* event = NULL) const 2620 | { 2621 | return detail::errHandler( 2622 | ::clEnqueueReadBuffer( 2623 | object_, buffer(), blocking, offset, size, 2624 | ptr, 2625 | (events != NULL) ? (cl_uint) events->size() : 0, 2626 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2627 | (cl_event*) event), 2628 | __ENQUEUE_READ_BUFFER_ERR); 2629 | } 2630 | 2631 | cl_int enqueueWriteBuffer( 2632 | const Buffer& buffer, 2633 | cl_bool blocking, 2634 | ::size_t offset, 2635 | ::size_t size, 2636 | const void* ptr, 2637 | const VECTOR_CLASS* events = NULL, 2638 | Event* event = NULL) const 2639 | { 2640 | return detail::errHandler( 2641 | ::clEnqueueWriteBuffer( 2642 | object_, buffer(), blocking, offset, size, 2643 | ptr, 2644 | (events != NULL) ? (cl_uint) events->size() : 0, 2645 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2646 | (cl_event*) event), 2647 | __ENQUEUE_WRITE_BUFFER_ERR); 2648 | } 2649 | 2650 | cl_int enqueueCopyBuffer( 2651 | const Buffer& src, 2652 | const Buffer& dst, 2653 | ::size_t src_offset, 2654 | ::size_t dst_offset, 2655 | ::size_t size, 2656 | const VECTOR_CLASS* events = NULL, 2657 | Event* event = NULL) const 2658 | { 2659 | return detail::errHandler( 2660 | ::clEnqueueCopyBuffer( 2661 | object_, src(), dst(), src_offset, dst_offset, size, 2662 | (events != NULL) ? (cl_uint) events->size() : 0, 2663 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2664 | (cl_event*) event), 2665 | __ENQEUE_COPY_BUFFER_ERR); 2666 | } 2667 | 2668 | #if defined(CL_VERSION_1_1) 2669 | cl_int enqueueReadBufferRect( 2670 | const Buffer& buffer, 2671 | cl_bool blocking, 2672 | const size_t<3>& buffer_offset, 2673 | const size_t<3>& host_offset, 2674 | const size_t<3>& region, 2675 | ::size_t buffer_row_pitch, 2676 | ::size_t buffer_slice_pitch, 2677 | ::size_t host_row_pitch, 2678 | ::size_t host_slice_pitch, 2679 | void *ptr, 2680 | const VECTOR_CLASS* events = NULL, 2681 | Event* event = NULL) const 2682 | { 2683 | return detail::errHandler( 2684 | ::clEnqueueReadBufferRect( 2685 | object_, 2686 | buffer(), 2687 | blocking, 2688 | (const ::size_t *)buffer_offset, 2689 | (const ::size_t *)host_offset, 2690 | (const ::size_t *)region, 2691 | buffer_row_pitch, 2692 | buffer_slice_pitch, 2693 | host_row_pitch, 2694 | host_slice_pitch, 2695 | ptr, 2696 | (events != NULL) ? (cl_uint) events->size() : 0, 2697 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2698 | (cl_event*) event), 2699 | __ENQUEUE_READ_BUFFER_RECT_ERR); 2700 | } 2701 | 2702 | 2703 | cl_int enqueueWriteBufferRect( 2704 | const Buffer& buffer, 2705 | cl_bool blocking, 2706 | const size_t<3>& buffer_offset, 2707 | const size_t<3>& host_offset, 2708 | const size_t<3>& region, 2709 | ::size_t buffer_row_pitch, 2710 | ::size_t buffer_slice_pitch, 2711 | ::size_t host_row_pitch, 2712 | ::size_t host_slice_pitch, 2713 | void *ptr, 2714 | const VECTOR_CLASS* events = NULL, 2715 | Event* event = NULL) const 2716 | { 2717 | return detail::errHandler( 2718 | ::clEnqueueWriteBufferRect( 2719 | object_, 2720 | buffer(), 2721 | blocking, 2722 | (const ::size_t *)buffer_offset, 2723 | (const ::size_t *)host_offset, 2724 | (const ::size_t *)region, 2725 | buffer_row_pitch, 2726 | buffer_slice_pitch, 2727 | host_row_pitch, 2728 | host_slice_pitch, 2729 | ptr, 2730 | (events != NULL) ? (cl_uint) events->size() : 0, 2731 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2732 | (cl_event*) event), 2733 | __ENQUEUE_WRITE_BUFFER_RECT_ERR); 2734 | } 2735 | 2736 | cl_int enqueueCopyBufferRect( 2737 | const Buffer& src, 2738 | const Buffer& dst, 2739 | const size_t<3>& src_origin, 2740 | const size_t<3>& dst_origin, 2741 | const size_t<3>& region, 2742 | ::size_t src_row_pitch, 2743 | ::size_t src_slice_pitch, 2744 | ::size_t dst_row_pitch, 2745 | ::size_t dst_slice_pitch, 2746 | const VECTOR_CLASS* events = NULL, 2747 | Event* event = NULL) const 2748 | { 2749 | return detail::errHandler( 2750 | ::clEnqueueCopyBufferRect( 2751 | object_, 2752 | src(), 2753 | dst(), 2754 | (const ::size_t *)src_origin, 2755 | (const ::size_t *)dst_origin, 2756 | (const ::size_t *)region, 2757 | src_row_pitch, 2758 | src_slice_pitch, 2759 | dst_row_pitch, 2760 | dst_slice_pitch, 2761 | (events != NULL) ? (cl_uint) events->size() : 0, 2762 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2763 | (cl_event*) event), 2764 | __ENQEUE_COPY_BUFFER_RECT_ERR); 2765 | } 2766 | #endif 2767 | 2768 | cl_int enqueueReadImage( 2769 | const Image& image, 2770 | cl_bool blocking, 2771 | const size_t<3>& origin, 2772 | const size_t<3>& region, 2773 | ::size_t row_pitch, 2774 | ::size_t slice_pitch, 2775 | void* ptr, 2776 | const VECTOR_CLASS* events = NULL, 2777 | Event* event = NULL) const 2778 | { 2779 | return detail::errHandler( 2780 | ::clEnqueueReadImage( 2781 | object_, image(), blocking, (const ::size_t *) origin, 2782 | (const ::size_t *) region, row_pitch, slice_pitch, ptr, 2783 | (events != NULL) ? (cl_uint) events->size() : 0, 2784 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2785 | (cl_event*) event), 2786 | __ENQUEUE_READ_IMAGE_ERR); 2787 | } 2788 | 2789 | cl_int enqueueWriteImage( 2790 | const Image& image, 2791 | cl_bool blocking, 2792 | const size_t<3>& origin, 2793 | const size_t<3>& region, 2794 | ::size_t row_pitch, 2795 | ::size_t slice_pitch, 2796 | void* ptr, 2797 | const VECTOR_CLASS* events = NULL, 2798 | Event* event = NULL) const 2799 | { 2800 | return detail::errHandler( 2801 | ::clEnqueueWriteImage( 2802 | object_, image(), blocking, (const ::size_t *) origin, 2803 | (const ::size_t *) region, row_pitch, slice_pitch, ptr, 2804 | (events != NULL) ? (cl_uint) events->size() : 0, 2805 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2806 | (cl_event*) event), 2807 | __ENQUEUE_WRITE_IMAGE_ERR); 2808 | } 2809 | 2810 | cl_int enqueueCopyImage( 2811 | const Image& src, 2812 | const Image& dst, 2813 | const size_t<3>& src_origin, 2814 | const size_t<3>& dst_origin, 2815 | const size_t<3>& region, 2816 | const VECTOR_CLASS* events = NULL, 2817 | Event* event = NULL) const 2818 | { 2819 | return detail::errHandler( 2820 | ::clEnqueueCopyImage( 2821 | object_, src(), dst(), (const ::size_t *) src_origin, 2822 | (const ::size_t *)dst_origin, (const ::size_t *) region, 2823 | (events != NULL) ? (cl_uint) events->size() : 0, 2824 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2825 | (cl_event*) event), 2826 | __ENQUEUE_COPY_IMAGE_ERR); 2827 | } 2828 | 2829 | cl_int enqueueCopyImageToBuffer( 2830 | const Image& src, 2831 | const Buffer& dst, 2832 | const size_t<3>& src_origin, 2833 | const size_t<3>& region, 2834 | ::size_t dst_offset, 2835 | const VECTOR_CLASS* events = NULL, 2836 | Event* event = NULL) const 2837 | { 2838 | return detail::errHandler( 2839 | ::clEnqueueCopyImageToBuffer( 2840 | object_, src(), dst(), (const ::size_t *) src_origin, 2841 | (const ::size_t *) region, dst_offset, 2842 | (events != NULL) ? (cl_uint) events->size() : 0, 2843 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2844 | (cl_event*) event), 2845 | __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR); 2846 | } 2847 | 2848 | cl_int enqueueCopyBufferToImage( 2849 | const Buffer& src, 2850 | const Image& dst, 2851 | ::size_t src_offset, 2852 | const size_t<3>& dst_origin, 2853 | const size_t<3>& region, 2854 | const VECTOR_CLASS* events = NULL, 2855 | Event* event = NULL) const 2856 | { 2857 | return detail::errHandler( 2858 | ::clEnqueueCopyBufferToImage( 2859 | object_, src(), dst(), src_offset, 2860 | (const ::size_t *) dst_origin, (const ::size_t *) region, 2861 | (events != NULL) ? (cl_uint) events->size() : 0, 2862 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2863 | (cl_event*) event), 2864 | __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR); 2865 | } 2866 | 2867 | void* enqueueMapBuffer( 2868 | const Buffer& buffer, 2869 | cl_bool blocking, 2870 | cl_map_flags flags, 2871 | ::size_t offset, 2872 | ::size_t size, 2873 | const VECTOR_CLASS* events = NULL, 2874 | Event* event = NULL, 2875 | cl_int* err = NULL) const 2876 | { 2877 | cl_int error; 2878 | void * result = ::clEnqueueMapBuffer( 2879 | object_, buffer(), blocking, flags, offset, size, 2880 | (events != NULL) ? (cl_uint) events->size() : 0, 2881 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2882 | (cl_event*) event, 2883 | &error); 2884 | 2885 | detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); 2886 | if (err != NULL) { 2887 | *err = error; 2888 | } 2889 | return result; 2890 | } 2891 | 2892 | void* enqueueMapImage( 2893 | const Image& buffer, 2894 | cl_bool blocking, 2895 | cl_map_flags flags, 2896 | const size_t<3>& origin, 2897 | const size_t<3>& region, 2898 | ::size_t * row_pitch, 2899 | ::size_t * slice_pitch, 2900 | const VECTOR_CLASS* events = NULL, 2901 | Event* event = NULL, 2902 | cl_int* err = NULL) const 2903 | { 2904 | cl_int error; 2905 | void * result = ::clEnqueueMapImage( 2906 | object_, buffer(), blocking, flags, 2907 | (const ::size_t *) origin, (const ::size_t *) region, 2908 | row_pitch, slice_pitch, 2909 | (events != NULL) ? (cl_uint) events->size() : 0, 2910 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2911 | (cl_event*) event, 2912 | &error); 2913 | 2914 | detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR); 2915 | if (err != NULL) { 2916 | *err = error; 2917 | } 2918 | return result; 2919 | } 2920 | 2921 | cl_int enqueueUnmapMemObject( 2922 | const Memory& memory, 2923 | void* mapped_ptr, 2924 | const VECTOR_CLASS* events = NULL, 2925 | Event* event = NULL) const 2926 | { 2927 | return detail::errHandler( 2928 | ::clEnqueueUnmapMemObject( 2929 | object_, memory(), mapped_ptr, 2930 | (events != NULL) ? (cl_uint) events->size() : 0, 2931 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2932 | (cl_event*) event), 2933 | __ENQUEUE_UNMAP_MEM_OBJECT_ERR); 2934 | } 2935 | 2936 | cl_int enqueueNDRangeKernel( 2937 | const Kernel& kernel, 2938 | const NDRange& offset, 2939 | const NDRange& global, 2940 | const NDRange& local, 2941 | const VECTOR_CLASS* events = NULL, 2942 | Event* event = NULL) const 2943 | { 2944 | return detail::errHandler( 2945 | ::clEnqueueNDRangeKernel( 2946 | object_, kernel(), (cl_uint) global.dimensions(), 2947 | offset.dimensions() != 0 ? (const ::size_t*) offset : NULL, 2948 | (const ::size_t*) global, 2949 | local.dimensions() != 0 ? (const ::size_t*) local : NULL, 2950 | (events != NULL) ? (cl_uint) events->size() : 0, 2951 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2952 | (cl_event*) event), 2953 | __ENQUEUE_NDRANGE_KERNEL_ERR); 2954 | } 2955 | 2956 | cl_int enqueueTask( 2957 | const Kernel& kernel, 2958 | const VECTOR_CLASS* events = NULL, 2959 | Event* event = NULL) const 2960 | { 2961 | return detail::errHandler( 2962 | ::clEnqueueTask( 2963 | object_, kernel(), 2964 | (events != NULL) ? (cl_uint) events->size() : 0, 2965 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2966 | (cl_event*) event), 2967 | __ENQUEUE_TASK_ERR); 2968 | } 2969 | 2970 | cl_int enqueueNativeKernel( 2971 | void (*userFptr)(void *), 2972 | std::pair args, 2973 | const VECTOR_CLASS* mem_objects = NULL, 2974 | const VECTOR_CLASS* mem_locs = NULL, 2975 | const VECTOR_CLASS* events = NULL, 2976 | Event* event = NULL) const 2977 | { 2978 | cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) 2979 | ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem)) 2980 | : NULL; 2981 | 2982 | if (mems != NULL) { 2983 | for (unsigned int i = 0; i < mem_objects->size(); i++) { 2984 | mems[i] = ((*mem_objects)[i])(); 2985 | } 2986 | } 2987 | 2988 | return detail::errHandler( 2989 | ::clEnqueueNativeKernel( 2990 | object_, userFptr, args.first, args.second, 2991 | (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, 2992 | mems, 2993 | (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL, 2994 | (events != NULL) ? (cl_uint) events->size() : 0, 2995 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 2996 | (cl_event*) event), 2997 | __ENQUEUE_NATIVE_KERNEL); 2998 | } 2999 | 3000 | cl_int enqueueMarker(Event* event = NULL) const 3001 | { 3002 | return detail::errHandler( 3003 | ::clEnqueueMarker(object_, (cl_event*) event), 3004 | __ENQUEUE_MARKER_ERR); 3005 | } 3006 | 3007 | cl_int enqueueWaitForEvents(const VECTOR_CLASS& events) const 3008 | { 3009 | return detail::errHandler( 3010 | ::clEnqueueWaitForEvents( 3011 | object_, 3012 | (cl_uint) events.size(), 3013 | (const cl_event*) &events.front()), 3014 | __ENQUEUE_WAIT_FOR_EVENTS_ERR); 3015 | } 3016 | 3017 | cl_int enqueueAcquireGLObjects( 3018 | const VECTOR_CLASS* mem_objects = NULL, 3019 | const VECTOR_CLASS* events = NULL, 3020 | Event* event = NULL) const 3021 | { 3022 | return detail::errHandler( 3023 | ::clEnqueueAcquireGLObjects( 3024 | object_, 3025 | (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, 3026 | (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, 3027 | (events != NULL) ? (cl_uint) events->size() : 0, 3028 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 3029 | (cl_event*) event), 3030 | __ENQUEUE_ACQUIRE_GL_ERR); 3031 | } 3032 | 3033 | cl_int enqueueReleaseGLObjects( 3034 | const VECTOR_CLASS* mem_objects = NULL, 3035 | const VECTOR_CLASS* events = NULL, 3036 | Event* event = NULL) const 3037 | { 3038 | return detail::errHandler( 3039 | ::clEnqueueReleaseGLObjects( 3040 | object_, 3041 | (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, 3042 | (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, 3043 | (events != NULL) ? (cl_uint) events->size() : 0, 3044 | (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, 3045 | (cl_event*) event), 3046 | __ENQUEUE_RELEASE_GL_ERR); 3047 | } 3048 | 3049 | #if defined (USE_DX_INTEROP) 3050 | typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)( 3051 | cl_command_queue command_queue, cl_uint num_objects, 3052 | const cl_mem* mem_objects, cl_uint num_events_in_wait_list, 3053 | const cl_event* event_wait_list, cl_event* event); 3054 | typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)( 3055 | cl_command_queue command_queue, cl_uint num_objects, 3056 | const cl_mem* mem_objects, cl_uint num_events_in_wait_list, 3057 | const cl_event* event_wait_list, cl_event* event); 3058 | 3059 | cl_int enqueueAcquireD3D10Objects( 3060 | const VECTOR_CLASS* mem_objects = NULL, 3061 | const VECTOR_CLASS* events = NULL, 3062 | Event* event = NULL) const 3063 | { 3064 | static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL; 3065 | __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR); 3066 | 3067 | return detail::errHandler( 3068 | pfn_clEnqueueAcquireD3D10ObjectsKHR( 3069 | object_, 3070 | (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, 3071 | (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, 3072 | (events != NULL) ? (cl_uint) events->size() : 0, 3073 | (events != NULL) ? (cl_event*) &events->front() : NULL, 3074 | (cl_event*) event), 3075 | __ENQUEUE_ACQUIRE_GL_ERR); 3076 | } 3077 | 3078 | cl_int enqueueReleaseD3D10Objects( 3079 | const VECTOR_CLASS* mem_objects = NULL, 3080 | const VECTOR_CLASS* events = NULL, 3081 | Event* event = NULL) const 3082 | { 3083 | static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL; 3084 | __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR); 3085 | 3086 | return detail::errHandler( 3087 | pfn_clEnqueueReleaseD3D10ObjectsKHR( 3088 | object_, 3089 | (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, 3090 | (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, 3091 | (events != NULL) ? (cl_uint) events->size() : 0, 3092 | (events != NULL) ? (cl_event*) &events->front() : NULL, 3093 | (cl_event*) event), 3094 | __ENQUEUE_RELEASE_GL_ERR); 3095 | } 3096 | #endif 3097 | 3098 | cl_int enqueueBarrier() const 3099 | { 3100 | return detail::errHandler( 3101 | ::clEnqueueBarrier(object_), 3102 | __ENQUEUE_BARRIER_ERR); 3103 | } 3104 | 3105 | cl_int flush() const 3106 | { 3107 | return detail::errHandler(::clFlush(object_), __FLUSH_ERR); 3108 | } 3109 | 3110 | cl_int finish() const 3111 | { 3112 | return detail::errHandler(::clFinish(object_), __FINISH_ERR); 3113 | } 3114 | }; 3115 | 3116 | __GET_INFO_HELPER_WITH_RETAIN(cl::CommandQueue) 3117 | 3118 | /*! \class KernelFunctor 3119 | * \brief Kernel functor interface 3120 | * 3121 | * \note Currently only functors of zero to ten arguments are supported. It 3122 | * is straightforward to add more and a more general solution, similar to 3123 | * Boost.Lambda could be followed if required in the future. 3124 | */ 3125 | class KernelFunctor 3126 | { 3127 | private: 3128 | Kernel kernel_; 3129 | CommandQueue queue_; 3130 | NDRange offset_; 3131 | NDRange global_; 3132 | NDRange local_; 3133 | 3134 | cl_int err_; 3135 | public: 3136 | KernelFunctor() { } 3137 | 3138 | KernelFunctor( 3139 | const Kernel& kernel, 3140 | const CommandQueue& queue, 3141 | const NDRange& offset, 3142 | const NDRange& global, 3143 | const NDRange& local) : 3144 | kernel_(kernel), 3145 | queue_(queue), 3146 | offset_(offset), 3147 | global_(global), 3148 | local_(local), 3149 | err_(CL_SUCCESS) 3150 | {} 3151 | 3152 | KernelFunctor& operator=(const KernelFunctor& rhs); 3153 | 3154 | KernelFunctor(const KernelFunctor& rhs); 3155 | 3156 | cl_int getError() { return err_; } 3157 | 3158 | inline Event operator()(const VECTOR_CLASS* events = NULL); 3159 | 3160 | template 3161 | inline Event operator()( 3162 | const A1& a1, 3163 | const VECTOR_CLASS* events = NULL); 3164 | 3165 | template 3166 | inline Event operator()( 3167 | const A1& a1, 3168 | const A2& a2, 3169 | const VECTOR_CLASS* events = NULL); 3170 | 3171 | template 3172 | inline Event operator()( 3173 | const A1& a1, 3174 | const A2& a2, 3175 | const A3& a3, 3176 | const VECTOR_CLASS* events = NULL); 3177 | 3178 | template 3179 | inline Event operator()( 3180 | const A1& a1, 3181 | const A2& a2, 3182 | const A3& a3, 3183 | const A4& a4, 3184 | const VECTOR_CLASS* events = NULL); 3185 | 3186 | template 3187 | inline Event operator()( 3188 | const A1& a1, 3189 | const A2& a2, 3190 | const A3& a3, 3191 | const A4& a4, 3192 | const A5& a5, 3193 | const VECTOR_CLASS* events = NULL); 3194 | 3195 | template 3196 | inline Event operator()( 3197 | const A1& a1, 3198 | const A2& a2, 3199 | const A3& a3, 3200 | const A4& a4, 3201 | const A5& a5, 3202 | const A6& a6, 3203 | const VECTOR_CLASS* events = NULL); 3204 | 3205 | template 3207 | inline Event operator()( 3208 | const A1& a1, 3209 | const A2& a2, 3210 | const A3& a3, 3211 | const A4& a4, 3212 | const A5& a5, 3213 | const A6& a6, 3214 | const A7& a7, 3215 | const VECTOR_CLASS* events = NULL); 3216 | 3217 | template 3219 | inline Event operator()( 3220 | const A1& a1, 3221 | const A2& a2, 3222 | const A3& a3, 3223 | const A4& a4, 3224 | const A5& a5, 3225 | const A6& a6, 3226 | const A7& a7, 3227 | const A8& a8, 3228 | const VECTOR_CLASS* events = NULL); 3229 | 3230 | template 3232 | inline Event operator()( 3233 | const A1& a1, 3234 | const A2& a2, 3235 | const A3& a3, 3236 | const A4& a4, 3237 | const A5& a5, 3238 | const A6& a6, 3239 | const A7& a7, 3240 | const A8& a8, 3241 | const A9& a9, 3242 | const VECTOR_CLASS* events = NULL); 3243 | 3244 | template 3246 | inline Event operator()( 3247 | const A1& a1, 3248 | const A2& a2, 3249 | const A3& a3, 3250 | const A4& a4, 3251 | const A5& a5, 3252 | const A6& a6, 3253 | const A7& a7, 3254 | const A8& a8, 3255 | const A9& a9, 3256 | const A10& a10, 3257 | const VECTOR_CLASS* events = NULL); 3258 | 3259 | template 3262 | inline Event operator()( 3263 | const A1& a1, 3264 | const A2& a2, 3265 | const A3& a3, 3266 | const A4& a4, 3267 | const A5& a5, 3268 | const A6& a6, 3269 | const A7& a7, 3270 | const A8& a8, 3271 | const A9& a9, 3272 | const A10& a10, 3273 | const A11& a11, 3274 | const VECTOR_CLASS* events = NULL); 3275 | 3276 | template 3279 | inline Event operator()( 3280 | const A1& a1, 3281 | const A2& a2, 3282 | const A3& a3, 3283 | const A4& a4, 3284 | const A5& a5, 3285 | const A6& a6, 3286 | const A7& a7, 3287 | const A8& a8, 3288 | const A9& a9, 3289 | const A10& a10, 3290 | const A11& a11, 3291 | const A12& a12, 3292 | const VECTOR_CLASS* events = NULL); 3293 | 3294 | template 3297 | inline Event operator()( 3298 | const A1& a1, 3299 | const A2& a2, 3300 | const A3& a3, 3301 | const A4& a4, 3302 | const A5& a5, 3303 | const A6& a6, 3304 | const A7& a7, 3305 | const A8& a8, 3306 | const A9& a9, 3307 | const A10& a10, 3308 | const A11& a11, 3309 | const A12& a12, 3310 | const A13& a13, 3311 | const VECTOR_CLASS* events = NULL); 3312 | 3313 | template 3316 | inline Event operator()( 3317 | const A1& a1, 3318 | const A2& a2, 3319 | const A3& a3, 3320 | const A4& a4, 3321 | const A5& a5, 3322 | const A6& a6, 3323 | const A7& a7, 3324 | const A8& a8, 3325 | const A9& a9, 3326 | const A10& a10, 3327 | const A11& a11, 3328 | const A12& a12, 3329 | const A13& a13, 3330 | const A14& a14, 3331 | const VECTOR_CLASS* events = NULL); 3332 | 3333 | template 3336 | inline Event operator()( 3337 | const A1& a1, 3338 | const A2& a2, 3339 | const A3& a3, 3340 | const A4& a4, 3341 | const A5& a5, 3342 | const A6& a6, 3343 | const A7& a7, 3344 | const A8& a8, 3345 | const A9& a9, 3346 | const A10& a10, 3347 | const A11& a11, 3348 | const A12& a12, 3349 | const A13& a13, 3350 | const A14& a14, 3351 | const A15& a15, 3352 | const VECTOR_CLASS* events = NULL); 3353 | }; 3354 | 3355 | inline KernelFunctor Kernel::bind( 3356 | const CommandQueue& queue, 3357 | const NDRange& offset, 3358 | const NDRange& global, 3359 | const NDRange& local) 3360 | { 3361 | return KernelFunctor(*this,queue,offset,global,local); 3362 | } 3363 | 3364 | inline KernelFunctor Kernel::bind( 3365 | const CommandQueue& queue, 3366 | const NDRange& global, 3367 | const NDRange& local) 3368 | { 3369 | return KernelFunctor(*this,queue,NullRange,global,local); 3370 | } 3371 | 3372 | inline KernelFunctor& KernelFunctor::operator=(const KernelFunctor& rhs) 3373 | { 3374 | if (this == &rhs) { 3375 | return *this; 3376 | } 3377 | 3378 | kernel_ = rhs.kernel_; 3379 | queue_ = rhs.queue_; 3380 | offset_ = rhs.offset_; 3381 | global_ = rhs.global_; 3382 | local_ = rhs.local_; 3383 | 3384 | return *this; 3385 | } 3386 | 3387 | inline KernelFunctor::KernelFunctor(const KernelFunctor& rhs) : 3388 | kernel_(rhs.kernel_), 3389 | queue_(rhs.queue_), 3390 | offset_(rhs.offset_), 3391 | global_(rhs.global_), 3392 | local_(rhs.local_) 3393 | { 3394 | } 3395 | 3396 | Event KernelFunctor::operator()(const VECTOR_CLASS* events) 3397 | { 3398 | Event event; 3399 | 3400 | err_ = queue_.enqueueNDRangeKernel( 3401 | kernel_, 3402 | offset_, 3403 | global_, 3404 | local_, 3405 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3406 | &event); 3407 | 3408 | return event; 3409 | } 3410 | 3411 | template 3412 | Event KernelFunctor::operator()( 3413 | const A1& a1, 3414 | const VECTOR_CLASS* events) 3415 | { 3416 | Event event; 3417 | 3418 | kernel_.setArg(0,a1); 3419 | 3420 | err_ = queue_.enqueueNDRangeKernel( 3421 | kernel_, 3422 | offset_, 3423 | global_, 3424 | local_, 3425 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3426 | &event); 3427 | 3428 | return event; 3429 | } 3430 | 3431 | template 3432 | Event KernelFunctor::operator()( 3433 | const A1& a1, 3434 | const A2& a2, 3435 | const VECTOR_CLASS* events) 3436 | { 3437 | Event event; 3438 | 3439 | kernel_.setArg(0,a1); 3440 | kernel_.setArg(1,a2); 3441 | 3442 | err_ = queue_.enqueueNDRangeKernel( 3443 | kernel_, 3444 | offset_, 3445 | global_, 3446 | local_, 3447 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3448 | &event); 3449 | 3450 | return event; 3451 | } 3452 | 3453 | template 3454 | Event KernelFunctor::operator()( 3455 | const A1& a1, 3456 | const A2& a2, 3457 | const A3& a3, 3458 | const VECTOR_CLASS* events) 3459 | { 3460 | Event event; 3461 | 3462 | kernel_.setArg(0,a1); 3463 | kernel_.setArg(1,a2); 3464 | kernel_.setArg(2,a3); 3465 | 3466 | err_ = queue_.enqueueNDRangeKernel( 3467 | kernel_, 3468 | offset_, 3469 | global_, 3470 | local_, 3471 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3472 | &event); 3473 | 3474 | return event; 3475 | } 3476 | 3477 | template 3478 | Event KernelFunctor::operator()( 3479 | const A1& a1, 3480 | const A2& a2, 3481 | const A3& a3, 3482 | const A4& a4, 3483 | const VECTOR_CLASS* events) 3484 | { 3485 | Event event; 3486 | 3487 | kernel_.setArg(0,a1); 3488 | kernel_.setArg(1,a2); 3489 | kernel_.setArg(2,a3); 3490 | kernel_.setArg(3,a4); 3491 | 3492 | err_ = queue_.enqueueNDRangeKernel( 3493 | kernel_, 3494 | offset_, 3495 | global_, 3496 | local_, 3497 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3498 | &event); 3499 | 3500 | return event; 3501 | } 3502 | 3503 | template 3504 | Event KernelFunctor::operator()( 3505 | const A1& a1, 3506 | const A2& a2, 3507 | const A3& a3, 3508 | const A4& a4, 3509 | const A5& a5, 3510 | const VECTOR_CLASS* events) 3511 | { 3512 | Event event; 3513 | 3514 | kernel_.setArg(0,a1); 3515 | kernel_.setArg(1,a2); 3516 | kernel_.setArg(2,a3); 3517 | kernel_.setArg(3,a4); 3518 | kernel_.setArg(4,a5); 3519 | 3520 | err_ = queue_.enqueueNDRangeKernel( 3521 | kernel_, 3522 | offset_, 3523 | global_, 3524 | local_, 3525 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3526 | &event); 3527 | 3528 | return event; 3529 | } 3530 | 3531 | template 3533 | Event KernelFunctor::operator()( 3534 | const A1& a1, 3535 | const A2& a2, 3536 | const A3& a3, 3537 | const A4& a4, 3538 | const A5& a5, 3539 | const A6& a6, 3540 | const VECTOR_CLASS* events) 3541 | { 3542 | Event event; 3543 | 3544 | kernel_.setArg(0,a1); 3545 | kernel_.setArg(1,a2); 3546 | kernel_.setArg(2,a3); 3547 | kernel_.setArg(3,a4); 3548 | kernel_.setArg(4,a5); 3549 | kernel_.setArg(5,a6); 3550 | 3551 | err_ = queue_.enqueueNDRangeKernel( 3552 | kernel_, 3553 | offset_, 3554 | global_, 3555 | local_, 3556 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3557 | &event); 3558 | 3559 | return event; 3560 | } 3561 | 3562 | template 3564 | Event KernelFunctor::operator()( 3565 | const A1& a1, 3566 | const A2& a2, 3567 | const A3& a3, 3568 | const A4& a4, 3569 | const A5& a5, 3570 | const A6& a6, 3571 | const A7& a7, 3572 | const VECTOR_CLASS* events) 3573 | { 3574 | Event event; 3575 | 3576 | kernel_.setArg(0,a1); 3577 | kernel_.setArg(1,a2); 3578 | kernel_.setArg(2,a3); 3579 | kernel_.setArg(3,a4); 3580 | kernel_.setArg(4,a5); 3581 | kernel_.setArg(5,a6); 3582 | kernel_.setArg(6,a7); 3583 | 3584 | err_ = queue_.enqueueNDRangeKernel( 3585 | kernel_, 3586 | offset_, 3587 | global_, 3588 | local_, 3589 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3590 | &event); 3591 | 3592 | return event; 3593 | } 3594 | 3595 | template 3597 | Event KernelFunctor::operator()( 3598 | const A1& a1, 3599 | const A2& a2, 3600 | const A3& a3, 3601 | const A4& a4, 3602 | const A5& a5, 3603 | const A6& a6, 3604 | const A7& a7, 3605 | const A8& a8, 3606 | const VECTOR_CLASS* events) 3607 | { 3608 | Event event; 3609 | 3610 | kernel_.setArg(0,a1); 3611 | kernel_.setArg(1,a2); 3612 | kernel_.setArg(2,a3); 3613 | kernel_.setArg(3,a4); 3614 | kernel_.setArg(4,a5); 3615 | kernel_.setArg(5,a6); 3616 | kernel_.setArg(6,a7); 3617 | kernel_.setArg(7,a8); 3618 | 3619 | err_ = queue_.enqueueNDRangeKernel( 3620 | kernel_, 3621 | offset_, 3622 | global_, 3623 | local_, 3624 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3625 | &event); 3626 | 3627 | return event; 3628 | } 3629 | 3630 | template 3632 | Event KernelFunctor::operator()( 3633 | const A1& a1, 3634 | const A2& a2, 3635 | const A3& a3, 3636 | const A4& a4, 3637 | const A5& a5, 3638 | const A6& a6, 3639 | const A7& a7, 3640 | const A8& a8, 3641 | const A9& a9, 3642 | const VECTOR_CLASS* events) 3643 | { 3644 | Event event; 3645 | 3646 | kernel_.setArg(0,a1); 3647 | kernel_.setArg(1,a2); 3648 | kernel_.setArg(2,a3); 3649 | kernel_.setArg(3,a4); 3650 | kernel_.setArg(4,a5); 3651 | kernel_.setArg(5,a6); 3652 | kernel_.setArg(6,a7); 3653 | kernel_.setArg(7,a8); 3654 | kernel_.setArg(8,a9); 3655 | 3656 | err_ = queue_.enqueueNDRangeKernel( 3657 | kernel_, 3658 | offset_, 3659 | global_, 3660 | local_, 3661 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3662 | &event); 3663 | 3664 | return event; 3665 | } 3666 | 3667 | template 3669 | Event KernelFunctor::operator()( 3670 | const A1& a1, 3671 | const A2& a2, 3672 | const A3& a3, 3673 | const A4& a4, 3674 | const A5& a5, 3675 | const A6& a6, 3676 | const A7& a7, 3677 | const A8& a8, 3678 | const A9& a9, 3679 | const A10& a10, 3680 | const VECTOR_CLASS* events) 3681 | { 3682 | Event event; 3683 | 3684 | kernel_.setArg(0,a1); 3685 | kernel_.setArg(1,a2); 3686 | kernel_.setArg(2,a3); 3687 | kernel_.setArg(3,a4); 3688 | kernel_.setArg(4,a5); 3689 | kernel_.setArg(5,a6); 3690 | kernel_.setArg(6,a7); 3691 | kernel_.setArg(7,a8); 3692 | kernel_.setArg(8,a9); 3693 | kernel_.setArg(9,a10); 3694 | 3695 | err_ = queue_.enqueueNDRangeKernel( 3696 | kernel_, 3697 | offset_, 3698 | global_, 3699 | local_, 3700 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3701 | &event); 3702 | 3703 | return event; 3704 | } 3705 | 3706 | template 3709 | Event KernelFunctor::operator()( 3710 | const A1& a1, 3711 | const A2& a2, 3712 | const A3& a3, 3713 | const A4& a4, 3714 | const A5& a5, 3715 | const A6& a6, 3716 | const A7& a7, 3717 | const A8& a8, 3718 | const A9& a9, 3719 | const A10& a10, 3720 | const A11& a11, 3721 | const VECTOR_CLASS* events) 3722 | { 3723 | Event event; 3724 | 3725 | kernel_.setArg(0,a1); 3726 | kernel_.setArg(1,a2); 3727 | kernel_.setArg(2,a3); 3728 | kernel_.setArg(3,a4); 3729 | kernel_.setArg(4,a5); 3730 | kernel_.setArg(5,a6); 3731 | kernel_.setArg(6,a7); 3732 | kernel_.setArg(7,a8); 3733 | kernel_.setArg(8,a9); 3734 | kernel_.setArg(9,a10); 3735 | kernel_.setArg(10,a11); 3736 | 3737 | err_ = queue_.enqueueNDRangeKernel( 3738 | kernel_, 3739 | offset_, 3740 | global_, 3741 | local_, 3742 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3743 | &event); 3744 | 3745 | return event; 3746 | } 3747 | 3748 | template 3751 | Event KernelFunctor::operator()( 3752 | const A1& a1, 3753 | const A2& a2, 3754 | const A3& a3, 3755 | const A4& a4, 3756 | const A5& a5, 3757 | const A6& a6, 3758 | const A7& a7, 3759 | const A8& a8, 3760 | const A9& a9, 3761 | const A10& a10, 3762 | const A11& a11, 3763 | const A12& a12, 3764 | const VECTOR_CLASS* events) 3765 | { 3766 | Event event; 3767 | 3768 | kernel_.setArg(0,a1); 3769 | kernel_.setArg(1,a2); 3770 | kernel_.setArg(2,a3); 3771 | kernel_.setArg(3,a4); 3772 | kernel_.setArg(4,a5); 3773 | kernel_.setArg(5,a6); 3774 | kernel_.setArg(6,a7); 3775 | kernel_.setArg(7,a8); 3776 | kernel_.setArg(8,a9); 3777 | kernel_.setArg(9,a10); 3778 | kernel_.setArg(10,a11); 3779 | kernel_.setArg(11,a12); 3780 | 3781 | err_ = queue_.enqueueNDRangeKernel( 3782 | kernel_, 3783 | offset_, 3784 | global_, 3785 | local_, 3786 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3787 | &event); 3788 | 3789 | return event; 3790 | } 3791 | 3792 | template 3795 | Event KernelFunctor::operator()( 3796 | const A1& a1, 3797 | const A2& a2, 3798 | const A3& a3, 3799 | const A4& a4, 3800 | const A5& a5, 3801 | const A6& a6, 3802 | const A7& a7, 3803 | const A8& a8, 3804 | const A9& a9, 3805 | const A10& a10, 3806 | const A11& a11, 3807 | const A12& a12, 3808 | const A13& a13, 3809 | const VECTOR_CLASS* events) 3810 | { 3811 | Event event; 3812 | 3813 | kernel_.setArg(0,a1); 3814 | kernel_.setArg(1,a2); 3815 | kernel_.setArg(2,a3); 3816 | kernel_.setArg(3,a4); 3817 | kernel_.setArg(4,a5); 3818 | kernel_.setArg(5,a6); 3819 | kernel_.setArg(6,a7); 3820 | kernel_.setArg(7,a8); 3821 | kernel_.setArg(8,a9); 3822 | kernel_.setArg(9,a10); 3823 | kernel_.setArg(10,a11); 3824 | kernel_.setArg(11,a12); 3825 | kernel_.setArg(12,a13); 3826 | 3827 | err_ = queue_.enqueueNDRangeKernel( 3828 | kernel_, 3829 | offset_, 3830 | global_, 3831 | local_, 3832 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3833 | &event); 3834 | 3835 | return event; 3836 | } 3837 | 3838 | template 3841 | Event KernelFunctor::operator()( 3842 | const A1& a1, 3843 | const A2& a2, 3844 | const A3& a3, 3845 | const A4& a4, 3846 | const A5& a5, 3847 | const A6& a6, 3848 | const A7& a7, 3849 | const A8& a8, 3850 | const A9& a9, 3851 | const A10& a10, 3852 | const A11& a11, 3853 | const A12& a12, 3854 | const A13& a13, 3855 | const A14& a14, 3856 | const VECTOR_CLASS* events) 3857 | { 3858 | Event event; 3859 | 3860 | kernel_.setArg(0,a1); 3861 | kernel_.setArg(1,a2); 3862 | kernel_.setArg(2,a3); 3863 | kernel_.setArg(3,a4); 3864 | kernel_.setArg(4,a5); 3865 | kernel_.setArg(5,a6); 3866 | kernel_.setArg(6,a7); 3867 | kernel_.setArg(7,a8); 3868 | kernel_.setArg(8,a9); 3869 | kernel_.setArg(9,a10); 3870 | kernel_.setArg(10,a11); 3871 | kernel_.setArg(11,a12); 3872 | kernel_.setArg(12,a13); 3873 | kernel_.setArg(13,a14); 3874 | 3875 | err_ = queue_.enqueueNDRangeKernel( 3876 | kernel_, 3877 | offset_, 3878 | global_, 3879 | local_, 3880 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3881 | &event); 3882 | 3883 | return event; 3884 | } 3885 | 3886 | template 3889 | Event KernelFunctor::operator()( 3890 | const A1& a1, 3891 | const A2& a2, 3892 | const A3& a3, 3893 | const A4& a4, 3894 | const A5& a5, 3895 | const A6& a6, 3896 | const A7& a7, 3897 | const A8& a8, 3898 | const A9& a9, 3899 | const A10& a10, 3900 | const A11& a11, 3901 | const A12& a12, 3902 | const A13& a13, 3903 | const A14& a14, 3904 | const A15& a15, 3905 | const VECTOR_CLASS* events) 3906 | { 3907 | Event event; 3908 | 3909 | kernel_.setArg(0,a1); 3910 | kernel_.setArg(1,a2); 3911 | kernel_.setArg(2,a3); 3912 | kernel_.setArg(3,a4); 3913 | kernel_.setArg(4,a5); 3914 | kernel_.setArg(5,a6); 3915 | kernel_.setArg(6,a7); 3916 | kernel_.setArg(7,a8); 3917 | kernel_.setArg(8,a9); 3918 | kernel_.setArg(9,a10); 3919 | kernel_.setArg(10,a11); 3920 | kernel_.setArg(11,a12); 3921 | kernel_.setArg(12,a13); 3922 | kernel_.setArg(13,a14); 3923 | kernel_.setArg(14,a15); 3924 | 3925 | err_ = queue_.enqueueNDRangeKernel( 3926 | kernel_, 3927 | offset_, 3928 | global_, 3929 | local_, 3930 | NULL, // bgaster_fixme - do we want to allow wait event lists? 3931 | &event); 3932 | 3933 | return event; 3934 | } 3935 | 3936 | #undef __ERR_STR 3937 | #if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) 3938 | #undef __GET_DEVICE_INFO_ERR 3939 | #undef __GET_PLATFORM_INFO_ERR 3940 | #undef __GET_DEVICE_IDS_ERR 3941 | #undef __GET_CONTEXT_INFO_ERR 3942 | #undef __GET_EVENT_INFO_ERR 3943 | #undef __GET_EVENT_PROFILE_INFO_ERR 3944 | #undef __GET_MEM_OBJECT_INFO_ERR 3945 | #undef __GET_IMAGE_INFO_ERR 3946 | #undef __GET_SAMPLER_INFO_ERR 3947 | #undef __GET_KERNEL_INFO_ERR 3948 | #undef __GET_KERNEL_WORK_GROUP_INFO_ERR 3949 | #undef __GET_PROGRAM_INFO_ERR 3950 | #undef __GET_PROGRAM_BUILD_INFO_ERR 3951 | #undef __GET_COMMAND_QUEUE_INFO_ERR 3952 | 3953 | #undef __CREATE_CONTEXT_FROM_TYPE_ERR 3954 | #undef __GET_SUPPORTED_IMAGE_FORMATS_ERR 3955 | 3956 | #undef __CREATE_BUFFER_ERR 3957 | #undef __CREATE_SUBBUFFER_ERR 3958 | #undef __CREATE_IMAGE2D_ERR 3959 | #undef __CREATE_IMAGE3D_ERR 3960 | #undef __CREATE_SAMPLER_ERR 3961 | #undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR 3962 | 3963 | #undef __CREATE_USER_EVENT_ERR 3964 | #undef __SET_USER_EVENT_STATUS_ERR 3965 | #undef __SET_EVENT_CALLBACK_ERR 3966 | 3967 | #undef __WAIT_FOR_EVENTS_ERR 3968 | 3969 | #undef __CREATE_KERNEL_ERR 3970 | #undef __SET_KERNEL_ARGS_ERR 3971 | #undef __CREATE_PROGRAM_WITH_SOURCE_ERR 3972 | #undef __CREATE_PROGRAM_WITH_BINARY_ERR 3973 | #undef __BUILD_PROGRAM_ERR 3974 | #undef __CREATE_KERNELS_IN_PROGRAM_ERR 3975 | 3976 | #undef __CREATE_COMMAND_QUEUE_ERR 3977 | #undef __SET_COMMAND_QUEUE_PROPERTY_ERR 3978 | #undef __ENQUEUE_READ_BUFFER_ERR 3979 | #undef __ENQUEUE_WRITE_BUFFER_ERR 3980 | #undef __ENQUEUE_READ_BUFFER_RECT_ERR 3981 | #undef __ENQUEUE_WRITE_BUFFER_RECT_ERR 3982 | #undef __ENQEUE_COPY_BUFFER_ERR 3983 | #undef __ENQEUE_COPY_BUFFER_RECT_ERR 3984 | #undef __ENQUEUE_READ_IMAGE_ERR 3985 | #undef __ENQUEUE_WRITE_IMAGE_ERR 3986 | #undef __ENQUEUE_COPY_IMAGE_ERR 3987 | #undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR 3988 | #undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR 3989 | #undef __ENQUEUE_MAP_BUFFER_ERR 3990 | #undef __ENQUEUE_MAP_IMAGE_ERR 3991 | #undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR 3992 | #undef __ENQUEUE_NDRANGE_KERNEL_ERR 3993 | #undef __ENQUEUE_TASK_ERR 3994 | #undef __ENQUEUE_NATIVE_KERNEL 3995 | 3996 | #undef __UNLOAD_COMPILER_ERR 3997 | #endif //__CL_USER_OVERRIDE_ERROR_STRINGS 3998 | 3999 | #undef __GET_INFO_HELPER_WITH_RETAIN 4000 | 4001 | // Extensions 4002 | #undef __INIT_CL_EXT_FCN_PTR 4003 | #undef __CREATE_SUB_DEVICES 4004 | 4005 | #if defined(USE_CL_DEVICE_FISSION) 4006 | #undef __PARAM_NAME_DEVICE_FISSION 4007 | #endif // USE_CL_DEVICE_FISSION 4008 | 4009 | } // namespace cl 4010 | 4011 | #endif // CL_HPP_ 4012 | -------------------------------------------------------------------------------- /kernels/matrix-multiply-tiled/matrix-multiply-tiled.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | 29 | #include "cuda.h" 30 | 31 | 32 | typedef float Real; 33 | 34 | 35 | //==--- Utility Functions --------------------------------------------------== // 36 | const char * statusToString(CUresult error) 37 | { 38 | switch (error) { 39 | case CUDA_SUCCESS: return "No errors"; 40 | case CUDA_ERROR_INVALID_VALUE: return "Invalid value"; 41 | case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory"; 42 | case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized"; 43 | case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized"; 44 | 45 | case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available"; 46 | case CUDA_ERROR_INVALID_DEVICE: return "Invalid device"; 47 | 48 | case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image"; 49 | case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context"; 50 | case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current"; 51 | case CUDA_ERROR_MAP_FAILED: return "Map failed"; 52 | case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed"; 53 | case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped"; 54 | case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped"; 55 | case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU"; 56 | case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired"; 57 | case CUDA_ERROR_NOT_MAPPED: return "Not mapped"; 58 | 59 | case CUDA_ERROR_INVALID_SOURCE: return "Invalid source"; 60 | case CUDA_ERROR_FILE_NOT_FOUND: return "File not found"; 61 | 62 | case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle"; 63 | 64 | case CUDA_ERROR_NOT_FOUND: return "Not found"; 65 | 66 | case CUDA_ERROR_NOT_READY: return "CUDA not ready"; 67 | 68 | case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed"; 69 | case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources"; 70 | case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout"; 71 | case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing"; 72 | 73 | case CUDA_ERROR_UNKNOWN: return "Unknown error"; 74 | default: return "Unknown error ID"; 75 | } 76 | } 77 | 78 | void checkSuccess(CUresult status, 79 | const char *func, 80 | const char *errorBuffer = 0) 81 | { 82 | if (status != CUDA_SUCCESS) { 83 | if (errorBuffer != 0) { 84 | std::cerr << "ERROR LOG:" << std::endl 85 | << errorBuffer << std::endl; 86 | } 87 | 88 | std::cerr << "ERROR: Could not execute '" << func << "', error (" 89 | << status << ") " << statusToString(status) << std::endl; 90 | exit(1); 91 | } 92 | } 93 | 94 | double getTimeStamp() 95 | { 96 | struct timezone Tzp; 97 | struct timeval Tp; 98 | int stat; 99 | stat = gettimeofday (&Tp, &Tzp); 100 | if (stat != 0) 101 | std::cerr << "Error return from gettimeofday: " << stat << "\n"; 102 | return (Tp.tv_sec + Tp.tv_usec * 1.0e-6); 103 | } 104 | 105 | 106 | 107 | //==--- Entry Point --------------------------------------------------------== // 108 | 109 | int main(int argc, 110 | char** argv) { 111 | 112 | CUcontext context; 113 | CUdevice device; 114 | CUmodule module; 115 | CUresult status; 116 | CUfunction function; 117 | 118 | const int kLogSize = 1024; 119 | char logBuffer[kLogSize]; 120 | 121 | int blockSizeX = 16; 122 | int blockSizeY = 16; 123 | int blockSizeMultiple = 100; 124 | int problemSizeX = blockSizeX * blockSizeMultiple; 125 | int problemSizeY = blockSizeY * blockSizeMultiple; 126 | 127 | // Seed random number generator 128 | srand(time(NULL)); 129 | 130 | // Initialize CUDA 131 | std::cout << "Initializing CUDA\n"; 132 | checkSuccess(cuInit(0), "cuInit"); 133 | std::cout << "Selecting first compute device\n"; 134 | checkSuccess(cuDeviceGet(&device, 0), "cuDeviceGet"); 135 | std::cout << "Creating CUDA context\n"; 136 | checkSuccess(cuCtxCreate(&context, 0, device), "cuCtxCreate"); 137 | 138 | // Read the PTX kernel from disk 139 | std::ifstream kernelFile("matrix-multiply-tiled.kernel.ptx"); 140 | if (!kernelFile.is_open()) { 141 | std::cerr << "Failed to open matrix-multiply.kernel.ptx\n"; 142 | return 1; 143 | } 144 | 145 | // Load entire kernel into a string 146 | std::string source(std::istreambuf_iterator(kernelFile), 147 | (std::istreambuf_iterator())); 148 | 149 | // Configure JIT options 150 | CUjit_option jitOptions[] = { CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, 151 | CU_JIT_ERROR_LOG_BUFFER }; 152 | void* jitOptionValues[] = { reinterpret_cast(kLogSize), logBuffer }; 153 | 154 | // Load the kernel onto the device 155 | status = cuModuleLoadDataEx(&module, source.c_str(), 156 | sizeof(jitOptions)/sizeof(jitOptions[0]), 157 | jitOptions, jitOptionValues); 158 | checkSuccess(status, "cuModuleLoadDataEx", logBuffer); 159 | 160 | status = cuModuleGetFunction(&function, module, "matrix_multiply_tiled"); 161 | checkSuccess(status, "cuModuleGetFunction"); 162 | 163 | 164 | // Print some diagnostics about the kernel compilation 165 | int numRegisters; 166 | cuFuncGetAttribute(&numRegisters, CU_FUNC_ATTRIBUTE_NUM_REGS, function); 167 | std::cout << "Register Usage: " << numRegisters << "\n"; 168 | 169 | 170 | // Setup buffers 171 | Real* hostA = new Real[problemSizeX * problemSizeY]; 172 | Real* hostB = new Real[problemSizeX * problemSizeY]; 173 | Real* refC = new Real[problemSizeX * problemSizeY]; 174 | Real* cmpC = new Real[problemSizeX * problemSizeY]; 175 | 176 | std::cout << "Problem Size: " << problemSizeX << " x " << problemSizeY 177 | << "\n"; 178 | 179 | CUdeviceptr deviceA; 180 | CUdeviceptr deviceB; 181 | CUdeviceptr deviceC; 182 | 183 | status = cuMemAlloc(&deviceA, problemSizeX * problemSizeY * sizeof(Real)); 184 | checkSuccess(status, "cuMemAlloc"); 185 | status = cuMemAlloc(&deviceB, problemSizeX * problemSizeY * sizeof(Real)); 186 | checkSuccess(status, "cuMemAlloc"); 187 | status = cuMemAlloc(&deviceC, problemSizeX * problemSizeY * sizeof(Real)); 188 | checkSuccess(status, "cuMemAlloc"); 189 | 190 | 191 | // Populate arrays with test data 192 | for (int i = 0; i < problemSizeX*problemSizeY; ++i) { 193 | hostA[i] = hostB[i] = (Real)rand() / ((Real)RAND_MAX + (Real)1.0); 194 | refC[i] = cmpC[i] = (Real)0.0; 195 | } 196 | 197 | 198 | // Copy buffers to device 199 | status = cuMemcpyHtoD(deviceA, hostA, problemSizeX * problemSizeY * sizeof(Real)); 200 | checkSuccess(status, "cuMemcpyHtoD"); 201 | status = cuMemcpyHtoD(deviceB, hostB, problemSizeX * problemSizeY * sizeof(Real)); 202 | checkSuccess(status, "cuMemcpyHtoD"); 203 | status = cuMemcpyHtoD(deviceC, cmpC, problemSizeX * problemSizeY * sizeof(Real)); 204 | checkSuccess(status, "cuMemcpyHtoD"); 205 | 206 | // Setup block shape 207 | status = cuFuncSetBlockShape(function, blockSizeX, blockSizeY, 1); 208 | checkSuccess(status, "cuFuncSetBlockShape"); 209 | 210 | 211 | // Bind kernel paramters 212 | status = cuParamSetv(function, 0, &deviceA, sizeof(CUdeviceptr)); 213 | checkSuccess(status, "cuParamSetv"); 214 | status = cuParamSetv(function, sizeof(CUdeviceptr), &deviceB, sizeof(CUdeviceptr)); 215 | checkSuccess(status, "cuParamSetv"); 216 | status = cuParamSetv(function, 2*sizeof(CUdeviceptr), &deviceC, sizeof(CUdeviceptr)); 217 | checkSuccess(status, "cuParamSetv"); 218 | 219 | status = cuParamSetSize(function, 3*sizeof(CUdeviceptr)); 220 | checkSuccess(status, "cuParamSetSize"); 221 | 222 | 223 | // Launch the kernel 224 | double deviceStart = getTimeStamp(); 225 | 226 | status = cuLaunchGrid(function, blockSizeMultiple, blockSizeMultiple); 227 | checkSuccess(status, "cuLaunchGrid"); 228 | cuCtxSynchronize(); 229 | 230 | double deviceEnd = getTimeStamp(); 231 | 232 | 233 | // Copy results back to the host 234 | status = cuMemcpyDtoH(cmpC, deviceC, problemSizeX * problemSizeY * sizeof(Real)); 235 | checkSuccess(status, "cuMemoryDtoH"); 236 | 237 | 238 | // Compute the reference solution 239 | double hostStart = getTimeStamp(); 240 | 241 | for (int k = 0; k < problemSizeX; ++k) { 242 | for (int i = 0; i < problemSizeY; ++i) { 243 | for (int j = 0; j < problemSizeX; ++j) { 244 | refC[i*problemSizeX+j] += hostA[i*problemSizeX+k] 245 | * hostB[k*problemSizeX+j]; 246 | } 247 | } 248 | } 249 | 250 | double hostEnd = getTimeStamp(); 251 | 252 | 253 | // Compare the results 254 | Real errorNorm = (Real)0.0; 255 | Real refNorm = (Real)0.0; 256 | 257 | for (int i = 0; i < problemSizeX*problemSizeY; ++i) { 258 | 259 | Real diff = refC[i] - cmpC[i]; 260 | 261 | errorNorm += diff * diff; 262 | 263 | refNorm += refC[i] * refC[i]; 264 | } 265 | 266 | errorNorm = (Real)sqrt((double)errorNorm); 267 | refNorm = (Real)sqrt((double)refNorm); 268 | 269 | if ((errorNorm / refNorm) < (Real)1e-5) { 270 | std::cout << "Host reference comparison test PASSED\n"; 271 | } 272 | else { 273 | std::cout << "Host reference comparison test FAILED\n"; 274 | std::cout << "Error Norm: " << errorNorm << "\n"; 275 | std::cout << "Ref Norm: " << refNorm << "\n"; 276 | } 277 | 278 | #if defined(DEBUG) 279 | for (int i = 0; i < problemSizeY; ++i) { 280 | for (int j = 0; j < problemSizeX; ++j) { 281 | std::cout << refC[i*problemSizeX+j] << ", " << cmpC[i*problemSizeX+j] << "\n"; 282 | } 283 | } 284 | #endif 285 | 286 | double gflopsDevice = ((double)problemSizeX * problemSizeY * problemSizeX * 2) / (deviceEnd - deviceStart) / 1e9; 287 | double gflopsHost = ((double)problemSizeX * problemSizeY * problemSizeX * 2) / (hostEnd - hostStart) / 1e9; 288 | 289 | std::cout << "Device Time: " << (deviceEnd - deviceStart) << "s\n"; 290 | std::cout << "Device GFlop/s: " << gflopsDevice << "\n"; 291 | std::cout << "Host Time: " << (hostEnd - hostStart) << "s\n"; 292 | std::cout << "Host GFlop/s: " << gflopsHost << "\n"; 293 | 294 | return 0; 295 | } 296 | 297 | -------------------------------------------------------------------------------- /kernels/matrix-multiply-tiled/matrix-multiply-tiled.kernel.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | // This must be changed to reflect changes in matrix-multiply-tiled.cpp 24 | #define BLOCK_SIZE 16 25 | 26 | __attribute__((address_space(4))) 27 | float g_scratchA[BLOCK_SIZE][BLOCK_SIZE]; 28 | __attribute__((address_space(4))) 29 | float g_scratchB[BLOCK_SIZE][BLOCK_SIZE]; 30 | 31 | extern "C" 32 | void matrix_multiply_tiled(float* A, 33 | float* B, 34 | float* C) 35 | { 36 | int globalX = __builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x() + __builtin_ptx_read_tid_x(); 37 | int globalY = __builtin_ptx_read_ctaid_y() * __builtin_ptx_read_ntid_y() + __builtin_ptx_read_tid_y(); 38 | int size = __builtin_ptx_read_nctaid_x() * __builtin_ptx_read_ntid_x(); 39 | int k; 40 | float sum = 0.0f; 41 | int numBlocks = size / BLOCK_SIZE; 42 | int b; 43 | 44 | int tidX = __builtin_ptx_read_tid_x(); 45 | int tidY = __builtin_ptx_read_tid_y(); 46 | 47 | for(b = 0; b < numBlocks; ++b) 48 | { 49 | // Populate a cache for A/B 50 | int x; 51 | int y; 52 | 53 | x = b * BLOCK_SIZE + tidX; 54 | y = globalY; 55 | 56 | g_scratchA[tidY][tidX] = A[y * size + x]; 57 | 58 | x = globalX; 59 | y = b * BLOCK_SIZE + tidY; 60 | 61 | g_scratchB[tidY][tidX] = B[y * size + x]; 62 | 63 | __builtin_ptx_bar_sync(0); 64 | 65 | for(k = 0; k < BLOCK_SIZE; ++k) 66 | { 67 | float myA; 68 | float myB; 69 | 70 | myA = g_scratchA[tidY][k]; 71 | myB = g_scratchB[k][tidX]; 72 | 73 | sum += myA * myB; 74 | } 75 | 76 | __builtin_ptx_bar_sync(0); 77 | } 78 | 79 | C[globalY * size + globalX] = sum; 80 | } 81 | -------------------------------------------------------------------------------- /kernels/matrix-multiply/matrix-multiply.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | 29 | #include "cuda.h" 30 | 31 | 32 | typedef float Real; 33 | 34 | 35 | //==--- Utility Functions --------------------------------------------------== // 36 | const char * statusToString(CUresult error) 37 | { 38 | switch (error) { 39 | case CUDA_SUCCESS: return "No errors"; 40 | case CUDA_ERROR_INVALID_VALUE: return "Invalid value"; 41 | case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory"; 42 | case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized"; 43 | case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized"; 44 | 45 | case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available"; 46 | case CUDA_ERROR_INVALID_DEVICE: return "Invalid device"; 47 | 48 | case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image"; 49 | case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context"; 50 | case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current"; 51 | case CUDA_ERROR_MAP_FAILED: return "Map failed"; 52 | case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed"; 53 | case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped"; 54 | case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped"; 55 | case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU"; 56 | case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired"; 57 | case CUDA_ERROR_NOT_MAPPED: return "Not mapped"; 58 | 59 | case CUDA_ERROR_INVALID_SOURCE: return "Invalid source"; 60 | case CUDA_ERROR_FILE_NOT_FOUND: return "File not found"; 61 | 62 | case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle"; 63 | 64 | case CUDA_ERROR_NOT_FOUND: return "Not found"; 65 | 66 | case CUDA_ERROR_NOT_READY: return "CUDA not ready"; 67 | 68 | case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed"; 69 | case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources"; 70 | case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout"; 71 | case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing"; 72 | 73 | case CUDA_ERROR_UNKNOWN: return "Unknown error"; 74 | default: return "Unknown error ID"; 75 | } 76 | } 77 | 78 | void checkSuccess(CUresult status, 79 | const char *func, 80 | const char *errorBuffer = 0) 81 | { 82 | if (status != CUDA_SUCCESS) { 83 | if (errorBuffer != 0) { 84 | std::cerr << "ERROR LOG:" << std::endl 85 | << errorBuffer << std::endl; 86 | } 87 | 88 | std::cerr << "ERROR: Could not execute '" << func << "', error (" 89 | << status << ") " << statusToString(status) << std::endl; 90 | exit(1); 91 | } 92 | } 93 | 94 | double getTimeStamp() 95 | { 96 | struct timezone Tzp; 97 | struct timeval Tp; 98 | int stat; 99 | stat = gettimeofday (&Tp, &Tzp); 100 | if (stat != 0) 101 | std::cerr << "Error return from gettimeofday: " << stat << "\n"; 102 | return (Tp.tv_sec + Tp.tv_usec * 1.0e-6); 103 | } 104 | 105 | 106 | 107 | //==--- Entry Point --------------------------------------------------------== // 108 | 109 | int main(int argc, 110 | char** argv) { 111 | 112 | CUcontext context; 113 | CUdevice device; 114 | CUmodule module; 115 | CUresult status; 116 | CUfunction function; 117 | 118 | const int kLogSize = 1024; 119 | char logBuffer[kLogSize]; 120 | 121 | int blockSizeX = 16; 122 | int blockSizeY = 16; 123 | int blockSizeMultiple = 100; 124 | int problemSizeX = blockSizeX * blockSizeMultiple; 125 | int problemSizeY = blockSizeY * blockSizeMultiple; 126 | 127 | // Seed random number generator 128 | srand(time(NULL)); 129 | 130 | // Initialize CUDA 131 | std::cout << "Initializing CUDA\n"; 132 | checkSuccess(cuInit(0), "cuInit"); 133 | std::cout << "Selecting first compute device\n"; 134 | checkSuccess(cuDeviceGet(&device, 0), "cuDeviceGet"); 135 | std::cout << "Creating CUDA context\n"; 136 | checkSuccess(cuCtxCreate(&context, 0, device), "cuCtxCreate"); 137 | 138 | // Read the PTX kernel from disk 139 | std::ifstream kernelFile("matrix-multiply.kernel.ptx"); 140 | if (!kernelFile.is_open()) { 141 | std::cerr << "Failed to open matrix-multiply.kernel.ptx\n"; 142 | return 1; 143 | } 144 | 145 | // Load entire kernel into a string 146 | std::string source(std::istreambuf_iterator(kernelFile), 147 | (std::istreambuf_iterator())); 148 | 149 | // Configure JIT options 150 | CUjit_option jitOptions[] = { CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, 151 | CU_JIT_ERROR_LOG_BUFFER }; 152 | void* jitOptionValues[] = { reinterpret_cast(kLogSize), logBuffer }; 153 | 154 | // Load the kernel onto the device 155 | status = cuModuleLoadDataEx(&module, source.c_str(), 156 | sizeof(jitOptions)/sizeof(jitOptions[0]), 157 | jitOptions, jitOptionValues); 158 | checkSuccess(status, "cuModuleLoadDataEx", logBuffer); 159 | 160 | status = cuModuleGetFunction(&function, module, "matrix_multiply"); 161 | checkSuccess(status, "cuModuleGetFunction"); 162 | 163 | 164 | // Print some diagnostics about the kernel compilation 165 | int numRegisters; 166 | cuFuncGetAttribute(&numRegisters, CU_FUNC_ATTRIBUTE_NUM_REGS, function); 167 | std::cout << "Register Usage: " << numRegisters << "\n"; 168 | 169 | 170 | // Setup buffers 171 | Real* hostA = new Real[problemSizeX * problemSizeY]; 172 | Real* hostB = new Real[problemSizeX * problemSizeY]; 173 | Real* refC = new Real[problemSizeX * problemSizeY]; 174 | Real* cmpC = new Real[problemSizeX * problemSizeY]; 175 | 176 | std::cout << "Problem Size: " << problemSizeX << " x " << problemSizeY 177 | << "\n"; 178 | 179 | CUdeviceptr deviceA; 180 | CUdeviceptr deviceB; 181 | CUdeviceptr deviceC; 182 | 183 | status = cuMemAlloc(&deviceA, problemSizeX * problemSizeY * sizeof(Real)); 184 | checkSuccess(status, "cuMemAlloc"); 185 | status = cuMemAlloc(&deviceB, problemSizeX * problemSizeY * sizeof(Real)); 186 | checkSuccess(status, "cuMemAlloc"); 187 | status = cuMemAlloc(&deviceC, problemSizeX * problemSizeY * sizeof(Real)); 188 | checkSuccess(status, "cuMemAlloc"); 189 | 190 | 191 | // Populate arrays with test data 192 | for (int i = 0; i < problemSizeX*problemSizeY; ++i) { 193 | hostA[i] = hostB[i] = (Real)rand() / ((Real)RAND_MAX + (Real)1.0); 194 | refC[i] = cmpC[i] = (Real)0.0; 195 | } 196 | 197 | 198 | // Copy buffers to device 199 | status = cuMemcpyHtoD(deviceA, hostA, problemSizeX * problemSizeY * sizeof(Real)); 200 | checkSuccess(status, "cuMemcpyHtoD"); 201 | status = cuMemcpyHtoD(deviceB, hostB, problemSizeX * problemSizeY * sizeof(Real)); 202 | checkSuccess(status, "cuMemcpyHtoD"); 203 | status = cuMemcpyHtoD(deviceC, cmpC, problemSizeX * problemSizeY * sizeof(Real)); 204 | checkSuccess(status, "cuMemcpyHtoD"); 205 | 206 | // Setup block shape 207 | status = cuFuncSetBlockShape(function, blockSizeX, blockSizeY, 1); 208 | checkSuccess(status, "cuFuncSetBlockShape"); 209 | 210 | 211 | // Bind kernel paramters 212 | status = cuParamSetv(function, 0, &deviceA, sizeof(CUdeviceptr)); 213 | checkSuccess(status, "cuParamSetv"); 214 | status = cuParamSetv(function, sizeof(CUdeviceptr), &deviceB, sizeof(CUdeviceptr)); 215 | checkSuccess(status, "cuParamSetv"); 216 | status = cuParamSetv(function, 2*sizeof(CUdeviceptr), &deviceC, sizeof(CUdeviceptr)); 217 | checkSuccess(status, "cuParamSetv"); 218 | 219 | status = cuParamSetSize(function, 3*sizeof(CUdeviceptr)); 220 | checkSuccess(status, "cuParamSetSize"); 221 | 222 | 223 | // Launch the kernel 224 | double deviceStart = getTimeStamp(); 225 | 226 | status = cuLaunchGrid(function, blockSizeMultiple, blockSizeMultiple); 227 | checkSuccess(status, "cuLaunchGrid"); 228 | cuCtxSynchronize(); 229 | 230 | double deviceEnd = getTimeStamp(); 231 | 232 | 233 | // Copy results back to the host 234 | status = cuMemcpyDtoH(cmpC, deviceC, problemSizeX * problemSizeY * sizeof(Real)); 235 | checkSuccess(status, "cuMemoryDtoH"); 236 | 237 | 238 | // Compute the reference solution 239 | double hostStart = getTimeStamp(); 240 | 241 | for (int k = 0; k < problemSizeX; ++k) { 242 | for (int i = 0; i < problemSizeY; ++i) { 243 | for (int j = 0; j < problemSizeX; ++j) { 244 | refC[i*problemSizeX+j] += hostA[i*problemSizeX+k] 245 | * hostB[k*problemSizeX+j]; 246 | } 247 | } 248 | } 249 | 250 | double hostEnd = getTimeStamp(); 251 | 252 | 253 | // Compare the results 254 | Real errorNorm = (Real)0.0; 255 | Real refNorm = (Real)0.0; 256 | 257 | for (int i = 0; i < problemSizeX*problemSizeY; ++i) { 258 | 259 | Real diff = refC[i] - cmpC[i]; 260 | 261 | errorNorm += diff * diff; 262 | 263 | refNorm += refC[i] * refC[i]; 264 | } 265 | 266 | errorNorm = (Real)sqrt((double)errorNorm); 267 | refNorm = (Real)sqrt((double)refNorm); 268 | 269 | if ((errorNorm / refNorm) < (Real)1e-5) { 270 | std::cout << "Host reference comparison test PASSED\n"; 271 | } 272 | else { 273 | std::cout << "Host reference comparison test FAILED\n"; 274 | std::cout << "Error Norm: " << errorNorm << "\n"; 275 | std::cout << "Ref Norm: " << refNorm << "\n"; 276 | } 277 | 278 | #if defined(DEBUG) 279 | for (int i = 0; i < problemSizeY; ++i) { 280 | for (int j = 0; j < problemSizeX; ++j) { 281 | std::cout << refC[i*problemSizeX+j] << ", " << cmpC[i*problemSizeX+j] << "\n"; 282 | } 283 | } 284 | #endif 285 | 286 | double gflopsDevice = ((double)problemSizeX * problemSizeY * problemSizeX * 2) / (deviceEnd - deviceStart) / 1e9; 287 | double gflopsHost = ((double)problemSizeX * problemSizeY * problemSizeX * 2) / (hostEnd - hostStart) / 1e9; 288 | 289 | std::cout << "Device Time: " << (deviceEnd - deviceStart) << "s\n"; 290 | std::cout << "Device GFlop/s: " << gflopsDevice << "\n"; 291 | std::cout << "Host Time: " << (hostEnd - hostStart) << "s\n"; 292 | std::cout << "Host GFlop/s: " << gflopsHost << "\n"; 293 | 294 | return 0; 295 | } 296 | 297 | -------------------------------------------------------------------------------- /kernels/matrix-multiply/matrix-multiply.kernel.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | extern "C" 24 | void matrix_multiply(float* A, 25 | float* B, 26 | float* C) 27 | { 28 | unsigned globalX = 29 | __builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x() 30 | + __builtin_ptx_read_tid_x(); 31 | unsigned globalY = 32 | __builtin_ptx_read_ctaid_y() * __builtin_ptx_read_ntid_y() 33 | + __builtin_ptx_read_tid_y(); 34 | unsigned size = 35 | __builtin_ptx_read_nctaid_x() * __builtin_ptx_read_ntid_x(); 36 | 37 | unsigned k; 38 | float sum = 0.0f; 39 | 40 | for(k = 0; k < size; ++k) 41 | { 42 | sum += A[globalY * size + k] * B[k * size + globalX]; 43 | } 44 | 45 | C[globalY * size + globalX] = sum; 46 | } 47 | -------------------------------------------------------------------------------- /kernels/vector-add/vector-add.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | 29 | #include "cuda.h" 30 | 31 | 32 | typedef float Real; 33 | 34 | 35 | //==--- Utility Functions --------------------------------------------------== // 36 | const char * statusToString(CUresult error) 37 | { 38 | switch (error) { 39 | case CUDA_SUCCESS: return "No errors"; 40 | case CUDA_ERROR_INVALID_VALUE: return "Invalid value"; 41 | case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory"; 42 | case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized"; 43 | case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized"; 44 | 45 | case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available"; 46 | case CUDA_ERROR_INVALID_DEVICE: return "Invalid device"; 47 | 48 | case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image"; 49 | case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context"; 50 | case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current"; 51 | case CUDA_ERROR_MAP_FAILED: return "Map failed"; 52 | case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed"; 53 | case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped"; 54 | case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped"; 55 | case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU"; 56 | case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired"; 57 | case CUDA_ERROR_NOT_MAPPED: return "Not mapped"; 58 | 59 | case CUDA_ERROR_INVALID_SOURCE: return "Invalid source"; 60 | case CUDA_ERROR_FILE_NOT_FOUND: return "File not found"; 61 | 62 | case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle"; 63 | 64 | case CUDA_ERROR_NOT_FOUND: return "Not found"; 65 | 66 | case CUDA_ERROR_NOT_READY: return "CUDA not ready"; 67 | 68 | case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed"; 69 | case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources"; 70 | case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout"; 71 | case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing"; 72 | 73 | case CUDA_ERROR_UNKNOWN: return "Unknown error"; 74 | default: return "Unknown error ID"; 75 | } 76 | } 77 | 78 | void checkSuccess(CUresult status, 79 | const char *func, 80 | const char *errorBuffer = 0) 81 | { 82 | if (status != CUDA_SUCCESS) { 83 | if (errorBuffer != 0) { 84 | std::cerr << "ERROR LOG:" << std::endl 85 | << errorBuffer << std::endl; 86 | } 87 | 88 | std::cerr << "ERROR: Could not execute '" << func << "', error (" 89 | << status << ") " << statusToString(status) << std::endl; 90 | exit(1); 91 | } 92 | } 93 | 94 | double getTimeStamp() 95 | { 96 | struct timezone Tzp; 97 | struct timeval Tp; 98 | int stat; 99 | stat = gettimeofday (&Tp, &Tzp); 100 | if (stat != 0) 101 | std::cerr << "Error return from gettimeofday: " << stat << "\n"; 102 | return (Tp.tv_sec + Tp.tv_usec * 1.0e-6); 103 | } 104 | 105 | 106 | 107 | //==--- Entry Point --------------------------------------------------------== // 108 | 109 | int main(int argc, 110 | char** argv) { 111 | 112 | CUcontext context; 113 | CUdevice device; 114 | CUmodule module; 115 | CUresult status; 116 | CUfunction function; 117 | 118 | const int kLogSize = 1024; 119 | char logBuffer[kLogSize]; 120 | 121 | int blockSizeX = 512; 122 | int blockSizeMultiple = 5000; 123 | int problemSize = blockSizeX * blockSizeMultiple; 124 | 125 | 126 | // Initialize CUDA 127 | std::cout << "Initializing CUDA\n"; 128 | checkSuccess(cuInit(0), "cuInit"); 129 | std::cout << "Selecting first compute device\n"; 130 | checkSuccess(cuDeviceGet(&device, 0), "cuDeviceGet"); 131 | std::cout << "Creating CUDA context\n"; 132 | checkSuccess(cuCtxCreate(&context, 0, device), "cuCtxCreate"); 133 | 134 | // Read the PTX kernel from disk 135 | std::ifstream kernelFile("vector-add.kernel.ptx"); 136 | if (!kernelFile.is_open()) { 137 | std::cerr << "Failed to open vector-add.kernel.ptx\n"; 138 | return 1; 139 | } 140 | 141 | // Load entire kernel into a string 142 | std::string source(std::istreambuf_iterator(kernelFile), 143 | (std::istreambuf_iterator())); 144 | 145 | // Configure JIT options 146 | CUjit_option jitOptions[] = { CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, 147 | CU_JIT_ERROR_LOG_BUFFER }; 148 | void* jitOptionValues[] = { reinterpret_cast(kLogSize), logBuffer }; 149 | 150 | // Load the kernel onto the device 151 | status = cuModuleLoadDataEx(&module, source.c_str(), 152 | sizeof(jitOptions)/sizeof(jitOptions[0]), 153 | jitOptions, jitOptionValues); 154 | checkSuccess(status, "cuModuleLoadDataEx", logBuffer); 155 | 156 | status = cuModuleGetFunction(&function, module, "vector_add"); 157 | checkSuccess(status, "cuModuleGetFunction"); 158 | 159 | 160 | // Print some diagnostics about the kernel compilation 161 | int numRegisters; 162 | cuFuncGetAttribute(&numRegisters, CU_FUNC_ATTRIBUTE_NUM_REGS, function); 163 | std::cout << "Register Usage: " << numRegisters << "\n"; 164 | 165 | 166 | // Setup buffers 167 | Real* hostA = new Real[problemSize]; 168 | Real* hostB = new Real[problemSize]; 169 | Real* refC = new Real[problemSize]; 170 | Real* cmpC = new Real[problemSize]; 171 | 172 | std::cout << "Problem Size: " << problemSize << "\n"; 173 | 174 | CUdeviceptr deviceA; 175 | CUdeviceptr deviceB; 176 | CUdeviceptr deviceC; 177 | 178 | status = cuMemAlloc(&deviceA, problemSize * sizeof(Real)); 179 | checkSuccess(status, "cuMemAlloc"); 180 | status = cuMemAlloc(&deviceB, problemSize * sizeof(Real)); 181 | checkSuccess(status, "cuMemAlloc"); 182 | status = cuMemAlloc(&deviceC, problemSize * sizeof(Real)); 183 | checkSuccess(status, "cuMemAlloc"); 184 | 185 | 186 | // Populate arrays with test data 187 | for (int i = 0; i < problemSize; ++i) { 188 | hostA[i] = hostB[i] = (Real)i; 189 | refC[i] = cmpC[i] = (Real)0.0; 190 | } 191 | 192 | 193 | // Copy buffers to device 194 | status = cuMemcpyHtoD(deviceA, hostA, problemSize * sizeof(Real)); 195 | checkSuccess(status, "cuMemcpyHtoD"); 196 | status = cuMemcpyHtoD(deviceB, hostB, problemSize * sizeof(Real)); 197 | checkSuccess(status, "cuMemcpyHtoD"); 198 | 199 | 200 | // Setup block shape 201 | status = cuFuncSetBlockShape(function, blockSizeX, 1, 1); 202 | checkSuccess(status, "cuFuncSetBlockShape"); 203 | 204 | 205 | // Bind kernel paramters 206 | status = cuParamSetv(function, 0, &deviceA, sizeof(CUdeviceptr)); 207 | checkSuccess(status, "cuParamSetv"); 208 | status = cuParamSetv(function, sizeof(CUdeviceptr), &deviceB, sizeof(CUdeviceptr)); 209 | checkSuccess(status, "cuParamSetv"); 210 | status = cuParamSetv(function, 2*sizeof(CUdeviceptr), &deviceC, sizeof(CUdeviceptr)); 211 | checkSuccess(status, "cuParamSetv"); 212 | status = cuParamSeti(function, 3*sizeof(CUdeviceptr), problemSize); 213 | checkSuccess(status, "cuParamSeti"); 214 | 215 | status = cuParamSetSize(function, 3*sizeof(CUdeviceptr) + sizeof(int)); 216 | checkSuccess(status, "cuParamSetSize"); 217 | 218 | 219 | // Launch the kernel 220 | double deviceStart = getTimeStamp(); 221 | 222 | status = cuLaunchGrid(function, blockSizeMultiple, 1); 223 | checkSuccess(status, "cuLaunchGrid"); 224 | cuCtxSynchronize(); 225 | 226 | double deviceEnd = getTimeStamp(); 227 | 228 | 229 | // Copy results back to the host 230 | status = cuMemcpyDtoH(cmpC, deviceC, problemSize * sizeof(Real)); 231 | checkSuccess(status, "cuMemoryDtoH"); 232 | 233 | 234 | // Compute the reference solution 235 | double hostStart = getTimeStamp(); 236 | 237 | for (int i = 0; i < problemSize; ++i) { 238 | refC[i] = hostA[i] + hostB[i]; 239 | } 240 | 241 | double hostEnd = getTimeStamp(); 242 | 243 | 244 | // Compare the results 245 | int numWrong = 0; 246 | 247 | for (int i = 0; i < problemSize; ++i) { 248 | if (std::abs(refC[i] - cmpC[i]) > (Real)1e-5) { 249 | numWrong++; 250 | } 251 | } 252 | 253 | if (numWrong == 0) { 254 | std::cout << "Host reference comparison test PASSED\n"; 255 | } 256 | else { 257 | std::cout << "Host reference comparison test FAILED\n"; 258 | } 259 | 260 | std::cout << "Device Time: " << (deviceEnd - deviceStart) << "s\n"; 261 | std::cout << "Host Time: " << (hostEnd - hostStart) << "s\n"; 262 | 263 | return 0; 264 | } 265 | 266 | -------------------------------------------------------------------------------- /kernels/vector-add/vector-add.kernel.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | extern "C" 24 | void vector_add(float* A, 25 | float* B, 26 | float* C, 27 | int N) { 28 | 29 | // Determine our global offset into the vector 30 | int myId = (__builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x()) 31 | + __builtin_ptx_read_tid_x(); 32 | 33 | // Perform one component of the vector addition 34 | if (myId < N) { 35 | C[myId] = A[myId] + B[myId]; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /opencl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2011 by Justin Holewinski 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | # 22 | 23 | add_subdirectory(blur2d) 24 | add_subdirectory(matmul) 25 | add_subdirectory(matmul-double) 26 | -------------------------------------------------------------------------------- /opencl/blur2d/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2011 by Justin Holewinski 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | # 22 | 23 | set(_cpp_sources blur2d.cpp) 24 | 25 | create_opencl_targets(_cl_targets blur2d_kernel) 26 | 27 | add_executable(ocl-blur2d ${_cpp_sources}) 28 | target_link_libraries(ocl-blur2d ${OPENCL_LIBRARY} sampleutil) 29 | add_dependencies(ocl-blur2d ${_cl_targets}) 30 | -------------------------------------------------------------------------------- /opencl/blur2d/blur2d.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include "common/OCLSample.hpp" 27 | 28 | #define BLOCK_SIZE 16 29 | 30 | class Blur2DSample : public OCLSample { 31 | public: 32 | 33 | Blur2DSample(); 34 | 35 | protected: 36 | 37 | virtual void initialize(); 38 | virtual void createMemoryBuffers(); 39 | virtual void setupKernel(cl::Kernel kernel); 40 | virtual void finishKernel(cl::Kernel kernel); 41 | virtual void runKernel(cl::Kernel kernel, cl::Event* evt); 42 | 43 | private: 44 | 45 | cl::Program programCL_; 46 | cl::Program programPTX_; 47 | 48 | cl::Buffer deviceIn_; 49 | cl::Buffer deviceOut_; 50 | 51 | float* hostIn_; 52 | float* hostOut_; 53 | 54 | unsigned int ProblemSize_; 55 | unsigned int ArraySize_; 56 | }; 57 | 58 | 59 | Blur2DSample::Blur2DSample() { 60 | ProblemSize_ = 4096; 61 | ArraySize_ = (ProblemSize_+2) * (ProblemSize_+2); 62 | } 63 | 64 | void Blur2DSample::initialize() { 65 | cl_int result; 66 | 67 | programCL_ = compileSource("blur2d_kernel.cl"); 68 | programPTX_ = loadBinary("blur2d_kernel.ptx"); 69 | 70 | cl::Kernel kernelCL = cl::Kernel(programCL_, "blur2d", &result); 71 | assert(result == CL_SUCCESS && "Failed to extract kernel"); 72 | cl::Kernel kernelPTX = cl::Kernel(programPTX_, "blur2d", &result); 73 | assert(result == CL_SUCCESS && "Failed to extract kernel"); 74 | 75 | setSourceKernel(kernelCL); 76 | setBinaryKernel(kernelPTX); 77 | 78 | // For this sample, let's run 16 iterations 79 | setNumberOfIterations(16); 80 | } 81 | 82 | void Blur2DSample::runKernel(cl::Kernel kernel, cl::Event* evt) { 83 | cl_int result; 84 | cl::NDRange globalSize(ProblemSize_, ProblemSize_); 85 | cl::NDRange localSize(BLOCK_SIZE, BLOCK_SIZE); 86 | 87 | result = kernel.setArg(0, deviceIn_); 88 | assert(result == CL_SUCCESS && "Failed to set kernel argument 0"); 89 | result = kernel.setArg(1, deviceOut_); 90 | assert(result == CL_SUCCESS && "Failed to set kernel argument 1"); 91 | result = kernel.setArg(2, ProblemSize_); 92 | assert(result == CL_SUCCESS && "Failed to set kernel argument 2"); 93 | 94 | result = getCommandQueue().enqueueNDRangeKernel(kernel, cl::NullRange, 95 | globalSize, localSize, 0, 96 | evt); 97 | assert(result == CL_SUCCESS && "Failed to launch kernel"); 98 | } 99 | 100 | void Blur2DSample::createMemoryBuffers() { 101 | cl_int result; 102 | 103 | // Create host buffers 104 | hostIn_ = new float[ArraySize_]; 105 | hostOut_ = new float[ArraySize_]; 106 | 107 | // Create device buffers 108 | deviceIn_ = cl::Buffer(getContext(), CL_MEM_READ_ONLY, 109 | ArraySize_*sizeof(float), NULL, &result); 110 | assert(result == CL_SUCCESS && "Failed to allocate device buffer"); 111 | deviceOut_ = cl::Buffer(getContext(), CL_MEM_WRITE_ONLY, 112 | ArraySize_*sizeof(float), NULL, &result); 113 | assert(result == CL_SUCCESS && "Failed to allocate device buffer"); 114 | } 115 | 116 | void Blur2DSample::setupKernel(cl::Kernel kernel) { 117 | cl_int result; 118 | 119 | // Copy data to device 120 | result = getCommandQueue().enqueueWriteBuffer(deviceIn_, CL_TRUE, 0, 121 | ArraySize_*sizeof(float), 122 | hostIn_, NULL, NULL); 123 | assert(result == CL_SUCCESS && "Failed to queue data copy to device"); 124 | } 125 | 126 | void Blur2DSample::finishKernel(cl::Kernel kernel) { 127 | cl_int result; 128 | 129 | // Copy data back to host 130 | result = getCommandQueue().enqueueReadBuffer(deviceOut_, CL_TRUE, 0, 131 | ArraySize_*sizeof(float), 132 | hostOut_, NULL, NULL); 133 | assert(result == CL_SUCCESS && "Failed to queue data copy to host"); 134 | } 135 | 136 | int main(int argc, char** argv) { 137 | Blur2DSample sample; 138 | 139 | sample.run(); 140 | 141 | return 0; 142 | } 143 | -------------------------------------------------------------------------------- /opencl/blur2d/blur2d_kernel.cl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | #define ACCESS(buffer,i,j) (buffer[(i)*width+(j)]) 24 | 25 | __kernel 26 | void blur2d(__global float* input, __global float* output, uint width) { 27 | 28 | // Add 1 to account for padding 29 | int globalX = get_global_id(0)+1; 30 | int globalY = get_global_id(1)+1; 31 | 32 | float result = 33 | 0.5f * ACCESS(input, globalX, globalY ) + 34 | 0.1f * ACCESS(input, globalX-1, globalY ) + 35 | 0.1f * ACCESS(input, globalX+1, globalY ) + 36 | 0.1f * ACCESS(input, globalX, globalY-1) + 37 | 0.1f * ACCESS(input, globalX, globalY+1); 38 | 39 | ACCESS(output, globalX, globalY) = result; 40 | } 41 | -------------------------------------------------------------------------------- /opencl/matmul-double/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2011 by Justin Holewinski 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | # 22 | 23 | set(_cpp_sources matmul-double.cpp) 24 | 25 | create_opencl_targets(_cl_targets matmul-double_kernel) 26 | 27 | add_executable(ocl-matmul-double ${_cpp_sources}) 28 | target_link_libraries(ocl-matmul-double ${OPENCL_LIBRARY} sampleutil) 29 | add_dependencies(ocl-matmul-double ${_cl_targets}) 30 | -------------------------------------------------------------------------------- /opencl/matmul-double/matmul-double.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include "common/OCLSample.hpp" 27 | 28 | #define BLOCK_SIZE 16 29 | 30 | class MatMulDoubleSample : public OCLSample { 31 | public: 32 | 33 | MatMulDoubleSample(); 34 | 35 | protected: 36 | 37 | virtual void initialize(); 38 | virtual void createMemoryBuffers(); 39 | virtual void setupKernel(cl::Kernel kernel); 40 | virtual void finishKernel(cl::Kernel kernel); 41 | virtual void runKernel(cl::Kernel kernel, cl::Event* evt); 42 | 43 | private: 44 | 45 | cl::Program programCL_; 46 | cl::Program programPTX_; 47 | 48 | cl::Buffer deviceA_; 49 | cl::Buffer deviceB_; 50 | cl::Buffer deviceC_; 51 | 52 | double* hostA_; 53 | double* hostB_; 54 | double* hostC_; 55 | 56 | unsigned int ProblemSize_; 57 | unsigned int ArraySize_; 58 | }; 59 | 60 | 61 | MatMulDoubleSample::MatMulDoubleSample() { 62 | ProblemSize_ = 4096; 63 | ArraySize_ = ProblemSize_ * ProblemSize_; 64 | } 65 | 66 | void MatMulDoubleSample::initialize() { 67 | cl_int result; 68 | 69 | programCL_ = compileSource("matmul-double_kernel.cl"); 70 | programPTX_ = loadBinary("matmul-double_kernel.ptx"); 71 | 72 | cl::Kernel kernelCL = cl::Kernel(programCL_, "matmul", &result); 73 | assert(result == CL_SUCCESS && "Failed to extract kernel"); 74 | cl::Kernel kernelPTX = cl::Kernel(programPTX_, "matmul", &result); 75 | assert(result == CL_SUCCESS && "Failed to extract kernel"); 76 | 77 | setSourceKernel(kernelCL); 78 | setBinaryKernel(kernelPTX); 79 | 80 | // For this sample, let's run 16 iterations 81 | setNumberOfIterations(16); 82 | } 83 | 84 | void MatMulDoubleSample::runKernel(cl::Kernel kernel, cl::Event* evt) { 85 | cl_int result; 86 | cl::NDRange globalSize(ProblemSize_, ProblemSize_); 87 | cl::NDRange localSize(BLOCK_SIZE, BLOCK_SIZE); 88 | 89 | result = kernel.setArg(0, deviceA_); 90 | assert(result == CL_SUCCESS && "Failed to set kernel argument 0"); 91 | result = kernel.setArg(1, deviceB_); 92 | assert(result == CL_SUCCESS && "Failed to set kernel argument 1"); 93 | result = kernel.setArg(2, deviceC_); 94 | assert(result == CL_SUCCESS && "Failed to set kernel argument 2"); 95 | 96 | result = getCommandQueue().enqueueNDRangeKernel(kernel, cl::NullRange, 97 | globalSize, localSize, 0, 98 | evt); 99 | assert(result == CL_SUCCESS && "Failed to launch kernel"); 100 | } 101 | 102 | void MatMulDoubleSample::createMemoryBuffers() { 103 | cl_int result; 104 | 105 | // Create host buffers 106 | hostA_ = new double[ArraySize_]; 107 | hostB_ = new double[ArraySize_]; 108 | hostC_ = new double[ArraySize_]; 109 | 110 | // Create device buffers 111 | deviceA_ = cl::Buffer(getContext(), CL_MEM_READ_ONLY, 112 | ArraySize_*sizeof(double), NULL, &result); 113 | assert(result == CL_SUCCESS && "Failed to allocate device buffer"); 114 | deviceB_ = cl::Buffer(getContext(), CL_MEM_READ_ONLY, 115 | ArraySize_*sizeof(double), NULL, &result); 116 | assert(result == CL_SUCCESS && "Failed to allocate device buffer"); 117 | deviceC_ = cl::Buffer(getContext(), CL_MEM_WRITE_ONLY, 118 | ArraySize_*sizeof(double), NULL, &result); 119 | assert(result == CL_SUCCESS && "Failed to allocate device buffer"); 120 | } 121 | 122 | void MatMulDoubleSample::setupKernel(cl::Kernel kernel) { 123 | cl_int result; 124 | 125 | // Copy data to device 126 | result = getCommandQueue().enqueueWriteBuffer(deviceA_, CL_TRUE, 0, 127 | ArraySize_*sizeof(double), 128 | hostA_, NULL, NULL); 129 | assert(result == CL_SUCCESS && "Failed to queue data copy to device"); 130 | result = getCommandQueue().enqueueWriteBuffer(deviceB_, CL_TRUE, 0, 131 | ArraySize_*sizeof(double), 132 | hostB_, NULL, NULL); 133 | assert(result == CL_SUCCESS && "Failed to queue data copy to device"); 134 | } 135 | 136 | void MatMulDoubleSample::finishKernel(cl::Kernel kernel) { 137 | cl_int result; 138 | 139 | // Copy data back to host 140 | result = getCommandQueue().enqueueReadBuffer(deviceC_, CL_TRUE, 0, 141 | ArraySize_*sizeof(double), 142 | hostC_, NULL, NULL); 143 | assert(result == CL_SUCCESS && "Failed to queue data copy to host"); 144 | } 145 | 146 | int main(int argc, char** argv) { 147 | MatMulDoubleSample sample; 148 | 149 | sample.run(); 150 | 151 | return 0; 152 | } 153 | -------------------------------------------------------------------------------- /opencl/matmul-double/matmul-double_kernel.cl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 24 | 25 | // This must match BLOCK_SIZE in matmul-double.cpp 26 | #define BLOCK_SIZE 16 27 | 28 | __kernel 29 | void matmul(__global double* A, __global double* B, __global double* C) { 30 | 31 | __local double scratchA[BLOCK_SIZE][BLOCK_SIZE]; 32 | __local double scratchB[BLOCK_SIZE][BLOCK_SIZE]; 33 | 34 | int globalX = get_global_id(0); 35 | int globalY = get_global_id(1); 36 | int size = get_global_size(0); 37 | int k; 38 | double sum = 0.0f; 39 | int numBlocks = size / BLOCK_SIZE; 40 | int b; 41 | 42 | int tidX = get_local_id(0); 43 | int tidY = get_local_id(1); 44 | 45 | for(b = 0; b < numBlocks; ++b) 46 | { 47 | // Populate a cache for A/B 48 | int x; 49 | int y; 50 | 51 | x = b * BLOCK_SIZE + tidX; 52 | y = globalY; 53 | 54 | scratchA[tidY][tidX] = A[y * size + x]; 55 | 56 | x = globalX; 57 | y = b * BLOCK_SIZE + tidY; 58 | 59 | scratchB[tidY][tidX] = B[y * size + x]; 60 | 61 | barrier(CLK_LOCAL_MEM_FENCE); 62 | 63 | for(k = 0; k < BLOCK_SIZE; ++k) 64 | { 65 | double myA; 66 | double myB; 67 | 68 | myA = scratchA[tidY][k]; 69 | myB = scratchB[k][tidX]; 70 | 71 | sum += myA * myB; 72 | } 73 | 74 | barrier(CLK_LOCAL_MEM_FENCE); 75 | } 76 | 77 | C[globalY * size + globalX] = sum; 78 | 79 | } 80 | -------------------------------------------------------------------------------- /opencl/matmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2011 by Justin Holewinski 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | # 22 | 23 | set(_cpp_sources matmul.cpp) 24 | 25 | create_opencl_targets(_cl_targets matmul_kernel) 26 | 27 | add_executable(ocl-matmul ${_cpp_sources}) 28 | target_link_libraries(ocl-matmul ${OPENCL_LIBRARY} sampleutil) 29 | add_dependencies(ocl-matmul ${_cl_targets}) 30 | -------------------------------------------------------------------------------- /opencl/matmul/matmul.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include "common/OCLSample.hpp" 27 | 28 | #define BLOCK_SIZE 16 29 | 30 | class MatMulSample : public OCLSample { 31 | public: 32 | 33 | MatMulSample(); 34 | 35 | protected: 36 | 37 | virtual void initialize(); 38 | virtual void createMemoryBuffers(); 39 | virtual void setupKernel(cl::Kernel kernel); 40 | virtual void finishKernel(cl::Kernel kernel); 41 | virtual void runKernel(cl::Kernel kernel, cl::Event* evt); 42 | 43 | private: 44 | 45 | cl::Program programCL_; 46 | cl::Program programPTX_; 47 | 48 | cl::Buffer deviceA_; 49 | cl::Buffer deviceB_; 50 | cl::Buffer deviceC_; 51 | 52 | float* hostA_; 53 | float* hostB_; 54 | float* hostC_; 55 | 56 | unsigned int ProblemSize_; 57 | unsigned int ArraySize_; 58 | }; 59 | 60 | 61 | MatMulSample::MatMulSample() { 62 | ProblemSize_ = 4096; 63 | ArraySize_ = ProblemSize_ * ProblemSize_; 64 | } 65 | 66 | void MatMulSample::initialize() { 67 | cl_int result; 68 | 69 | programCL_ = compileSource("matmul_kernel.cl"); 70 | programPTX_ = loadBinary("matmul_kernel.ptx"); 71 | 72 | cl::Kernel kernelCL = cl::Kernel(programCL_, "matmul", &result); 73 | assert(result == CL_SUCCESS && "Failed to extract kernel"); 74 | cl::Kernel kernelPTX = cl::Kernel(programPTX_, "matmul", &result); 75 | assert(result == CL_SUCCESS && "Failed to extract kernel"); 76 | 77 | setSourceKernel(kernelCL); 78 | setBinaryKernel(kernelPTX); 79 | 80 | // For this sample, let's run 16 iterations 81 | setNumberOfIterations(16); 82 | } 83 | 84 | void MatMulSample::runKernel(cl::Kernel kernel, cl::Event* evt) { 85 | cl_int result; 86 | cl::NDRange globalSize(ProblemSize_, ProblemSize_); 87 | cl::NDRange localSize(BLOCK_SIZE, BLOCK_SIZE); 88 | 89 | result = kernel.setArg(0, deviceA_); 90 | assert(result == CL_SUCCESS && "Failed to set kernel argument 0"); 91 | result = kernel.setArg(1, deviceB_); 92 | assert(result == CL_SUCCESS && "Failed to set kernel argument 1"); 93 | result = kernel.setArg(2, deviceC_); 94 | assert(result == CL_SUCCESS && "Failed to set kernel argument 2"); 95 | 96 | result = getCommandQueue().enqueueNDRangeKernel(kernel, cl::NullRange, 97 | globalSize, localSize, 0, 98 | evt); 99 | assert(result == CL_SUCCESS && "Failed to launch kernel"); 100 | } 101 | 102 | void MatMulSample::createMemoryBuffers() { 103 | cl_int result; 104 | 105 | // Create host buffers 106 | hostA_ = new float[ArraySize_]; 107 | hostB_ = new float[ArraySize_]; 108 | hostC_ = new float[ArraySize_]; 109 | 110 | // Create device buffers 111 | deviceA_ = cl::Buffer(getContext(), CL_MEM_READ_ONLY, 112 | ArraySize_*sizeof(float), NULL, &result); 113 | assert(result == CL_SUCCESS && "Failed to allocate device buffer"); 114 | deviceB_ = cl::Buffer(getContext(), CL_MEM_READ_ONLY, 115 | ArraySize_*sizeof(float), NULL, &result); 116 | assert(result == CL_SUCCESS && "Failed to allocate device buffer"); 117 | deviceC_ = cl::Buffer(getContext(), CL_MEM_WRITE_ONLY, 118 | ArraySize_*sizeof(float), NULL, &result); 119 | assert(result == CL_SUCCESS && "Failed to allocate device buffer"); 120 | } 121 | 122 | void MatMulSample::setupKernel(cl::Kernel kernel) { 123 | cl_int result; 124 | 125 | // Copy data to device 126 | result = getCommandQueue().enqueueWriteBuffer(deviceA_, CL_TRUE, 0, 127 | ArraySize_*sizeof(float), 128 | hostA_, NULL, NULL); 129 | assert(result == CL_SUCCESS && "Failed to queue data copy to device"); 130 | result = getCommandQueue().enqueueWriteBuffer(deviceB_, CL_TRUE, 0, 131 | ArraySize_*sizeof(float), 132 | hostB_, NULL, NULL); 133 | assert(result == CL_SUCCESS && "Failed to queue data copy to device"); 134 | } 135 | 136 | void MatMulSample::finishKernel(cl::Kernel kernel) { 137 | cl_int result; 138 | 139 | // Copy data back to host 140 | result = getCommandQueue().enqueueReadBuffer(deviceC_, CL_TRUE, 0, 141 | ArraySize_*sizeof(float), 142 | hostC_, NULL, NULL); 143 | assert(result == CL_SUCCESS && "Failed to queue data copy to host"); 144 | } 145 | 146 | int main(int argc, char** argv) { 147 | MatMulSample sample; 148 | 149 | sample.run(); 150 | 151 | return 0; 152 | } 153 | -------------------------------------------------------------------------------- /opencl/matmul/matmul_kernel.cl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 by Justin Holewinski 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | // This must match BLOCK_SIZE in matmul.cpp 24 | #define BLOCK_SIZE 16 25 | 26 | __kernel 27 | void matmul(__global float* A, __global float* B, __global float* C) { 28 | 29 | __local float scratchA[BLOCK_SIZE][BLOCK_SIZE]; 30 | __local float scratchB[BLOCK_SIZE][BLOCK_SIZE]; 31 | 32 | int globalX = get_global_id(0); 33 | int globalY = get_global_id(1); 34 | int size = get_global_size(0); 35 | int k; 36 | float sum = 0.0f; 37 | int numBlocks = size / BLOCK_SIZE; 38 | int b; 39 | 40 | int tidX = get_local_id(0); 41 | int tidY = get_local_id(1); 42 | 43 | for(b = 0; b < numBlocks; ++b) 44 | { 45 | // Populate a cache for A/B 46 | int x; 47 | int y; 48 | 49 | x = b * BLOCK_SIZE + tidX; 50 | y = globalY; 51 | 52 | scratchA[tidY][tidX] = A[y * size + x]; 53 | 54 | x = globalX; 55 | y = b * BLOCK_SIZE + tidY; 56 | 57 | scratchB[tidY][tidX] = B[y * size + x]; 58 | 59 | barrier(CLK_LOCAL_MEM_FENCE); 60 | 61 | for(k = 0; k < BLOCK_SIZE; ++k) 62 | { 63 | float myA; 64 | float myB; 65 | 66 | myA = scratchA[tidY][k]; 67 | myB = scratchB[k][tidX]; 68 | 69 | sum += myA * myB; 70 | } 71 | 72 | barrier(CLK_LOCAL_MEM_FENCE); 73 | } 74 | 75 | C[globalY * size + globalX] = sum; 76 | 77 | } 78 | --------------------------------------------------------------------------------