├── .gitignore ├── .gitattributes ├── src ├── OpenCL │ ├── lib │ │ ├── OpenCL.lib │ │ └── libOpenCL.so │ └── include │ │ └── CL │ │ ├── opencl.h │ │ ├── cl_version.h │ │ ├── cl_gl.h │ │ └── cl_platform.h ├── kernel.cpp ├── kernel.hpp ├── main.cpp ├── utilities.hpp └── opencl.hpp ├── CITATION.cff ├── make.sh ├── OpenCL-Benchmark.sln ├── LICENSE.md ├── OpenCL-Benchmark.vcxproj └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | .vs/ 3 | OpenCL-Benchmark.vcxproj.user 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | src/OpenCL/** linguist-vendored 2 | src/kernel.cpp linguist-language=OpenCL -------------------------------------------------------------------------------- /src/OpenCL/lib/OpenCL.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPhysX/OpenCL-Benchmark/HEAD/src/OpenCL/lib/OpenCL.lib -------------------------------------------------------------------------------- /src/OpenCL/lib/libOpenCL.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProjectPhysX/OpenCL-Benchmark/HEAD/src/OpenCL/lib/libOpenCL.so -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Lehmann" 5 | given-names: "Moritz" 6 | orcid: "https://orcid.org/0000-0002-4652-8383" 7 | title: "OpenCL-Benchmark" 8 | date-released: 2023-04-30 9 | url: "https://github.com/ProjectPhysX/OpenCL-Benchmark" -------------------------------------------------------------------------------- /make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # command line argument(s): device ID(s); if empty, it will benchmark all available devices 3 | 4 | mkdir -p bin # create directory for executable 5 | rm -f bin/OpenCL-Benchmark # prevent execution of old version if compiling fails 6 | 7 | case "$(uname -a)" in # automatically detect operating system 8 | Darwin*) g++ src/*.cpp -o bin/OpenCL-Benchmark -std=c++17 -pthread -O -Wno-comment -I./src/OpenCL/include -framework OpenCL ;; # macOS 9 | *Android) g++ src/*.cpp -o bin/OpenCL-Benchmark -std=c++17 -pthread -O -Wno-comment -I./src/OpenCL/include -L/system/vendor/lib64 -lOpenCL ;; # Android 10 | * ) g++ src/*.cpp -o bin/OpenCL-Benchmark -std=c++17 -pthread -O -Wno-comment -I./src/OpenCL/include -L./src/OpenCL/lib -lOpenCL ;; # Linux 11 | esac 12 | 13 | if [[ $? == 0 ]]; then bin/OpenCL-Benchmark "$@"; fi # run executable only if last compilation was successful 14 | -------------------------------------------------------------------------------- /OpenCL-Benchmark.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.31729.503 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "OpenCL-Benchmark", "OpenCL-Benchmark.vcxproj", "{B07BD873-9CD9-4F0B-AAA8-8AE6FE22F76A}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Release|x64 = Release|x64 11 | EndGlobalSection 12 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 13 | {B07BD873-9CD9-4F0B-AAA8-8AE6FE22F76A}.Release|x64.ActiveCfg = Release|x64 14 | {B07BD873-9CD9-4F0B-AAA8-8AE6FE22F76A}.Release|x64.Build.0 = Release|x64 15 | EndGlobalSection 16 | GlobalSection(SolutionProperties) = preSolution 17 | HideSolutionNode = FALSE 18 | EndGlobalSection 19 | GlobalSection(ExtensibilityGlobals) = postSolution 20 | SolutionGuid = {CF46CF2E-5B57-4081-86EB-6E1333CB46A3} 21 | EndGlobalSection 22 | EndGlobal 23 | -------------------------------------------------------------------------------- /src/OpenCL/include/CL/opencl.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2021 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef __OPENCL_H 18 | #define __OPENCL_H 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif /* __OPENCL_H */ 33 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2023-2024 Dr. Moritz Lehmann 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files, to use this software for educational use, non-military research or non-military commercial use, and to alter it and redistribute it freely, subject to the following restrictions: 4 | 5 | 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation should be provided. 6 | 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 7 | 3. This notice may not be removed or altered from any source distribution. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /src/OpenCL/include/CL/cl_version.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2018-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef __CL_VERSION_H 18 | #define __CL_VERSION_H 19 | 20 | /* Detect which version to target */ 21 | #if !defined(CL_TARGET_OPENCL_VERSION) 22 | #pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)") 23 | #define CL_TARGET_OPENCL_VERSION 300 24 | #endif 25 | #if CL_TARGET_OPENCL_VERSION != 100 && \ 26 | CL_TARGET_OPENCL_VERSION != 110 && \ 27 | CL_TARGET_OPENCL_VERSION != 120 && \ 28 | CL_TARGET_OPENCL_VERSION != 200 && \ 29 | CL_TARGET_OPENCL_VERSION != 210 && \ 30 | CL_TARGET_OPENCL_VERSION != 220 && \ 31 | CL_TARGET_OPENCL_VERSION != 300 32 | #pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220, 300). Defaulting to 300 (OpenCL 3.0)") 33 | #undef CL_TARGET_OPENCL_VERSION 34 | #define CL_TARGET_OPENCL_VERSION 300 35 | #endif 36 | 37 | 38 | /* OpenCL Version */ 39 | #if CL_TARGET_OPENCL_VERSION >= 300 && !defined(CL_VERSION_3_0) 40 | #define CL_VERSION_3_0 1 41 | #endif 42 | #if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2) 43 | #define CL_VERSION_2_2 1 44 | #endif 45 | #if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1) 46 | #define CL_VERSION_2_1 1 47 | #endif 48 | #if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0) 49 | #define CL_VERSION_2_0 1 50 | #endif 51 | #if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2) 52 | #define CL_VERSION_1_2 1 53 | #endif 54 | #if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1) 55 | #define CL_VERSION_1_1 1 56 | #endif 57 | #if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0) 58 | #define CL_VERSION_1_0 1 59 | #endif 60 | 61 | /* Allow deprecated APIs for older OpenCL versions. */ 62 | #if CL_TARGET_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS) 63 | #define CL_USE_DEPRECATED_OPENCL_2_2_APIS 64 | #endif 65 | #if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS) 66 | #define CL_USE_DEPRECATED_OPENCL_2_1_APIS 67 | #endif 68 | #if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS) 69 | #define CL_USE_DEPRECATED_OPENCL_2_0_APIS 70 | #endif 71 | #if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) 72 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS 73 | #endif 74 | #if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) 75 | #define CL_USE_DEPRECATED_OPENCL_1_1_APIS 76 | #endif 77 | #if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS) 78 | #define CL_USE_DEPRECATED_OPENCL_1_0_APIS 79 | #endif 80 | 81 | #endif /* __CL_VERSION_H */ 82 | -------------------------------------------------------------------------------- /OpenCL-Benchmark.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Release 6 | x64 7 | 8 | 9 | 10 | 15.0 11 | {B07BD873-9CD9-4F0B-AAA8-8AE6FE22F76A} 12 | OpenCL-Benchmark 13 | 10.0 14 | 15 | 16 | 17 | Application 18 | false 19 | v142 20 | true 21 | MultiByte 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | $(SolutionDir)bin\ 34 | $(SolutionDir)temp\ 35 | 36 | 37 | 38 | Level3 39 | MaxSpeed 40 | true 41 | true 42 | true 43 | true 44 | $(SolutionDir)src\OpenCL\include;%(AdditionalIncludeDirectories) 45 | true 46 | Speed 47 | true 48 | Fast 49 | stdcpp17 50 | 26451;6386;%(DisableSpecificWarnings) 51 | 52 | 53 | Console 54 | true 55 | true 56 | $(SolutionDir)src\OpenCL\lib 57 | OpenCL.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /src/kernel.cpp: -------------------------------------------------------------------------------- 1 | #include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with )+R( 2 | string opencl_c_container() { return R( // ########################## begin of OpenCL C code #################################################################### 3 | 4 | 5 | 6 | int dp4a(const char4 a, const char4 b, const int c) { // 4-wide byte dot product and accumulate 7 | )+"#if cl_nv_compute_capability>=61"+R( // use hardware-supported dp4a on Nvidia Pascal or newer GPUs with inline PTX assembly 8 | int d;)+"asm(\"dp4a.s32.s32\t%0,%1,%2,%3;\":\"=r\"(d):\"r\"(as_int(a)),\"r\"(as_int(b)),\"r\"(c));"+R(return d; 9 | )+"#elif defined(__opencl_c_integer_dot_product_input_4x8bit)"+R( // use hardware-supported dp4a on some Intel GPUs 10 | return c+dot(a, b); // dot_acc_sat(a, b, c); is slow 11 | )+"#elif __has_builtin(__builtin_amdgcn_sdot4)"+R( // use hardware-supported dp4a on older AMD GPUs 12 | return __builtin_amdgcn_sdot4(as_int(a), as_int(b), c, false); 13 | )+"#elif __has_builtin(__builtin_amdgcn_sudot4)"+R( // use hardware-supported dp4a on newer AMD GPUs 14 | return __builtin_amdgcn_sudot4(true, as_int(a), true, as_int(b), c, false); 15 | )+"#elif defined(cl_arm_integer_dot_product_accumulate_int8)"+R( // use hardware-supported dp4a on some ARM GPUs 16 | return arm_dot_acc(a, b, c); 17 | )+"#else"+R( // fallback emulation (compilers will turn this into hardware-supported dp4a instruction if available) 18 | return c+a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w; 19 | )+"#endif"+R( 20 | } 21 | 22 | 23 | 24 | )+"#ifdef cl_khr_fp64"+R( // OpenCL C defines don't work in R() stringification macro 25 | kernel void kernel_double(global float* data) { 26 | double x = (double)get_global_id(0); 27 | double y = (double)get_local_id(0); 28 | for(uint i=0u; i<128u; i++) { 29 | x = fma(y, x, y); // 2 operations 30 | y = fma(x, y, x); // 2 operations 31 | } 32 | data[get_global_id(0)] = (float)y; 33 | } 34 | )+"#endif"+R( // cl_khr_fp64 35 | 36 | kernel void kernel_float(global float* data) { 37 | float x = (float)get_global_id(0); 38 | float y = (float)get_local_id(0); 39 | for(uint i=0u; i<512u; i++) { 40 | x = fma(y, x, y); // 2 operations 41 | y = fma(x, y, x); // 2 operations 42 | } 43 | data[get_global_id(0)] = y; 44 | } 45 | 46 | )+"#ifdef cl_khr_fp16"+R( // OpenCL C defines don't work in R() stringification macro 47 | kernel void kernel_half(global float* data) { 48 | half2 x = (half2)((float)get_global_id(0), (float)get_local_id(0)); 49 | half2 y = (half2)((float)get_local_id(0), (float)get_global_id(0)); 50 | for(uint i=0u; i<512u; i++) { 51 | x = y*x+y; // 4 operations 52 | y = x*y+x; // 4 operations 53 | } 54 | data[get_global_id(0)] = (float)y.x+(float)y.y; 55 | } 56 | )+"#endif"+R( // cl_khr_fp16 57 | 58 | kernel void kernel_long(global float* data) { 59 | long x = (long)get_global_id(0); 60 | long y = (long)get_local_id(0); 61 | for(uint i=0u; i<8u; i++) { 62 | x = y*x+y; // 2 operations 63 | y = x*y+x; // 2 operations 64 | } 65 | data[get_global_id(0)] = as_float((int)y); 66 | } 67 | 68 | kernel void kernel_int(global float* data) { 69 | int x = get_global_id(0); 70 | int y = get_local_id(0); 71 | for(uint i=0u; i<512u; i++) { 72 | x = y*x+y; // 2 operations 73 | y = x*y+x; // 2 operations 74 | } 75 | data[get_global_id(0)] = as_float(y); 76 | } 77 | 78 | kernel void kernel_short(global float* data) { 79 | short2 x = as_short2((uint)get_global_id(0)); 80 | short2 y = as_short2((uint)get_local_id(0)); 81 | for(uint i=0u; i<128u; i++) { 82 | x = y*x+y; // 4 operations 83 | y = x*y+x; // 4 operations 84 | } 85 | data[get_global_id(0)] = as_float(y); 86 | } 87 | 88 | kernel void kernel_char(global float* data) { 89 | char4 x = as_char4((uint)get_global_id(0)); 90 | char4 y = as_char4((uint)get_local_id(0)); 91 | for(uint i=0u; i<64u; i++) { 92 | x = as_char4(dp4a(y, x, as_int(y))); // 8 operations 93 | y = as_char4(dp4a(x, y, as_int(x))); // 8 operations 94 | } 95 | data[get_global_id(0)] = as_float(y); 96 | } 97 | 98 | 99 | 100 | kernel void kernel_coalesced_write(global float* data) { 101 | const uint n = get_global_id(0); 102 | for(uint i=0u; i>23 80 | #define mad_sat(a,b,c) // a*b+c 81 | #define max(x,y) 82 | #define min(x,y) 83 | 84 | // integer and floating-point functions 85 | #define clamp(x,a,b) 86 | #define sign(x) 87 | 88 | // floating-point functions 89 | #define acos(x) 90 | #define acosh(x) 91 | #define acospi(x) // acos(x)/pi 92 | #define asin(x) 93 | #define asinh(x) 94 | #define asinpi(x) // asin(x)/pi 95 | #define atan(x) 96 | #define atan2(x,y) // atan(x/y) 97 | #define atanh(x) 98 | #define atanpi(x) // atan(x)/pi 99 | #define atan2pi(x,y) // atan(x/y)/pi 100 | #define cbrt(x) // x^(1/3) 101 | #define copysign(x,y) // x with sign changed to sign of y 102 | #define cos(x) 103 | #define cosh(x) 104 | #define cospi(x) // cos(pi*x) 105 | #define degrees(x) // x*180/pi 106 | #define erfc(x) // complementary error function 107 | #define erf(x) // error function 108 | #define exp(x) // e^x 109 | #define exp2(x) // 2^x 110 | #define exp10(x) // 10^x 111 | #define expm1(x) // e^x-1 112 | #define fabs(x) // |x| 113 | #define fdim(x,y) // max(x-y,0) 114 | #define floor(x) // (float)((int)x) 115 | #define fma(a,b,c) // a*b+c 116 | #define fmax(x,y) // max(x,y) 117 | #define fmin(x,y) // min(x,y) 118 | #define fmod(x,y) // x%y 119 | #define hypot(x,y) // (x^2+y^2)^(1/2) 120 | #define isfinite(x) // test for finite value 121 | #define isinf(x) // test for infinity 122 | #define isnan(x) // test for NaN 123 | #define isnormal(x) // test for normal value 124 | #define ldexp(x,n) // x*2^n (n is integer) 125 | #define lgamma(x) // log gamma function 126 | #define log(x) // ln(x) 127 | #define log2(x) // log_2(x) 128 | #define log10(x) // log_10(x) 129 | #define log1p(x) // ln(1+x) 130 | #define mad(a,b,c) // a*b+c (approximation) 131 | #define maxmag(x,y) // max(|x|,|y|) 132 | #define minmag(x,y) // min(|x|,|y|) 133 | #define native_rsqrt(x) // x^(-1/2) 134 | #define native_sqrt(x) // x^(1/2) 135 | #define pow(x,y) // x^y 136 | #define pown(x,n) // x^n, where n is an integer 137 | #define powr(x,y) // x^y, where x>=0 138 | #define radians(x) // x*pi/180 139 | #define rootn(x,y) // x^(1/y) 140 | #define rsqrt(x) // x^(-1/2), slower, use native_rsqrt(x) instead 141 | #define signbit(x) // test for sign bit 142 | #define sin(x) 143 | #define sinh(x) 144 | #define sinpi(x) // sin(pi*x) 145 | #define sqrt(x) // x^(1/2), slower, use native_sqrt(x) instead 146 | #define step(x,y) // y buffer(device, N, M); 37 | //print_info("Device mormory usage: "+to_string(device.info.memory_used)+" MB"); 38 | 39 | if(device.info.is_fp64_capable) { 40 | print("| Benchmarking ... |"); 41 | Kernel kernel_double(device, N, "kernel_double", buffer); 42 | for(uint i=0u; i17.6f?4:bw_max>8.8f?3:bw_max>4.4f?2:1)+" x16)"+alignr(8u, to_string(bw_bidirectional, 2u))+" GB/s |"); 177 | } 178 | 179 | println("|-----------------------------------------------------------------------------|"); 180 | } 181 | 182 | int main(int argc, char* argv[]) { 183 | vector main_arguments = get_main_arguments(argc, argv); 184 | println(".-----------------------------------------------------------------------------."); 185 | const vector devices = get_devices(); 186 | if((int)main_arguments.size()>0) { 187 | for(uint i=0u; i<(uint)main_arguments.size(); i++) benchmark_device(select_device_with_id(to_int(main_arguments[i]), devices)); 188 | } else { 189 | for(uint i=0u; i<(uint)devices.size(); i++) benchmark_device(devices[i]); 190 | } 191 | #ifdef _WIN32 192 | println("|-----------------------------------------------------------------------------|"); 193 | println("| Done. Press Enter to exit. |"); 194 | println("'-----------------------------------------------------------------------------'"); 195 | wait(); 196 | #else // Linux 197 | println("'-----------------------------------------------------------------------------'"); 198 | #endif // Linux 199 | return 0; 200 | } -------------------------------------------------------------------------------- /src/OpenCL/include/CL/cl_gl.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2023 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef OPENCL_CL_GL_H_ 18 | #define OPENCL_CL_GL_H_ 19 | 20 | /* 21 | ** This header is generated from the Khronos OpenCL XML API Registry. 22 | */ 23 | 24 | #include 25 | 26 | /* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */ 27 | #if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES) 28 | #define CL_NO_EXTENSION_PROTOTYPES 29 | #endif 30 | 31 | /* CL_NO_EXTENSION_PROTOTYPES implies 32 | CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and 33 | CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */ 34 | #if defined(CL_NO_EXTENSION_PROTOTYPES) && \ 35 | !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) 36 | #define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES 37 | #endif 38 | #if defined(CL_NO_EXTENSION_PROTOTYPES) && \ 39 | !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) 40 | #define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES 41 | #endif 42 | 43 | #ifdef __cplusplus 44 | extern "C" { 45 | #endif 46 | 47 | /*************************************************************** 48 | * cl_khr_gl_sharing 49 | ***************************************************************/ 50 | #define cl_khr_gl_sharing 1 51 | #define CL_KHR_GL_SHARING_EXTENSION_NAME \ 52 | "cl_khr_gl_sharing" 53 | 54 | 55 | #define CL_KHR_GL_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0) 56 | 57 | typedef int cl_GLint; 58 | typedef unsigned int cl_GLenum; 59 | typedef unsigned int cl_GLuint; 60 | 61 | typedef cl_uint cl_gl_context_info; 62 | 63 | /* Error codes */ 64 | #define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 65 | 66 | /* cl_gl_context_info */ 67 | #define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 68 | #define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 69 | 70 | /* Additional cl_context_properties */ 71 | #define CL_GL_CONTEXT_KHR 0x2008 72 | #define CL_EGL_DISPLAY_KHR 0x2009 73 | #define CL_GLX_DISPLAY_KHR 0x200A 74 | #define CL_WGL_HDC_KHR 0x200B 75 | #define CL_CGL_SHAREGROUP_KHR 0x200C 76 | 77 | typedef cl_uint cl_gl_object_type; 78 | typedef cl_uint cl_gl_texture_info; 79 | typedef cl_uint cl_gl_platform_info; 80 | 81 | /* cl_gl_object_type */ 82 | #define CL_GL_OBJECT_BUFFER 0x2000 83 | #define CL_GL_OBJECT_TEXTURE2D 0x2001 84 | #define CL_GL_OBJECT_TEXTURE3D 0x2002 85 | #define CL_GL_OBJECT_RENDERBUFFER 0x2003 86 | 87 | #if defined(CL_VERSION_1_2) 88 | /* cl_gl_object_type */ 89 | #define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E 90 | #define CL_GL_OBJECT_TEXTURE1D 0x200F 91 | #define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010 92 | #define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011 93 | 94 | #endif /* defined(CL_VERSION_1_2) */ 95 | 96 | /* cl_gl_texture_info */ 97 | #define CL_GL_TEXTURE_TARGET 0x2004 98 | #define CL_GL_MIPMAP_LEVEL 0x2005 99 | 100 | 101 | typedef cl_int CL_API_CALL 102 | clGetGLContextInfoKHR_t( 103 | const cl_context_properties* properties, 104 | cl_gl_context_info param_name, 105 | size_t param_value_size, 106 | void* param_value, 107 | size_t* param_value_size_ret); 108 | 109 | typedef clGetGLContextInfoKHR_t * 110 | clGetGLContextInfoKHR_fn CL_API_SUFFIX__VERSION_1_0; 111 | 112 | typedef cl_mem CL_API_CALL 113 | clCreateFromGLBuffer_t( 114 | cl_context context, 115 | cl_mem_flags flags, 116 | cl_GLuint bufobj, 117 | cl_int* errcode_ret); 118 | 119 | typedef clCreateFromGLBuffer_t * 120 | clCreateFromGLBuffer_fn CL_API_SUFFIX__VERSION_1_0; 121 | 122 | #if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) 123 | 124 | extern CL_API_ENTRY cl_int CL_API_CALL 125 | clGetGLContextInfoKHR( 126 | const cl_context_properties* properties, 127 | cl_gl_context_info param_name, 128 | size_t param_value_size, 129 | void* param_value, 130 | size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; 131 | 132 | extern CL_API_ENTRY cl_mem CL_API_CALL 133 | clCreateFromGLBuffer( 134 | cl_context context, 135 | cl_mem_flags flags, 136 | cl_GLuint bufobj, 137 | cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0; 138 | 139 | #endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */ 140 | 141 | #if defined(CL_VERSION_1_2) 142 | 143 | typedef cl_mem CL_API_CALL 144 | clCreateFromGLTexture_t( 145 | cl_context context, 146 | cl_mem_flags flags, 147 | cl_GLenum target, 148 | cl_GLint miplevel, 149 | cl_GLuint texture, 150 | cl_int* errcode_ret); 151 | 152 | typedef clCreateFromGLTexture_t * 153 | clCreateFromGLTexture_fn CL_API_SUFFIX__VERSION_1_2; 154 | 155 | #if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) 156 | 157 | extern CL_API_ENTRY cl_mem CL_API_CALL 158 | clCreateFromGLTexture( 159 | cl_context context, 160 | cl_mem_flags flags, 161 | cl_GLenum target, 162 | cl_GLint miplevel, 163 | cl_GLuint texture, 164 | cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2; 165 | 166 | #endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */ 167 | 168 | #endif /* defined(CL_VERSION_1_2) */ 169 | 170 | 171 | typedef cl_mem CL_API_CALL 172 | clCreateFromGLRenderbuffer_t( 173 | cl_context context, 174 | cl_mem_flags flags, 175 | cl_GLuint renderbuffer, 176 | cl_int* errcode_ret); 177 | 178 | typedef clCreateFromGLRenderbuffer_t * 179 | clCreateFromGLRenderbuffer_fn CL_API_SUFFIX__VERSION_1_0; 180 | 181 | typedef cl_int CL_API_CALL 182 | clGetGLObjectInfo_t( 183 | cl_mem memobj, 184 | cl_gl_object_type* gl_object_type, 185 | cl_GLuint* gl_object_name); 186 | 187 | typedef clGetGLObjectInfo_t * 188 | clGetGLObjectInfo_fn CL_API_SUFFIX__VERSION_1_0; 189 | 190 | typedef cl_int CL_API_CALL 191 | clGetGLTextureInfo_t( 192 | cl_mem memobj, 193 | cl_gl_texture_info param_name, 194 | size_t param_value_size, 195 | void* param_value, 196 | size_t* param_value_size_ret); 197 | 198 | typedef clGetGLTextureInfo_t * 199 | clGetGLTextureInfo_fn CL_API_SUFFIX__VERSION_1_0; 200 | 201 | typedef cl_int CL_API_CALL 202 | clEnqueueAcquireGLObjects_t( 203 | cl_command_queue command_queue, 204 | cl_uint num_objects, 205 | const cl_mem* mem_objects, 206 | cl_uint num_events_in_wait_list, 207 | const cl_event* event_wait_list, 208 | cl_event* event); 209 | 210 | typedef clEnqueueAcquireGLObjects_t * 211 | clEnqueueAcquireGLObjects_fn CL_API_SUFFIX__VERSION_1_0; 212 | 213 | typedef cl_int CL_API_CALL 214 | clEnqueueReleaseGLObjects_t( 215 | cl_command_queue command_queue, 216 | cl_uint num_objects, 217 | const cl_mem* mem_objects, 218 | cl_uint num_events_in_wait_list, 219 | const cl_event* event_wait_list, 220 | cl_event* event); 221 | 222 | typedef clEnqueueReleaseGLObjects_t * 223 | clEnqueueReleaseGLObjects_fn CL_API_SUFFIX__VERSION_1_0; 224 | 225 | #if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) 226 | 227 | extern CL_API_ENTRY cl_mem CL_API_CALL 228 | clCreateFromGLRenderbuffer( 229 | cl_context context, 230 | cl_mem_flags flags, 231 | cl_GLuint renderbuffer, 232 | cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0; 233 | 234 | extern CL_API_ENTRY cl_int CL_API_CALL 235 | clGetGLObjectInfo( 236 | cl_mem memobj, 237 | cl_gl_object_type* gl_object_type, 238 | cl_GLuint* gl_object_name) CL_API_SUFFIX__VERSION_1_0; 239 | 240 | extern CL_API_ENTRY cl_int CL_API_CALL 241 | clGetGLTextureInfo( 242 | cl_mem memobj, 243 | cl_gl_texture_info param_name, 244 | size_t param_value_size, 245 | void* param_value, 246 | size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; 247 | 248 | extern CL_API_ENTRY cl_int CL_API_CALL 249 | clEnqueueAcquireGLObjects( 250 | cl_command_queue command_queue, 251 | cl_uint num_objects, 252 | const cl_mem* mem_objects, 253 | cl_uint num_events_in_wait_list, 254 | const cl_event* event_wait_list, 255 | cl_event* event) CL_API_SUFFIX__VERSION_1_0; 256 | 257 | extern CL_API_ENTRY cl_int CL_API_CALL 258 | clEnqueueReleaseGLObjects( 259 | cl_command_queue command_queue, 260 | cl_uint num_objects, 261 | const cl_mem* mem_objects, 262 | cl_uint num_events_in_wait_list, 263 | const cl_event* event_wait_list, 264 | cl_event* event) CL_API_SUFFIX__VERSION_1_0; 265 | 266 | #endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */ 267 | 268 | /* OpenCL 1.0 APIs that were deprecated in OpenCL 1.2 */ 269 | 270 | typedef cl_mem CL_API_CALL 271 | clCreateFromGLTexture2D_t( 272 | cl_context context, 273 | cl_mem_flags flags, 274 | cl_GLenum target, 275 | cl_GLint miplevel, 276 | cl_GLuint texture, 277 | cl_int* errcode_ret); 278 | 279 | typedef clCreateFromGLTexture2D_t * 280 | clCreateFromGLTexture2D_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED; 281 | 282 | typedef cl_mem CL_API_CALL 283 | clCreateFromGLTexture3D_t( 284 | cl_context context, 285 | cl_mem_flags flags, 286 | cl_GLenum target, 287 | cl_GLint miplevel, 288 | cl_GLuint texture, 289 | cl_int* errcode_ret); 290 | 291 | typedef clCreateFromGLTexture3D_t * 292 | clCreateFromGLTexture3D_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED; 293 | 294 | #if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) 295 | 296 | extern CL_API_ENTRY cl_mem CL_API_CALL 297 | clCreateFromGLTexture2D( 298 | cl_context context, 299 | cl_mem_flags flags, 300 | cl_GLenum target, 301 | cl_GLint miplevel, 302 | cl_GLuint texture, 303 | cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; 304 | 305 | extern CL_API_ENTRY cl_mem CL_API_CALL 306 | clCreateFromGLTexture3D( 307 | cl_context context, 308 | cl_mem_flags flags, 309 | cl_GLenum target, 310 | cl_GLint miplevel, 311 | cl_GLuint texture, 312 | cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED; 313 | 314 | #endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */ 315 | 316 | /*************************************************************** 317 | * cl_khr_gl_event 318 | ***************************************************************/ 319 | #define cl_khr_gl_event 1 320 | #define CL_KHR_GL_EVENT_EXTENSION_NAME \ 321 | "cl_khr_gl_event" 322 | 323 | 324 | #define CL_KHR_GL_EVENT_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0) 325 | 326 | typedef struct __GLsync * cl_GLsync; 327 | 328 | /* cl_command_type */ 329 | #define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D 330 | 331 | 332 | typedef cl_event CL_API_CALL 333 | clCreateEventFromGLsyncKHR_t( 334 | cl_context context, 335 | cl_GLsync sync, 336 | cl_int* errcode_ret); 337 | 338 | typedef clCreateEventFromGLsyncKHR_t * 339 | clCreateEventFromGLsyncKHR_fn CL_API_SUFFIX__VERSION_1_1; 340 | 341 | #if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) 342 | 343 | extern CL_API_ENTRY cl_event CL_API_CALL 344 | clCreateEventFromGLsyncKHR( 345 | cl_context context, 346 | cl_GLsync sync, 347 | cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1; 348 | 349 | #endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */ 350 | 351 | /*************************************************************** 352 | * cl_khr_gl_depth_images 353 | ***************************************************************/ 354 | #define cl_khr_gl_depth_images 1 355 | #define CL_KHR_GL_DEPTH_IMAGES_EXTENSION_NAME \ 356 | "cl_khr_gl_depth_images" 357 | 358 | 359 | #define CL_KHR_GL_DEPTH_IMAGES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0) 360 | 361 | /* cl_channel_order */ 362 | #define CL_DEPTH_STENCIL 0x10BE 363 | 364 | /* cl_channel_type */ 365 | #define CL_UNORM_INT24 0x10DF 366 | 367 | /*************************************************************** 368 | * cl_khr_gl_msaa_sharing 369 | ***************************************************************/ 370 | #define cl_khr_gl_msaa_sharing 1 371 | #define CL_KHR_GL_MSAA_SHARING_EXTENSION_NAME \ 372 | "cl_khr_gl_msaa_sharing" 373 | 374 | 375 | #define CL_KHR_GL_MSAA_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0) 376 | 377 | /* cl_gl_texture_info */ 378 | #define CL_GL_NUM_SAMPLES 0x2012 379 | 380 | /*************************************************************** 381 | * cl_intel_sharing_format_query_gl 382 | ***************************************************************/ 383 | #define cl_intel_sharing_format_query_gl 1 384 | #define CL_INTEL_SHARING_FORMAT_QUERY_GL_EXTENSION_NAME \ 385 | "cl_intel_sharing_format_query_gl" 386 | 387 | 388 | #define CL_INTEL_SHARING_FORMAT_QUERY_GL_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0) 389 | 390 | /* when cl_khr_gl_sharing is supported */ 391 | 392 | typedef cl_int CL_API_CALL 393 | clGetSupportedGLTextureFormatsINTEL_t( 394 | cl_context context, 395 | cl_mem_flags flags, 396 | cl_mem_object_type image_type, 397 | cl_uint num_entries, 398 | cl_GLenum* gl_formats, 399 | cl_uint* num_texture_formats); 400 | 401 | typedef clGetSupportedGLTextureFormatsINTEL_t * 402 | clGetSupportedGLTextureFormatsINTEL_fn ; 403 | 404 | #if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) 405 | 406 | extern CL_API_ENTRY cl_int CL_API_CALL 407 | clGetSupportedGLTextureFormatsINTEL( 408 | cl_context context, 409 | cl_mem_flags flags, 410 | cl_mem_object_type image_type, 411 | cl_uint num_entries, 412 | cl_GLenum* gl_formats, 413 | cl_uint* num_texture_formats) ; 414 | 415 | #endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */ 416 | 417 | #ifdef __cplusplus 418 | } 419 | #endif 420 | 421 | #endif /* OPENCL_CL_GL_H_ */ 422 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenCL-Benchmark 2 | 3 | A small [OpenCL](https://github.com/ProjectPhysX/OpenCL-Wrapper "OpenCL-Wrapper") benchmark program to measure peak GPU/CPU performance. 4 | 5 | Works with any GPU in Windows, Linux, macOS and Android. 6 | 7 | 8 | 9 | ## Measurements 10 | - compute performance (`FP64` (scalar), `FP32` (scalar), `FP16` (half2), `INT64` (scalar), `INT32` (scalar), `INT16` (short2), `INT8` (dp4a)) 11 | - closest possible fraction/multiplicator of `measured compute performance` divided by `reported theoretical FP32 performance` is shown in `(round brackets)` 12 | - for example when OpenCL reports `19.492` TFLOPs/s theoretical FP32, and the benchmark measures `9.512` TFLOPs/s for FP64, the ratio of `(measured FP64)/(theoretical FP32) = 9.512/19.492 = 1/2.05` is rounded to the next possible value of `1/2` and reported as such 13 | - these ratios for any GPU/CPU architecture can only be either `1/64`, `1/32`, `1/24`, `1/16`, `1/12`, `1/8`, `1/4`, `1/3`, `1/2`, `2/3`, `1x`, `2x`, `4x`, `8x`, `16x`, `32x`, `64x`, and nothing in between 14 | - memory bandwidth (`coalesced`/`misaligned` `read`/`write`) 15 | - PCIe bandwidth (`send`/`receive`/`bidirectional`) 16 | - PCIe Gen is estimated based on measured PCIe bandwidth and assumed x16 link width 17 | 18 | 19 | 20 | ## How to use? 21 | 22 | ### Windows 23 | - Download and install [Visual Studio Community](https://visualstudio.microsoft.com/de/vs/community/). In Visual Studio Installer, add: 24 | - Desktop development with C++ 25 | - MSVC v142 26 | - Windows 10 SDK 27 | - Open [`OpenCL-Benchmark.sln`](OpenCL-Benchmark.sln) in [Visual Studio Community](https://visualstudio.microsoft.com/de/vs/community/). 28 | - Compile and run by clicking the ► Local Windows Debugger button. 29 | - To run outside of [Visual Studio Community](https://visualstudio.microsoft.com/de/vs/community/), open Windows CMD in the `OpenCL-Benchmark` folder (type `cmd` in File Explorer in the directory field and press Enter), then run 30 | ``` 31 | OpenCL-Benchmark.exe 32 | ``` 33 | 34 | ### Linux / macOS / Android 35 | - Download, compile and run: 36 | ``` 37 | git clone https://github.com/ProjectPhysX/OpenCL-Benchmark.git 38 | cd OpenCL-Benchmark 39 | chmod +x make.sh 40 | ./make.sh 41 | ``` 42 | - Run 43 | ``` 44 | bin/OpenCL-Benchmark 45 | ``` 46 | 47 | ### Run only for a specified list of devices 48 | - call `bin\OpenCL-Benchmark.exe 0 2 5` (Windows) or `bin/OpenCL-Benchmark 0 2 5` (Linux/macOS) with the number(s) being the device IDs to be benchmarked 49 | 50 | 51 | 52 | ## Examples 53 | ``` 54 | |----------------.------------------------------------------------------------| 55 | | Device ID | 0 | 56 | | Device Name | NVIDIA H100 80GB HBM3 | 57 | | Device Vendor | NVIDIA Corporation | 58 | | Device Driver | 565.57.01 (Linux) | 59 | | OpenCL Version | OpenCL C 3.0 | 60 | | Compute Units | 132 at 1980 MHz (16896 cores, 66.908 TFLOPs/s) | 61 | | Memory, Cache | 81105 MB VRAM, 4224 KB global / 48 KB local | 62 | | Buffer Limits | 20276 MB global, 64 KB constant | 63 | |----------------'------------------------------------------------------------| 64 | | Info: OpenCL C code successfully compiled. | 65 | | FP64 compute 31.184 TFLOPs/s (1/2 ) | 66 | | FP32 compute 62.908 TFLOPs/s ( 1x ) | 67 | | FP16 compute 123.749 TFLOPs/s ( 2x ) | 68 | | INT64 compute 3.227 TIOPs/s (1/24) | 69 | | INT32 compute 32.946 TIOPs/s (1/2 ) | 70 | | INT16 compute 30.901 TIOPs/s (1/2 ) | 71 | | INT8 compute 103.204 TIOPs/s ( 2x ) | 72 | | Memory Bandwidth ( coalesced read ) 3025.53 GB/s | 73 | | Memory Bandwidth ( coalesced write) 3055.98 GB/s | 74 | | Memory Bandwidth (misaligned read ) 2102.44 GB/s | 75 | | Memory Bandwidth (misaligned write) 314.25 GB/s | 76 | | PCIe Bandwidth (send ) 10.53 GB/s | 77 | | PCIe Bandwidth ( receive ) 11.47 GB/s | 78 | | PCIe Bandwidth ( bidirectional) (Gen4 x16) 10.91 GB/s | 79 | |-----------------------------------------------------------------------------| 80 | ``` 81 | ``` 82 | |----------------.------------------------------------------------------------| 83 | | Device ID | 0 | 84 | | Device Name | AMD Instinct MI300X | 85 | | Device Vendor | Advanced Micro Devices, Inc. | 86 | | Device Driver | 3635.0 (HSA1.1,LC) (Linux) | 87 | | OpenCL Version | OpenCL C 2.0 | 88 | | Compute Units | 304 at 2100 MHz (19456 cores, 81.715 TFLOPs/s) | 89 | | Memory, Cache | 196592 MB VRAM, 32 KB global / 64 KB local | 90 | | Buffer Limits | 196592 MB global, 201310208 KB constant | 91 | |----------------'------------------------------------------------------------| 92 | | Info: OpenCL C code successfully compiled. | 93 | | FP64 compute 54.944 TFLOPs/s (2/3 ) | 94 | | FP32 compute 130.000 TFLOPs/s ( 2x ) | 95 | | FP16 compute 141.320 TFLOPs/s ( 2x ) | 96 | | INT64 compute 3.666 TIOPs/s (1/24) | 97 | | INT32 compute 47.736 TIOPs/s (2/3 ) | 98 | | INT16 compute 69.022 TIOPs/s ( 1x ) | 99 | | INT8 compute 106.178 TIOPs/s ( 1x ) | 100 | | Memory Bandwidth ( coalesced read ) 3756.64 GB/s | 101 | | Memory Bandwidth ( coalesced write) 4686.31 GB/s | 102 | | Memory Bandwidth (misaligned read ) 3881.24 GB/s | 103 | | Memory Bandwidth (misaligned write) 2491.25 GB/s | 104 | | PCIe Bandwidth (send ) 54.57 GB/s | 105 | | PCIe Bandwidth ( receive ) 55.79 GB/s | 106 | | PCIe Bandwidth ( bidirectional) (Gen4 x16) 55.21 GB/s | 107 | |-----------------------------------------------------------------------------| 108 | ``` 109 | ``` 110 | |----------------.------------------------------------------------------------| 111 | | Device ID | 0 | 112 | | Device Name | Intel(R) Arc(TM) B580 Graphics | 113 | | Device Vendor | Intel(R) Corporation | 114 | | Device Driver | 32.0.101.6559 (Windows) | 115 | | OpenCL Version | OpenCL C 3.0 | 116 | | Compute Units | 160 at 2850 MHz (2560 cores, 14.592 TFLOPs/s) | 117 | | Memory, Cache | 12187 MB VRAM, 18432 KB global / 128 KB local | 118 | | Buffer Limits | 11944 MB global, 12230900 KB constant | 119 | |----------------'------------------------------------------------------------| 120 | | Info: OpenCL C code successfully compiled. | 121 | | FP64 compute 0.896 TFLOPs/s (1/16) | 122 | | FP32 compute 14.249 TFLOPs/s ( 1x ) | 123 | | FP16 compute 26.547 TFLOPs/s ( 2x ) | 124 | | INT64 compute 0.636 TIOPs/s (1/24) | 125 | | INT32 compute 4.556 TIOPs/s (1/3 ) | 126 | | INT16 compute 37.082 TIOPs/s ( 2x ) | 127 | | INT8 compute 48.668 TIOPs/s ( 4x ) | 128 | | Memory Bandwidth ( coalesced read ) 574.09 GB/s | 129 | | Memory Bandwidth ( coalesced write) 468.07 GB/s | 130 | | Memory Bandwidth (misaligned read ) 796.23 GB/s | 131 | | Memory Bandwidth (misaligned write) 383.15 GB/s | 132 | | PCIe Bandwidth (send ) 4.99 GB/s | 133 | | PCIe Bandwidth ( receive ) 4.87 GB/s | 134 | | PCIe Bandwidth ( bidirectional) (Gen3 x16) 5.11 GB/s | 135 | |-----------------------------------------------------------------------------| 136 | ``` 137 | ``` 138 | |----------------.------------------------------------------------------------| 139 | | Device ID | 0 | 140 | | Device Name | AMD EPYC 9554 64-Core Processor | 141 | | Device Vendor | Intel(R) Corporation | 142 | | Device Driver | 2024.18.10.0.08_160000 (Linux) | 143 | | OpenCL Version | OpenCL C 3.0 | 144 | | Compute Units | 128 at 0 MHz (64 cores, 0.000 TFLOPs/s) | 145 | | Memory, Cache | 386363 MB RAM, 1024 KB global / 256 KB local | 146 | | Buffer Limits | 386363 MB global, 128 KB constant | 147 | |----------------'------------------------------------------------------------| 148 | | Info: OpenCL C code successfully compiled. | 149 | | FP64 compute 3.739 TFLOPs/s (1/64) | 150 | | FP32 compute 3.842 TFLOPs/s (1/64) | 151 | | FP16 compute 0.863 TFLOPs/s (1/64) | 152 | | INT64 compute 1.506 TIOPs/s (1/64) | 153 | | INT32 compute 4.240 TIOPs/s (1/64) | 154 | | INT16 compute 8.592 TIOPs/s (1/64) | 155 | | INT8 compute 2.774 TIOPs/s (1/64) | 156 | | Memory Bandwidth ( coalesced read ) 391.09 GB/s | 157 | | Memory Bandwidth ( coalesced write) 167.26 GB/s | 158 | | Memory Bandwidth (misaligned read ) 248.65 GB/s | 159 | | Memory Bandwidth (misaligned write) 156.18 GB/s | 160 | |-----------------------------------------------------------------------------| 161 | ``` 162 | ``` 163 | |----------------.------------------------------------------------------------| 164 | | Device ID | 1 | 165 | | Device Name | Intel(R) UHD Graphics 630 | 166 | | Device Vendor | Intel(R) Corporation | 167 | | Device Driver | 31.0.101.2130 (Windows) | 168 | | OpenCL Version | OpenCL C 3.0 | 169 | | Compute Units | 24 at 1200 MHz (192 cores, 0.461 TFLOPs/s) | 170 | | Memory, Cache | 6500 MB RAM, 768 KB global / 64 KB local | 171 | | Buffer Limits | 3250 MB global, 3328048 KB constant | 172 | |----------------'------------------------------------------------------------| 173 | | Info: OpenCL C code successfully compiled. | 174 | | FP64 compute 0.112 TFLOPs/s (1/4 ) | 175 | | FP32 compute 0.437 TFLOPs/s ( 1x ) | 176 | | FP16 compute 0.801 TFLOPs/s ( 2x ) | 177 | | INT64 compute 0.016 TIOPs/s (1/32) | 178 | | INT32 compute 0.149 TIOPs/s (1/3 ) | 179 | | INT16 compute 0.863 TIOPs/s ( 2x ) | 180 | | INT8 compute 0.213 TIOPs/s (1/2 ) | 181 | | Memory Bandwidth ( coalesced read ) 20.98 GB/s | 182 | | Memory Bandwidth ( coalesced write) 25.18 GB/s | 183 | | Memory Bandwidth (misaligned read ) 35.16 GB/s | 184 | | Memory Bandwidth (misaligned write) 16.18 GB/s | 185 | |-----------------------------------------------------------------------------| 186 | ``` 187 | ``` 188 | |----------------.------------------------------------------------------------| 189 | | Device ID | 2 | 190 | | Device Name | Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz | 191 | | Device Vendor | Intel(R) Corporation | 192 | | Device Driver | 2024.17.3.0.08_160000 (Windows) | 193 | | OpenCL Version | OpenCL C 3.0 | 194 | | Compute Units | 12 at 3700 MHz (6 cores, 0.710 TFLOPs/s) | 195 | | Memory, Cache | 16250 MB RAM, 256 KB global / 32 KB local | 196 | | Buffer Limits | 16250 MB global, 128 KB constant | 197 | |----------------'------------------------------------------------------------| 198 | | Info: OpenCL C code successfully compiled. | 199 | | FP64 compute 0.151 TFLOPs/s (1/4 ) | 200 | | FP32 compute 0.158 TFLOPs/s (1/4 ) | 201 | | FP16 compute not supported | 202 | | INT64 compute 0.042 TIOPs/s (1/16) | 203 | | INT32 compute 0.063 TIOPs/s (1/12) | 204 | | INT16 compute 0.224 TIOPs/s (1/3 ) | 205 | | INT8 compute 0.059 TIOPs/s (1/12) | 206 | | Memory Bandwidth ( coalesced read ) 16.92 GB/s | 207 | | Memory Bandwidth ( coalesced write) 8.08 GB/s | 208 | | Memory Bandwidth (misaligned read ) 40.02 GB/s | 209 | | Memory Bandwidth (misaligned write) 13.69 GB/s | 210 | |-----------------------------------------------------------------------------| 211 | ``` -------------------------------------------------------------------------------- /src/utilities.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define UTILITIES_REGEX 4 | //#define UTILITIES_FILE 5 | #define CONSOLE_WIDTH 79 6 | #define UTILITIES_NO_CPP17 7 | 8 | #pragma warning(disable:26451) 9 | #pragma warning(disable:6386) 10 | #include 11 | #include 12 | #include 13 | #ifdef UTILITIES_REGEX 14 | #include // contains , , and others 15 | #endif // UTILITIES_REGEX 16 | #include 17 | #include 18 | #include 19 | #undef min 20 | #undef max 21 | using std::string; 22 | using std::vector; 23 | using std::thread; 24 | typedef unsigned char uchar; 25 | typedef unsigned short ushort; 26 | typedef unsigned int uint; 27 | typedef int64_t slong; 28 | typedef uint64_t ulong; 29 | #define pif 3.1415927f 30 | #define pi 3.141592653589793 31 | #define min_char ((char)-128) 32 | #define max_char ((char)127) 33 | #define max_uchar ((uchar)255) 34 | #define min_short ((short)-32768) 35 | #define max_short ((short)32767) 36 | #define max_ushort ((ushort)65535) 37 | #define min_int ((int)-2147483648) 38 | #define max_int 2147483647 39 | #define max_uint 4294967295u 40 | #define min_slong ((slong)-9223372036854775808ll) 41 | #define max_slong 9223372036854775807ll 42 | #define max_ulong 18446744073709551615ull 43 | #define min_float 1.401298464E-45f 44 | #define max_float 3.402823466E38f 45 | #define epsilon_float 1.192092896E-7f 46 | #define inf_float as_float(0x7F800000) 47 | #define nan_float as_float(0xFFFFFFFF) 48 | #define min_double 4.9406564584124654E-324 49 | #define max_double 1.7976931348623158E308 50 | #define epsilon_double 2.2204460492503131E-16 51 | #define inf_double as_double(0x7FF0000000000000) 52 | #define nan_double as_double(0xFFFFFFFFFFFFFFFF) 53 | 54 | class Clock { 55 | private: 56 | typedef std::chrono::high_resolution_clock clock; 57 | std::chrono::time_point t; 58 | public: 59 | inline Clock() { start(); } 60 | inline void start() { t = clock::now(); } 61 | inline double stop() const { return std::chrono::duration_cast>(clock::now()-t).count(); } 62 | }; 63 | inline void sleep(const double t) { 64 | if(t>0.0) std::this_thread::sleep_for(std::chrono::milliseconds((int)(1E3*t+0.5))); 65 | } 66 | 67 | inline float as_float(const uint x) { 68 | return *(float*)&x; 69 | } 70 | inline uint as_uint(const float x) { 71 | return *(uint*)&x; 72 | } 73 | inline double as_double(const ulong x) { 74 | return *(double*)&x; 75 | } 76 | inline ulong as_ulong(const double x) { 77 | return *(ulong*)&x; 78 | } 79 | 80 | inline float half_to_float(const ushort x) { // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits 81 | const uint e = (x&0x7C00)>>10; // exponent 82 | const uint m = (x&0x03FF)<<13; // mantissa 83 | const uint v = as_uint((float)m)>>23; // evil log2 bit hack to count leading zeros in denormalized format 84 | return as_float((x&0x8000)<<16 | (e!=0)*((e+112)<<23|m) | ((e==0)&(m!=0))*((v-37)<<23|((m<<(150-v))&0x007FE000))); // sign : normalized : denormalized 85 | } 86 | inline ushort float_to_half(const float x) { // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits 87 | const uint b = as_uint(x)+0x00001000; // round-to-nearest-even: add last bit after truncated mantissa 88 | const uint e = (b&0x7F800000)>>23; // exponent 89 | const uint m = b&0x007FFFFF; // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding 90 | return (b&0x80000000)>>16 | (e>112)*((((e-112)<<10)&0x7C00)|m>>13) | ((e<113)&(e>101))*((((0x007FF000+m)>>(125-e))+1)>>1) | (e>143)*0x7FFF; // sign : normalized : denormalized : saturate 91 | } 92 | 93 | inline float sq(const float x) { 94 | return x*x; 95 | } 96 | inline float cb(const float x) { 97 | return x*x*x; 98 | } 99 | inline float pow(const float x, const uint n) { 100 | float r = 1.0f; 101 | for(uint i=0u; i=0.0f ? 1.0f : -1.0f; 108 | } 109 | inline float clamp(const float x, const float a, const float b) { 110 | return fmin(fmax(x, a), b); 111 | } 112 | inline float rsqrt(const float x) { 113 | return 1.0f/sqrt(x); 114 | } 115 | inline float ln(const float x) { 116 | return log(x); // natural logarithm 117 | } 118 | inline float random(const float x=1.0f) { 119 | return x*((float)rand()/(float)RAND_MAX); 120 | } 121 | inline float random_symmetric(const float x=1.0f) { 122 | return 2.0f*x*((float)rand()/(float)RAND_MAX-0.5f); 123 | } 124 | 125 | inline double sq(const double x) { 126 | return x*x; 127 | } 128 | inline double cb(const double x) { 129 | return x*x*x; 130 | } 131 | inline double pow(const double x, const uint n) { 132 | double r = 1.0; 133 | for(uint i=0u; i=0.0 ? 1.0 : -1.0; 140 | } 141 | inline double clamp(const double x, const double a, const double b) { 142 | return fmin(fmax(x, a), b); 143 | } 144 | inline double rsqrt(const double x) { 145 | return 1.0/sqrt(x); 146 | } 147 | inline double ln(const double x) { 148 | return log(x); // natural logarithm 149 | } 150 | 151 | inline int sq(const int x) { 152 | return x*x; 153 | } 154 | inline int cb(const int x) { 155 | return x*x*x; 156 | } 157 | inline int pow(const int x, const uint n) { 158 | int r = 1; 159 | for(uint i=0u; i>31&1); 166 | } 167 | inline int min(const int x, const int y) { 168 | return xy?x:y; 172 | } 173 | inline int clamp(const int x, const int a, const int b) { 174 | return min(max(x, a), b); 175 | } 176 | 177 | inline uint sq(const uint x) { 178 | return x*x; 179 | } 180 | inline uint cb(const uint x) { 181 | return x*x*x; 182 | } 183 | inline uint pow(const uint x, const uint n) { 184 | uint r = 1u; 185 | for(uint i=0u; iy?x:y; 195 | } 196 | inline uint clamp(const uint x, const uint a, const uint b) { 197 | return min(max(x, a), b); 198 | } 199 | inline uint gcd(uint x, uint y) { // greatest common divisor 200 | if(x*y==0u) return 0u; 201 | uint t; 202 | while(y!=0u) { 203 | t = y; 204 | y = x%y; 205 | x = t; 206 | } 207 | return x; 208 | } 209 | inline uint lcm(const uint x, const uint y) { // least common multiple 210 | return x*y==0u ? 0u : x*y/gcd(x, y); 211 | } 212 | 213 | inline slong sq(const slong x) { 214 | return x*x; 215 | } 216 | inline slong cb(const slong x) { 217 | return x*x*x; 218 | } 219 | inline slong pow(const slong x, const uint n) { 220 | slong r = 1ll; 221 | for(uint i=0u; i>63&1ll); 228 | } 229 | inline slong min(const slong x, const slong y) { 230 | return xy?x:y; 234 | } 235 | inline slong clamp(const slong x, const slong a, const slong b) { 236 | return min(max(x, a), b); 237 | } 238 | 239 | inline ulong sq(const ulong x) { 240 | return x*x; 241 | } 242 | inline ulong cb(const ulong x) { 243 | return x*x*x; 244 | } 245 | inline ulong pow(const ulong x, const uint n) { 246 | ulong r = 1ull; 247 | for(uint i=0u; iy?x:y; 257 | } 258 | inline ulong clamp(const ulong x, const ulong a, const ulong b) { 259 | return min(max(x, a), b); 260 | } 261 | inline ulong gcd(ulong x, ulong y) { // greatest common divisor 262 | if(x*y==0ull) return 0ull; 263 | ulong t; 264 | while(y!=0ull) { 265 | t = y; 266 | y = x%y; 267 | x = t; 268 | } 269 | return x; 270 | } 271 | inline ulong lcm(const ulong x, const ulong y) { // least common multiple 272 | return x*y==0ull ? 0ull : x*y/gcd(x, y); 273 | } 274 | 275 | inline int to_int(const float x) { 276 | return (int)(x+0.5f-(float)(x<0.0f)); 277 | } 278 | inline int to_int(const double x) { 279 | return (int)(x+0.5-(double)(x<0.0)); 280 | } 281 | inline uint to_uint(const float x) { 282 | return (uint)fmax(x+0.5f, 0.5f); 283 | } 284 | inline uint to_uint(const double x) { 285 | return (uint)fmax(x+0.5, 0.5); 286 | } 287 | inline slong to_slong(const float x) { 288 | return (slong)(x+0.5f); 289 | } 290 | inline slong to_slong(const double x) { 291 | return (slong)(x+0.5); 292 | } 293 | inline ulong to_ulong(const float x) { 294 | return (ulong)fmax(x+0.5f, 0.5f); 295 | } 296 | inline ulong to_ulong(const double x) { 297 | return (ulong)fmax(x+0.5, 0.5); 298 | } 299 | 300 | inline void split_float(float x, uint& integral, uint& decimal, int& exponent) { 301 | if(x>=10.0f) { // convert to base 10 302 | if(x>=1E32f) { x *= 1E-32f; exponent += 32; } 303 | if(x>=1E16f) { x *= 1E-16f; exponent += 16; } 304 | if(x>= 1E8f) { x *= 1E-8f; exponent += 8; } 305 | if(x>= 1E4f) { x *= 1E-4f; exponent += 4; } 306 | if(x>= 1E2f) { x *= 1E-2f; exponent += 2; } 307 | if(x>= 1E1f) { x *= 1E-1f; exponent += 1; } 308 | } 309 | if(x>0.0f && x<=1.0f) { 310 | if(x<1E-31f) { x *= 1E32f; exponent -= 32; } 311 | if(x<1E-15f) { x *= 1E16f; exponent -= 16; } 312 | if(x< 1E-7f) { x *= 1E8f; exponent -= 8; } 313 | if(x< 1E-3f) { x *= 1E4f; exponent -= 4; } 314 | if(x< 1E-1f) { x *= 1E2f; exponent -= 2; } 315 | if(x< 1E0f) { x *= 1E1f; exponent -= 1; } 316 | } 317 | integral = (uint)x; 318 | float remainder = (x-integral)*1E8f; // 8 decimal digits 319 | decimal = (uint)remainder; 320 | if(remainder-(float)decimal>=0.5f) { // correct rounding of last decimal digit 321 | decimal++; 322 | if(decimal>=100000000u) { // decimal overflow 323 | decimal = 0u; 324 | integral++; 325 | if(integral>=10u) { // decimal overflow causes integral overflow 326 | integral = 1u; 327 | exponent++; 328 | } 329 | } 330 | } 331 | } 332 | inline void split_double(double x, uint& integral, ulong& decimal, int& exponent) { 333 | if(x>=10.0) { // convert to base 10 334 | if(x>=1E256) { x *= 1E-256; exponent += 256; } 335 | if(x>=1E128) { x *= 1E-128; exponent += 128; } 336 | if(x>= 1E64) { x *= 1E-64; exponent += 64; } 337 | if(x>= 1E32) { x *= 1E-32; exponent += 32; } 338 | if(x>= 1E16) { x *= 1E-16; exponent += 16; } 339 | if(x>= 1E8) { x *= 1E-8; exponent += 8; } 340 | if(x>= 1E4) { x *= 1E-4; exponent += 4; } 341 | if(x>= 1E2) { x *= 1E-2; exponent += 2; } 342 | if(x>= 1E1) { x *= 1E-1; exponent += 1; } 343 | } 344 | if(x>0.0 && x<=1.0) { 345 | if(x<1E-255) { x *= 1E256; exponent -= 256; } 346 | if(x<1E-127) { x *= 1E128; exponent -= 128; } 347 | if(x< 1E-63) { x *= 1E64; exponent -= 64; } 348 | if(x< 1E-31) { x *= 1E32; exponent -= 32; } 349 | if(x< 1E-15) { x *= 1E16; exponent -= 16; } 350 | if(x< 1E-7) { x *= 1E8; exponent -= 8; } 351 | if(x< 1E-3) { x *= 1E4; exponent -= 4; } 352 | if(x< 1E-1) { x *= 1E2; exponent -= 2; } 353 | if(x< 1E0) { x *= 1E1; exponent -= 1; } 354 | } 355 | integral = (uint)x; 356 | double remainder = (x-integral)*1E16; // 16 decimal digits 357 | decimal = (ulong)remainder; 358 | if(remainder-(double)decimal>=0.5) { // correct rounding of last decimal digit 359 | decimal++; 360 | if(decimal>=10000000000000000ull) { // decimal overflow 361 | decimal = 0ull; 362 | integral++; 363 | if(integral>=10u) { // decimal overflow causes integral overflow 364 | integral = 1u; 365 | exponent++; 366 | } 367 | } 368 | } 369 | } 370 | inline string decimal_to_string_float(uint x, int digits) { 371 | string r = ""; 372 | while((digits--)>0) { 373 | r = (char)(x%10u+48u)+r; 374 | x /= 10u; 375 | } 376 | return r; 377 | } 378 | inline string decimal_to_string_double(ulong x, int digits) { 379 | string r = ""; 380 | while((digits--)>0) { 381 | r = (char)(x%10ull+48ull)+r; 382 | x /= 10ull; 383 | } 384 | return r; 385 | } 386 | 387 | inline vector get_main_arguments(int argc, char* argv[]) { 388 | return argc>1 ? vector(argv+1, argv+argc) : vector(); 389 | } 390 | 391 | inline string to_string(const string& s){ 392 | return s; 393 | } 394 | inline string to_string(char c) { 395 | return string(1, c); 396 | } 397 | inline string to_string(uchar c) { 398 | return string(1, c); 399 | } 400 | inline string to_string(ulong x) { 401 | string r = ""; 402 | do { 403 | r = (char)(x%10ull+48ull)+r; 404 | x /= 10ull; 405 | } while(x); 406 | return r; 407 | } 408 | inline string to_string(slong x) { 409 | return x>=0ll ? to_string((ulong)x) : "-"+to_string((ulong)(-x)); 410 | } 411 | inline string to_string(uint x) { 412 | string r = ""; 413 | do { 414 | r = (char)(x%10u+48u)+r; 415 | x /= 10u; 416 | } while(x); 417 | return r; 418 | } 419 | inline string to_string(int x) { 420 | return x>=0 ? to_string((uint)x) : "-"+to_string((uint)(-x)); 421 | } 422 | inline string to_string_hex(ulong x) { 423 | string r = ""; 424 | for(uint i=0u; i<16u; i++) { 425 | const uint hex_char = (uint)(x&0xFull); 426 | r = (char)(hex_char+(hex_char<10u ? 48u : 55u))+r; 427 | x >>= 4u; 428 | } 429 | return "0x"+r; 430 | } 431 | inline string to_string_hex(slong x) { 432 | return to_string_hex(*(ulong*)&x); 433 | } 434 | inline string to_string_hex(uint x) { 435 | string r = ""; 436 | for(uint i=0u; i<8u; i++) { 437 | const uint hex_char = x&0xFu; 438 | r = (char)(hex_char+(hex_char<10u ? 48u : 55u))+r; 439 | x >>= 4u; 440 | } 441 | return "0x"+r; 442 | } 443 | inline string to_string_hex(int x) { 444 | return to_string_hex(*(uint*)&x); 445 | } 446 | inline string to_string(float x) { // convert float to string with full precision ( to_string() prints only 6 decimals) 447 | string s = ""; 448 | if(x<0.0f) { s += "-"; x = -x; } 449 | if(std::isnan(x)) return s+"NaN"; 450 | if(std::isinf(x)) return s+"Inf"; 451 | uint integral, decimal; 452 | int exponent = 0; 453 | split_float(x, integral, decimal, exponent); 454 | return s+to_string(integral)+"."+decimal_to_string_float(decimal, 8)+(exponent!=0?"E"+to_string(exponent):""); 455 | } 456 | inline string to_string(double x) { // convert double to string with full precision ( to_string() prints only 6 decimals) 457 | string s = ""; 458 | if(x<0.0) { s += "-"; x = -x; } 459 | if(std::isnan(x)) return s+"NaN"; 460 | if(std::isinf(x)) return s+"Inf"; 461 | uint integral; 462 | ulong decimal; 463 | int exponent = 0; 464 | split_double(x, integral, decimal, exponent); 465 | return s+to_string(integral)+"."+decimal_to_string_double(decimal, 16)+(exponent!=0?"E"+to_string(exponent):""); 466 | } 467 | inline string to_string(float x, const uint decimals) { // convert float to string with specified number of decimals 468 | string s = ""; 469 | if(x<0.0f) { s += "-"; x = -x; } 470 | if(std::isnan(x)) return s+"NaN"; 471 | if(std::isinf(x)||x>(float)max_ulong) return s+"Inf"; 472 | const float power = pow(10.0f, min(decimals, 8u)); 473 | x += 0.5f/power; // rounding 474 | const ulong integral = (ulong)x; 475 | const uint decimal = (uint)((x-(float)integral)*power); 476 | return s+to_string(integral)+(decimals==0u ? "" : "."+decimal_to_string_float(decimal, min((int)decimals, 8))); 477 | } 478 | inline string to_string(double x, const uint decimals) { // convert float to string with specified number of decimals 479 | string s = ""; 480 | if(x<0.0) { s += "-"; x = -x; } 481 | if(std::isnan(x)) return s+"NaN"; 482 | if(std::isinf(x)||x>(double)max_ulong) return s+"Inf"; 483 | const double power = pow(10.0, min(decimals, 16u)); 484 | x += 0.5/power; // rounding 485 | const ulong integral = (ulong)x; 486 | const ulong decimal = (ulong)((x-(double)integral)*power); 487 | return s+to_string(integral)+(decimals==0u ? "" : "."+decimal_to_string_double(decimal, min((int)decimals, 16))); 488 | } 489 | 490 | inline uint length(const string& s) { 491 | return (uint)s.length(); 492 | } 493 | inline bool contains(const string& s, const string& match) { 494 | return s.find(match)!=string::npos; 495 | } 496 | inline bool contains_any(const string& s, const vector& matches) { 497 | for(uint i=0u; i<(uint)matches.size(); i++) if(contains(s, matches[i])) return true; 498 | return false; 499 | } 500 | inline string to_lower(const string& s) { 501 | string r = ""; 502 | for(uint i=0u; i<(uint)s.length(); i++) { 503 | const uchar c = s.at(i); 504 | r += c>64u&&c<91u ? c+32u : c; 505 | } 506 | return r; 507 | } 508 | inline string to_upper(const string& s) { 509 | string r = ""; 510 | for(uint i=0u; i<(uint)s.length(); i++) { 511 | const uchar c = s.at(i); 512 | r += c>96u&&c<123u ? c-32u : c; 513 | } 514 | return r; 515 | } 516 | inline bool equals(const string& a, const string& b) { 517 | return to_lower(a)==to_lower(b); 518 | } 519 | inline string replace(const string& s, const string& from, const string& to) { 520 | string r = s; 521 | int p = 0; 522 | while((p=(int)r.find(from, p))!=string::npos) { 523 | r.replace(p, from.length(), to); 524 | p += (int)to.length(); 525 | } 526 | return r; 527 | } 528 | inline string substring(const string& s, const uint start, uint length=max_uint) { 529 | return s.substr(start, min(length, (uint)s.length()-start)); 530 | } 531 | inline string trim(const string& s) { // removes whitespace characters from beginnig and end of string s 532 | const int l = (int)s.length(); 533 | int a=0, b=l-1; 534 | char c; 535 | while(aa && ((c=s[b])==' '||c=='\t'||c=='\n'||c=='\v'||c=='\f'||c=='\r'||c=='\0')) b--; 537 | return s.substr(a, 1+b-a); 538 | } 539 | inline bool begins_with(const string& s, const string& match) { 540 | if(match.size()>s.size()) return false; 541 | else return equal(match.begin(), match.end(), s.begin()); 542 | } 543 | inline bool ends_with(const string& s, const string& match) { 544 | if(match.size()>s.size()) return false; 545 | else return equal(match.rbegin(), match.rend(), s.rbegin()); 546 | } 547 | template inline bool contains(const vector& v, const T& match) { 548 | return find(v.begin(), v.end(), match)!=v.end(); 549 | } 550 | 551 | inline string alignl(const uint n, const string& x="") { // converts x to string with spaces behind such that length is n if x is not longer than n 552 | string s = x; 553 | for(uint i=0u; i inline string alignl(const uint n, const T x) { // converts x to string with spaces behind such that length is n if x does not have more digits than n 563 | return alignl(n, to_string(x)); 564 | } 565 | template inline string alignr(const uint n, const T x) { // converts x to string with spaces in front such that length is n if x does not have more digits than n 566 | return alignr(n, to_string(x)); 567 | } 568 | 569 | inline void print(const string& s="") { 570 | std::cout << s; 571 | } 572 | inline void println(const string& s="") { 573 | std::cout << s+"\n"; 574 | } 575 | inline void reprint(const string& s="") { 576 | std::cout << "\r"+s; 577 | } 578 | inline void wait() { 579 | std::cin.get(); 580 | } 581 | template inline void println(const T& x) { 582 | println(to_string(x)); 583 | } 584 | 585 | #ifdef UTILITIES_REGEX 586 | inline vector split_regex(const string& s, const string& separator="\\s+") { 587 | vector r; 588 | const std::regex rgx(separator); 589 | std::sregex_token_iterator token(s.begin(), s.end(), rgx, -1), end; 590 | while(token!=end) { 591 | r.push_back(*token); 592 | token++; 593 | } 594 | return r; 595 | } 596 | inline bool equals_regex(const string& s, const string& match) { // returns true if string exactly matches regex 597 | return regex_match(s.begin(), s.end(), std::regex(match)); 598 | } 599 | inline uint matches_regex(const string& s, const string& match) { // counts number of matches 600 | std::regex words_regex(match); 601 | auto words_begin = std::sregex_iterator(s.begin(), s.end(), words_regex); 602 | auto words_end = std::sregex_iterator(); 603 | return (uint)std::distance(words_begin, words_end); 604 | } 605 | inline bool contains_regex(const string& s, const string& match) { 606 | return matches_regex(s, match)>=1; 607 | } 608 | inline string replace_regex(const string& s, const string& from, const string& to) { 609 | return regex_replace(s, std::regex(from), to); 610 | } 611 | inline bool is_number(const string& s) { 612 | return equals_regex(s, "\\d+(u|l|ul|ll|ull)?")||equals_regex(s, "0x(\\d|[a-fA-F])+(u|l|ul|ll|ull)?")||equals_regex(s, "0b[01]+(u|l|ul|ll|ull)?")||equals_regex(s, "(((\\d+\\.?\\d*|\\.\\d+)([eE][+-]?\\d+[fF]?)?)|(\\d+\\.\\d*|\\.\\d+)[fF]?)"); 613 | } 614 | inline void print_message(const string& message, const string& keyword="", const int keyword_color=-1, const int colons=true) { // print formatted message 615 | const uint k=length(keyword)+2u, w=CONSOLE_WIDTH-4u-k; 616 | string p=colons?": ":" ", f=""; 617 | for(uint j=0u; j v = split_regex(message); 619 | uint l = 0u; // length of current line of words 620 | for(uint i=0u; i<(uint)v.size(); i++) { 621 | const string word = v.at(i); 622 | const uint wordlength = length(word); 623 | l += wordlength+1u; // word + space 624 | if(l<=w+1u) { // word fits -> append word and space 625 | p += word+" "; 626 | } else if(wordlength>w) { // word overflows -> split word into next line 627 | p += substring(word, 0, w-(l-wordlength-1u))+" |\n| "+f; 628 | v[i] = substring(v[i], w-(l-wordlength-1u)); i--; // reuse same vector element for overflowing part, decrement i to start next line with this overflowing part 629 | l = 0u; // reset line length 630 | } else { // word does not fit -> fill remaining line with spaces 631 | l = l-length(v.at(i--))-1u; // remove word from line, decrement i to start next line with this word 632 | for(uint j=l; j<=w; j++) p += " "; 633 | p += "|\n| "+f; 634 | l = 0u; // reset line length 635 | } 636 | } 637 | for(uint j=l; j<=w; j++) p += " "; 638 | println("\r| "+keyword+p+"|"); 639 | } 640 | inline void print_error(const string& s) { // print formatted error message 641 | print_message(s, "Error"); 642 | #ifdef _WIN32 643 | print_message("Press Enter to exit.", " ", -1, false); 644 | #endif // _WIN32 645 | string b = ""; 646 | for(int i=0; i // read/write files 751 | #ifndef UTILITIES_NO_CPP17 752 | #include // automatically create directory before writing file, requires C++17 753 | inline vector find_files(const string& path, const string& extension=".*") { 754 | vector files; 755 | if(std::filesystem::is_directory(path)&&std::filesystem::exists(path)) { 756 | for(const auto& entry : std::filesystem::directory_iterator(path)) { 757 | if(extension==".*"||entry.path().extension().string()==extension) files.push_back(entry.path().string()); 758 | } 759 | } 760 | return files; 761 | } 762 | #endif // UTILITIES_NO_CPP17 763 | inline void create_folder(const string& path) { // create folder if it not already exists 764 | const int slash_position = (int)path.rfind('/'); // find last slash dividing the path from the filename 765 | if(slash_position==(int)string::npos) return; // no slash found 766 | const string f = path.substr(0, slash_position); // cut off file name if there is any 767 | #ifndef UTILITIES_NO_CPP17 768 | if(!std::filesystem::is_directory(f)||!std::filesystem::exists(f)) std::filesystem::create_directories(f); // create folder if it not already exists 769 | #endif // UTILITIES_NO_CPP17 770 | } 771 | inline string create_file_extension(const string& filename, const string& extension) { 772 | return filename.substr(0, filename.rfind('.'))+(extension.at(0)!='.'?".":"")+extension; // remove existing file extension if existing and replace it with new one 773 | } 774 | inline string read_file(const string& filename) { 775 | std::ifstream file(filename, std::ios::in); 776 | if(file.fail()) print_error("File \""+filename+"\" does not exist!"); 777 | const string r((std::istreambuf_iterator(file)), std::istreambuf_iterator()); 778 | file.close(); 779 | return r; 780 | } 781 | inline void write_file(const string& filename, const string& content="") { 782 | create_folder(filename); 783 | std::ofstream file(filename, std::ios::out); 784 | file.write(content.c_str(), content.length()); 785 | file.close(); 786 | } 787 | #endif // UTILITIES_FILE -------------------------------------------------------------------------------- /src/opencl.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define WORKGROUP_SIZE 64 // needs to be 64 to fully use AMD GPUs 4 | //#define PTX 5 | //#define LOG 6 | 7 | // https://github.com/KhronosGroup/OpenCL-Headers 8 | // https://github.com/KhronosGroup/OpenCL-CLHPP 9 | #define CL_HPP_MINIMUM_OPENCL_VERSION 100 10 | #if !defined(__APPLE__) // Windows/Linux/Android 11 | #define CL_HPP_TARGET_OPENCL_VERSION 300 // Windows/Linux/Android can use OpenCL 3.0 12 | #else // macOS 13 | #define CL_HPP_TARGET_OPENCL_VERSION 120 // macOS only supports OpenCL 1.2 14 | #endif // macOS 15 | #include 16 | #include "utilities.hpp" 17 | using cl::Event; 18 | 19 | static const string driver_installation_instructions = 20 | #ifdef _WIN32 21 | R"(|----------------.------------------------------------------------------------' 22 | | AMD GPUs | https://www.amd.com/en/support/download/drivers.html 23 | | Intel GPUs | https://www.intel.com/content/www/us/en/download/785597/intel-arc-iris-xe-graphics-windows.html 24 | | Nvidia GPUs | https://www.nvidia.com/Download/index.aspx 25 | | AMD/Intel CPUs | https://www.intel.com/content/www/us/en/developer/articles/technical/intel-cpu-runtime-for-opencl-applications-with-sycl-support.html 26 | |----------------'------------------------------------------------------------. 27 | | Don't forget to reboot after installation! Press Enter to exit. | 28 | '-----------------------------------------------------------------------------')""\n"; 29 | #else // Linux 30 | string("'-----------------------------------------------------------------------------'\n")+R"( 31 | )"+string("\033[31m")+R"(.-----------------------------------------------------------------------------. 32 | | AMD GPU Drivers, which contain the OpenCL Runtime | 33 | '-----------------------------------------------------------------------------' 34 | sudo apt update && sudo apt upgrade -y 35 | sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev 36 | mkdir -p ~/amdgpu 37 | wget -P ~/amdgpu https://repo.radeon.com/amdgpu-install/6.4.2.1/ubuntu/noble/amdgpu-install_6.4.60402-1_all.deb 38 | sudo apt install -y ~/amdgpu/amdgpu-install*.deb 39 | sudo amdgpu-install -y --usecase=graphics,rocm,opencl --opencl=rocr 40 | sudo usermod -a -G render,video $(whoami) 41 | rm -r ~/amdgpu 42 | sudo shutdown -r now 43 | 44 | )"+string("\033[36m")+R"(.-----------------------------------------------------------------------------. 45 | | Intel GPU Drivers are already installed, only the OpenCL Runtime is needed | 46 | '-----------------------------------------------------------------------------' 47 | sudo apt update && sudo apt upgrade -y 48 | sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev intel-opencl-icd 49 | sudo usermod -a -G render $(whoami) 50 | sudo shutdown -r now 51 | 52 | )"+string("\033[32m")+R"(.-----------------------------------------------------------------------------. 53 | | Nvidia GPU Drivers, which contain the OpenCL Runtime | 54 | '-----------------------------------------------------------------------------' 55 | sudo apt update && sudo apt upgrade -y 56 | sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev nvidia-driver-580 57 | sudo shutdown -r now 58 | 59 | )"+string("\033[96m")+R"(.-----------------------------------------------------------------------------. 60 | | CPU Option 1: Intel CPU Runtime for OpenCL (works for both AMD/Intel CPUs) | 61 | '-----------------------------------------------------------------------------' 62 | export OCLV="oclcpuexp-2025.20.6.0.04_224945_rel" 63 | export TBBV="oneapi-tbb-2022.2.0" 64 | sudo apt update && sudo apt upgrade -y 65 | sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev 66 | sudo mkdir -p ~/cpurt /opt/intel/${OCLV} /etc/OpenCL/vendors /etc/ld.so.conf.d 67 | sudo wget -P ~/cpurt https://github.com/intel/llvm/releases/download/2025-WW27/${OCLV}.tar.gz 68 | sudo wget -P ~/cpurt https://github.com/uxlfoundation/oneTBB/releases/download/v2022.2.0/${TBBV}-lin.tgz 69 | sudo tar -zxvf ~/cpurt/${OCLV}.tar.gz -C /opt/intel/${OCLV} 70 | sudo tar -zxvf ~/cpurt/${TBBV}-lin.tgz -C /opt/intel 71 | echo /opt/intel/${OCLV}/x64/libintelocl.so | sudo tee /etc/OpenCL/vendors/intel_expcpu.icd 72 | echo /opt/intel/${OCLV}/x64 | sudo tee /etc/ld.so.conf.d/libintelopenclexp.conf 73 | sudo ln -sf /opt/intel/${TBBV}/lib/intel64/gcc4.8/libtbb.so /opt/intel/${OCLV}/x64 74 | sudo ln -sf /opt/intel/${TBBV}/lib/intel64/gcc4.8/libtbbmalloc.so /opt/intel/${OCLV}/x64 75 | sudo ln -sf /opt/intel/${TBBV}/lib/intel64/gcc4.8/libtbb.so.12 /opt/intel/${OCLV}/x64 76 | sudo ln -sf /opt/intel/${TBBV}/lib/intel64/gcc4.8/libtbbmalloc.so.2 /opt/intel/${OCLV}/x64 77 | sudo ldconfig -f /etc/ld.so.conf.d/libintelopenclexp.conf 78 | sudo rm -r ~/cpurt 79 | 80 | )"+string("\033[33m")+R"(.-----------------------------------------------------------------------------. 81 | | CPU Option 2: PoCL | 82 | '-----------------------------------------------------------------------------' 83 | sudo apt update && sudo apt upgrade -y 84 | sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev pocl-opencl-icd 85 | 86 | )"+string("\033[0m"); 87 | #endif // Linux 88 | 89 | struct Device_Info { 90 | cl::Device cl_device; // OpenCL device 91 | cl::Context cl_context; // multiple devices in the same context can communicate buffers 92 | uint id = 0u; // unique device ID assigned by get_devices() 93 | string name="", vendor=""; // device name, vendor 94 | string driver_version="", opencl_c_version=""; // device driver version, device OpenCL C version ("1.0", "1.1", "1.2", "2.0", "2.1", "2.2", "3.0") 95 | uint memory = 0u; // global memory in MB 96 | uint memory_used = 0u; // track global memory usage in MB 97 | uint global_cache=0u, local_cache=0u; // global cache in KB, local cache in KB 98 | uint max_global_buffer=0u, max_constant_buffer=0u; // maximum global buffer size in MB, maximum constant buffer size in KB 99 | uint compute_units = 0u; // compute units (CUs) can contain multiple cores depending on the microarchitecture 100 | uint clock_frequency = 0u; // in MHz 101 | bool is_cpu=false, is_gpu=false, uses_ram=false; 102 | uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u, is_dp4a_capable=0u; 103 | uint cores = 0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading) 104 | float tflops = 0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s 105 | uint nvidia_compute_capability = 0u; // compute capability for Nvidia GPUs, for example nvidia_compute_capability=61 means compute capability 6.1 106 | bool patch_intel_gpu_above_4gb = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs 107 | bool patch_nvidia_fp16 = false; // Nvidia Pascal and newer GPUs with driver>=520.00 don't report cl_khr_fp16, but do support basic FP16 arithmetic 108 | bool patch_legacy_gpu_fma = false; // some old GPUs have terrible fma performance, so replace with a*b+c 109 | inline Device_Info(const cl::Device& cl_device, const cl::Context& cl_context, const uint id) { 110 | this->cl_device = cl_device; // see https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html 111 | this->cl_context = cl_context; 112 | this->id = id; 113 | name = trim(cl_device.getInfo()); // device name 114 | vendor = trim(cl_device.getInfo()); // device vendor 115 | driver_version = trim(cl_device.getInfo()); // device driver version 116 | opencl_c_version = cl_device.getInfo().substr(9, 3); 117 | memory = (uint)(cl_device.getInfo()/1048576ull); // global memory in MB 118 | global_cache = (uint)(cl_device.getInfo()/1024ull); // global cache in KB 119 | local_cache = (uint)(cl_device.getInfo()/1024ull); // local cache in KB 120 | max_global_buffer = (uint)(min(cl_device.getInfo()/1048576ull, (ulong)memory)); // maximum global buffer size in MB 121 | max_constant_buffer = (uint)(cl_device.getInfo()/1024ull); // maximum constant buffer size in KB 122 | compute_units = (uint)cl_device.getInfo(); // compute units (CUs) can contain multiple cores depending on the microarchitecture 123 | clock_frequency = (uint)cl_device.getInfo(); // in MHz 124 | is_fp64_capable = (uint)cl_device.getInfo()*(uint)contains(cl_device.getInfo(), "cl_khr_fp64"); 125 | is_fp32_capable = (uint)cl_device.getInfo(); 126 | is_fp16_capable = (uint)cl_device.getInfo()*(uint)contains(cl_device.getInfo(), "cl_khr_fp16"); 127 | is_int64_capable = (uint)cl_device.getInfo(); 128 | is_int32_capable = (uint)cl_device.getInfo(); 129 | is_int16_capable = (uint)cl_device.getInfo(); 130 | is_int8_capable = (uint)cl_device.getInfo(); 131 | is_cpu = cl_device.getInfo()==CL_DEVICE_TYPE_CPU; 132 | is_gpu = cl_device.getInfo()==CL_DEVICE_TYPE_GPU; 133 | uses_ram = is_cpu||(bool)cl_device.getInfo(); // CPUs or iGPUs 134 | const int vendor_id = (int)cl_device.getInfo(); // AMD=0x1002, Intel=0x8086, Nvidia=0x10DE, Apple=0x1027F00 135 | uint ipc = is_gpu ? 2u : 32u; // IPC (instructions per cycle) is 2 for most GPUs and 32 for most modern CPUs 136 | float cores_per_cu = 1.0f; 137 | #if !defined(__APPLE__) // macOS only supports OpenCL 1.2, OpenCL extensions are missing before OpenCL 3.0 138 | uint max_opencl_c_version = 0u; // device OpenCL C version; cl_device.getInfo().substr(9, 3) is unreliable as it will report 1.2 if 3.0 is available but not 2.X 139 | for(auto& v : cl_device.getInfo()) max_opencl_c_version = max(max_opencl_c_version, 10u*(uint)CL_VERSION_MAJOR(v.version)+CL_VERSION_MINOR(v.version)); 140 | if(max_opencl_c_version>=10u) opencl_c_version = to_string(max_opencl_c_version/10u)+"."+to_string(max_opencl_c_version%10u); 141 | is_dp4a_capable = (uint)contains(cl_device.getInfo(), "cl_khr_integer_dot_product"); 142 | int dp4a_error = 0; 143 | is_dp4a_capable = is_dp4a_capable&&(uint)(cl_device.getInfo(&dp4a_error)==3); 144 | is_dp4a_capable = is_dp4a_capable&&dp4a_error==0; 145 | const auto idpap = cl_device.getInfo(&dp4a_error); 146 | const cl_bool* idpap_bits = (cl_bool*)&idpap; // on some unsupported devices, values are random, so only claim is_dp4a_capable if all bits are set correctly 147 | is_dp4a_capable = is_dp4a_capable&&dp4a_error==0&&idpap_bits[0]==1&&idpap_bits[1]==1&&idpap_bits[2]==1&&idpap_bits[3]==1&&idpap_bits[4]==1&&idpap_bits[5]==1; 148 | if(vendor_id==0x1002) { // AMD GPU/CPU 149 | const bool is_full_profile = trim(cl_device.getInfo())=="FULL_PROFILE"; // rusticl reports "EMBEDDED_PROFILE" 150 | const bool amd_dual_cu = is_gpu&&is_full_profile&&contains_any(to_lower(name), {"gfx10", "gfx11", "gfx12"}); // identify RDNA/RDNA2/RDNA3/RDNA4 GPUs where dual CUs are reported 151 | const bool amd_ipc_4 = is_gpu&&contains_any(to_lower(name), {"gfx11", "gfx12", "gfx942", "gfx950"}); // identify RDNA3/RDNA4 GPUs (can dual-issue float2) and CDNA3/CDNA4 GPUs (ipc=4 for scalar float) 152 | if(amd_dual_cu) compute_units *= 2u; // some AMD GPUs wrongly report the number of dual CUs as the number of CUs 153 | if(amd_ipc_4) ipc = 4u; // some AMD GPUs support dual-issuging of float2 vector type, or have ipc=4 for scalar float 154 | cores_per_cu = is_gpu ? 64.0f : 0.5f; // 64 cores/CU (GPUs), 1/2 core/CU (CPUs) 155 | const string amd_device_name = trim(cl_device.getInfo()); 156 | if(is_gpu&&length(amd_device_name)>0u) name = amd_device_name; // for AMD GPUs, CL_DEVICE_NAME wrongly outputs chip codename, and CL_DEVICE_BOARD_NAME_AMD outputs actual device name 157 | } else if(vendor_id==0x8086) { // Intel GPU/CPU 158 | const int intel_device_id = (int)cl_device.getInfo(); // also see CL_DEVICE_IP_VERSION_INTEL 159 | const bool intel_16_cores_per_cu = contains({ 0x0BD5, 0x0BDA, 0x64A0, 0xE20B, 0xE20C, 0xE211, 0xE212 }, intel_device_id); // GPU Max 1550, GPU Max 1100, Arc 140V/130V, Arc B580, Arc B570, Arc Pro B60, Arc Pro B50 160 | cores_per_cu = is_gpu ? (intel_16_cores_per_cu ? 16.0f : 8.0f) : 0.5f; // Intel GPUs have 16 cores/CU (PVC/Xe2) or 8 cores/CU (Xe1), Intel CPUs (with HT) have 1/2 core/CU 161 | if(is_gpu&&!uses_ram) { // fix wrong global memory capacity reporting for Intel dGPUs 162 | #if defined(_WIN32) 163 | memory = (uint)((cl_device.getInfo()*50ull/49ull)/1048576ull); // 98% on Windows https://github.com/intel/compute-runtime/blob/master/shared/source/os_interface/windows/wddm_memory_manager.cpp#L969 164 | #elif defined(__linux__) 165 | memory = (uint)((cl_device.getInfo()*20ull/19ull)/1048576ull); // 95% on Linux https://github.com/intel/compute-runtime/blob/master/shared/source/os_interface/linux/drm_memory_manager.cpp#L1545 166 | #endif // Linux 167 | } 168 | patch_intel_gpu_above_4gb = patch_intel_gpu_above_4gb||(is_gpu&&memory>4096u); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM 169 | if(is_cpu) is_dp4a_capable = 0u; // native dp4a in Intel CPU Runtime for OpenCL is slower than emulated dp4a 170 | } else if(vendor_id==0x10DE||vendor_id==0x13B5) { // Nvidia GPU/CPU 171 | if(is_gpu) nvidia_compute_capability = 10u*(uint)cl_device.getInfo()+(uint)cl_device.getInfo(); 172 | const bool nvidia__32_cores_per_cu = (nvidia_compute_capability <30); // identify Fermi GPUs 173 | const bool nvidia_192_cores_per_cu = (nvidia_compute_capability>=30&&nvidia_compute_capability< 50); // identify Kepler GPUs 174 | const bool nvidia__64_cores_per_cu = (nvidia_compute_capability>=70&&nvidia_compute_capability<=80)||nvidia_compute_capability==60; // identify Volta, Turing, P100, A100, A30 175 | cores_per_cu = is_gpu ? (nvidia__32_cores_per_cu ? 32.0f : nvidia_192_cores_per_cu ? 192.0f : nvidia__64_cores_per_cu ? 64.0f : 128.0f) : 1.0f; // 32 (Fermi), 192 (Kepler), 64 (Volta, Turing, P100, A100, A30), 128 (Maxwell, Pascal, Ampere, Hopper, Ada, Blackwell) or 1 (CPUs) 176 | patch_nvidia_fp16 = patch_nvidia_fp16||(nvidia_compute_capability>=60&&atof(driver_version.substr(0, 6).c_str())>=520.00); // enable for all Nvidia Pascal or newer GPUs with driver>=520.00 177 | if(patch_nvidia_fp16) is_fp16_capable = 2u; 178 | is_dp4a_capable = (uint)(nvidia_compute_capability>=61u); // Nvidia GPUs with nvidia_compute_capability>=61 don't report dp4a support through cl_khr_integer_dot_product extension, but support it via inline PTX assembly 179 | } else 180 | #endif // Windows / Linux / Android 181 | if(vendor_id==0x1027F00) { // Apple iGPU 182 | cores_per_cu = 128.0f; // Apple ARM GPUs usually have 128 cores/CU 183 | } else if(vendor_id==0x1022||vendor_id==0x10006||vendor_id==0x6C636F70) { // x86 CPUs with PoCL runtime 184 | cores_per_cu = 0.5f; // CPUs typically have 1/2 cores/CU due to SMT/hyperthreading 185 | } else if(contains(to_lower(vendor), "arm")) { // ARM 186 | cores_per_cu = is_gpu ? 8.0f : 1.0f; // ARM GPUs usually have 8 cores/CU, ARM CPUs have 1 core/CU 187 | uses_ram = false; // CL_MEM_USE_HOST_PTR is broken on ARM iGPUs, so disable zero-copy there 188 | patch_legacy_gpu_fma = true; // enable for all ARM GPUs 189 | } 190 | cores = to_uint((float)compute_units*cores_per_cu); // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading) 191 | tflops = 1E-6f*(float)cores*(float)ipc*(float)clock_frequency; // estimated device floating point performance in TeraFLOPs/s 192 | } 193 | inline Device_Info() {}; // default constructor 194 | }; 195 | 196 | string get_opencl_c_code(); // implemented in kernel.hpp 197 | inline void print_device_info(const Device_Info& d) { // print OpenCL device info 198 | #if defined(_WIN32) 199 | const string os = "Windows"; 200 | #elif defined(__linux__) 201 | const string os = "Linux"; 202 | #elif defined(__APPLE__) 203 | const string os = "macOS"; 204 | #else // unknown operating system 205 | const string os = "unknown operating system"; 206 | #endif // operating system 207 | println("\r|----------------.------------------------------------------------------------|"); 208 | println("| Device ID | "+alignl(58, to_string(d.id) )+" |"); 209 | println("| Device Name | "+alignl(58, d.name )+" |"); 210 | println("| Device Vendor | "+alignl(58, d.vendor )+" |"); 211 | println("| Device Driver | "+alignl(58, d.driver_version+" ("+os+")" )+" |"); 212 | println("| OpenCL Version | "+alignl(58, "OpenCL C "+d.opencl_c_version )+" |"); 213 | println("| Compute Units | "+alignl(58, to_string(d.compute_units)+" at "+to_string(d.clock_frequency)+" MHz ("+to_string(d.cores)+" cores, "+to_string(d.tflops, 3)+" TFLOPs/s)")+" |"); 214 | println("| Memory, Cache | "+alignl(58, to_string(d.memory)+" MB "+(d.uses_ram ? "" : "V")+"RAM, "+to_string(d.global_cache)+" KB global / "+to_string(d.local_cache)+" KB local")+" |"); 215 | println("| Buffer Limits | "+alignl(58, to_string(d.max_global_buffer)+" MB global, "+to_string(d.max_constant_buffer)+" KB constant")+" |"); 216 | println("|----------------'------------------------------------------------------------|"); 217 | } 218 | inline vector get_devices(const bool print_info=true) { // returns a vector of all available OpenCL devices 219 | set_environment_variable((char*)"GPU_SINGLE_ALLOC_PERCENT=100"); // fix maximum buffer allocation size limit for AMD GPUs 220 | set_environment_variable((char*)"CL_CONFIG_CPU_FORCE_MAX_MEM_ALLOC_SIZE=17179869183GB"); // fix maximum buffer allocation size limit in Intel CPU Runtime for OpenCL, 2^34-1 is max non-overflowing value 221 | vector devices; // get all devices of all platforms 222 | vector cl_platforms; // get all platforms (drivers) 223 | cl::Platform::get(&cl_platforms); 224 | uint id = 0u; 225 | for(uint i=0u; i<(uint)cl_platforms.size(); i++) { 226 | vector cl_devices; 227 | cl_platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &cl_devices); 228 | //cl::Context cl_context(cl_devices); // same cl::Context for all devices (allocates extra VRAM on all other unused Nvidia GPUs) 229 | for(uint j=0u; j<(uint)cl_devices.size(); j++) { 230 | cl::Context cl_context(cl_devices[j]); // separate cl::Context for each device 231 | devices.push_back(Device_Info(cl_devices[j], cl_context, id++)); 232 | } 233 | } 234 | if((uint)cl_platforms.size()==0u||(uint)devices.size()==0u) { 235 | print_message("No OpenCL devices are available. Please install the drivers for your GPU(s) and/or the CPU Runtime for OpenCL. Instructions:", "Error", 12); 236 | print(driver_installation_instructions); 237 | #ifdef _WIN32 238 | wait(); 239 | #endif // Windows 240 | exit(1); 241 | } 242 | if(print_info) { 243 | println("\r|----------------.------------------------------------------------------------|"); 244 | for(uint i=0u; i<(uint)devices.size(); i++) println("| Device ID "+alignr(4u, i)+" | "+alignl(58u, devices[i].name)+" |"); 245 | println("|----------------'------------------------------------------------------------|"); 246 | } 247 | return devices; 248 | } 249 | inline Device_Info select_device_with_most_flops(const vector& devices=get_devices()) { // returns device with best floating-point performance 250 | float best_value = 0.0f; 251 | uint best_i = 0u; 252 | for(uint i=0u; i<(uint)devices.size(); i++) { // find device with highest (estimated) floating point performance 253 | if(devices[i].tflops>best_value) { 254 | best_value = devices[i].tflops; 255 | best_i = i; 256 | } 257 | } 258 | return devices[best_i]; 259 | } 260 | inline Device_Info select_device_with_most_memory(const vector& devices=get_devices()) { // returns device with largest memory capacity 261 | uint best_value = 0u; 262 | uint best_i = 0u; 263 | for(uint i=0u; i<(uint)devices.size(); i++) { // find device with most memory 264 | if(devices[i].memory>best_value) { 265 | best_value = devices[i].memory; 266 | best_i = i; 267 | } 268 | } 269 | return devices[best_i]; 270 | } 271 | inline Device_Info select_device_with_id(const uint id, const vector& devices=get_devices()) { // returns device with specified ID 272 | if(id<(uint)devices.size()) { 273 | return devices[id]; 274 | } else { 275 | print_error("Your selected Device ID ("+to_string(id)+") is wrong."); 276 | return devices[0]; // is never executed, just to avoid compiler warnings 277 | } 278 | } 279 | 280 | class Device { 281 | private: 282 | cl::Program cl_program; 283 | cl::CommandQueue cl_queue; 284 | bool exists = false; 285 | inline string enable_device_capabilities() const { return // enable FP64/FP16 capabilities if available 286 | string(info.patch_nvidia_fp16 ? "\n #define cl_khr_fp16" : "")+ // Nvidia Pascal and newer GPUs with driver>=520.00 don't report cl_khr_fp16, but do support basic FP16 arithmetic 287 | string(info.patch_legacy_gpu_fma ? "\n #define fma(a, b, c) ((a)*(b)+(c))" : "")+ // some old GPUs have terrible fma performance, so replace with a*b+c 288 | string(info.nvidia_compute_capability ? "\n #define cl_nv_compute_capability "+to_string(info.nvidia_compute_capability) : "")+ // allows querying Nvidia compute capability for inline PTX 289 | string(info.is_dp4a_capable==0u ? "\n #undef __opencl_c_integer_dot_product_input_4x8bit\n #undef __opencl_c_integer_dot_product_input_4x8bit_packed" : "")+ // patch false dp4a reporting on Intel 290 | "\n #define cl_workgroup_size "+to_string(WORKGROUP_SIZE)+"u" 291 | "\n #ifdef cl_khr_fp64" 292 | "\n #pragma OPENCL EXTENSION cl_khr_fp64 : enable" // make sure cl_khr_fp64 extension is enabled 293 | "\n #endif" 294 | "\n #ifdef cl_khr_fp16" 295 | "\n #pragma OPENCL EXTENSION cl_khr_fp16 : enable" // make sure cl_khr_fp16 extension is enabled 296 | "\n #endif" 297 | "\n #ifdef cl_khr_int64_base_atomics" 298 | "\n #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable" // make sure cl_khr_int64_base_atomics extension is enabled 299 | "\n #endif" 300 | ;} 301 | public: 302 | Device_Info info; 303 | inline Device(const Device_Info& info, const string& opencl_c_code=get_opencl_c_code()) { 304 | print_device_info(info); 305 | this->info = info; 306 | this->cl_queue = cl::CommandQueue(info.cl_context, info.cl_device); // queue to push commands for the device 307 | cl::Program::Sources cl_source; 308 | const string kernel_code = enable_device_capabilities()+"\n"+opencl_c_code; 309 | cl_source.push_back({ kernel_code.c_str(), kernel_code.length() }); 310 | this->cl_program = cl::Program(info.cl_context, cl_source); 311 | const string build_options = "-cl-std=CL"+info.opencl_c_version+" -cl-finite-math-only -cl-no-signed-zeros -cl-mad-enable"+(info.patch_intel_gpu_above_4gb ? " -cl-intel-greater-than-4GB-buffer-required" : ""); 312 | #ifndef LOG 313 | int error = cl_program.build({ info.cl_device }, (build_options+" -w").c_str()); // compile OpenCL C code, disable warnings 314 | if(error) print_warning(cl_program.getBuildInfo(info.cl_device)); // print build log 315 | #else // LOG, generate logfile for OpenCL code compilation 316 | int error = cl_program.build({ info.cl_device }, build_options.c_str()); // compile OpenCL C code 317 | const string log = cl_program.getBuildInfo(info.cl_device); 318 | write_file("bin/kernel.log", log); // save build log 319 | if((uint)log.length()>2u) print_warning(log); // print build log 320 | #endif // LOG 321 | if(error) print_error("OpenCL C code compilation failed with error code "+to_string(error)+". Make sure there are no errors in kernel.cpp."); 322 | else print_info("OpenCL C code successfully compiled."); 323 | #ifdef PTX // generate assembly (ptx) file for OpenCL code 324 | write_file("bin/kernel.ptx", (char*)&cl_program.getInfo()[0][0]); // save binary (ptx file) 325 | #endif // PTX 326 | this->exists = true; 327 | } 328 | inline Device() {} // default constructor 329 | inline void barrier(const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { cl_queue.enqueueBarrierWithWaitList(event_waitlist, event_returned); } 330 | inline void finish_queue() { cl_queue.finish(); } 331 | inline cl::Context get_cl_context() const { return info.cl_context; } 332 | inline cl::Program get_cl_program() const { return cl_program; } 333 | inline cl::CommandQueue get_cl_queue() const { return cl_queue; } 334 | inline bool is_initialized() const { return exists; } 335 | }; 336 | 337 | template class Memory { 338 | private: 339 | ulong N = 0ull; // buffer length 340 | uint d = 1u; // buffer dimensions 341 | bool host_buffer_exists = false; 342 | bool device_buffer_exists = false; 343 | bool external_host_buffer = false; // Memory object has been created with an externally supplied host buffer/pointer 344 | bool is_zero_copy = false; // if possible (device is CPU or iGPU), and if allowed by user, use zero-copy buffer: host+device buffers are fused into one 345 | T* host_buffer = nullptr; // host buffer 346 | T* host_buffer_unaligned = nullptr; // unaligned host buffer (only required for zero-copy to align host_buffer) 347 | cl::Buffer device_buffer; // device buffer 348 | Device* device = nullptr; // pointer to linked Device 349 | cl::CommandQueue cl_queue; // command queue 350 | inline void initialize_auxiliary_pointers() { 351 | /********/ x = s0 = host_buffer; /******/ if(d>0x4u) s4 = host_buffer+N*0x4ull; if(d>0x8u) s8 = host_buffer+N*0x8ull; if(d>0xCu) sC = host_buffer+N*0xCull; 352 | if(d>0x1u) y = s1 = host_buffer+N; /****/ if(d>0x5u) s5 = host_buffer+N*0x5ull; if(d>0x9u) s9 = host_buffer+N*0x9ull; if(d>0xDu) sD = host_buffer+N*0xDull; 353 | if(d>0x2u) z = s2 = host_buffer+N*0x2ull; if(d>0x6u) s6 = host_buffer+N*0x6ull; if(d>0xAu) sA = host_buffer+N*0xAull; if(d>0xEu) sE = host_buffer+N*0xEull; 354 | if(d>0x3u) w = s3 = host_buffer+N*0x3ull; if(d>0x7u) s7 = host_buffer+N*0x7ull; if(d>0xBu) sB = host_buffer+N*0xBull; if(d>0xFu) sF = host_buffer+N*0xFull; 355 | } 356 | inline void allocate_host_buffer(Device& device, const bool allocate_host, const bool allow_zero_copy) { 357 | if(allocate_host) { 358 | const ulong alignment = allow_zero_copy&&device.info.uses_ram ? 4096ull : 64ull; // host_buffer must be aligned to 4096 Bytes for CL_MEM_USE_HOST_PTR, and to 64 Bytes for optimal enqueueReadBuffer performance on modern CPUs 359 | const ulong padding = allow_zero_copy&&device.info.uses_ram ? 64ull : 0ull; // for CL_MEM_USE_HOST_PTR, 64 Bytes padding is required because device_buffer capacity in this case must be a multiple of 64 Bytes 360 | host_buffer_unaligned = new T[N*(ulong)d+(alignment+padding)/sizeof(T)]; // over-allocate host_buffer_unaligned by (alignment+padding) Bytes 361 | host_buffer = (T*)((((ulong)host_buffer_unaligned+alignment-1ull)/alignment)*alignment); // align host_buffer by fine-tuning pointer to be a multiple of alignment 362 | initialize_auxiliary_pointers(); 363 | host_buffer_exists = true; 364 | } 365 | } 366 | inline void allocate_device_buffer(Device& device, const bool allocate_device, const bool allow_zero_copy) { 367 | this->device = &device; 368 | this->cl_queue = device.get_cl_queue(); 369 | if(allocate_device) { 370 | device.info.memory_used += (uint)(capacity()/1048576ull); // track device memory usage 371 | if(device.info.memory_used>device.info.memory) print_error("Device \""+device.info.name+"\" does not have enough memory. Allocating another "+to_string((uint)(capacity()/1048576ull))+" MB would use a total of "+to_string(device.info.memory_used)+" MB / "+to_string(device.info.memory)+" MB."); 372 | int error = 0; 373 | is_zero_copy = allow_zero_copy&&host_buffer_exists&&device.info.uses_ram&&(!external_host_buffer||((ulong)host_buffer%4096ull==0ull&&capacity()%64ull==0ull)); 374 | device_buffer = cl::Buffer( // if(is_zero_copy) { don't allocate extra memory on CPUs/iGPUs } else { allocate VRAM on GPUs } 375 | device.get_cl_context(), 376 | CL_MEM_READ_WRITE|((int)is_zero_copy*CL_MEM_USE_HOST_PTR)|((int)device.info.patch_intel_gpu_above_4gb<<23), // for Intel GPUs set flag CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL = (1<<23) 377 | is_zero_copy ? ((capacity()+63ull)/64ull)*64ull : capacity(), // device_buffer capacity must be a multiple of 64 Bytes for CL_MEM_USE_HOST_PTR 378 | is_zero_copy ? (void*)host_buffer : nullptr, 379 | &error 380 | ); 381 | if(error==-61) print_error("Memory size is too large at "+to_string((uint)(capacity()/1048576ull))+" MB. Device \""+device.info.name+"\" accepts a maximum buffer size of "+to_string(device.info.max_global_buffer)+" MB."); 382 | else if(error) print_error("Device buffer allocation failed with error code "+to_string(error)+"."); 383 | device_buffer_exists = true; 384 | } 385 | } 386 | public: 387 | T *x=nullptr, *y=nullptr, *z=nullptr, *w=nullptr; // host buffer auxiliary pointers for multi-dimensional array access (array of structures) 388 | T *s0=nullptr, *s1=nullptr, *s2=nullptr, *s3=nullptr, *s4=nullptr, *s5=nullptr, *s6=nullptr, *s7=nullptr, *s8=nullptr, *s9=nullptr, *sA=nullptr, *sB=nullptr, *sC=nullptr, *sD=nullptr, *sE=nullptr, *sF=nullptr; 389 | inline Memory(Device& device, const ulong N, const uint dimensions=1u, const bool allocate_host=true, const bool allocate_device=true, const T value=(T)0, const bool allow_zero_copy=true) { 390 | if(!device.is_initialized()) print_error("No Device selected. Call Device constructor."); 391 | if(N*(ulong)dimensions==0ull) print_error("Memory size must be larger than 0."); 392 | this->N = N; 393 | this->d = dimensions; 394 | allocate_host_buffer(device, allocate_host, allow_zero_copy); // allocate host_buffer first 395 | allocate_device_buffer(device, allocate_device, allow_zero_copy); // allocate device_buffer second 396 | reset(value); 397 | } 398 | inline Memory(Device& device, const ulong N, const uint dimensions, T* const host_buffer, const bool allocate_device=true, const bool allow_zero_copy=true) { 399 | if(!device.is_initialized()) print_error("No Device selected. Call Device constructor."); 400 | if(N*(ulong)dimensions==0ull) print_error("Memory size must be larger than 0."); 401 | this->N = N; 402 | this->d = dimensions; 403 | this->host_buffer = host_buffer; 404 | initialize_auxiliary_pointers(); 405 | host_buffer_exists = true; 406 | external_host_buffer = true; 407 | allocate_device_buffer(device, allocate_device, allow_zero_copy); 408 | write_to_device(); 409 | } 410 | inline Memory() {} // default constructor 411 | inline ~Memory() { 412 | delete_buffers(); 413 | } 414 | inline Memory& operator=(Memory&& memory) noexcept { // move assignment 415 | delete_buffers(); // delete existing buffers and restore default state 416 | N = memory.length(); // copy values/pointers from memory 417 | d = memory.dimensions(); 418 | device = memory.device; 419 | cl_queue = memory.device->get_cl_queue(); 420 | if(memory.host_buffer_exists) { 421 | host_buffer = memory.exchange_host_buffer(nullptr); // transfer host_buffer pointer 422 | host_buffer_unaligned = memory.exchange_host_buffer_unaligned(nullptr); // transfer host_buffer_unaligned pointer 423 | initialize_auxiliary_pointers(); 424 | external_host_buffer = memory.external_host_buffer; 425 | host_buffer_exists = true; 426 | } 427 | if(memory.device_buffer_exists) { 428 | device_buffer = memory.get_cl_buffer(); // transfer device_buffer pointer 429 | device->info.memory_used += (uint)(capacity()/1048576ull); // track device memory usage 430 | is_zero_copy = memory.is_zero_copy; 431 | device_buffer_exists = true; 432 | } 433 | return *this; // destructor of memory will be called automatically 434 | } 435 | inline T* const exchange_host_buffer(T* const host_buffer) { // sets host_buffer to new pointer and returns old pointer 436 | T* const swap = this->host_buffer; 437 | this->host_buffer = host_buffer; 438 | return swap; 439 | } 440 | inline T* const exchange_host_buffer_unaligned(T* const host_buffer_unaligned) { // sets host_buffer_unaligned to new pointer and returns old pointer 441 | T* const swap = this->host_buffer_unaligned; 442 | this->host_buffer_unaligned = host_buffer_unaligned; 443 | return swap; 444 | } 445 | inline void add_host_buffer() { // makes only sense if there is no host buffer yet but an existing device buffer 446 | if(!host_buffer_exists&&device_buffer_exists) { 447 | host_buffer = new T[N*(ulong)d]; 448 | initialize_auxiliary_pointers(); 449 | read_from_device(); 450 | host_buffer_exists = true; 451 | } else if(!device_buffer_exists) { 452 | print_error("There is no existing device buffer, so can't add host buffer."); 453 | } 454 | } 455 | inline void add_device_buffer(const bool allow_zero_copy=true) { // makes only sense if there is no device buffer yet but an existing host buffer 456 | if(!device_buffer_exists&&host_buffer_exists) { 457 | allocate_device_buffer(*device, true, allow_zero_copy); 458 | write_to_device(); 459 | } else if(!host_buffer_exists) { 460 | print_error("There is no existing host buffer, so can't add device buffer."); 461 | } 462 | } 463 | inline void delete_host_buffer() { 464 | host_buffer_exists = false; 465 | if(!external_host_buffer) { 466 | host_buffer = nullptr; 467 | delete[] host_buffer_unaligned; 468 | } 469 | if(!device_buffer_exists) { 470 | N = 0ull; 471 | d = 1u; 472 | } 473 | } 474 | inline void delete_device_buffer() { 475 | if(device_buffer_exists) device->info.memory_used -= (uint)(capacity()/1048576ull); // track device memory usage 476 | device_buffer_exists = false; 477 | device_buffer = nullptr; 478 | if(!host_buffer_exists) { 479 | N = 0ull; 480 | d = 1u; 481 | } 482 | } 483 | inline void delete_buffers() { 484 | delete_device_buffer(); 485 | delete_host_buffer(); 486 | } 487 | inline void reset(const T value=(T)0) { 488 | //if(device_buffer_exists) cl_queue.enqueueFillBuffer(device_buffer, value, 0ull, capacity()); // faster than "write_to_device();" 489 | if(host_buffer_exists) std::fill(host_buffer, host_buffer+range(), value); // faster than "for(ulong i=0ull; i* event_waitlist=nullptr, Event* event_returned=nullptr) { 506 | if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) { 507 | cl_queue.enqueueReadBuffer(device_buffer, blocking, 0ull, capacity(), (void*)host_buffer, event_waitlist, event_returned); 508 | } 509 | } 510 | inline void write_to_device(const bool blocking=true, const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { 511 | if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) { 512 | cl_queue.enqueueWriteBuffer(device_buffer, blocking, 0ull, capacity(), (void*)host_buffer, event_waitlist, event_returned); 513 | } 514 | } 515 | inline void read_from_device(const ulong offset, const ulong length, const bool blocking=true, const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { 516 | if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) { 517 | const ulong safe_offset=min(offset, range()), safe_length=min(length, range()-safe_offset); 518 | if(safe_length>0ull) cl_queue.enqueueReadBuffer(device_buffer, blocking, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned); 519 | } 520 | } 521 | inline void write_to_device(const ulong offset, const ulong length, const bool blocking=true, const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { 522 | if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) { 523 | const ulong safe_offset=min(offset, range()), safe_length=min(length, range()-safe_offset); 524 | if(safe_length>0ull) cl_queue.enqueueWriteBuffer(device_buffer, blocking, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned); 525 | } 526 | } 527 | inline void read_from_device_1d(const ulong x0, const ulong x1, const int dimension=-1, const bool blocking=true, const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { // read 1D domain from device, either for all vector dimensions (-1) or for a specified dimension 528 | if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) { 529 | const uint i0=(uint)max(0, dimension), i1=dimension<0 ? d : i0+1u; 530 | for(uint i=i0; i0ull) cl_queue.enqueueReadBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned); 533 | } 534 | if(blocking) cl_queue.finish(); 535 | } 536 | } 537 | inline void write_to_device_1d(const ulong x0, const ulong x1, const int dimension=-1, const bool blocking=true, const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { // write 1D domain to device, either for all vector dimensions (-1) or for a specified dimension 538 | if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) { 539 | const uint i0=(uint)max(0, dimension), i1=dimension<0 ? d : i0+1u; 540 | for(uint i=i0; i0ull) cl_queue.enqueueWriteBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned); 543 | } 544 | if(blocking) cl_queue.finish(); 545 | } 546 | } 547 | inline void read_from_device_2d(const ulong x0, const ulong x1, const ulong y0, const ulong y1, const ulong Nx, const ulong Ny, const int dimension=-1, const bool blocking=true, const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { // read 2D domain from device, either for all vector dimensions (-1) or for a specified dimension 548 | if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) { 549 | for(uint y=y0; y0ull) cl_queue.enqueueReadBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned); 555 | } 556 | } 557 | if(blocking) cl_queue.finish(); 558 | } 559 | } 560 | inline void write_to_device_2d(const ulong x0, const ulong x1, const ulong y0, const ulong y1, const ulong Nx, const ulong Ny, const int dimension=-1, const bool blocking=true, const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { // write 2D domain to device, either for all vector dimensions (-1) or for a specified dimension 561 | if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) { 562 | for(uint y=y0; y0ull) cl_queue.enqueueWriteBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned); 568 | } 569 | } 570 | if(blocking) cl_queue.finish(); 571 | } 572 | } 573 | inline void read_from_device_3d(const ulong x0, const ulong x1, const ulong y0, const ulong y1, const ulong z0, const ulong z1, const ulong Nx, const ulong Ny, const ulong Nz, const int dimension=-1, const bool blocking=true, const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { // read 3D domain from device, either for all vector dimensions (-1) or for a specified dimension 574 | if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) { 575 | for(uint z=z0; z0ull) cl_queue.enqueueReadBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned); 582 | } 583 | } 584 | } 585 | if(blocking) cl_queue.finish(); 586 | } 587 | } 588 | inline void write_to_device_3d(const ulong x0, const ulong x1, const ulong y0, const ulong y1, const ulong z0, const ulong z1, const ulong Nx, const ulong Ny, const ulong Nz, const int dimension=-1, const bool blocking=true, const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { // write 3D domain to device, either for all vector dimensions (-1) or for a specified dimension 589 | if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) { 590 | for(uint z=z0; z0ull) cl_queue.enqueueWriteBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned); 597 | } 598 | } 599 | } 600 | if(blocking) cl_queue.finish(); 601 | } 602 | } 603 | inline void enqueue_read_from_device(const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { read_from_device(false, event_waitlist, event_returned); } 604 | inline void enqueue_write_to_device(const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { write_to_device(false, event_waitlist, event_returned); } 605 | inline void enqueue_read_from_device(const ulong offset, const ulong length, const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { read_from_device(offset, length, false, event_waitlist, event_returned); } 606 | inline void enqueue_write_to_device(const ulong offset, const ulong length, const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { write_to_device(offset, length, false, event_waitlist, event_returned); } 607 | inline void finish_queue() { cl_queue.finish(); } 608 | inline const cl::Buffer& get_cl_buffer() const { return device_buffer; } 609 | }; 610 | 611 | class Kernel { 612 | private: 613 | ulong N = 0ull; // kernel range 614 | uint number_of_parameters = 0u; 615 | string name = ""; 616 | cl::Kernel cl_kernel; 617 | cl::NDRange cl_range_global, cl_range_local; 618 | cl::CommandQueue cl_queue; 619 | inline void check_for_errors(const int error) { 620 | if(error==-48) print_error("There is no OpenCL kernel with name \""+name+"(...)\" in the OpenCL C code! Check spelling!"); 621 | if(error<-48&&error>-53) print_error("Parameters for OpenCL kernel \""+name+"(...)\" don't match between C++ and OpenCL C!"); 622 | if(error==-54) print_error("Workgrop size "+to_string(WORKGROUP_SIZE)+" for OpenCL kernel \""+name+"(...)\" is invalid!"); 623 | if(error!=0) print_error("OpenCL kernel \""+name+"(...)\" failed with error code "+to_string(error)+"!"); 624 | } 625 | template inline void link_parameter(const uint position, const Memory& memory) { 626 | check_for_errors(cl_kernel.setArg(position, memory.get_cl_buffer())); 627 | } 628 | template inline void link_parameter(const uint position, const T& constant) { 629 | check_for_errors(cl_kernel.setArg(position, sizeof(T), (void*)&constant)); 630 | } 631 | inline void link_parameters(const uint starting_position) { 632 | number_of_parameters = max(number_of_parameters, starting_position); 633 | } 634 | template inline void link_parameters(const uint starting_position, const T& parameter, const U&... parameters) { 635 | link_parameter(starting_position, parameter); 636 | link_parameters(starting_position+1u, parameters...); 637 | } 638 | public: 639 | template inline Kernel(const Device& device, const ulong N, const string& name, const T&... parameters) { // accepts Memory objects and fundamental data type constants 640 | if(!device.is_initialized()) print_error("No OpenCL Device selected. Call Device constructor."); 641 | this->name = name; 642 | cl_kernel = cl::Kernel(device.get_cl_program(), name.c_str()); 643 | link_parameters(0u, parameters...); // expand variadic template to link kernel parameters 644 | set_ranges(N); 645 | cl_queue = device.get_cl_queue(); 646 | } 647 | template inline Kernel(const Device& device, const ulong N, const uint workgroup_size, const string& name, const T&... parameters) { // accepts Memory objects and fundamental data type constants 648 | if(!device.is_initialized()) print_error("No OpenCL Device selected. Call Device constructor."); 649 | cl_kernel = cl::Kernel(device.get_cl_program(), name.c_str()); 650 | link_parameters(0u, parameters...); // expand variadic template to link kernel parameters 651 | set_ranges(N, (ulong)workgroup_size); 652 | cl_queue = device.get_cl_queue(); 653 | } 654 | inline Kernel() {} // default constructor 655 | inline Kernel& set_ranges(const ulong N, const ulong workgroup_size=(ulong)WORKGROUP_SIZE) { 656 | this->N = N; 657 | cl_range_global = cl::NDRange(((N+workgroup_size-1ull)/workgroup_size)*workgroup_size); // make global range a multiple of local range 658 | cl_range_local = cl::NDRange(workgroup_size); 659 | return *this; 660 | } 661 | inline const ulong range() const { return N; } 662 | inline uint get_number_of_parameters() const { return number_of_parameters; } 663 | template inline Kernel& add_parameters(const T&... parameters) { // add parameters to the list of existing parameters 664 | link_parameters(number_of_parameters, parameters...); // expand variadic template to link kernel parameters 665 | return *this; 666 | } 667 | template inline Kernel& set_parameters(const uint starting_position, const T&... parameters) { // set parameters starting at specified position 668 | link_parameters(starting_position, parameters...); // expand variadic template to link kernel parameters 669 | return *this; 670 | } 671 | inline Kernel& enqueue_run(const uint t=1u, const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { 672 | for(uint i=0u; i* event_waitlist=nullptr, Event* event_returned=nullptr) { 678 | enqueue_run(t, event_waitlist, event_returned); 679 | finish_queue(); 680 | return *this; 681 | } 682 | inline Kernel& operator()(const uint t=1u, const vector* event_waitlist=nullptr, Event* event_returned=nullptr) { 683 | return run(t, event_waitlist, event_returned); 684 | } 685 | inline Kernel& finish_queue() { 686 | cl_queue.finish(); 687 | return *this; 688 | } 689 | }; -------------------------------------------------------------------------------- /src/OpenCL/include/CL/cl_platform.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef __CL_PLATFORM_H 18 | #define __CL_PLATFORM_H 19 | 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | 26 | #if defined(_WIN32) 27 | #if !defined(CL_API_ENTRY) 28 | #define CL_API_ENTRY 29 | #endif 30 | #if !defined(CL_API_CALL) 31 | #define CL_API_CALL __stdcall 32 | #endif 33 | #if !defined(CL_CALLBACK) 34 | #define CL_CALLBACK __stdcall 35 | #endif 36 | #else 37 | #if !defined(CL_API_ENTRY) 38 | #define CL_API_ENTRY 39 | #endif 40 | #if !defined(CL_API_CALL) 41 | #define CL_API_CALL 42 | #endif 43 | #if !defined(CL_CALLBACK) 44 | #define CL_CALLBACK 45 | #endif 46 | #endif 47 | 48 | /* 49 | * Deprecation flags refer to the last version of the header in which the 50 | * feature was not deprecated. 51 | * 52 | * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without 53 | * deprecation but is deprecated in versions later than 1.1. 54 | */ 55 | 56 | #ifndef CL_API_SUFFIX_USER 57 | #define CL_API_SUFFIX_USER 58 | #endif 59 | 60 | #ifndef CL_API_PREFIX_USER 61 | #define CL_API_PREFIX_USER 62 | #endif 63 | 64 | #define CL_API_SUFFIX_COMMON CL_API_SUFFIX_USER 65 | #define CL_API_PREFIX_COMMON CL_API_PREFIX_USER 66 | 67 | #define CL_API_SUFFIX__VERSION_1_0 CL_API_SUFFIX_COMMON 68 | #define CL_API_SUFFIX__VERSION_1_1 CL_API_SUFFIX_COMMON 69 | #define CL_API_SUFFIX__VERSION_1_2 CL_API_SUFFIX_COMMON 70 | #define CL_API_SUFFIX__VERSION_2_0 CL_API_SUFFIX_COMMON 71 | #define CL_API_SUFFIX__VERSION_2_1 CL_API_SUFFIX_COMMON 72 | #define CL_API_SUFFIX__VERSION_2_2 CL_API_SUFFIX_COMMON 73 | #define CL_API_SUFFIX__VERSION_3_0 CL_API_SUFFIX_COMMON 74 | #define CL_API_SUFFIX__EXPERIMENTAL CL_API_SUFFIX_COMMON 75 | 76 | 77 | #ifdef __GNUC__ 78 | #define CL_API_SUFFIX_DEPRECATED __attribute__((deprecated)) 79 | #define CL_API_PREFIX_DEPRECATED 80 | #elif defined(_MSC_VER) && !defined(__clang__) 81 | #define CL_API_SUFFIX_DEPRECATED 82 | #define CL_API_PREFIX_DEPRECATED __declspec(deprecated) 83 | #else 84 | #define CL_API_SUFFIX_DEPRECATED 85 | #define CL_API_PREFIX_DEPRECATED 86 | #endif 87 | 88 | #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS 89 | #define CL_API_SUFFIX__VERSION_1_0_DEPRECATED CL_API_SUFFIX_COMMON 90 | #define CL_API_PREFIX__VERSION_1_0_DEPRECATED CL_API_PREFIX_COMMON 91 | #else 92 | #define CL_API_SUFFIX__VERSION_1_0_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED 93 | #define CL_API_PREFIX__VERSION_1_0_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED 94 | #endif 95 | 96 | #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS 97 | #define CL_API_SUFFIX__VERSION_1_1_DEPRECATED CL_API_SUFFIX_COMMON 98 | #define CL_API_PREFIX__VERSION_1_1_DEPRECATED CL_API_PREFIX_COMMON 99 | #else 100 | #define CL_API_SUFFIX__VERSION_1_1_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED 101 | #define CL_API_PREFIX__VERSION_1_1_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED 102 | #endif 103 | 104 | #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS 105 | #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON 106 | #define CL_API_PREFIX__VERSION_1_2_DEPRECATED CL_API_PREFIX_COMMON 107 | #else 108 | #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED 109 | #define CL_API_PREFIX__VERSION_1_2_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED 110 | #endif 111 | 112 | #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS 113 | #define CL_API_SUFFIX__VERSION_2_0_DEPRECATED CL_API_SUFFIX_COMMON 114 | #define CL_API_PREFIX__VERSION_2_0_DEPRECATED CL_API_PREFIX_COMMON 115 | #else 116 | #define CL_API_SUFFIX__VERSION_2_0_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED 117 | #define CL_API_PREFIX__VERSION_2_0_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED 118 | #endif 119 | 120 | #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS 121 | #define CL_API_SUFFIX__VERSION_2_1_DEPRECATED CL_API_SUFFIX_COMMON 122 | #define CL_API_PREFIX__VERSION_2_1_DEPRECATED CL_API_PREFIX_COMMON 123 | #else 124 | #define CL_API_SUFFIX__VERSION_2_1_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED 125 | #define CL_API_PREFIX__VERSION_2_1_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED 126 | #endif 127 | 128 | #ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS 129 | #define CL_API_SUFFIX__VERSION_2_2_DEPRECATED CL_API_SUFFIX_COMMON 130 | #define CL_API_PREFIX__VERSION_2_2_DEPRECATED CL_API_PREFIX_COMMON 131 | #else 132 | #define CL_API_SUFFIX__VERSION_2_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED 133 | #define CL_API_PREFIX__VERSION_2_2_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED 134 | #endif 135 | 136 | #if (defined (_WIN32) && defined(_MSC_VER)) 137 | 138 | #if defined(__clang__) 139 | #pragma clang diagnostic push 140 | #pragma clang diagnostic ignored "-Wlanguage-extension-token" 141 | #endif 142 | 143 | /* intptr_t is used in cl.h and provided by stddef.h in Visual C++, but not in clang */ 144 | /* stdint.h was missing before Visual Studio 2010, include it for later versions and for clang */ 145 | #if defined(__clang__) || _MSC_VER >= 1600 146 | #include 147 | #endif 148 | 149 | /* scalar types */ 150 | typedef signed __int8 cl_char; 151 | typedef unsigned __int8 cl_uchar; 152 | typedef signed __int16 cl_short; 153 | typedef unsigned __int16 cl_ushort; 154 | typedef signed __int32 cl_int; 155 | typedef unsigned __int32 cl_uint; 156 | typedef signed __int64 cl_long; 157 | typedef unsigned __int64 cl_ulong; 158 | 159 | typedef unsigned __int16 cl_half; 160 | typedef float cl_float; 161 | typedef double cl_double; 162 | 163 | #if defined(__clang__) 164 | #pragma clang diagnostic pop 165 | #endif 166 | 167 | /* Macro names and corresponding values defined by OpenCL */ 168 | #define CL_CHAR_BIT 8 169 | #define CL_SCHAR_MAX 127 170 | #define CL_SCHAR_MIN (-127-1) 171 | #define CL_CHAR_MAX CL_SCHAR_MAX 172 | #define CL_CHAR_MIN CL_SCHAR_MIN 173 | #define CL_UCHAR_MAX 255 174 | #define CL_SHRT_MAX 32767 175 | #define CL_SHRT_MIN (-32767-1) 176 | #define CL_USHRT_MAX 65535 177 | #define CL_INT_MAX 2147483647 178 | #define CL_INT_MIN (-2147483647-1) 179 | #define CL_UINT_MAX 0xffffffffU 180 | #define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) 181 | #define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) 182 | #define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) 183 | 184 | #define CL_FLT_DIG 6 185 | #define CL_FLT_MANT_DIG 24 186 | #define CL_FLT_MAX_10_EXP +38 187 | #define CL_FLT_MAX_EXP +128 188 | #define CL_FLT_MIN_10_EXP -37 189 | #define CL_FLT_MIN_EXP -125 190 | #define CL_FLT_RADIX 2 191 | #define CL_FLT_MAX 340282346638528859811704183484516925440.0f 192 | #define CL_FLT_MIN 1.175494350822287507969e-38f 193 | #define CL_FLT_EPSILON 1.1920928955078125e-7f 194 | 195 | #define CL_HALF_DIG 3 196 | #define CL_HALF_MANT_DIG 11 197 | #define CL_HALF_MAX_10_EXP +4 198 | #define CL_HALF_MAX_EXP +16 199 | #define CL_HALF_MIN_10_EXP -4 200 | #define CL_HALF_MIN_EXP -13 201 | #define CL_HALF_RADIX 2 202 | #define CL_HALF_MAX 65504.0f 203 | #define CL_HALF_MIN 6.103515625e-05f 204 | #define CL_HALF_EPSILON 9.765625e-04f 205 | 206 | #define CL_DBL_DIG 15 207 | #define CL_DBL_MANT_DIG 53 208 | #define CL_DBL_MAX_10_EXP +308 209 | #define CL_DBL_MAX_EXP +1024 210 | #define CL_DBL_MIN_10_EXP -307 211 | #define CL_DBL_MIN_EXP -1021 212 | #define CL_DBL_RADIX 2 213 | #define CL_DBL_MAX 1.7976931348623158e+308 214 | #define CL_DBL_MIN 2.225073858507201383090e-308 215 | #define CL_DBL_EPSILON 2.220446049250313080847e-16 216 | 217 | #define CL_M_E 2.7182818284590452354 218 | #define CL_M_LOG2E 1.4426950408889634074 219 | #define CL_M_LOG10E 0.43429448190325182765 220 | #define CL_M_LN2 0.69314718055994530942 221 | #define CL_M_LN10 2.30258509299404568402 222 | #define CL_M_PI 3.14159265358979323846 223 | #define CL_M_PI_2 1.57079632679489661923 224 | #define CL_M_PI_4 0.78539816339744830962 225 | #define CL_M_1_PI 0.31830988618379067154 226 | #define CL_M_2_PI 0.63661977236758134308 227 | #define CL_M_2_SQRTPI 1.12837916709551257390 228 | #define CL_M_SQRT2 1.41421356237309504880 229 | #define CL_M_SQRT1_2 0.70710678118654752440 230 | 231 | #define CL_M_E_F 2.718281828f 232 | #define CL_M_LOG2E_F 1.442695041f 233 | #define CL_M_LOG10E_F 0.434294482f 234 | #define CL_M_LN2_F 0.693147181f 235 | #define CL_M_LN10_F 2.302585093f 236 | #define CL_M_PI_F 3.141592654f 237 | #define CL_M_PI_2_F 1.570796327f 238 | #define CL_M_PI_4_F 0.785398163f 239 | #define CL_M_1_PI_F 0.318309886f 240 | #define CL_M_2_PI_F 0.636619772f 241 | #define CL_M_2_SQRTPI_F 1.128379167f 242 | #define CL_M_SQRT2_F 1.414213562f 243 | #define CL_M_SQRT1_2_F 0.707106781f 244 | 245 | #define CL_NAN (CL_INFINITY - CL_INFINITY) 246 | #define CL_HUGE_VALF ((cl_float) 1e50) 247 | #define CL_HUGE_VAL ((cl_double) 1e500) 248 | #define CL_MAXFLOAT CL_FLT_MAX 249 | #define CL_INFINITY CL_HUGE_VALF 250 | 251 | #else 252 | 253 | #include 254 | 255 | /* scalar types */ 256 | typedef int8_t cl_char; 257 | typedef uint8_t cl_uchar; 258 | typedef int16_t cl_short; 259 | typedef uint16_t cl_ushort; 260 | typedef int32_t cl_int; 261 | typedef uint32_t cl_uint; 262 | typedef int64_t cl_long; 263 | typedef uint64_t cl_ulong; 264 | 265 | typedef uint16_t cl_half; 266 | typedef float cl_float; 267 | typedef double cl_double; 268 | 269 | /* Macro names and corresponding values defined by OpenCL */ 270 | #define CL_CHAR_BIT 8 271 | #define CL_SCHAR_MAX 127 272 | #define CL_SCHAR_MIN (-127-1) 273 | #define CL_CHAR_MAX CL_SCHAR_MAX 274 | #define CL_CHAR_MIN CL_SCHAR_MIN 275 | #define CL_UCHAR_MAX 255 276 | #define CL_SHRT_MAX 32767 277 | #define CL_SHRT_MIN (-32767-1) 278 | #define CL_USHRT_MAX 65535 279 | #define CL_INT_MAX 2147483647 280 | #define CL_INT_MIN (-2147483647-1) 281 | #define CL_UINT_MAX 0xffffffffU 282 | #define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) 283 | #define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) 284 | #define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) 285 | 286 | #define CL_FLT_DIG 6 287 | #define CL_FLT_MANT_DIG 24 288 | #define CL_FLT_MAX_10_EXP +38 289 | #define CL_FLT_MAX_EXP +128 290 | #define CL_FLT_MIN_10_EXP -37 291 | #define CL_FLT_MIN_EXP -125 292 | #define CL_FLT_RADIX 2 293 | #define CL_FLT_MAX 340282346638528859811704183484516925440.0f 294 | #define CL_FLT_MIN 1.175494350822287507969e-38f 295 | #define CL_FLT_EPSILON 1.1920928955078125e-7f 296 | 297 | #define CL_HALF_DIG 3 298 | #define CL_HALF_MANT_DIG 11 299 | #define CL_HALF_MAX_10_EXP +4 300 | #define CL_HALF_MAX_EXP +16 301 | #define CL_HALF_MIN_10_EXP -4 302 | #define CL_HALF_MIN_EXP -13 303 | #define CL_HALF_RADIX 2 304 | #define CL_HALF_MAX 65504.0f 305 | #define CL_HALF_MIN 6.103515625e-05f 306 | #define CL_HALF_EPSILON 9.765625e-04f 307 | 308 | #define CL_DBL_DIG 15 309 | #define CL_DBL_MANT_DIG 53 310 | #define CL_DBL_MAX_10_EXP +308 311 | #define CL_DBL_MAX_EXP +1024 312 | #define CL_DBL_MIN_10_EXP -307 313 | #define CL_DBL_MIN_EXP -1021 314 | #define CL_DBL_RADIX 2 315 | #define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 316 | #define CL_DBL_MIN 2.225073858507201383090e-308 317 | #define CL_DBL_EPSILON 2.220446049250313080847e-16 318 | 319 | #define CL_M_E 2.7182818284590452354 320 | #define CL_M_LOG2E 1.4426950408889634074 321 | #define CL_M_LOG10E 0.43429448190325182765 322 | #define CL_M_LN2 0.69314718055994530942 323 | #define CL_M_LN10 2.30258509299404568402 324 | #define CL_M_PI 3.14159265358979323846 325 | #define CL_M_PI_2 1.57079632679489661923 326 | #define CL_M_PI_4 0.78539816339744830962 327 | #define CL_M_1_PI 0.31830988618379067154 328 | #define CL_M_2_PI 0.63661977236758134308 329 | #define CL_M_2_SQRTPI 1.12837916709551257390 330 | #define CL_M_SQRT2 1.41421356237309504880 331 | #define CL_M_SQRT1_2 0.70710678118654752440 332 | 333 | #define CL_M_E_F 2.718281828f 334 | #define CL_M_LOG2E_F 1.442695041f 335 | #define CL_M_LOG10E_F 0.434294482f 336 | #define CL_M_LN2_F 0.693147181f 337 | #define CL_M_LN10_F 2.302585093f 338 | #define CL_M_PI_F 3.141592654f 339 | #define CL_M_PI_2_F 1.570796327f 340 | #define CL_M_PI_4_F 0.785398163f 341 | #define CL_M_1_PI_F 0.318309886f 342 | #define CL_M_2_PI_F 0.636619772f 343 | #define CL_M_2_SQRTPI_F 1.128379167f 344 | #define CL_M_SQRT2_F 1.414213562f 345 | #define CL_M_SQRT1_2_F 0.707106781f 346 | 347 | #if defined( __GNUC__ ) 348 | #define CL_HUGE_VALF __builtin_huge_valf() 349 | #define CL_HUGE_VAL __builtin_huge_val() 350 | #define CL_NAN __builtin_nanf( "" ) 351 | #else 352 | #define CL_HUGE_VALF ((cl_float) 1e50) 353 | #define CL_HUGE_VAL ((cl_double) 1e500) 354 | float nanf( const char * ); 355 | #define CL_NAN nanf( "" ) 356 | #endif 357 | #define CL_MAXFLOAT CL_FLT_MAX 358 | #define CL_INFINITY CL_HUGE_VALF 359 | 360 | #endif 361 | 362 | #include 363 | 364 | /* 365 | * Vector types 366 | * 367 | * Note: OpenCL requires that all types be naturally aligned. 368 | * This means that vector types must be naturally aligned. 369 | * For example, a vector of four floats must be aligned to 370 | * a 16 byte boundary (calculated as 4 * the natural 4-byte 371 | * alignment of the float). The alignment qualifiers here 372 | * will only function properly if your compiler supports them 373 | * and if you don't actively work to defeat them. For example, 374 | * in order for a cl_float4 to be 16 byte aligned in a struct, 375 | * the start of the struct must itself be 16-byte aligned. 376 | * 377 | * Maintaining proper alignment is the user's responsibility. 378 | */ 379 | 380 | /* Define basic vector types */ 381 | #if defined( __VEC__ ) 382 | #if !defined(__clang__) 383 | #include /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ 384 | #endif 385 | typedef __vector unsigned char __cl_uchar16; 386 | typedef __vector signed char __cl_char16; 387 | typedef __vector unsigned short __cl_ushort8; 388 | typedef __vector signed short __cl_short8; 389 | typedef __vector unsigned int __cl_uint4; 390 | typedef __vector signed int __cl_int4; 391 | typedef __vector float __cl_float4; 392 | #define __CL_UCHAR16__ 1 393 | #define __CL_CHAR16__ 1 394 | #define __CL_USHORT8__ 1 395 | #define __CL_SHORT8__ 1 396 | #define __CL_UINT4__ 1 397 | #define __CL_INT4__ 1 398 | #define __CL_FLOAT4__ 1 399 | #endif 400 | 401 | #if defined( __SSE__ ) 402 | #if defined( __MINGW64__ ) 403 | #include 404 | #else 405 | #include 406 | #endif 407 | #if defined( __GNUC__ ) 408 | typedef float __cl_float4 __attribute__((vector_size(16))); 409 | #else 410 | typedef __m128 __cl_float4; 411 | #endif 412 | #define __CL_FLOAT4__ 1 413 | #endif 414 | 415 | #if defined( __SSE2__ ) 416 | #if defined( __MINGW64__ ) 417 | #include 418 | #else 419 | #include 420 | #endif 421 | #if defined( __GNUC__ ) 422 | typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); 423 | typedef cl_char __cl_char16 __attribute__((vector_size(16))); 424 | typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); 425 | typedef cl_short __cl_short8 __attribute__((vector_size(16))); 426 | typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); 427 | typedef cl_int __cl_int4 __attribute__((vector_size(16))); 428 | typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); 429 | typedef cl_long __cl_long2 __attribute__((vector_size(16))); 430 | typedef cl_double __cl_double2 __attribute__((vector_size(16))); 431 | #else 432 | typedef __m128i __cl_uchar16; 433 | typedef __m128i __cl_char16; 434 | typedef __m128i __cl_ushort8; 435 | typedef __m128i __cl_short8; 436 | typedef __m128i __cl_uint4; 437 | typedef __m128i __cl_int4; 438 | typedef __m128i __cl_ulong2; 439 | typedef __m128i __cl_long2; 440 | typedef __m128d __cl_double2; 441 | #endif 442 | #define __CL_UCHAR16__ 1 443 | #define __CL_CHAR16__ 1 444 | #define __CL_USHORT8__ 1 445 | #define __CL_SHORT8__ 1 446 | #define __CL_INT4__ 1 447 | #define __CL_UINT4__ 1 448 | #define __CL_ULONG2__ 1 449 | #define __CL_LONG2__ 1 450 | #define __CL_DOUBLE2__ 1 451 | #endif 452 | 453 | #if defined( __MMX__ ) 454 | #include 455 | #if defined( __GNUC__ ) 456 | typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); 457 | typedef cl_char __cl_char8 __attribute__((vector_size(8))); 458 | typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); 459 | typedef cl_short __cl_short4 __attribute__((vector_size(8))); 460 | typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); 461 | typedef cl_int __cl_int2 __attribute__((vector_size(8))); 462 | typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); 463 | typedef cl_long __cl_long1 __attribute__((vector_size(8))); 464 | typedef cl_float __cl_float2 __attribute__((vector_size(8))); 465 | #else 466 | typedef __m64 __cl_uchar8; 467 | typedef __m64 __cl_char8; 468 | typedef __m64 __cl_ushort4; 469 | typedef __m64 __cl_short4; 470 | typedef __m64 __cl_uint2; 471 | typedef __m64 __cl_int2; 472 | typedef __m64 __cl_ulong1; 473 | typedef __m64 __cl_long1; 474 | typedef __m64 __cl_float2; 475 | #endif 476 | #define __CL_UCHAR8__ 1 477 | #define __CL_CHAR8__ 1 478 | #define __CL_USHORT4__ 1 479 | #define __CL_SHORT4__ 1 480 | #define __CL_INT2__ 1 481 | #define __CL_UINT2__ 1 482 | #define __CL_ULONG1__ 1 483 | #define __CL_LONG1__ 1 484 | #define __CL_FLOAT2__ 1 485 | #endif 486 | 487 | #if defined( __AVX__ ) 488 | #if defined( __MINGW64__ ) 489 | #include 490 | #else 491 | #include 492 | #endif 493 | #if defined( __GNUC__ ) 494 | typedef cl_float __cl_float8 __attribute__((vector_size(32))); 495 | typedef cl_double __cl_double4 __attribute__((vector_size(32))); 496 | #else 497 | typedef __m256 __cl_float8; 498 | typedef __m256d __cl_double4; 499 | #endif 500 | #define __CL_FLOAT8__ 1 501 | #define __CL_DOUBLE4__ 1 502 | #endif 503 | 504 | /* Define capabilities for anonymous struct members. */ 505 | #if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L 506 | #define __CL_HAS_ANON_STRUCT__ 1 507 | #define __CL_ANON_STRUCT__ 508 | #elif defined(_WIN32) && defined(_MSC_VER) && !defined(__STDC__) 509 | #define __CL_HAS_ANON_STRUCT__ 1 510 | #define __CL_ANON_STRUCT__ 511 | #elif defined(__GNUC__) && ! defined(__STRICT_ANSI__) 512 | #define __CL_HAS_ANON_STRUCT__ 1 513 | #define __CL_ANON_STRUCT__ __extension__ 514 | #elif defined(__clang__) 515 | #define __CL_HAS_ANON_STRUCT__ 1 516 | #define __CL_ANON_STRUCT__ __extension__ 517 | #else 518 | #define __CL_HAS_ANON_STRUCT__ 0 519 | #define __CL_ANON_STRUCT__ 520 | #endif 521 | 522 | #if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__ 523 | /* Disable warning C4201: nonstandard extension used : nameless struct/union */ 524 | #pragma warning( push ) 525 | #pragma warning( disable : 4201 ) 526 | #endif 527 | 528 | /* Define alignment keys */ 529 | #if defined( __GNUC__ ) || defined(__INTEGRITY) 530 | #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) 531 | #elif defined( _WIN32) && (_MSC_VER) 532 | /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ 533 | /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ 534 | /* #include */ 535 | /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ 536 | #define CL_ALIGNED(_x) 537 | #else 538 | #warning Need to implement some method to align data here 539 | #define CL_ALIGNED(_x) 540 | #endif 541 | 542 | /* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ 543 | #if __CL_HAS_ANON_STRUCT__ 544 | /* .xyzw and .s0123...{f|F} are supported */ 545 | #define CL_HAS_NAMED_VECTOR_FIELDS 1 546 | /* .hi and .lo are supported */ 547 | #define CL_HAS_HI_LO_VECTOR_FIELDS 1 548 | #endif 549 | 550 | /* Define cl_vector types */ 551 | 552 | /* ---- cl_charn ---- */ 553 | typedef union 554 | { 555 | cl_char CL_ALIGNED(2) s[2]; 556 | #if __CL_HAS_ANON_STRUCT__ 557 | __CL_ANON_STRUCT__ struct{ cl_char x, y; }; 558 | __CL_ANON_STRUCT__ struct{ cl_char s0, s1; }; 559 | __CL_ANON_STRUCT__ struct{ cl_char lo, hi; }; 560 | #endif 561 | #if defined( __CL_CHAR2__) 562 | __cl_char2 v2; 563 | #endif 564 | }cl_char2; 565 | 566 | typedef union 567 | { 568 | cl_char CL_ALIGNED(4) s[4]; 569 | #if __CL_HAS_ANON_STRUCT__ 570 | __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; 571 | __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; }; 572 | __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; }; 573 | #endif 574 | #if defined( __CL_CHAR2__) 575 | __cl_char2 v2[2]; 576 | #endif 577 | #if defined( __CL_CHAR4__) 578 | __cl_char4 v4; 579 | #endif 580 | }cl_char4; 581 | 582 | /* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ 583 | typedef cl_char4 cl_char3; 584 | 585 | typedef union 586 | { 587 | cl_char CL_ALIGNED(8) s[8]; 588 | #if __CL_HAS_ANON_STRUCT__ 589 | __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; 590 | __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; 591 | __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; }; 592 | #endif 593 | #if defined( __CL_CHAR2__) 594 | __cl_char2 v2[4]; 595 | #endif 596 | #if defined( __CL_CHAR4__) 597 | __cl_char4 v4[2]; 598 | #endif 599 | #if defined( __CL_CHAR8__ ) 600 | __cl_char8 v8; 601 | #endif 602 | }cl_char8; 603 | 604 | typedef union 605 | { 606 | cl_char CL_ALIGNED(16) s[16]; 607 | #if __CL_HAS_ANON_STRUCT__ 608 | __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; 609 | __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; 610 | __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; }; 611 | #endif 612 | #if defined( __CL_CHAR2__) 613 | __cl_char2 v2[8]; 614 | #endif 615 | #if defined( __CL_CHAR4__) 616 | __cl_char4 v4[4]; 617 | #endif 618 | #if defined( __CL_CHAR8__ ) 619 | __cl_char8 v8[2]; 620 | #endif 621 | #if defined( __CL_CHAR16__ ) 622 | __cl_char16 v16; 623 | #endif 624 | }cl_char16; 625 | 626 | 627 | /* ---- cl_ucharn ---- */ 628 | typedef union 629 | { 630 | cl_uchar CL_ALIGNED(2) s[2]; 631 | #if __CL_HAS_ANON_STRUCT__ 632 | __CL_ANON_STRUCT__ struct{ cl_uchar x, y; }; 633 | __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; }; 634 | __CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; }; 635 | #endif 636 | #if defined( __cl_uchar2__) 637 | __cl_uchar2 v2; 638 | #endif 639 | }cl_uchar2; 640 | 641 | typedef union 642 | { 643 | cl_uchar CL_ALIGNED(4) s[4]; 644 | #if __CL_HAS_ANON_STRUCT__ 645 | __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; 646 | __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; }; 647 | __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; }; 648 | #endif 649 | #if defined( __CL_UCHAR2__) 650 | __cl_uchar2 v2[2]; 651 | #endif 652 | #if defined( __CL_UCHAR4__) 653 | __cl_uchar4 v4; 654 | #endif 655 | }cl_uchar4; 656 | 657 | /* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ 658 | typedef cl_uchar4 cl_uchar3; 659 | 660 | typedef union 661 | { 662 | cl_uchar CL_ALIGNED(8) s[8]; 663 | #if __CL_HAS_ANON_STRUCT__ 664 | __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; 665 | __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; 666 | __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; }; 667 | #endif 668 | #if defined( __CL_UCHAR2__) 669 | __cl_uchar2 v2[4]; 670 | #endif 671 | #if defined( __CL_UCHAR4__) 672 | __cl_uchar4 v4[2]; 673 | #endif 674 | #if defined( __CL_UCHAR8__ ) 675 | __cl_uchar8 v8; 676 | #endif 677 | }cl_uchar8; 678 | 679 | typedef union 680 | { 681 | cl_uchar CL_ALIGNED(16) s[16]; 682 | #if __CL_HAS_ANON_STRUCT__ 683 | __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; 684 | __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; 685 | __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; }; 686 | #endif 687 | #if defined( __CL_UCHAR2__) 688 | __cl_uchar2 v2[8]; 689 | #endif 690 | #if defined( __CL_UCHAR4__) 691 | __cl_uchar4 v4[4]; 692 | #endif 693 | #if defined( __CL_UCHAR8__ ) 694 | __cl_uchar8 v8[2]; 695 | #endif 696 | #if defined( __CL_UCHAR16__ ) 697 | __cl_uchar16 v16; 698 | #endif 699 | }cl_uchar16; 700 | 701 | 702 | /* ---- cl_shortn ---- */ 703 | typedef union 704 | { 705 | cl_short CL_ALIGNED(4) s[2]; 706 | #if __CL_HAS_ANON_STRUCT__ 707 | __CL_ANON_STRUCT__ struct{ cl_short x, y; }; 708 | __CL_ANON_STRUCT__ struct{ cl_short s0, s1; }; 709 | __CL_ANON_STRUCT__ struct{ cl_short lo, hi; }; 710 | #endif 711 | #if defined( __CL_SHORT2__) 712 | __cl_short2 v2; 713 | #endif 714 | }cl_short2; 715 | 716 | typedef union 717 | { 718 | cl_short CL_ALIGNED(8) s[4]; 719 | #if __CL_HAS_ANON_STRUCT__ 720 | __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; 721 | __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; }; 722 | __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; }; 723 | #endif 724 | #if defined( __CL_SHORT2__) 725 | __cl_short2 v2[2]; 726 | #endif 727 | #if defined( __CL_SHORT4__) 728 | __cl_short4 v4; 729 | #endif 730 | }cl_short4; 731 | 732 | /* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ 733 | typedef cl_short4 cl_short3; 734 | 735 | typedef union 736 | { 737 | cl_short CL_ALIGNED(16) s[8]; 738 | #if __CL_HAS_ANON_STRUCT__ 739 | __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; 740 | __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; 741 | __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; }; 742 | #endif 743 | #if defined( __CL_SHORT2__) 744 | __cl_short2 v2[4]; 745 | #endif 746 | #if defined( __CL_SHORT4__) 747 | __cl_short4 v4[2]; 748 | #endif 749 | #if defined( __CL_SHORT8__ ) 750 | __cl_short8 v8; 751 | #endif 752 | }cl_short8; 753 | 754 | typedef union 755 | { 756 | cl_short CL_ALIGNED(32) s[16]; 757 | #if __CL_HAS_ANON_STRUCT__ 758 | __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; 759 | __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; 760 | __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; }; 761 | #endif 762 | #if defined( __CL_SHORT2__) 763 | __cl_short2 v2[8]; 764 | #endif 765 | #if defined( __CL_SHORT4__) 766 | __cl_short4 v4[4]; 767 | #endif 768 | #if defined( __CL_SHORT8__ ) 769 | __cl_short8 v8[2]; 770 | #endif 771 | #if defined( __CL_SHORT16__ ) 772 | __cl_short16 v16; 773 | #endif 774 | }cl_short16; 775 | 776 | 777 | /* ---- cl_ushortn ---- */ 778 | typedef union 779 | { 780 | cl_ushort CL_ALIGNED(4) s[2]; 781 | #if __CL_HAS_ANON_STRUCT__ 782 | __CL_ANON_STRUCT__ struct{ cl_ushort x, y; }; 783 | __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; }; 784 | __CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; }; 785 | #endif 786 | #if defined( __CL_USHORT2__) 787 | __cl_ushort2 v2; 788 | #endif 789 | }cl_ushort2; 790 | 791 | typedef union 792 | { 793 | cl_ushort CL_ALIGNED(8) s[4]; 794 | #if __CL_HAS_ANON_STRUCT__ 795 | __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; 796 | __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; }; 797 | __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; }; 798 | #endif 799 | #if defined( __CL_USHORT2__) 800 | __cl_ushort2 v2[2]; 801 | #endif 802 | #if defined( __CL_USHORT4__) 803 | __cl_ushort4 v4; 804 | #endif 805 | }cl_ushort4; 806 | 807 | /* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ 808 | typedef cl_ushort4 cl_ushort3; 809 | 810 | typedef union 811 | { 812 | cl_ushort CL_ALIGNED(16) s[8]; 813 | #if __CL_HAS_ANON_STRUCT__ 814 | __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; 815 | __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; 816 | __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; }; 817 | #endif 818 | #if defined( __CL_USHORT2__) 819 | __cl_ushort2 v2[4]; 820 | #endif 821 | #if defined( __CL_USHORT4__) 822 | __cl_ushort4 v4[2]; 823 | #endif 824 | #if defined( __CL_USHORT8__ ) 825 | __cl_ushort8 v8; 826 | #endif 827 | }cl_ushort8; 828 | 829 | typedef union 830 | { 831 | cl_ushort CL_ALIGNED(32) s[16]; 832 | #if __CL_HAS_ANON_STRUCT__ 833 | __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; 834 | __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; 835 | __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; }; 836 | #endif 837 | #if defined( __CL_USHORT2__) 838 | __cl_ushort2 v2[8]; 839 | #endif 840 | #if defined( __CL_USHORT4__) 841 | __cl_ushort4 v4[4]; 842 | #endif 843 | #if defined( __CL_USHORT8__ ) 844 | __cl_ushort8 v8[2]; 845 | #endif 846 | #if defined( __CL_USHORT16__ ) 847 | __cl_ushort16 v16; 848 | #endif 849 | }cl_ushort16; 850 | 851 | 852 | /* ---- cl_halfn ---- */ 853 | typedef union 854 | { 855 | cl_half CL_ALIGNED(4) s[2]; 856 | #if __CL_HAS_ANON_STRUCT__ 857 | __CL_ANON_STRUCT__ struct{ cl_half x, y; }; 858 | __CL_ANON_STRUCT__ struct{ cl_half s0, s1; }; 859 | __CL_ANON_STRUCT__ struct{ cl_half lo, hi; }; 860 | #endif 861 | #if defined( __CL_HALF2__) 862 | __cl_half2 v2; 863 | #endif 864 | }cl_half2; 865 | 866 | typedef union 867 | { 868 | cl_half CL_ALIGNED(8) s[4]; 869 | #if __CL_HAS_ANON_STRUCT__ 870 | __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; 871 | __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3; }; 872 | __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; }; 873 | #endif 874 | #if defined( __CL_HALF2__) 875 | __cl_half2 v2[2]; 876 | #endif 877 | #if defined( __CL_HALF4__) 878 | __cl_half4 v4; 879 | #endif 880 | }cl_half4; 881 | 882 | /* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */ 883 | typedef cl_half4 cl_half3; 884 | 885 | typedef union 886 | { 887 | cl_half CL_ALIGNED(16) s[8]; 888 | #if __CL_HAS_ANON_STRUCT__ 889 | __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; 890 | __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7; }; 891 | __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; }; 892 | #endif 893 | #if defined( __CL_HALF2__) 894 | __cl_half2 v2[4]; 895 | #endif 896 | #if defined( __CL_HALF4__) 897 | __cl_half4 v4[2]; 898 | #endif 899 | #if defined( __CL_HALF8__ ) 900 | __cl_half8 v8; 901 | #endif 902 | }cl_half8; 903 | 904 | typedef union 905 | { 906 | cl_half CL_ALIGNED(32) s[16]; 907 | #if __CL_HAS_ANON_STRUCT__ 908 | __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; 909 | __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; 910 | __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; }; 911 | #endif 912 | #if defined( __CL_HALF2__) 913 | __cl_half2 v2[8]; 914 | #endif 915 | #if defined( __CL_HALF4__) 916 | __cl_half4 v4[4]; 917 | #endif 918 | #if defined( __CL_HALF8__ ) 919 | __cl_half8 v8[2]; 920 | #endif 921 | #if defined( __CL_HALF16__ ) 922 | __cl_half16 v16; 923 | #endif 924 | }cl_half16; 925 | 926 | /* ---- cl_intn ---- */ 927 | typedef union 928 | { 929 | cl_int CL_ALIGNED(8) s[2]; 930 | #if __CL_HAS_ANON_STRUCT__ 931 | __CL_ANON_STRUCT__ struct{ cl_int x, y; }; 932 | __CL_ANON_STRUCT__ struct{ cl_int s0, s1; }; 933 | __CL_ANON_STRUCT__ struct{ cl_int lo, hi; }; 934 | #endif 935 | #if defined( __CL_INT2__) 936 | __cl_int2 v2; 937 | #endif 938 | }cl_int2; 939 | 940 | typedef union 941 | { 942 | cl_int CL_ALIGNED(16) s[4]; 943 | #if __CL_HAS_ANON_STRUCT__ 944 | __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; 945 | __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; }; 946 | __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; }; 947 | #endif 948 | #if defined( __CL_INT2__) 949 | __cl_int2 v2[2]; 950 | #endif 951 | #if defined( __CL_INT4__) 952 | __cl_int4 v4; 953 | #endif 954 | }cl_int4; 955 | 956 | /* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ 957 | typedef cl_int4 cl_int3; 958 | 959 | typedef union 960 | { 961 | cl_int CL_ALIGNED(32) s[8]; 962 | #if __CL_HAS_ANON_STRUCT__ 963 | __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; 964 | __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; 965 | __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; }; 966 | #endif 967 | #if defined( __CL_INT2__) 968 | __cl_int2 v2[4]; 969 | #endif 970 | #if defined( __CL_INT4__) 971 | __cl_int4 v4[2]; 972 | #endif 973 | #if defined( __CL_INT8__ ) 974 | __cl_int8 v8; 975 | #endif 976 | }cl_int8; 977 | 978 | typedef union 979 | { 980 | cl_int CL_ALIGNED(64) s[16]; 981 | #if __CL_HAS_ANON_STRUCT__ 982 | __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; 983 | __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; 984 | __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; }; 985 | #endif 986 | #if defined( __CL_INT2__) 987 | __cl_int2 v2[8]; 988 | #endif 989 | #if defined( __CL_INT4__) 990 | __cl_int4 v4[4]; 991 | #endif 992 | #if defined( __CL_INT8__ ) 993 | __cl_int8 v8[2]; 994 | #endif 995 | #if defined( __CL_INT16__ ) 996 | __cl_int16 v16; 997 | #endif 998 | }cl_int16; 999 | 1000 | 1001 | /* ---- cl_uintn ---- */ 1002 | typedef union 1003 | { 1004 | cl_uint CL_ALIGNED(8) s[2]; 1005 | #if __CL_HAS_ANON_STRUCT__ 1006 | __CL_ANON_STRUCT__ struct{ cl_uint x, y; }; 1007 | __CL_ANON_STRUCT__ struct{ cl_uint s0, s1; }; 1008 | __CL_ANON_STRUCT__ struct{ cl_uint lo, hi; }; 1009 | #endif 1010 | #if defined( __CL_UINT2__) 1011 | __cl_uint2 v2; 1012 | #endif 1013 | }cl_uint2; 1014 | 1015 | typedef union 1016 | { 1017 | cl_uint CL_ALIGNED(16) s[4]; 1018 | #if __CL_HAS_ANON_STRUCT__ 1019 | __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; 1020 | __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; }; 1021 | __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; }; 1022 | #endif 1023 | #if defined( __CL_UINT2__) 1024 | __cl_uint2 v2[2]; 1025 | #endif 1026 | #if defined( __CL_UINT4__) 1027 | __cl_uint4 v4; 1028 | #endif 1029 | }cl_uint4; 1030 | 1031 | /* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ 1032 | typedef cl_uint4 cl_uint3; 1033 | 1034 | typedef union 1035 | { 1036 | cl_uint CL_ALIGNED(32) s[8]; 1037 | #if __CL_HAS_ANON_STRUCT__ 1038 | __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; 1039 | __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; 1040 | __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; }; 1041 | #endif 1042 | #if defined( __CL_UINT2__) 1043 | __cl_uint2 v2[4]; 1044 | #endif 1045 | #if defined( __CL_UINT4__) 1046 | __cl_uint4 v4[2]; 1047 | #endif 1048 | #if defined( __CL_UINT8__ ) 1049 | __cl_uint8 v8; 1050 | #endif 1051 | }cl_uint8; 1052 | 1053 | typedef union 1054 | { 1055 | cl_uint CL_ALIGNED(64) s[16]; 1056 | #if __CL_HAS_ANON_STRUCT__ 1057 | __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; 1058 | __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; 1059 | __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; }; 1060 | #endif 1061 | #if defined( __CL_UINT2__) 1062 | __cl_uint2 v2[8]; 1063 | #endif 1064 | #if defined( __CL_UINT4__) 1065 | __cl_uint4 v4[4]; 1066 | #endif 1067 | #if defined( __CL_UINT8__ ) 1068 | __cl_uint8 v8[2]; 1069 | #endif 1070 | #if defined( __CL_UINT16__ ) 1071 | __cl_uint16 v16; 1072 | #endif 1073 | }cl_uint16; 1074 | 1075 | /* ---- cl_longn ---- */ 1076 | typedef union 1077 | { 1078 | cl_long CL_ALIGNED(16) s[2]; 1079 | #if __CL_HAS_ANON_STRUCT__ 1080 | __CL_ANON_STRUCT__ struct{ cl_long x, y; }; 1081 | __CL_ANON_STRUCT__ struct{ cl_long s0, s1; }; 1082 | __CL_ANON_STRUCT__ struct{ cl_long lo, hi; }; 1083 | #endif 1084 | #if defined( __CL_LONG2__) 1085 | __cl_long2 v2; 1086 | #endif 1087 | }cl_long2; 1088 | 1089 | typedef union 1090 | { 1091 | cl_long CL_ALIGNED(32) s[4]; 1092 | #if __CL_HAS_ANON_STRUCT__ 1093 | __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; 1094 | __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; }; 1095 | __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; }; 1096 | #endif 1097 | #if defined( __CL_LONG2__) 1098 | __cl_long2 v2[2]; 1099 | #endif 1100 | #if defined( __CL_LONG4__) 1101 | __cl_long4 v4; 1102 | #endif 1103 | }cl_long4; 1104 | 1105 | /* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ 1106 | typedef cl_long4 cl_long3; 1107 | 1108 | typedef union 1109 | { 1110 | cl_long CL_ALIGNED(64) s[8]; 1111 | #if __CL_HAS_ANON_STRUCT__ 1112 | __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; 1113 | __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; 1114 | __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; }; 1115 | #endif 1116 | #if defined( __CL_LONG2__) 1117 | __cl_long2 v2[4]; 1118 | #endif 1119 | #if defined( __CL_LONG4__) 1120 | __cl_long4 v4[2]; 1121 | #endif 1122 | #if defined( __CL_LONG8__ ) 1123 | __cl_long8 v8; 1124 | #endif 1125 | }cl_long8; 1126 | 1127 | typedef union 1128 | { 1129 | cl_long CL_ALIGNED(128) s[16]; 1130 | #if __CL_HAS_ANON_STRUCT__ 1131 | __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; 1132 | __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; 1133 | __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; }; 1134 | #endif 1135 | #if defined( __CL_LONG2__) 1136 | __cl_long2 v2[8]; 1137 | #endif 1138 | #if defined( __CL_LONG4__) 1139 | __cl_long4 v4[4]; 1140 | #endif 1141 | #if defined( __CL_LONG8__ ) 1142 | __cl_long8 v8[2]; 1143 | #endif 1144 | #if defined( __CL_LONG16__ ) 1145 | __cl_long16 v16; 1146 | #endif 1147 | }cl_long16; 1148 | 1149 | 1150 | /* ---- cl_ulongn ---- */ 1151 | typedef union 1152 | { 1153 | cl_ulong CL_ALIGNED(16) s[2]; 1154 | #if __CL_HAS_ANON_STRUCT__ 1155 | __CL_ANON_STRUCT__ struct{ cl_ulong x, y; }; 1156 | __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; }; 1157 | __CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; }; 1158 | #endif 1159 | #if defined( __CL_ULONG2__) 1160 | __cl_ulong2 v2; 1161 | #endif 1162 | }cl_ulong2; 1163 | 1164 | typedef union 1165 | { 1166 | cl_ulong CL_ALIGNED(32) s[4]; 1167 | #if __CL_HAS_ANON_STRUCT__ 1168 | __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; 1169 | __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; }; 1170 | __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; }; 1171 | #endif 1172 | #if defined( __CL_ULONG2__) 1173 | __cl_ulong2 v2[2]; 1174 | #endif 1175 | #if defined( __CL_ULONG4__) 1176 | __cl_ulong4 v4; 1177 | #endif 1178 | }cl_ulong4; 1179 | 1180 | /* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ 1181 | typedef cl_ulong4 cl_ulong3; 1182 | 1183 | typedef union 1184 | { 1185 | cl_ulong CL_ALIGNED(64) s[8]; 1186 | #if __CL_HAS_ANON_STRUCT__ 1187 | __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; 1188 | __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; 1189 | __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; }; 1190 | #endif 1191 | #if defined( __CL_ULONG2__) 1192 | __cl_ulong2 v2[4]; 1193 | #endif 1194 | #if defined( __CL_ULONG4__) 1195 | __cl_ulong4 v4[2]; 1196 | #endif 1197 | #if defined( __CL_ULONG8__ ) 1198 | __cl_ulong8 v8; 1199 | #endif 1200 | }cl_ulong8; 1201 | 1202 | typedef union 1203 | { 1204 | cl_ulong CL_ALIGNED(128) s[16]; 1205 | #if __CL_HAS_ANON_STRUCT__ 1206 | __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; 1207 | __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; 1208 | __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; }; 1209 | #endif 1210 | #if defined( __CL_ULONG2__) 1211 | __cl_ulong2 v2[8]; 1212 | #endif 1213 | #if defined( __CL_ULONG4__) 1214 | __cl_ulong4 v4[4]; 1215 | #endif 1216 | #if defined( __CL_ULONG8__ ) 1217 | __cl_ulong8 v8[2]; 1218 | #endif 1219 | #if defined( __CL_ULONG16__ ) 1220 | __cl_ulong16 v16; 1221 | #endif 1222 | }cl_ulong16; 1223 | 1224 | 1225 | /* --- cl_floatn ---- */ 1226 | 1227 | typedef union 1228 | { 1229 | cl_float CL_ALIGNED(8) s[2]; 1230 | #if __CL_HAS_ANON_STRUCT__ 1231 | __CL_ANON_STRUCT__ struct{ cl_float x, y; }; 1232 | __CL_ANON_STRUCT__ struct{ cl_float s0, s1; }; 1233 | __CL_ANON_STRUCT__ struct{ cl_float lo, hi; }; 1234 | #endif 1235 | #if defined( __CL_FLOAT2__) 1236 | __cl_float2 v2; 1237 | #endif 1238 | }cl_float2; 1239 | 1240 | typedef union 1241 | { 1242 | cl_float CL_ALIGNED(16) s[4]; 1243 | #if __CL_HAS_ANON_STRUCT__ 1244 | __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; 1245 | __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; }; 1246 | __CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; }; 1247 | #endif 1248 | #if defined( __CL_FLOAT2__) 1249 | __cl_float2 v2[2]; 1250 | #endif 1251 | #if defined( __CL_FLOAT4__) 1252 | __cl_float4 v4; 1253 | #endif 1254 | }cl_float4; 1255 | 1256 | /* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ 1257 | typedef cl_float4 cl_float3; 1258 | 1259 | typedef union 1260 | { 1261 | cl_float CL_ALIGNED(32) s[8]; 1262 | #if __CL_HAS_ANON_STRUCT__ 1263 | __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; 1264 | __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; 1265 | __CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; }; 1266 | #endif 1267 | #if defined( __CL_FLOAT2__) 1268 | __cl_float2 v2[4]; 1269 | #endif 1270 | #if defined( __CL_FLOAT4__) 1271 | __cl_float4 v4[2]; 1272 | #endif 1273 | #if defined( __CL_FLOAT8__ ) 1274 | __cl_float8 v8; 1275 | #endif 1276 | }cl_float8; 1277 | 1278 | typedef union 1279 | { 1280 | cl_float CL_ALIGNED(64) s[16]; 1281 | #if __CL_HAS_ANON_STRUCT__ 1282 | __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; 1283 | __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; 1284 | __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; }; 1285 | #endif 1286 | #if defined( __CL_FLOAT2__) 1287 | __cl_float2 v2[8]; 1288 | #endif 1289 | #if defined( __CL_FLOAT4__) 1290 | __cl_float4 v4[4]; 1291 | #endif 1292 | #if defined( __CL_FLOAT8__ ) 1293 | __cl_float8 v8[2]; 1294 | #endif 1295 | #if defined( __CL_FLOAT16__ ) 1296 | __cl_float16 v16; 1297 | #endif 1298 | }cl_float16; 1299 | 1300 | /* --- cl_doublen ---- */ 1301 | 1302 | typedef union 1303 | { 1304 | cl_double CL_ALIGNED(16) s[2]; 1305 | #if __CL_HAS_ANON_STRUCT__ 1306 | __CL_ANON_STRUCT__ struct{ cl_double x, y; }; 1307 | __CL_ANON_STRUCT__ struct{ cl_double s0, s1; }; 1308 | __CL_ANON_STRUCT__ struct{ cl_double lo, hi; }; 1309 | #endif 1310 | #if defined( __CL_DOUBLE2__) 1311 | __cl_double2 v2; 1312 | #endif 1313 | }cl_double2; 1314 | 1315 | typedef union 1316 | { 1317 | cl_double CL_ALIGNED(32) s[4]; 1318 | #if __CL_HAS_ANON_STRUCT__ 1319 | __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; 1320 | __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; }; 1321 | __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; }; 1322 | #endif 1323 | #if defined( __CL_DOUBLE2__) 1324 | __cl_double2 v2[2]; 1325 | #endif 1326 | #if defined( __CL_DOUBLE4__) 1327 | __cl_double4 v4; 1328 | #endif 1329 | }cl_double4; 1330 | 1331 | /* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ 1332 | typedef cl_double4 cl_double3; 1333 | 1334 | typedef union 1335 | { 1336 | cl_double CL_ALIGNED(64) s[8]; 1337 | #if __CL_HAS_ANON_STRUCT__ 1338 | __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; 1339 | __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; 1340 | __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; }; 1341 | #endif 1342 | #if defined( __CL_DOUBLE2__) 1343 | __cl_double2 v2[4]; 1344 | #endif 1345 | #if defined( __CL_DOUBLE4__) 1346 | __cl_double4 v4[2]; 1347 | #endif 1348 | #if defined( __CL_DOUBLE8__ ) 1349 | __cl_double8 v8; 1350 | #endif 1351 | }cl_double8; 1352 | 1353 | typedef union 1354 | { 1355 | cl_double CL_ALIGNED(128) s[16]; 1356 | #if __CL_HAS_ANON_STRUCT__ 1357 | __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; 1358 | __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; 1359 | __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; }; 1360 | #endif 1361 | #if defined( __CL_DOUBLE2__) 1362 | __cl_double2 v2[8]; 1363 | #endif 1364 | #if defined( __CL_DOUBLE4__) 1365 | __cl_double4 v4[4]; 1366 | #endif 1367 | #if defined( __CL_DOUBLE8__ ) 1368 | __cl_double8 v8[2]; 1369 | #endif 1370 | #if defined( __CL_DOUBLE16__ ) 1371 | __cl_double16 v16; 1372 | #endif 1373 | }cl_double16; 1374 | 1375 | /* Macro to facilitate debugging 1376 | * Usage: 1377 | * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. 1378 | * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \" 1379 | * Each line thereafter of OpenCL C source must end with: \n\ 1380 | * The last line ends in "; 1381 | * 1382 | * Example: 1383 | * 1384 | * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\ 1385 | * kernel void foo( int a, float * b ) \n\ 1386 | * { \n\ 1387 | * // my comment \n\ 1388 | * *b[ get_global_id(0)] = a; \n\ 1389 | * } \n\ 1390 | * "; 1391 | * 1392 | * This should correctly set up the line, (column) and file information for your source 1393 | * string so you can do source level debugging. 1394 | */ 1395 | #define __CL_STRINGIFY( _x ) # _x 1396 | #define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) 1397 | #define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" 1398 | 1399 | #ifdef __cplusplus 1400 | } 1401 | #endif 1402 | 1403 | #if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__ 1404 | #pragma warning( pop ) 1405 | #endif 1406 | 1407 | #endif /* __CL_PLATFORM_H */ 1408 | --------------------------------------------------------------------------------