├── .gitignore
├── .gitattributes
├── src
    ├── OpenCL
    │   ├── lib
    │   │   ├── OpenCL.lib
    │   │   └── libOpenCL.so
    │   └── include
    │   │   └── CL
    │   │       ├── opencl.h
    │   │       ├── cl_version.h
    │   │       ├── cl_gl.h
    │   │       └── cl_platform.h
    ├── kernel.cpp
    ├── kernel.hpp
    ├── main.cpp
    ├── utilities.hpp
    └── opencl.hpp
├── CITATION.cff
├── make.sh
├── OpenCL-Benchmark.sln
├── LICENSE.md
├── OpenCL-Benchmark.vcxproj
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | bin/
2 | .vs/
3 | OpenCL-Benchmark.vcxproj.user
4 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | src/OpenCL/** linguist-vendored
2 | src/kernel.cpp linguist-language=OpenCL


--------------------------------------------------------------------------------
/src/OpenCL/lib/OpenCL.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProjectPhysX/OpenCL-Benchmark/HEAD/src/OpenCL/lib/OpenCL.lib


--------------------------------------------------------------------------------
/src/OpenCL/lib/libOpenCL.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProjectPhysX/OpenCL-Benchmark/HEAD/src/OpenCL/lib/libOpenCL.so


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 | - family-names: "Lehmann"
5 |   given-names: "Moritz"
6 |   orcid: "https://orcid.org/0000-0002-4652-8383"
7 | title: "OpenCL-Benchmark"
8 | date-released: 2023-04-30
9 | url: "https://github.com/ProjectPhysX/OpenCL-Benchmark"


--------------------------------------------------------------------------------
/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # command line argument(s): device ID(s); if empty, it will benchmark all available devices
 3 | 
 4 | mkdir -p bin # create directory for executable
 5 | rm -f bin/OpenCL-Benchmark # prevent execution of old version if compiling fails
 6 | 
 7 | case "$(uname -a)" in # automatically detect operating system
 8 | 	 Darwin*) g++ src/*.cpp -o bin/OpenCL-Benchmark -std=c++17 -pthread -O -Wno-comment -I./src/OpenCL/include -framework OpenCL               ;; # macOS
 9 | 	*Android) g++ src/*.cpp -o bin/OpenCL-Benchmark -std=c++17 -pthread -O -Wno-comment -I./src/OpenCL/include -L/system/vendor/lib64 -lOpenCL ;; # Android
10 | 	*       ) g++ src/*.cpp -o bin/OpenCL-Benchmark -std=c++17 -pthread -O -Wno-comment -I./src/OpenCL/include -L./src/OpenCL/lib -lOpenCL     ;; # Linux
11 | esac
12 | 
13 | if [[ $? == 0 ]]; then bin/OpenCL-Benchmark "$@"; fi # run executable only if last compilation was successful
14 | 


--------------------------------------------------------------------------------
/OpenCL-Benchmark.sln:
--------------------------------------------------------------------------------
 1 | 
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio Version 16
 4 | VisualStudioVersion = 16.0.31729.503
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "OpenCL-Benchmark", "OpenCL-Benchmark.vcxproj", "{B07BD873-9CD9-4F0B-AAA8-8AE6FE22F76A}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Release|x64 = Release|x64
11 | 	EndGlobalSection
12 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
13 | 		{B07BD873-9CD9-4F0B-AAA8-8AE6FE22F76A}.Release|x64.ActiveCfg = Release|x64
14 | 		{B07BD873-9CD9-4F0B-AAA8-8AE6FE22F76A}.Release|x64.Build.0 = Release|x64
15 | 	EndGlobalSection
16 | 	GlobalSection(SolutionProperties) = preSolution
17 | 		HideSolutionNode = FALSE
18 | 	EndGlobalSection
19 | 	GlobalSection(ExtensibilityGlobals) = postSolution
20 | 		SolutionGuid = {CF46CF2E-5B57-4081-86EB-6E1333CB46A3}
21 | 	EndGlobalSection
22 | EndGlobal
23 | 


--------------------------------------------------------------------------------
/src/OpenCL/include/CL/opencl.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 |  * Copyright (c) 2008-2021 The Khronos Group Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *    http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  ******************************************************************************/
16 | 
17 | #ifndef __OPENCL_H
18 | #define __OPENCL_H
19 | 
20 | #ifdef __cplusplus
21 | extern "C" {
22 | #endif
23 | 
24 | #include <CL/cl.h>
25 | #include <CL/cl_gl.h>
26 | #include <CL/cl_ext.h>
27 | 
28 | #ifdef __cplusplus
29 | }
30 | #endif
31 | 
32 | #endif  /* __OPENCL_H   */
33 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright (c) 2023-2024 Dr. Moritz Lehmann
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files, to use this software for educational use, non-military research or non-military commercial use, and to alter it and redistribute it freely, subject to the following restrictions:
4 | 
5 | 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation should be provided.
6 | 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
7 | 3. This notice may not be removed or altered from any source distribution.
8 | 
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/src/OpenCL/include/CL/cl_version.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 |  * Copyright (c) 2018-2020 The Khronos Group Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *    http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  ******************************************************************************/
16 | 
17 | #ifndef __CL_VERSION_H
18 | #define __CL_VERSION_H
19 | 
20 | /* Detect which version to target */
21 | #if !defined(CL_TARGET_OPENCL_VERSION)
22 | #pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)")
23 | #define CL_TARGET_OPENCL_VERSION 300
24 | #endif
25 | #if CL_TARGET_OPENCL_VERSION != 100 && \
26 |     CL_TARGET_OPENCL_VERSION != 110 && \
27 |     CL_TARGET_OPENCL_VERSION != 120 && \
28 |     CL_TARGET_OPENCL_VERSION != 200 && \
29 |     CL_TARGET_OPENCL_VERSION != 210 && \
30 |     CL_TARGET_OPENCL_VERSION != 220 && \
31 |     CL_TARGET_OPENCL_VERSION != 300
32 | #pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220, 300). Defaulting to 300 (OpenCL 3.0)")
33 | #undef CL_TARGET_OPENCL_VERSION
34 | #define CL_TARGET_OPENCL_VERSION 300
35 | #endif
36 | 
37 | 
38 | /* OpenCL Version */
39 | #if CL_TARGET_OPENCL_VERSION >= 300 && !defined(CL_VERSION_3_0)
40 | #define CL_VERSION_3_0  1
41 | #endif
42 | #if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)
43 | #define CL_VERSION_2_2  1
44 | #endif
45 | #if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)
46 | #define CL_VERSION_2_1  1
47 | #endif
48 | #if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)
49 | #define CL_VERSION_2_0  1
50 | #endif
51 | #if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)
52 | #define CL_VERSION_1_2  1
53 | #endif
54 | #if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)
55 | #define CL_VERSION_1_1  1
56 | #endif
57 | #if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)
58 | #define CL_VERSION_1_0  1
59 | #endif
60 | 
61 | /* Allow deprecated APIs for older OpenCL versions. */
62 | #if CL_TARGET_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
63 | #define CL_USE_DEPRECATED_OPENCL_2_2_APIS
64 | #endif
65 | #if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
66 | #define CL_USE_DEPRECATED_OPENCL_2_1_APIS
67 | #endif
68 | #if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
69 | #define CL_USE_DEPRECATED_OPENCL_2_0_APIS
70 | #endif
71 | #if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
72 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
73 | #endif
74 | #if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
75 | #define CL_USE_DEPRECATED_OPENCL_1_1_APIS
76 | #endif
77 | #if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
78 | #define CL_USE_DEPRECATED_OPENCL_1_0_APIS
79 | #endif
80 | 
81 | #endif  /* __CL_VERSION_H */
82 | 


--------------------------------------------------------------------------------
/OpenCL-Benchmark.vcxproj:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup Label="ProjectConfigurations">
 4 |     <ProjectConfiguration Include="Release|x64">
 5 |       <Configuration>Release</Configuration>
 6 |       <Platform>x64</Platform>
 7 |     </ProjectConfiguration>
 8 |   </ItemGroup>
 9 |   <PropertyGroup Label="Globals">
10 |     <VCProjectVersion>15.0</VCProjectVersion>
11 |     <ProjectGuid>{B07BD873-9CD9-4F0B-AAA8-8AE6FE22F76A}</ProjectGuid>
12 |     <RootNamespace>OpenCL-Benchmark</RootNamespace>
13 |     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
14 |   </PropertyGroup>
15 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
16 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
17 |     <ConfigurationType>Application</ConfigurationType>
18 |     <UseDebugLibraries>false</UseDebugLibraries>
19 |     <PlatformToolset>v142</PlatformToolset>
20 |     <WholeProgramOptimization>true</WholeProgramOptimization>
21 |     <CharacterSet>MultiByte</CharacterSet>
22 |   </PropertyGroup>
23 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
24 |   <ImportGroup Label="ExtensionSettings">
25 |   </ImportGroup>
26 |   <ImportGroup Label="Shared">
27 |   </ImportGroup>
28 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
29 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
30 |   </ImportGroup>
31 |   <PropertyGroup Label="UserMacros" />
32 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
33 |     <OutDir>$(SolutionDir)bin\</OutDir>
34 |     <IntDir>$(SolutionDir)temp\</IntDir>
35 |   </PropertyGroup>
36 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
37 |     <ClCompile>
38 |       <WarningLevel>Level3</WarningLevel>
39 |       <Optimization>MaxSpeed</Optimization>
40 |       <FunctionLevelLinking>true</FunctionLevelLinking>
41 |       <IntrinsicFunctions>true</IntrinsicFunctions>
42 |       <SDLCheck>true</SDLCheck>
43 |       <ConformanceMode>true</ConformanceMode>
44 |       <AdditionalIncludeDirectories>$(SolutionDir)src\OpenCL\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
45 |       <MultiProcessorCompilation>true</MultiProcessorCompilation>
46 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
47 |       <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
48 |       <FloatingPointModel>Fast</FloatingPointModel>
49 |       <LanguageStandard>stdcpp17</LanguageStandard>
50 |       <DisableSpecificWarnings>26451;6386;%(DisableSpecificWarnings)</DisableSpecificWarnings>
51 |     </ClCompile>
52 |     <Link>
53 |       <SubSystem>Console</SubSystem>
54 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
55 |       <OptimizeReferences>true</OptimizeReferences>
56 |       <AdditionalLibraryDirectories>$(SolutionDir)src\OpenCL\lib</AdditionalLibraryDirectories>
57 |       <AdditionalDependencies>OpenCL.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
58 |     </Link>
59 |   </ItemDefinitionGroup>
60 |   <ItemGroup>
61 |     <ClCompile Include="src\kernel.cpp" />
62 |     <ClCompile Include="src\main.cpp" />
63 |   </ItemGroup>
64 |   <ItemGroup>
65 |     <ClInclude Include="src\kernel.hpp" />
66 |     <ClInclude Include="src\opencl.hpp" />
67 |     <ClInclude Include="src\utilities.hpp" />
68 |   </ItemGroup>
69 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
70 |   <ImportGroup Label="ExtensionTargets">
71 |   </ImportGroup>
72 | </Project>


--------------------------------------------------------------------------------
/src/kernel.cpp:
--------------------------------------------------------------------------------
  1 | #include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with )+R(
  2 | string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################
  3 | 
  4 | 
  5 | 
  6 | int dp4a(const char4 a, const char4 b, const int c) { // 4-wide byte dot product and accumulate
  7 | )+"#if cl_nv_compute_capability>=61"+R( // use hardware-supported dp4a on Nvidia Pascal or newer GPUs with inline PTX assembly
  8 | 	int d;)+"asm(\"dp4a.s32.s32\t%0,%1,%2,%3;\":\"=r\"(d):\"r\"(as_int(a)),\"r\"(as_int(b)),\"r\"(c));"+R(return d;
  9 | )+"#elif defined(__opencl_c_integer_dot_product_input_4x8bit)"+R( // use hardware-supported dp4a on some Intel GPUs
 10 | 	return c+dot(a, b); // dot_acc_sat(a, b, c); is slow
 11 | )+"#elif __has_builtin(__builtin_amdgcn_sdot4)"+R( // use hardware-supported dp4a on older AMD GPUs
 12 | 	return __builtin_amdgcn_sdot4(as_int(a), as_int(b), c, false);
 13 | )+"#elif __has_builtin(__builtin_amdgcn_sudot4)"+R( // use hardware-supported dp4a on newer AMD GPUs
 14 | 	return __builtin_amdgcn_sudot4(true, as_int(a), true, as_int(b), c, false);
 15 | )+"#elif defined(cl_arm_integer_dot_product_accumulate_int8)"+R( // use hardware-supported dp4a on some ARM GPUs
 16 | 	return arm_dot_acc(a, b, c);
 17 | )+"#else"+R( // fallback emulation (compilers will turn this into hardware-supported dp4a instruction if available)
 18 | 	return c+a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
 19 | )+"#endif"+R(
 20 | }
 21 | 
 22 | 
 23 | 
 24 | )+"#ifdef cl_khr_fp64"+R( // OpenCL C defines don't work in R() stringification macro
 25 | kernel void kernel_double(global float* data) {
 26 | 	double x = (double)get_global_id(0);
 27 | 	double y = (double)get_local_id(0);
 28 | 	for(uint i=0u; i<128u; i++) {
 29 | 		x = fma(y, x, y); // 2 operations
 30 | 		y = fma(x, y, x); // 2 operations
 31 | 	}
 32 | 	data[get_global_id(0)] = (float)y;
 33 | }
 34 | )+"#endif"+R( // cl_khr_fp64
 35 | 
 36 | kernel void kernel_float(global float* data) {
 37 | 	float x = (float)get_global_id(0);
 38 | 	float y = (float)get_local_id(0);
 39 | 	for(uint i=0u; i<512u; i++) {
 40 | 		x = fma(y, x, y); // 2 operations
 41 | 		y = fma(x, y, x); // 2 operations
 42 | 	}
 43 | 	data[get_global_id(0)] = y;
 44 | }
 45 | 
 46 | )+"#ifdef cl_khr_fp16"+R( // OpenCL C defines don't work in R() stringification macro
 47 | kernel void kernel_half(global float* data) {
 48 | 	half2 x = (half2)((float)get_global_id(0), (float)get_local_id(0));
 49 | 	half2 y = (half2)((float)get_local_id(0), (float)get_global_id(0));
 50 | 	for(uint i=0u; i<512u; i++) {
 51 | 		x = y*x+y; // 4 operations
 52 | 		y = x*y+x; // 4 operations
 53 | 	}
 54 | 	data[get_global_id(0)] = (float)y.x+(float)y.y;
 55 | }
 56 | )+"#endif"+R( // cl_khr_fp16
 57 | 
 58 | kernel void kernel_long(global float* data) {
 59 | 	long x = (long)get_global_id(0);
 60 | 	long y = (long)get_local_id(0);
 61 | 	for(uint i=0u; i<8u; i++) {
 62 | 		x = y*x+y; // 2 operations
 63 | 		y = x*y+x; // 2 operations
 64 | 	}
 65 | 	data[get_global_id(0)] = as_float((int)y);
 66 | }
 67 | 
 68 | kernel void kernel_int(global float* data) {
 69 | 	int x = get_global_id(0);
 70 | 	int y = get_local_id(0);
 71 | 	for(uint i=0u; i<512u; i++) {
 72 | 		x = y*x+y; // 2 operations
 73 | 		y = x*y+x; // 2 operations
 74 | 	}
 75 | 	data[get_global_id(0)] = as_float(y);
 76 | }
 77 | 
 78 | kernel void kernel_short(global float* data) {
 79 | 	short2 x = as_short2((uint)get_global_id(0));
 80 | 	short2 y = as_short2((uint)get_local_id(0));
 81 | 	for(uint i=0u; i<128u; i++) {
 82 | 		x = y*x+y; // 4 operations
 83 | 		y = x*y+x; // 4 operations
 84 | 	}
 85 | 	data[get_global_id(0)] = as_float(y);
 86 | }
 87 | 
 88 | kernel void kernel_char(global float* data) {
 89 | 	char4 x = as_char4((uint)get_global_id(0));
 90 | 	char4 y = as_char4((uint)get_local_id(0));
 91 | 	for(uint i=0u; i<64u; i++) {
 92 | 		x = as_char4(dp4a(y, x, as_int(y))); // 8 operations
 93 | 		y = as_char4(dp4a(x, y, as_int(x))); // 8 operations
 94 | 	}
 95 | 	data[get_global_id(0)] = as_float(y);
 96 | }
 97 | 
 98 | 
 99 | 
100 | kernel void kernel_coalesced_write(global float* data) {
101 | 	const uint n = get_global_id(0);
102 | 	for(uint i=0u; i<def_M; i++) data[i*def_N+n] = as_float(n); // coalesced write
103 | }
104 | kernel void kernel_coalesced_read(global float* data) {
105 | 	const uint n = get_global_id(0);
106 | 	float x = 0.0f;
107 | 	for(uint i=0u; i<def_M; i++) x += data[i*def_N+n]; // coalesced read
108 | 	data[n] = x;
109 | }
110 | kernel void kernel_misaligned_write(global float* data) {
111 | 	const uint n = get_global_id(0);
112 | 	for(uint i=0u; i<def_M; i++) data[n*def_M+i] = as_float(n); // misaligned write
113 | }
114 | kernel void kernel_misaligned_read(global float* data) {
115 | 	const uint n = get_global_id(0);
116 | 	float x = 0.0f;
117 | 	for(uint i=0u; i<def_M; i++) x += data[n*def_M+i]; // misaligned read
118 | 	data[n] = x;
119 | }
120 | 
121 | 
122 | 
123 | );} // ############################################################### end of OpenCL C code #####################################################################


--------------------------------------------------------------------------------
/src/kernel.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "utilities.hpp"
  4 | #define R(...) string(" "#__VA_ARGS__" ") // evil stringification macro, similar syntax to raw string R"(...)"
  5 | 
  6 | string opencl_c_container(); // outsourced to kernel.cpp
  7 | string get_opencl_c_code() {
  8 | 	string r = opencl_c_container();
  9 | 	r = replace(r, " ", "\n"); // replace all spaces by new lines
 10 | 	r = replace(r, "#ifdef\n", "#ifdef "); // except for the arguments after some preprocessor options that need to be in the same line
 11 | 	r = replace(r, "#ifndef\n", "#ifndef ");
 12 | 	r = replace(r, "#define\n", "#define "); // #define with two arguments will not work
 13 | 	r = replace(r, "#undef\n", "#undef ");
 14 | 	r = replace(r, "#if\n", "#if "); // don't leave any spaces in arguments
 15 | 	r = replace(r, "#elif\n", "#elif "); // don't leave any spaces in arguments
 16 | 	r = replace(r, "#pragma\n", "#pragma ");
 17 | 	return "\n"+r;
 18 | }
 19 | 
 20 | // everything below is just for syntax highlighting in the editor, this does not change any functionality
 21 | // full catalogue: https://www.khronos.org/files/opencl30-reference-guide.pdf
 22 | 
 23 | // general
 24 | #define get_global_id(x) // global index, set x=0
 25 | #define get_global_size(x) // global range, set x=0
 26 | #define get_local_id(x) // local index within group, set x=0
 27 | #define get_local_size(x) // group size, set x=0
 28 | #define	get_num_groups(x) // number of groups, set x=0
 29 | #define get_group_id(x) // group ID, set x=0
 30 | #define get_global_offset(x) // global offset, set x=0
 31 | #define get_work_dim // number of dimensions in use
 32 | #define __attribute__(x) // compiler attribute qualifiers
 33 | #define always_inline // compiler attribute qualifier for inlining functions
 34 | #define opencl_unroll_hint // compiler attribute qualifier for loop unrolling
 35 | #define barrier(x) // barrier for local work group, x is CLK_LOCAL_MEM_FENCE or CLK_GLOBAL_MEM_FENCE
 36 | #define mem_fence(x) // orders loads/stores, x is CLK_LOCAL_MEM_FENCE or CLK_GLOBAL_MEM_FENCE
 37 | #define read_mem_fence(x) // orders loads, x is CLK_LOCAL_MEM_FENCE or CLK_GLOBAL_MEM_FENCE
 38 | #define write_mem_fence(x) // orders stores, x is CLK_LOCAL_MEM_FENCE or CLK_GLOBAL_MEM_FENCE
 39 | #define CLK_LOCAL_MEM_FENCE
 40 | #define CLK_GLOBAL_MEM_FENCE
 41 | #define kernel
 42 | #define constant
 43 | #define global
 44 | #define local
 45 | #define __kernel
 46 | #define __constant
 47 | #define __global
 48 | #define __local
 49 | #define __private // private keyword already exists in C++
 50 | 
 51 | // 32-bit integer atomics
 52 | #define atomic_add(p,x) // (*p)+=x
 53 | #define atomic_sub(p,x) // (*p)-=x
 54 | #define atomic_xchg(p,x) // t=(*p);(*p)=x;x=t;
 55 | #define atomic_inc(p) // (*p)+=1
 56 | #define atomic_dec(p) // (*p)-=1
 57 | #define atomic_cmpxchg(p,c,x) // (*p)=((*p)==c?x:(*p))
 58 | #define atomic_max(p,x) // (*p)=max(*p,x)
 59 | #define atomic_min(p,x) // (*p)=min(*p,x)
 60 | #define atomic_and(p,x) // (*p)=(*p)&x
 61 | #define atomic_or(p,x) // (*p)=(*p)|x
 62 | #define atomic_xor(p,x) // (*p)=(*p)^x
 63 | 
 64 | // 64-bit integer atomics (cl_khr_int64_base_atomics extension must be supported by the device)
 65 | #define atom_add(p,x) // (*p)+=x
 66 | #define atom_sub(p,x) // (*p)-=x
 67 | #define atom_xchg(p,x) // t=(*p);(*p)=x;x=t;
 68 | #define atom_inc(p) // (*p)+=1
 69 | #define atom_dec(p) // (*p)-=1
 70 | #define atom_cmpxchg(p,c,x) // (*p)=((*p)==c?x:(*p))
 71 | #define atom_max(p,x) // (*p)=max(*p,x)
 72 | #define atom_min(p,x) // (*p)=min(*p,x)
 73 | #define atom_and(p,x) // (*p)=(*p)&x
 74 | #define atom_or(p,x) // (*p)=(*p)|x
 75 | #define atom_xor(p,x) // (*p)=(*p)^x
 76 | 
 77 | // integer functions
 78 | #define abs(x) // |x|
 79 | #define clz(x) // count leading 0 bits, slow, instead use as_uint((float)((x&0x07FF)<<12))>>23
 80 | #define mad_sat(a,b,c) // a*b+c
 81 | #define max(x,y)
 82 | #define min(x,y)
 83 | 
 84 | // integer and floating-point functions
 85 | #define clamp(x,a,b)
 86 | #define sign(x)
 87 | 
 88 | // floating-point functions
 89 | #define acos(x)
 90 | #define acosh(x)
 91 | #define acospi(x) // acos(x)/pi
 92 | #define asin(x)
 93 | #define asinh(x)
 94 | #define asinpi(x) // asin(x)/pi
 95 | #define atan(x)
 96 | #define atan2(x,y) // atan(x/y)
 97 | #define atanh(x)
 98 | #define atanpi(x) // atan(x)/pi
 99 | #define atan2pi(x,y) // atan(x/y)/pi
100 | #define cbrt(x) // x^(1/3)
101 | #define copysign(x,y) // x with sign changed to sign of y
102 | #define cos(x)
103 | #define cosh(x)
104 | #define cospi(x) // cos(pi*x)
105 | #define degrees(x) // x*180/pi
106 | #define erfc(x) // complementary error function
107 | #define erf(x) // error function
108 | #define exp(x) // e^x
109 | #define exp2(x) // 2^x
110 | #define exp10(x) // 10^x
111 | #define expm1(x) // e^x-1
112 | #define fabs(x) // |x|
113 | #define fdim(x,y) // max(x-y,0)
114 | #define floor(x) // (float)((int)x)
115 | #define fma(a,b,c) // a*b+c
116 | #define fmax(x,y) // max(x,y)
117 | #define fmin(x,y) // min(x,y)
118 | #define fmod(x,y) // x%y
119 | #define hypot(x,y) // (x^2+y^2)^(1/2)
120 | #define isfinite(x) // test for finite value
121 | #define isinf(x) // test for infinity
122 | #define isnan(x) // test for NaN
123 | #define isnormal(x) // test for normal value
124 | #define ldexp(x,n) // x*2^n (n is integer)
125 | #define lgamma(x) // log gamma function
126 | #define log(x) // ln(x)
127 | #define log2(x) // log_2(x)
128 | #define log10(x) // log_10(x)
129 | #define log1p(x) // ln(1+x)
130 | #define mad(a,b,c) // a*b+c (approximation)
131 | #define maxmag(x,y) // max(|x|,|y|)
132 | #define minmag(x,y) // min(|x|,|y|)
133 | #define native_rsqrt(x) // x^(-1/2)
134 | #define native_sqrt(x) // x^(1/2)
135 | #define pow(x,y) // x^y
136 | #define pown(x,n) // x^n, where n is an integer
137 | #define powr(x,y) // x^y, where x>=0
138 | #define radians(x) // x*pi/180
139 | #define rootn(x,y) // x^(1/y)
140 | #define rsqrt(x) // x^(-1/2), slower, use native_rsqrt(x) instead
141 | #define signbit(x) // test for sign bit
142 | #define sin(x)
143 | #define sinh(x)
144 | #define sinpi(x) // sin(pi*x)
145 | #define sqrt(x) // x^(1/2), slower, use native_sqrt(x) instead
146 | #define step(x,y) // y<x ? 0 : 1
147 | #define stepsmooth(a,b,x) // step and interpolate
148 | #define tan(x)
149 | #define tanh(x)
150 | #define tanpi(x) // tan(pi*x)
151 | #define tgamma(x) // gamma function
152 | #define vload_half(o,p) // load half from global memory
153 | #define vstore_half_rte(x,o,p) // store half in global memory
154 | 
155 | // vector functions
156 | #define cross(x,y) // xxy
157 | #define distance(x,y) // |y-x|
158 | #define dot(x,y) // x*y
159 | #define length(x) // |x|
160 | #define normalize(x) // x/|x|
161 | #define fast_distance(x,y) // |y-x|
162 | #define fast_length(x) // |x|
163 | #define fast_normalize(x) // x/|x|
164 | 
165 | // data types
166 | #define half
167 | #define half2
168 | #define half3
169 | #define half4
170 | #define half8
171 | #define half16
172 | #define float2
173 | #define float3
174 | #define float4
175 | #define float8
176 | #define float16
177 | #define double2
178 | #define double3
179 | #define double4
180 | #define double8
181 | #define double16
182 | #define char2
183 | #define char3
184 | #define char4
185 | #define char8
186 | #define char16
187 | #define short2
188 | #define short3
189 | #define short4
190 | #define short8
191 | #define short16
192 | #define int2
193 | #define int3
194 | #define int4
195 | #define int8
196 | #define int16
197 | #define long2
198 | #define long3
199 | #define long4
200 | #define long8
201 | #define long16
202 | #define uchar
203 | #define uchar2
204 | #define uchar3
205 | #define uchar4
206 | #define uchar8
207 | #define uchar16
208 | #define ushort
209 | #define ushort2
210 | #define ushort3
211 | #define ushort4
212 | #define ushort8
213 | #define ushort16
214 | #define uint
215 | #define uint2
216 | #define uint3
217 | #define uint4
218 | #define uint8
219 | #define uint16
220 | #define ulong
221 | #define ulong2
222 | #define ulong3
223 | #define ulong4
224 | #define ulong8
225 | #define ulong16
226 | 
227 | // interpret functions
228 | #define as_half(x)
229 | #define as_half2(x)
230 | #define as_half3(x)
231 | #define as_half4(x)
232 | #define as_half8(x)
233 | #define as_half16(x)
234 | #define as_float(x)
235 | #define as_float2(x)
236 | #define as_float3(x)
237 | #define as_float4(x)
238 | #define as_float8(x)
239 | #define as_float16(x)
240 | #define as_double(x)
241 | #define as_double2(x)
242 | #define as_double3(x)
243 | #define as_double4(x)
244 | #define as_double8(x)
245 | #define as_double16(x)
246 | #define as_char(x)
247 | #define as_char2(x)
248 | #define as_char3(x)
249 | #define as_char4(x)
250 | #define as_char8(x)
251 | #define as_char16(x)
252 | #define as_short2(x)
253 | #define as_short3(x)
254 | #define as_short4(x)
255 | #define as_short8(x)
256 | #define as_short16(x)
257 | #define as_int(x)
258 | #define as_int2(x)
259 | #define as_int3(x)
260 | #define as_int4(x)
261 | #define as_int8(x)
262 | #define as_int16(x)
263 | #define as_long(x)
264 | #define as_long2(x)
265 | #define as_long3(x)
266 | #define as_long4(x)
267 | #define as_long8(x)
268 | #define as_long16(x)
269 | #define as_uchar(x)
270 | #define as_uchar2(x)
271 | #define as_uchar3(x)
272 | #define as_uchar4(x)
273 | #define as_uchar8(x)
274 | #define as_uchar16(x)
275 | #define as_ushort(x)
276 | #define as_ushort2(x)
277 | #define as_ushort3(x)
278 | #define as_ushort4(x)
279 | #define as_ushort8(x)
280 | #define as_ushort16(x)
281 | #define as_uint(x)
282 | #define as_uint2(x)
283 | #define as_uint3(x)
284 | #define as_uint4(x)
285 | #define as_uint8(x)
286 | #define as_uint16(x)
287 | #define as_ulong(x)
288 | #define as_ulong2(x)
289 | #define as_ulong3(x)
290 | #define as_ulong4(x)
291 | #define as_ulong8(x)
292 | #define as_ulong16(x)


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "opencl.hpp" // includes "utilities.hpp"
  2 | 
  3 | string fraction(const float x) {
  4 | 	float values[]   = { 1.0f/64.0f, 1.0f/32.0f, 1.0f/24.0f, 1.0f/16.0f, 1.0f/12.0f, 1.0f/8.0f, 1.0f/4.0f, 1.0f/3.0f, 1.0f/2.0f, 2.0f/3.0f, 1.0f, 2.0f, 4.0f, 8.0f, 16.0f, 32.0f, 64.0f };
  5 | 	string strings[] = {   "1/64"  ,   "1/32"  ,   "1/24"  ,   "1/16"  ,   "1/12"  ,   "1/8 ",   "1/4 ", "1/3 ", "1/2 ", "2/3 ", " 1x ", " 2x ", " 4x ", " 8x ", " 16x", " 32x", " 64x" };
  6 | 	uint i=0u, imin=0u;
  7 | 	float vmin = max_float;
  8 | 	for(float v : values) {
  9 | 		const float vnew = sq(0.01f*x-v);
 10 | 		if(vnew<=vmin) {
 11 | 			vmin = vnew;
 12 | 			imin = i;
 13 | 		}
 14 | 		i++;
 15 | 	}
 16 | 	return "("+strings[imin]+")";
 17 | }
 18 | 
 19 | void benchmark_device(const Device_Info& device_info) {
 20 | 	const uint N = 4096u*4096u; // kernel range: N*M*sizeof(float) = 1GB memory allocation
 21 | 	const uint M = 16u; // coalescence size
 22 | 	const uint N_kernel = 256u; // iterations for kernel calls
 23 | 	const uint N_memory = 16u; // iterations for PCIe memory transfers
 24 | 
 25 | 	double time_double=max_double, time_float=max_double, time_half=max_double;
 26 | 	double time_long=max_double, time_int=max_double, time_short=max_double, time_char=max_double;
 27 | 	double time_cw=max_double, time_cr=max_double, time_mw=max_double, time_mr=max_double, time_send=max_double, time_receive=max_double, time_bidirectional=max_double;
 28 | 	Clock clock;
 29 | 
 30 | 	const string defines =
 31 | 		"\n	#define def_N "+to_string(N)+"u"
 32 | 		"\n	#define def_M "+to_string(M)+"u"
 33 | 	;
 34 | 	print("| Compiling ...                                                               |");
 35 | 	Device device(device_info, defines+get_opencl_c_code());
 36 | 	Memory<float> buffer(device, N, M);
 37 | 	//print_info("Device mormory usage: "+to_string(device.info.memory_used)+" MB");
 38 | 
 39 | 	if(device.info.is_fp64_capable) {
 40 | 		print("| Benchmarking ...                                                            |");
 41 | 		Kernel kernel_double(device, N, "kernel_double", buffer);
 42 | 		for(uint i=0u; i<N_kernel; i++) {
 43 | 			clock.start();
 44 | 			kernel_double.run();
 45 | 			time_double = fmin(clock.stop(), time_double);
 46 | 		}
 47 | 		const float flops_double = 512.0f*(float)N/(float)time_double*1E-12f;
 48 | 		println("\r| FP64  compute "+alignr(45u, to_string(flops_double, 3u))+" TFLOPs/s "+fraction(100.0f*flops_double/device.info.tflops)+" |");
 49 | 	} else {
 50 | 		println("\r| FP64  compute                                          not supported        |");
 51 | 	}
 52 | 
 53 | 	print("| Benchmarking ...                                                            |");
 54 | 	Kernel kernel_float(device, N, "kernel_float", buffer);
 55 | 	for(uint i=0u; i<N_kernel; i++) {
 56 | 		clock.start();
 57 | 		kernel_float.run();
 58 | 		time_float = fmin(clock.stop(), time_float);
 59 | 	}
 60 | 	const float flops_float = 2048.0f*(float)N/(float)time_float*1E-12f; //const float flops = 32.0f*2.0f*(float)sq(M)*N/(float)timef*1E-12f;
 61 | 	println("\r| FP32  compute "+alignr(45u, to_string(flops_float, 3u))+" TFLOPs/s "+fraction(100.0f*flops_float/device.info.tflops)+" |");
 62 | 
 63 | 	if(device.info.is_fp16_capable) {
 64 | 		print("| Benchmarking ...                                                            |");
 65 | 		Kernel kernel_half(device, N, "kernel_half", buffer);
 66 | 		for(uint i=0u; i<N_kernel; i++) {
 67 | 			clock.start();
 68 | 			kernel_half.run();
 69 | 			time_half = fmin(clock.stop(), time_half);
 70 | 		}
 71 | 		const float flops_half = 4096.0f*(float)N/(float)time_half*1E-12f;
 72 | 		println("\r| FP16  compute "+alignr(45u, to_string(flops_half, 3u))+" TFLOPs/s "+fraction(100.0f*flops_half/device.info.tflops)+" |");
 73 | 	} else {
 74 | 		println("\r| FP16  compute                                          not supported        |");
 75 | 	}
 76 | 
 77 | 	print("| Benchmarking ...                                                            |");
 78 | 	Kernel kernel_long(device, N, "kernel_long", buffer);
 79 | 	for(uint i=0u; i<N_kernel; i++) {
 80 | 		clock.start();
 81 | 		kernel_long.run();
 82 | 		time_long = fmin(clock.stop(), time_long);
 83 | 	}
 84 | 	const float flops_long = 32.0f*(float)N/(float)time_long*1E-12f;
 85 | 	println("\r| INT64 compute "+alignr(45u, to_string(flops_long, 3u))+"  TIOPs/s "+fraction(100.0f*flops_long/device.info.tflops)+" |");
 86 | 
 87 | 	print("| Benchmarking ...                                                            |");
 88 | 	Kernel kernel_int(device, N, "kernel_int", buffer);
 89 | 	for(uint i=0u; i<N_kernel; i++) {
 90 | 		clock.start();
 91 | 		kernel_int.run();
 92 | 		time_int = fmin(clock.stop(), time_int);
 93 | 	}
 94 | 	const float flops_int = 2048.0f*(float)N/(float)time_int*1E-12f;
 95 | 	println("\r| INT32 compute "+alignr(45u, to_string(flops_int, 3u))+"  TIOPs/s "+fraction(100.0f*flops_int/device.info.tflops)+" |");
 96 | 
 97 | 	print("| Benchmarking ...                                                            |");
 98 | 	Kernel kernel_short(device, N, "kernel_short", buffer);
 99 | 	for(uint i=0u; i<N_kernel; i++) {
100 | 		clock.start();
101 | 		kernel_short.run();
102 | 		time_short = fmin(clock.stop(), time_short);
103 | 	}
104 | 	const float flops_short = 1024.0f*(float)N/(float)time_short*1E-12f;
105 | 	println("\r| INT16 compute "+alignr(45u, to_string(flops_short, 3u))+"  TIOPs/s "+fraction(100.0f*flops_short/device.info.tflops)+" |");
106 | 
107 | 	print("| Benchmarking ...                                                            |");
108 | 	Kernel kernel_char(device, N, "kernel_char", buffer);
109 | 	for(uint i=0u; i<N_kernel; i++) {
110 | 		clock.start();
111 | 		kernel_char.run();
112 | 		time_char = fmin(clock.stop(), time_char);
113 | 	}
114 | 	const float flops_char = 1024.0f*(float)N/(float)time_char*1E-12f;
115 | 	println("\r| INT8  compute "+alignr(45u, to_string(flops_char, 3u))+"  TIOPs/s "+fraction(100.0f*flops_char/device.info.tflops)+" |");
116 | 
117 | 	print("| Benchmarking ...                                                            |");
118 | 	Kernel kernel_coalesced_write(device, N, "kernel_coalesced_write" , buffer);
119 | 	for(uint i=0u; i<N_kernel; i++) {
120 | 		clock.start();
121 | 		kernel_coalesced_write.run();
122 | 		time_cw = fmin(clock.stop(), time_cw);
123 | 	}
124 | 	Kernel kernel_coalesced_read(device, N, "kernel_coalesced_read"  , buffer);
125 | 	for(uint i=0u; i<N_kernel; i++) {
126 | 		clock.start();
127 | 		kernel_coalesced_read.run();
128 | 		time_cr = fmin(clock.stop(), time_cr);
129 | 	}
130 | 	println("\r| Memory Bandwidth ( coalesced read      ) "+alignr(29u, to_string(4.0f*(float)N*(float)M/(float)(time_cr-time_cw/(double)M)*1E-9f, 2u))+" GB/s |");
131 | 	println("\r| Memory Bandwidth ( coalesced      write) "+alignr(29u, to_string(4.0f*(float)N*(float)M/(float) time_cw                   *1E-9f, 2u))+" GB/s |");
132 | 
133 | 	print("| Benchmarking ...                                                            |");
134 | 	Kernel kernel_misaligned_write(device, N, "kernel_misaligned_write", buffer);
135 | 	for(uint i=0u; i<N_kernel; i++) {
136 | 		clock.start();
137 | 		kernel_misaligned_write.run();
138 | 		time_mw = fmin(clock.stop(), time_mw);
139 | 	}
140 | 	Kernel kernel_misaligned_read(device, N, "kernel_misaligned_read" , buffer);
141 | 	for(uint i=0u; i<N_kernel; i++) {
142 | 		clock.start();
143 | 		kernel_misaligned_read.run();
144 | 		time_mr = fmin(clock.stop(), time_mr);
145 | 	}
146 | 	println("\r| Memory Bandwidth (misaligned read      ) "+alignr(29u, to_string(4.0f*(float)N*(float)M/(float)(time_mr-time_cw/(double)M)*1E-9f, 2u))+" GB/s |");
147 | 	println("\r| Memory Bandwidth (misaligned      write) "+alignr(29u, to_string(4.0f*(float)N*(float)M/(float) time_mw                   *1E-9f, 2u))+" GB/s |");
148 | 
149 | 	if(!device.info.uses_ram) {
150 | 		print("| Benchmarking ...                                                            |");
151 | 		for(uint i=0u; i<N_memory; i++) {
152 | 			clock.start();
153 | 			buffer.write_to_device();
154 | 			time_send = fmin(clock.stop(), time_send);
155 | 		}
156 | 		const float bw_send = 4.0f*M*N/(float)time_send*1E-9f;
157 | 		println("\r| PCIe   Bandwidth (send                 ) "+alignr(29u, to_string(bw_send, 2u))+" GB/s |");
158 | 		print("| Benchmarking ...                                                            |");
159 | 		for(uint i=0u; i<N_memory; i++) {
160 | 			clock.start();
161 | 			buffer.read_from_device();
162 | 			time_receive = fmin(clock.stop(), time_receive);
163 | 		}
164 | 		const float bw_receive = 4.0f*M*N/(float)time_receive*1E-9f;
165 | 		println("\r| PCIe   Bandwidth (   receive           ) "+alignr(29u, to_string(bw_receive, 2u))+" GB/s |");
166 | 		print("| Benchmarking ...                                                            |");
167 | 		for(uint i=0u; i<N_memory; i++) {
168 | 			clock.start();
169 | 			buffer.read_from_device(N*M/2u, N*M, false);
170 | 			buffer.write_to_device(0u, N*M/2u, false);
171 | 			buffer.finish_queue();
172 | 			time_bidirectional = fmin(clock.stop(), time_bidirectional);
173 | 		}
174 | 		const float bw_bidirectional = 4.0f*M*N/(float)time_bidirectional*1E-9f;
175 | 		const float bw_max = fmax(2.0f*fmax(bw_send, bw_receive), bw_bidirectional);
176 | 		println("\r| PCIe   Bandwidth (        bidirectional)            (Gen"+to_string(bw_max>17.6f?4:bw_max>8.8f?3:bw_max>4.4f?2:1)+" x16)"+alignr(8u, to_string(bw_bidirectional, 2u))+" GB/s |");
177 | 	}
178 | 
179 | 	println("|-----------------------------------------------------------------------------|");
180 | }
181 | 
182 | int main(int argc, char* argv[]) {
183 | 	vector<string> main_arguments = get_main_arguments(argc, argv);
184 | 	println(".-----------------------------------------------------------------------------.");
185 | 	const vector<Device_Info> devices = get_devices();
186 | 	if((int)main_arguments.size()>0) {
187 | 		for(uint i=0u; i<(uint)main_arguments.size(); i++) benchmark_device(select_device_with_id(to_int(main_arguments[i]), devices));
188 | 	} else {
189 | 		for(uint i=0u; i<(uint)devices.size(); i++) benchmark_device(devices[i]);
190 | 	}
191 | #ifdef _WIN32
192 | 	println("|-----------------------------------------------------------------------------|");
193 | 	println("| Done. Press Enter to exit.                                                  |");
194 | 	println("'-----------------------------------------------------------------------------'");
195 | 	wait();
196 | #else // Linux
197 | 	println("'-----------------------------------------------------------------------------'");
198 | #endif // Linux
199 | 	return 0;
200 | }


--------------------------------------------------------------------------------
/src/OpenCL/include/CL/cl_gl.h:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 |  * Copyright (c) 2008-2023 The Khronos Group Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *    http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  ******************************************************************************/
 16 | 
 17 | #ifndef OPENCL_CL_GL_H_
 18 | #define OPENCL_CL_GL_H_
 19 | 
 20 | /*
 21 | ** This header is generated from the Khronos OpenCL XML API Registry.
 22 | */
 23 | 
 24 | #include <CL/cl.h>
 25 | 
 26 | /* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
 27 | #if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
 28 | #define CL_NO_EXTENSION_PROTOTYPES
 29 | #endif
 30 | 
 31 | /* CL_NO_EXTENSION_PROTOTYPES implies
 32 |    CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
 33 |    CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
 34 | #if defined(CL_NO_EXTENSION_PROTOTYPES) && \
 35 |     !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 36 | #define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
 37 | #endif
 38 | #if defined(CL_NO_EXTENSION_PROTOTYPES) && \
 39 |     !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 40 | #define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
 41 | #endif
 42 | 
 43 | #ifdef __cplusplus
 44 | extern "C" {
 45 | #endif
 46 | 
 47 | /***************************************************************
 48 | * cl_khr_gl_sharing
 49 | ***************************************************************/
 50 | #define cl_khr_gl_sharing 1
 51 | #define CL_KHR_GL_SHARING_EXTENSION_NAME \
 52 |     "cl_khr_gl_sharing"
 53 | 
 54 | 
 55 | #define CL_KHR_GL_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)
 56 | 
 57 | typedef int                 cl_GLint;
 58 | typedef unsigned int        cl_GLenum;
 59 | typedef unsigned int        cl_GLuint;
 60 | 
 61 | typedef cl_uint             cl_gl_context_info;
 62 | 
 63 | /* Error codes */
 64 | #define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR              -1000
 65 | 
 66 | /* cl_gl_context_info */
 67 | #define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR                0x2006
 68 | #define CL_DEVICES_FOR_GL_CONTEXT_KHR                       0x2007
 69 | 
 70 | /* Additional cl_context_properties */
 71 | #define CL_GL_CONTEXT_KHR                                   0x2008
 72 | #define CL_EGL_DISPLAY_KHR                                  0x2009
 73 | #define CL_GLX_DISPLAY_KHR                                  0x200A
 74 | #define CL_WGL_HDC_KHR                                      0x200B
 75 | #define CL_CGL_SHAREGROUP_KHR                               0x200C
 76 | 
 77 | typedef cl_uint             cl_gl_object_type;
 78 | typedef cl_uint             cl_gl_texture_info;
 79 | typedef cl_uint             cl_gl_platform_info;
 80 | 
 81 | /* cl_gl_object_type */
 82 | #define CL_GL_OBJECT_BUFFER                                 0x2000
 83 | #define CL_GL_OBJECT_TEXTURE2D                              0x2001
 84 | #define CL_GL_OBJECT_TEXTURE3D                              0x2002
 85 | #define CL_GL_OBJECT_RENDERBUFFER                           0x2003
 86 | 
 87 | #if defined(CL_VERSION_1_2)
 88 | /* cl_gl_object_type */
 89 | #define CL_GL_OBJECT_TEXTURE2D_ARRAY                        0x200E
 90 | #define CL_GL_OBJECT_TEXTURE1D                              0x200F
 91 | #define CL_GL_OBJECT_TEXTURE1D_ARRAY                        0x2010
 92 | #define CL_GL_OBJECT_TEXTURE_BUFFER                         0x2011
 93 | 
 94 | #endif /* defined(CL_VERSION_1_2) */
 95 | 
 96 | /* cl_gl_texture_info */
 97 | #define CL_GL_TEXTURE_TARGET                                0x2004
 98 | #define CL_GL_MIPMAP_LEVEL                                  0x2005
 99 | 
100 | 
101 | typedef cl_int CL_API_CALL
102 | clGetGLContextInfoKHR_t(
103 |     const cl_context_properties* properties,
104 |     cl_gl_context_info param_name,
105 |     size_t param_value_size,
106 |     void* param_value,
107 |     size_t* param_value_size_ret);
108 | 
109 | typedef clGetGLContextInfoKHR_t *
110 | clGetGLContextInfoKHR_fn CL_API_SUFFIX__VERSION_1_0;
111 | 
112 | typedef cl_mem CL_API_CALL
113 | clCreateFromGLBuffer_t(
114 |     cl_context context,
115 |     cl_mem_flags flags,
116 |     cl_GLuint bufobj,
117 |     cl_int* errcode_ret);
118 | 
119 | typedef clCreateFromGLBuffer_t *
120 | clCreateFromGLBuffer_fn CL_API_SUFFIX__VERSION_1_0;
121 | 
122 | #if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
123 | 
124 | extern CL_API_ENTRY cl_int CL_API_CALL
125 | clGetGLContextInfoKHR(
126 |     const cl_context_properties* properties,
127 |     cl_gl_context_info param_name,
128 |     size_t param_value_size,
129 |     void* param_value,
130 |     size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
131 | 
132 | extern CL_API_ENTRY cl_mem CL_API_CALL
133 | clCreateFromGLBuffer(
134 |     cl_context context,
135 |     cl_mem_flags flags,
136 |     cl_GLuint bufobj,
137 |     cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
138 | 
139 | #endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
140 | 
141 | #if defined(CL_VERSION_1_2)
142 | 
143 | typedef cl_mem CL_API_CALL
144 | clCreateFromGLTexture_t(
145 |     cl_context context,
146 |     cl_mem_flags flags,
147 |     cl_GLenum target,
148 |     cl_GLint miplevel,
149 |     cl_GLuint texture,
150 |     cl_int* errcode_ret);
151 | 
152 | typedef clCreateFromGLTexture_t *
153 | clCreateFromGLTexture_fn CL_API_SUFFIX__VERSION_1_2;
154 | 
155 | #if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
156 | 
157 | extern CL_API_ENTRY cl_mem CL_API_CALL
158 | clCreateFromGLTexture(
159 |     cl_context context,
160 |     cl_mem_flags flags,
161 |     cl_GLenum target,
162 |     cl_GLint miplevel,
163 |     cl_GLuint texture,
164 |     cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
165 | 
166 | #endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
167 | 
168 | #endif /* defined(CL_VERSION_1_2) */
169 | 
170 | 
171 | typedef cl_mem CL_API_CALL
172 | clCreateFromGLRenderbuffer_t(
173 |     cl_context context,
174 |     cl_mem_flags flags,
175 |     cl_GLuint renderbuffer,
176 |     cl_int* errcode_ret);
177 | 
178 | typedef clCreateFromGLRenderbuffer_t *
179 | clCreateFromGLRenderbuffer_fn CL_API_SUFFIX__VERSION_1_0;
180 | 
181 | typedef cl_int CL_API_CALL
182 | clGetGLObjectInfo_t(
183 |     cl_mem memobj,
184 |     cl_gl_object_type* gl_object_type,
185 |     cl_GLuint* gl_object_name);
186 | 
187 | typedef clGetGLObjectInfo_t *
188 | clGetGLObjectInfo_fn CL_API_SUFFIX__VERSION_1_0;
189 | 
190 | typedef cl_int CL_API_CALL
191 | clGetGLTextureInfo_t(
192 |     cl_mem memobj,
193 |     cl_gl_texture_info param_name,
194 |     size_t param_value_size,
195 |     void* param_value,
196 |     size_t* param_value_size_ret);
197 | 
198 | typedef clGetGLTextureInfo_t *
199 | clGetGLTextureInfo_fn CL_API_SUFFIX__VERSION_1_0;
200 | 
201 | typedef cl_int CL_API_CALL
202 | clEnqueueAcquireGLObjects_t(
203 |     cl_command_queue command_queue,
204 |     cl_uint num_objects,
205 |     const cl_mem* mem_objects,
206 |     cl_uint num_events_in_wait_list,
207 |     const cl_event* event_wait_list,
208 |     cl_event* event);
209 | 
210 | typedef clEnqueueAcquireGLObjects_t *
211 | clEnqueueAcquireGLObjects_fn CL_API_SUFFIX__VERSION_1_0;
212 | 
213 | typedef cl_int CL_API_CALL
214 | clEnqueueReleaseGLObjects_t(
215 |     cl_command_queue command_queue,
216 |     cl_uint num_objects,
217 |     const cl_mem* mem_objects,
218 |     cl_uint num_events_in_wait_list,
219 |     const cl_event* event_wait_list,
220 |     cl_event* event);
221 | 
222 | typedef clEnqueueReleaseGLObjects_t *
223 | clEnqueueReleaseGLObjects_fn CL_API_SUFFIX__VERSION_1_0;
224 | 
225 | #if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
226 | 
227 | extern CL_API_ENTRY cl_mem CL_API_CALL
228 | clCreateFromGLRenderbuffer(
229 |     cl_context context,
230 |     cl_mem_flags flags,
231 |     cl_GLuint renderbuffer,
232 |     cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
233 | 
234 | extern CL_API_ENTRY cl_int CL_API_CALL
235 | clGetGLObjectInfo(
236 |     cl_mem memobj,
237 |     cl_gl_object_type* gl_object_type,
238 |     cl_GLuint* gl_object_name) CL_API_SUFFIX__VERSION_1_0;
239 | 
240 | extern CL_API_ENTRY cl_int CL_API_CALL
241 | clGetGLTextureInfo(
242 |     cl_mem memobj,
243 |     cl_gl_texture_info param_name,
244 |     size_t param_value_size,
245 |     void* param_value,
246 |     size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
247 | 
248 | extern CL_API_ENTRY cl_int CL_API_CALL
249 | clEnqueueAcquireGLObjects(
250 |     cl_command_queue command_queue,
251 |     cl_uint num_objects,
252 |     const cl_mem* mem_objects,
253 |     cl_uint num_events_in_wait_list,
254 |     const cl_event* event_wait_list,
255 |     cl_event* event) CL_API_SUFFIX__VERSION_1_0;
256 | 
257 | extern CL_API_ENTRY cl_int CL_API_CALL
258 | clEnqueueReleaseGLObjects(
259 |     cl_command_queue command_queue,
260 |     cl_uint num_objects,
261 |     const cl_mem* mem_objects,
262 |     cl_uint num_events_in_wait_list,
263 |     const cl_event* event_wait_list,
264 |     cl_event* event) CL_API_SUFFIX__VERSION_1_0;
265 | 
266 | #endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
267 | 
268 | /* OpenCL 1.0 APIs that were deprecated in OpenCL 1.2 */
269 | 
270 | typedef cl_mem CL_API_CALL
271 | clCreateFromGLTexture2D_t(
272 |     cl_context context,
273 |     cl_mem_flags flags,
274 |     cl_GLenum target,
275 |     cl_GLint miplevel,
276 |     cl_GLuint texture,
277 |     cl_int* errcode_ret);
278 | 
279 | typedef clCreateFromGLTexture2D_t *
280 | clCreateFromGLTexture2D_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
281 | 
282 | typedef cl_mem CL_API_CALL
283 | clCreateFromGLTexture3D_t(
284 |     cl_context context,
285 |     cl_mem_flags flags,
286 |     cl_GLenum target,
287 |     cl_GLint miplevel,
288 |     cl_GLuint texture,
289 |     cl_int* errcode_ret);
290 | 
291 | typedef clCreateFromGLTexture3D_t *
292 | clCreateFromGLTexture3D_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
293 | 
294 | #if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
295 | 
296 | extern CL_API_ENTRY cl_mem CL_API_CALL
297 | clCreateFromGLTexture2D(
298 |     cl_context context,
299 |     cl_mem_flags flags,
300 |     cl_GLenum target,
301 |     cl_GLint miplevel,
302 |     cl_GLuint texture,
303 |     cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
304 | 
305 | extern CL_API_ENTRY cl_mem CL_API_CALL
306 | clCreateFromGLTexture3D(
307 |     cl_context context,
308 |     cl_mem_flags flags,
309 |     cl_GLenum target,
310 |     cl_GLint miplevel,
311 |     cl_GLuint texture,
312 |     cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
313 | 
314 | #endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
315 | 
316 | /***************************************************************
317 | * cl_khr_gl_event
318 | ***************************************************************/
319 | #define cl_khr_gl_event 1
320 | #define CL_KHR_GL_EVENT_EXTENSION_NAME \
321 |     "cl_khr_gl_event"
322 | 
323 | 
324 | #define CL_KHR_GL_EVENT_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)
325 | 
326 | typedef struct __GLsync *   cl_GLsync;
327 | 
328 | /* cl_command_type */
329 | #define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR                 0x200D
330 | 
331 | 
332 | typedef cl_event CL_API_CALL
333 | clCreateEventFromGLsyncKHR_t(
334 |     cl_context context,
335 |     cl_GLsync sync,
336 |     cl_int* errcode_ret);
337 | 
338 | typedef clCreateEventFromGLsyncKHR_t *
339 | clCreateEventFromGLsyncKHR_fn CL_API_SUFFIX__VERSION_1_1;
340 | 
341 | #if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
342 | 
343 | extern CL_API_ENTRY cl_event CL_API_CALL
344 | clCreateEventFromGLsyncKHR(
345 |     cl_context context,
346 |     cl_GLsync sync,
347 |     cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1;
348 | 
349 | #endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
350 | 
351 | /***************************************************************
352 | * cl_khr_gl_depth_images
353 | ***************************************************************/
354 | #define cl_khr_gl_depth_images 1
355 | #define CL_KHR_GL_DEPTH_IMAGES_EXTENSION_NAME \
356 |     "cl_khr_gl_depth_images"
357 | 
358 | 
359 | #define CL_KHR_GL_DEPTH_IMAGES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)
360 | 
361 | /* cl_channel_order */
362 | #define CL_DEPTH_STENCIL                                    0x10BE
363 | 
364 | /* cl_channel_type */
365 | #define CL_UNORM_INT24                                      0x10DF
366 | 
367 | /***************************************************************
368 | * cl_khr_gl_msaa_sharing
369 | ***************************************************************/
370 | #define cl_khr_gl_msaa_sharing 1
371 | #define CL_KHR_GL_MSAA_SHARING_EXTENSION_NAME \
372 |     "cl_khr_gl_msaa_sharing"
373 | 
374 | 
375 | #define CL_KHR_GL_MSAA_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)
376 | 
377 | /* cl_gl_texture_info */
378 | #define CL_GL_NUM_SAMPLES                                   0x2012
379 | 
380 | /***************************************************************
381 | * cl_intel_sharing_format_query_gl
382 | ***************************************************************/
383 | #define cl_intel_sharing_format_query_gl 1
384 | #define CL_INTEL_SHARING_FORMAT_QUERY_GL_EXTENSION_NAME \
385 |     "cl_intel_sharing_format_query_gl"
386 | 
387 | 
388 | #define CL_INTEL_SHARING_FORMAT_QUERY_GL_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)
389 | 
390 | /* when cl_khr_gl_sharing is supported */
391 | 
392 | typedef cl_int CL_API_CALL
393 | clGetSupportedGLTextureFormatsINTEL_t(
394 |     cl_context context,
395 |     cl_mem_flags flags,
396 |     cl_mem_object_type image_type,
397 |     cl_uint num_entries,
398 |     cl_GLenum* gl_formats,
399 |     cl_uint* num_texture_formats);
400 | 
401 | typedef clGetSupportedGLTextureFormatsINTEL_t *
402 | clGetSupportedGLTextureFormatsINTEL_fn ;
403 | 
404 | #if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
405 | 
406 | extern CL_API_ENTRY cl_int CL_API_CALL
407 | clGetSupportedGLTextureFormatsINTEL(
408 |     cl_context context,
409 |     cl_mem_flags flags,
410 |     cl_mem_object_type image_type,
411 |     cl_uint num_entries,
412 |     cl_GLenum* gl_formats,
413 |     cl_uint* num_texture_formats) ;
414 | 
415 | #endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
416 | 
417 | #ifdef __cplusplus
418 | }
419 | #endif
420 | 
421 | #endif /* OPENCL_CL_GL_H_ */
422 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # OpenCL-Benchmark
  2 | 
  3 | A small [OpenCL](https://github.com/ProjectPhysX/OpenCL-Wrapper "OpenCL-Wrapper") benchmark program to measure peak GPU/CPU performance.
  4 | 
  5 | Works with any GPU in Windows, Linux, macOS and Android.
  6 | 
  7 | 
  8 | 
  9 | ## Measurements
 10 | - compute performance (`FP64` (scalar), `FP32` (scalar), `FP16` (half2), `INT64` (scalar), `INT32` (scalar), `INT16` (short2), `INT8` (dp4a))
 11 |   - closest possible fraction/multiplicator of `measured compute performance` divided by `reported theoretical FP32 performance` is shown in `(round brackets)`
 12 |     - for example when OpenCL reports `19.492` TFLOPs/s theoretical FP32, and the benchmark measures `9.512` TFLOPs/s for FP64, the ratio of `(measured FP64)/(theoretical FP32) = 9.512/19.492 = 1/2.05` is rounded to the next possible value of `1/2` and reported as such
 13 |     - these ratios for any GPU/CPU architecture can only be either `1/64`, `1/32`, `1/24`, `1/16`, `1/12`, `1/8`, `1/4`, `1/3`, `1/2`, `2/3`, `1x`, `2x`, `4x`, `8x`, `16x`, `32x`, `64x`, and nothing in between
 14 | - memory bandwidth (`coalesced`/`misaligned` `read`/`write`)
 15 | - PCIe bandwidth (`send`/`receive`/`bidirectional`)
 16 |   - PCIe Gen is estimated based on measured PCIe bandwidth and assumed x16 link width
 17 | 
 18 | 
 19 | 
 20 | ## How to use?
 21 | 
 22 | ### Windows
 23 | - Download and install [Visual Studio Community](https://visualstudio.microsoft.com/de/vs/community/). In Visual Studio Installer, add:
 24 |   - Desktop development with C++
 25 |   - MSVC v142
 26 |   - Windows 10 SDK
 27 | - Open [`OpenCL-Benchmark.sln`](OpenCL-Benchmark.sln) in [Visual Studio Community](https://visualstudio.microsoft.com/de/vs/community/).
 28 | - Compile and run by clicking the <kbd>► Local Windows Debugger</kbd> button.
 29 | - To run outside of [Visual Studio Community](https://visualstudio.microsoft.com/de/vs/community/), open Windows CMD in the `OpenCL-Benchmark` folder (type `cmd` in File Explorer in the directory field and press <kbd>Enter</kbd>), then run
 30 |   ```
 31 |   OpenCL-Benchmark.exe
 32 |   ```
 33 | 
 34 | ### Linux / macOS / Android
 35 | - Download, compile and run:
 36 |   ```
 37 |   git clone https://github.com/ProjectPhysX/OpenCL-Benchmark.git
 38 |   cd OpenCL-Benchmark
 39 |   chmod +x make.sh
 40 |   ./make.sh
 41 |   ```
 42 | - Run
 43 |   ```
 44 |   bin/OpenCL-Benchmark
 45 |   ```
 46 | 
 47 | ### Run only for a specified list of devices
 48 | - call `bin\OpenCL-Benchmark.exe 0 2 5` (Windows) or `bin/OpenCL-Benchmark 0 2 5` (Linux/macOS) with the number(s) being the device IDs to be benchmarked
 49 | 
 50 | 
 51 | 
 52 | ## Examples
 53 | ```
 54 | |----------------.------------------------------------------------------------|
 55 | | Device ID      | 0                                                          |
 56 | | Device Name    | NVIDIA H100 80GB HBM3                                      |
 57 | | Device Vendor  | NVIDIA Corporation                                         |
 58 | | Device Driver  | 565.57.01 (Linux)                                          |
 59 | | OpenCL Version | OpenCL C 3.0                                               |
 60 | | Compute Units  | 132 at 1980 MHz (16896 cores, 66.908 TFLOPs/s)             |
 61 | | Memory, Cache  | 81105 MB VRAM, 4224 KB global / 48 KB local                |
 62 | | Buffer Limits  | 20276 MB global, 64 KB constant                            |
 63 | |----------------'------------------------------------------------------------|
 64 | | Info: OpenCL C code successfully compiled.                                  |
 65 | | FP64  compute                                        31.184 TFLOPs/s (1/2 ) |
 66 | | FP32  compute                                        62.908 TFLOPs/s ( 1x ) |
 67 | | FP16  compute                                       123.749 TFLOPs/s ( 2x ) |
 68 | | INT64 compute                                         3.227  TIOPs/s (1/24) |
 69 | | INT32 compute                                        32.946  TIOPs/s (1/2 ) |
 70 | | INT16 compute                                        30.901  TIOPs/s (1/2 ) |
 71 | | INT8  compute                                       103.204  TIOPs/s ( 2x ) |
 72 | | Memory Bandwidth ( coalesced read      )                       3025.53 GB/s |
 73 | | Memory Bandwidth ( coalesced      write)                       3055.98 GB/s |
 74 | | Memory Bandwidth (misaligned read      )                       2102.44 GB/s |
 75 | | Memory Bandwidth (misaligned      write)                        314.25 GB/s |
 76 | | PCIe   Bandwidth (send                 )                         10.53 GB/s |
 77 | | PCIe   Bandwidth (   receive           )                         11.47 GB/s |
 78 | | PCIe   Bandwidth (        bidirectional)            (Gen4 x16)   10.91 GB/s |
 79 | |-----------------------------------------------------------------------------|
 80 | ```
 81 | ```
 82 | |----------------.------------------------------------------------------------|
 83 | | Device ID      | 0                                                          |
 84 | | Device Name    | AMD Instinct MI300X                                        |
 85 | | Device Vendor  | Advanced Micro Devices, Inc.                               |
 86 | | Device Driver  | 3635.0 (HSA1.1,LC) (Linux)                                 |
 87 | | OpenCL Version | OpenCL C 2.0                                               |
 88 | | Compute Units  | 304 at 2100 MHz (19456 cores, 81.715 TFLOPs/s)             |
 89 | | Memory, Cache  | 196592 MB VRAM, 32 KB global / 64 KB local                 |
 90 | | Buffer Limits  | 196592 MB global, 201310208 KB constant                    |
 91 | |----------------'------------------------------------------------------------|
 92 | | Info: OpenCL C code successfully compiled.                                  |
 93 | | FP64  compute                                        54.944 TFLOPs/s (2/3 ) |
 94 | | FP32  compute                                       130.000 TFLOPs/s ( 2x ) |
 95 | | FP16  compute                                       141.320 TFLOPs/s ( 2x ) |
 96 | | INT64 compute                                         3.666  TIOPs/s (1/24) |
 97 | | INT32 compute                                        47.736  TIOPs/s (2/3 ) |
 98 | | INT16 compute                                        69.022  TIOPs/s ( 1x ) |
 99 | | INT8  compute                                       106.178  TIOPs/s ( 1x ) |
100 | | Memory Bandwidth ( coalesced read      )                       3756.64 GB/s |
101 | | Memory Bandwidth ( coalesced      write)                       4686.31 GB/s |
102 | | Memory Bandwidth (misaligned read      )                       3881.24 GB/s |
103 | | Memory Bandwidth (misaligned      write)                       2491.25 GB/s |
104 | | PCIe   Bandwidth (send                 )                         54.57 GB/s |
105 | | PCIe   Bandwidth (   receive           )                         55.79 GB/s |
106 | | PCIe   Bandwidth (        bidirectional)            (Gen4 x16)   55.21 GB/s |
107 | |-----------------------------------------------------------------------------|
108 | ```
109 | ```
110 | |----------------.------------------------------------------------------------|
111 | | Device ID      | 0                                                          |
112 | | Device Name    | Intel(R) Arc(TM) B580 Graphics                             |
113 | | Device Vendor  | Intel(R) Corporation                                       |
114 | | Device Driver  | 32.0.101.6559 (Windows)                                    |
115 | | OpenCL Version | OpenCL C 3.0                                               |
116 | | Compute Units  | 160 at 2850 MHz (2560 cores, 14.592 TFLOPs/s)              |
117 | | Memory, Cache  | 12187 MB VRAM, 18432 KB global / 128 KB local              |
118 | | Buffer Limits  | 11944 MB global, 12230900 KB constant                      |
119 | |----------------'------------------------------------------------------------|
120 | | Info: OpenCL C code successfully compiled.                                  |
121 | | FP64  compute                                         0.896 TFLOPs/s (1/16) |
122 | | FP32  compute                                        14.249 TFLOPs/s ( 1x ) |
123 | | FP16  compute                                        26.547 TFLOPs/s ( 2x ) |
124 | | INT64 compute                                         0.636  TIOPs/s (1/24) |
125 | | INT32 compute                                         4.556  TIOPs/s (1/3 ) |
126 | | INT16 compute                                        37.082  TIOPs/s ( 2x ) |
127 | | INT8  compute                                        48.668  TIOPs/s ( 4x ) |
128 | | Memory Bandwidth ( coalesced read      )                        574.09 GB/s |
129 | | Memory Bandwidth ( coalesced      write)                        468.07 GB/s |
130 | | Memory Bandwidth (misaligned read      )                        796.23 GB/s |
131 | | Memory Bandwidth (misaligned      write)                        383.15 GB/s |
132 | | PCIe   Bandwidth (send                 )                          4.99 GB/s |
133 | | PCIe   Bandwidth (   receive           )                          4.87 GB/s |
134 | | PCIe   Bandwidth (        bidirectional)            (Gen3 x16)    5.11 GB/s |
135 | |-----------------------------------------------------------------------------|
136 | ```
137 | ```
138 | |----------------.------------------------------------------------------------|
139 | | Device ID      | 0                                                          |
140 | | Device Name    | AMD EPYC 9554 64-Core Processor                            |
141 | | Device Vendor  | Intel(R) Corporation                                       |
142 | | Device Driver  | 2024.18.10.0.08_160000 (Linux)                             |
143 | | OpenCL Version | OpenCL C 3.0                                               |
144 | | Compute Units  | 128 at 0 MHz (64 cores, 0.000 TFLOPs/s)                    |
145 | | Memory, Cache  | 386363 MB RAM, 1024 KB global / 256 KB local               |
146 | | Buffer Limits  | 386363 MB global, 128 KB constant                          |
147 | |----------------'------------------------------------------------------------|
148 | | Info: OpenCL C code successfully compiled.                                  |
149 | | FP64  compute                                         3.739 TFLOPs/s (1/64) |
150 | | FP32  compute                                         3.842 TFLOPs/s (1/64) |
151 | | FP16  compute                                         0.863 TFLOPs/s (1/64) |
152 | | INT64 compute                                         1.506  TIOPs/s (1/64) |
153 | | INT32 compute                                         4.240  TIOPs/s (1/64) |
154 | | INT16 compute                                         8.592  TIOPs/s (1/64) |
155 | | INT8  compute                                         2.774  TIOPs/s (1/64) |
156 | | Memory Bandwidth ( coalesced read      )                        391.09 GB/s |
157 | | Memory Bandwidth ( coalesced      write)                        167.26 GB/s |
158 | | Memory Bandwidth (misaligned read      )                        248.65 GB/s |
159 | | Memory Bandwidth (misaligned      write)                        156.18 GB/s |
160 | |-----------------------------------------------------------------------------|
161 | ```
162 | ```
163 | |----------------.------------------------------------------------------------|
164 | | Device ID      | 1                                                          |
165 | | Device Name    | Intel(R) UHD Graphics 630                                  |
166 | | Device Vendor  | Intel(R) Corporation                                       |
167 | | Device Driver  | 31.0.101.2130 (Windows)                                    |
168 | | OpenCL Version | OpenCL C 3.0                                               |
169 | | Compute Units  | 24 at 1200 MHz (192 cores, 0.461 TFLOPs/s)                 |
170 | | Memory, Cache  | 6500 MB RAM, 768 KB global / 64 KB local                   |
171 | | Buffer Limits  | 3250 MB global, 3328048 KB constant                        |
172 | |----------------'------------------------------------------------------------|
173 | | Info: OpenCL C code successfully compiled.                                  |
174 | | FP64  compute                                         0.112 TFLOPs/s (1/4 ) |
175 | | FP32  compute                                         0.437 TFLOPs/s ( 1x ) |
176 | | FP16  compute                                         0.801 TFLOPs/s ( 2x ) |
177 | | INT64 compute                                         0.016  TIOPs/s (1/32) |
178 | | INT32 compute                                         0.149  TIOPs/s (1/3 ) |
179 | | INT16 compute                                         0.863  TIOPs/s ( 2x ) |
180 | | INT8  compute                                         0.213  TIOPs/s (1/2 ) |
181 | | Memory Bandwidth ( coalesced read      )                         20.98 GB/s |
182 | | Memory Bandwidth ( coalesced      write)                         25.18 GB/s |
183 | | Memory Bandwidth (misaligned read      )                         35.16 GB/s |
184 | | Memory Bandwidth (misaligned      write)                         16.18 GB/s |
185 | |-----------------------------------------------------------------------------|
186 | ```
187 | ```
188 | |----------------.------------------------------------------------------------|
189 | | Device ID      | 2                                                          |
190 | | Device Name    | Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz                   |
191 | | Device Vendor  | Intel(R) Corporation                                       |
192 | | Device Driver  | 2024.17.3.0.08_160000 (Windows)                            |
193 | | OpenCL Version | OpenCL C 3.0                                               |
194 | | Compute Units  | 12 at 3700 MHz (6 cores, 0.710 TFLOPs/s)                   |
195 | | Memory, Cache  | 16250 MB RAM, 256 KB global / 32 KB local                  |
196 | | Buffer Limits  | 16250 MB global, 128 KB constant                           |
197 | |----------------'------------------------------------------------------------|
198 | | Info: OpenCL C code successfully compiled.                                  |
199 | | FP64  compute                                         0.151 TFLOPs/s (1/4 ) |
200 | | FP32  compute                                         0.158 TFLOPs/s (1/4 ) |
201 | | FP16  compute                                          not supported        |
202 | | INT64 compute                                         0.042  TIOPs/s (1/16) |
203 | | INT32 compute                                         0.063  TIOPs/s (1/12) |
204 | | INT16 compute                                         0.224  TIOPs/s (1/3 ) |
205 | | INT8  compute                                         0.059  TIOPs/s (1/12) |
206 | | Memory Bandwidth ( coalesced read      )                         16.92 GB/s |
207 | | Memory Bandwidth ( coalesced      write)                          8.08 GB/s |
208 | | Memory Bandwidth (misaligned read      )                         40.02 GB/s |
209 | | Memory Bandwidth (misaligned      write)                         13.69 GB/s |
210 | |-----------------------------------------------------------------------------|
211 | ```


--------------------------------------------------------------------------------
/src/utilities.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #define UTILITIES_REGEX
  4 | //#define UTILITIES_FILE
  5 | #define CONSOLE_WIDTH 79
  6 | #define UTILITIES_NO_CPP17
  7 | 
  8 | #pragma warning(disable:26451)
  9 | #pragma warning(disable:6386)
 10 | #include <cmath>
 11 | #include <vector>
 12 | #include <string>
 13 | #ifdef UTILITIES_REGEX
 14 | #include <regex> // contains <string>, <vector>, <algorithm> and others
 15 | #endif // UTILITIES_REGEX
 16 | #include <iostream>
 17 | #include <chrono>
 18 | #include <thread>
 19 | #undef min
 20 | #undef max
 21 | using std::string;
 22 | using std::vector;
 23 | using std::thread;
 24 | typedef unsigned char uchar;
 25 | typedef unsigned short ushort;
 26 | typedef unsigned int uint;
 27 | typedef int64_t slong;
 28 | typedef uint64_t ulong;
 29 | #define pif 3.1415927f
 30 | #define pi 3.141592653589793
 31 | #define min_char ((char)-128)
 32 | #define max_char ((char)127)
 33 | #define max_uchar ((uchar)255)
 34 | #define min_short ((short)-32768)
 35 | #define max_short ((short)32767)
 36 | #define max_ushort ((ushort)65535)
 37 | #define min_int ((int)-2147483648)
 38 | #define max_int 2147483647
 39 | #define max_uint 4294967295u
 40 | #define min_slong ((slong)-9223372036854775808ll)
 41 | #define max_slong 9223372036854775807ll
 42 | #define max_ulong 18446744073709551615ull
 43 | #define min_float 1.401298464E-45f
 44 | #define max_float 3.402823466E38f
 45 | #define epsilon_float 1.192092896E-7f
 46 | #define inf_float as_float(0x7F800000)
 47 | #define nan_float as_float(0xFFFFFFFF)
 48 | #define min_double 4.9406564584124654E-324
 49 | #define max_double 1.7976931348623158E308
 50 | #define epsilon_double 2.2204460492503131E-16
 51 | #define inf_double as_double(0x7FF0000000000000)
 52 | #define nan_double as_double(0xFFFFFFFFFFFFFFFF)
 53 | 
 54 | class Clock {
 55 | private:
 56 | 	typedef std::chrono::high_resolution_clock clock;
 57 | 	std::chrono::time_point<clock> t;
 58 | public:
 59 | 	inline Clock() { start(); }
 60 | 	inline void start() { t = clock::now(); }
 61 | 	inline double stop() const { return std::chrono::duration_cast<std::chrono::duration<double>>(clock::now()-t).count(); }
 62 | };
 63 | inline void sleep(const double t) {
 64 | 	if(t>0.0) std::this_thread::sleep_for(std::chrono::milliseconds((int)(1E3*t+0.5)));
 65 | }
 66 | 
 67 | inline float as_float(const uint x) {
 68 | 	return *(float*)&x;
 69 | }
 70 | inline uint as_uint(const float x) {
 71 | 	return *(uint*)&x;
 72 | }
 73 | inline double as_double(const ulong x) {
 74 | 	return *(double*)&x;
 75 | }
 76 | inline ulong as_ulong(const double x) {
 77 | 	return *(ulong*)&x;
 78 | }
 79 | 
 80 | inline float half_to_float(const ushort x) { // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
 81 | 	const uint e = (x&0x7C00)>>10; // exponent
 82 | 	const uint m = (x&0x03FF)<<13; // mantissa
 83 | 	const uint v = as_uint((float)m)>>23; // evil log2 bit hack to count leading zeros in denormalized format
 84 | 	return as_float((x&0x8000)<<16 | (e!=0)*((e+112)<<23|m) | ((e==0)&(m!=0))*((v-37)<<23|((m<<(150-v))&0x007FE000))); // sign : normalized : denormalized
 85 | }
 86 | inline ushort float_to_half(const float x) { // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
 87 | 	const uint b = as_uint(x)+0x00001000; // round-to-nearest-even: add last bit after truncated mantissa
 88 | 	const uint e = (b&0x7F800000)>>23; // exponent
 89 | 	const uint m = b&0x007FFFFF; // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding
 90 | 	return (b&0x80000000)>>16 | (e>112)*((((e-112)<<10)&0x7C00)|m>>13) | ((e<113)&(e>101))*((((0x007FF000+m)>>(125-e))+1)>>1) | (e>143)*0x7FFF; // sign : normalized : denormalized : saturate
 91 | }
 92 | 
 93 | inline float sq(const float x) {
 94 | 	return x*x;
 95 | }
 96 | inline float cb(const float x) {
 97 | 	return x*x*x;
 98 | }
 99 | inline float pow(const float x, const uint n) {
100 | 	float r = 1.0f;
101 | 	for(uint i=0u; i<n; i++) {
102 | 		r *= x;
103 | 	}
104 | 	return r;
105 | }
106 | inline float sign(const float x) {
107 | 	return x>=0.0f ? 1.0f : -1.0f;
108 | }
109 | inline float clamp(const float x, const float a, const float b) {
110 | 	return fmin(fmax(x, a), b);
111 | }
112 | inline float rsqrt(const float x) {
113 | 	return 1.0f/sqrt(x);
114 | }
115 | inline float ln(const float x) {
116 | 	return log(x); // natural logarithm
117 | }
118 | inline float random(const float x=1.0f) {
119 | 	return x*((float)rand()/(float)RAND_MAX);
120 | }
121 | inline float random_symmetric(const float x=1.0f) {
122 | 	return 2.0f*x*((float)rand()/(float)RAND_MAX-0.5f);
123 | }
124 | 
125 | inline double sq(const double x) {
126 | 	return x*x;
127 | }
128 | inline double cb(const double x) {
129 | 	return x*x*x;
130 | }
131 | inline double pow(const double x, const uint n) {
132 | 	double r = 1.0;
133 | 	for(uint i=0u; i<n; i++) {
134 | 		r *= x;
135 | 	}
136 | 	return r;
137 | }
138 | inline double sign(const double x) {
139 | 	return x>=0.0 ? 1.0 : -1.0;
140 | }
141 | inline double clamp(const double x, const double a, const double b) {
142 | 	return fmin(fmax(x, a), b);
143 | }
144 | inline double rsqrt(const double x) {
145 | 	return 1.0/sqrt(x);
146 | }
147 | inline double ln(const double x) {
148 | 	return log(x); // natural logarithm
149 | }
150 | 
151 | inline int sq(const int x) {
152 | 	return x*x;
153 | }
154 | inline int cb(const int x) {
155 | 	return x*x*x;
156 | }
157 | inline int pow(const int x, const uint n) {
158 | 	int r = 1;
159 | 	for(uint i=0u; i<n; i++) {
160 | 		r *= x;
161 | 	}
162 | 	return r;
163 | }
164 | inline int sign(const int x) {
165 | 	return 1-2*(x>>31&1);
166 | }
167 | inline int min(const int x, const int y) {
168 | 	return x<y?x:y;
169 | }
170 | inline int max(const int x, const int y) {
171 | 	return x>y?x:y;
172 | }
173 | inline int clamp(const int x, const int a, const int b) {
174 | 	return min(max(x, a), b);
175 | }
176 | 
177 | inline uint sq(const uint x) {
178 | 	return x*x;
179 | }
180 | inline uint cb(const uint x) {
181 | 	return x*x*x;
182 | }
183 | inline uint pow(const uint x, const uint n) {
184 | 	uint r = 1u;
185 | 	for(uint i=0u; i<n; i++) {
186 | 		r *= x;
187 | 	}
188 | 	return r;
189 | }
190 | inline uint min(const uint x, const uint y) {
191 | 	return x<y?x:y;
192 | }
193 | inline uint max(const uint x, const uint y) {
194 | 	return x>y?x:y;
195 | }
196 | inline uint clamp(const uint x, const uint a, const uint b) {
197 | 	return min(max(x, a), b);
198 | }
199 | inline uint gcd(uint x, uint y) { // greatest common divisor
200 | 	if(x*y==0u) return 0u;
201 | 	uint t;
202 | 	while(y!=0u) {
203 | 		t = y;
204 | 		y = x%y;
205 | 		x = t;
206 | 	}
207 | 	return x;
208 | }
209 | inline uint lcm(const uint x, const uint y) { // least common multiple
210 | 	return x*y==0u ? 0u : x*y/gcd(x, y);
211 | }
212 | 
213 | inline slong sq(const slong x) {
214 | 	return x*x;
215 | }
216 | inline slong cb(const slong x) {
217 | 	return x*x*x;
218 | }
219 | inline slong pow(const slong x, const uint n) {
220 | 	slong r = 1ll;
221 | 	for(uint i=0u; i<n; i++) {
222 | 		r *= x;
223 | 	}
224 | 	return r;
225 | }
226 | inline slong sign(const slong x) {
227 | 	return 1ll-2ll*(x>>63&1ll);
228 | }
229 | inline slong min(const slong x, const slong y) {
230 | 	return x<y?x:y;
231 | }
232 | inline slong max(const slong x, const slong y) {
233 | 	return x>y?x:y;
234 | }
235 | inline slong clamp(const slong x, const slong a, const slong b) {
236 | 	return min(max(x, a), b);
237 | }
238 | 
239 | inline ulong sq(const ulong x) {
240 | 	return x*x;
241 | }
242 | inline ulong cb(const ulong x) {
243 | 	return x*x*x;
244 | }
245 | inline ulong pow(const ulong x, const uint n) {
246 | 	ulong r = 1ull;
247 | 	for(uint i=0u; i<n; i++) {
248 | 		r *= x;
249 | 	}
250 | 	return r;
251 | }
252 | inline ulong min(const ulong x, const ulong y) {
253 | 	return x<y?x:y;
254 | }
255 | inline ulong max(const ulong x, const ulong y) {
256 | 	return x>y?x:y;
257 | }
258 | inline ulong clamp(const ulong x, const ulong a, const ulong b) {
259 | 	return min(max(x, a), b);
260 | }
261 | inline ulong gcd(ulong x, ulong y) { // greatest common divisor
262 | 	if(x*y==0ull) return 0ull;
263 | 	ulong t;
264 | 	while(y!=0ull) {
265 | 		t = y;
266 | 		y = x%y;
267 | 		x = t;
268 | 	}
269 | 	return x;
270 | }
271 | inline ulong lcm(const ulong x, const ulong y) { // least common multiple
272 | 	return x*y==0ull ? 0ull : x*y/gcd(x, y);
273 | }
274 | 
275 | inline int to_int(const float x) {
276 | 	return (int)(x+0.5f-(float)(x<0.0f));
277 | }
278 | inline int to_int(const double x) {
279 | 	return (int)(x+0.5-(double)(x<0.0));
280 | }
281 | inline uint to_uint(const float x) {
282 | 	return (uint)fmax(x+0.5f, 0.5f);
283 | }
284 | inline uint to_uint(const double x) {
285 | 	return (uint)fmax(x+0.5, 0.5);
286 | }
287 | inline slong to_slong(const float x) {
288 | 	return (slong)(x+0.5f);
289 | }
290 | inline slong to_slong(const double x) {
291 | 	return (slong)(x+0.5);
292 | }
293 | inline ulong to_ulong(const float x) {
294 | 	return (ulong)fmax(x+0.5f, 0.5f);
295 | }
296 | inline ulong to_ulong(const double x) {
297 | 	return (ulong)fmax(x+0.5, 0.5);
298 | }
299 | 
300 | inline void split_float(float x, uint& integral, uint& decimal, int& exponent) {
301 | 	if(x>=10.0f) { // convert to base 10
302 | 		if(x>=1E32f) { x *= 1E-32f; exponent += 32; }
303 | 		if(x>=1E16f) { x *= 1E-16f; exponent += 16; }
304 | 		if(x>= 1E8f) { x *=  1E-8f; exponent +=  8; }
305 | 		if(x>= 1E4f) { x *=  1E-4f; exponent +=  4; }
306 | 		if(x>= 1E2f) { x *=  1E-2f; exponent +=  2; }
307 | 		if(x>= 1E1f) { x *=  1E-1f; exponent +=  1; }
308 | 	}
309 | 	if(x>0.0f && x<=1.0f) {
310 | 		if(x<1E-31f) { x *=  1E32f; exponent -= 32; }
311 | 		if(x<1E-15f) { x *=  1E16f; exponent -= 16; }
312 | 		if(x< 1E-7f) { x *=   1E8f; exponent -=  8; }
313 | 		if(x< 1E-3f) { x *=   1E4f; exponent -=  4; }
314 | 		if(x< 1E-1f) { x *=   1E2f; exponent -=  2; }
315 | 		if(x<  1E0f) { x *=   1E1f; exponent -=  1; }
316 | 	}
317 | 	integral = (uint)x;
318 | 	float remainder = (x-integral)*1E8f; // 8 decimal digits
319 | 	decimal = (uint)remainder;
320 | 	if(remainder-(float)decimal>=0.5f) { // correct rounding of last decimal digit
321 | 		decimal++;
322 | 		if(decimal>=100000000u) { // decimal overflow
323 | 			decimal = 0u;
324 | 			integral++;
325 | 			if(integral>=10u) { // decimal overflow causes integral overflow
326 | 				integral = 1u;
327 | 				exponent++;
328 | 			}
329 | 		}
330 | 	}
331 | }
332 | inline void split_double(double x, uint& integral, ulong& decimal, int& exponent) {
333 | 	if(x>=10.0) { // convert to base 10
334 | 		if(x>=1E256) { x *= 1E-256; exponent += 256; }
335 | 		if(x>=1E128) { x *= 1E-128; exponent += 128; }
336 | 		if(x>= 1E64) { x *=  1E-64; exponent +=  64; }
337 | 		if(x>= 1E32) { x *=  1E-32; exponent +=  32; }
338 | 		if(x>= 1E16) { x *=  1E-16; exponent +=  16; }
339 | 		if(x>=  1E8) { x *=   1E-8; exponent +=   8; }
340 | 		if(x>=  1E4) { x *=   1E-4; exponent +=   4; }
341 | 		if(x>=  1E2) { x *=   1E-2; exponent +=   2; }
342 | 		if(x>=  1E1) { x *=   1E-1; exponent +=   1; }
343 | 	}
344 | 	if(x>0.0 && x<=1.0) {
345 | 		if(x<1E-255) { x *=  1E256; exponent -= 256; }
346 | 		if(x<1E-127) { x *=  1E128; exponent -= 128; }
347 | 		if(x< 1E-63) { x *=   1E64; exponent -=  64; }
348 | 		if(x< 1E-31) { x *=   1E32; exponent -=  32; }
349 | 		if(x< 1E-15) { x *=   1E16; exponent -=  16; }
350 | 		if(x<  1E-7) { x *=    1E8; exponent -=   8; }
351 | 		if(x<  1E-3) { x *=    1E4; exponent -=   4; }
352 | 		if(x<  1E-1) { x *=    1E2; exponent -=   2; }
353 | 		if(x<   1E0) { x *=    1E1; exponent -=   1; }
354 | 	}
355 | 	integral = (uint)x;
356 | 	double remainder = (x-integral)*1E16; // 16 decimal digits
357 | 	decimal = (ulong)remainder;
358 | 	if(remainder-(double)decimal>=0.5) { // correct rounding of last decimal digit
359 | 		decimal++;
360 | 		if(decimal>=10000000000000000ull) { // decimal overflow
361 | 			decimal = 0ull;
362 | 			integral++;
363 | 			if(integral>=10u) { // decimal overflow causes integral overflow
364 | 				integral = 1u;
365 | 				exponent++;
366 | 			}
367 | 		}
368 | 	}
369 | }
370 | inline string decimal_to_string_float(uint x, int digits) {
371 | 	string r = "";
372 | 	while((digits--)>0) {
373 | 		r = (char)(x%10u+48u)+r;
374 | 		x /= 10u;
375 | 	}
376 | 	return r;
377 | }
378 | inline string decimal_to_string_double(ulong x, int digits) {
379 | 	string r = "";
380 | 	while((digits--)>0) {
381 | 		r = (char)(x%10ull+48ull)+r;
382 | 		x /= 10ull;
383 | 	}
384 | 	return r;
385 | }
386 | 
387 | inline vector<string> get_main_arguments(int argc, char* argv[]) {
388 | 	return argc>1 ? vector<string>(argv+1, argv+argc) : vector<string>();
389 | }
390 | 
391 | inline string to_string(const string& s){
392 | 	return s;
393 | }
394 | inline string to_string(char c) {
395 | 	return string(1, c);
396 | }
397 | inline string to_string(uchar c) {
398 | 	return string(1, c);
399 | }
400 | inline string to_string(ulong x) {
401 | 	string r = "";
402 | 	do {
403 | 		r = (char)(x%10ull+48ull)+r;
404 | 		x /= 10ull;
405 | 	} while(x);
406 | 	return r;
407 | }
408 | inline string to_string(slong x) {
409 | 	return x>=0ll ? to_string((ulong)x) : "-"+to_string((ulong)(-x));
410 | }
411 | inline string to_string(uint x) {
412 | 	string r = "";
413 | 	do {
414 | 		r = (char)(x%10u+48u)+r;
415 | 		x /= 10u;
416 | 	} while(x);
417 | 	return r;
418 | }
419 | inline string to_string(int x) {
420 | 	return x>=0 ? to_string((uint)x) : "-"+to_string((uint)(-x));
421 | }
422 | inline string to_string_hex(ulong x) {
423 | 	string r = "";
424 | 	for(uint i=0u; i<16u; i++) {
425 | 		const uint hex_char = (uint)(x&0xFull);
426 | 		r = (char)(hex_char+(hex_char<10u ? 48u : 55u))+r;
427 | 		x >>= 4u;
428 | 	}
429 | 	return "0x"+r;
430 | }
431 | inline string to_string_hex(slong x) {
432 | 	return to_string_hex(*(ulong*)&x);
433 | }
434 | inline string to_string_hex(uint x) {
435 | 	string r = "";
436 | 	for(uint i=0u; i<8u; i++) {
437 | 		const uint hex_char = x&0xFu;
438 | 		r = (char)(hex_char+(hex_char<10u ? 48u : 55u))+r;
439 | 		x >>= 4u;
440 | 	}
441 | 	return "0x"+r;
442 | }
443 | inline string to_string_hex(int x) {
444 | 	return to_string_hex(*(uint*)&x);
445 | }
446 | inline string to_string(float x) { // convert float to string with full precision (<string> to_string() prints only 6 decimals)
447 | 	string s = "";
448 | 	if(x<0.0f) { s += "-"; x = -x; }
449 | 	if(std::isnan(x)) return s+"NaN";
450 | 	if(std::isinf(x)) return s+"Inf";
451 | 	uint integral, decimal;
452 | 	int exponent = 0;
453 | 	split_float(x, integral, decimal, exponent);
454 | 	return s+to_string(integral)+"."+decimal_to_string_float(decimal, 8)+(exponent!=0?"E"+to_string(exponent):"");
455 | }
456 | inline string to_string(double x) { // convert double to string with full precision (<string> to_string() prints only 6 decimals)
457 | 	string s = "";
458 | 	if(x<0.0) { s += "-"; x = -x; }
459 | 	if(std::isnan(x)) return s+"NaN";
460 | 	if(std::isinf(x)) return s+"Inf";
461 | 	uint integral;
462 | 	ulong decimal;
463 | 	int exponent = 0;
464 | 	split_double(x, integral, decimal, exponent);
465 | 	return s+to_string(integral)+"."+decimal_to_string_double(decimal, 16)+(exponent!=0?"E"+to_string(exponent):"");
466 | }
467 | inline string to_string(float x, const uint decimals) { // convert float to string with specified number of decimals
468 | 	string s = "";
469 | 	if(x<0.0f) { s += "-"; x = -x; }
470 | 	if(std::isnan(x)) return s+"NaN";
471 | 	if(std::isinf(x)||x>(float)max_ulong) return s+"Inf";
472 | 	const float power = pow(10.0f, min(decimals, 8u));
473 | 	x += 0.5f/power; // rounding
474 | 	const ulong integral = (ulong)x;
475 | 	const uint decimal = (uint)((x-(float)integral)*power);
476 | 	return s+to_string(integral)+(decimals==0u ? "" : "."+decimal_to_string_float(decimal, min((int)decimals, 8)));
477 | }
478 | inline string to_string(double x, const uint decimals) { // convert float to string with specified number of decimals
479 | 	string s = "";
480 | 	if(x<0.0) { s += "-"; x = -x; }
481 | 	if(std::isnan(x)) return s+"NaN";
482 | 	if(std::isinf(x)||x>(double)max_ulong) return s+"Inf";
483 | 	const double power = pow(10.0, min(decimals, 16u));
484 | 	x += 0.5/power; // rounding
485 | 	const ulong integral = (ulong)x;
486 | 	const ulong decimal = (ulong)((x-(double)integral)*power);
487 | 	return s+to_string(integral)+(decimals==0u ? "" : "."+decimal_to_string_double(decimal, min((int)decimals, 16)));
488 | }
489 | 
490 | inline uint length(const string& s) {
491 | 	return (uint)s.length();
492 | }
493 | inline bool contains(const string& s, const string& match) {
494 | 	return s.find(match)!=string::npos;
495 | }
496 | inline bool contains_any(const string& s, const vector<string>& matches) {
497 | 	for(uint i=0u; i<(uint)matches.size(); i++) if(contains(s, matches[i])) return true;
498 | 	return false;
499 | }
500 | inline string to_lower(const string& s) {
501 | 	string r = "";
502 | 	for(uint i=0u; i<(uint)s.length(); i++) {
503 | 		const uchar c = s.at(i);
504 | 		r += c>64u&&c<91u ? c+32u : c;
505 | 	}
506 | 	return r;
507 | }
508 | inline string to_upper(const string& s) {
509 | 	string r = "";
510 | 	for(uint i=0u; i<(uint)s.length(); i++) {
511 | 		const uchar c = s.at(i);
512 | 		r += c>96u&&c<123u ? c-32u : c;
513 | 	}
514 | 	return r;
515 | }
516 | inline bool equals(const string& a, const string& b) {
517 | 	return to_lower(a)==to_lower(b);
518 | }
519 | inline string replace(const string& s, const string& from, const string& to) {
520 | 	string r = s;
521 | 	int p = 0;
522 | 	while((p=(int)r.find(from, p))!=string::npos) {
523 | 		r.replace(p, from.length(), to);
524 | 		p += (int)to.length();
525 | 	}
526 | 	return r;
527 | }
528 | inline string substring(const string& s, const uint start, uint length=max_uint) {
529 | 	return s.substr(start, min(length, (uint)s.length()-start));
530 | }
531 | inline string trim(const string& s) { // removes whitespace characters from beginnig and end of string s
532 | 	const int l = (int)s.length();
533 | 	int a=0, b=l-1;
534 | 	char c;
535 | 	while(a<l && ((c=s[a])==' '||c=='\t'||c=='\n'||c=='\v'||c=='\f'||c=='\r'||c=='\0')) a++;
536 | 	while(b>a && ((c=s[b])==' '||c=='\t'||c=='\n'||c=='\v'||c=='\f'||c=='\r'||c=='\0')) b--;
537 | 	return s.substr(a, 1+b-a);
538 | }
539 | inline bool begins_with(const string& s, const string& match) {
540 | 	if(match.size()>s.size()) return false;
541 | 	else return equal(match.begin(), match.end(), s.begin());
542 | }
543 | inline bool ends_with(const string& s, const string& match) {
544 | 	if(match.size()>s.size()) return false;
545 | 	else return equal(match.rbegin(), match.rend(), s.rbegin());
546 | }
547 | template<class T> inline bool contains(const vector<T>& v, const T& match) {
548 | 	return find(v.begin(), v.end(), match)!=v.end();
549 | }
550 | 
551 | inline string alignl(const uint n, const string& x="") { // converts x to string with spaces behind such that length is n if x is not longer than n
552 | 	string s = x;
553 | 	for(uint i=0u; i<n; i++) s += " ";
554 | 	return s.substr(0, max(n, (uint)x.length()));
555 | }
556 | inline string alignr(const uint n, const string& x="") { // converts x to string with spaces in front such that length is n if x is not longer than n
557 | 	string s = "";
558 | 	for(uint i=0u; i<n; i++) s += " ";
559 | 	s += x;
560 | 	return s.substr((uint)min((int)s.length()-(int)n, (int)n), s.length());
561 | }
562 | template<typename T> inline string alignl(const uint n, const T x) { // converts x to string with spaces behind such that length is n if x does not have more digits than n
563 | 	return alignl(n, to_string(x));
564 | }
565 | template<typename T> inline string alignr(const uint n, const T x) { // converts x to string with spaces in front such that length is n if x does not have more digits than n
566 | 	return alignr(n, to_string(x));
567 | }
568 | 
569 | inline void print(const string& s="") {
570 | 	std::cout << s;
571 | }
572 | inline void println(const string& s="") {
573 | 	std::cout << s+"\n";
574 | }
575 | inline void reprint(const string& s="") {
576 | 	std::cout << "\r"+s;
577 | }
578 | inline void wait() {
579 | 	std::cin.get();
580 | }
581 | template<typename T> inline void println(const T& x) {
582 | 	println(to_string(x));
583 | }
584 | 
585 | #ifdef UTILITIES_REGEX
586 | inline vector<string> split_regex(const string& s, const string& separator="\\s+") {
587 | 	vector<string> r;
588 | 	const std::regex rgx(separator);
589 | 	std::sregex_token_iterator token(s.begin(), s.end(), rgx, -1), end;
590 | 	while(token!=end) {
591 | 		r.push_back(*token);
592 | 		token++;
593 | 	}
594 | 	return r;
595 | }
596 | inline bool equals_regex(const string& s, const string& match) { // returns true if string exactly matches regex
597 | 	return regex_match(s.begin(), s.end(), std::regex(match));
598 | }
599 | inline uint matches_regex(const string& s, const string& match) { // counts number of matches
600 | 	std::regex words_regex(match);
601 | 	auto words_begin = std::sregex_iterator(s.begin(), s.end(), words_regex);
602 | 	auto words_end = std::sregex_iterator();
603 | 	return (uint)std::distance(words_begin, words_end);
604 | }
605 | inline bool contains_regex(const string& s, const string& match) {
606 | 	return matches_regex(s, match)>=1;
607 | }
608 | inline string replace_regex(const string& s, const string& from, const string& to) {
609 | 	return regex_replace(s, std::regex(from), to);
610 | }
611 | inline bool is_number(const string& s) {
612 | 	return equals_regex(s, "\\d+(u|l|ul|ll|ull)?")||equals_regex(s, "0x(\\d|[a-fA-F])+(u|l|ul|ll|ull)?")||equals_regex(s, "0b[01]+(u|l|ul|ll|ull)?")||equals_regex(s, "(((\\d+\\.?\\d*|\\.\\d+)([eE][+-]?\\d+[fF]?)?)|(\\d+\\.\\d*|\\.\\d+)[fF]?)");
613 | }
614 | inline void print_message(const string& message, const string& keyword="", const int keyword_color=-1, const int colons=true) { // print formatted message
615 | 	const uint k=length(keyword)+2u, w=CONSOLE_WIDTH-4u-k;
616 | 	string p=colons?": ":"  ", f="";
617 | 	for(uint j=0u; j<k; j++) f += " ";
618 | 	vector<string> v = split_regex(message);
619 | 	uint l = 0u; // length of current line of words
620 | 	for(uint i=0u; i<(uint)v.size(); i++) {
621 | 		const string word = v.at(i);
622 | 		const uint wordlength = length(word);
623 | 		l += wordlength+1u; // word + space
624 | 		if(l<=w+1u) { // word fits -> append word and space
625 | 			p += word+" ";
626 | 		} else if(wordlength>w) { // word overflows -> split word into next line
627 | 			p += substring(word, 0, w-(l-wordlength-1u))+" |\n| "+f;
628 | 			v[i] = substring(v[i], w-(l-wordlength-1u)); i--; // reuse same vector element for overflowing part, decrement i to start next line with this overflowing part
629 | 			l = 0u; // reset line length
630 | 		} else { // word does not fit -> fill remaining line with spaces
631 | 			l = l-length(v.at(i--))-1u; // remove word from line, decrement i to start next line with this word
632 | 			for(uint j=l; j<=w; j++) p += " ";
633 | 			p += "|\n| "+f;
634 | 			l = 0u; // reset line length
635 | 		}
636 | 	}
637 | 	for(uint j=l; j<=w; j++) p += " ";
638 | 	println("\r| "+keyword+p+"|");
639 | }
640 | inline void print_error(const string& s) { // print formatted error message
641 | 	print_message(s, "Error");
642 | #ifdef _WIN32
643 | 	print_message("Press Enter to exit.", "     ", -1, false);
644 | #endif // _WIN32
645 | 	string b = "";
646 | 	for(int i=0; i<CONSOLE_WIDTH-2; i++) b += "-";
647 | 	println("'"+b+"'");
648 | #ifdef _WIN32
649 | 	wait();
650 | #endif //_WIN32
651 | 	exit(1);
652 | }
653 | inline void print_warning(const string& s) { // print formatted warning message
654 | 	print_message(s, "Warning");
655 | }
656 | inline void print_info(const string& s) { // print formatted info message
657 | 	print_message(s, "Info");
658 | }
659 | 
660 | inline void parse_sanity_check_error(const string& s, const string& regex, const string& type) {
661 | 	if(!equals_regex(s, regex)) print_error("\""+s+"\" cannot be parsed to "+type+".");
662 | }
663 | inline int to_int(const string& s) {
664 | 	const string t = trim(s);
665 | 	parse_sanity_check_error(t, "[+-]?\\d+", "int");
666 | 	return atoi(t.c_str());
667 | }
668 | inline uint to_uint(const string& s) {
669 | 	const string t = trim(s);
670 | 	parse_sanity_check_error(t, "\\+?\\d+", "uint");
671 | 	return (uint)atoi(t.c_str());
672 | }
673 | inline slong to_slong(const string& s) {
674 | 	const string t = trim(s);
675 | 	parse_sanity_check_error(t, "[+-]?\\d+", "slong");
676 | 	return (slong)atoll(t.c_str());
677 | }
678 | inline ulong to_ulong(const string& s) {
679 | 	const string t = trim(s);
680 | 	parse_sanity_check_error(t, "\\+?\\d+", "ulong");
681 | 	return (ulong)atoll(t.c_str());
682 | }
683 | inline float to_float(const string& s) {
684 | 	const string t = trim(s);
685 | 	parse_sanity_check_error(t, "[+-]?(((\\d+\\.?\\d*|\\.\\d+)([eE][+-]?\\d+[fF]?)?)|(\\d+\\.\\d*|\\.\\d+)[fF]?)", "float");
686 | 	return (float)atof(t.c_str());
687 | }
688 | inline double to_double(const string& s) {
689 | 	const string t = trim(s);
690 | 	parse_sanity_check_error(t, "[+-]?(((\\d+\\.?\\d*|\\.\\d+)([eE][+-]?\\d+[fF]?)?)|(\\d+\\.\\d*|\\.\\d+)[fF]?)", "double");
691 | 	return atof(t.c_str());
692 | }
693 | 
694 | inline bool parse_sanity_check(const string& s, const string& regex) {
695 | 	return equals_regex(s, regex);
696 | }
697 | inline int to_int(const string& s, const int default_value) {
698 | 	const string t = trim(s);
699 | 	return parse_sanity_check(t, "[+-]?\\d+") ? atoi(t.c_str()) : default_value;
700 | }
701 | inline uint to_uint(const string& s, const uint default_value) {
702 | 	const string t = trim(s);
703 | 	return parse_sanity_check(t, "\\+?\\d+") ? (uint)atoi(t.c_str()) : default_value;
704 | }
705 | inline slong to_slong(const string& s, const slong default_value) {
706 | 	const string t = trim(s);
707 | 	return parse_sanity_check(t, "[+-]?\\d+") ? (slong)atoll(t.c_str()) : default_value;
708 | }
709 | inline ulong to_ulong(const string& s, const ulong default_value) {
710 | 	const string t = trim(s);
711 | 	return parse_sanity_check(t, "\\+?\\d+") ? (ulong)atoll(t.c_str()) : default_value;
712 | }
713 | inline float to_float(const string& s, const float default_value) {
714 | 	const string t = trim(s);
715 | 	return parse_sanity_check(t, "[+-]?(((\\d+\\.?\\d*|\\.\\d+)([eE][+-]?\\d+[fF]?)?)|(\\d+\\.\\d*|\\.\\d+)[fF]?)") ? (float)atof(t.c_str()) : default_value;
716 | }
717 | inline double to_double(const string& s, const double default_value) {
718 | 	const string t = trim(s);
719 | 	return parse_sanity_check(t, "[+-]?(((\\d+\\.?\\d*|\\.\\d+)([eE][+-]?\\d+[fF]?)?)|(\\d+\\.\\d*|\\.\\d+)[fF]?)") ? atof(t.c_str()) : default_value;
720 | }
721 | #else // UTILITIES_REGEX
722 | inline void print_message(const string& message, const string& keyword="", const int colons=true) { // print message
723 | 	println(keyword+": "+message);
724 | }
725 | inline void print_error(const string& s) { // print error message
726 | 	println("Error: "+s);
727 | #ifdef _WIN32
728 | 	println("       Press Enter to exit.");
729 | 	wait();
730 | #endif //_WIN32
731 | 	exit(1);
732 | }
733 | inline void print_warning(const string& s) { // print warning message
734 | 	println("Warning: "+s);
735 | }
736 | inline void print_info(const string& s) { // print info message
737 | 	println("Info: "+s);
738 | }
739 | #endif // UTILITIES_REGEX
740 | 
741 | inline void set_environment_variable(char* s) { // usage: set_environment_variable((char*)"VARIABLE=VALUE");
742 | #if defined(_WIN32)
743 | 	(void)_putenv(s);
744 | #elif defined(__linux__)
745 | 	(void) putenv(s);
746 | #endif // Linux
747 | }
748 | 
749 | #ifdef UTILITIES_FILE
750 | #include <fstream> // read/write files
751 | #ifndef UTILITIES_NO_CPP17
752 | #include <filesystem> // automatically create directory before writing file, requires C++17
753 | inline vector<string> find_files(const string& path, const string& extension=".*") {
754 | 	vector<string> files;
755 | 	if(std::filesystem::is_directory(path)&&std::filesystem::exists(path)) {
756 | 		for(const auto& entry : std::filesystem::directory_iterator(path)) {
757 | 			if(extension==".*"||entry.path().extension().string()==extension) files.push_back(entry.path().string());
758 | 		}
759 | 	}
760 | 	return files;
761 | }
762 | #endif // UTILITIES_NO_CPP17
763 | inline void create_folder(const string& path) { // create folder if it not already exists
764 | 	const int slash_position = (int)path.rfind('/'); // find last slash dividing the path from the filename
765 | 	if(slash_position==(int)string::npos) return; // no slash found
766 | 	const string f = path.substr(0, slash_position); // cut off file name if there is any
767 | #ifndef UTILITIES_NO_CPP17
768 | 	if(!std::filesystem::is_directory(f)||!std::filesystem::exists(f)) std::filesystem::create_directories(f); // create folder if it not already exists
769 | #endif // UTILITIES_NO_CPP17
770 | }
771 | inline string create_file_extension(const string& filename, const string& extension) {
772 | 	return filename.substr(0, filename.rfind('.'))+(extension.at(0)!='.'?".":"")+extension; // remove existing file extension if existing and replace it with new one
773 | }
774 | inline string read_file(const string& filename) {
775 | 	std::ifstream file(filename, std::ios::in);
776 | 	if(file.fail()) print_error("File \""+filename+"\" does not exist!");
777 | 	const string r((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
778 | 	file.close();
779 | 	return r;
780 | }
781 | inline void write_file(const string& filename, const string& content="") {
782 | 	create_folder(filename);
783 | 	std::ofstream file(filename, std::ios::out);
784 | 	file.write(content.c_str(), content.length());
785 | 	file.close();
786 | }
787 | #endif // UTILITIES_FILE


--------------------------------------------------------------------------------
/src/opencl.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #define WORKGROUP_SIZE 64 // needs to be 64 to fully use AMD GPUs
  4 | //#define PTX
  5 | //#define LOG
  6 | 
  7 | // https://github.com/KhronosGroup/OpenCL-Headers
  8 | // https://github.com/KhronosGroup/OpenCL-CLHPP
  9 | #define CL_HPP_MINIMUM_OPENCL_VERSION 100
 10 | #if !defined(__APPLE__) // Windows/Linux/Android
 11 | #define CL_HPP_TARGET_OPENCL_VERSION 300 // Windows/Linux/Android can use OpenCL 3.0
 12 | #else // macOS
 13 | #define CL_HPP_TARGET_OPENCL_VERSION 120 // macOS only supports OpenCL 1.2
 14 | #endif // macOS
 15 | #include <CL/opencl.hpp>
 16 | #include "utilities.hpp"
 17 | using cl::Event;
 18 | 
 19 | static const string driver_installation_instructions =
 20 | #ifdef _WIN32
 21 | R"(|----------------.------------------------------------------------------------'
 22 | |       AMD GPUs | https://www.amd.com/en/support/download/drivers.html
 23 | |     Intel GPUs | https://www.intel.com/content/www/us/en/download/785597/intel-arc-iris-xe-graphics-windows.html
 24 | |    Nvidia GPUs | https://www.nvidia.com/Download/index.aspx
 25 | | AMD/Intel CPUs | https://www.intel.com/content/www/us/en/developer/articles/technical/intel-cpu-runtime-for-opencl-applications-with-sycl-support.html
 26 | |----------------'------------------------------------------------------------.
 27 | | Don't forget to reboot after installation! Press Enter to exit.             |
 28 | '-----------------------------------------------------------------------------')""\n";
 29 | #else // Linux
 30 | string("'-----------------------------------------------------------------------------'\n")+R"(
 31 | )"+string("\033[31m")+R"(.-----------------------------------------------------------------------------.
 32 | | AMD GPU Drivers, which contain the OpenCL Runtime                           |
 33 | '-----------------------------------------------------------------------------'
 34 | sudo apt update && sudo apt upgrade -y
 35 | sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev
 36 | mkdir -p ~/amdgpu
 37 | wget -P ~/amdgpu https://repo.radeon.com/amdgpu-install/6.4.2.1/ubuntu/noble/amdgpu-install_6.4.60402-1_all.deb
 38 | sudo apt install -y ~/amdgpu/amdgpu-install*.deb
 39 | sudo amdgpu-install -y --usecase=graphics,rocm,opencl --opencl=rocr
 40 | sudo usermod -a -G render,video $(whoami)
 41 | rm -r ~/amdgpu
 42 | sudo shutdown -r now
 43 | 
 44 | )"+string("\033[36m")+R"(.-----------------------------------------------------------------------------.
 45 | | Intel GPU Drivers are already installed, only the OpenCL Runtime is needed  |
 46 | '-----------------------------------------------------------------------------'
 47 | sudo apt update && sudo apt upgrade -y
 48 | sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev intel-opencl-icd
 49 | sudo usermod -a -G render $(whoami)
 50 | sudo shutdown -r now
 51 | 
 52 | )"+string("\033[32m")+R"(.-----------------------------------------------------------------------------.
 53 | | Nvidia GPU Drivers, which contain the OpenCL Runtime                        |
 54 | '-----------------------------------------------------------------------------'
 55 | sudo apt update && sudo apt upgrade -y
 56 | sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev nvidia-driver-580
 57 | sudo shutdown -r now
 58 | 
 59 | )"+string("\033[96m")+R"(.-----------------------------------------------------------------------------.
 60 | | CPU Option 1: Intel CPU Runtime for OpenCL (works for both AMD/Intel CPUs)  |
 61 | '-----------------------------------------------------------------------------'
 62 | export OCLV="oclcpuexp-2025.20.6.0.04_224945_rel"
 63 | export TBBV="oneapi-tbb-2022.2.0"
 64 | sudo apt update && sudo apt upgrade -y
 65 | sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev
 66 | sudo mkdir -p ~/cpurt /opt/intel/${OCLV} /etc/OpenCL/vendors /etc/ld.so.conf.d
 67 | sudo wget -P ~/cpurt https://github.com/intel/llvm/releases/download/2025-WW27/${OCLV}.tar.gz
 68 | sudo wget -P ~/cpurt https://github.com/uxlfoundation/oneTBB/releases/download/v2022.2.0/${TBBV}-lin.tgz
 69 | sudo tar -zxvf ~/cpurt/${OCLV}.tar.gz -C /opt/intel/${OCLV}
 70 | sudo tar -zxvf ~/cpurt/${TBBV}-lin.tgz -C /opt/intel
 71 | echo /opt/intel/${OCLV}/x64/libintelocl.so | sudo tee /etc/OpenCL/vendors/intel_expcpu.icd
 72 | echo /opt/intel/${OCLV}/x64 | sudo tee /etc/ld.so.conf.d/libintelopenclexp.conf
 73 | sudo ln -sf /opt/intel/${TBBV}/lib/intel64/gcc4.8/libtbb.so /opt/intel/${OCLV}/x64
 74 | sudo ln -sf /opt/intel/${TBBV}/lib/intel64/gcc4.8/libtbbmalloc.so /opt/intel/${OCLV}/x64
 75 | sudo ln -sf /opt/intel/${TBBV}/lib/intel64/gcc4.8/libtbb.so.12 /opt/intel/${OCLV}/x64
 76 | sudo ln -sf /opt/intel/${TBBV}/lib/intel64/gcc4.8/libtbbmalloc.so.2 /opt/intel/${OCLV}/x64
 77 | sudo ldconfig -f /etc/ld.so.conf.d/libintelopenclexp.conf
 78 | sudo rm -r ~/cpurt
 79 | 
 80 | )"+string("\033[33m")+R"(.-----------------------------------------------------------------------------.
 81 | | CPU Option 2: PoCL                                                          |
 82 | '-----------------------------------------------------------------------------'
 83 | sudo apt update && sudo apt upgrade -y
 84 | sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev pocl-opencl-icd
 85 | 
 86 | )"+string("\033[0m");
 87 | #endif // Linux
 88 | 
 89 | struct Device_Info {
 90 | 	cl::Device cl_device; // OpenCL device
 91 | 	cl::Context cl_context; // multiple devices in the same context can communicate buffers
 92 | 	uint id = 0u; // unique device ID assigned by get_devices()
 93 | 	string name="", vendor=""; // device name, vendor
 94 | 	string driver_version="", opencl_c_version=""; // device driver version, device OpenCL C version ("1.0", "1.1", "1.2", "2.0", "2.1", "2.2", "3.0")
 95 | 	uint memory = 0u; // global memory in MB
 96 | 	uint memory_used = 0u; // track global memory usage in MB
 97 | 	uint global_cache=0u, local_cache=0u; // global cache in KB, local cache in KB
 98 | 	uint max_global_buffer=0u, max_constant_buffer=0u; // maximum global buffer size in MB, maximum constant buffer size in KB
 99 | 	uint compute_units = 0u; // compute units (CUs) can contain multiple cores depending on the microarchitecture
100 | 	uint clock_frequency = 0u; // in MHz
101 | 	bool is_cpu=false, is_gpu=false, uses_ram=false;
102 | 	uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u, is_dp4a_capable=0u;
103 | 	uint cores = 0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
104 | 	float tflops = 0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
105 | 	uint nvidia_compute_capability = 0u; // compute capability for Nvidia GPUs, for example nvidia_compute_capability=61 means compute capability 6.1
106 | 	bool patch_intel_gpu_above_4gb = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
107 | 	bool patch_nvidia_fp16 = false; // Nvidia Pascal and newer GPUs with driver>=520.00 don't report cl_khr_fp16, but do support basic FP16 arithmetic
108 | 	bool patch_legacy_gpu_fma = false; // some old GPUs have terrible fma performance, so replace with a*b+c
109 | 	inline Device_Info(const cl::Device& cl_device, const cl::Context& cl_context, const uint id) {
110 | 		this->cl_device = cl_device; // see https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
111 | 		this->cl_context = cl_context;
112 | 		this->id = id;
113 | 		name = trim(cl_device.getInfo<CL_DEVICE_NAME>()); // device name
114 | 		vendor = trim(cl_device.getInfo<CL_DEVICE_VENDOR>()); // device vendor
115 | 		driver_version = trim(cl_device.getInfo<CL_DRIVER_VERSION>()); // device driver version
116 | 		opencl_c_version = cl_device.getInfo<CL_DEVICE_OPENCL_C_VERSION>().substr(9, 3);
117 | 		memory = (uint)(cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()/1048576ull); // global memory in MB
118 | 		global_cache = (uint)(cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>()/1024ull); // global cache in KB
119 | 		local_cache = (uint)(cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()/1024ull); // local cache in KB
120 | 		max_global_buffer = (uint)(min(cl_device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>()/1048576ull, (ulong)memory)); // maximum global buffer size in MB
121 | 		max_constant_buffer = (uint)(cl_device.getInfo<CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE>()/1024ull); // maximum constant buffer size in KB
122 | 		compute_units = (uint)cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); // compute units (CUs) can contain multiple cores depending on the microarchitecture
123 | 		clock_frequency = (uint)cl_device.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(); // in MHz
124 | 		is_fp64_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE>()*(uint)contains(cl_device.getInfo<CL_DEVICE_EXTENSIONS>(), "cl_khr_fp64");
125 | 		is_fp32_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT>();
126 | 		is_fp16_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF>()*(uint)contains(cl_device.getInfo<CL_DEVICE_EXTENSIONS>(), "cl_khr_fp16");
127 | 		is_int64_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG>();
128 | 		is_int32_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_INT>();
129 | 		is_int16_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT>();
130 | 		is_int8_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR>();
131 | 		is_cpu = cl_device.getInfo<CL_DEVICE_TYPE>()==CL_DEVICE_TYPE_CPU;
132 | 		is_gpu = cl_device.getInfo<CL_DEVICE_TYPE>()==CL_DEVICE_TYPE_GPU;
133 | 		uses_ram = is_cpu||(bool)cl_device.getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>(); // CPUs or iGPUs
134 | 		const int vendor_id = (int)cl_device.getInfo<CL_DEVICE_VENDOR_ID>(); // AMD=0x1002, Intel=0x8086, Nvidia=0x10DE, Apple=0x1027F00
135 | 		uint ipc = is_gpu ? 2u : 32u; // IPC (instructions per cycle) is 2 for most GPUs and 32 for most modern CPUs
136 | 		float cores_per_cu = 1.0f;
137 | #if !defined(__APPLE__) // macOS only supports OpenCL 1.2, OpenCL extensions are missing before OpenCL 3.0
138 | 		uint max_opencl_c_version = 0u; // device OpenCL C version; cl_device.getInfo<CL_DEVICE_OPENCL_C_VERSION>().substr(9, 3) is unreliable as it will report 1.2 if 3.0 is available but not 2.X
139 | 		for(auto& v : cl_device.getInfo<CL_DEVICE_OPENCL_C_ALL_VERSIONS>()) max_opencl_c_version = max(max_opencl_c_version, 10u*(uint)CL_VERSION_MAJOR(v.version)+CL_VERSION_MINOR(v.version));
140 | 		if(max_opencl_c_version>=10u) opencl_c_version = to_string(max_opencl_c_version/10u)+"."+to_string(max_opencl_c_version%10u);
141 | 		is_dp4a_capable = (uint)contains(cl_device.getInfo<CL_DEVICE_EXTENSIONS>(), "cl_khr_integer_dot_product");
142 | 		int dp4a_error = 0;
143 | 		is_dp4a_capable = is_dp4a_capable&&(uint)(cl_device.getInfo<CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR>(&dp4a_error)==3);
144 | 		is_dp4a_capable = is_dp4a_capable&&dp4a_error==0;
145 | 		const auto idpap = cl_device.getInfo<CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR>(&dp4a_error);
146 | 		const cl_bool* idpap_bits = (cl_bool*)&idpap; // on some unsupported devices, values are random, so only claim is_dp4a_capable if all bits are set correctly
147 | 		is_dp4a_capable = is_dp4a_capable&&dp4a_error==0&&idpap_bits[0]==1&&idpap_bits[1]==1&&idpap_bits[2]==1&&idpap_bits[3]==1&&idpap_bits[4]==1&&idpap_bits[5]==1;
148 | 		if(vendor_id==0x1002) { // AMD GPU/CPU
149 | 			const bool is_full_profile = trim(cl_device.getInfo<CL_DEVICE_PROFILE>())=="FULL_PROFILE"; // rusticl reports "EMBEDDED_PROFILE"
150 | 			const bool amd_dual_cu = is_gpu&&is_full_profile&&contains_any(to_lower(name), {"gfx10", "gfx11", "gfx12"}); // identify RDNA/RDNA2/RDNA3/RDNA4 GPUs where dual CUs are reported
151 | 			const bool amd_ipc_4 = is_gpu&&contains_any(to_lower(name), {"gfx11", "gfx12", "gfx942", "gfx950"}); // identify RDNA3/RDNA4 GPUs (can dual-issue float2) and CDNA3/CDNA4 GPUs (ipc=4 for scalar float)
152 | 			if(amd_dual_cu) compute_units *= 2u; // some AMD GPUs wrongly report the number of dual CUs as the number of CUs
153 | 			if(amd_ipc_4) ipc = 4u; // some AMD GPUs support dual-issuging of float2 vector type, or have ipc=4 for scalar float
154 | 			cores_per_cu = is_gpu ? 64.0f : 0.5f; // 64 cores/CU (GPUs), 1/2 core/CU (CPUs)
155 | 			const string amd_device_name = trim(cl_device.getInfo<CL_DEVICE_BOARD_NAME_AMD>());
156 | 			if(is_gpu&&length(amd_device_name)>0u) name = amd_device_name; // for AMD GPUs, CL_DEVICE_NAME wrongly outputs chip codename, and CL_DEVICE_BOARD_NAME_AMD outputs actual device name
157 | 		} else if(vendor_id==0x8086) { // Intel GPU/CPU
158 | 			const int intel_device_id = (int)cl_device.getInfo<CL_DEVICE_ID_INTEL>(); // also see CL_DEVICE_IP_VERSION_INTEL
159 | 			const bool intel_16_cores_per_cu = contains({ 0x0BD5, 0x0BDA, 0x64A0, 0xE20B, 0xE20C, 0xE211, 0xE212 }, intel_device_id); // GPU Max 1550, GPU Max 1100, Arc 140V/130V, Arc B580, Arc B570, Arc Pro B60, Arc Pro B50
160 | 			cores_per_cu = is_gpu ? (intel_16_cores_per_cu ? 16.0f : 8.0f) : 0.5f; // Intel GPUs have 16 cores/CU (PVC/Xe2) or 8 cores/CU (Xe1), Intel CPUs (with HT) have 1/2 core/CU
161 | 			if(is_gpu&&!uses_ram) { // fix wrong global memory capacity reporting for Intel dGPUs
162 | #if defined(_WIN32)
163 | 				memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*50ull/49ull)/1048576ull); // 98% on Windows https://github.com/intel/compute-runtime/blob/master/shared/source/os_interface/windows/wddm_memory_manager.cpp#L969
164 | #elif defined(__linux__)
165 | 				memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*20ull/19ull)/1048576ull); // 95% on Linux   https://github.com/intel/compute-runtime/blob/master/shared/source/os_interface/linux/drm_memory_manager.cpp#L1545
166 | #endif // Linux
167 | 			}
168 | 			patch_intel_gpu_above_4gb = patch_intel_gpu_above_4gb||(is_gpu&&memory>4096u); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
169 | 			if(is_cpu) is_dp4a_capable = 0u; // native dp4a in Intel CPU Runtime for OpenCL is slower than emulated dp4a
170 | 		} else if(vendor_id==0x10DE||vendor_id==0x13B5) { // Nvidia GPU/CPU
171 | 			if(is_gpu) nvidia_compute_capability = 10u*(uint)cl_device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV>()+(uint)cl_device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV>();
172 | 			const bool nvidia__32_cores_per_cu = (nvidia_compute_capability <30); // identify Fermi GPUs
173 | 			const bool nvidia_192_cores_per_cu = (nvidia_compute_capability>=30&&nvidia_compute_capability< 50); // identify Kepler GPUs
174 | 			const bool nvidia__64_cores_per_cu = (nvidia_compute_capability>=70&&nvidia_compute_capability<=80)||nvidia_compute_capability==60; // identify Volta, Turing, P100, A100, A30
175 | 			cores_per_cu = is_gpu ? (nvidia__32_cores_per_cu ? 32.0f : nvidia_192_cores_per_cu ? 192.0f : nvidia__64_cores_per_cu ? 64.0f : 128.0f) : 1.0f; // 32 (Fermi), 192 (Kepler), 64 (Volta, Turing, P100, A100, A30), 128 (Maxwell, Pascal, Ampere, Hopper, Ada, Blackwell) or 1 (CPUs)
176 | 			patch_nvidia_fp16 = patch_nvidia_fp16||(nvidia_compute_capability>=60&&atof(driver_version.substr(0, 6).c_str())>=520.00); // enable for all Nvidia Pascal or newer GPUs with driver>=520.00
177 | 			if(patch_nvidia_fp16) is_fp16_capable = 2u;
178 | 			is_dp4a_capable = (uint)(nvidia_compute_capability>=61u); // Nvidia GPUs with nvidia_compute_capability>=61 don't report dp4a support through cl_khr_integer_dot_product extension, but support it via inline PTX assembly
179 | 		} else
180 | #endif // Windows / Linux / Android
181 | 		if(vendor_id==0x1027F00) { // Apple iGPU
182 | 			cores_per_cu = 128.0f; // Apple ARM GPUs usually have 128 cores/CU
183 | 		} else if(vendor_id==0x1022||vendor_id==0x10006||vendor_id==0x6C636F70) { // x86 CPUs with PoCL runtime
184 | 			cores_per_cu = 0.5f; // CPUs typically have 1/2 cores/CU due to SMT/hyperthreading
185 | 		} else if(contains(to_lower(vendor), "arm")) { // ARM
186 | 			cores_per_cu = is_gpu ? 8.0f : 1.0f; // ARM GPUs usually have 8 cores/CU, ARM CPUs have 1 core/CU
187 | 			uses_ram = false; // CL_MEM_USE_HOST_PTR is broken on ARM iGPUs, so disable zero-copy there
188 | 			patch_legacy_gpu_fma = true; // enable for all ARM GPUs
189 | 		}
190 | 		cores = to_uint((float)compute_units*cores_per_cu); // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
191 | 		tflops = 1E-6f*(float)cores*(float)ipc*(float)clock_frequency; // estimated device floating point performance in TeraFLOPs/s
192 | 	}
193 | 	inline Device_Info() {}; // default constructor
194 | };
195 | 
196 | string get_opencl_c_code(); // implemented in kernel.hpp
197 | inline void print_device_info(const Device_Info& d) { // print OpenCL device info
198 | #if defined(_WIN32)
199 | 	const string os = "Windows";
200 | #elif defined(__linux__)
201 | 	const string os = "Linux";
202 | #elif defined(__APPLE__)
203 | 	const string os = "macOS";
204 | #else // unknown operating system
205 | 	const string os = "unknown operating system";
206 | #endif // operating system
207 | 	println("\r|----------------.------------------------------------------------------------|");
208 | 	println("| Device ID      | "+alignl(58, to_string(d.id)                             )+" |");
209 | 	println("| Device Name    | "+alignl(58, d.name                                      )+" |");
210 | 	println("| Device Vendor  | "+alignl(58, d.vendor                                    )+" |");
211 | 	println("| Device Driver  | "+alignl(58, d.driver_version+" ("+os+")"                )+" |");
212 | 	println("| OpenCL Version | "+alignl(58, "OpenCL C "+d.opencl_c_version              )+" |");
213 | 	println("| Compute Units  | "+alignl(58, to_string(d.compute_units)+" at "+to_string(d.clock_frequency)+" MHz ("+to_string(d.cores)+" cores, "+to_string(d.tflops, 3)+" TFLOPs/s)")+" |");
214 | 	println("| Memory, Cache  | "+alignl(58, to_string(d.memory)+" MB "+(d.uses_ram ? "" : "V")+"RAM, "+to_string(d.global_cache)+" KB global / "+to_string(d.local_cache)+" KB local")+" |");
215 | 	println("| Buffer Limits  | "+alignl(58, to_string(d.max_global_buffer)+" MB global, "+to_string(d.max_constant_buffer)+" KB constant")+" |");
216 | 	println("|----------------'------------------------------------------------------------|");
217 | }
218 | inline vector<Device_Info> get_devices(const bool print_info=true) { // returns a vector of all available OpenCL devices
219 | 	set_environment_variable((char*)"GPU_SINGLE_ALLOC_PERCENT=100"); // fix maximum buffer allocation size limit for AMD GPUs
220 | 	set_environment_variable((char*)"CL_CONFIG_CPU_FORCE_MAX_MEM_ALLOC_SIZE=17179869183GB"); // fix maximum buffer allocation size limit in Intel CPU Runtime for OpenCL, 2^34-1 is max non-overflowing value
221 | 	vector<Device_Info> devices; // get all devices of all platforms
222 | 	vector<cl::Platform> cl_platforms; // get all platforms (drivers)
223 | 	cl::Platform::get(&cl_platforms);
224 | 	uint id = 0u;
225 | 	for(uint i=0u; i<(uint)cl_platforms.size(); i++) {
226 | 		vector<cl::Device> cl_devices;
227 | 		cl_platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &cl_devices);
228 | 		//cl::Context cl_context(cl_devices); // same cl::Context for all devices (allocates extra VRAM on all other unused Nvidia GPUs)
229 | 		for(uint j=0u; j<(uint)cl_devices.size(); j++) {
230 | 			cl::Context cl_context(cl_devices[j]); // separate cl::Context for each device
231 | 			devices.push_back(Device_Info(cl_devices[j], cl_context, id++));
232 | 		}
233 | 	}
234 | 	if((uint)cl_platforms.size()==0u||(uint)devices.size()==0u) {
235 | 		print_message("No OpenCL devices are available. Please install the drivers for your GPU(s) and/or the CPU Runtime for OpenCL. Instructions:", "Error", 12);
236 | 		print(driver_installation_instructions);
237 | #ifdef _WIN32
238 | 		wait();
239 | #endif // Windows
240 | 		exit(1);
241 | 	}
242 | 	if(print_info) {
243 | 		println("\r|----------------.------------------------------------------------------------|");
244 | 		for(uint i=0u; i<(uint)devices.size(); i++) println("| Device ID "+alignr(4u, i)+" | "+alignl(58u, devices[i].name)+" |");
245 | 		println("|----------------'------------------------------------------------------------|");
246 | 	}
247 | 	return devices;
248 | }
249 | inline Device_Info select_device_with_most_flops(const vector<Device_Info>& devices=get_devices()) { // returns device with best floating-point performance
250 | 	float best_value = 0.0f;
251 | 	uint best_i = 0u;
252 | 	for(uint i=0u; i<(uint)devices.size(); i++) { // find device with highest (estimated) floating point performance
253 | 		if(devices[i].tflops>best_value) {
254 | 			best_value = devices[i].tflops;
255 | 			best_i = i;
256 | 		}
257 | 	}
258 | 	return devices[best_i];
259 | }
260 | inline Device_Info select_device_with_most_memory(const vector<Device_Info>& devices=get_devices()) { // returns device with largest memory capacity
261 | 	uint best_value = 0u;
262 | 	uint best_i = 0u;
263 | 	for(uint i=0u; i<(uint)devices.size(); i++) { // find device with most memory
264 | 		if(devices[i].memory>best_value) {
265 | 			best_value = devices[i].memory;
266 | 			best_i = i;
267 | 		}
268 | 	}
269 | 	return devices[best_i];
270 | }
271 | inline Device_Info select_device_with_id(const uint id, const vector<Device_Info>& devices=get_devices()) { // returns device with specified ID
272 | 	if(id<(uint)devices.size()) {
273 | 		return devices[id];
274 | 	} else {
275 | 		print_error("Your selected Device ID ("+to_string(id)+") is wrong.");
276 | 		return devices[0]; // is never executed, just to avoid compiler warnings
277 | 	}
278 | }
279 | 
280 | class Device {
281 | private:
282 | 	cl::Program cl_program;
283 | 	cl::CommandQueue cl_queue;
284 | 	bool exists = false;
285 | 	inline string enable_device_capabilities() const { return // enable FP64/FP16 capabilities if available
286 | 		string(info.patch_nvidia_fp16         ? "\n #define cl_khr_fp16"                : "")+ // Nvidia Pascal and newer GPUs with driver>=520.00 don't report cl_khr_fp16, but do support basic FP16 arithmetic
287 | 		string(info.patch_legacy_gpu_fma      ? "\n #define fma(a, b, c) ((a)*(b)+(c))" : "")+ // some old GPUs have terrible fma performance, so replace with a*b+c
288 | 		string(info.nvidia_compute_capability ? "\n #define cl_nv_compute_capability "+to_string(info.nvidia_compute_capability) : "")+ // allows querying Nvidia compute capability for inline PTX
289 | 		string(info.is_dp4a_capable==0u       ? "\n #undef __opencl_c_integer_dot_product_input_4x8bit\n #undef __opencl_c_integer_dot_product_input_4x8bit_packed" : "")+ // patch false dp4a reporting on Intel
290 | 		"\n #define cl_workgroup_size "+to_string(WORKGROUP_SIZE)+"u"
291 | 		"\n #ifdef cl_khr_fp64"
292 | 		"\n #pragma OPENCL EXTENSION cl_khr_fp64 : enable" // make sure cl_khr_fp64 extension is enabled
293 | 		"\n #endif"
294 | 		"\n #ifdef cl_khr_fp16"
295 | 		"\n #pragma OPENCL EXTENSION cl_khr_fp16 : enable" // make sure cl_khr_fp16 extension is enabled
296 | 		"\n #endif"
297 | 		"\n #ifdef cl_khr_int64_base_atomics"
298 | 		"\n #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable" // make sure cl_khr_int64_base_atomics extension is enabled
299 | 		"\n #endif"
300 | 	;}
301 | public:
302 | 	Device_Info info;
303 | 	inline Device(const Device_Info& info, const string& opencl_c_code=get_opencl_c_code()) {
304 | 		print_device_info(info);
305 | 		this->info = info;
306 | 		this->cl_queue = cl::CommandQueue(info.cl_context, info.cl_device); // queue to push commands for the device
307 | 		cl::Program::Sources cl_source;
308 | 		const string kernel_code = enable_device_capabilities()+"\n"+opencl_c_code;
309 | 		cl_source.push_back({ kernel_code.c_str(), kernel_code.length() });
310 | 		this->cl_program = cl::Program(info.cl_context, cl_source);
311 | 		const string build_options = "-cl-std=CL"+info.opencl_c_version+" -cl-finite-math-only -cl-no-signed-zeros -cl-mad-enable"+(info.patch_intel_gpu_above_4gb ? " -cl-intel-greater-than-4GB-buffer-required" : "");
312 | #ifndef LOG
313 | 		int error = cl_program.build({ info.cl_device }, (build_options+" -w").c_str()); // compile OpenCL C code, disable warnings
314 | 		if(error) print_warning(cl_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(info.cl_device)); // print build log
315 | #else // LOG, generate logfile for OpenCL code compilation
316 | 		int error = cl_program.build({ info.cl_device }, build_options.c_str()); // compile OpenCL C code
317 | 		const string log = cl_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(info.cl_device);
318 | 		write_file("bin/kernel.log", log); // save build log
319 | 		if((uint)log.length()>2u) print_warning(log); // print build log
320 | #endif // LOG
321 | 		if(error) print_error("OpenCL C code compilation failed with error code "+to_string(error)+". Make sure there are no errors in kernel.cpp.");
322 | 		else print_info("OpenCL C code successfully compiled.");
323 | #ifdef PTX // generate assembly (ptx) file for OpenCL code
324 | 		write_file("bin/kernel.ptx", (char*)&cl_program.getInfo<CL_PROGRAM_BINARIES>()[0][0]); // save binary (ptx file)
325 | #endif // PTX
326 | 		this->exists = true;
327 | 	}
328 | 	inline Device() {} // default constructor
329 | 	inline void barrier(const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) { cl_queue.enqueueBarrierWithWaitList(event_waitlist, event_returned); }
330 | 	inline void finish_queue() { cl_queue.finish(); }
331 | 	inline cl::Context get_cl_context() const { return info.cl_context; }
332 | 	inline cl::Program get_cl_program() const { return cl_program; }
333 | 	inline cl::CommandQueue get_cl_queue() const { return cl_queue; }
334 | 	inline bool is_initialized() const { return exists; }
335 | };
336 | 
337 | template<typename T> class Memory {
338 | private:
339 | 	ulong N = 0ull; // buffer length
340 | 	uint d = 1u; // buffer dimensions
341 | 	bool host_buffer_exists = false;
342 | 	bool device_buffer_exists = false;
343 | 	bool external_host_buffer = false; // Memory object has been created with an externally supplied host buffer/pointer
344 | 	bool is_zero_copy = false; // if possible (device is CPU or iGPU), and if allowed by user, use zero-copy buffer: host+device buffers are fused into one
345 | 	T* host_buffer = nullptr; // host buffer
346 | 	T* host_buffer_unaligned = nullptr; // unaligned host buffer (only required for zero-copy to align host_buffer)
347 | 	cl::Buffer device_buffer; // device buffer
348 | 	Device* device = nullptr; // pointer to linked Device
349 | 	cl::CommandQueue cl_queue; // command queue
350 | 	inline void initialize_auxiliary_pointers() {
351 | 		/********/ x = s0 = host_buffer; /******/ if(d>0x4u) s4 = host_buffer+N*0x4ull; if(d>0x8u) s8 = host_buffer+N*0x8ull; if(d>0xCu) sC = host_buffer+N*0xCull;
352 | 		if(d>0x1u) y = s1 = host_buffer+N; /****/ if(d>0x5u) s5 = host_buffer+N*0x5ull; if(d>0x9u) s9 = host_buffer+N*0x9ull; if(d>0xDu) sD = host_buffer+N*0xDull;
353 | 		if(d>0x2u) z = s2 = host_buffer+N*0x2ull; if(d>0x6u) s6 = host_buffer+N*0x6ull; if(d>0xAu) sA = host_buffer+N*0xAull; if(d>0xEu) sE = host_buffer+N*0xEull;
354 | 		if(d>0x3u) w = s3 = host_buffer+N*0x3ull; if(d>0x7u) s7 = host_buffer+N*0x7ull; if(d>0xBu) sB = host_buffer+N*0xBull; if(d>0xFu) sF = host_buffer+N*0xFull;
355 | 	}
356 | 	inline void allocate_host_buffer(Device& device, const bool allocate_host, const bool allow_zero_copy) {
357 | 		if(allocate_host) {
358 | 			const ulong alignment = allow_zero_copy&&device.info.uses_ram ? 4096ull : 64ull; // host_buffer must be aligned to 4096 Bytes for CL_MEM_USE_HOST_PTR, and to 64 Bytes for optimal enqueueReadBuffer performance on modern CPUs
359 | 			const ulong padding   = allow_zero_copy&&device.info.uses_ram ?   64ull :  0ull; // for CL_MEM_USE_HOST_PTR, 64 Bytes padding is required because device_buffer capacity in this case must be a multiple of 64 Bytes
360 | 			host_buffer_unaligned = new T[N*(ulong)d+(alignment+padding)/sizeof(T)]; // over-allocate host_buffer_unaligned by (alignment+padding) Bytes
361 | 			host_buffer = (T*)((((ulong)host_buffer_unaligned+alignment-1ull)/alignment)*alignment); // align host_buffer by fine-tuning pointer to be a multiple of alignment
362 | 			initialize_auxiliary_pointers();
363 | 			host_buffer_exists = true;
364 | 		}
365 | 	}
366 | 	inline void allocate_device_buffer(Device& device, const bool allocate_device, const bool allow_zero_copy) {
367 | 		this->device = &device;
368 | 		this->cl_queue = device.get_cl_queue();
369 | 		if(allocate_device) {
370 | 			device.info.memory_used += (uint)(capacity()/1048576ull); // track device memory usage
371 | 			if(device.info.memory_used>device.info.memory) print_error("Device \""+device.info.name+"\" does not have enough memory. Allocating another "+to_string((uint)(capacity()/1048576ull))+" MB would use a total of "+to_string(device.info.memory_used)+" MB / "+to_string(device.info.memory)+" MB.");
372 | 			int error = 0;
373 | 			is_zero_copy = allow_zero_copy&&host_buffer_exists&&device.info.uses_ram&&(!external_host_buffer||((ulong)host_buffer%4096ull==0ull&&capacity()%64ull==0ull));
374 | 			device_buffer = cl::Buffer( // if(is_zero_copy) { don't allocate extra memory on CPUs/iGPUs } else { allocate VRAM on GPUs }
375 | 				device.get_cl_context(),
376 | 				CL_MEM_READ_WRITE|((int)is_zero_copy*CL_MEM_USE_HOST_PTR)|((int)device.info.patch_intel_gpu_above_4gb<<23), // for Intel GPUs set flag CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL = (1<<23)
377 | 				is_zero_copy ? ((capacity()+63ull)/64ull)*64ull : capacity(), // device_buffer capacity must be a multiple of 64 Bytes for CL_MEM_USE_HOST_PTR
378 | 				is_zero_copy ? (void*)host_buffer : nullptr,
379 | 				&error
380 | 			);
381 | 			if(error==-61) print_error("Memory size is too large at "+to_string((uint)(capacity()/1048576ull))+" MB. Device \""+device.info.name+"\" accepts a maximum buffer size of "+to_string(device.info.max_global_buffer)+" MB.");
382 | 			else if(error) print_error("Device buffer allocation failed with error code "+to_string(error)+".");
383 | 			device_buffer_exists = true;
384 | 		}
385 | 	}
386 | public:
387 | 	T *x=nullptr, *y=nullptr, *z=nullptr, *w=nullptr; // host buffer auxiliary pointers for multi-dimensional array access (array of structures)
388 | 	T *s0=nullptr, *s1=nullptr, *s2=nullptr, *s3=nullptr, *s4=nullptr, *s5=nullptr, *s6=nullptr, *s7=nullptr, *s8=nullptr, *s9=nullptr, *sA=nullptr, *sB=nullptr, *sC=nullptr, *sD=nullptr, *sE=nullptr, *sF=nullptr;
389 | 	inline Memory(Device& device, const ulong N, const uint dimensions=1u, const bool allocate_host=true, const bool allocate_device=true, const T value=(T)0, const bool allow_zero_copy=true) {
390 | 		if(!device.is_initialized()) print_error("No Device selected. Call Device constructor.");
391 | 		if(N*(ulong)dimensions==0ull) print_error("Memory size must be larger than 0.");
392 | 		this->N = N;
393 | 		this->d = dimensions;
394 | 		allocate_host_buffer(device, allocate_host, allow_zero_copy); // allocate host_buffer first
395 | 		allocate_device_buffer(device, allocate_device, allow_zero_copy); // allocate device_buffer second
396 | 		reset(value);
397 | 	}
398 | 	inline Memory(Device& device, const ulong N, const uint dimensions, T* const host_buffer, const bool allocate_device=true, const bool allow_zero_copy=true) {
399 | 		if(!device.is_initialized()) print_error("No Device selected. Call Device constructor.");
400 | 		if(N*(ulong)dimensions==0ull) print_error("Memory size must be larger than 0.");
401 | 		this->N = N;
402 | 		this->d = dimensions;
403 | 		this->host_buffer = host_buffer;
404 | 		initialize_auxiliary_pointers();
405 | 		host_buffer_exists = true;
406 | 		external_host_buffer = true;
407 | 		allocate_device_buffer(device, allocate_device, allow_zero_copy);
408 | 		write_to_device();
409 | 	}
410 | 	inline Memory() {} // default constructor
411 | 	inline ~Memory() {
412 | 		delete_buffers();
413 | 	}
414 | 	inline Memory& operator=(Memory&& memory) noexcept { // move assignment
415 | 		delete_buffers(); // delete existing buffers and restore default state
416 | 		N = memory.length(); // copy values/pointers from memory
417 | 		d = memory.dimensions();
418 | 		device = memory.device;
419 | 		cl_queue = memory.device->get_cl_queue();
420 | 		if(memory.host_buffer_exists) {
421 | 			host_buffer = memory.exchange_host_buffer(nullptr); // transfer host_buffer pointer
422 | 			host_buffer_unaligned = memory.exchange_host_buffer_unaligned(nullptr); // transfer host_buffer_unaligned pointer
423 | 			initialize_auxiliary_pointers();
424 | 			external_host_buffer = memory.external_host_buffer;
425 | 			host_buffer_exists = true;
426 | 		}
427 | 		if(memory.device_buffer_exists) {
428 | 			device_buffer = memory.get_cl_buffer(); // transfer device_buffer pointer
429 | 			device->info.memory_used += (uint)(capacity()/1048576ull); // track device memory usage
430 | 			is_zero_copy = memory.is_zero_copy;
431 | 			device_buffer_exists = true;
432 | 		}
433 | 		return *this; // destructor of memory will be called automatically
434 | 	}
435 | 	inline T* const exchange_host_buffer(T* const host_buffer) { // sets host_buffer to new pointer and returns old pointer
436 | 		T* const swap = this->host_buffer;
437 | 		this->host_buffer = host_buffer;
438 | 		return swap;
439 | 	}
440 | 	inline T* const exchange_host_buffer_unaligned(T* const host_buffer_unaligned) { // sets host_buffer_unaligned to new pointer and returns old pointer
441 | 		T* const swap = this->host_buffer_unaligned;
442 | 		this->host_buffer_unaligned = host_buffer_unaligned;
443 | 		return swap;
444 | 	}
445 | 	inline void add_host_buffer() { // makes only sense if there is no host buffer yet but an existing device buffer
446 | 		if(!host_buffer_exists&&device_buffer_exists) {
447 | 			host_buffer = new T[N*(ulong)d];
448 | 			initialize_auxiliary_pointers();
449 | 			read_from_device();
450 | 			host_buffer_exists = true;
451 | 		} else if(!device_buffer_exists) {
452 | 			print_error("There is no existing device buffer, so can't add host buffer.");
453 | 		}
454 | 	}
455 | 	inline void add_device_buffer(const bool allow_zero_copy=true) { // makes only sense if there is no device buffer yet but an existing host buffer
456 | 		if(!device_buffer_exists&&host_buffer_exists) {
457 | 			allocate_device_buffer(*device, true, allow_zero_copy);
458 | 			write_to_device();
459 | 		} else if(!host_buffer_exists) {
460 | 			print_error("There is no existing host buffer, so can't add device buffer.");
461 | 		}
462 | 	}
463 | 	inline void delete_host_buffer() {
464 | 		host_buffer_exists = false;
465 | 		if(!external_host_buffer) {
466 | 			host_buffer = nullptr;
467 | 			delete[] host_buffer_unaligned;
468 | 		}
469 | 		if(!device_buffer_exists) {
470 | 			N = 0ull;
471 | 			d = 1u;
472 | 		}
473 | 	}
474 | 	inline void delete_device_buffer() {
475 | 		if(device_buffer_exists) device->info.memory_used -= (uint)(capacity()/1048576ull); // track device memory usage
476 | 		device_buffer_exists = false;
477 | 		device_buffer = nullptr;
478 | 		if(!host_buffer_exists) {
479 | 			N = 0ull;
480 | 			d = 1u;
481 | 		}
482 | 	}
483 | 	inline void delete_buffers() {
484 | 		delete_device_buffer();
485 | 		delete_host_buffer();
486 | 	}
487 | 	inline void reset(const T value=(T)0) {
488 | 		//if(device_buffer_exists) cl_queue.enqueueFillBuffer(device_buffer, value, 0ull, capacity()); // faster than "write_to_device();"
489 | 		if(host_buffer_exists) std::fill(host_buffer, host_buffer+range(), value); // faster than "for(ulong i=0ull; i<range(); i++) host_buffer[i] = value;"
490 | 		write_to_device(); // enqueueFillBuffer is broken for large buffers on Nvidia GPUs!
491 | 		//if(device_buffer_exists) cl_queue.finish();
492 | 	}
493 | 	inline const ulong length() const { return N; }
494 | 	inline const uint dimensions() const { return d; }
495 | 	inline const ulong range() const { return N*(ulong)d; }
496 | 	inline const ulong capacity() const { return N*(ulong)d*sizeof(T); } // returns capacity of the buffer in Bytes
497 | 	inline T* const data() { return host_buffer; }
498 | 	inline const T* const data() const { return host_buffer; }
499 | 	inline T* const operator()() { return host_buffer; }
500 | 	inline const T* const operator()() const { return host_buffer; }
501 | 	inline T& operator[](const ulong i) { return host_buffer[i]; }
502 | 	inline const T& operator[](const ulong i) const { return host_buffer[i]; }
503 | 	inline const T operator()(const ulong i) const { return host_buffer[i]; }
504 | 	inline const T operator()(const ulong i, const uint dimension) const { return host_buffer[i+(ulong)dimension*N]; } // array of structures
505 | 	inline void read_from_device(const bool blocking=true, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) {
506 | 		if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) {
507 | 			cl_queue.enqueueReadBuffer(device_buffer, blocking, 0ull, capacity(), (void*)host_buffer, event_waitlist, event_returned);
508 | 		}
509 | 	}
510 | 	inline void write_to_device(const bool blocking=true, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) {
511 | 		if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) {
512 | 			cl_queue.enqueueWriteBuffer(device_buffer, blocking, 0ull, capacity(), (void*)host_buffer, event_waitlist, event_returned);
513 | 		}
514 | 	}
515 | 	inline void read_from_device(const ulong offset, const ulong length, const bool blocking=true, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) {
516 | 		if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) {
517 | 			const ulong safe_offset=min(offset, range()), safe_length=min(length, range()-safe_offset);
518 | 			if(safe_length>0ull) cl_queue.enqueueReadBuffer(device_buffer, blocking, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned);
519 | 		}
520 | 	}
521 | 	inline void write_to_device(const ulong offset, const ulong length, const bool blocking=true, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) {
522 | 		if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) {
523 | 			const ulong safe_offset=min(offset, range()), safe_length=min(length, range()-safe_offset);
524 | 			if(safe_length>0ull) cl_queue.enqueueWriteBuffer(device_buffer, blocking, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned);
525 | 		}
526 | 	}
527 | 	inline void read_from_device_1d(const ulong x0, const ulong x1, const int dimension=-1, const bool blocking=true, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) { // read 1D domain from device, either for all vector dimensions (-1) or for a specified dimension
528 | 		if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) {
529 | 			const uint i0=(uint)max(0, dimension), i1=dimension<0 ? d : i0+1u;
530 | 			for(uint i=i0; i<i1; i++) {
531 | 				const ulong safe_offset=min((ulong)i*N+x0, range()), safe_length=min(x1-x0, range()-safe_offset);
532 | 				if(safe_length>0ull) cl_queue.enqueueReadBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned);
533 | 			}
534 | 			if(blocking) cl_queue.finish();
535 | 		}
536 | 	}
537 | 	inline void write_to_device_1d(const ulong x0, const ulong x1, const int dimension=-1, const bool blocking=true, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) { // write 1D domain to device, either for all vector dimensions (-1) or for a specified dimension
538 | 		if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) {
539 | 			const uint i0=(uint)max(0, dimension), i1=dimension<0 ? d : i0+1u;
540 | 			for(uint i=i0; i<i1; i++) {
541 | 				const ulong safe_offset=min((ulong)i*N+x0, range()), safe_length=min(x1-x0, range()-safe_offset);
542 | 				if(safe_length>0ull) cl_queue.enqueueWriteBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned);
543 | 			}
544 | 			if(blocking) cl_queue.finish();
545 | 		}
546 | 	}
547 | 	inline void read_from_device_2d(const ulong x0, const ulong x1, const ulong y0, const ulong y1, const ulong Nx, const ulong Ny, const int dimension=-1, const bool blocking=true, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) { // read 2D domain from device, either for all vector dimensions (-1) or for a specified dimension
548 | 		if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) {
549 | 			for(uint y=y0; y<y1; y++) {
550 | 				const ulong n = x0+y*Nx;
551 | 				const uint i0=(uint)max(0, dimension), i1=dimension<0 ? d : i0+1u;
552 | 				for(uint i=i0; i<i1; i++) {
553 | 					const ulong safe_offset=min((ulong)i*N+n, range()), safe_length=min(x1-x0, range()-safe_offset);
554 | 					if(safe_length>0ull) cl_queue.enqueueReadBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned);
555 | 				}
556 | 			}
557 | 			if(blocking) cl_queue.finish();
558 | 		}
559 | 	}
560 | 	inline void write_to_device_2d(const ulong x0, const ulong x1, const ulong y0, const ulong y1, const ulong Nx, const ulong Ny, const int dimension=-1, const bool blocking=true, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) { // write 2D domain to device, either for all vector dimensions (-1) or for a specified dimension
561 | 		if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) {
562 | 			for(uint y=y0; y<y1; y++) {
563 | 				const ulong n = x0+y*Nx;
564 | 				const uint i0=(uint)max(0, dimension), i1=dimension<0 ? d : i0+1u;
565 | 				for(uint i=i0; i<i1; i++) {
566 | 					const ulong safe_offset=min((ulong)i*N+n, range()), safe_length=min(x1-x0, range()-safe_offset);
567 | 					if(safe_length>0ull) cl_queue.enqueueWriteBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned);
568 | 				}
569 | 			}
570 | 			if(blocking) cl_queue.finish();
571 | 		}
572 | 	}
573 | 	inline void read_from_device_3d(const ulong x0, const ulong x1, const ulong y0, const ulong y1, const ulong z0, const ulong z1, const ulong Nx, const ulong Ny, const ulong Nz, const int dimension=-1, const bool blocking=true, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) { // read 3D domain from device, either for all vector dimensions (-1) or for a specified dimension
574 | 		if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) {
575 | 			for(uint z=z0; z<z1; z++) {
576 | 				for(uint y=y0; y<y1; y++) {
577 | 					const ulong n = x0+(y+z*Ny)*Nx;
578 | 					const uint i0=(uint)max(0, dimension), i1=dimension<0 ? d : i0+1u;
579 | 					for(uint i=i0; i<i1; i++) {
580 | 						const ulong safe_offset=min((ulong)i*N+n, range()), safe_length=min(x1-x0, range()-safe_offset);
581 | 						if(safe_length>0ull) cl_queue.enqueueReadBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned);
582 | 					}
583 | 				}
584 | 			}
585 | 			if(blocking) cl_queue.finish();
586 | 		}
587 | 	}
588 | 	inline void write_to_device_3d(const ulong x0, const ulong x1, const ulong y0, const ulong y1, const ulong z0, const ulong z1, const ulong Nx, const ulong Ny, const ulong Nz, const int dimension=-1, const bool blocking=true, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) { // write 3D domain to device, either for all vector dimensions (-1) or for a specified dimension
589 | 		if(host_buffer_exists&&device_buffer_exists&&!is_zero_copy) {
590 | 			for(uint z=z0; z<z1; z++) {
591 | 				for(uint y=y0; y<y1; y++) {
592 | 					const ulong n = x0+(y+z*Ny)*Nx;
593 | 					const uint i0=(uint)max(0, dimension), i1=dimension<0 ? d : i0+1u;
594 | 					for(uint i=i0; i<i1; i++) {
595 | 						const ulong safe_offset=min((ulong)i*N+n, range()), safe_length=min(x1-x0, range()-safe_offset);
596 | 						if(safe_length>0ull) cl_queue.enqueueWriteBuffer(device_buffer, false, safe_offset*sizeof(T), safe_length*sizeof(T), (void*)(host_buffer+safe_offset), event_waitlist, event_returned);
597 | 					}
598 | 				}
599 | 			}
600 | 			if(blocking) cl_queue.finish();
601 | 		}
602 | 	}
603 | 	inline void enqueue_read_from_device(const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) { read_from_device(false, event_waitlist, event_returned); }
604 | 	inline void enqueue_write_to_device(const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) { write_to_device(false, event_waitlist, event_returned); }
605 | 	inline void enqueue_read_from_device(const ulong offset, const ulong length, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) { read_from_device(offset, length, false, event_waitlist, event_returned); }
606 | 	inline void enqueue_write_to_device(const ulong offset, const ulong length, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) { write_to_device(offset, length, false, event_waitlist, event_returned); }
607 | 	inline void finish_queue() { cl_queue.finish(); }
608 | 	inline const cl::Buffer& get_cl_buffer() const { return device_buffer; }
609 | };
610 | 
611 | class Kernel {
612 | private:
613 | 	ulong N = 0ull; // kernel range
614 | 	uint number_of_parameters = 0u;
615 | 	string name = "";
616 | 	cl::Kernel cl_kernel;
617 | 	cl::NDRange cl_range_global, cl_range_local;
618 | 	cl::CommandQueue cl_queue;
619 | 	inline void check_for_errors(const int error) {
620 | 		if(error==-48) print_error("There is no OpenCL kernel with name \""+name+"(...)\" in the OpenCL C code! Check spelling!");
621 | 		if(error<-48&&error>-53) print_error("Parameters for OpenCL kernel \""+name+"(...)\" don't match between C++ and OpenCL C!");
622 | 		if(error==-54) print_error("Workgrop size "+to_string(WORKGROUP_SIZE)+" for OpenCL kernel \""+name+"(...)\" is invalid!");
623 | 		if(error!=0) print_error("OpenCL kernel \""+name+"(...)\" failed with error code "+to_string(error)+"!");
624 | 	}
625 | 	template<typename T> inline void link_parameter(const uint position, const Memory<T>& memory) {
626 | 		check_for_errors(cl_kernel.setArg(position, memory.get_cl_buffer()));
627 | 	}
628 | 	template<typename T> inline void link_parameter(const uint position, const T& constant) {
629 | 		check_for_errors(cl_kernel.setArg(position, sizeof(T), (void*)&constant));
630 | 	}
631 | 	inline void link_parameters(const uint starting_position) {
632 | 		number_of_parameters = max(number_of_parameters, starting_position);
633 | 	}
634 | 	template<class T, class... U> inline void link_parameters(const uint starting_position, const T& parameter, const U&... parameters) {
635 | 		link_parameter(starting_position, parameter);
636 | 		link_parameters(starting_position+1u, parameters...);
637 | 	}
638 | public:
639 | 	template<class... T> inline Kernel(const Device& device, const ulong N, const string& name, const T&... parameters) { // accepts Memory<T> objects and fundamental data type constants
640 | 		if(!device.is_initialized()) print_error("No OpenCL Device selected. Call Device constructor.");
641 | 		this->name = name;
642 | 		cl_kernel = cl::Kernel(device.get_cl_program(), name.c_str());
643 | 		link_parameters(0u, parameters...); // expand variadic template to link kernel parameters
644 | 		set_ranges(N);
645 | 		cl_queue = device.get_cl_queue();
646 | 	}
647 | 	template<class... T> inline Kernel(const Device& device, const ulong N, const uint workgroup_size, const string& name, const T&... parameters) { // accepts Memory<T> objects and fundamental data type constants
648 | 		if(!device.is_initialized()) print_error("No OpenCL Device selected. Call Device constructor.");
649 | 		cl_kernel = cl::Kernel(device.get_cl_program(), name.c_str());
650 | 		link_parameters(0u, parameters...); // expand variadic template to link kernel parameters
651 | 		set_ranges(N, (ulong)workgroup_size);
652 | 		cl_queue = device.get_cl_queue();
653 | 	}
654 | 	inline Kernel() {} // default constructor
655 | 	inline Kernel& set_ranges(const ulong N, const ulong workgroup_size=(ulong)WORKGROUP_SIZE) {
656 | 		this->N = N;
657 | 		cl_range_global = cl::NDRange(((N+workgroup_size-1ull)/workgroup_size)*workgroup_size); // make global range a multiple of local range
658 | 		cl_range_local = cl::NDRange(workgroup_size);
659 | 		return *this;
660 | 	}
661 | 	inline const ulong range() const { return N; }
662 | 	inline uint get_number_of_parameters() const { return number_of_parameters; }
663 | 	template<class... T> inline Kernel& add_parameters(const T&... parameters) { // add parameters to the list of existing parameters
664 | 		link_parameters(number_of_parameters, parameters...); // expand variadic template to link kernel parameters
665 | 		return *this;
666 | 	}
667 | 	template<class... T> inline Kernel& set_parameters(const uint starting_position, const T&... parameters) { // set parameters starting at specified position
668 | 		link_parameters(starting_position, parameters...); // expand variadic template to link kernel parameters
669 | 		return *this;
670 | 	}
671 | 	inline Kernel& enqueue_run(const uint t=1u, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) {
672 | 		for(uint i=0u; i<t; i++) {
673 | 			check_for_errors(cl_queue.enqueueNDRangeKernel(cl_kernel, cl::NullRange, cl_range_global, cl_range_local, event_waitlist, event_returned));
674 | 		}
675 | 		return *this;
676 | 	}
677 | 	inline Kernel& run(const uint t=1u, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) {
678 | 		enqueue_run(t, event_waitlist, event_returned);
679 | 		finish_queue();
680 | 		return *this;
681 | 	}
682 | 	inline Kernel& operator()(const uint t=1u, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) {
683 | 		return run(t, event_waitlist, event_returned);
684 | 	}
685 | 	inline Kernel& finish_queue() {
686 | 		cl_queue.finish();
687 | 		return *this;
688 | 	}
689 | };


--------------------------------------------------------------------------------
/src/OpenCL/include/CL/cl_platform.h:
--------------------------------------------------------------------------------
   1 | /*******************************************************************************
   2 |  * Copyright (c) 2008-2020 The Khronos Group Inc.
   3 |  *
   4 |  * Licensed under the Apache License, Version 2.0 (the "License");
   5 |  * you may not use this file except in compliance with the License.
   6 |  * You may obtain a copy of the License at
   7 |  *
   8 |  *    http://www.apache.org/licenses/LICENSE-2.0
   9 |  *
  10 |  * Unless required by applicable law or agreed to in writing, software
  11 |  * distributed under the License is distributed on an "AS IS" BASIS,
  12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 |  * See the License for the specific language governing permissions and
  14 |  * limitations under the License.
  15 |  ******************************************************************************/
  16 | 
  17 | #ifndef __CL_PLATFORM_H
  18 | #define __CL_PLATFORM_H
  19 | 
  20 | #include <CL/cl_version.h>
  21 | 
  22 | #ifdef __cplusplus
  23 | extern "C" {
  24 | #endif
  25 | 
  26 | #if defined(_WIN32)
  27 |     #if !defined(CL_API_ENTRY)
  28 |         #define CL_API_ENTRY
  29 |     #endif
  30 |     #if !defined(CL_API_CALL)
  31 |         #define CL_API_CALL     __stdcall
  32 |     #endif
  33 |     #if !defined(CL_CALLBACK)
  34 |         #define CL_CALLBACK     __stdcall
  35 |     #endif
  36 | #else
  37 |     #if !defined(CL_API_ENTRY)
  38 |         #define CL_API_ENTRY
  39 |     #endif
  40 |     #if !defined(CL_API_CALL)
  41 |         #define CL_API_CALL
  42 |     #endif
  43 |     #if !defined(CL_CALLBACK)
  44 |         #define CL_CALLBACK
  45 |     #endif
  46 | #endif
  47 | 
  48 | /*
  49 |  * Deprecation flags refer to the last version of the header in which the
  50 |  * feature was not deprecated.
  51 |  *
  52 |  * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
  53 |  * deprecation but is deprecated in versions later than 1.1.
  54 |  */
  55 | 
  56 | #ifndef CL_API_SUFFIX_USER
  57 | #define CL_API_SUFFIX_USER
  58 | #endif
  59 | 
  60 | #ifndef CL_API_PREFIX_USER
  61 | #define CL_API_PREFIX_USER
  62 | #endif
  63 | 
  64 | #define CL_API_SUFFIX_COMMON CL_API_SUFFIX_USER
  65 | #define CL_API_PREFIX_COMMON CL_API_PREFIX_USER
  66 | 
  67 | #define CL_API_SUFFIX__VERSION_1_0 CL_API_SUFFIX_COMMON
  68 | #define CL_API_SUFFIX__VERSION_1_1 CL_API_SUFFIX_COMMON
  69 | #define CL_API_SUFFIX__VERSION_1_2 CL_API_SUFFIX_COMMON
  70 | #define CL_API_SUFFIX__VERSION_2_0 CL_API_SUFFIX_COMMON
  71 | #define CL_API_SUFFIX__VERSION_2_1 CL_API_SUFFIX_COMMON
  72 | #define CL_API_SUFFIX__VERSION_2_2 CL_API_SUFFIX_COMMON
  73 | #define CL_API_SUFFIX__VERSION_3_0 CL_API_SUFFIX_COMMON
  74 | #define CL_API_SUFFIX__EXPERIMENTAL CL_API_SUFFIX_COMMON
  75 | 
  76 | 
  77 | #ifdef __GNUC__
  78 |   #define CL_API_SUFFIX_DEPRECATED __attribute__((deprecated))
  79 |   #define CL_API_PREFIX_DEPRECATED
  80 | #elif defined(_MSC_VER) && !defined(__clang__)
  81 |   #define CL_API_SUFFIX_DEPRECATED
  82 |   #define CL_API_PREFIX_DEPRECATED __declspec(deprecated)
  83 | #else
  84 |   #define CL_API_SUFFIX_DEPRECATED
  85 |   #define CL_API_PREFIX_DEPRECATED
  86 | #endif
  87 | 
  88 | #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
  89 |     #define CL_API_SUFFIX__VERSION_1_0_DEPRECATED CL_API_SUFFIX_COMMON
  90 |     #define CL_API_PREFIX__VERSION_1_0_DEPRECATED CL_API_PREFIX_COMMON
  91 | #else
  92 |     #define CL_API_SUFFIX__VERSION_1_0_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
  93 |     #define CL_API_PREFIX__VERSION_1_0_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
  94 | #endif
  95 | 
  96 | #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
  97 |     #define CL_API_SUFFIX__VERSION_1_1_DEPRECATED CL_API_SUFFIX_COMMON
  98 |     #define CL_API_PREFIX__VERSION_1_1_DEPRECATED CL_API_PREFIX_COMMON
  99 | #else
 100 |     #define CL_API_SUFFIX__VERSION_1_1_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
 101 |     #define CL_API_PREFIX__VERSION_1_1_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
 102 | #endif
 103 | 
 104 | #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
 105 |     #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON
 106 |     #define CL_API_PREFIX__VERSION_1_2_DEPRECATED CL_API_PREFIX_COMMON
 107 | #else
 108 |     #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
 109 |     #define CL_API_PREFIX__VERSION_1_2_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
 110 |  #endif
 111 | 
 112 | #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
 113 |     #define CL_API_SUFFIX__VERSION_2_0_DEPRECATED CL_API_SUFFIX_COMMON
 114 |     #define CL_API_PREFIX__VERSION_2_0_DEPRECATED CL_API_PREFIX_COMMON
 115 | #else
 116 |     #define CL_API_SUFFIX__VERSION_2_0_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
 117 |     #define CL_API_PREFIX__VERSION_2_0_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
 118 | #endif
 119 | 
 120 | #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
 121 |     #define CL_API_SUFFIX__VERSION_2_1_DEPRECATED CL_API_SUFFIX_COMMON
 122 |     #define CL_API_PREFIX__VERSION_2_1_DEPRECATED CL_API_PREFIX_COMMON
 123 | #else
 124 |     #define CL_API_SUFFIX__VERSION_2_1_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
 125 |     #define CL_API_PREFIX__VERSION_2_1_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
 126 | #endif
 127 | 
 128 | #ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS
 129 |     #define CL_API_SUFFIX__VERSION_2_2_DEPRECATED CL_API_SUFFIX_COMMON
 130 |     #define CL_API_PREFIX__VERSION_2_2_DEPRECATED CL_API_PREFIX_COMMON
 131 | #else
 132 |     #define CL_API_SUFFIX__VERSION_2_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
 133 |     #define CL_API_PREFIX__VERSION_2_2_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
 134 | #endif
 135 | 
 136 | #if (defined (_WIN32) && defined(_MSC_VER))
 137 | 
 138 | #if defined(__clang__)
 139 | #pragma clang diagnostic push
 140 | #pragma clang diagnostic ignored "-Wlanguage-extension-token"
 141 | #endif
 142 | 
 143 | /* intptr_t is used in cl.h and provided by stddef.h in Visual C++, but not in clang */
 144 | /* stdint.h was missing before Visual Studio 2010, include it for later versions and for clang */
 145 | #if defined(__clang__) || _MSC_VER >= 1600
 146 |     #include <stdint.h>
 147 | #endif
 148 | 
 149 | /* scalar types  */
 150 | typedef signed   __int8         cl_char;
 151 | typedef unsigned __int8         cl_uchar;
 152 | typedef signed   __int16        cl_short;
 153 | typedef unsigned __int16        cl_ushort;
 154 | typedef signed   __int32        cl_int;
 155 | typedef unsigned __int32        cl_uint;
 156 | typedef signed   __int64        cl_long;
 157 | typedef unsigned __int64        cl_ulong;
 158 | 
 159 | typedef unsigned __int16        cl_half;
 160 | typedef float                   cl_float;
 161 | typedef double                  cl_double;
 162 | 
 163 | #if defined(__clang__)
 164 | #pragma clang diagnostic pop
 165 | #endif
 166 | 
 167 | /* Macro names and corresponding values defined by OpenCL */
 168 | #define CL_CHAR_BIT         8
 169 | #define CL_SCHAR_MAX        127
 170 | #define CL_SCHAR_MIN        (-127-1)
 171 | #define CL_CHAR_MAX         CL_SCHAR_MAX
 172 | #define CL_CHAR_MIN         CL_SCHAR_MIN
 173 | #define CL_UCHAR_MAX        255
 174 | #define CL_SHRT_MAX         32767
 175 | #define CL_SHRT_MIN         (-32767-1)
 176 | #define CL_USHRT_MAX        65535
 177 | #define CL_INT_MAX          2147483647
 178 | #define CL_INT_MIN          (-2147483647-1)
 179 | #define CL_UINT_MAX         0xffffffffU
 180 | #define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
 181 | #define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
 182 | #define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
 183 | 
 184 | #define CL_FLT_DIG          6
 185 | #define CL_FLT_MANT_DIG     24
 186 | #define CL_FLT_MAX_10_EXP   +38
 187 | #define CL_FLT_MAX_EXP      +128
 188 | #define CL_FLT_MIN_10_EXP   -37
 189 | #define CL_FLT_MIN_EXP      -125
 190 | #define CL_FLT_RADIX        2
 191 | #define CL_FLT_MAX          340282346638528859811704183484516925440.0f
 192 | #define CL_FLT_MIN          1.175494350822287507969e-38f
 193 | #define CL_FLT_EPSILON      1.1920928955078125e-7f
 194 | 
 195 | #define CL_HALF_DIG          3
 196 | #define CL_HALF_MANT_DIG     11
 197 | #define CL_HALF_MAX_10_EXP   +4
 198 | #define CL_HALF_MAX_EXP      +16
 199 | #define CL_HALF_MIN_10_EXP   -4
 200 | #define CL_HALF_MIN_EXP      -13
 201 | #define CL_HALF_RADIX        2
 202 | #define CL_HALF_MAX          65504.0f
 203 | #define CL_HALF_MIN          6.103515625e-05f
 204 | #define CL_HALF_EPSILON      9.765625e-04f
 205 | 
 206 | #define CL_DBL_DIG          15
 207 | #define CL_DBL_MANT_DIG     53
 208 | #define CL_DBL_MAX_10_EXP   +308
 209 | #define CL_DBL_MAX_EXP      +1024
 210 | #define CL_DBL_MIN_10_EXP   -307
 211 | #define CL_DBL_MIN_EXP      -1021
 212 | #define CL_DBL_RADIX        2
 213 | #define CL_DBL_MAX          1.7976931348623158e+308
 214 | #define CL_DBL_MIN          2.225073858507201383090e-308
 215 | #define CL_DBL_EPSILON      2.220446049250313080847e-16
 216 | 
 217 | #define CL_M_E              2.7182818284590452354
 218 | #define CL_M_LOG2E          1.4426950408889634074
 219 | #define CL_M_LOG10E         0.43429448190325182765
 220 | #define CL_M_LN2            0.69314718055994530942
 221 | #define CL_M_LN10           2.30258509299404568402
 222 | #define CL_M_PI             3.14159265358979323846
 223 | #define CL_M_PI_2           1.57079632679489661923
 224 | #define CL_M_PI_4           0.78539816339744830962
 225 | #define CL_M_1_PI           0.31830988618379067154
 226 | #define CL_M_2_PI           0.63661977236758134308
 227 | #define CL_M_2_SQRTPI       1.12837916709551257390
 228 | #define CL_M_SQRT2          1.41421356237309504880
 229 | #define CL_M_SQRT1_2        0.70710678118654752440
 230 | 
 231 | #define CL_M_E_F            2.718281828f
 232 | #define CL_M_LOG2E_F        1.442695041f
 233 | #define CL_M_LOG10E_F       0.434294482f
 234 | #define CL_M_LN2_F          0.693147181f
 235 | #define CL_M_LN10_F         2.302585093f
 236 | #define CL_M_PI_F           3.141592654f
 237 | #define CL_M_PI_2_F         1.570796327f
 238 | #define CL_M_PI_4_F         0.785398163f
 239 | #define CL_M_1_PI_F         0.318309886f
 240 | #define CL_M_2_PI_F         0.636619772f
 241 | #define CL_M_2_SQRTPI_F     1.128379167f
 242 | #define CL_M_SQRT2_F        1.414213562f
 243 | #define CL_M_SQRT1_2_F      0.707106781f
 244 | 
 245 | #define CL_NAN              (CL_INFINITY - CL_INFINITY)
 246 | #define CL_HUGE_VALF        ((cl_float) 1e50)
 247 | #define CL_HUGE_VAL         ((cl_double) 1e500)
 248 | #define CL_MAXFLOAT         CL_FLT_MAX
 249 | #define CL_INFINITY         CL_HUGE_VALF
 250 | 
 251 | #else
 252 | 
 253 | #include <stdint.h>
 254 | 
 255 | /* scalar types  */
 256 | typedef int8_t          cl_char;
 257 | typedef uint8_t         cl_uchar;
 258 | typedef int16_t         cl_short;
 259 | typedef uint16_t        cl_ushort;
 260 | typedef int32_t         cl_int;
 261 | typedef uint32_t        cl_uint;
 262 | typedef int64_t         cl_long;
 263 | typedef uint64_t        cl_ulong;
 264 | 
 265 | typedef uint16_t        cl_half;
 266 | typedef float           cl_float;
 267 | typedef double          cl_double;
 268 | 
 269 | /* Macro names and corresponding values defined by OpenCL */
 270 | #define CL_CHAR_BIT         8
 271 | #define CL_SCHAR_MAX        127
 272 | #define CL_SCHAR_MIN        (-127-1)
 273 | #define CL_CHAR_MAX         CL_SCHAR_MAX
 274 | #define CL_CHAR_MIN         CL_SCHAR_MIN
 275 | #define CL_UCHAR_MAX        255
 276 | #define CL_SHRT_MAX         32767
 277 | #define CL_SHRT_MIN         (-32767-1)
 278 | #define CL_USHRT_MAX        65535
 279 | #define CL_INT_MAX          2147483647
 280 | #define CL_INT_MIN          (-2147483647-1)
 281 | #define CL_UINT_MAX         0xffffffffU
 282 | #define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
 283 | #define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
 284 | #define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
 285 | 
 286 | #define CL_FLT_DIG          6
 287 | #define CL_FLT_MANT_DIG     24
 288 | #define CL_FLT_MAX_10_EXP   +38
 289 | #define CL_FLT_MAX_EXP      +128
 290 | #define CL_FLT_MIN_10_EXP   -37
 291 | #define CL_FLT_MIN_EXP      -125
 292 | #define CL_FLT_RADIX        2
 293 | #define CL_FLT_MAX          340282346638528859811704183484516925440.0f
 294 | #define CL_FLT_MIN          1.175494350822287507969e-38f
 295 | #define CL_FLT_EPSILON      1.1920928955078125e-7f
 296 | 
 297 | #define CL_HALF_DIG          3
 298 | #define CL_HALF_MANT_DIG     11
 299 | #define CL_HALF_MAX_10_EXP   +4
 300 | #define CL_HALF_MAX_EXP      +16
 301 | #define CL_HALF_MIN_10_EXP   -4
 302 | #define CL_HALF_MIN_EXP      -13
 303 | #define CL_HALF_RADIX        2
 304 | #define CL_HALF_MAX          65504.0f
 305 | #define CL_HALF_MIN          6.103515625e-05f
 306 | #define CL_HALF_EPSILON      9.765625e-04f
 307 | 
 308 | #define CL_DBL_DIG          15
 309 | #define CL_DBL_MANT_DIG     53
 310 | #define CL_DBL_MAX_10_EXP   +308
 311 | #define CL_DBL_MAX_EXP      +1024
 312 | #define CL_DBL_MIN_10_EXP   -307
 313 | #define CL_DBL_MIN_EXP      -1021
 314 | #define CL_DBL_RADIX        2
 315 | #define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
 316 | #define CL_DBL_MIN          2.225073858507201383090e-308
 317 | #define CL_DBL_EPSILON      2.220446049250313080847e-16
 318 | 
 319 | #define CL_M_E              2.7182818284590452354
 320 | #define CL_M_LOG2E          1.4426950408889634074
 321 | #define CL_M_LOG10E         0.43429448190325182765
 322 | #define CL_M_LN2            0.69314718055994530942
 323 | #define CL_M_LN10           2.30258509299404568402
 324 | #define CL_M_PI             3.14159265358979323846
 325 | #define CL_M_PI_2           1.57079632679489661923
 326 | #define CL_M_PI_4           0.78539816339744830962
 327 | #define CL_M_1_PI           0.31830988618379067154
 328 | #define CL_M_2_PI           0.63661977236758134308
 329 | #define CL_M_2_SQRTPI       1.12837916709551257390
 330 | #define CL_M_SQRT2          1.41421356237309504880
 331 | #define CL_M_SQRT1_2        0.70710678118654752440
 332 | 
 333 | #define CL_M_E_F            2.718281828f
 334 | #define CL_M_LOG2E_F        1.442695041f
 335 | #define CL_M_LOG10E_F       0.434294482f
 336 | #define CL_M_LN2_F          0.693147181f
 337 | #define CL_M_LN10_F         2.302585093f
 338 | #define CL_M_PI_F           3.141592654f
 339 | #define CL_M_PI_2_F         1.570796327f
 340 | #define CL_M_PI_4_F         0.785398163f
 341 | #define CL_M_1_PI_F         0.318309886f
 342 | #define CL_M_2_PI_F         0.636619772f
 343 | #define CL_M_2_SQRTPI_F     1.128379167f
 344 | #define CL_M_SQRT2_F        1.414213562f
 345 | #define CL_M_SQRT1_2_F      0.707106781f
 346 | 
 347 | #if defined( __GNUC__ )
 348 |    #define CL_HUGE_VALF     __builtin_huge_valf()
 349 |    #define CL_HUGE_VAL      __builtin_huge_val()
 350 |    #define CL_NAN           __builtin_nanf( "" )
 351 | #else
 352 |    #define CL_HUGE_VALF     ((cl_float) 1e50)
 353 |    #define CL_HUGE_VAL      ((cl_double) 1e500)
 354 |    float nanf( const char * );
 355 |    #define CL_NAN           nanf( "" )
 356 | #endif
 357 | #define CL_MAXFLOAT         CL_FLT_MAX
 358 | #define CL_INFINITY         CL_HUGE_VALF
 359 | 
 360 | #endif
 361 | 
 362 | #include <stddef.h>
 363 | 
 364 | /*
 365 |  * Vector types
 366 |  *
 367 |  *  Note:   OpenCL requires that all types be naturally aligned.
 368 |  *          This means that vector types must be naturally aligned.
 369 |  *          For example, a vector of four floats must be aligned to
 370 |  *          a 16 byte boundary (calculated as 4 * the natural 4-byte
 371 |  *          alignment of the float).  The alignment qualifiers here
 372 |  *          will only function properly if your compiler supports them
 373 |  *          and if you don't actively work to defeat them.  For example,
 374 |  *          in order for a cl_float4 to be 16 byte aligned in a struct,
 375 |  *          the start of the struct must itself be 16-byte aligned.
 376 |  *
 377 |  *          Maintaining proper alignment is the user's responsibility.
 378 |  */
 379 | 
 380 | /* Define basic vector types */
 381 | #if defined( __VEC__ )
 382 |   #if !defined(__clang__)
 383 |      #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
 384 |   #endif
 385 |    typedef __vector unsigned char     __cl_uchar16;
 386 |    typedef __vector signed char       __cl_char16;
 387 |    typedef __vector unsigned short    __cl_ushort8;
 388 |    typedef __vector signed short      __cl_short8;
 389 |    typedef __vector unsigned int      __cl_uint4;
 390 |    typedef __vector signed int        __cl_int4;
 391 |    typedef __vector float             __cl_float4;
 392 |    #define  __CL_UCHAR16__  1
 393 |    #define  __CL_CHAR16__   1
 394 |    #define  __CL_USHORT8__  1
 395 |    #define  __CL_SHORT8__   1
 396 |    #define  __CL_UINT4__    1
 397 |    #define  __CL_INT4__     1
 398 |    #define  __CL_FLOAT4__   1
 399 | #endif
 400 | 
 401 | #if defined( __SSE__ )
 402 |     #if defined( __MINGW64__ )
 403 |         #include <intrin.h>
 404 |     #else
 405 |         #include <xmmintrin.h>
 406 |     #endif
 407 |     #if defined( __GNUC__ )
 408 |         typedef float __cl_float4   __attribute__((vector_size(16)));
 409 |     #else
 410 |         typedef __m128 __cl_float4;
 411 |     #endif
 412 |     #define __CL_FLOAT4__   1
 413 | #endif
 414 | 
 415 | #if defined( __SSE2__ )
 416 |     #if defined( __MINGW64__ )
 417 |         #include <intrin.h>
 418 |     #else
 419 |         #include <emmintrin.h>
 420 |     #endif
 421 |     #if defined( __GNUC__ )
 422 |         typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
 423 |         typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
 424 |         typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
 425 |         typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
 426 |         typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
 427 |         typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
 428 |         typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
 429 |         typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
 430 |         typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
 431 |     #else
 432 |         typedef __m128i __cl_uchar16;
 433 |         typedef __m128i __cl_char16;
 434 |         typedef __m128i __cl_ushort8;
 435 |         typedef __m128i __cl_short8;
 436 |         typedef __m128i __cl_uint4;
 437 |         typedef __m128i __cl_int4;
 438 |         typedef __m128i __cl_ulong2;
 439 |         typedef __m128i __cl_long2;
 440 |         typedef __m128d __cl_double2;
 441 |     #endif
 442 |     #define __CL_UCHAR16__  1
 443 |     #define __CL_CHAR16__   1
 444 |     #define __CL_USHORT8__  1
 445 |     #define __CL_SHORT8__   1
 446 |     #define __CL_INT4__     1
 447 |     #define __CL_UINT4__    1
 448 |     #define __CL_ULONG2__   1
 449 |     #define __CL_LONG2__    1
 450 |     #define __CL_DOUBLE2__  1
 451 | #endif
 452 | 
 453 | #if defined( __MMX__ )
 454 |     #include <mmintrin.h>
 455 |     #if defined( __GNUC__ )
 456 |         typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
 457 |         typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
 458 |         typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
 459 |         typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
 460 |         typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
 461 |         typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
 462 |         typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
 463 |         typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
 464 |         typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
 465 |     #else
 466 |         typedef __m64       __cl_uchar8;
 467 |         typedef __m64       __cl_char8;
 468 |         typedef __m64       __cl_ushort4;
 469 |         typedef __m64       __cl_short4;
 470 |         typedef __m64       __cl_uint2;
 471 |         typedef __m64       __cl_int2;
 472 |         typedef __m64       __cl_ulong1;
 473 |         typedef __m64       __cl_long1;
 474 |         typedef __m64       __cl_float2;
 475 |     #endif
 476 |     #define __CL_UCHAR8__   1
 477 |     #define __CL_CHAR8__    1
 478 |     #define __CL_USHORT4__  1
 479 |     #define __CL_SHORT4__   1
 480 |     #define __CL_INT2__     1
 481 |     #define __CL_UINT2__    1
 482 |     #define __CL_ULONG1__   1
 483 |     #define __CL_LONG1__    1
 484 |     #define __CL_FLOAT2__   1
 485 | #endif
 486 | 
 487 | #if defined( __AVX__ )
 488 |     #if defined( __MINGW64__ )
 489 |         #include <intrin.h>
 490 |     #else
 491 |         #include <immintrin.h>
 492 |     #endif
 493 |     #if defined( __GNUC__ )
 494 |         typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
 495 |         typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
 496 |     #else
 497 |         typedef __m256      __cl_float8;
 498 |         typedef __m256d     __cl_double4;
 499 |     #endif
 500 |     #define __CL_FLOAT8__   1
 501 |     #define __CL_DOUBLE4__  1
 502 | #endif
 503 | 
 504 | /* Define capabilities for anonymous struct members. */
 505 | #if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
 506 | #define  __CL_HAS_ANON_STRUCT__ 1
 507 | #define  __CL_ANON_STRUCT__
 508 | #elif defined(_WIN32) && defined(_MSC_VER) && !defined(__STDC__)
 509 | #define  __CL_HAS_ANON_STRUCT__ 1
 510 | #define  __CL_ANON_STRUCT__
 511 | #elif defined(__GNUC__) && ! defined(__STRICT_ANSI__)
 512 | #define  __CL_HAS_ANON_STRUCT__ 1
 513 | #define  __CL_ANON_STRUCT__ __extension__
 514 | #elif defined(__clang__)
 515 | #define  __CL_HAS_ANON_STRUCT__ 1
 516 | #define  __CL_ANON_STRUCT__ __extension__
 517 | #else
 518 | #define  __CL_HAS_ANON_STRUCT__ 0
 519 | #define  __CL_ANON_STRUCT__
 520 | #endif
 521 | 
 522 | #if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__
 523 |    /* Disable warning C4201: nonstandard extension used : nameless struct/union */
 524 |     #pragma warning( push )
 525 |     #pragma warning( disable : 4201 )
 526 | #endif
 527 | 
 528 | /* Define alignment keys */
 529 | #if defined( __GNUC__ ) || defined(__INTEGRITY)
 530 |     #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
 531 | #elif defined( _WIN32) && (_MSC_VER)
 532 |     /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
 533 |     /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
 534 |     /* #include <crtdefs.h>                                                                                             */
 535 |     /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
 536 |     #define CL_ALIGNED(_x)
 537 | #else
 538 |    #warning  Need to implement some method to align data here
 539 |    #define  CL_ALIGNED(_x)
 540 | #endif
 541 | 
 542 | /* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
 543 | #if __CL_HAS_ANON_STRUCT__
 544 |     /* .xyzw and .s0123...{f|F} are supported */
 545 |     #define CL_HAS_NAMED_VECTOR_FIELDS 1
 546 |     /* .hi and .lo are supported */
 547 |     #define CL_HAS_HI_LO_VECTOR_FIELDS 1
 548 | #endif
 549 | 
 550 | /* Define cl_vector types */
 551 | 
 552 | /* ---- cl_charn ---- */
 553 | typedef union
 554 | {
 555 |     cl_char  CL_ALIGNED(2) s[2];
 556 | #if __CL_HAS_ANON_STRUCT__
 557 |    __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
 558 |    __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
 559 |    __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
 560 | #endif
 561 | #if defined( __CL_CHAR2__)
 562 |     __cl_char2     v2;
 563 | #endif
 564 | }cl_char2;
 565 | 
 566 | typedef union
 567 | {
 568 |     cl_char  CL_ALIGNED(4) s[4];
 569 | #if __CL_HAS_ANON_STRUCT__
 570 |    __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
 571 |    __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
 572 |    __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
 573 | #endif
 574 | #if defined( __CL_CHAR2__)
 575 |     __cl_char2     v2[2];
 576 | #endif
 577 | #if defined( __CL_CHAR4__)
 578 |     __cl_char4     v4;
 579 | #endif
 580 | }cl_char4;
 581 | 
 582 | /* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
 583 | typedef  cl_char4  cl_char3;
 584 | 
 585 | typedef union
 586 | {
 587 |     cl_char   CL_ALIGNED(8) s[8];
 588 | #if __CL_HAS_ANON_STRUCT__
 589 |    __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
 590 |    __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
 591 |    __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
 592 | #endif
 593 | #if defined( __CL_CHAR2__)
 594 |     __cl_char2     v2[4];
 595 | #endif
 596 | #if defined( __CL_CHAR4__)
 597 |     __cl_char4     v4[2];
 598 | #endif
 599 | #if defined( __CL_CHAR8__ )
 600 |     __cl_char8     v8;
 601 | #endif
 602 | }cl_char8;
 603 | 
 604 | typedef union
 605 | {
 606 |     cl_char  CL_ALIGNED(16) s[16];
 607 | #if __CL_HAS_ANON_STRUCT__
 608 |    __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
 609 |    __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
 610 |    __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
 611 | #endif
 612 | #if defined( __CL_CHAR2__)
 613 |     __cl_char2     v2[8];
 614 | #endif
 615 | #if defined( __CL_CHAR4__)
 616 |     __cl_char4     v4[4];
 617 | #endif
 618 | #if defined( __CL_CHAR8__ )
 619 |     __cl_char8     v8[2];
 620 | #endif
 621 | #if defined( __CL_CHAR16__ )
 622 |     __cl_char16    v16;
 623 | #endif
 624 | }cl_char16;
 625 | 
 626 | 
 627 | /* ---- cl_ucharn ---- */
 628 | typedef union
 629 | {
 630 |     cl_uchar  CL_ALIGNED(2) s[2];
 631 | #if __CL_HAS_ANON_STRUCT__
 632 |    __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
 633 |    __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
 634 |    __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
 635 | #endif
 636 | #if defined( __cl_uchar2__)
 637 |     __cl_uchar2     v2;
 638 | #endif
 639 | }cl_uchar2;
 640 | 
 641 | typedef union
 642 | {
 643 |     cl_uchar  CL_ALIGNED(4) s[4];
 644 | #if __CL_HAS_ANON_STRUCT__
 645 |    __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
 646 |    __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
 647 |    __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
 648 | #endif
 649 | #if defined( __CL_UCHAR2__)
 650 |     __cl_uchar2     v2[2];
 651 | #endif
 652 | #if defined( __CL_UCHAR4__)
 653 |     __cl_uchar4     v4;
 654 | #endif
 655 | }cl_uchar4;
 656 | 
 657 | /* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
 658 | typedef  cl_uchar4  cl_uchar3;
 659 | 
 660 | typedef union
 661 | {
 662 |     cl_uchar   CL_ALIGNED(8) s[8];
 663 | #if __CL_HAS_ANON_STRUCT__
 664 |    __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
 665 |    __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
 666 |    __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
 667 | #endif
 668 | #if defined( __CL_UCHAR2__)
 669 |     __cl_uchar2     v2[4];
 670 | #endif
 671 | #if defined( __CL_UCHAR4__)
 672 |     __cl_uchar4     v4[2];
 673 | #endif
 674 | #if defined( __CL_UCHAR8__ )
 675 |     __cl_uchar8     v8;
 676 | #endif
 677 | }cl_uchar8;
 678 | 
 679 | typedef union
 680 | {
 681 |     cl_uchar  CL_ALIGNED(16) s[16];
 682 | #if __CL_HAS_ANON_STRUCT__
 683 |    __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
 684 |    __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
 685 |    __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
 686 | #endif
 687 | #if defined( __CL_UCHAR2__)
 688 |     __cl_uchar2     v2[8];
 689 | #endif
 690 | #if defined( __CL_UCHAR4__)
 691 |     __cl_uchar4     v4[4];
 692 | #endif
 693 | #if defined( __CL_UCHAR8__ )
 694 |     __cl_uchar8     v8[2];
 695 | #endif
 696 | #if defined( __CL_UCHAR16__ )
 697 |     __cl_uchar16    v16;
 698 | #endif
 699 | }cl_uchar16;
 700 | 
 701 | 
 702 | /* ---- cl_shortn ---- */
 703 | typedef union
 704 | {
 705 |     cl_short  CL_ALIGNED(4) s[2];
 706 | #if __CL_HAS_ANON_STRUCT__
 707 |    __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
 708 |    __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
 709 |    __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
 710 | #endif
 711 | #if defined( __CL_SHORT2__)
 712 |     __cl_short2     v2;
 713 | #endif
 714 | }cl_short2;
 715 | 
 716 | typedef union
 717 | {
 718 |     cl_short  CL_ALIGNED(8) s[4];
 719 | #if __CL_HAS_ANON_STRUCT__
 720 |    __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
 721 |    __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
 722 |    __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
 723 | #endif
 724 | #if defined( __CL_SHORT2__)
 725 |     __cl_short2     v2[2];
 726 | #endif
 727 | #if defined( __CL_SHORT4__)
 728 |     __cl_short4     v4;
 729 | #endif
 730 | }cl_short4;
 731 | 
 732 | /* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
 733 | typedef  cl_short4  cl_short3;
 734 | 
 735 | typedef union
 736 | {
 737 |     cl_short   CL_ALIGNED(16) s[8];
 738 | #if __CL_HAS_ANON_STRUCT__
 739 |    __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
 740 |    __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
 741 |    __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
 742 | #endif
 743 | #if defined( __CL_SHORT2__)
 744 |     __cl_short2     v2[4];
 745 | #endif
 746 | #if defined( __CL_SHORT4__)
 747 |     __cl_short4     v4[2];
 748 | #endif
 749 | #if defined( __CL_SHORT8__ )
 750 |     __cl_short8     v8;
 751 | #endif
 752 | }cl_short8;
 753 | 
 754 | typedef union
 755 | {
 756 |     cl_short  CL_ALIGNED(32) s[16];
 757 | #if __CL_HAS_ANON_STRUCT__
 758 |    __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
 759 |    __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
 760 |    __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
 761 | #endif
 762 | #if defined( __CL_SHORT2__)
 763 |     __cl_short2     v2[8];
 764 | #endif
 765 | #if defined( __CL_SHORT4__)
 766 |     __cl_short4     v4[4];
 767 | #endif
 768 | #if defined( __CL_SHORT8__ )
 769 |     __cl_short8     v8[2];
 770 | #endif
 771 | #if defined( __CL_SHORT16__ )
 772 |     __cl_short16    v16;
 773 | #endif
 774 | }cl_short16;
 775 | 
 776 | 
 777 | /* ---- cl_ushortn ---- */
 778 | typedef union
 779 | {
 780 |     cl_ushort  CL_ALIGNED(4) s[2];
 781 | #if __CL_HAS_ANON_STRUCT__
 782 |    __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
 783 |    __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
 784 |    __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
 785 | #endif
 786 | #if defined( __CL_USHORT2__)
 787 |     __cl_ushort2     v2;
 788 | #endif
 789 | }cl_ushort2;
 790 | 
 791 | typedef union
 792 | {
 793 |     cl_ushort  CL_ALIGNED(8) s[4];
 794 | #if __CL_HAS_ANON_STRUCT__
 795 |    __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
 796 |    __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
 797 |    __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
 798 | #endif
 799 | #if defined( __CL_USHORT2__)
 800 |     __cl_ushort2     v2[2];
 801 | #endif
 802 | #if defined( __CL_USHORT4__)
 803 |     __cl_ushort4     v4;
 804 | #endif
 805 | }cl_ushort4;
 806 | 
 807 | /* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
 808 | typedef  cl_ushort4  cl_ushort3;
 809 | 
 810 | typedef union
 811 | {
 812 |     cl_ushort   CL_ALIGNED(16) s[8];
 813 | #if __CL_HAS_ANON_STRUCT__
 814 |    __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
 815 |    __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
 816 |    __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
 817 | #endif
 818 | #if defined( __CL_USHORT2__)
 819 |     __cl_ushort2     v2[4];
 820 | #endif
 821 | #if defined( __CL_USHORT4__)
 822 |     __cl_ushort4     v4[2];
 823 | #endif
 824 | #if defined( __CL_USHORT8__ )
 825 |     __cl_ushort8     v8;
 826 | #endif
 827 | }cl_ushort8;
 828 | 
 829 | typedef union
 830 | {
 831 |     cl_ushort  CL_ALIGNED(32) s[16];
 832 | #if __CL_HAS_ANON_STRUCT__
 833 |    __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
 834 |    __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
 835 |    __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
 836 | #endif
 837 | #if defined( __CL_USHORT2__)
 838 |     __cl_ushort2     v2[8];
 839 | #endif
 840 | #if defined( __CL_USHORT4__)
 841 |     __cl_ushort4     v4[4];
 842 | #endif
 843 | #if defined( __CL_USHORT8__ )
 844 |     __cl_ushort8     v8[2];
 845 | #endif
 846 | #if defined( __CL_USHORT16__ )
 847 |     __cl_ushort16    v16;
 848 | #endif
 849 | }cl_ushort16;
 850 | 
 851 | 
 852 | /* ---- cl_halfn ---- */
 853 | typedef union
 854 | {
 855 |     cl_half  CL_ALIGNED(4) s[2];
 856 | #if __CL_HAS_ANON_STRUCT__
 857 |     __CL_ANON_STRUCT__ struct{ cl_half  x, y; };
 858 |     __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };
 859 |     __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };
 860 | #endif
 861 | #if defined( __CL_HALF2__)
 862 |     __cl_half2     v2;
 863 | #endif
 864 | }cl_half2;
 865 | 
 866 | typedef union
 867 | {
 868 |     cl_half  CL_ALIGNED(8) s[4];
 869 | #if __CL_HAS_ANON_STRUCT__
 870 |     __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
 871 |     __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };
 872 |     __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
 873 | #endif
 874 | #if defined( __CL_HALF2__)
 875 |     __cl_half2     v2[2];
 876 | #endif
 877 | #if defined( __CL_HALF4__)
 878 |     __cl_half4     v4;
 879 | #endif
 880 | }cl_half4;
 881 | 
 882 | /* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
 883 | typedef  cl_half4  cl_half3;
 884 | 
 885 | typedef union
 886 | {
 887 |     cl_half   CL_ALIGNED(16) s[8];
 888 | #if __CL_HAS_ANON_STRUCT__
 889 |     __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
 890 |     __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };
 891 |     __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
 892 | #endif
 893 | #if defined( __CL_HALF2__)
 894 |     __cl_half2     v2[4];
 895 | #endif
 896 | #if defined( __CL_HALF4__)
 897 |     __cl_half4     v4[2];
 898 | #endif
 899 | #if defined( __CL_HALF8__ )
 900 |     __cl_half8     v8;
 901 | #endif
 902 | }cl_half8;
 903 | 
 904 | typedef union
 905 | {
 906 |     cl_half  CL_ALIGNED(32) s[16];
 907 | #if __CL_HAS_ANON_STRUCT__
 908 |     __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
 909 |     __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
 910 |     __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
 911 | #endif
 912 | #if defined( __CL_HALF2__)
 913 |     __cl_half2     v2[8];
 914 | #endif
 915 | #if defined( __CL_HALF4__)
 916 |     __cl_half4     v4[4];
 917 | #endif
 918 | #if defined( __CL_HALF8__ )
 919 |     __cl_half8     v8[2];
 920 | #endif
 921 | #if defined( __CL_HALF16__ )
 922 |     __cl_half16    v16;
 923 | #endif
 924 | }cl_half16;
 925 | 
 926 | /* ---- cl_intn ---- */
 927 | typedef union
 928 | {
 929 |     cl_int  CL_ALIGNED(8) s[2];
 930 | #if __CL_HAS_ANON_STRUCT__
 931 |    __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
 932 |    __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
 933 |    __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
 934 | #endif
 935 | #if defined( __CL_INT2__)
 936 |     __cl_int2     v2;
 937 | #endif
 938 | }cl_int2;
 939 | 
 940 | typedef union
 941 | {
 942 |     cl_int  CL_ALIGNED(16) s[4];
 943 | #if __CL_HAS_ANON_STRUCT__
 944 |    __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
 945 |    __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
 946 |    __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
 947 | #endif
 948 | #if defined( __CL_INT2__)
 949 |     __cl_int2     v2[2];
 950 | #endif
 951 | #if defined( __CL_INT4__)
 952 |     __cl_int4     v4;
 953 | #endif
 954 | }cl_int4;
 955 | 
 956 | /* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
 957 | typedef  cl_int4  cl_int3;
 958 | 
 959 | typedef union
 960 | {
 961 |     cl_int   CL_ALIGNED(32) s[8];
 962 | #if __CL_HAS_ANON_STRUCT__
 963 |    __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
 964 |    __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
 965 |    __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
 966 | #endif
 967 | #if defined( __CL_INT2__)
 968 |     __cl_int2     v2[4];
 969 | #endif
 970 | #if defined( __CL_INT4__)
 971 |     __cl_int4     v4[2];
 972 | #endif
 973 | #if defined( __CL_INT8__ )
 974 |     __cl_int8     v8;
 975 | #endif
 976 | }cl_int8;
 977 | 
 978 | typedef union
 979 | {
 980 |     cl_int  CL_ALIGNED(64) s[16];
 981 | #if __CL_HAS_ANON_STRUCT__
 982 |    __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
 983 |    __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
 984 |    __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
 985 | #endif
 986 | #if defined( __CL_INT2__)
 987 |     __cl_int2     v2[8];
 988 | #endif
 989 | #if defined( __CL_INT4__)
 990 |     __cl_int4     v4[4];
 991 | #endif
 992 | #if defined( __CL_INT8__ )
 993 |     __cl_int8     v8[2];
 994 | #endif
 995 | #if defined( __CL_INT16__ )
 996 |     __cl_int16    v16;
 997 | #endif
 998 | }cl_int16;
 999 | 
1000 | 
1001 | /* ---- cl_uintn ---- */
1002 | typedef union
1003 | {
1004 |     cl_uint  CL_ALIGNED(8) s[2];
1005 | #if __CL_HAS_ANON_STRUCT__
1006 |    __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
1007 |    __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
1008 |    __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
1009 | #endif
1010 | #if defined( __CL_UINT2__)
1011 |     __cl_uint2     v2;
1012 | #endif
1013 | }cl_uint2;
1014 | 
1015 | typedef union
1016 | {
1017 |     cl_uint  CL_ALIGNED(16) s[4];
1018 | #if __CL_HAS_ANON_STRUCT__
1019 |    __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
1020 |    __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
1021 |    __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
1022 | #endif
1023 | #if defined( __CL_UINT2__)
1024 |     __cl_uint2     v2[2];
1025 | #endif
1026 | #if defined( __CL_UINT4__)
1027 |     __cl_uint4     v4;
1028 | #endif
1029 | }cl_uint4;
1030 | 
1031 | /* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
1032 | typedef  cl_uint4  cl_uint3;
1033 | 
1034 | typedef union
1035 | {
1036 |     cl_uint   CL_ALIGNED(32) s[8];
1037 | #if __CL_HAS_ANON_STRUCT__
1038 |    __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
1039 |    __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
1040 |    __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
1041 | #endif
1042 | #if defined( __CL_UINT2__)
1043 |     __cl_uint2     v2[4];
1044 | #endif
1045 | #if defined( __CL_UINT4__)
1046 |     __cl_uint4     v4[2];
1047 | #endif
1048 | #if defined( __CL_UINT8__ )
1049 |     __cl_uint8     v8;
1050 | #endif
1051 | }cl_uint8;
1052 | 
1053 | typedef union
1054 | {
1055 |     cl_uint  CL_ALIGNED(64) s[16];
1056 | #if __CL_HAS_ANON_STRUCT__
1057 |    __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
1058 |    __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
1059 |    __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
1060 | #endif
1061 | #if defined( __CL_UINT2__)
1062 |     __cl_uint2     v2[8];
1063 | #endif
1064 | #if defined( __CL_UINT4__)
1065 |     __cl_uint4     v4[4];
1066 | #endif
1067 | #if defined( __CL_UINT8__ )
1068 |     __cl_uint8     v8[2];
1069 | #endif
1070 | #if defined( __CL_UINT16__ )
1071 |     __cl_uint16    v16;
1072 | #endif
1073 | }cl_uint16;
1074 | 
1075 | /* ---- cl_longn ---- */
1076 | typedef union
1077 | {
1078 |     cl_long  CL_ALIGNED(16) s[2];
1079 | #if __CL_HAS_ANON_STRUCT__
1080 |    __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
1081 |    __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
1082 |    __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
1083 | #endif
1084 | #if defined( __CL_LONG2__)
1085 |     __cl_long2     v2;
1086 | #endif
1087 | }cl_long2;
1088 | 
1089 | typedef union
1090 | {
1091 |     cl_long  CL_ALIGNED(32) s[4];
1092 | #if __CL_HAS_ANON_STRUCT__
1093 |    __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
1094 |    __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
1095 |    __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
1096 | #endif
1097 | #if defined( __CL_LONG2__)
1098 |     __cl_long2     v2[2];
1099 | #endif
1100 | #if defined( __CL_LONG4__)
1101 |     __cl_long4     v4;
1102 | #endif
1103 | }cl_long4;
1104 | 
1105 | /* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
1106 | typedef  cl_long4  cl_long3;
1107 | 
1108 | typedef union
1109 | {
1110 |     cl_long   CL_ALIGNED(64) s[8];
1111 | #if __CL_HAS_ANON_STRUCT__
1112 |    __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
1113 |    __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
1114 |    __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
1115 | #endif
1116 | #if defined( __CL_LONG2__)
1117 |     __cl_long2     v2[4];
1118 | #endif
1119 | #if defined( __CL_LONG4__)
1120 |     __cl_long4     v4[2];
1121 | #endif
1122 | #if defined( __CL_LONG8__ )
1123 |     __cl_long8     v8;
1124 | #endif
1125 | }cl_long8;
1126 | 
1127 | typedef union
1128 | {
1129 |     cl_long  CL_ALIGNED(128) s[16];
1130 | #if __CL_HAS_ANON_STRUCT__
1131 |    __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
1132 |    __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
1133 |    __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
1134 | #endif
1135 | #if defined( __CL_LONG2__)
1136 |     __cl_long2     v2[8];
1137 | #endif
1138 | #if defined( __CL_LONG4__)
1139 |     __cl_long4     v4[4];
1140 | #endif
1141 | #if defined( __CL_LONG8__ )
1142 |     __cl_long8     v8[2];
1143 | #endif
1144 | #if defined( __CL_LONG16__ )
1145 |     __cl_long16    v16;
1146 | #endif
1147 | }cl_long16;
1148 | 
1149 | 
1150 | /* ---- cl_ulongn ---- */
1151 | typedef union
1152 | {
1153 |     cl_ulong  CL_ALIGNED(16) s[2];
1154 | #if __CL_HAS_ANON_STRUCT__
1155 |    __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
1156 |    __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
1157 |    __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
1158 | #endif
1159 | #if defined( __CL_ULONG2__)
1160 |     __cl_ulong2     v2;
1161 | #endif
1162 | }cl_ulong2;
1163 | 
1164 | typedef union
1165 | {
1166 |     cl_ulong  CL_ALIGNED(32) s[4];
1167 | #if __CL_HAS_ANON_STRUCT__
1168 |    __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
1169 |    __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
1170 |    __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
1171 | #endif
1172 | #if defined( __CL_ULONG2__)
1173 |     __cl_ulong2     v2[2];
1174 | #endif
1175 | #if defined( __CL_ULONG4__)
1176 |     __cl_ulong4     v4;
1177 | #endif
1178 | }cl_ulong4;
1179 | 
1180 | /* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
1181 | typedef  cl_ulong4  cl_ulong3;
1182 | 
1183 | typedef union
1184 | {
1185 |     cl_ulong   CL_ALIGNED(64) s[8];
1186 | #if __CL_HAS_ANON_STRUCT__
1187 |    __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
1188 |    __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
1189 |    __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
1190 | #endif
1191 | #if defined( __CL_ULONG2__)
1192 |     __cl_ulong2     v2[4];
1193 | #endif
1194 | #if defined( __CL_ULONG4__)
1195 |     __cl_ulong4     v4[2];
1196 | #endif
1197 | #if defined( __CL_ULONG8__ )
1198 |     __cl_ulong8     v8;
1199 | #endif
1200 | }cl_ulong8;
1201 | 
1202 | typedef union
1203 | {
1204 |     cl_ulong  CL_ALIGNED(128) s[16];
1205 | #if __CL_HAS_ANON_STRUCT__
1206 |    __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
1207 |    __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
1208 |    __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
1209 | #endif
1210 | #if defined( __CL_ULONG2__)
1211 |     __cl_ulong2     v2[8];
1212 | #endif
1213 | #if defined( __CL_ULONG4__)
1214 |     __cl_ulong4     v4[4];
1215 | #endif
1216 | #if defined( __CL_ULONG8__ )
1217 |     __cl_ulong8     v8[2];
1218 | #endif
1219 | #if defined( __CL_ULONG16__ )
1220 |     __cl_ulong16    v16;
1221 | #endif
1222 | }cl_ulong16;
1223 | 
1224 | 
1225 | /* --- cl_floatn ---- */
1226 | 
1227 | typedef union
1228 | {
1229 |     cl_float  CL_ALIGNED(8) s[2];
1230 | #if __CL_HAS_ANON_STRUCT__
1231 |    __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
1232 |    __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
1233 |    __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
1234 | #endif
1235 | #if defined( __CL_FLOAT2__)
1236 |     __cl_float2     v2;
1237 | #endif
1238 | }cl_float2;
1239 | 
1240 | typedef union
1241 | {
1242 |     cl_float  CL_ALIGNED(16) s[4];
1243 | #if __CL_HAS_ANON_STRUCT__
1244 |    __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
1245 |    __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
1246 |    __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
1247 | #endif
1248 | #if defined( __CL_FLOAT2__)
1249 |     __cl_float2     v2[2];
1250 | #endif
1251 | #if defined( __CL_FLOAT4__)
1252 |     __cl_float4     v4;
1253 | #endif
1254 | }cl_float4;
1255 | 
1256 | /* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
1257 | typedef  cl_float4  cl_float3;
1258 | 
1259 | typedef union
1260 | {
1261 |     cl_float   CL_ALIGNED(32) s[8];
1262 | #if __CL_HAS_ANON_STRUCT__
1263 |    __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
1264 |    __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
1265 |    __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
1266 | #endif
1267 | #if defined( __CL_FLOAT2__)
1268 |     __cl_float2     v2[4];
1269 | #endif
1270 | #if defined( __CL_FLOAT4__)
1271 |     __cl_float4     v4[2];
1272 | #endif
1273 | #if defined( __CL_FLOAT8__ )
1274 |     __cl_float8     v8;
1275 | #endif
1276 | }cl_float8;
1277 | 
1278 | typedef union
1279 | {
1280 |     cl_float  CL_ALIGNED(64) s[16];
1281 | #if __CL_HAS_ANON_STRUCT__
1282 |    __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
1283 |    __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
1284 |    __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
1285 | #endif
1286 | #if defined( __CL_FLOAT2__)
1287 |     __cl_float2     v2[8];
1288 | #endif
1289 | #if defined( __CL_FLOAT4__)
1290 |     __cl_float4     v4[4];
1291 | #endif
1292 | #if defined( __CL_FLOAT8__ )
1293 |     __cl_float8     v8[2];
1294 | #endif
1295 | #if defined( __CL_FLOAT16__ )
1296 |     __cl_float16    v16;
1297 | #endif
1298 | }cl_float16;
1299 | 
1300 | /* --- cl_doublen ---- */
1301 | 
1302 | typedef union
1303 | {
1304 |     cl_double  CL_ALIGNED(16) s[2];
1305 | #if __CL_HAS_ANON_STRUCT__
1306 |    __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
1307 |    __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
1308 |    __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
1309 | #endif
1310 | #if defined( __CL_DOUBLE2__)
1311 |     __cl_double2     v2;
1312 | #endif
1313 | }cl_double2;
1314 | 
1315 | typedef union
1316 | {
1317 |     cl_double  CL_ALIGNED(32) s[4];
1318 | #if __CL_HAS_ANON_STRUCT__
1319 |    __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
1320 |    __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
1321 |    __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
1322 | #endif
1323 | #if defined( __CL_DOUBLE2__)
1324 |     __cl_double2     v2[2];
1325 | #endif
1326 | #if defined( __CL_DOUBLE4__)
1327 |     __cl_double4     v4;
1328 | #endif
1329 | }cl_double4;
1330 | 
1331 | /* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
1332 | typedef  cl_double4  cl_double3;
1333 | 
1334 | typedef union
1335 | {
1336 |     cl_double   CL_ALIGNED(64) s[8];
1337 | #if __CL_HAS_ANON_STRUCT__
1338 |    __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
1339 |    __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
1340 |    __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
1341 | #endif
1342 | #if defined( __CL_DOUBLE2__)
1343 |     __cl_double2     v2[4];
1344 | #endif
1345 | #if defined( __CL_DOUBLE4__)
1346 |     __cl_double4     v4[2];
1347 | #endif
1348 | #if defined( __CL_DOUBLE8__ )
1349 |     __cl_double8     v8;
1350 | #endif
1351 | }cl_double8;
1352 | 
1353 | typedef union
1354 | {
1355 |     cl_double  CL_ALIGNED(128) s[16];
1356 | #if __CL_HAS_ANON_STRUCT__
1357 |    __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
1358 |    __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
1359 |    __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
1360 | #endif
1361 | #if defined( __CL_DOUBLE2__)
1362 |     __cl_double2     v2[8];
1363 | #endif
1364 | #if defined( __CL_DOUBLE4__)
1365 |     __cl_double4     v4[4];
1366 | #endif
1367 | #if defined( __CL_DOUBLE8__ )
1368 |     __cl_double8     v8[2];
1369 | #endif
1370 | #if defined( __CL_DOUBLE16__ )
1371 |     __cl_double16    v16;
1372 | #endif
1373 | }cl_double16;
1374 | 
1375 | /* Macro to facilitate debugging
1376 |  * Usage:
1377 |  *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
1378 |  *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
1379 |  *   Each line thereafter of OpenCL C source must end with: \n\
1380 |  *   The last line ends in ";
1381 |  *
1382 |  *   Example:
1383 |  *
1384 |  *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
1385 |  *   kernel void foo( int a, float * b )             \n\
1386 |  *   {                                               \n\
1387 |  *      // my comment                                \n\
1388 |  *      *b[ get_global_id(0)] = a;                   \n\
1389 |  *   }                                               \n\
1390 |  *   ";
1391 |  *
1392 |  * This should correctly set up the line, (column) and file information for your source
1393 |  * string so you can do source level debugging.
1394 |  */
1395 | #define  __CL_STRINGIFY( _x )               # _x
1396 | #define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
1397 | #define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
1398 | 
1399 | #ifdef __cplusplus
1400 | }
1401 | #endif
1402 | 
1403 | #if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__
1404 |     #pragma warning( pop )
1405 | #endif
1406 | 
1407 | #endif  /* __CL_PLATFORM_H  */
1408 | 


--------------------------------------------------------------------------------