├── .gitignore ├── OpenCL Tutorials.sln ├── README.md ├── Tutorial 1 ├── Tutorial 1.cpp ├── Tutorial 1.vcxproj ├── Tutorial 1.vcxproj.filters └── kernels │ └── my_kernels.cl ├── Tutorial 2 ├── Tutorial 2.cpp ├── Tutorial 2.vcxproj ├── Tutorial 2.vcxproj.filters ├── kernels │ └── my_kernels.cl ├── test.ppm └── test_large.ppm ├── Tutorial 3 ├── Tutorial 3.cpp ├── Tutorial 3.vcxproj ├── Tutorial 3.vcxproj.filters └── kernels │ └── my_kernels.cl ├── Tutorial 4 ├── Tutorial 4.cpp ├── Tutorial 4.vcxproj └── Tutorial 4.vcxproj.filters ├── images ├── test.ppm └── test_large.ppm └── include ├── CImg.h ├── CL └── cl2.hpp └── Utils.h /.gitignore: -------------------------------------------------------------------------------- 1 | #CMake 2 | build 3 | 4 | #Visual Studio 5 | [Dd]ebug/ 6 | [Dd]ebugPublic/ 7 | [Rr]elease/ 8 | [Rr]eleases/ 9 | x64/* 10 | x86/* 11 | bld/ 12 | [Oo]bj/ 13 | .vs 14 | desktop.ini 15 | 16 | *.opensdf 17 | *.sdf 18 | *.suo 19 | *.user 20 | 21 | !x64/glut32.dll 22 | !x86/glut32.dll 23 | 24 | 25 | -------------------------------------------------------------------------------- /OpenCL Tutorials.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.29613.14 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Tutorial 1", "Tutorial 1\Tutorial 1.vcxproj", "{E99F5DFC-113A-4BC3-8253-90A6AC0C9A9D}" 7 | EndProject 8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Tutorial 2", "Tutorial 2\Tutorial 2.vcxproj", "{9167FEE5-0E64-4275-B2B2-A3F87F3A5C8F}" 9 | EndProject 10 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Tutorial 3", "Tutorial 3\Tutorial 3.vcxproj", "{8CB4B79A-8170-44DE-88DC-C73EACB44CB2}" 11 | EndProject 12 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Tutorial 4", "Tutorial 4\Tutorial 4.vcxproj", "{E95D4B5A-1F3F-4A31-931F-7A99CE219124}" 13 | EndProject 14 | Global 15 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 16 | Debug|x64 = Debug|x64 17 | Debug|x86 = Debug|x86 18 | Release|x64 = Release|x64 19 | Release|x86 = Release|x86 20 | EndGlobalSection 21 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 22 | {E99F5DFC-113A-4BC3-8253-90A6AC0C9A9D}.Debug|x64.ActiveCfg = Debug|x64 23 | {E99F5DFC-113A-4BC3-8253-90A6AC0C9A9D}.Debug|x64.Build.0 = Debug|x64 24 | {E99F5DFC-113A-4BC3-8253-90A6AC0C9A9D}.Debug|x86.ActiveCfg = Debug|Win32 25 | {E99F5DFC-113A-4BC3-8253-90A6AC0C9A9D}.Debug|x86.Build.0 = Debug|Win32 26 | {E99F5DFC-113A-4BC3-8253-90A6AC0C9A9D}.Release|x64.ActiveCfg = Release|x64 27 | {E99F5DFC-113A-4BC3-8253-90A6AC0C9A9D}.Release|x64.Build.0 = Release|x64 28 | {E99F5DFC-113A-4BC3-8253-90A6AC0C9A9D}.Release|x86.ActiveCfg = Release|Win32 29 | {E99F5DFC-113A-4BC3-8253-90A6AC0C9A9D}.Release|x86.Build.0 = Release|Win32 30 | {9167FEE5-0E64-4275-B2B2-A3F87F3A5C8F}.Debug|x64.ActiveCfg = Debug|x64 31 | {9167FEE5-0E64-4275-B2B2-A3F87F3A5C8F}.Debug|x64.Build.0 = Debug|x64 32 | {9167FEE5-0E64-4275-B2B2-A3F87F3A5C8F}.Debug|x86.ActiveCfg = Debug|Win32 33 | {9167FEE5-0E64-4275-B2B2-A3F87F3A5C8F}.Debug|x86.Build.0 = Debug|Win32 34 | {9167FEE5-0E64-4275-B2B2-A3F87F3A5C8F}.Release|x64.ActiveCfg = Release|x64 35 | {9167FEE5-0E64-4275-B2B2-A3F87F3A5C8F}.Release|x64.Build.0 = Release|x64 36 | {9167FEE5-0E64-4275-B2B2-A3F87F3A5C8F}.Release|x86.ActiveCfg = Release|Win32 37 | {9167FEE5-0E64-4275-B2B2-A3F87F3A5C8F}.Release|x86.Build.0 = Release|Win32 38 | {8CB4B79A-8170-44DE-88DC-C73EACB44CB2}.Debug|x64.ActiveCfg = Debug|x64 39 | {8CB4B79A-8170-44DE-88DC-C73EACB44CB2}.Debug|x64.Build.0 = Debug|x64 40 | {8CB4B79A-8170-44DE-88DC-C73EACB44CB2}.Debug|x86.ActiveCfg = Debug|Win32 41 | {8CB4B79A-8170-44DE-88DC-C73EACB44CB2}.Debug|x86.Build.0 = Debug|Win32 42 | {8CB4B79A-8170-44DE-88DC-C73EACB44CB2}.Release|x64.ActiveCfg = Release|x64 43 | {8CB4B79A-8170-44DE-88DC-C73EACB44CB2}.Release|x64.Build.0 = Release|x64 44 | {8CB4B79A-8170-44DE-88DC-C73EACB44CB2}.Release|x86.ActiveCfg = Release|Win32 45 | {8CB4B79A-8170-44DE-88DC-C73EACB44CB2}.Release|x86.Build.0 = Release|Win32 46 | {E95D4B5A-1F3F-4A31-931F-7A99CE219124}.Debug|x64.ActiveCfg = Debug|x64 47 | {E95D4B5A-1F3F-4A31-931F-7A99CE219124}.Debug|x64.Build.0 = Debug|x64 48 | {E95D4B5A-1F3F-4A31-931F-7A99CE219124}.Debug|x86.ActiveCfg = Debug|Win32 49 | {E95D4B5A-1F3F-4A31-931F-7A99CE219124}.Debug|x86.Build.0 = Debug|Win32 50 | {E95D4B5A-1F3F-4A31-931F-7A99CE219124}.Release|x64.ActiveCfg = Release|x64 51 | {E95D4B5A-1F3F-4A31-931F-7A99CE219124}.Release|x64.Build.0 = Release|x64 52 | {E95D4B5A-1F3F-4A31-931F-7A99CE219124}.Release|x86.ActiveCfg = Release|Win32 53 | {E95D4B5A-1F3F-4A31-931F-7A99CE219124}.Release|x86.Build.0 = Release|Win32 54 | EndGlobalSection 55 | GlobalSection(SolutionProperties) = preSolution 56 | HideSolutionNode = FALSE 57 | EndGlobalSection 58 | GlobalSection(ExtensibilityGlobals) = postSolution 59 | SolutionGuid = {609364E1-9537-43CB-85C9-0D79409A1E2D} 60 | EndGlobalSection 61 | EndGlobal 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenCL Tutorials 2 | 3 | ## Requirements 4 | 5 | The presented tutorials were developed and tested on Windows 10, Visual Studio 2019 and [Intel SDK for OpenCL](https://software.intel.com/en-us/intel-opencl) so that can be run on Windows PCs in the computing labs. Tutorial 4 also depends on the Boost library. If you would like to develop OpenCL programs on your computer you have two options: 6 | - replicate the [Windows setup](#windows-setup) from the computing labs; 7 | - use the [multi_os](https://github.com/gcielniak/OpenCL-Tutorials/tree/multi_os) branch, which should allow for running the tutorials on different operating systems, programming environments and OpenCL SDKs. There is limited documentation for this option, however, so you should only choose that option if you are comfortable with installing custom libraries on your specific OS. 8 | 9 | ## Windows Setup 10 | - OS + IDE: Windows 10, Visual Studio 2019 11 | - OpenCL SDK: the SDK enables you to develop and compile the OpenCL code. In our case, we use [Intel SDK for OpenCL Applications](https://software.intel.com/en-us/intel-opencl). You are not tied to that choice, however, and can use SDKs by NVidia or AMD - just remember to make modifications in the project include paths. Each SDK comes with a range of additional tools which make development of OpenCL programs easier. 12 | - OpenCL runtime: the runtime drivers are necessary to run the OpenCL code on your hardware. Both NVidia and AMD GPUs have OpenCL runtime included with their card drivers. For CPUs, you will need to install a dedicated driver by [Intel](https://software.intel.com/en-us/articles/opencl-drivers) or APP SDK for older AMD processors. It seems that AMD’s OpenCL support for newer CPU models was dropped unfortunately. You can check the existing OpenCL support on your PC using [GPU Caps Viewer](http://www.ozone3d.net/gpu_caps_viewer/). 13 | - Boost library: install the recent [Boost library Windows binaries](https://sourceforge.net/projects/boost/files/boost-binaries/) (e.g. [boost_1_72_0](https://sourceforge.net/projects/boost/files/boost-binaries/1.72.0/boost_1_72_0-msvc-14.2-64.exe/download) for VS2019). Then, add two environmental variables in the command line specifying the location of the include and lib Boost directories. For example, with boost_1_72_0 the commands would look as follows: `setx BOOST_INCLUDEDIR "C:\local\boost_1_72_0"` and `setx BOOST_LIBRARYDIR "C:\local\boost_1_72_0\lib64-msvc-14.2"`. 14 | - A useful reference if you are struggling to get going: [OpenCL on Windows](http://streamcomputing.eu/blog/2015-03-16/how-to-install-opencl-on-windows/). 15 | -------------------------------------------------------------------------------- /Tutorial 1/Tutorial 1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "Utils.h" 5 | 6 | void print_help() { 7 | std::cerr << "Application usage:" << std::endl; 8 | 9 | std::cerr << " -p : select platform " << std::endl; 10 | std::cerr << " -d : select device" << std::endl; 11 | std::cerr << " -l : list all platforms and devices" << std::endl; 12 | std::cerr << " -h : print this message" << std::endl; 13 | } 14 | 15 | int main(int argc, char **argv) { 16 | //Part 1 - handle command line options such as device selection, verbosity, etc. 17 | int platform_id = 0; 18 | int device_id = 0; 19 | 20 | for (int i = 1; i < argc; i++) { 21 | if ((strcmp(argv[i], "-p") == 0) && (i < (argc - 1))) { platform_id = atoi(argv[++i]); } 22 | else if ((strcmp(argv[i], "-d") == 0) && (i < (argc - 1))) { device_id = atoi(argv[++i]); } 23 | else if (strcmp(argv[i], "-l") == 0) { std::cout << ListPlatformsDevices() << std::endl; } 24 | else if (strcmp(argv[i], "-h") == 0) { print_help(); return 0; } 25 | } 26 | 27 | //detect any potential exceptions 28 | try { 29 | //Part 2 - host operations 30 | //2.1 Select computing devices 31 | cl::Context context = GetContext(platform_id, device_id); 32 | 33 | //display the selected device 34 | std::cout << "Runinng on " << GetPlatformName(platform_id) << ", " << GetDeviceName(platform_id, device_id) << std::endl; 35 | 36 | //create a queue to which we will push commands for the device 37 | cl::CommandQueue queue(context); 38 | 39 | //2.2 Load & build the device code 40 | cl::Program::Sources sources; 41 | 42 | AddSources(sources, "kernels/my_kernels.cl"); 43 | 44 | cl::Program program(context, sources); 45 | 46 | //build and debug the kernel code 47 | try { 48 | program.build(); 49 | } 50 | catch (const cl::Error& err) { 51 | std::cout << "Build Status: " << program.getBuildInfo(context.getInfo()[0]) << std::endl; 52 | std::cout << "Build Options:\t" << program.getBuildInfo(context.getInfo()[0]) << std::endl; 53 | std::cout << "Build Log:\t " << program.getBuildInfo(context.getInfo()[0]) << std::endl; 54 | throw err; 55 | } 56 | 57 | //Part 3 - memory allocation 58 | //host - input 59 | std::vector A = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; //C++11 allows this type of initialisation 60 | std::vector B = { 0, 1, 2, 0, 1, 2, 0, 1, 2, 0 }; 61 | 62 | size_t vector_elements = A.size();//number of elements 63 | size_t vector_size = A.size()*sizeof(int);//size in bytes 64 | 65 | //host - output 66 | std::vector C(vector_elements); 67 | 68 | //device - buffers 69 | cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, vector_size); 70 | cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, vector_size); 71 | cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, vector_size); 72 | 73 | //Part 4 - device operations 74 | 75 | //4.1 Copy arrays A and B to device memory 76 | queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, vector_size, &A[0]); 77 | queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, vector_size, &B[0]); 78 | 79 | //4.2 Setup and execute the kernel (i.e. device code) 80 | cl::Kernel kernel_add = cl::Kernel(program, "add"); 81 | kernel_add.setArg(0, buffer_A); 82 | kernel_add.setArg(1, buffer_B); 83 | kernel_add.setArg(2, buffer_C); 84 | 85 | queue.enqueueNDRangeKernel(kernel_add, cl::NullRange, cl::NDRange(vector_elements), cl::NullRange); 86 | 87 | //4.3 Copy the result from device to host 88 | queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, vector_size, &C[0]); 89 | 90 | std::cout << "A = " << A << std::endl; 91 | std::cout << "B = " << B << std::endl; 92 | std::cout << "C = " << C << std::endl; 93 | } 94 | catch (cl::Error err) { 95 | std::cerr << "ERROR: " << err.what() << ", " << getErrorString(err.err()) << std::endl; 96 | } 97 | 98 | return 0; 99 | } -------------------------------------------------------------------------------- /Tutorial 1/Tutorial 1.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Debug 10 | x64 11 | 12 | 13 | Release 14 | Win32 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 23 | Debug 24 | x64 25 | 26 | 27 | Release 28 | x64 29 | 30 | 31 | 32 | {E99F5DFC-113A-4BC3-8253-90A6AC0C9A9D} 33 | Tutorial 1 34 | 10.0 35 | 36 | 37 | 38 | Application 39 | true 40 | v142 41 | Unicode 42 | 43 | 44 | Application 45 | false 46 | v142 47 | Unicode 48 | true 49 | 50 | 51 | Application 52 | true 53 | v142 54 | Unicode 55 | 56 | 57 | Application 58 | false 59 | v142 60 | Unicode 61 | true 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | false 79 | 80 | 81 | true 82 | 83 | 84 | false 85 | $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ 86 | 87 | 88 | true 89 | $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ 90 | 91 | 92 | 93 | 0 94 | 95 | 96 | $(INTELOCLSDKROOT)include;%(AdditionalIncludeDirectories) 97 | Win32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 98 | Level3 99 | ProgramDatabase 100 | 101 | 102 | 103 | $(INTELOCLSDKROOT)lib\x86;%(AdditionalLibraryDirectories) 104 | OpenCL.lib;%(AdditionalDependencies) 105 | true 106 | 107 | 108 | If exist "*.cl" copy "*.cl" "$(OutDir)\" 109 | 110 | 111 | 112 | 113 | 0 114 | 115 | 116 | $(INTELOCLSDKROOT)include;%(AdditionalIncludeDirectories) 117 | Win32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 118 | Level3 119 | ProgramDatabase 120 | 121 | 122 | 123 | $(INTELOCLSDKROOT)lib\x86;%(AdditionalLibraryDirectories) 124 | OpenCL.lib;%(AdditionalDependencies) 125 | true 126 | 127 | 128 | If exist "*.cl" copy "*.cl" "$(OutDir)\" 129 | 130 | 131 | 132 | 133 | 0 134 | 135 | 136 | $(INTELOCLSDKROOT)include;..\include;%(AdditionalIncludeDirectories) 137 | __x86_64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 138 | MaxSpeed 139 | false 140 | Default 141 | MultiThreadedDLL 142 | Level3 143 | ProgramDatabase 144 | 145 | 146 | 147 | $(INTELOCLSDKROOT)lib\x64;%(AdditionalLibraryDirectories) 148 | OpenCL.lib;%(AdditionalDependencies) 149 | true 150 | true 151 | true 152 | Console 153 | 154 | 155 | If exist "*.cl" copy "*.cl" "$(OutDir)\" 156 | 157 | 158 | 159 | 160 | 0 161 | 162 | 163 | $(INTELOCLSDKROOT)include;..\include;%(AdditionalIncludeDirectories) 164 | __x86_64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 165 | Disabled 166 | false 167 | EnableFastChecks 168 | MultiThreadedDebugDLL 169 | Level3 170 | ProgramDatabase 171 | 172 | 173 | 174 | $(INTELOCLSDKROOT)lib\x64;%(AdditionalLibraryDirectories) 175 | OpenCL.lib;%(AdditionalDependencies) 176 | true 177 | Console 178 | 179 | 180 | xcopy /s /i /y "kernels" "$(OutDir)kernels" 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | -------------------------------------------------------------------------------- /Tutorial 1/Tutorial 1.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | {11e52fc2-3f25-4b94-920c-911c1673bf6c} 9 | 10 | 11 | {5586229f-c9c4-4574-89bd-30ba09061ac6} 12 | 13 | 14 | 15 | 16 | kernels 17 | 18 | 19 | 20 | 21 | include 22 | 23 | 24 | -------------------------------------------------------------------------------- /Tutorial 1/kernels/my_kernels.cl: -------------------------------------------------------------------------------- 1 | //a simple OpenCL kernel which adds two vectors A and B together into a third vector C 2 | kernel void add(global const int* A, global const int* B, global int* C) { 3 | int id = get_global_id(0); 4 | C[id] = A[id] + B[id]; 5 | } 6 | 7 | //a simple smoothing kernel averaging values in a local window (radius 1) 8 | kernel void avg_filter(global const int* A, global int* B) { 9 | int id = get_global_id(0); 10 | B[id] = (A[id - 1] + A[id] + A[id + 1])/3; 11 | } 12 | 13 | //a simple 2D kernel 14 | kernel void add2D(global const int* A, global const int* B, global int* C) { 15 | int x = get_global_id(0); 16 | int y = get_global_id(1); 17 | int width = get_global_size(0); 18 | int height = get_global_size(1); 19 | int id = x + y*width; 20 | 21 | printf("id = %d x = %d y = %d w = %d h = %d\n", id, x, y, width, height); 22 | 23 | C[id]= A[id]+ B[id]; 24 | } 25 | -------------------------------------------------------------------------------- /Tutorial 2/Tutorial 2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "Utils.h" 5 | #include "CImg.h" 6 | 7 | using namespace cimg_library; 8 | 9 | void print_help() { 10 | std::cerr << "Application usage:" << std::endl; 11 | 12 | std::cerr << " -p : select platform " << std::endl; 13 | std::cerr << " -d : select device" << std::endl; 14 | std::cerr << " -l : list all platforms and devices" << std::endl; 15 | std::cerr << " -f : input image file (default: test.ppm)" << std::endl; 16 | std::cerr << " -h : print this message" << std::endl; 17 | } 18 | 19 | int main(int argc, char **argv) { 20 | //Part 1 - handle command line options such as device selection, verbosity, etc. 21 | int platform_id = 0; 22 | int device_id = 0; 23 | string image_filename = "test.ppm"; 24 | 25 | for (int i = 1; i < argc; i++) { 26 | if ((strcmp(argv[i], "-p") == 0) && (i < (argc - 1))) { platform_id = atoi(argv[++i]); } 27 | else if ((strcmp(argv[i], "-d") == 0) && (i < (argc - 1))) { device_id = atoi(argv[++i]); } 28 | else if (strcmp(argv[i], "-l") == 0) { std::cout << ListPlatformsDevices() << std::endl; } 29 | else if ((strcmp(argv[i], "-f") == 0) && (i < (argc - 1))) { image_filename = argv[++i]; } 30 | else if (strcmp(argv[i], "-h") == 0) { print_help(); return 0; } 31 | } 32 | 33 | cimg::exception_mode(0); 34 | 35 | //detect any potential exceptions 36 | try { 37 | CImg image_input(image_filename.c_str()); 38 | CImgDisplay disp_input(image_input,"input"); 39 | 40 | //a 3x3 convolution mask implementing an averaging filter 41 | std::vector convolution_mask = { 1.f / 9, 1.f / 9, 1.f / 9, 42 | 1.f / 9, 1.f / 9, 1.f / 9, 43 | 1.f / 9, 1.f / 9, 1.f / 9 }; 44 | 45 | //Part 3 - host operations 46 | //3.1 Select computing devices 47 | cl::Context context = GetContext(platform_id, device_id); 48 | 49 | //display the selected device 50 | std::cout << "Runing on " << GetPlatformName(platform_id) << ", " << GetDeviceName(platform_id, device_id) << std::endl; 51 | 52 | //create a queue to which we will push commands for the device 53 | cl::CommandQueue queue(context); 54 | 55 | //3.2 Load & build the device code 56 | cl::Program::Sources sources; 57 | 58 | AddSources(sources, "kernels/my_kernels.cl"); 59 | 60 | cl::Program program(context, sources); 61 | 62 | //build and debug the kernel code 63 | try { 64 | program.build(); 65 | } 66 | catch (const cl::Error& err) { 67 | std::cout << "Build Status: " << program.getBuildInfo(context.getInfo()[0]) << std::endl; 68 | std::cout << "Build Options:\t" << program.getBuildInfo(context.getInfo()[0]) << std::endl; 69 | std::cout << "Build Log:\t " << program.getBuildInfo(context.getInfo()[0]) << std::endl; 70 | throw err; 71 | } 72 | 73 | //Part 4 - device operations 74 | 75 | //device - buffers 76 | cl::Buffer dev_image_input(context, CL_MEM_READ_ONLY, image_input.size()); 77 | cl::Buffer dev_image_output(context, CL_MEM_READ_WRITE, image_input.size()); //should be the same as input image 78 | // cl::Buffer dev_convolution_mask(context, CL_MEM_READ_ONLY, convolution_mask.size()*sizeof(float)); 79 | 80 | //4.1 Copy images to device memory 81 | queue.enqueueWriteBuffer(dev_image_input, CL_TRUE, 0, image_input.size(), &image_input.data()[0]); 82 | // queue.enqueueWriteBuffer(dev_convolution_mask, CL_TRUE, 0, convolution_mask.size()*sizeof(float), &convolution_mask[0]); 83 | 84 | //4.2 Setup and execute the kernel (i.e. device code) 85 | cl::Kernel kernel = cl::Kernel(program, "identity"); 86 | kernel.setArg(0, dev_image_input); 87 | kernel.setArg(1, dev_image_output); 88 | // kernel.setArg(2, dev_convolution_mask); 89 | 90 | queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(image_input.size()), cl::NullRange); 91 | 92 | vector output_buffer(image_input.size()); 93 | //4.3 Copy the result from device to host 94 | queue.enqueueReadBuffer(dev_image_output, CL_TRUE, 0, output_buffer.size(), &output_buffer.data()[0]); 95 | 96 | CImg output_image(output_buffer.data(), image_input.width(), image_input.height(), image_input.depth(), image_input.spectrum()); 97 | CImgDisplay disp_output(output_image,"output"); 98 | 99 | while (!disp_input.is_closed() && !disp_output.is_closed() 100 | && !disp_input.is_keyESC() && !disp_output.is_keyESC()) { 101 | disp_input.wait(1); 102 | disp_output.wait(1); 103 | } 104 | 105 | } 106 | catch (const cl::Error& err) { 107 | std::cerr << "ERROR: " << err.what() << ", " << getErrorString(err.err()) << std::endl; 108 | } 109 | catch (CImgException& err) { 110 | std::cerr << "ERROR: " << err.what() << std::endl; 111 | } 112 | 113 | return 0; 114 | } 115 | -------------------------------------------------------------------------------- /Tutorial 2/Tutorial 2.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Debug 10 | x64 11 | 12 | 13 | Release 14 | Win32 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 23 | Debug 24 | x64 25 | 26 | 27 | Release 28 | x64 29 | 30 | 31 | 32 | {9167FEE5-0E64-4275-B2B2-A3F87F3A5C8F} 33 | Tutorial 1 34 | 10.0 35 | 36 | 37 | 38 | Application 39 | true 40 | v142 41 | Unicode 42 | 43 | 44 | Application 45 | false 46 | v142 47 | Unicode 48 | true 49 | 50 | 51 | Application 52 | true 53 | v142 54 | Unicode 55 | 56 | 57 | Application 58 | false 59 | v142 60 | Unicode 61 | true 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | false 79 | 80 | 81 | true 82 | 83 | 84 | false 85 | $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ 86 | 87 | 88 | true 89 | $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ 90 | 91 | 92 | 93 | 0 94 | 95 | 96 | $(INTELOCLSDKROOT)include;%(AdditionalIncludeDirectories) 97 | Win32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 98 | Level3 99 | ProgramDatabase 100 | 101 | 102 | 103 | $(INTELOCLSDKROOT)lib\x86;%(AdditionalLibraryDirectories) 104 | OpenCL.lib;%(AdditionalDependencies) 105 | true 106 | 107 | 108 | If exist "*.cl" copy "*.cl" "$(OutDir)\" 109 | 110 | 111 | 112 | 113 | 0 114 | 115 | 116 | $(INTELOCLSDKROOT)include;.\Graphics\include\win32;.\Graphics\lodepng;%(AdditionalIncludeDirectories) 117 | Win32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 118 | Level3 119 | ProgramDatabase 120 | 121 | 122 | 123 | $(INTELOCLSDKROOT)lib\x86;.\Graphics\lib\win32\glut;%(AdditionalLibraryDirectories) 124 | OpenCL.lib;glut32.lib;%(AdditionalDependencies) 125 | true 126 | 127 | 128 | If exist "*.cl" copy "*.cl" "$(OutDir)\" 129 | 130 | 131 | 132 | 133 | 0 134 | 135 | 136 | $(INTELOCLSDKROOT)include;..\include;%(AdditionalIncludeDirectories) 137 | __x86_64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 138 | MaxSpeed 139 | false 140 | Default 141 | MultiThreadedDLL 142 | Level3 143 | ProgramDatabase 144 | 145 | 146 | 147 | $(INTELOCLSDKROOT)lib\x64;%(AdditionalLibraryDirectories) 148 | OpenCL.lib;%(AdditionalDependencies) 149 | true 150 | true 151 | true 152 | Console 153 | 154 | 155 | xcopy /y "..\images\*" "$(ProjectDir)" 156 | 157 | 158 | 159 | 160 | 0 161 | 162 | 163 | $(INTELOCLSDKROOT)include;..\include;%(AdditionalIncludeDirectories) 164 | __x86_64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 165 | Disabled 166 | false 167 | EnableFastChecks 168 | MultiThreadedDebugDLL 169 | Level3 170 | ProgramDatabase 171 | 172 | 173 | 174 | $(INTELOCLSDKROOT)lib\x64;%(AdditionalLibraryDirectories) 175 | OpenCL.lib;%(AdditionalDependencies) 176 | true 177 | Console 178 | 179 | 180 | xcopy /y "..\images\*" "$(ProjectDir)" 181 | xcopy /s /i /y "kernels" "$(OutDir)kernels" 182 | xcopy /y "..\images\*" "$(OutDir)" 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | -------------------------------------------------------------------------------- /Tutorial 2/Tutorial 2.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | {bddc8ef0-f6c2-4509-accd-01b89c7b41b1} 9 | 10 | 11 | {533d906d-5db8-4839-8b76-d22542cb0f52} 12 | 13 | 14 | 15 | 16 | kernels 17 | 18 | 19 | 20 | 21 | include 22 | 23 | 24 | include 25 | 26 | 27 | -------------------------------------------------------------------------------- /Tutorial 2/kernels/my_kernels.cl: -------------------------------------------------------------------------------- 1 | //a simple OpenCL kernel which copies all pixels from A to B 2 | kernel void identity(global const uchar* A, global uchar* B) { 3 | int id = get_global_id(0); 4 | B[id] = A[id]; 5 | } 6 | 7 | kernel void filter_r(global const uchar* A, global uchar* B) { 8 | int id = get_global_id(0); 9 | int image_size = get_global_size(0)/3; //each image consists of 3 colour channels 10 | int colour_channel = id / image_size; // 0 - red, 1 - green, 2 - blue 11 | 12 | //this is just a copy operation, modify to filter out the individual colour channels 13 | B[id] = A[id]; 14 | } 15 | 16 | //simple ND identity kernel 17 | kernel void identityND(global const uchar* A, global uchar* B) { 18 | int width = get_global_size(0); //image width in pixels 19 | int height = get_global_size(1); //image height in pixels 20 | int image_size = width*height; //image size in pixels 21 | int channels = get_global_size(2); //number of colour channels: 3 for RGB 22 | 23 | int x = get_global_id(0); //current x coord. 24 | int y = get_global_id(1); //current y coord. 25 | int c = get_global_id(2); //current colour channel 26 | 27 | int id = x + y*width + c*image_size; //global id in 1D space 28 | 29 | B[id] = A[id]; 30 | } 31 | 32 | //2D averaging filter 33 | kernel void avg_filterND(global const uchar* A, global uchar* B) { 34 | int width = get_global_size(0); //image width in pixels 35 | int height = get_global_size(1); //image height in pixels 36 | int image_size = width*height; //image size in pixels 37 | int channels = get_global_size(2); //number of colour channels: 3 for RGB 38 | 39 | int x = get_global_id(0); //current x coord. 40 | int y = get_global_id(1); //current y coord. 41 | int c = get_global_id(2); //current colour channel 42 | 43 | int id = x + y*width + c*image_size; //global id in 1D space 44 | 45 | uint result = 0; 46 | 47 | //simple boundary handling - just copy the original pixel 48 | if ((x == 0) || (x == width-1) || (y == 0) || (y == height-1)) { 49 | result = A[id]; 50 | } else { 51 | for (int i = (x-1); i <= (x+1); i++) 52 | for (int j = (y-1); j <= (y+1); j++) 53 | result += A[i + j*width + c*image_size]; 54 | 55 | result /= 9; 56 | } 57 | 58 | B[id] = (uchar)result; 59 | } 60 | 61 | //2D 3x3 convolution kernel 62 | kernel void convolutionND(global const uchar* A, global uchar* B, constant float* mask) { 63 | int width = get_global_size(0); //image width in pixels 64 | int height = get_global_size(1); //image height in pixels 65 | int image_size = width*height; //image size in pixels 66 | int channels = get_global_size(2); //number of colour channels: 3 for RGB 67 | 68 | int x = get_global_id(0); //current x coord. 69 | int y = get_global_id(1); //current y coord. 70 | int c = get_global_id(2); //current colour channel 71 | 72 | int id = x + y*width + c*image_size; //global id in 1D space 73 | 74 | float result = 0; 75 | 76 | //simple boundary handling - just copy the original pixel 77 | if ((x == 0) || (x == width-1) || (y == 0) || (y == height-1)) { 78 | result = A[id]; 79 | } else { 80 | for (int i = (x-1); i <= (x+1); i++) 81 | for (int j = (y-1); j <= (y+1); j++) 82 | result += A[i + j*width + c*image_size]*mask[i-(x-1) + j-(y-1)]; 83 | } 84 | 85 | B[id] = (uchar)result; 86 | } -------------------------------------------------------------------------------- /Tutorial 2/test.ppm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcielniak/OpenCL-Tutorials/b7fb2dfcc9afea1fc2f24df90db01fd60b6670a8/Tutorial 2/test.ppm -------------------------------------------------------------------------------- /Tutorial 2/test_large.ppm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcielniak/OpenCL-Tutorials/b7fb2dfcc9afea1fc2f24df90db01fd60b6670a8/Tutorial 2/test_large.ppm -------------------------------------------------------------------------------- /Tutorial 3/Tutorial 3.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "Utils.h" 5 | 6 | void print_help() { 7 | std::cerr << "Application usage:" << std::endl; 8 | 9 | std::cerr << " -p : select platform " << std::endl; 10 | std::cerr << " -d : select device" << std::endl; 11 | std::cerr << " -l : list all platforms and devices" << std::endl; 12 | std::cerr << " -h : print this message" << std::endl; 13 | } 14 | 15 | int main(int argc, char **argv) { 16 | //Part 1 - handle command line options such as device selection, verbosity, etc. 17 | int platform_id = 0; 18 | int device_id = 0; 19 | 20 | for (int i = 1; i < argc; i++) { 21 | if ((strcmp(argv[i], "-p") == 0) && (i < (argc - 1))) { platform_id = atoi(argv[++i]); } 22 | else if ((strcmp(argv[i], "-d") == 0) && (i < (argc - 1))) { device_id = atoi(argv[++i]); } 23 | else if (strcmp(argv[i], "-l") == 0) { std::cout << ListPlatformsDevices() << std::endl; } 24 | else if (strcmp(argv[i], "-h") == 0) { print_help(); return 0;} 25 | } 26 | 27 | //detect any potential exceptions 28 | try { 29 | //Part 2 - host operations 30 | //2.1 Select computing devices 31 | cl::Context context = GetContext(platform_id, device_id); 32 | 33 | //display the selected device 34 | std::cout << "Runinng on " << GetPlatformName(platform_id) << ", " << GetDeviceName(platform_id, device_id) << std::endl; 35 | 36 | //create a queue to which we will push commands for the device 37 | cl::CommandQueue queue(context); 38 | 39 | //2.2 Load & build the device code 40 | cl::Program::Sources sources; 41 | 42 | AddSources(sources, "kernels/my_kernels.cl"); 43 | 44 | cl::Program program(context, sources); 45 | 46 | //build and debug the kernel code 47 | try { 48 | program.build(); 49 | } 50 | catch (const cl::Error& err) { 51 | std::cout << "Build Status: " << program.getBuildInfo(context.getInfo()[0]) << std::endl; 52 | std::cout << "Build Options:\t" << program.getBuildInfo(context.getInfo()[0]) << std::endl; 53 | std::cout << "Build Log:\t " << program.getBuildInfo(context.getInfo()[0]) << std::endl; 54 | throw err; 55 | } 56 | 57 | typedef int mytype; 58 | 59 | //Part 3 - memory allocation 60 | //host - input 61 | std::vector A(10, 1);//allocate 10 elements with an initial value 1 - their sum is 10 so it should be easy to check the results! 62 | 63 | //the following part adjusts the length of the input vector so it can be run for a specific workgroup size 64 | //if the total input length is divisible by the workgroup size 65 | //this makes the code more efficient 66 | size_t local_size = 10; 67 | 68 | size_t padding_size = A.size() % local_size; 69 | 70 | //if the input vector is not a multiple of the local_size 71 | //insert additional neutral elements (0 for addition) so that the total will not be affected 72 | if (padding_size) { 73 | //create an extra vector with neutral values 74 | std::vector A_ext(local_size-padding_size, 0); 75 | //append that extra vector to our input 76 | A.insert(A.end(), A_ext.begin(), A_ext.end()); 77 | } 78 | 79 | size_t input_elements = A.size();//number of input elements 80 | size_t input_size = A.size()*sizeof(mytype);//size in bytes 81 | size_t nr_groups = input_elements / local_size; 82 | 83 | //host - output 84 | std::vector B(input_elements); 85 | size_t output_size = B.size()*sizeof(mytype);//size in bytes 86 | 87 | //device - buffers 88 | cl::Buffer buffer_A(context, CL_MEM_READ_ONLY, input_size); 89 | cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, output_size); 90 | 91 | //Part 4 - device operations 92 | 93 | //4.1 copy array A to and initialise other arrays on device memory 94 | queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, input_size, &A[0]); 95 | queue.enqueueFillBuffer(buffer_B, 0, 0, output_size);//zero B buffer on device memory 96 | 97 | //4.2 Setup and execute all kernels (i.e. device code) 98 | cl::Kernel kernel_1 = cl::Kernel(program, "reduce_add_1"); 99 | kernel_1.setArg(0, buffer_A); 100 | kernel_1.setArg(1, buffer_B); 101 | // kernel_1.setArg(2, cl::Local(local_size*sizeof(mytype)));//local memory size 102 | 103 | //call all kernels in a sequence 104 | queue.enqueueNDRangeKernel(kernel_1, cl::NullRange, cl::NDRange(input_elements), cl::NDRange(local_size)); 105 | 106 | //4.3 Copy the result from device to host 107 | queue.enqueueReadBuffer(buffer_B, CL_TRUE, 0, output_size, &B[0]); 108 | 109 | std::cout << "A = " << A << std::endl; 110 | std::cout << "B = " << B << std::endl; 111 | } 112 | catch (cl::Error err) { 113 | std::cerr << "ERROR: " << err.what() << ", " << getErrorString(err.err()) << std::endl; 114 | } 115 | 116 | return 0; 117 | } -------------------------------------------------------------------------------- /Tutorial 3/Tutorial 3.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Debug 10 | x64 11 | 12 | 13 | Release 14 | Win32 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 23 | Debug 24 | x64 25 | 26 | 27 | Release 28 | x64 29 | 30 | 31 | 32 | {8CB4B79A-8170-44DE-88DC-C73EACB44CB2} 33 | Tutorial 1 34 | 10.0 35 | 36 | 37 | 38 | Application 39 | true 40 | v142 41 | Unicode 42 | 43 | 44 | Application 45 | false 46 | v142 47 | Unicode 48 | true 49 | 50 | 51 | Application 52 | true 53 | v142 54 | Unicode 55 | 56 | 57 | Application 58 | false 59 | v142 60 | Unicode 61 | true 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | false 79 | 80 | 81 | true 82 | 83 | 84 | false 85 | $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ 86 | 87 | 88 | true 89 | $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ 90 | 91 | 92 | 93 | 0 94 | 95 | 96 | $(INTELOCLSDKROOT)include;%(AdditionalIncludeDirectories) 97 | Win32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 98 | Level3 99 | ProgramDatabase 100 | 101 | 102 | 103 | $(INTELOCLSDKROOT)lib\x86;%(AdditionalLibraryDirectories) 104 | OpenCL.lib;%(AdditionalDependencies) 105 | true 106 | 107 | 108 | If exist "*.cl" copy "*.cl" "$(OutDir)\" 109 | 110 | 111 | 112 | 113 | 0 114 | 115 | 116 | $(INTELOCLSDKROOT)include;%(AdditionalIncludeDirectories) 117 | Win32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 118 | Level3 119 | ProgramDatabase 120 | 121 | 122 | 123 | $(INTELOCLSDKROOT)lib\x86;%(AdditionalLibraryDirectories) 124 | OpenCL.lib;%(AdditionalDependencies) 125 | true 126 | 127 | 128 | If exist "*.cl" copy "*.cl" "$(OutDir)\" 129 | 130 | 131 | 132 | 133 | 0 134 | 135 | 136 | $(INTELOCLSDKROOT)include;..\include;%(AdditionalIncludeDirectories) 137 | __x86_64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 138 | MaxSpeed 139 | false 140 | Default 141 | MultiThreadedDLL 142 | Level3 143 | ProgramDatabase 144 | 145 | 146 | 147 | $(INTELOCLSDKROOT)lib\x64;%(AdditionalLibraryDirectories) 148 | OpenCL.lib;%(AdditionalDependencies) 149 | true 150 | true 151 | true 152 | Console 153 | 154 | 155 | xcopy /s /i /y "kernels" "$(OutDir)kernels" 156 | 157 | 158 | 159 | 160 | 0 161 | 162 | 163 | $(INTELOCLSDKROOT)include;..\include;%(AdditionalIncludeDirectories) 164 | __x86_64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 165 | Disabled 166 | false 167 | EnableFastChecks 168 | MultiThreadedDebugDLL 169 | Level3 170 | ProgramDatabase 171 | 172 | 173 | 174 | $(INTELOCLSDKROOT)lib\x64;%(AdditionalLibraryDirectories) 175 | OpenCL.lib;%(AdditionalDependencies) 176 | true 177 | Console 178 | 179 | 180 | xcopy /s /i /y "kernels" "$(OutDir)kernels" 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | -------------------------------------------------------------------------------- /Tutorial 3/Tutorial 3.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | {bc7a8fec-44e6-4521-a22b-e4595d5b0ed1} 9 | 10 | 11 | {84aac619-8794-4eec-81d5-212689c31225} 12 | 13 | 14 | 15 | 16 | kernels 17 | 18 | 19 | 20 | 21 | include 22 | 23 | 24 | -------------------------------------------------------------------------------- /Tutorial 3/kernels/my_kernels.cl: -------------------------------------------------------------------------------- 1 | //fixed 4 step reduce 2 | kernel void reduce_add_1(global const int* A, global int* B) { 3 | int id = get_global_id(0); 4 | int N = get_global_size(0); 5 | 6 | B[id] = A[id]; //copy input to output 7 | 8 | barrier(CLK_GLOBAL_MEM_FENCE); //wait for all threads to finish copying 9 | 10 | //perform reduce on the output array 11 | //modulo operator is used to skip a set of values (e.g. 2 in the next line) 12 | //we also check if the added element is within bounds (i.e. < N) 13 | if (((id % 2) == 0) && ((id + 1) < N)) 14 | B[id] += B[id + 1]; 15 | 16 | barrier(CLK_GLOBAL_MEM_FENCE); 17 | 18 | if (((id % 4) == 0) && ((id + 2) < N)) 19 | B[id] += B[id + 2]; 20 | 21 | barrier(CLK_GLOBAL_MEM_FENCE); 22 | 23 | if (((id % 8) == 0) && ((id + 4) < N)) 24 | B[id] += B[id + 4]; 25 | 26 | barrier(CLK_GLOBAL_MEM_FENCE); 27 | 28 | if (((id % 16) == 0) && ((id + 8) < N)) 29 | B[id] += B[id + 8]; 30 | } 31 | 32 | //flexible step reduce 33 | kernel void reduce_add_2(global const int* A, global int* B) { 34 | int id = get_global_id(0); 35 | int N = get_global_size(0); 36 | 37 | B[id] = A[id]; 38 | 39 | barrier(CLK_GLOBAL_MEM_FENCE); 40 | 41 | for (int i = 1; i < N; i *= 2) { //i is a stride 42 | if (!(id % (i * 2)) && ((id + i) < N)) 43 | B[id] += B[id + i]; 44 | 45 | barrier(CLK_GLOBAL_MEM_FENCE); 46 | } 47 | } 48 | 49 | //reduce using local memory (so called privatisation) 50 | kernel void reduce_add_3(global const int* A, global int* B, local int* scratch) { 51 | int id = get_global_id(0); 52 | int lid = get_local_id(0); 53 | int N = get_local_size(0); 54 | 55 | //cache all N values from global memory to local memory 56 | scratch[lid] = A[id]; 57 | 58 | barrier(CLK_LOCAL_MEM_FENCE);//wait for all local threads to finish copying from global to local memory 59 | 60 | for (int i = 1; i < N; i *= 2) { 61 | if (!(lid % (i * 2)) && ((lid + i) < N)) 62 | scratch[lid] += scratch[lid + i]; 63 | 64 | barrier(CLK_LOCAL_MEM_FENCE); 65 | } 66 | 67 | //copy the cache to output array 68 | B[id] = scratch[lid]; 69 | } 70 | 71 | //reduce using local memory + accumulation of local sums into a single location 72 | //works with any number of groups - not optimal! 73 | kernel void reduce_add_4(global const int* A, global int* B, local int* scratch) { 74 | int id = get_global_id(0); 75 | int lid = get_local_id(0); 76 | int N = get_local_size(0); 77 | 78 | //cache all N values from global memory to local memory 79 | scratch[lid] = A[id]; 80 | 81 | barrier(CLK_LOCAL_MEM_FENCE);//wait for all local threads to finish copying from global to local memory 82 | 83 | for (int i = 1; i < N; i *= 2) { 84 | if (!(lid % (i * 2)) && ((lid + i) < N)) 85 | scratch[lid] += scratch[lid + i]; 86 | 87 | barrier(CLK_LOCAL_MEM_FENCE); 88 | } 89 | 90 | //we add results from all local groups to the first element of the array 91 | //serial operation! but works for any group size 92 | //copy the cache to output array 93 | if (!lid) { 94 | atomic_add(&B[0],scratch[lid]); 95 | } 96 | } 97 | 98 | //a very simple histogram implementation 99 | kernel void hist_simple(global const int* A, global int* H) { 100 | int id = get_global_id(0); 101 | 102 | //assumes that H has been initialised to 0 103 | int bin_index = A[id];//take value as a bin index 104 | 105 | atomic_inc(&H[bin_index]);//serial operation, not very efficient! 106 | } 107 | 108 | //Hillis-Steele basic inclusive scan 109 | //requires additional buffer B to avoid data overwrite 110 | kernel void scan_hs(global int* A, global int* B) { 111 | int id = get_global_id(0); 112 | int N = get_global_size(0); 113 | global int* C; 114 | 115 | for (int stride = 1; stride < N; stride *= 2) { 116 | B[id] = A[id]; 117 | if (id >= stride) 118 | B[id] += A[id - stride]; 119 | 120 | barrier(CLK_GLOBAL_MEM_FENCE); //sync the step 121 | 122 | C = A; A = B; B = C; //swap A & B between steps 123 | } 124 | } 125 | 126 | //a double-buffered version of the Hillis-Steele inclusive scan 127 | //requires two additional input arguments which correspond to two local buffers 128 | kernel void scan_add(__global const int* A, global int* B, local int* scratch_1, local int* scratch_2) { 129 | int id = get_global_id(0); 130 | int lid = get_local_id(0); 131 | int N = get_local_size(0); 132 | local int *scratch_3;//used for buffer swap 133 | 134 | //cache all N values from global memory to local memory 135 | scratch_1[lid] = A[id]; 136 | 137 | barrier(CLK_LOCAL_MEM_FENCE);//wait for all local threads to finish copying from global to local memory 138 | 139 | for (int i = 1; i < N; i *= 2) { 140 | if (lid >= i) 141 | scratch_2[lid] = scratch_1[lid] + scratch_1[lid - i]; 142 | else 143 | scratch_2[lid] = scratch_1[lid]; 144 | 145 | barrier(CLK_LOCAL_MEM_FENCE); 146 | 147 | //buffer swap 148 | scratch_3 = scratch_2; 149 | scratch_2 = scratch_1; 150 | scratch_1 = scratch_3; 151 | } 152 | 153 | //copy the cache to output array 154 | B[id] = scratch_1[lid]; 155 | } 156 | 157 | //Blelloch basic exclusive scan 158 | kernel void scan_bl(global int* A) { 159 | int id = get_global_id(0); 160 | int N = get_global_size(0); 161 | int t; 162 | 163 | //up-sweep 164 | for (int stride = 1; stride < N; stride *= 2) { 165 | if (((id + 1) % (stride*2)) == 0) 166 | A[id] += A[id - stride]; 167 | 168 | barrier(CLK_GLOBAL_MEM_FENCE); //sync the step 169 | } 170 | 171 | //down-sweep 172 | if (id == 0) 173 | A[N-1] = 0;//exclusive scan 174 | 175 | barrier(CLK_GLOBAL_MEM_FENCE); //sync the step 176 | 177 | for (int stride = N/2; stride > 0; stride /= 2) { 178 | if (((id + 1) % (stride*2)) == 0) { 179 | t = A[id]; 180 | A[id] += A[id - stride]; //reduce 181 | A[id - stride] = t; //move 182 | } 183 | 184 | barrier(CLK_GLOBAL_MEM_FENCE); //sync the step 185 | } 186 | } 187 | 188 | //calculates the block sums 189 | kernel void block_sum(global const int* A, global int* B, int local_size) { 190 | int id = get_global_id(0); 191 | B[id] = A[(id+1)*local_size-1]; 192 | } 193 | 194 | //simple exclusive serial scan based on atomic operations - sufficient for small number of elements 195 | kernel void scan_add_atomic(global int* A, global int* B) { 196 | int id = get_global_id(0); 197 | int N = get_global_size(0); 198 | for (int i = id+1; i < N && id < N; i++) 199 | atomic_add(&B[i], A[id]); 200 | } 201 | 202 | //adjust the values stored in partial scans by adding block sums to corresponding blocks 203 | kernel void scan_add_adjust(global int* A, global const int* B) { 204 | int id = get_global_id(0); 205 | int gid = get_group_id(0); 206 | A[id] += B[gid]; 207 | } -------------------------------------------------------------------------------- /Tutorial 4/Tutorial 4.cpp: -------------------------------------------------------------------------------- 1 | #include "Utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace compute = boost::compute; 8 | using namespace std; 9 | 10 | int main() { 11 | typedef int mytype; 12 | 13 | // create vectors on the host 14 | vector A = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; 15 | vector B = { 0, 1, 2, 0, 1, 2, 0, 1, 2, 0 }; 16 | vector C(A.size()); 17 | 18 | // create vectors on the device 19 | compute::vector devA(A.size()); 20 | compute::vector devB(B.size()); 21 | compute::vector devC(C.size()); 22 | 23 | // copy input data to the device 24 | compute::copy(A.begin(), A.end(), devA.begin()); 25 | compute::copy(B.begin(), B.end(), devB.begin()); 26 | 27 | // perform C = A + B 28 | compute::transform(devA.begin(), devA.end(), devB.begin(), devC.begin(), compute::plus()); 29 | 30 | // copy data back to the host 31 | compute::copy(devC.begin(), devC.end(), C.begin()); 32 | 33 | cout << "A = " << A << endl; 34 | cout << "B = " << B << endl; 35 | cout << "C = " << C << endl; 36 | 37 | return 0; 38 | } 39 | -------------------------------------------------------------------------------- /Tutorial 4/Tutorial 4.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | {E95D4B5A-1F3F-4A31-931F-7A99CE219124} 23 | Win32Proj 24 | BComputeExamples 25 | 10.0 26 | Tutorial 4 27 | 28 | 29 | 30 | Application 31 | true 32 | v142 33 | Unicode 34 | 35 | 36 | Application 37 | false 38 | v142 39 | true 40 | Unicode 41 | 42 | 43 | Application 44 | true 45 | v142 46 | Unicode 47 | 48 | 49 | Application 50 | false 51 | v142 52 | true 53 | Unicode 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | true 75 | 76 | 77 | true 78 | $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ 79 | 80 | 81 | false 82 | 83 | 84 | false 85 | $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ 86 | 87 | 88 | 89 | 90 | 91 | Level3 92 | Disabled 93 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 94 | C:\local\boost_1_60_0;C:\Program Files %28x86%29\Intel\OpenCL SDK\5.3\include;%(AdditionalIncludeDirectories) 95 | 96 | 97 | Console 98 | true 99 | C:\local\boost_1_60_0\lib64-msvc-14.0;%(AdditionalLibraryDirectories) 100 | 101 | 102 | 103 | 104 | 105 | 106 | Level3 107 | Disabled 108 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 109 | $(BOOST_INCLUDEDIR);$(INTELOCLSDKROOT)include;..\include;%(AdditionalIncludeDirectories) 110 | 4996 111 | 112 | 113 | Console 114 | true 115 | $(BOOST_LIBRARYDIR);$(INTELOCLSDKROOT)lib\x64;%(AdditionalLibraryDirectories) 116 | OpenCL.lib;%(AdditionalDependencies) 117 | 118 | 119 | 120 | 121 | Level3 122 | 123 | 124 | MaxSpeed 125 | true 126 | true 127 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 128 | 129 | 130 | Console 131 | true 132 | true 133 | true 134 | 135 | 136 | 137 | 138 | Level3 139 | 140 | 141 | MaxSpeed 142 | true 143 | true 144 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 145 | $(BOOST_INCLUDEDIR);$(INTELOCLSDKROOT)include;..\include; 146 | 147 | 148 | Console 149 | true 150 | true 151 | true 152 | $(BOOST_LIBRARYDIR);$(INTELOCLSDKROOT)lib\x64;%(AdditionalLibraryDirectories) 153 | OpenCL.lib;%(AdditionalDependencies) 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /Tutorial 4/Tutorial 4.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | {ba2bbd9b-6f55-4a90-8bc6-289f2bf5ddd5} 9 | 10 | 11 | 12 | 13 | include 14 | 15 | 16 | -------------------------------------------------------------------------------- /images/test.ppm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcielniak/OpenCL-Tutorials/b7fb2dfcc9afea1fc2f24df90db01fd60b6670a8/images/test.ppm -------------------------------------------------------------------------------- /images/test_large.ppm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcielniak/OpenCL-Tutorials/b7fb2dfcc9afea1fc2f24df90db01fd60b6670a8/images/test_large.ppm -------------------------------------------------------------------------------- /include/Utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS 9 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120 10 | #define CL_HPP_TARGET_OPENCL_VERSION 120 11 | #define CL_HPP_ENABLE_EXCEPTIONS 12 | 13 | #include 14 | 15 | using namespace std; 16 | 17 | template 18 | ostream& operator<< (ostream& out, const vector& v) { 19 | if (!v.empty()) { 20 | out << '['; 21 | copy(v.begin(), v.end(), ostream_iterator(out, ", ")); 22 | out << "\b\b]"; 23 | } 24 | return out; 25 | } 26 | 27 | string GetPlatformName(int platform_id) { 28 | vector platforms; 29 | cl::Platform::get(&platforms); 30 | return platforms[platform_id].getInfo(); 31 | } 32 | 33 | string GetDeviceName(int platform_id, int device_id) { 34 | vector platforms; 35 | cl::Platform::get(&platforms); 36 | vector devices; 37 | platforms[platform_id].getDevices((cl_device_type)CL_DEVICE_TYPE_ALL, &devices); 38 | return devices[device_id].getInfo(); 39 | } 40 | 41 | const char *getErrorString(cl_int error) { 42 | switch (error){ 43 | // run-time and JIT compiler errors 44 | case 0: return "CL_SUCCESS"; 45 | case -1: return "CL_DEVICE_NOT_FOUND"; 46 | case -2: return "CL_DEVICE_NOT_AVAILABLE"; 47 | case -3: return "CL_COMPILER_NOT_AVAILABLE"; 48 | case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; 49 | case -5: return "CL_OUT_OF_RESOURCES"; 50 | case -6: return "CL_OUT_OF_HOST_MEMORY"; 51 | case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE"; 52 | case -8: return "CL_MEM_COPY_OVERLAP"; 53 | case -9: return "CL_IMAGE_FORMAT_MISMATCH"; 54 | case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; 55 | case -11: return "CL_BUILD_PROGRAM_FAILURE"; 56 | case -12: return "CL_MAP_FAILURE"; 57 | case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; 58 | case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; 59 | case -15: return "CL_COMPILE_PROGRAM_FAILURE"; 60 | case -16: return "CL_LINKER_NOT_AVAILABLE"; 61 | case -17: return "CL_LINK_PROGRAM_FAILURE"; 62 | case -18: return "CL_DEVICE_PARTITION_FAILED"; 63 | case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; 64 | 65 | // compile-time errors 66 | case -30: return "CL_INVALID_VALUE"; 67 | case -31: return "CL_INVALID_DEVICE_TYPE"; 68 | case -32: return "CL_INVALID_PLATFORM"; 69 | case -33: return "CL_INVALID_DEVICE"; 70 | case -34: return "CL_INVALID_CONTEXT"; 71 | case -35: return "CL_INVALID_QUEUE_PROPERTIES"; 72 | case -36: return "CL_INVALID_COMMAND_QUEUE"; 73 | case -37: return "CL_INVALID_HOST_PTR"; 74 | case -38: return "CL_INVALID_MEM_OBJECT"; 75 | case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; 76 | case -40: return "CL_INVALID_IMAGE_SIZE"; 77 | case -41: return "CL_INVALID_SAMPLER"; 78 | case -42: return "CL_INVALID_BINARY"; 79 | case -43: return "CL_INVALID_BUILD_OPTIONS"; 80 | case -44: return "CL_INVALID_PROGRAM"; 81 | case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; 82 | case -46: return "CL_INVALID_KERNEL_NAME"; 83 | case -47: return "CL_INVALID_KERNEL_DEFINITION"; 84 | case -48: return "CL_INVALID_KERNEL"; 85 | case -49: return "CL_INVALID_ARG_INDEX"; 86 | case -50: return "CL_INVALID_ARG_VALUE"; 87 | case -51: return "CL_INVALID_ARG_SIZE"; 88 | case -52: return "CL_INVALID_KERNEL_ARGS"; 89 | case -53: return "CL_INVALID_WORK_DIMENSION"; 90 | case -54: return "CL_INVALID_WORK_GROUP_SIZE"; 91 | case -55: return "CL_INVALID_WORK_ITEM_SIZE"; 92 | case -56: return "CL_INVALID_GLOBAL_OFFSET"; 93 | case -57: return "CL_INVALID_EVENT_WAIT_LIST"; 94 | case -58: return "CL_INVALID_EVENT"; 95 | case -59: return "CL_INVALID_OPERATION"; 96 | case -60: return "CL_INVALID_GL_OBJECT"; 97 | case -61: return "CL_INVALID_BUFFER_SIZE"; 98 | case -62: return "CL_INVALID_MIP_LEVEL"; 99 | case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; 100 | case -64: return "CL_INVALID_PROPERTY"; 101 | case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; 102 | case -66: return "CL_INVALID_COMPILER_OPTIONS"; 103 | case -67: return "CL_INVALID_LINKER_OPTIONS"; 104 | case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; 105 | 106 | // extension errors 107 | case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; 108 | case -1001: return "CL_PLATFORM_NOT_FOUND_KHR"; 109 | case -1002: return "CL_INVALID_D3D10_DEVICE_KHR"; 110 | case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR"; 111 | case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; 112 | case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; 113 | default: return "Unknown OpenCL error"; 114 | } 115 | } 116 | 117 | void CheckError(cl_int error) { 118 | if (error != CL_SUCCESS) { 119 | cerr << "OpenCL call failed with error " << getErrorString(error) << endl; 120 | exit(1); 121 | } 122 | } 123 | 124 | void AddSources(cl::Program::Sources& sources, const string& file_name) { 125 | //TODO: add file existence check 126 | ifstream file(file_name); 127 | string* source_code = new string(istreambuf_iterator(file), (istreambuf_iterator())); 128 | sources.push_back((*source_code).c_str()); 129 | } 130 | 131 | string ListPlatformsDevices() { 132 | 133 | stringstream sstream; 134 | vector platforms; 135 | 136 | cl::Platform::get(&platforms); 137 | 138 | sstream << "Found " << platforms.size() << " platform(s):" << endl; 139 | 140 | for (unsigned int i = 0; i < platforms.size(); i++) 141 | { 142 | sstream << "\nPlatform " << i << ", " << platforms[i].getInfo() << ", version: " << platforms[i].getInfo(); 143 | 144 | sstream << ", vendor: " << platforms[i].getInfo() << endl; 145 | // sstream << ", extensions: " << platforms[i].getInfo() << endl; 146 | 147 | vector devices; 148 | 149 | platforms[i].getDevices((cl_device_type)CL_DEVICE_TYPE_ALL, &devices); 150 | 151 | sstream << "\n Found " << devices.size() << " device(s):" << endl; 152 | 153 | for (unsigned int j = 0; j < devices.size(); j++) 154 | { 155 | sstream << "\n Device " << j << ", " << devices[j].getInfo() << ", version: " << devices[j].getInfo(); 156 | 157 | sstream << ", vendor: " << devices[j].getInfo(); 158 | cl_device_type device_type = devices[j].getInfo(); 159 | sstream << ", type: "; 160 | if (device_type & CL_DEVICE_TYPE_DEFAULT) 161 | sstream << "DEFAULT "; 162 | if (device_type & CL_DEVICE_TYPE_CPU) 163 | sstream << "CPU "; 164 | if (device_type & CL_DEVICE_TYPE_GPU) 165 | sstream << "GPU "; 166 | if (device_type & CL_DEVICE_TYPE_ACCELERATOR) 167 | sstream << "ACCELERATOR "; 168 | sstream << ", compute units: " << devices[j].getInfo(); 169 | sstream << ", clock freq [MHz]: " << devices[j].getInfo(); 170 | sstream << ", max memory size [B]: " << devices[j].getInfo(); 171 | sstream << ", max allocatable memory [B]: " << devices[j].getInfo(); 172 | 173 | sstream << endl; 174 | } 175 | } 176 | sstream << "----------------------------------------------------------------" << endl; 177 | 178 | return sstream.str(); 179 | } 180 | 181 | cl::Context GetContext(int platform_id, int device_id) { 182 | vector platforms; 183 | 184 | cl::Platform::get(&platforms); 185 | 186 | for (unsigned int i = 0; i < platforms.size(); i++) 187 | { 188 | vector devices; 189 | platforms[i].getDevices((cl_device_type)CL_DEVICE_TYPE_ALL, &devices); 190 | 191 | for (unsigned int j = 0; j < devices.size(); j++) 192 | { 193 | if ((i == platform_id) && (j == device_id)) 194 | return cl::Context({ devices[j] }); 195 | } 196 | } 197 | 198 | return cl::Context(); 199 | } 200 | 201 | enum ProfilingResolution { 202 | PROF_NS = 1, 203 | PROF_US = 1000, 204 | PROF_MS = 1000000, 205 | PROF_S = 1000000000 206 | }; 207 | 208 | string GetFullProfilingInfo(const cl::Event& evnt, ProfilingResolution resolution) { 209 | stringstream sstream; 210 | 211 | sstream << "Queued " << (evnt.getProfilingInfo() - evnt.getProfilingInfo()) / resolution; 212 | sstream << ", Submitted " << (evnt.getProfilingInfo() - evnt.getProfilingInfo()) / resolution; 213 | sstream << ", Executed " << (evnt.getProfilingInfo() - evnt.getProfilingInfo()) / resolution; 214 | sstream << ", Total " << (evnt.getProfilingInfo() - evnt.getProfilingInfo()) / resolution; 215 | 216 | switch (resolution) { 217 | case PROF_NS: sstream << " [ns]"; break; 218 | case PROF_US: sstream << " [us]"; break; 219 | case PROF_MS: sstream << " [ms]"; break; 220 | case PROF_S: sstream << " [s]"; break; 221 | default: break; 222 | } 223 | 224 | return sstream.str(); 225 | } --------------------------------------------------------------------------------