├── examples ├── imgData.mat ├── mul_kernel.cl ├── test_mul.m ├── img_test.m └── filter.cl ├── .gitignore ├── get_devices.m ├── compile_linux.m ├── cl_get_devices.m ├── .zenodo.json ├── compile_windows.m ├── test_kernel.cl ├── compile_mac.m ├── cl_dbg_kernel.m ├── src ├── cl_dbg_kernel.cpp ├── cl_get_devices.cpp ├── ocl_dev_mgr.hpp ├── utils.hpp ├── cl_run_kernel.cpp ├── ocl_dev_mgr.cpp └── MatCL.hpp ├── cl_run_kernel.m ├── run_kernel.m └── README.md /examples/imgData.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IANW-Projects/MatCL/HEAD/examples/imgData.mat -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.asv 2 | *.pdb 3 | *.mexa64 4 | *.mexmaci64 5 | *.mexw32 6 | *.mexw64 7 | 8 | CL/* 9 | 10 | -------------------------------------------------------------------------------- /get_devices.m: -------------------------------------------------------------------------------- 1 | %This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. 2 | 3 | clear all 4 | close all 5 | clc 6 | 7 | %% 8 | %To get a list of available OpenCl devices use 'cl_get_devices'. This 9 | %functions returns the names of all availbale devices, the device class 10 | %(CPU, GPU or Other for other or unknown Accelerators) and the availble 11 | %device meory in bytes. To choose a device use the index of the 12 | %corresponding entry in the names array. 13 | [dev_name,dev_type,max_mem,wg_size,lw_size,compute_units]=cl_get_devices; 14 | -------------------------------------------------------------------------------- /compile_linux.m: -------------------------------------------------------------------------------- 1 | %This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. 2 | 3 | close all 4 | clear all 5 | clc 6 | 7 | %Change OpenCL library path according to your setup 8 | mex -g COMPFLAGS='$COMPFLAGS -std=c++11 -O2' '-LC /usr/lib/x86_64-linux-gnu' -lOpenCL src/cl_get_devices.cpp src/ocl_dev_mgr.cpp 9 | mex -g COMPFLAGS='$COMPFLAGS -std=c++11 -O2' '-LC /usr/lib/x86_64-linux-gnu' -lOpenCL src/cl_run_kernel.cpp src/ocl_dev_mgr.cpp 10 | mex -g COMPFLAGS='$COMPFLAGS -std=c++11 -O2' '-LC /usr/lib/x86_64-linux-gnu' -lOpenCL src/cl_dbg_kernel.cpp src/ocl_dev_mgr.cpp 11 | 12 | [dev_name,dev_type,max_mem,wg_size,lw_size]=cl_get_devices; 13 | -------------------------------------------------------------------------------- /cl_get_devices.m: -------------------------------------------------------------------------------- 1 | %CL_GET_DEVICES Enumerate OpenCL devices (returns a list whose i-th entry corresponds to the i-th OpenCL device) 2 | % 3 | % 4 | % [names, dev_class, max_mem, max_wg_size, max_local_work_size, compute_units] = cl_get_devices; 5 | % 6 | % 7 | % Outputs 8 | % ------- 9 | % 10 | % names: Names of all available devices 11 | % dev_class: The device class (CPU, GPU or Other for other or unknown Accelerators) 12 | % max_mem: The available device memory in bytes 13 | % max_wg_size: Max. size of OpenCL work group 14 | % max_local_work_size: Max. size of work items 15 | % compute_units: Number of compute units (e.g. CPU cores) of the device 16 | % 17 | -------------------------------------------------------------------------------- /.zenodo.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "MatCL: A new easy-to use OpenCL toolbox for MathWorks Matlab", 3 | "license": "other-open", 4 | "title": "IANW-Projects/MatCL", 5 | "version": "v1.1.2", 6 | "upload_type": "software", 7 | "publication_date": "2019-04-24", 8 | "creators": [ 9 | { 10 | "affiliation": "TU Braunschweig", 11 | "name": "Philip Heinisch" 12 | }, 13 | { 14 | "affiliation": "TU Braunschweig", 15 | "name": "Katharina Ostaszewski" 16 | }, 17 | { 18 | "affiliation": "TU Braunschweig", 19 | "name": "Hendrik Ranocha" 20 | } 21 | ], 22 | "access_right": "open" 23 | } 24 | -------------------------------------------------------------------------------- /compile_windows.m: -------------------------------------------------------------------------------- 1 | %This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. 2 | 3 | close all 4 | clear all 5 | clc 6 | 7 | %Change OpenCL library path according to your setup 8 | mex -g COMPFLAGS='$COMPFLAGS -O2' '-L C:\Intel\OpenCL\sdk\lib\x64\' '-I C:\Intel\OpenCL\sdk\include\' -lOpenCL src\cl_get_devices.cpp src\ocl_dev_mgr.cpp 9 | mex -g COMPFLAGS='$COMPFLAGS -O2' '-L C:\Intel\OpenCL\sdk\lib\x64\' '-I C:\Intel\OpenCL\sdk\include\' -lOpenCL src\cl_run_kernel.cpp src\ocl_dev_mgr.cpp 10 | mex -g COMPFLAGS='$COMPFLAGS -O2' '-L C:\Intel\OpenCL\sdk\lib\x64\' '-I C:\Intel\OpenCL\sdk\include\' -lOpenCL src\cl_dbg_kernel.cpp src\ocl_dev_mgr.cpp 11 | 12 | [dev_name,dev_type,max_mem,wg_size,lw_size,compute_units]=cl_get_devices; 13 | -------------------------------------------------------------------------------- /test_kernel.cl: -------------------------------------------------------------------------------- 1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */ 2 | 3 | #ifdef cl_khr_fp64 4 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 5 | #else 6 | #error "IEEE-754 double precision not supported by OpenCL implementation." 7 | #endif 8 | 9 | kernel void test1(global double4 *d_1,global double4 *d_2) 10 | { 11 | uint idx = get_global_id(0); 12 | 13 | //Simple test just add a constant DT to the value of d_1. The constant DT is a kernel define to increase performance 14 | d_1[idx].x=d_1[idx].x+DT; 15 | 16 | }; 17 | 18 | 19 | kernel void test2(global double4 *d_1,global double4 *d_2) 20 | { 21 | uint idx = get_global_id(0); 22 | printf("Test: %d \n",idx); 23 | d_1[idx].w=d_2[idx].w+DT; 24 | 25 | }; 26 | -------------------------------------------------------------------------------- /compile_mac.m: -------------------------------------------------------------------------------- 1 | %This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. 2 | 3 | close all 4 | clear all 5 | clc 6 | 7 | % Apple does not ship cl2.hpp in general 8 | mkdir('CL'); 9 | websave('CL/cl2.hpp', 'https://github.com/KhronosGroup/OpenCL-CLHPP/releases/download/v2.0.10/cl2.hpp'); 10 | 11 | %Change OpenCL library path according to your setup 12 | mex -g COMPFLAGS='$COMPFLAGS -std=c++11 -O2 -framework OpenCL' -I./ LDFLAGS='$LDFLAGS -framework OpenCL' src/cl_get_devices.cpp src/ocl_dev_mgr.cpp 13 | mex -g COMPFLAGS='$COMPFLAGS -std=c++11 -O2 -framework OpenCL' -I./ LDFLAGS='$LDFLAGS -framework OpenCL' src/cl_run_kernel.cpp src/ocl_dev_mgr.cpp 14 | mex -g COMPFLAGS='$COMPFLAGS -std=c++11 -O2 -framework OpenCL' -I./ LDFLAGS='$LDFLAGS -framework OpenCL' src/cl_dbg_kernel.cpp src/ocl_dev_mgr.cpp 15 | 16 | [dev_name,dev_type,max_mem,wg_size,lw_size]=cl_get_devices; 17 | -------------------------------------------------------------------------------- /examples/mul_kernel.cl: -------------------------------------------------------------------------------- 1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */ 2 | 3 | // enable double precision (not enabled by default) 4 | 5 | #ifdef cl_khr_fp64 6 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 7 | #else 8 | #error "IEEE-754 double precision not supported by OpenCL implementation." 9 | #endif 10 | 11 | kernel void MM(const global double *A,const global double *B,global double *C) 12 | { 13 | 14 | // Thread identifiers 15 | const uint globalRow = get_global_id(0); // Row ID of C (0..M) 16 | const uint globalCol = get_global_id(1); // Col ID of C (0..N) 17 | 18 | const uint num_rows=(uint)NR; 19 | const uint num_cols=(uint)NC; 20 | const uint num_i=(uint)NI; 21 | 22 | // Compute a single element (loop over K) 23 | double acc = 0.0f; 24 | 25 | for (uint k=0; k 4 | #include "mex.h" 5 | #include "matrix.h" 6 | #include 7 | #include 8 | #if defined(_WIN32) 9 | #include 10 | #include 11 | #include 12 | #endif 13 | 14 | 15 | #define CL_HPP_ENABLE_EXCEPTIONS 16 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120 17 | #define CL_HPP_TARGET_OPENCL_VERSION 120 18 | 19 | #include 20 | #include "ocl_dev_mgr.hpp" 21 | 22 | 23 | #include "MatCL.hpp" 24 | 25 | 26 | class mystream : public std::streambuf 27 | { 28 | protected: 29 | virtual std::streamsize xsputn(const char *s, std::streamsize n) { mexPrintf("%.*s", n, s); return n; } 30 | virtual int overflow(int c = EOF) { if (c != EOF) { mexPrintf("%.1s", &c); } return 1; } 31 | }; 32 | class scoped_redirect_cout 33 | { 34 | public: 35 | scoped_redirect_cout() { old_buf = std::cout.rdbuf(); std::cout.rdbuf(&mout); } 36 | ~scoped_redirect_cout() { std::cout.rdbuf(old_buf); } 37 | private: 38 | mystream mout; 39 | std::streambuf *old_buf; 40 | }; 41 | static scoped_redirect_cout mycout_redirect; 42 | 43 | 44 | 45 | void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray*prhs[]) 46 | 47 | { 48 | 49 | ocl_dev_mgr& dev_mgr = ocl_dev_mgr::getInstance(); 50 | 51 | uint32_t device = (uint32_t)mxGetScalar(prhs[0]) - 1; 52 | 53 | if (nrhs>2) { 54 | 55 | if (device 4 | #include "mex.h" 5 | #include "matrix.h" 6 | 7 | 8 | #define CL_HPP_ENABLE_EXCEPTIONS 9 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120 10 | #define CL_HPP_TARGET_OPENCL_VERSION 120 11 | 12 | #include 13 | #include "ocl_dev_mgr.hpp" 14 | 15 | 16 | void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray*prhs[] ) { 17 | 18 | uint32_t devices_availble=0; 19 | 20 | ocl_dev_mgr& dev_mgr = ocl_dev_mgr::getInstance(); 21 | devices_availble=dev_mgr.get_avail_dev_num(); 22 | 23 | //get OpenCl device names 24 | mxArray * tmp_str; 25 | mxArray *cell_array_ptr; 26 | mxArray *matrix_ptr; 27 | 28 | char name_string[100]=""; 29 | 30 | 31 | cell_array_ptr = mxCreateCellMatrix((mwSize)devices_availble,1); 32 | for (uint32_t i=0;i 7 | 8 | // disable strange warnings for newer versions of GCC for OpenCL typedefs 9 | #pragma GCC diagnostic ignored "-Wignored-attributes" 10 | 11 | #define CL_HPP_ENABLE_EXCEPTIONS 12 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120 13 | #define CL_HPP_TARGET_OPENCL_VERSION 120 14 | #if defined(__APPLE__) 15 | #define CL_SILENCE_DEPRECATION 16 | #include 17 | #else 18 | #include 19 | #endif 20 | 21 | 22 | class ocl_dev_mgr { 23 | public: 24 | ~ocl_dev_mgr() {}; 25 | 26 | static ocl_dev_mgr& getInstance() { 27 | static ocl_dev_mgr instance; 28 | return instance; 29 | } 30 | 31 | struct ocl_device_info{ 32 | cl::Device device; 33 | std::string name; 34 | cl::Platform platform; 35 | std::string platform_name; 36 | std::string vendor; 37 | cl_device_type type; 38 | std::string ocl_version; 39 | cl_ulong max_mem; 40 | cl_ulong max_mem_alloc; 41 | size_t wg_size; 42 | cl_uint lw_dim; 43 | size_t lw_size; 44 | cl_uint compute_units; 45 | cl_uint copy_perf; 46 | cl_uint double_perf; 47 | cl_uint float_perf; 48 | }; 49 | 50 | std::string getDevicePCIeID(cl_uint avail_device_idx); 51 | cl_ulong init_device(cl_uint avail_device_idx); 52 | cl::CommandQueue& get_queue(cl_uint context_idx, cl_uint queue_idx); 53 | cl::Context& get_context(cl_uint context_idx); 54 | cl::Program& get_program(cl_uint context_idx, std::string const& prog_name); 55 | cl_ulong get_avail_dev_num(); 56 | cl_ulong get_context_num(); 57 | ocl_device_info& get_avail_dev_info(cl_uint avail_device_idx); 58 | ocl_device_info& get_context_dev_info(cl_uint context_idx, cl_uint device_idx); 59 | cl_ulong compile_kernel(cl_uint context_idx, std::string const& prog_name, std::string const& options); 60 | cl_ulong get_kernel_names(cl_uint context_idx, std::string const& prog_name, std::vector& found_kernels); 61 | cl_ulong execute_kernel(cl::Kernel& kernel, cl::CommandQueue& queue, 62 | cl::NDRange global_range, cl::NDRange local_range, 63 | std::vector& dev_Buffers); 64 | cl_ulong execute_kernelNA(cl::Kernel& kernel, cl::CommandQueue& queue, 65 | cl::NDRange range_start, cl::NDRange global_range, cl::NDRange local_range); 66 | void execute_kernel_async(cl::Kernel& kernel, cl::CommandQueue& queue, 67 | cl::NDRange global_range, cl::NDRange local_range, 68 | std::vector& dev_Buffers); 69 | bool add_program_url(cl_uint context_idx, std::string prog_name, std::string const& url); 70 | bool add_program_str(cl_uint context_idx, std::string prog_name, std::string kernel); 71 | cl::Kernel* getKernelbyName(cl_uint context_idx, std::string const& prog_name, std::string const& kernel_name); 72 | cl::Kernel* getKernelbyID(cl_uint context_idx, std::string const& prog_name, cl_ulong kernel_id); 73 | std::string getDeviceType(cl_uint avail_device_idx); 74 | void deinitalize(); 75 | 76 | private: 77 | const std::string type_cpu_str = "CPU"; 78 | const std::string type_gpu_str = "GPU"; 79 | const std::string type_acc_str = "ACCELERATOR"; 80 | const std::string type_other_str = "OTHER"; 81 | 82 | struct ocl_context { 83 | cl::Context context; 84 | std::vector queues; 85 | std::vector programs; 86 | std::vector prog_names; 87 | std::vector> kernels; 88 | std::vector> kernel_names; 89 | std::vector devices; 90 | }; 91 | 92 | void initialize(); 93 | ocl_dev_mgr(); 94 | cl_ulong getDeviceList(std::vector& devices); 95 | 96 | std::vector available_devices; 97 | cl_ulong num_available_devices; 98 | std::vector con_list; 99 | }; 100 | 101 | #endif // DEV_MGR_H 102 | -------------------------------------------------------------------------------- /run_kernel.m: -------------------------------------------------------------------------------- 1 | %This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. 2 | 3 | clear all 4 | close all 5 | clc 6 | 7 | %% 8 | % Use cl_run_kernel to compile and launch kernels. It is possible to compile 9 | % and run kernels in a two-stage process to increase performance or just use 10 | % a single step approach, that does everything in one go. 11 | 12 | global_range=[10,1,1]; %Set global OpenCl Range. Default indexing is 3D. To use a 1D index set y and z to 1 13 | local_range=[0]; %Let OpenCL decide local range, otherwise specify range explicitly (like global range) 14 | 15 | % Create input data for the kernel 16 | for i=1:20 17 | in1(1,i)=double(1); 18 | in1(2,i)=double(1); 19 | in1(3,i)=double(1); 20 | in1(4,i)=double(1); 21 | 22 | in2(1,i)=double(2); 23 | in2(2,i)=double(2); 24 | in2(3,i)=double(2); 25 | in2(4,i)=double(2); 26 | end 27 | 28 | % This example shows how to only compile the kernel but not run it. The 29 | % arguments are as follows: 30 | % - OpenCl Device ID - see cl_get_devices 31 | % - Kernel file URL 32 | % - Kernel defines, can be used to efficently define constant values or set 33 | % other compiler arguments 34 | % 35 | %This functions returns the compile time (in us) and an array with the names of the compiled kernel 36 | %functions 37 | %The OpenCL optimization flags -cl-mad-enable -cl-no-signed-zeros 38 | %-cl-finite-math-only were tested on diffrent devices and sould not cause 39 | %unexpected behaviour 40 | [comp_time,kernels]=cl_run_kernel(1,'test_kernel.cl','-DDT=1.0 -cl-mad-enable -cl-no-signed-zeros -cl-finite-math-only'); 41 | 42 | 43 | % This example shows how to run a precompiled kernel. The 44 | % arguments are as follows: 45 | % - OpenCl Device ID - see cl_get_devices 46 | % - Name of the function to run or cell array of kernel names to queue 47 | % multiple kernels 48 | % - Global OpenCL Range used to launch the kernel (see OpenCL NDRange) 49 | % - Local OpenCL Range used to launch the kernel (see OpenCL NDRange). This 50 | % value can be set to 0 to let OpenCL decide the best values 51 | % - List of varaibles to be used by the kernel - they will be passed in the 52 | % same order to the kernel itself. In case these variables get changed by 53 | % the kernel, the value of the input variable will change automatically 54 | % - read/write flag for the Kernel variables, this can either be scalar(all variables are read&write) or a vector with an entry for each variable: 0 - read&write / 1 - kernel read 55 | % only / 2 - kernel write only. 56 | % 57 | %This function returns the runtime of the actual kernel and teh buffer copy time in us 58 | [run_time,copy_time]=cl_run_kernel(1,'test1',global_range,local_range,in1,in2,0); 59 | 60 | % This example shows how to compile and execute a kernel in a single pass. 61 | % The arguments are as follows: 62 | % - OpenCl Device ID - see cl_get_devices 63 | % - Kernel file URL 64 | % - Kernel defines, can be used to efficently define constant values or set 65 | % other compiler arguments 66 | % - Name of the function to run or cell array of kernel names to queue 67 | % multiple kernels 68 | % - Global OpenCL Range used to launch the kernel (see OpenCL NDRange) 69 | % - Local OpenCL Range used to launch the kernel (see OpenCL NDRange). This 70 | % value can be set to 0 to let OpenCL decide the best values 71 | % - List of varaibles to be used by the kernel - they will be passed in the 72 | % same order to the kernel itself. In case these variables get changed by 73 | % the kernel, the value of the input variable will change automatically 74 | % - read/write flag for the Kernel variables, this can either be scalar(all variables are read&write) or a vector with an entry for each variable: 0 - read&write / 1 - kernel read 75 | % only / 2 - kernel write only. 76 | % 77 | %This function returns the runtime of the actual kernel in ms 78 | [run_time]=cl_run_kernel(1,'test_kernel.cl','-DDT=5.0 -cl-mad-enable -cl-no-signed-zeros -cl-finite-math-only','test2',global_range,local_range,in1,in2,[0 1]); 79 | 80 | 81 | %Same as above. but this functions pipes kernel printf to Matlab 82 | [run_time]=cl_dbg_kernel(1,'test_kernel.cl','-DDT=5.0 -cl-mad-enable -cl-no-signed-zeros -cl-finite-math-only','test2',global_range,local_range,in1,in2,[0 1]); 83 | -------------------------------------------------------------------------------- /src/utils.hpp: -------------------------------------------------------------------------------- 1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */ 2 | 3 | #ifndef UTILS_H 4 | #define UTILS_H 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | 14 | #if defined(_WIN32) 15 | #include 16 | typedef cl_ulong uint64_t; 17 | typedef unsigned int uint; 18 | #else 19 | #include 20 | #include 21 | #endif 22 | 23 | class Timer 24 | { 25 | private: 26 | #if defined(_WIN32) 27 | LARGE_INTEGER frequency_; 28 | DWORD startTick_; 29 | LONGLONG prevElapsedTime_; 30 | LARGE_INTEGER startTime_; 31 | #else 32 | struct timespec startTime_; 33 | #endif //_WIN32 34 | 35 | template 36 | T _max(T a,T b) 37 | { 38 | return (a > b ? a : b); 39 | } 40 | 41 | uint64_t getTime(unsigned long long scale) 42 | { 43 | uint64_t ticks; 44 | #if defined(_WIN32) 45 | LARGE_INTEGER currentTime; 46 | QueryPerformanceCounter(¤tTime); 47 | LONGLONG elapsedTime = currentTime.QuadPart - startTime_.QuadPart; 48 | 49 | // Compute the number of millisecond ticks elapsed. 50 | unsigned long msecTicks = 51 | (unsigned long)(1000 * elapsedTime / frequency_.QuadPart); 52 | // Check for unexpected leaps in the Win32 performance counter. 53 | // (This is caused by unexpected data across the PCI to ISA 54 | // bridge, aka south bridge. See Microsoft KB274323.) 55 | unsigned long elapsedTicks = GetTickCount() - startTick_; 56 | 57 | signed long msecOff = (signed long)(msecTicks - elapsedTicks); 58 | if (msecOff < -100 || msecOff > 100) { 59 | // Adjust the starting time forwards. 60 | LONGLONG msecAdjustment = 61 | _max(msecOff * 62 | frequency_.QuadPart / 1000, elapsedTime - 63 | prevElapsedTime_); 64 | startTime_.QuadPart += msecAdjustment; 65 | elapsedTime -= msecAdjustment; 66 | } 67 | // Store the current elapsed time for adjustments next time. 68 | prevElapsedTime_ = elapsedTime; 69 | 70 | ticks = (uint64_t)(scale*elapsedTime / frequency_.QuadPart); 71 | #else 72 | struct timespec tp; 73 | ::clock_gettime(CLOCK_MONOTONIC, &tp); 74 | // check for overflow 75 | if ((tp.tv_nsec - startTime_.tv_nsec) < 0) 76 | { 77 | // Remove a second from the second field and add it to the 78 | // nanoseconds field to prevent overflow. 79 | // Then scale 80 | ticks = (uint64_t) (tp.tv_sec - startTime_.tv_sec - 1) * scale 81 | + (uint64_t) ((1000ULL * 1000ULL * 1000ULL) + tp.tv_nsec - startTime_.tv_nsec) 82 | * scale / (1000ULL * 1000ULL * 1000ULL); 83 | } 84 | else 85 | { 86 | ticks = (uint64_t) (tp.tv_sec - startTime_.tv_sec) * scale 87 | + (uint64_t) (tp.tv_nsec - startTime_.tv_nsec) * scale / (1000ULL * 1000ULL * 1000ULL); 88 | } 89 | #endif //_WIN32 90 | 91 | return ticks; 92 | } 93 | 94 | public: 95 | //! Constructor 96 | Timer() 97 | { 98 | #if defined(_WIN32) 99 | QueryPerformanceFrequency(&frequency_); 100 | #endif 101 | reset(); 102 | } 103 | 104 | //! Destructor 105 | ~Timer() 106 | { 107 | } 108 | 109 | /*! 110 | * \brief Resets timer such that in essence the elapsed time is zero 111 | * from this point. 112 | */ 113 | void reset() 114 | { 115 | #if defined(_WIN32) 116 | QueryPerformanceCounter(&startTime_); 117 | startTick_ = GetTickCount(); 118 | prevElapsedTime_ = 0; 119 | #else 120 | ::clock_gettime(CLOCK_MONOTONIC, &startTime_); 121 | #endif 122 | } 123 | 124 | /*! 125 | * \brief Calculates the time since the last reset. 126 | * \returns The time in milli seconds since the last reset. 127 | */ 128 | uint64_t getTimeMilliseconds(void) 129 | { 130 | return getTime(1000ULL); 131 | } 132 | 133 | /*! 134 | * \brief Calculates the time since the last reset. 135 | * \returns The time in nano seconds since the last reset. 136 | */ 137 | uint64_t getTimeNanoseconds(void) 138 | { 139 | return getTime(1000ULL * 1000ULL * 1000ULL); 140 | } 141 | 142 | /*! 143 | * \brief Calculates the time since the last reset. 144 | * \returns The time in micro seconds since the last reset. 145 | */ 146 | uint64_t getTimeMicroseconds(void) 147 | { 148 | return getTime(1000ULL * 1000ULL); 149 | } 150 | 151 | /*! 152 | * \brief Calculates the tick rate for millisecond counter. 153 | */ 154 | float getMillisecondsTickRate(void) 155 | { 156 | return 1000.f; 157 | } 158 | 159 | /*! 160 | * \brief Calculates the tick rate for nanosecond counter. 161 | */ 162 | float getNanosecondsTickRate(void) 163 | { 164 | return (float) (1000ULL * 1000ULL * 1000ULL); 165 | } 166 | 167 | /*! 168 | * \brief Calculates the tick rate for microsecond counter. 169 | */ 170 | float getMicrosecondsTickRate(void) 171 | { 172 | return (float) (1000ULL * 1000ULL); 173 | } 174 | }; 175 | 176 | #endif // UTILS_H 177 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MatCL 2 | 3 | 4 | [![License](https://licensebuttons.net/l/by-nc-nd/3.0/88x31.png)](https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode) 5 | [![DOI](https://zenodo.org/badge/DOI/10.1145/3204919.3204927.svg)](https://doi.org/10.1145/3204919.3204927) 6 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2531474.svg)](https://doi.org/10.5281/zenodo.2531474) 7 | 8 | 9 | MatCL is an OpenCL interface for MathWorks Matlab. This MEX-based toolbox aims at providing a simple and easy to use solution to transfer memory and launch OpenCL kernels from Matlab using a single command. 10 | In comparison to other Matlab OpenCL solutions, MatCL is not just an OpenCL API wrapper but encapsulates the low-level host API calls necessary to initialize devices, create OpenCL buffers from Matlab workspace variables and build and launch kernels. 11 | MatCL is primarily intended to help in the development and testing of OpenCL kernels by allowing to transparently pass data from and to Matlab. 12 | Because MatCL handles the entire low-level process, this toolbox makes it possible to execute kernels without in depth knowledge of the host implementation necessary to support the execution of OpenCL kernels. 13 | MatCL is also optimized to allow efficient execution of OpenCL kernels within Matlab to accelerate computationally intensive tasks without having to rely on Nvidia CUDA. In addition to single command kernel execution, MatCL also allows for an independent two-step kernel compilation and launch workflow to save the kernel compile time and allow efficient repetitive kernel execution. 14 | 15 | A practical example for how MatCL can be used for scientific research is the [Induction Equation](https://github.com/IANW-Projects/InductionEq) project. 16 | 17 | Tested using Nvidia (Tesla, GTX), AMD (Ryzen, Radeon R9, FirePro) and Intel (Xeon, Core, HD Graphics) devices with Matlab R2015b and up. 18 | 19 | ## Usage 20 | 21 | Usage information for the individual functions is available through the Matlab `help` command (e.g. `help cl_get_devices`) and the documentation browser (e.g. `doc cl_get_devices`). 22 | 23 | - Enumerate OpenCL Devices (returns a list whose i-th entry corresponds to the i-th OpenCL device): 24 | `[names,dev_class,max_mem,max_wg_size,max_local_work_size,compute_units]=cl_get_devices;` 25 | - `names`: Names of all available devices 26 | - `dev_class`: The device class (CPU, GPU or Other for other or unknown Accelerators) 27 | - `max_mem`: The available device memory in bytes 28 | - `max_wg_size`: Max. size of OpenCL work group 29 | - `max_local_work_size`: Max. size of work items 30 | - `compute_units`: Number of compute units (e.g. CPU cores) of the device 31 | 32 | - Build Kernel: 33 | `[comp_time,kernels]=cl_run_kernel(ocl_dev_id,'kernel_url.cl','defines');` 34 | - `comp_time`: Microseconds it took to compile the kernels 35 | - `kernels`: List with names of all available kernels 36 | 37 | - `ocl_dev_id`: ID of the OpenCL device to be used 38 | - `kernel_url.cl`: URL of the kernel file 39 | - `defines`: List of OpenCL compiler defines 40 | 41 | - Run Kernel: 42 | `[run_time,copy_time]=cl_run_kernel(ocl_dev_id,',{'kernel_function1','kernel_function2'},global_range,local_range,in1,out1,[rw_flags]);` 43 | - `run_time`: Microseconds it took to execute the kernels 44 | - `copy_time`: Microseconds it took to copy all buffers 45 | 46 | - `ocl_dev_id`: ID of the OpenCL device to be used 47 | - `kernel_function`: Cell array of kernel functions to execute (can also be a single string for just one kernel) 48 | - `global_range`: 3D global OpenCL range (see NDRange). If this vector has six entires, the first three define the 3D work offset followed by the 3D work size. 49 | - `local_range`: 3D local OpenCL range (see NDRange) 50 | - `in1, out1`: List of variables to pass from/to kernel 51 | - `rw_flags`: read/write flag for the Kernel variables, this can either be scalar (all variables are read&write) or a vector with an entry for each variable: 0 - read&write / 1 - kernel read only / 2 - kernel write only 52 | 53 | - Build & Run Kernel: 54 | `[run_time]=cl_run_kernel(ocl_dev_id,'kernel_url.cl ','defines ','kernel_function',global_range,local_range,in1,out1,[rw_flags]);` 55 | - `run_time`: Microseconds it took to execute the kernels 56 | 57 | - `ocl_dev_id`: ID of the OpenCL device to be used 58 | - `kernel_url.cl`: URL of the kernel file 59 | - `defines`: List of OpenCL compiler defines 60 | - `kernel_function`: Cell array of kernel functions to execute (can also be a single string for just one kernel) 61 | - `global_range`: 3D global OpenCL range (see NDRange). If this vector has six entires, the first three define the 3D work offset followed by the 3D work size. 62 | - `local_range`: Local OpenCL range (see NDRange) 63 | - `in1, out1`: List of variables to pass from/to kernel 64 | - `rw_flags`: read/write flag for the Kernel variables, this can either be scalar (all variables are read&write) or a vector with an entry for each variable: 0 - read&write / 1 - kernel read only / 2 - kernel write only 65 | 66 | - Build & Run Kernel (with Kernel printf redirection): 67 | `[run_time]=cl_dbg_kernel(ocl_dev_id,' kernel_url.cl ','defines ','kernel_function',global_range,local_range,in1,out1,[rw_flags]);` 68 | - `run_time`: Microseconds it took to execute the kernels (might be slower due to printf redirection) 69 | 70 | - `ocl_dev_id`: ID of the OpenCL device to be used 71 | - `kernel_url.cl`: URL of the kernel file 72 | - `defines`: List of OpenCL compiler defines 73 | - `kernel_function`: Cell array of kernel functions to execute (can also be a single string for just one kernel) 74 | - `global_range`: 3D global OpenCL range (see NDRange). If this vector has six entires, the first three define the 3D work offset followed by the 3D work size. 75 | - `local_range`: Local OpenCL range (see NDRange) 76 | - `in1, out1`: List of variables to pass from/to kernel 77 | - `rw_flags`: read/write flag for the Kernel variables, this can either be scalar (all variables are read&write) or a vector with an entry for each variable: 0 - read&write / 1 - kernel read only / 2 - kernel write only 78 | 79 | 80 | ## Setup 81 | 82 | Just use `git clone https://github.com/IANW-Projects/MatCL` and run `compile_linux.m`, `compile_windows.m`, or `compile_mac` to compile MatCL. Depending on the OpenCL libraries used, the library path may have to be changed in these files. 83 | Than add the folder `MatCL` to the search path of Matlab. 84 | **Alternatively, some precompiled binaries are available at https://github.com/IANW-Projects/MatCL/releases.** 85 | 86 | There may be problems with old C/C++ libraries supplied by Matlab under Linux, resulting in errors such as 87 | `Invalid MEX-file '/..../cl_get_devices.mex64'`, followed by many missing symbols. If you use 88 | a Debian based system, install the package `matlab-support` via `sudo apt-get install matlab-support` 89 | and choose the option to rename the GCC libraries of Matlab during setup. 90 | 91 | 92 | ## Reference 93 | 94 | MatCL can be referenced using the DOI [10.1145/3204919.3204927](https://doi.org/10.1145/3204919.3204927) 95 | and the following bibtex entry. 96 | ``` 97 | @inproceedings{heinisch2018MatCL, 98 | title={{MatCL}: {A} new easy-to use {OpenCL} toolbox for {MathWorks} {Matlab}}, 99 | author={Heinisch, Philip and Ostaszewski, Katharina}, 100 | year={2018}, 101 | pages={8:1--8:1}, 102 | booktitle={Proceedings of the International Workshop on OpenCL}, 103 | series={IWOCL '18, May 2018, Oxford (United Kingdom)}, 104 | publisher={ACM}, 105 | address={New York, NY, USA}, 106 | note={\url{https://github.com/IANW-Projects/MatCL}}, 107 | doi={10.1145/3204919.3204927} 108 | } 109 | ``` 110 | The latest release can be cited with 111 | ``` 112 | @misc{MatCLGit, 113 | title={{MatCL}: {A} new easy-to use {OpenCL} toolbox for {MathWorks} {Matlab}}, 114 | author={Heinisch, Philip and Ostaszewski, Katharina and Ranocha, Hendrik}, 115 | month={01}, 116 | year={2019}, 117 | howpublished={\url{https://github.com/IANW-Projects/MatCL}}, 118 | doi={10.5281/zenodo.2531474} 119 | } 120 | ``` 121 | ## License 122 | 123 | This project is licensed under the terms of the Creative Commons [CC BY-NC-ND 4.0](https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode) license. 124 | 125 | 126 | ## Disclaimer 127 | 128 | Product and company names may be trademarks or registered trademarks of their respective holders. 129 | Use of them does not imply any affiliation with or endorsement by them or their affiliates. 130 | Everything is provided as is and without warranty. Use at your own risk! 131 | -------------------------------------------------------------------------------- /src/cl_run_kernel.cpp: -------------------------------------------------------------------------------- 1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */ 2 | 3 | #include 4 | #include "mex.h" 5 | #include "matrix.h" 6 | #include 7 | #include 8 | #if defined(_WIN32) 9 | #include 10 | #include 11 | #define access _access_s 12 | #else 13 | #include 14 | #endif 15 | 16 | 17 | 18 | 19 | #define CL_HPP_ENABLE_EXCEPTIONS 20 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120 21 | #define CL_HPP_TARGET_OPENCL_VERSION 120 22 | 23 | #include 24 | #include "ocl_dev_mgr.hpp" 25 | 26 | 27 | 28 | #include "MatCL.hpp" 29 | 30 | 31 | class mystream : public std::streambuf 32 | { 33 | protected: 34 | virtual std::streamsize xsputn(const char *s, std::streamsize n) { mexPrintf("%.*s", n, s); return n; } 35 | virtual int overflow(int c = EOF) { if (c != EOF) { mexPrintf("%.1s", &c); } return 1; } 36 | }; 37 | class scoped_redirect_cout 38 | { 39 | public: 40 | scoped_redirect_cout() { old_buf = std::cout.rdbuf(); std::cout.rdbuf(&mout); } 41 | ~scoped_redirect_cout() { std::cout.rdbuf(old_buf); } 42 | private: 43 | mystream mout; 44 | std::streambuf *old_buf; 45 | }; 46 | static scoped_redirect_cout mycout_redirect; 47 | 48 | 49 | 50 | 51 | void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray*prhs[] ) 52 | { 53 | 54 | size_t buflen; 55 | char *buf; 56 | char *settings; 57 | char *kernel_name_c; 58 | bool blocking = CL_FALSE; 59 | uint64_t mem_needed = 0; 60 | 61 | std::vector data_in; 62 | std::vector data_size; 63 | std::vector dev_Buffers; 64 | 65 | uint32_t global_range_x = 1; 66 | uint32_t global_range_y = 1; 67 | uint32_t global_range_z = 1; 68 | uint32_t range_start_x = 0; 69 | uint32_t range_start_y = 0; 70 | uint32_t range_start_z = 0; 71 | cl::NDRange range_start = cl::NullRange; 72 | cl::NDRange global_range; 73 | cl::NDRange local_range; 74 | 75 | uint64_t startTransfer, transferTime; 76 | 77 | Timer timer; //used to track performance 78 | 79 | ocl_dev_mgr& dev_mgr = ocl_dev_mgr::getInstance(); 80 | 81 | uint32_t device = (uint32_t)mxGetScalar(prhs[0]) - 1; 82 | 83 | if (nrhs>2) { 84 | 85 | if (device 0) && (mxIsChar(prhs[2]) == 0)) { 93 | old_instance = true; 94 | // mexPrintf("Old instance found, running kernels only...\n"); 95 | } 96 | 97 | //only build kernels - does not execute anything 98 | if (nrhs == 3) { 99 | compile_only = true; 100 | 101 | if (mxIsCell(prhs[1]) == true) { 102 | mexPrintf("Building multiple kernel files...\n"); 103 | } 104 | else { 105 | mexPrintf("Building single kernel file...\n"); 106 | } 107 | } 108 | 109 | if (compile_only == true) { 110 | 111 | mexPrintf("Device: %s\n", dev_mgr.get_avail_dev_info(device).name.c_str()); 112 | 113 | dev_mgr.deinitalize(); 114 | 115 | dev_mgr.init_device(device); 116 | 117 | buflen = mxGetN(prhs[nrhs - 1]) + 1; //get Kernel Settings 118 | settings = (char *)mxMalloc(buflen); 119 | mxGetString(prhs[nrhs - 1], settings, (mwSize)buflen); 120 | 121 | 122 | //get Kernel URL 123 | getKernel_info(plhs, nrhs, prhs, &dev_mgr); 124 | 125 | 126 | uint64_t kernels_found = 0; 127 | uint64_t comp_time; 128 | comp_time= timer.getTimeMicroseconds(); 129 | 130 | kernels_found = dev_mgr.compile_kernel(0, "ocl_Kernel", settings); 131 | 132 | uint64_t *comp_time_ptr; 133 | plhs[0] = mxCreateNumericMatrix(1, 1, mxUINT64_CLASS, mxREAL); 134 | comp_time_ptr = (uint64_t *)mxGetData(plhs[0]); 135 | comp_time_ptr[0] = timer.getTimeMicroseconds()-comp_time; 136 | 137 | // transferTime = (timer.getTimeMicroseconds() - startTransfer); 138 | // mexPrintf("Copy: %d\n", transferTime); 139 | 140 | if (kernels_found > 0) { 141 | mxArray * tmp_str; 142 | mxArray *cell_array_ptr; 143 | cell_array_ptr = mxCreateCellMatrix((mwSize)kernels_found, 1); 144 | for (uint32_t i = 0; i < kernels_found; i++) { 145 | // mexPrintf("test: %s\n ", dev_mgr.getKernelbyID(i)->getInfo()); 146 | std::string kernel_name(dev_mgr.getKernelbyID(0, "ocl_Kernel",i)->getInfo()); 147 | tmp_str = mxCreateString(kernel_name.c_str()); 148 | mxSetCell(cell_array_ptr, i, mxDuplicateArray(tmp_str)); 149 | } 150 | 151 | plhs[1] = cell_array_ptr; 152 | 153 | } 154 | 155 | // mexLock(); //prevent matlab from unloading mex file to keep context alive 156 | 157 | } 158 | 159 | //this part only runs the kernel 160 | if ((compile_only == false) && (old_instance == true)) { 161 | 162 | uint32_t num_in = (uint32_t)nrhs-5;//Number of input buffers 163 | 164 | uint32_t var_offset = 4; 165 | uint64_t copy_time; 166 | 167 | std::vector kernel_list; 168 | 169 | mwSize cell_dims; 170 | mxArray *cellElement; 171 | if (mxIsCell(prhs[1]) == true) { 172 | 173 | 174 | cell_dims = mxGetNumberOfElements(prhs[1]); 175 | for (uint32_t icell = 0; icell < cell_dims; icell++) { 176 | cellElement = mxGetCell(prhs[1], icell); 177 | 178 | buflen = mxGetN(cellElement) + 1; 179 | //mexPrintf("Size: %d\n", buflen); 180 | 181 | char *kernel_name_c; 182 | kernel_name_c = (char *)mxMalloc(buflen); 183 | mxGetString(cellElement, kernel_name_c, (mwSize)buflen); 184 | kernel_list.push_back(std::string(kernel_name_c)); 185 | // mexPrintf("Kernel-Name: %s\n", kernel_name_c); 186 | } 187 | } 188 | else { 189 | buflen = mxGetN(prhs[1]) + 1; //get Kernel Name 190 | kernel_name_c = (char *)mxMalloc(buflen); 191 | mxGetString(prhs[1], kernel_name_c, (mwSize)buflen); 192 | kernel_list.push_back(std::string(kernel_name_c)); 193 | //mexPrintf("Kernel-Name: %s\n", kernel_name.c_str()); 194 | 195 | } 196 | 197 | //NDRange settings 198 | //global range 199 | 200 | size_t mrows = mxGetM(prhs[2]); 201 | size_t ncols = mxGetN(prhs[2]); 202 | 203 | if ((mxIsDouble(prhs[2]) || (mxGetClassID(prhs[2]) == mxUINT32_CLASS)) && !mxIsComplex(prhs[2]) && (mrows * ncols == 3)) { 204 | if (mxIsDouble(prhs[2])) { 205 | double *range_ptr; 206 | range_ptr = mxGetPr(prhs[2]); 207 | 208 | global_range_x = (uint32_t)round(range_ptr[0]); 209 | global_range_y = (uint32_t)round(range_ptr[1]); 210 | global_range_z = (uint32_t)round(range_ptr[2]); 211 | } 212 | else { 213 | uint32_t *range_ptr; 214 | range_ptr = (uint32_t *)mxGetData(prhs[2]); 215 | 216 | global_range_x = (uint32_t)(range_ptr[0]); 217 | global_range_y = (uint32_t)(range_ptr[1]); 218 | global_range_z = (uint32_t)(range_ptr[2]); 219 | 220 | } 221 | 222 | global_range = cl::NDRange(global_range_x, global_range_y, global_range_z); 223 | 224 | } 225 | else { 226 | if ((mxIsDouble(prhs[2]) || (mxGetClassID(prhs[2]) == mxUINT32_CLASS)) && !mxIsComplex(prhs[2]) && (mrows * ncols == 6)) { 227 | if (mxIsDouble(prhs[2])) { 228 | double *range_ptr; 229 | range_ptr = mxGetPr(prhs[2]); 230 | range_start_x = (uint32_t)round(range_ptr[0]); 231 | range_start_y = (uint32_t)round(range_ptr[1]); 232 | range_start_z = (uint32_t)round(range_ptr[2]); 233 | 234 | global_range_x = (uint32_t)round(range_ptr[3]); 235 | global_range_y = (uint32_t)round(range_ptr[4]); 236 | global_range_z = (uint32_t)round(range_ptr[5]); 237 | 238 | } 239 | else { 240 | uint32_t *range_ptr; 241 | range_ptr = (uint32_t *)mxGetData(prhs[2]); 242 | 243 | range_start_x = (uint32_t)(range_ptr[0]); 244 | range_start_y = (uint32_t)(range_ptr[1]); 245 | range_start_z = (uint32_t)(range_ptr[2]); 246 | 247 | global_range_x = (uint32_t)(range_ptr[3]); 248 | global_range_y = (uint32_t)(range_ptr[4]); 249 | global_range_z = (uint32_t)(range_ptr[5]); 250 | } 251 | 252 | range_start = cl::NDRange(range_start_x, range_start_y, range_start_z); 253 | global_range = cl::NDRange(global_range_x, global_range_y, global_range_z); 254 | } 255 | else { 256 | mexErrMsgIdAndTxt("OpenCL:NDRange", "Invalid global range defined!"); 257 | } 258 | } 259 | 260 | //local range 261 | 262 | mrows = mxGetM(prhs[3]); 263 | ncols = mxGetN(prhs[3]); 264 | 265 | if ((mxIsDouble(prhs[3]) || (mxGetClassID(prhs[3]) == mxUINT32_CLASS)) && !mxIsComplex(prhs[3]) && (mrows + ncols == 4)) { 266 | if (mxIsDouble(prhs[3])) { 267 | double *range_ptr; 268 | range_ptr = mxGetPr(prhs[3]); 269 | global_range_x = (uint32_t)round(range_ptr[0]); 270 | global_range_y = (uint32_t)round(range_ptr[1]); 271 | global_range_z = (uint32_t)round(range_ptr[2]); 272 | 273 | } 274 | else { 275 | uint32_t *range_ptr; 276 | range_ptr = (uint32_t *)mxGetData(prhs[3]); 277 | 278 | global_range_x = (uint32_t)(range_ptr[0]); 279 | global_range_y = (uint32_t)(range_ptr[1]); 280 | global_range_z = (uint32_t)(range_ptr[2]); 281 | } 282 | 283 | local_range = cl::NDRange(global_range_x, global_range_y, global_range_z); 284 | 285 | } 286 | else { 287 | if (mrows + ncols == 2) { 288 | local_range = cl::NullRange; 289 | } 290 | else { 291 | mexErrMsgIdAndTxt("OpenCL:NDRange", "Invalid local range defined!"); 292 | return; 293 | } 294 | } 295 | 296 | runkernel(plhs, nrhs, prhs, kernel_list, num_in,var_offset, &dev_mgr, device, range_start,global_range, local_range, false,false,copy_time); 297 | 298 | uint64_t *copy_time_ptr; 299 | plhs[1] = mxCreateNumericMatrix(1, 1, mxUINT64_CLASS, mxREAL); 300 | copy_time_ptr = (uint64_t *)mxGetData(plhs[1]); 301 | copy_time_ptr[0] = copy_time; 302 | 303 | 304 | 305 | } 306 | 307 | //this part compiles and runs kernel 308 | ////////////////////////////////////////////////////////////////////////////////////////////////////////// 309 | if ((compile_only == false) && (old_instance == false)) { 310 | //mexPrintf("Compile and run...\n"); 311 | 312 | dev_mgr.deinitalize(); 313 | 314 | compilerun(plhs, nrhs, prhs, &dev_mgr, device, false,false); 315 | } 316 | } 317 | 318 | else { 319 | mexErrMsgIdAndTxt("MATLAB:cl_dev", "OpenCl Device not found!"); 320 | } 321 | } else { 322 | mexErrMsgIdAndTxt("MATLAB:syntax", "Incorrect Syntax!"); 323 | } 324 | 325 | 326 | 327 | return; 328 | 329 | } 330 | -------------------------------------------------------------------------------- /src/ocl_dev_mgr.cpp: -------------------------------------------------------------------------------- 1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "ocl_dev_mgr.hpp" 9 | 10 | 11 | #if defined(_WIN32) 12 | #include 13 | #define access _access_s 14 | #else 15 | #include 16 | #endif 17 | 18 | 19 | // macros 20 | #define STRINGIZE_(x) #x 21 | #define STRINGIZE(x) STRINGIZE_(x) 22 | 23 | #define ERROR_INFO "Error in line " STRINGIZE(__LINE__) " of " __FILE__ ":\n " 24 | 25 | 26 | // file system functions 27 | 28 | inline bool fileExists(std::string const& filename) 29 | { 30 | return access(filename.c_str(), 0) == 0; 31 | } 32 | 33 | 34 | 35 | inline void compile(cl::Program& cl_prog, char const* options) 36 | { 37 | std::string compile_options = std::string(" ") + std::string(options); 38 | 39 | try { 40 | cl_prog.build(compile_options.c_str()); 41 | } 42 | catch (cl::BuildError error) { 43 | std::string log = error.getBuildLog()[0].second; 44 | std::cerr << ERROR_INFO << "Build error:\n" << log << std::endl; 45 | } 46 | catch (cl::Error err) { 47 | std::cerr << ERROR_INFO << "Exception:" << err.what() << std::endl; 48 | } 49 | } 50 | 51 | 52 | inline std::string loadProgram(std::string const& input_filename) 53 | { 54 | std::ifstream input(input_filename.c_str()); 55 | if (!input.is_open()) { 56 | std::cerr << ERROR_INFO << "Cannot open file '" << input_filename << "'." << std::endl; 57 | exit(1); 58 | } 59 | 60 | return std::string(std::istreambuf_iterator(input), (std::istreambuf_iterator())); 61 | } 62 | 63 | 64 | ocl_dev_mgr::ocl_dev_mgr() { 65 | initialize(); 66 | } 67 | 68 | 69 | cl::Kernel* ocl_dev_mgr::getKernelbyName(cl_uint context_idx, std::string const& prog_name, std::string const& kernel_name) 70 | { 71 | auto it_p = find(con_list.at(context_idx).prog_names.begin(), con_list.at(context_idx).prog_names.end(), prog_name); 72 | if (it_p == con_list.at(context_idx).prog_names.end()) { 73 | return nullptr; 74 | } 75 | 76 | uint32_t idx = distance(con_list.at(context_idx).prog_names.begin(), it_p); 77 | 78 | if (con_list.at(context_idx).kernels.at(idx).size() > 1) { 79 | for (cl_uint i = 0; i < con_list.at(context_idx).kernels.at(idx).size(); i++) { 80 | if (kernel_name == con_list.at(context_idx).kernel_names.at(idx).at(i)) { 81 | return &(con_list.at(context_idx).kernels.at(idx).at(i)); 82 | } 83 | } 84 | } 85 | 86 | return &(con_list.at(context_idx).kernels.at(idx).at(0)); 87 | } 88 | 89 | cl::Kernel* ocl_dev_mgr::getKernelbyID(cl_uint context_idx, std::string const& prog_name, cl_ulong kernel_id) 90 | { 91 | auto it_p = find(con_list.at(context_idx).prog_names.begin(), con_list.at(context_idx).prog_names.end(), prog_name); 92 | if (it_p == con_list.at(context_idx).prog_names.end()) { 93 | return nullptr; 94 | } 95 | 96 | uint32_t idx = distance(con_list.at(context_idx).prog_names.begin(), it_p); 97 | 98 | return &(con_list.at(context_idx).kernels.at(idx).at(kernel_id)); 99 | } 100 | 101 | 102 | std::string ocl_dev_mgr::getDeviceType(cl_uint avail_device_idx) 103 | { 104 | if (available_devices.at(avail_device_idx).type == CL_DEVICE_TYPE_CPU) { 105 | return(type_cpu_str); 106 | } 107 | else if (available_devices.at(avail_device_idx).type == CL_DEVICE_TYPE_GPU) { 108 | return(type_gpu_str); 109 | } 110 | else if (available_devices.at(avail_device_idx).type == CL_DEVICE_TYPE_ACCELERATOR) { 111 | return(type_acc_str); 112 | } 113 | else { 114 | return(type_other_str); 115 | } 116 | } 117 | 118 | std::string ocl_dev_mgr::getDevicePCIeID(cl_uint avail_device_idx) 119 | { 120 | #define CL_DEVICE_PCI_BUS_ID_NV 0x4008 121 | #define CL_DEVICE_PCI_SLOT_ID_NV 0x4009 122 | #define CL_DEVICE_TOPOLOGY_AMD 0x4037 123 | typedef union 124 | { 125 | struct { cl_uint type; cl_uint data[5]; } raw; 126 | struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie; 127 | } cl_device_topology_amd; 128 | 129 | cl_device_topology_amd amd_topo; 130 | cl_int bus_id; 131 | cl_int slot_id; 132 | std::ostringstream tmp_stream; 133 | 134 | std::size_t found = 0; 135 | found = available_devices.at(avail_device_idx).vendor.find("NVIDIA"); 136 | if (found != std::string::npos) { 137 | available_devices.at(avail_device_idx).device.getInfo(CL_DEVICE_PCI_BUS_ID_NV,&bus_id); 138 | available_devices.at(avail_device_idx).device.getInfo(CL_DEVICE_PCI_SLOT_ID_NV, &slot_id); 139 | 140 | cl_uint domain, bus, dev, func; 141 | domain = bus_id >> 8; 142 | bus = bus_id & 0xff; 143 | tmp_stream << domain << ":" << bus << ":" << slot_id; 144 | } 145 | else 146 | { 147 | found = available_devices.at(avail_device_idx).vendor.find("Advanced Micro Devices"); 148 | if (found != std::string::npos) { 149 | available_devices.at(avail_device_idx).device.getInfo(CL_DEVICE_TOPOLOGY_AMD, &amd_topo); 150 | tmp_stream << "0:" << (unsigned int)amd_topo.pcie.bus << ":" << (unsigned int)amd_topo.pcie.device; //Domain is not returned? 151 | } 152 | } 153 | 154 | 155 | return tmp_stream.str(); 156 | } 157 | 158 | cl_int bus_id; 159 | cl_int slot_id; 160 | 161 | cl_ulong ocl_dev_mgr::getDeviceList(std::vector& devices) 162 | { 163 | // Get list of platforms 164 | std::vector platforms; 165 | cl::Platform::get(&platforms); 166 | 167 | // Enumerate devices 168 | for (cl::Platform const& platform : platforms) 169 | { 170 | std::vector plat_devices; 171 | platform.getDevices(CL_DEVICE_TYPE_ALL, &plat_devices); 172 | devices.insert(devices.end(), plat_devices.begin(), plat_devices.end()); 173 | } 174 | 175 | return devices.size(); 176 | } 177 | 178 | 179 | cl_ulong ocl_dev_mgr::init_device(cl_uint avail_device_idx) 180 | { 181 | ocl_context tmp_context; 182 | 183 | tmp_context.devices.push_back(available_devices.at(avail_device_idx)); 184 | 185 | std::vector tmp_devices; 186 | tmp_devices.push_back(available_devices.at(avail_device_idx).device); 187 | 188 | cl::Context context(tmp_devices, NULL); 189 | tmp_context.context = context; 190 | 191 | tmp_context.queues.push_back(cl::CommandQueue(tmp_context.context, CL_QUEUE_PROFILING_ENABLE)); 192 | //push second queue for async copy 193 | tmp_context.queues.push_back(cl::CommandQueue(tmp_context.context, CL_QUEUE_PROFILING_ENABLE)); 194 | 195 | con_list.push_back(tmp_context); 196 | 197 | return con_list.size(); 198 | } 199 | 200 | cl::CommandQueue& ocl_dev_mgr::get_queue(cl_uint context_idx, cl_uint queue_idx) 201 | { 202 | return con_list.at(context_idx).queues.at(queue_idx); 203 | } 204 | 205 | cl::Context& ocl_dev_mgr::get_context(cl_uint context_idx) 206 | { 207 | return con_list.at(context_idx).context; 208 | } 209 | 210 | cl_ulong ocl_dev_mgr::get_avail_dev_num() 211 | { 212 | return num_available_devices; 213 | } 214 | 215 | cl_ulong ocl_dev_mgr::get_context_num() 216 | { 217 | return con_list.size(); 218 | } 219 | 220 | 221 | bool ocl_dev_mgr::add_program_url(cl_uint context_idx, std::string prog_name, std::string const& url) 222 | { 223 | if (!fileExists(url)) { 224 | return false; 225 | } 226 | 227 | return add_program_str(context_idx, prog_name, loadProgram(url)); 228 | } 229 | 230 | bool ocl_dev_mgr::add_program_str(cl_uint context_idx, std::string prog_name, std::string kernel) 231 | { 232 | con_list.at(context_idx).programs.push_back(cl::Program(con_list.at(context_idx).context, kernel)); 233 | con_list.at(context_idx).prog_names.push_back(prog_name); 234 | con_list.at(context_idx).kernels.resize(con_list.at(context_idx).kernels.size() + 1); 235 | con_list.at(context_idx).kernel_names.resize(con_list.at(context_idx).kernel_names.size() + 1); 236 | return true; 237 | } 238 | 239 | 240 | cl::Program& ocl_dev_mgr::get_program(cl_uint context_idx, std::string const& prog_name) 241 | { 242 | auto it_p = find(con_list.at(context_idx).prog_names.begin(), con_list.at(context_idx).prog_names.end(), prog_name); 243 | if (it_p != con_list.at(context_idx).prog_names.end()) { 244 | return con_list.at(context_idx).programs.at(distance(con_list.at(context_idx).prog_names.begin(), it_p)); 245 | } 246 | else { 247 | std::cerr << ERROR_INFO << "Program '" << prog_name << "' not found." << std::endl; 248 | //TODO: Exception? 249 | return con_list.at(context_idx).programs.at(0); 250 | } 251 | } 252 | 253 | 254 | ocl_dev_mgr::ocl_device_info& ocl_dev_mgr::get_avail_dev_info(cl_uint avail_device_idx) 255 | { 256 | return available_devices.at(avail_device_idx); 257 | } 258 | 259 | 260 | ocl_dev_mgr::ocl_device_info& ocl_dev_mgr::get_context_dev_info(cl_uint context_idx, cl_uint device_idx) 261 | { 262 | return con_list.at(context_idx).devices.at(device_idx); 263 | } 264 | 265 | 266 | // return execution time in µs 267 | cl_ulong ocl_dev_mgr::execute_kernel(cl::Kernel& kernel, cl::CommandQueue& queue, 268 | cl::NDRange global_range, cl::NDRange local_range, 269 | std::vector& dev_Buffers) 270 | { 271 | cl::Event event; 272 | cl_ulong time_start, time_end; 273 | 274 | try { 275 | for (cl_uint i = 0; i < dev_Buffers.size(); i++) { 276 | kernel.setArg(i, *dev_Buffers[i]); 277 | } 278 | 279 | queue.enqueueNDRangeKernel(kernel, cl::NullRange, global_range, local_range, NULL, &event); 280 | event.wait(); 281 | event.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end); 282 | event.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &time_start); 283 | } 284 | catch (cl::BuildError error) { 285 | std::string log = error.getBuildLog()[0].second; 286 | std::cerr << ERROR_INFO << "Build error:\n" << log << std::endl; 287 | } 288 | catch (cl::Error err) { 289 | std::cerr << ERROR_INFO << "Exception:" << err.what() << std::endl; 290 | } 291 | 292 | return (time_end - time_start) / 1000; 293 | } 294 | 295 | 296 | // return execution time in µs 297 | cl_ulong ocl_dev_mgr::execute_kernelNA(cl::Kernel& kernel, cl::CommandQueue& queue, 298 | cl::NDRange range_start, cl::NDRange global_range, cl::NDRange local_range) 299 | { 300 | cl::Event event; 301 | cl_ulong time_start, time_end; 302 | 303 | try { 304 | queue.enqueueNDRangeKernel(kernel, range_start, global_range, local_range, NULL, &event); 305 | event.wait(); 306 | event.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end); 307 | event.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &time_start); 308 | } 309 | catch (cl::BuildError error) { 310 | std::string log = error.getBuildLog()[0].second; 311 | std::cerr << ERROR_INFO << "Build error:\n" << log << std::endl; 312 | } 313 | catch (cl::Error err) { 314 | std::cerr << ERROR_INFO << "Exception:" << err.what() << std::endl; 315 | } 316 | 317 | return (time_end - time_start) / 1000; 318 | } 319 | 320 | // don't return execution time in µs 321 | void ocl_dev_mgr::execute_kernel_async(cl::Kernel& kernel, cl::CommandQueue& queue, 322 | cl::NDRange global_range, cl::NDRange local_range, 323 | std::vector& dev_Buffers) 324 | { 325 | try { 326 | for (cl_uint i = 0; i < dev_Buffers.size(); i++) { 327 | kernel.setArg(i, *dev_Buffers[i]); 328 | } 329 | 330 | queue.enqueueNDRangeKernel(kernel, cl::NullRange, global_range, local_range, NULL, NULL); 331 | } 332 | catch (cl::BuildError error) { 333 | std::string log = error.getBuildLog()[0].second; 334 | std::cerr << ERROR_INFO << "Build error:\n" << log << std::endl; 335 | } 336 | catch (cl::Error err) { 337 | std::cerr << ERROR_INFO << "Exception:" << err.what() << std::endl; 338 | } 339 | } 340 | 341 | 342 | // Compile kernels and return the number of compiled kernels. 343 | cl_ulong ocl_dev_mgr::compile_kernel(cl_uint context_idx, std::string const& prog_name, std::string const& options) 344 | { 345 | std::string compile_options = std::string(" ") + options; 346 | 347 | auto it_p = find(con_list.at(context_idx).prog_names.begin(), con_list.at(context_idx).prog_names.end(), prog_name); 348 | if (it_p == con_list.at(context_idx).prog_names.end()) { 349 | std::cerr << ERROR_INFO << "Program '" << prog_name << "' not found." << std::endl; 350 | //TODO: Exception? 351 | return 0; 352 | } 353 | 354 | int32_t idx = distance(con_list.at(context_idx).prog_names.begin(), it_p); 355 | 356 | try { 357 | con_list.at(context_idx).programs.at(idx).build(compile_options.c_str()); 358 | } 359 | catch (cl::BuildError error) { 360 | std::string log = error.getBuildLog()[0].second; 361 | std::cerr << ERROR_INFO << "Build error:\n" << log << std::endl; 362 | } 363 | catch (cl::Error err) { 364 | std::cerr << ERROR_INFO << "Exception:" << err.what() << std::endl; 365 | } 366 | 367 | con_list.at(context_idx).programs.at(idx).createKernels(&(con_list.at(context_idx).kernels.at(idx))); 368 | 369 | con_list.at(context_idx).kernel_names.at(idx).clear(); //make sure to clear kernel_names list 370 | 371 | for (uint32_t i = 0; i < con_list.at(context_idx).kernels.at(idx).size(); i++) { 372 | con_list.at(context_idx).kernel_names.at(idx).push_back(con_list.at(context_idx).kernels.at(idx).at(i).getInfo()); 373 | } 374 | 375 | return con_list.at(context_idx).kernels.at(idx).size(); 376 | } 377 | 378 | 379 | cl_ulong ocl_dev_mgr::get_kernel_names(cl_uint context_idx, std::string const& prog_name, std::vector& found_kernels) 380 | { 381 | auto it_p = find(con_list.at(context_idx).prog_names.begin(), con_list.at(context_idx).prog_names.end(), prog_name); 382 | if (it_p == con_list.at(context_idx).prog_names.end()) { 383 | std::cerr << ERROR_INFO << "Program '" << prog_name << "' not found." << std::endl; 384 | //TODO: Exception? 385 | return 0; 386 | } 387 | 388 | int32_t idx = distance(con_list.at(context_idx).prog_names.begin(), it_p); 389 | 390 | for (uint32_t kernel_id = 0; kernel_id < con_list.at(context_idx).kernel_names.at(idx).size(); kernel_id++) { 391 | found_kernels.push_back(con_list.at(context_idx).kernel_names.at(idx).at(kernel_id)); 392 | } 393 | 394 | return con_list.at(context_idx).kernel_names.at(idx).size(); 395 | } 396 | 397 | 398 | void ocl_dev_mgr::initialize() 399 | { 400 | std::vector tmp_devices; 401 | getDeviceList(tmp_devices); 402 | num_available_devices = tmp_devices.size(); 403 | 404 | available_devices = std::vector(num_available_devices); 405 | 406 | for (size_t i = 0; i < tmp_devices.size(); i++) { 407 | 408 | available_devices.at(i).device = tmp_devices.at(i); 409 | std::vector tmp_size; 410 | 411 | available_devices.at(i).device.getInfo(CL_DEVICE_GLOBAL_MEM_SIZE, &available_devices.at(i).max_mem); 412 | available_devices.at(i).device.getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &available_devices.at(i).max_mem_alloc); 413 | available_devices.at(i).device.getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, &available_devices.at(i).lw_dim); 414 | available_devices.at(i).device.getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &available_devices.at(i).wg_size); 415 | available_devices.at(i).device.getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &tmp_size); 416 | available_devices.at(i).lw_size = tmp_size.at(0); 417 | available_devices.at(i).device.getInfo(CL_DEVICE_NAME, &available_devices.at(i).name); 418 | available_devices.at(i).device.getInfo(CL_DEVICE_VERSION, &available_devices.at(i).ocl_version); 419 | available_devices.at(i).device.getInfo(CL_DEVICE_TYPE, &available_devices.at(i).type); 420 | available_devices.at(i).device.getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &available_devices.at(i).compute_units); 421 | available_devices.at(i).device.getInfo(CL_DEVICE_PLATFORM, &available_devices.at(i).platform); 422 | available_devices.at(i).device.getInfo(CL_DEVICE_VENDOR, &available_devices.at(i).vendor); 423 | available_devices.at(i).platform.getInfo(CL_PLATFORM_NAME, &available_devices.at(i).platform_name); 424 | } 425 | } 426 | 427 | 428 | void ocl_dev_mgr::deinitalize() 429 | { 430 | //Deinitialization should be performed automatically, but there seems to be segfaults 431 | //under certain conditions using Windows, hence the vetor is cleared manually 432 | con_list.clear(); 433 | } -------------------------------------------------------------------------------- /src/MatCL.hpp: -------------------------------------------------------------------------------- 1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */ 2 | 3 | #ifndef MATCL_H 4 | #define MATCL_H 5 | 6 | #include 7 | #include "mex.h" 8 | #include "matrix.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #if defined(_WIN32) 15 | #include 16 | #include 17 | #include 18 | #endif 19 | 20 | 21 | #define CL_HPP_ENABLE_EXCEPTIONS 22 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120 23 | #define CL_HPP_TARGET_OPENCL_VERSION 120 24 | 25 | #include 26 | #include "ocl_dev_mgr.hpp" 27 | 28 | 29 | #include "utils.hpp" 30 | inline bool FileExists(const std::string &Filename) 31 | { 32 | return access(Filename.c_str(), 0) == 0; 33 | } 34 | 35 | inline std::string loadProgram(std::string input) 36 | { 37 | std::ifstream stream(input.c_str()); 38 | if (!stream.is_open()) { 39 | std::cout << "Cannot open file: " << input << std::endl; 40 | exit(1); 41 | } 42 | 43 | return std::string( 44 | std::istreambuf_iterator(stream), 45 | (std::istreambuf_iterator())); 46 | } 47 | 48 | inline void remove_empty_lines(std::istream& in, std::ostream& out) 49 | { 50 | std::string line; 51 | 52 | while (std::getline(in, line)) { 53 | bool is_empty = true; 54 | if (!line.empty()) { 55 | for (uint32_t i = 0; i < line.length(); i++) 56 | { 57 | if ((line.at(i) != 32) && (line.at(i) != '\n')) { 58 | is_empty = false; 59 | } 60 | } 61 | } 62 | if (is_empty == false) { 63 | out << line << '\n'; 64 | } 65 | } 66 | } 67 | 68 | 69 | int32_t getKernel_info(mxArray *plhs[], int nrhs, const mxArray*prhs[], ocl_dev_mgr *dev_mgr) { 70 | 71 | size_t buflen; 72 | 73 | //get Kernel URL 74 | std::string kernel_data; 75 | 76 | mwSize cell_dims; 77 | mxArray *cellElement; 78 | if (mxIsCell(prhs[1]) == true) { 79 | 80 | 81 | cell_dims = mxGetNumberOfElements(prhs[1]); 82 | for (uint32_t icell = 0; icell < cell_dims; icell++) { 83 | cellElement = mxGetCell(prhs[1], icell); 84 | 85 | buflen = mxGetN(cellElement) + 1; 86 | //mexPrintf("Size: %d\n", buflen); 87 | 88 | char *kernel_url_c; 89 | kernel_url_c = (char *)mxMalloc(buflen); 90 | mxGetString(cellElement, kernel_url_c, (mwSize)buflen); 91 | std::string kernel_url(kernel_url_c); 92 | //mexPrintf("Kernel-URL: %s\n", kernel_url_c); 93 | if (FileExists(kernel_url) == true) { 94 | kernel_data.append(loadProgram(kernel_url)); 95 | kernel_data.append("\n"); 96 | } 97 | else { 98 | mexErrMsgIdAndTxt("MATLAB:cl_program", "OpenCl Kernel file not found!"); 99 | return -1; 100 | } 101 | 102 | } 103 | dev_mgr->add_program_str(0, "ocl_Kernel", kernel_data); 104 | } 105 | else { 106 | char *kernel_url_c; 107 | buflen = mxGetN(prhs[1]) + 1; 108 | kernel_url_c = (char *)mxMalloc(buflen); 109 | mxGetString(prhs[1], kernel_url_c, (mwSize)buflen); 110 | std::string kernel_url(kernel_url_c); 111 | //mexPrintf("Kernel-URL: %s\n", kernel_url_c); 112 | if (dev_mgr->add_program_url(0, "ocl_Kernel", kernel_url) < 0) { //Add kernel source 113 | mexErrMsgIdAndTxt("MATLAB:cl_program", "OpenCl Kernel file not found!"); 114 | return -1; 115 | } 116 | 117 | } 118 | 119 | 120 | return 0; 121 | 122 | } 123 | 124 | int32_t runkernel(mxArray *plhs[], int nrhs, const mxArray*prhs[], std::vector &kernel_list, uint32_t num_in, uint32_t mvar_offset, ocl_dev_mgr *dev_mgr, uint32_t device, cl::NDRange range_start, cl::NDRange global_range, cl::NDRange local_range, bool debug_mode, bool log_file, uint64_t ©_time) 125 | { 126 | size_t buflen; 127 | char *buf; 128 | char *settings; 129 | char *kernel_name_c; 130 | bool blocking = CL_FALSE; 131 | uint64_t mem_needed = 0; 132 | 133 | std::vector data_in; 134 | std::vector data_size; 135 | 136 | uint64_t startTransfer, transferTime; 137 | Timer timer; //used to track performance 138 | 139 | //used for kernel printf 140 | #if defined(_WIN32) 141 | COORD buffer_size; 142 | SMALL_RECT rect; 143 | #endif 144 | #if !defined(_WIN32) 145 | char buffer[4096]; 146 | auto fp = fmemopen(buffer, 4096, "w"); 147 | auto old = stdout; 148 | 149 | #endif 150 | 151 | 152 | uint32_t var_offset = mvar_offset; 153 | 154 | //this part compiles and runs kernel 155 | ////////////////////////////////////////////////////////////////////////////////////////////////////////// 156 | 157 | //mexPrintf("Compile and run...\n"); 158 | 159 | if ((debug_mode == true) || ((log_file == true))) { 160 | #if defined(_WIN32) 161 | 162 | AllocConsole(); 163 | 164 | #define con_rows 150 165 | #define con_cols 120 166 | 167 | 168 | //get info un biggest possible console buffer 169 | buffer_size = GetLargestConsoleWindowSize(GetStdHandle(STD_OUTPUT_HANDLE)); 170 | if (buffer_size.X > con_cols) { 171 | buffer_size.X = con_cols; 172 | } 173 | if (buffer_size.Y > con_rows) { 174 | buffer_size.Y = con_rows; 175 | } 176 | rect = { 0, 0, buffer_size.X - 1,buffer_size.Y - 1 }; 177 | //std::cout << buffer_size.X << "%" << buffer_size.Y << std::endl; 178 | SetConsoleScreenBufferSize(GetStdHandle(STD_OUTPUT_HANDLE), buffer_size); 179 | SetConsoleWindowInfo(GetStdHandle(STD_OUTPUT_HANDLE), TRUE, &rect); 180 | 181 | #endif 182 | #if !defined(_WIN32) 183 | 184 | memset(buffer, 0, 4096); 185 | if (!fp) { printf("Error allocating buffer!"); return -1; } 186 | 187 | 188 | stdout = fp; 189 | 190 | #endif 191 | } 192 | 193 | 194 | bool all_rw = mxIsScalar(prhs[var_offset + num_in]); 195 | double *rw_flags_ptr; 196 | 197 | if (all_rw == true) { 198 | //no read/write flags specified - treat all as rw buffer 199 | rw_flags_ptr = new double[num_in]; 200 | std::fill(rw_flags_ptr, rw_flags_ptr + num_in, 0); 201 | } 202 | else { 203 | rw_flags_ptr = mxGetPr(prhs[var_offset + num_in]); 204 | } 205 | 206 | uint64_t push_time, pull_time; 207 | push_time = timer.getTimeMicroseconds(); 208 | 209 | //create input OCL buffer 210 | for (uint32_t i = 0; i < num_in; i++) { 211 | 212 | //mxGetM="datatype" 1=cl_float,2=cl_float2,4=cl_float4; 213 | //mxGetN=num_elements 214 | 215 | uint64_t buf_size = mxGetN(prhs[var_offset])*mxGetM(prhs[var_offset]); 216 | 217 | switch (mxGetClassID(prhs[var_offset])) { 218 | case mxSINGLE_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_float)); break; 219 | case mxDOUBLE_CLASS: buf_size = buf_size * uint64_t(sizeof(cl_double)); break; 220 | case mxINT8_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_char)); break; 221 | case mxUINT8_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_uchar)); break; 222 | case mxINT16_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_short)); break; 223 | case mxUINT16_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_ushort)); break; 224 | case mxINT32_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_int)); break; 225 | case mxUINT32_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_uint)); break; 226 | case mxINT64_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_long)); break; 227 | case mxUINT64_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_ulong)); break; 228 | } 229 | //mexPrintf("I Buffer Size: %ld,%d,%d\n", buf_size, mxGetN(prhs[var_offset]), mxGetM(prhs[var_offset])); 230 | // std::cout << buf_size <<"/"<< dev_mgr.get_avail_dev_info(device).max_mem_alloc << std::endl; 231 | data_size.push_back(buf_size); 232 | mem_needed = mem_needed + buf_size; 233 | // mexPrintf("I Buffer Size: %d\n", buf_size); 234 | //mexPrintf("Var Size: %d\n", sizeof(cl_float)); 235 | // mexPrintf("I Datatype: %s\n", mxGetClassName(prhs[var_offset])); 236 | 237 | if (dev_mgr->get_avail_dev_info(device).max_mem_alloc < buf_size) { 238 | mexWarnMsgIdAndTxt("OpenCL:Dev_Mem", "Buffer size bigger than CL_DEVICE_MAX_MEM_ALLOC_SIZE!"); 239 | } 240 | if ((mxIsScalar(prhs[var_offset]) == true) && ((uint32_t)round(rw_flags_ptr[i]) == 1)) { 241 | //mexPrintf( "Scalar Var: %d\n",i); 242 | 243 | for (uint32_t kernel_idx = 0; kernel_idx < kernel_list.size(); kernel_idx++) { 244 | switch (mxGetClassID(prhs[var_offset])) { 245 | case mxSINGLE_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_float*)mxGetData(prhs[var_offset])); break; 246 | case mxDOUBLE_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_double*)mxGetData(prhs[var_offset])); break; 247 | case mxINT8_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_char*)mxGetData(prhs[var_offset])); break; 248 | case mxUINT8_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_uchar*)mxGetData(prhs[var_offset])); break; 249 | case mxINT16_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_short*)mxGetData(prhs[var_offset])); break; 250 | case mxUINT16_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_ushort*)mxGetData(prhs[var_offset])); break; 251 | case mxINT32_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_int*)mxGetData(prhs[var_offset])); break; 252 | case mxUINT32_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_uint*)mxGetData(prhs[var_offset])); break; 253 | case mxINT64_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_long*)mxGetData(prhs[var_offset])); break; 254 | case mxUINT64_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_ulong*)mxGetData(prhs[var_offset])); break; 255 | } 256 | } 257 | 258 | // mexPrintf("Scalar Var2: %f\n", *(cl_double*)mxGetData(prhs[var_offset])); 259 | 260 | } 261 | else { 262 | //mexPrintf("Vec Var: %d\n", i); 263 | try { 264 | switch ((uint32_t)round(rw_flags_ptr[i])) { 265 | case 0: data_in.push_back(cl::Buffer(dev_mgr->get_context(0), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, data_size.at(i))); dev_mgr->get_queue(0, 0).enqueueWriteBuffer(data_in.at(data_in.size() - 1), blocking, 0, data_size.at(i), mxGetData(prhs[var_offset])); break; 266 | case 1: data_in.push_back(cl::Buffer(dev_mgr->get_context(0), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, data_size.at(i))); dev_mgr->get_queue(0, 0).enqueueWriteBuffer(data_in.at(data_in.size() - 1), blocking, 0, data_size.at(i), mxGetData(prhs[var_offset])); break; 267 | case 2: data_in.push_back(cl::Buffer(dev_mgr->get_context(0), CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, data_size.at(i))); break; 268 | } 269 | //dev_Buffers.push_back(&(data_in.at(i))); 270 | 271 | for (uint32_t kernel_idx = 0; kernel_idx < kernel_list.size(); kernel_idx++) { 272 | //mexPrintf("Vec Var: %d for Kernel: %d with Name: %s\n", i,kernel_idx, kernel_list.at(kernel_idx)); 273 | dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, data_in.at(data_in.size() - 1)); 274 | } 275 | 276 | } 277 | catch (cl::Error err) { 278 | mexErrMsgIdAndTxt("OpenCL:exception", err.what()); 279 | } 280 | 281 | } 282 | var_offset++; 283 | } 284 | 285 | 286 | if (dev_mgr->get_avail_dev_info(device).max_mem < mem_needed*1.2) { 287 | mexWarnMsgIdAndTxt("OpenCL:Dev_Mem", "Device may be out of memory!"); 288 | } 289 | 290 | 291 | //mexPrintf("kernel: %s\n ", dev_mgr.getKernelbyID(0, "ocl_Kernel", 0)->getInfo()); 292 | //mexPrintf("jernel: %s\n ", dev_mgr.getKernelbyID(0, "ocl_Kernel", 1)->getInfo()); 293 | 294 | uint64_t *exec_time_ptr; 295 | plhs[0] = mxCreateNumericMatrix(1, 1, mxUINT64_CLASS, mxREAL); 296 | exec_time_ptr = (uint64_t *)mxGetData(plhs[0]); 297 | 298 | dev_mgr->get_queue(0, 0).finish();//Buffer Copy is asynchornous 299 | push_time = timer.getTimeMicroseconds() - push_time; 300 | 301 | // transferTime = (timer.getTimeMicroseconds() - startTransfer); 302 | // mexPrintf("Copy: %d\n", transferTime); 303 | exec_time_ptr[0] = 0; 304 | for (uint32_t kernel_idx = 0; kernel_idx < kernel_list.size(); kernel_idx++){ 305 | exec_time_ptr[0] = exec_time_ptr[0] + dev_mgr->execute_kernelNA(*(dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))), dev_mgr->get_queue(0, 0), range_start,global_range, local_range); 306 | } 307 | 308 | var_offset = mvar_offset; 309 | 310 | pull_time = timer.getTimeMicroseconds(); 311 | 312 | uint32_t buffer_counter = 0; 313 | 314 | for (uint32_t i = 0; i < num_in; i++) { 315 | 316 | if ((mxIsScalar(prhs[var_offset]) == true) && ((uint32_t)round(rw_flags_ptr[i]) == 1)) { 317 | //mexPrintf( "Scalar Var2: %d\n",i); 318 | // do something? 319 | } 320 | else { 321 | 322 | // mexPrintf("O Buffer Size: %d\n", data_size.at(i)); 323 | // mexPrintf("O Datatype: %s\n", mxGetClassName(prhs[var_offset])); 324 | try { 325 | switch ((uint32_t)round(rw_flags_ptr[i])) { 326 | 327 | case 0: dev_mgr->get_queue(0, 0).enqueueReadBuffer(data_in.at(buffer_counter), blocking, 0, data_size.at(buffer_counter), mxGetData(prhs[var_offset])); break; 328 | case 1: break; 329 | case 2: dev_mgr->get_queue(0, 0).enqueueReadBuffer(data_in.at(buffer_counter), blocking, 0, data_size.at(buffer_counter), mxGetData(prhs[var_offset])); break; 330 | } 331 | 332 | buffer_counter++; 333 | 334 | } 335 | catch (cl::Error err) { 336 | mexErrMsgIdAndTxt("OpenCL:exception", err.what()); 337 | } 338 | } 339 | var_offset++; 340 | } 341 | 342 | if ((debug_mode == true)||((log_file == true))) { 343 | #if defined(_WIN32) 344 | 345 | CONSOLE_SCREEN_BUFFER_INFO csbiInfo; 346 | 347 | GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbiInfo); 348 | 349 | COORD newpos = { 0,0 }; 350 | CHAR_INFO chiBuffer[con_rows*con_cols]; 351 | buffer_size = { csbiInfo.srWindow.Right,csbiInfo.srWindow.Bottom }; 352 | 353 | std::stringstream console_output; 354 | std::stringstream final_output; 355 | 356 | //memset(chiBuffer,0,sizeof(CHAR_INFO)); 357 | 358 | rect = { 0, 0, csbiInfo.srWindow.Right, csbiInfo.srWindow.Bottom }; 359 | ReadConsoleOutput(GetStdHandle(STD_OUTPUT_HANDLE), chiBuffer, buffer_size, newpos, &rect); 360 | FreeConsole(); 361 | // std::cout << csbiInfo.dwMaximumWindowSize.X <<"%"<< csbiInfo.dwMaximumWindowSize.Y << std::endl; 362 | // std::cout << csbiInfo.srWindow.Right << "%" << csbiInfo.srWindow.Bottom << std::endl; 363 | for (int32_t i = 0; i < rect.Bottom*rect.Right; i++) { 364 | if ((chiBuffer[i].Char.AsciiChar > 1) && (chiBuffer[i].Char.AsciiChar < 255)) 365 | 366 | if (i % (rect.Right + 1) == 0) { 367 | console_output << std::endl << (char)chiBuffer[i].Char.AsciiChar; 368 | } 369 | else { 370 | console_output << (char)chiBuffer[i].Char.AsciiChar; 371 | } 372 | } 373 | remove_empty_lines(console_output, final_output); 374 | 375 | mxArray * tmp_str; 376 | 377 | tmp_str = mxCreateString(final_output.str().c_str()); 378 | plhs[1] = tmp_str; 379 | 380 | if (debug_mode == true) { 381 | std::cout << final_output.str() << std::endl; 382 | } 383 | 384 | if (log_file == true) { 385 | FILE *fp; 386 | char log_timestamp[100]; 387 | char log_filename[200]; 388 | 389 | time_t now = time(0); 390 | strftime(log_timestamp, 100, "%m-%d_%H-%M-%S", localtime(&now)); 391 | 392 | snprintf(log_filename, 199, "log_%s_%s.txt", log_timestamp, kernel_list.at(0).c_str()); 393 | 394 | // mexPrintf("%s...\n", log_filename); 395 | 396 | fp = fopen(log_filename, "w"); 397 | if (!fp) { 398 | mexErrMsgIdAndTxt("MATLAB:FILE", "Can't create Log file!"); 399 | } 400 | else { 401 | fputs(final_output.str().c_str(), fp); 402 | fclose(fp); 403 | } 404 | } 405 | 406 | 407 | 408 | #endif 409 | #if !defined(_WIN32) 410 | std::fclose(fp); 411 | stdout = old; //reset stdout 412 | mxArray * tmp_str; 413 | 414 | tmp_str = mxCreateString(buffer); 415 | plhs[1] = tmp_str; 416 | 417 | if (debug_mode == true) { 418 | std::cout << buffer << std::endl; 419 | } 420 | if (log_file == true) { 421 | FILE *fp; 422 | char log_timestamp[100]; 423 | char log_filename[200]; 424 | 425 | time_t now = time(0); 426 | strftime(log_timestamp, 100, "%m-%d_%H-%M-%S", localtime(&now)); 427 | 428 | snprintf(log_filename, 199, "log_%s_%s.txt", log_timestamp, kernel_list.at(0).c_str()); 429 | // mexPrintf("%s...\n", log_filename); 430 | 431 | fp = fopen(log_filename, "w"); 432 | if (!fp) { 433 | mexErrMsgIdAndTxt("MATLAB:FILE", "Can't create Log file!"); 434 | } 435 | else { 436 | fputs(buffer, fp); 437 | fclose(fp); 438 | } 439 | } 440 | 441 | #endif 442 | } 443 | 444 | 445 | dev_mgr->get_queue(0, 0).finish(); 446 | 447 | pull_time = timer.getTimeMicroseconds() - pull_time; 448 | copy_time= push_time+pull_time; 449 | 450 | 451 | return 0; 452 | 453 | 454 | } 455 | 456 | 457 | int32_t compilerun(mxArray *plhs[], int nrhs, const mxArray*prhs[], ocl_dev_mgr *dev_mgr, uint32_t device, bool debug_mode,bool log_file) 458 | { 459 | 460 | 461 | bool blocking = CL_FALSE; 462 | uint64_t mem_needed = 0; 463 | std::vector kernel_list; 464 | 465 | std::vector data_in; 466 | std::vector data_size; 467 | 468 | uint32_t global_range_x = 1; 469 | uint32_t global_range_y = 1; 470 | uint32_t global_range_z = 1; 471 | uint32_t range_start_x = 0; 472 | uint32_t range_start_y = 0; 473 | uint32_t range_start_z = 0; 474 | cl::NDRange range_start= cl::NullRange; 475 | cl::NDRange global_range; 476 | cl::NDRange local_range; 477 | 478 | uint64_t copy_time; 479 | 480 | Timer timer; //used to track performance 481 | 482 | //this part compiles and runs kernel 483 | ////////////////////////////////////////////////////////////////////////////////////////////////////////// 484 | 485 | //mexPrintf("Compile and run...\n"); 486 | 487 | dev_mgr->init_device(device); 488 | 489 | 490 | size_t buflen; 491 | char *buf; 492 | char *settings; 493 | 494 | buflen = mxGetN(prhs[2]) + 1; //get Kernel Settings 495 | settings = (char *)mxMalloc(buflen); 496 | mxGetString(prhs[2], settings, (mwSize)buflen); 497 | //mexPrintf("Kernel-Settings: %s\n", settings); 498 | 499 | getKernel_info(plhs, nrhs, prhs, dev_mgr); 500 | 501 | mwSize cell_dims; 502 | mxArray *cellElement; 503 | if (mxIsCell(prhs[3]) == true) { 504 | 505 | 506 | cell_dims = mxGetNumberOfElements(prhs[3]); 507 | for (uint32_t icell = 0; icell < cell_dims; icell++) { 508 | cellElement = mxGetCell(prhs[3], icell); 509 | 510 | buflen = mxGetN(cellElement) + 1; 511 | //mexPrintf("Size: %d\n", buflen); 512 | 513 | char *kernel_name_c; 514 | kernel_name_c = (char *)mxMalloc(buflen); 515 | mxGetString(cellElement, kernel_name_c, (mwSize)buflen); 516 | kernel_list.push_back(std::string(kernel_name_c)); 517 | // mexPrintf("Kernel-Name: %s\n", kernel_name_c); 518 | } 519 | } 520 | else { 521 | char *kernel_name_c; 522 | buflen = mxGetN(prhs[3]) + 1; //get Kernel Name 523 | kernel_name_c = (char *)mxMalloc(buflen); 524 | mxGetString(prhs[3], kernel_name_c, (mwSize)buflen); 525 | kernel_list.push_back(std::string(kernel_name_c)); 526 | //mexPrintf("Kernel-Name: %s\n", kernel_name.c_str()); 527 | 528 | } 529 | 530 | uint64_t kernels_found = 0; 531 | 532 | kernels_found = dev_mgr->compile_kernel(0, "ocl_Kernel", settings); 533 | if (kernels_found == 0) { 534 | mexErrMsgIdAndTxt("OpenCL:Kernel", "No valid kernels found"); 535 | return -1; 536 | } 537 | 538 | 539 | //NDRange settings 540 | //global range 541 | 542 | size_t mrows = mxGetM(prhs[4]); 543 | size_t ncols = mxGetN(prhs[4]); 544 | 545 | if ((mxIsDouble(prhs[4]) || (mxGetClassID(prhs[4])== mxUINT32_CLASS)) && !mxIsComplex(prhs[4]) && (mrows * ncols == 3)) { 546 | if (mxIsDouble(prhs[4])) { 547 | 548 | double *range_ptr; 549 | range_ptr = mxGetPr(prhs[4]); 550 | global_range_x = (uint32_t)round(range_ptr[0]); 551 | global_range_y = (uint32_t)round(range_ptr[1]); 552 | global_range_z = (uint32_t)round(range_ptr[2]); 553 | } 554 | else { 555 | uint32_t *range_ptr; 556 | range_ptr = (uint32_t *)mxGetData(prhs[4]); 557 | 558 | global_range_x = (uint32_t)(range_ptr[0]); 559 | global_range_y = (uint32_t)(range_ptr[1]); 560 | global_range_z = (uint32_t)(range_ptr[2]); 561 | 562 | } 563 | global_range = cl::NDRange(global_range_x, global_range_y, global_range_z); 564 | 565 | } 566 | else { 567 | if ((mxIsDouble(prhs[4]) || (mxGetClassID(prhs[4]) == mxUINT32_CLASS)) && !mxIsComplex(prhs[4]) && (mrows * ncols == 6)) { 568 | if (mxIsDouble(prhs[4])) { 569 | double *range_ptr; 570 | range_ptr = mxGetPr(prhs[4]); 571 | 572 | range_start_x = (uint32_t)round(range_ptr[0]); 573 | range_start_y = (uint32_t)round(range_ptr[1]); 574 | range_start_z = (uint32_t)round(range_ptr[2]); 575 | 576 | global_range_x = (uint32_t)round(range_ptr[3]); 577 | global_range_y = (uint32_t)round(range_ptr[4]); 578 | global_range_z = (uint32_t)round(range_ptr[5]); 579 | 580 | } 581 | else { 582 | uint32_t *range_ptr; 583 | range_ptr = (uint32_t *)mxGetData(prhs[4]); 584 | 585 | range_start_x = (uint32_t)(range_ptr[0]); 586 | range_start_y = (uint32_t)(range_ptr[1]); 587 | range_start_z = (uint32_t)(range_ptr[2]); 588 | 589 | global_range_x = (uint32_t)(range_ptr[3]); 590 | global_range_y = (uint32_t)(range_ptr[4]); 591 | global_range_z = (uint32_t)(range_ptr[5]); 592 | } 593 | range_start = cl::NDRange(range_start_x, range_start_y, range_start_z); 594 | global_range = cl::NDRange(global_range_x, global_range_y, global_range_z); 595 | } 596 | else { 597 | mexErrMsgIdAndTxt("OpenCL:NDRange", "Invalid global range defined!"); 598 | return -1; 599 | } 600 | } 601 | 602 | //local range 603 | 604 | mrows = mxGetM(prhs[5]); 605 | ncols = mxGetN(prhs[5]); 606 | 607 | if ((mxIsDouble(prhs[5]) || (mxGetClassID(prhs[5]) == mxUINT32_CLASS)) && (mrows + ncols == 4)) { 608 | if (mxIsDouble(prhs[5])) { 609 | double *range_ptr; 610 | range_ptr = mxGetPr(prhs[5]); 611 | global_range_x = (uint32_t)round(range_ptr[0]); 612 | global_range_y = (uint32_t)round(range_ptr[1]); 613 | global_range_z = (uint32_t)round(range_ptr[2]); 614 | } 615 | else { 616 | uint32_t *range_ptr; 617 | range_ptr = (uint32_t *)mxGetData(prhs[5]); 618 | 619 | global_range_x = (uint32_t)(range_ptr[0]); 620 | global_range_y = (uint32_t)(range_ptr[1]); 621 | global_range_z = (uint32_t)(range_ptr[2]); 622 | } 623 | local_range = cl::NDRange(global_range_x, global_range_y, global_range_z); 624 | // printf("Local work Size: %d/%d/%d\n", global_range_x, global_range_y, global_range_z); 625 | 626 | } 627 | else { 628 | if (mrows + ncols == 2) { 629 | local_range = cl::NullRange; 630 | } 631 | else { 632 | mexErrMsgIdAndTxt("OpenCL:NDRange", "Invalid local range defined!"); 633 | return -1; 634 | } 635 | 636 | } 637 | 638 | 639 | 640 | uint32_t num_in = (uint32_t)nrhs - 7;//Number of input buffers 641 | 642 | uint32_t var_offset = 6; 643 | 644 | 645 | 646 | runkernel(plhs, nrhs, prhs, kernel_list, num_in,var_offset, dev_mgr, device, range_start, global_range, local_range, debug_mode,log_file,copy_time); 647 | 648 | if (debug_mode == false) { 649 | uint64_t *copy_time_ptr; 650 | plhs[1] = mxCreateNumericMatrix(1, 1, mxUINT64_CLASS, mxREAL); 651 | copy_time_ptr = (uint64_t *)mxGetData(plhs[1]); 652 | copy_time_ptr[0] = copy_time; 653 | } 654 | 655 | return 0; 656 | } 657 | 658 | #endif // MATCL_H 659 | --------------------------------------------------------------------------------