├── examples
    ├── imgData.mat
    ├── mul_kernel.cl
    ├── test_mul.m
    ├── img_test.m
    └── filter.cl
├── .gitignore
├── get_devices.m
├── compile_linux.m
├── cl_get_devices.m
├── .zenodo.json
├── compile_windows.m
├── test_kernel.cl
├── compile_mac.m
├── cl_dbg_kernel.m
├── src
    ├── cl_dbg_kernel.cpp
    ├── cl_get_devices.cpp
    ├── ocl_dev_mgr.hpp
    ├── utils.hpp
    ├── cl_run_kernel.cpp
    ├── ocl_dev_mgr.cpp
    └── MatCL.hpp
├── cl_run_kernel.m
├── run_kernel.m
└── README.md


/examples/imgData.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IANW-Projects/MatCL/HEAD/examples/imgData.mat


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.asv
 2 | *.pdb
 3 | *.mexa64
 4 | *.mexmaci64
 5 | *.mexw32
 6 | *.mexw64
 7 | 
 8 | CL/*
 9 | 
10 | 


--------------------------------------------------------------------------------
/get_devices.m:
--------------------------------------------------------------------------------
 1 | %This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license.
 2 | 
 3 | clear all
 4 | close all
 5 | clc
 6 | 
 7 | %%
 8 | %To get a list of available OpenCl devices use 'cl_get_devices'. This
 9 | %functions returns the names of all availbale devices, the device class
10 | %(CPU, GPU or Other for other or unknown Accelerators) and the availble
11 | %device meory in bytes. To choose a device use the index of the
12 | %corresponding entry in the names array.
13 | [dev_name,dev_type,max_mem,wg_size,lw_size,compute_units]=cl_get_devices;
14 | 


--------------------------------------------------------------------------------
/compile_linux.m:
--------------------------------------------------------------------------------
 1 | %This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license.
 2 | 
 3 | close all
 4 | clear all
 5 | clc
 6 | 
 7 | %Change OpenCL library path according to your setup
 8 | mex -g COMPFLAGS='$COMPFLAGS -std=c++11 -O2' '-LC /usr/lib/x86_64-linux-gnu' -lOpenCL src/cl_get_devices.cpp src/ocl_dev_mgr.cpp
 9 | mex -g COMPFLAGS='$COMPFLAGS -std=c++11 -O2' '-LC /usr/lib/x86_64-linux-gnu' -lOpenCL src/cl_run_kernel.cpp src/ocl_dev_mgr.cpp
10 | mex -g COMPFLAGS='$COMPFLAGS -std=c++11 -O2' '-LC /usr/lib/x86_64-linux-gnu' -lOpenCL src/cl_dbg_kernel.cpp src/ocl_dev_mgr.cpp
11 | 
12 | [dev_name,dev_type,max_mem,wg_size,lw_size]=cl_get_devices;
13 | 


--------------------------------------------------------------------------------
/cl_get_devices.m:
--------------------------------------------------------------------------------
 1 | %CL_GET_DEVICES Enumerate OpenCL devices (returns a list whose i-th entry corresponds to the i-th OpenCL device)
 2 | %
 3 | %
 4 | %   [names, dev_class, max_mem, max_wg_size, max_local_work_size, compute_units] = cl_get_devices;
 5 | %
 6 | %
 7 | %   Outputs
 8 | %   -------
 9 | %
10 | %   names: Names of all available devices
11 | %   dev_class: The device class (CPU, GPU or Other for other or unknown Accelerators)
12 | %   max_mem: The available device memory in bytes
13 | %   max_wg_size: Max. size of OpenCL work group
14 | %   max_local_work_size: Max. size of work items
15 | %   compute_units: Number of compute units (e.g. CPU cores) of the device
16 | %
17 | 


--------------------------------------------------------------------------------
/.zenodo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "description": "MatCL: A new easy-to use OpenCL toolbox for MathWorks Matlab",
 3 |     "license": "other-open",
 4 |     "title": "IANW-Projects/MatCL",
 5 |     "version": "v1.1.2",
 6 |     "upload_type": "software",
 7 |     "publication_date": "2019-04-24",
 8 |     "creators": [
 9 |         {
10 |             "affiliation": "TU Braunschweig",
11 |             "name": "Philip Heinisch"
12 |         },
13 |         {
14 |             "affiliation": "TU Braunschweig",
15 |             "name": "Katharina Ostaszewski"
16 |         },
17 |         {
18 |            "affiliation": "TU Braunschweig",
19 |             "name": "Hendrik Ranocha"
20 |         }
21 |     ],
22 |     "access_right": "open"
23 | }
24 | 


--------------------------------------------------------------------------------
/compile_windows.m:
--------------------------------------------------------------------------------
 1 | %This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license.
 2 | 
 3 | close all
 4 | clear all
 5 | clc
 6 | 
 7 | %Change OpenCL library path according to your setup
 8 | mex -g COMPFLAGS='$COMPFLAGS -O2' '-L C:\Intel\OpenCL\sdk\lib\x64\' '-I C:\Intel\OpenCL\sdk\include\' -lOpenCL src\cl_get_devices.cpp src\ocl_dev_mgr.cpp
 9 | mex -g COMPFLAGS='$COMPFLAGS -O2' '-L C:\Intel\OpenCL\sdk\lib\x64\' '-I C:\Intel\OpenCL\sdk\include\' -lOpenCL src\cl_run_kernel.cpp src\ocl_dev_mgr.cpp
10 | mex -g COMPFLAGS='$COMPFLAGS -O2' '-L C:\Intel\OpenCL\sdk\lib\x64\' '-I C:\Intel\OpenCL\sdk\include\' -lOpenCL src\cl_dbg_kernel.cpp src\ocl_dev_mgr.cpp
11 | 
12 | [dev_name,dev_type,max_mem,wg_size,lw_size,compute_units]=cl_get_devices;
13 | 


--------------------------------------------------------------------------------
/test_kernel.cl:
--------------------------------------------------------------------------------
 1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */
 2 | 
 3 | #ifdef cl_khr_fp64
 4 |     #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 5 | #else
 6 |     #error "IEEE-754 double precision not supported by OpenCL implementation."
 7 | #endif
 8 | 
 9 | kernel void test1(global double4 *d_1,global double4 *d_2)
10 | {
11 |   uint idx = get_global_id(0);
12 | 
13 |   //Simple test just add a constant DT to the value of d_1. The constant DT is a kernel define to increase performance
14 |   d_1[idx].x=d_1[idx].x+DT;
15 | 
16 | };
17 | 
18 | 
19 | kernel void test2(global double4 *d_1,global double4 *d_2)
20 | {
21 |   uint idx = get_global_id(0);
22 |   printf("Test: %d \n",idx);
23 |     d_1[idx].w=d_2[idx].w+DT;
24 | 
25 | };
26 | 


--------------------------------------------------------------------------------
/compile_mac.m:
--------------------------------------------------------------------------------
 1 | %This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license.
 2 | 
 3 | close all
 4 | clear all
 5 | clc
 6 | 
 7 | % Apple does not ship cl2.hpp in general
 8 | mkdir('CL');
 9 | websave('CL/cl2.hpp', 'https://github.com/KhronosGroup/OpenCL-CLHPP/releases/download/v2.0.10/cl2.hpp');
10 | 
11 | %Change OpenCL library path according to your setup
12 | mex -g COMPFLAGS='$COMPFLAGS -std=c++11 -O2 -framework OpenCL' -I./ LDFLAGS='$LDFLAGS -framework OpenCL' src/cl_get_devices.cpp src/ocl_dev_mgr.cpp
13 | mex -g COMPFLAGS='$COMPFLAGS -std=c++11 -O2 -framework OpenCL' -I./ LDFLAGS='$LDFLAGS -framework OpenCL' src/cl_run_kernel.cpp src/ocl_dev_mgr.cpp
14 | mex -g COMPFLAGS='$COMPFLAGS -std=c++11 -O2 -framework OpenCL' -I./ LDFLAGS='$LDFLAGS -framework OpenCL' src/cl_dbg_kernel.cpp src/ocl_dev_mgr.cpp
15 | 
16 | [dev_name,dev_type,max_mem,wg_size,lw_size]=cl_get_devices;
17 | 


--------------------------------------------------------------------------------
/examples/mul_kernel.cl:
--------------------------------------------------------------------------------
 1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */
 2 | 
 3 | // enable double precision (not enabled by default)
 4 | 
 5 | #ifdef cl_khr_fp64
 6 |     #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 7 | #else
 8 |     #error "IEEE-754 double precision not supported by OpenCL implementation."
 9 | #endif
10 | 
11 | kernel void MM(const global double *A,const global double *B,global double *C)
12 | {
13 | 
14 |   // Thread identifiers
15 |     const uint globalRow = get_global_id(0); // Row ID of C (0..M)
16 |     const uint globalCol = get_global_id(1); // Col ID of C (0..N)
17 | 
18 | 	const uint num_rows=(uint)NR;
19 | 	const uint num_cols=(uint)NC;
20 | 	const uint num_i=(uint)NI;
21 | 
22 |     // Compute a single element (loop over K)
23 |     double acc = 0.0f;
24 | 
25 |     for (uint k=0; k<num_i; k++) {
26 |         acc += A[k*num_rows + globalRow] * B[globalCol*num_i + k];
27 |     }
28 | 
29 |     // Store the result
30 |     C[globalCol*num_rows + globalRow] = acc;
31 | 
32 | 
33 | 
34 | };
35 | 


--------------------------------------------------------------------------------
/cl_dbg_kernel.m:
--------------------------------------------------------------------------------
 1 | %CL_DBG_KERNEL Build and run OpenCL kernels with printf redirection
 2 | %
 3 | % 
 4 | %   [run_time] = cl_dbg_kernel(ocl_dev_id, 'kernel_url.cl', 'defines', 'kernel_function', ...
 5 | %                              global_range, local_range, in1, out1, [rw_flags]);
 6 | %
 7 | %   Inputs
 8 | %   -------
 9 | %   ocl_dev_id: ID of the OpenCL device to be used
10 | %   kernel_url.cl: URL of the kernel file
11 | %   defines: List of OpenCL compiler defines
12 | %   kernel_function: Cell array of kernel functions to execute (can also be a single string for just one kernel)
13 | %   global_range: 3D global OpenCL range (see NDRange). If this vector has six entires, the first three define the 3D work offset followed by the 3D work size.
14 | %   local_range: Local OpenCL range (see NDRange)
15 | %   in1, out1: List of variables to pass from/to kernel
16 | %   rw_flags: read/write flag for the Kernel variables, this can either be scalar (all variables are read&write) or a vector with an entry for each variable: 0 - read&write / 1 - kernel read only / 2 - kernel write only
17 | %
18 | %   Outputs
19 | %   -------
20 | %   run_time:  Microseconds it took to execute the kernels (might be slower due to printf redirection)
21 | 


--------------------------------------------------------------------------------
/examples/test_mul.m:
--------------------------------------------------------------------------------
 1 | % This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license.
 2 | 
 3 | close all
 4 | clear all
 5 | clc
 6 | 
 7 | 
 8 | %%
 9 | %This scripts shows how to use cl_run_kernel to run a kernel. This example
10 | %kernel implements a matrix multiplication and compares the runtime with
11 | %the internal Matlab Implementation
12 | 
13 | 
14 | for i=1:10000
15 | A(i,1)=3;
16 | A(i,2)=2;
17 | A(i,3)=1;
18 | end;
19 | %%%%%%%%%%%
20 | A=double(A);
21 | B=double(A');
22 | 
23 | tmp=size(A);
24 | num_rows=tmp(1);
25 | 
26 | tmp=size(B);
27 | num_cols=tmp(2);
28 | num_i=tmp(1);
29 | 
30 | C=double(zeros(num_rows,num_cols));
31 | Range=[num_rows,num_cols,1];
32 | settings = sprintf('-DNR=%d -DNC=%d -DNI=%d -DREAL=double',num_rows,num_cols,num_i);
33 | 
34 | tic;
35 | mC=A*B;
36 | toc
37 | 
38 | clearvars mC
39 | 
40 | 
41 | 
42 | %%
43 | %run_kernel directly
44 | tic;
45 | [run_time]=cl_run_kernel(1,'mul_kernel.cl',settings,'MM',Range,0,A,B,C,[1 1 2]);
46 | toc;
47 | time_str=sprintf('OpenCL Kernel time is %f seconds.',double(run_time)/1000/1000);
48 |  disp(time_str)
49 | 
50 |  C=double(zeros(num_rows,num_cols));
51 | 
52 | 
53 | %%
54 | %Build and run kernel seperately
55 | 
56 | 
57 | %compile kernel
58 |  [comp_time,kernels]=cl_run_kernel(1,'mul_kernel.cl',settings);
59 |  time_str=sprintf('OpenCL Kernel compile time was %f seconds.',double(comp_time)/1000/1000);
60 | disp(time_str)
61 | 
62 | disp("Run Kernel only:")
63 | tic;
64 | %run_kernel
65 | [run_time,copy_time]=cl_run_kernel(1,'MM',Range,0,A,B,C,[1 1 2]);
66 | toc;
67 | 
68 | time_str=sprintf('OpenCL Buffer copy time is: %f seconds.\n OpenCL Kernel runtime is %f seconds.',double(copy_time)/1000/1000,double(run_time)/1000/1000);
69 | disp(time_str)
70 | 


--------------------------------------------------------------------------------
/src/cl_dbg_kernel.cpp:
--------------------------------------------------------------------------------
 1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */
 2 | 
 3 | #include <math.h>
 4 | #include "mex.h"
 5 | #include "matrix.h"
 6 | #include <iostream>
 7 | #include <string>
 8 | #if defined(_WIN32)
 9 | #include <windows.h>
10 | #include <io.h>
11 | #include <process.h>
12 | #endif
13 | 
14 | 
15 | #define CL_HPP_ENABLE_EXCEPTIONS
16 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120
17 | #define CL_HPP_TARGET_OPENCL_VERSION 120
18 | 
19 | #include <CL/cl2.hpp>
20 | #include "ocl_dev_mgr.hpp"
21 | 
22 | 
23 | #include "MatCL.hpp"
24 | 
25 | 
26 | class mystream : public std::streambuf
27 | {
28 | protected:
29 | 	virtual std::streamsize xsputn(const char *s, std::streamsize n) { mexPrintf("%.*s", n, s); return n; }
30 | 	virtual int overflow(int c = EOF) { if (c != EOF) { mexPrintf("%.1s", &c); } return 1; }
31 | };
32 | class scoped_redirect_cout
33 | {
34 | public:
35 | 	scoped_redirect_cout() { old_buf = std::cout.rdbuf(); std::cout.rdbuf(&mout); }
36 | 	~scoped_redirect_cout() { std::cout.rdbuf(old_buf); }
37 | private:
38 | 	mystream mout;
39 | 	std::streambuf *old_buf;
40 | };
41 | static scoped_redirect_cout mycout_redirect;
42 | 
43 | 
44 | 
45 | void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray*prhs[])
46 | 
47 | {
48 | 
49 | 	ocl_dev_mgr& dev_mgr = ocl_dev_mgr::getInstance();
50 | 
51 | 	uint32_t device = (uint32_t)mxGetScalar(prhs[0]) - 1;
52 | 
53 | 	if (nrhs>2) {
54 | 
55 | 		if (device<dev_mgr.get_avail_dev_num()) {
56 | 			dev_mgr.deinitalize();
57 | 			compilerun(plhs, nrhs, prhs, &dev_mgr, device, true,false);
58 | 
59 | 		}
60 | 
61 | 		else {
62 | 			mexErrMsgIdAndTxt("MATLAB:cl_dev", "OpenCl Device not found!");
63 | 		}
64 | 	}
65 | 	else {
66 | 		mexErrMsgIdAndTxt("MATLAB:syntax", "Incorrect Syntax!");
67 | 	}
68 | 
69 | 
70 | 
71 | 	return;
72 | 
73 | }
74 | 


--------------------------------------------------------------------------------
/examples/img_test.m:
--------------------------------------------------------------------------------
 1 | % This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license.
 2 | 
 3 | clear all
 4 | clc
 5 | close all
 6 | 
 7 | %Select OpenCL device
 8 | device=1;
 9 | 
10 | %Either load test image from .mat file or take an image using a camera(requires webcam support package)
11 | %cam_img = snapshot(webcam);
12 | load('imgData.mat');
13 | cam_img = rgb2gray(cam_img); %transform to grayscale image
14 | dims=size(cam_img); %get image size
15 | 
16 | %add artificial noise to the image
17 | nI=imnoise(cam_img,'salt & pepper',0.06);
18 | 
19 | %Run native matlab medfilt function and track execution time
20 | tic
21 | K = medfilt2(nI);
22 | cpu_time=toc;
23 | 
24 | 
25 | %%
26 | %Set OpenCL workgroup dimensions depending on the size of the image(take care of bounds)
27 | global_range=[3 3 0 dims(2)-6 dims(1)-6 1];
28 | local_range=[0];
29 | 
30 | %Convert data to uint8 and transform to 1x(dims(1)*dims(2)) vector
31 | imgData=uint8(reshape(nI',[1,dims(1)*dims(2)]));
32 | 
33 | %Preallocate destination array
34 | destI=uint8(zeros(1,(dims(2))*(dims(1))));
35 | 
36 | %Set OpenCL kernel defines
37 | settings=sprintf(' -DWIDTH=%d ', dims(2));
38 | %Precompile filter kernel
39 | [comp_time,kernels]=cl_run_kernel(device,'filter.cl',settings);
40 | %Execute OpenCL median filter kernel and track total execution time
41 | tic
42 | [run_time,copy_time]=cl_run_kernel(device,'filter',global_range,local_range,imgData,destI,0);
43 | ocl_time=toc;
44 | %%
45 | %Output results to console
46 | cl_times=sprintf('Buffer copy time: %.3f ms    Kernel runtime: %.3f ms',double(copy_time)/1000,double(run_time)/1000);
47 | 
48 | %convert image data back to matlab dims(1)xdims(2) style
49 | newImg=reshape(destI,[dims(1),dims(2)])';
50 | 
51 | %Generate figure with results and runtimes
52 | cpu_title=sprintf('CPU Runtime: %.3f ms',cpu_time*1000);
53 | ocl_title=sprintf('OpenCL Runtime: %.3f ms',ocl_time*1000);
54 | 
55 | figure('units','normalized','outerposition',[0 0 1 1])
56 | subplot(2,2,[1,2])
57 | imshow(nI)
58 | title('Original')
59 | 
60 | subplot(2,2,3)
61 | imshow(K)
62 | title(cpu_title)
63 | 
64 | subplot(2,2,4)
65 | imshow(newImg)
66 | title(ocl_title)
67 | xlabel(cl_times)
68 | 


--------------------------------------------------------------------------------
/cl_run_kernel.m:
--------------------------------------------------------------------------------
 1 | %CL_RUN_KERNEL Build and run OpenCL kernels
 2 | %   ------------
 3 | %   Build Kernel
 4 | %   ------------
 5 | %
 6 | %   [comp_time, kernels] = cl_run_kernel(ocl_dev_id, 'kernel_url.cl', 'defines');
 7 | %
 8 | %   Inputs
 9 | %   -------
10 | %   ocl_dev_id: ID of the OpenCL device to be used
11 | %   kernel_url: URL of the kernel file
12 | %   defines: List of OpenCL compiler defines
13 | %
14 | %   Outputs
15 | %   -------
16 | %   comp_time: Microseconds it took to compile the kernels
17 | %   kernels: List with names of all available kernels
18 | %
19 | %
20 | %   ----------
21 | %   Run Kernel
22 | %   ----------
23 | %
24 | %   [run_time, copy_time] = cl_run_kernel(ocl_dev_id, {'kernel_function1','kernel_function2'}, ...
25 | %                                         global_range, local_range, in1, out1, [rw_flags]);
26 | %
27 | %   Inputs
28 | %   -------
29 | %   ocl_dev_id: ID of the OpenCL device to be used
30 | %   kernel_function: Cell array of kernel functions to execute (can also be a single string for just one kernel)
31 | %   global_range: 3D global OpenCL range (see NDRange). If this vector has six entires, the first three define the 3D work offset followed by the 3D work size.
32 | %   local_range: 3D local OpenCL range (see NDRange)
33 | %   in1, out1: List of variables to pass from/to kernel
34 | %   rw_flags: read/write flag for the Kernel variables, this can either be scalar (all variables are read&write) or a vector with an entry for each variable: 0 - read&write / 1 - kernel read only / 2 - kernel write only
35 | %
36 | %   Outputs
37 | %   -------
38 | %   run_time:  Microseconds it took to execute the kernels
39 | %   copy_time:  Microseconds it took to copy all buffers
40 | %
41 | %
42 | %   ------------------
43 | %   Build & Run Kernel
44 | %   ------------------
45 | %
46 | %   [run_time] = cl_run_kernel(ocl_dev_id, 'kernel_url.cl', 'defines', 'kernel_function', ...
47 | %                              global_range, local_range, in1, out1, [rw_flags]);
48 | %
49 | %   Inputs
50 | %   -------
51 | %   ocl_dev_id: ID of the OpenCL device to be used
52 | %   kernel_url.cl: URL of the kernel file
53 | %   defines: List of OpenCL compiler defines
54 | %   kernel_function: Cell array of kernel functions to execute (can also be a signal string)
55 | %   global_range: 3D global OpenCL range (see NDRange). If this vector has six entires, the first three define the 3D work offset followed by the 3D work size.
56 | %   local_range: Local OpenCL range (see NDRange)
57 | %   in1, out1: List of variables to pass from/to kernel
58 | %   rw_flags: read/write flag for the Kernel variables, this can either be scalar (all variables are read&write) or a vector with an entry for each variable: 0 - read&write / 1 - kernel read only / 2 - kernel write only
59 | %
60 | %   Outputs
61 | %   -------
62 | %   run_time:  Microseconds it took to execute the kernels
63 | 


--------------------------------------------------------------------------------
/src/cl_get_devices.cpp:
--------------------------------------------------------------------------------
 1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */
 2 | 
 3 | #include <math.h>
 4 | #include "mex.h"
 5 | #include "matrix.h"
 6 | 
 7 | 
 8 | #define CL_HPP_ENABLE_EXCEPTIONS
 9 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120
10 | #define CL_HPP_TARGET_OPENCL_VERSION 120
11 | 
12 | #include <CL/cl2.hpp>
13 | #include "ocl_dev_mgr.hpp"
14 | 
15 | 
16 | void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray*prhs[] ) {
17 | 
18 |     uint32_t devices_availble=0;
19 | 
20 |     ocl_dev_mgr& dev_mgr = ocl_dev_mgr::getInstance();
21 |    devices_availble=dev_mgr.get_avail_dev_num();
22 | 
23 |    //get OpenCl device names
24 |        mxArray * tmp_str;
25 |        mxArray *cell_array_ptr;
26 |    mxArray *matrix_ptr;
27 | 
28 |    char name_string[100]="";
29 | 
30 | 
31 |    cell_array_ptr = mxCreateCellMatrix((mwSize)devices_availble,1);
32 |    for (uint32_t i=0;i<devices_availble;i++) {
33 | 	   snprintf(name_string, 100, "%s(%s)", dev_mgr.get_avail_dev_info(i).name.c_str(),dev_mgr.get_avail_dev_info(i).ocl_version.c_str());
34 |     tmp_str = mxCreateString(name_string);
35 |    mxSetCell(cell_array_ptr,i,mxDuplicateArray(tmp_str));
36 |    }
37 | 
38 |     plhs[0] = cell_array_ptr;
39 | 
40 | //get OpenCl Device type
41 | 
42 | cell_array_ptr = mxCreateCellMatrix((mwSize)devices_availble,1);
43 |    for (uint32_t i=0;i<devices_availble;i++) {
44 |      tmp_str = mxCreateString(dev_mgr.getDeviceType(i).c_str());
45 |    mxSetCell(cell_array_ptr,i,mxDuplicateArray(tmp_str));
46 |    }
47 | 
48 |     plhs[1] = cell_array_ptr;
49 | 
50 | 
51 |        //get OpenCl Device mem size
52 |     uint64_t  *pointer;
53 | 
54 |     matrix_ptr= mxCreateNumericMatrix(devices_availble, 1, mxUINT64_CLASS, mxREAL);
55 |     pointer =(uint64_t  *) mxGetData(matrix_ptr);
56 | 
57 |    for (uint32_t i=0;i<devices_availble;i++) {
58 |    pointer[i]=dev_mgr.get_avail_dev_info(i).max_mem;
59 |    }
60 | 
61 |    plhs[2] = matrix_ptr;
62 | 
63 | 
64 |    //get OpenCl Device WorkGroup size
65 | 
66 |    matrix_ptr = mxCreateNumericMatrix(devices_availble, 1, mxUINT64_CLASS, mxREAL);
67 |    pointer = (uint64_t  *)mxGetData(matrix_ptr);
68 | 
69 |    for (uint32_t i = 0; i<devices_availble; i++) {
70 | 	   pointer[i] = dev_mgr.get_avail_dev_info(i).wg_size;
71 |    }
72 | 
73 |    plhs[3] = matrix_ptr;
74 | 
75 |    //get OpenCl Device LocalWork size
76 | 
77 |    matrix_ptr = mxCreateNumericMatrix(devices_availble, 1, mxUINT64_CLASS, mxREAL);
78 |    pointer = (uint64_t  *)mxGetData(matrix_ptr);
79 | 
80 |    for (uint32_t i = 0; i<devices_availble; i++) {
81 | 	   pointer[i] = dev_mgr.get_avail_dev_info(i).lw_size;
82 |    }
83 | 
84 |    plhs[4] = matrix_ptr;
85 | 
86 |    //get OpenCl Device Compute units
87 | 
88 |    matrix_ptr = mxCreateNumericMatrix(devices_availble, 1, mxUINT64_CLASS, mxREAL);
89 |    pointer = (uint64_t  *)mxGetData(matrix_ptr);
90 | 
91 |    for (uint32_t i = 0; i<devices_availble; i++) {
92 | 	   pointer[i] = dev_mgr.get_avail_dev_info(i).compute_units;
93 |    }
94 | 
95 |    plhs[5] = matrix_ptr;
96 |     return;
97 | 
98 | }
99 | 


--------------------------------------------------------------------------------
/examples/filter.cl:
--------------------------------------------------------------------------------
  1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */
  2 | 
  3 | //2D median filter OpenCL example
  4 | kernel void filter(global uchar* pSrc, global uchar* pDst)
  5 | {
  6 | 
  7 | 	const int x = get_global_id(0);
  8 |     const int y = get_global_id(1);
  9 | 
 10 | 	const int iOffset = y * (uint)WIDTH;
 11 |     const int iPrev = iOffset - (uint)WIDTH;
 12 |     const int iNext = iOffset + (uint)WIDTH;
 13 | 
 14 |     // transfer pixels within median window to local variables
 15 | 	uchar r0,r1,r2,r3,r4,r5,r6,r7,r8;
 16 | 	r0 = pSrc[iPrev + x - 1];
 17 |     r1 = pSrc[iPrev + x];
 18 |     r2 = pSrc[iPrev + x + 1];
 19 | 
 20 |     r3 = pSrc[iOffset + x - 1];
 21 |     r4 = pSrc[iOffset + x];
 22 |     r5 = pSrc[iOffset + x + 1];
 23 | 
 24 |     r6 = pSrc[iNext + x - 1];
 25 |     r7 = pSrc[iNext + x];
 26 |     r8 = pSrc[iNext + x + 1];
 27 | 
 28 |     uchar uiResult = 0;
 29 | 
 30 |         // perform partial bitonic sort to find the median value
 31 |         uchar uiMin = min(r0, r1);
 32 |         uchar uiMax = max(r0, r1);
 33 |         r0 = uiMin;
 34 |         r1 = uiMax;
 35 | 
 36 |         uiMin = min(r3, r2);
 37 |         uiMax = max(r3, r2);
 38 |         r3 = uiMin;
 39 |         r2 = uiMax;
 40 | 
 41 |         uiMin = min(r2, r0);
 42 |         uiMax = max(r2, r0);
 43 |         r2 = uiMin;
 44 |         r0 = uiMax;
 45 | 
 46 |         uiMin = min(r3, r1);
 47 |         uiMax = max(r3, r1);
 48 |         r3 = uiMin;
 49 |         r1 = uiMax;
 50 | 
 51 |         uiMin = min(r1, r0);
 52 |         uiMax = max(r1, r0);
 53 |         r1 = uiMin;
 54 |         r0 = uiMax;
 55 | 
 56 |         uiMin = min(r3, r2);
 57 |         uiMax = max(r3, r2);
 58 |         r3 = uiMin;
 59 |         r2 = uiMax;
 60 | 
 61 |         uiMin = min(r5, r4);
 62 |         uiMax = max(r5, r4);
 63 |         r5 = uiMin;
 64 |         r4 = uiMax;
 65 | 
 66 |         uiMin = min(r7, r8);
 67 |         uiMax = max(r7, r8);
 68 |         r7 = uiMin;
 69 |         r8 = uiMax;
 70 | 
 71 |         uiMin = min(r6, r8);
 72 |         uiMax = max(r6, r8);
 73 |         r6 = uiMin;
 74 |         r8 = uiMax;
 75 | 
 76 |         uiMin = min(r6, r7);
 77 |         uiMax = max(r6, r7);
 78 |         r6 = uiMin;
 79 |         r7 = uiMax;
 80 | 
 81 |         uiMin = min(r4, r8);
 82 |         uiMax = max(r4, r8);
 83 |         r4 = uiMin;
 84 |         r8 = uiMax;
 85 | 
 86 |         uiMin = min(r4, r6);
 87 |         uiMax = max(r4, r6);
 88 |         r4 = uiMin;
 89 |         r6 = uiMax;
 90 | 
 91 |         uiMin = min(r5, r7);
 92 |         uiMax = max(r5, r7);
 93 |         r5 = uiMin;
 94 |         r7 = uiMax;
 95 | 
 96 |         uiMin = min(r4, r5);
 97 |         uiMax = max(r4, r5);
 98 |         r4 = uiMin;
 99 |         r5 = uiMax;
100 | 
101 |         uiMin = min(r6, r7);
102 |         uiMax = max(r6, r7);
103 |         r6 = uiMin;
104 |         r7 = uiMax;
105 | 
106 |         uiMin = min(r0, r8);
107 |         uiMax = max(r0, r8);
108 |         r0 = uiMin;
109 |         r8 = uiMax;
110 | 
111 |         r4 = max(r0, r4);
112 |         r5 = max(r1, r5);
113 | 
114 |         r6 = max(r2, r6);
115 |         r7 = max(r3, r7);
116 | 
117 |         r4 = min(r4, r6);
118 |         r5 = min(r5, r7);
119 | 
120 | 	pDst[iOffset + x] = (uchar)min(r4, r5);
121 | 
122 | }
123 | 


--------------------------------------------------------------------------------
/src/ocl_dev_mgr.hpp:
--------------------------------------------------------------------------------
  1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */
  2 | 
  3 | #ifndef DEV_MGR_H
  4 | #define DEV_MGR_H
  5 | 
  6 | #include <vector>
  7 | 
  8 | // disable strange warnings for newer versions of GCC for OpenCL typedefs
  9 | #pragma GCC diagnostic ignored "-Wignored-attributes"
 10 | 
 11 | #define CL_HPP_ENABLE_EXCEPTIONS
 12 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120
 13 | #define CL_HPP_TARGET_OPENCL_VERSION 120
 14 | #if defined(__APPLE__)
 15 | #define CL_SILENCE_DEPRECATION
 16 | #include <OpenCL/cl2.hpp>
 17 | #else
 18 | #include <CL/cl2.hpp>
 19 | #endif
 20 | 
 21 | 
 22 | class ocl_dev_mgr {
 23 | public:
 24 |   ~ocl_dev_mgr() {};
 25 | 
 26 |   static ocl_dev_mgr& getInstance() {
 27 |     static ocl_dev_mgr instance;
 28 |     return instance;
 29 |   }
 30 | 
 31 |   struct ocl_device_info{
 32 |     cl::Device device;
 33 |     std::string name;
 34 |     cl::Platform platform;
 35 |     std::string platform_name;
 36 |     std::string vendor;
 37 |     cl_device_type type;
 38 |     std::string ocl_version;
 39 |     cl_ulong max_mem;
 40 |     cl_ulong max_mem_alloc;
 41 |     size_t wg_size;
 42 |     cl_uint lw_dim;
 43 |     size_t lw_size;
 44 |     cl_uint compute_units;
 45 |     cl_uint copy_perf;
 46 |     cl_uint double_perf;
 47 |     cl_uint float_perf;
 48 |   };
 49 | 
 50 |   std::string getDevicePCIeID(cl_uint avail_device_idx);
 51 |   cl_ulong init_device(cl_uint avail_device_idx);
 52 |   cl::CommandQueue& get_queue(cl_uint context_idx, cl_uint queue_idx);
 53 |   cl::Context& get_context(cl_uint context_idx);
 54 |   cl::Program& get_program(cl_uint context_idx, std::string const& prog_name);
 55 |   cl_ulong get_avail_dev_num();
 56 |   cl_ulong get_context_num();
 57 |   ocl_device_info& get_avail_dev_info(cl_uint avail_device_idx);
 58 |   ocl_device_info& get_context_dev_info(cl_uint context_idx, cl_uint device_idx);
 59 |   cl_ulong compile_kernel(cl_uint context_idx, std::string const& prog_name, std::string const& options);
 60 |   cl_ulong get_kernel_names(cl_uint context_idx, std::string const& prog_name, std::vector<std::string>& found_kernels);
 61 |   cl_ulong execute_kernel(cl::Kernel& kernel, cl::CommandQueue& queue,
 62 |   cl::NDRange global_range, cl::NDRange local_range,
 63 |   std::vector<cl::Buffer*>& dev_Buffers);
 64 |   cl_ulong execute_kernelNA(cl::Kernel& kernel, cl::CommandQueue& queue,
 65 |   cl::NDRange range_start, cl::NDRange global_range, cl::NDRange local_range);
 66 |   void execute_kernel_async(cl::Kernel& kernel, cl::CommandQueue& queue,
 67 |   cl::NDRange global_range, cl::NDRange local_range,
 68 |   std::vector<cl::Buffer*>& dev_Buffers);
 69 |   bool add_program_url(cl_uint context_idx, std::string prog_name, std::string const& url);
 70 |   bool add_program_str(cl_uint context_idx, std::string prog_name, std::string kernel);
 71 |   cl::Kernel* getKernelbyName(cl_uint context_idx, std::string const& prog_name, std::string const& kernel_name);
 72 |   cl::Kernel* getKernelbyID(cl_uint context_idx, std::string const& prog_name, cl_ulong kernel_id);
 73 |   std::string getDeviceType(cl_uint avail_device_idx);
 74 |   void deinitalize();
 75 | 
 76 | private:
 77 |   const std::string type_cpu_str = "CPU";
 78 |   const std::string type_gpu_str = "GPU";
 79 |   const std::string type_acc_str = "ACCELERATOR";
 80 |   const std::string type_other_str = "OTHER";
 81 | 
 82 |   struct ocl_context {
 83 |     cl::Context context;
 84 |     std::vector<cl::CommandQueue> queues;
 85 |     std::vector<cl::Program> programs;
 86 |     std::vector<std::string> prog_names;
 87 |     std::vector<std::vector<cl::Kernel>> kernels;
 88 |     std::vector<std::vector<std::string>> kernel_names;
 89 |     std::vector<ocl_device_info> devices;
 90 |   };
 91 | 
 92 |   void initialize();
 93 |   ocl_dev_mgr();
 94 |   cl_ulong getDeviceList(std::vector<cl::Device>& devices);
 95 | 
 96 |   std::vector<ocl_device_info> available_devices;
 97 |   cl_ulong num_available_devices;
 98 |   std::vector<ocl_context> con_list;
 99 | };
100 | 
101 | #endif // DEV_MGR_H
102 | 


--------------------------------------------------------------------------------
/run_kernel.m:
--------------------------------------------------------------------------------
 1 | %This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license.
 2 | 
 3 | clear all
 4 | close all
 5 | clc
 6 | 
 7 | %%
 8 | % Use cl_run_kernel to compile and launch kernels. It is possible to compile
 9 | % and run kernels in a two-stage process to increase performance or just use
10 | % a single step approach, that does everything in one go.
11 | 
12 | global_range=[10,1,1];  %Set global OpenCl Range. Default indexing is 3D. To use a 1D index set y and z to 1
13 | local_range=[0];        %Let OpenCL decide local range, otherwise specify range explicitly (like global range)
14 | 
15 | % Create input data for the kernel
16 | for i=1:20
17 |    in1(1,i)=double(1);
18 |    in1(2,i)=double(1);
19 |    in1(3,i)=double(1);
20 |    in1(4,i)=double(1);
21 | 
22 |    in2(1,i)=double(2);
23 |    in2(2,i)=double(2);
24 |    in2(3,i)=double(2);
25 |    in2(4,i)=double(2);
26 | end
27 | 
28 | % This example shows how to only compile the kernel but not run it. The
29 | % arguments are as follows:
30 | % - OpenCl Device ID - see cl_get_devices
31 | % - Kernel file URL
32 | % - Kernel defines, can be used to efficently define constant values or set
33 | % other compiler arguments
34 | %
35 | %This functions returns the compile time (in us) and an array with the names of the compiled kernel
36 | %functions
37 | %The OpenCL optimization flags -cl-mad-enable -cl-no-signed-zeros
38 | %-cl-finite-math-only were tested on diffrent devices and sould not cause
39 | %unexpected behaviour
40 | [comp_time,kernels]=cl_run_kernel(1,'test_kernel.cl','-DDT=1.0 -cl-mad-enable -cl-no-signed-zeros -cl-finite-math-only');
41 | 
42 | 
43 | % This example shows how to run a precompiled kernel. The
44 | % arguments are as follows:
45 | % - OpenCl Device ID - see cl_get_devices
46 | % - Name of the function to run or cell array of kernel names to queue
47 | % multiple kernels
48 | % - Global OpenCL Range used to launch the kernel (see OpenCL NDRange)
49 | % - Local OpenCL Range used to launch the kernel (see OpenCL NDRange). This
50 | % value can be set to 0 to let OpenCL decide the best values
51 | % - List of varaibles to be used by the kernel - they will be passed in the
52 | % same order to the kernel itself. In case these variables get changed by
53 | % the kernel, the value of the input variable will change automatically
54 | % - read/write flag for the Kernel variables, this can either be scalar(all variables are read&write) or a vector with an entry for each variable: 0 - read&write / 1 - kernel read
55 | % only / 2 - kernel write only.
56 | %
57 | %This function returns the runtime of the actual kernel and teh buffer copy time in us
58 | [run_time,copy_time]=cl_run_kernel(1,'test1',global_range,local_range,in1,in2,0);
59 | 
60 | % This example shows how to compile and execute a kernel in a single pass.
61 | %  The arguments are as follows:
62 | % - OpenCl Device ID - see cl_get_devices
63 | % - Kernel file URL
64 | % - Kernel defines, can be used to efficently define constant values or set
65 | % other compiler arguments
66 | % - Name of the function to run or cell array of kernel names to queue
67 | % multiple kernels
68 | % - Global OpenCL Range used to launch the kernel (see OpenCL NDRange)
69 | % - Local OpenCL Range used to launch the kernel (see OpenCL NDRange). This
70 | % value can be set to 0 to let OpenCL decide the best values
71 | % - List of varaibles to be used by the kernel - they will be passed in the
72 | % same order to the kernel itself. In case these variables get changed by
73 | % the kernel, the value of the input variable will change automatically
74 | % - read/write flag for the Kernel variables, this can either be scalar(all variables are read&write) or a vector with an entry for each variable: 0 - read&write / 1 - kernel read
75 | % only / 2 - kernel write only.
76 | %
77 | %This function returns the runtime of the actual kernel in ms
78 | [run_time]=cl_run_kernel(1,'test_kernel.cl','-DDT=5.0 -cl-mad-enable -cl-no-signed-zeros -cl-finite-math-only','test2',global_range,local_range,in1,in2,[0 1]);
79 | 
80 | 
81 |  %Same as above. but this functions pipes kernel printf to Matlab
82 | [run_time]=cl_dbg_kernel(1,'test_kernel.cl','-DDT=5.0 -cl-mad-enable -cl-no-signed-zeros -cl-finite-math-only','test2',global_range,local_range,in1,in2,[0 1]);
83 | 


--------------------------------------------------------------------------------
/src/utils.hpp:
--------------------------------------------------------------------------------
  1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */
  2 | 
  3 | #ifndef UTILS_H
  4 | #define UTILS_H
  5 | 
  6 | #include <stdio.h>
  7 | #include <fstream>
  8 | #include <iostream>
  9 | #include <string>
 10 | #include <iterator>
 11 | #include <vector>
 12 | 
 13 | 
 14 | #if defined(_WIN32)
 15 | #include <windows.h>
 16 | typedef cl_ulong uint64_t;
 17 | typedef unsigned int uint;
 18 | #else
 19 | #include <stdint.h>
 20 | #include <unistd.h>
 21 | #endif
 22 | 
 23 | class Timer
 24 | {
 25 | private:
 26 | #if defined(_WIN32)
 27 |     LARGE_INTEGER frequency_;
 28 |     DWORD         startTick_;
 29 |     LONGLONG      prevElapsedTime_;
 30 |     LARGE_INTEGER startTime_;
 31 | #else
 32 |     struct timespec startTime_;
 33 | #endif //_WIN32
 34 | 
 35 |     template <typename T>
 36 |     T _max(T a,T b)
 37 |     {
 38 |         return (a > b ? a : b);
 39 |     }
 40 | 
 41 |     uint64_t getTime(unsigned long long scale)
 42 |     {
 43 |         uint64_t ticks;
 44 | #if defined(_WIN32)
 45 |         LARGE_INTEGER currentTime;
 46 |         QueryPerformanceCounter(&currentTime);
 47 |         LONGLONG elapsedTime = currentTime.QuadPart - startTime_.QuadPart;
 48 | 
 49 |         // Compute the number of millisecond ticks elapsed.
 50 |         unsigned long msecTicks =
 51 |             (unsigned long)(1000 * elapsedTime / frequency_.QuadPart);
 52 |         // Check for unexpected leaps in the Win32 performance counter.
 53 |         // (This is caused by unexpected data across the PCI to ISA
 54 |         // bridge, aka south bridge.  See Microsoft KB274323.)
 55 |         unsigned long elapsedTicks = GetTickCount() - startTick_;
 56 | 
 57 |         signed long msecOff = (signed long)(msecTicks - elapsedTicks);
 58 |         if (msecOff < -100 || msecOff > 100) {
 59 |                 // Adjust the starting time forwards.
 60 |                 LONGLONG msecAdjustment =
 61 |                     _max(msecOff *
 62 |                         frequency_.QuadPart / 1000, elapsedTime -
 63 |                         prevElapsedTime_);
 64 |                 startTime_.QuadPart += msecAdjustment;
 65 |                 elapsedTime -= msecAdjustment;
 66 |         }
 67 |         // Store the current elapsed time for adjustments next time.
 68 |         prevElapsedTime_ = elapsedTime;
 69 | 
 70 |         ticks = (uint64_t)(scale*elapsedTime / frequency_.QuadPart);
 71 | #else
 72 |         struct timespec tp;
 73 |         ::clock_gettime(CLOCK_MONOTONIC, &tp);
 74 |         // check for overflow
 75 |         if ((tp.tv_nsec - startTime_.tv_nsec) < 0)
 76 |         {
 77 |             // Remove a second from the second field and add it to the
 78 |             // nanoseconds field to prevent overflow.
 79 |             // Then scale
 80 |             ticks = (uint64_t) (tp.tv_sec - startTime_.tv_sec - 1) * scale
 81 |                     + (uint64_t) ((1000ULL * 1000ULL * 1000ULL) + tp.tv_nsec - startTime_.tv_nsec)
 82 |                                   * scale / (1000ULL * 1000ULL * 1000ULL);
 83 |         }
 84 |         else
 85 |         {
 86 |             ticks = (uint64_t) (tp.tv_sec - startTime_.tv_sec) * scale
 87 |                 + (uint64_t) (tp.tv_nsec - startTime_.tv_nsec) * scale / (1000ULL * 1000ULL * 1000ULL);
 88 |         }
 89 | #endif //_WIN32
 90 | 
 91 |         return ticks;
 92 |     }
 93 | 
 94 | public:
 95 |     //! Constructor
 96 |     Timer()
 97 |     {
 98 | #if defined(_WIN32)
 99 |         QueryPerformanceFrequency(&frequency_);
100 | #endif
101 |         reset();
102 |     }
103 | 
104 |     //! Destructor
105 |     ~Timer()
106 |     {
107 |     }
108 | 
109 |     /*!
110 |      * \brief Resets timer such that in essence the elapsed time is zero
111 |      * from this point.
112 |      */
113 |     void reset()
114 |     {
115 | #if defined(_WIN32)
116 |         QueryPerformanceCounter(&startTime_);
117 |         startTick_ = GetTickCount();
118 |         prevElapsedTime_ = 0;
119 | #else
120 |         ::clock_gettime(CLOCK_MONOTONIC, &startTime_);
121 | #endif
122 |     }
123 | 
124 |     /*!
125 |      * \brief Calculates the time since the last reset.
126 |      * \returns The time in milli seconds since the last reset.
127 |      */
128 |     uint64_t getTimeMilliseconds(void)
129 |     {
130 |         return getTime(1000ULL);
131 |     }
132 | 
133 |     /*!
134 |      * \brief Calculates the time since the last reset.
135 |      * \returns The time in nano seconds since the last reset.
136 |      */
137 |     uint64_t getTimeNanoseconds(void)
138 |     {
139 |         return getTime(1000ULL * 1000ULL * 1000ULL);
140 |     }
141 | 
142 |     /*!
143 |      * \brief Calculates the time since the last reset.
144 |      * \returns The time in micro seconds since the last reset.
145 |      */
146 |     uint64_t getTimeMicroseconds(void)
147 |     {
148 |         return getTime(1000ULL * 1000ULL);
149 |     }
150 | 
151 |     /*!
152 |      * \brief Calculates the tick rate for millisecond counter.
153 |      */
154 |     float getMillisecondsTickRate(void)
155 |     {
156 |         return 1000.f;
157 |     }
158 | 
159 |     /*!
160 |      * \brief Calculates the tick rate for nanosecond counter.
161 |      */
162 |     float getNanosecondsTickRate(void)
163 |     {
164 |         return (float) (1000ULL * 1000ULL * 1000ULL);
165 |     }
166 | 
167 |     /*!
168 |      * \brief Calculates the tick rate for microsecond counter.
169 |      */
170 |     float getMicrosecondsTickRate(void)
171 |     {
172 |         return (float) (1000ULL * 1000ULL);
173 |     }
174 | };
175 | 
176 | #endif // UTILS_H
177 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MatCL
  2 | 
  3 | 
  4 | [![License](https://licensebuttons.net/l/by-nc-nd/3.0/88x31.png)](https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode)
  5 | [![DOI](https://zenodo.org/badge/DOI/10.1145/3204919.3204927.svg)](https://doi.org/10.1145/3204919.3204927)
  6 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2531474.svg)](https://doi.org/10.5281/zenodo.2531474)
  7 | 
  8 | 
  9 | MatCL is an OpenCL interface for MathWorks Matlab. This MEX-based toolbox aims at providing a simple and easy to use solution to transfer memory and launch OpenCL kernels from Matlab using a single command.
 10 | In comparison to other Matlab OpenCL solutions, MatCL is not just an OpenCL API wrapper but encapsulates the low-level host API calls necessary to initialize devices, create OpenCL buffers from Matlab workspace variables and build and launch kernels.
 11 | MatCL is primarily intended to help in the development and testing of OpenCL kernels by allowing to transparently pass data from and to Matlab.
 12 | Because MatCL handles the entire low-level process, this toolbox makes it possible to execute kernels without in depth knowledge of the host implementation necessary to support the execution of OpenCL kernels.
 13 | MatCL is also optimized to allow efficient execution of OpenCL kernels within Matlab to accelerate computationally intensive tasks without having to rely on Nvidia CUDA. In addition to single command kernel execution, MatCL also allows for an independent two-step kernel compilation and launch workflow to save the kernel compile time and allow efficient repetitive kernel execution.
 14 | 
 15 | A practical example for how MatCL can be used for scientific research is the [Induction Equation](https://github.com/IANW-Projects/InductionEq) project.
 16 | 
 17 | Tested using Nvidia (Tesla, GTX), AMD (Ryzen, Radeon R9, FirePro) and Intel (Xeon, Core, HD Graphics) devices with Matlab R2015b and up.
 18 | 
 19 | ## Usage
 20 | 
 21 |  Usage information for the individual functions is available through the Matlab `help` command (e.g. `help cl_get_devices`) and the documentation browser (e.g. `doc cl_get_devices`).
 22 | 
 23 | - Enumerate OpenCL Devices (returns a list whose i-th entry corresponds to the i-th OpenCL device):
 24 |   `[names,dev_class,max_mem,max_wg_size,max_local_work_size,compute_units]=cl_get_devices;`
 25 |   - `names`: Names of all available devices
 26 |   - `dev_class`: The device class (CPU, GPU or Other for other or unknown Accelerators)
 27 |   - `max_mem`: The available device memory in bytes
 28 |   - `max_wg_size`: Max. size of OpenCL work group
 29 |   - `max_local_work_size`: Max. size of work items
 30 |   - `compute_units`: Number of compute units (e.g. CPU cores) of the device
 31 | 
 32 | - Build Kernel:
 33 |   `[comp_time,kernels]=cl_run_kernel(ocl_dev_id,'kernel_url.cl','defines');`
 34 |   - `comp_time`: Microseconds it took to compile the kernels
 35 |   - `kernels`: List with names of all available kernels
 36 | 
 37 |   - `ocl_dev_id`: ID of the OpenCL device to be used
 38 |   - `kernel_url.cl`: URL of the kernel file
 39 |   - `defines`: List of OpenCL compiler defines
 40 | 
 41 | - Run Kernel:
 42 |   `[run_time,copy_time]=cl_run_kernel(ocl_dev_id,',{'kernel_function1','kernel_function2'},global_range,local_range,in1,out1,[rw_flags]);`
 43 |   - `run_time`: Microseconds it took to execute the kernels
 44 |   - `copy_time`: Microseconds it took to copy all buffers
 45 | 
 46 |   - `ocl_dev_id`: ID of the OpenCL device to be used
 47 |   - `kernel_function`: Cell array of kernel functions to execute (can also be a single string for just one kernel)
 48 |   - `global_range`: 3D global OpenCL range (see NDRange). If this vector has six entires, the first three define the 3D work offset followed by the 3D work size.
 49 |   - `local_range`: 3D local OpenCL range (see NDRange)
 50 |   - `in1, out1`: List of variables to pass from/to kernel
 51 |   - `rw_flags`: read/write flag for the Kernel variables, this can either be scalar (all variables are read&write) or a vector with an entry for each variable: 0 - read&write / 1 - kernel read only / 2 - kernel write only
 52 | 
 53 | - Build & Run Kernel:
 54 |   `[run_time]=cl_run_kernel(ocl_dev_id,'kernel_url.cl ','defines ','kernel_function',global_range,local_range,in1,out1,[rw_flags]);`
 55 |   - `run_time`: Microseconds it took to execute the kernels
 56 | 
 57 |   - `ocl_dev_id`: ID of the OpenCL device to be used
 58 |   - `kernel_url.cl`: URL of the kernel file
 59 |   - `defines`: List of OpenCL compiler defines
 60 |   - `kernel_function`: Cell array of kernel functions to execute (can also be a single string for just one kernel)
 61 |   - `global_range`: 3D global OpenCL range (see NDRange). If this vector has six entires, the first three define the 3D work offset followed by the 3D work size.
 62 |   - `local_range`: Local OpenCL range (see NDRange)
 63 |   - `in1, out1`: List of variables to pass from/to kernel
 64 |   - `rw_flags`: read/write flag for the Kernel variables, this can either be scalar (all variables are read&write) or a vector with an entry for each variable: 0 - read&write / 1 - kernel read only / 2 - kernel write only
 65 | 
 66 | - Build & Run Kernel (with Kernel printf redirection):
 67 |   `[run_time]=cl_dbg_kernel(ocl_dev_id,' kernel_url.cl ','defines ','kernel_function',global_range,local_range,in1,out1,[rw_flags]);`
 68 |   - `run_time`: Microseconds it took to execute the kernels (might be slower due to printf redirection)
 69 | 
 70 |   - `ocl_dev_id`: ID of the OpenCL device to be used
 71 |   - `kernel_url.cl`: URL of the kernel file
 72 |   - `defines`: List of OpenCL compiler defines
 73 |   - `kernel_function`: Cell array of kernel functions to execute (can also be a single string for just one kernel)
 74 |   - `global_range`: 3D global OpenCL range (see NDRange). If this vector has six entires, the first three define the 3D work offset followed by the 3D work size.
 75 |   - `local_range`: Local OpenCL range (see NDRange)
 76 |   - `in1, out1`: List of variables to pass from/to kernel
 77 |   - `rw_flags`: read/write flag for the Kernel variables, this can either be scalar (all variables are read&write) or a vector with an entry for each variable: 0 - read&write / 1 - kernel read only / 2 - kernel write only
 78 | 
 79 | 
 80 | ## Setup
 81 | 
 82 | Just use `git clone https://github.com/IANW-Projects/MatCL` and run `compile_linux.m`, `compile_windows.m`, or `compile_mac` to compile MatCL. Depending on the OpenCL libraries used, the library path may have to be changed in these files.
 83 | Than add the folder `MatCL` to the search path of Matlab.
 84 | **Alternatively, some precompiled binaries are available at https://github.com/IANW-Projects/MatCL/releases.**
 85 | 
 86 | There may be problems with old C/C++ libraries supplied by Matlab under Linux, resulting in errors such as
 87 | `Invalid MEX-file '/..../cl_get_devices.mex64'`, followed by many missing symbols. If you use
 88 | a Debian based system, install the package `matlab-support` via `sudo apt-get install matlab-support`
 89 | and choose the option to rename the GCC libraries of Matlab during setup.
 90 | 
 91 | 
 92 | ## Reference
 93 | 
 94 | MatCL can be referenced using the DOI [10.1145/3204919.3204927](https://doi.org/10.1145/3204919.3204927)
 95 | and the following bibtex entry.
 96 | ```
 97 | @inproceedings{heinisch2018MatCL,
 98 |   title={{MatCL}: {A} new easy-to use {OpenCL} toolbox for {MathWorks} {Matlab}},
 99 |   author={Heinisch, Philip and Ostaszewski, Katharina},
100 |   year={2018},
101 |   pages={8:1--8:1},
102 |   booktitle={Proceedings of the International Workshop on OpenCL},
103 |   series={IWOCL '18, May 2018, Oxford (United Kingdom)},
104 |   publisher={ACM},
105 |   address={New York, NY, USA},
106 |   note={\url{https://github.com/IANW-Projects/MatCL}},
107 |   doi={10.1145/3204919.3204927}
108 | }
109 | ```
110 | The latest release can be cited with
111 | ```
112 | @misc{MatCLGit,
113 |   title={{MatCL}: {A} new easy-to use {OpenCL} toolbox for {MathWorks} {Matlab}},
114 |   author={Heinisch, Philip and Ostaszewski, Katharina and Ranocha, Hendrik},
115 |   month={01},
116 |   year={2019},
117 |   howpublished={\url{https://github.com/IANW-Projects/MatCL}},
118 |   doi={10.5281/zenodo.2531474}
119 | }
120 | ```
121 |  ## License
122 | 
123 | This project is licensed under the terms of the Creative Commons [CC BY-NC-ND 4.0](https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode) license.
124 | 
125 | 
126 |  ## Disclaimer
127 | 
128 | Product and company names may be trademarks or registered trademarks of their respective holders.
129 | Use of them does not imply any affiliation with or endorsement by them or their affiliates.
130 | Everything is provided as is and without warranty. Use at your own risk!
131 | 


--------------------------------------------------------------------------------
/src/cl_run_kernel.cpp:
--------------------------------------------------------------------------------
  1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */
  2 | 
  3 | #include <math.h>
  4 | #include "mex.h"
  5 | #include "matrix.h"
  6 | #include <iostream>
  7 | #include <string>
  8 | #if defined(_WIN32)
  9 | #include <windows.h>
 10 | #include <io.h>
 11 | #define access    _access_s
 12 | #else
 13 | #include <unistd.h>
 14 | #endif
 15 | 
 16 | 
 17 | 
 18 | 
 19 | #define CL_HPP_ENABLE_EXCEPTIONS
 20 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120
 21 | #define CL_HPP_TARGET_OPENCL_VERSION 120
 22 | 
 23 | #include <CL/cl2.hpp>
 24 | #include "ocl_dev_mgr.hpp"
 25 | 
 26 | 
 27 | 
 28 | #include "MatCL.hpp"
 29 | 
 30 | 
 31 | class mystream : public std::streambuf
 32 | {
 33 | protected:
 34 | 	virtual std::streamsize xsputn(const char *s, std::streamsize n) { mexPrintf("%.*s", n, s); return n; }
 35 | 	virtual int overflow(int c = EOF) { if (c != EOF) { mexPrintf("%.1s", &c); } return 1; }
 36 | };
 37 | class scoped_redirect_cout
 38 | {
 39 | public:
 40 | 	scoped_redirect_cout() { old_buf = std::cout.rdbuf(); std::cout.rdbuf(&mout); }
 41 | 	~scoped_redirect_cout() { std::cout.rdbuf(old_buf); }
 42 | private:
 43 | 	mystream mout;
 44 | 	std::streambuf *old_buf;
 45 | };
 46 | static scoped_redirect_cout mycout_redirect;
 47 | 
 48 | 
 49 | 
 50 | 
 51 | void mexFunction( int nlhs, mxArray *plhs[],   int nrhs, const mxArray*prhs[] )
 52 | {
 53 | 
 54 | 	size_t buflen;
 55 | 	char *buf;
 56 | 	char *settings;
 57 | 	char *kernel_name_c;
 58 | 	bool blocking = CL_FALSE;
 59 | 	uint64_t mem_needed = 0;
 60 | 
 61 | 	std::vector<cl::Buffer> data_in;
 62 | 	std::vector<uint64_t> data_size;
 63 | 	std::vector<cl::Buffer*> dev_Buffers;
 64 | 
 65 | 	uint32_t global_range_x = 1;
 66 | 	uint32_t global_range_y = 1;
 67 | 	uint32_t global_range_z = 1;
 68 | 	uint32_t range_start_x = 0;
 69 | 	uint32_t range_start_y = 0;
 70 | 	uint32_t range_start_z = 0;
 71 | 	cl::NDRange range_start = cl::NullRange;
 72 | 	cl::NDRange global_range;
 73 | 	cl::NDRange local_range;
 74 | 
 75 | 	uint64_t startTransfer, transferTime;
 76 | 
 77 | 	Timer timer; //used to track performance
 78 | 
 79 | 	ocl_dev_mgr& dev_mgr = ocl_dev_mgr::getInstance();
 80 | 
 81 | 	uint32_t device = (uint32_t)mxGetScalar(prhs[0]) - 1;
 82 | 
 83 |     if (nrhs>2) {
 84 | 
 85 | 	if (device<dev_mgr.get_avail_dev_num() ) {
 86 | 
 87 | 
 88 | 		bool old_instance = false;
 89 | 		bool compile_only = false;
 90 | 
 91 | 		//reuse context  - kernels are already compiled
 92 | 		if ((dev_mgr.get_context_num() > 0) && (mxIsChar(prhs[2]) == 0)) {
 93 | 			old_instance = true;
 94 | 		//	mexPrintf("Old instance found, running kernels only...\n");
 95 | 		}
 96 | 
 97 | 		//only build kernels - does not execute anything
 98 | 		if (nrhs == 3) {
 99 | 			compile_only = true;
100 | 
101 | 			if (mxIsCell(prhs[1]) == true) {
102 | 				mexPrintf("Building multiple kernel files...\n");
103 | 			}
104 | 			else {
105 | 				mexPrintf("Building single kernel file...\n");
106 | 			}
107 | 		}
108 | 
109 | 		if (compile_only == true) {
110 | 
111 | 			mexPrintf("Device:  %s\n", dev_mgr.get_avail_dev_info(device).name.c_str());
112 | 
113 | 			dev_mgr.deinitalize();
114 | 
115 | 			dev_mgr.init_device(device);
116 | 
117 | 			buflen = mxGetN(prhs[nrhs - 1]) + 1; //get Kernel Settings
118 | 			settings = (char *)mxMalloc(buflen);
119 | 			mxGetString(prhs[nrhs - 1], settings, (mwSize)buflen);
120 | 
121 | 
122 | 		//get Kernel URL
123 | 			getKernel_info(plhs, nrhs, prhs, &dev_mgr);
124 | 
125 | 
126 | 			uint64_t kernels_found = 0;
127 | 			uint64_t comp_time;
128 | 			comp_time= timer.getTimeMicroseconds();
129 | 
130 | 			kernels_found = dev_mgr.compile_kernel(0, "ocl_Kernel", settings);
131 | 
132 | 			uint64_t  *comp_time_ptr;
133 | 			plhs[0] = mxCreateNumericMatrix(1, 1, mxUINT64_CLASS, mxREAL);
134 | 			comp_time_ptr = (uint64_t *)mxGetData(plhs[0]);
135 | 			comp_time_ptr[0] = timer.getTimeMicroseconds()-comp_time;
136 | 
137 | 			//  transferTime = (timer.getTimeMicroseconds() - startTransfer);
138 | 			// mexPrintf("Copy:  %d\n", transferTime);
139 | 
140 | 			if (kernels_found > 0) {
141 | 				mxArray * tmp_str;
142 | 				mxArray *cell_array_ptr;
143 | 				cell_array_ptr = mxCreateCellMatrix((mwSize)kernels_found, 1);
144 | 				for (uint32_t i = 0; i < kernels_found; i++) {
145 | 					//	mexPrintf("test: %s\n ", dev_mgr.getKernelbyID(i)->getInfo<CL_KERNEL_FUNCTION_NAME>());
146 | 					std::string kernel_name(dev_mgr.getKernelbyID(0, "ocl_Kernel",i)->getInfo<CL_KERNEL_FUNCTION_NAME>());
147 | 					tmp_str = mxCreateString(kernel_name.c_str());
148 | 					mxSetCell(cell_array_ptr, i, mxDuplicateArray(tmp_str));
149 | 				}
150 | 
151 | 				plhs[1] = cell_array_ptr;
152 | 
153 | 			}
154 | 
155 | 			//	mexLock(); //prevent matlab from unloading mex file to keep context alive
156 | 
157 | 		}
158 | 
159 | 	//this part only runs the kernel
160 | 	if ((compile_only == false) && (old_instance == true)) {
161 | 
162 | 		uint32_t num_in = (uint32_t)nrhs-5;//Number of input buffers
163 | 
164 | 		uint32_t var_offset = 4;
165 | 		uint64_t  copy_time;
166 | 
167 | 		std::vector<std::string> kernel_list;
168 | 
169 | 		mwSize cell_dims;
170 | 		mxArray *cellElement;
171 | 		if (mxIsCell(prhs[1]) == true) {
172 | 
173 | 
174 | 			cell_dims = mxGetNumberOfElements(prhs[1]);
175 | 			for (uint32_t icell = 0; icell < cell_dims; icell++) {
176 | 				cellElement = mxGetCell(prhs[1], icell);
177 | 
178 | 				buflen = mxGetN(cellElement) + 1;
179 | 				//mexPrintf("Size:  %d\n", buflen);
180 | 
181 | 				char *kernel_name_c;
182 | 				kernel_name_c = (char *)mxMalloc(buflen);
183 | 				mxGetString(cellElement, kernel_name_c, (mwSize)buflen);
184 | 				kernel_list.push_back(std::string(kernel_name_c));
185 | 			//	mexPrintf("Kernel-Name:  %s\n", kernel_name_c);
186 | 			}
187 | 		}
188 | 		else {
189 | 			buflen = mxGetN(prhs[1]) + 1; //get Kernel Name
190 | 			kernel_name_c = (char *)mxMalloc(buflen);
191 | 			mxGetString(prhs[1], kernel_name_c, (mwSize)buflen);
192 | 			kernel_list.push_back(std::string(kernel_name_c));
193 | 			//mexPrintf("Kernel-Name:  %s\n", kernel_name.c_str());
194 | 
195 | 		}
196 | 
197 | 		//NDRange settings
198 | 		//global range
199 | 
200 | 		size_t mrows = mxGetM(prhs[2]);
201 | 		size_t ncols = mxGetN(prhs[2]);
202 | 
203 | 		if ((mxIsDouble(prhs[2]) || (mxGetClassID(prhs[2]) == mxUINT32_CLASS)) && !mxIsComplex(prhs[2]) && (mrows * ncols == 3)) {
204 | 			if (mxIsDouble(prhs[2])) {
205 | 				double  *range_ptr;
206 | 			range_ptr = mxGetPr(prhs[2]);
207 | 
208 | 			global_range_x = (uint32_t)round(range_ptr[0]);
209 | 			global_range_y = (uint32_t)round(range_ptr[1]);
210 | 			global_range_z = (uint32_t)round(range_ptr[2]);
211 | 			}
212 | 			else {
213 | 				uint32_t  *range_ptr;
214 | 				range_ptr = (uint32_t *)mxGetData(prhs[2]);
215 | 
216 | 				global_range_x = (uint32_t)(range_ptr[0]);
217 | 				global_range_y = (uint32_t)(range_ptr[1]);
218 | 				global_range_z = (uint32_t)(range_ptr[2]);
219 | 
220 | 			}
221 | 
222 | 			global_range = cl::NDRange(global_range_x, global_range_y, global_range_z);
223 | 
224 | 		}
225 | 		else {
226 | 			if ((mxIsDouble(prhs[2]) || (mxGetClassID(prhs[2]) == mxUINT32_CLASS)) && !mxIsComplex(prhs[2]) && (mrows * ncols == 6)) {
227 | 				if (mxIsDouble(prhs[2])) {
228 | 					double  *range_ptr;
229 | 					range_ptr = mxGetPr(prhs[2]);
230 | 				range_start_x = (uint32_t)round(range_ptr[0]);
231 | 				range_start_y = (uint32_t)round(range_ptr[1]);
232 | 				range_start_z = (uint32_t)round(range_ptr[2]);
233 | 
234 | 				global_range_x = (uint32_t)round(range_ptr[3]);
235 | 				global_range_y = (uint32_t)round(range_ptr[4]);
236 | 				global_range_z = (uint32_t)round(range_ptr[5]);
237 | 
238 | 				}
239 | 				else {
240 | 					uint32_t  *range_ptr;
241 | 					range_ptr = (uint32_t *)mxGetData(prhs[2]);
242 | 
243 | 					range_start_x = (uint32_t)(range_ptr[0]);
244 | 					range_start_y = (uint32_t)(range_ptr[1]);
245 | 					range_start_z = (uint32_t)(range_ptr[2]);
246 | 
247 | 					global_range_x = (uint32_t)(range_ptr[3]);
248 | 					global_range_y = (uint32_t)(range_ptr[4]);
249 | 					global_range_z = (uint32_t)(range_ptr[5]);
250 | 				}
251 | 
252 | 				range_start = cl::NDRange(range_start_x, range_start_y, range_start_z);
253 | 				global_range = cl::NDRange(global_range_x, global_range_y, global_range_z);
254 | 			}
255 | 			else {
256 | 				mexErrMsgIdAndTxt("OpenCL:NDRange", "Invalid global range defined!");
257 | 			}
258 | 		}
259 | 
260 | 		//local range
261 | 
262 | 		mrows = mxGetM(prhs[3]);
263 | 		ncols = mxGetN(prhs[3]);
264 | 
265 | 		if ((mxIsDouble(prhs[3]) || (mxGetClassID(prhs[3]) == mxUINT32_CLASS)) && !mxIsComplex(prhs[3]) && (mrows + ncols == 4)) {
266 | 			if (mxIsDouble(prhs[3])) {
267 | 				double  *range_ptr;
268 | 				range_ptr = mxGetPr(prhs[3]);
269 | 			global_range_x = (uint32_t)round(range_ptr[0]);
270 | 			global_range_y = (uint32_t)round(range_ptr[1]);
271 | 			global_range_z = (uint32_t)round(range_ptr[2]);
272 | 
273 | 		}
274 | 		else {
275 | 			uint32_t  *range_ptr;
276 | 			range_ptr = (uint32_t *)mxGetData(prhs[3]);
277 | 
278 | 			global_range_x = (uint32_t)(range_ptr[0]);
279 | 			global_range_y = (uint32_t)(range_ptr[1]);
280 | 			global_range_z = (uint32_t)(range_ptr[2]);
281 | 		}
282 | 
283 | 			local_range = cl::NDRange(global_range_x, global_range_y, global_range_z);
284 | 
285 | 		}
286 | 		else {
287 | 			if (mrows + ncols == 2) {
288 | 				local_range = cl::NullRange;
289 | 			}
290 | 			else {
291 | 				mexErrMsgIdAndTxt("OpenCL:NDRange", "Invalid local range defined!");
292 | 				return;
293 | 			}
294 | 		}
295 | 
296 | 		runkernel(plhs, nrhs, prhs, kernel_list, num_in,var_offset, &dev_mgr, device, range_start,global_range, local_range, false,false,copy_time);
297 | 
298 | 		uint64_t  *copy_time_ptr;
299 | 		plhs[1] = mxCreateNumericMatrix(1, 1, mxUINT64_CLASS, mxREAL);
300 | 		copy_time_ptr = (uint64_t *)mxGetData(plhs[1]);
301 | 		copy_time_ptr[0] = copy_time;
302 | 
303 | 
304 | 
305 | 	}
306 | 
307 | 	//this part compiles and runs kernel
308 | 	//////////////////////////////////////////////////////////////////////////////////////////////////////////
309 | 	if ((compile_only == false) && (old_instance == false)) {
310 | 		//mexPrintf("Compile and run...\n");
311 | 
312 | 		dev_mgr.deinitalize();
313 | 
314 | 		compilerun(plhs, nrhs, prhs, &dev_mgr, device, false,false);
315 | 	}
316 | 	}
317 | 
318 | 	else {
319 | 		mexErrMsgIdAndTxt("MATLAB:cl_dev", "OpenCl Device not found!");
320 | 	}
321 |     } else {
322 |         mexErrMsgIdAndTxt("MATLAB:syntax", "Incorrect Syntax!");
323 |     }
324 | 
325 | 
326 | 
327 | 	return;
328 | 
329 | }
330 | 


--------------------------------------------------------------------------------
/src/ocl_dev_mgr.cpp:
--------------------------------------------------------------------------------
  1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */
  2 | 
  3 | #include <algorithm>
  4 | #include <fstream>
  5 | #include <iostream>
  6 | #include <iterator>
  7 | #include <sstream> 
  8 | #include "ocl_dev_mgr.hpp"
  9 | 
 10 | 
 11 | #if defined(_WIN32)
 12 | #include <io.h>
 13 | #define access _access_s
 14 | #else
 15 | #include <unistd.h>
 16 | #endif
 17 | 
 18 | 
 19 | // macros
 20 | #define STRINGIZE_(x) #x
 21 | #define STRINGIZE(x) STRINGIZE_(x)
 22 | 
 23 | #define ERROR_INFO "Error in line " STRINGIZE(__LINE__) " of " __FILE__ ":\n "
 24 | 
 25 | 
 26 | // file system functions
 27 | 
 28 | inline bool fileExists(std::string const& filename)
 29 | {
 30 |   return access(filename.c_str(), 0) == 0;
 31 | }
 32 | 
 33 | 
 34 | 
 35 | inline void compile(cl::Program& cl_prog, char const* options)
 36 | {
 37 |   std::string compile_options = std::string(" ") + std::string(options);
 38 | 
 39 | 	try {
 40 |     cl_prog.build(compile_options.c_str());
 41 | 	}
 42 | 	catch (cl::BuildError error) {
 43 |     std::string log = error.getBuildLog()[0].second;
 44 |     std::cerr << ERROR_INFO << "Build error:\n" << log << std::endl;
 45 | 	}
 46 | 	catch (cl::Error err) {
 47 |     std::cerr << ERROR_INFO << "Exception:" << err.what() << std::endl;
 48 | 	}
 49 | }
 50 | 
 51 | 
 52 | inline std::string loadProgram(std::string const& input_filename)
 53 | {
 54 |   std::ifstream input(input_filename.c_str());
 55 |   if (!input.is_open()) {
 56 |     std::cerr << ERROR_INFO << "Cannot open file '" << input_filename << "'." << std::endl;
 57 |     exit(1);
 58 |   }
 59 | 
 60 |   return std::string(std::istreambuf_iterator<char>(input), (std::istreambuf_iterator<char>()));
 61 | }
 62 | 
 63 | 
 64 | ocl_dev_mgr::ocl_dev_mgr() {
 65 |     initialize();
 66 | }
 67 | 
 68 | 
 69 | cl::Kernel* ocl_dev_mgr::getKernelbyName(cl_uint context_idx, std::string const& prog_name, std::string const& kernel_name)
 70 | {
 71 |   auto it_p = find(con_list.at(context_idx).prog_names.begin(), con_list.at(context_idx).prog_names.end(), prog_name);
 72 |   if (it_p == con_list.at(context_idx).prog_names.end()) {
 73 |     return nullptr;
 74 |   }
 75 | 
 76 |   uint32_t idx = distance(con_list.at(context_idx).prog_names.begin(), it_p);
 77 | 
 78 |   if (con_list.at(context_idx).kernels.at(idx).size() > 1) {
 79 |     for (cl_uint i = 0; i < con_list.at(context_idx).kernels.at(idx).size(); i++) {
 80 |       if (kernel_name == con_list.at(context_idx).kernel_names.at(idx).at(i)) {
 81 |         return &(con_list.at(context_idx).kernels.at(idx).at(i));
 82 |       }
 83 |     }
 84 |   }
 85 | 
 86 |   return &(con_list.at(context_idx).kernels.at(idx).at(0));
 87 | }
 88 | 
 89 | cl::Kernel* ocl_dev_mgr::getKernelbyID(cl_uint context_idx, std::string const& prog_name, cl_ulong kernel_id)
 90 | {
 91 |   auto it_p = find(con_list.at(context_idx).prog_names.begin(), con_list.at(context_idx).prog_names.end(), prog_name);
 92 |   if (it_p == con_list.at(context_idx).prog_names.end()) {
 93 |     return nullptr;
 94 |   }
 95 | 
 96 |   uint32_t idx = distance(con_list.at(context_idx).prog_names.begin(), it_p);
 97 | 
 98 |   return &(con_list.at(context_idx).kernels.at(idx).at(kernel_id));
 99 | }
100 | 
101 | 
102 | std::string ocl_dev_mgr::getDeviceType(cl_uint avail_device_idx)
103 | {
104 |   if (available_devices.at(avail_device_idx).type == CL_DEVICE_TYPE_CPU) {
105 |     return(type_cpu_str);
106 |   }
107 |   else if (available_devices.at(avail_device_idx).type == CL_DEVICE_TYPE_GPU) {
108 |     return(type_gpu_str);
109 |   }
110 |   else if (available_devices.at(avail_device_idx).type == CL_DEVICE_TYPE_ACCELERATOR) {
111 |     return(type_acc_str);
112 |   }
113 |   else {
114 |     return(type_other_str);
115 |   }
116 | }
117 | 
118 | std::string ocl_dev_mgr::getDevicePCIeID(cl_uint avail_device_idx)
119 | {
120 | #define CL_DEVICE_PCI_BUS_ID_NV 0x4008
121 | #define CL_DEVICE_PCI_SLOT_ID_NV 0x4009
122 | #define CL_DEVICE_TOPOLOGY_AMD  0x4037
123 | typedef union
124 | {
125 |   struct { cl_uint type; cl_uint data[5]; } raw;
126 |   struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
127 | } cl_device_topology_amd;
128 | 
129 |   cl_device_topology_amd amd_topo;
130 | 	cl_int bus_id;
131 | 	cl_int slot_id;
132 | 	std::ostringstream tmp_stream;
133 | 
134 |   std::size_t found = 0;
135 |   found = available_devices.at(avail_device_idx).vendor.find("NVIDIA");
136 |   if (found != std::string::npos) {
137 |     available_devices.at(avail_device_idx).device.getInfo(CL_DEVICE_PCI_BUS_ID_NV,&bus_id);
138 | 	  available_devices.at(avail_device_idx).device.getInfo(CL_DEVICE_PCI_SLOT_ID_NV, &slot_id);
139 | 
140 | 	  cl_uint domain, bus, dev, func;
141 | 	  domain = bus_id >> 8;
142 | 	  bus = bus_id & 0xff;
143 | 	  tmp_stream << domain << ":" << bus << ":" << slot_id;
144 |   }
145 |   else
146 |   {
147 |     found = available_devices.at(avail_device_idx).vendor.find("Advanced Micro Devices");
148 |     if (found != std::string::npos) {
149 |       available_devices.at(avail_device_idx).device.getInfo(CL_DEVICE_TOPOLOGY_AMD, &amd_topo);
150 |       tmp_stream << "0:" << (unsigned int)amd_topo.pcie.bus << ":" << (unsigned int)amd_topo.pcie.device; //Domain is not returned?
151 |     }
152 |   }
153 | 
154 | 	
155 | 	return tmp_stream.str();
156 | }
157 | 
158 | cl_int bus_id;
159 | cl_int slot_id;
160 | 
161 | cl_ulong ocl_dev_mgr::getDeviceList(std::vector<cl::Device>& devices)
162 | {
163 |   // Get list of platforms 
164 |   std::vector<cl::Platform> platforms;
165 |   cl::Platform::get(&platforms);
166 | 
167 |   // Enumerate devices
168 |   for (cl::Platform const& platform : platforms)
169 |   {
170 |     std::vector<cl::Device> plat_devices;
171 |     platform.getDevices(CL_DEVICE_TYPE_ALL, &plat_devices);
172 |     devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
173 |   }
174 | 
175 |   return devices.size();
176 | }
177 | 
178 | 
179 | cl_ulong ocl_dev_mgr::init_device(cl_uint avail_device_idx)
180 | {
181 |   ocl_context tmp_context;
182 | 
183 |   tmp_context.devices.push_back(available_devices.at(avail_device_idx));
184 | 
185 |   std::vector<cl::Device> tmp_devices;
186 |   tmp_devices.push_back(available_devices.at(avail_device_idx).device);
187 | 
188 |   cl::Context context(tmp_devices, NULL);
189 |   tmp_context.context = context;
190 | 
191 |   tmp_context.queues.push_back(cl::CommandQueue(tmp_context.context, CL_QUEUE_PROFILING_ENABLE));
192 | 	//push second queue for async copy
193 |   tmp_context.queues.push_back(cl::CommandQueue(tmp_context.context, CL_QUEUE_PROFILING_ENABLE));
194 | 
195 |   con_list.push_back(tmp_context);
196 | 
197 |   return con_list.size();
198 | }
199 | 
200 | cl::CommandQueue& ocl_dev_mgr::get_queue(cl_uint context_idx, cl_uint queue_idx)
201 | {
202 |   return con_list.at(context_idx).queues.at(queue_idx);
203 | }
204 | 
205 | cl::Context& ocl_dev_mgr::get_context(cl_uint context_idx)
206 | {
207 |   return con_list.at(context_idx).context;
208 | }
209 | 
210 | cl_ulong ocl_dev_mgr::get_avail_dev_num()
211 | {
212 |   return num_available_devices;
213 | }
214 | 
215 | cl_ulong ocl_dev_mgr::get_context_num()
216 | {
217 |   return con_list.size();
218 | }
219 | 
220 | 
221 | bool ocl_dev_mgr::add_program_url(cl_uint context_idx, std::string prog_name, std::string const& url)
222 | {
223 |   if (!fileExists(url)) {
224 |     return false;
225 |   }
226 | 
227 |   return add_program_str(context_idx, prog_name, loadProgram(url));
228 | }
229 | 
230 | bool ocl_dev_mgr::add_program_str(cl_uint context_idx, std::string prog_name, std::string kernel)
231 | {
232 |   con_list.at(context_idx).programs.push_back(cl::Program(con_list.at(context_idx).context, kernel));
233 |   con_list.at(context_idx).prog_names.push_back(prog_name);
234 |   con_list.at(context_idx).kernels.resize(con_list.at(context_idx).kernels.size() + 1);
235 |   con_list.at(context_idx).kernel_names.resize(con_list.at(context_idx).kernel_names.size() + 1);
236 |   return true;
237 | }
238 | 
239 | 
240 | cl::Program& ocl_dev_mgr::get_program(cl_uint context_idx, std::string const& prog_name)
241 | {
242 |   auto it_p = find(con_list.at(context_idx).prog_names.begin(), con_list.at(context_idx).prog_names.end(), prog_name);
243 |   if (it_p != con_list.at(context_idx).prog_names.end()) {
244 |     return con_list.at(context_idx).programs.at(distance(con_list.at(context_idx).prog_names.begin(), it_p));
245 |   }
246 |   else {
247 |     std::cerr << ERROR_INFO << "Program '" << prog_name << "' not found." << std::endl;
248 |     //TODO: Exception?
249 |     return con_list.at(context_idx).programs.at(0);
250 |   }
251 | }
252 | 
253 | 
254 | ocl_dev_mgr::ocl_device_info& ocl_dev_mgr::get_avail_dev_info(cl_uint avail_device_idx)
255 | {
256 |   return available_devices.at(avail_device_idx);
257 | }
258 | 
259 | 
260 | ocl_dev_mgr::ocl_device_info& ocl_dev_mgr::get_context_dev_info(cl_uint context_idx, cl_uint device_idx)
261 | {
262 |   return con_list.at(context_idx).devices.at(device_idx);
263 | }
264 | 
265 | 
266 | // return execution time in µs
267 | cl_ulong ocl_dev_mgr::execute_kernel(cl::Kernel& kernel, cl::CommandQueue& queue,
268 |   cl::NDRange global_range, cl::NDRange local_range,
269 |   std::vector<cl::Buffer*>& dev_Buffers)
270 | {
271 |   cl::Event event;
272 |   cl_ulong time_start, time_end;
273 | 
274 |   try {
275 |     for (cl_uint i = 0; i < dev_Buffers.size(); i++) {
276 |       kernel.setArg(i, *dev_Buffers[i]);
277 |     }
278 | 
279 |     queue.enqueueNDRangeKernel(kernel, cl::NullRange, global_range, local_range, NULL, &event);
280 |     event.wait();
281 |     event.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
282 |     event.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &time_start);
283 |   }
284 |   catch (cl::BuildError error) {
285 |     std::string log = error.getBuildLog()[0].second;
286 |     std::cerr << ERROR_INFO << "Build error:\n" << log << std::endl;
287 | 	}
288 |   catch (cl::Error err) {
289 |     std::cerr << ERROR_INFO << "Exception:" << err.what() << std::endl;
290 |   }
291 | 
292 |   return (time_end - time_start) / 1000;
293 | }
294 | 
295 | 
296 | // return execution time in µs
297 | cl_ulong ocl_dev_mgr::execute_kernelNA(cl::Kernel& kernel, cl::CommandQueue& queue,
298 | cl::NDRange range_start, cl::NDRange global_range, cl::NDRange local_range)
299 | {
300 |   cl::Event event;
301 |   cl_ulong time_start, time_end;
302 | 
303 |   try {
304 |     queue.enqueueNDRangeKernel(kernel, range_start, global_range, local_range, NULL, &event);
305 |     event.wait();
306 |     event.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
307 |     event.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &time_start);
308 | 	}
309 |   catch (cl::BuildError error) {
310 |     std::string log = error.getBuildLog()[0].second;
311 |     std::cerr << ERROR_INFO << "Build error:\n" << log << std::endl;
312 |   }
313 |   catch (cl::Error err) {
314 |     std::cerr << ERROR_INFO << "Exception:" << err.what() << std::endl;
315 |   }
316 | 
317 |   return (time_end - time_start) / 1000;
318 | }
319 | 
320 | // don't return execution time in µs
321 | void ocl_dev_mgr::execute_kernel_async(cl::Kernel& kernel, cl::CommandQueue& queue,
322 |   cl::NDRange global_range, cl::NDRange local_range,
323 |   std::vector<cl::Buffer*>& dev_Buffers)
324 | {
325 |   try {
326 |     for (cl_uint i = 0; i < dev_Buffers.size(); i++) {
327 |     kernel.setArg(i, *dev_Buffers[i]);
328 |     }
329 | 
330 |     queue.enqueueNDRangeKernel(kernel, cl::NullRange, global_range, local_range, NULL, NULL);
331 |   }
332 |   catch (cl::BuildError error) {
333 |     std::string log = error.getBuildLog()[0].second;
334 |     std::cerr << ERROR_INFO << "Build error:\n" << log << std::endl;
335 |   }
336 |   catch (cl::Error err) {
337 |     std::cerr << ERROR_INFO << "Exception:" << err.what() << std::endl;
338 |   }
339 | }
340 | 
341 | 
342 | // Compile kernels and return the number of compiled kernels.
343 | cl_ulong ocl_dev_mgr::compile_kernel(cl_uint context_idx, std::string const& prog_name, std::string const& options)
344 | {
345 |   std::string compile_options = std::string(" ") + options;
346 | 
347 |   auto it_p = find(con_list.at(context_idx).prog_names.begin(), con_list.at(context_idx).prog_names.end(), prog_name);
348 |   if (it_p == con_list.at(context_idx).prog_names.end()) {
349 |     std::cerr << ERROR_INFO << "Program '" << prog_name << "' not found." << std::endl;
350 |     //TODO: Exception?
351 |     return 0;
352 |   }
353 | 
354 |   int32_t idx = distance(con_list.at(context_idx).prog_names.begin(), it_p);
355 | 
356 |   try {
357 |     con_list.at(context_idx).programs.at(idx).build(compile_options.c_str());
358 |   }
359 |   catch (cl::BuildError error) {
360 |     std::string log = error.getBuildLog()[0].second;
361 |     std::cerr << ERROR_INFO << "Build error:\n" << log << std::endl;
362 |   }
363 |   catch (cl::Error err) {
364 |     std::cerr << ERROR_INFO << "Exception:" << err.what() << std::endl;
365 |   }
366 | 
367 |   con_list.at(context_idx).programs.at(idx).createKernels(&(con_list.at(context_idx).kernels.at(idx)));
368 | 
369 |   con_list.at(context_idx).kernel_names.at(idx).clear(); //make sure to clear kernel_names list
370 | 
371 |   for (uint32_t i = 0; i < con_list.at(context_idx).kernels.at(idx).size(); i++) {
372 |     con_list.at(context_idx).kernel_names.at(idx).push_back(con_list.at(context_idx).kernels.at(idx).at(i).getInfo<CL_KERNEL_FUNCTION_NAME>());
373 |   }
374 | 
375 |   return con_list.at(context_idx).kernels.at(idx).size();
376 | }
377 | 
378 | 
379 | cl_ulong ocl_dev_mgr::get_kernel_names(cl_uint context_idx, std::string const& prog_name, std::vector<std::string>& found_kernels)
380 | {
381 |   auto it_p = find(con_list.at(context_idx).prog_names.begin(), con_list.at(context_idx).prog_names.end(), prog_name);
382 |   if (it_p == con_list.at(context_idx).prog_names.end()) {
383 |     std::cerr << ERROR_INFO << "Program '" << prog_name << "' not found." << std::endl;
384 | 		//TODO: Exception?
385 |     return 0;
386 |   }
387 | 
388 |   int32_t idx = distance(con_list.at(context_idx).prog_names.begin(), it_p);
389 | 
390 |   for (uint32_t kernel_id = 0; kernel_id < con_list.at(context_idx).kernel_names.at(idx).size(); kernel_id++) {
391 |     found_kernels.push_back(con_list.at(context_idx).kernel_names.at(idx).at(kernel_id));
392 |   }
393 | 
394 |   return con_list.at(context_idx).kernel_names.at(idx).size();
395 | }
396 | 
397 | 
398 | void ocl_dev_mgr::initialize()
399 | {
400 |   std::vector<cl::Device> tmp_devices;
401 |   getDeviceList(tmp_devices);
402 |   num_available_devices = tmp_devices.size();
403 | 
404 |   available_devices = std::vector<ocl_device_info>(num_available_devices);
405 | 
406 | for (size_t i = 0; i < tmp_devices.size(); i++) {
407 | 
408 |     available_devices.at(i).device = tmp_devices.at(i);
409 |     std::vector<size_t> tmp_size;
410 | 
411 |     available_devices.at(i).device.getInfo(CL_DEVICE_GLOBAL_MEM_SIZE, &available_devices.at(i).max_mem);
412 |     available_devices.at(i).device.getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &available_devices.at(i).max_mem_alloc);
413 |     available_devices.at(i).device.getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, &available_devices.at(i).lw_dim);
414 |     available_devices.at(i).device.getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &available_devices.at(i).wg_size);
415 |     available_devices.at(i).device.getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &tmp_size);
416 |     available_devices.at(i).lw_size = tmp_size.at(0);
417 |     available_devices.at(i).device.getInfo(CL_DEVICE_NAME, &available_devices.at(i).name);
418 |     available_devices.at(i).device.getInfo(CL_DEVICE_VERSION, &available_devices.at(i).ocl_version);
419 |     available_devices.at(i).device.getInfo(CL_DEVICE_TYPE, &available_devices.at(i).type);
420 |     available_devices.at(i).device.getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &available_devices.at(i).compute_units);
421 |     available_devices.at(i).device.getInfo(CL_DEVICE_PLATFORM, &available_devices.at(i).platform);
422 |     available_devices.at(i).device.getInfo(CL_DEVICE_VENDOR, &available_devices.at(i).vendor);
423 |     available_devices.at(i).platform.getInfo(CL_PLATFORM_NAME, &available_devices.at(i).platform_name);
424 | 	}
425 | }
426 | 
427 | 
428 | void ocl_dev_mgr::deinitalize()
429 | {
430 | 	//Deinitialization should be performed automatically, but there seems to be segfaults
431 | 	//under certain conditions using Windows, hence the vetor is cleared manually
432 |   con_list.clear();
433 | }


--------------------------------------------------------------------------------
/src/MatCL.hpp:
--------------------------------------------------------------------------------
  1 | /* This project is licensed under the terms of the Creative Commons CC BY-NC-ND 4.0 license. */
  2 | 
  3 | #ifndef MATCL_H
  4 | #define MATCL_H
  5 | 
  6 | #include <math.h>
  7 | #include "mex.h"
  8 | #include "matrix.h"
  9 | #include <iostream>
 10 | #include <fstream>
 11 | #include <string>
 12 | #include <time.h>
 13 | #include <sstream>
 14 | #if defined(_WIN32)
 15 | #include <windows.h>
 16 | #include <io.h>
 17 | #include <process.h>
 18 | #endif
 19 | 
 20 | 
 21 | #define CL_HPP_ENABLE_EXCEPTIONS
 22 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120
 23 | #define CL_HPP_TARGET_OPENCL_VERSION 120
 24 | 
 25 | #include <CL/cl2.hpp>
 26 | #include "ocl_dev_mgr.hpp"
 27 | 
 28 | 
 29 | #include "utils.hpp"
 30 | inline bool FileExists(const std::string &Filename)
 31 | {
 32 | 	return access(Filename.c_str(), 0) == 0;
 33 | }
 34 | 
 35 | inline std::string loadProgram(std::string input)
 36 | {
 37 | 	std::ifstream stream(input.c_str());
 38 | 	if (!stream.is_open()) {
 39 | 		std::cout << "Cannot open file: " << input << std::endl;
 40 | 		exit(1);
 41 | 	}
 42 | 
 43 | 	return std::string(
 44 | 		std::istreambuf_iterator<char>(stream),
 45 | 		(std::istreambuf_iterator<char>()));
 46 | }
 47 | 
 48 | inline void remove_empty_lines(std::istream& in, std::ostream& out)
 49 | {
 50 | 	std::string line;
 51 | 
 52 | 	while (std::getline(in, line)) {
 53 | 		bool is_empty = true;
 54 | 		if (!line.empty()) {
 55 | 			for (uint32_t i = 0; i < line.length(); i++)
 56 | 			{
 57 | 				if ((line.at(i) != 32) && (line.at(i) != '\n')) {
 58 | 					is_empty = false;
 59 | 				}
 60 | 			}
 61 | 		}
 62 | 		if (is_empty == false) {
 63 | 			out << line << '\n';
 64 | 		}
 65 | 	}
 66 | }
 67 | 
 68 | 
 69 | int32_t getKernel_info(mxArray *plhs[], int nrhs, const mxArray*prhs[], ocl_dev_mgr *dev_mgr) {
 70 | 
 71 | 	size_t buflen;
 72 | 
 73 | 	//get Kernel URL
 74 | 	std::string kernel_data;
 75 | 
 76 | 	mwSize cell_dims;
 77 | 	mxArray *cellElement;
 78 | 	if (mxIsCell(prhs[1]) == true) {
 79 | 
 80 | 
 81 | 		cell_dims = mxGetNumberOfElements(prhs[1]);
 82 | 		for (uint32_t icell = 0; icell < cell_dims; icell++) {
 83 | 			cellElement = mxGetCell(prhs[1], icell);
 84 | 
 85 | 			buflen = mxGetN(cellElement) + 1;
 86 | 			//mexPrintf("Size:  %d\n", buflen);
 87 | 
 88 | 			char *kernel_url_c;
 89 | 			kernel_url_c = (char *)mxMalloc(buflen);
 90 | 			mxGetString(cellElement, kernel_url_c, (mwSize)buflen);
 91 | 			std::string kernel_url(kernel_url_c);
 92 | 			//mexPrintf("Kernel-URL:  %s\n", kernel_url_c);
 93 | 			if (FileExists(kernel_url) == true) {
 94 | 				kernel_data.append(loadProgram(kernel_url));
 95 | 				kernel_data.append("\n");
 96 | 			}
 97 | 			else {
 98 | 				mexErrMsgIdAndTxt("MATLAB:cl_program", "OpenCl Kernel file not found!");
 99 | 				return -1;
100 | 			}
101 | 
102 | 		}
103 | 		dev_mgr->add_program_str(0, "ocl_Kernel", kernel_data);
104 | 	}
105 | 	else {
106 | 		char *kernel_url_c;
107 | 		buflen = mxGetN(prhs[1]) + 1;
108 | 		kernel_url_c = (char *)mxMalloc(buflen);
109 | 		mxGetString(prhs[1], kernel_url_c, (mwSize)buflen);
110 | 		std::string kernel_url(kernel_url_c);
111 | 		//mexPrintf("Kernel-URL:  %s\n", kernel_url_c);
112 | 		if (dev_mgr->add_program_url(0, "ocl_Kernel", kernel_url) < 0) {  //Add kernel source
113 | 			mexErrMsgIdAndTxt("MATLAB:cl_program", "OpenCl Kernel file not found!");
114 | 			return -1;
115 | 		}
116 | 
117 | 	}
118 | 
119 | 
120 | 	return 0;
121 | 
122 | }
123 | 
124 | int32_t runkernel(mxArray *plhs[], int nrhs, const mxArray*prhs[], std::vector<std::string> &kernel_list, uint32_t num_in, uint32_t mvar_offset, ocl_dev_mgr *dev_mgr, uint32_t device,  cl::NDRange range_start, cl::NDRange global_range, cl::NDRange local_range, bool debug_mode, bool log_file, uint64_t &copy_time)
125 | {
126 | 	size_t buflen;
127 | 	char *buf;
128 | 	char *settings;
129 | 	char *kernel_name_c;
130 | 	bool blocking = CL_FALSE;
131 | 	uint64_t mem_needed = 0;
132 | 
133 | 	std::vector<cl::Buffer> data_in;
134 | 	std::vector<uint64_t> data_size;
135 | 
136 | 	uint64_t startTransfer, transferTime;
137 | 	Timer timer; //used to track performance
138 | 
139 | 				 //used for kernel printf
140 | #if defined(_WIN32)
141 | 	COORD buffer_size;
142 | 	SMALL_RECT rect;
143 | #endif
144 | #if !defined(_WIN32)
145 | 	char buffer[4096];
146 | 	auto fp = fmemopen(buffer, 4096, "w");
147 | 	auto old = stdout;
148 | 
149 | #endif
150 | 
151 | 
152 | 	uint32_t var_offset = mvar_offset;
153 | 
154 | 	//this part compiles and runs kernel
155 | 	//////////////////////////////////////////////////////////////////////////////////////////////////////////
156 | 
157 | 	//mexPrintf("Compile and run...\n");
158 | 
159 | 	if ((debug_mode == true) || ((log_file == true))) {
160 | #if defined(_WIN32)
161 | 
162 | 		AllocConsole();
163 | 
164 | #define con_rows 150
165 | #define con_cols 120
166 | 
167 | 
168 | 		//get info un biggest possible console buffer
169 | 		buffer_size = GetLargestConsoleWindowSize(GetStdHandle(STD_OUTPUT_HANDLE));
170 | 		if (buffer_size.X > con_cols) {
171 | 			buffer_size.X = con_cols;
172 | 		}
173 | 		if (buffer_size.Y > con_rows) {
174 | 			buffer_size.Y = con_rows;
175 | 		}
176 | 		rect = { 0, 0,  buffer_size.X - 1,buffer_size.Y - 1 };
177 | 		//std::cout << buffer_size.X << "%" << buffer_size.Y << std::endl;
178 | 		SetConsoleScreenBufferSize(GetStdHandle(STD_OUTPUT_HANDLE), buffer_size);
179 | 		SetConsoleWindowInfo(GetStdHandle(STD_OUTPUT_HANDLE), TRUE, &rect);
180 | 
181 | #endif
182 | #if !defined(_WIN32)
183 | 
184 | 		memset(buffer, 0, 4096);
185 | 		if (!fp) { printf("Error allocating buffer!"); return -1; }
186 | 
187 | 
188 | 		stdout = fp;
189 | 
190 | #endif
191 | 	}
192 | 
193 | 
194 | 	bool all_rw = mxIsScalar(prhs[var_offset + num_in]);
195 | 	double  *rw_flags_ptr;
196 | 
197 | 	if (all_rw == true) {
198 | 		//no read/write flags specified - treat all as rw buffer
199 | 		rw_flags_ptr = new double[num_in];
200 | 		std::fill(rw_flags_ptr, rw_flags_ptr + num_in, 0);
201 | 	}
202 | 	else {
203 | 		rw_flags_ptr = mxGetPr(prhs[var_offset + num_in]);
204 | 	}
205 | 
206 | 	uint64_t push_time, pull_time;
207 | 	push_time = timer.getTimeMicroseconds();
208 | 
209 | 	//create input OCL buffer
210 | 	for (uint32_t i = 0; i < num_in; i++) {
211 | 
212 | 		//mxGetM="datatype" 1=cl_float,2=cl_float2,4=cl_float4;
213 | 		//mxGetN=num_elements
214 | 
215 | 		uint64_t buf_size = mxGetN(prhs[var_offset])*mxGetM(prhs[var_offset]);
216 | 
217 | 		switch (mxGetClassID(prhs[var_offset])) {
218 | 		case mxSINGLE_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_float)); break;
219 | 		case mxDOUBLE_CLASS: buf_size = buf_size * uint64_t(sizeof(cl_double)); break;
220 | 		case mxINT8_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_char)); break;
221 | 		case mxUINT8_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_uchar)); break;
222 | 		case mxINT16_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_short)); break;
223 | 		case mxUINT16_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_ushort)); break;
224 | 		case mxINT32_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_int)); break;
225 | 		case mxUINT32_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_uint)); break;
226 | 		case mxINT64_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_long)); break;
227 | 		case mxUINT64_CLASS: buf_size = uint64_t(buf_size * sizeof(cl_ulong)); break;
228 | 		}
229 | 		//mexPrintf("I Buffer Size:  %ld,%d,%d\n", buf_size, mxGetN(prhs[var_offset]), mxGetM(prhs[var_offset]));
230 | 		//	std::cout << buf_size <<"/"<< dev_mgr.get_avail_dev_info(device).max_mem_alloc << std::endl;
231 | 		data_size.push_back(buf_size);
232 | 		mem_needed = mem_needed + buf_size;
233 | 		//	 mexPrintf("I Buffer Size:  %d\n", buf_size);
234 | 		//mexPrintf("Var Size:  %d\n", sizeof(cl_float));
235 | 		//	 mexPrintf("I Datatype: %s\n", mxGetClassName(prhs[var_offset]));
236 | 
237 | 		if (dev_mgr->get_avail_dev_info(device).max_mem_alloc < buf_size) {
238 | 			mexWarnMsgIdAndTxt("OpenCL:Dev_Mem", "Buffer size bigger than CL_DEVICE_MAX_MEM_ALLOC_SIZE!");
239 | 		}
240 | 		if ((mxIsScalar(prhs[var_offset]) == true) && ((uint32_t)round(rw_flags_ptr[i]) == 1)) {
241 | 			//mexPrintf( "Scalar Var: %d\n",i);
242 | 
243 | 			for (uint32_t kernel_idx = 0; kernel_idx < kernel_list.size(); kernel_idx++) {
244 | 				switch (mxGetClassID(prhs[var_offset])) {
245 | 				case mxSINGLE_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_float*)mxGetData(prhs[var_offset])); break;
246 | 				case mxDOUBLE_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_double*)mxGetData(prhs[var_offset])); break;
247 | 				case mxINT8_CLASS:   dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_char*)mxGetData(prhs[var_offset])); break;
248 | 				case mxUINT8_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_uchar*)mxGetData(prhs[var_offset])); break;
249 | 				case mxINT16_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_short*)mxGetData(prhs[var_offset])); break;
250 | 				case mxUINT16_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_ushort*)mxGetData(prhs[var_offset])); break;
251 | 				case mxINT32_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_int*)mxGetData(prhs[var_offset])); break;
252 | 				case mxUINT32_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_uint*)mxGetData(prhs[var_offset])); break;
253 | 				case mxINT64_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_long*)mxGetData(prhs[var_offset])); break;
254 | 				case mxUINT64_CLASS: dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, *(cl_ulong*)mxGetData(prhs[var_offset])); break;
255 | 				}
256 | 			}
257 | 
258 | 			//	mexPrintf("Scalar Var2: %f\n", *(cl_double*)mxGetData(prhs[var_offset]));
259 | 
260 | 		}
261 | 		else {
262 | 			//mexPrintf("Vec Var: %d\n", i);
263 | 			try {
264 | 				switch ((uint32_t)round(rw_flags_ptr[i])) {
265 | 				case 0:	data_in.push_back(cl::Buffer(dev_mgr->get_context(0), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, data_size.at(i))); dev_mgr->get_queue(0, 0).enqueueWriteBuffer(data_in.at(data_in.size() - 1), blocking, 0, data_size.at(i), mxGetData(prhs[var_offset]));  break;
266 | 				case 1:	data_in.push_back(cl::Buffer(dev_mgr->get_context(0), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, data_size.at(i))); dev_mgr->get_queue(0, 0).enqueueWriteBuffer(data_in.at(data_in.size() - 1), blocking, 0, data_size.at(i), mxGetData(prhs[var_offset]));  break;
267 | 				case 2:	data_in.push_back(cl::Buffer(dev_mgr->get_context(0), CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, data_size.at(i))); break;
268 | 				}
269 | 				//dev_Buffers.push_back(&(data_in.at(i)));
270 | 
271 | 				for (uint32_t kernel_idx = 0; kernel_idx < kernel_list.size(); kernel_idx++) {
272 | 					//mexPrintf("Vec Var: %d for Kernel: %d with Name: %s\n", i,kernel_idx, kernel_list.at(kernel_idx));
273 | 					dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))->setArg(i, data_in.at(data_in.size() - 1));
274 | 				}
275 | 
276 | 			}
277 | 			catch (cl::Error err) {
278 | 				mexErrMsgIdAndTxt("OpenCL:exception", err.what());
279 | 			}
280 | 
281 | 		}
282 | 		var_offset++;
283 | 	}
284 | 
285 | 
286 | 	if (dev_mgr->get_avail_dev_info(device).max_mem < mem_needed*1.2) {
287 | 		mexWarnMsgIdAndTxt("OpenCL:Dev_Mem", "Device may be out of memory!");
288 | 	}
289 | 
290 | 
291 | 	//mexPrintf("kernel: %s\n ", dev_mgr.getKernelbyID(0, "ocl_Kernel", 0)->getInfo<CL_KERNEL_FUNCTION_NAME>());
292 | 	//mexPrintf("jernel: %s\n ", dev_mgr.getKernelbyID(0, "ocl_Kernel", 1)->getInfo<CL_KERNEL_FUNCTION_NAME>());
293 | 
294 | 	uint64_t  *exec_time_ptr;
295 | 	plhs[0] = mxCreateNumericMatrix(1, 1, mxUINT64_CLASS, mxREAL);
296 | 	exec_time_ptr = (uint64_t *)mxGetData(plhs[0]);
297 | 
298 | 	dev_mgr->get_queue(0, 0).finish();//Buffer Copy is asynchornous
299 | 	push_time = timer.getTimeMicroseconds() - push_time;
300 | 
301 | 	// transferTime = (timer.getTimeMicroseconds() - startTransfer);
302 | 	// mexPrintf("Copy:  %d\n", transferTime);
303 | 	exec_time_ptr[0] = 0;
304 | 	for (uint32_t kernel_idx = 0; kernel_idx < kernel_list.size(); kernel_idx++){
305 | 		exec_time_ptr[0] = exec_time_ptr[0] + dev_mgr->execute_kernelNA(*(dev_mgr->getKernelbyName(0, "ocl_Kernel", kernel_list.at(kernel_idx))), dev_mgr->get_queue(0, 0), range_start,global_range, local_range);
306 | 	}
307 | 
308 | 	var_offset = mvar_offset;
309 | 
310 | 	pull_time = timer.getTimeMicroseconds();
311 | 
312 | 	uint32_t buffer_counter = 0;
313 | 
314 | 	for (uint32_t i = 0; i < num_in; i++) {
315 | 
316 | 		if ((mxIsScalar(prhs[var_offset]) == true) && ((uint32_t)round(rw_flags_ptr[i]) == 1)) {
317 | 			//mexPrintf( "Scalar Var2: %d\n",i);
318 | 		//  do something?
319 | 		}
320 | 		else {
321 | 
322 | 			//  mexPrintf("O Buffer Size:  %d\n", data_size.at(i));
323 | 			//     mexPrintf("O Datatype: %s\n", mxGetClassName(prhs[var_offset]));
324 | 			try {
325 | 				switch ((uint32_t)round(rw_flags_ptr[i])) {
326 | 
327 | 				case 0:  dev_mgr->get_queue(0, 0).enqueueReadBuffer(data_in.at(buffer_counter), blocking, 0, data_size.at(buffer_counter), mxGetData(prhs[var_offset])); break;
328 | 				case 1: break;
329 | 				case 2: dev_mgr->get_queue(0, 0).enqueueReadBuffer(data_in.at(buffer_counter), blocking, 0, data_size.at(buffer_counter), mxGetData(prhs[var_offset])); break;
330 | 				}
331 | 
332 | 				buffer_counter++;
333 | 
334 | 			}
335 | 			catch (cl::Error err) {
336 | 				mexErrMsgIdAndTxt("OpenCL:exception", err.what());
337 | 			}
338 | 		}
339 | 		var_offset++;
340 | 	}
341 | 
342 | 	if ((debug_mode == true)||((log_file == true))) {
343 | #if defined(_WIN32)
344 | 
345 | 		CONSOLE_SCREEN_BUFFER_INFO csbiInfo;
346 | 
347 | 		GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbiInfo);
348 | 
349 | 		COORD newpos = { 0,0 };
350 | 		CHAR_INFO chiBuffer[con_rows*con_cols];
351 | 		buffer_size = { csbiInfo.srWindow.Right,csbiInfo.srWindow.Bottom };
352 | 
353 | 		std::stringstream console_output;
354 | 		std::stringstream final_output;
355 | 
356 | 		//memset(chiBuffer,0,sizeof(CHAR_INFO));
357 | 
358 | 		rect = { 0, 0,  csbiInfo.srWindow.Right, csbiInfo.srWindow.Bottom };
359 | 		ReadConsoleOutput(GetStdHandle(STD_OUTPUT_HANDLE), chiBuffer, buffer_size, newpos, &rect);
360 | 		FreeConsole();
361 | 		//	std::cout << csbiInfo.dwMaximumWindowSize.X <<"%"<< csbiInfo.dwMaximumWindowSize.Y << std::endl;
362 | 		//	std::cout << csbiInfo.srWindow.Right << "%" << csbiInfo.srWindow.Bottom << std::endl;
363 | 		for (int32_t i = 0; i < rect.Bottom*rect.Right; i++) {
364 | 			if ((chiBuffer[i].Char.AsciiChar > 1) && (chiBuffer[i].Char.AsciiChar < 255))
365 | 
366 | 				if (i % (rect.Right + 1) == 0) {
367 | 					console_output << std::endl << (char)chiBuffer[i].Char.AsciiChar;
368 | 				}
369 | 				else {
370 | 					console_output << (char)chiBuffer[i].Char.AsciiChar;
371 | 				}
372 | 		}
373 | 		remove_empty_lines(console_output, final_output);
374 | 
375 | 		mxArray * tmp_str;
376 | 
377 | 		tmp_str = mxCreateString(final_output.str().c_str());
378 | 		plhs[1] = tmp_str;
379 | 
380 | 		if (debug_mode == true) {
381 | 			std::cout << final_output.str() << std::endl;
382 | 		}
383 | 
384 | 		if (log_file == true) {
385 | 			FILE *fp;
386 | 			char log_timestamp[100];
387 | 			char log_filename[200];
388 | 
389 | 			time_t now = time(0);
390 | 			strftime(log_timestamp, 100, "%m-%d_%H-%M-%S", localtime(&now));
391 | 
392 | 				snprintf(log_filename, 199, "log_%s_%s.txt", log_timestamp, kernel_list.at(0).c_str());
393 | 
394 | 		//	mexPrintf("%s...\n", log_filename);
395 | 
396 | 			fp = fopen(log_filename, "w");
397 | 			if (!fp) {
398 | 				mexErrMsgIdAndTxt("MATLAB:FILE", "Can't create Log file!");
399 | 			}
400 | 			else {
401 | 				fputs(final_output.str().c_str(), fp);
402 | 				fclose(fp);
403 | 			}
404 | 		}
405 | 
406 | 
407 | 
408 | #endif
409 | #if !defined(_WIN32)
410 | 		std::fclose(fp);
411 | 		stdout = old; //reset stdout
412 | 		mxArray * tmp_str;
413 | 
414 | 		tmp_str = mxCreateString(buffer);
415 | 		plhs[1] = tmp_str;
416 | 
417 | 		if (debug_mode == true) {
418 | 			std::cout << buffer << std::endl;
419 | 		}
420 | 		if (log_file == true) {
421 | 			FILE *fp;
422 | 			char log_timestamp[100];
423 | 			char log_filename[200];
424 | 
425 | 			time_t now = time(0);
426 | 			strftime(log_timestamp, 100, "%m-%d_%H-%M-%S", localtime(&now));
427 | 
428 | 			snprintf(log_filename, 199, "log_%s_%s.txt", log_timestamp, kernel_list.at(0).c_str());
429 | 			//	mexPrintf("%s...\n", log_filename);
430 | 
431 | 			fp = fopen(log_filename, "w");
432 | 			if (!fp) {
433 | 				mexErrMsgIdAndTxt("MATLAB:FILE", "Can't create Log file!");
434 | 			}
435 | 			else {
436 | 				fputs(buffer, fp);
437 | 				fclose(fp);
438 | 			}
439 | 		}
440 | 
441 | #endif
442 | 	}
443 | 
444 | 
445 | 	dev_mgr->get_queue(0, 0).finish();
446 | 
447 | 	pull_time = timer.getTimeMicroseconds() - pull_time;
448 | 	copy_time= push_time+pull_time;
449 | 
450 | 
451 | 	return 0;
452 | 
453 | 
454 | }
455 | 
456 | 
457 | int32_t compilerun(mxArray *plhs[], int nrhs, const mxArray*prhs[], ocl_dev_mgr *dev_mgr, uint32_t device, bool debug_mode,bool log_file)
458 | {
459 | 
460 | 
461 | 	bool blocking = CL_FALSE;
462 | 	uint64_t mem_needed = 0;
463 | 	std::vector<std::string> kernel_list;
464 | 
465 | 	std::vector<cl::Buffer> data_in;
466 | 	std::vector<uint64_t> data_size;
467 | 
468 | 	uint32_t global_range_x = 1;
469 | 	uint32_t global_range_y = 1;
470 | 	uint32_t global_range_z = 1;
471 | 	uint32_t range_start_x = 0;
472 | 	uint32_t range_start_y = 0;
473 | 	uint32_t range_start_z = 0;
474 | 	cl::NDRange range_start= cl::NullRange;
475 | 	cl::NDRange global_range;
476 | 	cl::NDRange local_range;
477 | 
478 | 	uint64_t  copy_time;
479 | 
480 | 	Timer timer; //used to track performance
481 | 
482 | 	//this part compiles and runs kernel
483 | 	//////////////////////////////////////////////////////////////////////////////////////////////////////////
484 | 
485 | 	//mexPrintf("Compile and run...\n");
486 | 
487 | 	dev_mgr->init_device(device);
488 | 
489 | 
490 | 	size_t buflen;
491 | 	char *buf;
492 | 	char *settings;
493 | 
494 | 	buflen = mxGetN(prhs[2]) + 1; //get Kernel Settings
495 | 	settings = (char *)mxMalloc(buflen);
496 | 	mxGetString(prhs[2], settings, (mwSize)buflen);
497 | 	//mexPrintf("Kernel-Settings:  %s\n", settings);
498 | 
499 | 	getKernel_info(plhs, nrhs, prhs, dev_mgr);
500 | 
501 | 	mwSize cell_dims;
502 | 	mxArray *cellElement;
503 | 	if (mxIsCell(prhs[3]) == true) {
504 | 
505 | 
506 | 		cell_dims = mxGetNumberOfElements(prhs[3]);
507 | 		for (uint32_t icell = 0; icell < cell_dims; icell++) {
508 | 			cellElement = mxGetCell(prhs[3], icell);
509 | 
510 | 			buflen = mxGetN(cellElement) + 1;
511 | 			//mexPrintf("Size:  %d\n", buflen);
512 | 
513 | 			char *kernel_name_c;
514 | 			kernel_name_c = (char *)mxMalloc(buflen);
515 | 			mxGetString(cellElement, kernel_name_c, (mwSize)buflen);
516 | 			kernel_list.push_back(std::string(kernel_name_c));
517 | 		//	mexPrintf("Kernel-Name:  %s\n", kernel_name_c);
518 | 		}
519 | 	}
520 | 	else {
521 | 		char *kernel_name_c;
522 | 		buflen = mxGetN(prhs[3]) + 1; //get Kernel Name
523 | 		kernel_name_c = (char *)mxMalloc(buflen);
524 | 		mxGetString(prhs[3], kernel_name_c, (mwSize)buflen);
525 | 		kernel_list.push_back(std::string(kernel_name_c));
526 | 		//mexPrintf("Kernel-Name:  %s\n", kernel_name.c_str());
527 | 
528 | 	}
529 | 
530 | 	uint64_t kernels_found = 0;
531 | 
532 | 	kernels_found = dev_mgr->compile_kernel(0, "ocl_Kernel", settings);
533 | 	if (kernels_found == 0) {
534 | 		mexErrMsgIdAndTxt("OpenCL:Kernel", "No valid kernels found");
535 | 		return -1;
536 | 	}
537 | 
538 | 
539 | 	//NDRange settings
540 | 	//global range
541 | 
542 | 	size_t mrows = mxGetM(prhs[4]);
543 | 	size_t ncols = mxGetN(prhs[4]);
544 | 
545 | 	if ((mxIsDouble(prhs[4]) || (mxGetClassID(prhs[4])== mxUINT32_CLASS)) && !mxIsComplex(prhs[4]) && (mrows * ncols == 3)) {
546 | 		if (mxIsDouble(prhs[4])) {
547 | 
548 | 			double  *range_ptr;
549 | 			range_ptr = mxGetPr(prhs[4]);
550 | 				global_range_x = (uint32_t)round(range_ptr[0]);
551 | 				global_range_y = (uint32_t)round(range_ptr[1]);
552 | 				global_range_z = (uint32_t)round(range_ptr[2]);
553 | 		}
554 | 		else {
555 | 			uint32_t  *range_ptr;
556 | 			range_ptr = (uint32_t *)mxGetData(prhs[4]);
557 | 
558 | 			global_range_x = (uint32_t)(range_ptr[0]);
559 | 			global_range_y = (uint32_t)(range_ptr[1]);
560 | 			global_range_z = (uint32_t)(range_ptr[2]);
561 | 
562 | 		}
563 | 		global_range = cl::NDRange(global_range_x, global_range_y, global_range_z);
564 | 
565 | 	}
566 | 	else {
567 | 		if ((mxIsDouble(prhs[4]) || (mxGetClassID(prhs[4]) == mxUINT32_CLASS)) && !mxIsComplex(prhs[4]) && (mrows * ncols == 6)) {
568 | 			if (mxIsDouble(prhs[4])) {
569 | 			double  *range_ptr;
570 | 			range_ptr = mxGetPr(prhs[4]);
571 | 
572 | 			range_start_x = (uint32_t)round(range_ptr[0]);
573 | 			range_start_y = (uint32_t)round(range_ptr[1]);
574 | 			range_start_z = (uint32_t)round(range_ptr[2]);
575 | 
576 | 			global_range_x = (uint32_t)round(range_ptr[3]);
577 | 			global_range_y = (uint32_t)round(range_ptr[4]);
578 | 			global_range_z = (uint32_t)round(range_ptr[5]);
579 | 
580 | 			}
581 | 			else {
582 | 				uint32_t  *range_ptr;
583 | 				range_ptr = (uint32_t *)mxGetData(prhs[4]);
584 | 
585 | 				range_start_x = (uint32_t)(range_ptr[0]);
586 | 				range_start_y = (uint32_t)(range_ptr[1]);
587 | 				range_start_z = (uint32_t)(range_ptr[2]);
588 | 
589 | 				global_range_x = (uint32_t)(range_ptr[3]);
590 | 				global_range_y = (uint32_t)(range_ptr[4]);
591 | 				global_range_z = (uint32_t)(range_ptr[5]);
592 | 			}
593 | 			range_start = cl::NDRange(range_start_x, range_start_y, range_start_z);
594 | 			global_range = cl::NDRange(global_range_x, global_range_y, global_range_z);
595 | 		}
596 | 		else {
597 | 			mexErrMsgIdAndTxt("OpenCL:NDRange", "Invalid global range defined!");
598 | 			return -1;
599 | 		}
600 | 	}
601 | 
602 | 	//local range
603 | 
604 | 	mrows = mxGetM(prhs[5]);
605 | 	ncols = mxGetN(prhs[5]);
606 | 
607 | 	if ((mxIsDouble(prhs[5]) || (mxGetClassID(prhs[5]) == mxUINT32_CLASS)) && (mrows + ncols == 4)) {
608 | 		if (mxIsDouble(prhs[5])) {
609 | 			double  *range_ptr;
610 | 			range_ptr = mxGetPr(prhs[5]);
611 | 			global_range_x = (uint32_t)round(range_ptr[0]);
612 | 			global_range_y = (uint32_t)round(range_ptr[1]);
613 | 			global_range_z = (uint32_t)round(range_ptr[2]);
614 | 		}
615 | 		else {
616 | 			uint32_t  *range_ptr;
617 | 			range_ptr = (uint32_t *)mxGetData(prhs[5]);
618 | 
619 | 			global_range_x = (uint32_t)(range_ptr[0]);
620 | 			global_range_y = (uint32_t)(range_ptr[1]);
621 | 			global_range_z = (uint32_t)(range_ptr[2]);
622 | 		}
623 | 		local_range = cl::NDRange(global_range_x, global_range_y, global_range_z);
624 | 	//	printf("Local work Size: %d/%d/%d\n", global_range_x, global_range_y, global_range_z);
625 | 
626 | 	}
627 | 	else {
628 | 		if (mrows + ncols == 2) {
629 | 			local_range = cl::NullRange;
630 | 		}
631 | 		else {
632 | 			mexErrMsgIdAndTxt("OpenCL:NDRange", "Invalid local range defined!");
633 | 			return -1;
634 | 		}
635 | 
636 | 	}
637 | 
638 | 
639 | 
640 | 	uint32_t num_in = (uint32_t)nrhs - 7;//Number of input buffers
641 | 
642 | 	uint32_t var_offset = 6;
643 | 
644 | 
645 | 
646 | 	runkernel(plhs, nrhs, prhs, kernel_list, num_in,var_offset, dev_mgr, device, range_start, global_range, local_range, debug_mode,log_file,copy_time);
647 | 
648 | 	if (debug_mode == false) {
649 | 		uint64_t  *copy_time_ptr;
650 | 		plhs[1] = mxCreateNumericMatrix(1, 1, mxUINT64_CLASS, mxREAL);
651 | 		copy_time_ptr = (uint64_t *)mxGetData(plhs[1]);
652 | 		copy_time_ptr[0] = copy_time;
653 | 	}
654 | 
655 | 	return 0;
656 | }
657 | 
658 | #endif // MATCL_H
659 | 


--------------------------------------------------------------------------------