├── bin
    └── .keep
├── obj
    └── .keep
├── .gitignore
├── images
    ├── compare.jpg
    └── details.jpg
├── libs
    ├── lib
    │   └── OpenCL.lib
    └── include
    │   ├── CL
    │       ├── opencl.h
    │       ├── cl_gl_ext.h
    │       ├── cl_d3d10_ext.h
    │       ├── cl_d3d11_ext.h
    │       ├── cl_d3d9_ext.h
    │       └── cl_gl.h
    │   └── json
    │       └── gason.h
├── test
    ├── data
    │   ├── color_grid.png
    │   ├── color_grid2.jpg
    │   ├── config_non_parseable.json
    │   ├── color_grid2_luma_swapped.png
    │   ├── config.json
    │   ├── config_invalid_val.json
    │   └── test_cases.json
    ├── specs
    │   ├── SubtractFromAllTest.cpp
    │   ├── LayerDeltasTest_script.py
    │   ├── SumTest.cpp
    │   ├── TestSpecsDeclarations.hpp
    │   ├── ExtractLumaTest.cpp
    │   ├── LumaTests_script.py
    │   ├── SquaredErrorTest.cpp
    │   ├── LastLayerDeltaTest.cpp
    │   ├── SwapLumaTest.cpp
    │   ├── BackpropagationTest_script.py
    │   ├── UpdateParametersTest.cpp
    │   ├── ConfigTest.cpp
    │   ├── LayerTest_script.R
    │   ├── LayerTest.cpp
    │   ├── BackpropagationTest.cpp
    │   └── LayerDeltasTest.cpp
    ├── TestCase.hpp
    ├── TestRunner.cpp
    └── TestCase.cpp
├── src
    ├── kernel
    │   ├── subtract_from_all.cl
    │   ├── greyscale.cl
    │   ├── extract_luma.cl
    │   ├── update_parameters.cl
    │   ├── last_layer_delta.cl
    │   ├── sum.cl
    │   ├── swap_luma.cl
    │   ├── squared_error.cl
    │   ├── layer_uber_kernel.cl
    │   ├── backpropagate.cl
    │   └── layer_deltas.cl
    ├── Config.hpp
    ├── LayerData.hpp
    ├── LayerData.cpp
    ├── opencl
    │   ├── UtilsOpenCL.hpp
    │   ├── Kernel.hpp
    │   ├── Kernel.cpp
    │   └── UtilsOpenCL.cpp
    ├── pch.hpp
    ├── ConfigBasedDataPipeline.hpp
    ├── Config.cpp
    └── DataPipeline.hpp
├── example_config.json
├── LICENSE
├── profile.py
├── makefile
├── generate_training_samples.py
├── schedule_training.py
├── weights_visualize.py
└── README.md


/bin/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/obj/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | /obj/*o
3 | /data
4 | /logs
5 | 


--------------------------------------------------------------------------------
/images/compare.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scthe/cnn-Super-Resolution/HEAD/images/compare.jpg


--------------------------------------------------------------------------------
/images/details.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scthe/cnn-Super-Resolution/HEAD/images/details.jpg


--------------------------------------------------------------------------------
/libs/lib/OpenCL.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scthe/cnn-Super-Resolution/HEAD/libs/lib/OpenCL.lib


--------------------------------------------------------------------------------
/test/data/color_grid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scthe/cnn-Super-Resolution/HEAD/test/data/color_grid.png


--------------------------------------------------------------------------------
/test/data/color_grid2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scthe/cnn-Super-Resolution/HEAD/test/data/color_grid2.jpg


--------------------------------------------------------------------------------
/test/data/config_non_parseable.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"n1": 64,
3 | 	"n2": 32,
4 | 	"f1":
5 | 	"f2": 1,
6 | 	"f3": 5
7 | }
8 | 


--------------------------------------------------------------------------------
/test/data/color_grid2_luma_swapped.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scthe/cnn-Super-Resolution/HEAD/test/data/color_grid2_luma_swapped.png


--------------------------------------------------------------------------------
/src/kernel/subtract_from_all.cl:
--------------------------------------------------------------------------------
1 | __kernel void sub_from_all(__global float* data,  //
2 |                            __const float value,   //
3 |                            __const uint len) {
4 |   const int global_index = get_global_id(0);
5 |   if (global_index < len) {
6 |     data[global_index] = data[global_index] - value;
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/test/data/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "n1": 32,
 3 | 	"n2": 16,
 4 | 	"f1": 9,
 5 | 	"f2": 1,
 6 | 	"f3": 5,
 7 |   "momentum": 123.5,
 8 |   "weight_decay_parameter": 0.1,
 9 |   "learning_rates": [12, 34, 56],
10 | 	"parameters_file": "cnn-parameters-a.json",
11 | 	"parameters_distribution_1": {
12 | 		"mean_w": 0.9,
13 | 		"mean_b": 0.9,
14 | 		"std_deviation_w": 0.9,
15 | 		"std_deviation_b": 0.9
16 | 	},
17 | 	"parameters_distribution_2": {
18 | 		"mean_w": 2.001,
19 | 		"mean_b": 2.001,
20 | 		"std_deviation_w": 2.001,
21 | 		"std_deviation_b": 2.001
22 | 	},
23 | 	"parameters_distribution_3": {
24 | 		"mean_w": 0.001,
25 | 		"mean_b": 0.001,
26 | 		"std_deviation_w": 0.001,
27 | 		"std_deviation_b": 0.001
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/example_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"n1": 32,
 3 | 	"n2": 16,
 4 | 	"f1": 9,
 5 | 	"f2": 1,
 6 | 	"f3": 5,
 7 | 
 8 | 	"momentum": 0.9,
 9 | 	"weight_decay_parameter": 0.001,
10 | 	"learning_rates": [0.0001, 0.0001, 0.00001],
11 | 	"parameters_file": "data/parameters.json",
12 | 
13 | 	"parameters_distribution_1": {
14 | 		"mean_w": 0.0,
15 | 		"mean_b": 0.0,
16 | 		"std_deviation_w": 0.005,
17 | 		"std_deviation_b": 0.0
18 | 	},
19 | 	"parameters_distribution_2": {
20 | 		"mean_w": 0.0,
21 | 		"mean_b": 0.0,
22 | 		"std_deviation_w": 0.005,
23 | 		"std_deviation_b": 0.0
24 | 	},
25 | 	"parameters_distribution_3": {
26 | 		"mean_w": 0.0,
27 | 		"mean_b": 0.0,
28 | 		"std_deviation_w": 0.005,
29 | 		"std_deviation_b": 0.0
30 | 	}
31 | }
32 | 


--------------------------------------------------------------------------------
/test/data/config_invalid_val.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"n1": 32,
 3 | 	"n2": 16,
 4 | 	"f1": 9,
 5 | 	"f2": 1,
 6 | 	"f3": 5,
 7 | 	"momentum": 123.5,
 8 | 	"weight_decay_parameter": 0.1,
 9 | 	"learning_rates": [12, 34, 56],
10 | 	"parameters_file": "cnn-parameters-a.json",
11 | 	"parameters_distribution_1": {
12 | 		"mean_w": 0.9,
13 | 		"mean_b": 0.9,
14 | 		"std_deviation_w": 0.9,
15 | 		"std_deviation_b": 0.9
16 | 	},
17 | 	"parameters_distribution_2": {
18 | 		"mean_w": 2.001,
19 | 		"mean_b": 2.001,
20 | 		"std_deviation_w": 2.001,
21 | 		"std_deviation_b": 2.001
22 | 	},
23 | 	"parameters_distribution_3": {
24 | 		"mean_w": 9999,
25 | 		"mean_b": 0.001,
26 | 		"std_deviation_w": 0.001,
27 | 		"std_deviation_b": 0.001
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/src/kernel/greyscale.cl:
--------------------------------------------------------------------------------
 1 | __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
 2 | 															 CLK_ADDRESS_CLAMP_TO_EDGE |
 3 | 															 CLK_FILTER_NEAREST;
 4 | 
 5 | __kernel
 6 | void main(__read_only image2d_t image,
 7 | 					__global uchar* target,
 8 | 					int w, int h){
 9 | 
10 | 	// const uint w = get_global_size(0);
11 | 	// const uint h = get_global_size(1);
12 | 	const int2 pos = {get_global_id(0), get_global_id(1)};
13 | 	float2 normCoor = convert_float2(pos) / (float2)( w, h );
14 | 
15 | 	if(pos.x >= 0 && pos.x < w && pos.y >= 0 && pos.y < h){
16 | 		int idx = pos.y * w + pos.x;
17 | 
18 | 		uint4 pixel_col = read_imageui(image, sampler, pos);
19 | 		target[idx] = (uchar)pixel_col.x;
20 | 	}
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/src/kernel/extract_luma.cl:
--------------------------------------------------------------------------------
 1 | __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |  //
 2 |                                CLK_ADDRESS_CLAMP_TO_EDGE |    //
 3 |                                CLK_FILTER_NEAREST;            //
 4 | 
 5 | __constant float4 rgb2y = {0.299f, 0.587f, 0.114f, 0.0f};
 6 | 
 7 | __kernel void extract_luma(__read_only image2d_t image,  //
 8 |                            __global float* target,       //
 9 |                            int w, int h) {
10 |   const int2 pos = {get_global_id(0), get_global_id(1)};
11 | 
12 |   if (pos.x >= 0 && pos.x < w &&  //
13 |       pos.y >= 0 && pos.y < h) {
14 |     int idx = pos.y * w + pos.x;
15 |     uint4 pixel_col = read_imageui(image, sampler, pos);
16 |     float4 pixel_col_f = convert_float4(pixel_col);
17 | #ifdef NORMALIZE
18 |     target[idx] = dot(pixel_col_f, rgb2y) / 255.0f;
19 | #else
20 |     target[idx] = dot(pixel_col_f, rgb2y);
21 | #endif  // NORMALIZE
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Marcin Matuszczyk
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/src/Config.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CONFIG_H
 2 | #define CONFIG_H
 3 | 
 4 | #include "pch.hpp"
 5 | #include <ostream>  // for std::ostream& operator<<(..)
 6 | 
 7 | namespace cnn_sr {
 8 | 
 9 | struct ParametersDistribution {
10 |   ParametersDistribution() {}
11 |   ParametersDistribution(float, float, float, float);
12 | 
13 |   float mean_w = 0.01f, sd_w = 0.01f;
14 |   float mean_b = 0.0f, sd_b = 0.0f;
15 | };
16 | 
17 | struct Config {
18 |   Config(size_t, size_t,          //
19 |          size_t, size_t, size_t,  //
20 |          float, float, float*,    //
21 |          ParametersDistribution, ParametersDistribution, ParametersDistribution,
22 |          const char* const = nullptr);
23 | 
24 |   static void validate(Config&);
25 | 
26 |   size_t total_padding() const;
27 | 
28 |   // core parameters
29 |   const size_t n1, n2;
30 |   const size_t f1, f2, f3;
31 |   const float momentum, weight_decay_parameter;
32 |   float learning_rate[3];
33 |   std::string parameters_file = "";
34 | 
35 |   // random parameters(weights/biases)
36 |   ParametersDistribution params_distr_1;
37 |   ParametersDistribution params_distr_2;
38 |   ParametersDistribution params_distr_3;
39 | };
40 | 
41 | class ConfigReader {
42 |  public:
43 |   Config read(const char* const);
44 | };
45 | }
46 | 
47 | std::ostream& operator<<(std::ostream&, const cnn_sr::Config&);
48 | 
49 | #endif /* CONFIG_H   */
50 | 


--------------------------------------------------------------------------------
/test/specs/SubtractFromAllTest.cpp:
--------------------------------------------------------------------------------
 1 | #include "TestSpecsDeclarations.hpp"
 2 | 
 3 | #include "../../src/DataPipeline.hpp"
 4 | 
 5 | namespace test {
 6 | namespace specs {
 7 | 
 8 | ///
 9 | /// PIMPL
10 | ///
11 | struct SubtractFromAllTestImpl {};
12 | 
13 | ///
14 | /// SubtractFromAllTest
15 | ///
16 | 
17 | TEST_SPEC_PIMPL(SubtractFromAllTest)
18 | 
19 | void SubtractFromAllTest::init() {}
20 | 
21 | std::string SubtractFromAllTest::name(size_t) {
22 |   return "Subtract from all test";
23 | }
24 | 
25 | size_t SubtractFromAllTest::data_set_count() { return 1; }
26 | 
27 | bool SubtractFromAllTest::operator()(size_t,
28 |                                      cnn_sr::DataPipeline *const pipeline) {
29 |   assert_not_null(pipeline);
30 |   auto _context = pipeline->context();
31 | 
32 |   const size_t data_len = 900;
33 |   const float to_subtract = 450.0f;
34 |   std::vector<float> cpu_data(data_len);
35 |   std::vector<float> expected_buf(data_len);
36 |   for (size_t i = 0; i < data_len; i++) {
37 |     cpu_data[i] = i;
38 |     expected_buf[i] = cpu_data[i] - to_subtract;
39 |   }
40 | 
41 |   // gpu allocate
42 |   auto gpu_buf_data =
43 |       _context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * data_len);
44 |   _context->write_buffer(gpu_buf_data, (void *)&cpu_data[0], true);
45 | 
46 |   pipeline->subtract_from_all(gpu_buf_data, to_subtract);
47 |   assert_equals(pipeline, expected_buf, gpu_buf_data);
48 | 
49 |   return true;
50 | }
51 | 
52 | //
53 | //
54 | }  // namespace specs
55 | }  // namespace test
56 | 


--------------------------------------------------------------------------------
/src/kernel/update_parameters.cl:
--------------------------------------------------------------------------------
 1 | __kernel void update_params(
 2 |     __read_only __global float* weights,                 //
 3 |     __read_only __global float* bias,                    //
 4 |     __read_only __global float* grad_weights,            //
 5 |     __read_only __global float* grad_bias,               //
 6 |     __read_only __global float* previous_delta_weights,  //
 7 |     __read_only __global float* previous_delta_bias,     //
 8 |     __const float momentum,                              //
 9 |     __const float weight_decay_parameter,                //
10 |     __const float learning_rate,                         //
11 |     __const uint batch_size,                             //
12 |     __const uint weights_size,                           //
13 |     __const uint bias_size) {
14 |   const size_t idx = get_global_id(0);
15 | 
16 |   // update weights
17 |   if (idx < weights_size) {
18 |     float weight_value = weights[idx];
19 |     float delta_w = momentum * previous_delta_weights[idx] +
20 |                     learning_rate * grad_weights[idx] +
21 |                     weight_decay_parameter * weight_value;
22 |     weights[idx] = weight_value - delta_w / batch_size;
23 |     previous_delta_weights[idx] = delta_w;
24 |   }
25 | 
26 |   // update bias
27 |   if (idx < bias_size) {
28 |     float delta_b = momentum * previous_delta_bias[idx] +  //
29 |                     learning_rate * grad_bias[idx];
30 |     bias[idx] -= delta_b / batch_size;
31 |     previous_delta_bias[idx] = delta_b;
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/test/specs/LayerDeltasTest_script.py:
--------------------------------------------------------------------------------
 1 | # helper script to generate expected delta values
 2 | # for LayerDeltasTest
 3 | 
 4 | deltas=[0.122, 0.083, 0.064,
 5 |         0.057, 0.075, 0.055,
 6 |         0.025, 0.058, 0.138,
 7 |         0.170, 0.068, 0.144,
 8 |         0.121, 0.013, 0.176,
 9 |         0.065, 0.169, 0.049,
10 |         0.003, 0.181, 0.051,
11 |         0.021, 0.136, 0.062,
12 |         0.066, 0.165, 0.176]
13 | f=3
14 | n_curr = 3
15 | n_prev = 2
16 | 
17 | # 00000    -
18 | # 0___0    0
19 | # 0___0    1
20 | # 0___0    2
21 | # 00000    -
22 | 
23 | d=[None]*25
24 | d[0] = [(0,0)]
25 | d[1] = [(0,0),(0,1)]
26 | d[2] = [(0,0),(0,1),(0,2)]
27 | d[3] = [(0,1),(0,2)]
28 | d[4] = [(0,2)]
29 | 
30 | d[5] = [(0,0),(1,0)]
31 | d[6] = [(0,0),(0,1),(1,0),(1,1)]
32 | d[7] = [(0,0),(0,1),(0,2), (1,0),(1,1),(1,2)]
33 | d[8] = [(0,1),(0,2), (1,1),(1,2)]
34 | d[9] = [(0,2), (1,2)]
35 | 
36 | d[10] = [(0,0),(1,0),(2,0)]
37 | d[11] = [(0,0),(0,1),(1,0),(1,1),(2,0),(2,1)]
38 | d[12] = [(0,0),(0,1),(0,2), (1,0),(1,1),(1,2), (2,0),(2,1),(2,2)]
39 | d[13] = [(0,1),(0,2), (1,1),(1,2), (2,1),(2,2)]
40 | d[14] = [(0,2), (1,2), (2,2)]
41 | 
42 | d[15] = [(2,0),(1,0)]
43 | d[16] = [(2,0),(2,1),(1,0),(1,1)]
44 | d[17] = [(2,0),(2,1),(2,2), (1,0),(1,1),(1,2)]
45 | d[18] = [(2,1),(2,2), (1,1),(1,2)]
46 | d[19] = [(2,2), (1,2)]
47 | 
48 | d[20] = [(2,0)]
49 | d[21] = [(2,0),(2,1)]
50 | d[22] = [(2,0),(2,1),(2,2)]
51 | d[23] = [(2,1),(2,2)]
52 | d[24] = [(2,2)]
53 | 
54 | for xs in d:
55 |   summ = 0
56 |   for row,col in xs:
57 |     idx = row*f*f + col*f
58 |     for k in range(n_curr):
59 |       summ += deltas[idx+k]
60 |   print(summ)
61 | 


--------------------------------------------------------------------------------
/test/specs/SumTest.cpp:
--------------------------------------------------------------------------------
 1 | #include "TestSpecsDeclarations.hpp"
 2 | #include <cstdio>  // snprintf
 3 | #include "../../src/DataPipeline.hpp"
 4 | 
 5 | namespace test {
 6 | namespace specs {
 7 | 
 8 | ///
 9 | /// PIMPL
10 | ///
11 | struct SumTestImpl {};
12 | 
13 | ///
14 | /// SumTest
15 | ///
16 | 
17 | TEST_SPEC_PIMPL(SumTest)
18 | 
19 | void SumTest::init() {}
20 | 
21 | std::string SumTest::name(size_t sq) {
22 |   return sq == 1 ? "Sum all test - squared" : "Sum all test";
23 | }
24 | 
25 | size_t SumTest::data_set_count() { return 2; }
26 | 
27 | bool SumTest::operator()(size_t sq, cnn_sr::DataPipeline *const pipeline) {
28 |   assert_not_null(pipeline);
29 |   auto _context = pipeline->context();
30 | 
31 |   bool squared = sq == 1;
32 |   const size_t data_len = 900;
33 |   long long expected = 0;
34 |   float cpu_data[data_len];
35 |   for (size_t i = 0; i < data_len; i++) {
36 |     cpu_data[i] = i;
37 |     expected += squared ? i * i : i;
38 |   }
39 |   // std::cout << sq << "->" << squared << " exp: " << expected << std::endl;
40 |   auto gpu_buf_data =
41 |       _context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * data_len);
42 |   _context->write_buffer(gpu_buf_data, (void *)cpu_data, true);
43 | 
44 |   cl_ulong result = pipeline->sum(gpu_buf_data, squared);
45 | 
46 |   // ok, we do not expect 100% correct result
47 |   long long margin = 20;
48 |   long long err = expected - result;
49 |   err = err < 0 ? -err : err;
50 |   if (err > margin) {
51 |     char msg_buffer[128];
52 |     snprintf(msg_buffer, sizeof(msg_buffer),  //
53 |              "Expected %lld to be %lld", result, expected);
54 |     throw TestException(msg_buffer);
55 |   }
56 | 
57 |   return true;
58 | }
59 | 
60 | //
61 | //
62 | }  // namespace specs
63 | }  // namespace test
64 | 


--------------------------------------------------------------------------------
/profile.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import time
 3 | import subprocess
 4 | 
 5 | epochs = 100
 6 | seconds_per_epoch = 0.236
 7 | cmd = 'bin\\cnn.exe train dry -c data\config.json --epochs {0:} -i data\\train_samples36'.format(epochs)
 8 | 
 9 | kernel_profile_regex = "Kernel '.*/(.*?]).*?([\-e.\d]+)ns.*?([\-e.\d]+)s"
10 | def get_kernel_profiling_info(out):
11 |   out = out.decode('UTF-8')
12 |   rr = re.findall(kernel_profile_regex, out)
13 |   l = [(x[0], int(x[1]), float(x[2])) for x in rr]
14 |   l = sorted(l, key=lambda x: x[2])
15 |   ts = 0.0
16 |   for _,_,t in l:
17 |       ts += t
18 |   return l, ts
19 | 
20 | 
21 | if __name__ == '__main__':
22 |   import sys
23 | 
24 |   kernel_mode = 'kernel' in sys.argv
25 | 
26 |   cmd_ = cmd.split(' ')
27 |   if kernel_mode:
28 |     cmd_.append('profile')
29 |   print('Command to execute:')
30 |   print('\'' + (' '.join(cmd_)) + '\'')
31 | 
32 |   est_time = epochs * seconds_per_epoch
33 |   print('Will do {0:} epochs'.format( epochs))
34 |   print('Estimated required time: {:.3f}s = {:.3f} min'.format(est_time, est_time//60))
35 | 
36 |   start = time.time()
37 |   proc = subprocess.Popen(cmd_, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
38 |   outs, errs = proc.communicate()
39 |   if proc.returncode is not 0:
40 |         print('---- FAIL ----')
41 |         exit()
42 |   end = time.time()
43 |   dt = end - start
44 | 
45 |   print("Execution time: {:.3f}s = {:.2f}min ({:.5f} s/epoch)".format(dt, dt/60, dt/epochs))
46 | 
47 |   if kernel_mode:
48 |     kps, kernel_time = get_kernel_profiling_info(outs)
49 |     for name,ns,s in kps:
50 |       name = name.replace('-D ', '').replace('\'', '').replace('[--]','')
51 |       print("{0:7.4f}s ({1:5.2f}%)- {2:.65}".format(s, s*100/kernel_time, name))
52 |     print( "Time spend in kernel: {:f}s".format(kernel_time))
53 |     print("Percent of time spend in kernel: {:.4f}%".format(kernel_time*100/dt))
54 | 


--------------------------------------------------------------------------------
/test/specs/TestSpecsDeclarations.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef TEST_SPECS_DECL_H
 2 | #define TEST_SPECS_DECL_H
 3 | 
 4 | #include "../TestCase.hpp"
 5 | #include "../../src/opencl/Context.hpp"
 6 | 
 7 | #define DECLARE_TEST_SPEC(X, ...)                                              \
 8 |   struct CONCATENATE(X, Impl);                                                 \
 9 |   class X : public TestCase {                                                  \
10 |    public:                                                                     \
11 |     X();                                                                       \
12 |     ~X();                                                                      \
13 |     void init(__VA_ARGS__);                                                    \
14 |     std::string name(size_t data_set_id) override;                             \
15 |     bool operator()(size_t data_set_id, cnn_sr::DataPipeline *const) override; \
16 |     size_t data_set_count() override;                                          \
17 |                                                                                \
18 |    private:                                                                    \
19 |     CONCATENATE(X, Impl) *const _impl = nullptr;                               \
20 |   };
21 | 
22 | #define TEST_SPEC_PIMPL(X)                      \
23 |   X::X() : _impl(new CONCATENATE(X, Impl)()) {} \
24 |   X::~X() { delete _impl; }
25 | 
26 | namespace test {
27 | namespace specs {
28 | 
29 | DECLARE_TEST_SPEC(ExtractLumaTest)
30 | DECLARE_TEST_SPEC(SwapLumaTest)
31 | DECLARE_TEST_SPEC(SquaredErrorTest)
32 | DECLARE_TEST_SPEC(SubtractFromAllTest)
33 | DECLARE_TEST_SPEC(SumTest)
34 | DECLARE_TEST_SPEC(LayerDeltasTest)
35 | DECLARE_TEST_SPEC(BackpropagationTest)
36 | DECLARE_TEST_SPEC(LayerTest)
37 | DECLARE_TEST_SPEC(LastLayerDeltaTest)
38 | DECLARE_TEST_SPEC(UpdateParametersTest)
39 | DECLARE_TEST_SPEC(ConfigTest)
40 | 
41 | }
42 | }
43 | 
44 | #endif /* TEST_SPECS_DECL_H   */
45 | 


--------------------------------------------------------------------------------
/test/TestCase.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef TEST_CASE_H
 2 | #define TEST_CASE_H
 3 | 
 4 | #include "../src/pch.hpp"
 5 | #include <stdexcept>
 6 | #include <sstream>
 7 | 
 8 | namespace test {
 9 | 
10 | ///
11 | /// utils functions
12 | ///
13 | 
14 | float activation_function(float);
15 | float activation_function_derivative(float);
16 | 
17 | ///
18 | ///  TestException
19 | ///
20 | class TestException : public std::runtime_error {
21 |  public:
22 |   TestException();
23 |   TestException(const char *);
24 |   TestException(const TestException &);
25 | 
26 |   virtual const char *what() const throw();
27 | 
28 |  private:
29 |   std::ostringstream cnvt;
30 | };
31 | 
32 | ///
33 | /// TestCase etc.
34 | ///
35 | struct DataSet {
36 |   DataSet(std::string name) : name(name) {}
37 |   DataSet() {}
38 |   std::string name;
39 | };
40 | 
41 | class TestCase {
42 |  public:
43 |   ~TestCase() {}
44 | 
45 |   virtual std::string name(size_t data_set_id) = 0;
46 |   virtual bool operator()(size_t data_set_id, cnn_sr::DataPipeline *const) = 0;
47 |   virtual size_t data_set_count() { return 1; }
48 | 
49 |  protected:
50 |   void assert_equals(int expected, int result);
51 |   void assert_equals(float expected, float result);
52 |   void assert_equals(const std::vector<float> &expected,
53 |                      const std::vector<float> &result, bool print = false);
54 |   void assert_equals(cnn_sr::DataPipeline *const,
55 |                      const std::vector<float> &expected, opencl::MemoryHandle,
56 |                      bool print = false);
57 |   void assert_true(bool v, const char *msg);
58 |   void assert_data_set_ok(size_t);
59 | 
60 |   template <typename T>
61 |   void assert_not_null(T *, const char *msg = nullptr);
62 | };
63 | 
64 | ///
65 | /// template implementations
66 | ///
67 | 
68 | template <typename T>
69 | void TestCase::assert_not_null(T *ptr, const char *msg) {
70 |   if (!msg) msg = "Null pointer";
71 |   assert_true(ptr != nullptr, msg);
72 | }
73 | }
74 | 
75 | #endif /* TEST_CASE_H   */
76 | 


--------------------------------------------------------------------------------
/libs/include/CL/opencl.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 |  * Copyright (c) 2008-2010 The Khronos Group Inc.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and/or associated documentation files (the
 6 |  * "Materials"), to deal in the Materials without restriction, including
 7 |  * without limitation the rights to use, copy, modify, merge, publish,
 8 |  * distribute, sublicense, and/or sell copies of the Materials, and to
 9 |  * permit persons to whom the Materials are furnished to do so, subject to
10 |  * the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included
13 |  * in all copies or substantial portions of the Materials.
14 |  *
15 |  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 |  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 |  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 |  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 |  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
22 |  ******************************************************************************/
23 | 
24 | /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
25 | 
26 | #ifndef __OPENCL_H
27 | #define __OPENCL_H
28 | 
29 | #ifdef __cplusplus
30 | extern "C" {
31 | #endif
32 | 
33 | #ifdef __APPLE__
34 | 
35 | #include <OpenCL/cl.h>
36 | #include <OpenCL/cl_gl.h>
37 | #include <OpenCL/cl_gl_ext.h>
38 | #include <OpenCL/cl_ext.h>
39 | 
40 | #else
41 | 
42 | #include "CL/cl.h"
43 | #include "CL/cl_gl.h"
44 | #include "CL/cl_gl_ext.h"
45 | #include "CL/cl_ext.h"
46 | 
47 | #endif
48 | 
49 | #ifdef __cplusplus
50 | }
51 | #endif
52 | 
53 | #endif  /* __OPENCL_H   */
54 | 
55 | 


--------------------------------------------------------------------------------
/src/LayerData.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LAYER_DATA_H
 2 | #define LAYER_DATA_H
 3 | 
 4 | #include <vector>
 5 | #include <cstddef>  // for size_t
 6 | #include <ostream>  // for operator<<
 7 | 
 8 | namespace cnn_sr {
 9 | 
10 | /* clang-format off */
11 | /**
12 |  *
13 |  *  Test data schema description (values for each layer provided after '/'):
14 |  *
15 |  *  n_prev_filter_cnt    := INT, filter count for previous layer, values: 1/n1/n2
16 |  *  current_filter_count := INT, filter count for this layer, values: n1/n2/1
17 |  *  f_spatial_size       := INT, spatial size, values: f1/f2/f3
18 |  *  weights              := VECTOR[FLOAT], min size: weight_size
19 |  *  												 Each column for different filter(from 1 to current_filter_count)
20 |  *  												 Each row for different point in range 0..f_spatial_size^2
21 |  *  												 Each paragraph is 1 row of points  (f_spatial_size points)
22 |  *  bias                 := VECTOR[FLOAT], min size: bias_size
23 |  *
24 |  * calcutated values:
25 |  *  input_size := input_w * input_h * n_prev_filter_cnt * current_filter_count
26 |  *  out_w := input_w - f_spatial_size + 1
27 |  *  out_h := input_h - f_spatial_size + 1
28 |  *  weight_count := f_spatial_size^2 * n_prev_filter_cnt
29 |  *  bias_count := current_filter_count
30 |  */
31 | struct LayerData {
32 |   /* clang-format on */
33 | 
34 |   LayerData(size_t n_prev_filter_cnt, size_t current_filter_count,
35 |             size_t f_spatial_size);
36 | 
37 |   static void validate(const LayerData&);
38 | 
39 |   // setters
40 |   void set_weights(float*);
41 |   void set_bias(float*);
42 |   // getters
43 |   size_t input_size(size_t w, size_t h) const;
44 |   void get_output_dimensions(size_t*, size_t w, size_t h) const;
45 |   size_t weight_size() const;
46 |   size_t bias_size() const;
47 |   inline const float* weights_ptr() const { return &weights[0]; }
48 |   inline const float* bias_ptr() const { return &bias[0]; }
49 | 
50 |  public:
51 |   const size_t n_prev_filter_cnt;
52 |   const size_t current_filter_count;
53 |   const size_t f_spatial_size;
54 | 
55 |   /** stale */
56 |   std::vector<float> weights;
57 |   /** stale */
58 |   std::vector<float> bias;
59 | };
60 | }
61 | 
62 | std::ostream& operator<<(std::ostream&, const cnn_sr::LayerData&);
63 | 
64 | #endif /* LAYER_DATA_H   */
65 | 


--------------------------------------------------------------------------------
/src/kernel/last_layer_delta.cl:
--------------------------------------------------------------------------------
 1 | /* clang-format off */
 2 | /**
 3 |  * [main description]
 4 |  * @param  float*        ground_truth_image [description]
 5 |  * @param  float*        algo_result        [description]
 6 |  * @param  float*        target             [description]
 7 |  * @param  float         weight_decay       regularization term to bring the weights down
 8 |  * @param  uint          ground_truth_w     [description]
 9 |  * @param  uint          algo_result_w
10 |  * @param  uint          algo_result_h
11 |  * @return {[type]}             [description]
12 |  */
13 | /* clang-format on */
14 | __kernel void last_layer_delta(__read_only __global float* ground_truth_image,
15 |                                __read_only __global float* algo_result,
16 |                                __global float* target,       //
17 |                                __const uint ground_truth_w,  //
18 |                                __const uint ground_truth_h,  //
19 |                                __const uint algo_result_w,   //
20 |                                __const uint algo_result_h) {
21 |   const int2 pos = {get_global_id(0), get_global_id(1)};  // x=col=i, y=row=j
22 |   const uint sample_id = get_global_id(2);
23 |   const int2 out_size = {algo_result_w, algo_result_h};
24 |   const int idx = (pos.y * algo_result_w) + pos.x;
25 |   const size_t padding = (ground_truth_w - algo_result_w) / 2;
26 | 
27 | #define IMAGE_OFFSET_GT sample_id* ground_truth_w* ground_truth_h
28 | #define IMAGE_OFFSET_ALGO sample_id* algo_result_w* algo_result_h
29 | 
30 |   // size of ground_truth != algo res (padding)
31 |   // The offset is not const, since it depends on the row we are in
32 |   // algo for ground_truth_idx:
33 |   // (row + padding_on_top_of_image) * width + padding_left + col
34 |   const size_t ground_truth_idx =
35 |       (pos.y + padding) * ground_truth_w + padding + pos.x;
36 | 
37 |   if (pos.x >= 0 && pos.x < out_size.x &&  //
38 |       pos.y >= 0 && pos.y < out_size.y) {
39 |     // usuall square error derivative calculations
40 |     float t = ground_truth_image[IMAGE_OFFSET_GT + ground_truth_idx];
41 |     float y = algo_result[IMAGE_OFFSET_ALGO + idx];
42 |     float d = y - t;
43 | 
44 |     // relu
45 |     float relu_deriv = y > 0.0f ? 1.0f : 0.0f;
46 | 
47 |     // write result
48 |     target[IMAGE_OFFSET_ALGO + idx] = d * relu_deriv;
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | # $@ - left side of ':'
 2 | # $^ - right side of ':'
 3 | # $< - first of dependencies
 4 | 
 5 | CC = clang++
 6 | VPATH = src/opencl src test test/specs libs/cpp
 7 | IDIR = libs/include
 8 | ODIR = obj
 9 | BINDIR = bin
10 | LIBS = -lm -L libs/lib -l OpenCL
11 | EXECNAME = cnn.exe
12 | 
13 | CFLAGS = -std=c++11 \
14 | 	-c \
15 | 	-g \
16 | 	-Wall \
17 | 	-Wextra \
18 | 	-stdlib=libstdc++ \
19 | 	-isystem "C:\programs\install\MinGW\include" \
20 | 	-isystem "C:\programs\install\MinGW\lib\gcc\mingw32\4.7.2\include\c++" \
21 | 	-isystem "C:\programs\install\MinGW\lib\gcc\mingw32\4.7.2\include\c++\mingw32" \
22 | 	-I$(IDIR)
23 | 
24 | LFLAGS = -std=c++11 \
25 | 	-l "stdc++" \
26 | 	-I$(IDIR)
27 | 
28 | __OBJ = Config.o \
29 | 	LayerData.o \
30 | 	DataPipeline.o \
31 | 	ConfigBasedDataPipeline.o \
32 | 	pch.o \
33 | 	Context.o \
34 | 	UtilsOpenCL.o \
35 | 	Kernel.o \
36 | 	gason.o
37 | 
38 | _OBJ = Main_cl.o $(__OBJ)
39 | OBJ = $(patsubst %,$(ODIR)/%,$(_OBJ)) # append ODIR to each entry
40 | 
41 | # _TEST_OBJ = TestRunner.o $(__OBJ) TestDataProvider.o LayerDeltasTest.o BackpropagationTest.o
42 | _TEST_OBJ = TestRunner.o $(__OBJ) \
43 | 	TestCase.o \
44 | 	ExtractLumaTest.o \
45 | 	SwapLumaTest.o \
46 | 	SquaredErrorTest.o \
47 | 	SubtractFromAllTest.o \
48 | 	SumTest.o \
49 | 	LayerDeltasTest.o \
50 | 	BackpropagationTest.o \
51 | 	LayerTest.o \
52 | 	LastLayerDeltaTest.o \
53 | 	UpdateParametersTest.o \
54 | 	ConfigTest.o
55 | TEST_OBJ = $(patsubst %,$(ODIR)/%,$(_TEST_OBJ))
56 | 
57 | 
58 | # If the first argument is "run"...
59 | ifeq (run,$(firstword $(MAKECMDGOALS)))
60 |   # use the rest as arguments for "run"
61 |   RUN_ARGS := $(wordlist 2,$(words $(MAKECMDGOALS)),$(MAKECMDGOALS))
62 |   # ...and turn them into do-nothing targets
63 |   $(eval $(RUN_ARGS):;@:)
64 | endif
65 | 
66 | 
67 | # default target
68 | build: $(EXECNAME)
69 | 
70 | compile: $(OBJ)
71 | 
72 | # if You pass arguments do it like this:
73 | # 'make run -- ARGS_HERE'
74 | run: $(EXECNAME)
75 | 	@echo -----------------------
76 | 	@$(BINDIR)/$< $(RUN_ARGS)
77 | 
78 | test: $(TEST_OBJ)
79 | 	@echo Linking tests..
80 | 	g++ -o $(BINDIR)/test.exe $^ $(LFLAGS) $(LIBS)
81 | 	@echo -----------------------
82 | 	@$(BINDIR)/test.exe
83 | 
84 | 
85 | clean:
86 | 	rm -f $(ODIR)/*.o
87 | 	rm -f $(BINDIR)/*
88 | 
89 | 
90 | 
91 | $(EXECNAME): $(OBJ)
92 | 	@echo Linking..
93 | 	g++ -o $(BINDIR)/$@ $^ $(LFLAGS) $(LIBS)
94 | 
95 | $(ODIR)/%.o: %.cpp
96 | 	$(CC) -c -o $@ $< $(CFLAGS)
97 | 


--------------------------------------------------------------------------------
/src/kernel/sum.cl:
--------------------------------------------------------------------------------
 1 | /* clang-format off */
 2 | /**
 3 |  * @see http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html
 4 |  * @see  http://simpleopencl.blogspot.com/2013/05/atomic-operations-and-floats-in-opencl.html
 5 |  *
 6 |  * @param {[type]} volatile __global float   *source       [description]
 7 |  * @param {[type]} const    float    operand [description]
 8 |  */
 9 | inline void atomic_add_global(volatile __global float* source, const float operand) {
10 |   /* clang-format on */
11 |   union {
12 |     unsigned int intVal;
13 |     float floatVal;
14 |   } newVal;
15 | 
16 |   union {
17 |     unsigned int intVal;
18 |     float floatVal;
19 |   } prevVal;
20 | 
21 |   // NOTE: atomic_cmpxchg(volatile __global unsigned int *p,
22 |   // 	                    unsigned int cmp, unsigned int val)
23 |   do {
24 |     prevVal.floatVal = *source;
25 |     newVal.floatVal = prevVal.floatVal + operand;
26 |   } while (atomic_cmpxchg((volatile __global unsigned int*)source,
27 |                           prevVal.intVal,  //
28 |                           newVal.intVal) != prevVal.intVal);
29 | }
30 | 
31 | /**
32 |  * Code partially inspired by:
33 |  * http://developer.amd.com/resources/documentation-articles/articles-whitepapers/opencl-optimization-case-study-simple-reductions/
34 |  */
35 | __kernel void sum(__read_only __global float* data,  //
36 |                   volatile __global float* target,   //
37 |                   __local float* scratch,            //
38 |                   __const uint len) {
39 |   const int global_index = get_global_id(0);
40 |   const int local_index = get_local_id(0);
41 | 
42 |   // each kernel computes it's value and stores in local scratch buffer
43 |   float val = global_index < len ? data[global_index] : 0.0f;
44 | #ifdef SUM_SQUARED
45 |   val = val * val;
46 | #endif
47 |   scratch[local_index] = val;
48 | 
49 |   // wait till all kernels from local groups finished
50 |   barrier(CLK_LOCAL_MEM_FENCE);
51 | 
52 |   // add all squared_diffs for local group
53 |   for (int offset = get_local_size(0) / 2; offset > 0; offset = offset / 2) {
54 |     if (local_index < offset) {
55 |       float other = scratch[local_index + offset];
56 |       float mine = scratch[local_index];
57 |       scratch[local_index] = mine + other;
58 |     }
59 |     // wait for all local kernels to finish previous step
60 |     // and reach stable state
61 |     barrier(CLK_LOCAL_MEM_FENCE);
62 |   }
63 | 
64 |   // add local result to global result
65 |   if (local_index == 0) {
66 |     atomic_add_global(target, scratch[0]);
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/test/specs/ExtractLumaTest.cpp:
--------------------------------------------------------------------------------
 1 | #include "TestSpecsDeclarations.hpp"
 2 | 
 3 | #include "../../src/opencl/UtilsOpenCL.hpp"
 4 | #include "../../src/DataPipeline.hpp"
 5 | 
 6 | auto test_image = "test/data/color_grid.png";
 7 | 
 8 | namespace test {
 9 | namespace specs {
10 | 
11 | ///
12 | /// Data set
13 | ///
14 | struct ExtractLumaDataSet : DataSet {
15 |   ExtractLumaDataSet(bool n, std::string name) : DataSet(name), normalize(n) {}
16 |   bool normalize;
17 | };
18 | 
19 | ///
20 | /// PIMPL
21 | ///
22 | struct ExtractLumaTestImpl {
23 |   const size_t data_size[2] = {5, 5};
24 |   const std::vector<float> output = {0.000f, 1.000f, 0.812f, 0.853f, 0.437f,  //
25 |                                      0.170f, 0.701f, 0.413f, 0.886f, 0.787f,  //
26 |                                      0.430f, 0.299f, 0.587f, 0.114f, 0.707f,  //
27 |                                      0.670f, 0.745f, 0.853f, 0.745f, 0.299f,
28 |                                      0.810f, 0.588f, 0.859f, 0.593f, 0.702f};
29 | 
30 |   ExtractLumaDataSet data_sets[2] = {
31 |       ExtractLumaDataSet(true, "normalized"),
32 |       ExtractLumaDataSet(false, "not normalized")};
33 | };
34 | 
35 | ///
36 | /// ExtractLumaTest
37 | ///
38 | 
39 | TEST_SPEC_PIMPL(ExtractLumaTest)
40 | 
41 | void ExtractLumaTest::init() {}
42 | 
43 | size_t ExtractLumaTest::data_set_count() { return 2; }
44 | 
45 | std::string ExtractLumaTest::name(size_t data_set_id) {
46 |   assert_data_set_ok(data_set_id);
47 |   return "Extract luma test - " + _impl->data_sets[data_set_id].name;
48 | }
49 | 
50 | bool ExtractLumaTest::operator()(size_t data_set_id,
51 |                                  cnn_sr::DataPipeline *const pipeline) {
52 |   assert_not_null(pipeline);
53 |   assert_data_set_ok(data_set_id);
54 |   bool normalize = _impl->data_sets[data_set_id].normalize;
55 | 
56 |   opencl::utils::ImageData data;
57 |   load_image(test_image, data);
58 |   this->assert_true(
59 |       _impl->data_size[0] * _impl->data_size[1] == (size_t)(data.w * data.h),
60 |       "Vector of 1st layer's input values should be at least as big as test"
61 |       " image");
62 | 
63 |   opencl::MemoryHandle gpu_buf_raw_img = gpu_nullptr,
64 |                        gpu_buf_luma = gpu_nullptr;
65 |   pipeline->extract_luma(data, gpu_buf_raw_img, gpu_buf_luma, normalize);
66 | 
67 |   std::vector<float> expected = _impl->output;
68 |   for (int i = 0; (!normalize) && (i < data.w * data.h); i++) {
69 |     expected[i] *= 255;
70 |   }
71 |   assert_equals(pipeline, expected, gpu_buf_luma);
72 | 
73 |   return true;
74 | }
75 | 
76 | //
77 | //
78 | }  // namespace specs
79 | }  // namespace test
80 | 


--------------------------------------------------------------------------------
/test/specs/LumaTests_script.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | 
 3 | rgb2y   = [  0.299,    0.587,    0.114,  0.0]
 4 | rgb2Cb  = [-0.1687,  -0.3312,      0.5,  0.0]
 5 | rgb2Cr  = [    0.5,  -0.4186,  -0.0813,  0.0]
 6 | YCbCr2r = [1.0,     0.0,    1.4,  0.0]
 7 | YCbCr2g = [1.0,  -0.343, -0.711,  0.0]
 8 | YCbCr2b = [1.0,   1.765,    0.0,  0.0]
 9 | 
10 | def dot(a,b):
11 |   return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]
12 | 
13 | 
14 | def extract_luma(img):
15 |   luma_channel = []
16 |   width, height = img.size
17 |   pixels = img.load() # this is not a list, nor is it list()'able
18 |   for y in range(height):
19 |     for x in range(width):
20 |       cpixel = pixels[x, y]
21 |       luma_val = dot(cpixel, rgb2y)
22 |       luma_channel.append(luma_val/255)
23 |   l = int(len(luma_channel)**0.5)
24 |   for i in range(l):
25 |     xs = luma_channel[i*l : (i+1)*l]
26 |     print(', '.join(["{:>6.3}".format(x) for x in xs]))
27 | 
28 | def swap_luma(img, padding, out_path):
29 |   "To verify this works run it on f.e. 256*256 picture, but it may take some time (like 3 min or so)"
30 |   print("Deprecated, see SwapLumaTest.cpp")
31 |   raise Exception('SwapLuma does not produce acceptable result file')
32 | 
33 |   img_w, img_h = img.size
34 | 
35 |   # generate luma to swap into
36 |   total_padding = padding * 2
37 |   luma_w,luma_h = img_w - total_padding, img_h - total_padding
38 |   new_luma_size = luma_w * luma_w
39 |   new_luma = [(i/new_luma_size) for i in range(new_luma_size)]
40 |   # print(new_luma)
41 | 
42 |   pixels = img.load() # this is not a list, nor is it list()'able
43 |   for y in range(img_w):
44 |     for x in range(img_h):
45 |       pos_luma = x - padding, y - padding
46 |       idx_luma = pos_luma[1] * luma_w + pos_luma[0]
47 |       # idx = y * img_w + x
48 |       cpixel = pixels[x, y] # 0..255
49 | 
50 |       if pos_luma[0] >= 0 and pos_luma[0] < luma_w and \
51 |          pos_luma[1] >= 0 and pos_luma[1] < luma_h:
52 |         raw_luma = new_luma[idx_luma]
53 |         YCbCr = (raw_luma * 255, # 0..255
54 |                  dot(rgb2Cb, cpixel),
55 |                  dot(rgb2Cr, cpixel))
56 |         clamp = lambda x: int(min(255, max(0, x)))
57 |         new_color = (clamp(dot(YCbCr2r, YCbCr)), \
58 |                      clamp(dot(YCbCr2g, YCbCr)), \
59 |                      clamp(dot(YCbCr2b, YCbCr)))
60 |       else:
61 |         new_color = cpixel
62 |       # print(new_color)
63 |       pixels[x, y] = new_color
64 |       img.save(out_path, "JPEG")
65 | 
66 | 
67 | 
68 | if __name__ == '__main__':
69 |   extract_luma_img = Image.open("../data/color_grid.png")
70 |   extract_luma(extract_luma_img)
71 | 
72 |   # swap_luma_img = Image.open(  "../data/color_grid2.jpg")
73 |   # swap_luma_img = Image.open(  "../data/color_grid3.png")
74 |   # swap_luma(swap_luma_img, 10, "../data/color_grid2_luma_swapped.png")
75 | 


--------------------------------------------------------------------------------
/libs/include/CL/cl_gl_ext.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************************
 2 |  * Copyright (c) 2008-2010 The Khronos Group Inc.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and/or associated documentation files (the
 6 |  * "Materials"), to deal in the Materials without restriction, including
 7 |  * without limitation the rights to use, copy, modify, merge, publish,
 8 |  * distribute, sublicense, and/or sell copies of the Materials, and to
 9 |  * permit persons to whom the Materials are furnished to do so, subject to
10 |  * the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included
13 |  * in all copies or substantial portions of the Materials.
14 |  *
15 |  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 |  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 |  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 |  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 |  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
22 |  **********************************************************************************/
23 | 
24 | /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
25 | 
26 | /* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
27 | /* OpenGL dependencies.                                                         */
28 | 
29 | #ifndef __OPENCL_CL_GL_EXT_H
30 | #define __OPENCL_CL_GL_EXT_H
31 | 
32 | #ifdef __cplusplus
33 | extern "C" {
34 | #endif
35 | 
36 | #ifdef __APPLE__
37 |     #include <OpenCL/cl_gl.h>
38 | #else
39 |     #include "CL/cl_gl.h"
40 | #endif
41 | 
42 | /*
43 |  * For each extension, follow this template
44 |  * / * cl_VEN_extname extension  */
45 | /* #define cl_VEN_extname 1
46 |  * ... define new types, if any
47 |  * ... define new tokens, if any
48 |  * ... define new APIs, if any
49 |  *
50 |  *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
51 |  *  This allows us to avoid having to decide whether to include GL headers or GLES here.
52 |  */
53 | 
54 | /*
55 |  *  cl_khr_gl_event  extension
56 |  *  See section 9.9 in the OpenCL 1.1 spec for more information
57 |  */
58 | #define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
59 | 
60 | extern CL_API_ENTRY cl_event CL_API_CALL
61 | clCreateEventFromGLsyncKHR(cl_context           /* context */,
62 |                            cl_GLsync            /* cl_GLsync */,
63 |                            cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
64 | 
65 | #ifdef __cplusplus
66 | }
67 | #endif
68 | 
69 | #endif	/* __OPENCL_CL_GL_EXT_H  */
70 | 


--------------------------------------------------------------------------------
/generate_training_samples.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os.path import isfile, join
 3 | from random import randint
 4 | 
 5 | import argparse
 6 | from PIL import Image
 7 | 
 8 | img_id = 0
 9 | created_files =[]
10 | 
11 | def list_files(dir_path):
12 |   return [f for f in os.listdir(dir_path) if isfile(join(dir_path,f)) ]
13 | 
14 | def process_img(in_dir,out_dir,file_name, out_size,small_scale):
15 |   global img_id, created_files
16 |   in_path = join(in_dir, file_name)
17 |   large_path = join(out_dir, 'sample_{0:}_{1:}.jpg'.format(img_id, "large"))
18 |   small_path = join(out_dir, 'sample_{0:}_{1:}.jpg'.format(img_id, "small"))
19 |   # print( in_path)
20 |   # print(large_path)
21 |   # print(small_path)
22 |   img_id += 1
23 | 
24 |   im = Image.open(in_path)
25 |   if im.width < out_size or im.height < out_size:
26 |     raise Exception('Image \'{0:}\' is smaller then requested out-size'.format(file_name))
27 | 
28 |   crop_upper_left = randint(0,im.width-out_size), randint(0,im.height-out_size)
29 |   large = im.crop((crop_upper_left[0],\
30 |                    crop_upper_left[1], \
31 |                    crop_upper_left[0] + out_size,\
32 |                    crop_upper_left[1] + out_size))
33 |   # size = out_size, out_size
34 |   # im.resize(size, Image.ANTIALIAS)
35 |   large.save(large_path, "JPEG")
36 | 
37 |   small_size = int(out_size/small_scale)
38 |   small1 = large.resize((small_size,small_size), Image.ANTIALIAS)
39 |   small2 = small1.resize((out_size,out_size), Image.ANTIALIAS)
40 |   small2.save(small_path, "JPEG")
41 | 
42 |   created_files.append((large_path, small_path))
43 | 
44 | 
45 | 
46 | if __name__ == '__main__':
47 |   help_text = 'Mass resize images. Usage: ' + \
48 |               '"generate_training_samples.py -i data\\train_samples_raw -o data\\train_samples -s 128 -d 3"'
49 | 
50 |   parser = argparse.ArgumentParser(description=help_text)
51 |   parser.add_argument('--in-dir',   '-i',required=True, help='input directory' )
52 |   parser.add_argument('--out-dir',  '-o',required=True, help='output directory' )
53 |   parser.add_argument('--out-size', '-s',required=True, help='size of output images', type=int)
54 |   parser.add_argument('--degrade-factor', '-d', help='scale factor when producing smaller image', type=float, default=2)
55 |   args = parser.parse_args()
56 | 
57 |   in_files = list_files(args.in_dir)
58 |   # print('Found following files in \''+args.in_dir+'\': ')
59 |   # print(in_files)
60 | 
61 |   os.makedirs(args.out_dir, exist_ok=True)
62 |   for f in in_files:
63 |     try:
64 |       process_img(args.in_dir,args.out_dir,f, args.out_size, args.degrade_factor)
65 |     except IOError:
66 |       print("cannot create train samples for '{0:}'".format(f))
67 |     except Exception as e:
68 |       print(str(e))
69 | 
70 |   if not created_files:
71 |     print('No files were created')
72 |   else:
73 |     print('created {0:} files'.format(len(created_files)))
74 |     # print('\n'.join([item.replace("\\","\\\\") for sublist in created_files for item in sublist]))
75 | 


--------------------------------------------------------------------------------
/src/LayerData.cpp:
--------------------------------------------------------------------------------
 1 | #include "LayerData.hpp"
 2 | 
 3 | // #include <algorithm>  // for std::copy
 4 | #include <cstdio>     // snprintf
 5 | #include <stdexcept>  // std::runtime_error
 6 | 
 7 | namespace cnn_sr {
 8 | 
 9 | LayerData::LayerData(size_t n_prev_filter_cnt, size_t current_filter_count,
10 |                      size_t f_spatial_size)
11 |     : n_prev_filter_cnt(n_prev_filter_cnt),
12 |       current_filter_count(current_filter_count),
13 |       f_spatial_size(f_spatial_size) {
14 |   // validation will pass if we set size to proper values, thus limiting it's
15 |   // usefulness
16 |   this->weights.reserve(this->weight_size());
17 |   this->bias.reserve(this->bias_size());
18 | }
19 | 
20 | void LayerData::validate(const LayerData& data) {
21 |   if (data.weights.size() < data.weight_size()) {
22 |     char buf[255];
23 |     snprintf(buf, 255,
24 |              "Declared f_spatial_size(%d)*f_spatial_size(%d)"
25 |              "*n_prev_filter_cnt(%d)*current_filter_count(%d)=%d"
26 |              " is bigger then weights array (%d elements)."
27 |              " Expected more elements in weights array. ",
28 |              data.f_spatial_size, data.f_spatial_size, data.n_prev_filter_cnt,
29 |              data.current_filter_count, data.weight_size(),
30 |              data.weights.size());
31 |     throw std::runtime_error(buf);
32 |   }
33 | 
34 |   if (data.bias.size() < data.bias_size()) {
35 |     char buf[255];
36 |     snprintf(buf, 255,
37 |              "Bias array(size=%d) should have equal size to "
38 |              "current_filter_count(%d).",
39 |              data.bias.size(), data.bias_size());
40 |     throw std::runtime_error(buf);
41 |   }
42 | }
43 | 
44 | ///
45 | /// get&set
46 | ///
47 | 
48 | void LayerData::set_weights(float* x) {
49 |   if (x) std::copy(x, x + this->weight_size(), back_inserter(this->weights));
50 | }
51 | 
52 | void LayerData::set_bias(float* x) {
53 |   if (x) std::copy(x, x + this->bias_size(), back_inserter(this->bias));
54 | }
55 | 
56 | void LayerData::get_output_dimensions(size_t* dim_arr, size_t input_w,
57 |                                       size_t input_h) const {
58 |   dim_arr[0] = input_w - f_spatial_size + 1;
59 |   dim_arr[1] = input_h - f_spatial_size + 1;
60 | }
61 | 
62 | size_t LayerData::weight_size() const {
63 |   return f_spatial_size * f_spatial_size * n_prev_filter_cnt *
64 |          current_filter_count;
65 | }
66 | 
67 | size_t LayerData::bias_size() const { return current_filter_count; }
68 | 
69 | size_t LayerData::input_size(size_t input_w, size_t input_h) const {
70 |   return input_w * input_h * n_prev_filter_cnt;
71 | }
72 | 
73 | // namespace cnn_sr
74 | }
75 | 
76 | std::ostream& operator<<(std::ostream& os, const cnn_sr::LayerData& data) {
77 |   os << "Layer {"
78 |      << " previous filters: " << data.n_prev_filter_cnt
79 |      << ", current filters: " << data.current_filter_count
80 |      << ", f_spatial_size: " << data.f_spatial_size
81 |      << ", weighs.size: " << data.weights.size()
82 |      << ", bias.size: " << data.bias.size() << "}";
83 |   return os;
84 | }
85 | 


--------------------------------------------------------------------------------
/test/specs/SquaredErrorTest.cpp:
--------------------------------------------------------------------------------
 1 | #include "TestSpecsDeclarations.hpp"
 2 | 
 3 | #include <random>  // for std::mt19937
 4 | #include <chrono>  // for random seed
 5 | 
 6 | #include "../../src/DataPipeline.hpp"
 7 | 
 8 | namespace test {
 9 | namespace specs {
10 | 
11 | ///
12 | /// PIMPL
13 | ///
14 | struct SquaredErrorTestImpl {
15 |   const size_t algo_w = 1000, algo_h = 2000;
16 |   const size_t padding = 4;
17 | };
18 | 
19 | ///
20 | /// SquaredErrorTest
21 | ///
22 | 
23 | TEST_SPEC_PIMPL(SquaredErrorTest)
24 | 
25 | void SquaredErrorTest::init() {}
26 | 
27 | std::string SquaredErrorTest::name(size_t) { return "Squared error test"; }
28 | 
29 | size_t SquaredErrorTest::data_set_count() { return 1; }
30 | 
31 | bool SquaredErrorTest::operator()(size_t,
32 |                                   cnn_sr::DataPipeline *const pipeline) {
33 |   assert_not_null(pipeline);
34 |   auto _context = pipeline->context();
35 | 
36 |   // total padding (from both sides) = padding*2
37 |   const size_t total_padding = _impl->padding * 2,
38 |                ground_truth_w = _impl->algo_w + total_padding,
39 |                ground_truth_h = _impl->algo_h + total_padding,
40 |                algo_size = _impl->algo_w * _impl->algo_h,
41 |                ground_truth_size = ground_truth_w * ground_truth_h;
42 | 
43 |   std::vector<float> cpu_algo_res(algo_size);
44 |   std::vector<float> cpu_expected(algo_size);
45 |   std::vector<float> cpu_ground_truth(ground_truth_size);
46 |   for (size_t i = 0; i < ground_truth_size; i++) {
47 |     cpu_ground_truth[i] = 99999.0f;
48 |   }
49 | 
50 |   float sum = 0.0f;
51 |   unsigned seed1 = std::chrono::system_clock::now().time_since_epoch().count();
52 |   std::mt19937 generator(seed1);
53 |   for (size_t i = 0; i < algo_size; i++) {
54 |     size_t row = i / _impl->algo_w, col = i % _impl->algo_w,
55 |            g_t_idx =
56 |                (row + _impl->padding) * ground_truth_w + _impl->padding + col;
57 |     cpu_ground_truth[g_t_idx] = generator() % 256;
58 |     cpu_algo_res[i] = (generator() % 2560) / 10.0f;
59 |     // fill expected buffer
60 |     double d = cpu_ground_truth[g_t_idx] - cpu_algo_res[i];
61 |     sum += d * d;
62 |   }
63 | 
64 |   /* clang-format off */
65 |   auto gpu_buf_ground_truth = _context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * ground_truth_size);
66 |   _context->write_buffer(gpu_buf_ground_truth, (void *)&cpu_ground_truth[0], true);
67 |   auto gpu_buf_algo_res = _context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * algo_size);
68 |   _context->write_buffer(gpu_buf_algo_res, (void *)&cpu_algo_res[0], true);
69 |   /* clang-format on */
70 | 
71 |   // exec
72 |   opencl::MemoryHandle tmp_buffer = gpu_nullptr;
73 |   float target = 0.0f;
74 |   pipeline->squared_error(gpu_buf_ground_truth,            //
75 |                           ground_truth_w, ground_truth_h,  //
76 |                           gpu_buf_algo_res,                //
77 |                           tmp_buffer, target, total_padding);
78 |   _context->block();
79 |   assert_equals(sum, target);
80 | 
81 |   return true;
82 | }
83 | 
84 | //
85 | //
86 | }  // namespace specs
87 | }  // namespace test
88 | 


--------------------------------------------------------------------------------
/test/specs/LastLayerDeltaTest.cpp:
--------------------------------------------------------------------------------
 1 | #include "TestSpecsDeclarations.hpp"
 2 | 
 3 | #include <random>  // for std::mt19937
 4 | #include <chrono>  // for random seed
 5 | 
 6 | #include "../../src/DataPipeline.hpp"
 7 | 
 8 | using namespace cnn_sr;
 9 | 
10 | namespace test {
11 | namespace specs {
12 | 
13 | ///
14 | /// PIMPL
15 | ///
16 | struct LastLayerDeltaTestImpl {
17 |   // const size_t algo_w = 1000, algo_h = 2000;
18 |   const size_t algo_w = 6, algo_h = 6;
19 |   const size_t padding = 4;
20 | };
21 | 
22 | ///
23 | /// LastLayerDeltaTest
24 | ///
25 | 
26 | TEST_SPEC_PIMPL(LastLayerDeltaTest)
27 | 
28 | void LastLayerDeltaTest::init() {}
29 | 
30 | std::string LastLayerDeltaTest::name(size_t) { return "Last layer delta test"; }
31 | 
32 | size_t LastLayerDeltaTest::data_set_count() { return 1; }
33 | 
34 | bool LastLayerDeltaTest::operator()(size_t,
35 |                                     cnn_sr::DataPipeline *const pipeline) {
36 |   assert_not_null(pipeline);
37 |   auto _context = pipeline->context();
38 | 
39 |   // total padding (from both sides) = padding*2
40 |   const size_t total_padding = _impl->padding * 2,
41 |                ground_truth_w = _impl->algo_w + total_padding,
42 |                ground_truth_h = _impl->algo_h + total_padding,
43 |                algo_size = _impl->algo_w * _impl->algo_h,
44 |                ground_truth_size = ground_truth_w * ground_truth_h;
45 | 
46 |   std::vector<float> cpu_algo_res(algo_size);
47 |   std::vector<float> cpu_expected(algo_size);
48 |   std::vector<float> cpu_ground_truth(ground_truth_size);
49 |   for (size_t i = 0; i < ground_truth_size; i++) {
50 |     cpu_ground_truth[i] = 99999.0f;
51 |   }
52 | 
53 |   unsigned seed1 = std::chrono::system_clock::now().time_since_epoch().count();
54 |   std::mt19937 generator(seed1);
55 |   for (size_t i = 0; i < algo_size; i++) {
56 |     size_t row = i / _impl->algo_w, col = i % _impl->algo_w,
57 |            g_t_idx =
58 |                (row + _impl->padding) * ground_truth_w + _impl->padding + col;
59 |     float t = (generator() % 256) / 100.0f;
60 |     // activation_function etc
61 |     float x = (generator() % 2560) / 1000.0f;
62 |     float y = activation_function(x);
63 |     // fill expected buffer
64 |     cpu_expected[i] = (y - t) * activation_function_derivative(x);
65 |     cpu_ground_truth[g_t_idx] = t;
66 |     cpu_algo_res[i] = y;
67 |   }
68 | 
69 |   /* clang-format off */
70 |   auto gpu_buf_ground_truth = _context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * ground_truth_size);
71 |   _context->write_buffer(gpu_buf_ground_truth, (void *)&cpu_ground_truth[0], true);
72 |   auto gpu_buf_algo_res = _context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * algo_size);
73 |   _context->write_buffer(gpu_buf_algo_res, (void *)&cpu_algo_res[0], true);
74 |   /* clang-format on */
75 |   opencl::MemoryHandle gpu_buf_out = gpu_nullptr;
76 | 
77 |   // exec
78 |   pipeline->last_layer_delta(gpu_buf_ground_truth,            //
79 |                              ground_truth_w, ground_truth_h,  //
80 |                              gpu_buf_algo_res, gpu_buf_out, total_padding);
81 |   assert_equals(pipeline, cpu_expected, gpu_buf_out);
82 |   return true;
83 | }
84 | 
85 | //
86 | //
87 | }  // namespace specs
88 | }  // namespace test
89 | 


--------------------------------------------------------------------------------
/src/opencl/UtilsOpenCL.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_OPENCL_H
 2 | #define UTILS_OPENCL_H
 3 | 
 4 | #include "CL/opencl.h"
 5 | 
 6 | namespace opencl {
 7 | 
 8 | class Kernel;
 9 | 
10 | namespace utils {
11 | 
12 | /**
13 |  * From stb_image documentation:
14 |  *
15 |  * The return value from an image loader is an 'unsigned char *' which points
16 |  * to the pixel data, or NULL on an allocation failure or if the image is
17 |  * corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
18 |  * with each pixel consisting of N interleaved 8-bit components; the first
19 |  * pixel pointed to is top-left-most in the image. There is no padding between
20 |  * image scanlines or between pixels, regardless of format. The number of
21 |  * components N is 'req_comp' if req_comp is non-zero, or *comp otherwise.
22 |  * If req_comp is non-zero, *comp has the number of components that _would_
23 |  * have been output otherwise. E.g. if you set req_comp to 4, you will always
24 |  * get RGBA output, but you can check *comp to see if it's trivially opaque
25 |  * because e.g. there were only 3 channels in the source image.
26 |  */
27 | struct ImageData {
28 |   ImageData();
29 |   ImageData(int, int, int, unsigned char*);
30 |   ~ImageData();
31 |   // TODO do not allow copy !!!
32 | 
33 |   int w, h;
34 |   int bpp;  // bytes per pixel
35 |   unsigned char* data;
36 | 
37 |  private:
38 |   bool read_from_file = true;
39 | };
40 | 
41 | /**
42 |  * cl_device_type is a number so we will change it to string
43 |  */
44 | extern char const* device_type_str[9];
45 | 
46 | /**
47 |  * Loads a Program file and prepends the cPreamble to the code.
48 |  * @param  cFilename     program filename
49 |  * @param  cPreamble     code that is prepended to the loaded file, typically \
50 | a set of #defines or a header
51 |  * @param  szFinalLength returned length of the code string
52 |  * @return               the source string if succeeded, 0 otherwise
53 |  */
54 | char* load_file(const char* cFilename, const char* cPreamble,
55 |                 size_t* szFinalLength);
56 | 
57 | void load_image(const char*, ImageData&);
58 | 
59 | int write_image(const char*, ImageData&);
60 | 
61 | void write_image(const char* const, float*, size_t, size_t);
62 | 
63 | /**
64 |  * Due too different possible resolutions we may have to recalculate this each
65 |  * time.
66 |  *
67 |  * NOTE: this solution tries to maximize work items per group, as most of
68 |  *kernels have some __local related optimizations
69 |  *
70 |  * @param kernel           kernel to execute
71 |  * @param dims             work dimensions: 1 for linear, 2 for 2D, 3 for 3D
72 |  * @param global_work_size to be filled size: dims
73 |  * @param local_work_size  to be filled size: dims
74 |  * @param work             real work size f.e. array length, image dimesions
75 |  *                         etc. size: dims
76 |  */
77 | void work_sizes(const opencl::Kernel&, size_t dims, size_t* global_work_size,
78 |                 size_t* local_work_size, size_t* work, bool print = false);
79 | 
80 | /**
81 |  * convert error code to string
82 |  *
83 |  * @param  cl_int :cl_int, error code
84 |  * @return        :string
85 |  */
86 | const char* get_opencl_error_str(cl_int);
87 | }
88 | }
89 | 
90 | #endif /* UTILS_OPENCL_H   */
91 | 


--------------------------------------------------------------------------------
/src/kernel/swap_luma.cl:
--------------------------------------------------------------------------------
 1 | __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |  //
 2 |                                CLK_ADDRESS_CLAMP_TO_EDGE |    //
 3 |                                CLK_FILTER_NEAREST;
 4 | 
 5 | // http://www.equasys.de/colorconversion.html
 6 | /* clang-format off */
 7 | __constant float4 rgb2y =  {   0.299f,    0.587f,    0.114f,  0.0f};
 8 | __constant float4 rgb2Cb = { -0.1687f,  -0.3312f,      0.5f,  0.0f};
 9 | __constant float4 rgb2Cr = {     0.5f,  -0.4186f,  -0.0813f,  0.0f};
10 | // __constant float4 rgb2Cb = {-37.797f, -74.203f,   112.0f,  0.0f};
11 | // __constant float4 rgb2Cr = { 112.0f,  -98.786f, -18.214f,  0.0f};
12 | 
13 | __constant float4 YCbCr2r = { 1.0f,     0.0f,    1.4f,  0.0f};
14 | __constant float4 YCbCr2g = { 1.0f,  -0.343f, -0.711f,  0.0f};
15 | __constant float4 YCbCr2b = { 1.0f,   1.765f,    0.0f,  0.0f};
16 | /* clang-format on */
17 | 
18 | __kernel void swap_luma(__read_only image2d_t original_image,  //
19 |                         __read_only __global float* new_luma,  //
20 |                         __global uchar* target,                //
21 |                         const uint ground_truth_w,
22 |                         const uint ground_truth_h,  //
23 |                         const uint luma_w, const uint luma_h) {
24 |   const size_t padding = (ground_truth_w - luma_w) / 2;
25 |   const int2 pos = {get_global_id(0), get_global_id(1)},
26 |              pos_luma = {pos.x - padding, pos.y - padding};
27 |   const size_t idx = pos.y * ground_truth_w + pos.x,
28 |                idx_luma = pos_luma.y * luma_w + pos_luma.x;
29 | 
30 |   if (pos.x < 0 || pos.x >= ground_truth_w ||  //
31 |       pos.y < 0 || pos.y >= ground_truth_h)
32 |     return;
33 | 
34 |   const uint4 pixel_col = read_imageui(original_image, sampler, pos);
35 |   const float4 pixel_col_f = convert_float4(pixel_col);
36 |   uint3 new_color;
37 |   if (pos_luma.x < 0 || pos_luma.x >= luma_w ||  //
38 |       pos_luma.y < 0 || pos_luma.y >= luma_h) {
39 |     // sample original image
40 |     new_color.x = pixel_col.x;  // 0..255
41 |     new_color.y = pixel_col.y;  // 0..255
42 |     new_color.z = pixel_col.z;  // 0..255
43 |   } else {
44 |     // combine new luma with chroma from original image
45 |     // to do this we first have to remove old luma
46 |     // NOTE: during conversion we skip +128 / -128 steps as they cancel
47 |     // themselves out
48 |     // TODO after writing tests use matrix version of this code
49 |     float raw_luma = new_luma[idx_luma];       // 0..1
50 |     float4 YCbCr = {raw_luma * 255.0f,         // 0..255
51 |                     dot(pixel_col_f, rgb2Cb),  // 0..255
52 |                     dot(pixel_col_f, rgb2Cr),  // 0..255
53 |                     0.0f};
54 |     float3 rgb = {dot(YCbCr, YCbCr2r),  //
55 |                   dot(YCbCr, YCbCr2g),  //
56 |                   dot(YCbCr, YCbCr2b)};
57 |     rgb = clamp(rgb, 0.0f, 255.0f);
58 |     // TODO mix luma values in edges of new luma area, to make the transition
59 |     // less jarring
60 |     new_color.x = convert_uint(rgb.x);
61 |     new_color.y = convert_uint(rgb.y);
62 |     new_color.z = convert_uint(rgb.z);
63 |   }
64 | 
65 |   // write
66 |   target[idx * 3 + 0] = convert_uchar(new_color.x);
67 |   target[idx * 3 + 1] = convert_uchar(new_color.y);
68 |   target[idx * 3 + 2] = convert_uchar(new_color.z);
69 | }
70 | 


--------------------------------------------------------------------------------
/test/specs/SwapLumaTest.cpp:
--------------------------------------------------------------------------------
 1 | #include "TestSpecsDeclarations.hpp"
 2 | 
 3 | #include "../../src/opencl/UtilsOpenCL.hpp"
 4 | #include "../../src/DataPipeline.hpp"
 5 | 
 6 | /*
 7 |  * Just run the kernel and see if the output is ~what You would expect.
 8 |  * This is quite easy if luma that You swap into has distinctive pattern.
 9 |  * You could also try generating expected output through python
10 |  * (see LumaTests_script.py), but it uses some weird sampling method,
11 |  * which means that result image is slighly blurred.
12 |  */
13 | 
14 | namespace test {
15 | namespace specs {
16 | 
17 | ///
18 | /// PIMPL
19 | ///
20 | struct SwapLumaTestImpl {
21 |   const size_t padding = 10;
22 |   // const char * const test_image = "test/data/color_grid.png";
23 |   const char *const input_img = "test/data/color_grid2.jpg";
24 |   const char *const expected_img = "test/data/color_grid2_luma_swapped.png";
25 | };
26 | 
27 | ///
28 | /// SwapLumaTest
29 | ///
30 | 
31 | TEST_SPEC_PIMPL(SwapLumaTest)
32 | 
33 | void SwapLumaTest::init() {}
34 | 
35 | size_t SwapLumaTest::data_set_count() { return 1; }
36 | 
37 | std::string SwapLumaTest::name(size_t) { return "Swap luma test"; }
38 | 
39 | bool SwapLumaTest::operator()(size_t, cnn_sr::DataPipeline *const pipeline) {
40 |   assert_not_null(pipeline);
41 |   auto context = pipeline->context();
42 | 
43 |   opencl::utils::ImageData img;
44 |   load_image(_impl->input_img, img);
45 | 
46 |   // generate luma to swap into
47 |   size_t luma_w = img.w - 2 * _impl->padding,
48 |          luma_h = img.h - 2 * _impl->padding, new_luma_size = luma_w * luma_w;
49 |   std::vector<float> new_luma(new_luma_size);
50 |   for (size_t i = 0; i < new_luma_size; i++) {
51 |     new_luma[i] = i * 1.0f / new_luma_size;
52 |   }
53 | 
54 |   opencl::MemoryHandle gpu_buf_raw_img = gpu_nullptr,
55 |                        gpu_buf_target = gpu_nullptr;
56 |   auto gpu_buf_luma =
57 |       context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * new_luma.size());
58 |   context->write_buffer(gpu_buf_luma, (void *)&new_luma[0], true);
59 | 
60 |   // run
61 |   pipeline->swap_luma(img, gpu_buf_raw_img, gpu_buf_luma, gpu_buf_target,
62 |                       luma_w, luma_h);
63 |   // check
64 |   opencl::utils::ImageData expected_img;
65 |   load_image(_impl->expected_img, expected_img);
66 |   size_t result_w = expected_img.w, result_h = expected_img.h,
67 |          result_size = result_w * result_h * 3;  // 3 channels
68 |   std::vector<unsigned char> result(result_size);
69 |   context->read_buffer(gpu_buf_target, (void *)&result[0], true);
70 | 
71 |   // dump image - only for debug !!!
72 |   // opencl::utils::ImageData res_img(result_w, result_h, 3, &result[0]);
73 |   // opencl::utils::write_image("dbg.png", res_img);
74 | 
75 |   for (size_t y = 0; y < result_h; y++) {
76 |     for (size_t x = 0; x < result_w; x++) {
77 |       for (size_t ch = 0; ch < 3; ch++) {
78 |         // NOTE: expected_img has 4 channels !
79 |         size_t idx1 = y * result_w + x;
80 |         int r = static_cast<int>(result[idx1 * 3 + ch]);
81 |         int e = static_cast<int>(expected_img.data[idx1 * 4 + ch]);
82 |         // std::cout << "[" << idx1 << "] expected >\t" << e << "\tgot> " << r
83 |         // << std::endl;
84 |         assert_equals(e, r);
85 |       }
86 |     }
87 |   }
88 | 
89 |   return true;
90 | }
91 | 
92 | //
93 | //
94 | }  // namespace specs
95 | }  // namespace test
96 | 


--------------------------------------------------------------------------------
/test/TestRunner.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | 
  3 | #include "TestCase.hpp"
  4 | #include "../src/opencl/Context.hpp"
  5 | #include "../src/DataPipeline.hpp"
  6 | #include "specs/TestSpecsDeclarations.hpp"
  7 | 
  8 | ///
  9 | /// Test runner main function
 10 | ///
 11 | 
 12 | #define ADD_TEST(test_name, ...)                   \
 13 |   test_name CONCATENATE(__test, __LINE__){};       \
 14 |   CONCATENATE(__test, __LINE__).init(__VA_ARGS__); \
 15 |   cases.push_back(&CONCATENATE(__test, __LINE__));
 16 | 
 17 | int main(int, char **) {
 18 |   std::cout << "STARTING TESTS" << std::endl;
 19 | 
 20 |   using namespace test;
 21 |   using namespace test::specs;
 22 | 
 23 |   std::vector<TestCase *> cases;
 24 |   std::vector<int> results;
 25 | 
 26 |   opencl::Context context;
 27 |   context.init();
 28 |   cnn_sr::DataPipeline pipeline(&context);
 29 |   pipeline.init(cnn_sr::DataPipeline::LOAD_KERNEL_MISC);
 30 |   // TODO test opt
 31 |   // pipeline.init(true, cnn_sr::DataPipeline::LOAD_KERNEL_MISC);
 32 | 
 33 |   //
 34 |   //
 35 |   //
 36 | 
 37 |   ADD_TEST(LayerTest);
 38 |   ADD_TEST(ExtractLumaTest);
 39 |   ADD_TEST(SwapLumaTest);
 40 |   ADD_TEST(SquaredErrorTest);
 41 |   ADD_TEST(SubtractFromAllTest);
 42 |   ADD_TEST(SumTest);
 43 |   ADD_TEST(LayerDeltasTest);
 44 |   ADD_TEST(BackpropagationTest);
 45 |   ADD_TEST(LastLayerDeltaTest);
 46 |   ADD_TEST(UpdateParametersTest);
 47 |   ADD_TEST(ConfigTest);
 48 | 
 49 |   //
 50 |   //
 51 |   //
 52 |   //
 53 | 
 54 |   int failures = 0;
 55 |   for (auto i = begin(cases); i != end(cases); ++i) {
 56 |     TestCase *test = *i;
 57 |     size_t data_set_cnt = test->data_set_count();
 58 |     if (data_set_cnt == 0) {
 59 |       data_set_cnt = 1;
 60 |     }
 61 | 
 62 |     // run test case with all data sets
 63 |     for (size_t ds = 0; ds < data_set_cnt; ds++) {
 64 |       auto test_name = test->name(ds);
 65 |       bool passed = false;
 66 | 
 67 |       std::cout << std::endl
 68 |                 << test_name << ":" << std::endl;
 69 |       try {
 70 |         passed = (*test)(ds, &pipeline);
 71 |       } catch (const std::exception &ex) {
 72 |         std::cout << "[ERROR] " << ex.what() << std::endl;
 73 |       } catch (...) {
 74 |         std::cout << "[ERROR] Undefined exception" << std::endl;
 75 |       }
 76 |       results.push_back(passed ? 1 : 0);
 77 |     }
 78 |   }
 79 | 
 80 |   // print results
 81 |   std::cout << std::endl
 82 |             << "RESULTS:" << std::endl;
 83 |   size_t test_case_it = 0;
 84 |   for (size_t i = 0; i < cases.size(); i++) {
 85 |     TestCase *test = cases[i];
 86 |     size_t data_set_cnt = test->data_set_count();
 87 |     if (data_set_cnt == 0) {
 88 |       data_set_cnt = 1;
 89 |     }
 90 |     for (size_t ds = 0; ds < data_set_cnt; ds++) {
 91 |       auto test_name = test->name(ds);
 92 |       bool passed = results[test_case_it] != 0;
 93 |       ++test_case_it;
 94 |       if (passed) {
 95 |         std::cout << "\t  " << test_name << std::endl;
 96 |       } else {
 97 |         std::cout << "\t~ " << test_name << std::endl;
 98 |         ++failures;
 99 |       }
100 |     }
101 |   }
102 | 
103 |   if (failures == 0) {
104 |     std::cout << results.size() << " tests completed" << std::endl;
105 |     exit(EXIT_SUCCESS);
106 |   } else {
107 |     std::cout << failures << " of " << results.size() << " failed" << std::endl;
108 |     exit(EXIT_FAILURE);
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/test/data/test_cases.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "k=1, n=3, f=3, input:5*5": {
  3 |     "n_prev_filter_cnt": 1,
  4 |     "current_filter_count": 3,
  5 |     "f_spatial_size": 3,
  6 | 
  7 |     "input_w": 5,
  8 |     "input_h": 5,
  9 |     "input": [
 10 |       -0.6024,  0.3976,  0.2096,  0.2506, -0.1654,
 11 |       -0.4324,  0.0986, -0.1894,  0.2836,  0.1846,
 12 |       -0.1724, -0.3034, -0.0154, -0.4884,  0.1046,
 13 |        0.0676,  0.1426,  0.2506,  0.1426, -0.3034,
 14 |        0.2076, -0.0144,  0.2566, -0.0094,  0.0996],
 15 | 
 16 |     "output": [
 17 |       0.000, 0.000, 0.399,
 18 |       0.000, 0.776, 0.111,
 19 |       0.517, 0.000, 0.584,
 20 |       0.000, 0.253, 0.000,
 21 |       0.752, 0.000, 0.285,
 22 |       0.000, 0.715, 0.000,
 23 |       0.519, 0.200, 0.443,
 24 |       0.000, 0.726, 0.551,
 25 |       0.688, 0.000, 0.443],
 26 | 
 27 |     "weights": [
 28 |       1.0, 0.0, 0.0,
 29 |       0.0, 1.0, 0.0,
 30 |       1.0, 0.0, 0.0,
 31 | 
 32 |       0.0, 1.0, 0.0,
 33 |       1.0, 0.0, 1.0,
 34 |       0.0, 1.0, 0.0,
 35 | 
 36 |       1.0, 0.0, 0.0,
 37 |       0.0, 1.0, 0.0,
 38 |       1.0, 0.0, 0.0
 39 |     ],
 40 | 
 41 |     "bias": [0.1, 0.2, 0.3]
 42 |   },
 43 | 
 44 | 
 45 | 
 46 |   "k=3, n=2, f=3, input:3*3": {
 47 |     "n_prev_filter_cnt": 3,
 48 |     "current_filter_count": 2,
 49 |     "f_spatial_size": 3,
 50 | 
 51 |     "input_w": 3,
 52 |     "input_h": 3,
 53 |     "input": [
 54 |       0.406, 0.419, 0.598,
 55 |       0.442, 0.685, 0.528,
 56 |       0.627, 0.489, 0.642,
 57 |       0.376, 0.563, 0.499,
 58 |       0.680, 0.371, 0.571,
 59 |       0.390, 0.672, 0.453,
 60 |       0.626, 0.550, 0.609,
 61 |       0.386, 0.674, 0.634,
 62 |       0.666, 0.413, 0.609],
 63 | 
 64 |     "output": [0.169, 0.0],
 65 | 
 66 |     "weights": [
 67 |       -0.369,  0.025,     0.213,  0.058,    0.410, -0.068,
 68 |        0.236,  0.071,    -0.429, -0.104,    0.161,  0.087,
 69 |        0.361, -0.055,     0.273,  0.071,    0.431, -0.095,
 70 | 
 71 |        0.229,  0.378,    -0.178,  0.343,    0.114, -0.409,
 72 |       -0.220, -0.364,     0.711,	0.281,    0.851, -1.001,
 73 |       -0.411,	 0.661,    -0.831, -0.091,    0.281, -0.341,
 74 | 
 75 |       -0.931,	 0.511,     0.141, -0.591,    0.491, -0.921,
 76 |        0.291,	-0.211,     0.151,  0.491,   -0.431, -0.321,
 77 |       -0.631,	 0.301,    -0.001, -0.761,   -0.021,  0.501],
 78 | 
 79 |     "bias": [0.1, 0.2]
 80 |   },
 81 | 
 82 | 
 83 | 
 84 |   "k=3, n=3, f=1, input:3*3": {
 85 |     "n_prev_filter_cnt": 3,
 86 |     "current_filter_count": 3,
 87 |     "f_spatial_size": 1,
 88 | 
 89 |     "input_w": 3,
 90 |     "input_h": 3,
 91 |     "input": [
 92 |       0.406, 0.419, 0.598,
 93 |       0.442, 0.685, 0.528,
 94 |       0.627, 0.489, 0.642,
 95 |       0.376, 0.563, 0.499,
 96 |       0.680, 0.371, 0.571,
 97 |       0.390, 0.672, 0.453,
 98 |       0.626, 0.550, 0.609,
 99 |       0.386, 0.674, 0.634,
100 |       0.666, 0.413, 0.609],
101 | 
102 |     "output": [
103 |       0.369, 0.025, 0.229,
104 |       0.213, 0.058, 0.378,
105 |       0.410, 0.000, 0.178,
106 |       0.236, 0.071, 0.343,
107 |       0.429, 0.000, 0.114,
108 |       0.161, 0.087, 0.409,
109 |       0.361, 0.000, 0.220,
110 |       0.273, 0.071, 0.364,
111 |       0.431, 0.000, 0.132],
112 | 
113 |     "weights": [
114 |        0.20, -0.45, -0.35,    -0.45,  0.16,  0.54,    0.63, -0.10, -0.26],
115 | 
116 |     "bias": [0.1, 0.2, 0.3]
117 |   }
118 | 
119 | }
120 | 


--------------------------------------------------------------------------------
/test/specs/BackpropagationTest_script.py:
--------------------------------------------------------------------------------
  1 | #helper script to generate expected delta values
  2 | #for BackpropagationTest
  3 | 
  4 | inputs = [[
  5 |   -0.083,   0.075,   -0.058,   -0.068,  -0.013,
  6 |    0.169,   0.181,    0.136,   -0.165,   0.159,
  7 |   -0.112,   0.003,   -0.123,   -0.102,   0.242,
  8 |    0.406,  -0.442,   -0.627,    0.376,   0.680,
  9 |    0.121,  -0.103,    0.106,   -0.036,   0.052],
 10 |  [-0.064,  -0.055,   -0.138,   -0.144,   0.176,
 11 |    0.049,  -0.051,   -0.062,   -0.176,  -0.060,
 12 |    0.228,  -0.138,   -0.027,   -0.061,  -0.069,
 13 |    0.419,   0.685,   -0.489,    0.563,  -0.371,
 14 |   -0.075,   0.031,    0.033,   -0.052,  -0.035]]
 15 | 
 16 | deltas = [0.122, 0.083, 0.064,  # row 1, col 1
 17 |           0.057, 0.075, 0.055,  # row 1, col 2
 18 |           0.025, 0.058, 0.138,  # row 1, col 3
 19 | 
 20 |           0.170, 0.068, 0.144,  # row 2, col 1
 21 |           0.121, 0.013, 0.176,  # row 2, col 2
 22 |           0.065, 0.169, 0.049,  # row 2, col 3
 23 | 
 24 |           0.003, 0.181, 0.051,  # row 3, col 1
 25 |           0.021, 0.136, 0.062,  # row 3, col 2
 26 |           0.066, 0.165, 0.176]  # row 3, col 3
 27 | 
 28 | f=3
 29 | out = 3,3
 30 | inn = 5,5
 31 | n_curr = 3
 32 | n_prev = 2
 33 | 
 34 | w = [1.5] * 54 # we will add algo result to this
 35 | 
 36 | def kernel(x,y):
 37 |     for n in range(n_curr):
 38 |         delta_idx = ((y * out[0]) + x) * n_curr + n
 39 |         delta = deltas[delta_idx]
 40 |         for a in range(f):
 41 |             for b in range(f):
 42 |                 for k in range(n_prev):
 43 |                     p = x+b, y+a
 44 |                     val = inputs[k][p[1] * inn[0] + p[0]]
 45 |                     idx = ((a * f) + b) *n_curr*n_prev + k * n_curr + n
 46 |                     w[idx] += val * delta
 47 |                     # w[idx] += val
 48 |                     # w[idx] += delta
 49 | 
 50 | for y in range(out[1]):
 51 |     for x in range(out[0]):
 52 |         kernel(x,y)
 53 | # print('\n'.join(["[{}]\t{:>6.3}".format(i,x) for i,x in enumerate(w)]))
 54 | # print('\n'.join(["[{}]\t{:>6.3}".format(i,x) for i,x in enumerate(w) if i%3==0]))
 55 | # print('\n'.join(["[{}]\t{:>6}".format(i,x) for i,x in enumerate(w)]))
 56 | for i in range(9):
 57 |     xs = w[i*6:(i+1)*6]
 58 |     print(', '.join(["{:>7.5}".format(x) for i,x in enumerate(xs)]))
 59 | 
 60 | 
 61 | print('\n\nbias:')
 62 | bias_res = [
 63 |     sum([x for i,x in enumerate(deltas) if i%3==0]),
 64 |     sum([x for i,x in enumerate(deltas) if i%3==1]),
 65 |     sum([x for i,x in enumerate(deltas) if i%3==2])]
 66 | print(', '.join(["{:>6.3}".format(x) for i,x in enumerate(bias_res)]))
 67 | 
 68 | 
 69 | '''
 70 | ONLY INPUT:
 71 | 
 72 | PY:
 73 | [0]      0.188
 74 | [3]     -0.258
 75 | [6]     -0.121
 76 | [9]     -0.852
 77 | [12]     0.008
 78 | [15]    -0.561
 79 | [18]    -0.409
 80 | [21]     0.614
 81 | [24]    -0.763
 82 | [27]     0.244
 83 | [30]     0.576
 84 | [33]    -0.752
 85 | [36]    -0.771
 86 | [39]     0.667
 87 | [42]    -0.948
 88 | [45]     0.545
 89 | [48]     0.568
 90 | [51]    -0.508
 91 | 
 92 | GPU:
 93 | b[0]    0.188
 94 | b[3]   -0.258
 95 | b[6]   -0.121
 96 | b[9]   -0.852
 97 | b[14]   0.008
 98 | b[17]  -0.561
 99 | 
100 | b[18]  -0.409
101 | b[23]   0.614
102 | b[24]  -0.763
103 | b[29]   0.244
104 | b[32]   0.576
105 | b[35]  -0.752
106 | 
107 | b[38]  -0.771
108 | b[41]   0.667
109 | b[44]  -0.948
110 | b[47]   0.545
111 | b[50]   0.568
112 | b[53]  -0.508
113 | '''
114 | 


--------------------------------------------------------------------------------
/schedule_training.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import shutil
 4 | import argparse
 5 | import subprocess
 6 | from time import gmtime, strftime
 7 | 
 8 | '''
 9 | typical .bat file:
10 | 
11 | make build
12 | if %errorlevel%==0 (
13 |   bin\cnn.exe train -c data\config.json --epochs 100 -i data\train_samples -o data\parameters.json
14 | )
15 | '''
16 | 
17 | epochs_per_iteration = 500
18 | pars_file = 'data\\parameters.json'
19 | 
20 | seconds_per_epoch = 0.7
21 | #seconds_per_epoch = 0.236
22 | 
23 | cmd = 'bin\\cnn.exe train -c data\config.json --epochs {} -i data\\train_samples'.format(epochs_per_iteration)
24 | 
25 | 
26 | def get_dst_file_path():
27 |   #strftime("%Y-%m-%d %H:%M:%S")
28 |   tt = strftime("%Y-%m-%d--%H-%M-%S")
29 |   log_folder = lambda s: os.path.join('logs', s)
30 |   return log_folder('log_{}.txt'.format(tt)), \
31 |          log_folder('parameters_{}.json'.format(tt)), \
32 |          tt
33 | 
34 | 
35 | seconds_per_unit = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800}
36 | 
37 | def convert_to_seconds(s):
38 |   return int(s[:-1]) * seconds_per_unit[s[-1]]
39 | 
40 | if __name__ == '__main__':
41 |   help_text = 'Start training with either duration or #epochs'
42 |   parser = argparse.ArgumentParser(description=help_text)
43 |   action = parser.add_mutually_exclusive_group(required=True)
44 |   action.add_argument('--duration', '-d', help='Duration, provided as: X[s|m|h|d|w] (s=seconds, m=minutes, h=hours, d=days, w=week)')
45 |   action.add_argument('--epochs',   '-e', type=int, help='Number of epochs')
46 |   parser.add_argument('--dry', action='store_true', required=False, help='Do not output any files')
47 | 
48 |   args = parser.parse_args()
49 |   if args.duration:
50 |     time_in_s = convert_to_seconds(args.duration)
51 |     total_epochs = int(time_in_s / seconds_per_epoch)
52 |   else:
53 |     total_epochs = args.epochs
54 |   total_epochs = max(total_epochs, epochs_per_iteration)
55 | 
56 |   cmd_ = cmd.split(' ')
57 |   if args.dry:
58 |     cmd_.append('dry')
59 |   else:
60 |     cmd_.append('-o')
61 |     cmd_.append(pars_file)
62 |   print('Command to execute:')
63 |   print('\'' + (' '.join(cmd_)) + '\'')
64 | 
65 |   start = time.time()
66 |   iters = total_epochs // epochs_per_iteration
67 |   total_epochs = iters * epochs_per_iteration # last iter have same #epochs as others
68 |   print('Will do {0:} iterations, {1:} epochs per iteration = {2:} total'.format( \
69 |             iters, epochs_per_iteration, iters * epochs_per_iteration))
70 |   est_time = total_epochs * seconds_per_epoch
71 |   print('Estimated required time: {:.3f}s = {:.3f} min'.format(est_time, est_time//60))
72 | 
73 |   for i in range(iters):
74 |     log_path, tmp_params_path, stamp = get_dst_file_path()
75 |     total_epochs_left = (iters - i) * epochs_per_iteration
76 |     print('\n---- {0:} - {1:} (time left: {2:d}min)----'.format(i+1, stamp, int(total_epochs_left*seconds_per_epoch)//60))
77 | 
78 |     # execute training
79 |     with open(log_path, "w") as tmp_log:
80 |       ret_code = subprocess.call(cmd_, stdout=tmp_log, stderr=subprocess.STDOUT)
81 |       print('return code: '+str(ret_code))
82 |       if ret_code is not 0:
83 |         print('---- FAIL ----')
84 |         exit()
85 | 
86 |     # backup results
87 |     if not args.dry:
88 |       print('saving sub results to: \'' + tmp_params_path + '\'')
89 |       shutil.copy2(pars_file, tmp_params_path)
90 | 
91 |   end = time.time()
92 |   dt = end - start
93 |   print("Execution time: {:.3f}s = {:.2f}min ({:.5f} s/epoch)".format(dt, dt/60, dt/total_epochs))
94 | 


--------------------------------------------------------------------------------
/src/pch.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef PCH_H
  2 | #define PCH_H
  3 | 
  4 | #include <string>
  5 | #include <vector>
  6 | // #include <cstddef>  // for size_t
  7 | 
  8 | // TODO use during compilation
  9 | 
 10 | ///
 11 | /// forward declarations
 12 | ///
 13 | /* clang-format off */
 14 | namespace cnn_sr {
 15 |   struct ParametersDistribution;
 16 |   struct Config;
 17 |   class ConfigReader;
 18 |   class DataPipeline;
 19 |   class ConfigBasedDataPipeline;
 20 |   struct LayerData;
 21 |   struct CnnLayerGpuAllocationPool;
 22 | }
 23 | 
 24 | namespace opencl {
 25 |   class Kernel;
 26 |   typedef size_t MemoryHandle;
 27 |   class Context;
 28 | 
 29 |   namespace utils {
 30 |     struct ImageData;
 31 |   }
 32 | }
 33 | /* clang-format on */
 34 | 
 35 | typedef struct _cl_event* cl_event;
 36 | 
 37 | union JsonValue;
 38 | struct JsonNode;
 39 | class JsonAllocator;
 40 | 
 41 | ///
 42 | /// Utils
 43 | ///
 44 | namespace cnn_sr {
 45 | 
 46 | extern bool warn_about_blocking_operation;
 47 | 
 48 | namespace utils {
 49 | 
 50 | void require(bool, const char*);
 51 | 
 52 | void dump_vector(std::ostream&, std::vector<float>&,
 53 |                  const char* line_prefix = nullptr, size_t per_line = 0,
 54 |                  bool add_line_numbers = false);
 55 | 
 56 | template <typename T>
 57 | inline bool is_odd(T x) {
 58 |   return (x & 1) != 0;
 59 | }
 60 | 
 61 | template <typename T>
 62 | inline bool is_even(T x) {
 63 |   return !is_odd(x);
 64 | }
 65 | 
 66 | size_t closest_power_of_2(int);
 67 | 
 68 | ///
 69 | /// Utils - macros
 70 | ///
 71 | #define STRINGIFY2(s) #s
 72 | #define STRINGIFY(s) STRINGIFY2(s)
 73 | 
 74 | #define CONCATENATE_DETAIL(x, y) x##y
 75 | #define CONCATENATE(x, y) CONCATENATE_DETAIL(x, y)
 76 | 
 77 | ///
 78 | /// File system
 79 | ///
 80 | #define IOException std::ios_base::failure
 81 | 
 82 | void get_file_content(const char* const, std::stringstream&);
 83 | 
 84 | void list_files(const char* const, std::vector<std::string>&);
 85 | 
 86 | ///
 87 | /// Json utils
 88 | ///
 89 | /** NOTE: we need to hold file content in some persistent place, since the
 90 |  * string argument*/
 91 | void read_json_file(const char* const, JsonValue&, JsonAllocator&, std::string&,
 92 |                     int root_type);
 93 | 
 94 | bool try_read_float(JsonNode&, float&, const char*);
 95 | // (unsigned int)node->value.toNumber();
 96 | bool try_read_uint(JsonNode&, unsigned int&, const char*);
 97 | bool try_read_vector(JsonNode&, std::vector<float>&, const char*);
 98 | bool try_read_string(JsonNode&, std::string&, const char*);
 99 | 
100 | ///
101 | /// Cmd line args parsing
102 | ///
103 | struct ArgOption {
104 |   bool _required = false;
105 |   std::string _name = "";
106 |   std::string _help = "";
107 |   std::vector<std::string> _mnemonics;
108 | 
109 |   ArgOption& help(const char*);
110 |   ArgOption& required();
111 | };
112 | 
113 | class Argparse {
114 |   typedef std::pair<ArgOption*, std::string> ArgValue;
115 | 
116 |  public:
117 |   Argparse(const char*, const char*);
118 | 
119 |   ArgOption& add_argument(const char*);
120 |   ArgOption& add_argument(const char*, const char*);
121 |   bool parse(size_t, char**);
122 |   void print_help();
123 | 
124 |   bool has_arg(const char*);
125 |   const char* value(const char*);
126 |   void value(const char*, size_t&);
127 | 
128 |  private:
129 |   ArgOption& add_argument(size_t, const char**);
130 |   ArgValue* get_value(const char*);
131 | 
132 |   const std::string _general_help, _exec_name;
133 |   std::vector<ArgOption> _options;
134 |   std::vector<ArgValue> _values;
135 | };
136 | }
137 | }
138 | 
139 | #endif /* PCH_H   */
140 | 


--------------------------------------------------------------------------------
/src/opencl/Kernel.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef KERNEL_H
  2 | #define KERNEL_H
  3 | 
  4 | #include "CL/opencl.h"
  5 | #include <iostream>  // for std::ostream& operator<<(..)
  6 | 
  7 | #define MAX_KERNEL_IDENTIFIER_SIZE 128
  8 | 
  9 | namespace opencl {
 10 | 
 11 | // forward declaration
 12 | class Context;
 13 | typedef size_t MemoryHandle;
 14 | 
 15 | class Kernel {
 16 |  public:
 17 |   void init(Context *, cl_kernel, cl_program,  //
 18 |             const char *, const char *);
 19 |   void cleanup();
 20 |   friend std::ostream &operator<<(std::ostream &os, opencl::Kernel &p);
 21 | 
 22 |   size_t current_local_memory();
 23 | 
 24 |   /**
 25 |    * Set the next argument. To be used as a sequence of calls,
 26 |    * where each one sets next argument.
 27 |    *
 28 |    * @param arg_size  size of pointer f.e. sizeof(cl_mem) | sizeof(cl_int)
 29 |    * @param arg_value void* pointer to argument value
 30 |    */
 31 |   void push_arg(size_t arg_size, const void *);
 32 | 
 33 |   /**
 34 |    * Set the next argument. To be used as a sequence of calls,
 35 |    * where each one sets next argument.
 36 |    *
 37 |    * @param handle  gpu memory handler
 38 |    */
 39 |   void push_arg(MemoryHandle);
 40 | 
 41 |   /**
 42 |    * Execute the kernel with arguments that were pushed before this call.
 43 |    * After this call You will have to provide all arguments againg before
 44 |    * You execute the kernel again.
 45 |    * Also this function provides some basics checks for work_size parameters,
 46 |    * so You can catch them more easily.
 47 |    *
 48 |    * @param  work_dim                 number of dimensions
 49 |    * @param  global_work_size         :size_t*, total work size provided as
 50 |    *array
 51 |    *each value for one of dimensions
 52 |    * @param  local_work_size          :size_t*, work group size
 53 |    * @param  events_to_wait_for       [OPT] wait for other operations to finish
 54 |    * @param  events_to_wait_for_count [OPT]
 55 |    * @return                          opencl event object
 56 |    */
 57 |   cl_event execute(cl_uint work_dim,                //
 58 |                    const size_t *global_work_size,  //
 59 |                    const size_t *local_work_size,   //
 60 |                    cl_event *events_to_wait_for = nullptr, int event_count = 0);
 61 | 
 62 |   inline size_t get_max_work_group_size() const { return max_work_group_size; }
 63 |   inline Context *get_context() const { return context; }
 64 |   inline cl_ulong get_total_execution_time() const {
 65 |     return execution_time_sum;
 66 |   }
 67 |   inline const char *get_human_identifier() const { return human_identifier; }
 68 | 
 69 |  private:
 70 |   /**
 71 |    * Basic checks for work parameters. Based on:
 72 |    * https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
 73 |    *
 74 |    * @return                  if work parameters fulfill the constraints
 75 |    */
 76 |   void check_work_parameters(cl_uint work_dim,  //
 77 |                              const size_t *global_work_size,
 78 |                              const size_t *local_work_size);
 79 | 
 80 |  private:
 81 |   cl_kernel kernel_id;
 82 |   cl_program program_id;
 83 |   Context *context;
 84 |   size_t max_work_group_size;
 85 |   cl_ulong private_mem_size;
 86 |   size_t pref_work_group_multiple;
 87 | 
 88 |   size_t arg_stack_size;
 89 |   size_t assigned_local_memory;  // by hand, since it does always work
 90 |   bool initialized = false;
 91 | 
 92 |   /** meaningful only if context->is_running_profile_mode */
 93 |   cl_ulong execution_time_sum = 0;
 94 |   char human_identifier[MAX_KERNEL_IDENTIFIER_SIZE];
 95 | };
 96 | 
 97 | //
 98 | }
 99 | 
100 | // std::ostream &operator<<(std::ostream &, opencl::Kernel &);
101 | 
102 | #endif /* KERNEL_H   */
103 | 


--------------------------------------------------------------------------------
/src/kernel/squared_error.cl:
--------------------------------------------------------------------------------
 1 | /* clang-format off */
 2 | /**
 3 |  * @see http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html
 4 |  * @see  http://simpleopencl.blogspot.com/2013/05/atomic-operations-and-floats-in-opencl.html
 5 |  *
 6 |  * @param {[type]} volatile __global float   *source       [description]
 7 |  * @param {[type]} const    float    operand [description]
 8 |  */
 9 | inline void atomic_add_global(volatile __global float* source, const float operand) {
10 |   /* clang-format on */
11 |   union {
12 |     unsigned int intVal;
13 |     float floatVal;
14 |   } newVal;
15 | 
16 |   union {
17 |     unsigned int intVal;
18 |     float floatVal;
19 |   } prevVal;
20 | 
21 |   // NOTE: atomic_cmpxchg(volatile __global unsigned int *p,
22 |   // 	                    unsigned int cmp, unsigned int val)
23 |   do {
24 |     prevVal.floatVal = *source;
25 |     newVal.floatVal = prevVal.floatVal + operand;
26 |   } while (atomic_cmpxchg((volatile __global unsigned int*)source,
27 |                           prevVal.intVal,  //
28 |                           newVal.intVal) != prevVal.intVal);
29 | }
30 | 
31 | /**
32 |  * Part of mean square error calculations. Here we take 2 same sized,
33 |  * single color channel buffers with image data and get the difference
34 |  * between respective pixels.
35 |  */
36 | __kernel void squared_err(__read_only __global float* ground_truth_image,
37 |                           __read_only __global float* algo_result,
38 |                           __global float* target,       //
39 |                           __local float* scratch,       //
40 |                           __const uint ground_truth_w,  //
41 |                           __const uint ground_truth_h,  //
42 |                           __const uint algo_result_w,   //
43 |                           __const uint algo_result_h) {
44 |   const int2 pos = {get_global_id(0), get_global_id(1)};  // x=col=i, y=row=j
45 |   const uint sample_id = get_global_id(2);
46 |   const int2 out_size = {algo_result_w, algo_result_h};
47 |   const int idx = (pos.y * algo_result_w) + pos.x;
48 |   const size_t padding = (ground_truth_w - algo_result_w) / 2;
49 |   const size_t local_size = get_local_size(1) * get_local_size(0),
50 |                local_index =
51 |                    get_local_id(1) * get_local_size(0) + get_local_id(0);
52 | 
53 | #define IMAGE_OFFSET_GT sample_id* ground_truth_w* ground_truth_h
54 | #define IMAGE_OFFSET_ALGO sample_id* algo_result_w* algo_result_h
55 | 
56 |   // size of ground_truth != algo res (padding)
57 |   // The offset is not const, since it depends on the row we are in
58 |   // algo for ground_truth_idx:
59 |   // (row + padding_on_top_of_image) * width + padding_left + col
60 |   const size_t ground_truth_idx =
61 |       (pos.y + padding) * ground_truth_w + padding + pos.x;
62 | 
63 |   float squared_diff = 0.0f;
64 |   if (pos.x >= 0 && pos.x < out_size.x &&  //
65 |       pos.y >= 0 && pos.y < out_size.y) {
66 |     float t = ground_truth_image[IMAGE_OFFSET_GT + ground_truth_idx];
67 |     float y = algo_result[IMAGE_OFFSET_ALGO + idx];
68 |     float d = y - t;
69 |     squared_diff = d * d;
70 |   }
71 |   scratch[local_index] = squared_diff;
72 | 
73 |   // wait till all kernels from local groups finished
74 |   barrier(CLK_LOCAL_MEM_FENCE);
75 | 
76 |   // add all squared_diffs for local group
77 |   for (int offset = local_size / 2; offset > 0; offset = offset / 2) {
78 |     if (local_index < offset) {
79 |       float other = scratch[local_index + offset];
80 |       float mine = scratch[local_index];
81 |       scratch[local_index] = mine + other;
82 |     }
83 |     // wait for all local kernels to finish previous step
84 |     // and reach stable state
85 |     barrier(CLK_LOCAL_MEM_FENCE);
86 |   }
87 | 
88 |   // add local result to global result
89 |   if (local_index == 0) {
90 |     atomic_add_global(target, scratch[0]);
91 |   }
92 | }
93 | 


--------------------------------------------------------------------------------
/libs/include/json/gason.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stdint.h>
  4 | #include <stddef.h>
  5 | #include <assert.h>
  6 | 
  7 | enum JsonTag {
  8 |     JSON_NUMBER = 0,
  9 |     JSON_STRING,
 10 |     JSON_ARRAY,
 11 |     JSON_OBJECT,
 12 |     JSON_TRUE,
 13 |     JSON_FALSE,
 14 |     JSON_NULL = 0xF
 15 | };
 16 | 
 17 | struct JsonNode;
 18 | 
 19 | #define JSON_VALUE_PAYLOAD_MASK 0x00007FFFFFFFFFFFULL
 20 | #define JSON_VALUE_NAN_MASK 0x7FF8000000000000ULL
 21 | #define JSON_VALUE_TAG_MASK 0xF
 22 | #define JSON_VALUE_TAG_SHIFT 47
 23 | 
 24 | union JsonValue {
 25 |     uint64_t ival;
 26 |     double fval;
 27 | 
 28 |     JsonValue(double x)
 29 |         : fval(x) {
 30 |     }
 31 |     JsonValue(JsonTag tag = JSON_NULL, void *payload = nullptr) {
 32 |         assert((uint64_t)payload <= JSON_VALUE_PAYLOAD_MASK);
 33 |         ival = JSON_VALUE_NAN_MASK | ((uint64_t)tag << JSON_VALUE_TAG_SHIFT) | (uintptr_t)payload;
 34 |     }
 35 |     bool isDouble() const {
 36 |         return (int64_t)ival <= (int64_t)JSON_VALUE_NAN_MASK;
 37 |     }
 38 |     JsonTag getTag() const {
 39 |         return isDouble() ? JSON_NUMBER : JsonTag((ival >> JSON_VALUE_TAG_SHIFT) & JSON_VALUE_TAG_MASK);
 40 |     }
 41 |     uint64_t getPayload() const {
 42 |         assert(!isDouble());
 43 |         return ival & JSON_VALUE_PAYLOAD_MASK;
 44 |     }
 45 |     double toNumber() const {
 46 |         assert(getTag() == JSON_NUMBER);
 47 |         return fval;
 48 |     }
 49 |     char *toString() const {
 50 |         assert(getTag() == JSON_STRING);
 51 |         return (char *)getPayload();
 52 |     }
 53 |     JsonNode *toNode() const {
 54 |         assert(getTag() == JSON_ARRAY || getTag() == JSON_OBJECT);
 55 |         return (JsonNode *)getPayload();
 56 |     }
 57 | };
 58 | 
 59 | struct JsonNode {
 60 |     JsonValue value;
 61 |     JsonNode *next;
 62 |     char *key;
 63 | };
 64 | 
 65 | struct JsonIterator {
 66 |     JsonNode *p;
 67 | 
 68 |     void operator++() {
 69 |         p = p->next;
 70 |     }
 71 |     bool operator!=(const JsonIterator &x) const {
 72 |         return p != x.p;
 73 |     }
 74 |     JsonNode *operator*() const {
 75 |         return p;
 76 |     }
 77 |     JsonNode *operator->() const {
 78 |         return p;
 79 |     }
 80 | };
 81 | 
 82 | inline JsonIterator begin(JsonValue o) {
 83 |     return JsonIterator{o.toNode()};
 84 | }
 85 | inline JsonIterator end(JsonValue) {
 86 |     return JsonIterator{nullptr};
 87 | }
 88 | 
 89 | #define JSON_ERRNO_MAP(XX)                           \
 90 |     XX(OK, "ok")                                     \
 91 |     XX(BAD_NUMBER, "bad number")                     \
 92 |     XX(BAD_STRING, "bad string")                     \
 93 |     XX(BAD_IDENTIFIER, "bad identifier")             \
 94 |     XX(STACK_OVERFLOW, "stack overflow")             \
 95 |     XX(STACK_UNDERFLOW, "stack underflow")           \
 96 |     XX(MISMATCH_BRACKET, "mismatch bracket")         \
 97 |     XX(UNEXPECTED_CHARACTER, "unexpected character") \
 98 |     XX(UNQUOTED_KEY, "unquoted key")                 \
 99 |     XX(BREAKING_BAD, "breaking bad")
100 | 
101 | enum JsonErrno {
102 | #define XX(no, str) JSON_##no,
103 |     JSON_ERRNO_MAP(XX)
104 | #undef XX
105 | };
106 | 
107 | const char *jsonStrError(int err);
108 | 
109 | class JsonAllocator {
110 |     struct Zone {
111 |         Zone *next;
112 |         size_t used;
113 |     } *head = nullptr;
114 | 
115 | public:
116 |     JsonAllocator() = default;
117 |     JsonAllocator(const JsonAllocator &) = delete;
118 |     JsonAllocator &operator=(const JsonAllocator &) = delete;
119 |     JsonAllocator(JsonAllocator &&x) : head(x.head) {
120 |         x.head = nullptr;
121 |     }
122 |     JsonAllocator &operator=(JsonAllocator &&x) {
123 |         head = x.head;
124 |         x.head = nullptr;
125 |         return *this;
126 |     }
127 |     ~JsonAllocator() {
128 |         deallocate();
129 |     }
130 |     void *allocate(size_t size);
131 |     void deallocate();
132 | };
133 | 
134 | int jsonParse(char *str, char **endptr, JsonValue *value, JsonAllocator &allocator);
135 | 


--------------------------------------------------------------------------------
/test/TestCase.cpp:
--------------------------------------------------------------------------------
  1 | #include "TestCase.hpp"
  2 | 
  3 | #include <iostream>
  4 | #include <cmath>   // std::abs
  5 | #include <cstdio>  // snprintf
  6 | 
  7 | #include "../src/opencl/Context.hpp"
  8 | #include "../src/DataPipeline.hpp"
  9 | 
 10 | namespace test {
 11 | 
 12 | ///
 13 | /// utils functions
 14 | ///
 15 | float activation_function(float x) { return std::max(x, 0.0f); }
 16 | float activation_function_derivative(float x) { return x > 0.0f ? 1.0f : 0.0f; }
 17 | 
 18 | ///
 19 | /// TestException
 20 | ///
 21 | TestException::TestException() : runtime_error("TestException") {
 22 |   // cnvt.str("");
 23 |   cnvt << runtime_error::what() << ": Undefined error";
 24 | }
 25 | 
 26 | TestException::TestException(const char *msg) : runtime_error("TestException") {
 27 |   // cnvt.str("");
 28 |   cnvt << runtime_error::what() << ": " << msg;
 29 | }
 30 | 
 31 | TestException::TestException(const TestException &e)
 32 |     : runtime_error("TestException"), cnvt(e.cnvt.str()) {}
 33 | 
 34 | const char *TestException::what() const throw() { return cnvt.str().c_str(); }
 35 | 
 36 | ///
 37 | /// TestCase
 38 | ///
 39 | void TestCase::assert_equals(int expected, int result) {
 40 |   if (expected != result) {
 41 |     char msg_buffer[128];
 42 |     snprintf(msg_buffer, sizeof(msg_buffer),  //
 43 |              "[INT] Expected %d to be %d", result, expected);
 44 |     throw TestException(msg_buffer);
 45 |   }
 46 | }
 47 | 
 48 | void TestCase::assert_equals(float expected, float result) {
 49 |   // (yeah, this are going to be totally arbitrary numbers)
 50 |   expected = std::abs(expected);
 51 |   float margin = 0.005f;
 52 |   if (expected > 10) margin = 0.15f;
 53 |   if (expected > 100) margin = 1;
 54 |   if (expected > 1000) margin = expected / 10000;
 55 |   float err = expected - std::abs(result);
 56 | 
 57 |   if (err > margin) {
 58 |     char msg_buffer[128];
 59 |     snprintf(msg_buffer, sizeof(msg_buffer),  //
 60 |              "[FLOAT] Expected %f to be %f", result, expected);
 61 |     throw TestException(msg_buffer);
 62 |   }
 63 | }
 64 | 
 65 | void TestCase::assert_true(bool v, const char *msg) {
 66 |   if (!v) {
 67 |     throw TestException(msg);
 68 |   }
 69 | }
 70 | 
 71 | void TestCase::assert_equals(const std::vector<float> &expected,
 72 |                              const std::vector<float> &result, bool print) {
 73 |   if (expected.size() != result.size()) {
 74 |     char msg_buffer[128];
 75 |     snprintf(msg_buffer, sizeof(msg_buffer),  //
 76 |              "Expected vector has %d elements, while result %d. This vectors "
 77 |              "are not equal",
 78 |              expected.size(), result.size());
 79 |     throw TestException(msg_buffer);
 80 |   }
 81 | 
 82 |   for (size_t i = 0; i < expected.size(); i++) {
 83 |     float r = result[i];
 84 |     float e = expected[i];
 85 |     if (print)
 86 |       std::cout << "[" << i << "] expected >\t" << e << "\tgot> " << r
 87 |                 << std::endl;
 88 |     assert_equals(e, r);
 89 |   }
 90 | }
 91 | void TestCase::assert_equals(cnn_sr::DataPipeline *const pipeline,
 92 |                              const std::vector<float> &expected,
 93 |                              opencl::MemoryHandle handle, bool print) {
 94 |   auto context = pipeline->context();
 95 |   auto raw_gpu_mem = context->raw_memory(handle);
 96 |   size_t len = raw_gpu_mem->size / sizeof(cl_float);
 97 |   if (expected.size() != len) {
 98 |     char msg_buffer[128];
 99 |     snprintf(msg_buffer, sizeof(msg_buffer),  //
100 |              "Expected vector has %d elements, while gpu memory holds %d. This "
101 |              "vectors are not equal",
102 |              expected.size(), len);
103 |     throw TestException(msg_buffer);
104 |   }
105 | 
106 |   context->block();
107 |   std::vector<float> gpu_read(len);
108 |   context->read_buffer(handle, (void *)&gpu_read[0], true);
109 |   assert_equals(expected, gpu_read, print);
110 | }
111 | 
112 | void TestCase::assert_data_set_ok(size_t idx) {
113 |   char msg_buffer[128];
114 |   snprintf(msg_buffer, sizeof(msg_buffer),  //
115 |            "Incorrect data set index(%d), there are only %d data sets", idx,
116 |            this->data_set_count());
117 |   assert_true(idx < this->data_set_count(), msg_buffer);
118 | }
119 | }
120 | 


--------------------------------------------------------------------------------
/src/kernel/layer_uber_kernel.cl:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *
 3 |  * Weights are 4D, indexing formula:
 4 |  *   index(w[a,b,n,k]) = a * F_SPATIAL_SIZE * CURRENT_FILTER_COUNT * PREVIOUS_FILTER_COUNT
 5 |  *                     + b * CURRENT_FILTER_COUNT * PREVIOUS_FILTER_COUNT
 6 |  *                     + k * CURRENT_FILTER_COUNT
 7 |  *                     + n
 8 |  *  where:
 9 |  *    a = 0..F_SPATIAL_SIZE
10 |  *    b = 0..F_SPATIAL_SIZE
11 |  *    n = 0..CURRENT_FILTER_COUNT
12 |  *    k = 0..PREVIOUS_FILTER_COUNT
13 |  *
14 |  * macros:
15 |  *   CURRENT_FILTER_COUNT      filter count for curent layer
16 |  *
17 |  * @param input                output of previous layer, size:
18 |  *                               * 1st layer: img_w * img_h
19 |  *                               * 2nd layer: (img_w-f1+1) * (img_h-f1+1) * n1
20 |  *                               * 3rd layer: (img_w-f1-f2+2) * (img_h-f1-f2+2) * n2
21 |  * @param target               zeroed output buffer, size:
22 |  *                               * 1st layer: (img_w-f1+1) * (img_h-f1+1) * n1
23 |  *                               * 2nd layer: (img_w-f1-f2+2) * (img_h-f1-f2+2) * n2
24 |  *                               * 3rd layer: (img_w-f1-f2-f3+3) * (img_h-f1-f2-f3+3)
25 |  * @param W                    weights, size:
26 |  *                                * 1st layer: f1*f1    per each filter (total: f1*f1*n1)
27 |  *                                * 2nd layer: f2*f2*n1 per each filter (total: f2*f2*n1*n2)
28 |  *                                * 3rd layer: f3*f3*n2
29 |  * @param B                    biases, size:
30 |  *                                * 1st layer: n1
31 |  *                                * 2nd layer: n2
32 |  *                                * 3rd layer: 1
33 |  * @param input_w              source width
34 |  * @param input_h              source height
35 |  */
36 | __kernel
37 | void forward(__read_only __global float* input,
38 |           __global float* target,
39 |           __read_only __global float* W,
40 |           __read_only __global float* B,
41 |           uint input_w, uint input_h){
42 | 
43 |   // value range: (0..out_w, 0..out_h)
44 |   const int2 pos = {get_global_id(0), get_global_id(1)};
45 |   uint sample_id = get_global_id(2);
46 | 
47 |   const int2 src_size = {input_w, input_h};
48 |   const int2 out_size = {src_size.x - F_SPATIAL_SIZE + 1,
49 |                          src_size.y - F_SPATIAL_SIZE + 1};
50 | 
51 | #define IMAGE_OFFSET_IN  sample_id* PREVIOUS_FILTER_COUNT* input_w* input_h
52 | #define IMAGE_OFFSET_OUT sample_id* CURRENT_FILTER_COUNT* out_size.x* out_size.y
53 | 
54 |   // index on which write to target,
55 |   // will write total of CURRENT_FILTER_COUNT values
56 |   const int out_idx = ((pos.y * out_size.x) + pos.x) * CURRENT_FILTER_COUNT;
57 | 
58 |   // zeroed result cache
59 |   float vals_by_filter[CURRENT_FILTER_COUNT];
60 |   for (size_t filter_id = 0; filter_id < CURRENT_FILTER_COUNT; filter_id++) {
61 |     vals_by_filter[filter_id] = 0.0f;
62 |   }
63 | 
64 |   // value range check
65 |   if(pos.x < 0 || pos.x >= out_size.x || //
66 |      pos.y < 0 || pos.y >= out_size.y)
67 |      return;
68 | 
69 |   // apply weights & write to vals_by_filter
70 |   for (size_t dy = 0; dy < F_SPATIAL_SIZE; dy++) {
71 |     for (size_t dx = 0; dx < F_SPATIAL_SIZE; dx++) {
72 |       int2 input_pos = {pos.x + dx, pos.y + dy};
73 |       int base_input_idx  = ((input_pos.y * input_w) + input_pos.x) * PREVIOUS_FILTER_COUNT;
74 |       size_t w_idx_2D = ((dy * F_SPATIAL_SIZE) + dx) * CURRENT_FILTER_COUNT * PREVIOUS_FILTER_COUNT;
75 | 
76 |       for (size_t k = 0; k < PREVIOUS_FILTER_COUNT; k++) {
77 |         float point_value = input[IMAGE_OFFSET_IN + base_input_idx + k];
78 |         size_t w_idx_3D = w_idx_2D + k * CURRENT_FILTER_COUNT;
79 | 
80 |         for (size_t n = 0; n < CURRENT_FILTER_COUNT; n++) {
81 |           vals_by_filter[n] += W[w_idx_3D + n] * point_value;
82 |         }
83 |       }
84 |     }
85 |   }
86 | 
87 |   // add bias and write cached results to target buffer
88 |   for (size_t filter_id = 0; filter_id < CURRENT_FILTER_COUNT; filter_id++) {
89 |     float result = vals_by_filter[filter_id] + B[filter_id];
90 | #ifdef SKIP_RELU
91 |     target[IMAGE_OFFSET_OUT + out_idx + filter_id] = result;
92 | #else
93 |     target[IMAGE_OFFSET_OUT + out_idx + filter_id] = max(result, 0.0f);
94 | #endif // SKIP_RELU
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/test/specs/UpdateParametersTest.cpp:
--------------------------------------------------------------------------------
  1 | #include "TestSpecsDeclarations.hpp"
  2 | 
  3 | #include <random>  // for std::mt19937
  4 | #include <chrono>  // for random seed
  5 | #
  6 | #include "../../src/DataPipeline.hpp"
  7 | #include "../../src/LayerData.hpp"
  8 | 
  9 | namespace test {
 10 | namespace specs {
 11 | 
 12 | ///
 13 | /// PIMPL
 14 | ///
 15 | struct UpdateParametersTestImpl {
 16 |   const size_t n_prev_filter_cnt = 2, current_filter_count = 400,
 17 |                f_spatial_size = 5, batch_size = 2;
 18 |   // const size_t n_prev_filter_cnt = 2, current_filter_count = 2,
 19 |   //  f_spatial_size = 3;
 20 |   const float momentum = 0.8f, learning_rate = 0.001;
 21 | 
 22 |   void create_data(std::mt19937 &generator, opencl::Context *context,
 23 |                    opencl::MemoryHandle &gpu_current_values,  //
 24 |                    opencl::MemoryHandle &gpu_grad,            //
 25 |                    opencl::MemoryHandle &gpu_previous_delta,  //
 26 |                    std::vector<float> &expected,
 27 |                    std::vector<float> &current_vals,
 28 |                    std::vector<float> &deltas) {
 29 |     size_t len = current_vals.size();
 30 |     std::vector<float> grad(len), previous_delta(len);
 31 |     for (size_t i = 0; i < len; i++) {
 32 |       current_vals[i] = (generator() % 2560) / 10.0f;
 33 |       grad[i] = (generator() % 2560) / 100.0f;
 34 |       previous_delta[i] = (generator() % 2560) / 10.0f;
 35 |       deltas[i] = momentum * previous_delta[i] + learning_rate * grad[i];
 36 |       expected[i] = current_vals[i] - (deltas[i] / batch_size);
 37 |     }
 38 | 
 39 |     // alloc
 40 |     /* clang-format off */
 41 |   gpu_current_values = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * len);
 42 |   gpu_grad           = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * len);
 43 |   gpu_previous_delta = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * len);
 44 |   context->write_buffer(gpu_current_values, (void *)&current_vals[0],   true);
 45 |   context->write_buffer(gpu_grad,           (void *)&grad[0],           true);
 46 |   context->write_buffer(gpu_previous_delta, (void *)&previous_delta[0], true);
 47 |     /* clang-format on */
 48 |   }
 49 | };
 50 | 
 51 | ///
 52 | /// UpdateParametersTest
 53 | ///
 54 | 
 55 | TEST_SPEC_PIMPL(UpdateParametersTest)
 56 | 
 57 | void UpdateParametersTest::init() {}
 58 | 
 59 | size_t UpdateParametersTest::data_set_count() { return 1; }
 60 | 
 61 | std::string UpdateParametersTest::name(size_t) {
 62 |   return "Update parameters test";
 63 | }
 64 | 
 65 | bool UpdateParametersTest::operator()(size_t,
 66 |                                       cnn_sr::DataPipeline *const pipeline) {
 67 |   using namespace cnn_sr;
 68 |   assert_not_null(pipeline);
 69 |   auto context = pipeline->context();
 70 | 
 71 |   // create test data
 72 |   LayerData layer_data(_impl->n_prev_filter_cnt, _impl->current_filter_count,
 73 |                        _impl->f_spatial_size);
 74 |   size_t ws = layer_data.weight_size(), bs = layer_data.bias_size();
 75 | 
 76 |   unsigned seed1 = std::chrono::system_clock::now().time_since_epoch().count();
 77 |   std::mt19937 generator(seed1);
 78 | 
 79 |   LayerAllocationPool gpu_alloc;
 80 |   std::vector<float> expected_w(ws), current_w(ws), new_deltas_w(ws);
 81 |   std::vector<float> expected_b(bs), current_b(bs), new_deltas_b(bs);
 82 |   _impl->create_data(generator, context,
 83 |                      gpu_alloc.weights,                 //
 84 |                      gpu_alloc.accumulating_grad_w,     //
 85 |                      gpu_alloc.previous_batch_delta_w,  //
 86 |                      expected_w, current_w, new_deltas_w);
 87 |   _impl->create_data(generator, context,
 88 |                      gpu_alloc.bias,                    //
 89 |                      gpu_alloc.accumulating_grad_b,     //
 90 |                      gpu_alloc.previous_batch_delta_b,  //
 91 |                      expected_b, current_b, new_deltas_b);
 92 | 
 93 |   layer_data.set_weights(&current_w[0]);
 94 |   layer_data.set_bias(&current_b[0]);
 95 | 
 96 |   pipeline->update_parameters(layer_data, gpu_alloc, _impl->batch_size,
 97 |                               _impl->momentum, 0.0f, _impl->learning_rate);
 98 | 
 99 |   assert_equals(pipeline, expected_w, gpu_alloc.weights);
100 |   assert_equals(pipeline, expected_b, gpu_alloc.bias);
101 |   assert_equals(pipeline, new_deltas_w, gpu_alloc.previous_batch_delta_w);
102 |   assert_equals(pipeline, new_deltas_b, gpu_alloc.previous_batch_delta_b);
103 | 
104 |   return true;
105 | }
106 | 
107 | //
108 | //
109 | }  // namespace specs
110 | }  // namespace test
111 | 


--------------------------------------------------------------------------------
/test/specs/ConfigTest.cpp:
--------------------------------------------------------------------------------
  1 | #include "TestSpecsDeclarations.hpp"
  2 | #include "../../src/Config.hpp"
  3 | 
  4 | namespace test {
  5 | namespace specs {
  6 | 
  7 | ///
  8 | /// Data set
  9 | ///
 10 | struct ConfigDataSet : DataSet {
 11 |   ConfigDataSet(std::string name, const char* cfg_file, bool expect_io_error,
 12 |                 bool expect_invalid_val)
 13 |       : DataSet(name),
 14 |         cfg_file(cfg_file),
 15 |         expect_io_error(expect_io_error),
 16 |         expect_invalid_val(expect_invalid_val) {}
 17 | 
 18 |   const char* cfg_file;
 19 |   bool expect_io_error, expect_invalid_val;
 20 | };
 21 | 
 22 | ///
 23 | /// PIMPL
 24 | ///
 25 | struct ConfigTestImpl {
 26 |   /* clang-format off */
 27 |   ConfigDataSet data_sets[4] = {
 28 |       ConfigDataSet("ok", "test/data/config.json", false,false),
 29 |       ConfigDataSet("invalid value", "test/data/config_invalid_val.json", false,true),
 30 |       ConfigDataSet("invalid file", "test/data/config_non_parseable.json", true,false),
 31 |       ConfigDataSet("file nonexistent", "test/data/NOPE.json", true,false)};
 32 |   /* clang-format on */
 33 | 
 34 |   cnn_sr::ParametersDistribution pd1 = {0.9, 0.9, 0.9, 0.9};
 35 |   cnn_sr::ParametersDistribution pd2 = {2.001, 2.001, 2.001, 2.001};
 36 |   cnn_sr::ParametersDistribution pd3 = {0.001, 0.001, 0.001, 0.001};
 37 |   float learning_rates[3] = {12, 34, 56};
 38 |   /* clang-format off */
 39 |   cnn_sr::Config correct_result={32, 16,
 40 |                                  9, 1, 5,
 41 |                                  123.5f,0.1f, learning_rates,
 42 |                                  pd1, pd2, pd3,
 43 |                                  "cnn-parameters-a.json"};
 44 |   /* clang-format on */
 45 | };
 46 | 
 47 | ///
 48 | /// ConfigTest
 49 | ///
 50 | 
 51 | TEST_SPEC_PIMPL(ConfigTest)
 52 | 
 53 | void ConfigTest::init() {}
 54 | 
 55 | size_t ConfigTest::data_set_count() { return 4; }
 56 | 
 57 | std::string ConfigTest::name(size_t data_set_id) {
 58 |   assert_data_set_ok(data_set_id);
 59 |   return "Config test - " + _impl->data_sets[data_set_id].name;
 60 | }
 61 | 
 62 | bool params_cmp(cnn_sr::ParametersDistribution a,
 63 |                 cnn_sr::ParametersDistribution b) {
 64 |   return a.mean_w == b.mean_w && a.sd_w == b.sd_w &&  //
 65 |          a.mean_b == b.mean_b && a.sd_b == b.sd_b;
 66 | }
 67 | 
 68 | bool ConfigTest::operator()(size_t data_set_id,
 69 |                             cnn_sr::DataPipeline* const pipeline) {
 70 |   using namespace cnn_sr;
 71 |   assert_not_null(pipeline);
 72 |   assert_data_set_ok(data_set_id);
 73 |   auto data = _impl->data_sets[data_set_id];
 74 |   Config& c2 = _impl->correct_result;
 75 | 
 76 |   bool io_err = false, invalid_val = false;
 77 |   ConfigReader reader;
 78 | 
 79 |   try {
 80 |     Config c1 = reader.read(data.cfg_file);
 81 |     assert_true(c1.n1 == c2.n1 && c1.n2 == c2.n2,
 82 |                 "filter count does not match");
 83 |     assert_true(c1.f1 == c2.f1 && c1.f2 == c2.f2 && c1.f3 == c2.f3,
 84 |                 "filter spatial size does not match");
 85 |     assert_true(c1.momentum == c2.momentum, "momentum does not match");
 86 |     assert_true(c1.weight_decay_parameter == c2.weight_decay_parameter,
 87 |                 "weight decay parameter does not match");
 88 |     assert_true(c1.learning_rate[0] == c2.learning_rate[0]         //
 89 |                     && c1.learning_rate[1] == c2.learning_rate[1]  //
 90 |                     && c1.learning_rate[2] == c2.learning_rate[2],
 91 |                 "learning rate does not match");
 92 |     // std::cout << c1.parameters_file << "'" << std::endl;
 93 |     // std::cout << c2.parameters_file << "'" << std::endl;
 94 |     assert_true(c1.parameters_file.compare(c2.parameters_file) == 0,
 95 |                 "parameters_file does not match");
 96 |     assert_true(params_cmp(c1.params_distr_1, c2.params_distr_1),
 97 |                 "parameters distribution 1 does not match");
 98 |     assert_true(params_cmp(c1.params_distr_2, c2.params_distr_2),
 99 |                 "parameters distribution 2 does not match");
100 |     assert_true(params_cmp(c1.params_distr_3, c2.params_distr_3),
101 |                 "parameters distribution 3 does not match");
102 |   } catch (TestException& e) {
103 |     std::cout << e.what() << std::endl;
104 |     invalid_val = true;
105 |   } catch (IOException& e) {
106 |     std::cout << e.what() << std::endl;
107 |     io_err = true;
108 |   } /* catch (...) {
109 |      assert_true(false, "Unknown error");
110 |    }*/
111 | 
112 |   assert_true(io_err == data.expect_io_error, "Expected IO error");
113 |   assert_true(invalid_val == data.expect_invalid_val,
114 |               "Expected values mismatch");
115 |   return true;
116 | }
117 | 
118 | //
119 | //
120 | }  // namespace specs
121 | }  // namespace test
122 | 


--------------------------------------------------------------------------------
/src/kernel/backpropagate.cl:
--------------------------------------------------------------------------------
  1 | /* clang-format off */
  2 | /**
  3 |  * @see http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html
  4 |  * @see  http://simpleopencl.blogspot.com/2013/05/atomic-operations-and-floats-in-opencl.html
  5 |  *
  6 |  * @param float*   source  [description]
  7 |  * @param float    operand [description]
  8 |  */
  9 | inline void atomic_add_global(volatile __global float* source, const float operand) {
 10 |   /* clang-format on */
 11 |   union {
 12 |     unsigned int intVal;
 13 |     float floatVal;
 14 |   } newVal;
 15 | 
 16 |   union {
 17 |     unsigned int intVal;
 18 |     float floatVal;
 19 |   } prevVal;
 20 | 
 21 |   // NOTE: atomic_cmpxchg(volatile __global unsigned int *p,
 22 |   // 	                    unsigned int cmp, unsigned int val)
 23 |   do {
 24 |     prevVal.floatVal = *source;
 25 |     newVal.floatVal = prevVal.floatVal + operand;
 26 |   } while (atomic_cmpxchg((volatile __global unsigned int*)source,
 27 |                           prevVal.intVal,  //
 28 |                           newVal.intVal) != prevVal.intVal);
 29 | }
 30 | 
 31 | /* clang-format off */
 32 | /**
 33 |  *
 34 |  * Calculate grad_w and grad_b. Very expensive due too barriers & locks.
 35 |  *
 36 |  *
 37 |  * In following notation (l), (l-1) describes relative layer and [...] lower indices.
 38 |  *
 39 |  * Algo for dJ/dw[abnk] on layer (l), where:
 40 |  *   a = 0..spatial_size(l)
 41 |  *   b = 0..spatial_size(l)
 42 |  *   n = 0..filter_count(l)
 43 |  *   k = 0..filter_count(l+1)
 44 |  *
 45 |  * dJ/dw[abnk] = 0
 46 |  * for i = 0..output_w(l):               # for each node where this weight is used
 47 |  * for j = 0..output_h(l):
 48 |  *   for n = 0..filter_count(l):
 49 |  *     for a = 0..spatial_size(l):       # offset to weight
 50 |  *     for b = 0..spatial_size(l):       # (it's kernel size, what You expect ?)
 51 |  *       for k = 0..filter_count(l-1):   # for all inputs
 52 |  *         dJ/dw[abnk] += deltas[i,j,n]  # (1) error for this point
 53 |  *           * layer_input[i+b,j+a,k]    # (2) input at this point
 54 |  */
 55 | /* clang-format on */
 56 | __kernel void backpropagate(__read_only __global float* deltas,       //
 57 |                             __read_only __global float* layer_input,  //
 58 |                             __global float* target_grad_w,            //
 59 |                             __global float* target_grad_b,            //
 60 |                             uint n_current_filter_cnt,                //
 61 |                             uint n_prev_filter_cnt,                   //
 62 |                             uint f_spatial_size,                      //
 63 |                             uint layer_out_w, uint layer_out_h) {
 64 |   const int id = get_global_id(0);
 65 |   const uint sample_id = get_global_id(1);
 66 |   const uint input_w = layer_out_w + f_spatial_size - 1;
 67 |   const uint input_h = layer_out_h + f_spatial_size - 1;
 68 |   // weight dimensions
 69 |   const size_t d2 = n_prev_filter_cnt * n_current_filter_cnt,
 70 |                d3 = d2 * f_spatial_size;
 71 |   const size_t weights_size = d3 * f_spatial_size;
 72 | 
 73 | #define IMAGE_OFFSET_CURR \
 74 |   sample_id* n_current_filter_cnt* layer_out_w* layer_out_h
 75 | #define IMAGE_OFFSET_PREV sample_id* n_prev_filter_cnt* input_w* input_h
 76 | 
 77 |   // reverse id to get weight parameters: a(as dx), b(as dy), n, k
 78 |   int w_tmp = id;
 79 |   const int dy = w_tmp / d3;
 80 |   w_tmp -= dy * d3;
 81 |   const int dx = w_tmp / d2;
 82 |   w_tmp -= dx * d2;
 83 |   const int k = w_tmp / n_current_filter_cnt;
 84 |   const int n =
 85 |       w_tmp - k * n_current_filter_cnt;  // = id % n_current_filter_cnt
 86 | 
 87 |   if (id < weights_size) {
 88 |     float grad_w = 0.0f, grad_b = 0.0f;
 89 |     for (size_t row = 0; row < layer_out_h; row++) {
 90 |       for (size_t col = 0; col < layer_out_w; col++) {
 91 |         // (1) delta[i,j,n](l)
 92 |         int idx = ((row * layer_out_w) + col) * n_current_filter_cnt;
 93 |         float delta = deltas[IMAGE_OFFSET_CURR + idx + n];
 94 |         grad_b += delta;
 95 | 
 96 |         // (2) layer_input[i+b,j+a,k]
 97 |         // NOTE: we normally should be subtracting [dx,dy], but it does
 98 |         // depend on indexing
 99 |         int2 prev_layer_pos = {col + dx, row + dy};
100 |         int prev_layer_idx = ((prev_layer_pos.y * input_w) + prev_layer_pos.x) *
101 |                              n_prev_filter_cnt;
102 | 
103 |         float input = layer_input[IMAGE_OFFSET_PREV + prev_layer_idx + k];
104 |         grad_w += input * delta;
105 |       }
106 |     }
107 | 
108 |     // write
109 |     // NOTE: atomic_add_global is custom function, see beginning of the file
110 |     target_grad_w[id] += grad_w;
111 |     if (k == 0 && dx == 0 && dy == 0)
112 |       atomic_add_global(target_grad_b + n, grad_b);
113 |   }
114 | }
115 | 


--------------------------------------------------------------------------------
/weights_visualize.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import argparse
  3 | from os.path import join
  4 | from pprint import pprint
  5 | from PIL import Image, ImageDraw, ImageColor
  6 | 
  7 | # cfg_file = 'config_f.json'
  8 | # scale = None
  9 | per_weight_cell_padding = 2
 10 | 
 11 | def layer_data(cfg, layer_id):
 12 |   'returns (f,k,n)'
 13 |   read = lambda prop: int(cfg[prop])
 14 |   if layer_id == 1:
 15 |     return read('f1'),1,read('n1')
 16 |   elif layer_id == 2:
 17 |     return read('f2'),read('n1'),read('n2')
 18 |   elif layer_id == 3:
 19 |     return read('f3'),read('n2'),1
 20 |   else:
 21 |     raise Exception("Only 1,2,3 are valid layers")
 22 | 
 23 | def idx(layer, dy,dx,n,k):
 24 |   f, layer_k, layer_n = layer
 25 |   # print('layer: ',layer, ' a,b:',dy,dx, '   n,k: ',n,k)
 26 |   return dy * layer_n * layer_k * f + \
 27 |          dx * layer_n * layer_k + \
 28 |          k  * layer_n + \
 29 |          n
 30 | 
 31 | def filter_weights(weights, layer, curr_n,curr_k):
 32 |   f = layer[0]
 33 |   filter_wx = [0]*(f*f)
 34 |   for dy in range(f):
 35 |     for dx in range(f):
 36 |       w_idx = idx(layer, dy,dx,curr_n,curr_k)
 37 |       # print(dy*f+dx,'/',len(filter_wx))
 38 |       # print(w_idx,'/',len(weights))
 39 |       filter_wx[dy*f+dx] = weights[w_idx]
 40 |   min_w, max_w = min(filter_wx), max(filter_wx)
 41 |   # norm_w = max_w + min_w
 42 |   # print('min_w: {}, max_w: {}'.format(min_w, max_w))
 43 | 
 44 |   a,b=-999,999
 45 |   for dy in range(f):
 46 |     for dx in range(f):
 47 |       w = filter_wx[dy*f+dx]
 48 |       # w = (w-min_w) / (max_w + min_w)
 49 |       w = (w-min_w) / (max_w - min_w) if max_w != min_w else 0.5
 50 |       # w = min(1,max(0,w))
 51 |       yield dy,dx,w
 52 |       a=max(a,w)
 53 |       b=min(b,w)
 54 |   # print('{:8}\t: {:8} \t-> {:8} : {}'.format(min_w, max_w, b, a))
 55 | 
 56 | def visualize(cfg, params, scale, layer_id, out_path):
 57 |   print('--- layer ', layer_id, ' ---')
 58 |   weights = params['layer' + str(layer_id)]['weights']
 59 |   min_w, max_w = min(weights), max(weights)
 60 |   print('min_w: {}, max_w: {}'.format(min_w, max_w))
 61 |   print('overfit: {}'.format(sum([x*x for x in weights])))
 62 | 
 63 |   f, l_k, l_n = layer = layer_data(cfg, layer_id)
 64 |   cell_size = f * scale + 2 * per_weight_cell_padding
 65 |   print(layer)
 66 |   if f == 1:
 67 |     print('f==1, drawing weights would not show anything')
 68 |     return
 69 | 
 70 |   rows = int((l_n*l_k)**0.5)
 71 |   cells_in_row = int((l_n*l_k+rows-1) / rows)
 72 |   print('columns: ', cells_in_row, 'rows: ', rows)
 73 |   # size = cell_size * l_n, cell_size * l_k
 74 |   size = cell_size * cells_in_row, cell_size * rows
 75 | 
 76 |   img = Image.new('RGB', size, color='#000000')
 77 |   filter_img = Image.new('RGB', (f*scale,f*scale))
 78 |   filter_draw = ImageDraw.Draw(filter_img)
 79 | 
 80 |   for n in range(l_n):
 81 |     for k in range(l_k):
 82 |       idx = n * l_k + k
 83 |       row, col = idx // cells_in_row, idx % cells_in_row
 84 |       # print(idx, '\t-> ',row,', ',col)
 85 |       pos = int(cell_size * col + per_weight_cell_padding), \
 86 |             int(cell_size * row + per_weight_cell_padding)
 87 |       for (dy,dx,val) in filter_weights(weights, layer, n,k):
 88 |         v = int(val*255)
 89 |         col = "rgb({0},{0},{0})".format(v)
 90 |         pos_ab = dx*scale, dy*scale
 91 |         pos_ab_ = pos_ab[0] + scale - 1, \
 92 |                   pos_ab[1] + scale - 1
 93 |         filter_draw.rectangle((pos_ab, pos_ab_), fill=col)
 94 |       img.paste(filter_img, pos)
 95 | 
 96 |   img.save(out_path, "PNG")
 97 | 
 98 | if __name__ == '__main__':
 99 |   help_text = 'Draw weights. Usage: ' + \
100 |               '"weights_visualize.py -o data -s 10 data\config_f.json"'
101 | 
102 |   parser = argparse.ArgumentParser(description=help_text)
103 |   parser.add_argument('config', help='config file to analize' )
104 |   parser.add_argument('--parameters-file', '-p', required=False, help='parameters file holding all weights and biases')
105 |   parser.add_argument('--out-dir', '-o', required=False, default='', help='where to store result images')
106 |   parser.add_argument('--scale', '-s', required=False, default=10, type=int, help='scale factor - cause sometimes 10x10 image is too small')
107 |   args = parser.parse_args()
108 | 
109 |   with open(args.config) as data_file:
110 |     cfg = json.load(data_file)
111 |   # pprint(cfg)
112 | 
113 |   if args.parameters_file:
114 |     par_file = args.parameters_file
115 |   elif 'parameters_file' in cfg:
116 |     par_file = cfg['parameters_file']
117 |   else:
118 |     raise Exception('Either write parameter file path to config or provide as parametr')
119 |   print('Parameter file: \'',par_file,'\'')
120 |   with open(par_file) as data_file:
121 |     params = json.load(data_file)
122 |   # pprint(params)
123 | 
124 |   visualize(cfg, params, args.scale, 1, join(args.out_dir, 'weights1.png'))
125 |   visualize(cfg, params, args.scale, 2, join(args.out_dir, 'weights2.png'))
126 |   visualize(cfg, params, args.scale, 3, join(args.out_dir, 'weights3.png'))
127 | 


--------------------------------------------------------------------------------
/libs/include/CL/cl_d3d10_ext.h:
--------------------------------------------------------------------------------
  1 | /**********************************************************************************
  2 |  * Copyright (c) 2008-2009 The Khronos Group Inc.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and/or associated documentation files (the
  6 |  * "Materials"), to deal in the Materials without restriction, including
  7 |  * without limitation the rights to use, copy, modify, merge, publish,
  8 |  * distribute, sublicense, and/or sell copies of the Materials, and to
  9 |  * permit persons to whom the Materials are furnished to do so, subject to
 10 |  * the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included
 13 |  * in all copies or substantial portions of the Materials.
 14 |  *
 15 |  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 18 |  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 19 |  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 20 |  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 21 |  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 22 |  **********************************************************************************/
 23 | 
 24 | #ifndef __OPENCL_CL_D3D10_EXT_H
 25 | #define __OPENCL_CL_D3D10_EXT_H
 26 | 
 27 | #include <d3d10.h>
 28 | #include <CL/cl.h>
 29 | #include <CL/cl_platform.h>
 30 | 
 31 | #ifdef __cplusplus
 32 | extern "C" {
 33 | #endif
 34 | 
 35 | /******************************************************************************
 36 |  * cl_nv_d3d10_sharing                                                        */
 37 | 
 38 | typedef cl_uint cl_d3d10_device_source_nv;
 39 | typedef cl_uint cl_d3d10_device_set_nv;
 40 | 
 41 | /******************************************************************************/
 42 | 
 43 | // Error Codes
 44 | #define CL_INVALID_D3D10_DEVICE_NV             -1002
 45 | #define CL_INVALID_D3D10_RESOURCE_NV           -1003
 46 | #define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_NV  -1004
 47 | #define CL_D3D10_RESOURCE_NOT_ACQUIRED_NV      -1005
 48 | 
 49 | // cl_d3d10_device_source_nv
 50 | #define CL_D3D10_DEVICE_NV                     0x4010
 51 | #define CL_D3D10_DXGI_ADAPTER_NV               0x4011
 52 | 
 53 | // cl_d3d10_device_set_nv
 54 | #define CL_PREFERRED_DEVICES_FOR_D3D10_NV      0x4012
 55 | #define CL_ALL_DEVICES_FOR_D3D10_NV            0x4013
 56 | 
 57 | // cl_context_info
 58 | #define CL_CONTEXT_D3D10_DEVICE_NV             0x4014
 59 | 
 60 | // cl_mem_info
 61 | #define CL_MEM_D3D10_RESOURCE_NV               0x4015
 62 | 
 63 | // cl_image_info
 64 | #define CL_IMAGE_D3D10_SUBRESOURCE_NV          0x4016
 65 | 
 66 | // cl_command_type
 67 | #define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_NV    0x4017
 68 | #define CL_COMMAND_RELEASE_D3D10_OBJECTS_NV    0x4018
 69 | 
 70 | /******************************************************************************/
 71 | 
 72 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10NV_fn)(
 73 |     cl_platform_id            platform,
 74 |     cl_d3d10_device_source_nv d3d_device_source,
 75 |     void *                    d3d_object,
 76 |     cl_d3d10_device_set_nv    d3d_device_set,
 77 |     cl_uint                   num_entries, 
 78 |     cl_device_id *            devices, 
 79 |     cl_uint *                 num_devices) CL_API_SUFFIX__VERSION_1_0;
 80 | 
 81 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferNV_fn)(
 82 |     cl_context     context,
 83 |     cl_mem_flags   flags,
 84 |     ID3D10Buffer * resource,
 85 |     cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 86 | 
 87 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DNV_fn)(
 88 |     cl_context        context,
 89 |     cl_mem_flags      flags,
 90 |     ID3D10Texture2D * resource,
 91 |     UINT              subresource,
 92 |     cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 93 | 
 94 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DNV_fn)(
 95 |     cl_context        context,
 96 |     cl_mem_flags      flags,
 97 |     ID3D10Texture3D * resource,
 98 |     UINT              subresource,
 99 |     cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
100 | 
101 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsNV_fn)(
102 |     cl_command_queue command_queue,
103 |     cl_uint          num_objects,
104 |     const cl_mem *   mem_objects,
105 |     cl_uint          num_events_in_wait_list,
106 |     const cl_event * event_wait_list,
107 |     cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
108 | 
109 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsNV_fn)(
110 |     cl_command_queue command_queue,
111 |     cl_uint          num_objects,
112 |     cl_mem *         mem_objects,
113 |     cl_uint          num_events_in_wait_list,
114 |     const cl_event * event_wait_list,
115 |     cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
116 | 
117 | #ifdef __cplusplus
118 | }
119 | #endif
120 | 
121 | #endif  // __OPENCL_CL_D3D10_H
122 | 
123 | 


--------------------------------------------------------------------------------
/libs/include/CL/cl_d3d11_ext.h:
--------------------------------------------------------------------------------
  1 | /**********************************************************************************
  2 |  * Copyright (c) 2008-2009 The Khronos Group Inc.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and/or associated documentation files (the
  6 |  * "Materials"), to deal in the Materials without restriction, including
  7 |  * without limitation the rights to use, copy, modify, merge, publish,
  8 |  * distribute, sublicense, and/or sell copies of the Materials, and to
  9 |  * permit persons to whom the Materials are furnished to do so, subject to
 10 |  * the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included
 13 |  * in all copies or substantial portions of the Materials.
 14 |  *
 15 |  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 18 |  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 19 |  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 20 |  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 21 |  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 22 |  **********************************************************************************/
 23 | 
 24 | #ifndef __OPENCL_CL_D3D11_EXT_H
 25 | #define __OPENCL_CL_D3D11_EXT_H
 26 | 
 27 | #include <d3d11.h>
 28 | #include <CL/cl.h>
 29 | #include <CL/cl_platform.h>
 30 | 
 31 | #ifdef __cplusplus
 32 | extern "C" {
 33 | #endif
 34 | 
 35 | /******************************************************************************
 36 |  * cl_nv_d3d11_sharing                                                        */
 37 | 
 38 | typedef cl_uint cl_d3d11_device_source_nv;
 39 | typedef cl_uint cl_d3d11_device_set_nv;
 40 | 
 41 | /******************************************************************************/
 42 | 
 43 | // Error Codes
 44 | #define CL_INVALID_D3D11_DEVICE_NV             -1006
 45 | #define CL_INVALID_D3D11_RESOURCE_NV           -1007
 46 | #define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_NV  -1008
 47 | #define CL_D3D11_RESOURCE_NOT_ACQUIRED_NV      -1009
 48 | 
 49 | // cl_d3d11_device_source_nv
 50 | #define CL_D3D11_DEVICE_NV                     0x4019
 51 | #define CL_D3D11_DXGI_ADAPTER_NV               0x401A
 52 | 
 53 | // cl_d3d11_device_set_nv
 54 | #define CL_PREFERRED_DEVICES_FOR_D3D11_NV      0x401B
 55 | #define CL_ALL_DEVICES_FOR_D3D11_NV            0x401C
 56 | 
 57 | // cl_context_info
 58 | #define CL_CONTEXT_D3D11_DEVICE_NV             0x401D
 59 | 
 60 | // cl_mem_info
 61 | #define CL_MEM_D3D11_RESOURCE_NV               0x401E
 62 | 
 63 | // cl_image_info
 64 | #define CL_IMAGE_D3D11_SUBRESOURCE_NV          0x401F
 65 | 
 66 | // cl_command_type
 67 | #define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_NV    0x4020
 68 | #define CL_COMMAND_RELEASE_D3D11_OBJECTS_NV    0x4021
 69 | 
 70 | /******************************************************************************/
 71 | 
 72 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11NV_fn)(
 73 |     cl_platform_id            platform,
 74 |     cl_d3d11_device_source_nv d3d_device_source,
 75 |     void *                    d3d_object,
 76 |     cl_d3d11_device_set_nv    d3d_device_set,
 77 |     cl_uint                   num_entries, 
 78 |     cl_device_id *            devices, 
 79 |     cl_uint *                 num_devices) CL_API_SUFFIX__VERSION_1_0;
 80 | 
 81 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferNV_fn)(
 82 |     cl_context     context,
 83 |     cl_mem_flags   flags,
 84 |     ID3D11Buffer * resource,
 85 |     cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 86 | 
 87 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DNV_fn)(
 88 |     cl_context        context,
 89 |     cl_mem_flags      flags,
 90 |     ID3D11Texture2D * resource,
 91 |     UINT              subresource,
 92 |     cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 93 | 
 94 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DNV_fn)(
 95 |     cl_context        context,
 96 |     cl_mem_flags      flags,
 97 |     ID3D11Texture3D * resource,
 98 |     UINT              subresource,
 99 |     cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
100 | 
101 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsNV_fn)(
102 |     cl_command_queue command_queue,
103 |     cl_uint          num_objects,
104 |     const cl_mem *   mem_objects,
105 |     cl_uint          num_events_in_wait_list,
106 |     const cl_event * event_wait_list,
107 |     cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
108 | 
109 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsNV_fn)(
110 |     cl_command_queue command_queue,
111 |     cl_uint          num_objects,
112 |     cl_mem *         mem_objects,
113 |     cl_uint          num_events_in_wait_list,
114 |     const cl_event * event_wait_list,
115 |     cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
116 | 
117 | #ifdef __cplusplus
118 | }
119 | #endif
120 | 
121 | #endif  // __OPENCL_CL_D3D11_H
122 | 
123 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Image super-resolution using deep convolutional neural networks 
  2 | 
  3 | ## Overview
  4 | 
  5 | Super-resolution problem tries to upscale the image so that perceived loss of quality is minimal. For example after scaling with bicubic interpolation it is apparent that some pixels are just smudged together. The question is: *can AI do a better job ?*
  6 | 
  7 | 
  8 | ## Preliminary results
  9 | 
 10 | ![Preliminary result](images/compare.jpg)
 11 | *left: upscaling with bicubic interpolation, right: result of the presented algorithm*
 12 | 
 13 | 
 14 | ![Details](images/details.jpg)
 15 | *Details closeup - left: upscaling with bicubic interpolation, right: result of the presented algorithm*
 16 | 
 17 | 
 18 | As You can see presented method achieved significant improvement in the areas of: face boundaries, fingers, loose hair strands. On the other hand the finer details on the dress that require smooth gradient are lost. There are also numerous artifacts.
 19 | 
 20 | I believe that longer training time could fix some of the mentioned problems.
 21 | 
 22 | 
 23 | ## Usage
 24 | 
 25 | 
 26 | #### General guidelines
 27 | 
 28 | * **application input is already scaled image that we are going to run through various filters**
 29 | * only luma channel is upscaled (so image presented above is quite close to edge case)
 30 | * best effects are achieved if image has lot of edges
 31 | * presented method does not handle textures particularly well
 32 | * it may be needed to optimize kernels for Your configuration to achieve faster learning process
 33 | 
 34 | 
 35 | #### Installation
 36 | 
 37 | You will need OpenCL capable hardware, modern GPU recommended. All required libraries are included in this repositorium ([libs](/libs)). It is recommended to link with the OpenCL library that can be found in Your PATH environment variable. You also should change paths in the [makefile](makefile) - they are result of my non-standard MinGW configuration.
 38 | 
 39 | This app was developed using clang & g++, they may be needed some changes to make it work in Visual Studio (like list_files() in [pch.cpp](src/pch.cpp) )
 40 | 
 41 | #### Command line
 42 | 
 43 | * **make build** - compile and create executable
 44 | * **make run -- CMD_ARGUMENTS_HERE** - run app with provided arguments (note double dash)
 45 | * **make test** - run all tests
 46 | 
 47 | #### Arguments:
 48 | 
 49 | `cnn [-h] [train] [dry] [profile] --config --in [--out] [--epochs]`
 50 | 
 51 | * **help** - print help
 52 | * **train** - train mode
 53 | * **dry** - do not store result
 54 | * **profile** - print kernel execution times
 55 | * **--config CONFIG** - configuration file
 56 | * **--in IN** - either image we want to upscale or samples directory during training
 57 | * **--out OUT** - output file path (either result image or new set of parameters)
 58 | * **--epochs EPOCHS** - number of epochs during training
 59 | 
 60 | #### Examples
 61 | 
 62 | Start the app: `bin\cnn.exe -c data\config.json -i data\my_image.jpg -o data\result.jpg`
 63 |   
 64 | Learning (500 epochs): `bin\cnn.exe train -c data\config.json --epochs 500 -i data\train_samples -o data\parameters.json`
 65 |   
 66 | Start the learning (100 epochs), do not save results: `bin\cnn.exe train -c data\config.json --epochs 100 -i data\train_samples dry`
 67 | 
 68 | 
 69 | #### Useful scripts:
 70 | 
 71 | * **[generate_training_samples.py](generate_training_samples.py)** - generate ready to use training samples based on images from provided directory 
 72 | * **[weights_visualize.py](weights_visualize.py)** - present weights as images. Layer 1 is particularly informative
 73 | * **[profile.py](profile.py)** - measure total execution time or time spend per OpenCL kernel
 74 | * **[schedule_training.py](schedule_training.py)** - executes C++ application, specify number of epochs or how long we want for learning to continue
 75 | 
 76 | 
 77 | #### Config file ([example](example_config.json))
 78 | 
 79 | Config file is just a simple JSON with following keys:
 80 | * *n1* - numbers of filters in first layer
 81 | * *n2* - numbers of filters in second layer
 82 | * *f1* - kernel spatial size in first layer
 83 | * *f2* - kernel spatial size in second layer
 84 | * *f3* - kernel spatial size in third layer
 85 | * *momentum* - momentum used during learning
 86 | * *weight_decay_parameter* - used to prevent overfitting
 87 | * *learning_rates* - learning rates used during training
 88 | * *parameters_file* - file that holds all parameters: weights and biases for layers (optional)
 89 | 
 90 | If You do not provide *parameters_file* the parameters will be initialized with random numbers from normal distribution (see example for details how this process can be customized).
 91 | 
 92 | #### Parameters file
 93 | 
 94 | Parameters are described with following simple structure:
 95 | ```js
 96 | {
 97 |   "epochs": 0,
 98 |   "layer1": {
 99 |     "weights": [..],
100 |     "bias": [..]
101 |   },
102 |   "layer2": {
103 |     "weights": [..],
104 |     "bias": [..]
105 |   },
106 |   "layer3": {
107 |     "weights": [..],
108 |     "bias": [..]
109 |   }
110 | }
111 | ```
112 | 
113 | Value for key *epochs* is optional and indicates how many epochs were finished during training process.
114 | 
115 | 
116 | ## References
117 | 
118 | SRCNN [1] was used as the main reference, but the idea was taken from waifu2x[2].
119 | 
120 | * [1] Chao Dong, Chen Change Loy, Kaiming He, Xiaoou Tang, "Image Super-Resolution Using Deep Convolutional Networks", http://arxiv.org/abs/1501.00092
121 | 
122 | * [2] waifu2x, https://github.com/nagadomi/waifu2x
123 |  
124 | If You are interested, I've also wrtitten 2 articles on the topic at hand:
125 | 
126 | * [Neural networks: implementation tips](https://scthe.github.io/2015/08/23/neural-networks-implementation-tips.html)
127 | 
128 | * [Backpropagation notes](https://scthe.github.io/2015/08/30/backpropagation-notes.html)
129 | 
130 | 


--------------------------------------------------------------------------------
/test/specs/LayerTest_script.R:
--------------------------------------------------------------------------------
  1 | # run the script with:
  2 | # rscript test\test_confirmation.R
  3 | 
  4 | if (!require("rjson")) {
  5 |   install.packages("rjson", repos="http://cran.rstudio.com/")
  6 |   library("rjson")
  7 | }
  8 | 
  9 | 
 10 | json_data <- fromJSON(file="test/data/test_cases.json")
 11 | 
 12 | 
 13 | activation_function <- function(x){
 14 |     #1 / (1 + exp(-x))
 15 |     max(x,0)
 16 | }
 17 | 
 18 | split_by_columns <- function(arr, column_count, as_lists=FALSE){
 19 |     if((length(arr) %% column_count) != 0){
 20 |         stop(sprintf("Error: Tried to divide array of size(%d) into %d columns", length(arr), column_count))
 21 |     }
 22 | 
 23 |     result <- c()
 24 |     for (i in 1:column_count) {
 25 |         a <- (i) %% column_count
 26 |         sub_arr <- split(arr, 1:length(arr) %% column_count == a)$`TRUE`
 27 |         result <- c(result, sub_arr)
 28 |     }
 29 | 
 30 |     if(length(arr) != length(result)) {
 31 |         stop(sprintf("Error: Length of provided array(%d) != length of result(%d)", length(arr), length(result)))
 32 |      }
 33 | 
 34 |     if(as_lists){
 35 |         result <- matrix(result, ncol=column_count)
 36 |     }
 37 | 
 38 |     result
 39 | }
 40 | 
 41 | 
 42 | test_layer <- function(data, preproces_mean=FALSE, result_multiplier=0, decimal_places=3){
 43 |     n_prev_filter_cnt <- data$n_prev_filter_cnt
 44 |     current_filter_count <- data$current_filter_count
 45 |     f_spatial_size <- data$f_spatial_size
 46 |     input_size <- c(data$input_w, data$input_h)
 47 | 
 48 |     input_raw <- data$input
 49 |     output_raw <- data$output
 50 |     weights_raw <- data$weights
 51 |     bias <- data$bias
 52 | 
 53 |     out_size <- input_size - c(f_spatial_size, f_spatial_size) + c(1,1)
 54 |     out_dims <- c(out_size[2], out_size[1], current_filter_count)
 55 |     input_modifier <- if(preproces_mean) mean(input_raw) else 0
 56 |     # print(out_size)
 57 | 
 58 |     # preprocess data so that we can use native * operator for element-wise multiplication
 59 |     # (in json we have format that is suitable to be dumped into kernel indexing)
 60 |     input_vec <- split_by_columns(input_raw, n_prev_filter_cnt) - input_modifier
 61 |     input <- array(input_vec, c(input_size[1], input_size[2], n_prev_filter_cnt))
 62 |     # print(round(input, 3))
 63 | 
 64 |     # create submatrices of size f_spatial_size^2 * n_prev_filter_cnt
 65 |     sub_views <- list()
 66 |     for (dy in 1:out_size[2]) {
 67 |     for (dx in 1:out_size[1]) {
 68 |         end_dx <- dx + f_spatial_size - 1
 69 |         end_dy <- dy + f_spatial_size - 1
 70 |         sub_view <- input[dx:end_dx, dy:end_dy,]
 71 |         sub_views[[length(sub_views)+1]] <- sub_view
 72 |         # cat("SUBVIEW: ", dx, ":", end_dx, ", ", dy, ":",end_dy, "\n")
 73 |         # print(round(sub_view, 3))
 74 |     }
 75 |     }
 76 | 
 77 |     # weights
 78 |     weights_vec <- c()
 79 |     weights_by_filter <- split_by_columns(weights_raw, current_filter_count, as_lists=TRUE)
 80 |     for (filter_id in 1:current_filter_count) {
 81 |         ws <- weights_by_filter[,filter_id]
 82 |         # print(sprintf("Weights for filter %d (len=%d): %s", filter_id, length(ws), paste(ws, collapse=" ")))
 83 |         for(i in 1:length(ws)){
 84 |             # a <- (i-1) %/% (f_spatial_size*f_spatial_size)
 85 |             b <- (i-1) %/% f_spatial_size
 86 |             c <- (i-1) %% f_spatial_size
 87 |             d <- filter_id-1
 88 |             idx <- c * f_spatial_size * n_prev_filter_cnt * current_filter_count +
 89 |                 #    a * n_prev_filter_cnt * current_filter_count +
 90 |                    b * current_filter_count +
 91 |                    d
 92 | 
 93 |             weights_vec[idx+1] = ws[i]
 94 |         }
 95 |     }
 96 |     weights <- array(weights_vec, c(current_filter_count, f_spatial_size, f_spatial_size, n_prev_filter_cnt))
 97 | 
 98 |     # weights - debug print
 99 |     # for (filter_id in 1:current_filter_count) {
100 |         # cat("Weights for filter", filter_id, ":\n")
101 |         # print(weights[filter_id,,,])
102 |     # }
103 | 
104 |     # execute
105 |     result <- c()
106 |     for (filter_id in 1:current_filter_count) {
107 |         B <- bias[filter_id]
108 |         filter_weight <- weights[filter_id,,,]
109 |         # print(filter_weight)
110 | 
111 |         for (sub_view in sub_views) {
112 |             # print(round(sub_view,3))
113 |             res <- sum(sub_view * filter_weight) + B
114 |             res <- if(result_multiplier != 0) res * result_multiplier
115 |                    else activation_function(res)
116 |             result <- c(result, res)
117 |         }
118 |     }
119 |     res_arr <- array(round(result, decimal_places), out_dims)
120 | 
121 |     # print status
122 |     output_vec <- split_by_columns(output_raw, current_filter_count)
123 |     output <- array(output_vec, c(out_dims[1], out_dims[2], current_filter_count))
124 |     exp_arr <- array(round(output, decimal_places), out_dims)
125 | 
126 |     cat("DIFFERENCE - calculated result vs JSON output field (should be ~0 across the board):\n")
127 |     print(round(result-output,decimal_places))
128 |     cat("RESULT:\n")
129 |     print(round(res_arr,decimal_places))
130 |     # cat("EXPECTED:\n")
131 |     # print(round(exp_arr,2))
132 | 
133 |     result
134 | }
135 | 
136 | help_text <- "How to interpret results:\nResults have OUT_W*OUT_H*CURRENT_FILTER_COUNT numbers printed as OUT_W*OUT_H matrices. With the convention that data (in JSON) for each filter is in the respective column write content of each matrix (column-by-column) into single column (in JSON)."
137 | 
138 | cat("\n\n", help_text, "\n")
139 | 
140 | # print(json_data)
141 | # print(class(json_data))
142 | # print(json_data[[1]])
143 | # print(length(json_data))
144 | # print(names(json_data))
145 | 
146 | for( name in names(json_data)){
147 |     print('------------------')
148 |     print(name)
149 |     print('------------------')
150 |     test_layer(json_data[[name]], preproces_mean = FALSE)
151 | }
152 | 
153 | 


--------------------------------------------------------------------------------
/src/kernel/layer_deltas.cl:
--------------------------------------------------------------------------------
  1 | 
  2 | /* clang-format off */
  3 | /**
  4 |  *
  5 |  * Calculate deltas*activation_func_derivative of previous layer.
  6 |  *
  7 |  * In following notation (l), (l-1) describes relative layer and [...] lower indices.
  8 |  *
  9 |  * Algo for delta_ijn on layer (l-1), where:
 10 |  *   i = 0..output_w(l-1),
 11 |  *   j = 0..output_h(l-1),
 12 |  *   n = 0..filter_count(l-1):
 13 |  *
 14 |  * delta[i,j,n](l-1) = 0
 15 |  * for a = 0..spatial_size(l+1):
 16 |  *   for b = 0..spatial_size(l+1):
 17 |  *     for k = 0..filter_count(l+1):
 18 |  *       delta[i,j,n](l-1) += \
 19 |  *         w[abnk](l-1)               # (1) weight of edge between [i,j,n](l-1) and [i+a,j+b,k](l)
 20 |  *         * delta[i-a,j-b,k](l)      # (2) error term for [i-a,j-b,k](l). minus since we have point (i,j) and we asking: 'which output point are we affecting with w[a,b,_,_]'
 21 |  *         * f`(x[i,j,n](l-1) )       # (3) derivative of activation function at measured point
 22 |  *
 23 |  * TODO in (3) should index be x[i+a,j+b,n] or x[i,j,n]?
 24 |  *
 25 |  * macros:
 26 |  * 	CURRENT_FILTER_COUNT                   filter_count(l-1)
 27 |  *
 28 |  * @param  float*      deltas_next_layer   size: output_w(l) * output_w(l) * filter_count(l)
 29 |  * @param  float*      layer_output        size: output_w(l-1) * output_w(l-1) * filter_count(l-1)
 30 |  * @param  float*      target              size: output_w(l-1) * output_w(l-1) * filter_count(l-1)
 31 |  * @param  float*      W                   weights between (l-1) and (l).
 32 |  *                                         WARN: w3 is between (l2) and (l3), w2 -> (l1) and (l2), w1 -> (input) and (l1)
 33 |  *                                         size: f_spatial_size*f_spatial_size*filter_count(l-1)*filter_count(l)
 34 |  * @param  uint        f_spatial_size      spatial/kernel size for (l-1)
 35 |  * @param  uint        f_next_spatial_size spatial/kernel size for (l)
 36 |  * @param  uint        n_next_filter_cnt   filter_count(l)
 37 |  * @param  uint        layer_out_w         output_w(l-1)
 38 |  * @param  uint        layer_out_h         output_h(l-1)
 39 |  * @return {[type]}                        [description]
 40 |  */
 41 | /* clang-format on */
 42 | __kernel void deltas(__read_only __global float* deltas_next_layer,  //
 43 |                      __read_only __global float* layer_output,       //
 44 |                      __global float* target,                         //
 45 |                      __read_only __global float* W,                  //
 46 |                      uint f_spatial_size,                            //
 47 |                      uint f_next_spatial_size,                       //
 48 |                      uint n_next_filter_cnt,                         //
 49 |                      uint layer_out_w, uint layer_out_h) {
 50 |   // x=col=i; range: 0..layer_out_w
 51 |   // y=row=j; range: 0..layer_out_h
 52 |   const int2 pos = {get_global_id(0), get_global_id(1)};
 53 |   const uint sample_id = get_global_id(2);
 54 |   const int2 out_dim = {layer_out_w, layer_out_h};
 55 |   const int idx = ((pos.y * out_dim.x) + pos.x) * CURRENT_FILTER_COUNT;
 56 |   const int2 next_layer_out = {out_dim.x - f_next_spatial_size + 1,
 57 |                                out_dim.y - f_next_spatial_size + 1};
 58 | 
 59 | #define IMAGE_OFFSET_CURR \
 60 |   sample_id* CURRENT_FILTER_COUNT* layer_out_w* layer_out_h
 61 | #define IMAGE_OFFSET_NEXT \
 62 |   sample_id* n_next_filter_cnt* next_layer_out.x* next_layer_out.y
 63 | 
 64 |   // zeroed result cache and read read output values for output[i,j,n]
 65 |   float delta_for_filter[CURRENT_FILTER_COUNT];
 66 |   float activation_func_derivatives[CURRENT_FILTER_COUNT];
 67 | 
 68 |   // range check for i,j
 69 |   if (pos.x >= 0 && pos.x < out_dim.x &&  //
 70 |       pos.y >= 0 && pos.y < out_dim.y) {
 71 |     // fill tmp buffer values
 72 |     for (size_t n = 0; n < CURRENT_FILTER_COUNT; n++) {
 73 |       delta_for_filter[n] = 0.0f;
 74 |       // (3) f`( x[i,j,n](l-1) )
 75 |       float y_ijn = layer_output[IMAGE_OFFSET_CURR + idx + n];
 76 |       activation_func_derivatives[n] = y_ijn > 0.0f ? 1.0f : 0.0f;
 77 |     }
 78 | 
 79 |     for (size_t dy = 0; dy < f_next_spatial_size; dy++) {
 80 |       for (size_t dx = 0; dx < f_next_spatial_size; dx++) {
 81 |         // NOTE: dy=a, dx=b
 82 |         int2 next_layer_pos = {pos.x - dx, pos.y - dy};
 83 |         size_t w_idx_2D = ((dy * f_next_spatial_size) + dx) *
 84 |                           n_next_filter_cnt * CURRENT_FILTER_COUNT;
 85 | 
 86 |         for (size_t k = 0; k < n_next_filter_cnt; k++) {
 87 |           // (2) delta[i+a,j+b,k](l)
 88 |           // this requires us to map curent output_pos to next layer coords,
 89 |           // but some of the point may not be in range. f.e. point(i=0,j=0) does
 90 |           // not affect output with w[a,b] if a!=0 && b!=0
 91 |           int next_layer_idx =
 92 |               ((next_layer_pos.y * next_layer_out.x) + next_layer_pos.x) *
 93 |               n_next_filter_cnt;
 94 |           bool in_range =
 95 |               next_layer_pos.x >= 0 && next_layer_pos.x < next_layer_out.x &&
 96 |               next_layer_pos.y >= 0 && next_layer_pos.y < next_layer_out.y;
 97 |           float delta =
 98 |               in_range
 99 |                   ? deltas_next_layer[IMAGE_OFFSET_NEXT + next_layer_idx + k]
100 |                   : 0.0f;
101 | 
102 |           for (size_t n = 0; n < CURRENT_FILTER_COUNT; n++) {
103 |             // (1) w[abnk](l-1)
104 |             // NOTE: n iterates over lower layer's filters
105 |             size_t w_idx = w_idx_2D + n * n_next_filter_cnt + k;
106 |             float w = W[w_idx];
107 | 
108 |             // (3) f`( x[i,j,n](l-1) )
109 |             float activation_func_derivative = activation_func_derivatives[n];
110 | 
111 |             // result
112 |             delta_for_filter[n] += delta * w * activation_func_derivative;
113 |           }
114 |         }
115 | 
116 |         //
117 |       }
118 |     }
119 | 
120 |     // write results
121 |     for (size_t n = 0; n < CURRENT_FILTER_COUNT; n++) {
122 |       target[IMAGE_OFFSET_CURR + idx + n] = delta_for_filter[n];
123 |     }
124 | 
125 |     // end
126 |   }
127 | }
128 | 


--------------------------------------------------------------------------------
/libs/include/CL/cl_d3d9_ext.h:
--------------------------------------------------------------------------------
  1 | /**********************************************************************************
  2 |  * Copyright (c) 2008-2009 The Khronos Group Inc.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and/or associated documentation files (the
  6 |  * "Materials"), to deal in the Materials without restriction, including
  7 |  * without limitation the rights to use, copy, modify, merge, publish,
  8 |  * distribute, sublicense, and/or sell copies of the Materials, and to
  9 |  * permit persons to whom the Materials are furnished to do so, subject to
 10 |  * the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included
 13 |  * in all copies or substantial portions of the Materials.
 14 |  *
 15 |  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 18 |  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 19 |  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 20 |  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 21 |  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 22 |  **********************************************************************************/
 23 | 
 24 | #ifndef __OPENCL_CL_D3D9_EXT_H
 25 | #define __OPENCL_CL_D3D9_EXT_H
 26 | 
 27 | #include <d3d9.h>
 28 | #include <CL/cl.h>
 29 | #include <CL/cl_platform.h>
 30 | 
 31 | #ifdef __cplusplus
 32 | extern "C" {
 33 | #endif
 34 | 
 35 | /******************************************************************************
 36 |  * cl_nv_d3d9_sharing                                                         */
 37 | 
 38 | typedef cl_uint cl_d3d9_device_source_nv;
 39 | typedef cl_uint cl_d3d9_device_set_nv;
 40 | 
 41 | /******************************************************************************/
 42 | 
 43 | // Error Codes
 44 | #define CL_INVALID_D3D9_DEVICE_NV              -1010
 45 | #define CL_INVALID_D3D9_RESOURCE_NV            -1011
 46 | #define CL_D3D9_RESOURCE_ALREADY_ACQUIRED_NV   -1012
 47 | #define CL_D3D9_RESOURCE_NOT_ACQUIRED_NV       -1013
 48 | 
 49 | // cl_d3d9_device_source_nv
 50 | #define CL_D3D9_DEVICE_NV                      0x4022
 51 | #define CL_D3D9_ADAPTER_NAME_NV                0x4023
 52 | 
 53 | // cl_d3d9_device_set_nv
 54 | #define CL_PREFERRED_DEVICES_FOR_D3D9_NV       0x4024
 55 | #define CL_ALL_DEVICES_FOR_D3D9_NV             0x4025
 56 | 
 57 | // cl_context_info
 58 | #define CL_CONTEXT_D3D9_DEVICE_NV              0x4026
 59 | 
 60 | // cl_mem_info
 61 | #define CL_MEM_D3D9_RESOURCE_NV                0x4027
 62 | 
 63 | // cl_image_info
 64 | #define CL_IMAGE_D3D9_FACE_NV                  0x4028
 65 | #define CL_IMAGE_D3D9_LEVEL_NV                 0x4029
 66 | 
 67 | // cl_command_type
 68 | #define CL_COMMAND_ACQUIRE_D3D9_OBJECTS_NV     0x402A
 69 | #define CL_COMMAND_RELEASE_D3D9_OBJECTS_NV     0x402B
 70 | 
 71 | /******************************************************************************/
 72 | 
 73 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D9NV_fn)(
 74 |     cl_platform_id            platform,
 75 |     cl_d3d9_device_source_nv  d3d_device_source,
 76 |     void *                    d3d_object,
 77 |     cl_d3d9_device_set_nv     d3d_device_set,
 78 |     cl_uint                   num_entries, 
 79 |     cl_device_id *            devices, 
 80 |     cl_uint *                 num_devices) CL_API_SUFFIX__VERSION_1_0;
 81 | 
 82 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9VertexBufferNV_fn)(
 83 |     cl_context               context,
 84 |     cl_mem_flags             flags,
 85 |     IDirect3DVertexBuffer9 * resource,
 86 |     cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 87 | 
 88 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9IndexBufferNV_fn)(
 89 |     cl_context              context,
 90 |     cl_mem_flags            flags,
 91 |     IDirect3DIndexBuffer9 * resource,
 92 |     cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 93 | 
 94 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9SurfaceNV_fn)(
 95 |     cl_context          context,
 96 |     cl_mem_flags        flags,
 97 |     IDirect3DSurface9 * resource,
 98 |     cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 99 | 
100 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9TextureNV_fn)(
101 |     cl_context         context,
102 |     cl_mem_flags       flags,
103 |     IDirect3DTexture9 *resource,
104 |     UINT               miplevel,
105 |     cl_int *           errcode_ret) CL_API_SUFFIX__VERSION_1_0;
106 | 
107 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9CubeTextureNV_fn)(
108 |     cl_context              context,
109 |     cl_mem_flags            flags,
110 |     IDirect3DCubeTexture9 * resource,
111 |     D3DCUBEMAP_FACES        facetype,
112 |     UINT                    miplevel,
113 |     cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_0;
114 | 
115 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9VolumeTextureNV_fn)(
116 |     cl_context                context,
117 |     cl_mem_flags              flags,
118 |     IDirect3DVolumeTexture9 * resource,
119 |     UINT                      miplevel,
120 |     cl_int *                  errcode_ret) CL_API_SUFFIX__VERSION_1_0;
121 | 
122 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D9ObjectsNV_fn)(
123 |     cl_command_queue command_queue,
124 |     cl_uint num_objects,
125 |     const cl_mem *mem_objects,
126 |     cl_uint num_events_in_wait_list,
127 |     const cl_event *event_wait_list,
128 |     cl_event *event) CL_API_SUFFIX__VERSION_1_0;
129 | 
130 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D9ObjectsNV_fn)(
131 |     cl_command_queue command_queue,
132 |     cl_uint num_objects,
133 |     cl_mem *mem_objects,
134 |     cl_uint num_events_in_wait_list,
135 |     const cl_event *event_wait_list,
136 |     cl_event *event) CL_API_SUFFIX__VERSION_1_0;
137 | 
138 | #ifdef __cplusplus
139 | }
140 | #endif
141 | 
142 | #endif  // __OPENCL_CL_D3D9_H
143 | 
144 | 


--------------------------------------------------------------------------------
/src/ConfigBasedDataPipeline.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CONFIG_BASED_DATA_PIPELINE_H
  2 | #define CONFIG_BASED_DATA_PIPELINE_H
  3 | 
  4 | #include "DataPipeline.hpp"
  5 | #include "LayerData.hpp"
  6 | 
  7 | namespace cnn_sr {
  8 | 
  9 | /**
 10 |  * All gpu buffer handles related to single image
 11 |  */
 12 | struct SampleAllocationPool {
 13 |   /** Raw 3 channel image loaded from hard drive */
 14 |   opencl::MemoryHandle input_data = gpu_nullptr;
 15 |   /** Single channel (luma) of size input_img_w*input_img_h */
 16 |   opencl::MemoryHandle input_luma = gpu_nullptr;
 17 |   /** Dimensions of original image*/
 18 |   size_t input_w, input_h;
 19 | 
 20 |   /** Training: Raw 3 channel image loaded from hard drive */
 21 |   opencl::MemoryHandle expected_data = gpu_nullptr;
 22 |   /** Training: luma to compare our result to */
 23 |   opencl::MemoryHandle expected_luma = gpu_nullptr;
 24 | 
 25 |   SampleAllocationPool() = default;
 26 | 
 27 |   // private:
 28 |   // SampleAllocationPool(const SampleAllocationPool&) = delete;
 29 |   // SampleAllocationPool& operator=(const SampleAllocationPool&) = delete;
 30 | };
 31 | 
 32 | /** Represents all general allocations that we will make */
 33 | struct GpuAllocationPool {
 34 |   LayerAllocationPool layer_1;
 35 |   LayerAllocationPool layer_2;
 36 |   LayerAllocationPool layer_3;
 37 | 
 38 |   std::vector<SampleAllocationPool> samples;
 39 | };
 40 | 
 41 | /**
 42 |  * Class that wraps all low level functions from DataPipeline into something
 43 |  * more usable
 44 |  */
 45 | class ConfigBasedDataPipeline : public DataPipeline {
 46 |  public:
 47 |   ConfigBasedDataPipeline(Config&, opencl::Context*);
 48 | 
 49 |   void init(int load_flags);
 50 | 
 51 |   void set_mini_batch_size(size_t);
 52 | 
 53 |   float execute_batch(bool backpropagate, GpuAllocationPool&,
 54 |                       std::vector<SampleAllocationPool*>&);
 55 | 
 56 |   cl_event forward(LayerAllocationPool& layer_1_alloc,  //
 57 |                    LayerAllocationPool& layer_2_alloc,  //
 58 |                    LayerAllocationPool& layer_3_alloc,  //
 59 |                    SampleAllocationPool& sample);
 60 | 
 61 |  private:
 62 |   void allocate_buffers(size_t, size_t);
 63 | 
 64 |   cl_event forward(LayerAllocationPool& layer_1_alloc,  //
 65 |                    LayerAllocationPool& layer_2_alloc,  //
 66 |                    LayerAllocationPool& layer_3_alloc,  //
 67 |                    size_t w, size_t h, size_t id);
 68 | 
 69 |   /* clang-format off */
 70 |   /**
 71 |    * General backpropagation steps:
 72 |    *   - calculate weight decay (NOTE: value expected as a paramter)
 73 |    *   - calculate deltas for last layer
 74 |    *   - calculate deltas other layers in reverse order
 75 |    *   - backpropagate: calculate gradient w, gradient b for all layers
 76 |    *   - update weights and biases (NOTE: requires explicit call to ConfigBasedDataPipeline::update_parameters(...))
 77 |    *
 78 |    * @param  layer_1_alloc        [description]
 79 |    * @param  layer_2_alloc        [description]
 80 |    * @param  layer_3_alloc        [description]
 81 |    * @param  cnn_input            input that was provided during forward step
 82 |    * @param  gpu_buf_ground_truth expected result
 83 |    * @param  ground_truth_w       width of both cnn_input and gpu_buf_ground_truth
 84 |    * @param  ground_truth_h       height of both cnn_input and gpu_buf_ground_truth
 85 |    * @param  weight_decay
 86 |    * @param  ev_to_wait_for       [description]
 87 |    * @return                      [description]
 88 |    */
 89 |   cl_event backpropagate(cnn_sr::LayerAllocationPool&,
 90 |                          cnn_sr::LayerAllocationPool&,
 91 |                          cnn_sr::LayerAllocationPool&,
 92 |                          size_t, size_t, size_t,
 93 |                          cl_event* ev_to_wait_for = nullptr);
 94 |   /* clang-format on */
 95 | 
 96 |  public:
 97 |   /** update weights and biases*/
 98 |   void update_parameters(cnn_sr::LayerAllocationPool&,
 99 |                          cnn_sr::LayerAllocationPool&,
100 |                          cnn_sr::LayerAllocationPool&, size_t batch_size,
101 |                          cl_event* ev_to_wait_for = nullptr);
102 | 
103 |   void write_params_to_file(const char* const file_path,  //
104 |                             cnn_sr::LayerAllocationPool,
105 |                             cnn_sr::LayerAllocationPool,
106 |                             cnn_sr::LayerAllocationPool);
107 | 
108 |   void write_result_image(const char* const, opencl::utils::ImageData&,
109 |                           SampleAllocationPool& sample);
110 | 
111 |   inline const Config* config() { return _config; }
112 |   inline const LayerData* layer_1() { return &layer_data_1; }
113 |   inline const LayerData* layer_2() { return &layer_data_2; }
114 |   inline const LayerData* layer_3() { return &layer_data_3; }
115 | 
116 |  protected:
117 |   void load_kernels(int load_flags);
118 | 
119 |  private:
120 |   void fill_random_parameters(LayerData&, ParametersDistribution&);
121 | 
122 |   size_t load_parameters_file(const char* const);
123 | 
124 |   void create_luma_image(const char* const, opencl::MemoryHandle, size_t,
125 |                          size_t);
126 | 
127 |   // void create_lumas_delta_image(const char* const, SampleAllocationPool& e,
128 |   // AllocationItem&);
129 | 
130 |  private:
131 |   Config* const _config;
132 |   LayerData layer_data_1;
133 |   LayerData layer_data_2;
134 |   LayerData layer_data_3;
135 |   size_t epochs = 0;
136 |   size_t _mini_batch_size = 0;
137 | 
138 |   /* ground truth for batch */
139 |   opencl::MemoryHandle _ground_truth_gpu_buf = gpu_nullptr;
140 |   /** input for layer 1 */
141 |   opencl::MemoryHandle _forward_gpu_buf = gpu_nullptr;
142 |   /** outputs for layers */
143 |   opencl::MemoryHandle _out_1_gpu_buf = gpu_nullptr,  //
144 |       _out_2_gpu_buf = gpu_nullptr,                   //
145 |       _out_3_gpu_buf = gpu_nullptr;
146 |   /** deltas for layers */
147 |   opencl::MemoryHandle _delta_1_gpu_buf = gpu_nullptr,  //
148 |       _delta_2_gpu_buf = gpu_nullptr,                   //
149 |       _delta_3_gpu_buf = gpu_nullptr;
150 | 
151 |   opencl::Kernel* _layer_1_kernel = nullptr;
152 |   opencl::Kernel* _layer_2_kernel = nullptr;
153 |   opencl::Kernel* _layer_3_kernel = nullptr;
154 |   opencl::Kernel* _layer_1_deltas_kernel = nullptr;
155 |   opencl::Kernel* _layer_2_deltas_kernel = nullptr;
156 | };
157 | }
158 | 
159 | #endif /* CONFIG_BASED_DATA_PIPELINE_H   */
160 | 


--------------------------------------------------------------------------------
/test/specs/LayerTest.cpp:
--------------------------------------------------------------------------------
  1 | #include "TestSpecsDeclarations.hpp"
  2 | 
  3 | #include <sstream>
  4 | #include <fstream>
  5 | #include <iostream>
  6 | #include <cstring>
  7 | 
  8 | #include "json/gason.h"
  9 | 
 10 | #include "../../src/DataPipeline.hpp"
 11 | #include "../../src/LayerData.hpp"
 12 | 
 13 | auto test_data_file = "test/data/test_cases.json";
 14 | 
 15 | /* clang-format off */
 16 | /*
 17 |  *
 18 |  * NOTE: use LayerTest_script.R to generate expected output values
 19 |  *
 20 |  *
 21 |  *  Test data schema description (values for each layer provided after '/'):
 22 |  *
 23 |  *  n_prev_filter_cnt    := INT, filter count for previous layer, values: 1/n1/n2
 24 |  *  current_filter_count := INT, filter count for this layer, values: n1/n2/1
 25 |  *  f_spatial_size       := INT, spatial size, values: f1/f2/f3
 26 |  *  input_w              := INT, input dimensions
 27 |  *  input_h              := INT, input dimensions
 28 |  *  input                := VECTOR[FLOAT], min size: input_w * input_h * n_prev_filter_cnt.
 29 |  *                           Each column for different filter(from 1 to n_prev_filter_cnt).
 30 |  *                           Each row for different point in range 0..input_w*input_h
 31 |  *  output               := VECTOR[FLOAT], min size: out_w * out_h * current_filter_count
 32 |  *                           Expected output
 33 |  *  weights              := VECTOR[FLOAT], min size: f_spatial_size^2 * n_prev_filter_cnt * current_filter_count
 34 |  *                           There are f_spatial_size paragraphs
 35 |  *                           Each paragraph consists of f_spatial_size lines, representing 1 row.
 36 |  *                           Each row contains current_filter_count*n_prev_filter_cnt numbers,
 37 |  *                           grouped by n_prev_filter_cnt (n_prev_filter_cnt groups,
 38 |  *                           current_filter_count numbers per each group).
 39 |  *  bias                 := VECTOR[FLOAT], min size: current_filter_count
 40 |  *
 41 |  *
 42 |  * calcutated values:
 43 |  *   out_w := input_w - f_spatial_size + 1
 44 |  *   out_h := input_h - f_spatial_size + 1
 45 |  */
 46 | /* clang-format on */
 47 | 
 48 | namespace test {
 49 | namespace specs {
 50 | 
 51 | ///
 52 | /// Data set
 53 | ///
 54 | struct LayerDataSet : DataSet {
 55 |   size_t n_prev_filter_cnt,  //
 56 |       current_filter_count,  //
 57 |       f_spatial_size,        //
 58 |       input_w, input_h;
 59 |   std::vector<float> input;
 60 |   std::vector<float> output;
 61 |   std::vector<float> weights;
 62 |   std::vector<float> bias;
 63 | };
 64 | 
 65 | ///
 66 | /// PIMPL
 67 | ///
 68 | struct LayerTestImpl {
 69 |   bool read_test_data_from_file(char const* const file);
 70 | 
 71 |   std::vector<LayerDataSet> data_sets;
 72 | };
 73 | 
 74 | ///
 75 | /// LayerTest
 76 | ///
 77 | 
 78 | TEST_SPEC_PIMPL(LayerTest)
 79 | 
 80 | void LayerTest::init() {
 81 |   auto status = _impl->read_test_data_from_file(test_data_file);
 82 |   if (!status) {
 83 |     exit(EXIT_FAILURE);
 84 |   }
 85 | }
 86 | 
 87 | size_t LayerTest::data_set_count() { return _impl->data_sets.size(); }
 88 | 
 89 | std::string LayerTest::name(size_t data_set_id) {
 90 |   if (data_set_count() == 0) {
 91 |     return "Layer test - no data sets provided";
 92 |   }
 93 |   assert_data_set_ok(data_set_id);
 94 |   return "Layer test - " + _impl->data_sets[data_set_id].name;
 95 | }
 96 | 
 97 | bool LayerTest::operator()(size_t data_set_id,
 98 |                            cnn_sr::DataPipeline* const pipeline) {
 99 |   if (data_set_count() == 0) return false;
100 | 
101 |   assert_not_null(pipeline);
102 |   assert_data_set_ok(data_set_id);
103 |   auto data = &_impl->data_sets[data_set_id];
104 |   auto _context = pipeline->context();
105 | 
106 |   // convert layer test definition to cnn_sr::LayerData object
107 |   cnn_sr::LayerData layer_data(data->n_prev_filter_cnt,
108 |                                data->current_filter_count,
109 |                                data->f_spatial_size);
110 |   layer_data.set_weights(&data->weights[0]);
111 |   layer_data.set_bias(&data->bias[0]);
112 | 
113 |   size_t out_dim[2];
114 |   layer_data.get_output_dimensions(out_dim, data->input_w, data->input_h);
115 | 
116 |   // alloc input
117 |   cnn_sr::LayerAllocationPool gpu_alloc;
118 |   opencl::MemoryHandle gpu_output = gpu_nullptr;
119 |   auto gpu_buf_in = _context->allocate(CL_MEM_WRITE_ONLY,
120 |                                        sizeof(cl_float) * data->input.size());
121 |   _context->write_buffer(gpu_buf_in, (void*)&data->input[0], true);
122 | 
123 |   // create kernel & run
124 |   auto kernel = pipeline->create_layer_kernel(layer_data, false);
125 |   pipeline->execute_layer(*kernel, layer_data, gpu_alloc, gpu_buf_in,
126 |                           data->input_w, data->input_h, gpu_output);
127 |   assert_equals(pipeline, data->output, gpu_output);
128 | 
129 |   return true;
130 | }
131 | 
132 | //
133 | //
134 | //
135 | 
136 | bool read_layer_data(const JsonValue& object, LayerDataSet& data) {
137 |   // ASSERT(object.getTag() == JSON_TAG_OBJECT);
138 |   using namespace cnn_sr::utils;
139 | 
140 |   for (auto node : object) {
141 |     try_read_uint(*node, data.n_prev_filter_cnt, "n_prev_filter_cnt");
142 |     try_read_uint(*node, data.f_spatial_size, "f_spatial_size");
143 |     try_read_uint(*node, data.current_filter_count, "current_filter_count");
144 |     try_read_uint(*node, data.input_w, "input_w");
145 |     try_read_uint(*node, data.input_h, "input_h");
146 |     try_read_vector(*node, data.input, "input");
147 |     try_read_vector(*node, data.output, "output");
148 |     try_read_vector(*node, data.weights, "weights");
149 |     try_read_vector(*node, data.bias, "bias");
150 |   }
151 | 
152 |   return true;
153 | }
154 | 
155 | bool LayerTestImpl::read_test_data_from_file(char const* const file) {
156 |   std::cout << "Loading layer test data from: '" << file << "'" << std::endl;
157 | 
158 |   JsonValue value;
159 |   JsonAllocator allocator;
160 |   std::string source;
161 |   cnn_sr::utils::read_json_file(file, value, allocator, source, JSON_OBJECT);
162 | 
163 |   bool read_status = true;
164 |   if (value.getTag() == JSON_OBJECT) {
165 |     for (auto object : value) {
166 |       if (object->value.getTag() != JSON_OBJECT) continue;
167 |       // std::cout << object->key << std::endl;
168 |       data_sets.push_back(LayerDataSet());
169 |       LayerDataSet* ptr = &data_sets[data_sets.size() - 1];
170 |       ptr->name = object->key;
171 |       read_status &= read_layer_data(object->value, *ptr);
172 |     }
173 |   }
174 | 
175 |   return read_status;
176 | }
177 | 
178 | //
179 | //
180 | }  // namespace specs
181 | }  // namespace test
182 | 


--------------------------------------------------------------------------------
/src/Config.cpp:
--------------------------------------------------------------------------------
  1 | #include "Config.hpp"
  2 | #include <cmath>    // for std::abs
  3 | #include <cstring>  // for strcmp when reading json
  4 | 
  5 | #include "json/gason.h"
  6 | 
  7 | namespace cnn_sr {
  8 | using namespace utils;
  9 | 
 10 | const char* const parameters_keys[3] = {"parameters_distribution_1",
 11 |                                         "parameters_distribution_2",
 12 |                                         "parameters_distribution_3"};
 13 | 
 14 | ParametersDistribution::ParametersDistribution(float mean_w, float mean_b,
 15 |                                                float sd_w, float sd_b)
 16 |     : mean_w(mean_w), sd_w(sd_w), mean_b(mean_b), sd_b(sd_b) {}
 17 | 
 18 | ///
 19 | /// Config
 20 | ///
 21 | Config::Config(size_t n1, size_t n2,                                       //
 22 |                size_t f1, size_t f2, size_t f3,                            //
 23 |                float momentum, float weight_decay, float* learning_rates,  //
 24 |                ParametersDistribution pd1,                                 //
 25 |                ParametersDistribution pd2,                                 //
 26 |                ParametersDistribution pd3,                                 //
 27 |                const char* const parameters_file)
 28 |     : n1(n1),
 29 |       n2(n2),
 30 |       f1(f1),
 31 |       f2(f2),
 32 |       f3(f3),
 33 |       momentum(momentum),
 34 |       weight_decay_parameter(weight_decay),
 35 |       parameters_file(parameters_file),
 36 |       params_distr_1(pd1),
 37 |       params_distr_2(pd2),
 38 |       params_distr_3(pd3) {
 39 |   for (size_t i = 0; i < 3; i++) {
 40 |     this->learning_rate[i] = learning_rates[i];
 41 |   }
 42 | }
 43 | 
 44 | size_t Config::total_padding() const { return f1 + f2 + f3 - 3; }
 45 | 
 46 | void Config::validate(Config& config) {
 47 |   // spatial size works best if is odd number
 48 |   utils::require(is_odd(config.f1), "f1 should be odd");
 49 |   utils::require(is_odd(config.f2), "f2 should be odd");
 50 |   utils::require(is_odd(config.f3), "f3 should be odd");
 51 |   // both filter count and spatial size cannot be 0
 52 |   utils::require(config.n1 > 0, "n1 should be >0");
 53 |   utils::require(config.n2 > 0, "n2 should be >0");
 54 |   utils::require(config.f1 > 0, "f1 should be >0");
 55 |   utils::require(config.f2 > 0, "f2 should be >0");
 56 |   utils::require(config.f3 > 0, "f3 should be >0");
 57 | 
 58 |   utils::require(config.f3 > 0, "f3 should be >0");
 59 |   utils::require(config.weight_decay_parameter >= 0,
 60 |                  "weight_decay should be >0");
 61 |   utils::require(config.learning_rate[0] > 0 && config.learning_rate[1] > 0 &&
 62 |                      config.learning_rate[2] > 0,
 63 |                  "All learning rates should be >0");
 64 | 
 65 |   // ParametersDistribution
 66 |   ParametersDistribution* pd_arr[3] = {&config.params_distr_1,  //
 67 |                                        &config.params_distr_2,  //
 68 |                                        &config.params_distr_3};
 69 |   for (auto i = 0; i < 3; i++) {
 70 |     auto pd = pd_arr[i];
 71 |     utils::require(pd->sd_w > 0, "std dev. for weights should be > 0");
 72 |     utils::require(pd->sd_b >= 0, "std dev. for bias should be >= 0");
 73 |   }
 74 | }
 75 | 
 76 | ///
 77 | /// ConfigReader
 78 | ///
 79 | 
 80 | struct ConfigHelper {
 81 |   size_t n1, n2, f1, f2, f3;
 82 |   float momentum, weight_decay, lr1, lr2, lr3;
 83 |   std::string parameters_file = "";
 84 |   std::vector<float> learning_rates;
 85 | };
 86 | 
 87 | void fix_params_distribution(ParametersDistribution& d) {
 88 |   d.mean_w = std::abs(d.mean_w);
 89 |   d.mean_b = std::abs(d.mean_b);
 90 |   d.sd_w = std::abs(d.sd_w);
 91 |   d.sd_b = std::abs(d.sd_b);
 92 | }
 93 | 
 94 | void load_parameters_distr(JsonNode* node, ParametersDistribution& data) {
 95 |   for (auto subnode : node->value) {
 96 |     utils::try_read_float(*subnode, data.mean_w, "mean_w");
 97 |     utils::try_read_float(*subnode, data.mean_b, "mean_b");
 98 |     utils::try_read_float(*subnode, data.sd_w, "std_deviation_w");
 99 |     utils::try_read_float(*subnode, data.sd_b, "std_deviation_b");
100 |   }
101 | }
102 | 
103 | Config ConfigReader::read(const char* const file) {
104 |   JsonValue value;
105 |   JsonAllocator allocator;
106 |   std::string source;
107 |   utils::read_json_file(file, value, allocator, source, JSON_OBJECT);
108 | 
109 |   ConfigHelper cfg_h;
110 |   ParametersDistribution pd1, pd2, pd3;
111 |   for (auto node : value) {
112 |     auto key = node->key;
113 |     utils::try_read_uint(*node, cfg_h.n1, "n1");
114 |     utils::try_read_uint(*node, cfg_h.n2, "n2");
115 |     utils::try_read_uint(*node, cfg_h.f1, "f1");
116 |     utils::try_read_uint(*node, cfg_h.f2, "f2");
117 |     utils::try_read_uint(*node, cfg_h.f3, "f3");
118 |     utils::try_read_float(*node, cfg_h.momentum, "momentum");
119 |     utils::try_read_float(*node, cfg_h.weight_decay, "weight_decay_parameter");
120 |     utils::try_read_string(*node, cfg_h.parameters_file, "parameters_file");
121 |     utils::try_read_vector(*node, cfg_h.learning_rates, "learning_rates");
122 | 
123 |     if (strcmp(key, parameters_keys[0]) == 0) {
124 |       load_parameters_distr(node, pd1);
125 |     } else if (strcmp(key, parameters_keys[1]) == 0) {
126 |       load_parameters_distr(node, pd2);
127 |     } else if (strcmp(key, parameters_keys[2]) == 0) {
128 |       load_parameters_distr(node, pd3);
129 |     }
130 |   }
131 | 
132 |   fix_params_distribution(pd1);
133 |   fix_params_distribution(pd2);
134 |   fix_params_distribution(pd3);
135 |   utils::require(cfg_h.learning_rates.size() == 3,
136 |                  "Expected 3 learning rates (one per layer) to be provided");
137 | 
138 |   Config cfg(cfg_h.n1, cfg_h.n2,            //
139 |              cfg_h.f1, cfg_h.f2, cfg_h.f3,  //
140 |              cfg_h.momentum, cfg_h.weight_decay,
141 |              &cfg_h.learning_rates[0],  //
142 |              pd1, pd2, pd3,             //
143 |              cfg_h.parameters_file.c_str());
144 |   Config::validate(cfg);
145 | 
146 |   return cfg;
147 | }
148 | }
149 | 
150 | std::ostream& operator<<(std::ostream& os,
151 |                          const cnn_sr::ParametersDistribution& pd) {
152 |   /* clang-format off */
153 |   os << "{ weights(" << pd.mean_w << ", " << pd.sd_w
154 |      << "), bias("   << pd.mean_b << ", " << pd.sd_b << ")}";
155 |   /* clang-format on */
156 |   return os;
157 | }
158 | 
159 | std::ostream& operator<<(std::ostream& os, const cnn_sr::Config& cfg) {
160 |   /* clang-format off */
161 |   os << "Config {" << std::endl
162 |      << "  parameters file: '" << cfg.parameters_file << "'" << std::endl
163 |      << "  momentum: " << cfg.momentum << std::endl
164 |      << "  learning rates: { " << cfg.learning_rate[0] << ", "
165 |                                << cfg.learning_rate[1] << ", "
166 |                                << cfg.learning_rate[2] << "}" << std::endl
167 |      << "  layer 1: " << cfg.n1 << " filters, " << cfg.f1 << " spatial size" << std::endl
168 |      << "  layer 2: " << cfg.n2 << " filters, " << cfg.f2 << " spatial size" << std::endl
169 |      << "  layer 3: " << cfg.f3 << " spatial size" << std::endl
170 |      << "  parameters dist. 1 " << cfg.params_distr_1 << std::endl
171 |      << "  parameters dist. 2 " << cfg.params_distr_2 << std::endl
172 |      << "  parameters dist. 3 " << cfg.params_distr_3 << "}" << std::endl;
173 |   /* clang-format on */
174 |   return os;
175 | }
176 | 


--------------------------------------------------------------------------------
/libs/include/CL/cl_gl.h:
--------------------------------------------------------------------------------
  1 | /**********************************************************************************
  2 |  * Copyright (c) 2008-2010 The Khronos Group Inc.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and/or associated documentation files (the
  6 |  * "Materials"), to deal in the Materials without restriction, including
  7 |  * without limitation the rights to use, copy, modify, merge, publish,
  8 |  * distribute, sublicense, and/or sell copies of the Materials, and to
  9 |  * permit persons to whom the Materials are furnished to do so, subject to
 10 |  * the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included
 13 |  * in all copies or substantial portions of the Materials.
 14 |  *
 15 |  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 18 |  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 19 |  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 20 |  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 21 |  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 22 |  **********************************************************************************/
 23 | 
 24 | /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
 25 | 
 26 | /*
 27 |  * cl_gl.h contains Khronos-approved (KHR) OpenCL extensions which have
 28 |  * OpenGL dependencies. The application is responsible for #including
 29 |  * OpenGL or OpenGL ES headers before #including cl_gl.h.
 30 |  */
 31 | 
 32 | #ifndef __OPENCL_CL_GL_H
 33 | #define __OPENCL_CL_GL_H
 34 | 
 35 | #ifdef __APPLE__
 36 | #include <OpenCL/cl.h>
 37 | #include <OpenGL/CGLDevice.h>
 38 | #else
 39 | #include "CL/cl.h"
 40 | #endif	
 41 | 
 42 | #ifdef __cplusplus
 43 | extern "C" {
 44 | #endif
 45 | 
 46 | typedef cl_uint     cl_gl_object_type;
 47 | typedef cl_uint     cl_gl_texture_info;
 48 | typedef cl_uint     cl_gl_platform_info;
 49 | typedef struct __GLsync *cl_GLsync;
 50 | 
 51 | /* cl_gl_object_type */
 52 | #define CL_GL_OBJECT_BUFFER             0x2000
 53 | #define CL_GL_OBJECT_TEXTURE2D          0x2001
 54 | #define CL_GL_OBJECT_TEXTURE3D          0x2002
 55 | #define CL_GL_OBJECT_RENDERBUFFER       0x2003
 56 | 
 57 | /* cl_gl_texture_info */
 58 | #define CL_GL_TEXTURE_TARGET            0x2004
 59 | #define CL_GL_MIPMAP_LEVEL              0x2005
 60 | 
 61 | extern CL_API_ENTRY cl_mem CL_API_CALL
 62 | clCreateFromGLBuffer(cl_context     /* context */,
 63 |                      cl_mem_flags   /* flags */,
 64 |                      cl_GLuint      /* bufobj */,
 65 |                      int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 66 | 
 67 | extern CL_API_ENTRY cl_mem CL_API_CALL
 68 | clCreateFromGLTexture2D(cl_context      /* context */,
 69 |                         cl_mem_flags    /* flags */,
 70 |                         cl_GLenum       /* target */,
 71 |                         cl_GLint        /* miplevel */,
 72 |                         cl_GLuint       /* texture */,
 73 |                         cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 74 | 
 75 | extern CL_API_ENTRY cl_mem CL_API_CALL
 76 | clCreateFromGLTexture3D(cl_context      /* context */,
 77 |                         cl_mem_flags    /* flags */,
 78 |                         cl_GLenum       /* target */,
 79 |                         cl_GLint        /* miplevel */,
 80 |                         cl_GLuint       /* texture */,
 81 |                         cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 82 | 
 83 | extern CL_API_ENTRY cl_mem CL_API_CALL
 84 | clCreateFromGLRenderbuffer(cl_context   /* context */,
 85 |                            cl_mem_flags /* flags */,
 86 |                            cl_GLuint    /* renderbuffer */,
 87 |                            cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 88 | 
 89 | extern CL_API_ENTRY cl_int CL_API_CALL
 90 | clGetGLObjectInfo(cl_mem                /* memobj */,
 91 |                   cl_gl_object_type *   /* gl_object_type */,
 92 |                   cl_GLuint *              /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
 93 |                   
 94 | extern CL_API_ENTRY cl_int CL_API_CALL
 95 | clGetGLTextureInfo(cl_mem               /* memobj */,
 96 |                    cl_gl_texture_info   /* param_name */,
 97 |                    size_t               /* param_value_size */,
 98 |                    void *               /* param_value */,
 99 |                    size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
100 | 
101 | extern CL_API_ENTRY cl_int CL_API_CALL
102 | clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
103 |                           cl_uint               /* num_objects */,
104 |                           const cl_mem *        /* mem_objects */,
105 |                           cl_uint               /* num_events_in_wait_list */,
106 |                           const cl_event *      /* event_wait_list */,
107 |                           cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
108 | 
109 | extern CL_API_ENTRY cl_int CL_API_CALL
110 | clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
111 |                           cl_uint               /* num_objects */,
112 |                           const cl_mem *        /* mem_objects */,
113 |                           cl_uint               /* num_events_in_wait_list */,
114 |                           const cl_event *      /* event_wait_list */,
115 |                           cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
116 | 
117 | /* cl_khr_gl_sharing extension  */
118 | 
119 | #define cl_khr_gl_sharing 1
120 | 
121 | typedef cl_uint     cl_gl_context_info;
122 | 
123 | /* Additional Error Codes  */
124 | #define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
125 | 
126 | /* cl_gl_context_info  */
127 | #define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
128 | #define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
129 | 
130 | /* Additional cl_context_properties  */
131 | #define CL_GL_CONTEXT_KHR                       0x2008
132 | #define CL_EGL_DISPLAY_KHR                      0x2009
133 | #define CL_GLX_DISPLAY_KHR                      0x200A
134 | #define CL_WGL_HDC_KHR                          0x200B
135 | #define CL_CGL_SHAREGROUP_KHR                   0x200C
136 | 
137 | extern CL_API_ENTRY cl_int CL_API_CALL
138 | clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
139 |                       cl_gl_context_info            /* param_name */,
140 |                       size_t                        /* param_value_size */,
141 |                       void *                        /* param_value */,
142 |                       size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
143 | 
144 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
145 |     const cl_context_properties * properties,
146 |     cl_gl_context_info            param_name,
147 |     size_t                        param_value_size,
148 |     void *                        param_value,
149 |     size_t *                      param_value_size_ret);
150 | 
151 | #ifdef __cplusplus
152 | }
153 | #endif
154 | 
155 | #endif  /* __OPENCL_CL_GL_H  */
156 | 


--------------------------------------------------------------------------------
/test/specs/BackpropagationTest.cpp:
--------------------------------------------------------------------------------
  1 | #include "TestSpecsDeclarations.hpp"
  2 | 
  3 | #include "../../src/DataPipeline.hpp"
  4 | #include "../../src/LayerData.hpp"
  5 | 
  6 | using namespace cnn_sr;
  7 | 
  8 | ///
  9 | /// NOTE: generating expected output is just checking if all inputs, deltas
 10 | /// are read correctly. Change following line:
 11 | ///   'scratch_w[idx] = delta * layer_input[prev_layer_idx + k];'
 12 | /// to:
 13 | ///   'scratch_w[idx] = layer_input[prev_layer_idx + k];'
 14 | ///   OR
 15 | ///   'scratch_w[idx] = delta;'
 16 | /// Also just use BackpropagationTest_script.py to calc the values.
 17 | ///
 18 | /// NOTE: data set 1 checks if kernel works, data set 2 checks if it does not
 19 | /// crash when used with big number of data
 20 | ///
 21 | 
 22 | namespace test {
 23 | namespace specs {
 24 | 
 25 | ///
 26 | /// PIMPL
 27 | ///
 28 | struct BackpropagationTestImpl {
 29 | // INPUT_SIZE = input_dim*n(l-1)
 30 | #define INPUT_SIZE 50
 31 |   float input[INPUT_SIZE] = {-0.083, -0.064,  //
 32 |                              0.075,  -0.055,  //
 33 |                              -0.058, -0.138,  //
 34 |                              -0.068, -0.144,  //
 35 |                              -0.013, 0.176,   //
 36 | 
 37 |                              0.169,  0.049,   //
 38 |                              0.181,  -0.051,  //
 39 |                              0.136,  -0.062,  //
 40 |                              -0.165, -0.176,  //
 41 |                              0.159,  -0.060,  //
 42 | 
 43 |                              -0.112, 0.228,   //
 44 |                              0.003,  -0.138,  //
 45 |                              -0.123, -0.027,  //
 46 |                              -0.102, -0.061,  //
 47 |                              0.242,  -0.069,  //
 48 | 
 49 |                              0.406,  0.419,   //
 50 |                              -0.442, 0.685,   //
 51 |                              -0.627, -0.489,  //
 52 |                              0.376,  0.563,   //
 53 |                              0.680,  -0.371,  //
 54 | 
 55 |                              0.121,  -0.075,   //
 56 |                              -0.103, 0.031,    //
 57 |                              0.106,  0.033,    //
 58 |                              -0.036, -0.052,   //
 59 |                              0.052,  -0.035};  //
 60 | 
 61 | // DELTAS_SIZE = output_dim * n(l)
 62 | #define DELTAS_SIZE 27
 63 |   float deltas[DELTAS_SIZE] = {0.122f, 0.083f, 0.064f,  // row 1, col 1
 64 |                                0.057f, 0.075f, 0.055f,  // row 1, col 2
 65 |                                0.025f, 0.058f, 0.138f,  // row 1, col 3
 66 | 
 67 |                                0.170f, 0.068f, 0.144f,  // row 2, col 1
 68 |                                0.121f, 0.013f, 0.176f,  // row 2, col 2
 69 |                                0.065f, 0.169f, 0.049f,  // row 2, col 3
 70 | 
 71 |                                0.003f, 0.181f, 0.051f,   // row 3, col 1
 72 |                                0.021f, 0.136f, 0.062f,   // row 3, col 2
 73 |                                0.066f, 0.165f, 0.176f};  // row 3, col 3
 74 | #define WEIGHTS_SIZE 54
 75 |   const float grad_weights_init_val = 1.5f;
 76 |   /* clang-format off */
 77 |   const std::vector<float> expected_weights = {
 78 |      1.5438,  1.4920,  1.5265,            1.4797,  1.4928,  1.4672,
 79 |      1.5313,  1.4511,  1.5087,            1.4492,  1.4040,  1.4227,
 80 |      1.5157,  1.5271,  1.5191,            1.4377,  1.4467,  1.4474,
 81 |      1.4582,  1.4170,  1.4009,            1.5052,  1.5941,  1.4768,
 82 |      1.5150,  1.3938,  1.4748,            1.4841,  1.6112,  1.5451,
 83 |      1.5445,  1.5892,  1.6088,            1.4503,  1.3907,  1.4047,
 84 |      1.4634,  1.4251,  1.4444,            1.6442,  1.4578,  1.6641,
 85 |      1.3638,  1.5003,  1.3188,            1.5713,  1.6199,  1.5159,
 86 |      1.4713,  1.5962,  1.5414,            1.4491,  1.3937,  1.4882
 87 |   };
 88 |   /* clang-format on */
 89 | 
 90 |   const std::vector<float> expected_bias = {0.650f, 0.948f, 0.915f};
 91 | };
 92 | 
 93 | ///
 94 | /// BackpropagationTest
 95 | ///
 96 | 
 97 | TEST_SPEC_PIMPL(BackpropagationTest)
 98 | 
 99 | void BackpropagationTest::init() {}
100 | 
101 | std::string BackpropagationTest::name(size_t data_set_id) {
102 |   return data_set_id == 0 ?                              //
103 |              "Backpropagation test - value correctness"  //
104 |                           : "Backpropagation test - big data";
105 | }
106 | 
107 | size_t BackpropagationTest::data_set_count() { return 2; }
108 | 
109 | void execute(DataPipeline *pipeline, LayerData &data,    //
110 |              cnn_sr::LayerAllocationPool &gpu_buf,       //
111 |              float *deltas, float *input, float w_init,  //
112 |              size_t input_w, size_t input_h) {
113 |   auto context = pipeline->context();
114 |   size_t output_dim[2];
115 |   data.get_output_dimensions(output_dim, input_w, input_h);
116 |   size_t deltas_size =
117 |              output_dim[0] * output_dim[1] * data.current_filter_count,
118 |          input_size = data.input_size(input_w, input_h);
119 | 
120 |   // gpu memory alloc
121 |   /* clang-format off */
122 |   auto gpu_deltas = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * deltas_size);
123 |   auto gpu_input = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * input_size);
124 |   gpu_buf.accumulating_grad_w = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * data.weight_size());
125 |   /* clang-format on */
126 |   context->write_buffer(gpu_deltas, (void *)deltas, true);
127 |   context->write_buffer(gpu_input, (void *)input, true);
128 |   context->fill_float(gpu_buf.accumulating_grad_w, w_init, true);
129 | 
130 |   // run
131 |   pipeline->backpropagate(data, gpu_input, gpu_deltas, gpu_buf,  //
132 |                           output_dim[0], output_dim[1]);
133 | }
134 | 
135 | bool BackpropagationTest::operator()(size_t data_set_id,
136 |                                      cnn_sr::DataPipeline *const pipeline) {
137 |   assert_not_null(pipeline);
138 |   auto context = pipeline->context();
139 |   cnn_sr::LayerAllocationPool gpu_buf;
140 | 
141 |   if (data_set_id == 0) {
142 |     // data for layer, needs filled up weights&bias to pass validation
143 |     LayerData data(2, 3, 3);  // n_prev_filter_cnt/FILTER_CNT/f_spatial_size
144 |     float w[WEIGHTS_SIZE], bias[10];
145 |     data.set_bias(bias);
146 |     data.set_weights(w);
147 |     execute(pipeline, data, gpu_buf, _impl->deltas, _impl->input,
148 |             _impl->grad_weights_init_val, 5, 5);
149 |     // check results
150 |     std::cout << "checking weights" << std::endl;
151 |     assert_equals(pipeline, _impl->expected_weights,
152 |                   gpu_buf.accumulating_grad_w);
153 |     std::cout << "checking bias" << std::endl;
154 |     assert_equals(pipeline, _impl->expected_bias, gpu_buf.accumulating_grad_b);
155 |   } else {
156 |     LayerData data(32, 16, 3);
157 |     float w[4608], bias[16];
158 |     data.set_bias(bias);
159 |     data.set_weights(w);
160 |     // (we dont care about values and sizes must only be at least enough)
161 |     const size_t input_w = 1024, input_h = 1024;
162 |     std::vector<float> deltas(input_w * input_h * data.current_filter_count),
163 |         input(input_w * input_h * data.n_prev_filter_cnt);
164 |     execute(pipeline, data, gpu_buf,  //
165 |             &deltas[0], &input[0], 0.0f, input_w, input_h);
166 |     context->block();
167 |     // didn't crash? then it's ok
168 |   }
169 | 
170 |   return true;
171 | }
172 | 
173 | //
174 | //
175 | }  // namespace specs
176 | }  // namespace test
177 | 


--------------------------------------------------------------------------------
/test/specs/LayerDeltasTest.cpp:
--------------------------------------------------------------------------------
  1 | #include "TestSpecsDeclarations.hpp"
  2 | 
  3 | #include "../../src/DataPipeline.hpp"
  4 | #include "../../src/LayerData.hpp"
  5 | 
  6 | using namespace cnn_sr;
  7 | 
  8 | ///
  9 | /// NOTE: generating expected output is just checking if all weights, deltas
 10 | /// and outputs are read correctly. To this change following line:
 11 | ///   'delta_for_filter[n] += delta * w * activation_func_derivative;'
 12 | /// to:
 13 | ///   'delta_for_filter[n] += ONLY_ONE_OF_MULTIPLIERS'
 14 | ///
 15 | /// compare results with:
 16 | ///   * weight: should be [-0.077999..., -0.584] (sum columns 1-3 for first
 17 | ///             value, columns 4-6 for second)
 18 | ///   * activation_func_derivative: should be expected_derivative
 19 | ///                                 (code-generated)
 20 | ///   * delta: run LayerDeltasTest_script.py
 21 | /// if all of multipliers have correct value their produt will be ok.
 22 | ///
 23 | 
 24 | namespace test {
 25 | namespace specs {
 26 | 
 27 | ///
 28 | /// PIMPL
 29 | ///
 30 | struct LayerDeltasTestImpl {
 31 | // INPUT_SIZE = input_dim*n(l-1)
 32 | #define INPUT_SIZE 50
 33 |   float input_x[INPUT_SIZE] = {-0.083, -0.064,  //
 34 |                                0.075,  -0.055,  //
 35 |                                -0.058, -0.138,  //
 36 |                                -0.068, -0.144,  //
 37 |                                -0.013, 0.176,   //
 38 | 
 39 |                                0.169,  0.049,   //
 40 |                                0.181,  -0.051,  //
 41 |                                0.136,  -0.062,  //
 42 |                                -0.165, -0.176,  //
 43 |                                0.159,  -0.060,  //
 44 | 
 45 |                                -0.112, 0.228,   //
 46 |                                0.003,  -0.138,  //
 47 |                                -0.123, -0.027,  //
 48 |                                -0.102, -0.061,  //
 49 |                                0.242,  -0.069,  //
 50 | 
 51 |                                0.406,  0.419,   //
 52 |                                -0.442, 0.685,   //
 53 |                                -0.627, -0.489,  //
 54 |                                0.376,  0.563,   //
 55 |                                0.680,  -0.371,  //
 56 | 
 57 |                                0.121,  -0.075,   //
 58 |                                -0.103, 0.031,    //
 59 |                                0.106,  0.033,    //
 60 |                                -0.036, -0.052,   //
 61 |                                0.052,  -0.035};  //
 62 | 
 63 | // weights
 64 | // WEIGTHS_SIZE = f(l)*f(l)*n(l-1)*n(l)
 65 | // n(l)=3    |     n(l-1)=2
 66 | #define WEIGHTS_SIZE 54
 67 |   /* clang-format off */
 68 | float weights[WEIGHTS_SIZE] = {
 69 |    -0.369,  0.025,  0.213,     0.058,  0.410, -0.068,
 70 |     0.236,  0.071, -0.429,    -0.104,  0.161,  0.087,
 71 |     0.361, -0.055,  0.273,     0.071,  0.431, -0.095,
 72 | 
 73 |     0.229,  0.378, -0.178,     0.343,  0.114, -0.409,
 74 |    -0.220, -0.364,  0.711,     0.281,  0.851, -1.001,
 75 |    -0.411,   0.661, -0.831,    -0.091,  0.281, -0.341,
 76 | 
 77 |    -0.931,   0.511,  0.141,    -0.591,  0.491, -0.921,
 78 |     0.291,  -0.211,  0.151,     0.491, -0.431, -0.321,
 79 |    -0.631,   0.301, -0.001,    -0.761, -0.021,  0.501};
 80 | /* clang-format on */
 81 | 
 82 | // DELTAS_SIZE = output_dim * n(l)
 83 | #define DELTAS_SIZE 27
 84 |   float deltas[DELTAS_SIZE] = {0.122, 0.083, 0.064,   // row 1, col 1
 85 |                                0.057, 0.075, 0.055,   // row 1, col 2
 86 |                                0.025, 0.058, 0.138,   // row 1, col 3
 87 |                                0.170, 0.068, 0.144,   // row 2, col 1
 88 |                                0.121, 0.013, 0.176,   // row 2, col 2
 89 |                                0.065, 0.169, 0.049,   // row 2, col 3
 90 |                                0.003, 0.181, 0.051,   // row 3, col 1
 91 |                                0.021, 0.136, 0.062,   // row 3, col 2
 92 |                                0.066, 0.165, 0.176};  // row 3, col 3
 93 | 
 94 |   /* clang-format off */
 95 |   std::vector<float> expected_output = {
 96 |     0,                      0,
 97 |     -0.000213999,           0,
 98 |     0,                      0,
 99 |     0,                      0,
100 |     0,                      0.013663,
101 | 
102 |     0.017562,               0.05308,
103 |     -0.00359898,            0,
104 |     -0.004519,              0,
105 |     0,                      0,
106 |     -0.059068,              0,
107 | 
108 |     0,                     -0.012211,
109 |     0.06273,                0,
110 |     0,                      0,
111 |     0,                      0,
112 |     0.108619,               0,
113 | 
114 |     -0.043191,             -0.198902,
115 |     0,                     -0.118114,
116 |     0,                      0,
117 |     -0.00165999,           -0.062883,
118 |     -0.054512,              0,
119 | 
120 |     0.096889,               0,
121 |     0,                     -0.095646,
122 |     0.086999,              -0.168827,
123 |     0,                      0,
124 |     0.007843,               0
125 |   };
126 |   /* clang-format on */
127 | };
128 | 
129 | ///
130 | /// LayerDeltasTest
131 | ///
132 | 
133 | TEST_SPEC_PIMPL(LayerDeltasTest)
134 | 
135 | void LayerDeltasTest::init() {}
136 | 
137 | std::string LayerDeltasTest::name(size_t) { return "Layer deltas test"; }
138 | 
139 | size_t LayerDeltasTest::data_set_count() { return 1; }
140 | 
141 | bool LayerDeltasTest::operator()(size_t, cnn_sr::DataPipeline *const pipeline) {
142 |   assert_not_null(pipeline);
143 |   auto context = pipeline->context();
144 | 
145 |   const size_t IGNORED = 10;
146 | 
147 |   // data for layer, needs filled up weights&bias to pass validation
148 |   LayerData prev_data(IGNORED, 2, IGNORED);  // n(l-2), n(l-1), f(l-1)
149 |   LayerData curr_data(2, 3, 3);              // n(l-1), n(l), f(l)
150 |   float bias[3] = {0.0f, 0.0f, 0.0f};
151 |   curr_data.set_bias(bias);
152 |   curr_data.set_weights(_impl->weights);
153 | 
154 |   // previous layer results - used to take care of sigmoid func.
155 |   size_t output_dim[2] = {3, 3};
156 | 
157 |   // all variations with activation function
158 |   float output[INPUT_SIZE];
159 |   std::vector<float> expected_derivative(INPUT_SIZE);
160 |   size_t derivative_repeat_cnt = curr_data.f_spatial_size *
161 |                                  curr_data.f_spatial_size *
162 |                                  curr_data.current_filter_count;
163 |   for (size_t i = 0; i < INPUT_SIZE; i++) {
164 |     float x = _impl->input_x[i];
165 |     output[i] = activation_function(x);
166 |     expected_derivative[i] =
167 |         activation_function_derivative(x) * derivative_repeat_cnt;
168 |   }
169 | 
170 |   // gpu memory alloc
171 |   cnn_sr::LayerAllocationPool curr_gpu_buf;
172 |   opencl::MemoryHandle prev_deltas = gpu_nullptr;
173 |   /* clang-format off */
174 |   auto curr_deltas = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * DELTAS_SIZE);
175 |   auto prev_output = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * INPUT_SIZE);
176 |   /* clang-format on */
177 |   context->write_buffer(curr_deltas, (void *)_impl->deltas, true);
178 |   context->write_buffer(prev_output, (void *)output, true);
179 | 
180 |   // create kernel & run
181 |   auto kernel = pipeline->create_deltas_kernel(prev_data);
182 |   pipeline->calculate_deltas(*kernel,                   //
183 |                              prev_data, curr_data,      //
184 |                              curr_gpu_buf,              //
185 |                              prev_deltas, curr_deltas,  //
186 |                              output_dim[0], output_dim[1], prev_output);
187 |   assert_equals(pipeline, _impl->expected_output, prev_deltas);
188 | 
189 |   // sub test with expected_derivative
190 |   // assert_equals(pipeline, expected_derivative, prev_deltas);
191 | 
192 |   return true;
193 | }
194 | 
195 | //
196 | //
197 | }  // namespace specs
198 | }  // namespace test
199 | 


--------------------------------------------------------------------------------
/src/opencl/Kernel.cpp:
--------------------------------------------------------------------------------
  1 | #include "Kernel.hpp"
  2 | #include "Context.hpp"
  3 | 
  4 | #include <iostream>
  5 | #include <cstdio>
  6 | 
  7 | namespace opencl {
  8 | 
  9 | void Kernel::init(Context *ctx, cl_kernel k, cl_program p, const char *file,
 10 |                   const char *args) {
 11 |   if (initialized) cleanup();
 12 |   this->context = ctx;
 13 |   this->kernel_id = k;
 14 |   this->program_id = p;
 15 |   arg_stack_size = 0;
 16 |   assigned_local_memory = 0;
 17 |   initialized = true;
 18 |   // read parameters
 19 |   cl_int ciErr1;
 20 |   ciErr1 = clGetKernelWorkGroupInfo(k, context->device().device_id,
 21 |                                     CL_KERNEL_WORK_GROUP_SIZE, 1024,
 22 |                                     &max_work_group_size, nullptr);
 23 |   ciErr1 = clGetKernelWorkGroupInfo(k, context->device().device_id,
 24 |                                     CL_KERNEL_PRIVATE_MEM_SIZE, 1024,
 25 |                                     &private_mem_size, nullptr);
 26 |   ciErr1 =
 27 |       clGetKernelWorkGroupInfo(k, context->device().device_id,
 28 |                                CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
 29 |                                1024, &pref_work_group_multiple, nullptr);
 30 |   context->check_error(ciErr1, "Could not get kernel informations");
 31 | 
 32 |   file = file == nullptr ? "??" : file;
 33 |   args = args == nullptr ? "--" : args;
 34 |   if (file != nullptr && args != nullptr) {
 35 |     snprintf(this->human_identifier, MAX_KERNEL_IDENTIFIER_SIZE, "'%s'[%s]",
 36 |              file, args);
 37 |   }
 38 | }
 39 | 
 40 | void Kernel::cleanup() {
 41 |   if (!initialized) return;
 42 |   initialized = false;
 43 | 
 44 |   if (kernel_id) clReleaseKernel(kernel_id);
 45 |   if (program_id) clReleaseProgram(program_id);
 46 | }
 47 | 
 48 | size_t Kernel::current_local_memory() {
 49 |   cl_ulong loc_mem_size;
 50 |   cl_int ciErr1 = clGetKernelWorkGroupInfo(
 51 |       kernel_id, context->device().device_id, CL_KERNEL_LOCAL_MEM_SIZE, 1024,
 52 |       &loc_mem_size, nullptr);
 53 |   context->check_error(ciErr1, "Could not get kernel's local memory usage");
 54 |   return loc_mem_size > assigned_local_memory ? loc_mem_size
 55 |                                               : assigned_local_memory;
 56 | }
 57 | 
 58 | void Kernel::push_arg(size_t arg_size, const void *arg_value) {
 59 |   cl_int ciErr1 =
 60 |       clSetKernelArg(kernel_id, arg_stack_size, arg_size, arg_value);
 61 |   context->check_error(ciErr1, "Could not push kernel argument");
 62 |   ++arg_stack_size;
 63 |   // local memory
 64 |   if (!arg_value) assigned_local_memory += arg_size;
 65 | }
 66 | 
 67 | void Kernel::push_arg(MemoryHandle gpu_buf) {
 68 |   auto mem = context->raw_memory(gpu_buf);
 69 |   this->push_arg(sizeof(cl_mem), (void *)&mem->handle);
 70 | }
 71 | 
 72 | cl_event Kernel::execute(cl_uint work_dim,                //
 73 |                          const size_t *global_work_size,  //
 74 |                          const size_t *local_work_size,   //
 75 |                          cl_event *events_to_wait_for,
 76 |                          int events_to_wait_for_count) {
 77 |   context->check_error(context->is_initialized(),
 78 |                        "Context was not initialized");
 79 |   check_work_parameters(work_dim, global_work_size, local_work_size);
 80 | 
 81 |   // check used amount of local memory
 82 |   char msg_buffer[192];
 83 |   auto used_loc_mem = current_local_memory();
 84 |   if (used_loc_mem > context->device().local_mem_size) {
 85 |     snprintf(msg_buffer, sizeof(msg_buffer),
 86 |              "You are using too much local memory(%d), only %llu is available",
 87 |              used_loc_mem, context->device().local_mem_size);
 88 |     context->check_error(false, msg_buffer);
 89 |   }
 90 | 
 91 |   // correct event parameters
 92 |   if (!events_to_wait_for) events_to_wait_for_count = 0;
 93 |   if (events_to_wait_for_count <= 0) events_to_wait_for = nullptr;
 94 | 
 95 |   arg_stack_size = 0;  // prepare for next invoke
 96 |   assigned_local_memory = 0;
 97 |   cl_command_queue *cmd_queue = context->command_queue();
 98 | 
 99 |   cl_event finish_token;
100 |   cl_int ciErr1 = clEnqueueNDRangeKernel(
101 |       *cmd_queue, kernel_id,              // what and where to execute
102 |       work_dim, nullptr,                  // must be NULL
103 |       global_work_size, local_work_size,  //
104 |       events_to_wait_for_count, events_to_wait_for,  // sync events
105 |       &finish_token);
106 |   context->check_error(ciErr1, "Error in clEnqueueNDRangeKernel");
107 | 
108 |   if (context->is_running_profile_mode()) {
109 |     clWaitForEvents(1, &finish_token);
110 |     cl_ulong start = 0, end = 0;
111 |     clGetEventProfilingInfo(finish_token, CL_PROFILING_COMMAND_START,
112 |                             sizeof(cl_ulong), &start, NULL);
113 |     clGetEventProfilingInfo(finish_token, CL_PROFILING_COMMAND_END,
114 |                             sizeof(cl_ulong), &end, NULL);
115 |     execution_time_sum += (end - start);
116 |   }
117 | 
118 |   return finish_token;
119 | }
120 | 
121 | void Kernel::check_work_parameters(cl_uint work_dim,  //
122 |                                    const size_t *global_work_size,
123 |                                    const size_t *local_work_size) {
124 |   // std::cout << std::endl
125 |   // << "Work size: " << ((unsigned int)work_dim)
126 |   // << "/" << (*global_work_size)
127 |   // << "/" << (*local_work_size) << std::endl;
128 | 
129 |   char msg_buffer[192];
130 |   if (work_dim < 1 || work_dim > 3) {
131 |     snprintf(msg_buffer, sizeof(msg_buffer),
132 |              "Work parameters: 1 <= (work_dim=%d) <= 3", work_dim);
133 |     context->check_error(false, msg_buffer);
134 |   }
135 | 
136 |   auto device = context->device();
137 |   long long device_work_id_range = ((long long)1) << device.address_bits;
138 |   long long real_global_work_size = 1,
139 |             real_local_work_size = 1;  // # of work-items in work-group
140 |   bool local_dims_lte_device_max = true,
141 |        global_dims_divisible_by_local_dims = true;
142 | 
143 |   for (size_t i = 0; i < work_dim; i++) {
144 |     real_global_work_size *= global_work_size[i];
145 |     if (local_work_size) {
146 |       real_local_work_size *= local_work_size[i];
147 |       local_dims_lte_device_max &=
148 |           local_work_size[i] <= device.work_items_for_dims[i];
149 |       global_dims_divisible_by_local_dims &=
150 |           global_work_size[i] % local_work_size[i] == 0;
151 |     }
152 |   }
153 | 
154 | #define WORK_DIMENSIONS_STR "global:[%d,%d,%d], local:[%d,%d,%d]"
155 | #define WORK_DIMENSIONS_VAL global_work_size[0],                     \
156 |                            (work_dim > 1 ? global_work_size[1] : 1), \
157 |                            (work_dim == 3 ? global_work_size[2] : 1),\
158 |                            local_work_size[0],                       \
159 |                            (work_dim > 1 ? local_work_size[1] : 1),  \
160 |                            (work_dim == 3 ? local_work_size[2] : 1)
161 | 
162 |   bool is_ok = true;
163 |   if (!local_dims_lte_device_max) {
164 |     is_ok = false;
165 |     snprintf(msg_buffer, sizeof(msg_buffer),
166 |              "Work parameters: one of local dimensions are bigger "
167 |              "then device allows. " WORK_DIMENSIONS_STR,
168 |              WORK_DIMENSIONS_VAL);
169 |   } else if (!global_dims_divisible_by_local_dims) {
170 |     is_ok = false;
171 |     snprintf(msg_buffer, sizeof(msg_buffer),
172 |              "Work parameters: For each dimension "
173 |              "global_work_size should be multiply of "
174 |              "local_work_size. " WORK_DIMENSIONS_STR,
175 |              WORK_DIMENSIONS_VAL);
176 |   } else if (real_global_work_size > device_work_id_range) {
177 |     is_ok = false;
178 |     snprintf(msg_buffer, sizeof(msg_buffer),
179 |              "Work parameters: global_work_size(%llu) is bigger then device "
180 |              "address_bits(%d) can represent. " WORK_DIMENSIONS_STR,
181 |              real_global_work_size, device.address_bits, WORK_DIMENSIONS_VAL);
182 |   } else if (real_local_work_size > device.max_work_group_size ||
183 |              real_local_work_size > this->max_work_group_size) {
184 |     is_ok = false;
185 |     snprintf(msg_buffer, sizeof(msg_buffer),
186 |              "Work parameters: local_work_size(%llu) is bigger then device(%d) "
187 |              "or kernel(%d) allows. " WORK_DIMENSIONS_STR,
188 |              real_local_work_size, device.max_work_group_size,
189 |              this->max_work_group_size, WORK_DIMENSIONS_VAL);
190 |   }
191 | 
192 |   context->check_error(is_ok, msg_buffer);
193 | }
194 | 
195 | std::ostream &operator<<(std::ostream &os, opencl::Kernel &k) {
196 |   os << "program id: " << k.program_id                                //
197 |      << ", kernel id: " << k.kernel_id                                //
198 |      << ", max_work_group_size: " << k.max_work_group_size            //
199 |      << ", private_mem_size: " << k.private_mem_size                  //
200 |      << ", pref_work_group_multiple: " << k.pref_work_group_multiple  //
201 |      << ", allocated local memory: " << (k.current_local_memory() / 1024)
202 |      << "KB";  //
203 |   return os;
204 | }
205 | }
206 | 


--------------------------------------------------------------------------------
/src/DataPipeline.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef DATA_PIPELINE_H
  2 | #define DATA_PIPELINE_H
  3 | 
  4 | #include "pch.hpp"
  5 | 
  6 | // TODO move this to opencl::Context
  7 | const opencl::MemoryHandle gpu_nullptr = 1 << 30;
  8 | 
  9 | namespace cnn_sr {
 10 | 
 11 | struct LayerAllocationPool {
 12 |   /** Forward: weights, size: f*f*n*k */
 13 |   opencl::MemoryHandle weights = gpu_nullptr;
 14 |   /** Forward: bias, size: n */
 15 |   opencl::MemoryHandle bias = gpu_nullptr;
 16 | 
 17 |   /** Backpropagation: Accumulate gradients through out batch execution,
 18 |       size: f*f*n*k */
 19 |   opencl::MemoryHandle accumulating_grad_w = gpu_nullptr;
 20 |   /** Backpropagation: Accumulate gradients through out batch execution,
 21 |       size: f*f*n*k */
 22 |   opencl::MemoryHandle accumulating_grad_b = gpu_nullptr;
 23 |   /** Backpropagation-momentum: Deltas that we had after previous batch,
 24 |       size: f*f*n*k */
 25 |   opencl::MemoryHandle previous_batch_delta_w = gpu_nullptr;
 26 |   /** Backpropagation-momentum: Deltas that we had after previous batch,
 27 |       size: f*f*n*k */
 28 |   opencl::MemoryHandle previous_batch_delta_b = gpu_nullptr;
 29 | };
 30 | 
 31 | /**
 32 |  * Class used to execute various pipeline methods f.e.:
 33 |  * - luma extraction
 34 |  * - mean squared error
 35 |  * - all CNN methods
 36 |  *
 37 |  * This is quite low level - thin wrappers with validation mostly
 38 |  */
 39 | class DataPipeline {
 40 |  public:
 41 |   static int LOAD_KERNEL_LUMA;
 42 |   static int LOAD_KERNEL_LAYERS;
 43 |   static int LOAD_KERNEL_BACKPROPAGATE;
 44 |   static int LOAD_KERNEL_MISC;
 45 |   static int LOAD_KERNEL_NONE;
 46 |   static int LOAD_KERNEL_ALL;
 47 | 
 48 |   DataPipeline(opencl::Context*);
 49 |   virtual ~DataPipeline() {}
 50 |   virtual void init(int load_flags = DataPipeline::LOAD_KERNEL_ALL);
 51 |   opencl::Context* context();
 52 | 
 53 |   /**
 54 |    * Take image, write it to GPU (gpu_buf_raw_img), and write luma channel
 55 |    * separately to gpu_buf_luma
 56 |    *
 57 |    * used buffers:
 58 |    * 	in  - NONE
 59 |    * 	out - param->gpu_buf_raw_img(with raw image data, all 3 channels)
 60 |    * 	      param->gpu_buf_luma(with luma channel of provided image)
 61 |    */
 62 |   cl_event extract_luma(opencl::utils::ImageData&, opencl::MemoryHandle&,
 63 |                         opencl::MemoryHandle&, bool, cl_event* ev = nullptr);
 64 | 
 65 |   /** Swap luma in image to specified set of values */
 66 |   cl_event swap_luma(opencl::utils::ImageData&,
 67 |                      opencl::MemoryHandle& gpu_buf_org_img,
 68 |                      opencl::MemoryHandle gpu_buf_new_luma,
 69 |                      opencl::MemoryHandle& target,  //
 70 |                      size_t new_luma_w, size_t new_luma_h,
 71 |                      cl_event* ev = nullptr);
 72 | 
 73 |   /**
 74 |    * Forward propagation for single layer.
 75 |    *
 76 |    * used buffers:
 77 |    * 	in  - layer.weights, layer.bias, this layer's input(that means previous
 78 |    *                                                             layer output)
 79 |    * 	out - layer.output
 80 |    */
 81 |   cl_event execute_layer(opencl::Kernel&, const LayerData&,
 82 |                          cnn_sr::LayerAllocationPool&, opencl::MemoryHandle&,
 83 |                          size_t, size_t, size_t id, opencl::MemoryHandle&,
 84 |                          cl_event* ev = nullptr);
 85 | 
 86 |   /**
 87 |    * This function blocks.
 88 |    *
 89 |    * used buffers:
 90 |    * 	in  - orginal image luma, layer_3.output
 91 |    * 	out - this->_tmp_gpu_float
 92 |    *
 93 |    * @param  total_padding        difference in size between ground_truth image
 94 |    *                              and result. Should be equal to f1+f2+f3-3
 95 |    */
 96 |   cl_event squared_error(opencl::MemoryHandle gpu_buf_ground_truth,
 97 |                          size_t ground_truth_w, size_t ground_truth_h,
 98 |                          size_t id, opencl::MemoryHandle gpu_buf_algo_res,
 99 |                          opencl::MemoryHandle tmp_buffer, float& target,
100 |                          size_t total_padding, cl_event* ev = nullptr);
101 | 
102 |   /**
103 |    * Deltas last layer
104 |    *
105 |    * used buffers:
106 |    * 	in  - orginal image luma, layer_3.output
107 |    * 	out - param->gpu_buf_target
108 |    */
109 |   cl_event last_layer_delta(opencl::MemoryHandle gpu_buf_ground_truth,
110 |                             size_t ground_truth_w, size_t ground_truth_h,
111 |                             size_t id, opencl::MemoryHandle gpu_buf_algo_res,
112 |                             opencl::MemoryHandle& gpu_buf_target,
113 |                             size_t total_padding, cl_event* ev = nullptr);
114 | 
115 |   /**
116 |    * Deltas for current layer based on next layer
117 |    *
118 |    * used buffers:
119 |    * 	in  - next_layer.deltas, curr_layer.output, next_layer.weights
120 |    * 	out - curr_layer.deltas
121 |    */
122 |   cl_event calculate_deltas(opencl::Kernel&,  //
123 |                             const LayerData&, const LayerData&,
124 |                             cnn_sr::LayerAllocationPool&,                //
125 |                             opencl::MemoryHandle, opencl::MemoryHandle,  //
126 |                             size_t, size_t, size_t id,                   //
127 |                             opencl::MemoryHandle,                        //
128 |                             cl_event* ev = nullptr);
129 | 
130 |   /**
131 |    * Calculate gradients of weights and bias
132 |    *
133 |    * used buffers:
134 |    * 	in  - layer.deltas, this layer's input(that means previous layer output)
135 |    * 	out - layer.grad_w, layer.grad_b
136 |    */
137 |   cl_event backpropagate(LayerData&, opencl::MemoryHandle layer_input,
138 |                          opencl::MemoryHandle layer_deltas,
139 |                          LayerAllocationPool&,  //
140 |                          size_t layer_out_w, size_t layer_out_h, size_t id,
141 |                          cl_event* ev = nullptr, size_t ev_cnt = 0);
142 | 
143 |   /**
144 |    * Update weights and biases based on gradients and various factors like batch
145 |    * size, momentum, learning rate. Note that we are both using
146 |    * previous_delta_w/previous_delta_b to calculate his layers new
147 |    * weights/biases(READ) and updating theirs values(WRITE).
148 |    *
149 |    * used buffers:
150 |    * 	in  - layer.grad_w, layer.grad_b
151 |    * 	out - layer.weights, layer.bias
152 |    * 	in/out - layer.previous_delta_w, layer.previous_delta_b
153 |    */
154 |   cl_event update_parameters(LayerData&, LayerAllocationPool&,
155 |                              size_t batch_size, float momentum, float w_decay,
156 |                              float learning_rate, cl_event* ev = nullptr);
157 | 
158 |   ///
159 |   /// misc. kernels
160 |   ///
161 | 
162 |   /** Subtract mean value from all elements of the buffer. The mean parameter
163 |    * will have the mean value */
164 |   cl_event subtract_mean(opencl::MemoryHandle, float* mean = nullptr,
165 |                          cl_event* ev = nullptr);
166 | 
167 |   /**
168 |    * Sum all float in buffer. You may choose to square the values before adding
169 |    * them up.
170 |    */
171 |   float sum(opencl::MemoryHandle, bool squared = false, cl_event* ev = nullptr);
172 | 
173 |   /** Subtract provided value from all elements of the buffer */
174 |   cl_event subtract_from_all(opencl::MemoryHandle, float,
175 |                              cl_event* ev = nullptr);
176 | 
177 |   ///
178 |   /// kernel creation - ones that are not created during standard init
179 |   ///
180 |   /** @param  skip_relu:bool skip relu step, writing raw result */
181 |   opencl::Kernel* create_layer_kernel(const LayerData&, bool);
182 |   opencl::Kernel* create_deltas_kernel(const LayerData&);
183 | 
184 |   ///
185 |   /// misc
186 |   ///
187 |   void print_buffer(opencl::MemoryHandle, const char* const, size_t);
188 | 
189 |  protected:
190 |   void check_initialized(int kernel_load_flags);
191 |   virtual void load_kernels(int load_flags);
192 | 
193 |   /** Either allocation has exact size or release it. Memory is deallocated
194 |    * here, but we cannot allocate it with proper size since f.e. allocating
195 |    * image is different then allocating normal buffer.
196 |    */
197 |   bool allocation_has_right_size__(opencl::MemoryHandle, size_t,  //
198 |                                    size_t, const char*);
199 | 
200 |  private:
201 |   void pre_execute_layer_validation(const LayerData&, opencl::MemoryHandle,
202 |                                     size_t, size_t);
203 |   size_t element_count(opencl::MemoryHandle, size_t el_size);
204 | 
205 |  protected:
206 |   opencl::Context* const _context;
207 |   bool _initialized;
208 | 
209 |   /** Single float. Quite useful. */
210 |   opencl::MemoryHandle _tmp_gpu_float = gpu_nullptr;
211 | 
212 |   opencl::Kernel* _luma_kernel_norm = nullptr;
213 |   opencl::Kernel* _luma_kernel_raw = nullptr;
214 |   opencl::Kernel* _swap_luma_kernel = nullptr;
215 |   opencl::Kernel* _squared_error_kernel = nullptr;
216 |   opencl::Kernel* _sum_kernel = nullptr;
217 |   opencl::Kernel* _sum_squared_kernel = nullptr;
218 |   opencl::Kernel* _subtract_from_all_kernel = nullptr;
219 |   opencl::Kernel* _last_layer_delta_kernel = nullptr;
220 |   opencl::Kernel* _update_parameters_kernel = nullptr;
221 |   opencl::Kernel* _backpropagate_kernel = nullptr;
222 | };
223 | }
224 | 
225 | #endif /* DATA_PIPELINE_H   */
226 | 


--------------------------------------------------------------------------------
/src/opencl/UtilsOpenCL.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <stdio.h>
  3 | #include <strings.h>
  4 | #include <stdexcept>
  5 | #include <algorithm>  // f.e. std::minmax_element
  6 | 
  7 | #define STBI_FAILURE_USERMSG
  8 | #define STB_IMAGE_IMPLEMENTATION
  9 | #include "stb/stb_image.h"
 10 | #define STB_IMAGE_WRITE_IMPLEMENTATION
 11 | #include "stb/stb_image_write.h"
 12 | 
 13 | #include "UtilsOpenCL.hpp"
 14 | #include "Kernel.hpp"
 15 | #include "Context.hpp"
 16 | #include "../pch.hpp"
 17 | 
 18 | namespace opencl {
 19 | namespace utils {
 20 | 
 21 | char const *device_type_str[9] = {
 22 |     "-",
 23 |     "default",  // 1
 24 |     "CPU",      // 2
 25 |     "-",
 26 |     "GPU",  // 4
 27 |     "-",           "-", "-",
 28 |     "Accelerator",  // 8
 29 | };
 30 | 
 31 | char *load_file(const char *cFilename, const char *cPreamble,
 32 |                 size_t *szFinalLength) {
 33 |   FILE *pFileStream = NULL;
 34 |   size_t szSourceLength;
 35 | 
 36 | #ifdef _MSC_VER  // Visual studio
 37 |   if (fopen_s(&pFileStream, cFilename, "rb") != 0) {
 38 |     return NULL;
 39 |   }
 40 | #else  // Linux version
 41 |   pFileStream = fopen(cFilename, "rb");
 42 |   if (pFileStream == 0) {
 43 |     return NULL;
 44 |   }
 45 | #endif
 46 | 
 47 |   size_t szPreambleLength = strlen(cPreamble);
 48 | 
 49 |   // get the length of the source code
 50 |   fseek(pFileStream, 0, SEEK_END);
 51 |   szSourceLength = ftell(pFileStream);
 52 |   fseek(pFileStream, 0, SEEK_SET);
 53 | 
 54 |   // allocate a buffer for the source code string and read it in
 55 |   char *cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
 56 |   memcpy(cSourceString, cPreamble, szPreambleLength);
 57 |   if (fread((cSourceString) + szPreambleLength, szSourceLength, 1,
 58 |             pFileStream) != 1) {
 59 |     fclose(pFileStream);
 60 |     free(cSourceString);
 61 |     return 0;
 62 |   }
 63 | 
 64 |   // close the file and return the total length of the combined
 65 |   // (preamble + source) string
 66 |   fclose(pFileStream);
 67 |   if (szFinalLength != 0) {
 68 |     *szFinalLength = szSourceLength + szPreambleLength;
 69 |   }
 70 |   cSourceString[szSourceLength + szPreambleLength] = '\0';
 71 | 
 72 |   return cSourceString;
 73 | }
 74 | 
 75 | ///
 76 | /// images
 77 | ///
 78 | 
 79 | ImageData::ImageData() : w(0), h(0), bpp(0), data(nullptr) {}
 80 | 
 81 | ImageData::ImageData(int w, int h, int bpp, unsigned char *data)
 82 |     : w(w), h(h), bpp(bpp), data(data), read_from_file(false) {}
 83 | 
 84 | ImageData::~ImageData() {
 85 |   if (data && read_from_file) stbi_image_free(data);
 86 | }
 87 | 
 88 | void load_image(const char *filename, ImageData &data) {
 89 |   data.data = stbi_load(filename, &data.w, &data.h, &data.bpp, 4);
 90 |   // TODO CHECK_ALLOCATION(data.data);
 91 | }
 92 | 
 93 | int write_image(const char *filename, ImageData &data) {
 94 |   return stbi_write_png(filename, data.w, data.h, data.bpp, data.data, 0);
 95 | }
 96 | 
 97 | void write_image(const char *const file_path, float *source,  //
 98 |                  size_t w, size_t h) {
 99 |   size_t px_cnt = w * h;
100 |   // normalize values: 0..1
101 |   auto min_max_it = std::minmax_element(source, source + px_cnt);
102 |   float min = *min_max_it.first, max = *min_max_it.second,
103 |         norm_factor = max - min;
104 |   for (size_t i = 0; i < px_cnt; i++) {
105 |     source[i] = (source[i] - min) / norm_factor;
106 |   }
107 | 
108 |   std::cout << "writing image(" << w << "x" << h << ") to: '" << file_path
109 |             << "'" << std::endl;
110 |   std::vector<unsigned char> data(px_cnt * 3);
111 |   for (size_t row = 0; row < h; row++) {
112 |     for (size_t col = 0; col < w; col++) {
113 |       size_t idx = row * w + col;
114 |       float val = source[idx] * 255;
115 |       for (size_t k = 0; k < 3; k++) {
116 |         data[idx * 3 + k] = (unsigned char)val;
117 |       }
118 |     }
119 |   }
120 | 
121 |   ImageData dd(w, h, sizeof(unsigned char) * 3, &data[0]);
122 |   opencl::utils::write_image(file_path, dd);
123 | }
124 | 
125 | ///
126 | /// misc
127 | ///
128 | 
129 | void work_sizes(const opencl::Kernel &kernel, size_t dim,
130 |                 size_t *global_work_size, size_t *local_work_size, size_t *work,
131 |                 bool print) {
132 |   if (dim == 0 || dim > 3) {
133 |     throw std::runtime_error("Work dimesions should be 1,2 or 3");
134 |   }
135 | 
136 |   auto context = kernel.get_context();
137 |   auto device = context->device();
138 |   auto max_local =
139 |       std::min(device.max_work_group_size, kernel.get_max_work_group_size());
140 |   auto max_device_local_size = device.work_items_for_dims;
141 | 
142 |   size_t pow_2[3];
143 |   for (size_t i = 0; i < dim; i++) {
144 |     pow_2[i] = cnn_sr::utils::closest_power_of_2(static_cast<int>(work[i]));
145 |   }
146 | 
147 |   // local_work_size
148 |   // we are doing round robin (see to_update variable) multiplying each
149 |   // dimension by 2 each time. It may not work that good for:
150 |   // max_device_local_size = [1024, 1024, 1], since it stops after 3 iterations
151 |   // On the other note I've had to look up syntax to do{..}while(...);
152 |   size_t tmp[3] = {1, 1, 1}, local_dims_multiplied = 1, to_update = 0;
153 |   bool satisfies_conditions;
154 |   do {
155 |     // copy last correct configuration to local
156 |     memcpy(local_work_size, tmp, dim * sizeof(float));
157 |     tmp[to_update] *= 2;
158 |     local_dims_multiplied *= 2;
159 |     satisfies_conditions = tmp[to_update] <= max_device_local_size[to_update] &&
160 |                            tmp[to_update] <= pow_2[to_update] &&
161 |                            local_dims_multiplied <= max_local;
162 |     to_update = (to_update + 1) % dim;
163 |   } while (satisfies_conditions);
164 | 
165 |   // global_work_size
166 |   for (size_t i = 0; i < dim; i++) {
167 |     global_work_size[i] =
168 |         (pow_2[i] == local_work_size[i])
169 |             ? pow_2[i]
170 |             : ((work[i] / local_work_size[i]) + 1) * local_work_size[i];
171 |   }
172 | 
173 |   bool ok = true;
174 |   for (size_t i = 0; i < dim; i++) {
175 |     ok &= global_work_size[i] >= local_work_size[i];
176 |     ok &= global_work_size[i] >= work[i];
177 |     ok &= local_work_size[i] > 0;
178 |   }
179 | 
180 |   if (!ok) {
181 |     char buf[255];
182 |     snprintf(buf, 255,
183 |              "Tried to create nonstandard work dimensions: global=[%d,%d,%d], "
184 |              "local=[%d,%d,%d]",
185 |              global_work_size[0], (dim > 1 ? global_work_size[1] : 1),
186 |              (dim == 3 ? global_work_size[2] : 1),  //
187 |              local_work_size[0], (dim > 1 ? local_work_size[1] : 1),
188 |              (dim == 3 ? local_work_size[2] : 1));
189 |     throw std::runtime_error(buf);
190 |   }
191 | 
192 |   if (print) {
193 |     std::cout << "global work size: ["                        //
194 |               << global_work_size[0] << ", "                  //
195 |               << (dim > 1 ? global_work_size[1] : 1) << ", "  //
196 |               << (dim == 3 ? global_work_size[2] : 1) << "]" << std::endl;
197 |     std::cout << "local work size: ["                        //
198 |               << local_work_size[0] << ", "                  //
199 |               << (dim > 1 ? local_work_size[1] : 1) << ", "  //
200 |               << (dim == 3 ? local_work_size[2] : 1) << "]" << std::endl;
201 |   }
202 | }
203 | 
204 | const char *get_opencl_error_str(cl_int errorCode) {
205 | #define DECLARE_ERROR(err) \
206 |   case (err):              \
207 |     return #err
208 | 
209 |   switch (errorCode) {
210 |     DECLARE_ERROR(CL_SUCCESS);
211 |     DECLARE_ERROR(CL_DEVICE_NOT_FOUND);
212 |     DECLARE_ERROR(CL_DEVICE_NOT_AVAILABLE);
213 |     DECLARE_ERROR(CL_COMPILER_NOT_AVAILABLE);
214 |     DECLARE_ERROR(CL_MEM_OBJECT_ALLOCATION_FAILURE);
215 |     case CL_OUT_OF_RESOURCES:
216 |       return "CL_OUT_OF_RESOURCES - either running out of memory or possible "
217 |              "watchdog exception. See f.e "
218 |              "https://devtalk.nvidia.com/default/topic/471020/"
219 |              "driver-crashs-while-opencl-app-is-running/";
220 |       DECLARE_ERROR(CL_OUT_OF_HOST_MEMORY);
221 |       DECLARE_ERROR(CL_PROFILING_INFO_NOT_AVAILABLE);
222 |       DECLARE_ERROR(CL_MEM_COPY_OVERLAP);
223 |       DECLARE_ERROR(CL_IMAGE_FORMAT_MISMATCH);
224 |       DECLARE_ERROR(CL_IMAGE_FORMAT_NOT_SUPPORTED);
225 |       DECLARE_ERROR(CL_BUILD_PROGRAM_FAILURE);
226 |       DECLARE_ERROR(CL_MAP_FAILURE);
227 |       DECLARE_ERROR(CL_MISALIGNED_SUB_BUFFER_OFFSET);
228 |       DECLARE_ERROR(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
229 |       DECLARE_ERROR(CL_INVALID_VALUE);
230 |       DECLARE_ERROR(CL_INVALID_DEVICE_TYPE);
231 |       DECLARE_ERROR(CL_INVALID_PLATFORM);
232 |       DECLARE_ERROR(CL_INVALID_DEVICE);
233 |       DECLARE_ERROR(CL_INVALID_CONTEXT);
234 |       DECLARE_ERROR(CL_INVALID_QUEUE_PROPERTIES);
235 |       DECLARE_ERROR(CL_INVALID_COMMAND_QUEUE);
236 |       DECLARE_ERROR(CL_INVALID_HOST_PTR);
237 |       DECLARE_ERROR(CL_INVALID_MEM_OBJECT);
238 |       DECLARE_ERROR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
239 |       DECLARE_ERROR(CL_INVALID_IMAGE_SIZE);
240 |       DECLARE_ERROR(CL_INVALID_SAMPLER);
241 |       DECLARE_ERROR(CL_INVALID_BINARY);
242 |       DECLARE_ERROR(CL_INVALID_BUILD_OPTIONS);
243 |       DECLARE_ERROR(CL_INVALID_PROGRAM);
244 |       DECLARE_ERROR(CL_INVALID_PROGRAM_EXECUTABLE);
245 |       DECLARE_ERROR(CL_INVALID_KERNEL_NAME);
246 |       DECLARE_ERROR(CL_INVALID_KERNEL_DEFINITION);
247 |       DECLARE_ERROR(CL_INVALID_KERNEL);
248 |       DECLARE_ERROR(CL_INVALID_ARG_INDEX);
249 |       DECLARE_ERROR(CL_INVALID_ARG_VALUE);
250 |       DECLARE_ERROR(CL_INVALID_ARG_SIZE);
251 |       DECLARE_ERROR(CL_INVALID_KERNEL_ARGS);
252 |       DECLARE_ERROR(CL_INVALID_WORK_DIMENSION);
253 |       DECLARE_ERROR(CL_INVALID_WORK_GROUP_SIZE);
254 |       DECLARE_ERROR(CL_INVALID_WORK_ITEM_SIZE);
255 |       DECLARE_ERROR(CL_INVALID_GLOBAL_OFFSET);
256 |       DECLARE_ERROR(CL_INVALID_EVENT_WAIT_LIST);
257 |       DECLARE_ERROR(CL_INVALID_EVENT);
258 |       DECLARE_ERROR(CL_INVALID_OPERATION);
259 |       DECLARE_ERROR(CL_INVALID_GL_OBJECT);
260 |       DECLARE_ERROR(CL_INVALID_BUFFER_SIZE);
261 |       DECLARE_ERROR(CL_INVALID_MIP_LEVEL);
262 |       DECLARE_ERROR(CL_INVALID_GLOBAL_WORK_SIZE);
263 |       // DECLARE_ERROR(CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR);
264 |       // DECLARE_ERROR(CL_PLATFORM_NOT_FOUND_KHR);
265 |       // DECLARE_ERROR(CL_INVALID_PROPERTY_EXT);
266 |       // DECLARE_ERROR(CL_DEVICE_PARTITION_FAILED_EXT);
267 |       // DECLARE_ERROR(CL_INVALID_PARTITION_COUNT_EXT);
268 |       DECLARE_ERROR(CL_INVALID_PROPERTY);
269 |     default:
270 |       return "unknown error code";
271 |   }
272 | #undef DECLARE_ERROR
273 | }
274 | 
275 | //
276 | }
277 | }
278 | 


--------------------------------------------------------------------------------